Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
private tokenLookupMap: Map<number, ContextToken>;
private lastTotalCost: number;
private handleHasBeenCalled: boolean = false;
private predictableMatchFound: boolean = false;

get currentCost(): number {
const correctable = this.selectionQueue.peek();
Expand Down Expand Up @@ -276,8 +277,9 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
const correctableToUpdate = this.selectionQueue.dequeue();
const tokenResult = correctableToUpdate?.handleNextNode();

const correctionIsThePredictable = correctableToUpdate == this._predictable;
const delistCorrectable = () => {
if(correctableToUpdate != this._predictable) {
if(!correctionIsThePredictable) {
// Lock the 'correctable' token now that either a valid correction for
// it has been found or all possible corrections are exhausted. We only
// consider a single correction for most of a tokenization's tokens,
Expand All @@ -289,18 +291,24 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
}

if(tokenResult.type == 'none') {
// Transition the node from 'correctable' to 'uncorrectable' - we were
// unable to find valid corrections for it.
const lockedResult = correctableToUpdate.bestExample;
this._generatedTokenResults.set(correctableToUpdate.spaceId, {
matchString: lockedResult.text,
inputSamplingCost: -Math.log(lockedResult.p),
knownCost: MAX_EDIT_THRESHOLD_FACTOR, // we'll use the same threshold at which further search is terminated.
totalCost: -Math.log(lockedResult.p) + MAX_EDIT_THRESHOLD_FACTOR * EDIT_DISTANCE_COST_SCALE
});
// If it's a correction, or if we were unable to find a correction for
// the predictable token - both cases need a 'default 'entry.
if(!correctionIsThePredictable || !this.predictableMatchFound) {
// Transition the node from 'correctable' to 'uncorrectable' - we were
// unable to find valid corrections for it.
const lockedResult = correctableToUpdate.bestExample;
this._generatedTokenResults.set(correctableToUpdate.spaceId, {
matchString: lockedResult.text,
inputSamplingCost: -Math.log(lockedResult.p),
knownCost: MAX_EDIT_THRESHOLD_FACTOR, // we'll use the same threshold at which further search is terminated.
totalCost: -Math.log(lockedResult.p) + MAX_EDIT_THRESHOLD_FACTOR * EDIT_DISTANCE_COST_SCALE
});
}

// We can make no further predictions if we've exhausted all search options.
if(correctableToUpdate == this._predictable) {
// If we've reached this case, we're likely at the end of the search
// (unless correction for a correctable is still possible).
if(correctionIsThePredictable) {
this._uncorrectables.push(correctableToUpdate);
delete this._predictable;
} else {
Expand All @@ -312,6 +320,10 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
// for other corrective variations of the 'predictable'.
delistCorrectable();

if(correctionIsThePredictable) {
this.predictableMatchFound = true;
}

// Either way, update the token -> correction-string map with the obtained result.
this._generatedTokenResults.set(correctableToUpdate.spaceId, tokenResult.mapping);
}
Expand Down Expand Up @@ -351,11 +363,20 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
}

// Determine the proper return type and construct the proper return object accordingly.
this._previousResults.push(correctionResults);
return {
type: 'complete',
cost: tokenizationCost,
mapping: correctionResults
};
//
// If there was no result obtained from the predictable and a result was previously found,
// that indicates no further predictions may be found.
if(tokenResult.type != 'none' || !correctionIsThePredictable || !this.predictableMatchFound) {
this._previousResults.push(correctionResults);
return {
type: 'complete',
cost: tokenizationCost,
mapping: correctionResults
};
} else {
return {
type: 'none'
};
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import {
} from '@keymanapp/lm-worker/test-index';

import Distribution = LexicalModelTypes.Distribution;
import ProbabilityMass = LexicalModelTypes.ProbabilityMass;
import TrieModel = models.TrieModel;
import Transform = LexicalModelTypes.Transform;

Expand Down Expand Up @@ -326,6 +327,70 @@ describe('TokenizationCorrector', () => {
assert.equal(searchResult.type, 'none');
});

it('finds a default correction for a single correctable token without a model match', () => {
const fixture = buildFixture_therefore();

const theref = fixture.theref.tail;
const xInput: ProbabilityMass<Transform> = {
sample: {
insert: 'x',
deleteLeft: 0,
id: 123
},
p: 1
}
const therefx = new SubstitutionQuotientSpur(theref.searchModule, [xInput], xInput);
const yInput: ProbabilityMass<Transform> = {
sample: {
insert: 'y',
deleteLeft: 0,
id: 124
},
p: 1
}
const therefxy = new SubstitutionQuotientSpur(therefx, [yInput], yInput);
const zInput: ProbabilityMass<Transform> = {
sample: {
insert: 'z',
deleteLeft: 0,
id: 125
},
p: 1
}
const therefxyz = new ContextToken(new SubstitutionQuotientSpur(therefxy, [zInput], zInput));
const therefxyzTokenization = new ContextTokenization([therefxyz], null, null);

const instance = new TokenizationCorrector(
therefxyzTokenization,
1,
fixture.filter
);

let searchResult: PathResult<TokenizationResultMapping>;
do {
searchResult = instance.handleNextNode();
} while(searchResult.type == 'intermediate');

assert.equal(searchResult.type, 'complete');
if(searchResult.type == 'complete') {
const mapping = searchResult.mapping;
const tokenResults = mapping.matchedResult;
assert.isNotNaN(searchResult.cost);
assert.equal(searchResult.cost, searchResult.mapping.totalCost);
assert.equal(tokenResults.length, 1);
assert.sameOrderedMembers(tokenResults.map((r) => r.matchString), ['therefxyz']);

// Now that an entry has been found, verify the corrector's state.
assert.isNotOk(instance.predictableToken); // should become an uncorrectable.
assert.isTrue(instance.generatedTokenResults.has(therefxyz));
assert.equal(instance.generatedTokenResults.get(therefxyz), tokenResults[0]);
}

// There should be no further possible suggestions.
searchResult = instance.handleNextNode();
assert.equal(searchResult.type, 'none');
});

it('finds corrections for a group of tokens with two correctable', () => {
const fixture = buildFixture_therefore();

Expand Down
Loading