Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,8 @@ export class ContextState {
// into subsets.
const bestProb = transformDistribution.reduce((best, curr) => Math.max(best, curr.p), 0);
// Should gain one per subsetBuilder.subsets entry.
const resultTokenization = baseTokenization.evaluateTransition(tokenizationAnalysis, lexicalModel, trueInput, bestProb);
const realignedTokenization = baseTokenization.realign(tokenizationAnalysis.alignment);
const resultTokenization = realignedTokenization.evaluateTransition(tokenizationAnalysis, trueInput.id, bestProb);

// ------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,18 @@ export class ContextTokenization {
* The sequence of tokens in the context represented by this instance.
*/
readonly tokens: ContextToken[];

/**
* The tokenization-transition metadata relating this instance to the most likely
* tokenization from a prior state.
* Denotes whether or not the transition to this tokenization added or deleted
* any tokens.
*/
readonly transitionEdits?: TransitionEdge;
readonly transitionEdits?: {
addedNewTokens: boolean,
removedOldTokens: boolean,
// NOTE: slated for removal in an upcoming PR. Exists in this form to
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, #15727.

// facilitate factorization of the changes into smaller bodies of work.
editedTokenCount: number
};

/**
* The portion of edits from the true input keystroke that are not part of the
Expand All @@ -129,13 +136,19 @@ export class ContextTokenization {
constructor(tokens: ContextToken[], alignment: TransitionEdge, taillessTrueKeystroke: Transform);
constructor(
param1: ContextToken[] | ContextTokenization,
alignment?: TransitionEdge,
tokenizationPath?: TransitionEdge,
taillessTrueKeystroke?: Transform
) {
if(!(param1 instanceof ContextTokenization)) {
const tokens = param1;
this.tokens = [].concat(tokens);
this.transitionEdits = alignment;
if(tokenizationPath) {
this.transitionEdits = {
addedNewTokens: tokenizationPath?.inputs[0].sample.has(1) ?? false,
removedOldTokens: (tokenizationPath?.alignment.removedTokenCount ?? 0) > 0,
editedTokenCount: tokenizationPath?.inputs[0].sample.size
}
}
this.taillessTrueKeystroke = taillessTrueKeystroke;
} else {
const priorToClone = param1;
Expand Down Expand Up @@ -489,30 +502,16 @@ export class ContextTokenization {

/**
* Given results from `precomputeTokenizationAfterInput`, this method will
* evaluate the pending transition in tokenization for all associated inputs
* realign this tokenization's range to match the incoming keystroke's context window
* while reusing as many correction-search intermediate results as possible.
* @param transitionEdge Batched results from one or more
* @param alignment Batched results from one or more
* `precomputeTokenizationAfterInput` calls on this instance, all with the
* same alignment values.
* @param lexicalModel The active lexical model
* @param sourceInput The Transform associated with the keystroke triggering
* the transition.
* @param bestProbFromSet The probability of the single most likely input
* transform in the overall transformDistribution associated with the
* keystroke triggering the transition. It need not be represented by the
* TransitionEdge to be built.
* @returns
*/
evaluateTransition(
transitionEdge: TransitionEdge,
lexicalModel: LexicalModel,
sourceInput: Transform,
bestProbFromSet: number
): ContextTokenization {
const { alignment: alignment, inputs } = transitionEdge;
realign(alignment: TransitionEdgeAlignment): ContextTokenization {
const sliceIndex = alignment.edgeWindow.sliceIndex;
const baseTokenization = this.tokens.slice(sliceIndex);
let affectedToken: ContextToken;

const tokenization: ContextToken[] = [];

Expand Down Expand Up @@ -553,29 +552,63 @@ export class ContextTokenization {
tokenization.push(token);
}

return new ContextTokenization(this.tokens.slice(0, sliceIndex).concat(tokenization), null, this.taillessTrueKeystroke);
}

/**
* Given results from `precomputeTokenizationAfterInput`, this method will
* evaluate the pending transition in tokenization for all associated inputs
* while reusing as many correction-search intermediate results as possible.
* @param transitionEdge Batched results from one or more
* `precomputeTokenizationAfterInput` calls on this instance, all with the
* same alignment values.
* @param transitionId The id of the Transform associated with the keystroke
* triggering the transition.
* @param bestProbFromSet The probability of the single most likely input
* transform in the overall transformDistribution associated with the
* keystroke triggering the transition. It need not be represented by the
* tokenizationPath to be built.
* @param appliedSuggestionId
* @returns
*/
evaluateTransition(
transitionEdge: TransitionEdge,
transitionId: number,
bestProbFromSet: number,
appliedSuggestionId?: number
): ContextTokenization {
const { alignment, inputs } = transitionEdge;
const sliceIndex = alignment.edgeWindow.sliceIndex;
const lexicalModel = this.tail.searchModule.model;

let affectedToken: ContextToken;

const tailTokenization = this.tokens.slice(sliceIndex);

// Assumption: inputs.length > 0. (There is at least one input transform.)
const inputTransformKeys = [...inputs[0].sample.keys()];
const baseTailIndex = (tailTokenization.length - 1);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be done after removing the tokens from tailTokenization? Otherwise baseTailIndex might point to an index that is no longer valid.

Copy link
Copy Markdown
Contributor Author

@jahorton jahorton Mar 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, this is still correct. Inputs to be applied are tokenized elsewhere, and those tokens are indexed relative to this specific index - the location of the last pre-edit context token. For such cases, the 'first' (and possibly more!) such token index (as obtained from inputs[0].sample.keys() will be negative.

This is enforced in ContextTokenization.mapWhitespacedTokenization and .assembleTransforms, which together produce the key-values obtained by the block of code reviewed here.

let removedTokenCount = alignment.removedTokenCount;
while(removedTokenCount-- > 0) {
inputTransformKeys.pop();
tokenization.pop();
tailTokenization.pop();
}

let appliedLength = 0;
for(let i = 0; i < inputTransformKeys.length; i++) {
const tailRelativeIndex = inputTransformKeys[i];
let distribution = inputs.map((i) => ({sample: i.sample.get(tailRelativeIndex), p: i.p}));
const tokenIndex = (tokenization.length - 1) + tailRelativeIndex;
const tokenIndex = baseTailIndex + tailRelativeIndex;

affectedToken = tokenization[tokenIndex];
affectedToken = tailTokenization[tokenIndex];
if(!affectedToken) {
affectedToken = new ContextToken(lexicalModel);
tokenization.push(affectedToken);
tailTokenization.push(affectedToken);
} else if(KMWString.length(affectedToken.exampleInput) == distribution[0].sample.deleteLeft) {
// If the entire token will be replaced, throw out the old one and start anew.
affectedToken = new ContextToken(lexicalModel);
// Replace the token at the affected index with a brand-new token.
tokenization.splice(tokenIndex, 1, affectedToken);
tailTokenization.splice(tokenIndex, 1, affectedToken);
}

affectedToken.isPartial = true;
Expand All @@ -590,7 +623,7 @@ export class ContextTokenization {

const inputSource: PathInputProperties = {
segment: {
transitionId: sourceInput.id,
transitionId,
start: appliedLength
},
bestProbFromSet: bestProbFromSet,
Expand All @@ -601,17 +634,21 @@ export class ContextTokenization {
inputSource.segment.end = appliedLength;
}

affectedToken = new ContextToken(affectedToken);
affectedToken.addInput(inputSource, distribution);

const tokenize = determineModelTokenizer(lexicalModel);
affectedToken.isWhitespace = tokenize({left: affectedToken.exampleInput, startOfBuffer: false, endOfBuffer: false}).left[0]?.isWhitespace ?? false;
// Do not re-use the previous token; the mutation may have unexpected
// results (say, in unit-testing)
tailTokenization[tokenIndex] = affectedToken;

affectedToken = null;
}

return new ContextTokenization(
this.tokens.slice(0, sliceIndex).concat(tokenization),
null /* tokenMapping */,
this.tokens.slice(0, sliceIndex).concat(tailTokenization),
null,
determineTaillessTrueKeystroke(transitionEdge)
);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ import Transform = LexicalModelTypes.Transform;
// Mark affected tokens with the applied-suggestion transition ID
// for easy future reference.
const tagTokens = (state: ContextState, suggestion: Suggestion) => {
const inputs = state.tokenization.transitionEdits.inputs;
const appliedTokenCount = inputs[0].sample.size;
const edits = state.tokenization.transitionEdits;
const appliedTokenCount = edits.editedTokenCount;
const tokens = state.tokenization.tokens;
for(let i = tokens.length - appliedTokenCount; i < tokens.length; i++) {
tokens[i].appliedTransitionId = suggestion.transformId;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -346,14 +346,13 @@ export function determineSuggestionAlignment(
const context = transition.base.context;
const postContext = transition.final.context;
const inputTransform = transition.inputDistribution[0].sample;
const inputTransformMap = transitionEdits?.inputs[0].sample;
let deleteLeft: number;

// If the context now has more tokens, the token we'll be 'predicting' didn't originally exist.
const wordbreak = determineModelWordbreaker(lexicalModel);

// Is the token under construction newly-constructed / is there no pre-existing root?
if(tokenization.taillessTrueKeystroke && inputTransformMap?.has(1)) {
if(tokenization.taillessTrueKeystroke && transitionEdits?.addedNewTokens) {
return {
// If the new token is due to whitespace or due to a different input type
// that would likely imply a tokenization boundary, infer 'new word' mode.
Expand All @@ -366,7 +365,7 @@ export function determineSuggestionAlignment(
deleteLeft: 0
};
// If the tokenized context length is shorter... sounds like a backspace (or similar).
} else if (transitionEdits?.alignment.removedTokenCount > 0) {
} else if (transitionEdits?.removedOldTokens) {
/* Ooh, we've dropped context here. Almost certainly from a backspace or
* similar effect. Even if we drop multiple tokens... well, we know exactly
* how many chars were actually deleted - `inputTransform.deleteLeft`. Since
Expand Down
Loading
Loading