fix(Token Splitter Node): Cache tokenizer JSONs in memory (#17201)

This commit is contained in:
oleg
2025-07-10 19:08:29 +02:00
committed by GitHub
parent 36b410abdb
commit 2402926573
5 changed files with 115 additions and 54 deletions

View File

@@ -1,10 +1,9 @@
import type { TokenTextSplitterParams } from '@langchain/textsplitters';
import { TextSplitter } from '@langchain/textsplitters';
import type * as tiktoken from 'js-tiktoken';
import { hasLongSequentialRepeat } from '@utils/helpers';
import { getEncoding } from '@utils/tokenizer/tiktoken';
import { estimateTextSplitsByTokens } from '@utils/tokenizer/token-estimator';
import type * as tiktoken from 'js-tiktoken';
/**
* Implementation of splitter which looks at tokens.
@@ -52,9 +51,7 @@ export class TokenTextSplitter extends TextSplitter implements TokenTextSplitter
// Use tiktoken for normal text
try {
if (!this.tokenizer) {
this.tokenizer = await getEncoding(this.encodingName);
}
this.tokenizer ??= getEncoding(this.encodingName);
const splits: string[] = [];
const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);