feat(Token Splitter Node): Replace remote tiktoken encoding with local implementation (#16548)

This commit is contained in:
oleg
2025-06-20 16:08:16 +02:00
committed by GitHub
parent 79650ea55a
commit 2d638023be
11 changed files with 427 additions and 32 deletions

View File

@@ -0,0 +1,57 @@
/* eslint-disable n8n-nodes-base/node-dirname-against-convention */
import type { TokenTextSplitterParams } from '@langchain/textsplitters';
import { TextSplitter } from '@langchain/textsplitters';
import type * as tiktoken from 'js-tiktoken';
import { getEncoding } from '@utils/tokenizer/tiktoken';
/**
* Implementation of splitter which looks at tokens.
* This is override of the LangChain TokenTextSplitter
* to use the n8n tokenizer utility which uses local JSON encodings
*/
export class TokenTextSplitter extends TextSplitter implements TokenTextSplitterParams {
static lc_name() {
return 'TokenTextSplitter';
}
encodingName: tiktoken.TiktokenEncoding;
allowedSpecial: 'all' | string[];
disallowedSpecial: 'all' | string[];
private tokenizer: tiktoken.Tiktoken | undefined;
constructor(fields?: Partial<TokenTextSplitterParams>) {
super(fields);
this.encodingName = fields?.encodingName ?? 'cl100k_base';
this.allowedSpecial = fields?.allowedSpecial ?? [];
this.disallowedSpecial = fields?.disallowedSpecial ?? 'all';
}
async splitText(text: string): Promise<string[]> {
if (!this.tokenizer) {
this.tokenizer = await getEncoding(this.encodingName);
}
const splits: string[] = [];
const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
let start_idx = 0;
while (start_idx < input_ids.length) {
if (start_idx > 0) {
start_idx -= this.chunkOverlap;
}
const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
const chunk_ids = input_ids.slice(start_idx, end_idx);
splits.push(this.tokenizer.decode(chunk_ids));
start_idx = end_idx;
}
return splits;
}
}