/* eslint-disable n8n-nodes-base/node-dirname-against-convention */ import type { TokenTextSplitterParams } from '@langchain/textsplitters'; import { TextSplitter } from '@langchain/textsplitters'; import type * as tiktoken from 'js-tiktoken'; import { getEncoding } from '@utils/tokenizer/tiktoken'; /** * Implementation of splitter which looks at tokens. * This is override of the LangChain TokenTextSplitter * to use the n8n tokenizer utility which uses local JSON encodings */ export class TokenTextSplitter extends TextSplitter implements TokenTextSplitterParams { static lc_name() { return 'TokenTextSplitter'; } encodingName: tiktoken.TiktokenEncoding; allowedSpecial: 'all' | string[]; disallowedSpecial: 'all' | string[]; private tokenizer: tiktoken.Tiktoken | undefined; constructor(fields?: Partial) { super(fields); this.encodingName = fields?.encodingName ?? 'cl100k_base'; this.allowedSpecial = fields?.allowedSpecial ?? []; this.disallowedSpecial = fields?.disallowedSpecial ?? 'all'; } async splitText(text: string): Promise { if (!this.tokenizer) { this.tokenizer = await getEncoding(this.encodingName); } const splits: string[] = []; const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial); let start_idx = 0; while (start_idx < input_ids.length) { if (start_idx > 0) { start_idx -= this.chunkOverlap; } const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length); const chunk_ids = input_ids.slice(start_idx, end_idx); splits.push(this.tokenizer.decode(chunk_ids)); start_idx = end_idx; } return splits; } }