fix(Token Splitter Node): Prevent tiktoken blocking on repetitive content (#16769)

2025-12-17 18:12:04 +00:00 · 2025-06-27 16:08:14 +02:00
parent edf0fec444
commit c5ec056eb5
7 changed files with 812 additions and 27 deletions
--- a/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/TokenTextSplitter.ts
+++ b/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/TokenTextSplitter.ts
@@ -3,7 +3,9 @@ import type { TokenTextSplitterParams } from '@langchain/textsplitters';
 import { TextSplitter } from '@langchain/textsplitters';
 import type * as tiktoken from 'js-tiktoken';

+import { hasLongSequentialRepeat } from '@utils/helpers';
 import { getEncoding } from '@utils/tokenizer/tiktoken';
+import { estimateTextSplitsByTokens } from '@utils/tokenizer/token-estimator';

 /**
 * Implementation of splitter which looks at tokens.
@@ -32,26 +34,61 @@ export class TokenTextSplitter extends TextSplitter implements TokenTextSplitter
 	}

 	async splitText(text: string): Promise<string[]> {
-		if (!this.tokenizer) {
-			this.tokenizer = await getEncoding(this.encodingName);
-		}
-
-		const splits: string[] = [];
-
-		const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
-
-		let start_idx = 0;
-
-		while (start_idx < input_ids.length) {
-			if (start_idx > 0) {
-				start_idx -= this.chunkOverlap;
+		try {
+			// Validate input
+			if (!text || typeof text !== 'string') {
+				return [];
 			}
-			const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
-			const chunk_ids = input_ids.slice(start_idx, end_idx);
-			splits.push(this.tokenizer.decode(chunk_ids));
-			start_idx = end_idx;
-		}

-		return splits;
+			// Check for repetitive content
+			if (hasLongSequentialRepeat(text)) {
+				const splits = estimateTextSplitsByTokens(
+					text,
+					this.chunkSize,
+					this.chunkOverlap,
+					this.encodingName,
+				);
+				return splits;
+			}
+
+			// Use tiktoken for normal text
+			try {
+				if (!this.tokenizer) {
+					this.tokenizer = await getEncoding(this.encodingName);
+				}
+
+				const splits: string[] = [];
+				const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
+
+				let start_idx = 0;
+				let chunkCount = 0;
+
+				while (start_idx < input_ids.length) {
+					if (start_idx > 0) {
+						start_idx = Math.max(0, start_idx - this.chunkOverlap);
+					}
+					const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
+					const chunk_ids = input_ids.slice(start_idx, end_idx);
+
+					splits.push(this.tokenizer.decode(chunk_ids));
+
+					chunkCount++;
+					start_idx = end_idx;
+				}
+
+				return splits;
+			} catch (tiktokenError) {
+				// Fall back to character-based splitting if tiktoken fails
+				return estimateTextSplitsByTokens(
+					text,
+					this.chunkSize,
+					this.chunkOverlap,
+					this.encodingName,
+				);
+			}
+		} catch (error) {
+			// Return empty array on complete failure
+			return [];
+		}
 	}
 }