fix(Token Splitter Node): Prevent tiktoken blocking on repetitive content (#16769)

2025-12-17 10:02:05 +00:00 · 2025-06-27 16:08:14 +02:00
parent edf0fec444
commit c5ec056eb5
7 changed files with 812 additions and 27 deletions
--- a/packages/@n8n/nodes-langchain/utils/tokenizer/token-estimator.ts
+++ b/packages/@n8n/nodes-langchain/utils/tokenizer/token-estimator.ts
@@ -0,0 +1,176 @@
+/**
+ * Token estimation utilities for handling text without using tiktoken.
+ * This is used as a fallback when tiktoken would be too slow (e.g., with repetitive content).
+ */
+
+import type { TiktokenModel } from 'js-tiktoken';
+
+import { encodingForModel } from './tiktoken';
+import { hasLongSequentialRepeat } from '../helpers';
+
+/**
+ * Model-specific average characters per token ratios.
+ * These are approximate values based on typical English text.
+ */
+const MODEL_CHAR_PER_TOKEN_RATIOS: Record<string, number> = {
+	'gpt-4o': 3.8,
+	'gpt-4': 4.0,
+	'gpt-3.5-turbo': 4.0,
+	cl100k_base: 4.0,
+	o200k_base: 3.5,
+	p50k_base: 4.2,
+	r50k_base: 4.2,
+};
+
+/**
+ * Estimates the number of tokens in a text based on character count.
+ * This is much faster than tiktoken but less accurate.
+ *
+ * @param text The text to estimate tokens for
+ * @param model The model or encoding name (optional)
+ * @returns Estimated number of tokens
+ */
+export function estimateTokensByCharCount(text: string, model: string = 'cl100k_base'): number {
+	try {
+		// Validate input
+		if (!text || typeof text !== 'string' || text.length === 0) {
+			return 0;
+		}
+
+		// Get the ratio for the specific model, or use default
+		const charsPerToken = MODEL_CHAR_PER_TOKEN_RATIOS[model] || 4.0;
+
+		// Validate ratio
+		if (!Number.isFinite(charsPerToken) || charsPerToken <= 0) {
+			// Fallback to default ratio
+			const estimatedTokens = Math.ceil(text.length / 4.0);
+			return estimatedTokens;
+		}
+
+		// Calculate estimated tokens
+		const estimatedTokens = Math.ceil(text.length / charsPerToken);
+
+		return estimatedTokens;
+	} catch (error) {
+		// Return conservative estimate on error
+		return Math.ceil((text?.length || 0) / 4.0);
+	}
+}
+
+/**
+ * Estimates tokens for text splitting purposes.
+ * Returns chunk boundaries based on character positions rather than token positions.
+ *
+ * @param text The text to split
+ * @param chunkSize Target chunk size in tokens
+ * @param chunkOverlap Overlap between chunks in tokens
+ * @param model The model or encoding name (optional)
+ * @returns Array of text chunks
+ */
+export function estimateTextSplitsByTokens(
+	text: string,
+	chunkSize: number,
+	chunkOverlap: number,
+	model: string = 'cl100k_base',
+): string[] {
+	try {
+		// Validate inputs
+		if (!text || typeof text !== 'string' || text.length === 0) {
+			return [];
+		}
+
+		// Validate numeric parameters
+		if (!Number.isFinite(chunkSize) || chunkSize <= 0) {
+			// Return whole text as single chunk if invalid chunk size
+			return [text];
+		}
+
+		// Ensure overlap is valid and less than chunk size
+		const validOverlap =
+			Number.isFinite(chunkOverlap) && chunkOverlap >= 0
+				? Math.min(chunkOverlap, chunkSize - 1)
+				: 0;
+
+		const charsPerToken = MODEL_CHAR_PER_TOKEN_RATIOS[model] || 4.0;
+		const chunkSizeInChars = Math.floor(chunkSize * charsPerToken);
+		const overlapInChars = Math.floor(validOverlap * charsPerToken);
+
+		const chunks: string[] = [];
+		let start = 0;
+
+		while (start < text.length) {
+			const end = Math.min(start + chunkSizeInChars, text.length);
+			chunks.push(text.slice(start, end));
+
+			if (end >= text.length) {
+				break;
+			}
+
+			// Move to next chunk with overlap
+			start = Math.max(end - overlapInChars, start + 1);
+		}
+
+		return chunks;
+	} catch (error) {
+		// Return text as single chunk on error
+		return text ? [text] : [];
+	}
+}
+
+/**
+ * Estimates the total number of tokens for a list of strings.
+ * Uses tiktoken for normal text but falls back to character-based estimation
+ * for repetitive content or on errors.
+ *
+ * @param list Array of strings to estimate tokens for
+ * @param model The model or encoding name to use for estimation
+ * @returns Total estimated number of tokens across all strings
+ */
+export async function estimateTokensFromStringList(
+	list: string[],
+	model: TiktokenModel,
+): Promise<number> {
+	try {
+		// Validate input
+		if (!Array.isArray(list)) {
+			return 0;
+		}
+
+		const encoder = await encodingForModel(model);
+		const encodedListLength = await Promise.all(
+			list.map(async (text) => {
+				try {
+					// Handle null/undefined text
+					if (!text || typeof text !== 'string') {
+						return 0;
+					}
+
+					// Check for repetitive content
+					if (hasLongSequentialRepeat(text)) {
+						const estimatedTokens = estimateTokensByCharCount(text, model);
+						return estimatedTokens;
+					}
+
+					// Use tiktoken for normal text
+					try {
+						const tokens = encoder.encode(text);
+						return tokens.length;
+					} catch (encodingError) {
+						// Fall back to estimation if tiktoken fails
+						return estimateTokensByCharCount(text, model);
+					}
+				} catch (itemError) {
+					// Return 0 for individual item errors
+					return 0;
+				}
+			}),
+		);
+
+		const totalTokens = encodedListLength.reduce((acc, curr) => acc + curr, 0);
+
+		return totalTokens;
+	} catch (error) {
+		// Return 0 on complete failure
+		return 0;
+	}
+}