fix(Token Splitter Node): Prevent tiktoken blocking on repetitive content (#16769)

2025-12-17 18:12:04 +00:00 · 2025-06-27 16:08:14 +02:00
parent edf0fec444
commit c5ec056eb5
7 changed files with 812 additions and 27 deletions
--- a/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/TokenTextSplitter.ts
+++ b/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/TokenTextSplitter.ts
@@ -3,7 +3,9 @@ import type { TokenTextSplitterParams } from '@langchain/textsplitters';
 import { TextSplitter } from '@langchain/textsplitters';
 import type * as tiktoken from 'js-tiktoken';

+import { hasLongSequentialRepeat } from '@utils/helpers';
 import { getEncoding } from '@utils/tokenizer/tiktoken';
+import { estimateTextSplitsByTokens } from '@utils/tokenizer/token-estimator';

 /**
 * Implementation of splitter which looks at tokens.
@@ -32,26 +34,61 @@ export class TokenTextSplitter extends TextSplitter implements TokenTextSplitter
 	}

 	async splitText(text: string): Promise<string[]> {
-		if (!this.tokenizer) {
-			this.tokenizer = await getEncoding(this.encodingName);
-		}
-
-		const splits: string[] = [];
-
-		const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
-
-		let start_idx = 0;
-
-		while (start_idx < input_ids.length) {
-			if (start_idx > 0) {
-				start_idx -= this.chunkOverlap;
+		try {
+			// Validate input
+			if (!text || typeof text !== 'string') {
+				return [];
 			}
-			const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
-			const chunk_ids = input_ids.slice(start_idx, end_idx);
-			splits.push(this.tokenizer.decode(chunk_ids));
-			start_idx = end_idx;
-		}

-		return splits;
+			// Check for repetitive content
+			if (hasLongSequentialRepeat(text)) {
+				const splits = estimateTextSplitsByTokens(
+					text,
+					this.chunkSize,
+					this.chunkOverlap,
+					this.encodingName,
+				);
+				return splits;
+			}
+
+			// Use tiktoken for normal text
+			try {
+				if (!this.tokenizer) {
+					this.tokenizer = await getEncoding(this.encodingName);
+				}
+
+				const splits: string[] = [];
+				const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
+
+				let start_idx = 0;
+				let chunkCount = 0;
+
+				while (start_idx < input_ids.length) {
+					if (start_idx > 0) {
+						start_idx = Math.max(0, start_idx - this.chunkOverlap);
+					}
+					const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
+					const chunk_ids = input_ids.slice(start_idx, end_idx);
+
+					splits.push(this.tokenizer.decode(chunk_ids));
+
+					chunkCount++;
+					start_idx = end_idx;
+				}
+
+				return splits;
+			} catch (tiktokenError) {
+				// Fall back to character-based splitting if tiktoken fails
+				return estimateTextSplitsByTokens(
+					text,
+					this.chunkSize,
+					this.chunkOverlap,
+					this.encodingName,
+				);
+			}
+		} catch (error) {
+			// Return empty array on complete failure
+			return [];
+		}
 	}
 }
--- a/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/tests/TokenTextSplitter.test.ts
+++ b/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/tests/TokenTextSplitter.test.ts
@@ -1,7 +1,13 @@
+import { OperationalError } from 'n8n-workflow';
+
+import * as helpers from '../../../../utils/helpers';
 import * as tiktokenUtils from '../../../../utils/tokenizer/tiktoken';
+import * as tokenEstimator from '../../../../utils/tokenizer/token-estimator';
 import { TokenTextSplitter } from '../TokenTextSplitter';

 jest.mock('../../../../utils/tokenizer/tiktoken');
+jest.mock('../../../../utils/helpers');
+jest.mock('../../../../utils/tokenizer/token-estimator');

 describe('TokenTextSplitter', () => {
 	let mockTokenizer: jest.Mocked<{
@@ -15,6 +21,8 @@ describe('TokenTextSplitter', () => {
 			decode: jest.fn(),
 		};
 		(tiktokenUtils.getEncoding as jest.Mock).mockResolvedValue(mockTokenizer);
+		// Default mock for hasLongSequentialRepeat - no repetition
+		(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false);
 	});

 	afterEach(() => {
@@ -161,5 +169,175 @@ describe('TokenTextSplitter', () => {

 			expect(result).toEqual(['One two', 'two three', 'three four', 'four five', 'five six']);
 		});
+
+		describe('repetitive content handling', () => {
+			it('should use character-based estimation for repetitive content', async () => {
+				const splitter = new TokenTextSplitter({
+					chunkSize: 100,
+					chunkOverlap: 10,
+				});
+
+				const repetitiveText = 'a'.repeat(1000);
+				const estimatedChunks = ['chunk1', 'chunk2', 'chunk3'];
+
+				(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true);
+				(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue(estimatedChunks);
+
+				const result = await splitter.splitText(repetitiveText);
+
+				// Should not call tiktoken
+				expect(tiktokenUtils.getEncoding).not.toHaveBeenCalled();
+				expect(mockTokenizer.encode).not.toHaveBeenCalled();
+
+				// Should use estimation
+				expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(repetitiveText);
+				expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalledWith(
+					repetitiveText,
+					100,
+					10,
+					'cl100k_base',
+				);
+
+				expect(result).toEqual(estimatedChunks);
+			});
+
+			it('should use tiktoken for non-repetitive content', async () => {
+				const splitter = new TokenTextSplitter({
+					chunkSize: 3,
+					chunkOverlap: 0,
+				});
+
+				const normalText = 'This is normal text without repetition';
+				const mockTokenIds = [1, 2, 3, 4, 5, 6];
+
+				(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false);
+				mockTokenizer.encode.mockReturnValue(mockTokenIds);
+				mockTokenizer.decode.mockImplementation(() => 'chunk');
+
+				await splitter.splitText(normalText);
+
+				// Should check for repetition
+				expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(normalText);
+
+				// Should use tiktoken
+				expect(tiktokenUtils.getEncoding).toHaveBeenCalled();
+				expect(mockTokenizer.encode).toHaveBeenCalled();
+
+				// Should not use estimation
+				expect(tokenEstimator.estimateTextSplitsByTokens).not.toHaveBeenCalled();
+			});
+
+			it('should handle repetitive content with different encodings', async () => {
+				const splitter = new TokenTextSplitter({
+					encodingName: 'o200k_base',
+					chunkSize: 50,
+					chunkOverlap: 5,
+				});
+
+				const repetitiveText = '.'.repeat(500);
+				const estimatedChunks = ['estimated chunk 1', 'estimated chunk 2'];
+
+				(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true);
+				(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue(estimatedChunks);
+
+				const result = await splitter.splitText(repetitiveText);
+
+				expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalledWith(
+					repetitiveText,
+					50,
+					5,
+					'o200k_base',
+				);
+				expect(result).toEqual(estimatedChunks);
+			});
+
+			it('should handle edge case with exactly 100 repeating characters', async () => {
+				const splitter = new TokenTextSplitter();
+				const edgeText = 'x'.repeat(100);
+
+				(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true);
+				(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue(['single chunk']);
+
+				const result = await splitter.splitText(edgeText);
+
+				expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(edgeText);
+				expect(result).toEqual(['single chunk']);
+			});
+
+			it('should handle mixed content with repetitive sections', async () => {
+				const splitter = new TokenTextSplitter();
+				const mixedText = 'Normal text ' + 'z'.repeat(200) + ' more normal text';
+
+				(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true);
+				(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue([
+					'chunk1',
+					'chunk2',
+				]);
+
+				const result = await splitter.splitText(mixedText);
+
+				expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(mixedText);
+				expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalled();
+				expect(result).toEqual(['chunk1', 'chunk2']);
+			});
+		});
+
+		describe('error handling', () => {
+			it('should return empty array for null input', async () => {
+				const splitter = new TokenTextSplitter();
+				const result = await splitter.splitText(null as any);
+				expect(result).toEqual([]);
+			});
+
+			it('should return empty array for undefined input', async () => {
+				const splitter = new TokenTextSplitter();
+				const result = await splitter.splitText(undefined as any);
+				expect(result).toEqual([]);
+			});
+
+			it('should return empty array for non-string input', async () => {
+				const splitter = new TokenTextSplitter();
+				const result = await splitter.splitText(123 as any);
+				expect(result).toEqual([]);
+			});
+
+			it('should fall back to estimation if tiktoken fails', async () => {
+				const splitter = new TokenTextSplitter();
+				const text = 'This will cause tiktoken to fail';
+
+				(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false);
+				(tiktokenUtils.getEncoding as jest.Mock).mockRejectedValue(new Error('Tiktoken error'));
+				(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue([
+					'fallback chunk',
+				]);
+
+				const result = await splitter.splitText(text);
+
+				expect(result).toEqual(['fallback chunk']);
+				expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalledWith(
+					text,
+					splitter.chunkSize,
+					splitter.chunkOverlap,
+					splitter.encodingName,
+				);
+			});
+
+			it('should fall back to estimation if encode fails', async () => {
+				const splitter = new TokenTextSplitter();
+				const text = 'This will cause encode to fail';
+
+				(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false);
+				mockTokenizer.encode.mockImplementation(() => {
+					throw new OperationalError('Encode error');
+				});
+				(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue([
+					'fallback chunk',
+				]);
+
+				const result = await splitter.splitText(text);
+
+				expect(result).toEqual(['fallback chunk']);
+			});
+		});
 	});
 });