diff --git a/packages/@n8n/nodes-langchain/nodes/llms/N8nLlmTracing.ts b/packages/@n8n/nodes-langchain/nodes/llms/N8nLlmTracing.ts index 37495c80be..e006517d16 100644 --- a/packages/@n8n/nodes-langchain/nodes/llms/N8nLlmTracing.ts +++ b/packages/@n8n/nodes-langchain/nodes/llms/N8nLlmTracing.ts @@ -13,7 +13,7 @@ import type { IDataObject, ISupplyDataFunctions, JsonObject } from 'n8n-workflow import { NodeConnectionTypes, NodeError, NodeOperationError } from 'n8n-workflow'; import { logAiEvent } from '@utils/helpers'; -import { encodingForModel } from '@utils/tokenizer/tiktoken'; +import { estimateTokensFromStringList } from '@utils/tokenizer/token-estimator'; type TokensUsageParser = (llmOutput: LLMResult['llmOutput']) => { completionTokens: number; @@ -84,13 +84,7 @@ export class N8nLlmTracing extends BaseCallbackHandler { async estimateTokensFromStringList(list: string[]) { const embeddingModel = getModelNameForTiktoken(TIKTOKEN_ESTIMATE_MODEL); - const encoder = await encodingForModel(embeddingModel); - - const encodedListLength = await Promise.all( - list.map(async (text) => encoder.encode(text).length), - ); - - return encodedListLength.reduce((acc, curr) => acc + curr, 0); + return await estimateTokensFromStringList(list, embeddingModel); } async handleLLMEnd(output: LLMResult, runId: string) { diff --git a/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/TokenTextSplitter.ts b/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/TokenTextSplitter.ts index 8a38fc2fb0..7f51fe9746 100644 --- a/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/TokenTextSplitter.ts +++ b/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/TokenTextSplitter.ts @@ -3,7 +3,9 @@ import type { TokenTextSplitterParams } from '@langchain/textsplitters'; import { TextSplitter } from '@langchain/textsplitters'; import type * as tiktoken from 'js-tiktoken'; +import { hasLongSequentialRepeat } from '@utils/helpers'; import { getEncoding } from '@utils/tokenizer/tiktoken'; +import { estimateTextSplitsByTokens } from '@utils/tokenizer/token-estimator'; /** * Implementation of splitter which looks at tokens. @@ -32,26 +34,61 @@ export class TokenTextSplitter extends TextSplitter implements TokenTextSplitter } async splitText(text: string): Promise { - if (!this.tokenizer) { - this.tokenizer = await getEncoding(this.encodingName); - } - - const splits: string[] = []; - - const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial); - - let start_idx = 0; - - while (start_idx < input_ids.length) { - if (start_idx > 0) { - start_idx -= this.chunkOverlap; + try { + // Validate input + if (!text || typeof text !== 'string') { + return []; } - const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length); - const chunk_ids = input_ids.slice(start_idx, end_idx); - splits.push(this.tokenizer.decode(chunk_ids)); - start_idx = end_idx; - } - return splits; + // Check for repetitive content + if (hasLongSequentialRepeat(text)) { + const splits = estimateTextSplitsByTokens( + text, + this.chunkSize, + this.chunkOverlap, + this.encodingName, + ); + return splits; + } + + // Use tiktoken for normal text + try { + if (!this.tokenizer) { + this.tokenizer = await getEncoding(this.encodingName); + } + + const splits: string[] = []; + const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial); + + let start_idx = 0; + let chunkCount = 0; + + while (start_idx < input_ids.length) { + if (start_idx > 0) { + start_idx = Math.max(0, start_idx - this.chunkOverlap); + } + const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length); + const chunk_ids = input_ids.slice(start_idx, end_idx); + + splits.push(this.tokenizer.decode(chunk_ids)); + + chunkCount++; + start_idx = end_idx; + } + + return splits; + } catch (tiktokenError) { + // Fall back to character-based splitting if tiktoken fails + return estimateTextSplitsByTokens( + text, + this.chunkSize, + this.chunkOverlap, + this.encodingName, + ); + } + } catch (error) { + // Return empty array on complete failure + return []; + } } } diff --git a/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/tests/TokenTextSplitter.test.ts b/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/tests/TokenTextSplitter.test.ts index a33b7160a3..af8c3e125d 100644 --- a/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/tests/TokenTextSplitter.test.ts +++ b/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/tests/TokenTextSplitter.test.ts @@ -1,7 +1,13 @@ +import { OperationalError } from 'n8n-workflow'; + +import * as helpers from '../../../../utils/helpers'; import * as tiktokenUtils from '../../../../utils/tokenizer/tiktoken'; +import * as tokenEstimator from '../../../../utils/tokenizer/token-estimator'; import { TokenTextSplitter } from '../TokenTextSplitter'; jest.mock('../../../../utils/tokenizer/tiktoken'); +jest.mock('../../../../utils/helpers'); +jest.mock('../../../../utils/tokenizer/token-estimator'); describe('TokenTextSplitter', () => { let mockTokenizer: jest.Mocked<{ @@ -15,6 +21,8 @@ describe('TokenTextSplitter', () => { decode: jest.fn(), }; (tiktokenUtils.getEncoding as jest.Mock).mockResolvedValue(mockTokenizer); + // Default mock for hasLongSequentialRepeat - no repetition + (helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false); }); afterEach(() => { @@ -161,5 +169,175 @@ describe('TokenTextSplitter', () => { expect(result).toEqual(['One two', 'two three', 'three four', 'four five', 'five six']); }); + + describe('repetitive content handling', () => { + it('should use character-based estimation for repetitive content', async () => { + const splitter = new TokenTextSplitter({ + chunkSize: 100, + chunkOverlap: 10, + }); + + const repetitiveText = 'a'.repeat(1000); + const estimatedChunks = ['chunk1', 'chunk2', 'chunk3']; + + (helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true); + (tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue(estimatedChunks); + + const result = await splitter.splitText(repetitiveText); + + // Should not call tiktoken + expect(tiktokenUtils.getEncoding).not.toHaveBeenCalled(); + expect(mockTokenizer.encode).not.toHaveBeenCalled(); + + // Should use estimation + expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(repetitiveText); + expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalledWith( + repetitiveText, + 100, + 10, + 'cl100k_base', + ); + + expect(result).toEqual(estimatedChunks); + }); + + it('should use tiktoken for non-repetitive content', async () => { + const splitter = new TokenTextSplitter({ + chunkSize: 3, + chunkOverlap: 0, + }); + + const normalText = 'This is normal text without repetition'; + const mockTokenIds = [1, 2, 3, 4, 5, 6]; + + (helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false); + mockTokenizer.encode.mockReturnValue(mockTokenIds); + mockTokenizer.decode.mockImplementation(() => 'chunk'); + + await splitter.splitText(normalText); + + // Should check for repetition + expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(normalText); + + // Should use tiktoken + expect(tiktokenUtils.getEncoding).toHaveBeenCalled(); + expect(mockTokenizer.encode).toHaveBeenCalled(); + + // Should not use estimation + expect(tokenEstimator.estimateTextSplitsByTokens).not.toHaveBeenCalled(); + }); + + it('should handle repetitive content with different encodings', async () => { + const splitter = new TokenTextSplitter({ + encodingName: 'o200k_base', + chunkSize: 50, + chunkOverlap: 5, + }); + + const repetitiveText = '.'.repeat(500); + const estimatedChunks = ['estimated chunk 1', 'estimated chunk 2']; + + (helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true); + (tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue(estimatedChunks); + + const result = await splitter.splitText(repetitiveText); + + expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalledWith( + repetitiveText, + 50, + 5, + 'o200k_base', + ); + expect(result).toEqual(estimatedChunks); + }); + + it('should handle edge case with exactly 100 repeating characters', async () => { + const splitter = new TokenTextSplitter(); + const edgeText = 'x'.repeat(100); + + (helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true); + (tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue(['single chunk']); + + const result = await splitter.splitText(edgeText); + + expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(edgeText); + expect(result).toEqual(['single chunk']); + }); + + it('should handle mixed content with repetitive sections', async () => { + const splitter = new TokenTextSplitter(); + const mixedText = 'Normal text ' + 'z'.repeat(200) + ' more normal text'; + + (helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true); + (tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue([ + 'chunk1', + 'chunk2', + ]); + + const result = await splitter.splitText(mixedText); + + expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(mixedText); + expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalled(); + expect(result).toEqual(['chunk1', 'chunk2']); + }); + }); + + describe('error handling', () => { + it('should return empty array for null input', async () => { + const splitter = new TokenTextSplitter(); + const result = await splitter.splitText(null as any); + expect(result).toEqual([]); + }); + + it('should return empty array for undefined input', async () => { + const splitter = new TokenTextSplitter(); + const result = await splitter.splitText(undefined as any); + expect(result).toEqual([]); + }); + + it('should return empty array for non-string input', async () => { + const splitter = new TokenTextSplitter(); + const result = await splitter.splitText(123 as any); + expect(result).toEqual([]); + }); + + it('should fall back to estimation if tiktoken fails', async () => { + const splitter = new TokenTextSplitter(); + const text = 'This will cause tiktoken to fail'; + + (helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false); + (tiktokenUtils.getEncoding as jest.Mock).mockRejectedValue(new Error('Tiktoken error')); + (tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue([ + 'fallback chunk', + ]); + + const result = await splitter.splitText(text); + + expect(result).toEqual(['fallback chunk']); + expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalledWith( + text, + splitter.chunkSize, + splitter.chunkOverlap, + splitter.encodingName, + ); + }); + + it('should fall back to estimation if encode fails', async () => { + const splitter = new TokenTextSplitter(); + const text = 'This will cause encode to fail'; + + (helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false); + mockTokenizer.encode.mockImplementation(() => { + throw new OperationalError('Encode error'); + }); + (tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue([ + 'fallback chunk', + ]); + + const result = await splitter.splitText(text); + + expect(result).toEqual(['fallback chunk']); + }); + }); }); }); diff --git a/packages/@n8n/nodes-langchain/utils/helpers.ts b/packages/@n8n/nodes-langchain/utils/helpers.ts index 0b61b7946b..e182c8c73d 100644 --- a/packages/@n8n/nodes-langchain/utils/helpers.ts +++ b/packages/@n8n/nodes-langchain/utils/helpers.ts @@ -258,3 +258,50 @@ export function unwrapNestedOutput(output: Record): Record\/\\'"^%$]/g, '_').replace(/_+/g, '_'); } + +/** + * Detects if a text contains a character that repeats sequentially for a specified threshold. + * This is used to prevent performance issues with tiktoken on highly repetitive content. + * @param text The text to check + * @param threshold The minimum number of sequential repeats to detect (default: 1000) + * @returns true if a character repeats sequentially for at least the threshold amount + */ +export function hasLongSequentialRepeat(text: string, threshold = 1000): boolean { + try { + // Validate inputs + if ( + text === null || + typeof text !== 'string' || + text.length === 0 || + threshold <= 0 || + text.length < threshold + ) { + return false; + } + // Use string iterator to avoid creating array copy (memory efficient) + const iterator = text[Symbol.iterator](); + let prev = iterator.next(); + + if (prev.done) { + return false; + } + + let count = 1; + for (const char of iterator) { + if (char === prev.value) { + count++; + if (count >= threshold) { + return true; + } + } else { + count = 1; + prev = { value: char, done: false }; + } + } + + return false; + } catch (error) { + // On any error, return false to allow normal processing + return false; + } +} diff --git a/packages/@n8n/nodes-langchain/utils/tests/helpers.test.ts b/packages/@n8n/nodes-langchain/utils/tests/helpers.test.ts index a7419c9fc8..4047ac95d6 100644 --- a/packages/@n8n/nodes-langchain/utils/tests/helpers.test.ts +++ b/packages/@n8n/nodes-langchain/utils/tests/helpers.test.ts @@ -8,6 +8,7 @@ import { z } from 'zod'; import { escapeSingleCurlyBrackets, getConnectedTools, + hasLongSequentialRepeat, nodeNameToToolName, unwrapNestedOutput, } from '../helpers'; @@ -423,3 +424,107 @@ describe('unwrapNestedOutput', () => { expect(unwrapNestedOutput(input)).toEqual(input); }); }); + +describe('hasLongSequentialRepeat', () => { + it('should return false for text shorter than threshold', () => { + const text = 'a'.repeat(99); + expect(hasLongSequentialRepeat(text, 100)).toBe(false); + }); + + it('should return false for normal text without repeats', () => { + const text = 'This is a normal text without many sequential repeating characters.'; + expect(hasLongSequentialRepeat(text)).toBe(false); + }); + + it('should return true for text with exactly threshold repeats', () => { + const text = 'a'.repeat(100); + expect(hasLongSequentialRepeat(text, 100)).toBe(true); + }); + + it('should return true for text with more than threshold repeats', () => { + const text = 'b'.repeat(150); + expect(hasLongSequentialRepeat(text, 100)).toBe(true); + }); + + it('should detect repeats in the middle of text', () => { + const text = 'Normal text ' + 'x'.repeat(100) + ' more normal text'; + expect(hasLongSequentialRepeat(text, 100)).toBe(true); + }); + + it('should detect repeats at the end of text', () => { + const text = 'Normal text at the beginning' + 'z'.repeat(100); + expect(hasLongSequentialRepeat(text, 100)).toBe(true); + }); + + it('should work with different thresholds', () => { + const text = 'a'.repeat(50); + expect(hasLongSequentialRepeat(text, 30)).toBe(true); + expect(hasLongSequentialRepeat(text, 60)).toBe(false); + }); + + it('should handle special characters', () => { + const text = '.'.repeat(100); + expect(hasLongSequentialRepeat(text, 100)).toBe(true); + }); + + it('should handle spaces', () => { + const text = ' '.repeat(100); + expect(hasLongSequentialRepeat(text, 100)).toBe(true); + }); + + it('should handle newlines', () => { + const text = '\n'.repeat(100); + expect(hasLongSequentialRepeat(text, 100)).toBe(true); + }); + + it('should not detect non-sequential repeats', () => { + const text = 'ababab'.repeat(50); // 300 chars but no sequential repeats + expect(hasLongSequentialRepeat(text, 100)).toBe(false); + }); + + it('should handle mixed content with repeats below threshold', () => { + const text = 'aaa' + 'b'.repeat(50) + 'ccc' + 'd'.repeat(40) + 'eee'; + expect(hasLongSequentialRepeat(text, 100)).toBe(false); + }); + + it('should handle empty string', () => { + expect(hasLongSequentialRepeat('', 100)).toBe(false); + }); + + it('should work with very large texts', () => { + const normalText = 'Lorem ipsum dolor sit amet '.repeat(1000); + const textWithRepeat = normalText + 'A'.repeat(100) + normalText; + expect(hasLongSequentialRepeat(textWithRepeat, 100)).toBe(true); + }); + + it('should detect unicode character repeats', () => { + const text = '😀'.repeat(100); + expect(hasLongSequentialRepeat(text, 100)).toBe(true); + }); + + describe('error handling', () => { + it('should handle null input', () => { + expect(hasLongSequentialRepeat(null as any)).toBe(false); + }); + + it('should handle undefined input', () => { + expect(hasLongSequentialRepeat(undefined as any)).toBe(false); + }); + + it('should handle non-string input', () => { + expect(hasLongSequentialRepeat(123 as any)).toBe(false); + expect(hasLongSequentialRepeat({} as any)).toBe(false); + expect(hasLongSequentialRepeat([] as any)).toBe(false); + }); + + it('should handle zero or negative threshold', () => { + const text = 'a'.repeat(100); + expect(hasLongSequentialRepeat(text, 0)).toBe(false); + expect(hasLongSequentialRepeat(text, -1)).toBe(false); + }); + + it('should handle empty string', () => { + expect(hasLongSequentialRepeat('', 100)).toBe(false); + }); + }); +}); diff --git a/packages/@n8n/nodes-langchain/utils/tokenizer/tests/token-estimator.test.ts b/packages/@n8n/nodes-langchain/utils/tokenizer/tests/token-estimator.test.ts new file mode 100644 index 0000000000..cf40041eb9 --- /dev/null +++ b/packages/@n8n/nodes-langchain/utils/tokenizer/tests/token-estimator.test.ts @@ -0,0 +1,248 @@ +/* eslint-disable @typescript-eslint/no-explicit-any */ +/* eslint-disable @typescript-eslint/no-unsafe-argument */ +import { + estimateTokensByCharCount, + estimateTextSplitsByTokens, + estimateTokensFromStringList, +} from '../token-estimator'; + +describe('token-estimator', () => { + describe('estimateTokensByCharCount', () => { + it('should estimate tokens for text using default model', () => { + const text = 'This is a test text with some content.'; + const result = estimateTokensByCharCount(text); + // 38 characters / 4.0 (cl100k_base ratio) = 10 tokens + expect(result).toBe(10); + }); + + it('should estimate tokens for different models', () => { + const text = 'Test text'; // 9 characters + + expect(estimateTokensByCharCount(text, 'gpt-4o')).toBe(3); // 9 / 3.8 = 2.37 -> 3 + expect(estimateTokensByCharCount(text, 'gpt-4')).toBe(3); // 9 / 4.0 = 2.25 -> 3 + expect(estimateTokensByCharCount(text, 'o200k_base')).toBe(3); // 9 / 3.5 = 2.57 -> 3 + expect(estimateTokensByCharCount(text, 'p50k_base')).toBe(3); // 9 / 4.2 = 2.14 -> 3 + }); + + it('should use default ratio for unknown models', () => { + const text = 'Test text with 24 chars.'; // 24 characters + const result = estimateTokensByCharCount(text, 'unknown-model'); + expect(result).toBe(6); // 24 / 4.0 = 6 + }); + + it('should handle empty text', () => { + expect(estimateTokensByCharCount('')).toBe(0); + expect(estimateTokensByCharCount('', 'gpt-4')).toBe(0); + }); + + it('should handle null or undefined text', () => { + expect(estimateTokensByCharCount(null as any)).toBe(0); + expect(estimateTokensByCharCount(undefined as any)).toBe(0); + }); + + it('should handle non-string input', () => { + expect(estimateTokensByCharCount(123 as any)).toBe(0); + expect(estimateTokensByCharCount({} as any)).toBe(0); + expect(estimateTokensByCharCount([] as any)).toBe(0); + }); + + it('should handle very long text', () => { + const longText = 'a'.repeat(10000); + const result = estimateTokensByCharCount(longText); + expect(result).toBe(2500); // 10000 / 4.0 = 2500 + }); + + it('should handle invalid model ratios gracefully', () => { + // This would only happen if MODEL_CHAR_PER_TOKEN_RATIOS is corrupted + const text = 'Test text'; // 9 characters + // Since we can't mock the constant, we test with default fallback + const result = estimateTokensByCharCount(text, 'corrupted-model'); + expect(result).toBe(3); // Falls back to 4.0 ratio + }); + + it('should round up token estimates', () => { + expect(estimateTokensByCharCount('a')).toBe(1); // 1 / 4.0 = 0.25 -> 1 + expect(estimateTokensByCharCount('ab')).toBe(1); // 2 / 4.0 = 0.5 -> 1 + expect(estimateTokensByCharCount('abc')).toBe(1); // 3 / 4.0 = 0.75 -> 1 + expect(estimateTokensByCharCount('abcd')).toBe(1); // 4 / 4.0 = 1 + expect(estimateTokensByCharCount('abcde')).toBe(2); // 5 / 4.0 = 1.25 -> 2 + }); + }); + + describe('estimateTextSplitsByTokens', () => { + it('should split text into chunks based on estimated token size', () => { + const text = 'a'.repeat(400); // 400 characters + const chunks = estimateTextSplitsByTokens(text, 25, 0); // 25 tokens = 100 chars + + expect(chunks).toHaveLength(4); + expect(chunks[0]).toHaveLength(100); + expect(chunks[1]).toHaveLength(100); + expect(chunks[2]).toHaveLength(100); + expect(chunks[3]).toHaveLength(100); + }); + + it('should handle chunk overlap', () => { + const text = 'a'.repeat(200); // 200 characters + const chunks = estimateTextSplitsByTokens(text, 25, 5); // 25 tokens = 100 chars, 5 tokens = 20 chars overlap + + expect(chunks).toHaveLength(3); + expect(chunks[0]).toBe('a'.repeat(100)); // First chunk: 0-100 + expect(chunks[1]).toBe('a'.repeat(100)); // Second chunk: 80-180 (20 char overlap) + expect(chunks[2]).toBe('a'.repeat(40)); // Third chunk: 160-200 + }); + + it('should handle text shorter than chunk size', () => { + const text = 'Short text'; + const chunks = estimateTextSplitsByTokens(text, 100, 0); + + expect(chunks).toHaveLength(1); + expect(chunks[0]).toBe(text); + }); + + it('should handle empty text', () => { + expect(estimateTextSplitsByTokens('', 10, 0)).toEqual([]); + }); + + it('should handle null or undefined text', () => { + expect(estimateTextSplitsByTokens(null as any, 10, 0)).toEqual([]); + expect(estimateTextSplitsByTokens(undefined as any, 10, 0)).toEqual([]); + }); + + it('should handle non-string input', () => { + expect(estimateTextSplitsByTokens(123 as any, 10, 0)).toEqual([]); + expect(estimateTextSplitsByTokens({} as any, 10, 0)).toEqual([]); + }); + + it('should handle invalid chunk size', () => { + const text = 'Test text'; + expect(estimateTextSplitsByTokens(text, 0, 0)).toEqual([text]); + expect(estimateTextSplitsByTokens(text, -1, 0)).toEqual([text]); + expect(estimateTextSplitsByTokens(text, NaN, 0)).toEqual([text]); + expect(estimateTextSplitsByTokens(text, Infinity, 0)).toEqual([text]); + }); + + it('should handle invalid overlap', () => { + const text = 'a'.repeat(200); + // Negative overlap should be treated as 0 + const chunks1 = estimateTextSplitsByTokens(text, 25, -10); + expect(chunks1).toHaveLength(2); + + // Overlap larger than chunk size should be capped + const chunks2 = estimateTextSplitsByTokens(text, 25, 30); // overlap capped to 24 + expect(chunks2.length).toBeGreaterThan(2); + }); + + it('should ensure progress even with large overlap', () => { + const text = 'a'.repeat(100); + // With overlap = chunkSize - 1, we should still make progress + const chunks = estimateTextSplitsByTokens(text, 10, 9); // 10 tokens = 40 chars, 9 tokens = 36 chars overlap + + expect(chunks.length).toBeGreaterThan(1); + // Verify no infinite loop occurs + expect(chunks.length).toBeLessThan(100); + }); + + it('should work with different models', () => { + const text = 'a'.repeat(380); // 380 characters + const chunks = estimateTextSplitsByTokens(text, 100, 0, 'gpt-4o'); // 100 tokens * 3.8 = 380 chars + + expect(chunks).toHaveLength(1); + expect(chunks[0]).toBe(text); + }); + + it('should use default model ratio for unknown models', () => { + const text = 'a'.repeat(400); + const chunks = estimateTextSplitsByTokens(text, 100, 0, 'unknown-model'); // Falls back to 4.0 ratio + + expect(chunks).toHaveLength(1); + expect(chunks[0]).toBe(text); + }); + + it('should handle edge case where text length equals chunk size', () => { + const text = 'a'.repeat(100); + const chunks = estimateTextSplitsByTokens(text, 25, 0); // 25 tokens = 100 chars + + expect(chunks).toHaveLength(1); + expect(chunks[0]).toBe(text); + }); + + it('should handle unicode text', () => { + const text = '你好世界'.repeat(25); // 100 characters (4 chars * 25) + const chunks = estimateTextSplitsByTokens(text, 25, 0); + + expect(chunks.length).toBeGreaterThan(0); + expect(chunks.join('')).toBe(text); + }); + + it('should return single chunk on any error in catch block', () => { + const text = 'Test text'; + // Since we can't easily trigger the catch block, we test the expected behavior + // The function should return [text] on error + const result = estimateTextSplitsByTokens(text, 10, 0); + expect(result.length).toBeGreaterThan(0); + }); + }); + + describe('estimateTokensFromStringList', () => { + // Since this function uses tiktoken which requires external data files, + // we'll test it with integration-style tests that don't require mocking + + it('should handle empty list', async () => { + const result = await estimateTokensFromStringList([], 'gpt-4'); + expect(result).toBe(0); + }); + + it('should handle non-array input', async () => { + const result = await estimateTokensFromStringList(null as any, 'gpt-4'); + expect(result).toBe(0); + + const result2 = await estimateTokensFromStringList('not an array' as any, 'gpt-4'); + expect(result2).toBe(0); + }); + + it('should handle null/undefined items in list', async () => { + const list = ['Valid text', null, undefined, '', 123 as any]; + const result = await estimateTokensFromStringList(list, 'gpt-4'); + expect(result).toEqual(2); + }); + + it('should estimate tokens for normal text', async () => { + const list = ['Hello world', 'Test text']; + const result = await estimateTokensFromStringList(list, 'gpt-4'); + expect(result).toBeGreaterThan(0); + }); + + it('should use character-based estimation for repetitive content', async () => { + const list = ['a'.repeat(1500)]; + const result = await estimateTokensFromStringList(list, 'gpt-4'); + expect(result).toBe(375); // 1500 chars / 4.0 = 375 tokens + }); + + it('should handle mixed content', async () => { + const list = ['Normal text content', 'a'.repeat(1500), 'More normal text']; + const result = await estimateTokensFromStringList(list, 'gpt-4'); + expect(result).toBeGreaterThan(375); // At least the repetitive content tokens + }); + + it('should work with different models', async () => { + const list = ['Test text for different model']; + const result1 = await estimateTokensFromStringList(list, 'gpt-4'); + const result2 = await estimateTokensFromStringList(list, 'gpt-4o'); + // Both should return positive values + expect(result1).toBeGreaterThan(0); + expect(result2).toBeGreaterThan(0); + }); + + it('should handle very long lists', async () => { + const list = Array(10000).fill('Sample text'); + const result = await estimateTokensFromStringList(list, 'gpt-4'); + expect(result).toBeGreaterThan(0); + }); + + it('should handle unicode text', async () => { + const list = ['你好世界', '🌍🌎🌏', 'مرحبا بالعالم']; + const result = await estimateTokensFromStringList(list, 'gpt-4'); + expect(result).toBeGreaterThan(0); + }); + }); +}); diff --git a/packages/@n8n/nodes-langchain/utils/tokenizer/token-estimator.ts b/packages/@n8n/nodes-langchain/utils/tokenizer/token-estimator.ts new file mode 100644 index 0000000000..e3d3f8d9f3 --- /dev/null +++ b/packages/@n8n/nodes-langchain/utils/tokenizer/token-estimator.ts @@ -0,0 +1,176 @@ +/** + * Token estimation utilities for handling text without using tiktoken. + * This is used as a fallback when tiktoken would be too slow (e.g., with repetitive content). + */ + +import type { TiktokenModel } from 'js-tiktoken'; + +import { encodingForModel } from './tiktoken'; +import { hasLongSequentialRepeat } from '../helpers'; + +/** + * Model-specific average characters per token ratios. + * These are approximate values based on typical English text. + */ +const MODEL_CHAR_PER_TOKEN_RATIOS: Record = { + 'gpt-4o': 3.8, + 'gpt-4': 4.0, + 'gpt-3.5-turbo': 4.0, + cl100k_base: 4.0, + o200k_base: 3.5, + p50k_base: 4.2, + r50k_base: 4.2, +}; + +/** + * Estimates the number of tokens in a text based on character count. + * This is much faster than tiktoken but less accurate. + * + * @param text The text to estimate tokens for + * @param model The model or encoding name (optional) + * @returns Estimated number of tokens + */ +export function estimateTokensByCharCount(text: string, model: string = 'cl100k_base'): number { + try { + // Validate input + if (!text || typeof text !== 'string' || text.length === 0) { + return 0; + } + + // Get the ratio for the specific model, or use default + const charsPerToken = MODEL_CHAR_PER_TOKEN_RATIOS[model] || 4.0; + + // Validate ratio + if (!Number.isFinite(charsPerToken) || charsPerToken <= 0) { + // Fallback to default ratio + const estimatedTokens = Math.ceil(text.length / 4.0); + return estimatedTokens; + } + + // Calculate estimated tokens + const estimatedTokens = Math.ceil(text.length / charsPerToken); + + return estimatedTokens; + } catch (error) { + // Return conservative estimate on error + return Math.ceil((text?.length || 0) / 4.0); + } +} + +/** + * Estimates tokens for text splitting purposes. + * Returns chunk boundaries based on character positions rather than token positions. + * + * @param text The text to split + * @param chunkSize Target chunk size in tokens + * @param chunkOverlap Overlap between chunks in tokens + * @param model The model or encoding name (optional) + * @returns Array of text chunks + */ +export function estimateTextSplitsByTokens( + text: string, + chunkSize: number, + chunkOverlap: number, + model: string = 'cl100k_base', +): string[] { + try { + // Validate inputs + if (!text || typeof text !== 'string' || text.length === 0) { + return []; + } + + // Validate numeric parameters + if (!Number.isFinite(chunkSize) || chunkSize <= 0) { + // Return whole text as single chunk if invalid chunk size + return [text]; + } + + // Ensure overlap is valid and less than chunk size + const validOverlap = + Number.isFinite(chunkOverlap) && chunkOverlap >= 0 + ? Math.min(chunkOverlap, chunkSize - 1) + : 0; + + const charsPerToken = MODEL_CHAR_PER_TOKEN_RATIOS[model] || 4.0; + const chunkSizeInChars = Math.floor(chunkSize * charsPerToken); + const overlapInChars = Math.floor(validOverlap * charsPerToken); + + const chunks: string[] = []; + let start = 0; + + while (start < text.length) { + const end = Math.min(start + chunkSizeInChars, text.length); + chunks.push(text.slice(start, end)); + + if (end >= text.length) { + break; + } + + // Move to next chunk with overlap + start = Math.max(end - overlapInChars, start + 1); + } + + return chunks; + } catch (error) { + // Return text as single chunk on error + return text ? [text] : []; + } +} + +/** + * Estimates the total number of tokens for a list of strings. + * Uses tiktoken for normal text but falls back to character-based estimation + * for repetitive content or on errors. + * + * @param list Array of strings to estimate tokens for + * @param model The model or encoding name to use for estimation + * @returns Total estimated number of tokens across all strings + */ +export async function estimateTokensFromStringList( + list: string[], + model: TiktokenModel, +): Promise { + try { + // Validate input + if (!Array.isArray(list)) { + return 0; + } + + const encoder = await encodingForModel(model); + const encodedListLength = await Promise.all( + list.map(async (text) => { + try { + // Handle null/undefined text + if (!text || typeof text !== 'string') { + return 0; + } + + // Check for repetitive content + if (hasLongSequentialRepeat(text)) { + const estimatedTokens = estimateTokensByCharCount(text, model); + return estimatedTokens; + } + + // Use tiktoken for normal text + try { + const tokens = encoder.encode(text); + return tokens.length; + } catch (encodingError) { + // Fall back to estimation if tiktoken fails + return estimateTokensByCharCount(text, model); + } + } catch (itemError) { + // Return 0 for individual item errors + return 0; + } + }), + ); + + const totalTokens = encodedListLength.reduce((acc, curr) => acc + curr, 0); + + return totalTokens; + } catch (error) { + // Return 0 on complete failure + return 0; + } +}