mirror of
https://github.com/Abdulazizzn/n8n-enterprise-unlocked.git
synced 2025-12-17 10:02:05 +00:00
fix(Token Splitter Node): Prevent tiktoken blocking on repetitive content (#16769)
This commit is contained in:
@@ -13,7 +13,7 @@ import type { IDataObject, ISupplyDataFunctions, JsonObject } from 'n8n-workflow
|
|||||||
import { NodeConnectionTypes, NodeError, NodeOperationError } from 'n8n-workflow';
|
import { NodeConnectionTypes, NodeError, NodeOperationError } from 'n8n-workflow';
|
||||||
|
|
||||||
import { logAiEvent } from '@utils/helpers';
|
import { logAiEvent } from '@utils/helpers';
|
||||||
import { encodingForModel } from '@utils/tokenizer/tiktoken';
|
import { estimateTokensFromStringList } from '@utils/tokenizer/token-estimator';
|
||||||
|
|
||||||
type TokensUsageParser = (llmOutput: LLMResult['llmOutput']) => {
|
type TokensUsageParser = (llmOutput: LLMResult['llmOutput']) => {
|
||||||
completionTokens: number;
|
completionTokens: number;
|
||||||
@@ -84,13 +84,7 @@ export class N8nLlmTracing extends BaseCallbackHandler {
|
|||||||
|
|
||||||
async estimateTokensFromStringList(list: string[]) {
|
async estimateTokensFromStringList(list: string[]) {
|
||||||
const embeddingModel = getModelNameForTiktoken(TIKTOKEN_ESTIMATE_MODEL);
|
const embeddingModel = getModelNameForTiktoken(TIKTOKEN_ESTIMATE_MODEL);
|
||||||
const encoder = await encodingForModel(embeddingModel);
|
return await estimateTokensFromStringList(list, embeddingModel);
|
||||||
|
|
||||||
const encodedListLength = await Promise.all(
|
|
||||||
list.map(async (text) => encoder.encode(text).length),
|
|
||||||
);
|
|
||||||
|
|
||||||
return encodedListLength.reduce((acc, curr) => acc + curr, 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async handleLLMEnd(output: LLMResult, runId: string) {
|
async handleLLMEnd(output: LLMResult, runId: string) {
|
||||||
|
|||||||
@@ -3,7 +3,9 @@ import type { TokenTextSplitterParams } from '@langchain/textsplitters';
|
|||||||
import { TextSplitter } from '@langchain/textsplitters';
|
import { TextSplitter } from '@langchain/textsplitters';
|
||||||
import type * as tiktoken from 'js-tiktoken';
|
import type * as tiktoken from 'js-tiktoken';
|
||||||
|
|
||||||
|
import { hasLongSequentialRepeat } from '@utils/helpers';
|
||||||
import { getEncoding } from '@utils/tokenizer/tiktoken';
|
import { getEncoding } from '@utils/tokenizer/tiktoken';
|
||||||
|
import { estimateTextSplitsByTokens } from '@utils/tokenizer/token-estimator';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Implementation of splitter which looks at tokens.
|
* Implementation of splitter which looks at tokens.
|
||||||
@@ -32,26 +34,61 @@ export class TokenTextSplitter extends TextSplitter implements TokenTextSplitter
|
|||||||
}
|
}
|
||||||
|
|
||||||
async splitText(text: string): Promise<string[]> {
|
async splitText(text: string): Promise<string[]> {
|
||||||
if (!this.tokenizer) {
|
try {
|
||||||
this.tokenizer = await getEncoding(this.encodingName);
|
// Validate input
|
||||||
}
|
if (!text || typeof text !== 'string') {
|
||||||
|
return [];
|
||||||
const splits: string[] = [];
|
|
||||||
|
|
||||||
const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
|
|
||||||
|
|
||||||
let start_idx = 0;
|
|
||||||
|
|
||||||
while (start_idx < input_ids.length) {
|
|
||||||
if (start_idx > 0) {
|
|
||||||
start_idx -= this.chunkOverlap;
|
|
||||||
}
|
}
|
||||||
const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
|
|
||||||
const chunk_ids = input_ids.slice(start_idx, end_idx);
|
|
||||||
splits.push(this.tokenizer.decode(chunk_ids));
|
|
||||||
start_idx = end_idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
return splits;
|
// Check for repetitive content
|
||||||
|
if (hasLongSequentialRepeat(text)) {
|
||||||
|
const splits = estimateTextSplitsByTokens(
|
||||||
|
text,
|
||||||
|
this.chunkSize,
|
||||||
|
this.chunkOverlap,
|
||||||
|
this.encodingName,
|
||||||
|
);
|
||||||
|
return splits;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use tiktoken for normal text
|
||||||
|
try {
|
||||||
|
if (!this.tokenizer) {
|
||||||
|
this.tokenizer = await getEncoding(this.encodingName);
|
||||||
|
}
|
||||||
|
|
||||||
|
const splits: string[] = [];
|
||||||
|
const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
|
||||||
|
|
||||||
|
let start_idx = 0;
|
||||||
|
let chunkCount = 0;
|
||||||
|
|
||||||
|
while (start_idx < input_ids.length) {
|
||||||
|
if (start_idx > 0) {
|
||||||
|
start_idx = Math.max(0, start_idx - this.chunkOverlap);
|
||||||
|
}
|
||||||
|
const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
|
||||||
|
const chunk_ids = input_ids.slice(start_idx, end_idx);
|
||||||
|
|
||||||
|
splits.push(this.tokenizer.decode(chunk_ids));
|
||||||
|
|
||||||
|
chunkCount++;
|
||||||
|
start_idx = end_idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
return splits;
|
||||||
|
} catch (tiktokenError) {
|
||||||
|
// Fall back to character-based splitting if tiktoken fails
|
||||||
|
return estimateTextSplitsByTokens(
|
||||||
|
text,
|
||||||
|
this.chunkSize,
|
||||||
|
this.chunkOverlap,
|
||||||
|
this.encodingName,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
// Return empty array on complete failure
|
||||||
|
return [];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,13 @@
|
|||||||
|
import { OperationalError } from 'n8n-workflow';
|
||||||
|
|
||||||
|
import * as helpers from '../../../../utils/helpers';
|
||||||
import * as tiktokenUtils from '../../../../utils/tokenizer/tiktoken';
|
import * as tiktokenUtils from '../../../../utils/tokenizer/tiktoken';
|
||||||
|
import * as tokenEstimator from '../../../../utils/tokenizer/token-estimator';
|
||||||
import { TokenTextSplitter } from '../TokenTextSplitter';
|
import { TokenTextSplitter } from '../TokenTextSplitter';
|
||||||
|
|
||||||
jest.mock('../../../../utils/tokenizer/tiktoken');
|
jest.mock('../../../../utils/tokenizer/tiktoken');
|
||||||
|
jest.mock('../../../../utils/helpers');
|
||||||
|
jest.mock('../../../../utils/tokenizer/token-estimator');
|
||||||
|
|
||||||
describe('TokenTextSplitter', () => {
|
describe('TokenTextSplitter', () => {
|
||||||
let mockTokenizer: jest.Mocked<{
|
let mockTokenizer: jest.Mocked<{
|
||||||
@@ -15,6 +21,8 @@ describe('TokenTextSplitter', () => {
|
|||||||
decode: jest.fn(),
|
decode: jest.fn(),
|
||||||
};
|
};
|
||||||
(tiktokenUtils.getEncoding as jest.Mock).mockResolvedValue(mockTokenizer);
|
(tiktokenUtils.getEncoding as jest.Mock).mockResolvedValue(mockTokenizer);
|
||||||
|
// Default mock for hasLongSequentialRepeat - no repetition
|
||||||
|
(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
afterEach(() => {
|
afterEach(() => {
|
||||||
@@ -161,5 +169,175 @@ describe('TokenTextSplitter', () => {
|
|||||||
|
|
||||||
expect(result).toEqual(['One two', 'two three', 'three four', 'four five', 'five six']);
|
expect(result).toEqual(['One two', 'two three', 'three four', 'four five', 'five six']);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('repetitive content handling', () => {
|
||||||
|
it('should use character-based estimation for repetitive content', async () => {
|
||||||
|
const splitter = new TokenTextSplitter({
|
||||||
|
chunkSize: 100,
|
||||||
|
chunkOverlap: 10,
|
||||||
|
});
|
||||||
|
|
||||||
|
const repetitiveText = 'a'.repeat(1000);
|
||||||
|
const estimatedChunks = ['chunk1', 'chunk2', 'chunk3'];
|
||||||
|
|
||||||
|
(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true);
|
||||||
|
(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue(estimatedChunks);
|
||||||
|
|
||||||
|
const result = await splitter.splitText(repetitiveText);
|
||||||
|
|
||||||
|
// Should not call tiktoken
|
||||||
|
expect(tiktokenUtils.getEncoding).not.toHaveBeenCalled();
|
||||||
|
expect(mockTokenizer.encode).not.toHaveBeenCalled();
|
||||||
|
|
||||||
|
// Should use estimation
|
||||||
|
expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(repetitiveText);
|
||||||
|
expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalledWith(
|
||||||
|
repetitiveText,
|
||||||
|
100,
|
||||||
|
10,
|
||||||
|
'cl100k_base',
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(result).toEqual(estimatedChunks);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should use tiktoken for non-repetitive content', async () => {
|
||||||
|
const splitter = new TokenTextSplitter({
|
||||||
|
chunkSize: 3,
|
||||||
|
chunkOverlap: 0,
|
||||||
|
});
|
||||||
|
|
||||||
|
const normalText = 'This is normal text without repetition';
|
||||||
|
const mockTokenIds = [1, 2, 3, 4, 5, 6];
|
||||||
|
|
||||||
|
(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false);
|
||||||
|
mockTokenizer.encode.mockReturnValue(mockTokenIds);
|
||||||
|
mockTokenizer.decode.mockImplementation(() => 'chunk');
|
||||||
|
|
||||||
|
await splitter.splitText(normalText);
|
||||||
|
|
||||||
|
// Should check for repetition
|
||||||
|
expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(normalText);
|
||||||
|
|
||||||
|
// Should use tiktoken
|
||||||
|
expect(tiktokenUtils.getEncoding).toHaveBeenCalled();
|
||||||
|
expect(mockTokenizer.encode).toHaveBeenCalled();
|
||||||
|
|
||||||
|
// Should not use estimation
|
||||||
|
expect(tokenEstimator.estimateTextSplitsByTokens).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle repetitive content with different encodings', async () => {
|
||||||
|
const splitter = new TokenTextSplitter({
|
||||||
|
encodingName: 'o200k_base',
|
||||||
|
chunkSize: 50,
|
||||||
|
chunkOverlap: 5,
|
||||||
|
});
|
||||||
|
|
||||||
|
const repetitiveText = '.'.repeat(500);
|
||||||
|
const estimatedChunks = ['estimated chunk 1', 'estimated chunk 2'];
|
||||||
|
|
||||||
|
(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true);
|
||||||
|
(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue(estimatedChunks);
|
||||||
|
|
||||||
|
const result = await splitter.splitText(repetitiveText);
|
||||||
|
|
||||||
|
expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalledWith(
|
||||||
|
repetitiveText,
|
||||||
|
50,
|
||||||
|
5,
|
||||||
|
'o200k_base',
|
||||||
|
);
|
||||||
|
expect(result).toEqual(estimatedChunks);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle edge case with exactly 100 repeating characters', async () => {
|
||||||
|
const splitter = new TokenTextSplitter();
|
||||||
|
const edgeText = 'x'.repeat(100);
|
||||||
|
|
||||||
|
(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true);
|
||||||
|
(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue(['single chunk']);
|
||||||
|
|
||||||
|
const result = await splitter.splitText(edgeText);
|
||||||
|
|
||||||
|
expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(edgeText);
|
||||||
|
expect(result).toEqual(['single chunk']);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle mixed content with repetitive sections', async () => {
|
||||||
|
const splitter = new TokenTextSplitter();
|
||||||
|
const mixedText = 'Normal text ' + 'z'.repeat(200) + ' more normal text';
|
||||||
|
|
||||||
|
(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true);
|
||||||
|
(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue([
|
||||||
|
'chunk1',
|
||||||
|
'chunk2',
|
||||||
|
]);
|
||||||
|
|
||||||
|
const result = await splitter.splitText(mixedText);
|
||||||
|
|
||||||
|
expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(mixedText);
|
||||||
|
expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalled();
|
||||||
|
expect(result).toEqual(['chunk1', 'chunk2']);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('error handling', () => {
|
||||||
|
it('should return empty array for null input', async () => {
|
||||||
|
const splitter = new TokenTextSplitter();
|
||||||
|
const result = await splitter.splitText(null as any);
|
||||||
|
expect(result).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return empty array for undefined input', async () => {
|
||||||
|
const splitter = new TokenTextSplitter();
|
||||||
|
const result = await splitter.splitText(undefined as any);
|
||||||
|
expect(result).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return empty array for non-string input', async () => {
|
||||||
|
const splitter = new TokenTextSplitter();
|
||||||
|
const result = await splitter.splitText(123 as any);
|
||||||
|
expect(result).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should fall back to estimation if tiktoken fails', async () => {
|
||||||
|
const splitter = new TokenTextSplitter();
|
||||||
|
const text = 'This will cause tiktoken to fail';
|
||||||
|
|
||||||
|
(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false);
|
||||||
|
(tiktokenUtils.getEncoding as jest.Mock).mockRejectedValue(new Error('Tiktoken error'));
|
||||||
|
(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue([
|
||||||
|
'fallback chunk',
|
||||||
|
]);
|
||||||
|
|
||||||
|
const result = await splitter.splitText(text);
|
||||||
|
|
||||||
|
expect(result).toEqual(['fallback chunk']);
|
||||||
|
expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalledWith(
|
||||||
|
text,
|
||||||
|
splitter.chunkSize,
|
||||||
|
splitter.chunkOverlap,
|
||||||
|
splitter.encodingName,
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should fall back to estimation if encode fails', async () => {
|
||||||
|
const splitter = new TokenTextSplitter();
|
||||||
|
const text = 'This will cause encode to fail';
|
||||||
|
|
||||||
|
(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false);
|
||||||
|
mockTokenizer.encode.mockImplementation(() => {
|
||||||
|
throw new OperationalError('Encode error');
|
||||||
|
});
|
||||||
|
(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue([
|
||||||
|
'fallback chunk',
|
||||||
|
]);
|
||||||
|
|
||||||
|
const result = await splitter.splitText(text);
|
||||||
|
|
||||||
|
expect(result).toEqual(['fallback chunk']);
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -258,3 +258,50 @@ export function unwrapNestedOutput(output: Record<string, unknown>): Record<stri
|
|||||||
export function nodeNameToToolName(node: INode): string {
|
export function nodeNameToToolName(node: INode): string {
|
||||||
return node.name.replace(/[\s.?!=+#@&*()[\]{}:;,<>\/\\'"^%$]/g, '_').replace(/_+/g, '_');
|
return node.name.replace(/[\s.?!=+#@&*()[\]{}:;,<>\/\\'"^%$]/g, '_').replace(/_+/g, '_');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detects if a text contains a character that repeats sequentially for a specified threshold.
|
||||||
|
* This is used to prevent performance issues with tiktoken on highly repetitive content.
|
||||||
|
* @param text The text to check
|
||||||
|
* @param threshold The minimum number of sequential repeats to detect (default: 1000)
|
||||||
|
* @returns true if a character repeats sequentially for at least the threshold amount
|
||||||
|
*/
|
||||||
|
export function hasLongSequentialRepeat(text: string, threshold = 1000): boolean {
|
||||||
|
try {
|
||||||
|
// Validate inputs
|
||||||
|
if (
|
||||||
|
text === null ||
|
||||||
|
typeof text !== 'string' ||
|
||||||
|
text.length === 0 ||
|
||||||
|
threshold <= 0 ||
|
||||||
|
text.length < threshold
|
||||||
|
) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Use string iterator to avoid creating array copy (memory efficient)
|
||||||
|
const iterator = text[Symbol.iterator]();
|
||||||
|
let prev = iterator.next();
|
||||||
|
|
||||||
|
if (prev.done) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
let count = 1;
|
||||||
|
for (const char of iterator) {
|
||||||
|
if (char === prev.value) {
|
||||||
|
count++;
|
||||||
|
if (count >= threshold) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
count = 1;
|
||||||
|
prev = { value: char, done: false };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
} catch (error) {
|
||||||
|
// On any error, return false to allow normal processing
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import { z } from 'zod';
|
|||||||
import {
|
import {
|
||||||
escapeSingleCurlyBrackets,
|
escapeSingleCurlyBrackets,
|
||||||
getConnectedTools,
|
getConnectedTools,
|
||||||
|
hasLongSequentialRepeat,
|
||||||
nodeNameToToolName,
|
nodeNameToToolName,
|
||||||
unwrapNestedOutput,
|
unwrapNestedOutput,
|
||||||
} from '../helpers';
|
} from '../helpers';
|
||||||
@@ -423,3 +424,107 @@ describe('unwrapNestedOutput', () => {
|
|||||||
expect(unwrapNestedOutput(input)).toEqual(input);
|
expect(unwrapNestedOutput(input)).toEqual(input);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('hasLongSequentialRepeat', () => {
|
||||||
|
it('should return false for text shorter than threshold', () => {
|
||||||
|
const text = 'a'.repeat(99);
|
||||||
|
expect(hasLongSequentialRepeat(text, 100)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return false for normal text without repeats', () => {
|
||||||
|
const text = 'This is a normal text without many sequential repeating characters.';
|
||||||
|
expect(hasLongSequentialRepeat(text)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return true for text with exactly threshold repeats', () => {
|
||||||
|
const text = 'a'.repeat(100);
|
||||||
|
expect(hasLongSequentialRepeat(text, 100)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return true for text with more than threshold repeats', () => {
|
||||||
|
const text = 'b'.repeat(150);
|
||||||
|
expect(hasLongSequentialRepeat(text, 100)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should detect repeats in the middle of text', () => {
|
||||||
|
const text = 'Normal text ' + 'x'.repeat(100) + ' more normal text';
|
||||||
|
expect(hasLongSequentialRepeat(text, 100)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should detect repeats at the end of text', () => {
|
||||||
|
const text = 'Normal text at the beginning' + 'z'.repeat(100);
|
||||||
|
expect(hasLongSequentialRepeat(text, 100)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should work with different thresholds', () => {
|
||||||
|
const text = 'a'.repeat(50);
|
||||||
|
expect(hasLongSequentialRepeat(text, 30)).toBe(true);
|
||||||
|
expect(hasLongSequentialRepeat(text, 60)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle special characters', () => {
|
||||||
|
const text = '.'.repeat(100);
|
||||||
|
expect(hasLongSequentialRepeat(text, 100)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle spaces', () => {
|
||||||
|
const text = ' '.repeat(100);
|
||||||
|
expect(hasLongSequentialRepeat(text, 100)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle newlines', () => {
|
||||||
|
const text = '\n'.repeat(100);
|
||||||
|
expect(hasLongSequentialRepeat(text, 100)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should not detect non-sequential repeats', () => {
|
||||||
|
const text = 'ababab'.repeat(50); // 300 chars but no sequential repeats
|
||||||
|
expect(hasLongSequentialRepeat(text, 100)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle mixed content with repeats below threshold', () => {
|
||||||
|
const text = 'aaa' + 'b'.repeat(50) + 'ccc' + 'd'.repeat(40) + 'eee';
|
||||||
|
expect(hasLongSequentialRepeat(text, 100)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle empty string', () => {
|
||||||
|
expect(hasLongSequentialRepeat('', 100)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should work with very large texts', () => {
|
||||||
|
const normalText = 'Lorem ipsum dolor sit amet '.repeat(1000);
|
||||||
|
const textWithRepeat = normalText + 'A'.repeat(100) + normalText;
|
||||||
|
expect(hasLongSequentialRepeat(textWithRepeat, 100)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should detect unicode character repeats', () => {
|
||||||
|
const text = '😀'.repeat(100);
|
||||||
|
expect(hasLongSequentialRepeat(text, 100)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('error handling', () => {
|
||||||
|
it('should handle null input', () => {
|
||||||
|
expect(hasLongSequentialRepeat(null as any)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle undefined input', () => {
|
||||||
|
expect(hasLongSequentialRepeat(undefined as any)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle non-string input', () => {
|
||||||
|
expect(hasLongSequentialRepeat(123 as any)).toBe(false);
|
||||||
|
expect(hasLongSequentialRepeat({} as any)).toBe(false);
|
||||||
|
expect(hasLongSequentialRepeat([] as any)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle zero or negative threshold', () => {
|
||||||
|
const text = 'a'.repeat(100);
|
||||||
|
expect(hasLongSequentialRepeat(text, 0)).toBe(false);
|
||||||
|
expect(hasLongSequentialRepeat(text, -1)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle empty string', () => {
|
||||||
|
expect(hasLongSequentialRepeat('', 100)).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
@@ -0,0 +1,248 @@
|
|||||||
|
/* eslint-disable @typescript-eslint/no-explicit-any */
|
||||||
|
/* eslint-disable @typescript-eslint/no-unsafe-argument */
|
||||||
|
import {
|
||||||
|
estimateTokensByCharCount,
|
||||||
|
estimateTextSplitsByTokens,
|
||||||
|
estimateTokensFromStringList,
|
||||||
|
} from '../token-estimator';
|
||||||
|
|
||||||
|
describe('token-estimator', () => {
|
||||||
|
describe('estimateTokensByCharCount', () => {
|
||||||
|
it('should estimate tokens for text using default model', () => {
|
||||||
|
const text = 'This is a test text with some content.';
|
||||||
|
const result = estimateTokensByCharCount(text);
|
||||||
|
// 38 characters / 4.0 (cl100k_base ratio) = 10 tokens
|
||||||
|
expect(result).toBe(10);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should estimate tokens for different models', () => {
|
||||||
|
const text = 'Test text'; // 9 characters
|
||||||
|
|
||||||
|
expect(estimateTokensByCharCount(text, 'gpt-4o')).toBe(3); // 9 / 3.8 = 2.37 -> 3
|
||||||
|
expect(estimateTokensByCharCount(text, 'gpt-4')).toBe(3); // 9 / 4.0 = 2.25 -> 3
|
||||||
|
expect(estimateTokensByCharCount(text, 'o200k_base')).toBe(3); // 9 / 3.5 = 2.57 -> 3
|
||||||
|
expect(estimateTokensByCharCount(text, 'p50k_base')).toBe(3); // 9 / 4.2 = 2.14 -> 3
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should use default ratio for unknown models', () => {
|
||||||
|
const text = 'Test text with 24 chars.'; // 24 characters
|
||||||
|
const result = estimateTokensByCharCount(text, 'unknown-model');
|
||||||
|
expect(result).toBe(6); // 24 / 4.0 = 6
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle empty text', () => {
|
||||||
|
expect(estimateTokensByCharCount('')).toBe(0);
|
||||||
|
expect(estimateTokensByCharCount('', 'gpt-4')).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle null or undefined text', () => {
|
||||||
|
expect(estimateTokensByCharCount(null as any)).toBe(0);
|
||||||
|
expect(estimateTokensByCharCount(undefined as any)).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle non-string input', () => {
|
||||||
|
expect(estimateTokensByCharCount(123 as any)).toBe(0);
|
||||||
|
expect(estimateTokensByCharCount({} as any)).toBe(0);
|
||||||
|
expect(estimateTokensByCharCount([] as any)).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle very long text', () => {
|
||||||
|
const longText = 'a'.repeat(10000);
|
||||||
|
const result = estimateTokensByCharCount(longText);
|
||||||
|
expect(result).toBe(2500); // 10000 / 4.0 = 2500
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle invalid model ratios gracefully', () => {
|
||||||
|
// This would only happen if MODEL_CHAR_PER_TOKEN_RATIOS is corrupted
|
||||||
|
const text = 'Test text'; // 9 characters
|
||||||
|
// Since we can't mock the constant, we test with default fallback
|
||||||
|
const result = estimateTokensByCharCount(text, 'corrupted-model');
|
||||||
|
expect(result).toBe(3); // Falls back to 4.0 ratio
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should round up token estimates', () => {
|
||||||
|
expect(estimateTokensByCharCount('a')).toBe(1); // 1 / 4.0 = 0.25 -> 1
|
||||||
|
expect(estimateTokensByCharCount('ab')).toBe(1); // 2 / 4.0 = 0.5 -> 1
|
||||||
|
expect(estimateTokensByCharCount('abc')).toBe(1); // 3 / 4.0 = 0.75 -> 1
|
||||||
|
expect(estimateTokensByCharCount('abcd')).toBe(1); // 4 / 4.0 = 1
|
||||||
|
expect(estimateTokensByCharCount('abcde')).toBe(2); // 5 / 4.0 = 1.25 -> 2
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('estimateTextSplitsByTokens', () => {
|
||||||
|
it('should split text into chunks based on estimated token size', () => {
|
||||||
|
const text = 'a'.repeat(400); // 400 characters
|
||||||
|
const chunks = estimateTextSplitsByTokens(text, 25, 0); // 25 tokens = 100 chars
|
||||||
|
|
||||||
|
expect(chunks).toHaveLength(4);
|
||||||
|
expect(chunks[0]).toHaveLength(100);
|
||||||
|
expect(chunks[1]).toHaveLength(100);
|
||||||
|
expect(chunks[2]).toHaveLength(100);
|
||||||
|
expect(chunks[3]).toHaveLength(100);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle chunk overlap', () => {
|
||||||
|
const text = 'a'.repeat(200); // 200 characters
|
||||||
|
const chunks = estimateTextSplitsByTokens(text, 25, 5); // 25 tokens = 100 chars, 5 tokens = 20 chars overlap
|
||||||
|
|
||||||
|
expect(chunks).toHaveLength(3);
|
||||||
|
expect(chunks[0]).toBe('a'.repeat(100)); // First chunk: 0-100
|
||||||
|
expect(chunks[1]).toBe('a'.repeat(100)); // Second chunk: 80-180 (20 char overlap)
|
||||||
|
expect(chunks[2]).toBe('a'.repeat(40)); // Third chunk: 160-200
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle text shorter than chunk size', () => {
|
||||||
|
const text = 'Short text';
|
||||||
|
const chunks = estimateTextSplitsByTokens(text, 100, 0);
|
||||||
|
|
||||||
|
expect(chunks).toHaveLength(1);
|
||||||
|
expect(chunks[0]).toBe(text);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle empty text', () => {
|
||||||
|
expect(estimateTextSplitsByTokens('', 10, 0)).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle null or undefined text', () => {
|
||||||
|
expect(estimateTextSplitsByTokens(null as any, 10, 0)).toEqual([]);
|
||||||
|
expect(estimateTextSplitsByTokens(undefined as any, 10, 0)).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle non-string input', () => {
|
||||||
|
expect(estimateTextSplitsByTokens(123 as any, 10, 0)).toEqual([]);
|
||||||
|
expect(estimateTextSplitsByTokens({} as any, 10, 0)).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle invalid chunk size', () => {
|
||||||
|
const text = 'Test text';
|
||||||
|
expect(estimateTextSplitsByTokens(text, 0, 0)).toEqual([text]);
|
||||||
|
expect(estimateTextSplitsByTokens(text, -1, 0)).toEqual([text]);
|
||||||
|
expect(estimateTextSplitsByTokens(text, NaN, 0)).toEqual([text]);
|
||||||
|
expect(estimateTextSplitsByTokens(text, Infinity, 0)).toEqual([text]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle invalid overlap', () => {
|
||||||
|
const text = 'a'.repeat(200);
|
||||||
|
// Negative overlap should be treated as 0
|
||||||
|
const chunks1 = estimateTextSplitsByTokens(text, 25, -10);
|
||||||
|
expect(chunks1).toHaveLength(2);
|
||||||
|
|
||||||
|
// Overlap larger than chunk size should be capped
|
||||||
|
const chunks2 = estimateTextSplitsByTokens(text, 25, 30); // overlap capped to 24
|
||||||
|
expect(chunks2.length).toBeGreaterThan(2);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should ensure progress even with large overlap', () => {
|
||||||
|
const text = 'a'.repeat(100);
|
||||||
|
// With overlap = chunkSize - 1, we should still make progress
|
||||||
|
const chunks = estimateTextSplitsByTokens(text, 10, 9); // 10 tokens = 40 chars, 9 tokens = 36 chars overlap
|
||||||
|
|
||||||
|
expect(chunks.length).toBeGreaterThan(1);
|
||||||
|
// Verify no infinite loop occurs
|
||||||
|
expect(chunks.length).toBeLessThan(100);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should work with different models', () => {
|
||||||
|
const text = 'a'.repeat(380); // 380 characters
|
||||||
|
const chunks = estimateTextSplitsByTokens(text, 100, 0, 'gpt-4o'); // 100 tokens * 3.8 = 380 chars
|
||||||
|
|
||||||
|
expect(chunks).toHaveLength(1);
|
||||||
|
expect(chunks[0]).toBe(text);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should use default model ratio for unknown models', () => {
|
||||||
|
const text = 'a'.repeat(400);
|
||||||
|
const chunks = estimateTextSplitsByTokens(text, 100, 0, 'unknown-model'); // Falls back to 4.0 ratio
|
||||||
|
|
||||||
|
expect(chunks).toHaveLength(1);
|
||||||
|
expect(chunks[0]).toBe(text);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle edge case where text length equals chunk size', () => {
|
||||||
|
const text = 'a'.repeat(100);
|
||||||
|
const chunks = estimateTextSplitsByTokens(text, 25, 0); // 25 tokens = 100 chars
|
||||||
|
|
||||||
|
expect(chunks).toHaveLength(1);
|
||||||
|
expect(chunks[0]).toBe(text);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle unicode text', () => {
|
||||||
|
const text = '你好世界'.repeat(25); // 100 characters (4 chars * 25)
|
||||||
|
const chunks = estimateTextSplitsByTokens(text, 25, 0);
|
||||||
|
|
||||||
|
expect(chunks.length).toBeGreaterThan(0);
|
||||||
|
expect(chunks.join('')).toBe(text);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return single chunk on any error in catch block', () => {
|
||||||
|
const text = 'Test text';
|
||||||
|
// Since we can't easily trigger the catch block, we test the expected behavior
|
||||||
|
// The function should return [text] on error
|
||||||
|
const result = estimateTextSplitsByTokens(text, 10, 0);
|
||||||
|
expect(result.length).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('estimateTokensFromStringList', () => {
|
||||||
|
// Since this function uses tiktoken which requires external data files,
|
||||||
|
// we'll test it with integration-style tests that don't require mocking
|
||||||
|
|
||||||
|
it('should handle empty list', async () => {
|
||||||
|
const result = await estimateTokensFromStringList([], 'gpt-4');
|
||||||
|
expect(result).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle non-array input', async () => {
|
||||||
|
const result = await estimateTokensFromStringList(null as any, 'gpt-4');
|
||||||
|
expect(result).toBe(0);
|
||||||
|
|
||||||
|
const result2 = await estimateTokensFromStringList('not an array' as any, 'gpt-4');
|
||||||
|
expect(result2).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle null/undefined items in list', async () => {
|
||||||
|
const list = ['Valid text', null, undefined, '', 123 as any];
|
||||||
|
const result = await estimateTokensFromStringList(list, 'gpt-4');
|
||||||
|
expect(result).toEqual(2);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should estimate tokens for normal text', async () => {
|
||||||
|
const list = ['Hello world', 'Test text'];
|
||||||
|
const result = await estimateTokensFromStringList(list, 'gpt-4');
|
||||||
|
expect(result).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should use character-based estimation for repetitive content', async () => {
|
||||||
|
const list = ['a'.repeat(1500)];
|
||||||
|
const result = await estimateTokensFromStringList(list, 'gpt-4');
|
||||||
|
expect(result).toBe(375); // 1500 chars / 4.0 = 375 tokens
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle mixed content', async () => {
|
||||||
|
const list = ['Normal text content', 'a'.repeat(1500), 'More normal text'];
|
||||||
|
const result = await estimateTokensFromStringList(list, 'gpt-4');
|
||||||
|
expect(result).toBeGreaterThan(375); // At least the repetitive content tokens
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should work with different models', async () => {
|
||||||
|
const list = ['Test text for different model'];
|
||||||
|
const result1 = await estimateTokensFromStringList(list, 'gpt-4');
|
||||||
|
const result2 = await estimateTokensFromStringList(list, 'gpt-4o');
|
||||||
|
// Both should return positive values
|
||||||
|
expect(result1).toBeGreaterThan(0);
|
||||||
|
expect(result2).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle very long lists', async () => {
|
||||||
|
const list = Array(10000).fill('Sample text');
|
||||||
|
const result = await estimateTokensFromStringList(list, 'gpt-4');
|
||||||
|
expect(result).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle unicode text', async () => {
|
||||||
|
const list = ['你好世界', '🌍🌎🌏', 'مرحبا بالعالم'];
|
||||||
|
const result = await estimateTokensFromStringList(list, 'gpt-4');
|
||||||
|
expect(result).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
176
packages/@n8n/nodes-langchain/utils/tokenizer/token-estimator.ts
Normal file
176
packages/@n8n/nodes-langchain/utils/tokenizer/token-estimator.ts
Normal file
@@ -0,0 +1,176 @@
|
|||||||
|
/**
|
||||||
|
* Token estimation utilities for handling text without using tiktoken.
|
||||||
|
* This is used as a fallback when tiktoken would be too slow (e.g., with repetitive content).
|
||||||
|
*/
|
||||||
|
|
||||||
|
import type { TiktokenModel } from 'js-tiktoken';
|
||||||
|
|
||||||
|
import { encodingForModel } from './tiktoken';
|
||||||
|
import { hasLongSequentialRepeat } from '../helpers';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Model-specific average characters per token ratios.
|
||||||
|
* These are approximate values based on typical English text.
|
||||||
|
*/
|
||||||
|
const MODEL_CHAR_PER_TOKEN_RATIOS: Record<string, number> = {
|
||||||
|
'gpt-4o': 3.8,
|
||||||
|
'gpt-4': 4.0,
|
||||||
|
'gpt-3.5-turbo': 4.0,
|
||||||
|
cl100k_base: 4.0,
|
||||||
|
o200k_base: 3.5,
|
||||||
|
p50k_base: 4.2,
|
||||||
|
r50k_base: 4.2,
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Estimates the number of tokens in a text based on character count.
|
||||||
|
* This is much faster than tiktoken but less accurate.
|
||||||
|
*
|
||||||
|
* @param text The text to estimate tokens for
|
||||||
|
* @param model The model or encoding name (optional)
|
||||||
|
* @returns Estimated number of tokens
|
||||||
|
*/
|
||||||
|
export function estimateTokensByCharCount(text: string, model: string = 'cl100k_base'): number {
|
||||||
|
try {
|
||||||
|
// Validate input
|
||||||
|
if (!text || typeof text !== 'string' || text.length === 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the ratio for the specific model, or use default
|
||||||
|
const charsPerToken = MODEL_CHAR_PER_TOKEN_RATIOS[model] || 4.0;
|
||||||
|
|
||||||
|
// Validate ratio
|
||||||
|
if (!Number.isFinite(charsPerToken) || charsPerToken <= 0) {
|
||||||
|
// Fallback to default ratio
|
||||||
|
const estimatedTokens = Math.ceil(text.length / 4.0);
|
||||||
|
return estimatedTokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate estimated tokens
|
||||||
|
const estimatedTokens = Math.ceil(text.length / charsPerToken);
|
||||||
|
|
||||||
|
return estimatedTokens;
|
||||||
|
} catch (error) {
|
||||||
|
// Return conservative estimate on error
|
||||||
|
return Math.ceil((text?.length || 0) / 4.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Estimates tokens for text splitting purposes.
|
||||||
|
* Returns chunk boundaries based on character positions rather than token positions.
|
||||||
|
*
|
||||||
|
* @param text The text to split
|
||||||
|
* @param chunkSize Target chunk size in tokens
|
||||||
|
* @param chunkOverlap Overlap between chunks in tokens
|
||||||
|
* @param model The model or encoding name (optional)
|
||||||
|
* @returns Array of text chunks
|
||||||
|
*/
|
||||||
|
export function estimateTextSplitsByTokens(
|
||||||
|
text: string,
|
||||||
|
chunkSize: number,
|
||||||
|
chunkOverlap: number,
|
||||||
|
model: string = 'cl100k_base',
|
||||||
|
): string[] {
|
||||||
|
try {
|
||||||
|
// Validate inputs
|
||||||
|
if (!text || typeof text !== 'string' || text.length === 0) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate numeric parameters
|
||||||
|
if (!Number.isFinite(chunkSize) || chunkSize <= 0) {
|
||||||
|
// Return whole text as single chunk if invalid chunk size
|
||||||
|
return [text];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure overlap is valid and less than chunk size
|
||||||
|
const validOverlap =
|
||||||
|
Number.isFinite(chunkOverlap) && chunkOverlap >= 0
|
||||||
|
? Math.min(chunkOverlap, chunkSize - 1)
|
||||||
|
: 0;
|
||||||
|
|
||||||
|
const charsPerToken = MODEL_CHAR_PER_TOKEN_RATIOS[model] || 4.0;
|
||||||
|
const chunkSizeInChars = Math.floor(chunkSize * charsPerToken);
|
||||||
|
const overlapInChars = Math.floor(validOverlap * charsPerToken);
|
||||||
|
|
||||||
|
const chunks: string[] = [];
|
||||||
|
let start = 0;
|
||||||
|
|
||||||
|
while (start < text.length) {
|
||||||
|
const end = Math.min(start + chunkSizeInChars, text.length);
|
||||||
|
chunks.push(text.slice(start, end));
|
||||||
|
|
||||||
|
if (end >= text.length) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Move to next chunk with overlap
|
||||||
|
start = Math.max(end - overlapInChars, start + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return chunks;
|
||||||
|
} catch (error) {
|
||||||
|
// Return text as single chunk on error
|
||||||
|
return text ? [text] : [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Estimates the total number of tokens for a list of strings.
|
||||||
|
* Uses tiktoken for normal text but falls back to character-based estimation
|
||||||
|
* for repetitive content or on errors.
|
||||||
|
*
|
||||||
|
* @param list Array of strings to estimate tokens for
|
||||||
|
* @param model The model or encoding name to use for estimation
|
||||||
|
* @returns Total estimated number of tokens across all strings
|
||||||
|
*/
|
||||||
|
export async function estimateTokensFromStringList(
|
||||||
|
list: string[],
|
||||||
|
model: TiktokenModel,
|
||||||
|
): Promise<number> {
|
||||||
|
try {
|
||||||
|
// Validate input
|
||||||
|
if (!Array.isArray(list)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const encoder = await encodingForModel(model);
|
||||||
|
const encodedListLength = await Promise.all(
|
||||||
|
list.map(async (text) => {
|
||||||
|
try {
|
||||||
|
// Handle null/undefined text
|
||||||
|
if (!text || typeof text !== 'string') {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for repetitive content
|
||||||
|
if (hasLongSequentialRepeat(text)) {
|
||||||
|
const estimatedTokens = estimateTokensByCharCount(text, model);
|
||||||
|
return estimatedTokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use tiktoken for normal text
|
||||||
|
try {
|
||||||
|
const tokens = encoder.encode(text);
|
||||||
|
return tokens.length;
|
||||||
|
} catch (encodingError) {
|
||||||
|
// Fall back to estimation if tiktoken fails
|
||||||
|
return estimateTokensByCharCount(text, model);
|
||||||
|
}
|
||||||
|
} catch (itemError) {
|
||||||
|
// Return 0 for individual item errors
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
const totalTokens = encodedListLength.reduce((acc, curr) => acc + curr, 0);
|
||||||
|
|
||||||
|
return totalTokens;
|
||||||
|
} catch (error) {
|
||||||
|
// Return 0 on complete failure
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user