mirror of
https://github.com/Abdulazizzn/n8n-enterprise-unlocked.git
synced 2025-12-17 18:12:04 +00:00
344 lines
11 KiB
TypeScript
344 lines
11 KiB
TypeScript
import { OperationalError } from 'n8n-workflow';
|
|
|
|
import * as helpers from '../../../../utils/helpers';
|
|
import * as tiktokenUtils from '../../../../utils/tokenizer/tiktoken';
|
|
import * as tokenEstimator from '../../../../utils/tokenizer/token-estimator';
|
|
import { TokenTextSplitter } from '../TokenTextSplitter';
|
|
|
|
jest.mock('../../../../utils/tokenizer/tiktoken');
|
|
jest.mock('../../../../utils/helpers');
|
|
jest.mock('../../../../utils/tokenizer/token-estimator');
|
|
|
|
describe('TokenTextSplitter', () => {
|
|
let mockTokenizer: jest.Mocked<{
|
|
encode: jest.Mock;
|
|
decode: jest.Mock;
|
|
}>;
|
|
|
|
beforeEach(() => {
|
|
mockTokenizer = {
|
|
encode: jest.fn(),
|
|
decode: jest.fn(),
|
|
};
|
|
(tiktokenUtils.getEncoding as jest.Mock).mockResolvedValue(mockTokenizer);
|
|
// Default mock for hasLongSequentialRepeat - no repetition
|
|
(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false);
|
|
});
|
|
|
|
afterEach(() => {
|
|
jest.clearAllMocks();
|
|
});
|
|
|
|
describe('constructor', () => {
|
|
it('should initialize with default parameters', () => {
|
|
const splitter = new TokenTextSplitter();
|
|
|
|
expect(splitter.encodingName).toBe('cl100k_base');
|
|
expect(splitter.allowedSpecial).toEqual([]);
|
|
expect(splitter.disallowedSpecial).toBe('all');
|
|
});
|
|
|
|
it('should initialize with custom parameters', () => {
|
|
const splitter = new TokenTextSplitter({
|
|
encodingName: 'o200k_base',
|
|
allowedSpecial: ['<|special|>'],
|
|
disallowedSpecial: ['<|bad|>'],
|
|
chunkSize: 500,
|
|
chunkOverlap: 50,
|
|
});
|
|
|
|
expect(splitter.encodingName).toBe('o200k_base');
|
|
expect(splitter.allowedSpecial).toEqual(['<|special|>']);
|
|
expect(splitter.disallowedSpecial).toEqual(['<|bad|>']);
|
|
expect(splitter.chunkSize).toBe(500);
|
|
expect(splitter.chunkOverlap).toBe(50);
|
|
});
|
|
|
|
it('should have correct lc_name', () => {
|
|
expect(TokenTextSplitter.lc_name()).toBe('TokenTextSplitter');
|
|
});
|
|
});
|
|
|
|
describe('splitText', () => {
|
|
it('should split text into chunks based on token count', async () => {
|
|
const splitter = new TokenTextSplitter({
|
|
chunkSize: 3,
|
|
chunkOverlap: 0,
|
|
});
|
|
|
|
const inputText = 'Hello world, this is a test';
|
|
const mockTokenIds = [1, 2, 3, 4, 5, 6, 7, 8];
|
|
|
|
mockTokenizer.encode.mockReturnValue(mockTokenIds);
|
|
mockTokenizer.decode.mockImplementation((tokens: number[]) => {
|
|
const chunks = [
|
|
[1, 2, 3],
|
|
[4, 5, 6],
|
|
[7, 8],
|
|
];
|
|
const chunkTexts = ['Hello world,', ' this is', ' a test'];
|
|
const index = chunks.findIndex(
|
|
(chunk) => chunk.length === tokens.length && chunk.every((val, i) => val === tokens[i]),
|
|
);
|
|
return chunkTexts[index] || '';
|
|
});
|
|
|
|
const result = await splitter.splitText(inputText);
|
|
|
|
expect(tiktokenUtils.getEncoding).toHaveBeenCalledWith('cl100k_base');
|
|
expect(mockTokenizer.encode).toHaveBeenCalledWith(inputText, [], 'all');
|
|
expect(result).toEqual(['Hello world,', ' this is', ' a test']);
|
|
});
|
|
|
|
it('should handle empty text', async () => {
|
|
const splitter = new TokenTextSplitter();
|
|
mockTokenizer.encode.mockReturnValue([]);
|
|
|
|
const result = await splitter.splitText('');
|
|
|
|
expect(result).toEqual([]);
|
|
});
|
|
|
|
it('should handle text shorter than chunk size', async () => {
|
|
const splitter = new TokenTextSplitter({
|
|
chunkSize: 10,
|
|
chunkOverlap: 0,
|
|
});
|
|
|
|
const inputText = 'Short text';
|
|
const mockTokenIds = [1, 2];
|
|
|
|
mockTokenizer.encode.mockReturnValue(mockTokenIds);
|
|
mockTokenizer.decode.mockReturnValue('Short text');
|
|
|
|
const result = await splitter.splitText(inputText);
|
|
|
|
expect(result).toEqual(['Short text']);
|
|
});
|
|
|
|
it('should use custom encoding and special tokens', async () => {
|
|
const splitter = new TokenTextSplitter({
|
|
encodingName: 'o200k_base',
|
|
allowedSpecial: ['<|special|>'],
|
|
disallowedSpecial: ['<|bad|>'],
|
|
});
|
|
|
|
const inputText = 'Text with <|special|> tokens';
|
|
mockTokenizer.encode.mockReturnValue([1, 2, 3]);
|
|
mockTokenizer.decode.mockReturnValue('Text with <|special|> tokens');
|
|
|
|
await splitter.splitText(inputText);
|
|
|
|
expect(tiktokenUtils.getEncoding).toHaveBeenCalledWith('o200k_base');
|
|
expect(mockTokenizer.encode).toHaveBeenCalledWith(inputText, ['<|special|>'], ['<|bad|>']);
|
|
});
|
|
|
|
it('should reuse tokenizer on subsequent calls', async () => {
|
|
const splitter = new TokenTextSplitter();
|
|
mockTokenizer.encode.mockReturnValue([1, 2, 3]);
|
|
mockTokenizer.decode.mockReturnValue('test');
|
|
|
|
await splitter.splitText('first call');
|
|
await splitter.splitText('second call');
|
|
|
|
expect(tiktokenUtils.getEncoding).toHaveBeenCalledTimes(1);
|
|
});
|
|
|
|
it('should handle large text with multiple chunks and overlap', async () => {
|
|
const splitter = new TokenTextSplitter({
|
|
chunkSize: 2,
|
|
chunkOverlap: 1,
|
|
});
|
|
|
|
const inputText = 'One two three four five six';
|
|
const mockTokenIds = [1, 2, 3, 4, 5, 6];
|
|
|
|
mockTokenizer.encode.mockReturnValue(mockTokenIds);
|
|
mockTokenizer.decode.mockImplementation((tokens: number[]) => {
|
|
const chunkMap: Record<string, string> = {
|
|
'1,2': 'One two',
|
|
'2,3': 'two three',
|
|
'3,4': 'three four',
|
|
'4,5': 'four five',
|
|
'5,6': 'five six',
|
|
};
|
|
return chunkMap[tokens.join(',')] || '';
|
|
});
|
|
|
|
const result = await splitter.splitText(inputText);
|
|
|
|
expect(result).toEqual(['One two', 'two three', 'three four', 'four five', 'five six']);
|
|
});
|
|
|
|
describe('repetitive content handling', () => {
|
|
it('should use character-based estimation for repetitive content', async () => {
|
|
const splitter = new TokenTextSplitter({
|
|
chunkSize: 100,
|
|
chunkOverlap: 10,
|
|
});
|
|
|
|
const repetitiveText = 'a'.repeat(1000);
|
|
const estimatedChunks = ['chunk1', 'chunk2', 'chunk3'];
|
|
|
|
(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true);
|
|
(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue(estimatedChunks);
|
|
|
|
const result = await splitter.splitText(repetitiveText);
|
|
|
|
// Should not call tiktoken
|
|
expect(tiktokenUtils.getEncoding).not.toHaveBeenCalled();
|
|
expect(mockTokenizer.encode).not.toHaveBeenCalled();
|
|
|
|
// Should use estimation
|
|
expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(repetitiveText);
|
|
expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalledWith(
|
|
repetitiveText,
|
|
100,
|
|
10,
|
|
'cl100k_base',
|
|
);
|
|
|
|
expect(result).toEqual(estimatedChunks);
|
|
});
|
|
|
|
it('should use tiktoken for non-repetitive content', async () => {
|
|
const splitter = new TokenTextSplitter({
|
|
chunkSize: 3,
|
|
chunkOverlap: 0,
|
|
});
|
|
|
|
const normalText = 'This is normal text without repetition';
|
|
const mockTokenIds = [1, 2, 3, 4, 5, 6];
|
|
|
|
(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false);
|
|
mockTokenizer.encode.mockReturnValue(mockTokenIds);
|
|
mockTokenizer.decode.mockImplementation(() => 'chunk');
|
|
|
|
await splitter.splitText(normalText);
|
|
|
|
// Should check for repetition
|
|
expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(normalText);
|
|
|
|
// Should use tiktoken
|
|
expect(tiktokenUtils.getEncoding).toHaveBeenCalled();
|
|
expect(mockTokenizer.encode).toHaveBeenCalled();
|
|
|
|
// Should not use estimation
|
|
expect(tokenEstimator.estimateTextSplitsByTokens).not.toHaveBeenCalled();
|
|
});
|
|
|
|
it('should handle repetitive content with different encodings', async () => {
|
|
const splitter = new TokenTextSplitter({
|
|
encodingName: 'o200k_base',
|
|
chunkSize: 50,
|
|
chunkOverlap: 5,
|
|
});
|
|
|
|
const repetitiveText = '.'.repeat(500);
|
|
const estimatedChunks = ['estimated chunk 1', 'estimated chunk 2'];
|
|
|
|
(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true);
|
|
(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue(estimatedChunks);
|
|
|
|
const result = await splitter.splitText(repetitiveText);
|
|
|
|
expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalledWith(
|
|
repetitiveText,
|
|
50,
|
|
5,
|
|
'o200k_base',
|
|
);
|
|
expect(result).toEqual(estimatedChunks);
|
|
});
|
|
|
|
it('should handle edge case with exactly 100 repeating characters', async () => {
|
|
const splitter = new TokenTextSplitter();
|
|
const edgeText = 'x'.repeat(100);
|
|
|
|
(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true);
|
|
(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue(['single chunk']);
|
|
|
|
const result = await splitter.splitText(edgeText);
|
|
|
|
expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(edgeText);
|
|
expect(result).toEqual(['single chunk']);
|
|
});
|
|
|
|
it('should handle mixed content with repetitive sections', async () => {
|
|
const splitter = new TokenTextSplitter();
|
|
const mixedText = 'Normal text ' + 'z'.repeat(200) + ' more normal text';
|
|
|
|
(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true);
|
|
(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue([
|
|
'chunk1',
|
|
'chunk2',
|
|
]);
|
|
|
|
const result = await splitter.splitText(mixedText);
|
|
|
|
expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(mixedText);
|
|
expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalled();
|
|
expect(result).toEqual(['chunk1', 'chunk2']);
|
|
});
|
|
});
|
|
|
|
describe('error handling', () => {
|
|
it('should return empty array for null input', async () => {
|
|
const splitter = new TokenTextSplitter();
|
|
const result = await splitter.splitText(null as any);
|
|
expect(result).toEqual([]);
|
|
});
|
|
|
|
it('should return empty array for undefined input', async () => {
|
|
const splitter = new TokenTextSplitter();
|
|
const result = await splitter.splitText(undefined as any);
|
|
expect(result).toEqual([]);
|
|
});
|
|
|
|
it('should return empty array for non-string input', async () => {
|
|
const splitter = new TokenTextSplitter();
|
|
const result = await splitter.splitText(123 as any);
|
|
expect(result).toEqual([]);
|
|
});
|
|
|
|
it('should fall back to estimation if tiktoken fails', async () => {
|
|
const splitter = new TokenTextSplitter();
|
|
const text = 'This will cause tiktoken to fail';
|
|
|
|
(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false);
|
|
(tiktokenUtils.getEncoding as jest.Mock).mockRejectedValue(new Error('Tiktoken error'));
|
|
(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue([
|
|
'fallback chunk',
|
|
]);
|
|
|
|
const result = await splitter.splitText(text);
|
|
|
|
expect(result).toEqual(['fallback chunk']);
|
|
expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalledWith(
|
|
text,
|
|
splitter.chunkSize,
|
|
splitter.chunkOverlap,
|
|
splitter.encodingName,
|
|
);
|
|
});
|
|
|
|
it('should fall back to estimation if encode fails', async () => {
|
|
const splitter = new TokenTextSplitter();
|
|
const text = 'This will cause encode to fail';
|
|
|
|
(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false);
|
|
mockTokenizer.encode.mockImplementation(() => {
|
|
throw new OperationalError('Encode error');
|
|
});
|
|
(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue([
|
|
'fallback chunk',
|
|
]);
|
|
|
|
const result = await splitter.splitText(text);
|
|
|
|
expect(result).toEqual(['fallback chunk']);
|
|
});
|
|
});
|
|
});
|
|
});
|