mirror of
https://github.com/Abdulazizzn/n8n-enterprise-unlocked.git
synced 2025-12-16 09:36:44 +00:00
feat(Token Splitter Node): Replace remote tiktoken encoding with local implementation (#16548)
This commit is contained in:
125
packages/@n8n/nodes-langchain/utils/tests/tiktoken.test.ts
Normal file
125
packages/@n8n/nodes-langchain/utils/tests/tiktoken.test.ts
Normal file
@@ -0,0 +1,125 @@
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-call */
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
|
||||
/* eslint-disable @typescript-eslint/no-var-requires */
|
||||
import type { TiktokenEncoding } from 'js-tiktoken/lite';
|
||||
import { Tiktoken } from 'js-tiktoken/lite';
|
||||
|
||||
import { getEncoding, encodingForModel } from '../tokenizer/tiktoken';
|
||||
|
||||
jest.mock('js-tiktoken/lite', () => ({
|
||||
Tiktoken: jest.fn(),
|
||||
getEncodingNameForModel: jest.fn(),
|
||||
}));
|
||||
|
||||
jest.mock('../tokenizer/cl100k_base.json', () => ({ mockCl100kBase: 'data' }), { virtual: true });
|
||||
jest.mock('../tokenizer/o200k_base.json', () => ({ mockO200kBase: 'data' }), { virtual: true });
|
||||
|
||||
describe('tiktoken utils', () => {
|
||||
beforeEach(() => {
|
||||
jest.clearAllMocks();
|
||||
});
|
||||
|
||||
describe('getEncoding', () => {
|
||||
it('should return Tiktoken instance for cl100k_base encoding', async () => {
|
||||
const mockTiktoken = {};
|
||||
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||
|
||||
const result = await getEncoding('cl100k_base');
|
||||
|
||||
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
|
||||
expect(result).toBe(mockTiktoken);
|
||||
});
|
||||
|
||||
it('should return Tiktoken instance for o200k_base encoding', async () => {
|
||||
const mockTiktoken = {};
|
||||
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||
|
||||
const result = await getEncoding('o200k_base');
|
||||
|
||||
expect(Tiktoken).toHaveBeenCalledWith({ mockO200kBase: 'data' });
|
||||
expect(result).toBe(mockTiktoken);
|
||||
});
|
||||
|
||||
it('should map p50k_base to cl100k_base encoding', async () => {
|
||||
const mockTiktoken = {};
|
||||
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||
|
||||
const result = await getEncoding('p50k_base');
|
||||
|
||||
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
|
||||
expect(result).toBe(mockTiktoken);
|
||||
});
|
||||
|
||||
it('should map r50k_base to cl100k_base encoding', async () => {
|
||||
const mockTiktoken = {};
|
||||
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||
|
||||
const result = await getEncoding('r50k_base');
|
||||
|
||||
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
|
||||
expect(result).toBe(mockTiktoken);
|
||||
});
|
||||
|
||||
it('should map gpt2 to cl100k_base encoding', async () => {
|
||||
const mockTiktoken = {};
|
||||
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||
|
||||
const result = await getEncoding('gpt2');
|
||||
|
||||
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
|
||||
expect(result).toBe(mockTiktoken);
|
||||
});
|
||||
|
||||
it('should map p50k_edit to cl100k_base encoding', async () => {
|
||||
const mockTiktoken = {};
|
||||
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||
|
||||
const result = await getEncoding('p50k_edit');
|
||||
|
||||
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
|
||||
expect(result).toBe(mockTiktoken);
|
||||
});
|
||||
|
||||
it('should return cl100k_base for unknown encoding', async () => {
|
||||
const mockTiktoken = {};
|
||||
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument
|
||||
const result = await getEncoding('unknown_encoding' as unknown as TiktokenEncoding);
|
||||
|
||||
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
|
||||
expect(result).toBe(mockTiktoken);
|
||||
});
|
||||
});
|
||||
|
||||
describe('encodingForModel', () => {
|
||||
it('should call getEncodingNameForModel and return encoding for cl100k_base', async () => {
|
||||
const mockGetEncodingNameForModel = require('js-tiktoken/lite').getEncodingNameForModel;
|
||||
const mockTiktoken = {};
|
||||
|
||||
mockGetEncodingNameForModel.mockReturnValue('cl100k_base');
|
||||
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||
|
||||
const result = await encodingForModel('gpt-3.5-turbo');
|
||||
|
||||
expect(mockGetEncodingNameForModel).toHaveBeenCalledWith('gpt-3.5-turbo');
|
||||
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
|
||||
expect(result).toBe(mockTiktoken);
|
||||
});
|
||||
|
||||
it('should handle gpt-4 model with cl100k_base', async () => {
|
||||
const mockGetEncodingNameForModel = require('js-tiktoken/lite').getEncodingNameForModel;
|
||||
const mockTiktoken = {};
|
||||
|
||||
mockGetEncodingNameForModel.mockReturnValue('cl100k_base');
|
||||
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||
|
||||
const result = await encodingForModel('gpt-4');
|
||||
|
||||
expect(mockGetEncodingNameForModel).toHaveBeenCalledWith('gpt-4');
|
||||
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
|
||||
expect(result).toBe(mockTiktoken);
|
||||
});
|
||||
});
|
||||
});
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
30
packages/@n8n/nodes-langchain/utils/tokenizer/tiktoken.ts
Normal file
30
packages/@n8n/nodes-langchain/utils/tokenizer/tiktoken.ts
Normal file
@@ -0,0 +1,30 @@
|
||||
import type { TiktokenBPE, TiktokenEncoding, TiktokenModel } from 'js-tiktoken/lite';
|
||||
import { Tiktoken, getEncodingNameForModel } from 'js-tiktoken/lite';
|
||||
|
||||
import cl100k_base from './cl100k_base.json';
|
||||
import o200k_base from './o200k_base.json';
|
||||
|
||||
export async function getEncoding(encoding: TiktokenEncoding) {
|
||||
const encodings = {
|
||||
cl100k_base: cl100k_base as TiktokenBPE,
|
||||
o200k_base: o200k_base as TiktokenBPE,
|
||||
};
|
||||
const encodingsMap: Record<TiktokenEncoding, TiktokenBPE> = {
|
||||
cl100k_base: encodings.cl100k_base,
|
||||
p50k_base: encodings.cl100k_base,
|
||||
r50k_base: encodings.cl100k_base,
|
||||
gpt2: encodings.cl100k_base,
|
||||
p50k_edit: encodings.cl100k_base,
|
||||
o200k_base: encodings.o200k_base,
|
||||
};
|
||||
|
||||
if (!(encoding in encodingsMap)) {
|
||||
return new Tiktoken(cl100k_base);
|
||||
}
|
||||
|
||||
return new Tiktoken(encodingsMap[encoding]);
|
||||
}
|
||||
|
||||
export async function encodingForModel(model: TiktokenModel) {
|
||||
return await getEncoding(getEncodingNameForModel(model));
|
||||
}
|
||||
Reference in New Issue
Block a user