feat(Token Splitter Node): Replace remote tiktoken encoding with local implementation (#16548)

This commit is contained in:
oleg
2025-06-20 16:08:16 +02:00
committed by GitHub
parent 79650ea55a
commit 2d638023be
11 changed files with 427 additions and 32 deletions

View File

@@ -0,0 +1,125 @@
/* eslint-disable @typescript-eslint/no-unsafe-call */
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
/* eslint-disable @typescript-eslint/no-var-requires */
import type { TiktokenEncoding } from 'js-tiktoken/lite';
import { Tiktoken } from 'js-tiktoken/lite';
import { getEncoding, encodingForModel } from '../tokenizer/tiktoken';
jest.mock('js-tiktoken/lite', () => ({
Tiktoken: jest.fn(),
getEncodingNameForModel: jest.fn(),
}));
jest.mock('../tokenizer/cl100k_base.json', () => ({ mockCl100kBase: 'data' }), { virtual: true });
jest.mock('../tokenizer/o200k_base.json', () => ({ mockO200kBase: 'data' }), { virtual: true });
describe('tiktoken utils', () => {
beforeEach(() => {
jest.clearAllMocks();
});
describe('getEncoding', () => {
it('should return Tiktoken instance for cl100k_base encoding', async () => {
const mockTiktoken = {};
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
const result = await getEncoding('cl100k_base');
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
expect(result).toBe(mockTiktoken);
});
it('should return Tiktoken instance for o200k_base encoding', async () => {
const mockTiktoken = {};
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
const result = await getEncoding('o200k_base');
expect(Tiktoken).toHaveBeenCalledWith({ mockO200kBase: 'data' });
expect(result).toBe(mockTiktoken);
});
it('should map p50k_base to cl100k_base encoding', async () => {
const mockTiktoken = {};
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
const result = await getEncoding('p50k_base');
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
expect(result).toBe(mockTiktoken);
});
it('should map r50k_base to cl100k_base encoding', async () => {
const mockTiktoken = {};
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
const result = await getEncoding('r50k_base');
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
expect(result).toBe(mockTiktoken);
});
it('should map gpt2 to cl100k_base encoding', async () => {
const mockTiktoken = {};
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
const result = await getEncoding('gpt2');
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
expect(result).toBe(mockTiktoken);
});
it('should map p50k_edit to cl100k_base encoding', async () => {
const mockTiktoken = {};
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
const result = await getEncoding('p50k_edit');
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
expect(result).toBe(mockTiktoken);
});
it('should return cl100k_base for unknown encoding', async () => {
const mockTiktoken = {};
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument
const result = await getEncoding('unknown_encoding' as unknown as TiktokenEncoding);
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
expect(result).toBe(mockTiktoken);
});
});
describe('encodingForModel', () => {
it('should call getEncodingNameForModel and return encoding for cl100k_base', async () => {
const mockGetEncodingNameForModel = require('js-tiktoken/lite').getEncodingNameForModel;
const mockTiktoken = {};
mockGetEncodingNameForModel.mockReturnValue('cl100k_base');
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
const result = await encodingForModel('gpt-3.5-turbo');
expect(mockGetEncodingNameForModel).toHaveBeenCalledWith('gpt-3.5-turbo');
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
expect(result).toBe(mockTiktoken);
});
it('should handle gpt-4 model with cl100k_base', async () => {
const mockGetEncodingNameForModel = require('js-tiktoken/lite').getEncodingNameForModel;
const mockTiktoken = {};
mockGetEncodingNameForModel.mockReturnValue('cl100k_base');
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
const result = await encodingForModel('gpt-4');
expect(mockGetEncodingNameForModel).toHaveBeenCalledWith('gpt-4');
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
expect(result).toBe(mockTiktoken);
});
});
});