mirror of
https://github.com/Abdulazizzn/n8n-enterprise-unlocked.git
synced 2025-12-16 17:46:45 +00:00
feat(Token Splitter Node): Replace remote tiktoken encoding with local implementation (#16548)
This commit is contained in:
@@ -12,7 +12,9 @@
|
|||||||
"**/dist",
|
"**/dist",
|
||||||
"**/package.json",
|
"**/package.json",
|
||||||
"**/pnpm-lock.yaml",
|
"**/pnpm-lock.yaml",
|
||||||
"**/CHANGELOG.md"
|
"**/CHANGELOG.md",
|
||||||
|
"**/cl100k_base.json",
|
||||||
|
"**/o200k_base.json"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"formatter": {
|
"formatter": {
|
||||||
|
|||||||
@@ -8,12 +8,12 @@ import type {
|
|||||||
} from '@langchain/core/load/serializable';
|
} from '@langchain/core/load/serializable';
|
||||||
import type { BaseMessage } from '@langchain/core/messages';
|
import type { BaseMessage } from '@langchain/core/messages';
|
||||||
import type { LLMResult } from '@langchain/core/outputs';
|
import type { LLMResult } from '@langchain/core/outputs';
|
||||||
import { encodingForModel } from '@langchain/core/utils/tiktoken';
|
|
||||||
import pick from 'lodash/pick';
|
import pick from 'lodash/pick';
|
||||||
import type { IDataObject, ISupplyDataFunctions, JsonObject } from 'n8n-workflow';
|
import type { IDataObject, ISupplyDataFunctions, JsonObject } from 'n8n-workflow';
|
||||||
import { NodeConnectionTypes, NodeError, NodeOperationError } from 'n8n-workflow';
|
import { NodeConnectionTypes, NodeError, NodeOperationError } from 'n8n-workflow';
|
||||||
|
|
||||||
import { logAiEvent } from '@utils/helpers';
|
import { logAiEvent } from '@utils/helpers';
|
||||||
|
import { encodingForModel } from '@utils/tokenizer/tiktoken';
|
||||||
|
|
||||||
type TokensUsageParser = (llmOutput: LLMResult['llmOutput']) => {
|
type TokensUsageParser = (llmOutput: LLMResult['llmOutput']) => {
|
||||||
completionTokens: number;
|
completionTokens: number;
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
/* eslint-disable n8n-nodes-base/node-dirname-against-convention */
|
/* eslint-disable n8n-nodes-base/node-dirname-against-convention */
|
||||||
import { TokenTextSplitter } from '@langchain/textsplitters';
|
|
||||||
import {
|
import {
|
||||||
NodeConnectionTypes,
|
NodeConnectionTypes,
|
||||||
type INodeType,
|
type INodeType,
|
||||||
@@ -11,6 +10,8 @@ import {
|
|||||||
import { logWrapper } from '@utils/logWrapper';
|
import { logWrapper } from '@utils/logWrapper';
|
||||||
import { getConnectionHintNoticeField } from '@utils/sharedFields';
|
import { getConnectionHintNoticeField } from '@utils/sharedFields';
|
||||||
|
|
||||||
|
import { TokenTextSplitter } from './TokenTextSplitter';
|
||||||
|
|
||||||
export class TextSplitterTokenSplitter implements INodeType {
|
export class TextSplitterTokenSplitter implements INodeType {
|
||||||
description: INodeTypeDescription = {
|
description: INodeTypeDescription = {
|
||||||
displayName: 'Token Splitter',
|
displayName: 'Token Splitter',
|
||||||
@@ -71,9 +72,6 @@ export class TextSplitterTokenSplitter implements INodeType {
|
|||||||
disallowedSpecial: 'all',
|
disallowedSpecial: 'all',
|
||||||
encodingName: 'cl100k_base',
|
encodingName: 'cl100k_base',
|
||||||
keepSeparator: false,
|
keepSeparator: false,
|
||||||
// allowedSpecial: 'all',
|
|
||||||
// disallowedSpecial: 'all',
|
|
||||||
// encodingName: 'cl100k_base',
|
|
||||||
});
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -0,0 +1,57 @@
|
|||||||
|
/* eslint-disable n8n-nodes-base/node-dirname-against-convention */
|
||||||
|
import type { TokenTextSplitterParams } from '@langchain/textsplitters';
|
||||||
|
import { TextSplitter } from '@langchain/textsplitters';
|
||||||
|
import type * as tiktoken from 'js-tiktoken';
|
||||||
|
|
||||||
|
import { getEncoding } from '@utils/tokenizer/tiktoken';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implementation of splitter which looks at tokens.
|
||||||
|
* This is override of the LangChain TokenTextSplitter
|
||||||
|
* to use the n8n tokenizer utility which uses local JSON encodings
|
||||||
|
*/
|
||||||
|
export class TokenTextSplitter extends TextSplitter implements TokenTextSplitterParams {
|
||||||
|
static lc_name() {
|
||||||
|
return 'TokenTextSplitter';
|
||||||
|
}
|
||||||
|
|
||||||
|
encodingName: tiktoken.TiktokenEncoding;
|
||||||
|
|
||||||
|
allowedSpecial: 'all' | string[];
|
||||||
|
|
||||||
|
disallowedSpecial: 'all' | string[];
|
||||||
|
|
||||||
|
private tokenizer: tiktoken.Tiktoken | undefined;
|
||||||
|
|
||||||
|
constructor(fields?: Partial<TokenTextSplitterParams>) {
|
||||||
|
super(fields);
|
||||||
|
|
||||||
|
this.encodingName = fields?.encodingName ?? 'cl100k_base';
|
||||||
|
this.allowedSpecial = fields?.allowedSpecial ?? [];
|
||||||
|
this.disallowedSpecial = fields?.disallowedSpecial ?? 'all';
|
||||||
|
}
|
||||||
|
|
||||||
|
async splitText(text: string): Promise<string[]> {
|
||||||
|
if (!this.tokenizer) {
|
||||||
|
this.tokenizer = await getEncoding(this.encodingName);
|
||||||
|
}
|
||||||
|
|
||||||
|
const splits: string[] = [];
|
||||||
|
|
||||||
|
const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
|
||||||
|
|
||||||
|
let start_idx = 0;
|
||||||
|
|
||||||
|
while (start_idx < input_ids.length) {
|
||||||
|
if (start_idx > 0) {
|
||||||
|
start_idx -= this.chunkOverlap;
|
||||||
|
}
|
||||||
|
const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
|
||||||
|
const chunk_ids = input_ids.slice(start_idx, end_idx);
|
||||||
|
splits.push(this.tokenizer.decode(chunk_ids));
|
||||||
|
start_idx = end_idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
return splits;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,165 @@
|
|||||||
|
import * as tiktokenUtils from '../../../../utils/tokenizer/tiktoken';
|
||||||
|
import { TokenTextSplitter } from '../TokenTextSplitter';
|
||||||
|
|
||||||
|
jest.mock('../../../../utils/tokenizer/tiktoken');
|
||||||
|
|
||||||
|
describe('TokenTextSplitter', () => {
|
||||||
|
let mockTokenizer: jest.Mocked<{
|
||||||
|
encode: jest.Mock;
|
||||||
|
decode: jest.Mock;
|
||||||
|
}>;
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
mockTokenizer = {
|
||||||
|
encode: jest.fn(),
|
||||||
|
decode: jest.fn(),
|
||||||
|
};
|
||||||
|
(tiktokenUtils.getEncoding as jest.Mock).mockResolvedValue(mockTokenizer);
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(() => {
|
||||||
|
jest.clearAllMocks();
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('constructor', () => {
|
||||||
|
it('should initialize with default parameters', () => {
|
||||||
|
const splitter = new TokenTextSplitter();
|
||||||
|
|
||||||
|
expect(splitter.encodingName).toBe('cl100k_base');
|
||||||
|
expect(splitter.allowedSpecial).toEqual([]);
|
||||||
|
expect(splitter.disallowedSpecial).toBe('all');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should initialize with custom parameters', () => {
|
||||||
|
const splitter = new TokenTextSplitter({
|
||||||
|
encodingName: 'o200k_base',
|
||||||
|
allowedSpecial: ['<|special|>'],
|
||||||
|
disallowedSpecial: ['<|bad|>'],
|
||||||
|
chunkSize: 500,
|
||||||
|
chunkOverlap: 50,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(splitter.encodingName).toBe('o200k_base');
|
||||||
|
expect(splitter.allowedSpecial).toEqual(['<|special|>']);
|
||||||
|
expect(splitter.disallowedSpecial).toEqual(['<|bad|>']);
|
||||||
|
expect(splitter.chunkSize).toBe(500);
|
||||||
|
expect(splitter.chunkOverlap).toBe(50);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should have correct lc_name', () => {
|
||||||
|
expect(TokenTextSplitter.lc_name()).toBe('TokenTextSplitter');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('splitText', () => {
|
||||||
|
it('should split text into chunks based on token count', async () => {
|
||||||
|
const splitter = new TokenTextSplitter({
|
||||||
|
chunkSize: 3,
|
||||||
|
chunkOverlap: 0,
|
||||||
|
});
|
||||||
|
|
||||||
|
const inputText = 'Hello world, this is a test';
|
||||||
|
const mockTokenIds = [1, 2, 3, 4, 5, 6, 7, 8];
|
||||||
|
|
||||||
|
mockTokenizer.encode.mockReturnValue(mockTokenIds);
|
||||||
|
mockTokenizer.decode.mockImplementation((tokens: number[]) => {
|
||||||
|
const chunks = [
|
||||||
|
[1, 2, 3],
|
||||||
|
[4, 5, 6],
|
||||||
|
[7, 8],
|
||||||
|
];
|
||||||
|
const chunkTexts = ['Hello world,', ' this is', ' a test'];
|
||||||
|
const index = chunks.findIndex(
|
||||||
|
(chunk) => chunk.length === tokens.length && chunk.every((val, i) => val === tokens[i]),
|
||||||
|
);
|
||||||
|
return chunkTexts[index] || '';
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await splitter.splitText(inputText);
|
||||||
|
|
||||||
|
expect(tiktokenUtils.getEncoding).toHaveBeenCalledWith('cl100k_base');
|
||||||
|
expect(mockTokenizer.encode).toHaveBeenCalledWith(inputText, [], 'all');
|
||||||
|
expect(result).toEqual(['Hello world,', ' this is', ' a test']);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle empty text', async () => {
|
||||||
|
const splitter = new TokenTextSplitter();
|
||||||
|
mockTokenizer.encode.mockReturnValue([]);
|
||||||
|
|
||||||
|
const result = await splitter.splitText('');
|
||||||
|
|
||||||
|
expect(result).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle text shorter than chunk size', async () => {
|
||||||
|
const splitter = new TokenTextSplitter({
|
||||||
|
chunkSize: 10,
|
||||||
|
chunkOverlap: 0,
|
||||||
|
});
|
||||||
|
|
||||||
|
const inputText = 'Short text';
|
||||||
|
const mockTokenIds = [1, 2];
|
||||||
|
|
||||||
|
mockTokenizer.encode.mockReturnValue(mockTokenIds);
|
||||||
|
mockTokenizer.decode.mockReturnValue('Short text');
|
||||||
|
|
||||||
|
const result = await splitter.splitText(inputText);
|
||||||
|
|
||||||
|
expect(result).toEqual(['Short text']);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should use custom encoding and special tokens', async () => {
|
||||||
|
const splitter = new TokenTextSplitter({
|
||||||
|
encodingName: 'o200k_base',
|
||||||
|
allowedSpecial: ['<|special|>'],
|
||||||
|
disallowedSpecial: ['<|bad|>'],
|
||||||
|
});
|
||||||
|
|
||||||
|
const inputText = 'Text with <|special|> tokens';
|
||||||
|
mockTokenizer.encode.mockReturnValue([1, 2, 3]);
|
||||||
|
mockTokenizer.decode.mockReturnValue('Text with <|special|> tokens');
|
||||||
|
|
||||||
|
await splitter.splitText(inputText);
|
||||||
|
|
||||||
|
expect(tiktokenUtils.getEncoding).toHaveBeenCalledWith('o200k_base');
|
||||||
|
expect(mockTokenizer.encode).toHaveBeenCalledWith(inputText, ['<|special|>'], ['<|bad|>']);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should reuse tokenizer on subsequent calls', async () => {
|
||||||
|
const splitter = new TokenTextSplitter();
|
||||||
|
mockTokenizer.encode.mockReturnValue([1, 2, 3]);
|
||||||
|
mockTokenizer.decode.mockReturnValue('test');
|
||||||
|
|
||||||
|
await splitter.splitText('first call');
|
||||||
|
await splitter.splitText('second call');
|
||||||
|
|
||||||
|
expect(tiktokenUtils.getEncoding).toHaveBeenCalledTimes(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle large text with multiple chunks and overlap', async () => {
|
||||||
|
const splitter = new TokenTextSplitter({
|
||||||
|
chunkSize: 2,
|
||||||
|
chunkOverlap: 1,
|
||||||
|
});
|
||||||
|
|
||||||
|
const inputText = 'One two three four five six';
|
||||||
|
const mockTokenIds = [1, 2, 3, 4, 5, 6];
|
||||||
|
|
||||||
|
mockTokenizer.encode.mockReturnValue(mockTokenIds);
|
||||||
|
mockTokenizer.decode.mockImplementation((tokens: number[]) => {
|
||||||
|
const chunkMap: Record<string, string> = {
|
||||||
|
'1,2': 'One two',
|
||||||
|
'2,3': 'two three',
|
||||||
|
'3,4': 'three four',
|
||||||
|
'4,5': 'four five',
|
||||||
|
'5,6': 'five six',
|
||||||
|
};
|
||||||
|
return chunkMap[tokens.join(',')] || '';
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await splitter.splitText(inputText);
|
||||||
|
|
||||||
|
expect(result).toEqual(['One two', 'two three', 'three four', 'four five', 'five six']);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -8,7 +8,7 @@
|
|||||||
"dev": "pnpm run watch",
|
"dev": "pnpm run watch",
|
||||||
"typecheck": "tsc --noEmit",
|
"typecheck": "tsc --noEmit",
|
||||||
"copy-nodes-json": "node ../../nodes-base/scripts/copy-nodes-json.js .",
|
"copy-nodes-json": "node ../../nodes-base/scripts/copy-nodes-json.js .",
|
||||||
"build": "tsup --tsconfig tsconfig.build.json && pnpm copy-nodes-json && tsc-alias -p tsconfig.build.json && pnpm n8n-copy-static-files && pnpm n8n-generate-metadata",
|
"build": "tsup --tsconfig tsconfig.build.json && pnpm copy-nodes-json && tsc-alias -p tsconfig.build.json && cp utils/tokenizer/*.json dist/utils/tokenizer/ && pnpm n8n-copy-static-files && pnpm n8n-generate-metadata",
|
||||||
"format": "biome format --write .",
|
"format": "biome format --write .",
|
||||||
"format:check": "biome ci .",
|
"format:check": "biome ci .",
|
||||||
"lint": "eslint nodes credentials utils --quiet",
|
"lint": "eslint nodes credentials utils --quiet",
|
||||||
@@ -198,6 +198,7 @@
|
|||||||
"html-to-text": "9.0.5",
|
"html-to-text": "9.0.5",
|
||||||
"https-proxy-agent": "catalog:",
|
"https-proxy-agent": "catalog:",
|
||||||
"jsdom": "23.0.1",
|
"jsdom": "23.0.1",
|
||||||
|
"js-tiktoken": "^1.0.12",
|
||||||
"langchain": "0.3.28",
|
"langchain": "0.3.28",
|
||||||
"lodash": "catalog:",
|
"lodash": "catalog:",
|
||||||
"mammoth": "1.7.2",
|
"mammoth": "1.7.2",
|
||||||
|
|||||||
125
packages/@n8n/nodes-langchain/utils/tests/tiktoken.test.ts
Normal file
125
packages/@n8n/nodes-langchain/utils/tests/tiktoken.test.ts
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
/* eslint-disable @typescript-eslint/no-unsafe-call */
|
||||||
|
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
|
||||||
|
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
|
||||||
|
/* eslint-disable @typescript-eslint/no-var-requires */
|
||||||
|
import type { TiktokenEncoding } from 'js-tiktoken/lite';
|
||||||
|
import { Tiktoken } from 'js-tiktoken/lite';
|
||||||
|
|
||||||
|
import { getEncoding, encodingForModel } from '../tokenizer/tiktoken';
|
||||||
|
|
||||||
|
jest.mock('js-tiktoken/lite', () => ({
|
||||||
|
Tiktoken: jest.fn(),
|
||||||
|
getEncodingNameForModel: jest.fn(),
|
||||||
|
}));
|
||||||
|
|
||||||
|
jest.mock('../tokenizer/cl100k_base.json', () => ({ mockCl100kBase: 'data' }), { virtual: true });
|
||||||
|
jest.mock('../tokenizer/o200k_base.json', () => ({ mockO200kBase: 'data' }), { virtual: true });
|
||||||
|
|
||||||
|
describe('tiktoken utils', () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
jest.clearAllMocks();
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('getEncoding', () => {
|
||||||
|
it('should return Tiktoken instance for cl100k_base encoding', async () => {
|
||||||
|
const mockTiktoken = {};
|
||||||
|
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||||
|
|
||||||
|
const result = await getEncoding('cl100k_base');
|
||||||
|
|
||||||
|
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
|
||||||
|
expect(result).toBe(mockTiktoken);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return Tiktoken instance for o200k_base encoding', async () => {
|
||||||
|
const mockTiktoken = {};
|
||||||
|
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||||
|
|
||||||
|
const result = await getEncoding('o200k_base');
|
||||||
|
|
||||||
|
expect(Tiktoken).toHaveBeenCalledWith({ mockO200kBase: 'data' });
|
||||||
|
expect(result).toBe(mockTiktoken);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should map p50k_base to cl100k_base encoding', async () => {
|
||||||
|
const mockTiktoken = {};
|
||||||
|
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||||
|
|
||||||
|
const result = await getEncoding('p50k_base');
|
||||||
|
|
||||||
|
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
|
||||||
|
expect(result).toBe(mockTiktoken);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should map r50k_base to cl100k_base encoding', async () => {
|
||||||
|
const mockTiktoken = {};
|
||||||
|
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||||
|
|
||||||
|
const result = await getEncoding('r50k_base');
|
||||||
|
|
||||||
|
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
|
||||||
|
expect(result).toBe(mockTiktoken);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should map gpt2 to cl100k_base encoding', async () => {
|
||||||
|
const mockTiktoken = {};
|
||||||
|
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||||
|
|
||||||
|
const result = await getEncoding('gpt2');
|
||||||
|
|
||||||
|
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
|
||||||
|
expect(result).toBe(mockTiktoken);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should map p50k_edit to cl100k_base encoding', async () => {
|
||||||
|
const mockTiktoken = {};
|
||||||
|
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||||
|
|
||||||
|
const result = await getEncoding('p50k_edit');
|
||||||
|
|
||||||
|
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
|
||||||
|
expect(result).toBe(mockTiktoken);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return cl100k_base for unknown encoding', async () => {
|
||||||
|
const mockTiktoken = {};
|
||||||
|
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||||
|
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument
|
||||||
|
const result = await getEncoding('unknown_encoding' as unknown as TiktokenEncoding);
|
||||||
|
|
||||||
|
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
|
||||||
|
expect(result).toBe(mockTiktoken);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('encodingForModel', () => {
|
||||||
|
it('should call getEncodingNameForModel and return encoding for cl100k_base', async () => {
|
||||||
|
const mockGetEncodingNameForModel = require('js-tiktoken/lite').getEncodingNameForModel;
|
||||||
|
const mockTiktoken = {};
|
||||||
|
|
||||||
|
mockGetEncodingNameForModel.mockReturnValue('cl100k_base');
|
||||||
|
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||||
|
|
||||||
|
const result = await encodingForModel('gpt-3.5-turbo');
|
||||||
|
|
||||||
|
expect(mockGetEncodingNameForModel).toHaveBeenCalledWith('gpt-3.5-turbo');
|
||||||
|
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
|
||||||
|
expect(result).toBe(mockTiktoken);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle gpt-4 model with cl100k_base', async () => {
|
||||||
|
const mockGetEncodingNameForModel = require('js-tiktoken/lite').getEncodingNameForModel;
|
||||||
|
const mockTiktoken = {};
|
||||||
|
|
||||||
|
mockGetEncodingNameForModel.mockReturnValue('cl100k_base');
|
||||||
|
(Tiktoken as unknown as jest.Mock).mockReturnValue(mockTiktoken);
|
||||||
|
|
||||||
|
const result = await encodingForModel('gpt-4');
|
||||||
|
|
||||||
|
expect(mockGetEncodingNameForModel).toHaveBeenCalledWith('gpt-4');
|
||||||
|
expect(Tiktoken).toHaveBeenCalledWith({ mockCl100kBase: 'data' });
|
||||||
|
expect(result).toBe(mockTiktoken);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
30
packages/@n8n/nodes-langchain/utils/tokenizer/tiktoken.ts
Normal file
30
packages/@n8n/nodes-langchain/utils/tokenizer/tiktoken.ts
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
import type { TiktokenBPE, TiktokenEncoding, TiktokenModel } from 'js-tiktoken/lite';
|
||||||
|
import { Tiktoken, getEncodingNameForModel } from 'js-tiktoken/lite';
|
||||||
|
|
||||||
|
import cl100k_base from './cl100k_base.json';
|
||||||
|
import o200k_base from './o200k_base.json';
|
||||||
|
|
||||||
|
export async function getEncoding(encoding: TiktokenEncoding) {
|
||||||
|
const encodings = {
|
||||||
|
cl100k_base: cl100k_base as TiktokenBPE,
|
||||||
|
o200k_base: o200k_base as TiktokenBPE,
|
||||||
|
};
|
||||||
|
const encodingsMap: Record<TiktokenEncoding, TiktokenBPE> = {
|
||||||
|
cl100k_base: encodings.cl100k_base,
|
||||||
|
p50k_base: encodings.cl100k_base,
|
||||||
|
r50k_base: encodings.cl100k_base,
|
||||||
|
gpt2: encodings.cl100k_base,
|
||||||
|
p50k_edit: encodings.cl100k_base,
|
||||||
|
o200k_base: encodings.o200k_base,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!(encoding in encodingsMap)) {
|
||||||
|
return new Tiktoken(cl100k_base);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Tiktoken(encodingsMap[encoding]);
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function encodingForModel(model: TiktokenModel) {
|
||||||
|
return await getEncoding(getEncodingNameForModel(model));
|
||||||
|
}
|
||||||
65
pnpm-lock.yaml
generated
65
pnpm-lock.yaml
generated
@@ -777,7 +777,7 @@ importers:
|
|||||||
version: 4.3.0
|
version: 4.3.0
|
||||||
'@getzep/zep-cloud':
|
'@getzep/zep-cloud':
|
||||||
specifier: 1.0.12
|
specifier: 1.0.12
|
||||||
version: 1.0.12(@langchain/core@0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)))(encoding@0.1.13)(langchain@0.3.28(a1ee45cf0a7ccac06c2d6fcd08fda4e6))
|
version: 1.0.12(@langchain/core@0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)))(encoding@0.1.13)(langchain@0.3.28(d0079d2993dfd2a4e9e2c2f03851bb21))
|
||||||
'@getzep/zep-js':
|
'@getzep/zep-js':
|
||||||
specifier: 0.9.0
|
specifier: 0.9.0
|
||||||
version: 0.9.0
|
version: 0.9.0
|
||||||
@@ -804,7 +804,7 @@ importers:
|
|||||||
version: 0.3.4(@langchain/core@0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)))(encoding@0.1.13)
|
version: 0.3.4(@langchain/core@0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)))(encoding@0.1.13)
|
||||||
'@langchain/community':
|
'@langchain/community':
|
||||||
specifier: 'catalog:'
|
specifier: 'catalog:'
|
||||||
version: 0.3.46(421f5526c191b539c59b7e383489c114)
|
version: 0.3.46(9d9844bc33bc460a2ae2eef819849ab0)
|
||||||
'@langchain/core':
|
'@langchain/core':
|
||||||
specifier: 'catalog:'
|
specifier: 'catalog:'
|
||||||
version: 0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67))
|
version: 0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67))
|
||||||
@@ -904,12 +904,15 @@ importers:
|
|||||||
https-proxy-agent:
|
https-proxy-agent:
|
||||||
specifier: 'catalog:'
|
specifier: 'catalog:'
|
||||||
version: 7.0.6
|
version: 7.0.6
|
||||||
|
js-tiktoken:
|
||||||
|
specifier: ^1.0.12
|
||||||
|
version: 1.0.12
|
||||||
jsdom:
|
jsdom:
|
||||||
specifier: 23.0.1
|
specifier: 23.0.1
|
||||||
version: 23.0.1
|
version: 23.0.1
|
||||||
langchain:
|
langchain:
|
||||||
specifier: 0.3.28
|
specifier: 0.3.28
|
||||||
version: 0.3.28(a1ee45cf0a7ccac06c2d6fcd08fda4e6)
|
version: 0.3.28(d0079d2993dfd2a4e9e2c2f03851bb21)
|
||||||
lodash:
|
lodash:
|
||||||
specifier: 'catalog:'
|
specifier: 'catalog:'
|
||||||
version: 4.17.21
|
version: 4.17.21
|
||||||
@@ -15881,7 +15884,7 @@ snapshots:
|
|||||||
|
|
||||||
'@babel/generator@7.26.10':
|
'@babel/generator@7.26.10':
|
||||||
dependencies:
|
dependencies:
|
||||||
'@babel/parser': 7.26.10
|
'@babel/parser': 7.27.5
|
||||||
'@babel/types': 7.26.10
|
'@babel/types': 7.26.10
|
||||||
'@jridgewell/gen-mapping': 0.3.8
|
'@jridgewell/gen-mapping': 0.3.8
|
||||||
'@jridgewell/trace-mapping': 0.3.25
|
'@jridgewell/trace-mapping': 0.3.25
|
||||||
@@ -16572,7 +16575,7 @@ snapshots:
|
|||||||
dependencies:
|
dependencies:
|
||||||
'@babel/code-frame': 7.26.2
|
'@babel/code-frame': 7.26.2
|
||||||
'@babel/generator': 7.26.10
|
'@babel/generator': 7.26.10
|
||||||
'@babel/parser': 7.26.10
|
'@babel/parser': 7.27.5
|
||||||
'@babel/template': 7.26.9
|
'@babel/template': 7.26.9
|
||||||
'@babel/types': 7.26.10
|
'@babel/types': 7.26.10
|
||||||
debug: 4.4.1(supports-color@8.1.1)
|
debug: 4.4.1(supports-color@8.1.1)
|
||||||
@@ -17000,7 +17003,7 @@ snapshots:
|
|||||||
'@gar/promisify@1.1.3':
|
'@gar/promisify@1.1.3':
|
||||||
optional: true
|
optional: true
|
||||||
|
|
||||||
'@getzep/zep-cloud@1.0.12(@langchain/core@0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)))(encoding@0.1.13)(langchain@0.3.28(a1ee45cf0a7ccac06c2d6fcd08fda4e6))':
|
'@getzep/zep-cloud@1.0.12(@langchain/core@0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)))(encoding@0.1.13)(langchain@0.3.28(d0079d2993dfd2a4e9e2c2f03851bb21))':
|
||||||
dependencies:
|
dependencies:
|
||||||
form-data: 4.0.0
|
form-data: 4.0.0
|
||||||
node-fetch: 2.7.0(encoding@0.1.13)
|
node-fetch: 2.7.0(encoding@0.1.13)
|
||||||
@@ -17009,7 +17012,7 @@ snapshots:
|
|||||||
zod: 3.25.67
|
zod: 3.25.67
|
||||||
optionalDependencies:
|
optionalDependencies:
|
||||||
'@langchain/core': 0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67))
|
'@langchain/core': 0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67))
|
||||||
langchain: 0.3.28(a1ee45cf0a7ccac06c2d6fcd08fda4e6)
|
langchain: 0.3.28(d0079d2993dfd2a4e9e2c2f03851bb21)
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- encoding
|
- encoding
|
||||||
|
|
||||||
@@ -17523,7 +17526,7 @@ snapshots:
|
|||||||
- aws-crt
|
- aws-crt
|
||||||
- encoding
|
- encoding
|
||||||
|
|
||||||
'@langchain/community@0.3.46(421f5526c191b539c59b7e383489c114)':
|
'@langchain/community@0.3.46(9d9844bc33bc460a2ae2eef819849ab0)':
|
||||||
dependencies:
|
dependencies:
|
||||||
'@browserbasehq/stagehand': 1.9.0(@playwright/test@1.49.1)(deepmerge@4.3.1)(dotenv@16.5.0)(encoding@0.1.13)(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67))(zod@3.25.67)
|
'@browserbasehq/stagehand': 1.9.0(@playwright/test@1.49.1)(deepmerge@4.3.1)(dotenv@16.5.0)(encoding@0.1.13)(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67))(zod@3.25.67)
|
||||||
'@ibm-cloud/watsonx-ai': 1.1.2
|
'@ibm-cloud/watsonx-ai': 1.1.2
|
||||||
@@ -17535,8 +17538,8 @@ snapshots:
|
|||||||
flat: 5.0.2
|
flat: 5.0.2
|
||||||
ibm-cloud-sdk-core: 5.3.2
|
ibm-cloud-sdk-core: 5.3.2
|
||||||
js-yaml: 4.1.0
|
js-yaml: 4.1.0
|
||||||
langchain: 0.3.28(a1ee45cf0a7ccac06c2d6fcd08fda4e6)
|
langchain: 0.3.28(d0079d2993dfd2a4e9e2c2f03851bb21)
|
||||||
langsmith: 0.3.30(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67))
|
langsmith: 0.3.33(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67))
|
||||||
openai: 4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)
|
openai: 4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)
|
||||||
uuid: 10.0.0
|
uuid: 10.0.0
|
||||||
zod: 3.25.67
|
zod: 3.25.67
|
||||||
@@ -17549,7 +17552,7 @@ snapshots:
|
|||||||
'@aws-sdk/credential-provider-node': 3.808.0
|
'@aws-sdk/credential-provider-node': 3.808.0
|
||||||
'@azure/storage-blob': 12.26.0
|
'@azure/storage-blob': 12.26.0
|
||||||
'@browserbasehq/sdk': 2.6.0(encoding@0.1.13)
|
'@browserbasehq/sdk': 2.6.0(encoding@0.1.13)
|
||||||
'@getzep/zep-cloud': 1.0.12(@langchain/core@0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)))(encoding@0.1.13)(langchain@0.3.28(a1ee45cf0a7ccac06c2d6fcd08fda4e6))
|
'@getzep/zep-cloud': 1.0.12(@langchain/core@0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)))(encoding@0.1.13)(langchain@0.3.28(d0079d2993dfd2a4e9e2c2f03851bb21))
|
||||||
'@getzep/zep-js': 0.9.0
|
'@getzep/zep-js': 0.9.0
|
||||||
'@google-ai/generativelanguage': 2.6.0(encoding@0.1.13)
|
'@google-ai/generativelanguage': 2.6.0(encoding@0.1.13)
|
||||||
'@google-cloud/storage': 7.12.1(encoding@0.1.13)
|
'@google-cloud/storage': 7.12.1(encoding@0.1.13)
|
||||||
@@ -21013,7 +21016,7 @@ snapshots:
|
|||||||
|
|
||||||
axios-retry@4.5.0(axios@1.9.0):
|
axios-retry@4.5.0(axios@1.9.0):
|
||||||
dependencies:
|
dependencies:
|
||||||
axios: 1.9.0(debug@4.4.1)
|
axios: 1.9.0
|
||||||
is-retry-allowed: 2.2.0
|
is-retry-allowed: 2.2.0
|
||||||
|
|
||||||
axios@1.8.3:
|
axios@1.8.3:
|
||||||
@@ -21024,6 +21027,14 @@ snapshots:
|
|||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- debug
|
- debug
|
||||||
|
|
||||||
|
axios@1.9.0:
|
||||||
|
dependencies:
|
||||||
|
follow-redirects: 1.15.9(debug@4.3.6)
|
||||||
|
form-data: 4.0.2
|
||||||
|
proxy-from-env: 1.1.0
|
||||||
|
transitivePeerDependencies:
|
||||||
|
- debug
|
||||||
|
|
||||||
axios@1.9.0(debug@4.3.6):
|
axios@1.9.0(debug@4.3.6):
|
||||||
dependencies:
|
dependencies:
|
||||||
follow-redirects: 1.15.9(debug@4.3.6)
|
follow-redirects: 1.15.9(debug@4.3.6)
|
||||||
@@ -21042,7 +21053,7 @@ snapshots:
|
|||||||
|
|
||||||
axios@1.9.0(debug@4.4.1):
|
axios@1.9.0(debug@4.4.1):
|
||||||
dependencies:
|
dependencies:
|
||||||
follow-redirects: 1.15.9(debug@4.3.6)
|
follow-redirects: 1.15.9(debug@4.4.1)
|
||||||
form-data: 4.0.2
|
form-data: 4.0.2
|
||||||
proxy-from-env: 1.1.0
|
proxy-from-env: 1.1.0
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
@@ -21132,7 +21143,7 @@ snapshots:
|
|||||||
|
|
||||||
babel-walk@3.0.0-canary-5:
|
babel-walk@3.0.0-canary-5:
|
||||||
dependencies:
|
dependencies:
|
||||||
'@babel/types': 7.26.10
|
'@babel/types': 7.27.6
|
||||||
|
|
||||||
balanced-match@1.0.2: {}
|
balanced-match@1.0.2: {}
|
||||||
|
|
||||||
@@ -21301,7 +21312,7 @@ snapshots:
|
|||||||
|
|
||||||
bundlemon@3.1.0(typescript@5.8.2):
|
bundlemon@3.1.0(typescript@5.8.2):
|
||||||
dependencies:
|
dependencies:
|
||||||
axios: 1.9.0(debug@4.4.1)
|
axios: 1.9.0
|
||||||
axios-retry: 4.5.0(axios@1.9.0)
|
axios-retry: 4.5.0(axios@1.9.0)
|
||||||
brotli-size: 4.0.0
|
brotli-size: 4.0.0
|
||||||
bundlemon-utils: 2.0.1
|
bundlemon-utils: 2.0.1
|
||||||
@@ -21771,7 +21782,7 @@ snapshots:
|
|||||||
|
|
||||||
constantinople@4.0.1:
|
constantinople@4.0.1:
|
||||||
dependencies:
|
dependencies:
|
||||||
'@babel/parser': 7.26.10
|
'@babel/parser': 7.27.5
|
||||||
'@babel/types': 7.26.10
|
'@babel/types': 7.26.10
|
||||||
|
|
||||||
content-disposition@1.0.0:
|
content-disposition@1.0.0:
|
||||||
@@ -23253,6 +23264,10 @@ snapshots:
|
|||||||
optionalDependencies:
|
optionalDependencies:
|
||||||
debug: 4.4.0
|
debug: 4.4.0
|
||||||
|
|
||||||
|
follow-redirects@1.15.9(debug@4.4.1):
|
||||||
|
optionalDependencies:
|
||||||
|
debug: 4.4.1(supports-color@8.1.1)
|
||||||
|
|
||||||
for-each@0.3.3:
|
for-each@0.3.3:
|
||||||
dependencies:
|
dependencies:
|
||||||
is-callable: 1.2.7
|
is-callable: 1.2.7
|
||||||
@@ -23894,7 +23909,7 @@ snapshots:
|
|||||||
isstream: 0.1.2
|
isstream: 0.1.2
|
||||||
jsonwebtoken: 9.0.2
|
jsonwebtoken: 9.0.2
|
||||||
mime-types: 2.1.35
|
mime-types: 2.1.35
|
||||||
retry-axios: 2.6.0(axios@1.9.0(debug@4.4.1))
|
retry-axios: 2.6.0(axios@1.9.0)
|
||||||
tough-cookie: 4.1.4
|
tough-cookie: 4.1.4
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- supports-color
|
- supports-color
|
||||||
@@ -23959,7 +23974,7 @@ snapshots:
|
|||||||
|
|
||||||
infisical-node@1.3.0:
|
infisical-node@1.3.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
axios: 1.9.0(debug@4.4.1)
|
axios: 1.9.0
|
||||||
dotenv: 16.3.1
|
dotenv: 16.3.1
|
||||||
tweetnacl: 1.0.3
|
tweetnacl: 1.0.3
|
||||||
tweetnacl-util: 0.15.1
|
tweetnacl-util: 0.15.1
|
||||||
@@ -24886,7 +24901,7 @@ snapshots:
|
|||||||
|
|
||||||
kuler@2.0.0: {}
|
kuler@2.0.0: {}
|
||||||
|
|
||||||
langchain@0.3.28(a1ee45cf0a7ccac06c2d6fcd08fda4e6):
|
langchain@0.3.28(d0079d2993dfd2a4e9e2c2f03851bb21):
|
||||||
dependencies:
|
dependencies:
|
||||||
'@langchain/core': 0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67))
|
'@langchain/core': 0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67))
|
||||||
'@langchain/openai': 0.5.13(@langchain/core@0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)))(encoding@0.1.13)(ws@8.18.2)
|
'@langchain/openai': 0.5.13(@langchain/core@0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)))(encoding@0.1.13)(ws@8.18.2)
|
||||||
@@ -24909,7 +24924,7 @@ snapshots:
|
|||||||
'@langchain/groq': 0.2.3(@langchain/core@0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)))(encoding@0.1.13)
|
'@langchain/groq': 0.2.3(@langchain/core@0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)))(encoding@0.1.13)
|
||||||
'@langchain/mistralai': 0.2.1(@langchain/core@0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)))(zod@3.25.67)
|
'@langchain/mistralai': 0.2.1(@langchain/core@0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)))(zod@3.25.67)
|
||||||
'@langchain/ollama': 0.2.2(@langchain/core@0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)))
|
'@langchain/ollama': 0.2.2(@langchain/core@0.3.59(openai@4.103.0(encoding@0.1.13)(ws@8.18.2)(zod@3.25.67)))
|
||||||
axios: 1.9.0(debug@4.4.1)
|
axios: 1.9.0
|
||||||
cheerio: 1.0.0
|
cheerio: 1.0.0
|
||||||
handlebars: 4.7.8
|
handlebars: 4.7.8
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
@@ -26723,7 +26738,7 @@ snapshots:
|
|||||||
|
|
||||||
posthog-node@3.2.1:
|
posthog-node@3.2.1:
|
||||||
dependencies:
|
dependencies:
|
||||||
axios: 1.9.0(debug@4.4.1)
|
axios: 1.9.0
|
||||||
rusha: 0.8.14
|
rusha: 0.8.14
|
||||||
transitivePeerDependencies:
|
transitivePeerDependencies:
|
||||||
- debug
|
- debug
|
||||||
@@ -27300,9 +27315,9 @@ snapshots:
|
|||||||
onetime: 5.1.2
|
onetime: 5.1.2
|
||||||
signal-exit: 3.0.7
|
signal-exit: 3.0.7
|
||||||
|
|
||||||
retry-axios@2.6.0(axios@1.9.0(debug@4.4.1)):
|
retry-axios@2.6.0(axios@1.9.0):
|
||||||
dependencies:
|
dependencies:
|
||||||
axios: 1.9.0(debug@4.4.1)
|
axios: 1.9.0
|
||||||
|
|
||||||
retry-request@7.0.2(encoding@0.1.13):
|
retry-request@7.0.2(encoding@0.1.13):
|
||||||
dependencies:
|
dependencies:
|
||||||
@@ -27757,7 +27772,7 @@ snapshots:
|
|||||||
asn1.js: 5.4.1
|
asn1.js: 5.4.1
|
||||||
asn1.js-rfc2560: 5.0.1(asn1.js@5.4.1)
|
asn1.js-rfc2560: 5.0.1(asn1.js@5.4.1)
|
||||||
asn1.js-rfc5280: 3.0.0
|
asn1.js-rfc5280: 3.0.0
|
||||||
axios: 1.9.0(debug@4.4.1)
|
axios: 1.9.0
|
||||||
big-integer: 1.6.52
|
big-integer: 1.6.52
|
||||||
bignumber.js: 9.1.2
|
bignumber.js: 9.1.2
|
||||||
binascii: 0.0.2
|
binascii: 0.0.2
|
||||||
@@ -29309,7 +29324,7 @@ snapshots:
|
|||||||
|
|
||||||
with@7.0.2:
|
with@7.0.2:
|
||||||
dependencies:
|
dependencies:
|
||||||
'@babel/parser': 7.26.10
|
'@babel/parser': 7.27.5
|
||||||
'@babel/types': 7.26.10
|
'@babel/types': 7.26.10
|
||||||
assert-never: 1.2.1
|
assert-never: 1.2.1
|
||||||
babel-walk: 3.0.0-canary-5
|
babel-walk: 3.0.0-canary-5
|
||||||
|
|||||||
Reference in New Issue
Block a user