feat(Token Splitter Node): Replace remote tiktoken encoding with local implementation (#16548)

This commit is contained in:
oleg
2025-06-20 16:08:16 +02:00
committed by GitHub
parent 79650ea55a
commit 2d638023be
11 changed files with 427 additions and 32 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,30 @@
import type { TiktokenBPE, TiktokenEncoding, TiktokenModel } from 'js-tiktoken/lite';
import { Tiktoken, getEncodingNameForModel } from 'js-tiktoken/lite';
import cl100k_base from './cl100k_base.json';
import o200k_base from './o200k_base.json';
export async function getEncoding(encoding: TiktokenEncoding) {
const encodings = {
cl100k_base: cl100k_base as TiktokenBPE,
o200k_base: o200k_base as TiktokenBPE,
};
const encodingsMap: Record<TiktokenEncoding, TiktokenBPE> = {
cl100k_base: encodings.cl100k_base,
p50k_base: encodings.cl100k_base,
r50k_base: encodings.cl100k_base,
gpt2: encodings.cl100k_base,
p50k_edit: encodings.cl100k_base,
o200k_base: encodings.o200k_base,
};
if (!(encoding in encodingsMap)) {
return new Tiktoken(cl100k_base);
}
return new Tiktoken(encodingsMap[encoding]);
}
export async function encodingForModel(model: TiktokenModel) {
return await getEncoding(getEncodingNameForModel(model));
}