fix(Token Splitter Node): Prevent tiktoken blocking on repetitive content (#16769)

This commit is contained in:
oleg
2025-06-27 16:08:14 +02:00
committed by GitHub
parent edf0fec444
commit c5ec056eb5
7 changed files with 812 additions and 27 deletions

View File

@@ -3,7 +3,9 @@ import type { TokenTextSplitterParams } from '@langchain/textsplitters';
import { TextSplitter } from '@langchain/textsplitters';
import type * as tiktoken from 'js-tiktoken';
import { hasLongSequentialRepeat } from '@utils/helpers';
import { getEncoding } from '@utils/tokenizer/tiktoken';
import { estimateTextSplitsByTokens } from '@utils/tokenizer/token-estimator';
/**
* Implementation of splitter which looks at tokens.
@@ -32,26 +34,61 @@ export class TokenTextSplitter extends TextSplitter implements TokenTextSplitter
}
async splitText(text: string): Promise<string[]> {
if (!this.tokenizer) {
this.tokenizer = await getEncoding(this.encodingName);
}
const splits: string[] = [];
const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
let start_idx = 0;
while (start_idx < input_ids.length) {
if (start_idx > 0) {
start_idx -= this.chunkOverlap;
try {
// Validate input
if (!text || typeof text !== 'string') {
return [];
}
const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
const chunk_ids = input_ids.slice(start_idx, end_idx);
splits.push(this.tokenizer.decode(chunk_ids));
start_idx = end_idx;
}
return splits;
// Check for repetitive content
if (hasLongSequentialRepeat(text)) {
const splits = estimateTextSplitsByTokens(
text,
this.chunkSize,
this.chunkOverlap,
this.encodingName,
);
return splits;
}
// Use tiktoken for normal text
try {
if (!this.tokenizer) {
this.tokenizer = await getEncoding(this.encodingName);
}
const splits: string[] = [];
const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
let start_idx = 0;
let chunkCount = 0;
while (start_idx < input_ids.length) {
if (start_idx > 0) {
start_idx = Math.max(0, start_idx - this.chunkOverlap);
}
const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
const chunk_ids = input_ids.slice(start_idx, end_idx);
splits.push(this.tokenizer.decode(chunk_ids));
chunkCount++;
start_idx = end_idx;
}
return splits;
} catch (tiktokenError) {
// Fall back to character-based splitting if tiktoken fails
return estimateTextSplitsByTokens(
text,
this.chunkSize,
this.chunkOverlap,
this.encodingName,
);
}
} catch (error) {
// Return empty array on complete failure
return [];
}
}
}