fix(Token Splitter Node): Prevent tiktoken blocking on repetitive content (#16769)

This commit is contained in:
oleg
2025-06-27 16:08:14 +02:00
committed by GitHub
parent edf0fec444
commit c5ec056eb5
7 changed files with 812 additions and 27 deletions

View File

@@ -258,3 +258,50 @@ export function unwrapNestedOutput(output: Record<string, unknown>): Record<stri
export function nodeNameToToolName(node: INode): string {
return node.name.replace(/[\s.?!=+#@&*()[\]{}:;,<>\/\\'"^%$]/g, '_').replace(/_+/g, '_');
}
/**
* Detects if a text contains a character that repeats sequentially for a specified threshold.
* This is used to prevent performance issues with tiktoken on highly repetitive content.
* @param text The text to check
* @param threshold The minimum number of sequential repeats to detect (default: 1000)
* @returns true if a character repeats sequentially for at least the threshold amount
*/
export function hasLongSequentialRepeat(text: string, threshold = 1000): boolean {
try {
// Validate inputs
if (
text === null ||
typeof text !== 'string' ||
text.length === 0 ||
threshold <= 0 ||
text.length < threshold
) {
return false;
}
// Use string iterator to avoid creating array copy (memory efficient)
const iterator = text[Symbol.iterator]();
let prev = iterator.next();
if (prev.done) {
return false;
}
let count = 1;
for (const char of iterator) {
if (char === prev.value) {
count++;
if (count >= threshold) {
return true;
}
} else {
count = 1;
prev = { value: char, done: false };
}
}
return false;
} catch (error) {
// On any error, return false to allow normal processing
return false;
}
}