mirror of
https://github.com/Abdulazizzn/n8n-enterprise-unlocked.git
synced 2025-12-17 18:12:04 +00:00
fix(Token Splitter Node): Prevent tiktoken blocking on repetitive content (#16769)
This commit is contained in:
@@ -3,7 +3,9 @@ import type { TokenTextSplitterParams } from '@langchain/textsplitters';
|
||||
import { TextSplitter } from '@langchain/textsplitters';
|
||||
import type * as tiktoken from 'js-tiktoken';
|
||||
|
||||
import { hasLongSequentialRepeat } from '@utils/helpers';
|
||||
import { getEncoding } from '@utils/tokenizer/tiktoken';
|
||||
import { estimateTextSplitsByTokens } from '@utils/tokenizer/token-estimator';
|
||||
|
||||
/**
|
||||
* Implementation of splitter which looks at tokens.
|
||||
@@ -32,26 +34,61 @@ export class TokenTextSplitter extends TextSplitter implements TokenTextSplitter
|
||||
}
|
||||
|
||||
async splitText(text: string): Promise<string[]> {
|
||||
if (!this.tokenizer) {
|
||||
this.tokenizer = await getEncoding(this.encodingName);
|
||||
}
|
||||
|
||||
const splits: string[] = [];
|
||||
|
||||
const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
|
||||
|
||||
let start_idx = 0;
|
||||
|
||||
while (start_idx < input_ids.length) {
|
||||
if (start_idx > 0) {
|
||||
start_idx -= this.chunkOverlap;
|
||||
try {
|
||||
// Validate input
|
||||
if (!text || typeof text !== 'string') {
|
||||
return [];
|
||||
}
|
||||
const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
|
||||
const chunk_ids = input_ids.slice(start_idx, end_idx);
|
||||
splits.push(this.tokenizer.decode(chunk_ids));
|
||||
start_idx = end_idx;
|
||||
}
|
||||
|
||||
return splits;
|
||||
// Check for repetitive content
|
||||
if (hasLongSequentialRepeat(text)) {
|
||||
const splits = estimateTextSplitsByTokens(
|
||||
text,
|
||||
this.chunkSize,
|
||||
this.chunkOverlap,
|
||||
this.encodingName,
|
||||
);
|
||||
return splits;
|
||||
}
|
||||
|
||||
// Use tiktoken for normal text
|
||||
try {
|
||||
if (!this.tokenizer) {
|
||||
this.tokenizer = await getEncoding(this.encodingName);
|
||||
}
|
||||
|
||||
const splits: string[] = [];
|
||||
const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
|
||||
|
||||
let start_idx = 0;
|
||||
let chunkCount = 0;
|
||||
|
||||
while (start_idx < input_ids.length) {
|
||||
if (start_idx > 0) {
|
||||
start_idx = Math.max(0, start_idx - this.chunkOverlap);
|
||||
}
|
||||
const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
|
||||
const chunk_ids = input_ids.slice(start_idx, end_idx);
|
||||
|
||||
splits.push(this.tokenizer.decode(chunk_ids));
|
||||
|
||||
chunkCount++;
|
||||
start_idx = end_idx;
|
||||
}
|
||||
|
||||
return splits;
|
||||
} catch (tiktokenError) {
|
||||
// Fall back to character-based splitting if tiktoken fails
|
||||
return estimateTextSplitsByTokens(
|
||||
text,
|
||||
this.chunkSize,
|
||||
this.chunkOverlap,
|
||||
this.encodingName,
|
||||
);
|
||||
}
|
||||
} catch (error) {
|
||||
// Return empty array on complete failure
|
||||
return [];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user