feat(Default Data Loader Node): Add default text splitter (#15786)

Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>
This commit is contained in:
Nikhil Kuriakose
2025-06-03 17:14:26 +02:00
committed by GitHub
parent 103bc20243
commit 40850c95b6
4 changed files with 309 additions and 27 deletions

View File

@@ -1,11 +1,13 @@
/* eslint-disable n8n-nodes-base/node-dirname-against-convention */
import type { TextSplitter } from '@langchain/textsplitters';
import { RecursiveCharacterTextSplitter, type TextSplitter } from '@langchain/textsplitters';
import {
NodeConnectionTypes,
type INodeType,
type INodeTypeDescription,
type ISupplyDataFunctions,
type SupplyData,
type IDataObject,
type INodeInputConfiguration,
} from 'n8n-workflow';
import { logWrapper } from '@utils/logWrapper';
@@ -20,13 +22,31 @@ import 'mammoth'; // for docx
import 'epub2'; // for epub
import 'pdf-parse'; // for pdf
function getInputs(parameters: IDataObject) {
const inputs: INodeInputConfiguration[] = [];
const textSplittingMode = parameters?.textSplittingMode;
// If text splitting mode is 'custom' or does not exist (v1), we need to add an input for the text splitter
if (!textSplittingMode || textSplittingMode === 'custom') {
inputs.push({
displayName: 'Text Splitter',
maxConnections: 1,
type: 'ai_textSplitter',
required: true,
});
}
return inputs;
}
export class DocumentDefaultDataLoader implements INodeType {
description: INodeTypeDescription = {
displayName: 'Default Data Loader',
name: 'documentDefaultDataLoader',
icon: 'file:binary.svg',
group: ['transform'],
version: 1,
version: [1, 1.1],
defaultVersion: 1.1,
description: 'Load data from previous step in the workflow',
defaults: {
name: 'Default Data Loader',
@@ -45,14 +65,7 @@ export class DocumentDefaultDataLoader implements INodeType {
},
},
// eslint-disable-next-line n8n-nodes-base/node-class-description-inputs-wrong-regular-node
inputs: [
{
displayName: 'Text Splitter',
maxConnections: 1,
type: NodeConnectionTypes.AiTextSplitter,
required: true,
},
],
inputs: `={{ ((parameter) => { ${getInputs.toString()}; return getInputs(parameter) })($parameter) }}`,
// eslint-disable-next-line n8n-nodes-base/node-class-description-outputs-wrong
outputs: [NodeConnectionTypes.AiDocument],
outputNames: ['Document'],
@@ -64,6 +77,31 @@ export class DocumentDefaultDataLoader implements INodeType {
type: 'notice',
default: '',
},
{
displayName: 'Text Splitting',
name: 'textSplittingMode',
type: 'options',
default: 'simple',
required: true,
noDataExpression: true,
displayOptions: {
show: {
'@version': [1.1],
},
},
options: [
{
name: 'Simple',
value: 'simple',
description: 'Uses the Recursive Character Text Splitter with default options',
},
{
name: 'Custom',
value: 'custom',
description: 'Connect a text splitter of your choice',
},
],
},
{
displayName: 'Type of Data',
name: 'dataType',
@@ -284,11 +322,29 @@ export class DocumentDefaultDataLoader implements INodeType {
};
async supplyData(this: ISupplyDataFunctions, itemIndex: number): Promise<SupplyData> {
const node = this.getNode();
const dataType = this.getNodeParameter('dataType', itemIndex, 'json') as 'json' | 'binary';
const textSplitter = (await this.getInputConnectionData(
NodeConnectionTypes.AiTextSplitter,
0,
)) as TextSplitter | undefined;
let textSplitter: TextSplitter | undefined;
if (node.typeVersion === 1.1) {
const textSplittingMode = this.getNodeParameter('textSplittingMode', itemIndex, 'simple') as
| 'simple'
| 'custom';
if (textSplittingMode === 'simple') {
textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 200 });
} else if (textSplittingMode === 'custom') {
textSplitter = (await this.getInputConnectionData(NodeConnectionTypes.AiTextSplitter, 0)) as
| TextSplitter
| undefined;
}
} else {
textSplitter = (await this.getInputConnectionData(NodeConnectionTypes.AiTextSplitter, 0)) as
| TextSplitter
| undefined;
}
const binaryDataKey = this.getNodeParameter('binaryDataKey', itemIndex, '') as string;
const processor =

View File

@@ -0,0 +1,72 @@
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
import type { ISupplyDataFunctions } from 'n8n-workflow';
import { NodeConnectionTypes } from 'n8n-workflow';
import { DocumentDefaultDataLoader } from '../DocumentDefaultDataLoader.node';
jest.mock('@langchain/textsplitters', () => ({
RecursiveCharacterTextSplitter: jest.fn().mockImplementation(() => ({
splitDocuments: jest.fn(
async (docs: Array<Record<string, unknown>>): Promise<Array<Record<string, unknown>>> =>
docs.map((doc) => ({ ...doc, split: true })),
),
})),
}));
describe('DocumentDefaultDataLoader', () => {
let loader: DocumentDefaultDataLoader;
beforeEach(() => {
loader = new DocumentDefaultDataLoader();
jest.clearAllMocks();
});
it('should supply data with recursive char text splitter', async () => {
const context = {
getNode: jest.fn(() => ({ typeVersion: 1.1 })),
getNodeParameter: jest.fn().mockImplementation((paramName, _itemIndex) => {
switch (paramName) {
case 'dataType':
return 'json';
case 'textSplittingMode':
return 'simple';
case 'binaryDataKey':
return 'data';
default:
return;
}
}),
} as unknown as ISupplyDataFunctions;
await loader.supplyData.call(context, 0);
expect(RecursiveCharacterTextSplitter).toHaveBeenCalledWith({
chunkSize: 1000,
chunkOverlap: 200,
});
});
it('should supply data with custom text splitter', async () => {
const customSplitter = { splitDocuments: jest.fn(async (docs) => docs) };
const context = {
getNode: jest.fn(() => ({ typeVersion: 1.1 })),
getNodeParameter: jest.fn().mockImplementation((paramName, _itemIndex) => {
switch (paramName) {
case 'dataType':
return 'json';
case 'textSplittingMode':
return 'custom';
case 'binaryDataKey':
return 'data';
default:
return;
}
}),
getInputConnectionData: jest.fn(async () => customSplitter),
} as unknown as ISupplyDataFunctions;
await loader.supplyData.call(context, 0);
expect(context.getInputConnectionData).toHaveBeenCalledWith(
NodeConnectionTypes.AiTextSplitter,
0,
);
});
});