Files
n8n-enterprise-unlocked/packages/@n8n/nodes-langchain/nodes/document_loaders/DocumentDefaultDataLoader/DocumentDefaultDataLoader.node.ts
Nikhil Kuriakose 40850c95b6 feat(Default Data Loader Node): Add default text splitter (#15786)
Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>
2025-06-03 17:14:26 +02:00

360 lines
8.9 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/* eslint-disable n8n-nodes-base/node-dirname-against-convention */
import { RecursiveCharacterTextSplitter, type TextSplitter } from '@langchain/textsplitters';
import {
NodeConnectionTypes,
type INodeType,
type INodeTypeDescription,
type ISupplyDataFunctions,
type SupplyData,
type IDataObject,
type INodeInputConfiguration,
} from 'n8n-workflow';
import { logWrapper } from '@utils/logWrapper';
import { N8nBinaryLoader } from '@utils/N8nBinaryLoader';
import { N8nJsonLoader } from '@utils/N8nJsonLoader';
import { metadataFilterField } from '@utils/sharedFields';
// Dependencies needed underneath the hood for the loaders. We add them
// here only to track where what dependency is sued
// import 'd3-dsv'; // for csv
import 'mammoth'; // for docx
import 'epub2'; // for epub
import 'pdf-parse'; // for pdf
function getInputs(parameters: IDataObject) {
const inputs: INodeInputConfiguration[] = [];
const textSplittingMode = parameters?.textSplittingMode;
// If text splitting mode is 'custom' or does not exist (v1), we need to add an input for the text splitter
if (!textSplittingMode || textSplittingMode === 'custom') {
inputs.push({
displayName: 'Text Splitter',
maxConnections: 1,
type: 'ai_textSplitter',
required: true,
});
}
return inputs;
}
export class DocumentDefaultDataLoader implements INodeType {
description: INodeTypeDescription = {
displayName: 'Default Data Loader',
name: 'documentDefaultDataLoader',
icon: 'file:binary.svg',
group: ['transform'],
version: [1, 1.1],
defaultVersion: 1.1,
description: 'Load data from previous step in the workflow',
defaults: {
name: 'Default Data Loader',
},
codex: {
categories: ['AI'],
subcategories: {
AI: ['Document Loaders'],
},
resources: {
primaryDocumentation: [
{
url: 'https://docs.n8n.io/integrations/builtin/cluster-nodes/sub-nodes/n8n-nodes-langchain.documentdefaultdataloader/',
},
],
},
},
// eslint-disable-next-line n8n-nodes-base/node-class-description-inputs-wrong-regular-node
inputs: `={{ ((parameter) => { ${getInputs.toString()}; return getInputs(parameter) })($parameter) }}`,
// eslint-disable-next-line n8n-nodes-base/node-class-description-outputs-wrong
outputs: [NodeConnectionTypes.AiDocument],
outputNames: ['Document'],
properties: [
{
displayName:
'This will load data from a previous step in the workflow. <a href="/templates/1962" target="_blank">Example</a>',
name: 'notice',
type: 'notice',
default: '',
},
{
displayName: 'Text Splitting',
name: 'textSplittingMode',
type: 'options',
default: 'simple',
required: true,
noDataExpression: true,
displayOptions: {
show: {
'@version': [1.1],
},
},
options: [
{
name: 'Simple',
value: 'simple',
description: 'Uses the Recursive Character Text Splitter with default options',
},
{
name: 'Custom',
value: 'custom',
description: 'Connect a text splitter of your choice',
},
],
},
{
displayName: 'Type of Data',
name: 'dataType',
type: 'options',
default: 'json',
required: true,
noDataExpression: true,
options: [
{
name: 'JSON',
value: 'json',
description: 'Process JSON data from previous step in the workflow',
},
{
name: 'Binary',
value: 'binary',
description: 'Process binary data from previous step in the workflow',
},
],
},
{
displayName: 'Mode',
name: 'jsonMode',
type: 'options',
default: 'allInputData',
required: true,
displayOptions: {
show: {
dataType: ['json'],
},
},
options: [
{
name: 'Load All Input Data',
value: 'allInputData',
description: 'Use all JSON data that flows into the parent agent or chain',
},
{
name: 'Load Specific Data',
value: 'expressionData',
description:
'Load a subset of data, and/or data from any previous step in the workflow',
},
],
},
{
displayName: 'Mode',
name: 'binaryMode',
type: 'options',
default: 'allInputData',
required: true,
displayOptions: {
show: {
dataType: ['binary'],
},
},
options: [
{
name: 'Load All Input Data',
value: 'allInputData',
description: 'Use all Binary data that flows into the parent agent or chain',
},
{
name: 'Load Specific Data',
value: 'specificField',
description: 'Load data from a specific field in the parent agent or chain',
},
],
},
{
displayName: 'Data Format',
name: 'loader',
type: 'options',
default: 'auto',
required: true,
displayOptions: {
show: {
dataType: ['binary'],
},
},
options: [
{
name: 'Automatically Detect by Mime Type',
value: 'auto',
description: 'Uses the mime type to detect the format',
},
{
name: 'CSV',
value: 'csvLoader',
description: 'Load CSV files',
},
{
name: 'Docx',
value: 'docxLoader',
description: 'Load Docx documents',
},
{
name: 'EPub',
value: 'epubLoader',
description: 'Load EPub files',
},
{
name: 'JSON',
value: 'jsonLoader',
description: 'Load JSON files',
},
{
name: 'PDF',
value: 'pdfLoader',
description: 'Load PDF documents',
},
{
name: 'Text',
value: 'textLoader',
description: 'Load plain text files',
},
],
},
{
displayName: 'Data',
name: 'jsonData',
type: 'string',
typeOptions: {
rows: 6,
},
default: '',
required: true,
description: 'Drag and drop fields from the input pane, or use an expression',
displayOptions: {
show: {
dataType: ['json'],
jsonMode: ['expressionData'],
},
},
},
{
displayName: 'Input Data Field Name',
name: 'binaryDataKey',
type: 'string',
default: 'data',
required: true,
description:
'The name of the field in the agent or chains input that contains the binary file to be processed',
displayOptions: {
show: {
dataType: ['binary'],
},
hide: {
binaryMode: ['allInputData'],
},
},
},
{
displayName: 'Options',
name: 'options',
type: 'collection',
placeholder: 'Add Option',
default: {},
options: [
{
displayName: 'JSON Pointers',
name: 'pointers',
type: 'string',
default: '',
description: 'Pointers to extract from JSON, e.g. "/text" or "/text, /meta/title"',
displayOptions: {
show: {
'/loader': ['jsonLoader', 'auto'],
},
},
},
{
displayName: 'CSV Separator',
name: 'separator',
type: 'string',
description: 'Separator to use for CSV',
default: ',',
displayOptions: {
show: {
'/loader': ['csvLoader', 'auto'],
},
},
},
{
displayName: 'CSV Column',
name: 'column',
type: 'string',
default: '',
description: 'Column to extract from CSV',
displayOptions: {
show: {
'/loader': ['csvLoader', 'auto'],
},
},
},
{
displayName: 'Split Pages in PDF',
description: 'Whether to split PDF pages into separate documents',
name: 'splitPages',
type: 'boolean',
default: true,
displayOptions: {
show: {
'/loader': ['pdfLoader', 'auto'],
},
},
},
{
...metadataFilterField,
displayName: 'Metadata',
description:
'Metadata to add to each document. Could be used for filtering during retrieval',
placeholder: 'Add property',
},
],
},
],
};
async supplyData(this: ISupplyDataFunctions, itemIndex: number): Promise<SupplyData> {
const node = this.getNode();
const dataType = this.getNodeParameter('dataType', itemIndex, 'json') as 'json' | 'binary';
let textSplitter: TextSplitter | undefined;
if (node.typeVersion === 1.1) {
const textSplittingMode = this.getNodeParameter('textSplittingMode', itemIndex, 'simple') as
| 'simple'
| 'custom';
if (textSplittingMode === 'simple') {
textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 200 });
} else if (textSplittingMode === 'custom') {
textSplitter = (await this.getInputConnectionData(NodeConnectionTypes.AiTextSplitter, 0)) as
| TextSplitter
| undefined;
}
} else {
textSplitter = (await this.getInputConnectionData(NodeConnectionTypes.AiTextSplitter, 0)) as
| TextSplitter
| undefined;
}
const binaryDataKey = this.getNodeParameter('binaryDataKey', itemIndex, '') as string;
const processor =
dataType === 'binary'
? new N8nBinaryLoader(this, 'options.', binaryDataKey, textSplitter)
: new N8nJsonLoader(this, 'options.', textSplitter);
return {
response: logWrapper(processor, this),
};
}
}