mirror of
https://github.com/Abdulazizzn/n8n-enterprise-unlocked.git
synced 2025-12-16 09:36:44 +00:00
feat(Default Data Loader Node): Add default text splitter (#15786)
Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>
This commit is contained in:
@@ -1,11 +1,13 @@
|
||||
/* eslint-disable n8n-nodes-base/node-dirname-against-convention */
|
||||
import type { TextSplitter } from '@langchain/textsplitters';
|
||||
import { RecursiveCharacterTextSplitter, type TextSplitter } from '@langchain/textsplitters';
|
||||
import {
|
||||
NodeConnectionTypes,
|
||||
type INodeType,
|
||||
type INodeTypeDescription,
|
||||
type ISupplyDataFunctions,
|
||||
type SupplyData,
|
||||
type IDataObject,
|
||||
type INodeInputConfiguration,
|
||||
} from 'n8n-workflow';
|
||||
|
||||
import { logWrapper } from '@utils/logWrapper';
|
||||
@@ -20,13 +22,31 @@ import 'mammoth'; // for docx
|
||||
import 'epub2'; // for epub
|
||||
import 'pdf-parse'; // for pdf
|
||||
|
||||
function getInputs(parameters: IDataObject) {
|
||||
const inputs: INodeInputConfiguration[] = [];
|
||||
|
||||
const textSplittingMode = parameters?.textSplittingMode;
|
||||
// If text splitting mode is 'custom' or does not exist (v1), we need to add an input for the text splitter
|
||||
if (!textSplittingMode || textSplittingMode === 'custom') {
|
||||
inputs.push({
|
||||
displayName: 'Text Splitter',
|
||||
maxConnections: 1,
|
||||
type: 'ai_textSplitter',
|
||||
required: true,
|
||||
});
|
||||
}
|
||||
|
||||
return inputs;
|
||||
}
|
||||
|
||||
export class DocumentDefaultDataLoader implements INodeType {
|
||||
description: INodeTypeDescription = {
|
||||
displayName: 'Default Data Loader',
|
||||
name: 'documentDefaultDataLoader',
|
||||
icon: 'file:binary.svg',
|
||||
group: ['transform'],
|
||||
version: 1,
|
||||
version: [1, 1.1],
|
||||
defaultVersion: 1.1,
|
||||
description: 'Load data from previous step in the workflow',
|
||||
defaults: {
|
||||
name: 'Default Data Loader',
|
||||
@@ -45,14 +65,7 @@ export class DocumentDefaultDataLoader implements INodeType {
|
||||
},
|
||||
},
|
||||
// eslint-disable-next-line n8n-nodes-base/node-class-description-inputs-wrong-regular-node
|
||||
inputs: [
|
||||
{
|
||||
displayName: 'Text Splitter',
|
||||
maxConnections: 1,
|
||||
type: NodeConnectionTypes.AiTextSplitter,
|
||||
required: true,
|
||||
},
|
||||
],
|
||||
inputs: `={{ ((parameter) => { ${getInputs.toString()}; return getInputs(parameter) })($parameter) }}`,
|
||||
// eslint-disable-next-line n8n-nodes-base/node-class-description-outputs-wrong
|
||||
outputs: [NodeConnectionTypes.AiDocument],
|
||||
outputNames: ['Document'],
|
||||
@@ -64,6 +77,31 @@ export class DocumentDefaultDataLoader implements INodeType {
|
||||
type: 'notice',
|
||||
default: '',
|
||||
},
|
||||
{
|
||||
displayName: 'Text Splitting',
|
||||
name: 'textSplittingMode',
|
||||
type: 'options',
|
||||
default: 'simple',
|
||||
required: true,
|
||||
noDataExpression: true,
|
||||
displayOptions: {
|
||||
show: {
|
||||
'@version': [1.1],
|
||||
},
|
||||
},
|
||||
options: [
|
||||
{
|
||||
name: 'Simple',
|
||||
value: 'simple',
|
||||
description: 'Uses the Recursive Character Text Splitter with default options',
|
||||
},
|
||||
{
|
||||
name: 'Custom',
|
||||
value: 'custom',
|
||||
description: 'Connect a text splitter of your choice',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
displayName: 'Type of Data',
|
||||
name: 'dataType',
|
||||
@@ -284,11 +322,29 @@ export class DocumentDefaultDataLoader implements INodeType {
|
||||
};
|
||||
|
||||
async supplyData(this: ISupplyDataFunctions, itemIndex: number): Promise<SupplyData> {
|
||||
const node = this.getNode();
|
||||
const dataType = this.getNodeParameter('dataType', itemIndex, 'json') as 'json' | 'binary';
|
||||
const textSplitter = (await this.getInputConnectionData(
|
||||
NodeConnectionTypes.AiTextSplitter,
|
||||
0,
|
||||
)) as TextSplitter | undefined;
|
||||
|
||||
let textSplitter: TextSplitter | undefined;
|
||||
|
||||
if (node.typeVersion === 1.1) {
|
||||
const textSplittingMode = this.getNodeParameter('textSplittingMode', itemIndex, 'simple') as
|
||||
| 'simple'
|
||||
| 'custom';
|
||||
|
||||
if (textSplittingMode === 'simple') {
|
||||
textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 200 });
|
||||
} else if (textSplittingMode === 'custom') {
|
||||
textSplitter = (await this.getInputConnectionData(NodeConnectionTypes.AiTextSplitter, 0)) as
|
||||
| TextSplitter
|
||||
| undefined;
|
||||
}
|
||||
} else {
|
||||
textSplitter = (await this.getInputConnectionData(NodeConnectionTypes.AiTextSplitter, 0)) as
|
||||
| TextSplitter
|
||||
| undefined;
|
||||
}
|
||||
|
||||
const binaryDataKey = this.getNodeParameter('binaryDataKey', itemIndex, '') as string;
|
||||
|
||||
const processor =
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
|
||||
import type { ISupplyDataFunctions } from 'n8n-workflow';
|
||||
import { NodeConnectionTypes } from 'n8n-workflow';
|
||||
|
||||
import { DocumentDefaultDataLoader } from '../DocumentDefaultDataLoader.node';
|
||||
|
||||
jest.mock('@langchain/textsplitters', () => ({
|
||||
RecursiveCharacterTextSplitter: jest.fn().mockImplementation(() => ({
|
||||
splitDocuments: jest.fn(
|
||||
async (docs: Array<Record<string, unknown>>): Promise<Array<Record<string, unknown>>> =>
|
||||
docs.map((doc) => ({ ...doc, split: true })),
|
||||
),
|
||||
})),
|
||||
}));
|
||||
|
||||
describe('DocumentDefaultDataLoader', () => {
|
||||
let loader: DocumentDefaultDataLoader;
|
||||
|
||||
beforeEach(() => {
|
||||
loader = new DocumentDefaultDataLoader();
|
||||
jest.clearAllMocks();
|
||||
});
|
||||
|
||||
it('should supply data with recursive char text splitter', async () => {
|
||||
const context = {
|
||||
getNode: jest.fn(() => ({ typeVersion: 1.1 })),
|
||||
getNodeParameter: jest.fn().mockImplementation((paramName, _itemIndex) => {
|
||||
switch (paramName) {
|
||||
case 'dataType':
|
||||
return 'json';
|
||||
case 'textSplittingMode':
|
||||
return 'simple';
|
||||
case 'binaryDataKey':
|
||||
return 'data';
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}),
|
||||
} as unknown as ISupplyDataFunctions;
|
||||
|
||||
await loader.supplyData.call(context, 0);
|
||||
expect(RecursiveCharacterTextSplitter).toHaveBeenCalledWith({
|
||||
chunkSize: 1000,
|
||||
chunkOverlap: 200,
|
||||
});
|
||||
});
|
||||
|
||||
it('should supply data with custom text splitter', async () => {
|
||||
const customSplitter = { splitDocuments: jest.fn(async (docs) => docs) };
|
||||
const context = {
|
||||
getNode: jest.fn(() => ({ typeVersion: 1.1 })),
|
||||
getNodeParameter: jest.fn().mockImplementation((paramName, _itemIndex) => {
|
||||
switch (paramName) {
|
||||
case 'dataType':
|
||||
return 'json';
|
||||
case 'textSplittingMode':
|
||||
return 'custom';
|
||||
case 'binaryDataKey':
|
||||
return 'data';
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}),
|
||||
getInputConnectionData: jest.fn(async () => customSplitter),
|
||||
} as unknown as ISupplyDataFunctions;
|
||||
await loader.supplyData.call(context, 0);
|
||||
expect(context.getInputConnectionData).toHaveBeenCalledWith(
|
||||
NodeConnectionTypes.AiTextSplitter,
|
||||
0,
|
||||
);
|
||||
});
|
||||
});
|
||||
@@ -1,24 +1,45 @@
|
||||
/* eslint-disable n8n-nodes-base/node-dirname-against-convention */
|
||||
import { GithubRepoLoader } from '@langchain/community/document_loaders/web/github';
|
||||
import type { CharacterTextSplitter } from '@langchain/textsplitters';
|
||||
import type { TextSplitter } from '@langchain/textsplitters';
|
||||
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
|
||||
import {
|
||||
NodeConnectionTypes,
|
||||
type INodeType,
|
||||
type INodeTypeDescription,
|
||||
type ISupplyDataFunctions,
|
||||
type SupplyData,
|
||||
type IDataObject,
|
||||
type INodeInputConfiguration,
|
||||
} from 'n8n-workflow';
|
||||
|
||||
import { logWrapper } from '@utils/logWrapper';
|
||||
import { getConnectionHintNoticeField } from '@utils/sharedFields';
|
||||
|
||||
function getInputs(parameters: IDataObject) {
|
||||
const inputs: INodeInputConfiguration[] = [];
|
||||
|
||||
const textSplittingMode = parameters?.textSplittingMode;
|
||||
// If text splitting mode is 'custom' or does not exist (v1), we need to add an input for the text splitter
|
||||
if (!textSplittingMode || textSplittingMode === 'custom') {
|
||||
inputs.push({
|
||||
displayName: 'Text Splitter',
|
||||
maxConnections: 1,
|
||||
type: 'ai_textSplitter',
|
||||
required: true,
|
||||
});
|
||||
}
|
||||
|
||||
return inputs;
|
||||
}
|
||||
|
||||
export class DocumentGithubLoader implements INodeType {
|
||||
description: INodeTypeDescription = {
|
||||
displayName: 'GitHub Document Loader',
|
||||
name: 'documentGithubLoader',
|
||||
icon: 'file:github.svg',
|
||||
group: ['transform'],
|
||||
version: 1,
|
||||
version: [1, 1.1],
|
||||
defaultVersion: 1.1,
|
||||
description: 'Use GitHub data as input to this chain',
|
||||
defaults: {
|
||||
name: 'GitHub Document Loader',
|
||||
@@ -43,19 +64,38 @@ export class DocumentGithubLoader implements INodeType {
|
||||
},
|
||||
],
|
||||
// eslint-disable-next-line n8n-nodes-base/node-class-description-inputs-wrong-regular-node
|
||||
inputs: [
|
||||
{
|
||||
displayName: 'Text Splitter',
|
||||
maxConnections: 1,
|
||||
type: NodeConnectionTypes.AiTextSplitter,
|
||||
},
|
||||
],
|
||||
inputs: `={{ ((parameter) => { ${getInputs.toString()}; return getInputs(parameter) })($parameter) }}`,
|
||||
inputNames: ['Text Splitter'],
|
||||
// eslint-disable-next-line n8n-nodes-base/node-class-description-outputs-wrong
|
||||
outputs: [NodeConnectionTypes.AiDocument],
|
||||
outputNames: ['Document'],
|
||||
properties: [
|
||||
getConnectionHintNoticeField([NodeConnectionTypes.AiVectorStore]),
|
||||
{
|
||||
displayName: 'Text Splitting',
|
||||
name: 'textSplittingMode',
|
||||
type: 'options',
|
||||
default: 'simple',
|
||||
required: true,
|
||||
noDataExpression: true,
|
||||
displayOptions: {
|
||||
show: {
|
||||
'@version': [1.1],
|
||||
},
|
||||
},
|
||||
options: [
|
||||
{
|
||||
name: 'Simple',
|
||||
value: 'simple',
|
||||
description: 'Uses Recursive Character Text Splitter with default options',
|
||||
},
|
||||
{
|
||||
name: 'Custom',
|
||||
value: 'custom',
|
||||
description: 'Connect a text splitter of your choice',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
displayName: 'Repository Link',
|
||||
name: 'repository',
|
||||
@@ -96,6 +136,7 @@ export class DocumentGithubLoader implements INodeType {
|
||||
|
||||
async supplyData(this: ISupplyDataFunctions, itemIndex: number): Promise<SupplyData> {
|
||||
this.logger.debug('Supplying data for Github Document Loader');
|
||||
const node = this.getNode();
|
||||
|
||||
const repository = this.getNodeParameter('repository', itemIndex) as string;
|
||||
const branch = this.getNodeParameter('branch', itemIndex) as string;
|
||||
@@ -104,11 +145,25 @@ export class DocumentGithubLoader implements INodeType {
|
||||
recursive: boolean;
|
||||
ignorePaths: string;
|
||||
};
|
||||
let textSplitter: TextSplitter | undefined;
|
||||
|
||||
const textSplitter = (await this.getInputConnectionData(
|
||||
NodeConnectionTypes.AiTextSplitter,
|
||||
0,
|
||||
)) as CharacterTextSplitter | undefined;
|
||||
if (node.typeVersion === 1.1) {
|
||||
const textSplittingMode = this.getNodeParameter('textSplittingMode', itemIndex, 'simple') as
|
||||
| 'simple'
|
||||
| 'custom';
|
||||
|
||||
if (textSplittingMode === 'simple') {
|
||||
textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 200 });
|
||||
} else if (textSplittingMode === 'custom') {
|
||||
textSplitter = (await this.getInputConnectionData(NodeConnectionTypes.AiTextSplitter, 0)) as
|
||||
| TextSplitter
|
||||
| undefined;
|
||||
}
|
||||
} else {
|
||||
textSplitter = (await this.getInputConnectionData(NodeConnectionTypes.AiTextSplitter, 0)) as
|
||||
| TextSplitter
|
||||
| undefined;
|
||||
}
|
||||
|
||||
const { index } = this.addInputData(NodeConnectionTypes.AiDocument, [
|
||||
[{ json: { repository, branch, ignorePaths, recursive } }],
|
||||
|
||||
@@ -0,0 +1,99 @@
|
||||
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
|
||||
import type { ISupplyDataFunctions } from 'n8n-workflow';
|
||||
import { NodeConnectionTypes } from 'n8n-workflow';
|
||||
|
||||
import { DocumentGithubLoader } from '../DocumentGithubLoader.node';
|
||||
|
||||
jest.mock('@langchain/textsplitters', () => ({
|
||||
RecursiveCharacterTextSplitter: jest.fn().mockImplementation(() => ({
|
||||
splitDocuments: jest.fn(
|
||||
async (docs: Array<{ [key: string]: unknown }>): Promise<Array<{ [key: string]: unknown }>> =>
|
||||
docs.map((doc) => ({ ...doc, split: true })),
|
||||
),
|
||||
})),
|
||||
}));
|
||||
jest.mock('@langchain/community/document_loaders/web/github', () => ({
|
||||
GithubRepoLoader: jest.fn().mockImplementation(() => ({
|
||||
load: jest.fn(async () => [{ pageContent: 'doc1' }, { pageContent: 'doc2' }]),
|
||||
})),
|
||||
}));
|
||||
|
||||
const mockLogger = { debug: jest.fn() };
|
||||
|
||||
describe('DocumentGithubLoader', () => {
|
||||
let loader: DocumentGithubLoader;
|
||||
|
||||
beforeEach(() => {
|
||||
loader = new DocumentGithubLoader();
|
||||
jest.clearAllMocks();
|
||||
});
|
||||
|
||||
it('should supply data with recursive char text splitter', async () => {
|
||||
const context = {
|
||||
logger: mockLogger,
|
||||
getNode: jest.fn(() => ({ typeVersion: 1.1 })),
|
||||
getNodeParameter: jest.fn().mockImplementation((paramName, _itemIndex) => {
|
||||
switch (paramName) {
|
||||
case 'repository':
|
||||
return 'owner/repo';
|
||||
case 'branch':
|
||||
return 'main';
|
||||
case 'textSplittingMode':
|
||||
return 'simple';
|
||||
case 'additionalOptions':
|
||||
return { recursive: true, ignorePaths: 'docs,tests' };
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}),
|
||||
getCredentials: jest.fn().mockResolvedValue({
|
||||
accessToken: 'token',
|
||||
server: 'https://api.github.com',
|
||||
}),
|
||||
addInputData: jest.fn(() => ({ index: 0 })),
|
||||
addOutputData: jest.fn(),
|
||||
} as unknown as ISupplyDataFunctions;
|
||||
await loader.supplyData.call(context, 0);
|
||||
|
||||
expect(RecursiveCharacterTextSplitter).toHaveBeenCalledWith({
|
||||
chunkSize: 1000,
|
||||
chunkOverlap: 200,
|
||||
});
|
||||
});
|
||||
|
||||
it('should use custom text splitter when textSplittingMode is custom', async () => {
|
||||
const customSplitter = { splitDocuments: jest.fn(async (docs) => docs) };
|
||||
const context = {
|
||||
logger: mockLogger,
|
||||
getNode: jest.fn(() => ({ typeVersion: 1.1 })),
|
||||
getNodeParameter: jest.fn().mockImplementation((paramName, _itemIndex) => {
|
||||
switch (paramName) {
|
||||
case 'repository':
|
||||
return 'owner/repo';
|
||||
case 'branch':
|
||||
return 'main';
|
||||
case 'textSplittingMode':
|
||||
return 'custom';
|
||||
case 'additionalOptions':
|
||||
return { recursive: true, ignorePaths: 'docs,tests' };
|
||||
default:
|
||||
return;
|
||||
}
|
||||
}),
|
||||
getCredentials: jest.fn().mockResolvedValue({
|
||||
accessToken: 'token',
|
||||
server: 'https://api.github.com',
|
||||
}),
|
||||
getInputConnectionData: jest.fn(async () => customSplitter),
|
||||
addInputData: jest.fn(() => ({ index: 0 })),
|
||||
addOutputData: jest.fn(),
|
||||
} as unknown as ISupplyDataFunctions;
|
||||
await loader.supplyData.call(context, 0);
|
||||
|
||||
expect(context.getInputConnectionData).toHaveBeenCalledWith(
|
||||
NodeConnectionTypes.AiTextSplitter,
|
||||
0,
|
||||
);
|
||||
expect(customSplitter.splitDocuments).toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user