mirror of
https://github.com/Abdulazizzn/n8n-enterprise-unlocked.git
synced 2025-12-17 01:56:46 +00:00
refactor: Remove Epub document loader override (no-changelog) (#7874)
- Remove Epub document loader override, use standard Langchain loader - Use temporary buffer file for document loader processing and pass just the path - Remove replace `@gxl/epub-parser` library with `epub2` Github issue / Community forum post (link here to close automatically): --------- Signed-off-by: Oleg Ivaniv <me@olegivaniv.com> Co-authored-by: कारतोफ्फेलस्क्रिप्ट™ <aditya@netroy.in>
This commit is contained in:
@@ -1,82 +0,0 @@
|
||||
// Modified version of https://github.com/hwchase17/langchainjs/blob/main/langchain/src/document_loaders/fs/epub.ts
|
||||
// to support loading of EPUB files from a Buffer
|
||||
import { parseEpub } from '@gxl/epub-parser';
|
||||
import { BaseDocumentLoader } from 'langchain/document_loaders/base';
|
||||
import { Document } from 'langchain/document';
|
||||
import { htmlToText } from 'html-to-text';
|
||||
/**
|
||||
* A class that extends the `BaseDocumentLoader` class. It represents a
|
||||
* document loader that loads documents from EPUB files.
|
||||
*/
|
||||
export class N8nEPubLoader extends BaseDocumentLoader {
|
||||
private splitChapters: boolean;
|
||||
|
||||
constructor(
|
||||
public file: Buffer,
|
||||
{ splitChapters = true } = {},
|
||||
) {
|
||||
super();
|
||||
this.splitChapters = splitChapters;
|
||||
}
|
||||
|
||||
/**
|
||||
* A protected method that takes an EPUB object as a parameter and returns
|
||||
* a promise that resolves to an array of objects representing the content
|
||||
* and metadata of each chapter.
|
||||
* @param epub The EPUB object to parse.
|
||||
* @returns A promise that resolves to an array of objects representing the content and metadata of each chapter.
|
||||
*/
|
||||
protected async parse(
|
||||
epub: ReturnType<typeof parseEpub>,
|
||||
): Promise<Array<{ pageContent: string; metadata?: object }>> {
|
||||
// We await it here because epub-parsers doesn't export a type for the
|
||||
// return value of parseEpub.
|
||||
const parsed = await epub;
|
||||
|
||||
const chapters = await Promise.all(
|
||||
(parsed.sections ?? []).map(async (chapter) => {
|
||||
if (!chapter.id) return null as never;
|
||||
|
||||
const html = chapter.htmlString;
|
||||
if (!html) return null as never;
|
||||
|
||||
return {
|
||||
html,
|
||||
title: chapter.id,
|
||||
};
|
||||
}),
|
||||
);
|
||||
return chapters.filter(Boolean).map((chapter) => ({
|
||||
pageContent: htmlToText(chapter.html),
|
||||
metadata: {
|
||||
...(chapter.title && { chapter: chapter.title }),
|
||||
},
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* A method that loads the EPUB file and returns a promise that resolves
|
||||
* to an array of `Document` instances.
|
||||
* @returns A promise that resolves to an array of `Document` instances.
|
||||
*/
|
||||
public async load(): Promise<Document[]> {
|
||||
const epub = parseEpub(this.file, { type: 'buffer' });
|
||||
const parsed = await this.parse(epub);
|
||||
|
||||
return this.splitChapters
|
||||
? parsed.map(
|
||||
(chapter) =>
|
||||
new Document({
|
||||
pageContent: chapter.pageContent,
|
||||
metadata: {
|
||||
...chapter.metadata,
|
||||
},
|
||||
}),
|
||||
)
|
||||
: [
|
||||
new Document({
|
||||
pageContent: parsed.map((chapter) => chapter.pageContent).join('\n\n'),
|
||||
}),
|
||||
];
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
import type { IExecuteFunctions, INodeExecutionData, IBinaryData } from 'n8n-workflow';
|
||||
import { NodeOperationError, NodeConnectionType } from 'n8n-workflow';
|
||||
import type { IExecuteFunctions, INodeExecutionData } from 'n8n-workflow';
|
||||
import { NodeOperationError, NodeConnectionType, BINARY_ENCODING } from 'n8n-workflow';
|
||||
|
||||
import type { TextSplitter } from 'langchain/text_splitter';
|
||||
import type { Document } from 'langchain/document';
|
||||
@@ -8,7 +8,11 @@ import { DocxLoader } from 'langchain/document_loaders/fs/docx';
|
||||
import { JSONLoader } from 'langchain/document_loaders/fs/json';
|
||||
import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
|
||||
import { TextLoader } from 'langchain/document_loaders/fs/text';
|
||||
import { N8nEPubLoader } from './EpubLoader';
|
||||
import { EPubLoader } from 'langchain/document_loaders/fs/epub';
|
||||
import { file as tmpFile, type DirectoryResult } from 'tmp-promise';
|
||||
import { pipeline } from 'stream/promises';
|
||||
import { createWriteStream } from 'fs';
|
||||
|
||||
import { getMetadataFiltersValues } from './helpers';
|
||||
|
||||
const SUPPORTED_MIME_TYPES = {
|
||||
@@ -58,11 +62,7 @@ export class N8nBinaryLoader {
|
||||
if (!item) return [];
|
||||
|
||||
// TODO: Should we support traversing the object to find the binary data?
|
||||
const binaryData = item.binary?.[binaryDataKey] as IBinaryData;
|
||||
|
||||
if (!binaryData) {
|
||||
throw new NodeOperationError(this.context.getNode(), 'No binary data set.');
|
||||
}
|
||||
const binaryData = this.context.helpers.assertBinaryData(itemIndex, binaryDataKey);
|
||||
|
||||
const { mimeType } = binaryData;
|
||||
|
||||
@@ -92,10 +92,18 @@ export class N8nBinaryLoader {
|
||||
);
|
||||
}
|
||||
|
||||
const bufferData = await this.context.helpers.getBinaryDataBuffer(itemIndex, binaryDataKey);
|
||||
const itemBlob = new Blob([new Uint8Array(bufferData)], { type: mimeType });
|
||||
let filePathOrBlob: string | Blob;
|
||||
if (binaryData.id) {
|
||||
filePathOrBlob = this.context.helpers.getBinaryPath(binaryData.id);
|
||||
} else {
|
||||
filePathOrBlob = new Blob([Buffer.from(binaryData.data, BINARY_ENCODING)], {
|
||||
type: mimeType,
|
||||
});
|
||||
}
|
||||
|
||||
let loader: PDFLoader | CSVLoader | EPubLoader | DocxLoader | TextLoader | JSONLoader;
|
||||
let cleanupTmpFile: DirectoryResult["cleanup"] | undefined = undefined;
|
||||
|
||||
let loader: PDFLoader | CSVLoader | N8nEPubLoader | DocxLoader | TextLoader | JSONLoader;
|
||||
switch (mimeType) {
|
||||
case 'application/pdf':
|
||||
const splitPages = this.context.getNodeParameter(
|
||||
@@ -103,7 +111,7 @@ export class N8nBinaryLoader {
|
||||
itemIndex,
|
||||
false,
|
||||
) as boolean;
|
||||
loader = new PDFLoader(itemBlob, {
|
||||
loader = new PDFLoader(filePathOrBlob, {
|
||||
splitPages,
|
||||
});
|
||||
break;
|
||||
@@ -119,19 +127,39 @@ export class N8nBinaryLoader {
|
||||
',',
|
||||
) as string;
|
||||
|
||||
loader = new CSVLoader(itemBlob, {
|
||||
loader = new CSVLoader(filePathOrBlob, {
|
||||
column: column ?? undefined,
|
||||
separator,
|
||||
});
|
||||
break;
|
||||
case 'application/epub+zip':
|
||||
loader = new N8nEPubLoader(Buffer.from(bufferData));
|
||||
// EPubLoader currently does not accept Blobs https://github.com/langchain-ai/langchainjs/issues/1623
|
||||
let filePath: string;
|
||||
if (filePathOrBlob instanceof Blob) {
|
||||
const tmpFileData = await tmpFile({ prefix: 'epub-loader-' });
|
||||
cleanupTmpFile = tmpFileData.cleanup;
|
||||
try {
|
||||
const bufferData = await filePathOrBlob.arrayBuffer();
|
||||
await pipeline(
|
||||
[new Uint8Array(bufferData)],
|
||||
createWriteStream(tmpFileData.path),
|
||||
);
|
||||
loader = new EPubLoader(tmpFileData.path);
|
||||
break
|
||||
} catch (error) {
|
||||
await cleanupTmpFile();
|
||||
throw new NodeOperationError(this.context.getNode(), error as Error);
|
||||
}
|
||||
} else {
|
||||
filePath = filePathOrBlob;
|
||||
}
|
||||
loader = new EPubLoader(filePath);
|
||||
break;
|
||||
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
|
||||
loader = new DocxLoader(itemBlob);
|
||||
loader = new DocxLoader(filePathOrBlob);
|
||||
break;
|
||||
case 'text/plain':
|
||||
loader = new TextLoader(itemBlob);
|
||||
loader = new TextLoader(filePathOrBlob);
|
||||
break;
|
||||
case 'application/json':
|
||||
const pointers = this.context.getNodeParameter(
|
||||
@@ -140,10 +168,10 @@ export class N8nBinaryLoader {
|
||||
'',
|
||||
) as string;
|
||||
const pointersArray = pointers.split(',').map((pointer) => pointer.trim());
|
||||
loader = new JSONLoader(itemBlob, pointersArray);
|
||||
loader = new JSONLoader(filePathOrBlob, pointersArray);
|
||||
break;
|
||||
default:
|
||||
loader = new TextLoader(itemBlob);
|
||||
loader = new TextLoader(filePathOrBlob);
|
||||
}
|
||||
|
||||
const textSplitter = (await this.context.getInputConnectionData(
|
||||
@@ -163,6 +191,10 @@ export class N8nBinaryLoader {
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
if (cleanupTmpFile) {
|
||||
await cleanupTmpFile();
|
||||
}
|
||||
return docs;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user