refactor: Remove Epub document loader override (no-changelog) (#7874)

- Remove Epub document loader override, use standard Langchain loader - Use temporary buffer file for document loader processing and pass just the path - Remove replace `@gxl/epub-parser` library with `epub2` Github issue / Community forum post (link here to close automatically): --------- Signed-off-by: Oleg Ivaniv <me@olegivaniv.com> Co-authored-by: कारतोफ्फेलस्क्रिप्ट™ <aditya@netroy.in>
2025-12-17 01:56:46 +00:00 · 2023-11-30 11:59:37 +01:00
parent 5f4a9524ec
commit e834f14991
6 changed files with 133 additions and 498 deletions
--- a/packages/@n8n/nodes-langchain/utils/EpubLoader.ts
+++ b/packages/@n8n/nodes-langchain/utils/EpubLoader.ts
@@ -1,82 +0,0 @@
-// Modified version of https://github.com/hwchase17/langchainjs/blob/main/langchain/src/document_loaders/fs/epub.ts
-// to support loading of EPUB files from a Buffer
-import { parseEpub } from '@gxl/epub-parser';
-import { BaseDocumentLoader } from 'langchain/document_loaders/base';
-import { Document } from 'langchain/document';
-import { htmlToText } from 'html-to-text';
-/**
- * A class that extends the `BaseDocumentLoader` class. It represents a
- * document loader that loads documents from EPUB files.
- */
-export class N8nEPubLoader extends BaseDocumentLoader {
-	private splitChapters: boolean;
-
-	constructor(
-		public file: Buffer,
-		{ splitChapters = true } = {},
-	) {
-		super();
-		this.splitChapters = splitChapters;
-	}
-
-	/**
-	 * A protected method that takes an EPUB object as a parameter and returns
-	 * a promise that resolves to an array of objects representing the content
-	 * and metadata of each chapter.
-	 * @param epub The EPUB object to parse.
-	 * @returns A promise that resolves to an array of objects representing the content and metadata of each chapter.
-	 */
-	protected async parse(
-		epub: ReturnType<typeof parseEpub>,
-	): Promise<Array<{ pageContent: string; metadata?: object }>> {
-		// We await it here because epub-parsers doesn't export a type for the
-		// return value of parseEpub.
-		const parsed = await epub;
-
-		const chapters = await Promise.all(
-			(parsed.sections ?? []).map(async (chapter) => {
-				if (!chapter.id) return null as never;
-
-				const html = chapter.htmlString;
-				if (!html) return null as never;
-
-				return {
-					html,
-					title: chapter.id,
-				};
-			}),
-		);
-		return chapters.filter(Boolean).map((chapter) => ({
-			pageContent: htmlToText(chapter.html),
-			metadata: {
-				...(chapter.title && { chapter: chapter.title }),
-			},
-		}));
-	}
-
-	/**
-	 * A method that loads the EPUB file and returns a promise that resolves
-	 * to an array of `Document` instances.
-	 * @returns A promise that resolves to an array of `Document` instances.
-	 */
-	public async load(): Promise<Document[]> {
-		const epub = parseEpub(this.file, { type: 'buffer' });
-		const parsed = await this.parse(epub);
-
-		return this.splitChapters
-			? parsed.map(
-					(chapter) =>
-						new Document({
-							pageContent: chapter.pageContent,
-							metadata: {
-								...chapter.metadata,
-							},
-						}),
-			  )
-			: [
-					new Document({
-						pageContent: parsed.map((chapter) => chapter.pageContent).join('\n\n'),
-					}),
-			  ];
-	}
-}
--- a/packages/@n8n/nodes-langchain/utils/N8nBinaryLoader.ts
+++ b/packages/@n8n/nodes-langchain/utils/N8nBinaryLoader.ts
@@ -1,5 +1,5 @@
-import type { IExecuteFunctions, INodeExecutionData, IBinaryData } from 'n8n-workflow';
-import { NodeOperationError, NodeConnectionType } from 'n8n-workflow';
+import type { IExecuteFunctions, INodeExecutionData } from 'n8n-workflow';
+import { NodeOperationError, NodeConnectionType, BINARY_ENCODING } from 'n8n-workflow';

 import type { TextSplitter } from 'langchain/text_splitter';
 import type { Document } from 'langchain/document';
@@ -8,7 +8,11 @@ import { DocxLoader } from 'langchain/document_loaders/fs/docx';
 import { JSONLoader } from 'langchain/document_loaders/fs/json';
 import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
 import { TextLoader } from 'langchain/document_loaders/fs/text';
-import { N8nEPubLoader } from './EpubLoader';
+import { EPubLoader } from 'langchain/document_loaders/fs/epub';
+import { file as tmpFile, type DirectoryResult } from 'tmp-promise';
+import { pipeline } from 'stream/promises';
+import { createWriteStream } from 'fs';
+
 import { getMetadataFiltersValues } from './helpers';

 const SUPPORTED_MIME_TYPES = {
@@ -58,11 +62,7 @@ export class N8nBinaryLoader {
 		if (!item) return [];

 		// TODO: Should we support traversing the object to find the binary data?
-		const binaryData = item.binary?.[binaryDataKey] as IBinaryData;
-
-		if (!binaryData) {
-			throw new NodeOperationError(this.context.getNode(), 'No binary data set.');
-		}
+		const binaryData = this.context.helpers.assertBinaryData(itemIndex, binaryDataKey);

 		const { mimeType } = binaryData;

@@ -92,10 +92,18 @@ export class N8nBinaryLoader {
 			);
 		}

-		const bufferData = await this.context.helpers.getBinaryDataBuffer(itemIndex, binaryDataKey);
-		const itemBlob = new Blob([new Uint8Array(bufferData)], { type: mimeType });
+		let filePathOrBlob: string | Blob;
+		if (binaryData.id) {
+			filePathOrBlob = this.context.helpers.getBinaryPath(binaryData.id);
+		} else {
+			filePathOrBlob = new Blob([Buffer.from(binaryData.data, BINARY_ENCODING)], {
+				type: mimeType,
+			});
+		}
+
+		let loader: PDFLoader | CSVLoader | EPubLoader | DocxLoader | TextLoader | JSONLoader;
+		let cleanupTmpFile: DirectoryResult["cleanup"] | undefined = undefined;

-		let loader: PDFLoader | CSVLoader | N8nEPubLoader | DocxLoader | TextLoader | JSONLoader;
 		switch (mimeType) {
 			case 'application/pdf':
 				const splitPages = this.context.getNodeParameter(
@@ -103,7 +111,7 @@ export class N8nBinaryLoader {
 					itemIndex,
 					false,
 				) as boolean;
-				loader = new PDFLoader(itemBlob, {
+				loader = new PDFLoader(filePathOrBlob, {
 					splitPages,
 				});
 				break;
@@ -119,19 +127,39 @@ export class N8nBinaryLoader {
 					',',
 				) as string;

-				loader = new CSVLoader(itemBlob, {
+				loader = new CSVLoader(filePathOrBlob, {
 					column: column ?? undefined,
 					separator,
 				});
 				break;
 			case 'application/epub+zip':
-				loader = new N8nEPubLoader(Buffer.from(bufferData));
+				// EPubLoader currently does not accept Blobs https://github.com/langchain-ai/langchainjs/issues/1623
+				let filePath: string;
+				if (filePathOrBlob instanceof Blob) {
+					const tmpFileData = await tmpFile({ prefix: 'epub-loader-' });
+					cleanupTmpFile = tmpFileData.cleanup;
+					try {
+						const bufferData = await filePathOrBlob.arrayBuffer();
+						await pipeline(
+							[new Uint8Array(bufferData)],
+							createWriteStream(tmpFileData.path),
+						);
+						loader = new EPubLoader(tmpFileData.path);
+						break
+					} catch (error) {
+						await cleanupTmpFile();
+						throw new NodeOperationError(this.context.getNode(), error as Error);
+					}
+				} else {
+					filePath = filePathOrBlob;
+				}
+				loader = new EPubLoader(filePath);
 				break;
 			case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
-				loader = new DocxLoader(itemBlob);
+				loader = new DocxLoader(filePathOrBlob);
 				break;
 			case 'text/plain':
-				loader = new TextLoader(itemBlob);
+				loader = new TextLoader(filePathOrBlob);
 				break;
 			case 'application/json':
 				const pointers = this.context.getNodeParameter(
@@ -140,10 +168,10 @@ export class N8nBinaryLoader {
 					'',
 				) as string;
 				const pointersArray = pointers.split(',').map((pointer) => pointer.trim());
-				loader = new JSONLoader(itemBlob, pointersArray);
+				loader = new JSONLoader(filePathOrBlob, pointersArray);
 				break;
 			default:
-				loader = new TextLoader(itemBlob);
+				loader = new TextLoader(filePathOrBlob);
 		}

 		const textSplitter = (await this.context.getInputConnectionData(
@@ -163,6 +191,10 @@ export class N8nBinaryLoader {
 				};
 			});
 		}
+
+		if (cleanupTmpFile) {
+			await cleanupTmpFile();
+		}
 		return docs;
 	}
 }