feat(core): Improvements/overhaul for nodes working with binary data (#7651)

Github issue / Community forum post (link here to close automatically): --------- Co-authored-by: Giulio Andreini <andreini@netseven.it> Co-authored-by: Marcus <marcus@n8n.io>
2025-12-17 01:56:46 +00:00 · 2024-01-03 13:08:16 +02:00
parent 259323b97e
commit 5e16dd4ab4
119 changed files with 4477 additions and 1201 deletions
--- a/packages/nodes-base/utils/binary.ts
+++ b/packages/nodes-base/utils/binary.ts
@@ -0,0 +1,194 @@
+import type { IBinaryData, IDataObject, IExecuteFunctions, INodeExecutionData } from 'n8n-workflow';
+import { NodeOperationError, BINARY_ENCODING } from 'n8n-workflow';
+import type { WorkBook, WritingOptions } from 'xlsx';
+import { utils as xlsxUtils, write as xlsxWrite } from 'xlsx';
+import { flattenObject } from '@utils/utilities';
+
+import get from 'lodash/get';
+import iconv from 'iconv-lite';
+
+import { getDocument as readPDF, version as pdfJsVersion } from 'pdfjs-dist';
+
+export type JsonToSpreadsheetBinaryFormat = 'csv' | 'html' | 'rtf' | 'ods' | 'xls' | 'xlsx';
+
+export type JsonToSpreadsheetBinaryOptions = {
+	headerRow?: boolean;
+	compression?: boolean;
+	fileName?: string;
+	sheetName?: string;
+};
+
+export type JsonToBinaryOptions = {
+	fileName?: string;
+	sourceKey?: string;
+	encoding?: string;
+	addBOM?: boolean;
+	mimeType?: string;
+	dataIsBase64?: boolean;
+	itemIndex?: number;
+};
+
+type PdfDocument = Awaited<ReturnType<Awaited<typeof readPDF>>['promise']>;
+type PdfPage = Awaited<ReturnType<Awaited<PdfDocument['getPage']>>>;
+type PdfTextContent = Awaited<ReturnType<PdfPage['getTextContent']>>;
+
+export async function convertJsonToSpreadsheetBinary(
+	this: IExecuteFunctions,
+	items: INodeExecutionData[],
+	fileFormat: JsonToSpreadsheetBinaryFormat,
+	options: JsonToSpreadsheetBinaryOptions,
+	defaultFileName = 'spreadsheet',
+): Promise<IBinaryData> {
+	const itemData: IDataObject[] = [];
+	for (let itemIndex = 0; itemIndex < items.length; itemIndex++) {
+		itemData.push(flattenObject(items[itemIndex].json));
+	}
+
+	let sheetToJsonOptions;
+	if (options.headerRow === false) {
+		sheetToJsonOptions = { skipHeader: true };
+	}
+
+	const sheet = xlsxUtils.json_to_sheet(itemData, sheetToJsonOptions);
+
+	const writingOptions: WritingOptions = {
+		bookType: fileFormat,
+		bookSST: false,
+		type: 'buffer',
+	};
+
+	if (['xlsx', 'ods'].includes(fileFormat) && options.compression) {
+		writingOptions.compression = true;
+	}
+
+	// Convert the data in the correct format
+	const sheetName = (options.sheetName as string) || 'Sheet';
+	const workbook: WorkBook = {
+		SheetNames: [sheetName],
+		Sheets: {
+			[sheetName]: sheet,
+		},
+	};
+
+	const buffer: Buffer = xlsxWrite(workbook, writingOptions);
+	const fileName =
+		options.fileName !== undefined ? options.fileName : `${defaultFileName}.${fileFormat}`;
+	const binaryData = await this.helpers.prepareBinaryData(buffer, fileName);
+
+	return binaryData;
+}
+
+export async function createBinaryFromJson(
+	this: IExecuteFunctions,
+	data: IDataObject | IDataObject[],
+	options: JsonToBinaryOptions,
+): Promise<IBinaryData> {
+	let value;
+	if (options.sourceKey) {
+		value = get(data, options.sourceKey) as IDataObject;
+	} else {
+		value = data;
+	}
+
+	if (value === undefined) {
+		throw new NodeOperationError(this.getNode(), `The value in "${options.sourceKey}" is not set`, {
+			itemIndex: options.itemIndex || 0,
+		});
+	}
+
+	let buffer: Buffer;
+	if (!options.dataIsBase64) {
+		let valueAsString = value as unknown as string;
+
+		if (typeof value === 'object') {
+			options.mimeType = 'application/json';
+			valueAsString = JSON.stringify(value);
+		}
+
+		buffer = iconv.encode(valueAsString, options.encoding || 'utf8', {
+			addBOM: options.addBOM,
+		});
+	} else {
+		buffer = Buffer.from(value as unknown as string, BINARY_ENCODING);
+	}
+
+	const binaryData = await this.helpers.prepareBinaryData(
+		buffer,
+		options.fileName,
+		options.mimeType,
+	);
+
+	if (!binaryData.fileName) {
+		const fileExtension = binaryData.fileExtension ? `.${binaryData.fileExtension}` : '';
+		binaryData.fileName = `file${fileExtension}`;
+	}
+
+	return binaryData;
+}
+
+const parseText = (textContent: PdfTextContent) => {
+	let lastY = undefined;
+	const text = [];
+	for (const item of textContent.items) {
+		if ('str' in item) {
+			if (lastY == item.transform[5] || !lastY) {
+				text.push(item.str);
+			} else {
+				text.push(`\n${item.str}`);
+			}
+			lastY = item.transform[5];
+		}
+	}
+	return text.join('');
+};
+
+export async function extractDataFromPDF(
+	this: IExecuteFunctions,
+	binaryPropertyName: string,
+	password?: string,
+	maxPages?: number,
+	joinPages = true,
+	itemIndex = 0,
+) {
+	const binaryData = this.helpers.assertBinaryData(itemIndex, binaryPropertyName);
+
+	const params: { password?: string; url?: URL; data?: ArrayBuffer } = { password };
+
+	if (binaryData.id) {
+		const binaryPath = this.helpers.getBinaryPath(binaryData.id);
+		params.url = new URL(`file://${binaryPath}`);
+	} else {
+		params.data = Buffer.from(binaryData.data, BINARY_ENCODING).buffer;
+	}
+
+	const document = await readPDF(params).promise;
+	const { info, metadata } = await document
+		.getMetadata()
+		.catch(() => ({ info: null, metadata: null }));
+
+	const pages = [];
+	if (maxPages !== 0) {
+		let pagesToRead = document.numPages;
+		if (maxPages && maxPages < document.numPages) {
+			pagesToRead = maxPages;
+		}
+		for (let i = 1; i <= pagesToRead; i++) {
+			const page = await document.getPage(i);
+			const text = await page.getTextContent().then(parseText);
+			pages.push(text);
+		}
+	}
+
+	const text = joinPages ? pages.join('\n\n') : pages;
+
+	const returnData = {
+		numpages: document.numPages,
+		numrender: document.numPages,
+		info,
+		metadata: metadata?.getAll(),
+		text,
+		version: pdfJsVersion,
+	};
+
+	return returnData;
+}