feat(core): Improvements/overhaul for nodes working with binary data (#7651)

Github issue / Community forum post (link here to close automatically):

---------

Co-authored-by: Giulio Andreini <andreini@netseven.it>
Co-authored-by: Marcus <marcus@n8n.io>
This commit is contained in:
Michael Kret
2024-01-03 13:08:16 +02:00
committed by GitHub
parent 259323b97e
commit 5e16dd4ab4
119 changed files with 4477 additions and 1201 deletions

View File

@@ -1,35 +1,15 @@
import {
BINARY_ENCODING,
NodeOperationError,
type IExecuteFunctions,
type INodeExecutionData,
type INodeType,
type INodeTypeDescription,
} from 'n8n-workflow';
import { getDocument as readPDF, version as pdfJsVersion } from 'pdfjs-dist';
type Document = Awaited<ReturnType<Awaited<typeof readPDF>>['promise']>;
type Page = Awaited<ReturnType<Awaited<Document['getPage']>>>;
type TextContent = Awaited<ReturnType<Page['getTextContent']>>;
const parseText = (textContent: TextContent) => {
let lastY = undefined;
const text = [];
for (const item of textContent.items) {
if ('str' in item) {
if (lastY == item.transform[5] || !lastY) {
text.push(item.str);
} else {
text.push(`\n${item.str}`);
}
lastY = item.transform[5];
}
}
return text.join('');
};
import { extractDataFromPDF } from '@utils/binary';
export class ReadPDF implements INodeType {
description: INodeTypeDescription = {
hidden: true,
displayName: 'Read PDF',
// eslint-disable-next-line n8n-nodes-base/node-class-description-name-miscased
name: 'readPDF',
@@ -45,7 +25,7 @@ export class ReadPDF implements INodeType {
outputs: ['main'],
properties: [
{
displayName: 'Binary Property',
displayName: 'Input Binary Field',
name: 'binaryPropertyName',
type: 'string',
default: 'data',
@@ -84,46 +64,26 @@ export class ReadPDF implements INodeType {
for (let itemIndex = 0; itemIndex < length; itemIndex++) {
try {
const binaryPropertyName = this.getNodeParameter('binaryPropertyName', itemIndex);
const binaryData = this.helpers.assertBinaryData(itemIndex, binaryPropertyName);
const params: { password?: string; url?: URL; data?: ArrayBuffer } = {};
let password;
if (this.getNodeParameter('encrypted', itemIndex) === true) {
params.password = this.getNodeParameter('password', itemIndex) as string;
password = this.getNodeParameter('password', itemIndex) as string;
}
if (binaryData.id) {
const binaryPath = this.helpers.getBinaryPath(binaryData.id);
params.url = new URL(`file://${binaryPath}`);
} else {
params.data = Buffer.from(binaryData.data, BINARY_ENCODING).buffer;
}
const document = await readPDF(params).promise;
const { info, metadata } = await document
.getMetadata()
.catch(() => ({ info: null, metadata: null }));
const pages = [];
for (let i = 1; i <= document.numPages; i++) {
const page = await document.getPage(i);
const text = await page.getTextContent().then(parseText);
pages.push(text);
}
const json = await extractDataFromPDF.call(
this,
binaryPropertyName,
password,
undefined,
undefined,
itemIndex,
);
returnData.push({
binary: items[itemIndex].binary,
json: {
numpages: document.numPages,
numrender: document.numPages,
info,
metadata: metadata?.getAll(),
text: pages.join('\n\n'),
version: pdfJsVersion,
},
json,
});
} catch (error) {
console.log(error);
if (this.continueOnFail()) {
returnData.push({
json: {
@@ -135,7 +95,7 @@ export class ReadPDF implements INodeType {
});
continue;
}
throw error;
throw new NodeOperationError(this.getNode(), error, { itemIndex });
}
}
return [returnData];