feat(Read PDF Node): Replace pdf-parse with pdfjs, and add support for streaming and encrypted PDFs (#6640)

This commit is contained in:
कारतोफ्फेलस्क्रिप्ट™
2023-07-18 20:07:29 +02:00
committed by GitHub
parent acda7f269f
commit 0a31b8e2b4
11 changed files with 267 additions and 131 deletions

View File

@@ -1,12 +1,32 @@
import type {
IExecuteFunctions,
IDataObject,
INodeExecutionData,
INodeType,
INodeTypeDescription,
import {
BINARY_ENCODING,
type IExecuteFunctions,
type INodeExecutionData,
type INodeType,
type INodeTypeDescription,
} from 'n8n-workflow';
import pdf from 'pdf-parse';
import { getDocument as readPDF, version as pdfJsVersion } from 'pdfjs-dist';
type Document = Awaited<ReturnType<Awaited<typeof readPDF>>['promise']>;
type Page = Awaited<ReturnType<Awaited<Document['getPage']>>>;
type TextContent = Awaited<ReturnType<Page['getTextContent']>>;
const parseText = (textContent: TextContent) => {
let lastY = undefined;
const text = [];
for (const item of textContent.items) {
if ('str' in item) {
if (lastY == item.transform[5] || !lastY) {
text.push(item.str);
} else {
text.push(`\n${item.str}`);
}
lastY = item.transform[5];
}
}
return text.join('');
};
export class ReadPDF implements INodeType {
description: INodeTypeDescription = {
@@ -32,6 +52,26 @@ export class ReadPDF implements INodeType {
required: true,
description: 'Name of the binary property from which to read the PDF file',
},
{
displayName: 'Encrypted',
name: 'encrypted',
type: 'boolean',
default: false,
required: true,
},
{
displayName: 'Password',
name: 'password',
type: 'string',
typeOptions: { password: true },
default: '',
description: 'Password to decrypt the PDF file with',
displayOptions: {
show: {
encrypted: [true],
},
},
},
],
};
@@ -40,27 +80,50 @@ export class ReadPDF implements INodeType {
const returnData: INodeExecutionData[] = [];
const length = items.length;
let item: INodeExecutionData;
for (let itemIndex = 0; itemIndex < length; itemIndex++) {
try {
item = items[itemIndex];
const binaryPropertyName = this.getNodeParameter('binaryPropertyName', itemIndex);
const binaryData = this.helpers.assertBinaryData(itemIndex, binaryPropertyName);
if (item.binary === undefined) {
item.binary = {};
const params: { password?: string; url?: URL; data?: ArrayBuffer } = {};
if (this.getNodeParameter('encrypted', itemIndex) === true) {
params.password = this.getNodeParameter('password', itemIndex) as string;
}
const binaryDataBuffer = await this.helpers.getBinaryDataBuffer(
itemIndex,
binaryPropertyName,
);
returnData.push({
binary: item.binary,
if (binaryData.id) {
const binaryPath = this.helpers.getBinaryPath(binaryData.id);
params.url = new URL(`file://${binaryPath}`);
} else {
params.data = Buffer.from(binaryData.data, BINARY_ENCODING).buffer;
}
json: (await pdf(binaryDataBuffer)) as unknown as IDataObject,
const document = await readPDF(params).promise;
const { info, metadata } = await document
.getMetadata()
.catch(() => ({ info: null, metadata: null }));
const pages = [];
for (let i = 1; i <= document.numPages; i++) {
const page = await document.getPage(i);
const text = await page.getTextContent().then(parseText);
pages.push(text);
}
returnData.push({
binary: items[itemIndex].binary,
json: {
numpages: document.numPages,
numrender: document.numPages,
info,
metadata: metadata?.getAll(),
text: pages.join('\n\n'),
version: pdfJsVersion,
},
});
} catch (error) {
console.log(error);
if (this.continueOnFail()) {
returnData.push({
json: {