mirror of
https://github.com/Abdulazizzn/n8n-enterprise-unlocked.git
synced 2025-12-17 01:56:46 +00:00
feat(Read PDF Node): Replace pdf-parse with pdfjs, and add support for streaming and encrypted PDFs (#6640)
This commit is contained in:
committed by
GitHub
parent
acda7f269f
commit
0a31b8e2b4
@@ -1,12 +1,32 @@
|
||||
import type {
|
||||
IExecuteFunctions,
|
||||
IDataObject,
|
||||
INodeExecutionData,
|
||||
INodeType,
|
||||
INodeTypeDescription,
|
||||
import {
|
||||
BINARY_ENCODING,
|
||||
type IExecuteFunctions,
|
||||
type INodeExecutionData,
|
||||
type INodeType,
|
||||
type INodeTypeDescription,
|
||||
} from 'n8n-workflow';
|
||||
|
||||
import pdf from 'pdf-parse';
|
||||
import { getDocument as readPDF, version as pdfJsVersion } from 'pdfjs-dist';
|
||||
|
||||
type Document = Awaited<ReturnType<Awaited<typeof readPDF>>['promise']>;
|
||||
type Page = Awaited<ReturnType<Awaited<Document['getPage']>>>;
|
||||
type TextContent = Awaited<ReturnType<Page['getTextContent']>>;
|
||||
|
||||
const parseText = (textContent: TextContent) => {
|
||||
let lastY = undefined;
|
||||
const text = [];
|
||||
for (const item of textContent.items) {
|
||||
if ('str' in item) {
|
||||
if (lastY == item.transform[5] || !lastY) {
|
||||
text.push(item.str);
|
||||
} else {
|
||||
text.push(`\n${item.str}`);
|
||||
}
|
||||
lastY = item.transform[5];
|
||||
}
|
||||
}
|
||||
return text.join('');
|
||||
};
|
||||
|
||||
export class ReadPDF implements INodeType {
|
||||
description: INodeTypeDescription = {
|
||||
@@ -32,6 +52,26 @@ export class ReadPDF implements INodeType {
|
||||
required: true,
|
||||
description: 'Name of the binary property from which to read the PDF file',
|
||||
},
|
||||
{
|
||||
displayName: 'Encrypted',
|
||||
name: 'encrypted',
|
||||
type: 'boolean',
|
||||
default: false,
|
||||
required: true,
|
||||
},
|
||||
{
|
||||
displayName: 'Password',
|
||||
name: 'password',
|
||||
type: 'string',
|
||||
typeOptions: { password: true },
|
||||
default: '',
|
||||
description: 'Password to decrypt the PDF file with',
|
||||
displayOptions: {
|
||||
show: {
|
||||
encrypted: [true],
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
@@ -40,27 +80,50 @@ export class ReadPDF implements INodeType {
|
||||
|
||||
const returnData: INodeExecutionData[] = [];
|
||||
const length = items.length;
|
||||
let item: INodeExecutionData;
|
||||
|
||||
for (let itemIndex = 0; itemIndex < length; itemIndex++) {
|
||||
try {
|
||||
item = items[itemIndex];
|
||||
const binaryPropertyName = this.getNodeParameter('binaryPropertyName', itemIndex);
|
||||
const binaryData = this.helpers.assertBinaryData(itemIndex, binaryPropertyName);
|
||||
|
||||
if (item.binary === undefined) {
|
||||
item.binary = {};
|
||||
const params: { password?: string; url?: URL; data?: ArrayBuffer } = {};
|
||||
|
||||
if (this.getNodeParameter('encrypted', itemIndex) === true) {
|
||||
params.password = this.getNodeParameter('password', itemIndex) as string;
|
||||
}
|
||||
|
||||
const binaryDataBuffer = await this.helpers.getBinaryDataBuffer(
|
||||
itemIndex,
|
||||
binaryPropertyName,
|
||||
);
|
||||
returnData.push({
|
||||
binary: item.binary,
|
||||
if (binaryData.id) {
|
||||
const binaryPath = this.helpers.getBinaryPath(binaryData.id);
|
||||
params.url = new URL(`file://${binaryPath}`);
|
||||
} else {
|
||||
params.data = Buffer.from(binaryData.data, BINARY_ENCODING).buffer;
|
||||
}
|
||||
|
||||
json: (await pdf(binaryDataBuffer)) as unknown as IDataObject,
|
||||
const document = await readPDF(params).promise;
|
||||
const { info, metadata } = await document
|
||||
.getMetadata()
|
||||
.catch(() => ({ info: null, metadata: null }));
|
||||
|
||||
const pages = [];
|
||||
for (let i = 1; i <= document.numPages; i++) {
|
||||
const page = await document.getPage(i);
|
||||
const text = await page.getTextContent().then(parseText);
|
||||
pages.push(text);
|
||||
}
|
||||
|
||||
returnData.push({
|
||||
binary: items[itemIndex].binary,
|
||||
json: {
|
||||
numpages: document.numPages,
|
||||
numrender: document.numPages,
|
||||
info,
|
||||
metadata: metadata?.getAll(),
|
||||
text: pages.join('\n\n'),
|
||||
version: pdfJsVersion,
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
if (this.continueOnFail()) {
|
||||
returnData.push({
|
||||
json: {
|
||||
|
||||
Reference in New Issue
Block a user