feat(Read PDF Node): Replace pdf-parse with pdfjs, and add support for streaming and encrypted PDFs (#6640)

This commit is contained in:
कारतोफ्फेलस्क्रिप्ट™
2023-07-18 20:07:29 +02:00
committed by GitHub
parent acda7f269f
commit 0a31b8e2b4
11 changed files with 267 additions and 131 deletions

View File

@@ -0,0 +1,87 @@
{
"nodes": [
{
"name": "When clicking \"Execute Workflow\"",
"type": "n8n-nodes-base.manualTrigger",
"typeVersion": 1,
"parameters": {},
"position": [660, 580]
},
{
"name": "Read sample-encrypted.pdf",
"type": "n8n-nodes-base.readBinaryFile",
"typeVersion": 1,
"parameters": {
"filePath": "C:\\Test\\sample-encrypted.pdf"
},
"position": [880, 780]
},
{
"name": "Read PDF (encrypted)",
"type": "n8n-nodes-base.readPDF",
"typeVersion": 1,
"parameters": {
"encrypted": true,
"password": "ReaderPassword"
},
"position": [1100, 780]
}
],
"pinData": {
"Read PDF (encrypted)": [
{
"binary": {
"data": {
"fileExtension": "pdf",
"fileName": "sample-encrypted.pdf",
"fileSize": "18.9 kB",
"mimeType": "application/pdf"
}
},
"json": {
"numpages": 1,
"numrender": 1,
"info": {
"PDFFormatVersion": "1.7",
"Language": null,
"EncryptFilterName": "Standard",
"IsLinearized": false,
"IsAcroFormPresent": false,
"IsXFAPresent": false,
"IsCollectionPresent": false,
"IsSignaturesPresent": false,
"ModDate": "D:20230210122750Z",
"Producer": "iLovePDF",
"Title": "sample"
},
"text": "N8N\nSample PDF\nLorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor\ninvidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et\njusto duo dolores et ea rebum.",
"version": "2.16.105"
}
}
]
},
"connections": {
"When clicking \"Execute Workflow\"": {
"main": [
[
{
"node": "Read sample-encrypted.pdf",
"type": "main",
"index": 0
}
]
]
},
"Read sample-encrypted.pdf": {
"main": [
[
{
"node": "Read PDF (encrypted)",
"type": "main",
"index": 0
}
]
]
}
}
}

View File

@@ -1,60 +1,11 @@
/* eslint-disable @typescript-eslint/no-loop-func */
import * as Helpers from '@test/nodes/Helpers';
import type { WorkflowTestData } from '@test/nodes/types';
import { executeWorkflow } from '@test/nodes/ExecuteWorkflow';
import path from 'path';
import { getWorkflowFilenames, initBinaryDataManager, testWorkflows } from '@test/nodes/Helpers';
describe('Test Read PDF Node', () => {
beforeEach(async () => {
await Helpers.initBinaryDataManager();
const workflows = getWorkflowFilenames(__dirname);
beforeAll(async () => {
await initBinaryDataManager();
});
const workflow = Helpers.readJsonFileSync('nodes/ReadPdf/test/ReadPDF.workflow.json');
const node = workflow.nodes.find((n: any) => n.name === 'Read Binary File');
node.parameters.filePath = path.join(__dirname, 'sample.pdf');
const testData: WorkflowTestData = {
description: 'nodes/ReadPdf/test/ReadPDF.workflow.json',
input: {
workflowData: workflow,
},
output: {
nodeData: {
'Read PDF': [
[
{
json: {
numpages: 1,
numrender: 1,
info: {
PDFFormatVersion: '1.4',
IsAcroFormPresent: false,
IsXFAPresent: false,
Title: 'sample',
Producer: 'iLovePDF',
ModDate: 'D:20230210122750Z',
},
metadata: null,
text: '\n\nN8N\nSample PDF\nLorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor\ninvidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et\njusto duo dolores et ea rebum.',
version: '1.10.100',
},
},
],
],
},
},
};
const nodeTypes = Helpers.setup(testData);
test(testData.description, async () => {
const { result } = await executeWorkflow(testData, nodeTypes);
const resultNodeData = Helpers.getResultNodeData(result, testData);
// delete binary data because we test against json only
delete resultNodeData[0].resultData[0]![0].binary;
expect(resultNodeData[0].resultData).toEqual(testData.output.nodeData['Read PDF']);
expect(result.finished).toEqual(true);
});
testWorkflows(workflows);
});

View File

@@ -1,47 +1,75 @@
{
"name": "Read PDF node unit test",
"nodes": [
{
"parameters": {},
"id": "0c9db33c-dd15-4088-9d12-b9f3b8f1fa96",
"name": "When clicking \"Execute Workflow\"",
"type": "n8n-nodes-base.manualTrigger",
"typeVersion": 1,
"position": [960, 540]
},
{
"parameters": {},
"id": "86abdc3b-206d-4b67-a37f-6b67b6bd3bbc",
"name": "Read PDF",
"type": "n8n-nodes-base.readPDF",
"typeVersion": 1,
"position": [1400, 540]
"position": [660, 580]
},
{
"name": "Read sample.pdf",
"type": "n8n-nodes-base.readBinaryFile",
"typeVersion": 1,
"parameters": {
"filePath": "C:\\Test\\sample.pdf"
},
"id": "2f6d241e-44a4-4213-b49a-166201946a89",
"name": "Read Binary File",
"type": "n8n-nodes-base.readBinaryFile",
"position": [880, 580]
},
{
"name": "Read PDF",
"type": "n8n-nodes-base.readPDF",
"typeVersion": 1,
"position": [1180, 540]
"parameters": {},
"position": [1100, 580]
}
],
"pinData": {},
"pinData": {
"Read PDF": [
{
"binary": {
"data": {
"fileExtension": "pdf",
"fileName": "sample.pdf",
"fileSize": "17.8 kB",
"mimeType": "application/pdf"
}
},
"json": {
"numpages": 1,
"numrender": 1,
"info": {
"PDFFormatVersion": "1.4",
"Language": null,
"EncryptFilterName": null,
"IsLinearized": false,
"IsAcroFormPresent": false,
"IsXFAPresent": false,
"IsCollectionPresent": false,
"IsSignaturesPresent": false,
"Title": "sample",
"Producer": "iLovePDF",
"ModDate": "D:20230210122750Z"
},
"text": "N8N\nSample PDF\nLorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor\ninvidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et\njusto duo dolores et ea rebum.",
"version": "2.16.105"
}
}
]
},
"connections": {
"When clicking \"Execute Workflow\"": {
"main": [
[
{
"node": "Read Binary File",
"node": "Read sample.pdf",
"type": "main",
"index": 0
}
]
]
},
"Read Binary File": {
"Read sample.pdf": {
"main": [
[
{
@@ -52,13 +80,5 @@
]
]
}
},
"active": false,
"settings": {},
"versionId": "9802b48d-727a-40ef-ad87-d544a9a648a7",
"id": "188",
"meta": {
"instanceId": "104a4d08d8897b8bdeb38aaca515021075e0bd8544c983c2bb8c86e6a8e6081c"
},
"tags": []
}
}