Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: reading multiple pdf files with a single PDFParser object #371

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion jest.config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"testMatch": ["**/test/_test_.*"],
"testMatch": ["**/test/_test_*"],
"testEnvironment": "node",
"bail": false,
"testFailureExitCode": 1
Expand Down
13 changes: 13 additions & 0 deletions lib/pdf.js
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,10 @@ export default class PDFJSClass extends EventEmitter {

raiseReadyEvent(data) {
process.nextTick(() => this.emit("pdfjs_parseDataReady", data));
if(data===null){
//reset the state of the PDF reader
this.resetCurrentObject();
}
return data;
}

Expand Down Expand Up @@ -299,6 +303,7 @@ export default class PDFJSClass extends EventEmitter {
};

this.pages.push(page);

this.emit("data", page);

if (this.needRawText) {
Expand Down Expand Up @@ -401,6 +406,14 @@ export default class PDFJSClass extends EventEmitter {
return { Pages: this.pages };
}

resetCurrentObject(){
if (this.pdfDocument) this.pdfDocument.destroy();
this.pdfDocument = null;

this.pages = [];
this.rawTextContents = [];
}

destroy() {
this.removeAllListeners();

Expand Down
9 changes: 9 additions & 0 deletions pdfparser.js
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,15 @@ export default class PDFParser extends EventEmitter {
return ParserStream.createContentStream(this.getMergedTextBlocksIfNeeded());
}

/**
* Destroys the current instance of PDFJS and sets a new one
* @param {boolean} needRawText - Whether raw text is needed or not
*/
resetPDFJS(needRawText){
this.#PDFJS.destroy();
this.#PDFJS=new PDFJS(needRawText);
}

/**
* Destroy the PDFParser instance.
*/
Expand Down
52 changes: 52 additions & 0 deletions test/_test_testMultipleDataPDF.cjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
const assert = require("assert");
const fs = require("fs");

const PDFParser = require("../dist/pdfparser.cjs");
// we want to read two (or more) PDF files without recreating a reference to PDFParser
describe("Multiple PDFs with same structure",()=>{
test("Read different values",async ()=>{
// the target PDFs for this test have only 3 values: Name, Surname and BirthDate.
// you can find the PDFs in test/pdf/mpf
let parser=new PDFParser();
const firstPDFLocation=__dirname+"/pdf/mpf/testPDF.pdf";
const secondPDFLocation=__dirname+"/pdf/mpf/testPDF2.pdf";
const firstPDFBuffer=fs.readFileSync(firstPDFLocation);
const secondPDFBuffer=fs.readFileSync(secondPDFLocation);
//we need to check if buffers are indeed different, otherwise it's useless!
expect(firstPDFBuffer).not.toBe(secondPDFBuffer);
const firstData=await new Promise((resolve,reject)=>{
parser.parseBuffer(firstPDFBuffer,5);
parser.on("pdfParser_dataReady", (evtData) => {
resolve(evtData);
});

parser.on("pdfParser_dataError", (evtData) => {
reject(evtData);
});
});
const secondData=await new Promise((resolve,reject)=>{
parser.parseBuffer(secondPDFBuffer,5);
parser.on("pdfParser_dataReady", (evtData) => {
resolve(evtData);
});

parser.on("pdfParser_dataError", (evtData) => {
reject(evtData);
});
});
//first, make sure the files are read
expect(firstData).toBeDefined();
expect(firstData.Pages[0]).toBeDefined();
expect(firstData.Pages[0].Fields).toBeDefined();
expect(secondData).toBeDefined();
expect(secondData.Pages[0]).toBeDefined();
expect(secondData.Pages[0].Fields).toBeDefined();
//then, we check if the files have the correct values
expect(firstData.Pages[0].Fields[0].V).toBe("Mario");
expect(firstData.Pages[0].Fields[1].V).toBe("Rossi");
expect(firstData.Pages[0].Fields[2].V).toBe("01/01/1990");
expect(secondData.Pages[0].Fields[0].V).toBe("Luigi");
expect(secondData.Pages[0].Fields[1].V).toBe("Verdi");
expect(secondData.Pages[0].Fields[2].V).toBe("01/01/1991");
});
});
Binary file added test/pdf/mpf/testPDF.pdf
Binary file not shown.
Binary file added test/pdf/mpf/testPDF2.pdf
Binary file not shown.