pdf2json
pdf2json copied to clipboard
Can't process multiple pdf files
Is there an easy way to process multiple files? I have this:
var PDFParser = require('pdf2json');
var pdfParser = new PDFParser();
var fs = require('fs');
var fileNames = [];
var fileCont = 0;
fs.readdir(fileFolder, function(err, files){
for (var i = files.length - 1; i >= 0; i--) {
if (files[i].indexOf('.pdf') !== -1){
fileNames.push(files[i]);
}
pdfParser.loadPDF(fileNames[fileCont]);
});
pdfParser.on('pdfParser_dataReady', function(data){
//Do all my stuff and insert in db...
fileCont++;
If (fileCont === fileNames.lenght){
for (var i = fileNames.length - 1; i >= 0; i--) {
fs.unlink(fileFolder + fileNames[i]);
}
return res.json({
data: 'ok '
});
}
pdfParser.loadPDF(fileFolder + fileNames[fileCont]);
})
But is failing as the dataReady event if firing on the same pdf file always
I came across this today. I think the problem is that the pdfParser object stores some data from the previous readings and can not be cleared. I solved it with an async function that instantiates the pdfParser for every file and encapsulates the event handlers:
function readOneFile(path, cb){
let pdfParser = new PDFParser();
pdfParser.on("pdfParser_dataError", errData => {
cb(errData);
});
pdfParser.on("pdfParser_dataReady", pdfData => {
processData(pdfData)
cb(null);
});
pdfParser.loadPDF(path);
}
Than you can call it with regular async or turn it into a Promise, etc:
async.eachSeries(
["./file1.pdf", "./file2.pdf"],
readOneFile,
err => {
if(err){ console.log(err) }
}
);