pdf2json icon indicating copy to clipboard operation
pdf2json copied to clipboard

Can't process multiple pdf files

Open axul opened this issue 9 years ago • 1 comments

Is there an easy way to process multiple files? I have this:

 var PDFParser = require('pdf2json');
 var pdfParser = new PDFParser();
 var fs = require('fs');
 var fileNames = [];
 var fileCont = 0;

 fs.readdir(fileFolder, function(err, files){
      for (var i = files.length - 1; i >= 0; i--) {
      if (files[i].indexOf('.pdf') !== -1){
           fileNames.push(files[i]);
      }

      pdfParser.loadPDF(fileNames[fileCont]);
 });

 pdfParser.on('pdfParser_dataReady', function(data){
      //Do all my stuff and insert in db...

      fileCont++;

      If (fileCont === fileNames.lenght){
            for (var i = fileNames.length - 1; i >= 0; i--) {
                 fs.unlink(fileFolder + fileNames[i]);
            }
           return res.json({
                data: 'ok '
           });
      }

      pdfParser.loadPDF(fileFolder + fileNames[fileCont]);
 })

But is failing as the dataReady event if firing on the same pdf file always

axul avatar Nov 07 '16 13:11 axul

I came across this today. I think the problem is that the pdfParser object stores some data from the previous readings and can not be cleared. I solved it with an async function that instantiates the pdfParser for every file and encapsulates the event handlers:

function readOneFile(path, cb){
    let pdfParser = new PDFParser();
    pdfParser.on("pdfParser_dataError", errData => {
        cb(errData);
    });
    pdfParser.on("pdfParser_dataReady", pdfData => {
        processData(pdfData)
        cb(null);
    });
    pdfParser.loadPDF(path);
}

Than you can call it with regular async or turn it into a Promise, etc:

async.eachSeries(
    ["./file1.pdf", "./file2.pdf"],
    readOneFile,
    err => {
        if(err){ console.log(err) }
    }
);

ersoma avatar Feb 28 '17 20:02 ersoma