How can I read the contents of a PDF and export it to .txt?

3

I explain what I have, I have this application that does something similar to link the difference is that it reads a .txt file and processes the text . Up there all right, now I would like to do the same with a PDF but when I show the content of that PDF some strange characters are shown, my question is: Is it possible to read the PDF file and convert it to .txt?

     $( document ).ready(function() {
          spritz_pause();
        });

        function leerArchivo(e) {
          var archivo = e.target.files[0];
          if (!archivo) {
            return;
          }
          var lector = new FileReader();
          lector.onload = function(e) {
            var contenido = e.target.result;
            mostrarContenido(contenido);
          };
          lector.readAsText(archivo);
        }

        function mostrarContenido(contenido) {
          var elemento = document.getElementById('contenido-archivo');
          elemento.innerHTML = contenido;
          clearInterval(spritz);
          words_set();
          i = 0;
          spritz_play();
        }

        document.getElementById('file-input')
        .addEventListener('change', leerArchivo, false);


        var wpm = $('#spritz_wpm').val();
        var interval = 60000/wpm;
        var paused = false;
        var $space = $('.spritz-word');
        var i = 1;

        /* TEXT PARSING */
        function words_set() {
          words = $('.demo-text').val().trim()
          .replace(/([-—])(\w)/g, '$1 $2')
          .replace(/[\r\n]/g, ' {linebreak} ')
          .replace(/\. /g, '. {period} ')
          .replace(/[ \t]{2,}/g, ' ')
          .split(' ');
          for (var j = 1; j < words.length; j++) {
            words[j] = words[j].replace(/{linebreak}|{period}/g, '   ');
          }
        }
        /* ON EACH WORD */
        function word_show(i) {
          var word = words[i];
          var stop = Math.round((word.length+1)*0.4)-1;
          $space.html('<div>'+word.slice(0,stop)+'</div><div>'+word[stop]+'</div><div>'+word.slice(stop+1)+'</div>');
        }
        /* ITERATION FUNCTION */
        function word_update() {
          spritz = setInterval(function() {
            word_show(i);
            i++;
            if (i == words.length) {
              setTimeout(function() {
                $space.html('');
                spritz_pause();
              }, interval);
              clearInterval(spritz);
            };
          }, interval);
        }

        /* PAUSING FUNCTIONS */
        function spritz_pause() {
          clearInterval(spritz);
          paused = true;
          $('#spritz_pause').addClass('paused');
        }
        function spritz_play() {
          word_update();
          paused = false;
          $('#spritz_pause').removeClass('paused');
        }
        function spritz_flip() {
          if (paused) {
            spritz_play();
          } else {
            spritz_pause();
          }
        }

        /* INITIATE */
        words_set();
        word_show(0);
        word_update();

        /* CHANGE SPEED */
        $('#spritz_wpm').on('input', function() {
          interval = 60000/$('#spritz_wpm').val();
          if (!paused) {
            clearInterval(spritz);
            word_update();
          }
        });

        /* REFRESH TEXT */
        $('#spritz_change').on('click', function() {
          clearInterval(spritz);
          words_set();
          i = 0;
          spritz_play();
        });

        /* PAUSE BUTTON AND SPACE BAR */
        $('#spritz_pause').on('click', function() {
          spritz_flip();
          return false;
        });
        $(document).on('keyup', function(e) {
          if (e.keyCode == 32) {
            spritz_flip();
          }
        })


        /* LIGHT/DARK THEME */
        $('.light').on('click', function() {
          $('html').toggleClass('night');
          return false;
        });
     .spritz {
          position: relative;
          border-top: 2px solid #000;
          border-bottom: 2px solid #000;
          padding: 2rem 0 1.2rem;

        }

        .spritz:before, .spritz:after {
          content: "";
          position: absolute;
          left: 40%;
          height: 0.2rem;
          width: 10px;
          margin-left: -1px;
          background-color: #000;
        }

        .spritz-word {
          font-size: 5rem;
          line-height: 5rem;
          height: 5.5rem;
          font-weight: 600;
        }

        .spritz-word div {
          display: table-cell;
        }
        .spritz-word div:first-child {
          width: 40%;
          text-align: right;
        }
        .spritz-word div:nth-child(2) {
          color: #e60000;
        }
        .night .spritz-word div:nth-child(2) {
          color: #ff4d4d;
        }
        .spritz-word div:last-child {
          width: 60%;
          text-align: left;
        }
        .row{
          margin: 5rem auto;
          justify-content: center;
          display: flex;
        }
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<!DOCTYPE html>
      <html>
      <head>
        <title></title>
        <meta charset="utf-8">
        <link rel="stylesheet" href="css/bootstrap.css">
      </head>

      <body> 
        <h1 class="text-center">Ejemplo Lector Como Spritz</h1>
        <div class="row">
          <div class="spritz col-md-3"> 
            <div class="spritz-word"></div>
          </div>
        </div>

        <div class="row">
          <div class="settings col-md-4">
            <input type="file" id="file-input" class="form-control" />
            <input class="form-control" id="spritz_wpm" type="number" value="300" step="50" min="50"/>
            <button class="btn btn-danger pause" id="spritz_pause"><i class="glyphicon glyphicon-play"></i></button>
            <button class="btn btn-danger pause" id="spritz_change"><i class="glyphicon glyphicon-refresh"></i></button>

          </div>
        </div>
      </div>

      <div class="container-fluid"><textarea class="demo-text form-control" id="contenido-archivo" rows="15">_</textarea></div>

      </body>
      </html>
    
asked by Luis Fernando 16.08.2017 в 00:27
source

1 answer

2

With PDF.js it is possible and using Blob you can create a file and add it to download in a link using the download attribute.

Code ES8

const fileToBinary = file => new Promise((resolve) => {
  const reader = new FileReader()
  reader.onload = function() {
    const b64 = reader.result.split(',')[1]
    const binary = atob(b64)
    resolve(binary)
  }
  reader.readAsDataURL(file)
})

PDFJS.disableWorker = true

const getDocument = (source, isURL) => {
  if (isURL) {
    return PDFJS.getDocument(source)
  }
  return PDFJS.getDocument({ data: source })
}

const getPages = async (doc) => {
  const numPages = doc.numPages
  const pages = []
  for (let i = 0; i < numPages; i++) {
    pages.push(await doc.getPage(i + 1))
  }
  return pages
}

const convertToText = async (source) => {
  const doc = await getDocument(source)
  const pages = await getPages(doc)
  const contents = []

  for (let i = 0; i < pages.length; i++) {
    const content = await pages[i].getTextContent()
    const text = 
          content
            .items
            .reduce((acc, item) => (acc.str || acc) + item.str)
    contents.push(text)
  }
  return contents
}

async function processPDF(pdf) {
  const binary = await fileToBinary(pdf)
  const contents = await convertToText(binary, false)
  const text = contents.join('\n')
  const download = document.getElementById('download')
  download.classList.remove('hidden')
  const blob = new Blob([text], { type: 'text/plain '})
  download.href = URL.createObjectURL(blob)
}

The previous code allows the user to upload a PDF and then it is converted to base64 to finally convert it to a binary form so that it can be processed by PDF.js. Finally, create a Blob from the text and build a URL for download.

Demo

You can see a functional example in this Pen .

PD

If you want to use the code in production, first transpile it to ES5 with Babel .

    
answered by 06.09.2017 / 00:01
source