Convert a pdf to tiff using tesseract

2

I need to convert a PDF file to tiff using the library Tesseract.js

The application will be in HTML and JavaScript, what I intend is to choose the PDF and at the same time convert it to tiff, what I have for now is the following.

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta http-equiv="X-UA-Compatible" content="ie=edge">
    <title>Document</title>
</head>
<body>
    <script src="https://cdn.rawgit.com/naptha/tesseract.js/1.0.10/dist/tesseract.js"></script>
    <label for="fileInput">
        <div>Click this div and select a file</div>
      </label>
      <input type="file" id="fileInput"/>
</body>
</html>

That code allows me to select a file from the pc. I found the following video in which it does it by console Using Tesseract-OCR to extract text from images I wonder if you can do that from a browser using the code I show above?

    
asked by Pedro Ávila 15.12.2017 в 00:08
source

1 answer

0

The first thing that needs to be clarified is that Tesseract.js is not able to read PDF files, only images.

Starting from that base, you need to add the action of converting the file that is selected in the input. This is done by adding an onchange event handler, which can be done in several ways:

  • In the same input element:
  • <input type="file" id="fileInput" onchange="procesarFichero"/>
  • Adding the driver directly to the input element just after painting it on the screen:
  • <input type="file" id="fileInput"/>
    <script type="text/javascript">
    	document.getElementById('fileInput').addEventListener("change", procesarFichero);
    </script>

    This is necessary, but what really matters is the function that is defined to control the event, which has to be in the following way:

    function procesarFichero(){
    	var name = document.getElementById('fileInput');
    	if(name != null && name.files != null && name.files.length==1){
    		var file = name.files.item(0);
    		//alert('Name: ' + file.name);
    		//alert('Size: ' + file.size);
    		//alert('Type: ' + file.type);
    		Tesseract.recognize(file).progress(function(message){
    		console.log('Progress is: ', message)
    		}).then(function(data){
    			console.log(data);
    		});
    	}
    }

    This way you will be using Tesseract.js correctly. For more information you can go to Tessaract.js documentation .

    p>

    Applying the changes on the HTML code that you have given, it would stay like this:

    <!DOCTYPE html>
    <html lang="en">
    <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta http-equiv="X-UA-Compatible" content="ie=edge">
    <title>Document</title>
    </head>
    <body>
    <script src="https://cdn.rawgit.com/naptha/tesseract.js/1.0.10/dist/tesseract.js"></script>
    <label for="fileInput">
    	<div>Click this div and select a file</div>
    </label>
    <input type="file" id="fileInput" onchange="procesarFichero()"/>
    <br />
    <br />
    <span>Text from image:</span>
    <br />
    <textarea id="remoseTextarea" rows="30" cols="50"></textarea>
    <script>
    	var isWorking = false;
    	function procesarFichero(){
    		isWorking = true;
    		checkInputs(isWorking)
    		var name = document.getElementById('fileInput');
    		if(name != null && name.files != null && name.files.length==1){
    			var file = name.files.item(0);
    			//alert('Name: ' + file.name);
    			//alert('Size: ' + file.size);
    			//alert('Type: ' + file.type);
    			Tesseract.recognize(file).progress(function(message){
    			console.log('Progress is: ', message)
    			}).then(function(data){
    				console.log(data);
    				document.getElementById('remoseTextarea').value = data.text;
    				isWorking = false;
    				checkInputs(isWorking)
    			});
    		}
    	}
    
    	function checkInputs(disableFields){
    		document.getElementById('fileInput').disabled = disableFields;
    		document.getElementById('remoseTextarea').disabled = disableFields;
    	}
    </script>
    </body>
    </html>
        
    answered by 16.12.2017 / 11:20
    source