I am trying to extract the text of a PDF by coordinates in a visual way
public override void BeginGetText(Image page, uint pageNumber)
{
_PageNumber = (int)pageNumber;
using (PdfReader reader = new PdfReader(FileName))
{
iTextSharp.text.Rectangle mediabox = reader.GetPageSize((int)pageNumber);
//iTextSharp.text.Rectangle cropbox = reader.GetCropBox((int)pageNumber);
_FactorX = page.Width / (mediabox.Width + 0.0F);
_FactorY = page.Height / (mediabox.Height + 0.0F);
}
}
That way I get the conversion factor, but I've set it to 72p to be 1 (so that's not it)
public override string GetText(int x, int y, int w, int h)
{
using (PdfReader reader = new PdfReader(FileName))
{
float
fx = x / _FactorX, fy = y / _FactorY,
fw = w / _FactorX, fh = h / _FactorY;
System.util.RectangleJ rect = new System.util.RectangleJ(fx, fy, fw, fh);
ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(),
new RenderFilter[] { new RegionTextRenderFilter(rect) });
string ret = PdfTextExtractor.GetTextFromPage(reader, _PageNumber, textExtractionStrategy);
return ret;
}
}
and this way I extract it, the problem is that it does not return the text of those positions, in fact if I put the whole page (588x842) if you take it out, but if I put from 0.0 to 588,400 to remove the upper half, I take out the lower one.
Any ideas? I only get it to work if I put it in full page