Console: Convert an image to a searchable PDF document.
Posted: Tue Sep 18, 2018 8:34 am
Here is an example that shows how to convert an image to a searchable PDF document:
Source codes of console application for VintaSoft Imaging .NET SDK 12 can be downloaded from here.
Code: Select all
namespace ConsoleApp1
{
class Program
{
static void Main(string[] args)
{
string imageFilePath = "OCR.tif";
string tesseractOcrPath = @"..\..\TesseractOCR";
// create the OCR engine
using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr(tesseractOcrPath))
{
// specify that OCR engine will recognize English text
Vintasoft.Imaging.Ocr.OcrLanguage language = Vintasoft.Imaging.Ocr.OcrLanguage.English;
// create the OCR engine settings
Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings settings = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(language);
// initialize the OCR engine
tesseractOcr.Init(settings);
// load an image with text
using (Vintasoft.Imaging.VintasoftImage image = new Vintasoft.Imaging.VintasoftImage(imageFilePath))
{
// preprocess image before text recognition
// remove noise from image
Vintasoft.Imaging.ImageProcessing.Document.DespeckleCommand despeckleCommand = new Vintasoft.Imaging.ImageProcessing.Document.DespeckleCommand();
despeckleCommand.ExecuteInPlace(image);
// remove lines from image
Vintasoft.Imaging.ImageProcessing.Document.LineRemovalCommand lineRemovalCommand = new Vintasoft.Imaging.ImageProcessing.Document.LineRemovalCommand();
lineRemovalCommand.ExecuteInPlace(image);
// specify the image, where text must be recognized
tesseractOcr.SetImage(image);
// recognize text in the image
Vintasoft.Imaging.Ocr.Results.OcrPage ocrResult = tesseractOcr.Recognize();
// create PDF document
using (Vintasoft.Imaging.Pdf.PdfDocument pdfDocument = new Vintasoft.Imaging.Pdf.PdfDocument("OCR.pdf", Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14))
{
// create PDF document builder
Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder =
new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(pdfDocument);
documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto;
documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText;
// add OCR result to the PDF document
documentBuilder.AddPage(image, ocrResult);
// save changes in PDF document
pdfDocument.SaveChanges();
}
// clear the image
tesseractOcr.ClearImage();
}
// shutdown the OCR engine
tesseractOcr.Shutdown();
}
}
}
}