Console: Convert an image to a searchable PDF document.

Post by **Alex** » Tue Sep 18, 2018 8:34 am

Here is C# example that shows how to convert an image to a searchable PDF document:


namespace ConsoleApp1
{
    class Program
    {
        static void Main(string[] args)
        {
            string imageFilePath = "OCR.tif";

            string tesseractOcrPath = @"..\..\TesseractOCR";
            // create the OCR engine
            using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr(tesseractOcrPath))
            {
                // specify that OCR engine will recognize English text
                Vintasoft.Imaging.Ocr.OcrLanguage language = Vintasoft.Imaging.Ocr.OcrLanguage.English;
                // create the OCR engine settings
                Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings settings = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(language);
                // initialize the OCR engine
                tesseractOcr.Init(settings);

                // load an image with text
                using (Vintasoft.Imaging.VintasoftImage image = new Vintasoft.Imaging.VintasoftImage(imageFilePath))
                {
                    // preprocess image before text recognition

                    // remove noise from image
                    Vintasoft.Imaging.ImageProcessing.Document.DespeckleCommand despeckleCommand = new Vintasoft.Imaging.ImageProcessing.Document.DespeckleCommand();
                    despeckleCommand.ExecuteInPlace(image);
                    // remove lines from image
                    Vintasoft.Imaging.ImageProcessing.Document.LineRemovalCommand lineRemovalCommand = new Vintasoft.Imaging.ImageProcessing.Document.LineRemovalCommand();
                    lineRemovalCommand.ExecuteInPlace(image);


                    // specify the image, where text must be recognized
                    tesseractOcr.SetImage(image);

                    // recognize text in the image
                    Vintasoft.Imaging.Ocr.Results.OcrPage ocrResult = tesseractOcr.Recognize();

                    // create PDF document
                    using (Vintasoft.Imaging.Pdf.PdfDocument pdfDocument = new Vintasoft.Imaging.Pdf.PdfDocument("OCR.pdf", Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14))
                    {
                        // create PDF document builder
                        Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder =
                            new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(pdfDocument);
                        documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto;
                        documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText;

                        // add OCR result to the PDF document
                        documentBuilder.AddPage(image, ocrResult);

                        // save changes in PDF document
                        pdfDocument.SaveChanges();
                    }

                    // clear the image
                    tesseractOcr.ClearImage();
                }
                // shutdown the OCR engine
                tesseractOcr.Shutdown();
            }
        }
    }
}

Source codes of console application for VintaSoft Imaging .NET SDK 14.0 can be downloaded from here.