Console: Convert an image to a searchable PDF document.

Posted: Tue Sep 18, 2018 8:34 am
by Alex
Here is an example that shows how to convert an image to a searchable PDF document:

Code: Select all

namespace ConsoleApp1
    class Program
        static void Main(string[] args)
            string imageFilePath = "OCR.tif";

            string tesseractOcrPath = @"..\..\TesseractOCR";
            // create the OCR engine
            using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr(tesseractOcrPath))
                // specify that OCR engine will recognize English text
                Vintasoft.Imaging.Ocr.OcrLanguage language = Vintasoft.Imaging.Ocr.OcrLanguage.English;
                // create the OCR engine settings
                Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings settings = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(language);
                // initialize the OCR engine

                // load an image with text
                using (Vintasoft.Imaging.VintasoftImage image = new Vintasoft.Imaging.VintasoftImage(imageFilePath))
                    // preprocess image before text recognition

                    // remove noise from image
                    Vintasoft.Imaging.ImageProcessing.Document.DespeckleCommand despeckleCommand = new Vintasoft.Imaging.ImageProcessing.Document.DespeckleCommand();
                    // remove lines from image
                    Vintasoft.Imaging.ImageProcessing.Document.LineRemovalCommand lineRemovalCommand = new Vintasoft.Imaging.ImageProcessing.Document.LineRemovalCommand();

                    // specify the image, where text must be recognized

                    // recognize text in the image
                    Vintasoft.Imaging.Ocr.Results.OcrPage ocrResult = tesseractOcr.Recognize();

                    // create PDF document
                    using (Vintasoft.Imaging.Pdf.PdfDocument pdfDocument = new Vintasoft.Imaging.Pdf.PdfDocument("OCR.pdf", Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14))
                        // create PDF document builder
                        Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder =
                            new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(pdfDocument);
                        documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto;
                        documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText;

                        // add OCR result to the PDF document
                        documentBuilder.AddPage(image, ocrResult);

                        // save changes in PDF document

                    // clear the image
                // shutdown the OCR engine
Source codes of console application for VintaSoft Imaging .NET SDK 12 can be downloaded from here.