Console: Convert an image-only PDF document to a searchable PDF document.

Post by **Alex** » Fri Aug 18, 2017 11:49 am

For converting an image-only PDF document to a searchable PDF document it is necessary to do the following steps:

Render image-only PDF page with 300 dpi resolution or higher
Prepare rendered image for text recognition if necessary. For example, you can remove noise from image.
Recognize text on image.
Filter the recognition result if necessary.
Add text over image on PDF page.

Here is C# example that shows how to convert an image-only PDF document to a searchable PDF document:

using System;
using System.Collections.Generic;

namespace ConsoleApplication1
{
    class Program
    {
        static void Main(string[] args)
        {
            // convert an image-only PDF document to a searchable PDF document
            ConvertImageOnlyPdfToSearchablePdf(
                "imageOnlyPdfDocument.pdf",
                Vintasoft.Imaging.Ocr.OcrLanguage.English,
                "searchablePdfDocument.pdf");
        }

        /// <summary>
        /// Converts an image-only PDF document to a searchable PDF document.
        /// </summary>
        /// <param name="imageOnlyPdfFilename">A filename of source image-only PDF file.</param>
        /// <param name="ocrLanguage">OCR language.</param>
        /// <param name="searchablePdfFilename">A filename of destination searchable PDF file.</param>
        public static void ConvertImageOnlyPdfToSearchablePdf(
            string imageOnlyPdfFilename,
            Vintasoft.Imaging.Ocr.OcrLanguage ocrLanguage,
            string searchablePdfFilename)
        {
            // create an image collection
            using (Vintasoft.Imaging.ImageCollection images = new Vintasoft.Imaging.ImageCollection())
            {
                // add pages from image-only PDF document into image collection
                images.Add(imageOnlyPdfFilename);

                // create a searchable PDF document
                using (Vintasoft.Imaging.Pdf.PdfDocument document =
                    new Vintasoft.Imaging.Pdf.PdfDocument(searchablePdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14))
                {
                    Console.WriteLine("Create OCR engine...");
                    // create the Tesseract OCR engine
                    using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr =
                        new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr(@"..\..\TesseractOCR\"))
                    {
                        Console.WriteLine("Initialize OCR engine...");
                        // init the Tesseract OCR engine
                        tesseractOcr.Init(new Vintasoft.Imaging.Ocr.OcrEngineSettings(ocrLanguage));

                        // create a PDF document builder
                        Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder =
                            new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(document);
                        // specify that the best image compression must be calculated automatically
                        documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto;
                        // specify that image must be place over text
                        documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText;

                        // for each image in image collection
                        foreach (Vintasoft.Imaging.VintasoftImage image in images)
                        {
                            Console.WriteLine("Recognize text in image...");
                            // recognize text on image
                            Vintasoft.Imaging.Ocr.Results.OcrPage ocrPage = tesseractOcr.Recognize(image);

                            // remove low confidence words from OCR result
                            RemoveLowConfidenceWords(ocrPage);

                            Console.WriteLine("Add page to a PDF document...");
                            // add recognized OCR page to the PDF document
                            documentBuilder.AddPage(image, ocrPage);
                        }

                        // shutdown OCR engine
                        tesseractOcr.Shutdown();

                        Console.WriteLine("Save changes in PDF document...");
                        // save changes in PDF document
                        document.SaveChanges();
                    }
                }

                // clear and dispose images in image collection
                images.ClearAndDisposeItems();
            }
        }

        /// <summary>
        /// Preprocess an image before text recognition.
        /// </summary>
        /// <param name="image">Image to preprocess.</param>
        static void PreprocessImageBeforeOcr(Vintasoft.Imaging.VintasoftImage image)
        {
            // remove noise from image
            Vintasoft.Imaging.ImageProcessing.Document.DespeckleCommand despeckleCommand =
                new Vintasoft.Imaging.ImageProcessing.Document.DespeckleCommand();
            despeckleCommand.ExecuteInPlace(image);
        }

        /// <summary>
        /// Removes low confidence words from OCR page.
        /// </summary>
        /// <param name="ocrPage">OCR page.</param>
        static void RemoveLowConfidenceWords(Vintasoft.Imaging.Ocr.Results.OcrPage ocrPage)
        {
            // minimum confidence
            const float MIN_CONFIDENCE = 75.0f;

            // get all words in recognized text
            Vintasoft.Imaging.Ocr.Results.OcrObject[] ocrObjects =
                ocrPage.GetObjects(Vintasoft.Imaging.Ocr.OcrObjectType.Word);
            // create list of words to remove
            List<Vintasoft.Imaging.Ocr.Results.OcrObject> removeObjects =
                new List<Vintasoft.Imaging.Ocr.Results.OcrObject>();
            // for each word
            foreach (Vintasoft.Imaging.Ocr.Results.OcrObject word in ocrObjects)
            {
                // if word confidence is less than minimum confidence
                if (word.Confidence < MIN_CONFIDENCE)
                    // add word to a list of words to remove
                    removeObjects.Add(word);
            }

            // validate recognition results (remove words with low confidence)

            Vintasoft.Imaging.Ocr.Results.OcrResultsEditor editor =
                new Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(ocrPage);
            editor.RemoveObjects(removeObjects.ToArray());
            editor.ValidateResults();
        }

    }
}

Source codes of console application for VintaSoft Imaging .NET SDK 14.0 can be downloaded from here.