OCR: How to convert an image-only PDF document to a searchable PDF document?

In This Topic

Here is C#/VB.NET code that shows how to convert an image-only PDF document to a searchable PDF document:

/// <summary>
/// Converts an image-only PDF document to a searchable PDF document.
/// </summary>
/// <param name="ocrLanguage">OCR language.</param>
/// <param name="imageOnlyPdfFilename">A filename of source image-only PDF file.</param>
/// <param name="ocrResolution">The resolution that uses to OCR <paramref name="imageOnlyPdfFilename"/>.</param>
/// <param name="searchablePdfFilename">A filename of destination searchable PDF file.</param>
public static void ConvertImageOnlyPdfToSearchablePdf(
    Vintasoft.Imaging.Ocr.OcrLanguage ocrLanguage,
    string imageOnlyPdfFilename,
    Vintasoft.Imaging.Resolution ocrResolution,
    string searchablePdfFilename)
{
    // create an image collection
    using (Vintasoft.Imaging.ImageCollection images = new Vintasoft.Imaging.ImageCollection())
    {
        // add pages from image-only PDF document into image collection
        images.Add(imageOnlyPdfFilename);

        // create a searchable PDF document
        using (Vintasoft.Imaging.Pdf.PdfDocument document =
            new Vintasoft.Imaging.Pdf.PdfDocument(searchablePdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14))
        {
            System.Console.WriteLine("Create OCR engine...");
            // create the Tesseract OCR engine
            using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr =
                new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
            {
                System.Console.WriteLine("Initialize OCR engine...");
                // init the Tesseract OCR engine
                tesseractOcr.Init(new Vintasoft.Imaging.Ocr.OcrEngineSettings(ocrLanguage));

                // create a PDF document builder
                Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder =
                    new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(document);
                // specify that the best image compression must be calculated automatically
                documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto;
                // specify that image must be place over text
                documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText;

                // for each image in image collection
                foreach (Vintasoft.Imaging.VintasoftImage image in images)
                {
                    System.Console.WriteLine("Recognize text in image...");
                    // recognize text on image
                    Vintasoft.Imaging.Ocr.Results.OcrPage page = tesseractOcr.Recognize(image);

                    System.Console.WriteLine("Add page to a PDF document...");
                    // add recognized OCR page to the PDF document
                    documentBuilder.AddPage(image, page);
                }

                // shutdown OCR engine
                tesseractOcr.Shutdown();

                System.Console.WriteLine("Save changes in PDF document...");
                // save changes in PDF document
                document.SaveChanges();
            }
        }

        // clear and dispose images in image collection
        images.ClearAndDisposeItems();
    }
}

VB.NET

''' <summary>
''' Converts an image-only PDF document to a searchable PDF document.
''' </summary>
''' <param name="ocrLanguage">OCR language.</param>
''' <param name="imageOnlyPdfFilename">A filename of source image-only PDF file.</param>
''' <param name="ocrResolution">The resolution that uses to OCR <paramref name="imageOnlyPdfFilename"/>.</param>
''' <param name="searchablePdfFilename">A filename of destination searchable PDF file.</param>
Public Shared Sub ConvertImageOnlyPdfToSearchablePdf(ocrLanguage As Vintasoft.Imaging.Ocr.OcrLanguage, imageOnlyPdfFilename As String, ocrResolution As Vintasoft.Imaging.Resolution, searchablePdfFilename As String)
    ' create an image collection
    Using images As New Vintasoft.Imaging.ImageCollection()
        ' add pages from image-only PDF document into image collection
        images.Add(imageOnlyPdfFilename)

        ' create a searchable PDF document
        Using document As New Vintasoft.Imaging.Pdf.PdfDocument(searchablePdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14)
            System.Console.WriteLine("Create OCR engine...")
            ' create the Tesseract OCR engine
            Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
                System.Console.WriteLine("Initialize OCR engine...")
                ' init the Tesseract OCR engine
                tesseractOcr.Init(New Vintasoft.Imaging.Ocr.OcrEngineSettings(ocrLanguage))

                ' create a PDF document builder
                Dim documentBuilder As New Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(document)
                ' specify that the best image compression must be calculated automatically
                documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto
                ' specify that image must be place over text
                documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText

                ' for each image in image collection
                For Each image As Vintasoft.Imaging.VintasoftImage In images
                    System.Console.WriteLine("Recognize text in image...")
                    ' recognize text on image
                    Dim page As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.Recognize(image)

                    System.Console.WriteLine("Add page to a PDF document...")
                    ' add recognized OCR page to the PDF document
                    documentBuilder.AddPage(image, page)
                Next

                ' shutdown OCR engine
                tesseractOcr.Shutdown()

                System.Console.WriteLine("Save changes in PDF document...")
                ' save changes in PDF document
                document.SaveChanges()
            End Using
        End Using

        ' clear and dispose images in image collection
        images.ClearAndDisposeItems()
    End Using
End Sub

Send Feedback