VintaSoft Imaging .NET SDK 12.4: Documentation for .NET developer
In This Topic
    OCR: How to convert an image-only PDF document to a searchable PDF document?
    In This Topic
    Here is C#/VB.NET code that shows how to convert an image-only PDF document to a searchable PDF document:
    /// <summary>
    /// Converts an image-only PDF document to a searchable PDF document.
    /// </summary>
    /// <param name="ocrLanguage">OCR language.</param>
    /// <param name="imageOnlyPdfFilename">A filename of source image-only PDF file.</param>
    /// <param name="ocrResolution">The resolution that uses to OCR <paramref name="imageOnlyPdfFilename"/>.</param>
    /// <param name="searchablePdfFilename">A filename of destination searchable PDF file.</param>
    public static void ConvertImageOnlyPdfToSearchablePdf(
        Vintasoft.Imaging.Ocr.OcrLanguage ocrLanguage,
        string imageOnlyPdfFilename,
        Vintasoft.Imaging.Resolution ocrResolution,
        string searchablePdfFilename)
    {
        // create an image collection
        using (Vintasoft.Imaging.ImageCollection images = new Vintasoft.Imaging.ImageCollection())
        {
            // add pages from image-only PDF document into image collection
            images.Add(imageOnlyPdfFilename);
    
            // create a searchable PDF document
            using (Vintasoft.Imaging.Pdf.PdfDocument document =
                new Vintasoft.Imaging.Pdf.PdfDocument(searchablePdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14))
            {
                System.Console.WriteLine("Create OCR engine...");
                // create the Tesseract OCR engine
                using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr =
                    new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
                {
                    System.Console.WriteLine("Initialize OCR engine...");
                    // init the Tesseract OCR engine
                    tesseractOcr.Init(new Vintasoft.Imaging.Ocr.OcrEngineSettings(ocrLanguage));
    
                    // create a PDF document builder
                    Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder =
                        new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(document);
                    // specify that the best image compression must be calculated automatically
                    documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto;
                    // specify that image must be place over text
                    documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText;
    
                    // for each image in image collection
                    foreach (Vintasoft.Imaging.VintasoftImage image in images)
                    {
                        System.Console.WriteLine("Recognize text in image...");
                        // recognize text on image
                        Vintasoft.Imaging.Ocr.Results.OcrPage page = tesseractOcr.Recognize(image);
    
                        System.Console.WriteLine("Add page to a PDF document...");
                        // add recognized OCR page to the PDF document
                        documentBuilder.AddPage(image, page);
                    }
    
                    // shutdown OCR engine
                    tesseractOcr.Shutdown();
    
                    System.Console.WriteLine("Save changes in PDF document...");
                    // save changes in PDF document
                    document.SaveChanges();
                }
            }
    
            // clear and dispose images in image collection
            images.ClearAndDisposeItems();
        }
    }
    
    ''' <summary>
    ''' Converts an image-only PDF document to a searchable PDF document.
    ''' </summary>
    ''' <param name="ocrLanguage">OCR language.</param>
    ''' <param name="imageOnlyPdfFilename">A filename of source image-only PDF file.</param>
    ''' <param name="ocrResolution">The resolution that uses to OCR <paramref name="imageOnlyPdfFilename"/>.</param>
    ''' <param name="searchablePdfFilename">A filename of destination searchable PDF file.</param>
    Public Shared Sub ConvertImageOnlyPdfToSearchablePdf(ocrLanguage As Vintasoft.Imaging.Ocr.OcrLanguage, imageOnlyPdfFilename As String, ocrResolution As Vintasoft.Imaging.Resolution, searchablePdfFilename As String)
        ' create an image collection
        Using images As New Vintasoft.Imaging.ImageCollection()
            ' add pages from image-only PDF document into image collection
            images.Add(imageOnlyPdfFilename)
    
            ' create a searchable PDF document
            Using document As New Vintasoft.Imaging.Pdf.PdfDocument(searchablePdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14)
                System.Console.WriteLine("Create OCR engine...")
                ' create the Tesseract OCR engine
                Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
                    System.Console.WriteLine("Initialize OCR engine...")
                    ' init the Tesseract OCR engine
                    tesseractOcr.Init(New Vintasoft.Imaging.Ocr.OcrEngineSettings(ocrLanguage))
    
                    ' create a PDF document builder
                    Dim documentBuilder As New Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(document)
                    ' specify that the best image compression must be calculated automatically
                    documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto
                    ' specify that image must be place over text
                    documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText
    
                    ' for each image in image collection
                    For Each image As Vintasoft.Imaging.VintasoftImage In images
                        System.Console.WriteLine("Recognize text in image...")
                        ' recognize text on image
                        Dim page As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.Recognize(image)
    
                        System.Console.WriteLine("Add page to a PDF document...")
                        ' add recognized OCR page to the PDF document
                        documentBuilder.AddPage(image, page)
                    Next
    
                    ' shutdown OCR engine
                    tesseractOcr.Shutdown()
    
                    System.Console.WriteLine("Save changes in PDF document...")
                    ' save changes in PDF document
                    document.SaveChanges()
                End Using
            End Using
    
            ' clear and dispose images in image collection
            images.ClearAndDisposeItems()
        End Using
    End Sub