VintaSoft Imaging .NET SDK 12.4: Documentation for .NET developer
In This Topic
    OCR: Save the OCR result to a searchable PDF document
    In This Topic
    OCR result can be saved to a searchable PDF document in 3 modes:


    In "Text over image" mode:
    In "Image over text" mode:
    In "Text only" mode:

    The SearchablePdfGenerator class is a processing command that allows to add recognized pages to a searchable PDF document.

    Here is C#/VB.NET code that shows how to convert an image file to a searchable PDF document:
    /// <summary>
    /// Converts an image file to a searchable PDF document.
    /// </summary>
    /// <param name="sourceFilePath">A filename of source image file.</param>
    /// <param name="pageCreationMode">The PDF page creation mode.</param>
    /// <param name="ocrLanguage">An OCR language that should be used for text recognition.</param>
    /// <param name="pdfFilename">A filename of destination PDF file.</param>
    public static void ConvertImagesToSearchablePdf(
        string sourceFilePath,
        Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode pageCreationMode,
        Vintasoft.Imaging.Ocr.OcrLanguage ocrLanguage,
        string pdfFilename)
    {
        // create image collection
        using (Vintasoft.Imaging.ImageCollection images = new Vintasoft.Imaging.ImageCollection())
        {
            // add images to the image collection
            images.Add(sourceFilePath);
    
            try
            {
                // create Tesseract OCR engine
                using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcrEngine = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
                {
                    // create OCR engine manager
                    Vintasoft.Imaging.Ocr.OcrEngineManager ocrEngineManager = new Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcrEngine);
    
                    // create PDF generator
                    Vintasoft.Imaging.Pdf.Ocr.SearchablePdfGenerator pdfGenerator = new Vintasoft.Imaging.Pdf.Ocr.SearchablePdfGenerator(ocrEngineManager);
    
                    // set source images in PDF generator
                    pdfGenerator.SourceImages = images;
                    // set PDF page creation mode in PDF generator
                    pdfGenerator.PageCreationMode = pageCreationMode;
    
                    // create Tesseract OCR settings
                    Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings tesseractOcrSettings = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(ocrLanguage);
                    tesseractOcrSettings.RecognitionRegionType =
                        Vintasoft.Imaging.Ocr.RecognitionRegionType.RecognizePageWithPageSegmentationAndOrientationDetection;
    
                    // if PDF pages must be created in "TextOverImage" mode
                    if (pageCreationMode == Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.TextOverImage)
                        tesseractOcrSettings.UseSymbolRegionsCorrection = true;
                    else
                        tesseractOcrSettings.UseSymbolRegionsCorrection = false;
    
                    // set Tesseract OCR settings
                    pdfGenerator.OcrEngineSettings = tesseractOcrSettings;
    
                    // if PDF pages must be created in "Text" mode
                    if (pageCreationMode == Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.Text)
                    {
                        // set text color
                        pdfGenerator.TextColor = System.Drawing.Color.Black;
                        // text text font name
                        pdfGenerator.TextOnlyFontName = "Arial";
                    }
    
                    // subscribe to the PDF page added event for saving changes to PDF document
                    pdfGenerator.PdfPageAdded += Command_PdfPageAdded;
                    // subscribe to the image processing started event for preprocessing image
                    pdfGenerator.ImageProcessingStarted += PdfGenerator_ImageProcessingStarted;
    
                    // create PDF document
                    using (Vintasoft.Imaging.Pdf.PdfDocument document = new Vintasoft.Imaging.Pdf.PdfDocument(
                        pdfFilename, System.IO.FileMode.Create, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_16))
                    {
                        // generate PDF document pages
                        pdfGenerator.Execute(document);
    
                        // save PDF document
                        document.SaveChanges();
                    }
                }
            }
            finally
            {
                // remove images
                images.ClearAndDisposeItems();
            }
        }
    }
    
    /// <summary>
    /// Handles the <see cref="Vintasoft.Imaging.Pdf.Ocr.SearchablePdfGenerator.ImageProcessingStarted"/> event.
    /// </summary>
    private static void PdfGenerator_ImageProcessingStarted(object sender, Vintasoft.Imaging.Pdf.Ocr.OcrImageProcessingEventArgs e)
    {
        // create image copy
        Vintasoft.Imaging.VintasoftImage image = (Vintasoft.Imaging.VintasoftImage)e.Image.Clone();
    
        try
        {
            // preprocess image
    
            Vintasoft.Imaging.ImageProcessing.Document.DeskewCommand deskewCommand =
                new Vintasoft.Imaging.ImageProcessing.Document.DeskewCommand();
            deskewCommand.ExecuteInPlace(image);
    
            Vintasoft.Imaging.ImageProcessing.Document.AutoTextOrientationCommand autoTextOrientationCommand =
                new Vintasoft.Imaging.ImageProcessing.Document.AutoTextOrientationCommand();
            autoTextOrientationCommand.ExecuteInPlace(image);
    
            // change the processing image (the processing command will dispose image after use)
            e.Image = image;
        }
        catch
        {
            image.Dispose();
    
            // skip image
            e.Image = null;
        }
    }
    
    /// <summary>
    /// Handles the <see cref="Vintasoft.Imaging.Pdf.Ocr.SearchablePdfGenerator.PdfPageAdded"/> event.
    /// </summary>
    private static void Command_PdfPageAdded(object sender, Vintasoft.Imaging.Pdf.Ocr.PdfPageAddedEventArgs e)
    {
        // save PDF document every 10 pages
    
        if (e.Document.Pages.Count % 10 == 0)
            e.Document.SaveChanges();
    }
    
    ''' <summary>
    ''' Converts an image file to a searchable PDF document.
    ''' </summary>
    ''' <param name="sourceFilePath">A filename of source image file.</param>
    ''' <param name="pageCreationMode">The PDF page creation mode.</param>
    ''' <param name="ocrLanguage">An OCR language that should be used for text recognition.</param>
    ''' <param name="pdfFilename">A filename of destination PDF file.</param>
    Public Shared Sub ConvertImagesToSearchablePdf(sourceFilePath As String, pageCreationMode As Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode, ocrLanguage As Vintasoft.Imaging.Ocr.OcrLanguage, pdfFilename As String)
        ' create image collection
        Using images As New Vintasoft.Imaging.ImageCollection()
            ' add images to the image collection
            images.Add(sourceFilePath)
    
            Try
                ' create Tesseract OCR engine
                Using tesseractOcrEngine As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
                    ' create OCR engine manager
                    Dim ocrEngineManager As New Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcrEngine)
    
                    ' create PDF generator
                    Dim pdfGenerator As New Vintasoft.Imaging.Pdf.Ocr.SearchablePdfGenerator(ocrEngineManager)
    
                    ' set source images in PDF generator
                    pdfGenerator.SourceImages = images
                    ' set PDF page creation mode in PDF generator
                    pdfGenerator.PageCreationMode = pageCreationMode
    
                    ' create Tesseract OCR settings
                    Dim tesseractOcrSettings As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(ocrLanguage)
                    tesseractOcrSettings.RecognitionRegionType = Vintasoft.Imaging.Ocr.RecognitionRegionType.RecognizePageWithPageSegmentationAndOrientationDetection
    
                    ' if PDF pages must be created in "TextOverImage" mode
                    If pageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.TextOverImage Then
                        tesseractOcrSettings.UseSymbolRegionsCorrection = True
                    Else
                        tesseractOcrSettings.UseSymbolRegionsCorrection = False
                    End If
    
                    ' set Tesseract OCR settings
                    pdfGenerator.OcrEngineSettings = tesseractOcrSettings
    
                    ' if PDF pages must be created in "Text" mode
                    If pageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.Text Then
                        ' set text color
                        pdfGenerator.TextColor = System.Drawing.Color.Black
                        ' text text font name
                        pdfGenerator.TextOnlyFontName = "Arial"
                    End If
    
                    ' subscribe to the PDF page added event for saving changes to PDF document
                    AddHandler pdfGenerator.PdfPageAdded, AddressOf Command_PdfPageAdded
                    ' subscribe to the image processing started event for preprocessing image
                    AddHandler pdfGenerator.ImageProcessingStarted, AddressOf PdfGenerator_ImageProcessingStarted
    
                    ' create PDF document
                    Using document As New Vintasoft.Imaging.Pdf.PdfDocument(pdfFilename, System.IO.FileMode.Create, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_16)
                        ' generate PDF document pages
                        pdfGenerator.Execute(document)
    
                        ' save PDF document
                        document.SaveChanges()
                    End Using
                End Using
            Finally
                ' remove images
                images.ClearAndDisposeItems()
            End Try
        End Using
    End Sub
    
    ''' <summary>
    ''' Handles the <see cref="Vintasoft.Imaging.Pdf.Ocr.SearchablePdfGenerator.ImageProcessingStarted"/> event.
    ''' </summary>
    Private Shared Sub PdfGenerator_ImageProcessingStarted(sender As Object, e As Vintasoft.Imaging.Pdf.Ocr.OcrImageProcessingEventArgs)
        ' create image copy
        Dim image As Vintasoft.Imaging.VintasoftImage = DirectCast(e.Image.Clone(), Vintasoft.Imaging.VintasoftImage)
    
        Try
            ' preprocess image
    
            Dim deskewCommand As New Vintasoft.Imaging.ImageProcessing.Document.DeskewCommand()
            deskewCommand.ExecuteInPlace(image)
    
            Dim autoTextOrientationCommand As New Vintasoft.Imaging.ImageProcessing.Document.AutoTextOrientationCommand()
            autoTextOrientationCommand.ExecuteInPlace(image)
    
            ' change the processing image (the processing command will dispose image after use)
            e.Image = image
        Catch
            image.Dispose()
    
            ' skip image
            e.Image = Nothing
        End Try
    End Sub
    
    ''' <summary>
    ''' Handles the <see cref="Vintasoft.Imaging.Pdf.Ocr.SearchablePdfGenerator.PdfPageAdded"/> event.
    ''' </summary>
    Private Shared Sub Command_PdfPageAdded(sender As Object, e As Vintasoft.Imaging.Pdf.Ocr.PdfPageAddedEventArgs)
        ' save PDF document every 10 pages
    
        If e.Document.Pages.Count Mod 10 = 0 Then
            e.Document.SaveChanges()
        End If
    End Sub