VintaSoft Imaging .NET SDK 12.0
In This Topic
    OCR: Save the OCR result to a searchable PDF document
    In This Topic
    OCR result can be saved to a searchable PDF document.
    The searchable PDF document may contain the text located above the image or just text. To create a searchable PDF document from OCR results it is necessary to use PdfDocumentBuilder class.

    Here is an example that shows how to save OCR result to a PDF file as text above the image:
    class ScanAndConvertToSearchablePdfDcoument
    {
        /// <summary>
        /// Acquires image from scanner,
        /// recognizes text on images and
        /// saves the result as a searchable PDF document.
        /// </summary>
        /// <remarks>
        /// VintaSoft Twain .NET SDK (Vintasoft.Twain.dll),
        /// VintaSoft Imaging .NET SDK (Standard) (Vintasoft.Imaging.dll),
        /// VintaSoft Document Cleanup .NET Plug-in (Vintasoft.Imaging.DocCleanup.dll),
        /// VintaSoft OCR .NET Plug-in (Vintasoft.Imaging.Ocr.dll, Vintasoft.Imaging.Ocr.Tesseract.dll),
        /// VintaSoft PDF .NET Plug-in (Vintasoft.Imaging.Pdf.dll, Vintasoft.Imaging.Pdf.Ocr.dll) and
        /// and VintaSoft PDF .NET Plug-in (Writer) (Vintasoft.Pdf.dll) are necessary
        /// for executing this sample.
        /// </remarks>
        public static void ScanImagesAndSaveAsSearchablePdfDocument(
            Vintasoft.Imaging.Ocr.OcrLanguage language,
            string outputPdfFilename)
        {
            System.Console.WriteLine("Create TWAIN device manager...");
            using (Vintasoft.Twain.DeviceManager deviceManager = 
                new Vintasoft.Twain.DeviceManager())
            {
                // create PDF document
                using (Vintasoft.Imaging.Pdf.PdfDocument pdfDocument = 
                    new Vintasoft.Imaging.Pdf.PdfDocument(outputPdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14))
                {
                    // create PDF document builder
                    Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder = 
                        new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(pdfDocument);
                    documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto;
                    documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText;
    
                    System.Console.WriteLine("Create Tesseract OCR engine...");
                    using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = 
                        new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
                    {
                        // create OCR engine manager
                        Vintasoft.Imaging.Ocr.OcrEngineManager engineManager =
                            new Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr);
    
                        Vintasoft.Imaging.Ocr.OcrEngineSettings settings = 
                            new Vintasoft.Imaging.Ocr.OcrEngineSettings(language);
    
                        System.Console.WriteLine("Open TWAIN device manager...");
                        deviceManager.Open();
    
                        Vintasoft.Twain.Device device = deviceManager.DefaultDevice;
    
                        Vintasoft.Twain.AcquireModalState acquireState;
                        do
                        {
                            System.Console.WriteLine("Acquire image from scanner...");
                            acquireState = device.AcquireModal();
                            if (acquireState == Vintasoft.Twain.AcquireModalState.ImageAcquired)
                            {
                                // create VintasoftImage
                                using (Vintasoft.Imaging.VintasoftImage image =
                                    Vintasoft.Imaging.VintasoftImageGdiExtensions.Create(device.AcquiredImage.GetAsBitmap(), true))
                                {
                                    // preprocess image
                                    // BorderClear, Despeckle, Deskew, Segmentation
                                    System.Console.WriteLine("Preprocess the image...");
                                    Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand preprocessing =
                                        new Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand();
                                    preprocessing.Binarization = null;
                                    preprocessing.ExecuteInPlace(image);
    
                                    // recognize image
                                    System.Console.WriteLine("Recognize the image...");
                                    Vintasoft.Imaging.Ocr.Results.OcrPage page = engineManager.Recognize(image, settings,
                                        preprocessing.SegmentationTextRegions);
    
                                    // add page to PDF document
                                    System.Console.WriteLine("Add page to PDF document...");
                                    documentBuilder.AddPage(image, page);
                                }
    
                                // dispose the acquired image
                                device.AcquiredImage.Dispose();
                            }
                        }
                        while (acquireState != Vintasoft.Twain.AcquireModalState.None);
    
                        System.Console.WriteLine("Save changes in PDF document...");
                        pdfDocument.SaveChanges();
                    }
                }
            }
        }
    }
    
    Class ScanAndConvertToSearchablePdfDcoument
        ''' <summary>
        ''' Acquires image from scanner,
        ''' recognizes text on images and
        ''' saves the result as a searchable PDF document.
        ''' </summary>
        ''' <remarks>
        ''' VintaSoft Twain .NET SDK (Vintasoft.Twain.dll),
        ''' VintaSoft Imaging .NET SDK (Standard) (Vintasoft.Imaging.dll),
        ''' VintaSoft Document Cleanup .NET Plug-in (Vintasoft.Imaging.DocCleanup.dll),
        ''' VintaSoft OCR .NET Plug-in (Vintasoft.Imaging.Ocr.dll, Vintasoft.Imaging.Ocr.Tesseract.dll),
        ''' VintaSoft PDF .NET Plug-in (Vintasoft.Imaging.Pdf.dll, Vintasoft.Imaging.Pdf.Ocr.dll) and
        ''' and VintaSoft PDF .NET Plug-in (Writer) (Vintasoft.Pdf.dll) are necessary
        ''' for executing this sample.
        ''' </remarks>
        Public Shared Sub ScanImagesAndSaveAsSearchablePdfDocument(language As Vintasoft.Imaging.Ocr.OcrLanguage, outputPdfFilename As String)
            System.Console.WriteLine("Create TWAIN device manager...")
            Using deviceManager As New Vintasoft.Twain.DeviceManager()
                ' create PDF document
                Using pdfDocument As New Vintasoft.Imaging.Pdf.PdfDocument(outputPdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14)
                    ' create PDF document builder
                    Dim documentBuilder As New Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(pdfDocument)
                    documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto
                    documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText
    
                    System.Console.WriteLine("Create Tesseract OCR engine...")
                    Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
                        ' create OCR engine manager
                        Dim engineManager As New Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr)
    
                        Dim settings As New Vintasoft.Imaging.Ocr.OcrEngineSettings(language)
    
                        System.Console.WriteLine("Open TWAIN device manager...")
                        deviceManager.Open()
    
                        Dim device As Vintasoft.Twain.Device = deviceManager.DefaultDevice
    
                        Dim acquireState As Vintasoft.Twain.AcquireModalState
                        Do
                            System.Console.WriteLine("Acquire image from scanner...")
                            acquireState = device.AcquireModal()
                            If acquireState = Vintasoft.Twain.AcquireModalState.ImageAcquired Then
                                ' create VintasoftImage
                                Using image As Vintasoft.Imaging.VintasoftImage = Vintasoft.Imaging.VintasoftImageGdiExtensions.Create(device.AcquiredImage.GetAsBitmap(), True)
                                    ' preprocess image
                                    ' BorderClear, Despeckle, Deskew, Segmentation
                                    System.Console.WriteLine("Preprocess the image...")
                                    Dim preprocessing As New Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand()
                                    preprocessing.Binarization = Nothing
                                    preprocessing.ExecuteInPlace(image)
    
                                    ' recognize image
                                    System.Console.WriteLine("Recognize the image...")
                                    Dim page As Vintasoft.Imaging.Ocr.Results.OcrPage = engineManager.Recognize(image, settings, preprocessing.SegmentationTextRegions)
    
                                    ' add page to PDF document
                                    System.Console.WriteLine("Add page to PDF document...")
                                    documentBuilder.AddPage(image, page)
                                End Using
    
                                ' dispose the acquired image
                                device.AcquiredImage.Dispose()
                            End If
                        Loop While acquireState <> Vintasoft.Twain.AcquireModalState.None
    
                        System.Console.WriteLine("Save changes in PDF document...")
                        pdfDocument.SaveChanges()
                    End Using
                End Using
            End Using
        End Sub
    End Class