VintaSoft Imaging .NET SDK v8.6
In This Topic
    OCR: How to acquire image from scanner and generate searchable PDF document?
    In This Topic

    Here is an example that shows how to acquire image from scanner, run OCR and create searchable PDF document based on OCR results:

    ' The project, which uses this code, must have references to the following assemblies:
    ' - Vintasoft.Imaging
    ' - Vintasoft.Imaging.DocCleanup
    ' - Vintasoft.Imaging.Ocr
    ' - Vintasoft.Imaging.Ocr.Tesseract
    ' - Vintasoft.Imaging.Pdf
    ' - Vintasoft.Imaging.Pdf.Ocr
    
    Class ScanAndConvertToSearchablePdfDcoument
        ''' <summary>
        ''' Acquires image from scanner,
        ''' recognizes text on images and
        ''' saves the result as a searchable PDF document.
        ''' </summary>
        ''' <remarks>
        ''' VintaSoft Twain .NET SDK (Vintasoft.Twain.dll),
        ''' VintaSoft Imaging .NET SDK (Standard) (Vintasoft.Imaging.dll),
        ''' VintaSoft Document Cleanup .NET Plug-in (Vintasoft.Imaging.DocCleanup.dll),
        ''' VintaSoft OCR .NET Plug-in (Vintasoft.Imaging.Ocr.dll, Vintasoft.Imaging.Ocr.Tesseract.dll),
        ''' VintaSoft PDF .NET Plug-in (Vintasoft.Imaging.Pdf.dll, Vintasoft.Imaging.Pdf.Ocr.dll) and
        ''' and VintaSoft PDF .NET Plug-in (Writer) (Vintasoft.Pdf.dll) are necessary
        ''' for executing this sample.
        ''' </remarks>
        Public Shared Sub ScanImagesAndSaveAsSearchablePdfDocument(language As Vintasoft.Imaging.Ocr.OcrLanguage, outputPdfFilename As String)
            System.Console.WriteLine("Create TWAIN device manager...")
            Using deviceManager As New Vintasoft.Twain.DeviceManager()
                ' create PDF document
                Using pdfDocument As New Vintasoft.Imaging.Pdf.PdfDocument(outputPdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14)
                    ' create PDF document builder
                    Dim documentBuilder As New Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(pdfDocument)
                    documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto
                    documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText
    
                    System.Console.WriteLine("Create Tesseract OCR engine...")
                    Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
                        ' create OCR engine manager
                        Dim engineManager As New Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr)
    
                        Dim settings As New Vintasoft.Imaging.Ocr.OcrEngineSettings(language)
    
                        System.Console.WriteLine("Open TWAIN device manager...")
                        deviceManager.Open()
    
                        Dim device As Vintasoft.Twain.Device = deviceManager.DefaultDevice
    
                        Dim acquireState As Vintasoft.Twain.AcquireModalState
                        Do
                            System.Console.WriteLine("Acquire image from scanner...")
                            acquireState = device.AcquireModal()
                            If acquireState = Vintasoft.Twain.AcquireModalState.ImageAcquired Then
                                ' create VintasoftImage
                                Using image As New Vintasoft.Imaging.VintasoftImage(device.AcquiredImage.GetAsBitmap(), True)
                                    ' preprocess image
                                    ' BorderClear, Despeckle, Deskew, Segmentation
                                    System.Console.WriteLine("Preprocess the image...")
                                    Dim preprocessing As New Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand()
                                    preprocessing.Binarization = Nothing
                                    preprocessing.ExecuteInPlace(image)
    
                                    ' recognize image
                                    System.Console.WriteLine("Recognize the image...")
                                    Dim page As Vintasoft.Imaging.Ocr.Results.OcrPage = engineManager.Recognize(image, settings, preprocessing.SegmentationTextRegions)
    
                                    ' add page to PDF document
                                    System.Console.WriteLine("Add page to PDF document...")
                                    documentBuilder.AddPage(image, page)
                                End Using
    
                                ' dispose the acquired image
                                device.AcquiredImage.Dispose()
                            End If
                        Loop While acquireState <> Vintasoft.Twain.AcquireModalState.None
    
                        System.Console.WriteLine("Save changes in PDF document...")
                        pdfDocument.SaveChanges()
                    End Using
                End Using
            End Using
        End Sub
    End Class
                  
    
    // The project, which uses this code, must have references to the following assemblies:
    // - Vintasoft.Imaging
    // - Vintasoft.Imaging.DocCleanup
    // - Vintasoft.Imaging.Ocr
    // - Vintasoft.Imaging.Ocr.Tesseract
    // - Vintasoft.Imaging.Pdf
    // - Vintasoft.Imaging.Pdf.Ocr
    
    class ScanAndConvertToSearchablePdfDcoument
    {
        /// <summary>
        /// Acquires image from scanner,
        /// recognizes text on images and
        /// saves the result as a searchable PDF document.
        /// </summary>
        /// <remarks>
        /// VintaSoft Twain .NET SDK (Vintasoft.Twain.dll),
        /// VintaSoft Imaging .NET SDK (Standard) (Vintasoft.Imaging.dll),
        /// VintaSoft Document Cleanup .NET Plug-in (Vintasoft.Imaging.DocCleanup.dll),
        /// VintaSoft OCR .NET Plug-in (Vintasoft.Imaging.Ocr.dll, Vintasoft.Imaging.Ocr.Tesseract.dll),
        /// VintaSoft PDF .NET Plug-in (Vintasoft.Imaging.Pdf.dll, Vintasoft.Imaging.Pdf.Ocr.dll) and
        /// and VintaSoft PDF .NET Plug-in (Writer) (Vintasoft.Pdf.dll) are necessary
        /// for executing this sample.
        /// </remarks>
        public static void ScanImagesAndSaveAsSearchablePdfDocument(
            Vintasoft.Imaging.Ocr.OcrLanguage language,
            string outputPdfFilename)
        {
            System.Console.WriteLine("Create TWAIN device manager...");
            using (Vintasoft.Twain.DeviceManager deviceManager = 
                new Vintasoft.Twain.DeviceManager())
            {
                // create PDF document
                using (Vintasoft.Imaging.Pdf.PdfDocument pdfDocument = 
                    new Vintasoft.Imaging.Pdf.PdfDocument(outputPdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14))
                {
                    // create PDF document builder
                    Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder = 
                        new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(pdfDocument);
                    documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto;
                    documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText;
    
                    System.Console.WriteLine("Create Tesseract OCR engine...");
                    using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = 
                        new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
                    {
                        // create OCR engine manager
                        Vintasoft.Imaging.Ocr.OcrEngineManager engineManager =
                            new Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr);
    
                        Vintasoft.Imaging.Ocr.OcrEngineSettings settings = 
                            new Vintasoft.Imaging.Ocr.OcrEngineSettings(language);
    
                        System.Console.WriteLine("Open TWAIN device manager...");
                        deviceManager.Open();
    
                        Vintasoft.Twain.Device device = deviceManager.DefaultDevice;
    
                        Vintasoft.Twain.AcquireModalState acquireState;
                        do
                        {
                            System.Console.WriteLine("Acquire image from scanner...");
                            acquireState = device.AcquireModal();
                            if (acquireState == Vintasoft.Twain.AcquireModalState.ImageAcquired)
                            {
                                // create VintasoftImage
                                using (Vintasoft.Imaging.VintasoftImage image =
                                    new Vintasoft.Imaging.VintasoftImage(device.AcquiredImage.GetAsBitmap(), true))
                                {
                                    // preprocess image
                                    // BorderClear, Despeckle, Deskew, Segmentation
                                    System.Console.WriteLine("Preprocess the image...");
                                    Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand preprocessing =
                                        new Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand();
                                    preprocessing.Binarization = null;
                                    preprocessing.ExecuteInPlace(image);
    
                                    // recognize image
                                    System.Console.WriteLine("Recognize the image...");
                                    Vintasoft.Imaging.Ocr.Results.OcrPage page = engineManager.Recognize(image, settings,
                                        preprocessing.SegmentationTextRegions);
    
                                    // add page to PDF document
                                    System.Console.WriteLine("Add page to PDF document...");
                                    documentBuilder.AddPage(image, page);
                                }
    
                                // dispose the acquired image
                                device.AcquiredImage.Dispose();
                            }
                        }
                        while (acquireState != Vintasoft.Twain.AcquireModalState.None);
    
                        System.Console.WriteLine("Save changes in PDF document...");
                        pdfDocument.SaveChanges();
                    }
                }
            }
        }
    }