VintaSoft Imaging .NET SDK v8.8 for .NET Framework
In This Topic
    OCR: Save the OCR results
    In This Topic

    OCR result can be saved to a text file or searchable PDF document. The searchable PDF document may contain the text located above the image or just text. To create a searchable PDF document from OCR results it is necessary to use PdfDocumentBuilder class.

    Here is an example that shows how to save OCR result to a text file as formatted text:

    ' The project, which uses this code, must have references to the following assemblies:
    ' - Vintasoft.Imaging
    ' - Vintasoft.Imaging.Ocr
    ' - Vintasoft.Imaging.Ocr.Tesseract
    
    Dim imageFilePath As String = "D:\TestImage.png"
    ' create the OCR engine
    Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
        ' specify that OCR engine will recognize English text
        Dim language As Vintasoft.Imaging.Ocr.OcrLanguage = Vintasoft.Imaging.Ocr.OcrLanguage.English
        ' create the OCR engine settings
        Dim settings As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(language)
        ' initialize the OCR engine
        tesseractOcr.Init(settings)
    
        ' load an image
        Using image As New Vintasoft.Imaging.VintasoftImage(imageFilePath)
            ' specify the image, where text must be recognized
            tesseractOcr.SetImage(image)
    
            ' recognize text in the image
            Dim ocrResult As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.Recognize()
    
            ' get the recognized text as formatted text
            Dim ocrResultAsFormattedContent As String = ocrResult.GetFormattedText()
    
            Dim textFilePath As String = System.IO.Path.Combine(System.IO.Path.GetDirectoryName(imageFilePath), System.IO.Path.GetFileNameWithoutExtension(imageFilePath) & ".txt")
            ' save the formatted text in a file
            System.IO.File.WriteAllText(textFilePath, ocrResultAsFormattedContent, System.Text.Encoding.UTF8)
    
            ' clear the image
            tesseractOcr.ClearImage()
        End Using
        ' shutdown the OCR engine
        tesseractOcr.Shutdown()
    End Using
                  
    
    // The project, which uses this code, must have references to the following assemblies:
    // - Vintasoft.Imaging
    // - Vintasoft.Imaging.Ocr
    // - Vintasoft.Imaging.Ocr.Tesseract
    
    string imageFilePath = @"D:\TestImage.png";
    // create the OCR engine
    using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
    {
        // specify that OCR engine will recognize English text
        Vintasoft.Imaging.Ocr.OcrLanguage language = Vintasoft.Imaging.Ocr.OcrLanguage.English;
        // create the OCR engine settings
        Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings settings = 
            new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(language);
        // initialize the OCR engine
        tesseractOcr.Init(settings);
    
        // load an image
        using (Vintasoft.Imaging.VintasoftImage image = new Vintasoft.Imaging.VintasoftImage(imageFilePath))
        {
            // specify the image, where text must be recognized
            tesseractOcr.SetImage(image);
    
            // recognize text in the image
            Vintasoft.Imaging.Ocr.Results.OcrPage ocrResult = tesseractOcr.Recognize();
    
            // get the recognized text as formatted text
            string ocrResultAsFormattedContent = ocrResult.GetFormattedText();
    
            string textFilePath = System.IO.Path.Combine(
                System.IO.Path.GetDirectoryName(imageFilePath),
                System.IO.Path.GetFileNameWithoutExtension(imageFilePath) + ".txt");
            // save the formatted text in a file
            System.IO.File.WriteAllText(textFilePath, ocrResultAsFormattedContent, System.Text.Encoding.UTF8);
    
            // clear the image
            tesseractOcr.ClearImage();
        }
        // shutdown the OCR engine
        tesseractOcr.Shutdown();
    }
                    
    


    Here is an example that shows how to save OCR result to a PDF file as text above the image:
    ' The project, which uses this code, must have references to the following assemblies:
    ' - Vintasoft.Imaging
    ' - Vintasoft.Imaging.DocCleanup
    ' - Vintasoft.Imaging.Ocr
    ' - Vintasoft.Imaging.Ocr.Tesseract
    ' - Vintasoft.Imaging.Pdf
    ' - Vintasoft.Imaging.Pdf.Ocr
    
    Class ScanAndConvertToSearchablePdfDcoument
            ''' <summary>
            ''' Acquires image from scanner,
            ''' recognizes text on images and
            ''' saves the result as a searchable PDF document.
            ''' </summary>
            ''' <remarks>
            ''' VintaSoft Twain .NET SDK (Vintasoft.Twain.dll),
            ''' VintaSoft Imaging .NET SDK (Standard) (Vintasoft.Imaging.dll),
            ''' VintaSoft Document Cleanup .NET Plug-in (Vintasoft.Imaging.DocCleanup.dll),
            ''' VintaSoft OCR .NET Plug-in (Vintasoft.Imaging.Ocr.dll, Vintasoft.Imaging.Ocr.Tesseract.dll),
            ''' VintaSoft PDF .NET Plug-in (Vintasoft.Imaging.Pdf.dll, Vintasoft.Imaging.Pdf.Ocr.dll) and
            ''' and VintaSoft PDF .NET Plug-in (Writer) (Vintasoft.Pdf.dll) are necessary
            ''' for executing this sample.
            ''' </remarks>
            Public Shared Sub ScanImagesAndSaveAsSearchablePdfDocument(language As Vintasoft.Imaging.Ocr.OcrLanguage, outputPdfFilename As String)
                    System.Console.WriteLine("Create TWAIN device manager...")
                    Using deviceManager As New Vintasoft.Twain.DeviceManager()
                            ' create PDF document
                            Using pdfDocument As New Vintasoft.Imaging.Pdf.PdfDocument(outputPdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14)
                                    ' create PDF document builder
                                    Dim documentBuilder As New Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(pdfDocument)
                                    documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto
                                    documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText
    
                                    System.Console.WriteLine("Create Tesseract OCR engine...")
                                    Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
                                            ' create OCR engine manager
                                            Dim engineManager As New Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr)
    
                                            Dim settings As New Vintasoft.Imaging.Ocr.OcrEngineSettings(language)
    
                                            System.Console.WriteLine("Open TWAIN device manager...")
                                            deviceManager.Open()
    
                                            Dim device As Vintasoft.Twain.Device = deviceManager.DefaultDevice
    
                                            Dim acquireState As Vintasoft.Twain.AcquireModalState
                                            Do
                                                    System.Console.WriteLine("Acquire image from scanner...")
                                                    acquireState = device.AcquireModal()
                                                    If acquireState = Vintasoft.Twain.AcquireModalState.ImageAcquired Then
                                                            ' create VintasoftImage
                                                            Using image As New Vintasoft.Imaging.VintasoftImage(device.AcquiredImage.GetAsBitmap(), True)
                                                                    ' preprocess image
                                                                    ' BorderClear, Despeckle, Deskew, Segmentation
                                                                    System.Console.WriteLine("Preprocess the image...")
                                                                    Dim preprocessing As New Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand()
                                                                    preprocessing.Binarization = Nothing
                                                                    preprocessing.ExecuteInPlace(image)
    
                                                                    ' recognize image
                                                                    System.Console.WriteLine("Recognize the image...")
                                                                    Dim page As Vintasoft.Imaging.Ocr.Results.OcrPage = engineManager.Recognize(image, settings, preprocessing.SegmentationTextRegions)
    
                                                                    ' add page to PDF document
                                                                    System.Console.WriteLine("Add page to PDF document...")
                                                                    documentBuilder.AddPage(image, page)
                                                            End Using
    
                                                            ' dispose the acquired image
                                                            device.AcquiredImage.Dispose()
                                                    End If
                                            Loop While acquireState <> Vintasoft.Twain.AcquireModalState.None
    
                                            System.Console.WriteLine("Save changes in PDF document...")
                                            pdfDocument.SaveChanges()
                                    End Using
                            End Using
                    End Using
            End Sub
    End Class
                  
    
    // The project, which uses this code, must have references to the following assemblies:
    // - Vintasoft.Imaging
    // - Vintasoft.Imaging.DocCleanup
    // - Vintasoft.Imaging.Ocr
    // - Vintasoft.Imaging.Ocr.Tesseract
    // - Vintasoft.Imaging.Pdf
    // - Vintasoft.Imaging.Pdf.Ocr
    
    class ScanAndConvertToSearchablePdfDcoument
    {
        /// <summary>
        /// Acquires image from scanner,
        /// recognizes text on images and
        /// saves the result as a searchable PDF document.
        /// </summary>
        /// <remarks>
        /// VintaSoft Twain .NET SDK (Vintasoft.Twain.dll),
        /// VintaSoft Imaging .NET SDK (Standard) (Vintasoft.Imaging.dll),
        /// VintaSoft Document Cleanup .NET Plug-in (Vintasoft.Imaging.DocCleanup.dll),
        /// VintaSoft OCR .NET Plug-in (Vintasoft.Imaging.Ocr.dll, Vintasoft.Imaging.Ocr.Tesseract.dll),
        /// VintaSoft PDF .NET Plug-in (Vintasoft.Imaging.Pdf.dll, Vintasoft.Imaging.Pdf.Ocr.dll) and
        /// and VintaSoft PDF .NET Plug-in (Writer) (Vintasoft.Pdf.dll) are necessary
        /// for executing this sample.
        /// </remarks>
        public static void ScanImagesAndSaveAsSearchablePdfDocument(
            Vintasoft.Imaging.Ocr.OcrLanguage language,
            string outputPdfFilename)
        {
            System.Console.WriteLine("Create TWAIN device manager...");
            using (Vintasoft.Twain.DeviceManager deviceManager = 
                new Vintasoft.Twain.DeviceManager())
            {
                // create PDF document
                using (Vintasoft.Imaging.Pdf.PdfDocument pdfDocument = 
                    new Vintasoft.Imaging.Pdf.PdfDocument(outputPdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14))
                {
                    // create PDF document builder
                    Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder = 
                        new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(pdfDocument);
                    documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto;
                    documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText;
    
                    System.Console.WriteLine("Create Tesseract OCR engine...");
                    using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = 
                        new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
                    {
                        // create OCR engine manager
                        Vintasoft.Imaging.Ocr.OcrEngineManager engineManager =
                            new Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr);
    
                        Vintasoft.Imaging.Ocr.OcrEngineSettings settings = 
                            new Vintasoft.Imaging.Ocr.OcrEngineSettings(language);
    
                        System.Console.WriteLine("Open TWAIN device manager...");
                        deviceManager.Open();
    
                        Vintasoft.Twain.Device device = deviceManager.DefaultDevice;
    
                        Vintasoft.Twain.AcquireModalState acquireState;
                        do
                        {
                            System.Console.WriteLine("Acquire image from scanner...");
                            acquireState = device.AcquireModal();
                            if (acquireState == Vintasoft.Twain.AcquireModalState.ImageAcquired)
                            {
                                // create VintasoftImage
                                using (Vintasoft.Imaging.VintasoftImage image =
                                    new Vintasoft.Imaging.VintasoftImage(device.AcquiredImage.GetAsBitmap(), true))
                                {
                                    // preprocess image
                                    // BorderClear, Despeckle, Deskew, Segmentation
                                    System.Console.WriteLine("Preprocess the image...");
                                    Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand preprocessing =
                                        new Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand();
                                    preprocessing.Binarization = null;
                                    preprocessing.ExecuteInPlace(image);
    
                                    // recognize image
                                    System.Console.WriteLine("Recognize the image...");
                                    Vintasoft.Imaging.Ocr.Results.OcrPage page = engineManager.Recognize(image, settings,
                                        preprocessing.SegmentationTextRegions);
    
                                    // add page to PDF document
                                    System.Console.WriteLine("Add page to PDF document...");
                                    documentBuilder.AddPage(image, page);
                                }
    
                                // dispose the acquired image
                                device.AcquiredImage.Dispose();
                            }
                        }
                        while (acquireState != Vintasoft.Twain.AcquireModalState.None);
    
                        System.Console.WriteLine("Save changes in PDF document...");
                        pdfDocument.SaveChanges();
                    }
                }
            }
        }
    }
                    
    


    OCR results can be exported to hOCR and imported from hOCR format.

    Here is an example that shows how to save OCR result to a file of hOCR format:
    ' The project, which uses this code, must have references to the following assemblies:
    ' - Vintasoft.Imaging
    ' - Vintasoft.Imaging.Ocr
    ' - Vintasoft.Imaging.Ocr.Tesseract
    
    Dim imageFilePath As String = "D:\TestImage.png"
    ' create the OCR engine
    Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
        ' specify that OCR engine will recognize English text
        Dim language As Vintasoft.Imaging.Ocr.OcrLanguage = Vintasoft.Imaging.Ocr.OcrLanguage.English
        ' create the OCR engine settings
        Dim settings As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(language)
        ' initialize the OCR engine
        tesseractOcr.Init(settings)
    
        ' load an image
        Using image As New Vintasoft.Imaging.VintasoftImage(imageFilePath)
            ' specify the image, where text must be recognized
            tesseractOcr.SetImage(image)
    
            ' recognize text in the image
            Dim ocrResult As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.Recognize()
    
            Dim hocrFilePath As String = System.IO.Path.Combine(System.IO.Path.GetDirectoryName(imageFilePath), System.IO.Path.GetFileNameWithoutExtension(imageFilePath) & ".hocr")
            ' create the file
            Using stream As System.IO.Stream = System.IO.File.Open(hocrFilePath, System.IO.FileMode.Create)
                ' create the HOcr codec
                Dim hOcrCodec As New Vintasoft.Imaging.Ocr.Results.HOcrCodec()
                ' save the OCR result in HOcr format
                hOcrCodec.Export(ocrResult, stream)
            End Using
    
            ' clear the image
            tesseractOcr.ClearImage()
        End Using
        ' shutdown the OCR engine
        tesseractOcr.Shutdown()
    End Using
                  
    
    // The project, which uses this code, must have references to the following assemblies:
    // - Vintasoft.Imaging
    // - Vintasoft.Imaging.Ocr
    // - Vintasoft.Imaging.Ocr.Tesseract
    
    string imageFilePath = @"D:\TestImage.png";
    // create the OCR engine
    using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
    {
        // specify that OCR engine will recognize English text
        Vintasoft.Imaging.Ocr.OcrLanguage language = Vintasoft.Imaging.Ocr.OcrLanguage.English;
        // create the OCR engine settings
        Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings settings = 
            new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(language);
        // initialize the OCR engine
        tesseractOcr.Init(settings);
    
        // load an image
        using (Vintasoft.Imaging.VintasoftImage image = new Vintasoft.Imaging.VintasoftImage(imageFilePath))
        {
            // specify the image, where text must be recognized
            tesseractOcr.SetImage(image);
    
            // recognize text in the image
            Vintasoft.Imaging.Ocr.Results.OcrPage ocrResult = tesseractOcr.Recognize();
    
            string hocrFilePath = System.IO.Path.Combine(
                System.IO.Path.GetDirectoryName(imageFilePath),
                System.IO.Path.GetFileNameWithoutExtension(imageFilePath) + ".hocr");
            // create the file
            using (System.IO.Stream stream = System.IO.File.Open(hocrFilePath, System.IO.FileMode.Create))
            {
                // create the HOcr codec
                Vintasoft.Imaging.Ocr.Results.HOcrCodec hOcrCodec = 
                    new Vintasoft.Imaging.Ocr.Results.HOcrCodec();
                // save the OCR result in HOcr format
                hOcrCodec.Export(ocrResult, stream);
            }
    
            // clear the image
            tesseractOcr.ClearImage();
        }
        // shutdown the OCR engine
        tesseractOcr.Shutdown();
    }