VintaSoft Imaging .NET SDK 14.0: Documentation for .NET developer
In This Topic
    OCR: Obtain and edit the OCR results
    In This Topic
    The text recognition result in an image is returned as an OCR page (an instance of OcrPage class). The OCR page contains text regions (instances of OcrTextRegion class). The text region contains paragraphs (instances of OcrParagraph class). The paragraph contains text lines (instances of OcrTextLine class). The text line contains words (instances of OcrWord class). The word contains symbols (instances of OcrSymbol class).

    All elements (OcrPage, OcrTextRegion, OcrParagraph, OcrTextLine, OcrWord, OcrSymbol) contain:

    OcrWord class also contains the font name of recognized text (OcrWord.Font).

    OcrPage class allows to obtain the OCR result as formatted text (OcrPage.GetFormattedText) and non-formatted text (OcrPage.GetText).


    Often it is necessary to process the OCR results, for example, remove words with low confidence or unite two and more results into one.
    OcrResultsEditor class is intended for such processing of OCR results.

    Here is C#/VB.NET code that shows how to remove words with low confidence from OCR results:
    /// <summary>
    /// Recognizes text in images,
    /// removes words with low confidence from recognized text and
    /// returns recognized text.
    /// </summary>
    /// <param name="filename">The name of the file containing image to OCR.</param>
    public string RecognizeTextAndFilterRecognitionResult(string filename)
    {
        // minimum confidence
        const float MIN_CONFIDENCE = 75.0f;
    
        // create image collection
        using (Vintasoft.Imaging.ImageCollection images = new Vintasoft.Imaging.ImageCollection())
        {
            // add images from file to image collection
            images.Add(filename);
    
            // create tesseract OCR engine
            using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = 
                new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
            {
                // create tesseract OCR settings
                Vintasoft.Imaging.Ocr.OcrEngineSettings settings = 
                    new Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.English);
                tesseractOcr.Init(settings);
    
                // create result builder
                System.Text.StringBuilder result = new System.Text.StringBuilder();
    
                // for each image in image collection
                foreach (Vintasoft.Imaging.VintasoftImage image in images)
                {
                    // recognize the image
                    Vintasoft.Imaging.Ocr.Results.OcrPage page = tesseractOcr.Recognize(image);
    
                    // get all words in recognized text
                    Vintasoft.Imaging.Ocr.Results.OcrObject[] ocrObjects = page.GetObjects(
                        Vintasoft.Imaging.Ocr.OcrObjectType.Word);
                    // create list of words to remove
                    System.Collections.Generic.List<Vintasoft.Imaging.Ocr.Results.OcrObject> removeObjects = 
                        new System.Collections.Generic.List<Vintasoft.Imaging.Ocr.Results.OcrObject>();
                    // for each word
                    foreach (Vintasoft.Imaging.Ocr.Results.OcrObject word in ocrObjects)
                    {
                        // if word confidence is less than minimum confidence
                        if (word.Confidence < MIN_CONFIDENCE)
                            // add word to a list of words to remove
                            removeObjects.Add(word);
                    }
    
                    // validate recognition results (remove words with low confidence)
    
                    Vintasoft.Imaging.Ocr.Results.OcrResultsEditor editor = 
                        new Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(page);
                    editor.RemoveObjects(removeObjects.ToArray());
                    editor.ValidateResults();
    
                    // get recognized text
                    string text = page.GetText();
                    // add recognized text to result
                    result.Append(text);
                    result.AppendLine();
                }
    
                // dispose images and clear image collection
                images.ClearAndDisposeItems();
    
                // return result
                return result.ToString();
            }
        }
    }
    
    ''' <summary>
    ''' Recognizes text in images,
    ''' removes words with low confidence from recognized text and
    ''' returns recognized text.
    ''' </summary>
    ''' <param name="filename">The name of the file containing image to OCR.</param>
    Public Function RecognizeTextAndFilterRecognitionResult(filename As String) As String
        ' minimum confidence
        Const  MIN_CONFIDENCE As Single = 75F
    
        ' create image collection
        Using images As New Vintasoft.Imaging.ImageCollection()
            ' add images from file to image collection
            images.Add(filename)
    
            ' create tesseract OCR engine
            Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
                ' create tesseract OCR settings
                Dim settings As New Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.English)
                tesseractOcr.Init(settings)
    
                ' create result builder
                Dim result As New System.Text.StringBuilder()
    
                ' for each image in image collection
                For Each image As Vintasoft.Imaging.VintasoftImage In images
                    ' recognize the image
                    Dim page As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.Recognize(image)
    
                    ' get all words in recognized text
                    Dim ocrObjects As Vintasoft.Imaging.Ocr.Results.OcrObject() = page.GetObjects(Vintasoft.Imaging.Ocr.OcrObjectType.Word)
                    ' create list of words to remove
                    Dim removeObjects As New System.Collections.Generic.List(Of Vintasoft.Imaging.Ocr.Results.OcrObject)()
                    ' for each word
                    For Each word As Vintasoft.Imaging.Ocr.Results.OcrObject In ocrObjects
                        ' if word confidence is less than minimum confidence
                        If word.Confidence < MIN_CONFIDENCE Then
                            ' add word to a list of words to remove
                            removeObjects.Add(word)
                        End If
                    Next
    
                    ' validate recognition results (remove words with low confidence)
    
                    Dim editor As New Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(page)
                    editor.RemoveObjects(removeObjects.ToArray())
                    editor.ValidateResults()
    
                    ' get recognized text
                    Dim text As String = page.GetText()
                    ' add recognized text to result
                    result.Append(text)
                    result.AppendLine()
                Next
    
                ' dispose images and clear image collection
                images.ClearAndDisposeItems()
    
                ' return result
                Return result.ToString()
            End Using
        End Using
    End Function
    


    Here is C#/VB.NET code that shows how to recognize text in two images (with different resolution) of the same document and unite the OCR results:
    string imageFilePath = @"D:\TestImage.pdf";
    // create the OCR engine
    using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
    {
        // create an array for additional OCR engines
        Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr[] additionalEngines = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr[2];
        try
        {
            // create an array for additional OCR engines
            for (int i = 0; i < additionalEngines.Length; i++)
                // create the additional OCR engine
                additionalEngines[i] = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr();
    
            // create the OCR engine manager
            Vintasoft.Imaging.Ocr.OcrEngineManager engineManager = 
                new Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr, additionalEngines);
    
            // load a PDF document from file
            using (Vintasoft.Imaging.Pdf.PdfDocument pdfDocument = 
                new Vintasoft.Imaging.Pdf.PdfDocument(imageFilePath))
            {
                // create the OCR engine settings and
                // specify that German text will be recognized
                Vintasoft.Imaging.Ocr.OcrEngineSettings settings = 
                    new Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.German);
    
                // if PDF document is empty
                if (pdfDocument.Pages.Count == 0)
                    return;
    
                // get the first PDF page
                Vintasoft.Imaging.Pdf.Tree.PdfPage pdfPage = pdfDocument.Pages[0];
                // scales, which should be applied to the PDF page before text recognition
                float[] scales = new float[] { 0.5f, 1.5f };
                // an array that contains the scaled OCR results
                Vintasoft.Imaging.Ocr.Results.OcrPage[] scaledOcrResults = 
                    new Vintasoft.Imaging.Ocr.Results.OcrPage[scales.Length];
    
                // for each scale
                for (int i = 0; i < scales.Length; i++)
                {
                    // render the scaled PDF page
                    using (Vintasoft.Imaging.VintasoftImage renderedImage = pdfPage.Render(scales[i]))
                    {
                        // recognize text in scaled PDF page
                        scaledOcrResults[i] = engineManager.Recognize(renderedImage, settings);
                    }
                }
    
                // if text is recognized
                if (scaledOcrResults.Length > 0)
                {
                    Vintasoft.Imaging.Ocr.Results.OcrResultsEditor resultEditor;
    
                    // "downscale" the OCR results, this is necessary
                    // because we scaled PDF page before text recognition
    
                    // for each recognition result
                    for (int i = 0; i < scaledOcrResults.Length; i++)
                    {
                        if (scales[i] == 1f)
                            continue;
    
                        // create the OCR results editor
                        resultEditor = new Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(scaledOcrResults[i]);
    
                        // calculate the "downscale" factor
                        Vintasoft.Imaging.Scale downScale = new Vintasoft.Imaging.Scale(1f / scales[i], 1f / scales[i]);
                        // "downscale" the OCR result
                        resultEditor.ScaleOcrPage(downScale);
                    }
    
    
                    // combine the scaled OCR results into the final OCR result
    
                    // set the first scaled OCR result as the final OCR result
                    Vintasoft.Imaging.Ocr.Results.OcrPage finalOcrResult = scaledOcrResults[0];
    
                    // create the OCR results editor for the final OCR result
                    resultEditor = new Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(finalOcrResult);
                    // for each scaled OCR result starting from the second
                    for (int i = 1; i < scaledOcrResults.Length; i++)
                    {
                        // add the scaled OCR result to the OCR results editor
                        resultEditor.AddRegions(scaledOcrResults[i]);
                    }
                    // validate the final OCR result
                    resultEditor.ValidateResults();
    
    
                    // get the recognized text from the final OCR result
                    string ocrPageContent = finalOcrResult.GetText();
    
                    string textFilePath = System.IO.Path.Combine(
                        System.IO.Path.GetDirectoryName(imageFilePath),
                        System.IO.Path.GetFileNameWithoutExtension(imageFilePath) + ".txt");
                    // save the recognition results
                    System.IO.File.WriteAllText(textFilePath, ocrPageContent, System.Text.Encoding.UTF8);
                }
            }
        }
        finally
        {
            // for each additional OCR engine
            for (int i = 0; i < additionalEngines.Length; i++)
            {
                if (additionalEngines[i] != null)
                    // dispose the additional OCR engine
                    additionalEngines[i].Dispose();
            }
        }
    }
    
    Dim imageFilePath As String = "D:\TestImage.pdf"
    ' create the OCR engine
    Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
        ' create an array for additional OCR engines
        Dim additionalEngines As Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr() = New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr(1) {}
        Try
            ' create an array for additional OCR engines
            For i As Integer = 0 To additionalEngines.Length - 1
                ' create the additional OCR engine
                additionalEngines(i) = New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
            Next
    
            ' create the OCR engine manager
            Dim engineManager As New Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr, additionalEngines)
    
            ' load a PDF document from file
            Using pdfDocument As New Vintasoft.Imaging.Pdf.PdfDocument(imageFilePath)
                ' create the OCR engine settings and
                ' specify that German text will be recognized
                Dim settings As New Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.German)
    
                ' if PDF document is empty
                If pdfDocument.Pages.Count = 0 Then
                    Return
                End If
    
                ' get the first PDF page
                Dim pdfPage As Vintasoft.Imaging.Pdf.Tree.PdfPage = pdfDocument.Pages(0)
                ' scales, which should be applied to the PDF page before text recognition
                Dim scales As Single() = New Single() {0.5F, 1.5F}
                ' an array that contains the scaled OCR results
                Dim scaledOcrResults As Vintasoft.Imaging.Ocr.Results.OcrPage() = New Vintasoft.Imaging.Ocr.Results.OcrPage(scales.Length - 1) {}
    
                ' for each scale
                For i As Integer = 0 To scales.Length - 1
                    ' render the scaled PDF page
                    Using renderedImage As Vintasoft.Imaging.VintasoftImage = pdfPage.Render(scales(i))
                        ' recognize text in scaled PDF page
                        scaledOcrResults(i) = engineManager.Recognize(renderedImage, settings)
                    End Using
                Next
    
                ' if text is recognized
                If scaledOcrResults.Length > 0 Then
                    Dim resultEditor As Vintasoft.Imaging.Ocr.Results.OcrResultsEditor
    
                    ' "downscale" the OCR results, this is necessary
                    ' because we scaled PDF page before text recognition
    
                    ' for each recognition result
                    For i As Integer = 0 To scaledOcrResults.Length - 1
                        If scales(i) = 1F Then
                            Continue For
                        End If
    
                        ' create the OCR results editor
                        resultEditor = New Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(scaledOcrResults(i))
    
                        ' calculate the "downscale" factor
                        Dim downScale As New Vintasoft.Imaging.Scale(1F / scales(i), 1F / scales(i))
                        ' "downscale" the OCR result
                        resultEditor.ScaleOcrPage(downScale)
                    Next
    
    
                    ' combine the scaled OCR results into the final OCR result
    
                    ' set the first scaled OCR result as the final OCR result
                    Dim finalOcrResult As Vintasoft.Imaging.Ocr.Results.OcrPage = scaledOcrResults(0)
    
                    ' create the OCR results editor for the final OCR result
                    resultEditor = New Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(finalOcrResult)
                    ' for each scaled OCR result starting from the second
                    For i As Integer = 1 To scaledOcrResults.Length - 1
                        ' add the scaled OCR result to the OCR results editor
                        resultEditor.AddRegions(scaledOcrResults(i))
                    Next
                    ' validate the final OCR result
                    resultEditor.ValidateResults()
    
    
                    ' get the recognized text from the final OCR result
                    Dim ocrPageContent As String = finalOcrResult.GetText()
    
                    Dim textFilePath As String = System.IO.Path.Combine(System.IO.Path.GetDirectoryName(imageFilePath), System.IO.Path.GetFileNameWithoutExtension(imageFilePath) & ".txt")
                    ' save the recognition results
                    System.IO.File.WriteAllText(textFilePath, ocrPageContent, System.Text.Encoding.UTF8)
                End If
            End Using
        Finally
            ' for each additional OCR engine
            For i As Integer = 0 To additionalEngines.Length - 1
                If additionalEngines(i) IsNot Nothing Then
                    ' dispose the additional OCR engine
                    additionalEngines(i).Dispose()
                End If
            Next
        End Try
    End Using