VintaSoft Imaging .NET SDK 12.4: Documentation for .NET developer
Vintasoft.Imaging.Ocr.Tesseract Namespace / TesseractOcr Class / AnalyzeLayout() Method
Syntax Exceptions Remarks Example Requirements SeeAlso
In This Topic
    AnalyzeLayout() Method (TesseractOcr)
    In This Topic
    Analyzes the layout (layout of paragraphs, lines, words, symbols on document image) without text recognition.
    Syntax

    Return Value

    Result of layout as OcrPage.
    Exceptions
    ExceptionDescription
    Thrown if this object is disposed.
    Thrown if Tesseract OCR engine does not have image for processing.
    Remarks

    Use this method if you need to analyze layout of paragraphs, lines, words, symbols on document image but do not need to recognize text from image.

    This method works faster than Recognize method.

    This method provides good results for non rotated document image. If document image is rotated, image can be deskewed using Vintasoft.Imaging.ImageProcessing.Document.DeskewCommand class.

    Example

    This C#/VB.NET code shows how to analyze page layout using Tesseract OCR engine.

    
    ''' <summary>
    ''' Analyzes page layout using Tesseract OCR engine.
    ''' </summary>
    ''' <param name="filename">The name of document image file.</param>
    Public Shared Sub AnalyzePageLayoutUsingTesseractOCR(filename As String)
        ' create an image collection
        Using images As New Vintasoft.Imaging.ImageCollection()
            ' add images from file to the image collection
            images.Add(filename)
    
            System.Console.WriteLine("Create Tesseract OCR engine...")
            ' create the Tesseract OCR engine
            Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
                System.Console.WriteLine("Initialize OCR engine...")
                ' init the Tesseract OCR engine for recognition of English characters
                tesseractOcr.Init(New Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.English))
    
                ' for each image in image collection
                For Each image As Vintasoft.Imaging.VintasoftImage In images
                    System.Console.WriteLine("Recognize the image...")
    
                    ' set image for Tesseract OCR engine
                    tesseractOcr.SetImage(image)
    
                    ' analyze page layout and get result as OCR page
                    Dim ocrPage As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.AnalyzeLayout()
    
                    ' clear image in Tesseract OCR engine
                    tesseractOcr.ClearImage()
    
                    ' calculate count of regions, paragraphs, lines, words, symbols
    
                    Dim regionCount As Integer = ocrPage.Regions.Count
                    Dim paragraphCount As Integer = 0
                    Dim lineCount As Integer = 0
                    Dim wordCount As Integer = 0
                    Dim symbolCount As Integer = 0
    
                    For Each region As Vintasoft.Imaging.Ocr.Results.OcrRegion In ocrPage.Regions
                        Dim textRegion As Vintasoft.Imaging.Ocr.Results.OcrTextRegion = TryCast(region, Vintasoft.Imaging.Ocr.Results.OcrTextRegion)
                        paragraphCount += textRegion.Paragraphs.Count
                        For Each paragraph As Vintasoft.Imaging.Ocr.Results.OcrParagraph In textRegion.Paragraphs
                            lineCount += paragraph.TextLines.Count
                            For Each line As Vintasoft.Imaging.Ocr.Results.OcrTextLine In paragraph.TextLines
                                wordCount += line.Words.Count
                                For Each word As Vintasoft.Imaging.Ocr.Results.OcrWord In line.Words
                                    symbolCount += word.Symbols.Count
                                Next
                            Next
                        Next
                    Next
    
                    ' output information about count of regions, paragraphs, lines, words, symbols
    
                    System.Console.WriteLine("Layout result:")
                    System.Console.WriteLine(String.Format("- Region count: {0}", regionCount))
                    System.Console.WriteLine(String.Format("- Paragraph count: {0}", paragraphCount))
                    System.Console.WriteLine(String.Format("- Line count: {0}", lineCount))
                    System.Console.WriteLine(String.Format("- Word count: {0}", wordCount))
                    System.Console.WriteLine(String.Format("- Symbol count: {0}", symbolCount))
                    System.Console.WriteLine()
                    System.Console.ReadKey()
                Next
    
                ' shutdown the Tesseract OCR engine
                tesseractOcr.Shutdown()
            End Using
    
            ' free images
            images.ClearAndDisposeItems()
        End Using
    End Sub
    
    
    
    /// <summary>
    /// Analyzes page layout using Tesseract OCR engine.
    /// </summary>
    /// <param name="filename">The name of document image file.</param>
    public static void AnalyzePageLayoutUsingTesseractOCR(string filename)
    {
        // create an image collection
        using (Vintasoft.Imaging.ImageCollection images =
            new Vintasoft.Imaging.ImageCollection())
        {
            // add images from file to the image collection
            images.Add(filename);
    
            System.Console.WriteLine("Create Tesseract OCR engine...");
            // create the Tesseract OCR engine
            using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr =
                new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
            {
                System.Console.WriteLine("Initialize OCR engine...");
                // init the Tesseract OCR engine for recognition of English characters
                tesseractOcr.Init(new Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.English));
    
                // for each image in image collection
                foreach (Vintasoft.Imaging.VintasoftImage image in images)
                {
                    System.Console.WriteLine("Recognize the image...");
    
                    // set image for Tesseract OCR engine
                    tesseractOcr.SetImage(image);
    
                    // analyze page layout and get result as OCR page
                    Vintasoft.Imaging.Ocr.Results.OcrPage ocrPage = tesseractOcr.AnalyzeLayout();
    
                    // clear image in Tesseract OCR engine
                    tesseractOcr.ClearImage();
    
                    // calculate count of regions, paragraphs, lines, words, symbols
    
                    int regionCount = ocrPage.Regions.Count;
                    int paragraphCount = 0;
                    int lineCount = 0;
                    int wordCount = 0;
                    int symbolCount = 0;
    
                    foreach (Vintasoft.Imaging.Ocr.Results.OcrRegion region in ocrPage.Regions)
                    {
                        Vintasoft.Imaging.Ocr.Results.OcrTextRegion textRegion =
                            region as Vintasoft.Imaging.Ocr.Results.OcrTextRegion;
                        paragraphCount += textRegion.Paragraphs.Count;
                        foreach (Vintasoft.Imaging.Ocr.Results.OcrParagraph paragraph in textRegion.Paragraphs)
                        {
                            lineCount += paragraph.TextLines.Count;
                            foreach (Vintasoft.Imaging.Ocr.Results.OcrTextLine line in paragraph.TextLines)
                            {
                                wordCount += line.Words.Count;
                                foreach (Vintasoft.Imaging.Ocr.Results.OcrWord word in line.Words)
                                {
                                    symbolCount += word.Symbols.Count;
                                }
                            }
                        }
                    }
    
                    // output information about count of regions, paragraphs, lines, words, symbols
    
                    System.Console.WriteLine("Layout result:");
                    System.Console.WriteLine(string.Format("- Region count: {0}", regionCount));
                    System.Console.WriteLine(string.Format("- Paragraph count: {0}", paragraphCount));
                    System.Console.WriteLine(string.Format("- Line count: {0}", lineCount));
                    System.Console.WriteLine(string.Format("- Word count: {0}", wordCount));
                    System.Console.WriteLine(string.Format("- Symbol count: {0}", symbolCount));
                    System.Console.WriteLine();
                    System.Console.ReadKey();
                }
    
                // shutdown the Tesseract OCR engine
                tesseractOcr.Shutdown();
            }
    
            // free images
            images.ClearAndDisposeItems();
        }
    }
    
    

    Requirements

    Target Platforms: .NET 8; .NET 7; .NET 6; .NET Framework 4.8, 4.7, 4.6, 4.5, 4.0, 3.5

    See Also