AnalyzeLayout() Method (TesseractOcr)
In This Topic
Analyzes the page layout (layout of paragraphs, lines, words, symbols on document image) without tables detection and text recognition.
Syntax
Return Value
Result of layout as
OcrPage.
Exceptions
Exception | Description |
System.ObjectDisposedException | Thrown if this object is disposed. |
OcrException | Thrown if Tesseract OCR engine does not have image for processing. |
Remarks
Use this method if you need to analyze layout of paragraphs, lines, words, symbols on document image but do not need to detect tables on image and do not need to recognize text from image.
This method works faster than Recognize method.
This method provides good results for non rotated document image. If document image is rotated, image can be deskewed using Vintasoft.Imaging.ImageProcessing.Document.DeskewCommand class.
Example
This C#/VB.NET code shows how to analyze page layout without tables detection using Tesseract OCR engine.
''' <summary>
''' Analyzes page layout without tables detection using Tesseract OCR engine.
''' </summary>
''' <param name="filename">The name of document image file.</param>
Public Shared Sub AnalyzePageLayoutWithoutTablesDetectionUsingTesseractOCR(filename As String)
' create an image collection
Using images As New Vintasoft.Imaging.ImageCollection()
' add images from file to the image collection
images.Add(filename)
System.Console.WriteLine("Create Tesseract OCR engine...")
' create the Tesseract OCR engine
Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
System.Console.WriteLine("Initialize OCR engine...")
' init the Tesseract OCR engine for recognition of English characters
tesseractOcr.Init(New Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.English))
' for each image in image collection
For Each image As Vintasoft.Imaging.VintasoftImage In images
System.Console.WriteLine("Recognize the image...")
' set image for Tesseract OCR engine
tesseractOcr.SetImage(image)
' analyze page layout without tables detection and get result as OCR page
Dim ocrPage As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.AnalyzeLayout()
' clear image in Tesseract OCR engine
tesseractOcr.ClearImage()
' calculate count of regions, paragraphs, lines, words, symbols
Dim regionCount As Integer = ocrPage.Regions.Count
Dim paragraphCount As Integer = 0
Dim lineCount As Integer = 0
Dim wordCount As Integer = 0
Dim symbolCount As Integer = 0
For Each region As Vintasoft.Imaging.Ocr.Results.OcrRegion In ocrPage.Regions
Dim textRegion As Vintasoft.Imaging.Ocr.Results.OcrTextRegion = TryCast(region, Vintasoft.Imaging.Ocr.Results.OcrTextRegion)
paragraphCount += textRegion.Paragraphs.Count
For Each paragraph As Vintasoft.Imaging.Ocr.Results.OcrParagraph In textRegion.Paragraphs
lineCount += paragraph.TextLines.Count
For Each line As Vintasoft.Imaging.Ocr.Results.OcrTextLine In paragraph.TextLines
wordCount += line.Words.Count
For Each word As Vintasoft.Imaging.Ocr.Results.OcrWord In line.Words
symbolCount += word.Symbols.Count
Next
Next
Next
Next
' output information about count of regions, paragraphs, lines, words, symbols
System.Console.WriteLine("Layout result:")
System.Console.WriteLine(String.Format("- Region count: {0}", regionCount))
System.Console.WriteLine(String.Format("- Paragraph count: {0}", paragraphCount))
System.Console.WriteLine(String.Format("- Line count: {0}", lineCount))
System.Console.WriteLine(String.Format("- Word count: {0}", wordCount))
System.Console.WriteLine(String.Format("- Symbol count: {0}", symbolCount))
System.Console.WriteLine()
System.Console.ReadKey()
Next
' shutdown the Tesseract OCR engine
tesseractOcr.Shutdown()
End Using
' free images
images.ClearAndDisposeItems()
End Using
End Sub
/// <summary>
/// Analyzes page layout without tables detection using Tesseract OCR engine.
/// </summary>
/// <param name="filename">The name of document image file.</param>
public static void AnalyzePageLayoutWithoutTablesDetectionUsingTesseractOCR(string filename)
{
// create an image collection
using (Vintasoft.Imaging.ImageCollection images =
new Vintasoft.Imaging.ImageCollection())
{
// add images from file to the image collection
images.Add(filename);
System.Console.WriteLine("Create Tesseract OCR engine...");
// create the Tesseract OCR engine
using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr =
new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
{
System.Console.WriteLine("Initialize OCR engine...");
// init the Tesseract OCR engine for recognition of English characters
tesseractOcr.Init(new Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.English));
// for each image in image collection
foreach (Vintasoft.Imaging.VintasoftImage image in images)
{
System.Console.WriteLine("Recognize the image...");
// set image for Tesseract OCR engine
tesseractOcr.SetImage(image);
// analyze page layout without tables detection and get result as OCR page
Vintasoft.Imaging.Ocr.Results.OcrPage ocrPage = tesseractOcr.AnalyzeLayout();
// clear image in Tesseract OCR engine
tesseractOcr.ClearImage();
// calculate count of regions, paragraphs, lines, words, symbols
int regionCount = ocrPage.Regions.Count;
int paragraphCount = 0;
int lineCount = 0;
int wordCount = 0;
int symbolCount = 0;
foreach (Vintasoft.Imaging.Ocr.Results.OcrRegion region in ocrPage.Regions)
{
Vintasoft.Imaging.Ocr.Results.OcrTextRegion textRegion =
region as Vintasoft.Imaging.Ocr.Results.OcrTextRegion;
paragraphCount += textRegion.Paragraphs.Count;
foreach (Vintasoft.Imaging.Ocr.Results.OcrParagraph paragraph in textRegion.Paragraphs)
{
lineCount += paragraph.TextLines.Count;
foreach (Vintasoft.Imaging.Ocr.Results.OcrTextLine line in paragraph.TextLines)
{
wordCount += line.Words.Count;
foreach (Vintasoft.Imaging.Ocr.Results.OcrWord word in line.Words)
{
symbolCount += word.Symbols.Count;
}
}
}
}
// output information about count of regions, paragraphs, lines, words, symbols
System.Console.WriteLine("Layout result:");
System.Console.WriteLine(string.Format("- Region count: {0}", regionCount));
System.Console.WriteLine(string.Format("- Paragraph count: {0}", paragraphCount));
System.Console.WriteLine(string.Format("- Line count: {0}", lineCount));
System.Console.WriteLine(string.Format("- Word count: {0}", wordCount));
System.Console.WriteLine(string.Format("- Symbol count: {0}", symbolCount));
System.Console.WriteLine();
System.Console.ReadKey();
}
// shutdown the Tesseract OCR engine
tesseractOcr.Shutdown();
}
// free images
images.ClearAndDisposeItems();
}
}
Requirements
Target Platforms: .NET9; .NET 8; .NET 7; .NET 6; .NET Framework 4.8, 4.7, 4.6, 4.5, 4.0, 3.5
See Also