OCR: How to analyze layout of document image using Tesseract OCR in .NET
In This Topic
The recognition of text from document image consists of two steps. The first step analyzes the layout of document image, i.e. it is determined the position of paragraphs, text lines, words and symbols in the document image. The second step performs character recognition in the document image and separates the characters into paragraphs, text lines, and words.
VintaSoft Imaging .NET SDK with
VintaSoft OCR .NET Plug-in allows to recognize text in document image using
TesseractOcr.Recognize method. This method does both steps of text recognition, i.e. analyzes layout and performs character recognition.
Sometimes there is a need to analyze the position of paragraphs, text lines, words and symbols in the document image without recognition of characters.
VintaSoft Imaging .NET SDK with
VintaSoft OCR .NET Plug-in allows to analyze the layout of document image (determine the position of paragraphs, text lines, words and symbols in the document image) using
TesseractOcr.AnalyzeLayout method. The
TesseractOcr.AnalyzeLayout method works faster than
TesseractOcr.Recognize method, because
TesseractOcr.AnalyzeLayout method analyzes the layout of the document image, but does not perform character recognition.
Here is C#/VB.NET code that shows how to analyze the layout of document image using Tesseract OCR engine (
TesseractOcr.AnalyzeLayout method):
/// <summary>
/// Analyzes page layout without tables detection using Tesseract OCR engine.
/// </summary>
/// <param name="filename">The name of document image file.</param>
public static void AnalyzePageLayoutWithoutTablesDetectionUsingTesseractOCR(string filename)
{
// create an image collection
using (Vintasoft.Imaging.ImageCollection images =
new Vintasoft.Imaging.ImageCollection())
{
// add images from file to the image collection
images.Add(filename);
System.Console.WriteLine("Create Tesseract OCR engine...");
// create the Tesseract OCR engine
using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr =
new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
{
System.Console.WriteLine("Initialize OCR engine...");
// init the Tesseract OCR engine for recognition of English characters
tesseractOcr.Init(new Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.English));
// for each image in image collection
foreach (Vintasoft.Imaging.VintasoftImage image in images)
{
System.Console.WriteLine("Recognize the image...");
// set image for Tesseract OCR engine
tesseractOcr.SetImage(image);
// analyze page layout without tables detection and get result as OCR page
Vintasoft.Imaging.Ocr.Results.OcrPage ocrPage = tesseractOcr.AnalyzeLayout();
// clear image in Tesseract OCR engine
tesseractOcr.ClearImage();
// calculate count of regions, paragraphs, lines, words, symbols
int regionCount = ocrPage.Regions.Count;
int paragraphCount = 0;
int lineCount = 0;
int wordCount = 0;
int symbolCount = 0;
foreach (Vintasoft.Imaging.Ocr.Results.OcrRegion region in ocrPage.Regions)
{
Vintasoft.Imaging.Ocr.Results.OcrTextRegion textRegion =
region as Vintasoft.Imaging.Ocr.Results.OcrTextRegion;
paragraphCount += textRegion.Paragraphs.Count;
foreach (Vintasoft.Imaging.Ocr.Results.OcrParagraph paragraph in textRegion.Paragraphs)
{
lineCount += paragraph.TextLines.Count;
foreach (Vintasoft.Imaging.Ocr.Results.OcrTextLine line in paragraph.TextLines)
{
wordCount += line.Words.Count;
foreach (Vintasoft.Imaging.Ocr.Results.OcrWord word in line.Words)
{
symbolCount += word.Symbols.Count;
}
}
}
}
// output information about count of regions, paragraphs, lines, words, symbols
System.Console.WriteLine("Layout result:");
System.Console.WriteLine(string.Format("- Region count: {0}", regionCount));
System.Console.WriteLine(string.Format("- Paragraph count: {0}", paragraphCount));
System.Console.WriteLine(string.Format("- Line count: {0}", lineCount));
System.Console.WriteLine(string.Format("- Word count: {0}", wordCount));
System.Console.WriteLine(string.Format("- Symbol count: {0}", symbolCount));
System.Console.WriteLine();
System.Console.ReadKey();
}
// shutdown the Tesseract OCR engine
tesseractOcr.Shutdown();
}
// free images
images.ClearAndDisposeItems();
}
}
''' <summary>
''' Analyzes page layout without tables detection using Tesseract OCR engine.
''' </summary>
''' <param name="filename">The name of document image file.</param>
Public Shared Sub AnalyzePageLayoutWithoutTablesDetectionUsingTesseractOCR(filename As String)
' create an image collection
Using images As New Vintasoft.Imaging.ImageCollection()
' add images from file to the image collection
images.Add(filename)
System.Console.WriteLine("Create Tesseract OCR engine...")
' create the Tesseract OCR engine
Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
System.Console.WriteLine("Initialize OCR engine...")
' init the Tesseract OCR engine for recognition of English characters
tesseractOcr.Init(New Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.English))
' for each image in image collection
For Each image As Vintasoft.Imaging.VintasoftImage In images
System.Console.WriteLine("Recognize the image...")
' set image for Tesseract OCR engine
tesseractOcr.SetImage(image)
' analyze page layout without tables detection and get result as OCR page
Dim ocrPage As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.AnalyzeLayout()
' clear image in Tesseract OCR engine
tesseractOcr.ClearImage()
' calculate count of regions, paragraphs, lines, words, symbols
Dim regionCount As Integer = ocrPage.Regions.Count
Dim paragraphCount As Integer = 0
Dim lineCount As Integer = 0
Dim wordCount As Integer = 0
Dim symbolCount As Integer = 0
For Each region As Vintasoft.Imaging.Ocr.Results.OcrRegion In ocrPage.Regions
Dim textRegion As Vintasoft.Imaging.Ocr.Results.OcrTextRegion = TryCast(region, Vintasoft.Imaging.Ocr.Results.OcrTextRegion)
paragraphCount += textRegion.Paragraphs.Count
For Each paragraph As Vintasoft.Imaging.Ocr.Results.OcrParagraph In textRegion.Paragraphs
lineCount += paragraph.TextLines.Count
For Each line As Vintasoft.Imaging.Ocr.Results.OcrTextLine In paragraph.TextLines
wordCount += line.Words.Count
For Each word As Vintasoft.Imaging.Ocr.Results.OcrWord In line.Words
symbolCount += word.Symbols.Count
Next
Next
Next
Next
' output information about count of regions, paragraphs, lines, words, symbols
System.Console.WriteLine("Layout result:")
System.Console.WriteLine(String.Format("- Region count: {0}", regionCount))
System.Console.WriteLine(String.Format("- Paragraph count: {0}", paragraphCount))
System.Console.WriteLine(String.Format("- Line count: {0}", lineCount))
System.Console.WriteLine(String.Format("- Word count: {0}", wordCount))
System.Console.WriteLine(String.Format("- Symbol count: {0}", symbolCount))
System.Console.WriteLine()
System.Console.ReadKey()
Next
' shutdown the Tesseract OCR engine
tesseractOcr.Shutdown()
End Using
' free images
images.ClearAndDisposeItems()
End Using
End Sub