Identify and recognize a document that contains completed form with text
In This Topic
If you want to identify and recognize document that contains completed form with text, you need to do the following steps:
- Specify the OCR engine that must be used for recognition of text in text fields - this can be done using the OcrFieldTemplate.OcrEngineManager property.
- Identify a template for completed form
- Recognize field values in completed form
Here is C#/VB.NET code that demonstrates how to identify and recognize a completed form that contains text.
/// <summary>
/// Recognizes the form with OCR fields.
/// </summary>
/// <param name="formRecognitionManager">The form recognition manager.</param>
/// <param name="image">The image.</param>
public static void RecognizeFormWithOcrFields(
Vintasoft.Imaging.FormsProcessing.FormRecognitionManager formRecognitionManager,
Vintasoft.Imaging.VintasoftImage image)
{
// check whether OCR engine manager of the OCR field templates is initialized
// (this initialization can be moved to the start of your application)
if (Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrFieldTemplate.OcrEngineManager == null)
{
// get or create text OCR engine
Vintasoft.Imaging.Ocr.OcrEngine textOcrEngine = GetOcrEngine();
// create Handwritten digits OCR engine
Vintasoft.Imaging.Ocr.OcrEngine handwrittenDigitsOcrEngine = new Vintasoft.Imaging.Ocr.ML.HandwrittenDigits.HandwrittenDigitsOcrEngine();
// create and set OCR engine manager of the OCR field templates
Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrFieldTemplate.OcrEngineManager =
new Vintasoft.Imaging.Ocr.OcrEngineManager(textOcrEngine, handwrittenDigitsOcrEngine);
}
// recognize filled form in an image
Vintasoft.Imaging.FormsProcessing.FormRecognitionResult recognitionResult =
formRecognitionManager.Recognize(image);
// get the result of image comparison
Vintasoft.Imaging.FormsProcessing.TemplateMatching.ImageImprintCompareResult imageCompareResult =
recognitionResult.TemplateMatchingResult.ImageCompareResult;
// if result is not reliable
if (!imageCompareResult.IsReliable)
{
// matching template is not found
System.Console.WriteLine("Matching template is not found.");
}
else
{
// get recognized page
Vintasoft.Imaging.FormsProcessing.FormRecognition.FormPage recognizedPage = recognitionResult.RecognizedPage;
// get form field count
if (recognizedPage.Items.Count == 0)
{
System.Console.WriteLine("No form fields were recognized.");
}
else
{
System.Console.WriteLine(string.Format(
"Recognized form field count: {0}",
recognizedPage.Items.Count));
// for each recognized form field
foreach (Vintasoft.Imaging.FormsProcessing.FormRecognition.FormField recognizedField in recognizedPage.Items)
{
if (recognizedField is Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrField)
{
Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrField ocrField =
(Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrField)recognizedField;
// write field info
System.Console.WriteLine(string.Format(
" OCR field: name: {0}; value: {1}; confidence: {2:F1}%",
ocrField.Name,
ocrField.Value,
ocrField.Confidence * 100));
Vintasoft.Imaging.Ocr.Results.OcrPage ocrResult = ocrField.OcrResult;
// get all words
Vintasoft.Imaging.Ocr.Results.OcrObject[] words = ocrResult.GetWords(75, 75);
// write words info
for (int i = 0; i < words.Length; i++)
{
Vintasoft.Imaging.Ocr.Results.OcrObject word = words[i];
System.Console.WriteLine(string.Format(
" OCR word: {0}; confidence: {1:F1}%",
word.ToString(),
word.Confidence));
}
}
}
}
}
}
/// <summary>
/// Gets the OCR engine used for OCR field recognition.
/// </summary>
/// <remarks>
/// To create a Tesseract OCR engine,
/// add a reference to Vintasoft.Imaging.Ocr.Tesseract.dll
/// into your project.
/// </remarks>
private static Vintasoft.Imaging.Ocr.OcrEngine GetOcrEngine()
{
// full path to the Tesseract5.Vintasoft.xXX.dll files
// NOTE: specify here the actual path to the Tesseract OCR dll files
string tesseractDllDirectory = @"C:\Program Files\VintaSoft\VintaSoft Imaging .NET\Bin\TesseractOCR\";
// create Tesseract OCR engine (Vintasoft.Imaging.Ocr.Tesseract.dll is required)
return new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr(tesseractDllDirectory);
}
''' <summary>
''' Recognizes the form with OCR fields.
''' </summary>
''' <param name="formRecognitionManager">The form recognition manager.</param>
''' <param name="image">The image.</param>
Public Shared Sub RecognizeFormWithOcrFields(formRecognitionManager As Vintasoft.Imaging.FormsProcessing.FormRecognitionManager, image As Vintasoft.Imaging.VintasoftImage)
' check whether OCR engine manager of the OCR field templates is initialized
' (this initialization can be moved to the start of your application)
If Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrFieldTemplate.OcrEngineManager Is Nothing Then
' get or create text OCR engine
Dim textOcrEngine As Vintasoft.Imaging.Ocr.OcrEngine = GetOcrEngine()
' create Handwritten digits OCR engine
Dim handwrittenDigitsOcrEngine As Vintasoft.Imaging.Ocr.OcrEngine = New Vintasoft.Imaging.Ocr.ML.HandwrittenDigits.HandwrittenDigitsOcrEngine()
' create and set OCR engine manager of the OCR field templates
Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrFieldTemplate.OcrEngineManager = New Vintasoft.Imaging.Ocr.OcrEngineManager(textOcrEngine, handwrittenDigitsOcrEngine)
End If
' recognize filled form in an image
Dim recognitionResult As Vintasoft.Imaging.FormsProcessing.FormRecognitionResult = formRecognitionManager.Recognize(image)
' get the result of image comparison
Dim imageCompareResult As Vintasoft.Imaging.FormsProcessing.TemplateMatching.ImageImprintCompareResult = recognitionResult.TemplateMatchingResult.ImageCompareResult
' if result is not reliable
If Not imageCompareResult.IsReliable Then
' matching template is not found
System.Console.WriteLine("Matching template is not found.")
Else
' get recognized page
Dim recognizedPage As Vintasoft.Imaging.FormsProcessing.FormRecognition.FormPage = recognitionResult.RecognizedPage
' get form field count
If recognizedPage.Items.Count = 0 Then
System.Console.WriteLine("No form fields were recognized.")
Else
System.Console.WriteLine(String.Format("Recognized form field count: {0}", recognizedPage.Items.Count))
' for each recognized form field
For Each recognizedField As Vintasoft.Imaging.FormsProcessing.FormRecognition.FormField In recognizedPage.Items
If TypeOf recognizedField Is Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrField Then
Dim ocrField As Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrField = DirectCast(recognizedField, Vintasoft.Imaging.FormsProcessing.FormRecognition.Ocr.OcrField)
' write field info
System.Console.WriteLine(String.Format(" OCR field: name: {0}; value: {1}; confidence: {2:F1}%", ocrField.Name, ocrField.Value, ocrField.Confidence * 100))
Dim ocrResult As Vintasoft.Imaging.Ocr.Results.OcrPage = ocrField.OcrResult
' get all words
Dim words As Vintasoft.Imaging.Ocr.Results.OcrObject() = ocrResult.GetWords(75, 75)
' write words info
For i As Integer = 0 To words.Length - 1
Dim word As Vintasoft.Imaging.Ocr.Results.OcrObject = words(i)
System.Console.WriteLine(String.Format(" OCR word: {0}; confidence: {1:F1}%", word.ToString(), word.Confidence))
Next
End If
Next
End If
End If
End Sub
''' <summary>
''' Gets the OCR engine used for OCR field recognition.
''' </summary>
''' <remarks>
''' To create a Tesseract OCR engine,
''' add a reference to Vintasoft.Imaging.Ocr.Tesseract.dll
''' into your project.
''' </remarks>
Private Shared Function GetOcrEngine() As Vintasoft.Imaging.Ocr.OcrEngine
' full path to the Tesseract5.Vintasoft.xXX.dll files
' NOTE: specify here the actual path to the Tesseract OCR dll files
Dim tesseractDllDirectory As String = "C:\Program Files\VintaSoft\VintaSoft Imaging .NET\Bin\TesseractOCR\"
' create Tesseract OCR engine (Vintasoft.Imaging.Ocr.Tesseract.dll is required)
Return New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr(tesseractDllDirectory)
End Function