The text recognition result in an image is returned as an OCR page (an instance of OcrPage class). The OCR page contains text regions (instances of OcrTextRegion class). The text region contains paragraphs (instances of OcrParagraph class). The paragraph contains text lines (instances of OcrTextLine class). The text line contains words (instances of OcrWord class). The word contains symbols (instances of OcrSymbol class).
All elements (OcrPage, OcrTextRegion, OcrParagraph, OcrTextLine, OcrWord, OcrSymbol) contain:
' The project, which uses this code, must have references to the following assemblies: ' - Vintasoft.Imaging ' - Vintasoft.Imaging.Ocr ' - Vintasoft.Imaging.Ocr.Tesseract ''' <summary> ''' Recognizes text in images, ''' removes words with low confidence from recognized text and ''' returns recognized text. ''' </summary> ''' <param name="filename">The name of the file containing image to OCR.</param> Public Function RecognizeTextAndFilterRecognitionResult(filename As String) As String ' minimum confidence Const MIN_CONFIDENCE As Single = 75F ' create image collection Using images As New Vintasoft.Imaging.ImageCollection() ' add images from file to image collection images.Add(filename) ' create tesseract OCR engine Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr() ' create tesseract OCR settings Dim settings As New Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.English) tesseractOcr.Init(settings) ' create result builder Dim result As New System.Text.StringBuilder() ' for each image in image collection For Each image As Vintasoft.Imaging.VintasoftImage In images ' recognize the image Dim page As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.Recognize(image) ' get all words in recognized text Dim ocrObjects As Vintasoft.Imaging.Ocr.Results.OcrObject() = page.GetObjects(Vintasoft.Imaging.Ocr.OcrObjectType.Word) ' create list of words to remove Dim removeObjects As New System.Collections.Generic.List(Of Vintasoft.Imaging.Ocr.Results.OcrObject)() ' for each word For Each word As Vintasoft.Imaging.Ocr.Results.OcrObject In ocrObjects ' if word confidence is less than minimum confidence If word.Confidence < MIN_CONFIDENCE Then ' add word to a list of words to remove removeObjects.Add(word) End If Next ' validate recognition results (remove words with low confidence) Dim editor As New Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(page) editor.RemoveObjects(removeObjects.ToArray()) editor.ValidateResults() ' get recognized text Dim text As String = page.GetText() ' add recognized text to result result.Append(text) result.AppendLine() Next ' dispose images and clear image collection images.ClearAndDisposeItems() ' return result Return result.ToString() End Using End Using End Function
// The project, which uses this code, must have references to the following assemblies: // - Vintasoft.Imaging // - Vintasoft.Imaging.Ocr // - Vintasoft.Imaging.Ocr.Tesseract /// <summary> /// Recognizes text in images, /// removes words with low confidence from recognized text and /// returns recognized text. /// </summary> /// <param name="filename">The name of the file containing image to OCR.</param> public string RecognizeTextAndFilterRecognitionResult(string filename) { // minimum confidence const float MIN_CONFIDENCE = 75.0f; // create image collection using (Vintasoft.Imaging.ImageCollection images = new Vintasoft.Imaging.ImageCollection()) { // add images from file to image collection images.Add(filename); // create tesseract OCR engine using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()) { // create tesseract OCR settings Vintasoft.Imaging.Ocr.OcrEngineSettings settings = new Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.English); tesseractOcr.Init(settings); // create result builder System.Text.StringBuilder result = new System.Text.StringBuilder(); // for each image in image collection foreach (Vintasoft.Imaging.VintasoftImage image in images) { // recognize the image Vintasoft.Imaging.Ocr.Results.OcrPage page = tesseractOcr.Recognize(image); // get all words in recognized text Vintasoft.Imaging.Ocr.Results.OcrObject[] ocrObjects = page.GetObjects( Vintasoft.Imaging.Ocr.OcrObjectType.Word); // create list of words to remove System.Collections.Generic.List<Vintasoft.Imaging.Ocr.Results.OcrObject> removeObjects = new System.Collections.Generic.List<Vintasoft.Imaging.Ocr.Results.OcrObject>(); // for each word foreach (Vintasoft.Imaging.Ocr.Results.OcrObject word in ocrObjects) { // if word confidence is less than minimum confidence if (word.Confidence < MIN_CONFIDENCE) // add word to a list of words to remove removeObjects.Add(word); } // validate recognition results (remove words with low confidence) Vintasoft.Imaging.Ocr.Results.OcrResultsEditor editor = new Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(page); editor.RemoveObjects(removeObjects.ToArray()); editor.ValidateResults(); // get recognized text string text = page.GetText(); // add recognized text to result result.Append(text); result.AppendLine(); } // dispose images and clear image collection images.ClearAndDisposeItems(); // return result return result.ToString(); } } }
' The project, which uses this code, must have references to the following assemblies: ' - Vintasoft.Imaging ' - Vintasoft.Imaging.Ocr ' - Vintasoft.Imaging.Ocr.Tesseract ' - Vintasoft.Imaging.Pdf Dim imageFilePath As String = "D:\TestImage.pdf" ' create the OCR engine Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr() ' create an array for additional OCR engines Dim additionalEngines As Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr() = New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr(1) {} Try ' create an array for additional OCR engines For i As Integer = 0 To additionalEngines.Length - 1 ' create the additional OCR engine additionalEngines(i) = New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr() Next ' create the OCR engine manager Dim engineManager As New Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr, additionalEngines) ' load a PDF document from file Using pdfDocument As New Vintasoft.Imaging.Pdf.PdfDocument(imageFilePath) ' create the OCR engine settings and ' specify that German text will be recognized Dim settings As New Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.German) ' if PDF document is empty If pdfDocument.Pages.Count = 0 Then Return End If ' get the first PDF page Dim pdfPage As Vintasoft.Imaging.Pdf.Tree.PdfPage = pdfDocument.Pages(0) ' scales, which should be applied to the PDF page before text recognition Dim scales As Single() = New Single() {0.5F, 1.5F} ' an array that contains the scaled OCR results Dim scaledOcrResults As Vintasoft.Imaging.Ocr.Results.OcrPage() = New Vintasoft.Imaging.Ocr.Results.OcrPage(scales.Length - 1) {} ' for each scale For i As Integer = 0 To scales.Length - 1 ' render the scaled PDF page Using renderedImage As Vintasoft.Imaging.VintasoftImage = pdfPage.Render(scales(i)) ' recognize text in scaled PDF page scaledOcrResults(i) = engineManager.Recognize(renderedImage, settings) End Using Next ' if text is recognized If scaledOcrResults.Length > 0 Then Dim resultEditor As Vintasoft.Imaging.Ocr.Results.OcrResultsEditor ' "downscale" the OCR results, this is necessary ' because we scaled PDF page before text recognition ' for each recognition result For i As Integer = 0 To scaledOcrResults.Length - 1 If scales(i) = 1F Then Continue For End If ' create the OCR results editor resultEditor = New Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(scaledOcrResults(i)) ' calculate the "downscale" factor Dim downScale As New Vintasoft.Imaging.Scale(1F / scales(i), 1F / scales(i)) ' "downscale" the OCR result resultEditor.ScaleOcrPage(downScale) Next ' combine the scaled OCR results into the final OCR result ' set the first scaled OCR result as the final OCR result Dim finalOcrResult As Vintasoft.Imaging.Ocr.Results.OcrPage = scaledOcrResults(0) ' create the OCR results editor for the final OCR result resultEditor = New Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(finalOcrResult) ' for each scaled OCR result starting from the second For i As Integer = 1 To scaledOcrResults.Length - 1 ' add the scaled OCR result to the OCR results editor resultEditor.AddRegions(scaledOcrResults(i)) Next ' validate the final OCR result resultEditor.ValidateResults() ' get the recognized text from the final OCR result Dim ocrPageContent As String = finalOcrResult.GetText() Dim textFilePath As String = System.IO.Path.Combine(System.IO.Path.GetDirectoryName(imageFilePath), System.IO.Path.GetFileNameWithoutExtension(imageFilePath) & ".txt") ' save the recognition results System.IO.File.WriteAllText(textFilePath, ocrPageContent, System.Text.Encoding.UTF8) End If End Using Finally ' for each additional OCR engine For i As Integer = 0 To additionalEngines.Length - 1 If additionalEngines(i) IsNot Nothing Then ' dispose the additional OCR engine additionalEngines(i).Dispose() End If Next End Try End Using
// The project, which uses this code, must have references to the following assemblies: // - Vintasoft.Imaging // - Vintasoft.Imaging.Ocr // - Vintasoft.Imaging.Ocr.Tesseract // - Vintasoft.Imaging.Pdf string imageFilePath = @"D:\TestImage.pdf"; // create the OCR engine using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()) { // create an array for additional OCR engines Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr[] additionalEngines = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr[2]; try { // create an array for additional OCR engines for (int i = 0; i < additionalEngines.Length; i++) // create the additional OCR engine additionalEngines[i] = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr(); // create the OCR engine manager Vintasoft.Imaging.Ocr.OcrEngineManager engineManager = new Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr, additionalEngines); // load a PDF document from file using (Vintasoft.Imaging.Pdf.PdfDocument pdfDocument = new Vintasoft.Imaging.Pdf.PdfDocument(imageFilePath)) { // create the OCR engine settings and // specify that German text will be recognized Vintasoft.Imaging.Ocr.OcrEngineSettings settings = new Vintasoft.Imaging.Ocr.OcrEngineSettings(Vintasoft.Imaging.Ocr.OcrLanguage.German); // if PDF document is empty if (pdfDocument.Pages.Count == 0) return; // get the first PDF page Vintasoft.Imaging.Pdf.Tree.PdfPage pdfPage = pdfDocument.Pages[0]; // scales, which should be applied to the PDF page before text recognition float[] scales = new float[] { 0.5f, 1.5f }; // an array that contains the scaled OCR results Vintasoft.Imaging.Ocr.Results.OcrPage[] scaledOcrResults = new Vintasoft.Imaging.Ocr.Results.OcrPage[scales.Length]; // for each scale for (int i = 0; i < scales.Length; i++) { // render the scaled PDF page using (Vintasoft.Imaging.VintasoftImage renderedImage = pdfPage.Render(scales[i])) { // recognize text in scaled PDF page scaledOcrResults[i] = engineManager.Recognize(renderedImage, settings); } } // if text is recognized if (scaledOcrResults.Length > 0) { Vintasoft.Imaging.Ocr.Results.OcrResultsEditor resultEditor; // "downscale" the OCR results, this is necessary // because we scaled PDF page before text recognition // for each recognition result for (int i = 0; i < scaledOcrResults.Length; i++) { if (scales[i] == 1f) continue; // create the OCR results editor resultEditor = new Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(scaledOcrResults[i]); // calculate the "downscale" factor Vintasoft.Imaging.Scale downScale = new Vintasoft.Imaging.Scale(1f / scales[i], 1f / scales[i]); // "downscale" the OCR result resultEditor.ScaleOcrPage(downScale); } // combine the scaled OCR results into the final OCR result // set the first scaled OCR result as the final OCR result Vintasoft.Imaging.Ocr.Results.OcrPage finalOcrResult = scaledOcrResults[0]; // create the OCR results editor for the final OCR result resultEditor = new Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(finalOcrResult); // for each scaled OCR result starting from the second for (int i = 1; i < scaledOcrResults.Length; i++) { // add the scaled OCR result to the OCR results editor resultEditor.AddRegions(scaledOcrResults[i]); } // validate the final OCR result resultEditor.ValidateResults(); // get the recognized text from the final OCR result string ocrPageContent = finalOcrResult.GetText(); string textFilePath = System.IO.Path.Combine( System.IO.Path.GetDirectoryName(imageFilePath), System.IO.Path.GetFileNameWithoutExtension(imageFilePath) + ".txt"); // save the recognition results System.IO.File.WriteAllText(textFilePath, ocrPageContent, System.Text.Encoding.UTF8); } } } finally { // for each additional OCR engine for (int i = 0; i < additionalEngines.Length; i++) { if (additionalEngines[i] != null) // dispose the additional OCR engine additionalEngines[i].Dispose(); } } }