OCR result can be saved to a text file or searchable PDF document. The searchable PDF document may contain the text located above the image or just text. To create a searchable PDF document from OCR results it is necessary to use PdfDocumentBuilder class.
Here is an example that shows how to save OCR result to a text file as formatted text:
' The project, which uses this code, must have references to the following assemblies: ' - Vintasoft.Imaging ' - Vintasoft.Imaging.Ocr ' - Vintasoft.Imaging.Ocr.Tesseract Dim imageFilePath As String = "D:\TestImage.png" ' create the OCR engine Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr() ' specify that OCR engine will recognize English text Dim language As Vintasoft.Imaging.Ocr.OcrLanguage = Vintasoft.Imaging.Ocr.OcrLanguage.English ' create the OCR engine settings Dim settings As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(language) ' initialize the OCR engine tesseractOcr.Init(settings) ' load an image Using image As New Vintasoft.Imaging.VintasoftImage(imageFilePath) ' specify the image, where text must be recognized tesseractOcr.SetImage(image) ' recognize text in the image Dim ocrResult As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.Recognize() ' get the recognized text as formatted text Dim ocrResultAsFormattedContent As String = ocrResult.GetFormattedText() Dim textFilePath As String = System.IO.Path.Combine(System.IO.Path.GetDirectoryName(imageFilePath), System.IO.Path.GetFileNameWithoutExtension(imageFilePath) & ".txt") ' save the formatted text in a file System.IO.File.WriteAllText(textFilePath, ocrResultAsFormattedContent, System.Text.Encoding.UTF8) ' clear the image tesseractOcr.ClearImage() End Using ' shutdown the OCR engine tesseractOcr.Shutdown() End Using
// The project, which uses this code, must have references to the following assemblies: // - Vintasoft.Imaging // - Vintasoft.Imaging.Ocr // - Vintasoft.Imaging.Ocr.Tesseract string imageFilePath = @"D:\TestImage.png"; // create the OCR engine using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()) { // specify that OCR engine will recognize English text Vintasoft.Imaging.Ocr.OcrLanguage language = Vintasoft.Imaging.Ocr.OcrLanguage.English; // create the OCR engine settings Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings settings = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(language); // initialize the OCR engine tesseractOcr.Init(settings); // load an image using (Vintasoft.Imaging.VintasoftImage image = new Vintasoft.Imaging.VintasoftImage(imageFilePath)) { // specify the image, where text must be recognized tesseractOcr.SetImage(image); // recognize text in the image Vintasoft.Imaging.Ocr.Results.OcrPage ocrResult = tesseractOcr.Recognize(); // get the recognized text as formatted text string ocrResultAsFormattedContent = ocrResult.GetFormattedText(); string textFilePath = System.IO.Path.Combine( System.IO.Path.GetDirectoryName(imageFilePath), System.IO.Path.GetFileNameWithoutExtension(imageFilePath) + ".txt"); // save the formatted text in a file System.IO.File.WriteAllText(textFilePath, ocrResultAsFormattedContent, System.Text.Encoding.UTF8); // clear the image tesseractOcr.ClearImage(); } // shutdown the OCR engine tesseractOcr.Shutdown(); }
' The project, which uses this code, must have references to the following assemblies: ' - Vintasoft.Imaging ' - Vintasoft.Imaging.DocCleanup ' - Vintasoft.Imaging.Ocr ' - Vintasoft.Imaging.Ocr.Tesseract ' - Vintasoft.Imaging.Pdf ' - Vintasoft.Imaging.Pdf.Ocr Class ScanAndConvertToSearchablePdfDcoument ''' <summary> ''' Acquires image from scanner, ''' recognizes text on images and ''' saves the result as a searchable PDF document. ''' </summary> ''' <remarks> ''' VintaSoft Twain .NET SDK (Vintasoft.Twain.dll), ''' VintaSoft Imaging .NET SDK (Standard) (Vintasoft.Imaging.dll), ''' VintaSoft Document Cleanup .NET Plug-in (Vintasoft.Imaging.DocCleanup.dll), ''' VintaSoft OCR .NET Plug-in (Vintasoft.Imaging.Ocr.dll, Vintasoft.Imaging.Ocr.Tesseract.dll), ''' VintaSoft PDF .NET Plug-in (Vintasoft.Imaging.Pdf.dll, Vintasoft.Imaging.Pdf.Ocr.dll) and ''' and VintaSoft PDF .NET Plug-in (Writer) (Vintasoft.Pdf.dll) are necessary ''' for executing this sample. ''' </remarks> Public Shared Sub ScanImagesAndSaveAsSearchablePdfDocument(language As Vintasoft.Imaging.Ocr.OcrLanguage, outputPdfFilename As String) System.Console.WriteLine("Create TWAIN device manager...") Using deviceManager As New Vintasoft.Twain.DeviceManager() ' create PDF document Using pdfDocument As New Vintasoft.Imaging.Pdf.PdfDocument(outputPdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14) ' create PDF document builder Dim documentBuilder As New Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(pdfDocument) documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText System.Console.WriteLine("Create Tesseract OCR engine...") Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr() ' create OCR engine manager Dim engineManager As New Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr) Dim settings As New Vintasoft.Imaging.Ocr.OcrEngineSettings(language) System.Console.WriteLine("Open TWAIN device manager...") deviceManager.Open() Dim device As Vintasoft.Twain.Device = deviceManager.DefaultDevice Dim acquireState As Vintasoft.Twain.AcquireModalState Do System.Console.WriteLine("Acquire image from scanner...") acquireState = device.AcquireModal() If acquireState = Vintasoft.Twain.AcquireModalState.ImageAcquired Then ' create VintasoftImage Using image As New Vintasoft.Imaging.VintasoftImage(device.AcquiredImage.GetAsBitmap(), True) ' preprocess image ' BorderClear, Despeckle, Deskew, Segmentation System.Console.WriteLine("Preprocess the image...") Dim preprocessing As New Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand() preprocessing.Binarization = Nothing preprocessing.ExecuteInPlace(image) ' recognize image System.Console.WriteLine("Recognize the image...") Dim page As Vintasoft.Imaging.Ocr.Results.OcrPage = engineManager.Recognize(image, settings, preprocessing.SegmentationTextRegions) ' add page to PDF document System.Console.WriteLine("Add page to PDF document...") documentBuilder.AddPage(image, page) End Using ' dispose the acquired image device.AcquiredImage.Dispose() End If Loop While acquireState <> Vintasoft.Twain.AcquireModalState.None System.Console.WriteLine("Save changes in PDF document...") pdfDocument.SaveChanges() End Using End Using End Using End Sub End Class
// The project, which uses this code, must have references to the following assemblies: // - Vintasoft.Imaging // - Vintasoft.Imaging.DocCleanup // - Vintasoft.Imaging.Ocr // - Vintasoft.Imaging.Ocr.Tesseract // - Vintasoft.Imaging.Pdf // - Vintasoft.Imaging.Pdf.Ocr class ScanAndConvertToSearchablePdfDcoument { /// <summary> /// Acquires image from scanner, /// recognizes text on images and /// saves the result as a searchable PDF document. /// </summary> /// <remarks> /// VintaSoft Twain .NET SDK (Vintasoft.Twain.dll), /// VintaSoft Imaging .NET SDK (Standard) (Vintasoft.Imaging.dll), /// VintaSoft Document Cleanup .NET Plug-in (Vintasoft.Imaging.DocCleanup.dll), /// VintaSoft OCR .NET Plug-in (Vintasoft.Imaging.Ocr.dll, Vintasoft.Imaging.Ocr.Tesseract.dll), /// VintaSoft PDF .NET Plug-in (Vintasoft.Imaging.Pdf.dll, Vintasoft.Imaging.Pdf.Ocr.dll) and /// and VintaSoft PDF .NET Plug-in (Writer) (Vintasoft.Pdf.dll) are necessary /// for executing this sample. /// </remarks> public static void ScanImagesAndSaveAsSearchablePdfDocument( Vintasoft.Imaging.Ocr.OcrLanguage language, string outputPdfFilename) { System.Console.WriteLine("Create TWAIN device manager..."); using (Vintasoft.Twain.DeviceManager deviceManager = new Vintasoft.Twain.DeviceManager()) { // create PDF document using (Vintasoft.Imaging.Pdf.PdfDocument pdfDocument = new Vintasoft.Imaging.Pdf.PdfDocument(outputPdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14)) { // create PDF document builder Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder = new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(pdfDocument); documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto; documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText; System.Console.WriteLine("Create Tesseract OCR engine..."); using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()) { // create OCR engine manager Vintasoft.Imaging.Ocr.OcrEngineManager engineManager = new Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr); Vintasoft.Imaging.Ocr.OcrEngineSettings settings = new Vintasoft.Imaging.Ocr.OcrEngineSettings(language); System.Console.WriteLine("Open TWAIN device manager..."); deviceManager.Open(); Vintasoft.Twain.Device device = deviceManager.DefaultDevice; Vintasoft.Twain.AcquireModalState acquireState; do { System.Console.WriteLine("Acquire image from scanner..."); acquireState = device.AcquireModal(); if (acquireState == Vintasoft.Twain.AcquireModalState.ImageAcquired) { // create VintasoftImage using (Vintasoft.Imaging.VintasoftImage image = new Vintasoft.Imaging.VintasoftImage(device.AcquiredImage.GetAsBitmap(), true)) { // preprocess image // BorderClear, Despeckle, Deskew, Segmentation System.Console.WriteLine("Preprocess the image..."); Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand preprocessing = new Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand(); preprocessing.Binarization = null; preprocessing.ExecuteInPlace(image); // recognize image System.Console.WriteLine("Recognize the image..."); Vintasoft.Imaging.Ocr.Results.OcrPage page = engineManager.Recognize(image, settings, preprocessing.SegmentationTextRegions); // add page to PDF document System.Console.WriteLine("Add page to PDF document..."); documentBuilder.AddPage(image, page); } // dispose the acquired image device.AcquiredImage.Dispose(); } } while (acquireState != Vintasoft.Twain.AcquireModalState.None); System.Console.WriteLine("Save changes in PDF document..."); pdfDocument.SaveChanges(); } } } } }
' The project, which uses this code, must have references to the following assemblies: ' - Vintasoft.Imaging ' - Vintasoft.Imaging.Ocr ' - Vintasoft.Imaging.Ocr.Tesseract Dim imageFilePath As String = "D:\TestImage.png" ' create the OCR engine Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr() ' specify that OCR engine will recognize English text Dim language As Vintasoft.Imaging.Ocr.OcrLanguage = Vintasoft.Imaging.Ocr.OcrLanguage.English ' create the OCR engine settings Dim settings As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(language) ' initialize the OCR engine tesseractOcr.Init(settings) ' load an image Using image As New Vintasoft.Imaging.VintasoftImage(imageFilePath) ' specify the image, where text must be recognized tesseractOcr.SetImage(image) ' recognize text in the image Dim ocrResult As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.Recognize() Dim hocrFilePath As String = System.IO.Path.Combine(System.IO.Path.GetDirectoryName(imageFilePath), System.IO.Path.GetFileNameWithoutExtension(imageFilePath) & ".hocr") ' create the file Using stream As System.IO.Stream = System.IO.File.Open(hocrFilePath, System.IO.FileMode.Create) ' create the HOcr codec Dim hOcrCodec As New Vintasoft.Imaging.Ocr.Results.HOcrCodec() ' save the OCR result in HOcr format hOcrCodec.Export(ocrResult, stream) End Using ' clear the image tesseractOcr.ClearImage() End Using ' shutdown the OCR engine tesseractOcr.Shutdown() End Using
// The project, which uses this code, must have references to the following assemblies: // - Vintasoft.Imaging // - Vintasoft.Imaging.Ocr // - Vintasoft.Imaging.Ocr.Tesseract string imageFilePath = @"D:\TestImage.png"; // create the OCR engine using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()) { // specify that OCR engine will recognize English text Vintasoft.Imaging.Ocr.OcrLanguage language = Vintasoft.Imaging.Ocr.OcrLanguage.English; // create the OCR engine settings Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings settings = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(language); // initialize the OCR engine tesseractOcr.Init(settings); // load an image using (Vintasoft.Imaging.VintasoftImage image = new Vintasoft.Imaging.VintasoftImage(imageFilePath)) { // specify the image, where text must be recognized tesseractOcr.SetImage(image); // recognize text in the image Vintasoft.Imaging.Ocr.Results.OcrPage ocrResult = tesseractOcr.Recognize(); string hocrFilePath = System.IO.Path.Combine( System.IO.Path.GetDirectoryName(imageFilePath), System.IO.Path.GetFileNameWithoutExtension(imageFilePath) + ".hocr"); // create the file using (System.IO.Stream stream = System.IO.File.Open(hocrFilePath, System.IO.FileMode.Create)) { // create the HOcr codec Vintasoft.Imaging.Ocr.Results.HOcrCodec hOcrCodec = new Vintasoft.Imaging.Ocr.Results.HOcrCodec(); // save the OCR result in HOcr format hOcrCodec.Export(ocrResult, stream); } // clear the image tesseractOcr.ClearImage(); } // shutdown the OCR engine tesseractOcr.Shutdown(); }