OCR result can be saved to a searchable PDF document. The searchable PDF document may contain the text located above the image or just text. To create a searchable PDF document from OCR results it is necessary to use PdfDocumentBuilder class.
Here is an example that shows how to save OCR result to a PDF file as text above the image:
' The project, which uses this code, must have references to the following assemblies: ' - Vintasoft.Imaging ' - Vintasoft.Imaging.DocCleanup ' - Vintasoft.Imaging.Ocr ' - Vintasoft.Imaging.Ocr.Tesseract ' - Vintasoft.Imaging.Pdf ' - Vintasoft.Imaging.Pdf.Ocr Class ScanAndConvertToSearchablePdfDcoument ''' <summary> ''' Acquires image from scanner, ''' recognizes text on images and ''' saves the result as a searchable PDF document. ''' </summary> ''' <remarks> ''' VintaSoft Twain .NET SDK (Vintasoft.Twain.dll), ''' VintaSoft Imaging .NET SDK (Standard) (Vintasoft.Imaging.dll), ''' VintaSoft Document Cleanup .NET Plug-in (Vintasoft.Imaging.DocCleanup.dll), ''' VintaSoft OCR .NET Plug-in (Vintasoft.Imaging.Ocr.dll, Vintasoft.Imaging.Ocr.Tesseract.dll), ''' VintaSoft PDF .NET Plug-in (Vintasoft.Imaging.Pdf.dll, Vintasoft.Imaging.Pdf.Ocr.dll) and ''' and VintaSoft PDF .NET Plug-in (Writer) (Vintasoft.Pdf.dll) are necessary ''' for executing this sample. ''' </remarks> Public Shared Sub ScanImagesAndSaveAsSearchablePdfDocument(language As Vintasoft.Imaging.Ocr.OcrLanguage, outputPdfFilename As String) System.Console.WriteLine("Create TWAIN device manager...") Using deviceManager As New Vintasoft.Twain.DeviceManager() ' create PDF document Using pdfDocument As New Vintasoft.Imaging.Pdf.PdfDocument(outputPdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14) ' create PDF document builder Dim documentBuilder As New Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(pdfDocument) documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText System.Console.WriteLine("Create Tesseract OCR engine...") Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr() ' create OCR engine manager Dim engineManager As New Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr) Dim settings As New Vintasoft.Imaging.Ocr.OcrEngineSettings(language) System.Console.WriteLine("Open TWAIN device manager...") deviceManager.Open() Dim device As Vintasoft.Twain.Device = deviceManager.DefaultDevice Dim acquireState As Vintasoft.Twain.AcquireModalState Do System.Console.WriteLine("Acquire image from scanner...") acquireState = device.AcquireModal() If acquireState = Vintasoft.Twain.AcquireModalState.ImageAcquired Then ' create VintasoftImage Using image As New Vintasoft.Imaging.VintasoftImage(device.AcquiredImage.GetAsBitmap(), True) ' preprocess image ' BorderClear, Despeckle, Deskew, Segmentation System.Console.WriteLine("Preprocess the image...") Dim preprocessing As New Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand() preprocessing.Binarization = Nothing preprocessing.ExecuteInPlace(image) ' recognize image System.Console.WriteLine("Recognize the image...") Dim page As Vintasoft.Imaging.Ocr.Results.OcrPage = engineManager.Recognize(image, settings, preprocessing.SegmentationTextRegions) ' add page to PDF document System.Console.WriteLine("Add page to PDF document...") documentBuilder.AddPage(image, page) End Using ' dispose the acquired image device.AcquiredImage.Dispose() End If Loop While acquireState <> Vintasoft.Twain.AcquireModalState.None System.Console.WriteLine("Save changes in PDF document...") pdfDocument.SaveChanges() End Using End Using End Using End Sub End Class
// The project, which uses this code, must have references to the following assemblies: // - Vintasoft.Imaging // - Vintasoft.Imaging.DocCleanup // - Vintasoft.Imaging.Ocr // - Vintasoft.Imaging.Ocr.Tesseract // - Vintasoft.Imaging.Pdf // - Vintasoft.Imaging.Pdf.Ocr class ScanAndConvertToSearchablePdfDcoument { /// <summary> /// Acquires image from scanner, /// recognizes text on images and /// saves the result as a searchable PDF document. /// </summary> /// <remarks> /// VintaSoft Twain .NET SDK (Vintasoft.Twain.dll), /// VintaSoft Imaging .NET SDK (Standard) (Vintasoft.Imaging.dll), /// VintaSoft Document Cleanup .NET Plug-in (Vintasoft.Imaging.DocCleanup.dll), /// VintaSoft OCR .NET Plug-in (Vintasoft.Imaging.Ocr.dll, Vintasoft.Imaging.Ocr.Tesseract.dll), /// VintaSoft PDF .NET Plug-in (Vintasoft.Imaging.Pdf.dll, Vintasoft.Imaging.Pdf.Ocr.dll) and /// and VintaSoft PDF .NET Plug-in (Writer) (Vintasoft.Pdf.dll) are necessary /// for executing this sample. /// </remarks> public static void ScanImagesAndSaveAsSearchablePdfDocument( Vintasoft.Imaging.Ocr.OcrLanguage language, string outputPdfFilename) { System.Console.WriteLine("Create TWAIN device manager..."); using (Vintasoft.Twain.DeviceManager deviceManager = new Vintasoft.Twain.DeviceManager()) { // create PDF document using (Vintasoft.Imaging.Pdf.PdfDocument pdfDocument = new Vintasoft.Imaging.Pdf.PdfDocument(outputPdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14)) { // create PDF document builder Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder = new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(pdfDocument); documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto; documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText; System.Console.WriteLine("Create Tesseract OCR engine..."); using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()) { // create OCR engine manager Vintasoft.Imaging.Ocr.OcrEngineManager engineManager = new Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcr); Vintasoft.Imaging.Ocr.OcrEngineSettings settings = new Vintasoft.Imaging.Ocr.OcrEngineSettings(language); System.Console.WriteLine("Open TWAIN device manager..."); deviceManager.Open(); Vintasoft.Twain.Device device = deviceManager.DefaultDevice; Vintasoft.Twain.AcquireModalState acquireState; do { System.Console.WriteLine("Acquire image from scanner..."); acquireState = device.AcquireModal(); if (acquireState == Vintasoft.Twain.AcquireModalState.ImageAcquired) { // create VintasoftImage using (Vintasoft.Imaging.VintasoftImage image = new Vintasoft.Imaging.VintasoftImage(device.AcquiredImage.GetAsBitmap(), true)) { // preprocess image // BorderClear, Despeckle, Deskew, Segmentation System.Console.WriteLine("Preprocess the image..."); Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand preprocessing = new Vintasoft.Imaging.ImageProcessing.Document.OcrPreprocessingCommand(); preprocessing.Binarization = null; preprocessing.ExecuteInPlace(image); // recognize image System.Console.WriteLine("Recognize the image..."); Vintasoft.Imaging.Ocr.Results.OcrPage page = engineManager.Recognize(image, settings, preprocessing.SegmentationTextRegions); // add page to PDF document System.Console.WriteLine("Add page to PDF document..."); documentBuilder.AddPage(image, page); } // dispose the acquired image device.AcquiredImage.Dispose(); } } while (acquireState != Vintasoft.Twain.AcquireModalState.None); System.Console.WriteLine("Save changes in PDF document..."); pdfDocument.SaveChanges(); } } } } }