OCR: Save the OCR result to a searchable PDF document
In This Topic
OCR result can be saved to a searchable PDF document in 3 modes:
- Text over image.
- Image over text. In this mode the SDK draws invisible text under image. Text symbols are drawn using glyphless font.
- Text only.
In "Text over image" mode:
- Created PDF page has background. Background is source image with erased text symbols.
- Text symbols are drawn over image.
- Text symbols are drawn using OCR fonts, which are created using outlines of recognized text symbols.
- Text in created PDF document can be selected in any PDF reader.
- Text in created PDF document can be edited in any PDF editor.
- Rendered PDF page equals to the source image.
In "Image over text" mode:
- Created PDF page has background. Background equals to the source image.
- Text symbols are drawn under image.
- Text symbols are drawn using glyphless font.
- Text in created PDF document can be selected in any PDF reader.
- Text in created PDF document can be edited only if PDF editor allows to edit hidden text.
- Rendered PDF page equals to the source image.
In "Text only" mode:
- Created PDF page does not have background.
- Text symbols are drawn using single font, which is selected by user.
- Text in created PDF document can be selected in any PDF reader.
- Text in created PDF document can be edited in any PDF editor.
- In most cases rendered PDF page is not equal to the source image because text font contains text symbols with "ideal" outlines but source image in most cases contains not "ideal" outlines.
The
SearchablePdfGenerator class is a processing command that allows to add recognized pages to a searchable PDF document.
Here is C#/VB.NET code that shows how to convert an image file to a searchable PDF document:
/// <summary>
/// Converts an image file to a searchable PDF document.
/// </summary>
/// <param name="sourceFilePath">A filename of source image file.</param>
/// <param name="pageCreationMode">The PDF page creation mode.</param>
/// <param name="ocrLanguage">An OCR language that should be used for text recognition.</param>
/// <param name="pdfFilename">A filename of destination PDF file.</param>
public static void ConvertImagesToSearchablePdf(
string sourceFilePath,
Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode pageCreationMode,
Vintasoft.Imaging.Ocr.OcrLanguage ocrLanguage,
string pdfFilename)
{
// create image collection
using (Vintasoft.Imaging.ImageCollection images = new Vintasoft.Imaging.ImageCollection())
{
// add images to the image collection
images.Add(sourceFilePath);
try
{
// create Tesseract OCR engine
using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcrEngine = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
{
// create OCR engine manager
Vintasoft.Imaging.Ocr.OcrEngineManager ocrEngineManager = new Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcrEngine);
// create PDF generator
Vintasoft.Imaging.Pdf.Ocr.SearchablePdfGenerator pdfGenerator = new Vintasoft.Imaging.Pdf.Ocr.SearchablePdfGenerator(ocrEngineManager);
// set source images in PDF generator
pdfGenerator.SourceImages = images;
// set PDF page creation mode in PDF generator
pdfGenerator.PageCreationMode = pageCreationMode;
// create Tesseract OCR settings
Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings tesseractOcrSettings = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(ocrLanguage);
tesseractOcrSettings.RecognitionRegionType =
Vintasoft.Imaging.Ocr.RecognitionRegionType.RecognizePageWithPageSegmentationAndOrientationDetection;
// if PDF pages must be created in "TextOverImage" mode
if (pageCreationMode == Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.TextOverImage)
tesseractOcrSettings.UseSymbolRegionsCorrection = true;
else
tesseractOcrSettings.UseSymbolRegionsCorrection = false;
// set Tesseract OCR settings
pdfGenerator.OcrEngineSettings = tesseractOcrSettings;
// if PDF pages must be created in "Text" mode
if (pageCreationMode == Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.Text)
{
// set text color
pdfGenerator.TextColor = System.Drawing.Color.Black;
// text text font name
pdfGenerator.TextOnlyFontName = "Arial";
}
// subscribe to the PDF page added event for saving changes to PDF document
pdfGenerator.PdfPageAdded += Command_PdfPageAdded;
// subscribe to the image processing started event for preprocessing image
pdfGenerator.ImageProcessingStarted += PdfGenerator_ImageProcessingStarted;
// create PDF document
using (Vintasoft.Imaging.Pdf.PdfDocument document = new Vintasoft.Imaging.Pdf.PdfDocument(
pdfFilename, System.IO.FileMode.Create, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_16))
{
// generate PDF document pages
pdfGenerator.Execute(document);
// save PDF document
document.SaveChanges();
}
}
}
finally
{
// remove images
images.ClearAndDisposeItems();
}
}
}
/// <summary>
/// Handles the <see cref="Vintasoft.Imaging.Pdf.Ocr.SearchablePdfGenerator.ImageProcessingStarted"/> event.
/// </summary>
private static void PdfGenerator_ImageProcessingStarted(object sender, Vintasoft.Imaging.Pdf.Ocr.OcrImageProcessingEventArgs e)
{
// create image copy
Vintasoft.Imaging.VintasoftImage image = (Vintasoft.Imaging.VintasoftImage)e.Image.Clone();
try
{
// preprocess image
Vintasoft.Imaging.ImageProcessing.Document.DeskewCommand deskewCommand =
new Vintasoft.Imaging.ImageProcessing.Document.DeskewCommand();
deskewCommand.ExecuteInPlace(image);
Vintasoft.Imaging.ImageProcessing.Document.AutoTextOrientationCommand autoTextOrientationCommand =
new Vintasoft.Imaging.ImageProcessing.Document.AutoTextOrientationCommand();
autoTextOrientationCommand.ExecuteInPlace(image);
// change the processing image (the processing command will dispose image after use)
e.Image = image;
}
catch
{
image.Dispose();
// skip image
e.Image = null;
}
}
/// <summary>
/// Handles the <see cref="Vintasoft.Imaging.Pdf.Ocr.SearchablePdfGenerator.PdfPageAdded"/> event.
/// </summary>
private static void Command_PdfPageAdded(object sender, Vintasoft.Imaging.Pdf.Ocr.PdfPageAddedEventArgs e)
{
// save PDF document every 10 pages
if (e.Document.Pages.Count % 10 == 0)
e.Document.SaveChanges();
}
''' <summary>
''' Converts an image file to a searchable PDF document.
''' </summary>
''' <param name="sourceFilePath">A filename of source image file.</param>
''' <param name="pageCreationMode">The PDF page creation mode.</param>
''' <param name="ocrLanguage">An OCR language that should be used for text recognition.</param>
''' <param name="pdfFilename">A filename of destination PDF file.</param>
Public Shared Sub ConvertImagesToSearchablePdf(sourceFilePath As String, pageCreationMode As Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode, ocrLanguage As Vintasoft.Imaging.Ocr.OcrLanguage, pdfFilename As String)
' create image collection
Using images As New Vintasoft.Imaging.ImageCollection()
' add images to the image collection
images.Add(sourceFilePath)
Try
' create Tesseract OCR engine
Using tesseractOcrEngine As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
' create OCR engine manager
Dim ocrEngineManager As New Vintasoft.Imaging.Ocr.OcrEngineManager(tesseractOcrEngine)
' create PDF generator
Dim pdfGenerator As New Vintasoft.Imaging.Pdf.Ocr.SearchablePdfGenerator(ocrEngineManager)
' set source images in PDF generator
pdfGenerator.SourceImages = images
' set PDF page creation mode in PDF generator
pdfGenerator.PageCreationMode = pageCreationMode
' create Tesseract OCR settings
Dim tesseractOcrSettings As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(ocrLanguage)
tesseractOcrSettings.RecognitionRegionType = Vintasoft.Imaging.Ocr.RecognitionRegionType.RecognizePageWithPageSegmentationAndOrientationDetection
' if PDF pages must be created in "TextOverImage" mode
If pageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.TextOverImage Then
tesseractOcrSettings.UseSymbolRegionsCorrection = True
Else
tesseractOcrSettings.UseSymbolRegionsCorrection = False
End If
' set Tesseract OCR settings
pdfGenerator.OcrEngineSettings = tesseractOcrSettings
' if PDF pages must be created in "Text" mode
If pageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.Text Then
' set text color
pdfGenerator.TextColor = System.Drawing.Color.Black
' text text font name
pdfGenerator.TextOnlyFontName = "Arial"
End If
' subscribe to the PDF page added event for saving changes to PDF document
AddHandler pdfGenerator.PdfPageAdded, AddressOf Command_PdfPageAdded
' subscribe to the image processing started event for preprocessing image
AddHandler pdfGenerator.ImageProcessingStarted, AddressOf PdfGenerator_ImageProcessingStarted
' create PDF document
Using document As New Vintasoft.Imaging.Pdf.PdfDocument(pdfFilename, System.IO.FileMode.Create, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_16)
' generate PDF document pages
pdfGenerator.Execute(document)
' save PDF document
document.SaveChanges()
End Using
End Using
Finally
' remove images
images.ClearAndDisposeItems()
End Try
End Using
End Sub
''' <summary>
''' Handles the <see cref="Vintasoft.Imaging.Pdf.Ocr.SearchablePdfGenerator.ImageProcessingStarted"/> event.
''' </summary>
Private Shared Sub PdfGenerator_ImageProcessingStarted(sender As Object, e As Vintasoft.Imaging.Pdf.Ocr.OcrImageProcessingEventArgs)
' create image copy
Dim image As Vintasoft.Imaging.VintasoftImage = DirectCast(e.Image.Clone(), Vintasoft.Imaging.VintasoftImage)
Try
' preprocess image
Dim deskewCommand As New Vintasoft.Imaging.ImageProcessing.Document.DeskewCommand()
deskewCommand.ExecuteInPlace(image)
Dim autoTextOrientationCommand As New Vintasoft.Imaging.ImageProcessing.Document.AutoTextOrientationCommand()
autoTextOrientationCommand.ExecuteInPlace(image)
' change the processing image (the processing command will dispose image after use)
e.Image = image
Catch
image.Dispose()
' skip image
e.Image = Nothing
End Try
End Sub
''' <summary>
''' Handles the <see cref="Vintasoft.Imaging.Pdf.Ocr.SearchablePdfGenerator.PdfPageAdded"/> event.
''' </summary>
Private Shared Sub Command_PdfPageAdded(sender As Object, e As Vintasoft.Imaging.Pdf.Ocr.PdfPageAddedEventArgs)
' save PDF document every 10 pages
If e.Document.Pages.Count Mod 10 = 0 Then
e.Document.SaveChanges()
End If
End Sub