- Render image-only PDF page with 300 dpi resolution or higher
- Prepare rendered image for text recognition if necessary. For example, you can remove noise from image.
- Recognize text on image.
- Filter the recognition result if necessary.
- Add text over image on PDF page.
Here is an example that shows how to convert an image-only PDF document to a searchable PDF document:
Code: Select all
using System;
using System.Collections.Generic;
namespace ConsoleApplication1
{
class Program
{
static void Main(string[] args)
{
// convert an image-only PDF document to a searchable PDF document
ConvertImageOnlyPdfToSearchablePdf(
"imageOnlyPdfDocument.pdf",
Vintasoft.Imaging.Ocr.OcrLanguage.English,
"searchablePdfDocument.pdf");
}
/// <summary>
/// Converts an image-only PDF document to a searchable PDF document.
/// </summary>
/// <param name="imageOnlyPdfFilename">A filename of source image-only PDF file.</param>
/// <param name="ocrLanguage">OCR language.</param>
/// <param name="searchablePdfFilename">A filename of destination searchable PDF file.</param>
public static void ConvertImageOnlyPdfToSearchablePdf(
string imageOnlyPdfFilename,
Vintasoft.Imaging.Ocr.OcrLanguage ocrLanguage,
string searchablePdfFilename)
{
// create an image collection
using (Vintasoft.Imaging.ImageCollection images = new Vintasoft.Imaging.ImageCollection())
{
// add pages from image-only PDF document into image collection
images.Add(imageOnlyPdfFilename);
// create a searchable PDF document
using (Vintasoft.Imaging.Pdf.PdfDocument document =
new Vintasoft.Imaging.Pdf.PdfDocument(searchablePdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14))
{
Console.WriteLine("Create OCR engine...");
// create the Tesseract OCR engine
using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr =
new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr(@"..\..\TesseractOCR\"))
{
Console.WriteLine("Initialize OCR engine...");
// init the Tesseract OCR engine
tesseractOcr.Init(new Vintasoft.Imaging.Ocr.OcrEngineSettings(ocrLanguage));
// create a PDF document builder
Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder =
new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(document);
// specify that the best image compression must be calculated automatically
documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto;
// specify that image must be place over text
documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText;
// for each image in image collection
foreach (Vintasoft.Imaging.VintasoftImage image in images)
{
Console.WriteLine("Recognize text in image...");
// recognize text on image
Vintasoft.Imaging.Ocr.Results.OcrPage ocrPage = tesseractOcr.Recognize(image);
// remove low confidence words from OCR result
RemoveLowConfidenceWords(ocrPage);
Console.WriteLine("Add page to a PDF document...");
// add recognized OCR page to the PDF document
documentBuilder.AddPage(image, ocrPage);
}
// shutdown OCR engine
tesseractOcr.Shutdown();
Console.WriteLine("Save changes in PDF document...");
// save changes in PDF document
document.SaveChanges();
}
}
// clear and dispose images in image collection
images.ClearAndDisposeItems();
}
}
/// <summary>
/// Preprocess an image before text recognition.
/// </summary>
/// <param name="image">Image to preprocess.</param>
static void PreprocessImageBeforeOcr(Vintasoft.Imaging.VintasoftImage image)
{
// remove noise from image
Vintasoft.Imaging.ImageProcessing.Document.DespeckleCommand despeckleCommand =
new Vintasoft.Imaging.ImageProcessing.Document.DespeckleCommand();
despeckleCommand.ExecuteInPlace(image);
}
/// <summary>
/// Removes low confidence words from OCR page.
/// </summary>
/// <param name="ocrPage">OCR page.</param>
static void RemoveLowConfidenceWords(Vintasoft.Imaging.Ocr.Results.OcrPage ocrPage)
{
// minimum confidence
const float MIN_CONFIDENCE = 75.0f;
// get all words in recognized text
Vintasoft.Imaging.Ocr.Results.OcrObject[] ocrObjects =
ocrPage.GetObjects(Vintasoft.Imaging.Ocr.OcrObjectType.Word);
// create list of words to remove
List<Vintasoft.Imaging.Ocr.Results.OcrObject> removeObjects =
new List<Vintasoft.Imaging.Ocr.Results.OcrObject>();
// for each word
foreach (Vintasoft.Imaging.Ocr.Results.OcrObject word in ocrObjects)
{
// if word confidence is less than minimum confidence
if (word.Confidence < MIN_CONFIDENCE)
// add word to a list of words to remove
removeObjects.Add(word);
}
// validate recognition results (remove words with low confidence)
Vintasoft.Imaging.Ocr.Results.OcrResultsEditor editor =
new Vintasoft.Imaging.Ocr.Results.OcrResultsEditor(ocrPage);
editor.RemoveObjects(removeObjects.ToArray());
editor.ValidateResults();
}
}
}