OCR: How to convert a TIFF file to a searchable PDF document?
In This Topic
Here is C#/VB.NET code that shows how to convert a TIFF file to a searchable PDF document:
/// <summary>
/// Converts a TIFF file to a searchable PDF document (PDF 1.4, PDF/A-1a, PDF/A-1b, PDF/A-2a, PDF/A-2b, PDF/A-2u, PDF/A-3a, PDF/A-3u, PDF/A-4, PDF/A-4e or PDF/A-4f).
/// </summary>
/// <param name="ocrLanguage">An OCR language.</param>
/// <param name="tiffFilename">A filename of source TIFF file.</param>
/// <param name="pdfFilename">A filename of destination PDF file.</param>
/// <param name="documentConformance">Conformance of destination PDF document.</param>
public static void ConvertTiffToSearchablePdf(
Vintasoft.Imaging.Ocr.OcrLanguage ocrLanguage,
string tiffFilename,
string pdfFilename,
Vintasoft.Imaging.Pdf.PdfDocumentConformance documentConformance)
{
try
{
// create an image collection
using (Vintasoft.Imaging.ImageCollection images = new Vintasoft.Imaging.ImageCollection())
{
// add images from TIFF file into image collection
images.Add(tiffFilename);
// create a searchable PDF document
using (Vintasoft.Imaging.Pdf.PdfDocument document =
new Vintasoft.Imaging.Pdf.PdfDocument(pdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14))
{
System.Console.WriteLine("Create OCR engine...");
// create the Tesseract OCR engine
using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr =
new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
{
System.Console.WriteLine("Initialize OCR engine...");
// init the Tesseract OCR engine
tesseractOcr.Init(new Vintasoft.Imaging.Ocr.OcrEngineSettings(ocrLanguage));
// create a PDF document builder
Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder =
new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(document);
// if PDF document conformnce is PDF/A-1a or PDF/A-2a or PDF/A-3a
if (documentConformance == Vintasoft.Imaging.Pdf.PdfDocumentConformance.PdfA_1a ||
documentConformance == Vintasoft.Imaging.Pdf.PdfDocumentConformance.PdfA_2a ||
documentConformance == Vintasoft.Imaging.Pdf.PdfDocumentConformance.PdfA_3a)
{
// specify that document builder must add marked content (add structure elements) to a PDF document
documentBuilder.AddMarkedContent = true;
}
// specify that the best image compression must be calculated automatically
documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto;
// specify that image must be place over text
documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText;
// for each image in image collection
foreach (Vintasoft.Imaging.VintasoftImage image in images)
{
System.Console.WriteLine("Recognize text in image...");
// recognize text on image
Vintasoft.Imaging.Ocr.Results.OcrPage page = tesseractOcr.Recognize(image);
System.Console.WriteLine("Add page to a PDF document...");
// add recognized OCR page to the PDF document
documentBuilder.AddPage(image, page);
}
// shutdown OCR engine
tesseractOcr.Shutdown();
// if PDF document conformance is specified
if (documentConformance != Vintasoft.Imaging.Pdf.PdfDocumentConformance.Undefined)
{
System.Console.WriteLine(string.Format("Convert PDF document to {0}...", documentConformance));
// create PDF/A document converter
Vintasoft.Imaging.Pdf.Processing.PdfA.PdfAConverter converter =
(Vintasoft.Imaging.Pdf.Processing.PdfA.PdfAConverter)Vintasoft.Imaging.Pdf.Processing.PdfDocumentConverter.Create(documentConformance);
if (converter == null)
throw new System.NotImplementedException("PDF/A converter not found.");
// set ICC profiles
converter.DefaultCmykIccProfileFilename = "DefaultCmyk.icc";
converter.DefaultRgbIccProfileFilename = "DefaultRgb.icc";
// create processing state
using (Vintasoft.Imaging.Processing.ProcessingState processingState = new Vintasoft.Imaging.Processing.ProcessingState())
{
// convert PDF document
Vintasoft.Imaging.Processing.ConversionProfileResult result = converter.Convert(document, processingState);
// if PDF document is not converted
if (!result.IsSuccessful)
{
// throw error
throw result.CreateConversionException();
}
}
}
else
{
// save changes in PDF document
System.Console.WriteLine("Save changes in PDF document...");
document.SaveChanges();
}
}
}
// clear and dispose images in image collection
images.ClearAndDisposeItems();
}
}
catch (System.Exception ex)
{
System.Console.WriteLine(ex.Message);
System.Console.ReadKey();
}
}
''' <summary>
''' Converts a TIFF file to a searchable PDF document (PDF 1.4, PDF/A-1a, PDF/A-1b, PDF/A-2a, PDF/A-2b, PDF/A-2u, PDF/A-3a, PDF/A-3u, PDF/A-4, PDF/A-4e or PDF/A-4f).
''' </summary>
''' <param name="ocrLanguage">An OCR language.</param>
''' <param name="tiffFilename">A filename of source TIFF file.</param>
''' <param name="pdfFilename">A filename of destination PDF file.</param>
''' <param name="documentConformance">Conformance of destination PDF document.</param>
Public Shared Sub ConvertTiffToSearchablePdf(ocrLanguage As Vintasoft.Imaging.Ocr.OcrLanguage, tiffFilename As String, pdfFilename As String, documentConformance As Vintasoft.Imaging.Pdf.PdfDocumentConformance)
Try
' create an image collection
Using images As New Vintasoft.Imaging.ImageCollection()
' add images from TIFF file into image collection
images.Add(tiffFilename)
' create a searchable PDF document
Using document As New Vintasoft.Imaging.Pdf.PdfDocument(pdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14)
System.Console.WriteLine("Create OCR engine...")
' create the Tesseract OCR engine
Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
System.Console.WriteLine("Initialize OCR engine...")
' init the Tesseract OCR engine
tesseractOcr.Init(New Vintasoft.Imaging.Ocr.OcrEngineSettings(ocrLanguage))
' create a PDF document builder
Dim documentBuilder As New Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(document)
' if PDF document conformnce is PDF/A-1a or PDF/A-2a or PDF/A-3a
If documentConformance = Vintasoft.Imaging.Pdf.PdfDocumentConformance.PdfA_1a OrElse documentConformance = Vintasoft.Imaging.Pdf.PdfDocumentConformance.PdfA_2a OrElse documentConformance = Vintasoft.Imaging.Pdf.PdfDocumentConformance.PdfA_3a Then
' specify that document builder must add marked content (add structure elements) to a PDF document
documentBuilder.AddMarkedContent = True
End If
' specify that the best image compression must be calculated automatically
documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto
' specify that image must be place over text
documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText
' for each image in image collection
For Each image As Vintasoft.Imaging.VintasoftImage In images
System.Console.WriteLine("Recognize text in image...")
' recognize text on image
Dim page As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.Recognize(image)
System.Console.WriteLine("Add page to a PDF document...")
' add recognized OCR page to the PDF document
documentBuilder.AddPage(image, page)
Next
' shutdown OCR engine
tesseractOcr.Shutdown()
' if PDF document conformance is specified
If documentConformance <> Vintasoft.Imaging.Pdf.PdfDocumentConformance.Undefined Then
System.Console.WriteLine(String.Format("Convert PDF document to {0}...", documentConformance))
' create PDF/A document converter
Dim converter As Vintasoft.Imaging.Pdf.Processing.PdfA.PdfAConverter = DirectCast(Vintasoft.Imaging.Pdf.Processing.PdfDocumentConverter.Create(documentConformance), Vintasoft.Imaging.Pdf.Processing.PdfA.PdfAConverter)
If converter Is Nothing Then
Throw New System.NotImplementedException("PDF/A converter not found.")
End If
' set ICC profiles
converter.DefaultCmykIccProfileFilename = "DefaultCmyk.icc"
converter.DefaultRgbIccProfileFilename = "DefaultRgb.icc"
' create processing state
Using processingState As New Vintasoft.Imaging.Processing.ProcessingState()
' convert PDF document
Dim result As Vintasoft.Imaging.Processing.ConversionProfileResult = converter.Convert(document, processingState)
' if PDF document is not converted
If Not result.IsSuccessful Then
' throw error
Throw result.CreateConversionException()
End If
End Using
Else
' save changes in PDF document
System.Console.WriteLine("Save changes in PDF document...")
document.SaveChanges()
End If
End Using
End Using
' clear and dispose images in image collection
images.ClearAndDisposeItems()
End Using
Catch ex As System.Exception
System.Console.WriteLine(ex.Message)
System.Console.ReadKey()
End Try
End Sub