VintaSoft Imaging .NET SDK 14.0: Documentation for .NET developer
In This Topic
    OCR: How to convert a TIFF file to a searchable PDF document?
    In This Topic
    Here is C#/VB.NET code that shows how to convert a TIFF file to a searchable PDF document:
    /// <summary>
    /// Converts a TIFF file to a searchable PDF document (PDF 1.4, PDF/A-1a, PDF/A-1b, PDF/A-2a, PDF/A-2b, PDF/A-2u, PDF/A-3a, PDF/A-3u, PDF/A-4, PDF/A-4e or PDF/A-4f).
    /// </summary>
    /// <param name="ocrLanguage">An OCR language.</param>
    /// <param name="tiffFilename">A filename of source TIFF file.</param>
    /// <param name="pdfFilename">A filename of destination PDF file.</param>
    /// <param name="documentConformance">Conformance of destination PDF document.</param>
    public static void ConvertTiffToSearchablePdf(
        Vintasoft.Imaging.Ocr.OcrLanguage ocrLanguage,
        string tiffFilename,
        string pdfFilename,
        Vintasoft.Imaging.Pdf.PdfDocumentConformance documentConformance)
    {
        try
        {
            // create an image collection
            using (Vintasoft.Imaging.ImageCollection images = new Vintasoft.Imaging.ImageCollection())
            {
                // add images from TIFF file into image collection
                images.Add(tiffFilename);
    
                // create a searchable PDF document
                using (Vintasoft.Imaging.Pdf.PdfDocument document =
                    new Vintasoft.Imaging.Pdf.PdfDocument(pdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14))
                {
                    System.Console.WriteLine("Create OCR engine...");
                    // create the Tesseract OCR engine
                    using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr =
                        new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
                    {
                        System.Console.WriteLine("Initialize OCR engine...");
                        // init the Tesseract OCR engine
                        tesseractOcr.Init(new Vintasoft.Imaging.Ocr.OcrEngineSettings(ocrLanguage));
    
                        // create a PDF document builder
                        Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder documentBuilder =
                            new Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(document);
    
                        // if PDF document conformnce is PDF/A-1a or PDF/A-2a or PDF/A-3a
                        if (documentConformance == Vintasoft.Imaging.Pdf.PdfDocumentConformance.PdfA_1a ||
                            documentConformance == Vintasoft.Imaging.Pdf.PdfDocumentConformance.PdfA_2a ||
                            documentConformance == Vintasoft.Imaging.Pdf.PdfDocumentConformance.PdfA_3a)
                        {
                            // specify that document builder must add marked content (add structure elements) to a PDF document
                            documentBuilder.AddMarkedContent = true;
                        }
    
                        // specify that the best image compression must be calculated automatically
                        documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto;
    
                        // specify that image must be place over text
                        documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText;
    
    
                        // for each image in image collection
                        foreach (Vintasoft.Imaging.VintasoftImage image in images)
                        {
                            System.Console.WriteLine("Recognize text in image...");
                            // recognize text on image
                            Vintasoft.Imaging.Ocr.Results.OcrPage page = tesseractOcr.Recognize(image);
    
                            System.Console.WriteLine("Add page to a PDF document...");
                            // add recognized OCR page to the PDF document
                            documentBuilder.AddPage(image, page);
                        }
    
                        // shutdown OCR engine
                        tesseractOcr.Shutdown();
    
                        // if PDF document conformance is specified
                        if (documentConformance != Vintasoft.Imaging.Pdf.PdfDocumentConformance.Undefined)
                        {
                            System.Console.WriteLine(string.Format("Convert PDF document to {0}...", documentConformance));
                            // create PDF/A document converter
                            Vintasoft.Imaging.Pdf.Processing.PdfA.PdfAConverter converter =
                                (Vintasoft.Imaging.Pdf.Processing.PdfA.PdfAConverter)Vintasoft.Imaging.Pdf.Processing.PdfDocumentConverter.Create(documentConformance);
                            if (converter == null)
                                throw new System.NotImplementedException("PDF/A converter not found.");
    
                            // set ICC profiles
                            converter.DefaultCmykIccProfileFilename = "DefaultCmyk.icc";
                            converter.DefaultRgbIccProfileFilename = "DefaultRgb.icc";
    
    
                            // create processing state
                            using (Vintasoft.Imaging.Processing.ProcessingState processingState = new Vintasoft.Imaging.Processing.ProcessingState())
                            {
                                // convert PDF document
                                Vintasoft.Imaging.Processing.ConversionProfileResult result = converter.Convert(document, processingState);
    
                                // if PDF document is not converted
                                if (!result.IsSuccessful)
                                {
                                    // throw error
                                    throw result.CreateConversionException();
                                }
                            }
                        }
                        else
                        {
                            // save changes in PDF document
                            System.Console.WriteLine("Save changes in PDF document...");
                            document.SaveChanges();
                        }
                    }
                }
    
                // clear and dispose images in image collection
                images.ClearAndDisposeItems();
            }
        }
        catch (System.Exception ex)
        {
            System.Console.WriteLine(ex.Message);
            System.Console.ReadKey();
        }
    }
    
    ''' <summary>
    ''' Converts a TIFF file to a searchable PDF document (PDF 1.4, PDF/A-1a, PDF/A-1b, PDF/A-2a, PDF/A-2b, PDF/A-2u, PDF/A-3a, PDF/A-3u, PDF/A-4, PDF/A-4e or PDF/A-4f).
    ''' </summary>
    ''' <param name="ocrLanguage">An OCR language.</param>
    ''' <param name="tiffFilename">A filename of source TIFF file.</param>
    ''' <param name="pdfFilename">A filename of destination PDF file.</param>
    ''' <param name="documentConformance">Conformance of destination PDF document.</param>
    Public Shared Sub ConvertTiffToSearchablePdf(ocrLanguage As Vintasoft.Imaging.Ocr.OcrLanguage, tiffFilename As String, pdfFilename As String, documentConformance As Vintasoft.Imaging.Pdf.PdfDocumentConformance)
        Try
            ' create an image collection
            Using images As New Vintasoft.Imaging.ImageCollection()
                ' add images from TIFF file into image collection
                images.Add(tiffFilename)
    
                ' create a searchable PDF document
                Using document As New Vintasoft.Imaging.Pdf.PdfDocument(pdfFilename, Vintasoft.Imaging.Pdf.PdfFormat.Pdf_14)
                    System.Console.WriteLine("Create OCR engine...")
                    ' create the Tesseract OCR engine
                    Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
                        System.Console.WriteLine("Initialize OCR engine...")
                        ' init the Tesseract OCR engine
                        tesseractOcr.Init(New Vintasoft.Imaging.Ocr.OcrEngineSettings(ocrLanguage))
    
                        ' create a PDF document builder
                        Dim documentBuilder As New Vintasoft.Imaging.Pdf.Ocr.PdfDocumentBuilder(document)
    
                        ' if PDF document conformnce is PDF/A-1a or PDF/A-2a or PDF/A-3a
                        If documentConformance = Vintasoft.Imaging.Pdf.PdfDocumentConformance.PdfA_1a OrElse documentConformance = Vintasoft.Imaging.Pdf.PdfDocumentConformance.PdfA_2a OrElse documentConformance = Vintasoft.Imaging.Pdf.PdfDocumentConformance.PdfA_3a Then
                            ' specify that document builder must add marked content (add structure elements) to a PDF document
                            documentBuilder.AddMarkedContent = True
                        End If
    
                        ' specify that the best image compression must be calculated automatically
                        documentBuilder.ImageCompression = Vintasoft.Imaging.Pdf.PdfCompression.Auto
    
                        ' specify that image must be place over text
                        documentBuilder.PageCreationMode = Vintasoft.Imaging.Pdf.Ocr.PdfPageCreationMode.ImageOverText
    
    
                        ' for each image in image collection
                        For Each image As Vintasoft.Imaging.VintasoftImage In images
                            System.Console.WriteLine("Recognize text in image...")
                            ' recognize text on image
                            Dim page As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.Recognize(image)
    
                            System.Console.WriteLine("Add page to a PDF document...")
                            ' add recognized OCR page to the PDF document
                            documentBuilder.AddPage(image, page)
                        Next
    
                        ' shutdown OCR engine
                        tesseractOcr.Shutdown()
    
                        ' if PDF document conformance is specified
                        If documentConformance <> Vintasoft.Imaging.Pdf.PdfDocumentConformance.Undefined Then
                            System.Console.WriteLine(String.Format("Convert PDF document to {0}...", documentConformance))
                            ' create PDF/A document converter
                            Dim converter As Vintasoft.Imaging.Pdf.Processing.PdfA.PdfAConverter = DirectCast(Vintasoft.Imaging.Pdf.Processing.PdfDocumentConverter.Create(documentConformance), Vintasoft.Imaging.Pdf.Processing.PdfA.PdfAConverter)
                            If converter Is Nothing Then
                                Throw New System.NotImplementedException("PDF/A converter not found.")
                            End If
    
                            ' set ICC profiles
                            converter.DefaultCmykIccProfileFilename = "DefaultCmyk.icc"
                            converter.DefaultRgbIccProfileFilename = "DefaultRgb.icc"
    
    
                            ' create processing state
                            Using processingState As New Vintasoft.Imaging.Processing.ProcessingState()
                                ' convert PDF document
                                Dim result As Vintasoft.Imaging.Processing.ConversionProfileResult = converter.Convert(document, processingState)
    
                                ' if PDF document is not converted
                                If Not result.IsSuccessful Then
                                    ' throw error
                                    Throw result.CreateConversionException()
                                End If
                            End Using
                        Else
                            ' save changes in PDF document
                            System.Console.WriteLine("Save changes in PDF document...")
                            document.SaveChanges()
                        End If
                    End Using
                End Using
    
                ' clear and dispose images in image collection
                images.ClearAndDisposeItems()
            End Using
        Catch ex As System.Exception
            System.Console.WriteLine(ex.Message)
            System.Console.ReadKey()
        End Try
    End Sub