Console: Convert a multipage TIFF file to a searchable PDF document.

Post by **Alex** » Wed Nov 16, 2016 10:47 am

This topic contains C# code sample that shows how to convert a multipage TIFF file to a searchable PDF document:

using System.Collections.Generic;
using System.IO;
using Vintasoft.Imaging;
using Vintasoft.Imaging.ImageProcessing.Document;
using Vintasoft.Imaging.ImageProcessing.Info;
using Vintasoft.Imaging.Ocr;
using Vintasoft.Imaging.Ocr.Results;
using Vintasoft.Imaging.Ocr.Tesseract;
using Vintasoft.Imaging.Pdf;
using Vintasoft.Imaging.Pdf.Ocr;

namespace ConvertMultipageTiffToSearchablePdfDocument
{
    class Program
    {
        static void Main(string[] args)
        {
            // create Tesseract OCR
            using (TesseractOcr tesseractOcr = new TesseractOcr(@"..\..\TesseractOCR"))
            {
                // create OCR engine manager
                OcrEngineManager engineManager = new OcrEngineManager(tesseractOcr);
                // create OCR settings
                OcrEngineSettings ocrSettings = new OcrEngineSettings(OcrLanguage.Turkish);

                // create image collection
                ImageCollection images = new ImageCollection();
                // add multipage TIFF file to the image collection
                images.Add("multipage.tif");

                // create a dictionary: image => OCR page
                Dictionary<VintasoftImage, OcrPage> imagesToOcrPages = new Dictionary<VintasoftImage, OcrPage>();
                // for each image in image collection
                for (int i = 0; i < images.Count; i++)
                {
                    // get image
                    VintasoftImage image = images[i];
                    // clone image
                    using (VintasoftImage clonedImage = (VintasoftImage)image.Clone())
                    {
                        // remove tables from image
                        LineRemovalCommand lineRemovalCommand = new LineRemovalCommand(RemovingLinesType.Tables);
                        lineRemovalCommand.ExecuteInPlace(clonedImage);

                        // remove halftone from image
                        HalftoneRemovalCommand halftoneRemovalCommand = new HalftoneRemovalCommand();
                        halftoneRemovalCommand.ExecuteInPlace(image);

                        // clear border on image
                        BorderClearCommand borderClearCommand = new BorderClearCommand();
                        borderClearCommand.ExecuteInPlace(image);

                        // remove hole punches from image
                        HolePunchRemovalCommand holePunchRemovalCommand = new HolePunchRemovalCommand();
                        holePunchRemovalCommand.ExecuteInPlace(image);

                        // remove noise from image
                        DespeckleCommand despeckleCommand = new DespeckleCommand();
                        despeckleCommand.ExecuteInPlace(clonedImage);

                        // deskew image
                        DeskewCommand deskewCommand = new DeskewCommand();
                        deskewCommand.ExecuteInPlace(image);

                        // detect regions (text, images, etc) on image
                        DocumentSegmentationCommand documentSegmentationCommand = new DocumentSegmentationCommand();
                        documentSegmentationCommand.BorderSize = 30;
                        documentSegmentationCommand.ExecuteInPlace(image);


                        // recognize text in image regions
                        OcrPage ocrPage = engineManager.Recognize(clonedImage, ocrSettings, documentSegmentationCommand.Regions);

                        // save information about recognized text in dictionary
                        imagesToOcrPages.Add(image, ocrPage);
                    }
                }

                // create new PDF document
                using (PdfDocument document = new PdfDocument("searchable.pdf", System.IO.FileMode.Create, PdfFormat.Pdf_14))
                {
                    // create PDF document builder
                    PdfDocumentBuilder documentBuilder = new PdfDocumentBuilder(document);
                    // specify that PDF document must contain image over text
                    documentBuilder.PageCreationMode = PdfPageCreationMode.ImageOverText;

                    // create file font programs controller
                    FileFontProgramsController fileFontProgramsController = new FileFontProgramsController(true);
                    // get a stream, which contains font program, which contains all recognized characters
                    Stream fontStream = fileFontProgramsController.GetTrueTypeFontProgram(null, "Times New Roman");
                    // if stream with font program is found
                    if (fontStream != null)
                    {
                        // set Times New Roman font as text font in PDF document
                        documentBuilder.Font = document.FontManager.CreateCIDFontFromTrueTypeFont(fontStream);
                        // dispose stream with font program
                        fontStream.Dispose();
                    }

                    // for each image in image collection
                    for (int i = 0; i < images.Count; i++)
                    {
                        // get image
                        VintasoftImage image = images[i];
                        // if image has recognized text
                        if (imagesToOcrPages.ContainsKey(image))
                        {
                            // add image with text as a new page of PDF document
                            documentBuilder.AddPage(image, imagesToOcrPages[image]);
                        }
                        // if image does NOT have recognized text
                        else
                        {
                            // add image as a new page of PDF document
                            documentBuilder.AddPage(image, null);
                        }
                    }

                    // pack fonts in PDF document for removing unused characters from fonts
                    document.FontManager.PackAllFonts();

                    // pack PDF document
                    document.Pack();
                }

                // clear image collection and dispose images
                images.ClearAndDisposeItems();
            }
        }
    }
}

VintaSoft Imaging .NET SDK (Standard) (Vintasoft.Imaging.dll), VintaSoft Document Cleanup .NET Plug-in (Vintasoft.Imaging.DocCleanup.dll), VintaSoft OCR .NET Plug-in (Vintasoft.Imaging.Ocr.dll, Vintasoft.Imaging.Ocr.Tesseract.dll, Vintasoft.Imaging.Pdf.Ocr.dll, Tesseract5.Vintasoft.x86.dll/Tesseract5.Vintasoft.x64.dll) and VintaSoft PDF .NET Plug-in (Reader+Writer) (Vintasoft.Imaging.Pdf.dll) are necessary for executing this sample.