Console: Convert a multipage TIFF file to a searchable PDF document.

Code samples for VintaSoft Imaging .NET SDK. Here you can request a code sample.

Moderator: Alex

Post Reply
Site Admin
Posts: 2044
Joined: Thu Jul 10, 2008 2:21 pm

Console: Convert a multipage TIFF file to a searchable PDF document.

Post by Alex »

This topic contains a code sample that shows how to convert a multipage TIFF file to a searchable PDF document:

Code: Select all

using System.Collections.Generic;
using System.IO;
using Vintasoft.Imaging;
using Vintasoft.Imaging.ImageProcessing.Document;
using Vintasoft.Imaging.ImageProcessing.Info;
using Vintasoft.Imaging.Ocr;
using Vintasoft.Imaging.Ocr.Results;
using Vintasoft.Imaging.Ocr.Tesseract;
using Vintasoft.Imaging.Pdf;
using Vintasoft.Imaging.Pdf.Ocr;

namespace ConvertMultipageTiffToSearchablePdfDocument
    class Program
        static void Main(string[] args)
            // create Tesseract OCR
            using (TesseractOcr tesseractOcr = new TesseractOcr(@"..\..\TesseractOCR"))
                // create OCR engine manager
                OcrEngineManager engineManager = new OcrEngineManager(tesseractOcr);
                // create OCR settings
                OcrEngineSettings ocrSettings = new OcrEngineSettings(OcrLanguage.Turkish);

                // create image collection
                ImageCollection images = new ImageCollection();
                // add multipage TIFF file to the image collection

                // create a dictionary: image => OCR page
                Dictionary<VintasoftImage, OcrPage> imagesToOcrPages = new Dictionary<VintasoftImage, OcrPage>();
                // for each image in image collection
                for (int i = 0; i < images.Count; i++)
                    // get image
                    VintasoftImage image = images[i];
                    // clone image
                    using (VintasoftImage clonedImage = (VintasoftImage)image.Clone())
                        // remove tables from image
                        LineRemovalCommand lineRemovalCommand = new LineRemovalCommand(RemovingLinesType.Tables);

                        // remove halftone from image
                        HalftoneRemovalCommand halftoneRemovalCommand = new HalftoneRemovalCommand();

                        // clear border on image
                        BorderClearCommand borderClearCommand = new BorderClearCommand();

                        // remove hole punches from image
                        HolePunchRemovalCommand holePunchRemovalCommand = new HolePunchRemovalCommand();

                        // remove noise from image
                        DespeckleCommand despeckleCommand = new DespeckleCommand();

                        // deskew image
                        DeskewCommand deskewCommand = new DeskewCommand();

                        // detect regions (text, images, etc) on image
                        DocumentSegmentationCommand documentSegmentationCommand = new DocumentSegmentationCommand();
                        documentSegmentationCommand.BorderSize = 30;

                        // recognize text in image regions
                        OcrPage ocrPage = engineManager.Recognize(clonedImage, ocrSettings, documentSegmentationCommand.Regions);

                        // save information about recognized text in dictionary
                        imagesToOcrPages.Add(image, ocrPage);

                // create new PDF document
                using (PdfDocument document = new PdfDocument("searchable.pdf", System.IO.FileMode.Create, PdfFormat.Pdf_14))
                    // create PDF document builder
                    PdfDocumentBuilder documentBuilder = new PdfDocumentBuilder(document);
                    // specify that PDF document must contain image over text
                    documentBuilder.PageCreationMode = PdfPageCreationMode.ImageOverText;

                    // create file font programs controller
                    FileFontProgramsController fileFontProgramsController = new FileFontProgramsController(true);
                    // get a stream, which contains font program, which contains all recognized characters
                    Stream fontStream = fileFontProgramsController.GetTrueTypeFontProgram(null, "Times New Roman");
                    // if stream with font program is found
                    if (fontStream != null)
                        // set Times New Roman font as text font in PDF document
                        documentBuilder.Font = document.FontManager.CreateCIDFontFromTrueTypeFont(fontStream);
                        // dispose stream with font program

                    // for each image in image collection
                    for (int i = 0; i < images.Count; i++)
                        // get image
                        VintasoftImage image = images[i];
                        // if image has recognized text
                        if (imagesToOcrPages.ContainsKey(image))
                            // add image with text as a new page of PDF document
                            documentBuilder.AddPage(image, imagesToOcrPages[image]);
                        // if image does NOT have recognized text
                            // add image as a new page of PDF document
                            documentBuilder.AddPage(image, null);

                    // pack fonts in PDF document for removing unused characters from fonts

                    // pack PDF document

                // clear image collection and dispose images
VintaSoft Imaging .NET SDK (Standard) (Vintasoft.Imaging.dll), VintaSoft Document Cleanup .NET Plug-in (Vintasoft.Imaging.DocCleanup.dll), VintaSoft OCR .NET Plug-in (Vintasoft.Imaging.Ocr.dll, Vintasoft.Imaging.Ocr.Tesseract.dll, Vintasoft.Imaging.Pdf.Ocr.dll, Tesseract4.Vintasoft.x86.dll/Tesseract4.Vintasoft.x64.dll) and VintaSoft PDF .NET Plug-in (Reader+Writer) (Vintasoft.Pdf.dll) are necessary for executing this sample.
Post Reply