OCR: How to export the OCR result to hOCR file?
In This Topic
OCR results can be exported to hOCR file.
Here is an example that shows how to export OCR result to hOCR file:
string imageFilePath = @"D:\TestImage.png";
// create the OCR engine
using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
{
// specify that OCR engine will recognize English text
Vintasoft.Imaging.Ocr.OcrLanguage language = Vintasoft.Imaging.Ocr.OcrLanguage.English;
// create the OCR engine settings
Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings settings =
new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(language);
// initialize the OCR engine
tesseractOcr.Init(settings);
// load an image
using (Vintasoft.Imaging.VintasoftImage image = new Vintasoft.Imaging.VintasoftImage(imageFilePath))
{
// specify the image, where text must be recognized
tesseractOcr.SetImage(image);
// recognize text in the image
Vintasoft.Imaging.Ocr.Results.OcrPage ocrResult = tesseractOcr.Recognize();
string hocrFilePath = System.IO.Path.Combine(
System.IO.Path.GetDirectoryName(imageFilePath),
System.IO.Path.GetFileNameWithoutExtension(imageFilePath) + ".hocr");
// create the file
using (System.IO.Stream stream = System.IO.File.Open(hocrFilePath, System.IO.FileMode.Create))
{
// create the HOcr codec
Vintasoft.Imaging.Ocr.Results.HOcrCodec hOcrCodec = new Vintasoft.Imaging.Ocr.Results.HOcrCodec();
// save the OCR result to hOCR file
hOcrCodec.Export(ocrResult, stream);
}
// clear the image
tesseractOcr.ClearImage();
}
// shutdown the OCR engine
tesseractOcr.Shutdown();
}
Dim imageFilePath As String = "D:\TestImage.png"
' create the OCR engine
Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
' specify that OCR engine will recognize English text
Dim language As Vintasoft.Imaging.Ocr.OcrLanguage = Vintasoft.Imaging.Ocr.OcrLanguage.English
' create the OCR engine settings
Dim settings As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(language)
' initialize the OCR engine
tesseractOcr.Init(settings)
' load an image
Using image As New Vintasoft.Imaging.VintasoftImage(imageFilePath)
' specify the image, where text must be recognized
tesseractOcr.SetImage(image)
' recognize text in the image
Dim ocrResult As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.Recognize()
Dim hocrFilePath As String = System.IO.Path.Combine(System.IO.Path.GetDirectoryName(imageFilePath), System.IO.Path.GetFileNameWithoutExtension(imageFilePath) & ".hocr")
' create the file
Using stream As System.IO.Stream = System.IO.File.Open(hocrFilePath, System.IO.FileMode.Create)
' create the HOcr codec
Dim hOcrCodec As New Vintasoft.Imaging.Ocr.Results.HOcrCodec()
' save the OCR result to hOCR file
hOcrCodec.Export(ocrResult, stream)
End Using
' clear the image
tesseractOcr.ClearImage()
End Using
' shutdown the OCR engine
tesseractOcr.Shutdown()
End Using