OCR: How to export the OCR result to hOCR file?
In This Topic
OCR results can be exported to hOCR file.
Here is C#/VB.NET code that shows how to export OCR result to hOCR file:
string imageFilePath = @"D:\TestImage.png";
// create the OCR engine
using (Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr tesseractOcr = new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr())
{
// specify that OCR engine will recognize English text
Vintasoft.Imaging.Ocr.OcrLanguage language = Vintasoft.Imaging.Ocr.OcrLanguage.English;
// create the OCR engine settings
Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings settings =
new Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(language);
// initialize the OCR engine
tesseractOcr.Init(settings);
// load an image
using (Vintasoft.Imaging.VintasoftImage image = new Vintasoft.Imaging.VintasoftImage(imageFilePath))
{
// specify the image, where text must be recognized
tesseractOcr.SetImage(image);
// recognize text in the image
Vintasoft.Imaging.Ocr.Results.OcrPage ocrResult = tesseractOcr.Recognize();
string hocrFilePath = System.IO.Path.Combine(
System.IO.Path.GetDirectoryName(imageFilePath),
System.IO.Path.GetFileNameWithoutExtension(imageFilePath) + ".hocr");
// create the file
using (System.IO.Stream stream = System.IO.File.Open(hocrFilePath, System.IO.FileMode.Create))
{
// create the HOcr codec
Vintasoft.Imaging.Ocr.Results.HOcrCodec hOcrCodec = new Vintasoft.Imaging.Ocr.Results.HOcrCodec();
// save the OCR result to hOCR file
hOcrCodec.Export(ocrResult, stream);
}
// clear the image
tesseractOcr.ClearImage();
}
// shutdown the OCR engine
tesseractOcr.Shutdown();
}
Dim imageFilePath As String = "D:\TestImage.png"
' create the OCR engine
Using tesseractOcr As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcr()
' specify that OCR engine will recognize English text
Dim language As Vintasoft.Imaging.Ocr.OcrLanguage = Vintasoft.Imaging.Ocr.OcrLanguage.English
' create the OCR engine settings
Dim settings As New Vintasoft.Imaging.Ocr.Tesseract.TesseractOcrSettings(language)
' initialize the OCR engine
tesseractOcr.Init(settings)
' load an image
Using image As New Vintasoft.Imaging.VintasoftImage(imageFilePath)
' specify the image, where text must be recognized
tesseractOcr.SetImage(image)
' recognize text in the image
Dim ocrResult As Vintasoft.Imaging.Ocr.Results.OcrPage = tesseractOcr.Recognize()
Dim hocrFilePath As String = System.IO.Path.Combine(System.IO.Path.GetDirectoryName(imageFilePath), System.IO.Path.GetFileNameWithoutExtension(imageFilePath) & ".hocr")
' create the file
Using stream As System.IO.Stream = System.IO.File.Open(hocrFilePath, System.IO.FileMode.Create)
' create the HOcr codec
Dim hOcrCodec As New Vintasoft.Imaging.Ocr.Results.HOcrCodec()
' save the OCR result to hOCR file
hOcrCodec.Export(ocrResult, stream)
End Using
' clear the image
tesseractOcr.ClearImage()
End Using
' shutdown the OCR engine
tesseractOcr.Shutdown()
End Using