OCR Images Using Microsoft Office 2003 SDK
Universal Document Converter is virtual printer software that saves any document you print as a raster PDF or an image file. You can use a post-print feature of Universal Document Converter to apply additional processing to every output file. The example below is just one of many post-print processing solutions.
// Microsoft Office Document Imaging Library (MODI) which is
// pupplied with the Office 2003 package, allows you easily integrate
// OCR functionality into your own applications. For example, you can
// use recognized text for indexing documents in your database.
// Important notice: MS Office 2000 or XP does not contain MODI!
//
// 1) Open your project in Microsoft Visual C++ 6.0
//
// 2) Press "Ctrl+W" in Visual C++ to open ClassWizard
//
// 3) In ClassWizard window press "Add Class->From a type library"
// button and select "MDIVWCTL.DLL" file. By default this file
// should be in folder:
// "C:\Program Files\Common Files\Microsoft Shared\MODI\11.0"
//
// 4) Save files "mdivwctl.h" and "mdivwctl.cpp" and add into
// your "stdafx.h" file this include: #include "mdivwctl.h"
//
// 5) You must initialize the COM before you call any COM method.
// Please call "::CoInitialize(0);" before using COM and
// ::CoUninitialize(); after using COM.
#include "mdivwctl.h"
enum MiLANGUAGES
{
miLANG_CHINESE_SIMPLIFIED = 2052,
miLANG_CHINESE_TRADITIONAL = 1028,
miLANG_CZECH = 5,
miLANG_DANISH = 6,
miLANG_DUTCH = 19,
miLANG_ENGLISH = 9,
miLANG_FINNISH = 11,
miLANG_FRENCH = 12,
miLANG_GERMAN = 7,
miLANG_GREEK = 8,
miLANG_HUNGARIAN = 14,
miLANG_ITALIAN = 16,
miLANG_JAPANESE = 17,
miLANG_KOREAN = 18,
miLANG_NORWEGIAN = 20,
miLANG_POLISH = 21,
miLANG_PORTUGUESE = 22,
miLANG_RUSSIAN = 25,
miLANG_SPANISH = 10,
miLANG_SWEDISH = 29,
miLANG_SYSDEFAULT = 2048,
miLANG_TURKISH = 31
};
enum MiFILE_FORMAT
{
miFILE_FORMAT_DEFAULTVALUE = -1,
miFILE_FORMAT_TIFF = 1,
miFILE_FORMAT_TIFF_LOSSLESS = 2,
miFILE_FORMAT_MDI = 4
};
enum MiCOMP_LEVEL
{
miCOMP_LEVEL_LOW = 0,
miCOMP_LEVEL_MEDIUM = 1,
miCOMP_LEVEL_HIGH = 2
};
BOOL OCRImageFile( CString sImgFilePath, CString sOutFilePath )
{
IDocument *pDoc = new IDocument;
pDoc->CreateDispatch( "MODI.Document" );
pDoc->Create( sImgFilePath );
pDoc->OCR( miLANG_ENGLISH, 0, 0 );
////
FILE *fpOut = fopen( sOutFilePath, "wt" );
IImages images = pDoc->GetImages();
long num = images.GetCount();
for( int i = 0; i < num; i++ )
{
IImage image = images.GetItem(i);
ILayout layout = image.GetLayout();
fprintf( fpOut, " --< page %d of %d begin >--\n", i + 1, num );
fprintf( fpOut, layout.GetText() );
fprintf( fpOut, " --< page %d of %d end >--\n", i + 1, num );
}
////
pDoc->Close(0);
pDoc->ReleaseDispatch();
delete pDoc;
fclose( fpOut );
////
return (num > 0) ? TRUE : FALSE;
}