OCR Images Using Microsoft Office 2003 SDK

Universal Document Converter is virtual printer software that saves any document you print as a raster PDF or an image file. You can use a post-print feature of Universal Document Converter to apply additional processing to every output file. The example below is just one of many post-print processing solutions.

// Microsoft Office Document Imaging Library (MODI) which is
// pupplied with the Office 2003 package, allows you easily integrate
// OCR functionality into your own applications. For example, you can
// use recognized text for indexing documents in your database.
// Important notice: MS Office 2000 or XP does not contain MODI!
//
// 1) Open your project in Microsoft Visual C++ 6.0
//
// 2) Press "Ctrl+W" in Visual C++ to open ClassWizard
//
// 3) In ClassWizard window press "Add Class->From a type library"
//    button and select "MDIVWCTL.DLL" file. By default this file
//    should be in folder:
//    "C:\Program Files\Common Files\Microsoft Shared\MODI\11.0"
//
// 4) Save files "mdivwctl.h" and "mdivwctl.cpp" and add into
//    your "stdafx.h" file this include: #include "mdivwctl.h"
//
// 5) You must initialize the COM before you call any COM method.
//    Please call "::CoInitialize(0);" before using COM and
//    ::CoUninitialize(); after using COM.

#include "mdivwctl.h"

enum MiLANGUAGES 
{ 
  miLANG_CHINESE_SIMPLIFIED = 2052,
  miLANG_CHINESE_TRADITIONAL = 1028,
  miLANG_CZECH = 5,
  miLANG_DANISH = 6, 
  miLANG_DUTCH = 19,
  miLANG_ENGLISH = 9,
  miLANG_FINNISH = 11,
  miLANG_FRENCH = 12,
  miLANG_GERMAN = 7,
  miLANG_GREEK = 8, 
  miLANG_HUNGARIAN = 14,
  miLANG_ITALIAN = 16,
  miLANG_JAPANESE = 17,
  miLANG_KOREAN = 18,
  miLANG_NORWEGIAN = 20,
  miLANG_POLISH = 21,
  miLANG_PORTUGUESE = 22,
  miLANG_RUSSIAN = 25,
  miLANG_SPANISH = 10,
  miLANG_SWEDISH = 29,
  miLANG_SYSDEFAULT = 2048,
  miLANG_TURKISH = 31
};

enum MiFILE_FORMAT
{
  miFILE_FORMAT_DEFAULTVALUE = -1,
  miFILE_FORMAT_TIFF = 1,
  miFILE_FORMAT_TIFF_LOSSLESS = 2, 
  miFILE_FORMAT_MDI = 4
};

enum MiCOMP_LEVEL 
{
  miCOMP_LEVEL_LOW = 0,
  miCOMP_LEVEL_MEDIUM = 1, 
  miCOMP_LEVEL_HIGH = 2
};

BOOL OCRImageFile( CString sImgFilePath, CString sOutFilePath )
{
  IDocument *pDoc = new IDocument;
  pDoc->CreateDispatch( "MODI.Document" );

  pDoc->Create( sImgFilePath );
  pDoc->OCR( miLANG_ENGLISH, 0, 0 );
////
  FILE    *fpOut = fopen( sOutFilePath, "wt" );
  IImages images = pDoc->GetImages();
  long	  num = images.GetCount();

  for( int i = 0; i < num; i++ )
  {
    IImage  image = images.GetItem(i);
    ILayout layout = image.GetLayout();

    fprintf( fpOut, " --< page %d of %d begin >--\n", i + 1, num );
    fprintf( fpOut, layout.GetText() );
    fprintf( fpOut, " --< page %d of %d end >--\n", i + 1, num );
  }
////
  pDoc->Close(0);

  pDoc->ReleaseDispatch();
  delete pDoc;

  fclose( fpOut );
////
  return (num > 0) ? TRUE : FALSE;
}


  • «Universal Document Converter  is a instrument that allows me to do my CRM consultant work much faster. It converts Documents for my clients,  in various Formats such as  PDF, docx, and even image diagrams in  JPEG, BMP, PCX, PNG, with the best conversion quality!»