// DebugSupport.cpp : Defines the entry point for the console application. // #include "stdafx.h" int g_StreamIndex = 0; FILE *g_fpOutputFile = NULL; IStorage *g_pDebugFile = NULL; WCHAR* ConvertPOSToString( DWORD dwPartOfSpeech ); bool ParseCommandLine( int argc, char* argv[] ); void ExtractSentenceBreaks( void ); void ExtractNormalizedText( void ); void ExtractLexLookup( void ); void ExtractPOSPossibilities( void ); void ExtractMorphology( void ); int main(int argc, char* argv[]) { bool fSuccess = false; CoInitialize( NULL ); fSuccess = ParseCommandLine( argc, argv ); if ( fSuccess ) { switch ( g_StreamIndex ) { case STREAM_SENTENCEBREAKS: ExtractSentenceBreaks(); break; case STREAM_NORMALIZEDTEXT: ExtractNormalizedText(); break; case STREAM_LEXLOOKUP: ExtractLexLookup(); break; case STREAM_POSPOSSIBILITIES: ExtractPOSPossibilities(); break; case STREAM_MORPHOLOGY: ExtractMorphology(); break; } } CoUninitialize(); return 0; } bool ParseCommandLine( int argc, char* argv[] ) { bool fSuccess = true; //--- Check number of parameters if ( argc < 4 ) { goto USAGE; } //--- Check streamname validity fSuccess = false; WCHAR StreamName[MAX_PATH]; if ( !MultiByteToWideChar( CP_ACP, 0, argv[2], strlen( argv[2] ) + 1, StreamName, MAX_PATH ) ) { goto MISC_ERROR; } else { for ( int i = 0; i < STREAM_LASTTYPE; i++ ) { if ( wcscmp( StreamName, StreamTypeStrings[i].pStr ) == 0 ) { fSuccess = true; g_StreamIndex = i; break; } } } if ( !fSuccess ) { goto USAGE; } //--- Try to open debug info file WCHAR DebugFilename[MAX_PATH]; if ( !MultiByteToWideChar( CP_ACP, 0, argv[1], strlen( argv[1] ) + 1, DebugFilename, MAX_PATH ) ) { goto MISC_ERROR; } if ( FAILED( StgOpenStorage( DebugFilename, NULL, STGM_READ | STGM_SHARE_DENY_WRITE, NULL, 0, &g_pDebugFile ) ) ) { goto MISC_ERROR; } //--- Try to open file for output WCHAR OutputFilename[MAX_PATH]; if ( !MultiByteToWideChar( CP_ACP, 0, argv[3], strlen( argv[3] ) + 1, OutputFilename, MAX_PATH ) ) { goto MISC_ERROR; } g_fpOutputFile = _wfopen( OutputFilename, L"w" ); if ( !g_fpOutputFile ) { printf( "\n\nUnable to open file: %s\n", argv[3] ); goto MISC_ERROR; } return true; USAGE: printf( "\n\nUSAGE:\n\n\tDebugSupport [debug filename] [streamname] [output filename]\n" ); printf( "\tStream names are:\n\t\tSentenceBreaks\n\t\tNormalizedText\n\t\tMorphology" ); printf( "\n\t\tLexLookup\n\n" ); return false; MISC_ERROR: printf( "\n\n\tERROR in ParseCommandLine(...)\n\n" ); return false; } //--- Just print the original text out, with a newline character between each sentence. void ExtractSentenceBreaks( void ) { IStream *pStgStream = NULL; if ( g_pDebugFile->OpenStream( StreamTypeStrings[g_StreamIndex].pStr, 0, STGM_READ | STGM_SHARE_EXCLUSIVE, 0, &pStgStream) == S_OK ) { DebugSentItem Item, EmptyItem; ULONG cbRead = 0, ulOffset = 0; bool fResetOffset = true; while ( SUCCEEDED( pStgStream->Read( (void*) &Item, sizeof( Item ), &cbRead ) ) && cbRead == sizeof( Item ) ) { //--- Check for delimiter if ( memcmp( &Item, &EmptyItem, sizeof( Item ) ) == 0 ) { fwprintf( g_fpOutputFile, L"\n" ); } else { //--- Print item fwprintf ( g_fpOutputFile, L"%s ", Item.ItemSrcText ); } } } } //--- Just print the normalized text of each item out, separated by single spaces, //--- with a newline character between each sentence. void ExtractNormalizedText( void ) { IStream *pStgStream = NULL; if ( g_pDebugFile->OpenStream( StreamTypeStrings[5].pStr, 0, STGM_READ | STGM_SHARE_EXCLUSIVE, 0, &pStgStream) == S_OK ) { DebugSentItem Item, EmptyItem; ULONG cbRead = 0; while ( SUCCEEDED( pStgStream->Read( (void*) &Item, sizeof( Item ), &cbRead ) ) && cbRead == sizeof( Item ) ) { //--- Check for delimiter if ( memcmp( &Item, &EmptyItem, sizeof( Item ) ) == 0 ) { fwprintf( g_fpOutputFile, L"\n" ); } else { //--- Print item if ( Item.ItemInfo.Type != eALPHA_WORD && Item.ItemInfo.Type != eOPEN_PARENTHESIS && Item.ItemInfo.Type != eOPEN_BRACKET && Item.ItemInfo.Type != eOPEN_BRACE && Item.ItemInfo.Type != eCLOSE_PARENTHESIS && Item.ItemInfo.Type != eCLOSE_BRACKET && Item.ItemInfo.Type != eCLOSE_BRACE && Item.ItemInfo.Type != eSINGLE_QUOTE && Item.ItemInfo.Type != eDOUBLE_QUOTE && Item.ItemInfo.Type != ePERIOD && Item.ItemInfo.Type != eEXCLAMATION && Item.ItemInfo.Type != eQUESTION && Item.ItemInfo.Type != eCOMMA && Item.ItemInfo.Type != eSEMICOLON && Item.ItemInfo.Type != eCOLON && Item.ItemInfo.Type != eHYPHEN ) { fwprintf( g_fpOutputFile, L"[ " ); } for ( ULONG i = 0; i < Item.ulNumWords; i++ ) { if ( Item.Words[i].ulWordLen > 0 ) { fwprintf( g_fpOutputFile, L"%s ", Item.Words[i].WordText ); } else { fwprintf( g_fpOutputFile, L"%s ", Item.ItemSrcText ); } } if ( Item.ItemInfo.Type != eALPHA_WORD && Item.ItemInfo.Type != eOPEN_PARENTHESIS && Item.ItemInfo.Type != eOPEN_BRACKET && Item.ItemInfo.Type != eOPEN_BRACE && Item.ItemInfo.Type != eCLOSE_PARENTHESIS && Item.ItemInfo.Type != eCLOSE_BRACKET && Item.ItemInfo.Type != eCLOSE_BRACE && Item.ItemInfo.Type != eSINGLE_QUOTE && Item.ItemInfo.Type != eDOUBLE_QUOTE && Item.ItemInfo.Type != ePERIOD && Item.ItemInfo.Type != eEXCLAMATION && Item.ItemInfo.Type != eQUESTION && Item.ItemInfo.Type != eCOMMA && Item.ItemInfo.Type != eSEMICOLON && Item.ItemInfo.Type != eCOLON && Item.ItemInfo.Type != eHYPHEN ) { fwprintf( g_fpOutputFile, L"] " ); } } } } } //--- Print the text of each item, and then its Pronunciation and Part of Speech. //--- Separate each with a newline character. void ExtractLexLookup( void ) { IStream *pStgStream = NULL; if ( g_pDebugFile->OpenStream( StreamTypeStrings[g_StreamIndex].pStr, 0, STGM_READ | STGM_SHARE_EXCLUSIVE, 0, &pStgStream) == S_OK ) { DebugSentItem Item, EmptyItem; ULONG cbRead = 0; while ( SUCCEEDED( pStgStream->Read( (void*) &Item, sizeof( Item ), &cbRead ) ) && cbRead == sizeof( Item ) ) { if ( memcmp( &Item, &EmptyItem, sizeof( Item ) ) == 0 ) { fwprintf( g_fpOutputFile, L"\n" ); } else { //--- Print Normalization delimiter if ( Item.ItemInfo.Type != eALPHA_WORD && Item.ItemInfo.Type != eOPEN_PARENTHESIS && Item.ItemInfo.Type != eOPEN_BRACKET && Item.ItemInfo.Type != eOPEN_BRACE && Item.ItemInfo.Type != eCLOSE_PARENTHESIS && Item.ItemInfo.Type != eCLOSE_BRACKET && Item.ItemInfo.Type != eCLOSE_BRACE && Item.ItemInfo.Type != eSINGLE_QUOTE && Item.ItemInfo.Type != eDOUBLE_QUOTE && Item.ItemInfo.Type != ePERIOD && Item.ItemInfo.Type != eEXCLAMATION && Item.ItemInfo.Type != eQUESTION && Item.ItemInfo.Type != eCOMMA && Item.ItemInfo.Type != eSEMICOLON && Item.ItemInfo.Type != eCOLON && Item.ItemInfo.Type != eHYPHEN ) { fwprintf( g_fpOutputFile, L"[ " ); } for ( ULONG i = 0; i < Item.ulNumWords; i++ ) { //--- Print item if ( Item.Words[i].WordText[0] != 0 ) { fwprintf ( g_fpOutputFile, L"%s ", Item.Words[i].WordText ); } else { fwprintf ( g_fpOutputFile, L"%s ", Item.ItemSrcText ); } //--- Print pronunciation //CComPtr pPhoneConv; //if ( SUCCEEDED( SpCreatePhoneConverter(1033, NULL, NULL, &pPhoneConv) ) ) //{ // if ( SUCCEEDED( pPhoneConv->IdToPhone( Item.Words[i].WordPron, Item.Words[i].WordPron ) ) ) // { // fwprintf( g_fpOutputFile, L"%s", Item.Words[i].WordPron ); // for ( long j = 0; j < (long)( (long)45 - (long)wcslen( Item.Words[i].WordPron ) ); j++ ) // { // fwprintf( g_fpOutputFile, L" " ); // } // } //} //--- Print POS fwprintf ( g_fpOutputFile, L"(%s) ", ConvertPOSToString( Item.Words[i].eWordPartOfSpeech ) ); } //--- Print Normalization delimiter if ( Item.ItemInfo.Type != eALPHA_WORD && Item.ItemInfo.Type != eOPEN_PARENTHESIS && Item.ItemInfo.Type != eOPEN_BRACKET && Item.ItemInfo.Type != eOPEN_BRACE && Item.ItemInfo.Type != eCLOSE_PARENTHESIS && Item.ItemInfo.Type != eCLOSE_BRACKET && Item.ItemInfo.Type != eCLOSE_BRACE && Item.ItemInfo.Type != eSINGLE_QUOTE && Item.ItemInfo.Type != eDOUBLE_QUOTE && Item.ItemInfo.Type != ePERIOD && Item.ItemInfo.Type != eEXCLAMATION && Item.ItemInfo.Type != eQUESTION && Item.ItemInfo.Type != eCOMMA && Item.ItemInfo.Type != eSEMICOLON && Item.ItemInfo.Type != eCOLON && Item.ItemInfo.Type != eHYPHEN ) { fwprintf( g_fpOutputFile, L"] " ); } } } } } void ExtractPOSPossibilities( void ) { IStream *pStgStream = NULL; if ( g_pDebugFile->OpenStream( StreamTypeStrings[g_StreamIndex].pStr, 0, STGM_READ | STGM_SHARE_EXCLUSIVE, 0, &pStgStream) == S_OK ) { DebugPronRecord PronRecord, EmptyPronRecord; ULONG cbRead = 0; while ( SUCCEEDED( pStgStream->Read( (void*) &PronRecord, sizeof( PronRecord ), &cbRead ) ) && cbRead == sizeof( PronRecord ) ) { //--- Check for delimiter if ( memcmp( &PronRecord, &EmptyPronRecord, sizeof( PronRecord ) ) == 0 ) { fwprintf( g_fpOutputFile, L"\n" ); } else { fwprintf( g_fpOutputFile, PronRecord.orthStr ); fwprintf( g_fpOutputFile, L" [ " ); fwprintf( g_fpOutputFile, L"%s - ", ConvertPOSToString( PronRecord.POSchoice ) ); for ( ULONG i = 0; i < PronRecord.pronArray[0].POScount; i++ ) { fwprintf( g_fpOutputFile, L"%s,", ConvertPOSToString( (DWORD)PronRecord.pronArray[0].POScode[i] ) ); } for ( i = 0; i < PronRecord.pronArray[1].POScount; i++ ) { fwprintf( g_fpOutputFile, L"%s,", ConvertPOSToString( (DWORD)PronRecord.pronArray[1].POScode[i] ) ); } fwprintf( g_fpOutputFile, L" ]\n" ); } } } } void ExtractMorphology( void ) { IStream *pStgStream = NULL; if ( g_pDebugFile->OpenStream( StreamTypeStrings[g_StreamIndex].pStr, 0, STGM_READ | STGM_SHARE_EXCLUSIVE, 0, &pStgStream ) == S_OK ) { CComPtr pPhoneConv; if ( SUCCEEDED( SpCreatePhoneConverter( 1033, NULL, NULL, &pPhoneConv ) ) ) { WCHAR Buffer[SP_MAX_WORD_LENGTH], EmptyBuffer[SP_MAX_WORD_LENGTH]; ULONG cbRead = 0; ZeroMemory( EmptyBuffer, SP_MAX_WORD_LENGTH * sizeof( WCHAR ) ); BOOL fRoot = true; while ( SUCCEEDED( pStgStream->Read( (void*) &Buffer, SP_MAX_WORD_LENGTH * sizeof( WCHAR ), &cbRead ) ) && cbRead == SP_MAX_WORD_LENGTH * sizeof( WCHAR ) ) { //--- Check for delimiter if ( memcmp( &Buffer, &EmptyBuffer, SP_MAX_WORD_LENGTH * sizeof( WCHAR ) ) == 0 ) { fwprintf( g_fpOutputFile, L"\n" ); fRoot = true; } else if ( fRoot ) { fwprintf( g_fpOutputFile, L"%s ", Buffer ); fRoot = false; } else { if ( SUCCEEDED( pPhoneConv->IdToPhone( Buffer, Buffer ) ) ) { fwprintf( g_fpOutputFile, L"- %s ", Buffer ); } } } } } } WCHAR* ConvertPOSToString( DWORD dwPartOfSpeech ) { switch (dwPartOfSpeech) { case MS_NotOverriden: return L"Noun"; case MS_Unknown: return L"Unknown"; case MS_Punctuation: return L"Punctuation"; case MS_Noun: return L"Noun"; case MS_Verb: return L"Verb"; case MS_Modifier: return L"Modifier"; case MS_Function: return L"Function"; case MS_Interjection: return L"Interj"; case MS_Pron: return L"Pron"; case MS_SubjPron: return L"SubjPron"; case MS_ObjPron: return L"ObjPron"; case MS_RelPron: return L"RelPron"; // case MS_PPron: // return L"PPron"; // case MS_IPron: // return L"IPron"; // case MS_RPron: // return L"RPron"; // case MS_DPron: // return L"DPron"; case MS_Adj: return L"Adj"; case MS_Adv: return L"Adv"; case MS_VAux: return L"VAux"; // case MS_RVAux: // return L"RVAux"; case MS_Conj: return L"Conj"; case MS_CConj: return L"CConj"; case MS_Interr: return L"WHWord"; case MS_Det: return L"Det"; case MS_Contr: return L"Contr"; // case MS_VPart: // return L"VPart"; case MS_Prep: return L"Prep"; // case MS_Quant: // return L"Quant"; default: return L"Unknown"; } }