467 lines
16 KiB
C++
467 lines
16 KiB
C++
// DebugSupport.cpp : Defines the entry point for the console application.
|
|
//
|
|
|
|
#include "stdafx.h"
|
|
|
|
int g_StreamIndex = 0;
|
|
FILE *g_fpOutputFile = NULL;
|
|
IStorage *g_pDebugFile = NULL;
|
|
|
|
WCHAR* ConvertPOSToString( DWORD dwPartOfSpeech );
|
|
bool ParseCommandLine( int argc, char* argv[] );
|
|
void ExtractSentenceBreaks( void );
|
|
void ExtractNormalizedText( void );
|
|
void ExtractLexLookup( void );
|
|
void ExtractPOSPossibilities( void );
|
|
void ExtractMorphology( void );
|
|
|
|
int main(int argc, char* argv[])
|
|
{
|
|
bool fSuccess = false;
|
|
CoInitialize( NULL );
|
|
|
|
fSuccess = ParseCommandLine( argc, argv );
|
|
if ( fSuccess )
|
|
{
|
|
switch ( g_StreamIndex )
|
|
{
|
|
case STREAM_SENTENCEBREAKS:
|
|
ExtractSentenceBreaks();
|
|
break;
|
|
case STREAM_NORMALIZEDTEXT:
|
|
ExtractNormalizedText();
|
|
break;
|
|
case STREAM_LEXLOOKUP:
|
|
ExtractLexLookup();
|
|
break;
|
|
case STREAM_POSPOSSIBILITIES:
|
|
ExtractPOSPossibilities();
|
|
break;
|
|
case STREAM_MORPHOLOGY:
|
|
ExtractMorphology();
|
|
break;
|
|
}
|
|
}
|
|
|
|
CoUninitialize();
|
|
return 0;
|
|
}
|
|
|
|
bool ParseCommandLine( int argc, char* argv[] )
|
|
{
|
|
bool fSuccess = true;
|
|
|
|
//--- Check number of parameters
|
|
if ( argc < 4 )
|
|
{
|
|
goto USAGE;
|
|
}
|
|
|
|
//--- Check streamname validity
|
|
fSuccess = false;
|
|
WCHAR StreamName[MAX_PATH];
|
|
if ( !MultiByteToWideChar( CP_ACP, 0, argv[2], strlen( argv[2] ) + 1, StreamName, MAX_PATH ) )
|
|
{
|
|
goto MISC_ERROR;
|
|
}
|
|
else
|
|
{
|
|
for ( int i = 0; i < STREAM_LASTTYPE; i++ )
|
|
{
|
|
if ( wcscmp( StreamName, StreamTypeStrings[i].pStr ) == 0 )
|
|
{
|
|
fSuccess = true;
|
|
g_StreamIndex = i;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if ( !fSuccess )
|
|
{
|
|
goto USAGE;
|
|
}
|
|
|
|
//--- Try to open debug info file
|
|
WCHAR DebugFilename[MAX_PATH];
|
|
if ( !MultiByteToWideChar( CP_ACP, 0, argv[1], strlen( argv[1] ) + 1, DebugFilename, MAX_PATH ) )
|
|
{
|
|
goto MISC_ERROR;
|
|
}
|
|
|
|
if ( FAILED( StgOpenStorage( DebugFilename, NULL, STGM_READ | STGM_SHARE_DENY_WRITE,
|
|
NULL, 0, &g_pDebugFile ) ) )
|
|
{
|
|
goto MISC_ERROR;
|
|
}
|
|
|
|
//--- Try to open file for output
|
|
WCHAR OutputFilename[MAX_PATH];
|
|
if ( !MultiByteToWideChar( CP_ACP, 0, argv[3], strlen( argv[3] ) + 1, OutputFilename, MAX_PATH ) )
|
|
{
|
|
goto MISC_ERROR;
|
|
}
|
|
|
|
g_fpOutputFile = _wfopen( OutputFilename, L"w" );
|
|
if ( !g_fpOutputFile )
|
|
{
|
|
printf( "\n\nUnable to open file: %s\n", argv[3] );
|
|
goto MISC_ERROR;
|
|
}
|
|
|
|
return true;
|
|
|
|
USAGE:
|
|
printf( "\n\nUSAGE:\n\n\tDebugSupport [debug filename] [streamname] [output filename]\n" );
|
|
printf( "\tStream names are:\n\t\tSentenceBreaks\n\t\tNormalizedText\n\t\tMorphology" );
|
|
printf( "\n\t\tLexLookup\n\n" );
|
|
|
|
return false;
|
|
|
|
MISC_ERROR:
|
|
printf( "\n\n\tERROR in ParseCommandLine(...)\n\n" );
|
|
|
|
return false;
|
|
}
|
|
|
|
//--- Just print the original text out, with a newline character between each sentence.
|
|
void ExtractSentenceBreaks( void )
|
|
{
|
|
IStream *pStgStream = NULL;
|
|
|
|
if ( g_pDebugFile->OpenStream( StreamTypeStrings[g_StreamIndex].pStr, 0, STGM_READ | STGM_SHARE_EXCLUSIVE,
|
|
0, &pStgStream) == S_OK )
|
|
{
|
|
DebugSentItem Item, EmptyItem;
|
|
ULONG cbRead = 0, ulOffset = 0;
|
|
bool fResetOffset = true;
|
|
|
|
while ( SUCCEEDED( pStgStream->Read( (void*) &Item, sizeof( Item ), &cbRead ) ) &&
|
|
cbRead == sizeof( Item ) )
|
|
{
|
|
//--- Check for delimiter
|
|
if ( memcmp( &Item, &EmptyItem, sizeof( Item ) ) == 0 )
|
|
{
|
|
fwprintf( g_fpOutputFile, L"\n" );
|
|
}
|
|
else
|
|
{
|
|
//--- Print item
|
|
fwprintf ( g_fpOutputFile, L"%s ", Item.ItemSrcText );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//--- Just print the normalized text of each item out, separated by single spaces,
|
|
//--- with a newline character between each sentence.
|
|
void ExtractNormalizedText( void )
|
|
{
|
|
IStream *pStgStream = NULL;
|
|
|
|
if ( g_pDebugFile->OpenStream( StreamTypeStrings[5].pStr, 0, STGM_READ | STGM_SHARE_EXCLUSIVE,
|
|
0, &pStgStream) == S_OK )
|
|
{
|
|
DebugSentItem Item, EmptyItem;
|
|
ULONG cbRead = 0;
|
|
|
|
while ( SUCCEEDED( pStgStream->Read( (void*) &Item, sizeof( Item ), &cbRead ) ) &&
|
|
cbRead == sizeof( Item ) )
|
|
{
|
|
//--- Check for delimiter
|
|
if ( memcmp( &Item, &EmptyItem, sizeof( Item ) ) == 0 )
|
|
{
|
|
fwprintf( g_fpOutputFile, L"\n" );
|
|
}
|
|
else
|
|
{
|
|
//--- Print item
|
|
if ( Item.ItemInfo.Type != eALPHA_WORD &&
|
|
Item.ItemInfo.Type != eOPEN_PARENTHESIS &&
|
|
Item.ItemInfo.Type != eOPEN_BRACKET &&
|
|
Item.ItemInfo.Type != eOPEN_BRACE &&
|
|
Item.ItemInfo.Type != eCLOSE_PARENTHESIS &&
|
|
Item.ItemInfo.Type != eCLOSE_BRACKET &&
|
|
Item.ItemInfo.Type != eCLOSE_BRACE &&
|
|
Item.ItemInfo.Type != eSINGLE_QUOTE &&
|
|
Item.ItemInfo.Type != eDOUBLE_QUOTE &&
|
|
Item.ItemInfo.Type != ePERIOD &&
|
|
Item.ItemInfo.Type != eEXCLAMATION &&
|
|
Item.ItemInfo.Type != eQUESTION &&
|
|
Item.ItemInfo.Type != eCOMMA &&
|
|
Item.ItemInfo.Type != eSEMICOLON &&
|
|
Item.ItemInfo.Type != eCOLON &&
|
|
Item.ItemInfo.Type != eHYPHEN )
|
|
{
|
|
fwprintf( g_fpOutputFile, L"[ " );
|
|
}
|
|
for ( ULONG i = 0; i < Item.ulNumWords; i++ )
|
|
{
|
|
if ( Item.Words[i].ulWordLen > 0 )
|
|
{
|
|
fwprintf( g_fpOutputFile, L"%s ", Item.Words[i].WordText );
|
|
}
|
|
else
|
|
{
|
|
fwprintf( g_fpOutputFile, L"%s ", Item.ItemSrcText );
|
|
}
|
|
}
|
|
if ( Item.ItemInfo.Type != eALPHA_WORD &&
|
|
Item.ItemInfo.Type != eOPEN_PARENTHESIS &&
|
|
Item.ItemInfo.Type != eOPEN_BRACKET &&
|
|
Item.ItemInfo.Type != eOPEN_BRACE &&
|
|
Item.ItemInfo.Type != eCLOSE_PARENTHESIS &&
|
|
Item.ItemInfo.Type != eCLOSE_BRACKET &&
|
|
Item.ItemInfo.Type != eCLOSE_BRACE &&
|
|
Item.ItemInfo.Type != eSINGLE_QUOTE &&
|
|
Item.ItemInfo.Type != eDOUBLE_QUOTE &&
|
|
Item.ItemInfo.Type != ePERIOD &&
|
|
Item.ItemInfo.Type != eEXCLAMATION &&
|
|
Item.ItemInfo.Type != eQUESTION &&
|
|
Item.ItemInfo.Type != eCOMMA &&
|
|
Item.ItemInfo.Type != eSEMICOLON &&
|
|
Item.ItemInfo.Type != eCOLON &&
|
|
Item.ItemInfo.Type != eHYPHEN )
|
|
{
|
|
fwprintf( g_fpOutputFile, L"] " );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//--- Print the text of each item, and then its Pronunciation and Part of Speech.
|
|
//--- Separate each with a newline character.
|
|
void ExtractLexLookup( void )
|
|
{
|
|
IStream *pStgStream = NULL;
|
|
|
|
if ( g_pDebugFile->OpenStream( StreamTypeStrings[g_StreamIndex].pStr, 0, STGM_READ | STGM_SHARE_EXCLUSIVE,
|
|
0, &pStgStream) == S_OK )
|
|
{
|
|
DebugSentItem Item, EmptyItem;
|
|
ULONG cbRead = 0;
|
|
|
|
while ( SUCCEEDED( pStgStream->Read( (void*) &Item, sizeof( Item ), &cbRead ) ) &&
|
|
cbRead == sizeof( Item ) )
|
|
{
|
|
if ( memcmp( &Item, &EmptyItem, sizeof( Item ) ) == 0 )
|
|
{
|
|
fwprintf( g_fpOutputFile, L"\n" );
|
|
}
|
|
else
|
|
{
|
|
//--- Print Normalization delimiter
|
|
if ( Item.ItemInfo.Type != eALPHA_WORD &&
|
|
Item.ItemInfo.Type != eOPEN_PARENTHESIS &&
|
|
Item.ItemInfo.Type != eOPEN_BRACKET &&
|
|
Item.ItemInfo.Type != eOPEN_BRACE &&
|
|
Item.ItemInfo.Type != eCLOSE_PARENTHESIS &&
|
|
Item.ItemInfo.Type != eCLOSE_BRACKET &&
|
|
Item.ItemInfo.Type != eCLOSE_BRACE &&
|
|
Item.ItemInfo.Type != eSINGLE_QUOTE &&
|
|
Item.ItemInfo.Type != eDOUBLE_QUOTE &&
|
|
Item.ItemInfo.Type != ePERIOD &&
|
|
Item.ItemInfo.Type != eEXCLAMATION &&
|
|
Item.ItemInfo.Type != eQUESTION &&
|
|
Item.ItemInfo.Type != eCOMMA &&
|
|
Item.ItemInfo.Type != eSEMICOLON &&
|
|
Item.ItemInfo.Type != eCOLON &&
|
|
Item.ItemInfo.Type != eHYPHEN )
|
|
{
|
|
fwprintf( g_fpOutputFile, L"[ " );
|
|
}
|
|
for ( ULONG i = 0; i < Item.ulNumWords; i++ )
|
|
{
|
|
//--- Print item
|
|
if ( Item.Words[i].WordText[0] != 0 )
|
|
{
|
|
fwprintf ( g_fpOutputFile, L"%s ", Item.Words[i].WordText );
|
|
}
|
|
else
|
|
{
|
|
fwprintf ( g_fpOutputFile, L"%s ", Item.ItemSrcText );
|
|
}
|
|
//--- Print pronunciation
|
|
//CComPtr<ISpPhoneConverter> pPhoneConv;
|
|
//if ( SUCCEEDED( SpCreatePhoneConverter(1033, NULL, NULL, &pPhoneConv) ) )
|
|
//{
|
|
// if ( SUCCEEDED( pPhoneConv->IdToPhone( Item.Words[i].WordPron, Item.Words[i].WordPron ) ) )
|
|
// {
|
|
// fwprintf( g_fpOutputFile, L"%s", Item.Words[i].WordPron );
|
|
// for ( long j = 0; j < (long)( (long)45 - (long)wcslen( Item.Words[i].WordPron ) ); j++ )
|
|
// {
|
|
// fwprintf( g_fpOutputFile, L" " );
|
|
// }
|
|
// }
|
|
//}
|
|
//--- Print POS
|
|
fwprintf ( g_fpOutputFile, L"(%s) ", ConvertPOSToString( Item.Words[i].eWordPartOfSpeech ) );
|
|
}
|
|
//--- Print Normalization delimiter
|
|
if ( Item.ItemInfo.Type != eALPHA_WORD &&
|
|
Item.ItemInfo.Type != eOPEN_PARENTHESIS &&
|
|
Item.ItemInfo.Type != eOPEN_BRACKET &&
|
|
Item.ItemInfo.Type != eOPEN_BRACE &&
|
|
Item.ItemInfo.Type != eCLOSE_PARENTHESIS &&
|
|
Item.ItemInfo.Type != eCLOSE_BRACKET &&
|
|
Item.ItemInfo.Type != eCLOSE_BRACE &&
|
|
Item.ItemInfo.Type != eSINGLE_QUOTE &&
|
|
Item.ItemInfo.Type != eDOUBLE_QUOTE &&
|
|
Item.ItemInfo.Type != ePERIOD &&
|
|
Item.ItemInfo.Type != eEXCLAMATION &&
|
|
Item.ItemInfo.Type != eQUESTION &&
|
|
Item.ItemInfo.Type != eCOMMA &&
|
|
Item.ItemInfo.Type != eSEMICOLON &&
|
|
Item.ItemInfo.Type != eCOLON &&
|
|
Item.ItemInfo.Type != eHYPHEN )
|
|
{
|
|
fwprintf( g_fpOutputFile, L"] " );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void ExtractPOSPossibilities( void )
|
|
{
|
|
IStream *pStgStream = NULL;
|
|
|
|
if ( g_pDebugFile->OpenStream( StreamTypeStrings[g_StreamIndex].pStr, 0, STGM_READ | STGM_SHARE_EXCLUSIVE,
|
|
0, &pStgStream) == S_OK )
|
|
{
|
|
DebugPronRecord PronRecord, EmptyPronRecord;
|
|
ULONG cbRead = 0;
|
|
|
|
while ( SUCCEEDED( pStgStream->Read( (void*) &PronRecord, sizeof( PronRecord ), &cbRead ) ) &&
|
|
cbRead == sizeof( PronRecord ) )
|
|
{
|
|
//--- Check for delimiter
|
|
if ( memcmp( &PronRecord, &EmptyPronRecord, sizeof( PronRecord ) ) == 0 )
|
|
{
|
|
fwprintf( g_fpOutputFile, L"\n" );
|
|
}
|
|
else
|
|
{
|
|
fwprintf( g_fpOutputFile, PronRecord.orthStr );
|
|
fwprintf( g_fpOutputFile, L" [ " );
|
|
fwprintf( g_fpOutputFile, L"%s - ", ConvertPOSToString( PronRecord.POSchoice ) );
|
|
for ( ULONG i = 0; i < PronRecord.pronArray[0].POScount; i++ )
|
|
{
|
|
fwprintf( g_fpOutputFile, L"%s,", ConvertPOSToString( (DWORD)PronRecord.pronArray[0].POScode[i] ) );
|
|
}
|
|
for ( i = 0; i < PronRecord.pronArray[1].POScount; i++ )
|
|
{
|
|
fwprintf( g_fpOutputFile, L"%s,", ConvertPOSToString( (DWORD)PronRecord.pronArray[1].POScode[i] ) );
|
|
}
|
|
fwprintf( g_fpOutputFile, L" ]\n" );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void ExtractMorphology( void )
|
|
{
|
|
IStream *pStgStream = NULL;
|
|
|
|
if ( g_pDebugFile->OpenStream( StreamTypeStrings[g_StreamIndex].pStr, 0, STGM_READ | STGM_SHARE_EXCLUSIVE,
|
|
0, &pStgStream ) == S_OK )
|
|
{
|
|
CComPtr<ISpPhoneConverter> pPhoneConv;
|
|
if ( SUCCEEDED( SpCreatePhoneConverter( 1033, NULL, NULL, &pPhoneConv ) ) )
|
|
{
|
|
WCHAR Buffer[SP_MAX_WORD_LENGTH], EmptyBuffer[SP_MAX_WORD_LENGTH];
|
|
ULONG cbRead = 0;
|
|
ZeroMemory( EmptyBuffer, SP_MAX_WORD_LENGTH * sizeof( WCHAR ) );
|
|
BOOL fRoot = true;
|
|
|
|
while ( SUCCEEDED( pStgStream->Read( (void*) &Buffer, SP_MAX_WORD_LENGTH * sizeof( WCHAR ), &cbRead ) ) &&
|
|
cbRead == SP_MAX_WORD_LENGTH * sizeof( WCHAR ) )
|
|
{
|
|
//--- Check for delimiter
|
|
if ( memcmp( &Buffer, &EmptyBuffer, SP_MAX_WORD_LENGTH * sizeof( WCHAR ) ) == 0 )
|
|
{
|
|
fwprintf( g_fpOutputFile, L"\n" );
|
|
fRoot = true;
|
|
}
|
|
else if ( fRoot )
|
|
{
|
|
fwprintf( g_fpOutputFile, L"%s ", Buffer );
|
|
fRoot = false;
|
|
}
|
|
else
|
|
{
|
|
if ( SUCCEEDED( pPhoneConv->IdToPhone( Buffer, Buffer ) ) )
|
|
{
|
|
fwprintf( g_fpOutputFile, L"- %s ", Buffer );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
WCHAR* ConvertPOSToString( DWORD dwPartOfSpeech )
|
|
{
|
|
switch (dwPartOfSpeech)
|
|
{
|
|
case MS_NotOverriden:
|
|
return L"Noun";
|
|
case MS_Unknown:
|
|
return L"Unknown";
|
|
case MS_Punctuation:
|
|
return L"Punctuation";
|
|
case MS_Noun:
|
|
return L"Noun";
|
|
case MS_Verb:
|
|
return L"Verb";
|
|
case MS_Modifier:
|
|
return L"Modifier";
|
|
case MS_Function:
|
|
return L"Function";
|
|
case MS_Interjection:
|
|
return L"Interj";
|
|
case MS_Pron:
|
|
return L"Pron";
|
|
case MS_SubjPron:
|
|
return L"SubjPron";
|
|
case MS_ObjPron:
|
|
return L"ObjPron";
|
|
case MS_RelPron:
|
|
return L"RelPron";
|
|
// case MS_PPron:
|
|
// return L"PPron";
|
|
// case MS_IPron:
|
|
// return L"IPron";
|
|
// case MS_RPron:
|
|
// return L"RPron";
|
|
// case MS_DPron:
|
|
// return L"DPron";
|
|
case MS_Adj:
|
|
return L"Adj";
|
|
case MS_Adv:
|
|
return L"Adv";
|
|
case MS_VAux:
|
|
return L"VAux";
|
|
// case MS_RVAux:
|
|
// return L"RVAux";
|
|
case MS_Conj:
|
|
return L"Conj";
|
|
case MS_CConj:
|
|
return L"CConj";
|
|
case MS_Interr:
|
|
return L"WHWord";
|
|
case MS_Det:
|
|
return L"Det";
|
|
case MS_Contr:
|
|
return L"Contr";
|
|
// case MS_VPart:
|
|
// return L"VPart";
|
|
case MS_Prep:
|
|
return L"Prep";
|
|
// case MS_Quant:
|
|
// return L"Quant";
|
|
default:
|
|
return L"Unknown";
|
|
}
|
|
}
|