//+--------------------------------------------------------------------------- // // Microsoft Windows // Copyright (C) Microsoft Corporation, 1997 - 1999 // // File: IWBreak.cxx // // Contents: Korean Word Breaker glue code // // History: weibz, 10-Sep-1997 created // //---------------------------------------------------------------------------- #include #include "iwbreak.hxx" #define MAXFORMS 20 extern long gulcInstances; extern HSTM g_hStm; extern BOOL g_fLoad; //extern CRITICAL_SECTION ThCritSect; //+--------------------------------------------------------------------------- // // Member: CWordBreaker::CWordBreaker // // Synopsis: Constructor for the CWordBreaker class. // // Arguments: [lcid] -- locale id // //---------------------------------------------------------------------------- CWordBreaker::CWordBreaker( LCID lcid ) : _cRefs(1), _lcid(lcid) { InterlockedIncrement( &gulcInstances ); } //+--------------------------------------------------------------------------- // // Member: CWordBreaker::~CWordBreaker // // Synopsis: Destructor for the CWordBreaker class. // // Notes: All termination/deallocation is done by embedded smart pointers // //---------------------------------------------------------------------------- CWordBreaker::~CWordBreaker() { InterlockedDecrement( &gulcInstances ); } //+------------------------------------------------------------------------- // // Method: CWordBreaker::QueryInterface // // Synopsis: Rebind to other interface // // Arguments: [riid] -- IID of new interface // [ppvObject] -- New interface * returned here // // Returns: S_OK if bind succeeded, E_NOINTERFACE if bind failed // //-------------------------------------------------------------------------- SCODE STDMETHODCALLTYPE CWordBreaker::QueryInterface( REFIID riid, void ** ppvObject) { // // Optimize QueryInterface by only checking minimal number of bytes. // // IID_IUnknown = 00000000-0000-0000-C000-000000000046 // IID_IWordBreaker = D53552C8-77E3-101A-B552-08002B33B0E6 // -------- // | // +--- Unique! // Assert( (IID_IUnknown.Data1 & 0x000000FF) == 0x00 ); Assert( (IID_IWordBreaker.Data1 & 0x000000FF) == 0xC8 ); IUnknown *pUnkTemp; SCODE sc = S_OK; switch( riid.Data1 ) { case 0x00000000: if ( memcmp( &IID_IUnknown, &riid, sizeof(riid) ) == 0 ) pUnkTemp = (IUnknown *)this; else sc = E_NOINTERFACE; break; case 0xD53552C8: if ( memcmp( &IID_IWordBreaker, &riid, sizeof(riid) ) == 0 ) pUnkTemp = (IUnknown *)(IWordBreaker *)this; else sc = E_NOINTERFACE; break; default: pUnkTemp = 0; sc = E_NOINTERFACE; break; } if( 0 != pUnkTemp ) { *ppvObject = (void * )pUnkTemp; pUnkTemp->AddRef(); } else *ppvObject = 0; return(sc); } //+------------------------------------------------------------------------- // // Method: CWordBreaker::AddRef // // Synopsis: Increments refcount // //-------------------------------------------------------------------------- ULONG STDMETHODCALLTYPE CWordBreaker::AddRef() { return InterlockedIncrement( &_cRefs ); } //+------------------------------------------------------------------------- // // Method: CWordBreaker::Release // // Synopsis: Decrement refcount. Delete if necessary. // //-------------------------------------------------------------------------- ULONG STDMETHODCALLTYPE CWordBreaker::Release() { unsigned long uTmp = InterlockedDecrement( &_cRefs ); if ( 0 == uTmp ) delete this; return(uTmp); } //+------------------------------------------------------------------------- // // Method: CWordBreaker::Init // // Synopsis: Initialize word-breaker // // Arguments: [fQuery] -- TRUE if query-time // [ulMaxTokenSize] -- Maximum size token stored by caller // [pfLicense] -- Set to true if use restricted // // Returns: Status code // //-------------------------------------------------------------------------- SCODE STDMETHODCALLTYPE CWordBreaker::Init( BOOL fQuery, ULONG ulMaxTokenSize, BOOL *pfLicense ) { if ( NULL == pfLicense ) return E_INVALIDARG; if (IsBadWritePtr(pfLicense, sizeof(DWORD))) return E_INVALIDARG; if ( !StemInit() ) return LANGUAGE_E_DATABASE_NOT_FOUND; *pfLicense = TRUE; _fQuery = fQuery; _ulMaxTokenSize = ulMaxTokenSize; return S_OK; } //+--------------------------------------------------------------------------- // // Member: CWordBreaker::ComposePhrase // // Synopsis: Convert a noun and a modifier into a phrase. // // Arguments: [pwcNoun] -- pointer to noun. // [cwcNoun] -- count of chars in pwcNoun // [pwcModifier] -- pointer to word modifying pwcNoun // [cwcModifier] -- count of chars in pwcModifier // [ulAttachmentType] -- relationship between pwcNoun &pwcModifier // //---------------------------------------------------------------------------- SCODE STDMETHODCALLTYPE CWordBreaker::ComposePhrase( WCHAR const *pwcNoun, ULONG cwcNoun, WCHAR const *pwcModifier, ULONG cwcModifier, ULONG ulAttachmentType, WCHAR *pwcPhrase, ULONG *pcwcPhrase ) { // // Need to code in later // if ( _fQuery ) return( E_NOTIMPL ); else return ( WBREAK_E_QUERY_ONLY ); } //+--------------------------------------------------------------------------- // // Member: CWordBreaker::GetLicenseToUse // // Synopsis: Returns a pointer to vendors license information // // Arguments: [ppwcsLicense] -- ptr to ptr to which license info is returned // //---------------------------------------------------------------------------- SCODE STDMETHODCALLTYPE CWordBreaker::GetLicenseToUse( const WCHAR **ppwcsLicense ) { static WCHAR const * wcsCopyright = L"Copyright Microsoft, 1991-1998"; if ( NULL == ppwcsLicense ) { return E_INVALIDARG; } if (IsBadWritePtr(ppwcsLicense, sizeof(DWORD))) { return E_INVALIDARG; } *ppwcsLicense = wcsCopyright; return( S_OK ); } //+--------------------------------------------------------------------------- // // Member: CWordBreaker::BreakText // // Synopsis: Break input stream into words. // // Arguments: [pTextSource] -- source of Unicode text // [pWordSink] -- sink for collecting words // [pPhraseSink] -- sink for collecting phrases // // History: 10-Sep-1997, WeibZ, Created. // // Notes: Since the input buffer may be greater than MAX_II_BUFFER_LEN // we process the buffer in chunks of length MAX_II_BUFFER_LEN. // //---------------------------------------------------------------------------- SCODE STDMETHODCALLTYPE CWordBreaker::BreakText( TEXT_SOURCE *pTextSource, IWordSink *pWordSink, IPhraseSink *pPhraseSink ) { SCODE sc = S_OK; ULONG cwc; WT Type; BOOL Ret_ProcToken; if ( NULL == pTextSource ) { // OutputDebugString("\nPTextSources is Null\n"); return E_INVALIDARG; } if ( NULL == pWordSink ) { // BUGBUG, propagate the null word sink error code return sc; } // BUGBUG, need to normalize nums within T-Hammer, pass as flag? // turn on noun phrase analysis if there is a phrase sink if ( 0 != pPhraseSink ) { // BUGBUG, do we need to pass a separate flag to T-Hammer for this? // ignore the phrase sink for now // return sc; } if (pTextSource->iEnd == pTextSource->iCur) { return S_OK; } Assert( pTextSource->iCur < pTextSource->iEnd ); __try { do { while ( pTextSource->iCur < pTextSource->iEnd ) { cwc = pTextSource->iEnd - pTextSource->iCur; Tokenize( cwc, TRUE, pTextSource, &Type); if ( Type != WT_REACHEND ) { Ret_ProcToken = ProcessTokens( pTextSource, Type, pWordSink, pPhraseSink ); if ( !Ret_ProcToken ) { // Process_Tokens return FALSE, so return here return E_UNEXPECTED; } pTextSource->iCur += _cchTextProcessed; } else break; } } while ( SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)) ); while ( pTextSource->iCur < pTextSource->iEnd ) { cwc = pTextSource->iEnd - pTextSource->iCur; Tokenize( cwc, FALSE, pTextSource, &Type); Ret_ProcToken = ProcessTokens( pTextSource, Type, pWordSink, pPhraseSink ); if ( !Ret_ProcToken ) { // Process_Tokens return FALSE, so return here return E_UNEXPECTED; } pTextSource->iCur += _cchTextProcessed; } } __except(1) { sc = E_UNEXPECTED; } return sc; } void CWordBreaker::Tokenize( unsigned cwc, BOOL bMoreText, TEXT_SOURCE *pTextSource, WT *Type) { ULONG i; BYTE ct; BOOL fRomanWord = FALSE; BOOL fHanguelWord = FALSE; CONST WCHAR *pwcInput, *pwcStem; _cchTextProcessed = 0; *Type = WT_START; pwcStem = pwcInput = pTextSource->awcBuffer + pTextSource->iCur; for (i=0; i< cwc; i++, pwcInput++) { ct = GetCharType(*pwcInput); if ( (ct != WS) && (ct != PS) && (ct != HG) ) ct = CH; switch (ct) { case CH : // check to see if there is a Hanguel word before this char if (fHanguelWord) { _cchTextProcessed = (DWORD)(pwcInput - pwcStem); return; } if (!fRomanWord) { pwcStem = pwcInput; fRomanWord = TRUE; *Type = WT_ROMAJI; } break; case HG : // check to see if there is an English word before this char if ( fRomanWord ) { _cchTextProcessed = (DWORD)(pwcInput - pwcStem); return; } if (!fHanguelWord) { pwcStem = pwcInput; fHanguelWord = TRUE; *Type = WT_HANGUEL; } break; case WS : if (fRomanWord || fHanguelWord) { _cchTextProcessed = (DWORD)(pwcInput - pwcStem); return; } *Type = WT_WORD_SEP; _cchTextProcessed = 1; return; case PS : if (fRomanWord || fHanguelWord) { _cchTextProcessed = (DWORD)(pwcInput - pwcStem); return; } *Type = WT_PHRASE_SEP; _cchTextProcessed = 1; return; } } if ( bMoreText ) { _cchTextProcessed = 0; *Type = WT_REACHEND; } else _cchTextProcessed = cwc; } BOOL CWordBreaker::ProcessTokens( TEXT_SOURCE *pTextSource, WT Type, IWordSink *pWordSink, IPhraseSink *pPhraseSink ) { CONST WCHAR *pwcStem; if ( Type == WT_PHRASE_SEP) { pWordSink->PutBreak (WORDREP_BREAK_EOS); return TRUE; } if ( Type == WT_ROMAJI) { ULONG i; pwcStem = pTextSource->awcBuffer + pTextSource->iCur; #ifdef KORDBG OutputDebugString("\n"); for (i=0; i< _cchTextProcessed; i++) { char ctmp[2]; ctmp[0] = pwcStem[i] & 0xff; ctmp[1] = '\0'; OutputDebugString(ctmp); } OutputDebugString(" "); #endif (pWordSink->PutWord)(_cchTextProcessed, pwcStem, _cchTextProcessed, pTextSource->iCur); return TRUE; } if ( Type == WT_HANGUEL ) { WCHAR TokenWord[80]; ULONG i; WDOB sob; // EnterCriticalSection(&ThCritSect); sob.wordlist = (LPWSTR)LocalAlloc(LPTR, 200); sob.sch = 200; if (sob.wordlist == NULL ) return FALSE; pwcStem = pTextSource->awcBuffer + pTextSource->iCur; for (i=0; i<_cchTextProcessed; i++) { #ifdef KORDBG WORD wtmp; char ctmp[80]; wtmp = pwcStem[i]; sprintf(ctmp, "%4x ", wtmp); OutputDebugString(ctmp); #endif TokenWord[i] = pwcStem[i]; } TokenWord[_cchTextProcessed] = L'\0'; #ifdef KORDBG OutputDebugString("\nBefore StemmerDecomposeW\n"); #endif if (StemmerDecomposeW(g_hStm, TokenWord, &sob) == NULL) { ULONG wInLexLen; WORD winfo; ULONG num, len, j, k; WCHAR *pWordList, *pVerb; ULONG NumEf; BOOL fExist; WCHAR *pwszStart[MAXFORMS]; NumEf = 0; do { num = sob.num; pWordList = sob.wordlist; for (j=0; j 00. // so the length should be len + 2 = len + 1 + 1. pVerb = (LPWSTR)LocalAlloc(LPTR, (len+2)*sizeof(WCHAR)); if (pVerb == NULL ) { if (sob.wordlist) LocalFree(sob.wordlist); for (k=0; k 00 Winfo. // so the length should be len+3 = len + 1 + 1 + 1. pwszStart[NumEf]=(LPWSTR)LocalAlloc(LPTR,(len+3)*sizeof(WCHAR)); if ( pwszStart[NumEf] == NULL ) { // alloc error, so return here if (sob.wordlist) LocalFree(sob.wordlist); if ( pVerb != NULL ) LocalFree(pVerb); for (k=0; kPutAltWord(wInLexLen, pwszStart[i], _cchTextProcessed, pTextSource->iCur); } // handle the last one. wInLexLen = wcslen(pwszStart[NumEf-1]); pWordSink->PutWord(wInLexLen, pwszStart[NumEf-1], _cchTextProcessed, pTextSource->iCur); } else { if ( NumEf == 1 ) { // handle this only one. wInLexLen = wcslen(pwszStart[NumEf-1]); pWordSink->PutWord(wInLexLen, pwszStart[NumEf-1], _cchTextProcessed, pTextSource->iCur); } else { ULONG uNum_Noun; ULONG uIndex[MAXFORMS]; uNum_Noun = 0; for (i=0; iPutAltWord(wInLexLen, pwszStart[i], _cchTextProcessed, pTextSource->iCur); } // handle the last one. wInLexLen = wcslen(pwszStart[NumEf-1]); pWordSink->PutWord(wInLexLen, pwszStart[NumEf-1], _cchTextProcessed, pTextSource->iCur); } if (uNum_Noun == 1) { // there is only One Noun, and we just use this one to query. ULONG index; index = uIndex[0]; wInLexLen = wcslen(pwszStart[index]); pWordSink->PutWord(wInLexLen, pwszStart[index], _cchTextProcessed, pTextSource->iCur); } if ( uNum_Noun > 1 ) { // there are more than one Noun, Use all those Noun to query. ULONG index; for (i=0; iPutAltWord(wInLexLen, pwszStart[index], _cchTextProcessed, pTextSource->iCur); } // handle the last Noun. index = uIndex[uNum_Noun-1]; wInLexLen = wcslen(pwszStart[index]); pWordSink->PutWord(wInLexLen, pwszStart[index], _cchTextProcessed, pTextSource->iCur); } } } #ifdef KORDBG { char ctmp[80]; OutputDebugString("\nStemmerDecomposeW Correct\n"); OutputDebugString(" the Num of Stemm is "); sprintf(ctmp, "%4x ", NumEf); OutputDebugString(ctmp); OutputDebugString("\n"); } for (i=0; i< NumEf; i++) { WORD wtmp; char ctmp[80]; wInLexLen = wcslen(pwszStart[i]); for (j=0; jPutWord(_cchTextProcessed, TokenWord, _cchTextProcessed, pTextSource->iCur); } LocalFree(sob.wordlist); // LeaveCriticalSection (&ThCritSect); } return TRUE; }