898 lines
26 KiB
C++
898 lines
26 KiB
C++
|
//+---------------------------------------------------------------------------
|
||
|
//
|
||
|
// Microsoft Windows
|
||
|
// Copyright (C) Microsoft Corporation, 1997 - 1999
|
||
|
//
|
||
|
// File: IWBreak.cxx
|
||
|
//
|
||
|
// Contents: Korean Word Breaker glue code
|
||
|
//
|
||
|
// History: weibz, 10-Sep-1997 created
|
||
|
//
|
||
|
//----------------------------------------------------------------------------
|
||
|
|
||
|
#include <pch.cxx>
|
||
|
|
||
|
#include "iwbreak.hxx"
|
||
|
|
||
|
#define MAXFORMS 20
|
||
|
|
||
|
extern long gulcInstances;
|
||
|
extern HSTM g_hStm;
|
||
|
extern BOOL g_fLoad;
|
||
|
//extern CRITICAL_SECTION ThCritSect;
|
||
|
|
||
|
//+---------------------------------------------------------------------------
|
||
|
//
|
||
|
// Member: CWordBreaker::CWordBreaker
|
||
|
//
|
||
|
// Synopsis: Constructor for the CWordBreaker class.
|
||
|
//
|
||
|
// Arguments: [lcid] -- locale id
|
||
|
//
|
||
|
//----------------------------------------------------------------------------
|
||
|
|
||
|
CWordBreaker::CWordBreaker( LCID lcid )
|
||
|
: _cRefs(1),
|
||
|
_lcid(lcid)
|
||
|
{
|
||
|
|
||
|
InterlockedIncrement( &gulcInstances );
|
||
|
|
||
|
}
|
||
|
|
||
|
//+---------------------------------------------------------------------------
|
||
|
//
|
||
|
// Member: CWordBreaker::~CWordBreaker
|
||
|
//
|
||
|
// Synopsis: Destructor for the CWordBreaker class.
|
||
|
//
|
||
|
// Notes: All termination/deallocation is done by embedded smart pointers
|
||
|
//
|
||
|
//----------------------------------------------------------------------------
|
||
|
|
||
|
CWordBreaker::~CWordBreaker()
|
||
|
{
|
||
|
InterlockedDecrement( &gulcInstances );
|
||
|
|
||
|
|
||
|
}
|
||
|
|
||
|
//+-------------------------------------------------------------------------
|
||
|
//
|
||
|
// Method: CWordBreaker::QueryInterface
|
||
|
//
|
||
|
// Synopsis: Rebind to other interface
|
||
|
//
|
||
|
// Arguments: [riid] -- IID of new interface
|
||
|
// [ppvObject] -- New interface * returned here
|
||
|
//
|
||
|
// Returns: S_OK if bind succeeded, E_NOINTERFACE if bind failed
|
||
|
//
|
||
|
//--------------------------------------------------------------------------
|
||
|
|
||
|
SCODE STDMETHODCALLTYPE
|
||
|
CWordBreaker::QueryInterface( REFIID riid, void ** ppvObject)
|
||
|
{
|
||
|
//
|
||
|
// Optimize QueryInterface by only checking minimal number of bytes.
|
||
|
//
|
||
|
// IID_IUnknown = 00000000-0000-0000-C000-000000000046
|
||
|
// IID_IWordBreaker = D53552C8-77E3-101A-B552-08002B33B0E6
|
||
|
// --------
|
||
|
// |
|
||
|
// +--- Unique!
|
||
|
//
|
||
|
|
||
|
Assert( (IID_IUnknown.Data1 & 0x000000FF) == 0x00 );
|
||
|
Assert( (IID_IWordBreaker.Data1 & 0x000000FF) == 0xC8 );
|
||
|
|
||
|
IUnknown *pUnkTemp;
|
||
|
SCODE sc = S_OK;
|
||
|
|
||
|
switch( riid.Data1 )
|
||
|
{
|
||
|
case 0x00000000:
|
||
|
if ( memcmp( &IID_IUnknown, &riid, sizeof(riid) ) == 0 )
|
||
|
pUnkTemp = (IUnknown *)this;
|
||
|
else
|
||
|
sc = E_NOINTERFACE;
|
||
|
break;
|
||
|
|
||
|
case 0xD53552C8:
|
||
|
if ( memcmp( &IID_IWordBreaker, &riid, sizeof(riid) ) == 0 )
|
||
|
pUnkTemp = (IUnknown *)(IWordBreaker *)this;
|
||
|
else
|
||
|
sc = E_NOINTERFACE;
|
||
|
break;
|
||
|
|
||
|
default:
|
||
|
pUnkTemp = 0;
|
||
|
sc = E_NOINTERFACE;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if( 0 != pUnkTemp )
|
||
|
{
|
||
|
*ppvObject = (void * )pUnkTemp;
|
||
|
pUnkTemp->AddRef();
|
||
|
}
|
||
|
else
|
||
|
*ppvObject = 0;
|
||
|
return(sc);
|
||
|
}
|
||
|
|
||
|
|
||
|
//+-------------------------------------------------------------------------
|
||
|
//
|
||
|
// Method: CWordBreaker::AddRef
|
||
|
//
|
||
|
// Synopsis: Increments refcount
|
||
|
//
|
||
|
//--------------------------------------------------------------------------
|
||
|
|
||
|
ULONG STDMETHODCALLTYPE
|
||
|
CWordBreaker::AddRef()
|
||
|
{
|
||
|
return InterlockedIncrement( &_cRefs );
|
||
|
}
|
||
|
|
||
|
//+-------------------------------------------------------------------------
|
||
|
//
|
||
|
// Method: CWordBreaker::Release
|
||
|
//
|
||
|
// Synopsis: Decrement refcount. Delete if necessary.
|
||
|
//
|
||
|
//--------------------------------------------------------------------------
|
||
|
|
||
|
ULONG STDMETHODCALLTYPE
|
||
|
CWordBreaker::Release()
|
||
|
{
|
||
|
unsigned long uTmp = InterlockedDecrement( &_cRefs );
|
||
|
|
||
|
if ( 0 == uTmp )
|
||
|
delete this;
|
||
|
|
||
|
|
||
|
return(uTmp);
|
||
|
}
|
||
|
|
||
|
//+-------------------------------------------------------------------------
|
||
|
//
|
||
|
// Method: CWordBreaker::Init
|
||
|
//
|
||
|
// Synopsis: Initialize word-breaker
|
||
|
//
|
||
|
// Arguments: [fQuery] -- TRUE if query-time
|
||
|
// [ulMaxTokenSize] -- Maximum size token stored by caller
|
||
|
// [pfLicense] -- Set to true if use restricted
|
||
|
//
|
||
|
// Returns: Status code
|
||
|
//
|
||
|
//--------------------------------------------------------------------------
|
||
|
|
||
|
SCODE STDMETHODCALLTYPE
|
||
|
CWordBreaker::Init(
|
||
|
BOOL fQuery,
|
||
|
ULONG ulMaxTokenSize,
|
||
|
BOOL *pfLicense )
|
||
|
{
|
||
|
if ( NULL == pfLicense )
|
||
|
return E_INVALIDARG;
|
||
|
|
||
|
|
||
|
if (IsBadWritePtr(pfLicense, sizeof(DWORD)))
|
||
|
return E_INVALIDARG;
|
||
|
|
||
|
if ( !StemInit() )
|
||
|
return LANGUAGE_E_DATABASE_NOT_FOUND;
|
||
|
|
||
|
*pfLicense = TRUE;
|
||
|
_fQuery = fQuery;
|
||
|
_ulMaxTokenSize = ulMaxTokenSize;
|
||
|
|
||
|
|
||
|
return S_OK;
|
||
|
}
|
||
|
|
||
|
//+---------------------------------------------------------------------------
|
||
|
//
|
||
|
// Member: CWordBreaker::ComposePhrase
|
||
|
//
|
||
|
// Synopsis: Convert a noun and a modifier into a phrase.
|
||
|
//
|
||
|
// Arguments: [pwcNoun] -- pointer to noun.
|
||
|
// [cwcNoun] -- count of chars in pwcNoun
|
||
|
// [pwcModifier] -- pointer to word modifying pwcNoun
|
||
|
// [cwcModifier] -- count of chars in pwcModifier
|
||
|
// [ulAttachmentType] -- relationship between pwcNoun &pwcModifier
|
||
|
//
|
||
|
//----------------------------------------------------------------------------
|
||
|
SCODE STDMETHODCALLTYPE
|
||
|
CWordBreaker::ComposePhrase(
|
||
|
WCHAR const *pwcNoun,
|
||
|
ULONG cwcNoun,
|
||
|
WCHAR const *pwcModifier,
|
||
|
ULONG cwcModifier,
|
||
|
ULONG ulAttachmentType,
|
||
|
WCHAR *pwcPhrase,
|
||
|
ULONG *pcwcPhrase )
|
||
|
{
|
||
|
//
|
||
|
// Need to code in later
|
||
|
//
|
||
|
if ( _fQuery )
|
||
|
return( E_NOTIMPL );
|
||
|
else
|
||
|
return ( WBREAK_E_QUERY_ONLY );
|
||
|
}
|
||
|
|
||
|
//+---------------------------------------------------------------------------
|
||
|
//
|
||
|
// Member: CWordBreaker::GetLicenseToUse
|
||
|
//
|
||
|
// Synopsis: Returns a pointer to vendors license information
|
||
|
//
|
||
|
// Arguments: [ppwcsLicense] -- ptr to ptr to which license info is returned
|
||
|
//
|
||
|
//----------------------------------------------------------------------------
|
||
|
SCODE STDMETHODCALLTYPE
|
||
|
CWordBreaker::GetLicenseToUse(
|
||
|
const WCHAR **ppwcsLicense )
|
||
|
{
|
||
|
|
||
|
static WCHAR const * wcsCopyright = L"Copyright Microsoft, 1991-1998";
|
||
|
|
||
|
|
||
|
if ( NULL == ppwcsLicense ) {
|
||
|
return E_INVALIDARG;
|
||
|
}
|
||
|
|
||
|
if (IsBadWritePtr(ppwcsLicense, sizeof(DWORD))) {
|
||
|
return E_INVALIDARG;
|
||
|
}
|
||
|
|
||
|
*ppwcsLicense = wcsCopyright;
|
||
|
return( S_OK );
|
||
|
}
|
||
|
|
||
|
|
||
|
//+---------------------------------------------------------------------------
|
||
|
//
|
||
|
// Member: CWordBreaker::BreakText
|
||
|
//
|
||
|
// Synopsis: Break input stream into words.
|
||
|
//
|
||
|
// Arguments: [pTextSource] -- source of Unicode text
|
||
|
// [pWordSink] -- sink for collecting words
|
||
|
// [pPhraseSink] -- sink for collecting phrases
|
||
|
//
|
||
|
// History: 10-Sep-1997, WeibZ, Created.
|
||
|
//
|
||
|
// Notes: Since the input buffer may be greater than MAX_II_BUFFER_LEN
|
||
|
// we process the buffer in chunks of length MAX_II_BUFFER_LEN.
|
||
|
//
|
||
|
//----------------------------------------------------------------------------
|
||
|
|
||
|
SCODE STDMETHODCALLTYPE CWordBreaker::BreakText( TEXT_SOURCE *pTextSource,
|
||
|
IWordSink *pWordSink,
|
||
|
IPhraseSink *pPhraseSink )
|
||
|
{
|
||
|
SCODE sc = S_OK;
|
||
|
ULONG cwc;
|
||
|
WT Type;
|
||
|
BOOL Ret_ProcToken;
|
||
|
|
||
|
|
||
|
|
||
|
if ( NULL == pTextSource ) {
|
||
|
// OutputDebugString("\nPTextSources is Null\n");
|
||
|
return E_INVALIDARG;
|
||
|
}
|
||
|
|
||
|
if ( NULL == pWordSink )
|
||
|
{
|
||
|
// BUGBUG, propagate the null word sink error code
|
||
|
return sc;
|
||
|
}
|
||
|
|
||
|
// BUGBUG, need to normalize nums within T-Hammer, pass as flag?
|
||
|
|
||
|
// turn on noun phrase analysis if there is a phrase sink
|
||
|
if ( 0 != pPhraseSink )
|
||
|
{
|
||
|
// BUGBUG, do we need to pass a separate flag to T-Hammer for this?
|
||
|
// ignore the phrase sink for now
|
||
|
// return sc;
|
||
|
}
|
||
|
|
||
|
if (pTextSource->iEnd == pTextSource->iCur) {
|
||
|
return S_OK;
|
||
|
}
|
||
|
|
||
|
Assert( pTextSource->iCur < pTextSource->iEnd );
|
||
|
|
||
|
|
||
|
__try
|
||
|
{
|
||
|
do
|
||
|
{
|
||
|
while ( pTextSource->iCur < pTextSource->iEnd )
|
||
|
{
|
||
|
|
||
|
cwc = pTextSource->iEnd - pTextSource->iCur;
|
||
|
|
||
|
Tokenize( cwc, TRUE, pTextSource, &Type);
|
||
|
|
||
|
if ( Type != WT_REACHEND )
|
||
|
{
|
||
|
Ret_ProcToken = ProcessTokens( pTextSource, Type,
|
||
|
pWordSink, pPhraseSink );
|
||
|
|
||
|
if ( !Ret_ProcToken ) {
|
||
|
// Process_Tokens return FALSE, so return here
|
||
|
return E_UNEXPECTED;
|
||
|
}
|
||
|
|
||
|
pTextSource->iCur += _cchTextProcessed;
|
||
|
}
|
||
|
else
|
||
|
break;
|
||
|
|
||
|
}
|
||
|
|
||
|
|
||
|
} while ( SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)) );
|
||
|
|
||
|
while ( pTextSource->iCur < pTextSource->iEnd )
|
||
|
{
|
||
|
|
||
|
cwc = pTextSource->iEnd - pTextSource->iCur;
|
||
|
|
||
|
Tokenize( cwc, FALSE, pTextSource, &Type);
|
||
|
|
||
|
Ret_ProcToken = ProcessTokens( pTextSource, Type,
|
||
|
pWordSink, pPhraseSink );
|
||
|
|
||
|
if ( !Ret_ProcToken ) {
|
||
|
// Process_Tokens return FALSE, so return here
|
||
|
return E_UNEXPECTED;
|
||
|
}
|
||
|
|
||
|
pTextSource->iCur += _cchTextProcessed;
|
||
|
}
|
||
|
|
||
|
} __except(1) {
|
||
|
|
||
|
sc = E_UNEXPECTED;
|
||
|
}
|
||
|
|
||
|
return sc;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
void CWordBreaker::Tokenize( unsigned cwc,
|
||
|
BOOL bMoreText,
|
||
|
TEXT_SOURCE *pTextSource,
|
||
|
WT *Type)
|
||
|
{
|
||
|
|
||
|
ULONG i;
|
||
|
BYTE ct;
|
||
|
BOOL fRomanWord = FALSE;
|
||
|
BOOL fHanguelWord = FALSE;
|
||
|
CONST WCHAR *pwcInput, *pwcStem;
|
||
|
|
||
|
|
||
|
_cchTextProcessed = 0;
|
||
|
*Type = WT_START;
|
||
|
|
||
|
pwcStem = pwcInput = pTextSource->awcBuffer + pTextSource->iCur;
|
||
|
|
||
|
for (i=0; i< cwc; i++, pwcInput++) {
|
||
|
|
||
|
ct = GetCharType(*pwcInput);
|
||
|
|
||
|
if ( (ct != WS) && (ct != PS) && (ct != HG) )
|
||
|
ct = CH;
|
||
|
|
||
|
|
||
|
switch (ct) {
|
||
|
case CH :
|
||
|
// check to see if there is a Hanguel word before this char
|
||
|
if (fHanguelWord) {
|
||
|
_cchTextProcessed = (DWORD)(pwcInput - pwcStem);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if (!fRomanWord) {
|
||
|
pwcStem = pwcInput;
|
||
|
fRomanWord = TRUE;
|
||
|
*Type = WT_ROMAJI;
|
||
|
}
|
||
|
break;
|
||
|
case HG :
|
||
|
|
||
|
// check to see if there is an English word before this char
|
||
|
if ( fRomanWord ) {
|
||
|
_cchTextProcessed = (DWORD)(pwcInput - pwcStem);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if (!fHanguelWord) {
|
||
|
pwcStem = pwcInput;
|
||
|
fHanguelWord = TRUE;
|
||
|
*Type = WT_HANGUEL;
|
||
|
}
|
||
|
break;
|
||
|
case WS :
|
||
|
if (fRomanWord || fHanguelWord) {
|
||
|
_cchTextProcessed = (DWORD)(pwcInput - pwcStem);
|
||
|
return;
|
||
|
}
|
||
|
*Type = WT_WORD_SEP;
|
||
|
_cchTextProcessed = 1;
|
||
|
return;
|
||
|
|
||
|
case PS :
|
||
|
if (fRomanWord || fHanguelWord) {
|
||
|
_cchTextProcessed = (DWORD)(pwcInput - pwcStem);
|
||
|
return;
|
||
|
}
|
||
|
*Type = WT_PHRASE_SEP;
|
||
|
_cchTextProcessed = 1;
|
||
|
return;
|
||
|
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if ( bMoreText ) {
|
||
|
_cchTextProcessed = 0;
|
||
|
*Type = WT_REACHEND;
|
||
|
}
|
||
|
else
|
||
|
_cchTextProcessed = cwc;
|
||
|
|
||
|
}
|
||
|
|
||
|
|
||
|
BOOL CWordBreaker::ProcessTokens( TEXT_SOURCE *pTextSource,
|
||
|
WT Type,
|
||
|
IWordSink *pWordSink,
|
||
|
IPhraseSink *pPhraseSink )
|
||
|
{
|
||
|
|
||
|
CONST WCHAR *pwcStem;
|
||
|
|
||
|
if ( Type == WT_PHRASE_SEP)
|
||
|
{
|
||
|
pWordSink->PutBreak (WORDREP_BREAK_EOS);
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
if ( Type == WT_ROMAJI)
|
||
|
{
|
||
|
ULONG i;
|
||
|
|
||
|
pwcStem = pTextSource->awcBuffer + pTextSource->iCur;
|
||
|
|
||
|
#ifdef KORDBG
|
||
|
OutputDebugString("\n");
|
||
|
for (i=0; i< _cchTextProcessed; i++)
|
||
|
{
|
||
|
char ctmp[2];
|
||
|
|
||
|
ctmp[0] = pwcStem[i] & 0xff;
|
||
|
ctmp[1] = '\0';
|
||
|
OutputDebugString(ctmp);
|
||
|
}
|
||
|
OutputDebugString(" ");
|
||
|
#endif
|
||
|
|
||
|
(pWordSink->PutWord)(_cchTextProcessed,
|
||
|
pwcStem,
|
||
|
_cchTextProcessed,
|
||
|
pTextSource->iCur);
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
if ( Type == WT_HANGUEL )
|
||
|
{
|
||
|
WCHAR TokenWord[80];
|
||
|
ULONG i;
|
||
|
WDOB sob;
|
||
|
|
||
|
// EnterCriticalSection(&ThCritSect);
|
||
|
|
||
|
sob.wordlist = (LPWSTR)LocalAlloc(LPTR, 200);
|
||
|
sob.sch = 200;
|
||
|
|
||
|
|
||
|
if (sob.wordlist == NULL )
|
||
|
return FALSE;
|
||
|
|
||
|
pwcStem = pTextSource->awcBuffer + pTextSource->iCur;
|
||
|
|
||
|
|
||
|
for (i=0; i<_cchTextProcessed; i++)
|
||
|
{
|
||
|
#ifdef KORDBG
|
||
|
WORD wtmp;
|
||
|
char ctmp[80];
|
||
|
|
||
|
wtmp = pwcStem[i];
|
||
|
sprintf(ctmp, "%4x ", wtmp);
|
||
|
OutputDebugString(ctmp);
|
||
|
#endif
|
||
|
|
||
|
TokenWord[i] = pwcStem[i];
|
||
|
}
|
||
|
|
||
|
TokenWord[_cchTextProcessed] = L'\0';
|
||
|
|
||
|
#ifdef KORDBG
|
||
|
OutputDebugString("\nBefore StemmerDecomposeW\n");
|
||
|
#endif
|
||
|
|
||
|
if (StemmerDecomposeW(g_hStm, TokenWord, &sob) == NULL)
|
||
|
{
|
||
|
ULONG wInLexLen;
|
||
|
WORD winfo;
|
||
|
ULONG num, len, j, k;
|
||
|
WCHAR *pWordList, *pVerb;
|
||
|
ULONG NumEf;
|
||
|
BOOL fExist;
|
||
|
WCHAR *pwszStart[MAXFORMS];
|
||
|
|
||
|
NumEf = 0;
|
||
|
|
||
|
do
|
||
|
{
|
||
|
num = sob.num;
|
||
|
pWordList = sob.wordlist;
|
||
|
|
||
|
for (j=0; j<num; j++)
|
||
|
{
|
||
|
len = wcslen(pWordList);
|
||
|
memcpy(&winfo,pWordList+len+1,2);
|
||
|
|
||
|
switch (winfo & 0x0f00)
|
||
|
{
|
||
|
case POS_NOUN :
|
||
|
case POS_ADJECTIVE :
|
||
|
case POS_PRONOUN :
|
||
|
case POS_ADVERB :
|
||
|
case POS_NUMBER :
|
||
|
|
||
|
fExist = FALSE;
|
||
|
for (k=0; k<NumEf; k++)
|
||
|
{
|
||
|
if ( wcscmp(pWordList,pwszStart[k]) == 0 )
|
||
|
fExist = TRUE;
|
||
|
}
|
||
|
|
||
|
if ( !fExist ) // this stem does not exist so far.
|
||
|
{
|
||
|
// it will contain: WordList 00 Winfo, so the length
|
||
|
// should be len + 2 = len + 1 + 1.
|
||
|
|
||
|
pwszStart[NumEf]=(LPWSTR)LocalAlloc(LPTR,(len+2)*sizeof(WCHAR));
|
||
|
|
||
|
if ( pwszStart[NumEf] == NULL )
|
||
|
{
|
||
|
// alloc error, so return here
|
||
|
if (sob.wordlist)
|
||
|
LocalFree(sob.wordlist);
|
||
|
|
||
|
for (k=0; k<NumEf; k++)
|
||
|
{
|
||
|
if ( pwszStart[k] != NULL)
|
||
|
LocalFree(pwszStart[k]);
|
||
|
}
|
||
|
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
|
||
|
wcscpy(pwszStart[NumEf], pWordList);
|
||
|
pwszStart[NumEf][len] = L'\0';
|
||
|
pwszStart[NumEf][len+1] = winfo & 0x0f00 ;
|
||
|
|
||
|
NumEf++;
|
||
|
|
||
|
}
|
||
|
|
||
|
break;
|
||
|
|
||
|
case POS_VERB : // for Verb, we will handle it specially.
|
||
|
// Append a flag char Da (U+B2E4) to
|
||
|
// the root form of a verb word.
|
||
|
|
||
|
fExist = FALSE;
|
||
|
|
||
|
|
||
|
// it will contain: WordList <Da> 00.
|
||
|
// so the length should be len + 2 = len + 1 + 1.
|
||
|
|
||
|
pVerb = (LPWSTR)LocalAlloc(LPTR, (len+2)*sizeof(WCHAR));
|
||
|
|
||
|
if (pVerb == NULL )
|
||
|
{
|
||
|
if (sob.wordlist)
|
||
|
LocalFree(sob.wordlist);
|
||
|
|
||
|
for (k=0; k<NumEf; k++)
|
||
|
{
|
||
|
if ( pwszStart[k] != NULL )
|
||
|
LocalFree(pwszStart[k]);
|
||
|
}
|
||
|
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
wcscpy(pVerb, pWordList);
|
||
|
|
||
|
pVerb[len] = VERBCHAR;
|
||
|
pVerb[len+1] = L'\0';
|
||
|
|
||
|
for (k=0; k<NumEf; k++)
|
||
|
{
|
||
|
if ( wcscmp(pVerb,pwszStart[k]) == 0 )
|
||
|
fExist = TRUE;
|
||
|
}
|
||
|
|
||
|
if ( !fExist ) // this stem does not exist so far.
|
||
|
{
|
||
|
|
||
|
// it will contain: Wordlist <Da> 00 Winfo.
|
||
|
// so the length should be len+3 = len + 1 + 1 + 1.
|
||
|
|
||
|
pwszStart[NumEf]=(LPWSTR)LocalAlloc(LPTR,(len+3)*sizeof(WCHAR));
|
||
|
|
||
|
if ( pwszStart[NumEf] == NULL )
|
||
|
{
|
||
|
// alloc error, so return here
|
||
|
if (sob.wordlist)
|
||
|
LocalFree(sob.wordlist);
|
||
|
|
||
|
if ( pVerb != NULL )
|
||
|
LocalFree(pVerb);
|
||
|
|
||
|
for (k=0; k<NumEf; k++)
|
||
|
{
|
||
|
if ( pwszStart[k] != NULL)
|
||
|
LocalFree(pwszStart[k]);
|
||
|
}
|
||
|
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
wcscpy(pwszStart[NumEf], pVerb);
|
||
|
pwszStart[NumEf][len+1] = L'\0';
|
||
|
pwszStart[NumEf][len+2] = winfo & 0x0f00;
|
||
|
|
||
|
NumEf++;
|
||
|
}
|
||
|
|
||
|
if ( pVerb != NULL )
|
||
|
LocalFree(pVerb);
|
||
|
|
||
|
break;
|
||
|
|
||
|
case POS_AUXVERB :
|
||
|
case POS_AUXADJ :
|
||
|
case POS_SPECIFIER :
|
||
|
case POS_PREFIX :
|
||
|
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
pWordList += len + 3;
|
||
|
}
|
||
|
} while (StemmerDecomposeMoreW(g_hStm, TokenWord, &sob) == NULL) ;
|
||
|
|
||
|
//
|
||
|
// BUGBUG: Can this legitimately happen? We're seeing it indexing microsoft.com.
|
||
|
//
|
||
|
|
||
|
if ( 0 == NumEf )
|
||
|
{
|
||
|
#if DBG == 1
|
||
|
OutputDebugString( "BOGUS WORD: " );
|
||
|
|
||
|
for ( WCHAR * pwc = &TokenWord[0]; 0 != *pwc; pwc++ )
|
||
|
{
|
||
|
char ctmp[6];
|
||
|
sprintf(ctmp, "%4x ", *pwc);
|
||
|
OutputDebugString(ctmp);
|
||
|
}
|
||
|
|
||
|
OutputDebugString( "\n" );
|
||
|
#endif
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
if ( !_fQuery )
|
||
|
{
|
||
|
for (i=0; i< (NumEf-1); i++ )
|
||
|
{
|
||
|
wInLexLen = wcslen(pwszStart[i]);
|
||
|
pWordSink->PutAltWord(wInLexLen,
|
||
|
pwszStart[i],
|
||
|
_cchTextProcessed,
|
||
|
pTextSource->iCur);
|
||
|
}
|
||
|
|
||
|
// handle the last one.
|
||
|
wInLexLen = wcslen(pwszStart[NumEf-1]);
|
||
|
pWordSink->PutWord(wInLexLen,
|
||
|
pwszStart[NumEf-1],
|
||
|
_cchTextProcessed,
|
||
|
pTextSource->iCur);
|
||
|
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if ( NumEf == 1 )
|
||
|
{
|
||
|
// handle this only one.
|
||
|
wInLexLen = wcslen(pwszStart[NumEf-1]);
|
||
|
pWordSink->PutWord(wInLexLen,
|
||
|
pwszStart[NumEf-1],
|
||
|
_cchTextProcessed,
|
||
|
pTextSource->iCur);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
ULONG uNum_Noun;
|
||
|
ULONG uIndex[MAXFORMS];
|
||
|
|
||
|
uNum_Noun = 0;
|
||
|
|
||
|
for (i=0; i<NumEf; i++)
|
||
|
{
|
||
|
wInLexLen = wcslen(pwszStart[i]);
|
||
|
|
||
|
if ( (pwszStart[i][wInLexLen+1] == POS_NOUN) ||
|
||
|
(pwszStart[i][wInLexLen+1] == POS_PRONOUN) ||
|
||
|
(pwszStart[i][wInLexLen+1] == POS_NUMBER) )
|
||
|
{
|
||
|
uIndex[uNum_Noun] = i;
|
||
|
uNum_Noun ++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if ( uNum_Noun == 0 )
|
||
|
{
|
||
|
// there is no Noun form
|
||
|
|
||
|
for (i=0; i< (NumEf-1); i++ )
|
||
|
{
|
||
|
wInLexLen = wcslen(pwszStart[i]);
|
||
|
pWordSink->PutAltWord(wInLexLen,
|
||
|
pwszStart[i],
|
||
|
_cchTextProcessed,
|
||
|
pTextSource->iCur);
|
||
|
}
|
||
|
|
||
|
// handle the last one.
|
||
|
wInLexLen = wcslen(pwszStart[NumEf-1]);
|
||
|
pWordSink->PutWord(wInLexLen,
|
||
|
pwszStart[NumEf-1],
|
||
|
_cchTextProcessed,
|
||
|
pTextSource->iCur);
|
||
|
}
|
||
|
|
||
|
if (uNum_Noun == 1)
|
||
|
{
|
||
|
// there is only One Noun, and we just use this one to query.
|
||
|
ULONG index;
|
||
|
|
||
|
index = uIndex[0];
|
||
|
wInLexLen = wcslen(pwszStart[index]);
|
||
|
pWordSink->PutWord(wInLexLen,
|
||
|
pwszStart[index],
|
||
|
_cchTextProcessed,
|
||
|
pTextSource->iCur);
|
||
|
}
|
||
|
|
||
|
if ( uNum_Noun > 1 )
|
||
|
{
|
||
|
// there are more than one Noun, Use all those Noun to query.
|
||
|
|
||
|
ULONG index;
|
||
|
|
||
|
for (i=0; i<uNum_Noun-1; i++)
|
||
|
{
|
||
|
index = uIndex[i];
|
||
|
wInLexLen = wcslen(pwszStart[index]);
|
||
|
pWordSink->PutAltWord(wInLexLen,
|
||
|
pwszStart[index],
|
||
|
_cchTextProcessed,
|
||
|
pTextSource->iCur);
|
||
|
}
|
||
|
|
||
|
// handle the last Noun.
|
||
|
|
||
|
index = uIndex[uNum_Noun-1];
|
||
|
wInLexLen = wcslen(pwszStart[index]);
|
||
|
pWordSink->PutWord(wInLexLen,
|
||
|
pwszStart[index],
|
||
|
_cchTextProcessed,
|
||
|
pTextSource->iCur);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#ifdef KORDBG
|
||
|
{
|
||
|
char ctmp[80];
|
||
|
|
||
|
OutputDebugString("\nStemmerDecomposeW Correct\n");
|
||
|
OutputDebugString(" the Num of Stemm is ");
|
||
|
sprintf(ctmp, "%4x ", NumEf);
|
||
|
OutputDebugString(ctmp);
|
||
|
OutputDebugString("\n");
|
||
|
}
|
||
|
|
||
|
for (i=0; i< NumEf; i++)
|
||
|
{
|
||
|
WORD wtmp;
|
||
|
char ctmp[80];
|
||
|
|
||
|
wInLexLen = wcslen(pwszStart[i]);
|
||
|
for (j=0; j<wInLexLen; j++)
|
||
|
{
|
||
|
wtmp = pwszStart[i][j];
|
||
|
sprintf(ctmp, "%4x ", wtmp);
|
||
|
OutputDebugString(ctmp);
|
||
|
}
|
||
|
OutputDebugString(" Type: ");
|
||
|
|
||
|
wtmp = pwszStart[i][wInLexLen+1];
|
||
|
wtmp = wtmp & 0x0f00;
|
||
|
|
||
|
sprintf(ctmp, "%4x \n", wtmp);
|
||
|
OutputDebugString(ctmp);
|
||
|
}
|
||
|
|
||
|
OutputDebugString("\n");
|
||
|
#endif
|
||
|
|
||
|
// Free the memory.
|
||
|
for (i=0; i<NumEf; i++)
|
||
|
{
|
||
|
if ( pwszStart[i] != NULL )
|
||
|
LocalFree(pwszStart[i]);
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
#ifdef KORDBG
|
||
|
OutputDebugString("\nStemmerDecomposeW NOT right\n");
|
||
|
for (i=0; i<_cchTextProcessed; i++)
|
||
|
{
|
||
|
WORD wtmp;
|
||
|
char ctmp[80];
|
||
|
|
||
|
wtmp = TokenWord[i];
|
||
|
sprintf(ctmp, "%4x ", wtmp);
|
||
|
OutputDebugString(ctmp);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
pWordSink->PutWord(_cchTextProcessed,
|
||
|
TokenWord,
|
||
|
_cchTextProcessed,
|
||
|
pTextSource->iCur);
|
||
|
}
|
||
|
|
||
|
LocalFree(sob.wordlist);
|
||
|
// LeaveCriticalSection (&ThCritSect);
|
||
|
}
|
||
|
|
||
|
return TRUE;
|
||
|
}
|