617 lines
17 KiB
C++
617 lines
17 KiB
C++
//+---------------------------------------------------------------------------
|
|
//
|
|
// Microsoft Windows
|
|
// Copyright (C) Microsoft Corporation, 1991 - 2000
|
|
//
|
|
// File: KEYMAK.CXX
|
|
//
|
|
// Contents: Key maker
|
|
//
|
|
// Classes: CKeyMaker
|
|
//
|
|
// History: 31-Jan-92 BartoszM Created
|
|
// 24-Apr-95 SitaramR Removed US/Fake stemmer and added
|
|
// Infosoft stemmer
|
|
//
|
|
// Notes: The filtering pipeline is hidden in the Data Repository
|
|
// object which serves as a sink for the filter.
|
|
// The sink for the Data Repository is the Key Repository.
|
|
// The language dependent part of the pipeline
|
|
// is obtained from the Language List object and is called
|
|
// Language Dependent Key Maker. It consists of:
|
|
//
|
|
// Word Breaker
|
|
// Stemmer (optional)
|
|
// Normalizer
|
|
// Noise List
|
|
//
|
|
// Each object serves as a sink for its predecessor,
|
|
// Key Repository is the final sink.
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
#include <pch.cxx>
|
|
#pragma hdrstop
|
|
|
|
#include <lang.hxx>
|
|
#include <keymak.hxx>
|
|
#include <noise.hxx>
|
|
#include <norm.hxx>
|
|
#include <stemsink.hxx>
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CKeyMaker::CKeyMaker
|
|
//
|
|
// Synopsis: Constructs a language-dependant key maker object
|
|
//
|
|
// Effects: Creates a noiselist, normalizer and borrows a wordbreaker, stemmer
|
|
//
|
|
// Arguments: [locale] -- language locale
|
|
// [krep] -- key repository to place completed keys in
|
|
// [pPhraseSink] -- sink for collecting phrases
|
|
// [fQuery] -- true if this is during querying
|
|
// [ulFuzzy] -- fuzzy level of query
|
|
//
|
|
// History: 05-June-91 t-WadeR Created.
|
|
// 12-Oct-92 AmyA Added Unicode support
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
CKeyMaker::CKeyMaker( LCID locale,
|
|
PROPID pid,
|
|
PKeyRepository& krep,
|
|
IPhraseSink *pPhraseSink,
|
|
BOOL fQuery,
|
|
ULONG ulFuzzy,
|
|
CLangList & langList )
|
|
: _pPhraseSink(pPhraseSink),
|
|
_fQuery( fQuery ),
|
|
_sLang( locale, pid, &langList, fQuery ? LANG_LOAD_ALL : LANG_LOAD_NO_STEMMER ),
|
|
_lcid( locale ),
|
|
_pid( pid )
|
|
{
|
|
krep.GetSourcePosBuffers (&_pcwcSrcPos, &_pcwcSrcLen );
|
|
|
|
CStringTable* noiseTable;
|
|
|
|
//
|
|
// Don't remove noise words if we're doing prefix matching. The noise
|
|
// *word* is potentially only a prefix for a non-noise word.
|
|
//
|
|
|
|
if (GENERATE_METHOD_PREFIX == ulFuzzy )
|
|
noiseTable = 0;
|
|
else
|
|
noiseTable = _sLang->GetNoiseTable();
|
|
|
|
if ( noiseTable != 0 )
|
|
_xNoiseList.Set( new CNoiseList( *noiseTable, krep ) );
|
|
else
|
|
_xNoiseList.Set( new CNoiseListEmpty( krep, ulFuzzy ) );
|
|
|
|
_xWordRep.Set( new CNormalizer( _xNoiseList.GetReference() ) );
|
|
|
|
// Get Normalizer's buffer length
|
|
_cwcMaxNormBuf = _xWordRep->GetMaxBufferLen();
|
|
|
|
// get stemmer (optional)
|
|
if ( ulFuzzy == GENERATE_METHOD_STEMMED )
|
|
{
|
|
IStemmer *pStemmer = _sLang->GetStemmer();
|
|
|
|
if ( pStemmer )
|
|
{
|
|
BOOL fCopyright;
|
|
SCODE sc = pStemmer->Init( _cwcMaxNormBuf, &fCopyright );
|
|
|
|
if ( FAILED(sc) )
|
|
{
|
|
ciDebugOut(( DEB_ERROR, "IStemmer::Init returned 0x%x\n", sc ));
|
|
THROW( CException( sc ) );
|
|
}
|
|
|
|
if ( fCopyright )
|
|
{
|
|
WCHAR const * pLicense;
|
|
sc = pStemmer->GetLicenseToUse( &pLicense );
|
|
|
|
if ( SUCCEEDED(sc) )
|
|
{
|
|
ciDebugOut(( DEB_WORDS, "%ws\n", pLicense ));
|
|
}
|
|
else
|
|
{
|
|
ciDebugOut(( DEB_ERROR, "IStemmer::GetLicenseToUse returned 0x%x\n", sc ));
|
|
THROW( CException( sc ) );
|
|
}
|
|
}
|
|
|
|
_xWordRep2.Set( _xWordRep.Acquire() );
|
|
_xWordRep.Set( new CStemmerSink( pStemmer, _xWordRep2.GetReference() ) );
|
|
}
|
|
else
|
|
{
|
|
ciDebugOut(( DEB_ERROR,
|
|
"Fuzzy2 query, but no stemmer available for locale 0x%x\n",
|
|
locale ));
|
|
}
|
|
}
|
|
|
|
//
|
|
// Initialize word breaker
|
|
//
|
|
_pWBreak = _sLang->GetWordBreaker();
|
|
|
|
Win4Assert( _pWBreak );
|
|
|
|
BOOL fCopyright;
|
|
SCODE sc = _pWBreak->Init( fQuery, _cwcMaxNormBuf, &fCopyright );
|
|
|
|
if ( FAILED(sc) )
|
|
{
|
|
ciDebugOut(( DEB_ERROR, "IWordBreaker::Init returned 0x%x\n", sc ));
|
|
THROW( CException( sc ) );
|
|
}
|
|
|
|
if ( fCopyright )
|
|
{
|
|
WCHAR const * pLicense;
|
|
sc = _pWBreak->GetLicenseToUse( &pLicense );
|
|
|
|
if ( SUCCEEDED(sc) )
|
|
{
|
|
ciDebugOut(( DEB_WORDS, "%ws\n", pLicense ));
|
|
}
|
|
else
|
|
{
|
|
ciDebugOut(( DEB_ERROR, "IWordBreaker::GetLicenseToUse returned 0x%x\n", sc ));
|
|
THROW( CException( sc ) );
|
|
}
|
|
}
|
|
} //CKeyMaker
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CKeyMaker::CKeyMaker
|
|
//
|
|
// Synopsis: Constructs key maker for noise word list initialization.
|
|
//
|
|
// Arguments: [pWBreak] -- word breaker
|
|
// [Noise] -- noise word list
|
|
//
|
|
// History: 05-June-91 t-WadeR Created.
|
|
// 12-Oct-92 AmyA Added Unicode support
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
CKeyMaker::CKeyMaker( IWordBreaker * pWBreak, PNoiseList & Noise )
|
|
: _pWBreak( pWBreak ),
|
|
_pPhraseSink(0),
|
|
_fQuery(FALSE)
|
|
{
|
|
_xWordRep.Set( new CNormalizer( Noise ) );
|
|
|
|
// Get Normalizer's buffer length
|
|
_cwcMaxNormBuf = _xWordRep->GetMaxBufferLen();
|
|
|
|
_pcwcSrcPos = 0; // We don't use them!
|
|
_pcwcSrcLen = 0;
|
|
|
|
//
|
|
// Initialize word breaker
|
|
//
|
|
Win4Assert( _pWBreak );
|
|
|
|
BOOL fCopyright;
|
|
SCODE sc = _pWBreak->Init( FALSE, _cwcMaxNormBuf, &fCopyright );
|
|
|
|
if ( FAILED(sc) )
|
|
{
|
|
ciDebugOut(( DEB_ERROR, "IWordBreaker::Init returned 0x%x\n", sc ));
|
|
THROW( CException( sc ) );
|
|
}
|
|
|
|
if ( fCopyright )
|
|
{
|
|
WCHAR const * pLicense;
|
|
sc = _pWBreak->GetLicenseToUse( &pLicense );
|
|
|
|
if ( SUCCEEDED(sc) )
|
|
{
|
|
ciDebugOut(( DEB_WORDS, "%ws\n", pLicense ));
|
|
}
|
|
else
|
|
{
|
|
ciDebugOut(( DEB_ERROR, "IWordBreaker::GetLicenseToUse returned 0x%x\n", sc ));
|
|
THROW( CException( sc ) );
|
|
}
|
|
}
|
|
} //CKeyMaker
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CKeyMaker::~CKeyMaker
|
|
//
|
|
// Synopsis: destroys a key maker object
|
|
//
|
|
// History: 05-June-91 t-WadeR Created.
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
CKeyMaker::~CKeyMaker()
|
|
{
|
|
}
|
|
|
|
//
|
|
// The following are needed to make midl happy. There are no other interfaces
|
|
// to bind to. Inheritance from IUnknown is unnecessary.
|
|
//
|
|
|
|
SCODE STDMETHODCALLTYPE CKeyMaker::QueryInterface(REFIID riid, void * * ppvObject)
|
|
{
|
|
*ppvObject = 0;
|
|
return( E_NOTIMPL );
|
|
}
|
|
|
|
ULONG STDMETHODCALLTYPE CKeyMaker::AddRef()
|
|
{
|
|
return( 1 );
|
|
}
|
|
|
|
ULONG STDMETHODCALLTYPE CKeyMaker::Release()
|
|
{
|
|
return( 1 );
|
|
}
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CKeyMaker::PutWord
|
|
//
|
|
// Synopsis: Store word in word repository
|
|
//
|
|
// Arguments: [cwc] -- Count of characters in [pwcInBuf]
|
|
// [pwcInBuf] -- Word
|
|
// [cwcSrcLen] -- count of characters in pTextSource buffer (see IWordBreaker::BreakText)
|
|
// [cwcSrcPos] -- position of word in pTextSource buffer
|
|
//
|
|
// History: 19-Apr-1994 KyleP Created
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
SCODE STDMETHODCALLTYPE CKeyMaker::PutWord( ULONG cwc,
|
|
WCHAR const *pwcInBuf,
|
|
ULONG cwcSrcLen,
|
|
ULONG cwcSrcPos )
|
|
{
|
|
SCODE sc = S_OK;
|
|
|
|
// validate PutWord call
|
|
if ( !_altWordsEnforcer.IsPutWordOk() )
|
|
{
|
|
Win4Assert( !"CKeyMaker::PutWord - invalid state" );
|
|
ciDebugOut(( DEB_ITRACE, "PutWord: %.*ws\n", cwc, pwcInBuf ));
|
|
|
|
return E_FAIL;
|
|
}
|
|
|
|
CTranslateSystemExceptions translate;
|
|
|
|
TRY
|
|
{
|
|
if ( cwc > _cwcMaxNormBuf )
|
|
{
|
|
sc = LANGUAGE_S_LARGE_WORD;
|
|
cwc = _cwcMaxNormBuf;
|
|
}
|
|
|
|
if ( cwc > 0 )
|
|
{
|
|
#if CIDBG == 1
|
|
if ( ciInfoLevel & DEB_WORDS )
|
|
{
|
|
//
|
|
// Check for 'printable' characters.
|
|
//
|
|
|
|
BOOL fOk = TRUE;
|
|
|
|
for ( unsigned i = 0; i < cwc; i++ )
|
|
{
|
|
if ( pwcInBuf[i] > 0xFF )
|
|
{
|
|
fOk = FALSE;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ( fOk )
|
|
ciDebugOut(( DEB_WORDS,
|
|
"PutWord: \"%.*ws\" Occ = %d cwcSrcLen = %d, cwcSrcPos = %d\n",
|
|
cwc, pwcInBuf, _xWordRep->GetOccurrence(), cwcSrcLen, cwcSrcPos ));
|
|
else
|
|
{
|
|
ciDebugOut(( DEB_WORDS, "PutWord:" ));
|
|
|
|
for ( i = 0; i < cwc; i++ )
|
|
ciDebugOut(( DEB_WORDS | DEB_NOCOMPNAME, " %04X", pwcInBuf[i] ));
|
|
|
|
ciDebugOut(( DEB_WORDS | DEB_NOCOMPNAME,
|
|
" Occ = %d cwcSrcLen = %d, cwcSrcPos = %d\n",
|
|
_xWordRep->GetOccurrence(), cwcSrcLen, cwcSrcPos ));
|
|
}
|
|
}
|
|
#endif // CIDBG
|
|
|
|
//
|
|
// No internal call to PutAltWord for performance reasons.
|
|
//
|
|
|
|
if (0 != _pcwcSrcPos)
|
|
{
|
|
Win4Assert ( 0 != _pcwcSrcLen );
|
|
*_pcwcSrcLen = cwcSrcLen;
|
|
*_pcwcSrcPos = cwcSrcPos;
|
|
}
|
|
|
|
_xWordRep->ProcessWord( pwcInBuf, cwc );
|
|
}
|
|
}
|
|
CATCH( CException, e )
|
|
{
|
|
sc = e.GetErrorCode();
|
|
}
|
|
END_CATCH;
|
|
|
|
return sc;
|
|
} //PutWord
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CKeyMaker::PutAltWord
|
|
//
|
|
// Synopsis: Store alternate word in word repository.
|
|
//
|
|
// Effects: Identical to PutWord except occurrence count is not
|
|
// incremented.
|
|
//
|
|
// Arguments: [cwc] -- Count of characters in [pwcInBuf]
|
|
// [pwcInBuf] -- Word
|
|
// [cwcSrcLen] -- count of characters in pTextSource buffer (see IWordBreaker::BreakText)
|
|
// [cwcSrcPos] -- position of word in pTextSource buffer
|
|
//
|
|
// History: 19-Apr-1994 KyleP Created
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
SCODE STDMETHODCALLTYPE CKeyMaker::PutAltWord( ULONG cwc,
|
|
WCHAR const *pwcInBuf,
|
|
ULONG cwcSrcLen,
|
|
ULONG cwcSrcPos )
|
|
{
|
|
SCODE sc = S_OK;
|
|
|
|
// validate PutWord call
|
|
|
|
if ( !_altWordsEnforcer.IsPutAltWordOk() )
|
|
{
|
|
Win4Assert( !"CKeyMaker::PutAltWord - invalid state" );
|
|
ciDebugOut(( DEB_ITRACE, "PutAltWord: %.*ws\n", cwc, pwcInBuf ));
|
|
|
|
return E_FAIL;
|
|
}
|
|
|
|
CTranslateSystemExceptions translate;
|
|
|
|
TRY
|
|
{
|
|
//
|
|
// What is to be done if two large, alternate words end up with the
|
|
// same (truncated) prefix after truncation ?
|
|
// This is fixed in Babylon and isn't a problem here.
|
|
//
|
|
if ( cwc > _cwcMaxNormBuf )
|
|
{
|
|
sc = LANGUAGE_S_LARGE_WORD;
|
|
cwc = _cwcMaxNormBuf;
|
|
}
|
|
|
|
if ( cwc > 0 )
|
|
{
|
|
ciDebugOut(( DEB_WORDS,
|
|
"PutAltWord: \"%.*ws\" Occ = %d cwcSrcLen = %d, cwcSrcPos = %d\n",
|
|
cwc, pwcInBuf, _xWordRep->GetOccurrence(), cwcSrcLen, cwcSrcPos ));
|
|
|
|
if (0 != _pcwcSrcPos)
|
|
{
|
|
Win4Assert ( 0 != _pcwcSrcLen );
|
|
*_pcwcSrcLen = cwcSrcLen;
|
|
*_pcwcSrcPos = cwcSrcPos;
|
|
}
|
|
|
|
_xWordRep->ProcessAltWord( pwcInBuf, cwc );
|
|
}
|
|
}
|
|
CATCH( CException, e )
|
|
{
|
|
sc = e.GetErrorCode();
|
|
}
|
|
END_CATCH;
|
|
|
|
return sc;
|
|
} //PutAltWord
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CKeyMaker::StartAltPhrase
|
|
//
|
|
// Synopsis: Pass on StartAltPhrase to word repository
|
|
//
|
|
// History: 24-Apr-1994 KyleP Created
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
SCODE STDMETHODCALLTYPE CKeyMaker::StartAltPhrase()
|
|
{
|
|
SCODE sc = S_OK;
|
|
|
|
CTranslateSystemExceptions translate;
|
|
|
|
TRY
|
|
{
|
|
if ( _fQuery )
|
|
{
|
|
// validate StartAltPhrase call
|
|
if ( !_altWordsEnforcer.IsStartAltPhraseOk() || !_altPhrasesEnforcer.IsStartAltPhraseOk() )
|
|
{
|
|
Win4Assert( !"CKeyMaker::StartAltPhrase - invalid state" );
|
|
|
|
THROW( CException( E_FAIL ) );
|
|
}
|
|
|
|
_xWordRep->StartAltPhrase();
|
|
}
|
|
else
|
|
sc = WBREAK_E_QUERY_ONLY;
|
|
}
|
|
CATCH( CException, e )
|
|
{
|
|
sc = e.GetErrorCode();
|
|
}
|
|
END_CATCH;
|
|
|
|
return sc;
|
|
} //StartAltPhrase
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CKeyMaker::EndAltPhrase
|
|
//
|
|
// Synopsis: Pass on EndAltPhrase to word repository
|
|
//
|
|
// History: 24-Apr-1994 KyleP Created
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
SCODE STDMETHODCALLTYPE CKeyMaker::EndAltPhrase()
|
|
{
|
|
SCODE sc = S_OK;
|
|
|
|
CTranslateSystemExceptions translate;
|
|
|
|
TRY
|
|
{
|
|
if ( _fQuery )
|
|
{
|
|
// validate EndAltPhrase call
|
|
if ( !_altWordsEnforcer.IsEndAltPhraseOk() || !_altPhrasesEnforcer.IsEndAltPhraseOk() )
|
|
{
|
|
Win4Assert( !"CKeyMaker::EndAltPhrase - invalid state" );
|
|
|
|
THROW( CException( E_FAIL ) );
|
|
}
|
|
|
|
_xWordRep->EndAltPhrase();
|
|
}
|
|
else
|
|
sc = WBREAK_E_QUERY_ONLY;
|
|
}
|
|
CATCH( CException, e )
|
|
{
|
|
sc = e.GetErrorCode();
|
|
}
|
|
END_CATCH;
|
|
|
|
return sc;
|
|
} //EndAltPhrase
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CKeyMaker::PutBreak
|
|
//
|
|
// Synopsis: Increment the occurrence count appropriately
|
|
//
|
|
// History: 24-Apr-1994 KyleP Created
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
SCODE STDMETHODCALLTYPE CKeyMaker::PutBreak( WORDREP_BREAK_TYPE breakType )
|
|
{
|
|
// We are modeling PutBreak by a skip of the appropriate number of noise words
|
|
|
|
switch ( breakType )
|
|
{
|
|
case WORDREP_BREAK_EOW:
|
|
_xWordRep->SkipNoiseWords( 1 );
|
|
break;
|
|
|
|
case WORDREP_BREAK_EOS:
|
|
_xWordRep->SkipNoiseWords( 8 );
|
|
break;
|
|
|
|
case WORDREP_BREAK_EOP:
|
|
_xWordRep->SkipNoiseWords( 128 );
|
|
break;
|
|
|
|
case WORDREP_BREAK_EOC:
|
|
_xWordRep->SkipNoiseWords( 1024 );
|
|
break;
|
|
|
|
default:
|
|
ciDebugOut(( DEB_ERROR,
|
|
"CKeyMaker::PutBreak -- Bad break type %d\n",
|
|
breakType ));
|
|
return( E_FAIL );
|
|
}
|
|
|
|
return( S_OK );
|
|
} //PutBreak
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CKeyMaker::Supports
|
|
//
|
|
// Synopsis: Checks if the pid/lang are supported by the language object
|
|
//
|
|
// Arguments: [pid] -- The property ID
|
|
// [lcid] -- The locale
|
|
//
|
|
// Returns: TRUE if it is supported
|
|
//
|
|
// History: 24-Apr-1994 KyleP Created
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
BOOL CKeyMaker::Supports( PROPID pid, LCID lcid )
|
|
{
|
|
if ( (lcid == _lcid) && (pid == _pid) )
|
|
return TRUE;
|
|
else
|
|
return _sLang.Supports( pid, lcid );
|
|
} //Supports
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CKeyMaker::NormalizeWStr - Public
|
|
//
|
|
// Synopsis: Normalizes a UniCode string
|
|
//
|
|
// Arguments: [pwcInBuf] -- input buffer
|
|
// [cwcInBuf] -- count of chars in pwcInBuf
|
|
// [pbOutBuf] -- output buffer.
|
|
// [pcbOutBuf] - pointer to output count of bytes.
|
|
//
|
|
// History: 10-Feb-2000 KitmanH Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
void CKeyMaker::NormalizeWStr( WCHAR const *pwcInBuf,
|
|
ULONG cwcInBuf,
|
|
BYTE *pbOutBuf,
|
|
unsigned *pcbOutBuf )
|
|
{
|
|
_xWordRep->NormalizeWStr( pwcInBuf,
|
|
cwcInBuf,
|
|
pbOutBuf,
|
|
pcbOutBuf );
|
|
}
|