611 lines
16 KiB
C++
611 lines
16 KiB
C++
/*************************************************************************
|
|
* @doc SHROOM EXTERNAL API *
|
|
* *
|
|
* ENGSTEM.CPP *
|
|
* *
|
|
* Copyright (C) Microsoft Corporation 1997 *
|
|
* All Rights reserved. *
|
|
* *
|
|
* This file contains the implementation of CITEngStemmer methods. *
|
|
* CITEngStemmer is a pluggable word stemer object. *
|
|
* Although all the word breaking interface methods that accept text *
|
|
* require it to be Unicode, CITEngStemmer still only supports ANSI *
|
|
* internally. *
|
|
* *
|
|
**************************************************************************
|
|
* *
|
|
* Written By : Bill Aloof *
|
|
* Current Owner: billa *
|
|
* *
|
|
**************************************************************************/
|
|
|
|
#include <mvopsys.h>
|
|
|
|
#ifdef _DEBUG
|
|
static char s_aszModule[] = __FILE__; /* For error report */
|
|
#endif
|
|
|
|
#include <atlinc.h> // includes for ATL.
|
|
#include <_mvutil.h>
|
|
#include <mem.h>
|
|
#include <orkin.h>
|
|
#include <mvsearch.h>
|
|
#include "common.h"
|
|
#include <iterror.h>
|
|
#include <itstem.h>
|
|
#include <itwbrkid.h>
|
|
#include "engstem.h"
|
|
|
|
|
|
|
|
//---------------------------------------------------------------------------
|
|
// Constructor and Destructor
|
|
//---------------------------------------------------------------------------
|
|
|
|
|
|
CITEngStemmer::CITEngStemmer()
|
|
{
|
|
ClearMembers();
|
|
m_hmem1 = m_hmem2 = NULL;
|
|
m_cbBuf1Cur = m_cbBuf2Cur = 0;
|
|
}
|
|
|
|
CITEngStemmer::~CITEngStemmer()
|
|
{
|
|
Close();
|
|
}
|
|
|
|
|
|
//---------------------------------------------------------------------------
|
|
// IStemmer Method Implementations
|
|
//---------------------------------------------------------------------------
|
|
|
|
|
|
/********************************************************************
|
|
* @method STDMETHODIMP | IStemmer | Init |
|
|
* Gives the stemmer object a chance to initialize itself beyond
|
|
* what it did during IPersistStreamInit::InitNew or ::Load.
|
|
* @parm ULONG | ulMaxTokenSize | Max term length requested by caller
|
|
* @parm BOOL* | pfLicense | Whether the stemmer is subject to a license
|
|
*
|
|
* @rvalue E_POINTER | pfLicense was NULL
|
|
*
|
|
********************************************************************/
|
|
STDMETHODIMP
|
|
CITEngStemmer::Init(ULONG ulMaxTokenSize, BOOL *pfLicense)
|
|
{
|
|
HRESULT hr;
|
|
|
|
if (pfLicense == NULL)
|
|
return (SetErrReturn(E_POINTER));
|
|
|
|
// If we haven't been initialized yet (i.e. no call was made to either
|
|
// IPersistStreamInit::InitNew or Load), we'll initialize ourselves now.
|
|
// This allows Tripoli clients to use us without any code changes on their
|
|
// part. If we have already been initialized, the caller has had a chance
|
|
// to correctly set the lcid, so we check it now; otherwise, we want to
|
|
// still give the caller a chance to set it correctly.
|
|
if (m_fInitialized)
|
|
hr = (PRIMARYLANGID(LANGIDFROMLCID(m_stemctl.lcid)) == LANG_ENGLISH ?
|
|
S_OK : E_FAIL);
|
|
else
|
|
hr = InitNew();
|
|
|
|
if (SUCCEEDED(hr))
|
|
*pfLicense = FALSE;
|
|
|
|
// NOTE: We don't support internal truncation of terms based on
|
|
// ulMaxTokenSize. This is OK since the word sink is supposed to be
|
|
// prepared to have to truncate anyway.
|
|
|
|
return (hr);
|
|
}
|
|
|
|
|
|
/********************************************************************
|
|
* @method STDMETHODIMP | IStemmer | StemWord |
|
|
* stems the input word and calls the methods of IStemSink with the results.
|
|
*
|
|
* @parm WCHAR const | *pwcInBuf | Input Unicode word.
|
|
* @parm ULONG | cwc | count of Unicode characters in the input word.
|
|
* @parm IStemSink | *pStemSink | Pointer to the stemmer sink.
|
|
*
|
|
*
|
|
*
|
|
* @rvalue E_WORDTOOLONG | cwc is larger than 0x7FFF
|
|
* @rvalue E_POINTER | Either the input buffer or *pStemSink is NULL.
|
|
* @rvalue S_OK | The operation completed successfully.
|
|
*
|
|
********************************************************************/
|
|
STDMETHODIMP
|
|
CITEngStemmer::StemWord(WCHAR const *pwcInBuf, ULONG cwc, IStemSink *pStemSink)
|
|
{
|
|
HRESULT hr = S_OK;
|
|
|
|
if (pwcInBuf == NULL || pStemSink == NULL)
|
|
return (SetErrReturn(E_POINTER));
|
|
|
|
if (!m_fInitialized)
|
|
return (SetErrReturn(E_NOTOPEN));
|
|
|
|
if (PRIMARYLANGID(LANGIDFROMLCID(m_stemctl.lcid)) != LANG_ENGLISH)
|
|
return (SetErrReturn(E_FAIL));
|
|
|
|
if (cwc > 0x7FFF)
|
|
return (SetErrReturn(E_WORDTOOLONG));
|
|
|
|
m_cs.Lock();
|
|
|
|
// We allocate enough space for a worst case Unicode ---> MBCS conversion
|
|
// and allow an extra word for a length prefix that we will add later.
|
|
// This is probably overly cautious because we shouldn't be seeing any
|
|
// DBCS anyway (we're an English stemmer).
|
|
if (SUCCEEDED(hr = ReallocBuffer(&m_hmem1, &m_cbBuf1Cur,
|
|
(sizeof(WCHAR) * cwc) + sizeof(WORD))))
|
|
{
|
|
LPBYTE lpbRawWord;
|
|
|
|
lpbRawWord = (LPBYTE) _GLOBALLOCK(m_hmem1);
|
|
|
|
// REVIEW (billa): Need to make sure that the word being stemmed is in
|
|
// lower case.
|
|
|
|
// Convert the raw word to ANSI.
|
|
if ((*((WORD *)lpbRawWord) =
|
|
(WORD) WideCharToMultiByte(m_stemctl.dwCodePageID, NULL,
|
|
pwcInBuf, cwc, (char *)lpbRawWord + sizeof(WORD),
|
|
(m_cbBuf1Cur - sizeof(WORD)), NULL, NULL)) > 0)
|
|
{
|
|
|
|
// We want the buffer we allocate for the stemmed word to be larger
|
|
// than the raw word length so that we can handle the rare case
|
|
// where the stemmed word has grown. We can just use the raw word
|
|
// buffer size because it included a lot of extra padding.
|
|
if (SUCCEEDED(hr = ReallocBuffer(&m_hmem2, &m_cbBuf2Cur,
|
|
m_cbBuf1Cur)))
|
|
{
|
|
LPBYTE lpbStemWord;
|
|
|
|
lpbStemWord = (LPBYTE) _GLOBALLOCK(m_hmem2);
|
|
|
|
if (SUCCEEDED(hr = FStem(lpbStemWord, lpbRawWord)))
|
|
{
|
|
WCHAR *lpwchStem;
|
|
DWORD cwchStem;
|
|
DWORD cbStemWord;
|
|
|
|
_GLOBALUNLOCK(m_hmem1);
|
|
cwchStem = cbStemWord = (DWORD)(*((WORD *)lpbStemWord));
|
|
hr = ReallocBuffer(&m_hmem1, &m_cbBuf1Cur,
|
|
sizeof (WCHAR) * cbStemWord);
|
|
|
|
// Relock buffer even if we've failed the realloc
|
|
// so that the unlock we do later is valid. An
|
|
// unconditional relock is OK because ReallocBuffer
|
|
// won't invalidate the original m_hmem1 if it fails.
|
|
lpwchStem = (WCHAR *) _GLOBALLOCK(m_hmem1);
|
|
|
|
// Convert the stem word back to Unicode so that we can
|
|
// call the stem sink.
|
|
if ((cwchStem =
|
|
MultiByteToWideChar(m_stemctl.dwCodePageID, NULL,
|
|
(LPCSTR)lpbStemWord + sizeof(WORD), cbStemWord,
|
|
lpwchStem, cwchStem)) > 0)
|
|
{
|
|
// Send the raw word to the word sink.
|
|
hr = pStemSink->PutWord(lpwchStem, cwchStem);
|
|
}
|
|
else
|
|
hr = E_UNEXPECTED;
|
|
}
|
|
|
|
_GLOBALUNLOCK(m_hmem2);
|
|
}
|
|
}
|
|
else
|
|
hr = E_UNEXPECTED;
|
|
|
|
_GLOBALUNLOCK(m_hmem1);
|
|
}
|
|
|
|
m_cs.Unlock();
|
|
|
|
return (hr);
|
|
}
|
|
|
|
/*****************************************************************
|
|
* @method STDMETHODIMP | IStemmer | GetLicenseToUse |
|
|
*
|
|
* Not yet implemented
|
|
*
|
|
****************************************************************/
|
|
STDMETHODIMP
|
|
CITEngStemmer::GetLicenseToUse(WCHAR const **ppwcsLicense)
|
|
{
|
|
return (E_NOTIMPL);
|
|
}
|
|
|
|
|
|
//---------------------------------------------------------------------------
|
|
// IStemmerConfig Method Implementations
|
|
//---------------------------------------------------------------------------
|
|
|
|
|
|
/*****************************************************************
|
|
* @method STDMETHODIMP | IStemmerConfig | SetLocaleInfo |
|
|
* Sets locale information that affects the stemming
|
|
* behavior of IStemmer::StemWord.
|
|
* @parm DWORD | dwCodePageID | ANSI code page no. specified at build time.
|
|
* @parm LCID | lcid | Win32 locale identifier specified at build time.
|
|
*
|
|
* @rvalue S_OK | Locale described by the parameters is supported
|
|
* @rvalue E_INVALIDARG | Locale described by the parameters is not supported.
|
|
*
|
|
*
|
|
****************************************************************/
|
|
STDMETHODIMP
|
|
CITEngStemmer::SetLocaleInfo(DWORD dwCodePageID, LCID lcid)
|
|
{
|
|
if (!m_fInitialized)
|
|
return (SetErrReturn(E_NOTOPEN));
|
|
|
|
if (PRIMARYLANGID(LANGIDFROMLCID(lcid)) != LANG_ENGLISH)
|
|
return (SetErrReturn(E_INVALIDARG));
|
|
|
|
m_cs.Lock();
|
|
|
|
m_stemctl.dwCodePageID = dwCodePageID;
|
|
m_stemctl.lcid = lcid;
|
|
m_fDirty = TRUE;
|
|
|
|
m_cs.Unlock();
|
|
|
|
return (S_OK);
|
|
}
|
|
|
|
|
|
/*****************************************************************
|
|
* @method STDMETHODIMP | IStemmerConfig | GetLocaleInfo |
|
|
* Gets locale information that affects the stemming
|
|
* behavior of IStemmer::StemWord.
|
|
* @parm DWORD | *pdwCodePageID | Pointer to code page identifier
|
|
* @parm LCID | *plcid | Pointer to Win32 locale identifier.
|
|
*
|
|
* @rvalue S_OK | Locale described by the parameters is supported
|
|
* @rvalue E_INVALIDARG | Locale described by the parameters is not supported.
|
|
*
|
|
*
|
|
****************************************************************/
|
|
STDMETHODIMP
|
|
CITEngStemmer::GetLocaleInfo(DWORD *pdwCodePageID, LCID *plcid)
|
|
{
|
|
if (pdwCodePageID == NULL || plcid == NULL)
|
|
return (SetErrReturn(E_POINTER));
|
|
|
|
if (!m_fInitialized)
|
|
return (SetErrReturn(E_NOTOPEN));
|
|
|
|
m_cs.Lock();
|
|
|
|
*pdwCodePageID = m_stemctl.dwCodePageID;
|
|
*plcid = m_stemctl.lcid;
|
|
|
|
m_cs.Unlock();
|
|
|
|
return (S_OK);
|
|
}
|
|
|
|
/*****************************************************************
|
|
* @method STDMETHODIMP | IStemmerConfig | SetControlInfo |
|
|
* Sets information that controls certain aspects of stemming.
|
|
*
|
|
* @parm DWORD | grfStemFlags | Flags that control stemming behavior.
|
|
* @parm DWORD | dwReserved | Reserved for future use.
|
|
*
|
|
* @rvalue S_OK | The operation completed successfully.
|
|
*
|
|
* @comm
|
|
* In the future, additional information may be passed in through
|
|
* dwReserved.
|
|
****************************************************************/
|
|
|
|
STDMETHODIMP
|
|
CITEngStemmer::SetControlInfo(DWORD grfStemFlags, DWORD dwReserved)
|
|
{
|
|
DWORD grfFlagsUnsupported;
|
|
|
|
if (!m_fInitialized)
|
|
return (SetErrReturn(E_NOTOPEN));
|
|
|
|
grfFlagsUnsupported = ~(0);
|
|
|
|
if ((grfStemFlags & grfFlagsUnsupported) != 0)
|
|
return (SetErrReturn(E_INVALIDARG));
|
|
|
|
m_cs.Lock();
|
|
|
|
m_stemctl.grfStemFlags = grfStemFlags;
|
|
m_fDirty = TRUE;
|
|
|
|
m_cs.Unlock();
|
|
|
|
return (S_OK);
|
|
}
|
|
|
|
|
|
/*****************************************************************
|
|
* @method STDMETHODIMP | IStemmerConfig | GetControlInfo |
|
|
* Gets information that controls stemming behavior.
|
|
*
|
|
* @parm DWORD | *pgrfStemFlags | Pointer to flags that control stemming behavior.
|
|
* @parm DWORD | *pdwReserved | Reserved for future use.
|
|
*
|
|
* @rvalue S_OK | The operation completed successfully.
|
|
*
|
|
****************************************************************/
|
|
STDMETHODIMP
|
|
CITEngStemmer::GetControlInfo(DWORD *pgrfStemFlags, DWORD *pdwReserved)
|
|
{
|
|
if (pgrfStemFlags == NULL)
|
|
return (SetErrReturn(E_POINTER));
|
|
|
|
if (!m_fInitialized)
|
|
return (SetErrReturn(E_NOTOPEN));
|
|
|
|
*pgrfStemFlags = m_stemctl.grfStemFlags;
|
|
|
|
return (S_OK);
|
|
}
|
|
|
|
|
|
/*****************************************************************
|
|
* @method STDMETHODIMP | IStemmerConfig | LoadExternalStemmerData |
|
|
* Loads external stemmer data, such as word part lists.
|
|
*
|
|
* @parm IStream | *pStream | Pointer to stream object containing
|
|
* stenner data.
|
|
* @parm DWORD | dwExtDataType | Data type.
|
|
*
|
|
* @comm
|
|
* Not implemented yet.
|
|
****************************************************************/
|
|
STDMETHODIMP
|
|
CITEngStemmer::LoadExternalStemmerData(IStream *pStream, DWORD dwExtDataType)
|
|
{
|
|
if (!m_fInitialized)
|
|
return (SetErrReturn(E_NOTOPEN));
|
|
|
|
return (E_NOTIMPL);
|
|
}
|
|
|
|
|
|
//---------------------------------------------------------------------------
|
|
// IPersistStreamInit Method Implementations
|
|
//---------------------------------------------------------------------------
|
|
|
|
|
|
STDMETHODIMP
|
|
CITEngStemmer::GetClassID(CLSID *pclsid)
|
|
{
|
|
if (pclsid == NULL)
|
|
return (SetErrReturn(E_POINTER));
|
|
|
|
*pclsid = CLSID_ITEngStemmer;
|
|
return (S_OK);
|
|
}
|
|
|
|
|
|
STDMETHODIMP
|
|
CITEngStemmer::IsDirty(void)
|
|
{
|
|
if (!m_fInitialized)
|
|
return (SetErrReturn(E_NOTOPEN));
|
|
|
|
return (m_fDirty ? S_OK : S_FALSE);
|
|
}
|
|
|
|
|
|
STDMETHODIMP
|
|
CITEngStemmer::Load(IStream *pStream)
|
|
{
|
|
HRESULT hr;
|
|
DWORD dwVersion;
|
|
DWORD grfPersistedItems;
|
|
DWORD cbRead;
|
|
|
|
if (pStream == NULL)
|
|
return (SetErrReturn(E_POINTER));
|
|
|
|
// Lock before checking m_fInitialized to make sure we don't compete
|
|
// with a call to ::InitNew.
|
|
m_cs.Lock();
|
|
|
|
if (m_fInitialized)
|
|
return (SetErrReturn(E_ALREADYOPEN));
|
|
|
|
if (SUCCEEDED(hr = pStream->Read((LPVOID) &dwVersion, sizeof(DWORD),
|
|
&cbRead)) &&
|
|
SUCCEEDED(hr = ((cbRead == sizeof(DWORD)) ? S_OK : E_BADFORMAT)) &&
|
|
SUCCEEDED(hr = ((dwVersion == VERSION_ENGSTEMMER) ? S_OK :
|
|
E_BADVERSION)) &&
|
|
SUCCEEDED(hr = pStream->Read((LPVOID) &grfPersistedItems,
|
|
sizeof(DWORD), &cbRead)) &&
|
|
SUCCEEDED(hr = ((cbRead == sizeof(DWORD)) ? S_OK : E_BADFORMAT)) &&
|
|
grfPersistedItems != 0)
|
|
{
|
|
if ((grfPersistedItems & ITSTDBRK_PERSISTED_STEMCTL) != 0)
|
|
{
|
|
if (SUCCEEDED(hr =
|
|
pStream->Read((LPVOID) &m_stemctl, sizeof(STEMCTL), &cbRead)))
|
|
hr = ((cbRead == sizeof(STEMCTL)) ? S_OK : E_BADFORMAT);
|
|
}
|
|
else
|
|
{
|
|
// It is a surprise not to find the STEMCTL structure in the stream,
|
|
// but we can continue on because we will initialize the structure
|
|
// with good defaults before we exit this routine.
|
|
ITASSERT(FALSE);
|
|
}
|
|
|
|
}
|
|
|
|
if (SUCCEEDED(hr))
|
|
{
|
|
if ((grfPersistedItems & ITSTDBRK_PERSISTED_STEMCTL) == 0)
|
|
{
|
|
InitStemCtl();
|
|
|
|
// Set flag in case we're asked to save.
|
|
grfPersistedItems |= ITSTDBRK_PERSISTED_STEMCTL;
|
|
}
|
|
|
|
m_grfPersistedItems = grfPersistedItems;
|
|
m_fInitialized = TRUE;
|
|
}
|
|
else
|
|
// Free any peristed items which may have been loaded successfully.
|
|
Close();
|
|
|
|
m_cs.Unlock();
|
|
return (hr);
|
|
}
|
|
|
|
|
|
STDMETHODIMP
|
|
CITEngStemmer::Save(IStream *pStream, BOOL fClearDirty)
|
|
{
|
|
HRESULT hr;
|
|
DWORD dwVersion;
|
|
DWORD cbWritten;
|
|
|
|
if (pStream == NULL)
|
|
return (SetErrReturn(E_POINTER));
|
|
|
|
if (!m_fInitialized)
|
|
return (SetErrReturn(E_NOTOPEN));
|
|
|
|
m_cs.Lock();
|
|
|
|
dwVersion = VERSION_ENGSTEMMER;
|
|
if (SUCCEEDED(hr = pStream->Write((LPVOID) &dwVersion, sizeof(DWORD),
|
|
&cbWritten)) &&
|
|
SUCCEEDED(hr = pStream->Write((LPVOID) &m_grfPersistedItems,
|
|
sizeof(DWORD), &cbWritten)))
|
|
{
|
|
if ((m_grfPersistedItems & ITSTDBRK_PERSISTED_STEMCTL) != 0)
|
|
hr = pStream->Write((LPVOID) &m_stemctl, sizeof(STEMCTL),
|
|
&cbWritten);
|
|
else
|
|
{
|
|
// We should always be writing the STEMCTL structure, but if for
|
|
// some reason the flag to write it is not set, we can still continue
|
|
// because at load time we will tolerate the absence of the struct.
|
|
ITASSERT(FALSE);
|
|
}
|
|
|
|
}
|
|
|
|
if (SUCCEEDED(hr) && fClearDirty)
|
|
m_fDirty = FALSE;
|
|
|
|
m_cs.Unlock();
|
|
|
|
return (hr);
|
|
}
|
|
|
|
|
|
STDMETHODIMP
|
|
CITEngStemmer::GetSizeMax(ULARGE_INTEGER *pcbSizeMax)
|
|
{
|
|
return (E_NOTIMPL);
|
|
}
|
|
|
|
|
|
STDMETHODIMP
|
|
CITEngStemmer::InitNew(void)
|
|
{
|
|
// Lock before checking m_fInitialized to make sure we don't compete
|
|
// with a call to ::Load.
|
|
m_cs.Lock();
|
|
|
|
if (m_fInitialized)
|
|
return (SetErrReturn(E_ALREADYOPEN));
|
|
|
|
InitStemCtl();
|
|
m_grfPersistedItems |= ITSTDBRK_PERSISTED_STEMCTL;
|
|
m_fInitialized = TRUE;
|
|
|
|
m_cs.Unlock();
|
|
return (S_OK);
|
|
}
|
|
|
|
|
|
//---------------------------------------------------------------------------
|
|
// Private Method Implementations
|
|
//---------------------------------------------------------------------------
|
|
|
|
|
|
HRESULT
|
|
CITEngStemmer::ReallocBuffer(HGLOBAL *phmemBuf, DWORD *pcbBufCur, DWORD cbBufNew)
|
|
{
|
|
HRESULT hr = S_OK;
|
|
|
|
m_cs.Lock();
|
|
|
|
hr = ReallocBufferHmem(phmemBuf, pcbBufCur, max(cbBufNew, cbAnsiBufInit));
|
|
|
|
m_cs.Unlock();
|
|
|
|
return (hr);
|
|
}
|
|
|
|
|
|
void
|
|
CITEngStemmer::ClearMembers(void)
|
|
{
|
|
MEMSET(&m_stemctl, NULL, sizeof(STEMCTL));
|
|
m_fInitialized = m_fDirty = FALSE;
|
|
m_grfPersistedItems = 0;
|
|
}
|
|
|
|
|
|
void
|
|
CITEngStemmer::InitStemCtl(void)
|
|
{
|
|
m_stemctl.dwCodePageID = GetACP();
|
|
|
|
// If the user default language is not English, we'll store the
|
|
// value and check it in IStemmer::Init and ::StemWord.
|
|
m_stemctl.lcid = GetUserDefaultLCID();
|
|
m_stemctl.grfStemFlags = 0;
|
|
}
|
|
|
|
|
|
void
|
|
CITEngStemmer::Close(void)
|
|
{
|
|
if (m_hmem1 != NULL)
|
|
{
|
|
_GLOBALFREE(m_hmem1);
|
|
m_hmem1 = NULL;
|
|
m_cbBuf1Cur = 0;
|
|
}
|
|
|
|
if (m_hmem2 != NULL)
|
|
{
|
|
_GLOBALFREE(m_hmem2);
|
|
m_hmem2 = NULL;
|
|
m_cbBuf2Cur = 0;
|
|
}
|
|
|
|
ClearMembers();
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|