windows-nt/Source/XPSP1/NT/enduser/stuff/itircl/fts/breakers/engstem.cpp
2020-09-26 16:20:57 +08:00

611 lines
16 KiB
C++

/*************************************************************************
* @doc SHROOM EXTERNAL API *
* *
* ENGSTEM.CPP *
* *
* Copyright (C) Microsoft Corporation 1997 *
* All Rights reserved. *
* *
* This file contains the implementation of CITEngStemmer methods. *
* CITEngStemmer is a pluggable word stemer object. *
* Although all the word breaking interface methods that accept text *
* require it to be Unicode, CITEngStemmer still only supports ANSI *
* internally. *
* *
**************************************************************************
* *
* Written By : Bill Aloof *
* Current Owner: billa *
* *
**************************************************************************/
#include <mvopsys.h>
#ifdef _DEBUG
static char s_aszModule[] = __FILE__; /* For error report */
#endif
#include <atlinc.h> // includes for ATL.
#include <_mvutil.h>
#include <mem.h>
#include <orkin.h>
#include <mvsearch.h>
#include "common.h"
#include <iterror.h>
#include <itstem.h>
#include <itwbrkid.h>
#include "engstem.h"
//---------------------------------------------------------------------------
// Constructor and Destructor
//---------------------------------------------------------------------------
CITEngStemmer::CITEngStemmer()
{
ClearMembers();
m_hmem1 = m_hmem2 = NULL;
m_cbBuf1Cur = m_cbBuf2Cur = 0;
}
CITEngStemmer::~CITEngStemmer()
{
Close();
}
//---------------------------------------------------------------------------
// IStemmer Method Implementations
//---------------------------------------------------------------------------
/********************************************************************
* @method STDMETHODIMP | IStemmer | Init |
* Gives the stemmer object a chance to initialize itself beyond
* what it did during IPersistStreamInit::InitNew or ::Load.
* @parm ULONG | ulMaxTokenSize | Max term length requested by caller
* @parm BOOL* | pfLicense | Whether the stemmer is subject to a license
*
* @rvalue E_POINTER | pfLicense was NULL
*
********************************************************************/
STDMETHODIMP
CITEngStemmer::Init(ULONG ulMaxTokenSize, BOOL *pfLicense)
{
HRESULT hr;
if (pfLicense == NULL)
return (SetErrReturn(E_POINTER));
// If we haven't been initialized yet (i.e. no call was made to either
// IPersistStreamInit::InitNew or Load), we'll initialize ourselves now.
// This allows Tripoli clients to use us without any code changes on their
// part. If we have already been initialized, the caller has had a chance
// to correctly set the lcid, so we check it now; otherwise, we want to
// still give the caller a chance to set it correctly.
if (m_fInitialized)
hr = (PRIMARYLANGID(LANGIDFROMLCID(m_stemctl.lcid)) == LANG_ENGLISH ?
S_OK : E_FAIL);
else
hr = InitNew();
if (SUCCEEDED(hr))
*pfLicense = FALSE;
// NOTE: We don't support internal truncation of terms based on
// ulMaxTokenSize. This is OK since the word sink is supposed to be
// prepared to have to truncate anyway.
return (hr);
}
/********************************************************************
* @method STDMETHODIMP | IStemmer | StemWord |
* stems the input word and calls the methods of IStemSink with the results.
*
* @parm WCHAR const | *pwcInBuf | Input Unicode word.
* @parm ULONG | cwc | count of Unicode characters in the input word.
* @parm IStemSink | *pStemSink | Pointer to the stemmer sink.
*
*
*
* @rvalue E_WORDTOOLONG | cwc is larger than 0x7FFF
* @rvalue E_POINTER | Either the input buffer or *pStemSink is NULL.
* @rvalue S_OK | The operation completed successfully.
*
********************************************************************/
STDMETHODIMP
CITEngStemmer::StemWord(WCHAR const *pwcInBuf, ULONG cwc, IStemSink *pStemSink)
{
HRESULT hr = S_OK;
if (pwcInBuf == NULL || pStemSink == NULL)
return (SetErrReturn(E_POINTER));
if (!m_fInitialized)
return (SetErrReturn(E_NOTOPEN));
if (PRIMARYLANGID(LANGIDFROMLCID(m_stemctl.lcid)) != LANG_ENGLISH)
return (SetErrReturn(E_FAIL));
if (cwc > 0x7FFF)
return (SetErrReturn(E_WORDTOOLONG));
m_cs.Lock();
// We allocate enough space for a worst case Unicode ---> MBCS conversion
// and allow an extra word for a length prefix that we will add later.
// This is probably overly cautious because we shouldn't be seeing any
// DBCS anyway (we're an English stemmer).
if (SUCCEEDED(hr = ReallocBuffer(&m_hmem1, &m_cbBuf1Cur,
(sizeof(WCHAR) * cwc) + sizeof(WORD))))
{
LPBYTE lpbRawWord;
lpbRawWord = (LPBYTE) _GLOBALLOCK(m_hmem1);
// REVIEW (billa): Need to make sure that the word being stemmed is in
// lower case.
// Convert the raw word to ANSI.
if ((*((WORD *)lpbRawWord) =
(WORD) WideCharToMultiByte(m_stemctl.dwCodePageID, NULL,
pwcInBuf, cwc, (char *)lpbRawWord + sizeof(WORD),
(m_cbBuf1Cur - sizeof(WORD)), NULL, NULL)) > 0)
{
// We want the buffer we allocate for the stemmed word to be larger
// than the raw word length so that we can handle the rare case
// where the stemmed word has grown. We can just use the raw word
// buffer size because it included a lot of extra padding.
if (SUCCEEDED(hr = ReallocBuffer(&m_hmem2, &m_cbBuf2Cur,
m_cbBuf1Cur)))
{
LPBYTE lpbStemWord;
lpbStemWord = (LPBYTE) _GLOBALLOCK(m_hmem2);
if (SUCCEEDED(hr = FStem(lpbStemWord, lpbRawWord)))
{
WCHAR *lpwchStem;
DWORD cwchStem;
DWORD cbStemWord;
_GLOBALUNLOCK(m_hmem1);
cwchStem = cbStemWord = (DWORD)(*((WORD *)lpbStemWord));
hr = ReallocBuffer(&m_hmem1, &m_cbBuf1Cur,
sizeof (WCHAR) * cbStemWord);
// Relock buffer even if we've failed the realloc
// so that the unlock we do later is valid. An
// unconditional relock is OK because ReallocBuffer
// won't invalidate the original m_hmem1 if it fails.
lpwchStem = (WCHAR *) _GLOBALLOCK(m_hmem1);
// Convert the stem word back to Unicode so that we can
// call the stem sink.
if ((cwchStem =
MultiByteToWideChar(m_stemctl.dwCodePageID, NULL,
(LPCSTR)lpbStemWord + sizeof(WORD), cbStemWord,
lpwchStem, cwchStem)) > 0)
{
// Send the raw word to the word sink.
hr = pStemSink->PutWord(lpwchStem, cwchStem);
}
else
hr = E_UNEXPECTED;
}
_GLOBALUNLOCK(m_hmem2);
}
}
else
hr = E_UNEXPECTED;
_GLOBALUNLOCK(m_hmem1);
}
m_cs.Unlock();
return (hr);
}
/*****************************************************************
* @method STDMETHODIMP | IStemmer | GetLicenseToUse |
*
* Not yet implemented
*
****************************************************************/
STDMETHODIMP
CITEngStemmer::GetLicenseToUse(WCHAR const **ppwcsLicense)
{
return (E_NOTIMPL);
}
//---------------------------------------------------------------------------
// IStemmerConfig Method Implementations
//---------------------------------------------------------------------------
/*****************************************************************
* @method STDMETHODIMP | IStemmerConfig | SetLocaleInfo |
* Sets locale information that affects the stemming
* behavior of IStemmer::StemWord.
* @parm DWORD | dwCodePageID | ANSI code page no. specified at build time.
* @parm LCID | lcid | Win32 locale identifier specified at build time.
*
* @rvalue S_OK | Locale described by the parameters is supported
* @rvalue E_INVALIDARG | Locale described by the parameters is not supported.
*
*
****************************************************************/
STDMETHODIMP
CITEngStemmer::SetLocaleInfo(DWORD dwCodePageID, LCID lcid)
{
if (!m_fInitialized)
return (SetErrReturn(E_NOTOPEN));
if (PRIMARYLANGID(LANGIDFROMLCID(lcid)) != LANG_ENGLISH)
return (SetErrReturn(E_INVALIDARG));
m_cs.Lock();
m_stemctl.dwCodePageID = dwCodePageID;
m_stemctl.lcid = lcid;
m_fDirty = TRUE;
m_cs.Unlock();
return (S_OK);
}
/*****************************************************************
* @method STDMETHODIMP | IStemmerConfig | GetLocaleInfo |
* Gets locale information that affects the stemming
* behavior of IStemmer::StemWord.
* @parm DWORD | *pdwCodePageID | Pointer to code page identifier
* @parm LCID | *plcid | Pointer to Win32 locale identifier.
*
* @rvalue S_OK | Locale described by the parameters is supported
* @rvalue E_INVALIDARG | Locale described by the parameters is not supported.
*
*
****************************************************************/
STDMETHODIMP
CITEngStemmer::GetLocaleInfo(DWORD *pdwCodePageID, LCID *plcid)
{
if (pdwCodePageID == NULL || plcid == NULL)
return (SetErrReturn(E_POINTER));
if (!m_fInitialized)
return (SetErrReturn(E_NOTOPEN));
m_cs.Lock();
*pdwCodePageID = m_stemctl.dwCodePageID;
*plcid = m_stemctl.lcid;
m_cs.Unlock();
return (S_OK);
}
/*****************************************************************
* @method STDMETHODIMP | IStemmerConfig | SetControlInfo |
* Sets information that controls certain aspects of stemming.
*
* @parm DWORD | grfStemFlags | Flags that control stemming behavior.
* @parm DWORD | dwReserved | Reserved for future use.
*
* @rvalue S_OK | The operation completed successfully.
*
* @comm
* In the future, additional information may be passed in through
* dwReserved.
****************************************************************/
STDMETHODIMP
CITEngStemmer::SetControlInfo(DWORD grfStemFlags, DWORD dwReserved)
{
DWORD grfFlagsUnsupported;
if (!m_fInitialized)
return (SetErrReturn(E_NOTOPEN));
grfFlagsUnsupported = ~(0);
if ((grfStemFlags & grfFlagsUnsupported) != 0)
return (SetErrReturn(E_INVALIDARG));
m_cs.Lock();
m_stemctl.grfStemFlags = grfStemFlags;
m_fDirty = TRUE;
m_cs.Unlock();
return (S_OK);
}
/*****************************************************************
* @method STDMETHODIMP | IStemmerConfig | GetControlInfo |
* Gets information that controls stemming behavior.
*
* @parm DWORD | *pgrfStemFlags | Pointer to flags that control stemming behavior.
* @parm DWORD | *pdwReserved | Reserved for future use.
*
* @rvalue S_OK | The operation completed successfully.
*
****************************************************************/
STDMETHODIMP
CITEngStemmer::GetControlInfo(DWORD *pgrfStemFlags, DWORD *pdwReserved)
{
if (pgrfStemFlags == NULL)
return (SetErrReturn(E_POINTER));
if (!m_fInitialized)
return (SetErrReturn(E_NOTOPEN));
*pgrfStemFlags = m_stemctl.grfStemFlags;
return (S_OK);
}
/*****************************************************************
* @method STDMETHODIMP | IStemmerConfig | LoadExternalStemmerData |
* Loads external stemmer data, such as word part lists.
*
* @parm IStream | *pStream | Pointer to stream object containing
* stenner data.
* @parm DWORD | dwExtDataType | Data type.
*
* @comm
* Not implemented yet.
****************************************************************/
STDMETHODIMP
CITEngStemmer::LoadExternalStemmerData(IStream *pStream, DWORD dwExtDataType)
{
if (!m_fInitialized)
return (SetErrReturn(E_NOTOPEN));
return (E_NOTIMPL);
}
//---------------------------------------------------------------------------
// IPersistStreamInit Method Implementations
//---------------------------------------------------------------------------
STDMETHODIMP
CITEngStemmer::GetClassID(CLSID *pclsid)
{
if (pclsid == NULL)
return (SetErrReturn(E_POINTER));
*pclsid = CLSID_ITEngStemmer;
return (S_OK);
}
STDMETHODIMP
CITEngStemmer::IsDirty(void)
{
if (!m_fInitialized)
return (SetErrReturn(E_NOTOPEN));
return (m_fDirty ? S_OK : S_FALSE);
}
STDMETHODIMP
CITEngStemmer::Load(IStream *pStream)
{
HRESULT hr;
DWORD dwVersion;
DWORD grfPersistedItems;
DWORD cbRead;
if (pStream == NULL)
return (SetErrReturn(E_POINTER));
// Lock before checking m_fInitialized to make sure we don't compete
// with a call to ::InitNew.
m_cs.Lock();
if (m_fInitialized)
return (SetErrReturn(E_ALREADYOPEN));
if (SUCCEEDED(hr = pStream->Read((LPVOID) &dwVersion, sizeof(DWORD),
&cbRead)) &&
SUCCEEDED(hr = ((cbRead == sizeof(DWORD)) ? S_OK : E_BADFORMAT)) &&
SUCCEEDED(hr = ((dwVersion == VERSION_ENGSTEMMER) ? S_OK :
E_BADVERSION)) &&
SUCCEEDED(hr = pStream->Read((LPVOID) &grfPersistedItems,
sizeof(DWORD), &cbRead)) &&
SUCCEEDED(hr = ((cbRead == sizeof(DWORD)) ? S_OK : E_BADFORMAT)) &&
grfPersistedItems != 0)
{
if ((grfPersistedItems & ITSTDBRK_PERSISTED_STEMCTL) != 0)
{
if (SUCCEEDED(hr =
pStream->Read((LPVOID) &m_stemctl, sizeof(STEMCTL), &cbRead)))
hr = ((cbRead == sizeof(STEMCTL)) ? S_OK : E_BADFORMAT);
}
else
{
// It is a surprise not to find the STEMCTL structure in the stream,
// but we can continue on because we will initialize the structure
// with good defaults before we exit this routine.
ITASSERT(FALSE);
}
}
if (SUCCEEDED(hr))
{
if ((grfPersistedItems & ITSTDBRK_PERSISTED_STEMCTL) == 0)
{
InitStemCtl();
// Set flag in case we're asked to save.
grfPersistedItems |= ITSTDBRK_PERSISTED_STEMCTL;
}
m_grfPersistedItems = grfPersistedItems;
m_fInitialized = TRUE;
}
else
// Free any peristed items which may have been loaded successfully.
Close();
m_cs.Unlock();
return (hr);
}
STDMETHODIMP
CITEngStemmer::Save(IStream *pStream, BOOL fClearDirty)
{
HRESULT hr;
DWORD dwVersion;
DWORD cbWritten;
if (pStream == NULL)
return (SetErrReturn(E_POINTER));
if (!m_fInitialized)
return (SetErrReturn(E_NOTOPEN));
m_cs.Lock();
dwVersion = VERSION_ENGSTEMMER;
if (SUCCEEDED(hr = pStream->Write((LPVOID) &dwVersion, sizeof(DWORD),
&cbWritten)) &&
SUCCEEDED(hr = pStream->Write((LPVOID) &m_grfPersistedItems,
sizeof(DWORD), &cbWritten)))
{
if ((m_grfPersistedItems & ITSTDBRK_PERSISTED_STEMCTL) != 0)
hr = pStream->Write((LPVOID) &m_stemctl, sizeof(STEMCTL),
&cbWritten);
else
{
// We should always be writing the STEMCTL structure, but if for
// some reason the flag to write it is not set, we can still continue
// because at load time we will tolerate the absence of the struct.
ITASSERT(FALSE);
}
}
if (SUCCEEDED(hr) && fClearDirty)
m_fDirty = FALSE;
m_cs.Unlock();
return (hr);
}
STDMETHODIMP
CITEngStemmer::GetSizeMax(ULARGE_INTEGER *pcbSizeMax)
{
return (E_NOTIMPL);
}
STDMETHODIMP
CITEngStemmer::InitNew(void)
{
// Lock before checking m_fInitialized to make sure we don't compete
// with a call to ::Load.
m_cs.Lock();
if (m_fInitialized)
return (SetErrReturn(E_ALREADYOPEN));
InitStemCtl();
m_grfPersistedItems |= ITSTDBRK_PERSISTED_STEMCTL;
m_fInitialized = TRUE;
m_cs.Unlock();
return (S_OK);
}
//---------------------------------------------------------------------------
// Private Method Implementations
//---------------------------------------------------------------------------
HRESULT
CITEngStemmer::ReallocBuffer(HGLOBAL *phmemBuf, DWORD *pcbBufCur, DWORD cbBufNew)
{
HRESULT hr = S_OK;
m_cs.Lock();
hr = ReallocBufferHmem(phmemBuf, pcbBufCur, max(cbBufNew, cbAnsiBufInit));
m_cs.Unlock();
return (hr);
}
void
CITEngStemmer::ClearMembers(void)
{
MEMSET(&m_stemctl, NULL, sizeof(STEMCTL));
m_fInitialized = m_fDirty = FALSE;
m_grfPersistedItems = 0;
}
void
CITEngStemmer::InitStemCtl(void)
{
m_stemctl.dwCodePageID = GetACP();
// If the user default language is not English, we'll store the
// value and check it in IStemmer::Init and ::StemWord.
m_stemctl.lcid = GetUserDefaultLCID();
m_stemctl.grfStemFlags = 0;
}
void
CITEngStemmer::Close(void)
{
if (m_hmem1 != NULL)
{
_GLOBALFREE(m_hmem1);
m_hmem1 = NULL;
m_cbBuf1Cur = 0;
}
if (m_hmem2 != NULL)
{
_GLOBALFREE(m_hmem2);
m_hmem2 = NULL;
m_cbBuf2Cur = 0;
}
ClearMembers();
}