/************************************************************************* * @doc SHROOM EXTERNAL API * * * * ENGSTEM.CPP * * * * Copyright (C) Microsoft Corporation 1997 * * All Rights reserved. * * * * This file contains the implementation of CITEngStemmer methods. * * CITEngStemmer is a pluggable word stemer object. * * Although all the word breaking interface methods that accept text * * require it to be Unicode, CITEngStemmer still only supports ANSI * * internally. * * * ************************************************************************** * * * Written By : Bill Aloof * * Current Owner: billa * * * **************************************************************************/ #include #ifdef _DEBUG static char s_aszModule[] = __FILE__; /* For error report */ #endif #include // includes for ATL. #include <_mvutil.h> #include #include #include #include "common.h" #include #include #include #include "engstem.h" //--------------------------------------------------------------------------- // Constructor and Destructor //--------------------------------------------------------------------------- CITEngStemmer::CITEngStemmer() { ClearMembers(); m_hmem1 = m_hmem2 = NULL; m_cbBuf1Cur = m_cbBuf2Cur = 0; } CITEngStemmer::~CITEngStemmer() { Close(); } //--------------------------------------------------------------------------- // IStemmer Method Implementations //--------------------------------------------------------------------------- /******************************************************************** * @method STDMETHODIMP | IStemmer | Init | * Gives the stemmer object a chance to initialize itself beyond * what it did during IPersistStreamInit::InitNew or ::Load. * @parm ULONG | ulMaxTokenSize | Max term length requested by caller * @parm BOOL* | pfLicense | Whether the stemmer is subject to a license * * @rvalue E_POINTER | pfLicense was NULL * ********************************************************************/ STDMETHODIMP CITEngStemmer::Init(ULONG ulMaxTokenSize, BOOL *pfLicense) { HRESULT hr; if (pfLicense == NULL) return (SetErrReturn(E_POINTER)); // If we haven't been initialized yet (i.e. no call was made to either // IPersistStreamInit::InitNew or Load), we'll initialize ourselves now. // This allows Tripoli clients to use us without any code changes on their // part. If we have already been initialized, the caller has had a chance // to correctly set the lcid, so we check it now; otherwise, we want to // still give the caller a chance to set it correctly. if (m_fInitialized) hr = (PRIMARYLANGID(LANGIDFROMLCID(m_stemctl.lcid)) == LANG_ENGLISH ? S_OK : E_FAIL); else hr = InitNew(); if (SUCCEEDED(hr)) *pfLicense = FALSE; // NOTE: We don't support internal truncation of terms based on // ulMaxTokenSize. This is OK since the word sink is supposed to be // prepared to have to truncate anyway. return (hr); } /******************************************************************** * @method STDMETHODIMP | IStemmer | StemWord | * stems the input word and calls the methods of IStemSink with the results. * * @parm WCHAR const | *pwcInBuf | Input Unicode word. * @parm ULONG | cwc | count of Unicode characters in the input word. * @parm IStemSink | *pStemSink | Pointer to the stemmer sink. * * * * @rvalue E_WORDTOOLONG | cwc is larger than 0x7FFF * @rvalue E_POINTER | Either the input buffer or *pStemSink is NULL. * @rvalue S_OK | The operation completed successfully. * ********************************************************************/ STDMETHODIMP CITEngStemmer::StemWord(WCHAR const *pwcInBuf, ULONG cwc, IStemSink *pStemSink) { HRESULT hr = S_OK; if (pwcInBuf == NULL || pStemSink == NULL) return (SetErrReturn(E_POINTER)); if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN)); if (PRIMARYLANGID(LANGIDFROMLCID(m_stemctl.lcid)) != LANG_ENGLISH) return (SetErrReturn(E_FAIL)); if (cwc > 0x7FFF) return (SetErrReturn(E_WORDTOOLONG)); m_cs.Lock(); // We allocate enough space for a worst case Unicode ---> MBCS conversion // and allow an extra word for a length prefix that we will add later. // This is probably overly cautious because we shouldn't be seeing any // DBCS anyway (we're an English stemmer). if (SUCCEEDED(hr = ReallocBuffer(&m_hmem1, &m_cbBuf1Cur, (sizeof(WCHAR) * cwc) + sizeof(WORD)))) { LPBYTE lpbRawWord; lpbRawWord = (LPBYTE) _GLOBALLOCK(m_hmem1); // REVIEW (billa): Need to make sure that the word being stemmed is in // lower case. // Convert the raw word to ANSI. if ((*((WORD *)lpbRawWord) = (WORD) WideCharToMultiByte(m_stemctl.dwCodePageID, NULL, pwcInBuf, cwc, (char *)lpbRawWord + sizeof(WORD), (m_cbBuf1Cur - sizeof(WORD)), NULL, NULL)) > 0) { // We want the buffer we allocate for the stemmed word to be larger // than the raw word length so that we can handle the rare case // where the stemmed word has grown. We can just use the raw word // buffer size because it included a lot of extra padding. if (SUCCEEDED(hr = ReallocBuffer(&m_hmem2, &m_cbBuf2Cur, m_cbBuf1Cur))) { LPBYTE lpbStemWord; lpbStemWord = (LPBYTE) _GLOBALLOCK(m_hmem2); if (SUCCEEDED(hr = FStem(lpbStemWord, lpbRawWord))) { WCHAR *lpwchStem; DWORD cwchStem; DWORD cbStemWord; _GLOBALUNLOCK(m_hmem1); cwchStem = cbStemWord = (DWORD)(*((WORD *)lpbStemWord)); hr = ReallocBuffer(&m_hmem1, &m_cbBuf1Cur, sizeof (WCHAR) * cbStemWord); // Relock buffer even if we've failed the realloc // so that the unlock we do later is valid. An // unconditional relock is OK because ReallocBuffer // won't invalidate the original m_hmem1 if it fails. lpwchStem = (WCHAR *) _GLOBALLOCK(m_hmem1); // Convert the stem word back to Unicode so that we can // call the stem sink. if ((cwchStem = MultiByteToWideChar(m_stemctl.dwCodePageID, NULL, (LPCSTR)lpbStemWord + sizeof(WORD), cbStemWord, lpwchStem, cwchStem)) > 0) { // Send the raw word to the word sink. hr = pStemSink->PutWord(lpwchStem, cwchStem); } else hr = E_UNEXPECTED; } _GLOBALUNLOCK(m_hmem2); } } else hr = E_UNEXPECTED; _GLOBALUNLOCK(m_hmem1); } m_cs.Unlock(); return (hr); } /***************************************************************** * @method STDMETHODIMP | IStemmer | GetLicenseToUse | * * Not yet implemented * ****************************************************************/ STDMETHODIMP CITEngStemmer::GetLicenseToUse(WCHAR const **ppwcsLicense) { return (E_NOTIMPL); } //--------------------------------------------------------------------------- // IStemmerConfig Method Implementations //--------------------------------------------------------------------------- /***************************************************************** * @method STDMETHODIMP | IStemmerConfig | SetLocaleInfo | * Sets locale information that affects the stemming * behavior of IStemmer::StemWord. * @parm DWORD | dwCodePageID | ANSI code page no. specified at build time. * @parm LCID | lcid | Win32 locale identifier specified at build time. * * @rvalue S_OK | Locale described by the parameters is supported * @rvalue E_INVALIDARG | Locale described by the parameters is not supported. * * ****************************************************************/ STDMETHODIMP CITEngStemmer::SetLocaleInfo(DWORD dwCodePageID, LCID lcid) { if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN)); if (PRIMARYLANGID(LANGIDFROMLCID(lcid)) != LANG_ENGLISH) return (SetErrReturn(E_INVALIDARG)); m_cs.Lock(); m_stemctl.dwCodePageID = dwCodePageID; m_stemctl.lcid = lcid; m_fDirty = TRUE; m_cs.Unlock(); return (S_OK); } /***************************************************************** * @method STDMETHODIMP | IStemmerConfig | GetLocaleInfo | * Gets locale information that affects the stemming * behavior of IStemmer::StemWord. * @parm DWORD | *pdwCodePageID | Pointer to code page identifier * @parm LCID | *plcid | Pointer to Win32 locale identifier. * * @rvalue S_OK | Locale described by the parameters is supported * @rvalue E_INVALIDARG | Locale described by the parameters is not supported. * * ****************************************************************/ STDMETHODIMP CITEngStemmer::GetLocaleInfo(DWORD *pdwCodePageID, LCID *plcid) { if (pdwCodePageID == NULL || plcid == NULL) return (SetErrReturn(E_POINTER)); if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN)); m_cs.Lock(); *pdwCodePageID = m_stemctl.dwCodePageID; *plcid = m_stemctl.lcid; m_cs.Unlock(); return (S_OK); } /***************************************************************** * @method STDMETHODIMP | IStemmerConfig | SetControlInfo | * Sets information that controls certain aspects of stemming. * * @parm DWORD | grfStemFlags | Flags that control stemming behavior. * @parm DWORD | dwReserved | Reserved for future use. * * @rvalue S_OK | The operation completed successfully. * * @comm * In the future, additional information may be passed in through * dwReserved. ****************************************************************/ STDMETHODIMP CITEngStemmer::SetControlInfo(DWORD grfStemFlags, DWORD dwReserved) { DWORD grfFlagsUnsupported; if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN)); grfFlagsUnsupported = ~(0); if ((grfStemFlags & grfFlagsUnsupported) != 0) return (SetErrReturn(E_INVALIDARG)); m_cs.Lock(); m_stemctl.grfStemFlags = grfStemFlags; m_fDirty = TRUE; m_cs.Unlock(); return (S_OK); } /***************************************************************** * @method STDMETHODIMP | IStemmerConfig | GetControlInfo | * Gets information that controls stemming behavior. * * @parm DWORD | *pgrfStemFlags | Pointer to flags that control stemming behavior. * @parm DWORD | *pdwReserved | Reserved for future use. * * @rvalue S_OK | The operation completed successfully. * ****************************************************************/ STDMETHODIMP CITEngStemmer::GetControlInfo(DWORD *pgrfStemFlags, DWORD *pdwReserved) { if (pgrfStemFlags == NULL) return (SetErrReturn(E_POINTER)); if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN)); *pgrfStemFlags = m_stemctl.grfStemFlags; return (S_OK); } /***************************************************************** * @method STDMETHODIMP | IStemmerConfig | LoadExternalStemmerData | * Loads external stemmer data, such as word part lists. * * @parm IStream | *pStream | Pointer to stream object containing * stenner data. * @parm DWORD | dwExtDataType | Data type. * * @comm * Not implemented yet. ****************************************************************/ STDMETHODIMP CITEngStemmer::LoadExternalStemmerData(IStream *pStream, DWORD dwExtDataType) { if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN)); return (E_NOTIMPL); } //--------------------------------------------------------------------------- // IPersistStreamInit Method Implementations //--------------------------------------------------------------------------- STDMETHODIMP CITEngStemmer::GetClassID(CLSID *pclsid) { if (pclsid == NULL) return (SetErrReturn(E_POINTER)); *pclsid = CLSID_ITEngStemmer; return (S_OK); } STDMETHODIMP CITEngStemmer::IsDirty(void) { if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN)); return (m_fDirty ? S_OK : S_FALSE); } STDMETHODIMP CITEngStemmer::Load(IStream *pStream) { HRESULT hr; DWORD dwVersion; DWORD grfPersistedItems; DWORD cbRead; if (pStream == NULL) return (SetErrReturn(E_POINTER)); // Lock before checking m_fInitialized to make sure we don't compete // with a call to ::InitNew. m_cs.Lock(); if (m_fInitialized) return (SetErrReturn(E_ALREADYOPEN)); if (SUCCEEDED(hr = pStream->Read((LPVOID) &dwVersion, sizeof(DWORD), &cbRead)) && SUCCEEDED(hr = ((cbRead == sizeof(DWORD)) ? S_OK : E_BADFORMAT)) && SUCCEEDED(hr = ((dwVersion == VERSION_ENGSTEMMER) ? S_OK : E_BADVERSION)) && SUCCEEDED(hr = pStream->Read((LPVOID) &grfPersistedItems, sizeof(DWORD), &cbRead)) && SUCCEEDED(hr = ((cbRead == sizeof(DWORD)) ? S_OK : E_BADFORMAT)) && grfPersistedItems != 0) { if ((grfPersistedItems & ITSTDBRK_PERSISTED_STEMCTL) != 0) { if (SUCCEEDED(hr = pStream->Read((LPVOID) &m_stemctl, sizeof(STEMCTL), &cbRead))) hr = ((cbRead == sizeof(STEMCTL)) ? S_OK : E_BADFORMAT); } else { // It is a surprise not to find the STEMCTL structure in the stream, // but we can continue on because we will initialize the structure // with good defaults before we exit this routine. ITASSERT(FALSE); } } if (SUCCEEDED(hr)) { if ((grfPersistedItems & ITSTDBRK_PERSISTED_STEMCTL) == 0) { InitStemCtl(); // Set flag in case we're asked to save. grfPersistedItems |= ITSTDBRK_PERSISTED_STEMCTL; } m_grfPersistedItems = grfPersistedItems; m_fInitialized = TRUE; } else // Free any peristed items which may have been loaded successfully. Close(); m_cs.Unlock(); return (hr); } STDMETHODIMP CITEngStemmer::Save(IStream *pStream, BOOL fClearDirty) { HRESULT hr; DWORD dwVersion; DWORD cbWritten; if (pStream == NULL) return (SetErrReturn(E_POINTER)); if (!m_fInitialized) return (SetErrReturn(E_NOTOPEN)); m_cs.Lock(); dwVersion = VERSION_ENGSTEMMER; if (SUCCEEDED(hr = pStream->Write((LPVOID) &dwVersion, sizeof(DWORD), &cbWritten)) && SUCCEEDED(hr = pStream->Write((LPVOID) &m_grfPersistedItems, sizeof(DWORD), &cbWritten))) { if ((m_grfPersistedItems & ITSTDBRK_PERSISTED_STEMCTL) != 0) hr = pStream->Write((LPVOID) &m_stemctl, sizeof(STEMCTL), &cbWritten); else { // We should always be writing the STEMCTL structure, but if for // some reason the flag to write it is not set, we can still continue // because at load time we will tolerate the absence of the struct. ITASSERT(FALSE); } } if (SUCCEEDED(hr) && fClearDirty) m_fDirty = FALSE; m_cs.Unlock(); return (hr); } STDMETHODIMP CITEngStemmer::GetSizeMax(ULARGE_INTEGER *pcbSizeMax) { return (E_NOTIMPL); } STDMETHODIMP CITEngStemmer::InitNew(void) { // Lock before checking m_fInitialized to make sure we don't compete // with a call to ::Load. m_cs.Lock(); if (m_fInitialized) return (SetErrReturn(E_ALREADYOPEN)); InitStemCtl(); m_grfPersistedItems |= ITSTDBRK_PERSISTED_STEMCTL; m_fInitialized = TRUE; m_cs.Unlock(); return (S_OK); } //--------------------------------------------------------------------------- // Private Method Implementations //--------------------------------------------------------------------------- HRESULT CITEngStemmer::ReallocBuffer(HGLOBAL *phmemBuf, DWORD *pcbBufCur, DWORD cbBufNew) { HRESULT hr = S_OK; m_cs.Lock(); hr = ReallocBufferHmem(phmemBuf, pcbBufCur, max(cbBufNew, cbAnsiBufInit)); m_cs.Unlock(); return (hr); } void CITEngStemmer::ClearMembers(void) { MEMSET(&m_stemctl, NULL, sizeof(STEMCTL)); m_fInitialized = m_fDirty = FALSE; m_grfPersistedItems = 0; } void CITEngStemmer::InitStemCtl(void) { m_stemctl.dwCodePageID = GetACP(); // If the user default language is not English, we'll store the // value and check it in IStemmer::Init and ::StemWord. m_stemctl.lcid = GetUserDefaultLCID(); m_stemctl.grfStemFlags = 0; } void CITEngStemmer::Close(void) { if (m_hmem1 != NULL) { _GLOBALFREE(m_hmem1); m_hmem1 = NULL; m_cbBuf1Cur = 0; } if (m_hmem2 != NULL) { _GLOBALFREE(m_hmem2); m_hmem2 = NULL; m_cbBuf2Cur = 0; } ClearMembers(); }