/************************************************************************* * @doc SHROOM INTERNAL API * * * * IDXOBR.CPP * * * * Copyright (C) Microsoft Corporation 1997 * * All Rights reserved. * * * * This file contains the implementation of CITIndexObjBridge, * * which is a class used by CITIndexLocal to allow the old .c * * search internals to call the new COM-based breaker and stemmer * * objects. * * * * ************************************************************************** * * * Written By : Bill Aloof * * Current Owner: billa * * * **************************************************************************/ #include #ifdef _DEBUG static char s_aszModule[] = __FILE__; /* For error report */ #endif #include // MediaView (InfoTech) includes #include #include #include #include #include #include #include "indeximp.h" #include "queryimp.h" #include "mvsearch.h" #include "idxobr.h" #include "common.h" //--------------------------------------------------------------------------- // Constructor and Destructor //--------------------------------------------------------------------------- CITIndexObjBridge::CITIndexObjBridge() { m_cRef = 0; m_piwbrk = NULL; m_piwbrkc = NULL; m_pistem = NULL; m_piitstwdl = NULL; m_pexbrkpm = NULL; m_fNormWord = FALSE; m_dwCodePageID = 0; m_hmemSrc = m_hmemDestNorm = m_hmemDestRaw = NULL; m_cbBufSrcCur = m_cbBufDestNormCur = m_cbBufDestRawCur = 0; m_lpsipbTermHit = NULL; } CITIndexObjBridge::~CITIndexObjBridge() { if (m_cRef > 0) { ITASSERT(FALSE); } if (m_hmemSrc != NULL) { _GLOBALFREE(m_hmemSrc); m_hmemSrc = NULL; m_cbBufSrcCur = 0; } if (m_hmemDestNorm != NULL) { _GLOBALFREE(m_hmemDestNorm); m_hmemDestNorm = NULL; m_cbBufDestNormCur = 0; } if (m_hmemDestRaw != NULL) { _GLOBALFREE(m_hmemDestRaw); m_hmemDestRaw = NULL; m_cbBufDestRawCur = 0; } if (m_piwbrk != NULL) { m_piwbrk->Release(); m_piwbrk = NULL; } if (m_piwbrkc != NULL) { m_piwbrkc->Release(); m_piwbrkc = NULL; } if (m_pistem != NULL) { m_pistem->Release(); m_pistem = NULL; } if (m_piitstwdl != NULL) { m_piitstwdl->Release(); m_piitstwdl = NULL; } MVStopListDispose(m_lpsipbTermHit); } //--------------------------------------------------------------------------- // IUnknown Method Implementations //--------------------------------------------------------------------------- // NOTE: This implementation of IUnknown assumes that this object is used // only in a local context, meaning that no piece of code will hold onto // an IUnknown pointer obtained via QueryInterface beyond the scope that // an instance of this object was created in. For example, this object // will very likely be created/destroyed in the same method. That's why // there's no controlling IUnknown for us to forward AddRef's and Release's // to. It is also the reason that IUnknown::Release doesn't call the // class's destructor when the ref count goes to 0. STDMETHODIMP CITIndexObjBridge::QueryInterface(REFIID riid, LPVOID *ppvObj) { HRESULT hr = S_OK; void *pvObj = NULL; if (ppvObj == NULL) return (SetErrReturn(E_POINTER)); if (riid == IID_IWordSink) pvObj = (void *)((PIWRDSNK) this); else if (riid == IID_IStemSink) pvObj = (void *)((PISTEMSNK) this); else if (riid == IID_IUnknown) pvObj = (void *)((IUnknown *) ((PIWRDSNK) this)); if (pvObj != NULL) *ppvObj = pvObj; else hr = E_NOINTERFACE; return (hr); } STDMETHODIMP_(ULONG) CITIndexObjBridge::AddRef(void) { return (++m_cRef); } STDMETHODIMP_(ULONG) CITIndexObjBridge::Release(void) { if (m_cRef > 0) --m_cRef; else { ITASSERT(FALSE); } return (m_cRef); } //--------------------------------------------------------------------------- // IWordSink Method Implementations //--------------------------------------------------------------------------- /***************************************************************** * @method STDMETHODIMP | IWordSink | PutWord | * This method notifies IWordSink of a new word. * * @parm WCHAR const | *pwcInBuf | Pointer to the word to add. * @parm ULONG |cwc | Count of characters in the word. * @parm ULONG |cwcSrcLen | count of characters in pTextSource buffer * (see ) that corresponds to the output word * @parm ULONG |cwcSrcPos | the position of the word in pTextSource * buffer that corresponds to the output word * * @rvalue S_OK | The operation completed successfully. * @rvalue E_POINTER | Input buffer is NULL. * * @comm The values of

and

are used by the * ISearch interface, which given a query and a text source, will highlight * all hits within the text source that match the query. The location * of the text to be highlighted is computed from

and *

. Since

is constant and should not be * modified by PutWord, it can point directly into

. * Values of cwc larger than the ulMaxTokenSize specified in * will result in LANGUAGE_S_LARGE_WORD. * @comm Text sent to PutWord should match the source text as closely * as possible, including capitalization and accents. * * @comm You need to call this method for every word retrieved from *

except those for which the call * has been made. The word sink automatically adds an end of word break * (EOW) after this token. * ****************************************************************/ STDMETHODIMP CITIndexObjBridge::PutWord(WCHAR const *pwcInBuf, ULONG cwc, ULONG cwcSrcLen, ULONG cwcSrcPos) { HRESULT hr; DWORD cbAnsi; if (pwcInBuf == NULL) return (E_POINTER); cbAnsi = (sizeof(WCHAR) * cwc) + sizeof(WORD); if (SUCCEEDED(hr = ReallocBuffer(&m_hmemDestRaw, &m_cbBufDestRawCur, cbAnsi))) { char *lpchBufRaw; lpchBufRaw = (char *) _GLOBALLOCK(m_hmemDestRaw); if ((*((WORD *)lpchBufRaw) = (WORD) WideCharToMultiByte(m_dwCodePageID, NULL, pwcInBuf, cwc, lpchBufRaw + sizeof(WORD), cbAnsi - sizeof(WORD), NULL, NULL)) > 0) { char *lpchBufNorm; lpchBufNorm = (char *) _GLOBALLOCK(m_hmemDestNorm); if (!m_fNormWord) MEMCPY(lpchBufNorm, lpchBufRaw, *((WORD *)lpchBufRaw) + sizeof(WORD)); ITASSERT(m_pexbrkpm != NULL); if (m_pexbrkpm->lpfnOutWord != NULL) { DWORD ibAnsiOffset; WCHAR *lpwchSrc; lpwchSrc = (WCHAR *) _GLOBALLOCK(m_hmemSrc); // Compute the ANSI offset of the beginning of the raw word. // The ANSI buffer we pass won't get written to - we just // pass a pointer just in case the routine requires a non-NULL // for that param (documentation doesn't say). ibAnsiOffset = WideCharToMultiByte(m_dwCodePageID, NULL, lpwchSrc, cwcSrcPos, lpchBufRaw, 0, NULL, NULL); // Call the supplied word callback function. hr = m_pexbrkpm->lpfnOutWord((LPBYTE)lpchBufRaw, (LPBYTE)lpchBufNorm, ibAnsiOffset, m_pexbrkpm->lpvUser); _GLOBALUNLOCK(m_hmemSrc); } _GLOBALUNLOCK(m_hmemDestNorm); } else hr = E_UNEXPECTED; _GLOBALUNLOCK(m_hmemDestRaw); } return (hr); } /**************************************************************** * @method STDMETHODIMP | IWordSink | PutAltWord | * Allows the word breaker to put more than one word in the same place. * @parm WCHAR const | *pwcInBuf | Pointer to the word to add. * @parm ULONG |cwc | Count of characters in the word. * @parm ULONG |cwcSrcLen | count of characters in pTextSource buffer * (see ) that corresponds to the output word. * @parm ULONG |cwcSrcPos | the position of the word in pTextSource * buffer that corresponds to the output word * * @rvalue S_OK | The operation completed successfully. * @rvalue E_POINTER | Input buffer is NULL. * * @comm * When you need to add more than one word in the same place, use * PutAltWord for all alternative words except the last one. Use * PutWord for the final alternative, indicating movement to the next position. * @ex The phrase "Where is Kyle's document" would be stored as: | * pWSink->PutWord( L"Where", 5, 5, 0 ); * pWSink->PutWord( L"is", 2, 2, 6 ); * pWSink->PutAltWord( L"Kyle", 4, 6, 9 ); * pWSink->PutWord( L"Kyle's", 6, 6, 9 ); * pWSink->PutWord( L"document", 8, 8, 16 ); * * ***************************************************************/ STDMETHODIMP CITIndexObjBridge::PutAltWord(WCHAR const *pwcInBuf, ULONG cwc, ULONG cwcSrcLen, ULONG cwcSrcPos) { HRESULT hr; DWORD cbAnsi; if (pwcInBuf == NULL) return (E_POINTER); cbAnsi = (sizeof(WCHAR) * cwc) + sizeof(WORD); if (SUCCEEDED(hr = ReallocBuffer(&m_hmemDestNorm, &m_cbBufDestNormCur, cbAnsi))) { char *lpchBuf; lpchBuf = (char *) _GLOBALLOCK(m_hmemDestNorm); if ((*((WORD *)lpchBuf) = (WORD) WideCharToMultiByte(m_dwCodePageID, NULL, pwcInBuf, cwc, lpchBuf + sizeof(WORD), cbAnsi - sizeof(WORD), NULL, NULL)) > 0) { m_fNormWord = TRUE; } else hr = E_UNEXPECTED; _GLOBALUNLOCK(m_hmemDestNorm); } return (hr); } /**************************************************************** * @method STDMETHODIMP | IWordSink | StartAltPhrase | * This method is not implemented. ***************************************************************/ STDMETHODIMP CITIndexObjBridge::StartAltPhrase(void) { return (E_NOTIMPL); } /**************************************************************** * @method STDMETHODIMP | IWordSink | EndAltPhrase| * This method is not implemented. ***************************************************************/ STDMETHODIMP CITIndexObjBridge::EndAltPhrase(void) { return (E_NOTIMPL); } /**************************************************************** * @method STDMETHODIMP | IWordSink | PutBreak | * This method is not implemented. * * @parm WORDREP_BREAK_TYPE | breakType | Specifies break type * * ***************************************************************/ STDMETHODIMP CITIndexObjBridge::PutBreak(WORDREP_BREAK_TYPE breakType) { return (E_NOTIMPL); } //--------------------------------------------------------------------------- // IStemSink Method Implementations //--------------------------------------------------------------------------- /**************************************************************** * @method STDMETHODIMP | IStemSink | PutWord | * Notifies IStemSink of a word that is similar to the input word * of method. * * @parm WCHAR const | *pwcInBuf | Pointer to the word * @parm ULONG | cwc | Number of characters in the word * * @rvalue E_POINTER | The input buffer is NULL. * ***************************************************************/ STDMETHODIMP CITIndexObjBridge::PutWord(WCHAR const *pwcInBuf, ULONG cwc) { HRESULT hr; DWORD cbAnsi; if (pwcInBuf == NULL) return (E_POINTER); cbAnsi = (sizeof(WCHAR) * cwc) + sizeof(WORD); if (SUCCEEDED(hr = ReallocBuffer(&m_hmemDestNorm, &m_cbBufDestNormCur, cbAnsi))) { char *lpchBuf; lpchBuf = (char *) _GLOBALLOCK(m_hmemDestNorm); if ((*((WORD *)lpchBuf) = (WORD) WideCharToMultiByte(m_dwCodePageID, NULL, pwcInBuf, cwc, lpchBuf + sizeof(WORD), cbAnsi - sizeof(WORD), NULL, NULL)) == 0) hr = E_UNEXPECTED; _GLOBALUNLOCK(m_hmemDestNorm); } return (hr); } /**************************************************************** * @method STDMETHODIMP | IStemSink | PutAltWord | * Notifies IStemSink of a word that is similar to the input word * of method. * @parm WCHAR const | *pwcInBuf | Pointer to the word * @parm ULONG | cwc | Number of characters in the word * * @rvalue S_OK | This method always returns success. * * @comm * InfoTech Search only supports getting back one stemmed version * of the raw word. Any others are ignored. * @xref ***************************************************************/ STDMETHODIMP CITIndexObjBridge::PutAltWord(WCHAR const *pwcInBuf, ULONG cwc) { // We only support getting back one stemmed version of the raw word, // so we ignore all the others. return (S_OK); } //--------------------------------------------------------------------------- // Other Public Method Implementations //--------------------------------------------------------------------------- // By the time this method is called, we assume the breaker has been fully // initialized via IWordBreakerConfig (if present) and via IWordBreaker::Init. STDMETHODIMP CITIndexObjBridge::SetWordBreaker(PIWBRK piwbrk) { LCID lcid; if (piwbrk == NULL) return (SetErrReturn(E_POINTER)); if (m_piwbrk != NULL) return (SetErrReturn(E_ALREADYINIT)); // Pick up IWordBreakerConfig if its there, otherwise we'll go without it. // Do the same for IStemmer if we got IWordBreakerConfig. if (SUCCEEDED(piwbrk->QueryInterface(IID_IWordBreakerConfig, (LPVOID *) &m_piwbrkc))) m_piwbrkc->GetWordStemmer(&m_pistem); // Pick up IITStopWordList if its there, otherwise we'll go without it. piwbrk->QueryInterface(IID_IITStopWordList, (LPVOID *) &m_piitstwdl); if (m_piwbrkc == NULL || FAILED(m_piwbrkc->GetLocaleInfo(&m_dwCodePageID, &lcid))) m_dwCodePageID = GetACP(); (m_piwbrk = piwbrk)->AddRef(); return (S_OK); } // NOTE: If CITIndexObjBridge::BreakText was going to provide more than // one buffer's worth of text to the COM breaker, then the very first members of // CITIndexObjBridge would be made to match those of TEXT_SOURCE so that // FillTextSource callback could call back into us (by casting the TEXT_SOURCE // param passed to it). Otherwise, we would have no way of providing // object-oriented breaking - we would have to resort to using globals. SCODE __stdcall FillTextSource(TEXT_SOURCE *pTextSource) { // We always return failure to signify no more text. return E_FAIL; } STDMETHODIMP CITIndexObjBridge::BreakText(PEXBRKPM pexbrkpm) { HRESULT hr = S_OK; if (m_piwbrk == NULL) return (E_UNEXPECTED); if (pexbrkpm == NULL) return (SetErrReturn(E_POINTER)); if (pexbrkpm->lpbBuf == NULL) return (SetErrReturn(E_INVALIDARG)); // Configure word breaker if we got IWordBreakerConfig; otherwise, // check values in *pexbrkpm to see if they are compatible with defaults. if (m_piwbrkc != NULL) { DWORD grfBreakFlags; if (SUCCEEDED(hr = m_piwbrkc->SetBreakWordType(pexbrkpm->dwBreakWordType)) && SUCCEEDED(hr = m_piwbrkc->GetControlInfo(&grfBreakFlags, NULL))) { SetGrfFlag(&grfBreakFlags, IITWBC_BREAK_ACCEPT_WILDCARDS, (pexbrkpm->fFlags & ACCEPT_WILDCARD)); hr = m_piwbrkc->SetControlInfo(grfBreakFlags, NULL); } } else { if (pexbrkpm->dwBreakWordType != IITWBC_BREAKTYPE_TEXT) hr = E_NOTSUPPORTED; } if (SUCCEEDED(hr)) { DWORD cwch; m_fNormWord = FALSE; m_pexbrkpm = pexbrkpm; cwch = pexbrkpm->cbBufCount; if (SUCCEEDED(hr = ReallocBuffer(&m_hmemSrc, &m_cbBufSrcCur, sizeof(WCHAR) * cwch))) { WCHAR *lpwchBuf; lpwchBuf = (WCHAR *) _GLOBALLOCK(m_hmemSrc); // Convert the text source buffer to Unicode. if ((cwch = MultiByteToWideChar(m_dwCodePageID, NULL, (LPCSTR) pexbrkpm->lpbBuf, pexbrkpm->cbBufCount, lpwchBuf, cwch)) > 0) { TEXT_SOURCE txtsrc; txtsrc.pfnFillTextBuffer = FillTextSource; txtsrc.awcBuffer = lpwchBuf; txtsrc.iCur = 0; txtsrc.iEnd = cwch; // Send the Unicode text buffer to the breaker. hr = m_piwbrk->BreakText(&txtsrc, (PIWRDSNK) this, NULL); } else hr = E_UNEXPECTED; _GLOBALUNLOCK(m_hmemSrc); } m_pexbrkpm = NULL; } return (hr); } // The stop word is in WORD length prefix format. STDMETHODIMP CITIndexObjBridge::LookupStopWord(LPBYTE lpbStopWord) { HRESULT hr; DWORD cwch; DWORD cbAnsi; if (lpbStopWord == NULL) return (SetErrReturn(E_POINTER)); if (m_piitstwdl == NULL) return (SetErrReturn(E_NOTIMPL)); cwch = cbAnsi = (DWORD)(*((WORD *)lpbStopWord)); if (SUCCEEDED(hr = ReallocBuffer(&m_hmemSrc, &m_cbBufSrcCur, sizeof(WCHAR) * cwch))) { WCHAR *lpwchBuf; lpwchBuf = (WCHAR *) _GLOBALLOCK(m_hmemSrc); // Convert the stop word to Unicode. if ((cwch = MultiByteToWideChar(m_dwCodePageID, NULL, (LPCSTR)lpbStopWord + sizeof(WORD), cbAnsi, lpwchBuf, cwch)) > 0) { // Lookup the stop word. hr = m_piitstwdl->LookupWord(lpwchBuf, cwch); } else hr = E_UNEXPECTED; _GLOBALUNLOCK(m_hmemSrc); } return (hr); } // Stem the raw word and return result in lpbStemWord. // Both word buffers are in WORD length prefix format. STDMETHODIMP CITIndexObjBridge::StemWord(LPBYTE lpbStemWord, LPBYTE lpbRawWord) { HRESULT hr; DWORD cwch; DWORD cbAnsi; if (lpbStemWord == NULL || lpbRawWord == NULL) return (SetErrReturn(E_POINTER)); if (m_pistem == NULL) return (SetErrReturn(E_NOSTEMMER)); cwch = cbAnsi = (DWORD)(*((WORD *)lpbRawWord)); if (SUCCEEDED(hr = ReallocBuffer(&m_hmemSrc, &m_cbBufSrcCur, sizeof(WCHAR) * cwch))) { WCHAR *lpwchBuf; lpwchBuf = (WCHAR *) _GLOBALLOCK(m_hmemSrc); // Convert the word to be stemmed to Unicode. if ((cwch = MultiByteToWideChar(m_dwCodePageID, NULL, (LPCSTR)lpbRawWord + sizeof(WORD), cbAnsi, lpwchBuf, cwch)) > 0) { // Stem the raw word. if (SUCCEEDED(hr = m_pistem->StemWord(lpwchBuf, cwch, (PISTEMSNK) this))) { char *lpchStemBuf; WORD cbStemWord; lpchStemBuf = (char *) _GLOBALLOCK(m_hmemDestNorm); // Copy stem word from the normalized word destination buffer // (where our implementation of IStemSink::PutWord put it) to // lpbStemWord as long as it is not longer than the raw word. if ((cbStemWord = *((WORD *)lpchStemBuf)) <= cbAnsi) MEMCPY(lpbStemWord, lpchStemBuf, cbStemWord + sizeof(WORD)); else hr = E_WORDTOOLONG; _GLOBALUNLOCK(m_hmemDestNorm); } } else hr = E_UNEXPECTED; _GLOBALUNLOCK(m_hmemSrc); } return (hr); } // On entry, lpbTermHit is a WORD-prefixed MBCS string. // On exit, *ppvTermHit is a WORD-prefixed Unicode string. STDMETHODIMP CITIndexObjBridge::AddQueryResultTerm(LPBYTE lpbTermHit, LPVOID *ppvTermHit) { DWORD cwch; DWORD cbAnsi; HRESULT hr = S_OK; if (lpbTermHit == NULL || ppvTermHit == NULL) return (SetErrReturn(E_POINTER)); if (m_dwCodePageID == 0) return (SetErrReturn(E_NOTINIT)); cwch = cbAnsi = (DWORD)(*((WORD *)lpbTermHit)); // When allocating the buffer, add 1 char to leave room for the // Unicode string's WORD prefix. if ((m_lpsipbTermHit != NULL || (m_lpsipbTermHit = MVStopListInitiate(IDXOBR_TERMHASH_SIZE, &hr)) != NULL) && SUCCEEDED(hr = ReallocBuffer(&m_hmemSrc, &m_cbBufSrcCur, sizeof(WCHAR) * (cwch + 1)))) { WCHAR *lpwchBuf; lpwchBuf = (WCHAR *) _GLOBALLOCK(m_hmemSrc); // Convert lpbTermHit to Unicode before searching or storing it; // leave space in the Unicode buffer for the WORD length prefix. if ((cwch = MultiByteToWideChar(m_dwCodePageID, NULL, (LPCSTR)lpbTermHit + sizeof(WORD), cbAnsi, lpwchBuf + 1, cwch)) > 0) { // Store the Unicode string length, but restate it in bytes // since the stopword list lookup code assumes MBCS. *lpwchBuf = (WORD)cwch * sizeof(WCHAR); // Add the word to the list and then get a pointer to it. if (SUCCEEDED(hr = MVStopListAddWord(m_lpsipbTermHit, (LPBYTE) lpwchBuf))) { hr = MVStopListFindWordPtr(m_lpsipbTermHit, (LST)lpwchBuf, (LST *)ppvTermHit); } } else hr = E_UNEXPECTED; _GLOBALUNLOCK(m_hmemSrc); } return (hr); } // This method should only be called after a query term hit list has been // completely built. It will iterate over all the terms and reduce the // length prefixes from byte-based to WCHAR-based - i.e. the lengths // ill be divided by two. Once this method has been called, it will // no longer be possible to search for terms in the term list. // We do this so that the direct pointer refs to terms that end up in the // query result list point to correct WCHAR-based length prefixes. STDMETHODIMP CITIndexObjBridge::AdjustQueryResultTerms(void) { if (m_lpsipbTermHit != NULL) { LST lstWord; LONG lWordInfo = -1L; LPVOID pvWordInfo = NULL; while (SUCCEEDED(MVStopListEnumWords(m_lpsipbTermHit, &lstWord, &lWordInfo, &pvWordInfo))) { ITASSERT(*((WORD *)lstWord) % sizeof(WCHAR) == 0); *((WORD *)lstWord) /= sizeof(WCHAR); } } return (S_OK); } //--------------------------------------------------------------------------- // Private Method Implementations //--------------------------------------------------------------------------- HRESULT CITIndexObjBridge::ReallocBuffer(HGLOBAL *phmemBuf, DWORD *pcbBufCur, DWORD cbBufNew) { return (ReallocBufferHmem(phmemBuf, pcbBufCur, max(cbBufNew, cbConvBufInit))); }