1571 lines
47 KiB
C
1571 lines
47 KiB
C
//#define _DUMPALL
|
|
/*************************************************************************
|
|
* *
|
|
* SIMILAR.C *
|
|
* *
|
|
* Copyright (C) Microsoft Corporation 1990-1996 *
|
|
* All Rights reserved. *
|
|
* *
|
|
**************************************************************************
|
|
* *
|
|
* Module Intent: *
|
|
* *
|
|
* Search Core Engine: Find Similar functionality *
|
|
* *
|
|
**************************************************************************
|
|
*
|
|
* Revision History:
|
|
*
|
|
* 09/24/96 kevynct Started from algorithm notes (4 hrs)
|
|
* 09/25/96 kevynct Implemented skeleton of ProcessSimilarityTerm (1 hr)
|
|
* 09/26/96 kevynct More work on inner loop and relevant list (5 hrs)
|
|
* 09/27/96 kevynct Query parsing, weighting, and sorting (6 hrs)
|
|
* 10/01/96 kevynct Incorporate into MV2.0b (10 min)
|
|
* 10/02/96 kevynct Clean-up query code, start resolve query code (4 hrs)
|
|
* 10/03/96 kevynct Resolve query code (2 hrs)
|
|
* 10/11/96 kevynct Start bucket routines (2 hrs)
|
|
* 10/13/96 kevynct Finish bucket routines, write node processor, cleanup (6 hrs)
|
|
* 10/14/96 kevynct Clean-up, remove compilation errors, debugging (6 hrs)
|
|
* 10/24/96 kevynct Convert to two-phase query resolution (3 hrs)
|
|
* 10/25/96 kevynct Fix sort by cTopics, debug new query resolution, try new weighting (2 hrs)
|
|
* 11/26/96 kevynct Testing, fix and improve weighting and accumulation: aliases, digits (8 hrs)
|
|
* 12/2/96 kevynct More weighting tests (8 hrs)
|
|
* Work remaining:
|
|
*
|
|
* Investigate field and stemming support
|
|
*
|
|
* Use probabilistic upperbounds for pruning. Remove single-term nodes after each term process
|
|
* Test current bucket method vs. exact scores w/ heap
|
|
*
|
|
**************************************************************************
|
|
*
|
|
* Current Owner: KevynCT
|
|
*
|
|
**************************************************************************/
|
|
|
|
#include <mvopsys.h>
|
|
#include <mem.h>
|
|
#include <memory.h>
|
|
#include <orkin.h>
|
|
#include <mvsearch.h>
|
|
#include <math.h>
|
|
#include <groups.h>
|
|
#include "common.h"
|
|
#include "search.h"
|
|
|
|
#ifdef _DEBUG
|
|
static BYTE NEAR s_aszModule[] = __FILE__; // Used by error return functions.
|
|
#endif
|
|
|
|
#define FGetDword(a,b,c) (*DecodeTable[b.cschScheme])(a, b, c)
|
|
#define IS_DIGIT(p) ((p) >= '0' && (p) <= '9')
|
|
// these are in case the doc scoring is approximate: they tell which
|
|
// direction to err on the side of.
|
|
#define ROUND_DOWN 0
|
|
#define ROUND_UP 1
|
|
|
|
#define SCORE_BLOCK_SIZE 32
|
|
#define NUM_SCORE_BLOCKS (MAX_WEIGHT/SCORE_BLOCK_SIZE)
|
|
|
|
typedef struct tagDocScoreList {
|
|
HANDLE hMem;
|
|
int cScoresLeft;
|
|
int iBucketLowest;
|
|
int iHighestScore;
|
|
int rgiScores[NUM_SCORE_BLOCKS + 1];
|
|
} DSL, FAR *_LPDSL;
|
|
|
|
PUBLIC HRESULT PASCAL FAR SkipOccList(_LPQT lpqt, PNODEINFO pNodeInfo, DWORD dwOccs); // ftsearch.c
|
|
PUBLIC int PASCAL FAR CompareTerm(_LPQTNODE lpQtNode,
|
|
LST lstTermWord, LST lstBtreeWord, DWORD dwBtreeFieldId, char []); // ftsearch.c
|
|
PUBLIC STRING_TOKEN FAR *PASCAL AllocWord(_LPQT lpQueryTree, LST lstWord); // qtparse.c
|
|
|
|
|
|
|
|
__inline LPVOID InitDocScoreList(int cScores);
|
|
__inline void FreeDocScoreList(LPV lpDocScores);
|
|
__inline int GetMaxDocScore(_LPDSL lpDocScores);
|
|
__inline int GetMinDocScore(_LPDSL lpDocScores, BOOL fRoundUp);
|
|
BOOL UpdateDocScoreList(_LPDSL lpDocScores, int iOldScore, int i);
|
|
__inline BOOL IsDocScoreListFull(_LPDSL lpdsl);
|
|
__inline WORD AddWeights(DWORD w1, DWORD w2);
|
|
int GetSortedDocScore(_LPDSL lpDocScores, int iThis, BOOL fRoundUp);
|
|
#if defined(_DEBUG)
|
|
BOOL DumpDocScoreList(_LPDSL lpdsl, PSRCHINFO pSrchInfo);
|
|
#endif
|
|
__inline void MergeWordInfoCounts(WORDINFO FAR *lpwiDest, WORDINFO FAR *lpwiSrc);
|
|
|
|
PRIVATE LPQT TokenizeFlatQuery(LPPARSE_PARMS lpParms, PSRCHINFO pSrchInfo, PHRESULT phr);
|
|
PRIVATE HRESULT PASCAL NEAR ResolveFlatQuery(_LPQT lpqt, _LPQTNODE lpCurQtNode, LPRETV lpRetV);
|
|
PRIVATE HRESULT GetWordInfoList(_LPQT lpqt, STRING_TOKEN FAR *lpStrToken, _LPQTNODE lpCurQtNode, LPRETV lpRetV);
|
|
PRIVATE VOID PASCAL SortStringWeights(_LPQT lpQueryTree);
|
|
PRIVATE VOID PASCAL SetStringWeights (LPQI lpQueryInfo);
|
|
PUBLIC HRESULT PASCAL FAR EXPORT_API FFlatCallBack (LST lstRawWord, LST lstNormWord,
|
|
LFO lfoWordOffset, LPQI lpqi);
|
|
|
|
__inline LPVOID InitDocScoreList(int cScores)
|
|
{
|
|
_LPDSL lpdsl;
|
|
|
|
if ((lpdsl = (_LPDSL)GlobalLockedStructMemAlloc(sizeof(DSL))) == NULL)
|
|
return NULL;
|
|
|
|
lpdsl->cScoresLeft = cScores;
|
|
lpdsl->iHighestScore = 0;
|
|
lpdsl->iBucketLowest = -1;
|
|
return (LPV)lpdsl;
|
|
}
|
|
|
|
__inline void FreeDocScoreList(LPV lpDocScores)
|
|
{
|
|
if ((_LPDSL)lpDocScores)
|
|
GlobalLockedStructMemFree((_LPDSL)lpDocScores);
|
|
}
|
|
|
|
__inline int GetMaxDocScore(_LPDSL lpDocScores)
|
|
{
|
|
return lpDocScores->iHighestScore;
|
|
}
|
|
|
|
__inline int GetMinDocScore(_LPDSL lpDocScores, BOOL fRoundUp)
|
|
{
|
|
if (lpDocScores->iBucketLowest >= 0)
|
|
return (lpDocScores->iBucketLowest + !!fRoundUp) * SCORE_BLOCK_SIZE;
|
|
|
|
return 0;
|
|
}
|
|
|
|
int GetSortedDocScore(_LPDSL lpdsl, int cThis, BOOL fRoundUp)
|
|
{
|
|
LPINT lpi, lpiFirst;
|
|
|
|
if (lpdsl->iHighestScore < 0)
|
|
return 0;
|
|
|
|
lpiFirst= &lpdsl->rgiScores[0];
|
|
|
|
for (lpi = &lpdsl->rgiScores[lpdsl->iHighestScore/SCORE_BLOCK_SIZE];
|
|
lpi >= lpiFirst; cThis -= *lpi, lpi--)
|
|
{
|
|
if (cThis <= *lpi)
|
|
return ((lpi - lpiFirst) + !!fRoundUp) * SCORE_BLOCK_SIZE;
|
|
}
|
|
return (!!fRoundUp * SCORE_BLOCK_SIZE);
|
|
}
|
|
|
|
#if defined(_DEBUG)
|
|
BOOL DumpDocScoreList(_LPDSL lpdsl, PSRCHINFO pSrchInfo)
|
|
{
|
|
LPINT lpi, lpiMax;
|
|
int iT = 0;
|
|
int i;
|
|
|
|
lpi = &lpdsl->rgiScores[0];
|
|
lpiMax = lpi + NUM_SCORE_BLOCKS;
|
|
for (i = 0;lpi < lpiMax;lpi++, i++)
|
|
{
|
|
if (*lpi)
|
|
{
|
|
_DPF2("Score %d (count %d)\n", i, *lpi);
|
|
}
|
|
iT += *lpi;
|
|
}
|
|
_DPF1("%d topics in scorelist\n", iT);
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
#endif
|
|
|
|
BOOL UpdateDocScoreList(_LPDSL lpdsl, int iOldScore, int iScore)
|
|
{
|
|
int iThis = iScore/SCORE_BLOCK_SIZE;
|
|
int iOld = iOldScore/SCORE_BLOCK_SIZE;
|
|
|
|
if (lpdsl->cScoresLeft <= 0)
|
|
{
|
|
// already full, figure out which buckets need updating
|
|
if (iThis > lpdsl->iBucketLowest)
|
|
{
|
|
// if we're updating an existing entry, remove that
|
|
// otherwise remove the lowest one
|
|
if (iOld >= lpdsl->iBucketLowest)
|
|
lpdsl->rgiScores[iOld]--;
|
|
else
|
|
lpdsl->rgiScores[lpdsl->iBucketLowest]--;
|
|
|
|
// then make sure lowest one is still non-empty; if not,
|
|
// revise upwards
|
|
if (lpdsl->rgiScores[lpdsl->iBucketLowest] <= 0)
|
|
{
|
|
for (lpdsl->iBucketLowest++; lpdsl->iBucketLowest <= iThis; lpdsl->iBucketLowest++)
|
|
if (lpdsl->rgiScores[lpdsl->iBucketLowest])
|
|
break;
|
|
add_new_doc:
|
|
if (lpdsl->iBucketLowest >= 0)
|
|
lpdsl->iBucketLowest = min(lpdsl->iBucketLowest, iThis);
|
|
else
|
|
lpdsl->iBucketLowest = iThis;
|
|
}
|
|
|
|
// then add the new entry
|
|
lpdsl->rgiScores[iThis]++;
|
|
update_highest_score:
|
|
if (iScore > lpdsl->iHighestScore)
|
|
lpdsl->iHighestScore = iScore;
|
|
|
|
#if defined(_DEBUG) && defined(_DUMPALL)
|
|
//DumpDocScoreList(lpdsl, NULL);
|
|
#endif
|
|
Assert(lpdsl->rgiScores[lpdsl->iHighestScore/SCORE_BLOCK_SIZE] >= 0);
|
|
return TRUE;
|
|
}
|
|
else
|
|
if (iThis == lpdsl->iBucketLowest)
|
|
goto update_highest_score;
|
|
|
|
Assert(lpdsl->rgiScores[lpdsl->iHighestScore/SCORE_BLOCK_SIZE] >= 0);
|
|
return FALSE;
|
|
}
|
|
|
|
// doc score list is not yet full, so automatically add if new,
|
|
// remove old if update
|
|
if (iOld >= lpdsl->iBucketLowest)
|
|
lpdsl->rgiScores[iOld]--;
|
|
else
|
|
lpdsl->cScoresLeft--;
|
|
goto add_new_doc;
|
|
}
|
|
|
|
__inline BOOL IsDocScoreListFull(_LPDSL lpdsl)
|
|
{
|
|
return (lpdsl->cScoresLeft <= 0);
|
|
}
|
|
|
|
__inline WORD AddWeights(DWORD w1, DWORD w2)
|
|
{
|
|
return (WORD)min(MAX_WEIGHT, w1 + w2);
|
|
}
|
|
|
|
/*************************************************************************
|
|
* @doc EXTERNAL API RETRIEVAL
|
|
*
|
|
* @func LPHL FAR PASCAL | MVIndexFindSimilar |
|
|
* Given a query which probably represents a document text stream, returns
|
|
* a hitlist containing topics which are determined to be similar to the query
|
|
* using nearest-neighbour searching.
|
|
*
|
|
* @parm LPIDX | lpidx |
|
|
* Pointer to index information.
|
|
*
|
|
* @parm LPQT | lpqt |
|
|
* Pointer to query tree (returned by MVQueryParse())
|
|
*
|
|
* @parm PSRCHINFO | pSrchInfo |
|
|
* Pointer to search information data
|
|
*
|
|
* @parm _LPGROUP | lpResGroup |
|
|
* Pointer to resulting group
|
|
*
|
|
* @parm LPVOID | pCallback |
|
|
* Pointer to callback struct FCALLBACK_MSG (optional)
|
|
*
|
|
* @parm PHRESULT | phr |
|
|
* Pointer to error buffer
|
|
*
|
|
* @rdesc Pointer to hitlist structure if succeeded, even there is
|
|
* no hits (use MVHitListEntries() to find out how many hits have been
|
|
* returned). It will return NULL if failed. The error buffer
|
|
* (see IndexOpen()) will contain descriptions about the cause of
|
|
* the failure. There is one special case when the function returns
|
|
* a non-null pointer, even there is error, that is when it can't
|
|
* write the result to the disk, and everything is still in memory.
|
|
*
|
|
*************************************************************************/
|
|
// bugbug: handle wildcards
|
|
PUBLIC LPHL EXPORT_API FAR PASCAL MVIndexFindSimilar (_LPIDX lpidx,
|
|
LPPARSE_PARMS lpParms, PSRCHINFO pSrchInfo, _LPGROUP lpResGroup,
|
|
LPVOID pCallback, PHRESULT phr)
|
|
{
|
|
HRESULT fRet; // Return from this function.
|
|
LPRETV lpRetV; // Retrieval memory/files.
|
|
GHANDLE hRetv;
|
|
//OCCF occf; // Index occurence flags temporary variable.
|
|
_LPHL lphl; // Pointer to hitlist
|
|
_LPQTNODE lpTreeTop;
|
|
HANDLE hTreeTop = NULL;
|
|
_LPQT lpqt;
|
|
|
|
if (lpidx == NULL || lpParms == NULL || pSrchInfo == NULL)
|
|
{
|
|
/* We get some bad arguments!! */
|
|
SetErrCode (phr, E_INVALIDARG);
|
|
return NULL;
|
|
}
|
|
|
|
if (NULL == (lpqt = TokenizeFlatQuery(lpParms, pSrchInfo, phr)))
|
|
{
|
|
// errb was set
|
|
return NULL;
|
|
}
|
|
|
|
fRet = E_FAIL; // Assume thing will go wrong
|
|
|
|
// Transfer all the information about the index to the query tree
|
|
lpqt->foIdxRoot = lpidx->ih.foIdxRoot; /* Top node offset */
|
|
lpqt->dwBlockSize = lpidx->ih.dwBlockSize; /* Index block size */
|
|
lpqt->cIdxLevels = lpidx->ih.cIdxLevels; /* Index's depth */
|
|
lpqt->occf = lpidx->ih.occf;
|
|
lpqt->idxf = lpidx->ih.idxf;
|
|
lpqt->foIdxRoot = lpidx->ih.foIdxRoot;
|
|
lpqt->ckeyTopicId = lpidx->ih.ckeyTopicId;
|
|
lpqt->ckeyOccCount = lpidx->ih.ckeyOccCount;
|
|
lpqt->ckeyWordCount = lpidx->ih.ckeyWordCount;
|
|
lpqt->ckeyOffset = lpidx->ih.ckeyOffset;
|
|
|
|
if (pSrchInfo->dwMemAllowed)
|
|
{
|
|
// allocate document result list
|
|
// no occurrence info is returned for similarity query
|
|
SetBlockCount (lpqt->lpTopicMemBlock, (WORD)(pSrchInfo->dwMemAllowed /
|
|
(sizeof(TOPIC_LIST) * cTOPIC_PER_BLOCK)));
|
|
|
|
SetBlockCount (lpqt->lpOccMemBlock, 1);
|
|
}
|
|
|
|
if (pCallback)
|
|
MVSearchSetCallback(lpqt, pCallback);
|
|
|
|
/* Allocate hitlist */
|
|
if ((lphl = (_LPHL)GlobalLockedStructMemAlloc(sizeof (HL))) == NULL)
|
|
{
|
|
fRet = E_OUTOFMEMORY;
|
|
SetErrCode(phr, fRet);
|
|
exit00:
|
|
if (lpqt)
|
|
{
|
|
FreeDocScoreList(lpqt->lpDocScores);
|
|
MVQueryFree(lpqt);
|
|
}
|
|
|
|
if (lphl && fRet != S_OK && fRet != E_TOOMANYTOPICS)
|
|
{
|
|
MVHitListDispose(lphl);
|
|
lphl = NULL;
|
|
}
|
|
return (LPHL)lphl;
|
|
}
|
|
lphl->lLastTopicId = 0xffffffff;
|
|
lphl->lcMaxTopic = lpidx->ih.lcTopics;
|
|
|
|
/* Allocate a return value structure */
|
|
|
|
if ((hRetv = _GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT,
|
|
sizeof(RETV))) == NULL)
|
|
{
|
|
SetErrCode(phr, E_OUTOFMEMORY);
|
|
goto exit00;
|
|
}
|
|
|
|
lpRetV = (LPRETV)_GLOBALLOCK(hRetv);
|
|
lpRetV->lpqt = lpqt;
|
|
|
|
if ((fRet = TopNodeRead(lpidx)) != S_OK)
|
|
{
|
|
SetErrCode (phr, fRet);
|
|
exit02:
|
|
FreeHandle(hRetv);
|
|
goto exit00;
|
|
}
|
|
|
|
|
|
//
|
|
// Count the number of occurence fields present. My retrieval
|
|
// occurence record is going to cost 4 bytes per field.
|
|
//
|
|
|
|
//occf = lpqt->occf;
|
|
//for (lpRetV->cOccFields = 0; occf; lpRetV->cOccFields++)
|
|
// occf &= occf - 1;
|
|
|
|
lpqt->dwOccSize = lpRetV->dwOccSize = 0;
|
|
//sizeof(OCCURENCE) + lpRetV->cOccFields * sizeof (DWORD);
|
|
|
|
lpRetV->fRank = TRUE; //((pSrchInfo->Flag &
|
|
//(QUERYRESULT_RANK | QUERYRESULT_NORMALIZE)) != 0);
|
|
|
|
// Set pointer to various buffer
|
|
lpRetV->LeafInfo.pTopNode = lpidx->lrgbTopNode;
|
|
lpRetV->LeafInfo.pStemNode = lpRetV->pNodeBuf;
|
|
lpRetV->LeafInfo.pLeafNode = lpRetV->pNodeBuf;
|
|
lpRetV->LeafInfo.pDataNode = lpRetV->pDataBuf;
|
|
lpRetV->LeafInfo.hfpbIdx = lpidx->hfpbIdxSubFile; // Index file to read from
|
|
|
|
lpRetV->DataInfo.pTopNode = lpidx->lrgbTopNode;
|
|
lpRetV->DataInfo.pStemNode = lpRetV->pNodeBuf;
|
|
lpRetV->DataInfo.pLeafNode = lpRetV->pNodeBuf;
|
|
lpRetV->DataInfo.pDataNode = lpRetV->pDataBuf;
|
|
lpRetV->DataInfo.hfpbIdx = lpidx->hfpbIdxSubFile; // Index file to read from
|
|
lpRetV->lcid = lpidx->ih.lcid;
|
|
|
|
// Save search information
|
|
lpRetV->SrchInfo = *pSrchInfo;
|
|
if (pSrchInfo->dwValue == 0)
|
|
lpRetV->SrchInfo.dwValue = (DWORD)(-1);
|
|
else
|
|
lpRetV->SrchInfo.dwValue = lpidx->ih.lcTopics/pSrchInfo->dwValue;
|
|
|
|
// this is a dummy node that we pass in to hold all term results
|
|
if ((lpTreeTop = (_LPQTNODE)_GLOBALLOCK( \
|
|
hTreeTop = _GLOBALALLOC(GHND, sizeof (QTNODE)))) == NULL)
|
|
{
|
|
SetErrCode(phr, fRet = E_OUTOFMEMORY);
|
|
goto exit02;
|
|
}
|
|
QTN_FLAG(lpTreeTop) = EXACT_MATCH;
|
|
lpTreeTop->pNext = NULL;
|
|
lpTreeTop->pPrev = NULL;
|
|
lpTreeTop->lpTopicList = NULL;
|
|
|
|
if ( (fRet = ResolveFlatQuery(lpqt, lpTreeTop, lpRetV)) != S_OK)
|
|
{
|
|
SetErrCode (phr, fRet);
|
|
|
|
/* Free the Topic and Occurrence memory blocks since they are
|
|
* not freed by QueryTreeFree(), or MVHitListDispose() at this
|
|
* point
|
|
*/
|
|
|
|
if (fRet != E_TOOMANYTOPICS)
|
|
{
|
|
|
|
BlockFree ((LPV)lpqt->lpTopicMemBlock);
|
|
BlockFree ((LPV)lpqt->lpOccMemBlock);
|
|
lpqt->lpTopicMemBlock = NULL;
|
|
lpqt->lpOccMemBlock = NULL;
|
|
exit03:
|
|
if (hTreeTop)
|
|
{
|
|
_GLOBALUNLOCK(hTreeTop);
|
|
_GLOBALFREE(hTreeTop);
|
|
}
|
|
goto exit02;
|
|
}
|
|
}
|
|
|
|
/* Create a group if requested */
|
|
if ((pSrchInfo->Flag & QUERYRESULT_GROUPCREATE) && lpResGroup)
|
|
{
|
|
LPITOPIC lpCurTopic; /* Topic's current pointer */
|
|
LPB lpbGrpBitVect;
|
|
DWORD maxTopicId;
|
|
|
|
/* Initialize the pointer */
|
|
lpbGrpBitVect = lpResGroup->lpbGrpBitVect;
|
|
|
|
maxTopicId = lpResGroup->dwSize * 8;
|
|
for (lpCurTopic = QTN_TOPICLIST(lpTreeTop); lpCurTopic;
|
|
lpCurTopic = lpCurTopic->pNext)
|
|
{
|
|
/* Set the bit */
|
|
if (lpCurTopic->dwTopicId < maxTopicId)
|
|
{
|
|
lpbGrpBitVect[(DWORD)(lpCurTopic->dwTopicId / 8)] |= 1 <<
|
|
(lpCurTopic->dwTopicId % 8);
|
|
}
|
|
}
|
|
}
|
|
|
|
if ((pSrchInfo->Flag & QUERYRESULT_UIDSORT) == 0)
|
|
{
|
|
|
|
/* Sort the result depending on ranking or not */
|
|
if (lpRetV->fRank)
|
|
SortResult ((LPQT)lpqt, lpTreeTop, WEIGHT_BASED);
|
|
else
|
|
SortResult ((LPQT)lpqt, lpTreeTop, HIT_COUNT_BASED);
|
|
}
|
|
|
|
/* Update HitList info structure, cut off the unwanted list */
|
|
if (lphl->lpTopicList = lpTreeTop->lpTopicList)
|
|
lphl->lcReturnedTopics = lphl->lcTotalNumOfTopics = lpTreeTop->cTopic;
|
|
|
|
// Only return the number of topics that the user requested
|
|
// if dwTopicCount == 0, it means that the user wants to return all
|
|
|
|
if (pSrchInfo->dwTopicCount != 0 &&
|
|
pSrchInfo->dwTopicCount < lphl->lcReturnedTopics)
|
|
lphl->lcReturnedTopics = pSrchInfo->dwTopicCount;
|
|
|
|
lphl->lpOccMemBlock = lpqt->lpOccMemBlock;
|
|
lphl->lpTopicMemBlock = lpqt->lpTopicMemBlock;
|
|
|
|
#if 1
|
|
/* WARNING: The following code should be commented out for
|
|
* diskless devices. No returned error is checked, since
|
|
* if disk writes fail, everything is still in memory
|
|
*/
|
|
|
|
if ((pSrchInfo->Flag & QUERYRESULT_IN_MEM) == 0)
|
|
{
|
|
if ((fRet = MVHitListFlush (lphl, lphl->lcReturnedTopics)) != S_OK)
|
|
SetErrCode (phr, fRet);
|
|
}
|
|
#endif
|
|
|
|
fRet = S_OK;
|
|
goto exit03;
|
|
}
|
|
|
|
PRIVATE LPQT TokenizeFlatQuery(LPPARSE_PARMS lpParms, PSRCHINFO pSrchInfo, PHRESULT phr)
|
|
{
|
|
HRESULT fRet; // Return value.
|
|
HANDLE hqi; // Handle to "lpqi".
|
|
HANDLE hibi; // Handle to internal breaker info
|
|
HANDLE hQuery; // Handle to secondary query buffer
|
|
LPQI lpQueryInfo; // Query information.
|
|
LPIBI lpibi; // Pointer to internal breaker info
|
|
LPB lpbQueryBuf; // Copy of query's buffer
|
|
_LPQT lpQueryTree; // Query tree pointer
|
|
BRK_PARMS brkParms; // Breaker info parms
|
|
LPCHARTAB lpCharTabInfo;// Pointer to character table's info
|
|
|
|
/* LPPARSE_PARMS structure break-out variables */
|
|
BYTE FAR CONST *lpbQuery; // Query buffer
|
|
DWORD cbQuery; // Query length
|
|
LPBRKLIST lpfnTable; // DType function table
|
|
LPGROUP lpGroup; // Group
|
|
|
|
lpbQuery = lpParms->lpbQuery;
|
|
cbQuery = lpParms->cbQuery;
|
|
lpfnTable = lpParms->lpfnTable;
|
|
lpGroup = lpParms->lpGroup;
|
|
|
|
if (lpfnTable == NULL)
|
|
{
|
|
SetErrCode(phr, E_BADBREAKER);
|
|
return NULL;
|
|
}
|
|
|
|
if (cbQuery == 0 || lpbQuery == NULL) {
|
|
SetErrCode(phr, E_NULLQUERY);
|
|
return NULL;
|
|
}
|
|
|
|
lpQueryTree = NULL;
|
|
hqi = hibi = hQuery = NULL;
|
|
fRet = E_FAIL;
|
|
|
|
if ((hqi = (GHANDLE)_GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT,
|
|
(LCB)sizeof(QUERY_INFO))) == NULL)
|
|
{
|
|
fRet = SetErrCode(phr, E_OUTOFMEMORY);
|
|
goto ErrFreeAll;
|
|
}
|
|
lpQueryInfo = (LPQI)_GLOBALLOCK(hqi);
|
|
lpQueryInfo->lperrb = phr;
|
|
lpQueryInfo->lpOpSymTab = NULL; // not used for similarity
|
|
lpQueryInfo->cOpEntry = 0;
|
|
|
|
/* Allocate a breaker info block used by different breakers */
|
|
if ((hibi = (GHANDLE)_GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT,
|
|
(LCB)sizeof(IBI))) == NULL)
|
|
{
|
|
fRet = SetErrCode(phr, E_OUTOFMEMORY);
|
|
goto ErrFreeAll;
|
|
}
|
|
lpibi = (LPBRKI)_GLOBALLOCK(hibi);
|
|
|
|
/* Set the default breaker function, and stop list */
|
|
#ifndef CW
|
|
lpQueryInfo->lpfnBreakFunc = lpfnTable[0].lpfnBreakFunc;
|
|
#endif
|
|
lpQueryInfo->lpStopListInfo = lpfnTable[0].lpStopListInfo;
|
|
|
|
if ((lpCharTabInfo = lpQueryInfo->lpCharTab =
|
|
lpfnTable[0].lpCharTab) == NULL)
|
|
{
|
|
|
|
/* Default character and ligature tables */
|
|
|
|
lpCharTabInfo = lpQueryInfo->lpCharTab = MVCharTableGetDefault (phr);
|
|
if (lpCharTabInfo == NULL)
|
|
{
|
|
fRet = SetErrCode(phr, E_NOHANDLE);
|
|
goto ErrFreeAll;
|
|
}
|
|
lpQueryInfo->fFlag |= FREE_CHARTAB;
|
|
}
|
|
|
|
/* Change the property of '*' and '?' to character */
|
|
|
|
((LPCMAP)lpCharTabInfo->lpCMapTab)['*'].Class = CLASS_WILDCARD;
|
|
((LPCMAP)lpCharTabInfo->lpCMapTab)['?'].Class = CLASS_WILDCARD;
|
|
|
|
switch (lpCharTabInfo->fFlag)
|
|
{
|
|
case USE_DEF_LIGATURE:
|
|
lpCharTabInfo->wcLigature = DEF_LIGATURE_COUNT;
|
|
lpCharTabInfo->lpLigature = LigatureTable;
|
|
break;
|
|
|
|
case NO_LIGATURE:
|
|
lpCharTabInfo->wcLigature = 0;
|
|
lpCharTabInfo->lpLigature = NULL;
|
|
}
|
|
|
|
// not used for similarity
|
|
lpQueryInfo->lpStack = NULL;
|
|
|
|
/* Allocate a query tree */
|
|
if ((lpQueryTree = (_LPQT)QueryTreeAlloc()) == NULL)
|
|
{
|
|
fRet = SetErrCode(phr, E_OUTOFMEMORY);
|
|
goto ErrFreeAll;
|
|
}
|
|
|
|
/* Associate the query tree with the query. In the future, this will
|
|
* ensure the capability to have several queries and query trees
|
|
* at once
|
|
*/
|
|
lpQueryInfo->lpQueryTree = (LPQT)lpQueryTree;
|
|
|
|
/* Default arguments */
|
|
|
|
lpQueryTree->iDefaultOp = (BYTE)OR_OP;
|
|
lpQueryTree->lpGroup = lpGroup; // Use default Group
|
|
lpQueryTree->dwFieldId = 0;//DW_NIL_FIELD; // No fieldid search
|
|
lpQueryTree->cStruct.dwKey = CALLBACKKEY;
|
|
|
|
lpQueryTree->fFlag = 0;
|
|
lpQueryTree->wProxDist = 0;
|
|
|
|
if (NULL == (lpQueryTree->lpDocScores = InitDocScoreList(pSrchInfo->dwTopicCount)))
|
|
{
|
|
fRet = SetErrCode(phr, E_OUTOFMEMORY);
|
|
goto ErrFreeAll;
|
|
}
|
|
|
|
/* Copy the query into a temporary buffer since we are going to make
|
|
change to it
|
|
*/
|
|
if ((hQuery = _GLOBALALLOC(DLLGMEM_ZEROINIT, (LCB)cbQuery + 2)) == NULL)
|
|
{
|
|
SetErrCode(phr, E_OUTOFMEMORY);
|
|
FreeHandle(hqi);
|
|
return NULL;
|
|
}
|
|
lpbQueryBuf = lpQueryInfo->lpbQuery = (LPB)_GLOBALLOCK(hQuery);
|
|
lpbQueryBuf[cbQuery] = ' '; // Add a space to help LowLeveltransformation
|
|
lpbQueryBuf[cbQuery + 1] = 0; // Zero-terminated string (safety bytes)
|
|
MEMCPY(lpbQueryBuf, lpbQuery, cbQuery);
|
|
|
|
//
|
|
// Word-break between here and there.
|
|
//
|
|
|
|
brkParms.lpInternalBreakInfo = lpibi;
|
|
brkParms.lpbBuf = lpbQueryBuf;
|
|
brkParms.cbBufCount = cbQuery;
|
|
brkParms.lcbBufOffset = 0;
|
|
brkParms.lpvUser = lpQueryInfo;
|
|
brkParms.lpfnOutWord = (FWORDCB)FFlatCallBack;
|
|
brkParms.lpStopInfoBlock = lpQueryInfo->lpStopListInfo;
|
|
brkParms.lpCharTab = lpQueryInfo->lpCharTab;
|
|
brkParms.fFlags = ACCEPT_WILDCARD;
|
|
|
|
if ((fRet = (*lpQueryInfo->lpfnBreakFunc)((LPBRK_PARMS)&brkParms))
|
|
!= S_OK)
|
|
{
|
|
fRet = SetErrCode(phr, (WORD)fRet);
|
|
goto ErrFreeAll;
|
|
}
|
|
|
|
/* Flush the word breaker */
|
|
brkParms.lpbBuf = NULL;
|
|
brkParms.cbBufCount = 0;
|
|
|
|
if ((fRet = (*lpQueryInfo->lpfnBreakFunc)((LPBRK_PARMS)&brkParms))
|
|
!= S_OK)
|
|
{
|
|
fRet = SetErrCode(phr, fRet);
|
|
goto ErrFreeAll;
|
|
}
|
|
|
|
/* Set the position of pointer to report missing term at
|
|
the end of the query. -1 since the offset starts at 0
|
|
*/
|
|
lpQueryInfo->dwOffset = cbQuery - 1;
|
|
|
|
fRet = S_OK;
|
|
|
|
ErrFreeAll:
|
|
/* Free the charmap table */
|
|
if (lpQueryInfo->fFlag & FREE_CHARTAB)
|
|
MVCharTableDispose (lpQueryInfo->lpCharTab);
|
|
|
|
/* Free query info */
|
|
if (hqi)
|
|
{
|
|
FreeHandle(hqi);
|
|
};
|
|
|
|
/* Free internal breaker info */
|
|
if (hibi)
|
|
{
|
|
FreeHandle(hibi);
|
|
};
|
|
|
|
/* Free internal query buffer info */
|
|
if (hQuery)
|
|
{
|
|
FreeHandle(hQuery);
|
|
};
|
|
|
|
if (fRet == S_OK)
|
|
return lpQueryTree;
|
|
|
|
if (lpQueryTree)
|
|
{
|
|
BlockFree(lpQueryTree->lpStringBlock);
|
|
BlockFree(lpQueryTree->lpWordInfoBlock);
|
|
BlockFree(lpQueryTree->lpOccMemBlock);
|
|
BlockFree(lpQueryTree->lpTopicMemBlock);
|
|
BlockFree(lpQueryTree->lpNodeBlock);
|
|
|
|
FreeDocScoreList(lpQueryTree->lpDocScores);
|
|
/* Free Query tree block */
|
|
FreeHandle ((HANDLE)lpQueryTree->cStruct.dwReserved);
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/*************************************************************************
|
|
* @doc INTERNAL
|
|
*
|
|
* @func HRESULT FAR PASCAL | ProcessTerm |
|
|
* This function will search the index for the given word' data.
|
|
* @parm _LPQT | lpqt |
|
|
* Pointer to index structure
|
|
* @parm LPRETV | lpRetV |
|
|
* Pointer to "globals"
|
|
* @parm _LPQTNODE | lpCurQtNode |
|
|
* Current node in the query tree containing important data
|
|
* - The number of topics
|
|
* - The location of the data
|
|
* - The size of the data
|
|
* - Pointer to the next word (for wildcard search)
|
|
* @rdesc S_OK or other errors
|
|
*************************************************************************/
|
|
PUBLIC HRESULT EXPORT_API FAR PASCAL ProcessTerm(_LPQT lpqt, LPRETV lpRetV,
|
|
_LPQTNODE lpResQuery, _LPQTNODE lpQtNode, STRING_TOKEN FAR *lpToken)
|
|
{
|
|
DWORD dwTopicIDDelta; // Topic-ID delta from previous sub-list.
|
|
DWORD dwOccs; // Number of occurences in this sub-list.
|
|
DWORD dwTmp; // Scratch variable.
|
|
WORD wWeight; // Term-weight associated with this sub-list.
|
|
WORD wWeightMax;
|
|
DWORD dwTopicID; // TopicId
|
|
WORD wImportance;
|
|
DWORD dwLength; // Length of the word
|
|
TOPIC_LIST FAR *lpResTopicList; // Result TopicList
|
|
HRESULT fRet; // Returned value
|
|
PNODEINFO pDataInfo;
|
|
DWORD dwTopicCount;
|
|
_LPQT lpQueryTree; // Query tree
|
|
OCCF occf;
|
|
BYTE fSkipOccList = FALSE;
|
|
_LPDSL lpDocScores = (_LPDSL)(lpqt->lpDocScores);
|
|
|
|
pDataInfo = &lpRetV->DataInfo;
|
|
if ((pDataInfo->dwDataSizeLeft = lpQtNode->cbData) == 0)
|
|
return(S_OK); // There is nothing to process
|
|
|
|
// Initialize variables
|
|
occf = lpqt->occf;
|
|
wImportance = QTN_TOKEN(lpQtNode)->wWeight;
|
|
lpResTopicList = NULL;
|
|
lpQueryTree = lpRetV->lpqt;
|
|
dwTopicCount = lpQtNode->cTopic;
|
|
wWeight = (WORD)(65535L/(lpToken ? lpToken->dwTopicCount : dwTopicCount));
|
|
|
|
// Reset the topic count for lpQtNode so that is will not affect the
|
|
// result in case that lpResQuery == NULL
|
|
|
|
lpQtNode->cTopic = 0;
|
|
|
|
if (lpResQuery == NULL)
|
|
lpResQuery = lpQtNode;
|
|
|
|
// Initialize the data buffer node values
|
|
pDataInfo->pBuffer = pDataInfo->pDataNode;
|
|
pDataInfo->nodeOffset = lpQtNode->foData;
|
|
|
|
// Read the data block
|
|
if ((fRet = ReadNewData(pDataInfo)) != S_OK)
|
|
return(fRet);
|
|
|
|
dwTopicID = 0L; // Init occurence record
|
|
dwLength = 0;
|
|
|
|
// for each document in posting
|
|
for (; dwTopicCount; dwTopicCount--)
|
|
{
|
|
/* Check for interrupt now and then */
|
|
if ((++lpqt->cInterruptCount) == 0)
|
|
{
|
|
if (lpqt->fInterrupt == E_INTERRUPT)
|
|
return E_INTERRUPT;
|
|
if (*lpqt->cStruct.Callback.MessageFunc &&
|
|
(fRet = (*lpqt->cStruct.Callback.MessageFunc)(
|
|
lpqt->cStruct.Callback.dwFlags,
|
|
lpqt->cStruct.Callback.pUserData, NULL)) != S_OK)
|
|
return(fRet);
|
|
}
|
|
|
|
// Byte align
|
|
if (pDataInfo->ibit != cbitBYTE - 1)
|
|
{
|
|
pDataInfo->ibit = cbitBYTE - 1;
|
|
pDataInfo->pCurPtr ++;
|
|
}
|
|
|
|
// Get value from which I will calculate current doc-ID.
|
|
if ((fRet = FGetDword(pDataInfo, lpqt->ckeyTopicId,
|
|
&dwTopicIDDelta)) != S_OK)
|
|
{
|
|
exit0:
|
|
return fRet;
|
|
}
|
|
|
|
dwTopicID += dwTopicIDDelta;
|
|
//
|
|
// Get term-weight if present. I'm going to get this
|
|
// even if I'm not doing ranking, because it's in the
|
|
// index, and I have to get around it somehow.
|
|
//
|
|
if (lpqt->idxf & IDXF_NORMALIZE)
|
|
{
|
|
if ((fRet = FGetBits(pDataInfo, &dwTmp, sizeof (USHORT) * cbitBYTE))
|
|
!= S_OK)
|
|
goto exit0;
|
|
|
|
if (wImportance != MAX_WEIGHT)
|
|
dwTmp = (dwTmp * wImportance) / 65535;
|
|
|
|
// BUGBUG: we actually want the weights for all aliased terms
|
|
// to be considered at once.
|
|
wWeight = (WORD)dwTmp;
|
|
}
|
|
|
|
// always skip any occurrence info
|
|
if (occf & (OCCF_OFFSET | OCCF_COUNT))
|
|
{
|
|
// Figure out how many occurences there are in this
|
|
// sub-list.
|
|
//
|
|
if ((fRet = FGetDword(pDataInfo, lpqt->ckeyOccCount,
|
|
&dwOccs)) != S_OK)
|
|
goto exit0;
|
|
|
|
if ((fRet = SkipOccList (lpqt, pDataInfo, dwOccs)) != S_OK)
|
|
goto exit0;
|
|
}
|
|
|
|
// If this search includes a group, and the doc is not in the
|
|
// group then ignore it
|
|
if (lpQueryTree->lpGroup
|
|
&& FGroupLookup(lpQueryTree->lpGroup, dwTopicID) == FALSE)
|
|
continue;
|
|
|
|
// calculate relevance upper bound Dr = Ds + sum(Qi) for this document
|
|
if (lpResTopicList = TopicNodeSearch(lpQueryTree, lpResQuery, dwTopicID))
|
|
wWeightMax = lpResTopicList->wWeight;
|
|
else
|
|
wWeightMax = 0;
|
|
|
|
wWeightMax = AddWeights(wWeightMax, wWeight);
|
|
wWeightMax = AddWeights(wWeightMax, QTN_TOKEN(lpQtNode)->wWeightRemain);
|
|
if (wWeightMax < GetMinDocScore(lpDocScores, ROUND_DOWN)
|
|
&&
|
|
IsDocScoreListFull(lpDocScores))
|
|
{
|
|
// do not alloc/ or remove D from result list if present
|
|
if (lpResTopicList)
|
|
{
|
|
register LPITOPIC lpPrev, lpTmp;
|
|
|
|
// find lpPrev
|
|
// UNDONE: look into removing necessity for this loop
|
|
for (lpPrev = NULL, lpTmp = (LPITOPIC)lpQtNode->lpTopicList; lpTmp;
|
|
lpTmp = lpTmp->pNext) {
|
|
if (lpTmp == (LPITOPIC)lpResTopicList)
|
|
break;
|
|
lpPrev = lpTmp;
|
|
}
|
|
|
|
TopicNodeFree(lpQueryTree, lpResQuery, lpPrev, lpResTopicList);
|
|
#if defined(_DEBUG) && defined(_DUMPALL)
|
|
_DPF3("Remove topic %lu, wWeightMax = %lu, MinDocScore = %u\n", dwTopicID, \
|
|
wWeightMax, GetMinDocScore(lpDocScores, ROUND_DOWN));
|
|
#endif
|
|
}
|
|
// no need to update top-N docs since this wasn't one of them
|
|
continue;
|
|
}
|
|
|
|
if (lpResTopicList)
|
|
{
|
|
WORD wOldWeight = lpResTopicList->wWeight;
|
|
|
|
// Calc new Ds for this doc and if good enough for the club, ensure that
|
|
// club invariant is maintained, else leave it since it could still become
|
|
// a club member in the future
|
|
lpResTopicList->wWeight = AddWeights(lpResTopicList->wWeight, wWeight);
|
|
if (lpResTopicList->wWeight > GetMinDocScore(lpDocScores, ROUND_DOWN))
|
|
UpdateDocScoreList(lpDocScores, wOldWeight, lpResTopicList->wWeight);
|
|
|
|
#if defined(_DEBUG) && defined(_DUMPALL)
|
|
_DPF3("Update topic %lu, wWeightMax = %lu, wWeight = %u\n", dwTopicID, \
|
|
wWeightMax, lpResTopicList->wWeight);
|
|
#endif
|
|
|
|
continue;
|
|
}
|
|
|
|
// a new document counter: possible club member, or not enough
|
|
// total documents yet
|
|
if ((lpResTopicList = TopicNodeAllocate(lpQueryTree)) == NULL)
|
|
{
|
|
fRet = E_TOOMANYTOPICS;
|
|
goto exit0;
|
|
}
|
|
lpResTopicList->dwTopicId = dwTopicID;
|
|
lpResTopicList->lpOccur = NULL;
|
|
lpResTopicList->lcOccur = 0;
|
|
lpResTopicList->wWeight = wWeight;
|
|
|
|
/* Add the new TopicID node into TopicList */
|
|
TopicNodeInsert (lpQueryTree, lpResQuery, lpResTopicList);
|
|
UpdateDocScoreList(lpDocScores, -1, lpResTopicList->wWeight);
|
|
|
|
#if defined(_DEBUG) && defined(_DUMPALL)
|
|
_DPF3("New topic %lu, wWeightMax = %lu, wWeight = %u\n", dwTopicID, \
|
|
wWeightMax, lpResTopicList->wWeight);
|
|
#endif
|
|
|
|
|
|
} // end for each topic in posting
|
|
|
|
fRet = S_OK;
|
|
|
|
return fRet;
|
|
}
|
|
|
|
PRIVATE HRESULT PASCAL NEAR ResolveFlatQuery(_LPQT lpqt, _LPQTNODE lpCurQtNode, LPRETV lpRetV)
|
|
{
|
|
HRESULT fRet;
|
|
PNODEINFO pLeafInfo = &lpRetV->LeafInfo;
|
|
LPB astBTreeWord = lpRetV->pBTreeWord;
|
|
DWORD dwTotalTopic;
|
|
LPB lstModified = lpRetV->pModifiedWord;
|
|
ERRB errb;
|
|
WORD cByteMatched = 0;
|
|
STRING_TOKEN FAR *lpStrList; /* Pointer to strings table */
|
|
STRING_TOKEN FAR *lpPrev; /* Pointer to strings table */
|
|
_LPDSL lpDocScores = (_LPDSL)(lpqt->lpDocScores);
|
|
LPWORDINFO lpwiT;
|
|
LPWORDINFO lpwiPrev;
|
|
|
|
// first collect the word info for each token
|
|
for (lpStrList = lpqt->lpStrList, lpPrev = NULL;
|
|
lpStrList; lpStrList = lpStrList->pNext)
|
|
{
|
|
BOOL fNumber = TRUE;
|
|
|
|
// accumulate the list of terms to have data read
|
|
if ((fRet = GetWordInfoList(lpqt, lpStrList, lpCurQtNode, lpRetV)) != S_OK)
|
|
{
|
|
return SetErrCode (&errb, fRet);
|
|
}
|
|
|
|
// if no word info was available, remove the token from the list
|
|
// it won't get freed until end of query, but who cares - it makes
|
|
// the rest of the processing faster
|
|
if (!lpStrList->lpwi)
|
|
{
|
|
if (lpPrev)
|
|
lpPrev->pNext = lpStrList->pNext;
|
|
else
|
|
lpqt->lpStrList = lpStrList->pNext;
|
|
|
|
// NOTE: lpPrev must remain unchanged when deleting!
|
|
continue;
|
|
}
|
|
|
|
// cycle through all the instances of this term's lookalikes
|
|
// (e.g. multiple aliases) and add up the total topic count
|
|
// since we don't want to treat aliases as rare, even though
|
|
// they may be.
|
|
lpStrList->dwTopicCount = lpStrList->lpwi->cTopic;
|
|
for (lpwiT = lpStrList->lpwi->pNext, lpwiPrev = NULL; lpwiT;
|
|
lpwiPrev = lpwiT, lpwiT = lpwiT->pNext)
|
|
lpStrList->dwTopicCount += lpwiT->cTopic;
|
|
|
|
lpPrev = lpStrList;
|
|
} // for next term
|
|
|
|
// sort string list by descending term rarity
|
|
SortStringWeights(lpqt);
|
|
|
|
dwTotalTopic = 0;
|
|
|
|
for (lpStrList = lpqt->lpStrList;
|
|
lpStrList; lpStrList = lpStrList->pNext)
|
|
{
|
|
LPWORDINFO lpwiT;
|
|
|
|
if (lpStrList->lpwi == NULL)
|
|
continue;
|
|
|
|
#if defined(_DEBUG) && defined(_DUMPALL)
|
|
{
|
|
char szTemp[256];
|
|
|
|
STRNCPY(szTemp, lpStrList->lpString + 2, *(LPWORD)lpStrList->lpString);
|
|
szTemp[*(LPWORD)lpStrList->lpString] = 0;
|
|
_DPF1("Term: '%s'\n", szTemp);
|
|
}
|
|
#endif
|
|
|
|
// We can terminate the query processing if the upper bound on the
|
|
// smallest current doc score is lteq the current score of the R-th
|
|
// biggest doc score, since any further computation will at most
|
|
// result in a re-ordering of the bottom (N - R) documents.
|
|
// However, this leaves the remaining documents only partially
|
|
// sorted by relevancy, which may or may not be acceptable.
|
|
|
|
if (AddWeights(GetMinDocScore(lpDocScores, ROUND_UP),
|
|
lpStrList->wWeightRemain) <= GetSortedDocScore(lpDocScores,
|
|
(int)lpRetV->SrchInfo.dwTopicFullCalc, ROUND_DOWN))
|
|
break;
|
|
|
|
lpqt->lpTopicStartSearch = NULL;
|
|
lpqt->lpOccStartSearch = NULL;
|
|
|
|
QTN_TOKEN(lpCurQtNode) = lpStrList;
|
|
|
|
for (lpwiT = lpStrList->lpwi; lpwiT; lpwiT = lpwiT->pNext)
|
|
{
|
|
// TO DO: replace with WORDINFO in curqt node
|
|
lpCurQtNode->cTopic = lpwiT->cTopic;
|
|
lpCurQtNode->foData = lpwiT->foData;
|
|
lpCurQtNode->cbData = lpwiT->cbData;
|
|
lpCurQtNode->wRealLength = lpwiT->wRealLength;
|
|
|
|
if ((fRet = ProcessTerm(lpqt, lpRetV,
|
|
NULL, lpCurQtNode, lpStrList)) != S_OK)
|
|
{
|
|
// kevynct: no need to overwrite count on error since
|
|
// we may be attempting to continue
|
|
lpCurQtNode->cTopic += dwTotalTopic;
|
|
return(fRet);
|
|
}
|
|
|
|
// Accumulate the topic count, since cTopic will be destroyed
|
|
// if there is more searches for this node (such as wildcard)
|
|
dwTotalTopic += lpCurQtNode->cTopic;
|
|
}
|
|
}
|
|
|
|
lpCurQtNode->cTopic = dwTotalTopic;
|
|
|
|
return S_OK;
|
|
}
|
|
|
|
__inline void MergeWordInfoCounts(WORDINFO FAR *lpwiDest, WORDINFO FAR *lpwiSrc)
|
|
{
|
|
lpwiDest->cTopic += lpwiSrc->cTopic;
|
|
}
|
|
|
|
// adds zero or more WORDINFO nodes for the passed-in string
|
|
PRIVATE HRESULT GetWordInfoList(_LPQT lpqt, STRING_TOKEN FAR *lpStrToken, _LPQTNODE lpCurQtNode, LPRETV lpRetV)
|
|
{
|
|
int cLevel;
|
|
int cMaxLevel;
|
|
int fCheckFieldId;
|
|
LST lstSearchStr;
|
|
LPB lpCurPtr;
|
|
int nCmp;
|
|
HRESULT fRet;
|
|
int f1stIsWild;
|
|
LPB lpMaxAddress;
|
|
PNODEINFO pLeafInfo = &lpRetV->LeafInfo;
|
|
DWORD dwTemp;
|
|
LPB astBTreeWord = lpRetV->pBTreeWord;
|
|
WORD wLen;
|
|
DWORD dwFieldID;
|
|
LPB lstModified = lpRetV->pModifiedWord;
|
|
BYTE fStemmed;
|
|
LPB pBTreeWord;
|
|
ERRB errb;
|
|
WORD cByteMatched = 0;
|
|
WORDINFO wi;
|
|
LPWORDINFO lpwi;
|
|
|
|
fStemmed = 0;
|
|
|
|
lstSearchStr = lpStrToken->lpString;
|
|
f1stIsWild = (lstSearchStr[2] == WILDCARD_CHAR ||
|
|
lstSearchStr[2] == WILDCARD_STAR);
|
|
|
|
// Make sure to turn of stemming if there is any wildcard characters
|
|
|
|
for (nCmp = *((LPW)lstSearchStr) + 1; nCmp >= 2; nCmp--)
|
|
{
|
|
if (lstSearchStr[nCmp] == '*' || lstSearchStr[nCmp] == '?')
|
|
{
|
|
fStemmed = FALSE;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Turned off stemming for short words
|
|
if (*(LPW)lstSearchStr < 3)
|
|
fStemmed = FALSE;
|
|
|
|
pLeafInfo->nodeOffset = lpqt->foIdxRoot;
|
|
pLeafInfo->iLeafLevel = lpqt->cIdxLevels - 1;
|
|
pLeafInfo->dwBlockSize = lpqt->dwBlockSize;
|
|
|
|
// BUGBUG: we don't handle stemming for now.
|
|
MEMCPY (lstModified, lstSearchStr,
|
|
*((LPW)lstSearchStr) + sizeof (SHORT));
|
|
// Zero terminated for wildcard search
|
|
lstModified [*((LPW)lstModified) + 2] = 0;
|
|
|
|
pBTreeWord = lpRetV->pBTreeWord;
|
|
|
|
/* Change all '*' and '?' to 0. This will
|
|
* ensure that things gets compared correctly with
|
|
* the top node's entries
|
|
*/
|
|
for (nCmp = *((LPW)lstModified) + 1; nCmp >= 2; nCmp--)
|
|
{
|
|
if (lstModified[nCmp] == '*' || lstModified[nCmp] == '?')
|
|
{
|
|
lstModified[nCmp] = 0;
|
|
*(LPW)lstModified = nCmp - 2;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Point node-resolution variables at the right things. This
|
|
* sets these up to read b-tree nodes. Fields not set here are
|
|
* set as appropriate elsewhere.
|
|
*/
|
|
|
|
/* Set the flag */
|
|
fCheckFieldId = (lpqt->occf & OCCF_FIELDID) && (lpCurQtNode->dwFieldId != DW_NIL_FIELD);
|
|
|
|
astBTreeWord[0] = 0;
|
|
cMaxLevel = lpqt->cIdxLevels - 1;
|
|
|
|
/*
|
|
First we have to find which tree level the word is in. The number of
|
|
searches is equal to the number of tree levels at most. The
|
|
structure of the directory node is a sequence of:
|
|
- Words: PASCAL strings
|
|
- Data offset: will tell us where is the
|
|
offset of the record in the index file
|
|
*/
|
|
for (cLevel = 0; cLevel < cMaxLevel ; cLevel++)
|
|
{
|
|
//
|
|
// Get a node.
|
|
//
|
|
if ((fRet = ReadStemNode ((PNODEINFO)pLeafInfo, cLevel)) != S_OK)
|
|
{
|
|
return SetErrCode (&errb, fRet);
|
|
}
|
|
lpMaxAddress = pLeafInfo->pMaxAddress;
|
|
lpCurPtr = pLeafInfo->pCurPtr;
|
|
|
|
//
|
|
// Loop through it. This compares the word I'm
|
|
// looking for against the word in the b-tree.
|
|
// If the word in the b-tree is >= the word I'm
|
|
// looking for, I'm done.
|
|
//
|
|
// If I run off the end of the node, there can be
|
|
// no match for this term, so I skip the entire
|
|
// process.
|
|
//
|
|
for (;;)
|
|
{
|
|
if (lpCurPtr >= lpMaxAddress)
|
|
return S_OK;
|
|
|
|
lpCurPtr = ExtractWord(astBTreeWord, lpCurPtr, &wLen);
|
|
|
|
if (fStemmed)
|
|
{
|
|
if ((fRet = FStem (pBTreeWord, astBTreeWord)) !=
|
|
S_OK)
|
|
return(S_OK);
|
|
}
|
|
|
|
/* Read in NodeId record */
|
|
lpCurPtr += ReadFileOffset (&pLeafInfo->nodeOffset, lpCurPtr);
|
|
|
|
if (f1stIsWild)
|
|
break;
|
|
if (StrCmpPascal2(lstModified, pBTreeWord) <= 0)
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* At this point, pLeafInfo->nodeOffset is the node id of the leaf that
|
|
is supposed to contain the searched word. Read in the leaf node
|
|
*/
|
|
if ((fRet = ReadLeafNode ((PNODEINFO)pLeafInfo, cLevel)) != S_OK)
|
|
{
|
|
return fRet;
|
|
}
|
|
|
|
lpCurPtr = pLeafInfo->pCurPtr;
|
|
lpMaxAddress = pLeafInfo->pMaxAddress;
|
|
|
|
//
|
|
// Second step is to deal with the leaf node(s). I'm going to
|
|
// find and capture some occurence lists. I'll probably have to
|
|
// ignore some bogus ones first.
|
|
//
|
|
|
|
// Reset the word
|
|
if (fStemmed)
|
|
{
|
|
MEMCPY (lstModified, lpRetV->pStemmedQueryWord,
|
|
*(LPW)lpRetV->pStemmedQueryWord + sizeof(WORD));
|
|
}
|
|
else
|
|
{
|
|
MEMCPY (lstModified, lstSearchStr,
|
|
*((LPW)lstSearchStr) + sizeof (SHORT));
|
|
}
|
|
|
|
for (;;)
|
|
{
|
|
// Check for out of data
|
|
if (lpCurPtr >= lpMaxAddress)
|
|
{
|
|
// Get the offset of the next node
|
|
ReadFileOffset (&pLeafInfo->nodeOffset, pLeafInfo->pBuffer);
|
|
if (FoIsNil (pLeafInfo->nodeOffset))
|
|
{
|
|
return S_OK;
|
|
}
|
|
|
|
// Read the next node
|
|
if ((fRet = ReadLeafNode ((PNODEINFO)pLeafInfo, cLevel))
|
|
!= S_OK)
|
|
{
|
|
return SetErrCode (&errb, fRet);
|
|
}
|
|
lpCurPtr =
|
|
pLeafInfo->pBuffer + FOFFSET_SIZE + sizeof (SHORT);
|
|
lpMaxAddress = pLeafInfo->pMaxAddress;
|
|
}
|
|
|
|
/* Check for interrupt now and then */
|
|
if ((++lpqt->cInterruptCount) == 0)
|
|
{
|
|
if (lpqt->fInterrupt == E_INTERRUPT)
|
|
return E_INTERRUPT;
|
|
if (*lpqt->cStruct.Callback.MessageFunc &&
|
|
(fRet = (*lpqt->cStruct.Callback.MessageFunc)(
|
|
lpqt->cStruct.Callback.dwFlags,
|
|
lpqt->cStruct.Callback.pUserData, NULL)) != S_OK)
|
|
return(fRet);
|
|
}
|
|
|
|
// Extract the word
|
|
lpCurPtr = ExtractWord(astBTreeWord, lpCurPtr, &wLen);
|
|
|
|
if (fStemmed)
|
|
{
|
|
if ((fRet = FStem (pBTreeWord, astBTreeWord)) != S_OK)
|
|
return(fRet);
|
|
}
|
|
|
|
if (lpqt->occf & OCCF_FIELDID)
|
|
lpCurPtr += CbByteUnpack (&dwFieldID, lpCurPtr);
|
|
|
|
nCmp = CompareTerm (lpCurQtNode, lstModified, pBTreeWord, fCheckFieldId ?
|
|
dwFieldID : lpCurQtNode->dwFieldId, lpRetV->pLeadByteTable);
|
|
|
|
switch (nCmp)
|
|
{
|
|
case KEEP_SEARCHING:
|
|
// Skip TopicCount
|
|
lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
|
|
// Skip data offset
|
|
lpCurPtr += FOFFSET_SIZE;
|
|
// Skip DataSize
|
|
lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
|
|
break;
|
|
|
|
case STRING_MATCH:
|
|
|
|
lpCurPtr += CbByteUnpack (&wi.cTopic, lpCurPtr);
|
|
lpCurPtr += ReadFileOffset (&wi.foData, lpCurPtr);
|
|
lpCurPtr += CbByteUnpack (&wi.cbData, lpCurPtr);
|
|
wi.wRealLength = wLen;// BUGBUG doublecheck this
|
|
|
|
// Check for Topic count. This can be 0 if the word has been deleted
|
|
// from the index
|
|
if (wi.cTopic == 0)
|
|
break;
|
|
|
|
// long search optimization: clip noise words.
|
|
// Johnms- eliminate frequent words.
|
|
// typically, you eliminate if in more than 1/7 of documents.
|
|
|
|
if ((lpRetV->SrchInfo.Flag & LARGEQUERY_SEARCH)
|
|
&&
|
|
lpRetV->SrchInfo.dwValue < wi.cTopic
|
|
)
|
|
{
|
|
break;
|
|
}
|
|
|
|
// allocate WORDINFO node
|
|
if ((lpwi = BlockGetElement(lpqt->lpWordInfoBlock)) == NULL)
|
|
return E_OUTOFMEMORY;
|
|
|
|
*lpwi = wi;
|
|
|
|
lpwi->pNext = lpStrToken->lpwi;
|
|
lpStrToken->lpwi = lpwi;
|
|
|
|
// Save the info
|
|
pLeafInfo->pCurPtr = lpCurPtr;
|
|
break;
|
|
|
|
case NOT_FOUND: // No unconditional "break" above.
|
|
if (fStemmed && (strncmp (lstSearchStr+ 2, pBTreeWord + 2,
|
|
cByteMatched) == 0))
|
|
{
|
|
// Continue searching in case stemming is messed up
|
|
// by non-alphabetic word, such as the sequence:
|
|
// subtopic subtopic2 subtopics
|
|
lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
|
|
// Skip data offset
|
|
lpCurPtr += FOFFSET_SIZE;
|
|
// Skip DataSize
|
|
lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
|
|
break;
|
|
}
|
|
return S_OK;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
/*************************************************************************
|
|
* @doc INTERNAL
|
|
*
|
|
* @func HRESULT PASCAL FAR | FFlatCallBack |
|
|
* This call back function is called by various breakers after
|
|
* fetching a token. The token is checked for wild char presence
|
|
*
|
|
* @parm LST | lstRawWord |
|
|
* Pointer to unnormalized string
|
|
*
|
|
* @parm LST | lstNormWord |
|
|
* Pointer to normalized string. This pascal string's size should be
|
|
* at least *lstNormWord+2
|
|
*
|
|
* @parm LFO | lfoWordOffset |
|
|
* Offset into the query buffer. It is used to mark the location
|
|
* where an parsing error has occurred
|
|
*
|
|
* @parm LPQI | lpqi |
|
|
* Pointer to query info structure. This has all "global" variables
|
|
*
|
|
* @rdesc S_OK if succeeded, else various errors.
|
|
*************************************************************************/
|
|
PUBLIC HRESULT PASCAL FAR EXPORT_API FFlatCallBack (LST lstRawWord, LST lstNormWord,
|
|
LFO lfoWordOffset, LPQI lpqi)
|
|
{
|
|
/* Add extra 0 to make sure that AllocWord() gets the needed 0
|
|
* for WildCardCompare()
|
|
*/
|
|
lstNormWord[*(LPW)(lstNormWord) + 2] = 0;
|
|
|
|
// add the token to the string list
|
|
if (AllocWord(lpqi->lpQueryTree, lstNormWord) == NULL)
|
|
return E_OUTOFMEMORY;
|
|
|
|
return S_OK;
|
|
}
|
|
|
|
// for now, perform simple insertion sort on the string list
|
|
// bugbug: use heapsort or faster method for long lists
|
|
// for now, we sort by total topic count decreasing (rare terms first)
|
|
PRIVATE VOID PASCAL SortStringWeights(_LPQT lpQueryTree)
|
|
{
|
|
STRING_TOKEN FAR *pStr, *pStrNext, *pT, *pTPrev;
|
|
STRING_TOKEN FAR *pStrHead = lpQueryTree->lpStrList;
|
|
DWORD dwSum, dwT;
|
|
DWORD dwMaxWeight;
|
|
WORD wWeightT;
|
|
int nCmp;
|
|
FLOAT rLog;
|
|
FLOAT rLogSquared;
|
|
FLOAT rSigma;
|
|
FLOAT rTerm;
|
|
BOOL fNormalize = FALSE; // Normalize was for testing only.
|
|
|
|
if (fNormalize)
|
|
{
|
|
rSigma = (float)0.0;
|
|
|
|
// for each term:
|
|
for (pStr = pStrHead; pStr; pStr = pStr->pNext)
|
|
{
|
|
FLOAT fOcc;
|
|
|
|
// we have to guard against the possibility of the log resulting in
|
|
// a value <= 0.0. Very rare, but possible in the future. This happens
|
|
// if dwTopicCount approaches or exceeds the N we are using (N == 100 million)
|
|
if (pStr->dwTopicCount >= cNintyFiveMillion)
|
|
rLog = cVerySmallWt; // log10(100 mil/ 95 mil) == 0.02
|
|
else
|
|
//rLog = (float) log10(cHundredMillion/(double)pHeader->dwTopicCount);
|
|
rLog = (float) (8.0 - log10((double)pStr->dwTopicCount));
|
|
|
|
rLogSquared = rLog*rLog;
|
|
|
|
// Update sigma value
|
|
// NOTE : We are bounding dwOccCount by a value of eTFThreshold
|
|
// The RHS of the equation below has an upperbound of 2 power 30.
|
|
fOcc = (float) min(cTFThreshold, pStr->cUsed);
|
|
rSigma += fOcc*fOcc*rLogSquared;
|
|
}
|
|
|
|
rSigma = (float)sqrt(rSigma);
|
|
}
|
|
|
|
// calculate final weights and corrections
|
|
dwSum = dwMaxWeight = 0L;
|
|
for (pStr = pStrHead; pStr; pStr = pStr->pNext, nCmp++)
|
|
{
|
|
BOOL fNumber;
|
|
|
|
// once sigma is known, each term's proper weight can be calculated
|
|
if (fNormalize)
|
|
{
|
|
FLOAT rWeight;
|
|
|
|
// log10(x/y) == log10 (x) - log10 (y). Since x in our case is a known constant,
|
|
// 100,000,000, I'm replacing that with its equivalent log10 value of 8.0 and subtracting
|
|
// the log10(y) from it
|
|
rTerm = (float) (8.0 - log10((double) pStr->dwTopicCount));
|
|
// In extreme cases, rTerm could be 0 or even -ve (when dwTopicCount approaches or
|
|
// exceeds 100,000,000)
|
|
if (rTerm <= (float) 0.0)
|
|
rTerm = cVerySmallWt; // very small value. == log(100 mil/ 95 mil)
|
|
// NOTE : rWeight for the doc term would be as follows:
|
|
// rWeight = float(min(4096, dwBlockSize)) * rTerm / lpipb->wi.hrgsigma[dwTopicId]
|
|
//
|
|
// Since rTerm needs to be recomputed again for the query term weight computation,
|
|
// and since rTerm will be the same value for the current term ('cos N and n of log(N/n)
|
|
// are the same (N = 100 million and n is whatever the doc term freq is for the term),
|
|
// we will factor in the second rTerm at index time. This way, we don't have to deal
|
|
// with rTerm at search time (reduces computation and query time shortens)
|
|
//
|
|
// MV 2.0 initially did the same thing. However, BinhN removed the second rTerm
|
|
// because he decided to remove the rTerm altogether from the query term weight. He
|
|
// did that to keep the scores reasonably high.
|
|
|
|
rWeight = ((float) min(cTFThreshold, pStr->cUsed))
|
|
* rTerm * rTerm / rSigma;
|
|
// without the additional rTerm, we would probably be between 0.0 and 1.0
|
|
if (rWeight > rTerm)
|
|
wWeightT = 0xFFFF;
|
|
else
|
|
wWeightT = (WORD) ((float)0xFFFF * rWeight / rTerm);
|
|
|
|
}
|
|
else
|
|
wWeightT = 65535;
|
|
|
|
pStr->wWeight = (WORD)(16383 + 49152 / pStr->dwTopicCount);
|
|
|
|
// perform any special weight adjustments here
|
|
// BUGBUG: use NextChar here, and use charmap here
|
|
// numbers four digits or less get downgraded
|
|
fNumber = TRUE;
|
|
for (nCmp = *((LPWORD)pStr->lpString) + 1; nCmp >= 2; nCmp--)
|
|
if (nCmp > 5 || !IS_DIGIT(pStr->lpString[nCmp]))
|
|
{
|
|
fNumber = FALSE;
|
|
break;
|
|
}
|
|
|
|
if (fNumber)
|
|
pStr->wWeight = pStr->wWeight / 256;
|
|
|
|
//pStr->wTermWeight = (WORD)(pStr->wWeight * wWeightT / 65535L);
|
|
|
|
dwMaxWeight = max(dwMaxWeight, pStr->wWeight);
|
|
dwSum += pStr->wWeight;
|
|
}
|
|
|
|
// now sort 'em
|
|
for (pStr = pStrHead; pStr;)
|
|
{
|
|
if (NULL == (pStrNext = pStr->pNext))
|
|
break;
|
|
|
|
if (pStrNext->wWeight <= pStr->wWeight)
|
|
{
|
|
pStr = pStr->pNext;
|
|
continue;
|
|
}
|
|
|
|
// find element in already-sorted section
|
|
for (pT = pStrHead, pTPrev = NULL; pT; pTPrev = pT, pT = pT->pNext)
|
|
{
|
|
if (pT->wWeight <= pStrNext->wWeight)
|
|
{
|
|
pStr->pNext = pStrNext->pNext;
|
|
pStrNext->pNext = pT;
|
|
|
|
if (pTPrev)
|
|
pTPrev->pNext = pStrNext;
|
|
else
|
|
pStrHead = pStrNext;
|
|
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
dwT = 0;
|
|
for (pStr = pStrHead; pStr; pStr = pStr->pNext)
|
|
{
|
|
dwT += pStr->wWeight;
|
|
|
|
if (dwSum > dwT)
|
|
pStr->wWeightRemain = AddWeights(0, (WORD)((dwSum - dwT) * 65535.0 / dwSum));
|
|
else
|
|
pStr->wWeightRemain = 1;
|
|
}
|
|
|
|
lpQueryTree->lpStrList = pStrHead;
|
|
}
|
|
|
|
|