windows-nt/Source/XPSP1/NT/enduser/stuff/itircl/fts/search/similar.c

//#define _DUMPALL
/*************************************************************************
*                                                                        *
*  SIMILAR.C                                                             *
*                                                                        *
*  Copyright (C) Microsoft Corporation 1990-1996                         *
*  All Rights reserved.                                                  *
*                                                                        *
**************************************************************************
*                                                                        *
*  Module Intent:                                                        *
*																		 *
*   Search Core Engine: Find Similar functionality						 *
*                                                                        *
**************************************************************************
*
*	Revision History:
*
*	09/24/96 kevynct	Started from algorithm notes (4 hrs)
*	09/25/96 kevynct 	Implemented skeleton of ProcessSimilarityTerm  (1 hr)
*	09/26/96 kevynct	More work on inner loop and relevant list (5 hrs)
*	09/27/96 kevynct	Query parsing, weighting, and sorting (6 hrs)
*	10/01/96 kevynct	Incorporate into MV2.0b	(10 min)
*	10/02/96 kevynct	Clean-up query code, start resolve query code (4 hrs)
*	10/03/96 kevynct	Resolve query code (2 hrs)
*	10/11/96 kevynct	Start bucket routines (2 hrs)
*   10/13/96 kevynct	Finish bucket routines, write node processor, cleanup (6 hrs)
*	10/14/96 kevynct	Clean-up, remove compilation errors, debugging (6 hrs)
*	10/24/96 kevynct	Convert to two-phase query resolution (3 hrs)
*	10/25/96 kevynct	Fix sort by cTopics, debug new query resolution, try new weighting (2 hrs)
*	11/26/96 kevynct	Testing, fix and improve weighting and accumulation: aliases, digits (8 hrs)
*	12/2/96	 kevynct	More weighting tests (8 hrs)
*	Work remaining:
*
*   Investigate field and stemming support
*
*	Use probabilistic upperbounds for pruning.  Remove single-term nodes after each term process
*	Test current bucket method vs. exact scores w/ heap
*
**************************************************************************
*
*	Current Owner: KevynCT
*
**************************************************************************/

#include <mvopsys.h>
#include <mem.h>
#include <memory.h>
#include <orkin.h>
#include <mvsearch.h>
#include <math.h>
#include <groups.h>
#include "common.h"
#include "search.h"

#ifdef _DEBUG
static  BYTE  NEAR s_aszModule[] = __FILE__;  // Used by error return functions.
#endif

#define FGetDword(a,b,c) (*DecodeTable[b.cschScheme])(a, b, c)
#define IS_DIGIT(p) ((p) >= '0' && (p) <= '9')
// these are in case the doc scoring is approximate: they tell which
// direction to err on the side of.
#define ROUND_DOWN 0
#define ROUND_UP 1

#define SCORE_BLOCK_SIZE 32
#define NUM_SCORE_BLOCKS (MAX_WEIGHT/SCORE_BLOCK_SIZE)

typedef struct tagDocScoreList {
	HANDLE hMem;
	int cScoresLeft;
	int iBucketLowest;
	int iHighestScore;
	int rgiScores[NUM_SCORE_BLOCKS + 1];
} DSL, FAR *_LPDSL;

PUBLIC HRESULT PASCAL FAR SkipOccList(_LPQT  lpqt, PNODEINFO pNodeInfo, DWORD dwOccs); // ftsearch.c
PUBLIC int PASCAL FAR CompareTerm(_LPQTNODE lpQtNode,
    LST lstTermWord, LST lstBtreeWord, DWORD dwBtreeFieldId, char []); // ftsearch.c
PUBLIC STRING_TOKEN FAR *PASCAL AllocWord(_LPQT lpQueryTree, LST lstWord); // qtparse.c


__inline LPVOID InitDocScoreList(int cScores);
__inline void FreeDocScoreList(LPV lpDocScores);
__inline int GetMaxDocScore(_LPDSL lpDocScores);
__inline int GetMinDocScore(_LPDSL lpDocScores, BOOL fRoundUp);
BOOL UpdateDocScoreList(_LPDSL lpDocScores, int iOldScore, int i);
__inline BOOL IsDocScoreListFull(_LPDSL lpdsl);
__inline WORD AddWeights(DWORD w1, DWORD w2);
int GetSortedDocScore(_LPDSL lpDocScores, int iThis, BOOL fRoundUp);
#if defined(_DEBUG)
BOOL DumpDocScoreList(_LPDSL lpdsl, PSRCHINFO pSrchInfo);
#endif
__inline void MergeWordInfoCounts(WORDINFO FAR *lpwiDest, WORDINFO FAR *lpwiSrc);

PRIVATE LPQT TokenizeFlatQuery(LPPARSE_PARMS lpParms, PSRCHINFO pSrchInfo, PHRESULT phr);
PRIVATE HRESULT PASCAL NEAR ResolveFlatQuery(_LPQT lpqt,	_LPQTNODE lpCurQtNode, LPRETV lpRetV);
PRIVATE HRESULT GetWordInfoList(_LPQT lpqt, STRING_TOKEN FAR *lpStrToken, _LPQTNODE lpCurQtNode, LPRETV lpRetV);
PRIVATE VOID PASCAL SortStringWeights(_LPQT lpQueryTree);
PRIVATE VOID PASCAL SetStringWeights (LPQI lpQueryInfo);
PUBLIC HRESULT PASCAL FAR EXPORT_API FFlatCallBack (LST lstRawWord, LST lstNormWord,
    LFO lfoWordOffset, LPQI lpqi);

__inline LPVOID InitDocScoreList(int cScores)
{
	_LPDSL lpdsl;

	if ((lpdsl = (_LPDSL)GlobalLockedStructMemAlloc(sizeof(DSL))) == NULL)
        return NULL;

	lpdsl->cScoresLeft = cScores;
	lpdsl->iHighestScore = 0;
	lpdsl->iBucketLowest = -1;
	return (LPV)lpdsl;
}

__inline void FreeDocScoreList(LPV lpDocScores)
{
	if ((_LPDSL)lpDocScores)
		GlobalLockedStructMemFree((_LPDSL)lpDocScores);
}

__inline int GetMaxDocScore(_LPDSL lpDocScores)
{
	return lpDocScores->iHighestScore;
}

__inline int GetMinDocScore(_LPDSL lpDocScores, BOOL fRoundUp)
{
	if (lpDocScores->iBucketLowest >= 0)
		return (lpDocScores->iBucketLowest + !!fRoundUp) * SCORE_BLOCK_SIZE;

	return 0;
}

int GetSortedDocScore(_LPDSL lpdsl, int cThis, BOOL fRoundUp)
{
	LPINT lpi, lpiFirst;

	if (lpdsl->iHighestScore < 0)
		return 0;

	lpiFirst= &lpdsl->rgiScores[0];

    for (lpi = &lpdsl->rgiScores[lpdsl->iHighestScore/SCORE_BLOCK_SIZE];
	 lpi >= lpiFirst; cThis -= *lpi, lpi--)
	{
		if (cThis <= *lpi)
			return ((lpi - lpiFirst) + !!fRoundUp) * SCORE_BLOCK_SIZE;
	}
	return (!!fRoundUp * SCORE_BLOCK_SIZE);
}

#if defined(_DEBUG)
BOOL DumpDocScoreList(_LPDSL lpdsl, PSRCHINFO pSrchInfo)
{
	LPINT lpi, lpiMax;
	int iT = 0;
	int i;

	lpi = &lpdsl->rgiScores[0];
	lpiMax = lpi + NUM_SCORE_BLOCKS;
	for (i = 0;lpi < lpiMax;lpi++, i++)
	{
		if (*lpi)
		{
			_DPF2("Score %d (count %d)\n", i, *lpi);
		}
		iT += *lpi;
	}
	_DPF1("%d topics in scorelist\n", iT);

	return TRUE;

}
#endif

BOOL UpdateDocScoreList(_LPDSL lpdsl, int iOldScore, int iScore)
{
	int iThis = iScore/SCORE_BLOCK_SIZE;
	int iOld = iOldScore/SCORE_BLOCK_SIZE;

	if (lpdsl->cScoresLeft <= 0)
	{
		// already full, figure out which buckets need updating
		if (iThis > lpdsl->iBucketLowest)
		{
			// if we're updating an existing entry, remove that
			// otherwise remove the lowest one
			if (iOld >= lpdsl->iBucketLowest)
				lpdsl->rgiScores[iOld]--;
			else
				lpdsl->rgiScores[lpdsl->iBucketLowest]--;

			// then make sure lowest one is still non-empty; if not,
			// revise upwards
			if (lpdsl->rgiScores[lpdsl->iBucketLowest] <= 0)
			{
    			for (lpdsl->iBucketLowest++; lpdsl->iBucketLowest <= iThis; lpdsl->iBucketLowest++)
    				if (lpdsl->rgiScores[lpdsl->iBucketLowest])
						break;
add_new_doc:
				if (lpdsl->iBucketLowest >= 0)
					lpdsl->iBucketLowest = min(lpdsl->iBucketLowest, iThis);
				else
					lpdsl->iBucketLowest = iThis;
			}

			// then add the new entry
			lpdsl->rgiScores[iThis]++;
update_highest_score:
			if (iScore > lpdsl->iHighestScore)
				lpdsl->iHighestScore = iScore;

#if defined(_DEBUG) && defined(_DUMPALL)
			//DumpDocScoreList(lpdsl, NULL);
#endif
			Assert(lpdsl->rgiScores[lpdsl->iHighestScore/SCORE_BLOCK_SIZE] >= 0);
			return TRUE;
		}
		else
		if (iThis == lpdsl->iBucketLowest)
			goto update_highest_score;

		Assert(lpdsl->rgiScores[lpdsl->iHighestScore/SCORE_BLOCK_SIZE] >= 0);
		return FALSE;
    }

	// doc score list is not yet full, so automatically add if new,
	// remove old if update
	if (iOld >= lpdsl->iBucketLowest)
		lpdsl->rgiScores[iOld]--;
	else
		lpdsl->cScoresLeft--;
	goto add_new_doc;
}

__inline BOOL IsDocScoreListFull(_LPDSL lpdsl)
{
	return (lpdsl->cScoresLeft <= 0);
}

__inline WORD AddWeights(DWORD w1, DWORD w2)
{
	return (WORD)min(MAX_WEIGHT, w1 + w2);
}

/*************************************************************************
 *  @doc    EXTERNAL API RETRIEVAL
 *
 *  @func   LPHL FAR PASCAL | MVIndexFindSimilar |
 *      Given a query which probably represents a document text stream, returns
 *  a hitlist containing topics which are determined to be similar to the query
 *  using nearest-neighbour searching.
 *
 *  @parm LPIDX | lpidx |
 *       Pointer to index information.
 *
 *  @parm   LPQT | lpqt |
 *      Pointer to query tree (returned by MVQueryParse())
 *
 *  @parm   PSRCHINFO | pSrchInfo |
 *      Pointer to search information data
 *
 *  @parm _LPGROUP | lpResGroup |
 *     Pointer to resulting group
 *
 *  @parm LPVOID | pCallback |
 *     Pointer to callback struct FCALLBACK_MSG (optional)
 *
 *  @parm  PHRESULT | phr |
 *     Pointer to error buffer
 *
 *  @rdesc Pointer to hitlist structure if succeeded, even there is
 *      no hits (use MVHitListEntries() to find out how many hits have been
 *      returned). It will return NULL if failed. The error buffer
 *      (see IndexOpen()) will contain descriptions about the cause of
 *      the failure. There is one special case when the function returns
 *      a non-null pointer, even there is error, that is when it can't
 *      write the result to the disk, and everything is still in memory.
 *
 *************************************************************************/
// bugbug: handle wildcards
PUBLIC LPHL EXPORT_API FAR PASCAL MVIndexFindSimilar (_LPIDX lpidx,
  LPPARSE_PARMS lpParms, PSRCHINFO pSrchInfo, _LPGROUP lpResGroup,
  LPVOID pCallback, PHRESULT phr)
{
    HRESULT fRet;           // Return from this function.
    LPRETV  lpRetV;     // Retrieval memory/files.
    GHANDLE hRetv;
    //OCCF    occf;       // Index occurence flags temporary variable.
    _LPHL   lphl;       // Pointer to hitlist
    _LPQTNODE   lpTreeTop;
	HANDLE hTreeTop = NULL;
	_LPQT lpqt;

    if (lpidx == NULL || lpParms == NULL || pSrchInfo == NULL)
    {
        /* We get some bad arguments!! */
        SetErrCode (phr, E_INVALIDARG);
        return NULL;
    }

	if (NULL == (lpqt = TokenizeFlatQuery(lpParms, pSrchInfo, phr)))
	{
		// errb was set
        return NULL;
	}

    fRet = E_FAIL;      // Assume thing will go wrong

    // Transfer all the information about the index to the query tree
    lpqt->foIdxRoot = lpidx->ih.foIdxRoot;      /* Top node offset */
    lpqt->dwBlockSize = lpidx->ih.dwBlockSize;  /* Index block size */
    lpqt->cIdxLevels = lpidx->ih.cIdxLevels;         /* Index's depth */
    lpqt->occf = lpidx->ih.occf;
    lpqt->idxf = lpidx->ih.idxf;
    lpqt->foIdxRoot = lpidx->ih.foIdxRoot;
    lpqt->ckeyTopicId = lpidx->ih.ckeyTopicId;
    lpqt->ckeyOccCount = lpidx->ih.ckeyOccCount;
    lpqt->ckeyWordCount = lpidx->ih.ckeyWordCount;
    lpqt->ckeyOffset = lpidx->ih.ckeyOffset;

	if (pSrchInfo->dwMemAllowed)
	{
		// allocate document result list
		// no occurrence info is returned for similarity query
		SetBlockCount (lpqt->lpTopicMemBlock, (WORD)(pSrchInfo->dwMemAllowed /
			(sizeof(TOPIC_LIST) * cTOPIC_PER_BLOCK)));

		SetBlockCount (lpqt->lpOccMemBlock, 1);
	}

	if (pCallback)
		MVSearchSetCallback(lpqt, pCallback);

    /* Allocate hitlist */
    if ((lphl = (_LPHL)GlobalLockedStructMemAlloc(sizeof (HL))) == NULL)
    {
		fRet = E_OUTOFMEMORY;
        SetErrCode(phr, fRet);
exit00:
		if (lpqt)
		{
			FreeDocScoreList(lpqt->lpDocScores);
			MVQueryFree(lpqt);
		}

        if (lphl && fRet != S_OK && fRet != E_TOOMANYTOPICS)
        {
            MVHitListDispose(lphl);
            lphl = NULL;
        }
        return (LPHL)lphl;
    }
    lphl->lLastTopicId = 0xffffffff;
    lphl->lcMaxTopic = lpidx->ih.lcTopics;

    /* Allocate a return value structure */

    if ((hRetv = _GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT,
        sizeof(RETV))) == NULL)
    {
       SetErrCode(phr, E_OUTOFMEMORY);
		goto exit00;
    }

    lpRetV = (LPRETV)_GLOBALLOCK(hRetv);
    lpRetV->lpqt = lpqt;

    if ((fRet = TopNodeRead(lpidx)) != S_OK)
    {
        SetErrCode (phr, fRet);
exit02:
        FreeHandle(hRetv);
        goto exit00;
    }


    //
    //  Count the number of occurence fields present.  My retrieval
    //  occurence record is going to cost 4 bytes per field.
    //

    //occf = lpqt->occf;
    //for (lpRetV->cOccFields = 0; occf; lpRetV->cOccFields++)
	//        occf &= occf - 1;

    lpqt->dwOccSize = lpRetV->dwOccSize = 0;
        //sizeof(OCCURENCE) + lpRetV->cOccFields * sizeof (DWORD);

    lpRetV->fRank = TRUE; //((pSrchInfo->Flag &
		//(QUERYRESULT_RANK | QUERYRESULT_NORMALIZE)) != 0);

    // Set pointer to various buffer
    lpRetV->LeafInfo.pTopNode = lpidx->lrgbTopNode;
    lpRetV->LeafInfo.pStemNode = lpRetV->pNodeBuf;
    lpRetV->LeafInfo.pLeafNode = lpRetV->pNodeBuf;
    lpRetV->LeafInfo.pDataNode = lpRetV->pDataBuf;
    lpRetV->LeafInfo.hfpbIdx = lpidx->hfpbIdxSubFile;   // Index file to read from

    lpRetV->DataInfo.pTopNode = lpidx->lrgbTopNode;
    lpRetV->DataInfo.pStemNode = lpRetV->pNodeBuf;
    lpRetV->DataInfo.pLeafNode = lpRetV->pNodeBuf;
    lpRetV->DataInfo.pDataNode = lpRetV->pDataBuf;
    lpRetV->DataInfo.hfpbIdx = lpidx->hfpbIdxSubFile;   // Index file to read from
	lpRetV->lcid = lpidx->ih.lcid;

    // Save search information
    lpRetV->SrchInfo = *pSrchInfo;
    if (pSrchInfo->dwValue == 0)
        lpRetV->SrchInfo.dwValue = (DWORD)(-1);
    else
        lpRetV->SrchInfo.dwValue = lpidx->ih.lcTopics/pSrchInfo->dwValue;

	// this is a dummy node that we pass in to hold all term results
    if ((lpTreeTop = (_LPQTNODE)_GLOBALLOCK( \
		hTreeTop = _GLOBALALLOC(GHND, sizeof (QTNODE)))) == NULL)
    {
        SetErrCode(phr, fRet = E_OUTOFMEMORY);
        goto exit02;
    }
	QTN_FLAG(lpTreeTop) = EXACT_MATCH;
	lpTreeTop->pNext = NULL;
	lpTreeTop->pPrev = NULL;
    lpTreeTop->lpTopicList = NULL;

    if ( (fRet = ResolveFlatQuery(lpqt, lpTreeTop, lpRetV)) != S_OK)
    {
        SetErrCode (phr, fRet);

        /* Free the Topic and Occurrence memory blocks since they are
         * not freed by QueryTreeFree(), or MVHitListDispose() at this
         * point
         */

        if (fRet != E_TOOMANYTOPICS)
        {

			BlockFree ((LPV)lpqt->lpTopicMemBlock);
			BlockFree ((LPV)lpqt->lpOccMemBlock);
			lpqt->lpTopicMemBlock = NULL;
			lpqt->lpOccMemBlock = NULL;
exit03:
			if (hTreeTop)
			{
				_GLOBALUNLOCK(hTreeTop);
				_GLOBALFREE(hTreeTop);
			}
			goto exit02;
      }
    }

    /* Create a group if requested */
    if ((pSrchInfo->Flag & QUERYRESULT_GROUPCREATE) && lpResGroup)
    {
        LPITOPIC    lpCurTopic;     /* Topic's current pointer */
        LPB         lpbGrpBitVect;
        DWORD       maxTopicId;

       /* Initialize the pointer */
        lpbGrpBitVect = lpResGroup->lpbGrpBitVect;

        maxTopicId = lpResGroup->dwSize * 8;
        for (lpCurTopic = QTN_TOPICLIST(lpTreeTop); lpCurTopic;
           lpCurTopic = lpCurTopic->pNext)
        {
            /* Set the bit */
            if (lpCurTopic->dwTopicId < maxTopicId)
            {
                lpbGrpBitVect[(DWORD)(lpCurTopic->dwTopicId / 8)] |= 1 <<
                   (lpCurTopic->dwTopicId % 8);
            }
        }
    }

    if ((pSrchInfo->Flag & QUERYRESULT_UIDSORT) == 0)
    {

        /* Sort the result depending on ranking or not */
        if (lpRetV->fRank)
            SortResult ((LPQT)lpqt, lpTreeTop, WEIGHT_BASED);
        else
            SortResult ((LPQT)lpqt, lpTreeTop, HIT_COUNT_BASED);
    }

    /* Update HitList info structure, cut off the unwanted list */
    if (lphl->lpTopicList = lpTreeTop->lpTopicList)
        lphl->lcReturnedTopics = lphl->lcTotalNumOfTopics = lpTreeTop->cTopic;

    // Only return the number of topics that the user requested
	// if dwTopicCount == 0, it means that the user wants to return all

	if (pSrchInfo->dwTopicCount != 0 &&
		pSrchInfo->dwTopicCount < lphl->lcReturnedTopics)
        lphl->lcReturnedTopics = pSrchInfo->dwTopicCount;

    lphl->lpOccMemBlock = lpqt->lpOccMemBlock;
    lphl->lpTopicMemBlock = lpqt->lpTopicMemBlock;

#if 1
    /* WARNING: The following code should be commented out for
     * diskless devices. No returned error is checked, since
     * if disk writes fail, everything is still in memory
     */

    if ((pSrchInfo->Flag & QUERYRESULT_IN_MEM) == 0)
    {
        if ((fRet = MVHitListFlush (lphl, lphl->lcReturnedTopics)) != S_OK)
            SetErrCode (phr, fRet);
    }
#endif

	fRet = S_OK;
    goto exit03;
}

PRIVATE LPQT TokenizeFlatQuery(LPPARSE_PARMS lpParms, PSRCHINFO pSrchInfo, PHRESULT phr)
{
    HRESULT fRet;           // Return value.
    HANDLE  hqi;            // Handle to "lpqi".
    HANDLE  hibi;           // Handle to internal breaker info
    HANDLE  hQuery;         // Handle to secondary query buffer
    LPQI    lpQueryInfo;    // Query information.
    LPIBI   lpibi;          // Pointer to internal breaker info
    LPB     lpbQueryBuf;    // Copy of query's buffer
    _LPQT   lpQueryTree;    // Query tree pointer
    BRK_PARMS   brkParms;   // Breaker info parms
    LPCHARTAB lpCharTabInfo;// Pointer to character table's info

    /* LPPARSE_PARMS structure break-out variables */
    BYTE FAR CONST *lpbQuery;           // Query buffer
    DWORD cbQuery;          // Query length
    LPBRKLIST lpfnTable;    // DType function table
    LPGROUP lpGroup;        // Group

    lpbQuery = lpParms->lpbQuery;
    cbQuery = lpParms->cbQuery;
    lpfnTable = lpParms->lpfnTable;
    lpGroup = lpParms->lpGroup;

    if (lpfnTable == NULL)
    {
        SetErrCode(phr, E_BADBREAKER);
        return NULL;
    }

    if (cbQuery == 0 || lpbQuery == NULL) {
        SetErrCode(phr, E_NULLQUERY);
        return NULL;
    }

    lpQueryTree = NULL;
    hqi = hibi = hQuery = NULL;
	fRet = E_FAIL;

    if ((hqi = (GHANDLE)_GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT,
        (LCB)sizeof(QUERY_INFO))) == NULL)
    {
        fRet = SetErrCode(phr, E_OUTOFMEMORY);
        goto ErrFreeAll;
    }
    lpQueryInfo = (LPQI)_GLOBALLOCK(hqi);
    lpQueryInfo->lperrb = phr;
    lpQueryInfo->lpOpSymTab = NULL; // not used for similarity
	lpQueryInfo->cOpEntry = 0;

    /*  Allocate a breaker info block used by different breakers */
    if ((hibi = (GHANDLE)_GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT,
        (LCB)sizeof(IBI))) == NULL)
    {
        fRet = SetErrCode(phr, E_OUTOFMEMORY);
        goto ErrFreeAll;
    }
    lpibi = (LPBRKI)_GLOBALLOCK(hibi);

    /* Set the default breaker function, and stop list */
#ifndef CW
    lpQueryInfo->lpfnBreakFunc = lpfnTable[0].lpfnBreakFunc;
#endif
    lpQueryInfo->lpStopListInfo = lpfnTable[0].lpStopListInfo;

    if ((lpCharTabInfo = lpQueryInfo->lpCharTab =
        lpfnTable[0].lpCharTab) == NULL)
    {

        /* Default character and ligature tables */

        lpCharTabInfo = lpQueryInfo->lpCharTab = MVCharTableGetDefault (phr);
        if (lpCharTabInfo == NULL)
        {
            fRet = SetErrCode(phr, E_NOHANDLE);
            goto ErrFreeAll;
        }
        lpQueryInfo->fFlag |= FREE_CHARTAB;
    }

    /* Change the property of '*' and '?' to character */

    ((LPCMAP)lpCharTabInfo->lpCMapTab)['*'].Class = CLASS_WILDCARD;
    ((LPCMAP)lpCharTabInfo->lpCMapTab)['?'].Class = CLASS_WILDCARD;

    switch (lpCharTabInfo->fFlag)
    {
        case USE_DEF_LIGATURE:
            lpCharTabInfo->wcLigature = DEF_LIGATURE_COUNT;
            lpCharTabInfo->lpLigature = LigatureTable;
            break;

        case NO_LIGATURE:
            lpCharTabInfo->wcLigature = 0;
            lpCharTabInfo->lpLigature = NULL;
    }

	// not used for similarity
    lpQueryInfo->lpStack = NULL;

    /* Allocate a query tree */
    if ((lpQueryTree = (_LPQT)QueryTreeAlloc()) == NULL)
    {
        fRet = SetErrCode(phr, E_OUTOFMEMORY);
        goto ErrFreeAll;
    }

    /* Associate the query tree with the query. In the future, this will
     * ensure the capability to have several queries and query trees
     * at once
     */
    lpQueryInfo->lpQueryTree = (LPQT)lpQueryTree;

    /* Default arguments */

    lpQueryTree->iDefaultOp = (BYTE)OR_OP;
    lpQueryTree->lpGroup = lpGroup;         // Use default Group
    lpQueryTree->dwFieldId = 0;//DW_NIL_FIELD;  // No fieldid search
    lpQueryTree->cStruct.dwKey = CALLBACKKEY;

    lpQueryTree->fFlag = 0;
    lpQueryTree->wProxDist = 0;

	if (NULL == (lpQueryTree->lpDocScores = InitDocScoreList(pSrchInfo->dwTopicCount)))
	{
        fRet = SetErrCode(phr, E_OUTOFMEMORY);
        goto ErrFreeAll;
	}

    /* Copy the query into a temporary buffer since we are going to make
    change to it
    */
    if ((hQuery = _GLOBALALLOC(DLLGMEM_ZEROINIT, (LCB)cbQuery + 2)) == NULL)
    {
        SetErrCode(phr, E_OUTOFMEMORY);
        FreeHandle(hqi);
        return NULL;
    }
    lpbQueryBuf = lpQueryInfo->lpbQuery = (LPB)_GLOBALLOCK(hQuery);
    lpbQueryBuf[cbQuery] = ' '; // Add a space to help LowLeveltransformation
    lpbQueryBuf[cbQuery + 1] = 0; // Zero-terminated string (safety bytes)
    MEMCPY(lpbQueryBuf, lpbQuery, cbQuery);

    //
    //  Word-break between here and there.
    //

    brkParms.lpInternalBreakInfo = lpibi;
    brkParms.lpbBuf = lpbQueryBuf;
    brkParms.cbBufCount = cbQuery;
    brkParms.lcbBufOffset = 0;
    brkParms.lpvUser = lpQueryInfo;
    brkParms.lpfnOutWord = (FWORDCB)FFlatCallBack;
    brkParms.lpStopInfoBlock = lpQueryInfo->lpStopListInfo;
    brkParms.lpCharTab = lpQueryInfo->lpCharTab;
    brkParms.fFlags = ACCEPT_WILDCARD;

    if ((fRet = (*lpQueryInfo->lpfnBreakFunc)((LPBRK_PARMS)&brkParms))
        != S_OK)
    {
        fRet = SetErrCode(phr, (WORD)fRet);
        goto ErrFreeAll;
    }

    /* Flush the word breaker */
    brkParms.lpbBuf = NULL;
    brkParms.cbBufCount = 0;

    if ((fRet = (*lpQueryInfo->lpfnBreakFunc)((LPBRK_PARMS)&brkParms))
        != S_OK)
    {
        fRet = SetErrCode(phr, fRet);
        goto ErrFreeAll;
    }

    /* Set the position of pointer to report missing term at
    the end of the query. -1 since the offset starts at 0
    */
    lpQueryInfo->dwOffset = cbQuery - 1;

	fRet = S_OK;

ErrFreeAll:
    /* Free the charmap table */
    if (lpQueryInfo->fFlag & FREE_CHARTAB)
        MVCharTableDispose (lpQueryInfo->lpCharTab);

    /* Free query info */
    if (hqi)
    {
        FreeHandle(hqi);
    };

    /* Free internal breaker info */
    if (hibi)
    {
        FreeHandle(hibi);
    };

    /* Free internal query buffer info */
    if (hQuery)
    {
        FreeHandle(hQuery);
    };

    if (fRet == S_OK)
        return lpQueryTree;

    if (lpQueryTree)
    {
        BlockFree(lpQueryTree->lpStringBlock);
		BlockFree(lpQueryTree->lpWordInfoBlock);
        BlockFree(lpQueryTree->lpOccMemBlock);
        BlockFree(lpQueryTree->lpTopicMemBlock);
        BlockFree(lpQueryTree->lpNodeBlock);

		FreeDocScoreList(lpQueryTree->lpDocScores);
        /* Free Query tree block */
        FreeHandle ((HANDLE)lpQueryTree->cStruct.dwReserved);
    }
    return NULL;
}

/*************************************************************************
 *  @doc    INTERNAL
 *
 *  @func   HRESULT FAR PASCAL | ProcessTerm |
 *      This function will search the index for the given word' data.
 *  @parm   _LPQT | lpqt |
 *      Pointer to index structure
 *  @parm   LPRETV | lpRetV |
 *      Pointer to "globals"
 *  @parm   _LPQTNODE | lpCurQtNode |
 *      Current node in the query tree containing important data
 *      - The number of topics
 *      - The location of the data
 *      - The size of the data
 *      - Pointer to the next word (for wildcard search)
 *  @rdesc  S_OK or other errors
 *************************************************************************/
PUBLIC HRESULT EXPORT_API FAR PASCAL ProcessTerm(_LPQT lpqt, LPRETV lpRetV,
	_LPQTNODE lpResQuery, _LPQTNODE lpQtNode, STRING_TOKEN FAR *lpToken)
{
    DWORD   dwTopicIDDelta; // Topic-ID delta from previous sub-list.
    DWORD   dwOccs;         // Number of occurences in this sub-list.
    DWORD   dwTmp;          // Scratch variable.
    WORD    wWeight;        // Term-weight associated with this sub-list.
	WORD	wWeightMax;
    DWORD   dwTopicID;      // TopicId
    WORD    wImportance;
    DWORD   dwLength;       // Length of the word
    TOPIC_LIST FAR *lpResTopicList;  // Result TopicList
    HRESULT fRet;               // Returned value
    PNODEINFO pDataInfo;
    DWORD   dwTopicCount;
    _LPQT   lpQueryTree; // Query tree
    OCCF    occf;
    BYTE    fSkipOccList = FALSE;
	_LPDSL	lpDocScores = (_LPDSL)(lpqt->lpDocScores);

    pDataInfo = &lpRetV->DataInfo;
    if ((pDataInfo->dwDataSizeLeft = lpQtNode->cbData) == 0)
        return(S_OK);    // There is nothing to process

    // Initialize variables
    occf = lpqt->occf;
    wImportance = QTN_TOKEN(lpQtNode)->wWeight;
    lpResTopicList = NULL;
    lpQueryTree = lpRetV->lpqt;
    dwTopicCount = lpQtNode->cTopic;
    wWeight = (WORD)(65535L/(lpToken ? lpToken->dwTopicCount : dwTopicCount));

    // Reset the topic count for lpQtNode so that is will not affect the
    // result in case that lpResQuery == NULL

    lpQtNode->cTopic = 0;

    if (lpResQuery == NULL)
        lpResQuery = lpQtNode;

    // Initialize the data buffer node values
    pDataInfo->pBuffer = pDataInfo->pDataNode;
    pDataInfo->nodeOffset = lpQtNode->foData;

    // Read the data block
    if ((fRet = ReadNewData(pDataInfo)) != S_OK)
        return(fRet);

    dwTopicID = 0L;         // Init occurence record
    dwLength = 0;

	// for each document in posting
    for (; dwTopicCount; dwTopicCount--)
    {
        /* Check for interrupt now and then */
        if ((++lpqt->cInterruptCount) == 0)
        {
            if (lpqt->fInterrupt == E_INTERRUPT)
                return E_INTERRUPT;
            if (*lpqt->cStruct.Callback.MessageFunc &&
                (fRet = (*lpqt->cStruct.Callback.MessageFunc)(
                lpqt->cStruct.Callback.dwFlags,
                lpqt->cStruct.Callback.pUserData, NULL)) != S_OK)
                return(fRet);
        }

        // Byte align
        if (pDataInfo->ibit != cbitBYTE - 1)
        {
            pDataInfo->ibit = cbitBYTE - 1;
            pDataInfo->pCurPtr ++;
        }

        // Get value from which I will calculate current doc-ID.
        if ((fRet = FGetDword(pDataInfo, lpqt->ckeyTopicId,
            &dwTopicIDDelta)) != S_OK)
        {
exit0:
            return fRet;
        }

        dwTopicID += dwTopicIDDelta;
        //
        //  Get term-weight if present.  I'm going to get this
        //  even if I'm not doing ranking, because it's in the
        //  index, and I have to get around it somehow.
        //
        if (lpqt->idxf & IDXF_NORMALIZE)
        {
            if ((fRet = FGetBits(pDataInfo, &dwTmp, sizeof (USHORT) * cbitBYTE))
                != S_OK)
                goto exit0;

            if (wImportance != MAX_WEIGHT)
                dwTmp = (dwTmp * wImportance) / 65535;

			// BUGBUG: we actually want the weights for all aliased terms
			// to be considered at once.
            wWeight = (WORD)dwTmp;
        }

		// always skip any occurrence info
        if (occf & (OCCF_OFFSET | OCCF_COUNT))
		{
			//  Figure out how many occurences there are in this
			//  sub-list.
			//
			if ((fRet = FGetDword(pDataInfo, lpqt->ckeyOccCount,
				&dwOccs)) != S_OK)
				goto exit0;

			if ((fRet = SkipOccList (lpqt, pDataInfo, dwOccs)) != S_OK)
				goto exit0;
		}

        //  If this search includes a group, and the doc is not in the
        //  group then ignore it
        if (lpQueryTree->lpGroup
			 && FGroupLookup(lpQueryTree->lpGroup, dwTopicID) == FALSE)
			 continue;

		// calculate relevance upper bound Dr = Ds + sum(Qi) for this document
	    if (lpResTopicList = TopicNodeSearch(lpQueryTree, lpResQuery, dwTopicID))
			wWeightMax = lpResTopicList->wWeight;
		else
			wWeightMax = 0;

		wWeightMax = AddWeights(wWeightMax, wWeight);
		wWeightMax = AddWeights(wWeightMax, QTN_TOKEN(lpQtNode)->wWeightRemain);
		if (wWeightMax < GetMinDocScore(lpDocScores, ROUND_DOWN)
			 &&
			IsDocScoreListFull(lpDocScores))
		{
			// do not alloc/ or remove D from result list if present
			if (lpResTopicList)
			{
				register LPITOPIC lpPrev, lpTmp;

				// find lpPrev
				// UNDONE: look into removing necessity for this loop
				for (lpPrev = NULL, lpTmp = (LPITOPIC)lpQtNode->lpTopicList; lpTmp;
					lpTmp = lpTmp->pNext) {
					if (lpTmp == (LPITOPIC)lpResTopicList)
						break;
					lpPrev = lpTmp;
				}

				TopicNodeFree(lpQueryTree, lpResQuery, lpPrev, lpResTopicList);
#if defined(_DEBUG) && defined(_DUMPALL)
				_DPF3("Remove topic %lu, wWeightMax = %lu, MinDocScore = %u\n", dwTopicID, \
					wWeightMax, GetMinDocScore(lpDocScores, ROUND_DOWN));
#endif
			}
			// no need to update top-N docs since this wasn't one of them
			continue;
		}

		if (lpResTopicList)
		{
			WORD wOldWeight = lpResTopicList->wWeight;

			// Calc new Ds for this doc and if good enough for the club, ensure that
			// club invariant is maintained, else leave it since it could still become
			// a club member in the future
			lpResTopicList->wWeight = AddWeights(lpResTopicList->wWeight, wWeight);
			if (lpResTopicList->wWeight > GetMinDocScore(lpDocScores, ROUND_DOWN))
				UpdateDocScoreList(lpDocScores, wOldWeight, lpResTopicList->wWeight);

#if defined(_DEBUG) && defined(_DUMPALL)
			_DPF3("Update topic %lu, wWeightMax = %lu, wWeight = %u\n", dwTopicID, \
				wWeightMax, lpResTopicList->wWeight);
#endif

			continue;
		}

		// a new document counter: possible  club member, or not enough
		// total documents yet
		if ((lpResTopicList = TopicNodeAllocate(lpQueryTree)) == NULL)
		{
			fRet = E_TOOMANYTOPICS;
			goto exit0;
		}
		lpResTopicList->dwTopicId = dwTopicID;
		lpResTopicList->lpOccur = NULL;
		lpResTopicList->lcOccur = 0;
		lpResTopicList->wWeight = wWeight;

		/* Add the new TopicID node into TopicList */
		TopicNodeInsert (lpQueryTree, lpResQuery, lpResTopicList);
		UpdateDocScoreList(lpDocScores, -1, lpResTopicList->wWeight);

#if defined(_DEBUG) && defined(_DUMPALL)
		_DPF3("New topic %lu, wWeightMax = %lu, wWeight = %u\n", dwTopicID, \
			wWeightMax, lpResTopicList->wWeight);
#endif


    } // end for each topic in posting

    fRet = S_OK;

    return fRet;
}

PRIVATE HRESULT PASCAL NEAR ResolveFlatQuery(_LPQT lpqt, _LPQTNODE lpCurQtNode, LPRETV lpRetV)
{
    HRESULT     fRet;
    PNODEINFO    pLeafInfo = &lpRetV->LeafInfo;
    LPB     astBTreeWord = lpRetV->pBTreeWord;
    DWORD   dwTotalTopic;
    LPB     lstModified = lpRetV->pModifiedWord;
    ERRB    errb;
	WORD    cByteMatched = 0;
    STRING_TOKEN FAR *lpStrList;    /* Pointer to strings table */
	STRING_TOKEN FAR *lpPrev;    /* Pointer to strings table */
	_LPDSL	lpDocScores = (_LPDSL)(lpqt->lpDocScores);
	LPWORDINFO lpwiT;
	LPWORDINFO lpwiPrev;

	// first collect the word info for each token
    for (lpStrList = lpqt->lpStrList, lpPrev = NULL;
		lpStrList; lpStrList = lpStrList->pNext)
    {
		BOOL fNumber = TRUE;

		// accumulate the list of terms to have data read
		if ((fRet = GetWordInfoList(lpqt, lpStrList, lpCurQtNode, lpRetV)) != S_OK)
		{
			return SetErrCode (&errb, fRet);
		}

		// if no word info was available, remove the token from the list
		// it won't get freed until end of query, but who cares - it makes
		// the rest of the processing faster
		if (!lpStrList->lpwi)
		{
			if (lpPrev)
				lpPrev->pNext = lpStrList->pNext;
			else
				lpqt->lpStrList = lpStrList->pNext;

			// NOTE: lpPrev must remain unchanged when deleting!
			continue;
		}

		// cycle through all the instances of this term's lookalikes
		// (e.g. multiple aliases) and add up the total topic count
		// since we don't want to treat aliases as rare, even though
		// they may be.
		lpStrList->dwTopicCount = lpStrList->lpwi->cTopic;
		for (lpwiT = lpStrList->lpwi->pNext, lpwiPrev = NULL; lpwiT;
		 lpwiPrev = lpwiT, lpwiT = lpwiT->pNext)
			lpStrList->dwTopicCount += lpwiT->cTopic;

		lpPrev = lpStrList;
	} // for next term

	// sort string list by descending term rarity
	SortStringWeights(lpqt);

	dwTotalTopic = 0;

    for (lpStrList = lpqt->lpStrList;
		lpStrList; lpStrList = lpStrList->pNext)
    {
		LPWORDINFO lpwiT;

		if (lpStrList->lpwi == NULL)
			continue;

#if defined(_DEBUG) && defined(_DUMPALL)
		{
		char szTemp[256];

		STRNCPY(szTemp, lpStrList->lpString + 2, *(LPWORD)lpStrList->lpString);
		szTemp[*(LPWORD)lpStrList->lpString] = 0;
		_DPF1("Term: '%s'\n", szTemp);
		}
#endif

		// We can terminate the query processing if the upper bound on the
		// smallest current doc score is lteq the current score of the R-th
		// biggest doc score, since any further computation will at most
		// result in a re-ordering of the bottom (N - R) documents.
		// However, this leaves the remaining documents only partially
		// sorted by relevancy, which may or may not be acceptable.

		if (AddWeights(GetMinDocScore(lpDocScores, ROUND_UP),
			lpStrList->wWeightRemain) <= GetSortedDocScore(lpDocScores,
			   (int)lpRetV->SrchInfo.dwTopicFullCalc, ROUND_DOWN))
			 break;

		lpqt->lpTopicStartSearch = NULL;
		lpqt->lpOccStartSearch = NULL;

		QTN_TOKEN(lpCurQtNode) = lpStrList;

		for (lpwiT = lpStrList->lpwi; lpwiT; lpwiT = lpwiT->pNext)
		{
			// TO DO: replace with WORDINFO in curqt node
            lpCurQtNode->cTopic = lpwiT->cTopic;
            lpCurQtNode->foData = lpwiT->foData;
            lpCurQtNode->cbData = lpwiT->cbData;
			lpCurQtNode->wRealLength = lpwiT->wRealLength;

            if ((fRet = ProcessTerm(lpqt, lpRetV,
                NULL, lpCurQtNode, lpStrList)) != S_OK)
            {
				// kevynct: no need to overwrite count on error since
				// we may be attempting to continue
				lpCurQtNode->cTopic += dwTotalTopic;
                return(fRet);
            }

            // Accumulate the topic count, since cTopic will be destroyed
            // if there is more searches for this node (such as wildcard)
            dwTotalTopic += lpCurQtNode->cTopic;
		}
	}

	lpCurQtNode->cTopic = dwTotalTopic;

	return S_OK;
}

__inline void MergeWordInfoCounts(WORDINFO FAR *lpwiDest, WORDINFO FAR *lpwiSrc)
{
	lpwiDest->cTopic += lpwiSrc->cTopic;
}

// adds zero or more WORDINFO nodes for the passed-in string
PRIVATE HRESULT GetWordInfoList(_LPQT lpqt, STRING_TOKEN FAR *lpStrToken, _LPQTNODE lpCurQtNode, LPRETV lpRetV)
{
    int     cLevel;
    int     cMaxLevel;
    int     fCheckFieldId;
    LST     lstSearchStr;
    LPB     lpCurPtr;
    int     nCmp;
    HRESULT     fRet;
    int     f1stIsWild;
    LPB     lpMaxAddress;
    PNODEINFO    pLeafInfo = &lpRetV->LeafInfo;
    DWORD   dwTemp;
    LPB     astBTreeWord = lpRetV->pBTreeWord;
    WORD    wLen;
    DWORD   dwFieldID;
    LPB     lstModified = lpRetV->pModifiedWord;
    BYTE    fStemmed;
    LPB     pBTreeWord;
    ERRB    errb;
	WORD    cByteMatched = 0;
	WORDINFO wi;
    LPWORDINFO lpwi;

	fStemmed = 0;

	lstSearchStr = lpStrToken->lpString;
	f1stIsWild = (lstSearchStr[2] == WILDCARD_CHAR ||
		lstSearchStr[2] == WILDCARD_STAR);

	// Make sure to turn of stemming if there is any wildcard characters

	for (nCmp = *((LPW)lstSearchStr) + 1; nCmp >= 2; nCmp--)
	{
		if (lstSearchStr[nCmp] == '*' || lstSearchStr[nCmp] == '?')
		{
			fStemmed = FALSE;
			break;
		}
	}

	// Turned off stemming for short words
	if (*(LPW)lstSearchStr < 3)
		fStemmed = FALSE;

	pLeafInfo->nodeOffset = lpqt->foIdxRoot;
	pLeafInfo->iLeafLevel = lpqt->cIdxLevels - 1;
	pLeafInfo->dwBlockSize = lpqt->dwBlockSize;

    // BUGBUG: we don't handle stemming for now.
    MEMCPY (lstModified, lstSearchStr,
        *((LPW)lstSearchStr) + sizeof (SHORT));
    // Zero terminated for wildcard search
    lstModified [*((LPW)lstModified) + 2] = 0;

    pBTreeWord = lpRetV->pBTreeWord;

	/* Change all '*' and '?' to 0. This will
	 * ensure that things gets compared correctly with
	 * the top node's entries
	 */
	for (nCmp = *((LPW)lstModified) + 1; nCmp >= 2; nCmp--)
	{
		if (lstModified[nCmp] == '*' || lstModified[nCmp] == '?')
		{
			lstModified[nCmp] = 0;
			*(LPW)lstModified = nCmp - 2;
		}
	}

	/*
	 * Point node-resolution variables at the right things.  This
	 * sets these up to read b-tree nodes.  Fields not set here are
	 * set as appropriate elsewhere.
	 */

	/* Set the flag */
	fCheckFieldId = (lpqt->occf & OCCF_FIELDID) && (lpCurQtNode->dwFieldId != DW_NIL_FIELD);

	astBTreeWord[0] = 0;
	cMaxLevel = lpqt->cIdxLevels - 1;

	/*
	First we have to find which tree level the word is in. The number of
	searches is equal to the number of tree levels at most. The
	structure of the directory node is a sequence of:
		- Words: PASCAL strings
		- Data offset: will tell us where is the
		offset of the record in the index file
	*/
	for (cLevel = 0; cLevel < cMaxLevel ; cLevel++)
	{
		//
		//  Get a node.
		//
		if ((fRet = ReadStemNode ((PNODEINFO)pLeafInfo, cLevel)) != S_OK)
		{
			return SetErrCode (&errb, fRet);
		}
		lpMaxAddress = pLeafInfo->pMaxAddress;
		lpCurPtr = pLeafInfo->pCurPtr;

		//
		//  Loop through it.  This compares the word I'm
		//  looking for against the word in the b-tree.
		//  If the word in the b-tree is >= the word I'm
		//  looking for, I'm done.
		//
		//  If I run off the end of the node, there can be
		//  no match for this term, so I skip the entire
		//  process.
		//
		for (;;)
		{
			if (lpCurPtr >= lpMaxAddress)
				return S_OK;

			lpCurPtr = ExtractWord(astBTreeWord, lpCurPtr, &wLen);

			if (fStemmed)
			{
				if ((fRet = FStem (pBTreeWord, astBTreeWord)) !=
					S_OK)
					return(S_OK);
			}

			/* Read in NodeId record */
			lpCurPtr += ReadFileOffset (&pLeafInfo->nodeOffset, lpCurPtr);

			if (f1stIsWild)
				break;
			if (StrCmpPascal2(lstModified, pBTreeWord) <= 0)
				break;
		}
	}

	/* At this point, pLeafInfo->nodeOffset is the node id of the leaf that
	is supposed to contain the searched word. Read in the leaf node
	*/
	if ((fRet = ReadLeafNode ((PNODEINFO)pLeafInfo, cLevel)) != S_OK)
	{
		return fRet;
	}

	lpCurPtr = pLeafInfo->pCurPtr;
	lpMaxAddress = pLeafInfo->pMaxAddress;

	//
	//  Second step is to deal with the leaf node(s).  I'm going to
	//  find and capture some occurence lists.  I'll probably have to
	//  ignore some bogus ones first.
	//

	// Reset the word
	if (fStemmed)
	{
		MEMCPY (lstModified, lpRetV->pStemmedQueryWord,
			*(LPW)lpRetV->pStemmedQueryWord + sizeof(WORD));
	}
	else
	{
		MEMCPY (lstModified, lstSearchStr,
			*((LPW)lstSearchStr) + sizeof (SHORT));
	}

	for (;;)
	{
		// Check for out of data
		if (lpCurPtr >= lpMaxAddress)
		{
			// Get the offset of the next node
			ReadFileOffset (&pLeafInfo->nodeOffset, pLeafInfo->pBuffer);
			if (FoIsNil (pLeafInfo->nodeOffset))
			{
				return S_OK;
			}

			// Read the next node
			if ((fRet = ReadLeafNode ((PNODEINFO)pLeafInfo, cLevel))
				!= S_OK)
			{
				return SetErrCode (&errb, fRet);
			}
			lpCurPtr =
				pLeafInfo->pBuffer + FOFFSET_SIZE + sizeof (SHORT);
			lpMaxAddress = pLeafInfo->pMaxAddress;
		}

		/* Check for interrupt now and then */
		if ((++lpqt->cInterruptCount) == 0)
		{
			if (lpqt->fInterrupt == E_INTERRUPT)
				return E_INTERRUPT;
			if (*lpqt->cStruct.Callback.MessageFunc &&
				(fRet = (*lpqt->cStruct.Callback.MessageFunc)(
				lpqt->cStruct.Callback.dwFlags,
				lpqt->cStruct.Callback.pUserData, NULL)) != S_OK)
				return(fRet);
		}

		// Extract the word
		lpCurPtr = ExtractWord(astBTreeWord, lpCurPtr, &wLen);

		if (fStemmed)
		{
			if ((fRet = FStem (pBTreeWord, astBTreeWord)) != S_OK)
				return(fRet);
		}

		if (lpqt->occf & OCCF_FIELDID)
			lpCurPtr += CbByteUnpack (&dwFieldID, lpCurPtr);

		nCmp = CompareTerm (lpCurQtNode, lstModified, pBTreeWord, fCheckFieldId ?
			 dwFieldID : lpCurQtNode->dwFieldId, lpRetV->pLeadByteTable);

		switch (nCmp)
		{
			case KEEP_SEARCHING:
				// Skip TopicCount
				lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
				// Skip data offset
				lpCurPtr += FOFFSET_SIZE;
				// Skip DataSize
				lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
				break;

			case STRING_MATCH:

				lpCurPtr += CbByteUnpack (&wi.cTopic, lpCurPtr);
				lpCurPtr += ReadFileOffset (&wi.foData, lpCurPtr);
				lpCurPtr += CbByteUnpack (&wi.cbData, lpCurPtr);
				wi.wRealLength = wLen;// BUGBUG doublecheck this

				// Check for Topic count. This can be 0 if the word has been deleted
				// from the index
				if (wi.cTopic == 0)
					break;

				// long search optimization: clip noise words.
				// Johnms- eliminate frequent words.
				// typically, you eliminate if in more than 1/7 of documents.

				if ((lpRetV->SrchInfo.Flag & LARGEQUERY_SEARCH)
					 &&
					 lpRetV->SrchInfo.dwValue < wi.cTopic
					)
				{
					break;
				}

				// allocate WORDINFO node
				if ((lpwi = BlockGetElement(lpqt->lpWordInfoBlock)) == NULL)
					return E_OUTOFMEMORY;

				*lpwi = wi;

				lpwi->pNext = lpStrToken->lpwi;
				lpStrToken->lpwi = lpwi;

				// Save the info
				pLeafInfo->pCurPtr = lpCurPtr;
            	break;

			case NOT_FOUND: // No unconditional "break" above.
				if (fStemmed &&  (strncmp (lstSearchStr+ 2, pBTreeWord + 2,
					cByteMatched) == 0))
				{
					// Continue searching in case stemming is messed up
					// by non-alphabetic word, such as the sequence:
					// subtopic subtopic2 subtopics
					lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
					// Skip data offset
					lpCurPtr += FOFFSET_SIZE;
					// Skip DataSize
					lpCurPtr += CbByteUnpack (&dwTemp, lpCurPtr);
					   break;
				}
				return S_OK;
		}
	}

}

/*************************************************************************
 *  @doc    INTERNAL
 *
 *  @func   HRESULT PASCAL FAR | FFlatCallBack |
 *      This call back function is called by various breakers after
 *      fetching a token. The token is checked for wild char presence
 *
 *  @parm   LST | lstRawWord |
 *      Pointer to unnormalized string
 *
 *  @parm   LST | lstNormWord |
 *      Pointer to normalized string. This pascal string's size should be
 *      at least *lstNormWord+2
 *
 *  @parm   LFO | lfoWordOffset |
 *      Offset into the query buffer. It is used to mark the location
 *      where an parsing error has occurred
 *
 *  @parm   LPQI | lpqi |
 *      Pointer to query info structure. This has all "global" variables
 *
 *  @rdesc  S_OK if succeeded, else various errors.
 *************************************************************************/
PUBLIC HRESULT PASCAL FAR EXPORT_API FFlatCallBack (LST lstRawWord, LST lstNormWord,
    LFO lfoWordOffset, LPQI lpqi)
{
    /* Add extra 0 to make sure that AllocWord() gets the needed 0
     * for WildCardCompare()
     */
    lstNormWord[*(LPW)(lstNormWord) + 2] = 0;

	// add the token to the string list
    if (AllocWord(lpqi->lpQueryTree, lstNormWord) == NULL)
        return E_OUTOFMEMORY;

    return S_OK;
}

// for now, perform simple insertion sort on the string list
// bugbug: use heapsort or faster method for long lists
// for now, we sort by total topic count decreasing (rare terms first)
PRIVATE VOID PASCAL SortStringWeights(_LPQT lpQueryTree)
{
    STRING_TOKEN FAR *pStr, *pStrNext, *pT, *pTPrev;
	STRING_TOKEN FAR *pStrHead = lpQueryTree->lpStrList;
	DWORD dwSum, dwT;
	DWORD dwMaxWeight;
	WORD wWeightT;
	int nCmp;
    FLOAT   rLog;
    FLOAT   rLogSquared;
	FLOAT   rSigma;
	FLOAT	rTerm;
	BOOL	fNormalize = FALSE; // Normalize was for testing only.

	if (fNormalize)
	{
		rSigma = (float)0.0;

		// for each term:
		for (pStr = pStrHead; pStr; pStr = pStr->pNext)
		{
			FLOAT fOcc;

			// we have to guard against the possibility of the log resulting in
			// a value <= 0.0. Very rare, but possible in the future. This happens
			// if dwTopicCount approaches or exceeds the N we are using (N == 100 million)
			if (pStr->dwTopicCount >= cNintyFiveMillion)
				rLog = cVerySmallWt;	// log10(100 mil/ 95 mil) == 0.02
			else
				//rLog = (float) log10(cHundredMillion/(double)pHeader->dwTopicCount);
				rLog = (float) (8.0 - log10((double)pStr->dwTopicCount));

			rLogSquared = rLog*rLog;

			// Update sigma value
			// NOTE : We are bounding dwOccCount by a value of eTFThreshold
			// The RHS of the equation below has an upperbound of 2 power 30.
			fOcc = (float) min(cTFThreshold, pStr->cUsed);
			rSigma += fOcc*fOcc*rLogSquared;
		}

		rSigma = (float)sqrt(rSigma);
	}

	// calculate final weights and corrections
	dwSum = dwMaxWeight = 0L;
	for (pStr = pStrHead; pStr; pStr = pStr->pNext, nCmp++)
	{
		BOOL fNumber;

		// once sigma is known, each term's proper weight can be calculated
        if (fNormalize)
        {
			FLOAT rWeight;

			// log10(x/y) == log10 (x) - log10 (y). Since x in our case is a known constant,
			// 100,000,000, I'm replacing that with its equivalent log10 value of 8.0 and subtracting
			// the log10(y) from it
			rTerm = (float) (8.0 - log10((double) pStr->dwTopicCount));
			// In extreme cases, rTerm could be 0 or even -ve (when dwTopicCount approaches or
			// exceeds 100,000,000)
			if (rTerm <= (float) 0.0)
				rTerm = cVerySmallWt;	// very small value. == log(100 mil/ 95 mil)
			// NOTE : rWeight for the doc term would be as follows:
			//	rWeight = float(min(4096, dwBlockSize)) * rTerm / lpipb->wi.hrgsigma[dwTopicId]
			//
			// Since rTerm needs to be recomputed again for the query term weight computation,
			// and since rTerm will be the same value for the current term ('cos N and n of log(N/n)
			// are the same (N = 100 million and n is whatever the doc term freq is for the term),
			// we will factor in the second rTerm at index time. This way, we don't have to deal
			// with rTerm at search time (reduces computation and query time shortens)
			//
			// MV 2.0 initially did the same thing. However, BinhN removed the second rTerm
			// because he decided to remove the rTerm altogether from the query term weight. He
			// did that to keep the scores reasonably high.

			rWeight = ((float) min(cTFThreshold, pStr->cUsed))
				* rTerm * rTerm / rSigma;
			// without the additional rTerm, we would probably be between 0.0 and 1.0
			if (rWeight > rTerm)
				wWeightT = 0xFFFF;
			else
				wWeightT = (WORD) ((float)0xFFFF * rWeight / rTerm);

        }
		else
			wWeightT = 65535;

		pStr->wWeight = (WORD)(16383 + 49152 / pStr->dwTopicCount);

		// perform any special weight adjustments here
		// BUGBUG: use NextChar here, and use charmap here
		// numbers four digits or less get downgraded
		fNumber = TRUE;
		for (nCmp = *((LPWORD)pStr->lpString) + 1; nCmp >= 2; nCmp--)
			if (nCmp > 5 || !IS_DIGIT(pStr->lpString[nCmp]))
			{
				fNumber = FALSE;
				break;
			}

		if (fNumber)
			pStr->wWeight = pStr->wWeight / 256;

		//pStr->wTermWeight = (WORD)(pStr->wWeight * wWeightT / 65535L);

		dwMaxWeight = max(dwMaxWeight, pStr->wWeight);
		dwSum += pStr->wWeight;
	}

	// now sort 'em
	for (pStr = pStrHead; pStr;)
	{
		if (NULL == (pStrNext = pStr->pNext))
			break;

		if (pStrNext->wWeight <= pStr->wWeight)
		{
			pStr = pStr->pNext;
			continue;
		}

		// find element in already-sorted section
		for (pT = pStrHead, pTPrev = NULL; pT; pTPrev = pT, pT = pT->pNext)
		{
			if (pT->wWeight <= pStrNext->wWeight)
			{
				pStr->pNext = pStrNext->pNext;
				pStrNext->pNext = pT;

				if (pTPrev)
					pTPrev->pNext = pStrNext;
				else
					pStrHead = pStrNext;

				break;
			}
		}
	}

	dwT = 0;
	for (pStr = pStrHead; pStr; pStr = pStr->pNext)
	{
		dwT += pStr->wWeight;

		if (dwSum > dwT)
			pStr->wWeightRemain = AddWeights(0, (WORD)((dwSum - dwT) * 65535.0 / dwSum));
		else
			pStr->wWeightRemain = 1;
	}

	lpQueryTree->lpStrList = pStrHead;
}