windows-nt/Source/XPSP1/NT/enduser/stuff/itircl/fts/search/index3.c
2020-09-26 16:20:57 +08:00

2059 lines
66 KiB
C

#define VER3
/*************************************************************************
* *
* INDEX.C *
* *
* Copyright (C) Microsoft Corporation 1990-1994 *
* All Rights reserved. *
* *
**************************************************************************
* *
* Module Intent *
* This is the second stage of the index building process. After all *
* of the word have been add in stage 1, IndexBuild will be called. *
* IndexBuild starts the second stage. We will merge-sort the temp file *
* generated in phase 1 to create a second temp file to send to phase 3. *
* *
**************************************************************************
* *
* Current Owner: BinhN *
* *
**************************************************************************/
#include <mvopsys.h>
#include <mem.h>
#include <memory.h>
#include <io.h>
#include <math.h>
#include <mvsearch.h>
#include <orkin.h>
#include "common.h"
#include "index.h"
#ifdef _DEBUG
static BYTE NEAR s_aszModule[] = __FILE__; /* Used by error return functions.*/
#endif
#ifndef _32BIT
#define ESOUTPUT_BUFFER 0xFFFC // Size of output file buffer
// This must be at the size of the largest word + 12
// or word + 14 if OCCF_LENGTH is set
#else
#define ESOUTPUT_BUFFER 0xFFFFC // Size of output file buffer
// This must be at the size of the largest word + 12
// or word + 14 if OCCF_LENGTH is set
#endif
#define FLUSH_NEW_RECORD 1
#define FLUSH_EXCEPT_LAST 2
/*************************************************************************
*
* INTERNAL PRIVATE FUNCTIONS
*
* All of them should be declared near
*
*************************************************************************/
PRIVATE HRESULT NEAR PASCAL FillInputBuffer (LPESB, HFPB);
PRIVATE HRESULT NEAR PASCAL ESFlushBuffer (LPESI);
PRIVATE HRESULT NEAR PASCAL ESFillBuffer (_LPIPB, LPESB);
PRIVATE HRESULT NEAR PASCAL ESMemory2Disk (_LPIPB, PMERGEHEADER, int);
PRIVATE HRESULT NEAR PASCAL ProcessFiles (_LPIPB lpipb, LPMERGEPARAMS);
PRIVATE int NEAR PASCAL CompareRecordBuffers (_LPIPB, LPB, LPB);
PRIVATE VOID NEAR PASCAL PQueueUp (_LPIPB, LPESB FAR *, LONG);
PRIVATE VOID NEAR PASCAL PQueueDown (_LPIPB);
PRIVATE PTOPICDATA PASCAL NEAR MergeTopicNode (PMERGEHEADER, PTOPICDATA, int);
PRIVATE VOID NEAR MergeOccurrence (PTOPICDATA, PTOPICDATA, int);
PRIVATE LPV PASCAL NEAR GetBlockNode (PBLKCOMBO lpBlockCombo);
PRIVATE VOID PASCAL NEAR SetQueue (LPESI pEsi);
PRIVATE HRESULT PASCAL NEAR ESBBlockAllocate (_LPIPB lpipb, DWORD lMemSize);
PRIVATE BOOL PASCAL LoadEsiTemp (_LPIPB lpipb, LPESI lpesi, LPB lpbEsiFile,
LPB lpbIsiFile, PHRESULT phr);
PRIVATE VOID PASCAL NEAR SaveEsiTemp (LPIPB lpipb, LPESI lpesi);
PRIVATE VOID PASCAL NEAR UpdateEsiTemp (LPIPB lpipb);
PRIVATE BOOL PASCAL NEAR FindTopic(LPMERGEPARAMS lpmp, DWORD dwTopicId);
/*************************************************************************
*
* INTERNAL PUBLIC FUNCTIONS
*
* All of them should be declared far, unless we know they belong to
* the same segment. They should be included in some include file
*
*************************************************************************/
PUBLIC HRESULT FAR PASCAL FlushTree(_LPIPB lpipb);
PUBLIC HRESULT FAR PASCAL MergeSortTreeFile (_LPIPB, LPMERGEPARAMS);
HRESULT FAR PASCAL AllocSigmaTable (_LPIPB lpipb);
/*************************************************************************
*
* @doc EXTERNAL API INDEX
*
* @func BOOL FAR PASCAL | MVIndexBuild |
* This function will build an index file based on the information
* collected in the Index parameter block.
*
* @parm HFPB | hSysFile |
* If it is non-null, it is the handle of an already opened system file.
* In this case the index is a subfile of the opened system file
* If it is 0, the index file is a regular DOS file
*
* @parm LPIPB | lpipb |
* Pointer to Index Parameter Block. This structure contains all the
* information necessary to build the index file
*
* @parm HFPB | hfpb |
* Index hfpb if pstrFile is NULL
*
* @parm LPSTR | pstrFile |
* Index filename if hfpb is NULL
*
* @rdesc S_OK, or other errors
*
* @xref MVIndexInitiate()
*************************************************************************/
/*
* This operates in three main steps:
*
* 1. Send finish to first phase to dump the buffer. Then merge-sort
* that file into a temporary index. Keep statistics on the information
* written to this temporary index.
*
* 2. Analyze the statistics gathered during the temporary index
* building phase. This analysis results in the choice of
* compression processes that will be used in the next step.
*
* 3. Permanent index building phase. During this phase, the
* temporary index is read, compressed like crazy, and written
* to a permanent index file. Unlike the temporary index, the
* permanent index contains directory nodes as well as leaf
* nodes.
*
*************************************************************************/
PUBLIC HRESULT EXPORT_API FAR PASCAL MVIndexBuild (HFPB hSysFile,
_LPIPB lpipb, HFPB hfpb, LPSTR pstrFile)
{
ERRB errb;
PHRESULT phr = &errb;
BYTE bKeyIndex = CKEY_OCC_BASE; // Index into cKey array for compression
HRESULT fRet; // Return value from this function.
DWORD loop;
// Sanity check
if (lpipb == NULL || (NULL == hfpb && NULL == pstrFile))
return E_INVALIDARG;
// Flush the internal sort
// Flushes any records in the tree to disk
fRet = FlushTree(lpipb);
// Free all memory blocks
FreeISI (lpipb);
if (fRet != S_OK)
return(fRet);
// lpipb->lcTopics++; // Adjust to base-1 from base-0
if (lpipb->esi.cesb == 0)
// Nothing to process, there will be no index file
return S_OK;
if (lpipb->idxf & KEEP_TEMP_FILE)
SaveEsiTemp (lpipb, &lpipb->esi);
// If we're doing term-weighting, set up a huge array to contain the
// sigma terms. The size of the array depends on the total # of topics
// We also create an array of LOG values to save calculations later
if (lpipb->idxf & IDXF_NORMALIZE)
{
if ((fRet = AllocSigmaTable (lpipb)) != S_OK)
return(fRet);
}
if ((fRet = MergeSortTreeFile (lpipb, NULL)) != S_OK)
return SetErrCode (phr, fRet);
if ((lpipb->idxf & KEEP_TEMP_FILE) == 0)
FileUnlink (NULL, lpipb->isi.aszTempName, REGULAR_FILE);
// If we are doing term-weighting we have to square root all sigma values
if (lpipb->idxf & IDXF_NORMALIZE)
{
// ISBU_IR_CHANGE not needed here 'cos computing sqrt is necessary in both cases
for (loop = 0; loop < lpipb->dwMaxTopicId + 1; ++loop)
lpipb->wi.hrgsigma[loop] =
(float)sqrt ((double)lpipb->wi.hrgsigma[loop]);
}
// Analyze data to get the best compression scheme
// TopicId
// Note: We can't use fixed field compression for topic, since they
// can be modified by update. A fixed field format may become
// insufficient to store larger values of topic differences
VGetBestScheme(&lpipb->cKey[CKEY_TOPIC_ID],
&lpipb->BitCount[CKEY_TOPIC_ID][0], lcbitBITSTREAM_ILLEGAL, TRUE);
// Occurrence Count
VGetBestScheme(&lpipb->cKey[CKEY_OCC_COUNT],
&lpipb->BitCount[CKEY_OCC_COUNT][0], lcbitBITSTREAM_ILLEGAL, TRUE);
if (lpipb->occf & OCCF_COUNT)
{
VGetBestScheme(&lpipb->cKey[bKeyIndex],
&lpipb->BitCount[bKeyIndex][0], lcbitBITSTREAM_ILLEGAL, TRUE);
bKeyIndex++;
}
if (lpipb->occf & OCCF_OFFSET)
{
VGetBestScheme(&lpipb->cKey[bKeyIndex],
&lpipb->BitCount[bKeyIndex][0], lcbitBITSTREAM_ILLEGAL, TRUE);
bKeyIndex++;
}
if (lpipb->idxf & KEEP_TEMP_FILE)
UpdateEsiTemp (lpipb);
// Build the permanent index
fRet = BuildBTree(hSysFile, lpipb, lpipb->esi.aszTempName, hfpb, pstrFile);
if (lpipb->idxf & IDXF_NORMALIZE)
{
FreeHandle (lpipb->wi.hSigma);
FreeHandle (lpipb->wi.hLog);
}
return fRet;
}
/*************************************************************************
*
* @doc INDEX
*
* @func HRESULT NEAR PASCAL | FillInputBuffer |
* Fills the buffer by reading from the specified file.
*
* @parm PESB | pEsb |
* Pointer to external sort block to fill
*
* @parm HFPB | hFile |
* Handle to the input file
*
* @rdesc S_OK, or errors if failed
*
*************************************************************************/
HRESULT NEAR PASCAL FillInputBuffer(LPESB pEsb, HFPB hFile)
{
ERRB errb;
DWORD dwBytesRead;
// Read in data
if ((dwBytesRead = FileSeekRead (hFile,
(LPB)pEsb->lrgbMem, pEsb->lfo, pEsb->dwEsbSize, &errb)) == 0)
return errb;
// Update utility variables
pEsb->lfo = FoAddDw(pEsb->lfo, dwBytesRead);
pEsb->dwEsbSize = (CB)dwBytesRead;
pEsb->ibBuf = 0;
return S_OK;
}
/*************************************************************************
*
* @doc INDEX
*
* @func HRESULT NEAR PASCAL | ESFlushBuffer |
* Flushes the output buffer to disk and resets it.
*
* @parm LPESI | pEsi |
* Pointer to ESI block
*
* @rdesc S_OK, or errors if failed
*
*************************************************************************/
HRESULT NEAR PASCAL ESFlushBuffer(LPESI pEsi)
{
ERRB errb;
DWORD dwLen;
dwLen = pEsi->ibBuf;
if (dwLen != (DWORD)FileWrite (pEsi->hfpb, pEsi->pOutputBuffer,
dwLen, &errb))
return errb;
pEsi->lfoTempOffset = FoAddDw (pEsi->lfoTempOffset, dwLen);
pEsi->ibBuf = 0;
return S_OK;
}
/*************************************************************************
*
* @doc INDEX
*
* @func HRESULT NEAR PASCAL | ESFillBuffer |
* Updates the input buffer with new data from the input file.
*
* @parm _LPIPB | lpipb |
* Pointer to index parameter block
*
* @parm LPESB | pEsb |
* Pointer to ESB block to be filled
*
* @rdesc S_OK, or other errors
*************************************************************************/
HRESULT NEAR PASCAL ESFillBuffer(_LPIPB lpipb, LPESB pEsb)
{
DWORD dwBytesRead;
DWORD dwExtra = pEsb->dwEsbSize - pEsb->ibBuf;
ERRB errb;
// Read either the entire buffer size or whatever is left
dwBytesRead = DwSubFo (pEsb->lfoMax, pEsb->lfo);
if (dwBytesRead > pEsb->dwEsbSize - dwExtra)
dwBytesRead = pEsb->dwEsbSize - dwExtra;
// Save unproccessed information to beginning of buffer
if (dwExtra)
MEMMOVE ((LPB)pEsb->lrgbMem, pEsb->lrgbMem + pEsb->ibBuf, dwExtra);
// Read in the new data
if ((dwBytesRead = FileSeekRead (lpipb->isi.hfpb, (LPB)(pEsb->lrgbMem +
dwExtra), pEsb->lfo, dwBytesRead, &errb)) == 0 &&
errb != S_OK)
return(errb);
pEsb->lfo = FoAddDw(pEsb->lfo, dwBytesRead);
pEsb->ibBuf = 0;
pEsb->dwEsbSize = dwBytesRead + dwExtra;
return(S_OK);
}
/*************************************************************************
*
* @doc INTERNAL INDEXING
*
* @func HRESULT FAR PASCAL | MergeSortTree File |
* Sorts the file generated from the tree output into one
* list of sorted elements.
*
* @parm _LPIPB | lpipb |
* Pointer to index parameter block
*
*************************************************************************/
PUBLIC HRESULT PASCAL FAR MergeSortTreeFile (_LPIPB lpipb, LPMERGEPARAMS lpmp)
{
// Local replacement variables
LPESI pEsi; // Pointer to external sort info
LPISI pIsi; // Pointer to internal sort info
HFPB hInputFile; // Handle to input file
ERRB errb;
PHRESULT phr = &errb;
DWORD cesb; // Input buffer count
LPESB FAR* lrgPriorityQueue; // Pointer to Priority Queue
WORD uiQueueSize = 0; // Count of entries in Queue
DWORD dwBufferSize;
// Working variables
HRESULT fRet;
LPESB pEsb; // Temp pointer to linked list
// Sanity check
if (lpipb == NULL)
return E_INVALIDARG;
// Variables initialization
pEsi = &lpipb->esi; // Pointer to external sort info
pIsi = &lpipb->isi; // Pointer to internal sort info
cesb = pEsi->cesb; // Input buffer count
// Open input file
if ((pIsi->hfpb = FileOpen (NULL, pIsi->aszTempName,
REGULAR_FILE, READ, phr)) == NULL)
return *phr;
hInputFile = pIsi->hfpb;
// Allocate & fill input buffers
for (pEsb = pEsi->lpesbRoot; pEsb != NULL; pEsb = pEsb->lpesbNext)
{
DWORD cbRead;
dwBufferSize = (lpipb->dwMemAllowed * 6) / (8 * pEsi->cesb);
// Alocate buffer space
if ((pEsb->hMem = _GLOBALALLOC (DLLGMEM_ZEROINIT,
dwBufferSize)) == NULL)
{
fRet = E_OUTOFMEMORY;
exit1:
FreeEsi (lpipb);
FileClose(hInputFile);
pIsi->hfpb = NULL;
return fRet;
}
pEsb->lrgbMem = (LRGB)_GLOBALLOCK (pEsb->hMem);
if ((cbRead = DwSubFo(pEsb->lfoMax, pEsb->lfo)) > dwBufferSize)
cbRead = dwBufferSize;
// Fill buffer from disk
if (FileSeekRead (hInputFile, pEsb->lrgbMem, pEsb->lfo,
cbRead, phr) != (LONG)cbRead)
{
fRet = *phr;
_GLOBALUNLOCK(pEsb->hMem);
_GLOBALFREE(pEsb->hMem);
pEsb->hMem = NULL;
goto exit1;
}
pEsb->dwEsbSize = cbRead;
pEsb->ibBuf = 0;
pEsb->lfo = FoAddDw (pEsb->lfo, cbRead);
}
// Allocate a priority queue array. The size of the array
// is the number of external sort info blocks plus 1, since
// location 0 is not used.
if ((pEsi->hPriorityQueue = _GLOBALALLOC (DLLGMEM_ZEROINIT,
(DWORD)(pEsi->cesb + 1) * sizeof (LPB))) == NULL)
{
fRet = E_OUTOFMEMORY;
goto exit1;
}
pEsi->lrgPriorityQueue =
(LPESB FAR *)_GLOBALLOCK (pEsi->hPriorityQueue);
lrgPriorityQueue = pEsi->lrgPriorityQueue;
// Attach input buffers to Priority Queue
// Remebering to start at offset 1 NOT 0 (PQ's have a null 0 element)
for (pEsb = pEsi->lpesbRoot; pEsb != NULL; pEsb = pEsb->lpesbNext)
{
lrgPriorityQueue[++uiQueueSize] = pEsb;
PQueueUp (lpipb, lrgPriorityQueue, uiQueueSize);
}
pEsi->uiQueueSize = uiQueueSize;
// Clear largest Record Size field
// lpipb->dwMaxRecordSize = 0;
fRet = ProcessFiles(lpipb, lpmp);
_GLOBALUNLOCK (pEsi->hPriorityQueue);
_GLOBALFREE (pEsi->hPriorityQueue);
pEsi->hPriorityQueue = NULL;
goto exit1;
}
/*************************************************************************
*
* @doc INDEX
*
* @func HRESULT NEAR PASCAL | ESMemory2Disk |
* Copies temp record to output buffer.
*
* @parm _LPIPB | lpipb |
* Pointer to index parameter block
*
* @parm PMERGEHEADER | pHeader |
* Pointer to header to flush
*
* @parm int | flag |
* - if FLUSH_NEW_RECORD, the flush is due to new record, we flush
* everything, else we may do a partial flush only
* - if FLUSH_EXCEPT_LAST, we don't flush the last topic
*
* @rdesc S_OK, or other errors
*************************************************************************/
PRIVATE HRESULT NEAR PASCAL ESMemory2Disk
(_LPIPB lpipb, PMERGEHEADER pHeader, int flag)
{
// Local replacement variables
LPESI pEsi = &lpipb->esi;
LPB pMax = pEsi->pOutputBuffer + ESOUTPUT_BUFFER - 2 * sizeof(DWORD);
DWORD dwOccCount;
LPB pOutputBuffer = pEsi->pOutputBuffer;
ERRB errb;
PHRESULT phr = &errb;
HRESULT fRet;
BYTE cNumOcc;
OCCF occf;
// Working variables
PTOPICDATA pTopic; // Temp var to traverse the topic linked list
DWORD loop, sub; // Various loop counters
DWORD dwTopicIdDelta;
DWORD OccDelta[5]; // Delta base for all occurrence data
DWORD LastOcc[5];
FLOAT rLog; // (1/n) - IDXF_NORMALIZE is set
FLOAT rLogSquared; // (1/n)^2 - IDXF_NORMALIZE is set
LPB pStart;
LPB pCurPtr;
// Set up pointers
pStart = pCurPtr = pOutputBuffer + pEsi->ibBuf;
// Variable replacement
occf = lpipb->occf;
// Size of string
loop = pHeader->dwStrLen;
// Make sure the string, FileId, Topic Count and Record Size fit
// We add in and extra DWORD for 5 byte compresssion problems and
// to cover the Word Length if there is one.
if ((pStart + loop + sizeof (DWORD) * 5) >= pMax)
{
if ((fRet = ESFlushBuffer (pEsi)) != S_OK)
return(fRet);
pStart = pCurPtr = pOutputBuffer;
}
if (pHeader->fEmitRecord == FALSE)
{
// If we never emitted the record header then we emitted now
// Reset the flag
pHeader->fEmitRecord = TRUE;
// Skip record size field
pCurPtr += sizeof (DWORD);
// Pascal string
MEMCPY (pCurPtr, pHeader->lpbWord, loop);
pCurPtr += loop;
// Word Length
if (occf & OCCF_LENGTH)
pCurPtr += CbBytePack (pCurPtr, pHeader->dwWordLength);
// FieldId
if (occf & OCCF_FIELDID)
pCurPtr += CbBytePack (pCurPtr, pHeader->dwFieldId);
// Topic Count
if (flag & FLUSH_NEW_RECORD)
{
// This is the whole record. dwTopicCount value is correct
SETLONG((LPUL)pCurPtr, pHeader->dwTopicCount);
}
else
{
// Save the offset for backpatching
pHeader->foTopicCount = FoAddDw (pEsi->lfoTempOffset,
(DWORD)(pCurPtr - pOutputBuffer));
pHeader->pTopicCount = pCurPtr;
}
pCurPtr += sizeof(DWORD);
// Write Record Length
*(LPUL)pStart = (DWORD)(pCurPtr - pStart - sizeof (DWORD));
}
else if (flag & FLUSH_NEW_RECORD)
{
// We emit the record before, since pheader->fEmitRecord == TRUE
// We need to backpatch the topic count
if (FoCompare(pHeader->foTopicCount, pEsi->lfoTempOffset) >= 0)
{
// Everything is still in memory, just do local backpatch
SETLONG((LPUL)(pHeader->pTopicCount), pHeader->dwTopicCount);
}
else
{
// Do backpatch in the file by seeking back to the right
// place
if (FileSeekWrite(pEsi->hfpb, &pHeader->dwTopicCount,
pHeader->foTopicCount, sizeof(DWORD), phr) != sizeof(DWORD))
return(*phr);
// Restore the current file offset
FileSeek(pEsi->hfpb, pEsi->lfoTempOffset, 0, phr);
}
}
// Convert all occ data to delta values & compress them
pTopic = pHeader->pTopic;
cNumOcc = lpipb->ucNumOccDataFields;
for (; pTopic;)
{
POCCDATA pOccData;
PTOPICDATA pReleased;
if ((flag & FLUSH_EXCEPT_LAST) && pTopic->pNext == NULL)
break;
// Set TopicId delta
dwTopicIdDelta = pTopic->dwTopicId - pHeader->dwLastTopicId;
pHeader->dwLastTopicId = pTopic->dwTopicId;
// Save bit size to the statistics array
lpipb->BitCount[CKEY_TOPIC_ID][CbitBitsDw (dwTopicIdDelta)] += 1;
// Write TopicID Delta
if (pCurPtr > pMax)
{
pEsi->ibBuf = (DWORD)(pCurPtr - pOutputBuffer);
if ((fRet = ESFlushBuffer (pEsi)) != S_OK)
return(fRet);
pCurPtr = pOutputBuffer;
}
pCurPtr += CbBytePack (pCurPtr, dwTopicIdDelta);
if (cNumOcc == 0)
{
pReleased = pTopic;
pTopic = pTopic->pNext;
// Add the released to the freed linked list
pReleased->pNext = (PTOPICDATA)lpipb->TopicBlock.pFreeList;
lpipb->TopicBlock.pFreeList = (PLIST)pReleased;
lpipb->TopicBlock.dwCount--;
continue;
}
if (dwOccCount = pTopic->dwOccCount)
{
// Reset count occdata delta for every new topic
MEMSET (OccDelta, 0, 5 * sizeof (DWORD));
MEMSET (LastOcc, 0, 5 * sizeof (DWORD));
// Copy Occurrence Count
if (pCurPtr > pMax)
{
pEsi->ibBuf = (DWORD)(pCurPtr - pOutputBuffer);
if ((fRet = ESFlushBuffer (pEsi)) != S_OK)
return(fRet);
pCurPtr = pOutputBuffer;
}
pCurPtr += CbBytePack (pCurPtr, dwOccCount);
// Save bit size to the statistics array
lpipb->BitCount[1][CbitBitsDw (dwOccCount)] += 1;
// Repeat for each occurrence block
for (pOccData = pTopic->pOccData,
sub = dwOccCount; sub > 0 && pOccData; --sub)
{
LPDW lpDw;
int iIndex;
POCCDATA pReleased;
if (pCurPtr + 5 * sizeof(DWORD) > pMax)
{
pEsi->ibBuf = (DWORD)(pCurPtr - pOutputBuffer);
if ((fRet = ESFlushBuffer (pEsi)) != S_OK)
return(fRet);
pStart = pCurPtr = pOutputBuffer;
}
lpDw = &pOccData->OccData[0];
iIndex = CKEY_OCC_BASE;
if (occf & OCCF_COUNT)
{
// Convert each value to a delta value
OccDelta[iIndex] = *lpDw - LastOcc[iIndex];
LastOcc[iIndex] = *lpDw;
lpDw++;
// Save to bit size to the statistics array
lpipb->BitCount[iIndex][CbitBitsDw (OccDelta[iIndex])] += 1;
// Compress occurrence field to buffer
pCurPtr += CbBytePack (pCurPtr, OccDelta[iIndex]);
iIndex++;
}
if (occf & OCCF_OFFSET)
{
// Convert each value to a delta value
OccDelta[iIndex] = *lpDw - LastOcc[iIndex];
LastOcc[iIndex] = *lpDw;
lpDw++;
// Save to bit size to the statistics array
lpipb->BitCount[iIndex][CbitBitsDw (OccDelta[iIndex])] += 1;
// Compress occurrence field to buffer
pCurPtr += CbBytePack (pCurPtr, OccDelta[iIndex]);
iIndex++;
}
pReleased = pOccData;
pOccData = pOccData->pNext;
pReleased->pNext = (POCCDATA)lpipb->OccBlock.pFreeList;
lpipb->OccBlock.pFreeList = (PLIST)pReleased;
lpipb->OccBlock.dwCount--;
}
// Check for mismatch between count and links
#ifdef _DEBUG
if (sub)
SetErrCode (phr, E_ASSERT);
if (pOccData)
SetErrCode (phr, E_ASSERT);
#endif
}
// Update the sigma values if we are doing term weighing
// erinfox: remove test against flag. Sometimes sigma never
// got calculated for a topic and that caused a divide by zero
// later on.
if ((lpipb->idxf & IDXF_NORMALIZE) /* && (flag & FLUSH_NEW_RECORD)*/)
{
if (pTopic->dwTopicId > lpipb->dwMaxTopicId)
{
// Incease the size of the sigma table. This can happen when
// updating with new topics
_GLOBALUNLOCK (lpipb->wi.hSigma);
if ((lpipb->wi.hSigma = _GLOBALREALLOC (lpipb->wi.hSigma,
(pTopic->dwTopicId + 1) * sizeof(float),
DLLGMEM_ZEROINIT)) == NULL)
{
return (SetErrCode(phr, E_OUTOFMEMORY));
}
lpipb->wi.hrgsigma = (HRGSIGMA)_GLOBALLOCK(lpipb->wi.hSigma);
lpipb->dwMaxTopicId = pTopic->dwTopicId ;
}
if (lpipb->bState == INDEXING_STATE)
{
#ifndef ISBU_IR_CHANGE
FLOAT fOcc;
if (pHeader->dwTopicCount >= cLOG_MAX)
{
// we have to guard against the possibility of the log resulting in
// a value <= 0.0. Very rare, but possible in the future. This happens
// if dwTopicCount approaches or exceeds the N we are using (N == 100 million)
if (pHeader->dwTopicCount >= cNintyFiveMillion)
rLog = cVerySmallWt; // log10(100 mil/ 95 mil) == 0.02
else
//rLog = (float) log10(cHundredMillion/(double)pHeader->dwTopicCount);
rLog = (float) (8.0 - log10((double)pHeader->dwTopicCount));
rLogSquared = rLog*rLog;
}
else
rLogSquared = lpipb->wi.lrgrLog[(WORD)pHeader->dwTopicCount];
// Update sigma value
// NOTE : We are bounding dwOccCount by a value of eTFThreshold
// The RHS of the equation below has an upperbound of 2 power 30.
fOcc = (float) min(cTFThreshold, dwOccCount);
lpipb->wi.hrgsigma[pTopic->dwTopicId] += (SIGMA) fOcc*fOcc*rLogSquared;
//(SIGMA) (fOcc * fOcc * rLogSquared/(float)0xFFFF);
#else
// Failed for update : UNDONE
if (pHeader->dwTopicCount >= cLOG_MAX)
{
rLog = (float)1.0 / (float)pHeader->dwTopicCount;
rLogSquared = rLog * rLog;
}
else
rLogSquared = lpipb->wi.lrgrLog[(WORD)pHeader->dwTopicCount];
// Update sigma value
lpipb->wi.hrgsigma[pTopic->dwTopicId] +=
(SIGMA)(dwOccCount * dwOccCount) * rLogSquared;
#endif // ISBU_IR_CHANGE
}
}
pReleased = pTopic;
pTopic = pTopic->pNext;
// Add the released to the freed linked list
pReleased->pNext = (PTOPICDATA)lpipb->TopicBlock.pFreeList;
lpipb->TopicBlock.pFreeList = (PLIST)pReleased;
lpipb->TopicBlock.dwCount--;
}
pHeader->pTopic = pHeader->pLastTopic = pTopic;
// Update output offset
pEsi->ibBuf = (DWORD)(pCurPtr - pOutputBuffer);
return(S_OK);
}
/*************************************************************************
*
* @doc INDEX
*
* @func HRESULT NEAR PASCAL | ProcessFiles |
* Sorts the file generated from the tree output into one
* list of sorted elements.
*
* @parm _LPIPB | lpipb |
* Pointer to index parameter block
*
* @rdesc S_OK, or errors if failed
*
* @notes
* This function processed the input buffers and uses dynamic
* memory allocation to sort each word as it come in. Once a
* word stops repeating, it is flush to disk and the memory is
* reset for the next word.
*************************************************************************/
HRESULT NEAR PASCAL ProcessFiles(_LPIPB lpipb, LPMERGEPARAMS lpmp)
{
// Local replacement variables
LPISI pIsi = &lpipb->isi;
LPESI pEsi = &lpipb->esi;
LPESB FAR * lrgPriorityQueue = pEsi->lrgPriorityQueue;
LONG uiQueueSize = pEsi->uiQueueSize;
LPB pQueuePtr;
WORD cNumOcc = lpipb->ucNumOccDataFields;
WORD OccSize = sizeof(OCCDATA) - sizeof(DWORD) + cNumOcc *
sizeof(DWORD);
int occf = lpipb->occf;
LPB pBufMax;
HANDLE hWord;
LPB lpbWord;
DWORD dwUniqueTerm = 0; // Used for calback function
#ifdef _DEBUG
BYTE astWord[300];
BYTE astLastWord[300];
#endif
// Working variables
PMERGEHEADER pHeader; // Pointer to merge header
LPESB pEsb; // Temp ESB pointer
PTOPICDATA pNewTopic; // Used to create new topic
DWORD loop; // Temp loop counter
HANDLE hHeader;
HFPB hOutputFile; // Handle to output file
int fRet; // Return value
USHORT uStringSize; // Size of Psacal String
ERRB errb;
PHRESULT phr = &errb;
static long Count = 0;
// Setup Block Manager
if ((fRet = ESBBlockAllocate (lpipb, lpipb->dwMemAllowed / 4)) != S_OK)
return(fRet);
// Allocate output buffer
if ((pEsi->hBuf = _GLOBALALLOC
(DLLGMEM_ZEROINIT, ESOUTPUT_BUFFER)) == NULL)
{
fRet = E_OUTOFMEMORY;
exit1:
return fRet;
}
pEsi->pOutputBuffer = (LPB)_GLOBALLOCK (pEsi->hBuf);
pEsi->ibBuf = 0;
// Create output file
GETTEMPFILENAME ((char)0, "eso", 0, pEsi->aszTempName);
if ((pEsi->hfpb = FileOpen(NULL, pEsi->aszTempName,
REGULAR_FILE, WRITE, &errb)) == NULL)
{
fRet = E_FILECREATE;
exit2:
FreeHandle (pEsi->hBuf);
pEsi->hBuf = NULL;
goto exit1;
}
hOutputFile = pEsi->hfpb;
// Setup new record in memory
if ((hHeader = _GLOBALALLOC
(DLLGMEM_ZEROINIT, sizeof (MERGEHEADER))) == NULL)
{
fRet = E_OUTOFMEMORY;
exit3:
FileClose (hOutputFile);
goto exit2;
}
pHeader = (PMERGEHEADER)_GLOBALLOCK (hHeader);
// Allocate buffer for a word, which include 64K + sizeof(WORD) + slack
if ((hWord = _GLOBALALLOC(DLLGMEM_ZEROINIT, 0x10004)) == NULL)
{
exit4:
_GLOBALUNLOCK(hHeader);
_GLOBALFREE (hHeader);
goto exit3;
}
pHeader->lpbWord = lpbWord = (LPB)_GLOBALLOCK(hWord);
#ifdef _DEBUG
astWord[0] = 0;
#endif
// Process all input buffers
do
{
DWORD dwWordLength;
DWORD dwFieldId;
LPB lpStart;
DWORD dwTopicCount;
#ifdef _DEBUG
Count++;
#endif
// Grab smallest record and send to buffer
pEsb = lrgPriorityQueue[1];
// Set the fill limit
pBufMax = pEsb->lrgbMem + pEsb->dwEsbSize - 256;
if ((pQueuePtr = pEsb->lrgbMem + pEsb->ibBuf) >= pBufMax)
{
if ((fRet = ESFillBuffer (lpipb, pEsb)) != S_OK)
goto exit4;
pQueuePtr = pEsb->lrgbMem;
}
// Save the record beginning
pQueuePtr += sizeof(DWORD);
lpStart = pQueuePtr;
// Get string
uStringSize = GETWORD ((LPUW)pQueuePtr) + sizeof (SHORT);
pQueuePtr += uStringSize;
#ifdef _DEBUG
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
SetErrCode (phr, E_ASSERT);
#endif
if (occf & OCCF_LENGTH)
pQueuePtr += CbByteUnpack (&dwWordLength, pQueuePtr);
else
dwWordLength = 0;
#ifdef _DEBUG
if (pQueuePtr >= pEsb->lrgbMem + pEsb->dwEsbSize)
SetErrCode (phr, E_ASSERT);
#endif
if (occf & OCCF_FIELDID)
pQueuePtr += CbByteUnpack (&dwFieldId, pQueuePtr);
else
dwFieldId = 0;
#ifdef _DEBUG
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
SetErrCode (phr, E_ASSERT);
#endif
// Is the word in the buffer equal to the new word?
// If it is not then flush the old word
if (*(LPUW)pHeader->lpbWord)
{
fRet = (StrCmp2BytePascal (pHeader->lpbWord, lpStart)
|| dwWordLength > pHeader->dwWordLength);
if (fRet == 0) // Same word, reduce the unique words count
lpipb->dwUniqueWord--;
if (fRet || dwFieldId > pHeader->dwFieldId)
{
#if defined(_DEBUG) && !defined(_MAC)
// Word out of order
if (StrCmp2BytePascal (pHeader->lpbWord, lpStart) > 0)
assert(FALSE);
#endif
if ((fRet = ESMemory2Disk (lpipb, pHeader, TRUE)) != S_OK)
return(fRet);
// Reset pHeader
MEMSET (pHeader, 0, sizeof (MERGEHEADER));
// Set the word buffer
pHeader->lpbWord = lpbWord;
#ifdef _DEBUG
STRCPY(astLastWord, astWord);
#endif
// Call the user callback every once in a while
if (!(++dwUniqueTerm % 8192L)
&& (lpipb->CallbackInfo.dwFlags & ERRFLAG_STATUS))
{
PFCALLBACK_MSG pCallbackInfo = &lpipb->CallbackInfo;
CALLBACKINFO Info;
Info.dwPhase = 2;
Info.dwIndex = (DWORD)((float)dwUniqueTerm / lpipb->dwUniqueWord * 100);
fRet = (*pCallbackInfo->MessageFunc)
(ERRFLAG_STATUS, pCallbackInfo->pUserData, &Info);
if (S_OK != fRet)
goto exit5;
}
}
}
// Update the data
pHeader->dwFieldId = dwFieldId;
pHeader->dwWordLength = dwWordLength;
pHeader->dwStrLen = uStringSize;
// Copy word and header info
MEMCPY (pHeader->lpbWord, (LPB)lpStart, uStringSize);
#ifdef _DEBUG
if (uStringSize >= 300)
uStringSize = 300;
MEMCPY (astWord, lpStart + 2, uStringSize - 2);
astWord[uStringSize - 2] = 0;
//if (STRCMP(astWord, "87db") == 0)
// _asm int 3;
#endif
pQueuePtr += CbByteUnpack (&dwTopicCount, pQueuePtr);
pHeader->dwTopicCount += dwTopicCount;
#ifdef _DEBUG
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
SetErrCode (phr, E_ASSERT);
#endif
pNewTopic = NULL;
// Copy topic(s) to memory
for (loop = dwTopicCount; loop > 0; loop--)
{
DWORD dwTopicId;
// Get the topic id
pQueuePtr += CbByteUnpack (&dwTopicId, pQueuePtr);
// kevynct: if there is a to-delete list, and this topic is on it, skip it
if (lpmp && FindTopic(lpmp, dwTopicId))
{
// Get the occ count
if (cNumOcc)
{
DWORD dwOccCount;
DWORD dwT;
pQueuePtr += CbByteUnpack (&dwOccCount, pQueuePtr);
#ifdef _DEBUG
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
SetErrCode (phr, E_ASSERT);
#endif
for (; dwOccCount > 0; dwOccCount--)
{
// Fill up the buffer if run out of data
if (pQueuePtr >= pBufMax)
{
pEsb->ibBuf = (DWORD)(pQueuePtr - pEsb->lrgbMem);
if ((fRet = ESFillBuffer (lpipb, pEsb)) != S_OK)
goto exit5;
pQueuePtr = pEsb->lrgbMem;
}
switch (cNumOcc)
{
case 5:
pQueuePtr += CbByteUnpack (&dwT, pQueuePtr);
case 4:
pQueuePtr += CbByteUnpack (&dwT, pQueuePtr);
case 3:
pQueuePtr += CbByteUnpack (&dwT, pQueuePtr);
case 2:
pQueuePtr += CbByteUnpack (&dwT, pQueuePtr);
case 1:
pQueuePtr += CbByteUnpack (&dwT, pQueuePtr);
}
#ifdef _DEBUG
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
SetErrCode (phr, E_ASSERT);
#endif
} // end occ loop
} // end if occ non-zero
pHeader->dwTopicCount--;
continue;
} // end of to-delete condition
// Allocate a topicdata node
if ((pNewTopic == NULL) &&
(pNewTopic = GetBlockNode (&lpipb->TopicBlock)) == NULL)
{
if ((fRet = ESMemory2Disk(lpipb, pHeader, FLUSH_EXCEPT_LAST)) != S_OK)
{
exit5:
_GLOBALUNLOCK(hWord);
_GLOBALFREE(hWord);
goto exit4;
}
if ((pNewTopic = GetBlockNode (&lpipb->TopicBlock)) == NULL)
{
// Extremely weird, since we just release a bunch of
// memory
fRet = E_ASSERT;
goto exit5;
}
}
pNewTopic->dwTopicId = dwTopicId;
#ifdef _DEBUG
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
SetErrCode (phr, E_ASSERT);
#endif
// Set the other fields
pNewTopic->pOccData = pNewTopic->pLastOccData = NULL;
// Get the occ count
if (cNumOcc)
{
DWORD dwOccCount;
POCCDATA pOccData;
LPDW lpDw;
pQueuePtr += CbByteUnpack (&pNewTopic->dwOccCount,
pQueuePtr);
#ifdef _DEBUG
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
SetErrCode (phr, E_ASSERT);
#endif
for (dwOccCount = pNewTopic->dwOccCount; dwOccCount > 0;
dwOccCount--)
{
// Get all occ fields
if ((pOccData = (POCCDATA)GetBlockNode
(&lpipb->OccBlock)) == NULL )
{
if ((fRet = ESMemory2Disk(lpipb, pHeader,
FLUSH_EXCEPT_LAST)) != S_OK)
goto exit5;
if ((pOccData =
(POCCDATA)GetBlockNode(&lpipb->OccBlock)) == NULL)
{
// Extremely weird, since we just release a bunch of
// memory, unless there are so many duplicates of the same word
// in the topic
fRet = E_TOOMANYDUPS;
goto exit5;
}
}
// Fill up the buffer if run out of data
if (pQueuePtr >= pBufMax)
{
pEsb->ibBuf = (DWORD) (pQueuePtr - pEsb->lrgbMem);
if ((fRet = ESFillBuffer (lpipb, pEsb)) != S_OK)
goto exit5;
pQueuePtr = pEsb->lrgbMem;
}
lpDw = (LPDW)&pOccData->OccData;
switch (cNumOcc)
{
case 5:
pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr);
case 4:
pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr);
case 3:
pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr);
case 2:
pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr);
case 1:
pQueuePtr += CbByteUnpack (lpDw++, pQueuePtr);
}
#ifdef _DEBUG
if (pQueuePtr > pEsb->lrgbMem + pEsb->dwEsbSize)
SetErrCode (phr, E_ASSERT);
#endif
// Attach to the linked list
// Note that we are assumimg that the occurrences are
// already sorted, so no checking is done here
if (pNewTopic->pOccData == NULL)
{
pNewTopic->pLastOccData = pNewTopic->pOccData
= pOccData;
}
else
{
// Add to the end of the linked list
pNewTopic->pLastOccData->pNext = pOccData;
pNewTopic->pLastOccData = pOccData;
}
pOccData->pNext = NULL;
}
}
if (pNewTopic = MergeTopicNode (pHeader, pNewTopic, cNumOcc))
pHeader->dwTopicCount --;
}
// Update the offset
pEsb->ibBuf = (DWORD) (pQueuePtr - pEsb->lrgbMem);
// If next record doesn't fit in buffer
// Then reset to beginning and load data
if (pEsb->dwEsbSize - pEsb->ibBuf <= sizeof(DWORD) ||
pEsb->dwEsbSize - pEsb->ibBuf <= GETLONG((LPUL)pQueuePtr) +
2 * sizeof(DWORD))
{
if ((fRet = ESFillBuffer (lpipb, pEsb)) != S_OK)
goto exit4;
}
// Adjust priority queue
if (uiQueueSize > 1)
{
if (DwSubFo (pEsb->lfo, pEsb->lfoMax) != 0 &&
pEsb->ibBuf >= pEsb->dwEsbSize)
{
// Replace first record with last
lrgPriorityQueue[1] = lrgPriorityQueue[uiQueueSize];
lrgPriorityQueue[uiQueueSize] = NULL;
uiQueueSize--;
pEsi->uiQueueSize = uiQueueSize;
}
#if 0
else
{ // If the stream still has input add it back into the Queue
lrgPriorityQueue[uiQueueSize] = pEsb;
PQueueUp(lpipb, lrgPriorityQueue, uiQueueSize);
}
#endif
PQueueDown(lpipb); // Maintain sort order
}
else if (DwSubFo (pEsb->lfo, pEsb->lfoMax) != 0 &&
pEsb->ibBuf >= pEsb->dwEsbSize)
{
uiQueueSize--;
pEsi->uiQueueSize = uiQueueSize;
if ((fRet = ESMemory2Disk (lpipb, pHeader, FLUSH_NEW_RECORD)) != S_OK)
return(fRet);
}
} while (uiQueueSize);
fRet = ESFlushBuffer(pEsi);
goto exit5;
}
BOOL PASCAL NEAR FindTopic(LPMERGEPARAMS lpmp, DWORD dwTopicId)
{
register LPDW lpdw;
LPDW lpdwMac;
Assert(lpmp->dwCount > 0);
Assert(lpmp->lpTopicIdLast >= lpmp->rgTopicId);
Assert(lpmp->lpTopicIdLast < lpmp->rgTopicId + lpmp->dwCount);
if (lpmp->rgTopicId[0] > dwTopicId
||
*(lpdwMac = lpmp->rgTopicId + lpmp->dwCount - 1) < dwTopicId)
return FALSE;
if (*lpmp->lpTopicIdLast == dwTopicId)
return TRUE;
if (*lpmp->lpTopicIdLast > dwTopicId)
{
// re-start at the beginning
lpmp->lpTopicIdLast = lpmp->rgTopicId;
}
for (lpdw = lpmp->lpTopicIdLast; lpdw < lpdwMac + 1; lpdw++)
if (*lpdw == dwTopicId)
{
lpmp->lpTopicIdLast = lpdw;
return TRUE;
}
return FALSE;
}
/*************************************************************************
*
* @doc INTERNAL INDEXING
*
* @func int | CompareRecordBuffers |
* Called from PQueueUp/Down to sort the input buffers based first
* upon the string's, then TopicID's, then word length's, etc.
*
* @parm _LPIPB | lpipb |
* Pointer to the index parameter block
*
* @parm LPB | pBuffer A |
* Pointer to the first input buffer
*
* @parm LPB | pBuffer B |
* Pointer to the second input buffer
*
* @rdesc
* If pBufferA < pBufferB return < 0
* If pBufferA == pBufferB return = 0
* If pBufferA > pBufferB return > 0
*************************************************************************/
int PASCAL NEAR CompareRecordBuffers (_LPIPB lpipb, LPB pBufferA, LPB pBufferB)
{
// Local Replacement Variables
int occf = lpipb->occf;
int cNumOcc = lpipb->ucNumOccDataFields;
DWORD dwOccMin;
// Working Variables
int fRet;
int Len;
DWORD dwDataA;
DWORD dwDataB;
pBufferA += sizeof (DWORD); // Skip record length
pBufferB += sizeof (DWORD); // Skip record length
// Compare Pascal strings
if ((fRet = StrCmp2BytePascal(pBufferA, pBufferB)) != 0)
return fRet;
pBufferA += (Len = GETWORD ((LPUW)pBufferA) + sizeof (SHORT));
pBufferB += Len;
// Strings equal - compare FieldIds
// Compare Word Lengths
if (occf & OCCF_LENGTH)
{
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
if ((fRet = (int)(dwDataA - dwDataB)) != 0)
return fRet;
}
if (occf & OCCF_FIELDID)
{
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
if ((fRet = (int)(dwDataA - dwDataB)) != 0)
return fRet;
}
// Skip topic count
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
// Compare 1st topic Id
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
if ((fRet = (int)(dwDataA - dwDataB)) != 0)
return fRet;
// Get the occurrence count
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
if ((fRet = (int)(dwDataA - dwDataB)) < 0)
dwOccMin = dwDataA;
else
dwOccMin = dwDataB;
for (; dwOccMin; dwOccMin--)
{
switch (cNumOcc)
{
case 5:
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
if ((fRet = (int)(dwDataA - dwDataB)) != 0)
return fRet;
break;
case 4:
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
if ((fRet = (int)(dwDataA - dwDataB)) != 0)
return fRet;
break;
case 3:
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
if ((fRet = (int)(dwDataA - dwDataB)) != 0)
return fRet;
break;
case 2:
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
if ((fRet = (int)(dwDataA - dwDataB)) != 0)
return fRet;
break;
case 1:
pBufferA += CbByteUnpack (&dwDataA, pBufferA);
pBufferB += CbByteUnpack (&dwDataB, pBufferB);
if ((fRet = (int)(dwDataA - dwDataB)) != 0)
return fRet;
break;
}
}
return fRet;
}
/*************************************************************************
*
* @doc INTERNAL INDEXING
*
* @func VOID | PQueueUp |
* The function restores the heap condition of a PQ, ie. the parent
* node must be less than the children. When the top node is inserted
* the heap condition may be violated if the resulting node
* is smaller than its parent. In this case the nodes have to
* be switched.
*
* @parm LPESI | lpesi |
* Pointer to external sort info, which contains all info
*
* @parm LONG | index |
* Index of the inserted node
*
*************************************************************************/
VOID PASCAL NEAR PQueueUp
(_LPIPB lpipb, LPESB FAR *lrgPriorityQueue, LONG index)
{
LPESB lpesbTemp; // Pointer to the inserted node
LPESB lpesbHalf; // Pointer to the parent node
WORD uiHalf; // Index of the parent's node
lpesbTemp = lrgPriorityQueue [index];
if ((uiHalf = (WORD) (index/2)) == 0)
return;
lpesbHalf = lrgPriorityQueue [uiHalf];
/* If the parent node is greated than the child, then exchange the
* nodes, The condition uiHalf != index makes sure that we stop
* at node 0 (top node)
*/
while (uiHalf && CompareRecordBuffers (lpipb, (LPB)lpesbHalf->lrgbMem +
lpesbHalf->ibBuf, (LPB)lpesbTemp->lrgbMem + lpesbTemp->ibBuf) > 0)
{
lrgPriorityQueue [index] = lpesbHalf;
index = uiHalf;
uiHalf = (WORD)(index/2);
lpesbHalf = lrgPriorityQueue [uiHalf];
}
lrgPriorityQueue[index] = lpesbTemp;
#if BINHN
SetQueue (&lpipb->esi);
#endif
}
/*************************************************************************
*
* @doc INTERNAL INDEXING
*
* @func VOID | PQueueDown |
* The function restores the heap condition of a PQ, ie. the parent
* node must be less than the children. When the top node is removed
* the heap condition may be violated if the resulting node
* is greater than its children. In this case the nodes have to
* be switched.
*
* @parm LPESI | lpesi |
* Pointer to external sort info, which contains all info
*
*************************************************************************/
PRIVATE VOID PASCAL NEAR PQueueDown (_LPIPB lpipb)
{
LPESI lpesi = &lpipb->esi;
LPESB FAR *lrgPriorityQueue;
int CurIndex;
int ChildIndex;
int MaxCurIndex;
int MaxChildIndex;
LPESB lpesbSaved;
LPESB lpesbTemp;
LPESB lpesbChild;
lrgPriorityQueue = lpesi->lrgPriorityQueue;
lpesbSaved = lrgPriorityQueue[1];
MaxCurIndex = (MaxChildIndex = lpesi->uiQueueSize) / 2;
for (CurIndex = 1; CurIndex <= MaxCurIndex; CurIndex = ChildIndex)
{
// Get child index
ChildIndex = CurIndex * 2;
// Find the minimum of the two children
if (ChildIndex < MaxChildIndex)
{
if ((lpesbTemp = lrgPriorityQueue[ChildIndex + 1]) != NULL)
{
lpesbChild = lrgPriorityQueue[ChildIndex];
// The two children exist. Take the smallest
if (CompareRecordBuffers
(lpipb, (LPB)lpesbChild->lrgbMem + lpesbChild->ibBuf,
(LPB)lpesbTemp->lrgbMem + lpesbTemp->ibBuf) >= 0)
ChildIndex++;
}
}
// If the parent's node is less than the child, then break
// (heap condition met)
if (ChildIndex > MaxChildIndex)
break;
lpesbTemp = lrgPriorityQueue [ChildIndex];
if (CompareRecordBuffers (lpipb, (LPB)lpesbSaved->lrgbMem +
lpesbSaved->ibBuf, (LPB)lpesbTemp->lrgbMem+lpesbTemp->ibBuf) < 0)
break;
// Replace the node
lrgPriorityQueue [CurIndex] = lpesbTemp;
}
lrgPriorityQueue [CurIndex] = lpesbSaved;
#if _BINHN
SetQueue (lpesi);
#endif
}
PRIVATE PTOPICDATA PASCAL NEAR MergeTopicNode (PMERGEHEADER pHeader,
PTOPICDATA pNewTopic, int cNumOcc)
{
// PTOPICDATA pLastTopic;
PTOPICDATA pTopic, pPrevTopic;
int fResult;
if ((pTopic = pHeader->pLastTopic) == NULL)
{
// The list is empty
pHeader->pTopic = pHeader->pLastTopic = pNewTopic;
pNewTopic->pNext = NULL;
return(NULL);
}
fResult = pTopic->dwTopicId - pNewTopic->dwTopicId;
if (fResult < 0)
{
// New node. Add to the end
pNewTopic->pNext = NULL;
pHeader->pLastTopic->pNext = pNewTopic;
pHeader->pLastTopic = pNewTopic;
// Reset pNewTopic for next node allocation
return NULL;
}
if (fResult == 0)
{
// Same topic. Return pNewTopic for reuse
if (cNumOcc)
MergeOccurrence (pTopic, pNewTopic, cNumOcc);
return(pNewTopic);
}
// If we get to this point, the list is out of order
// Try to find the insertion point
pTopic = pHeader->pTopic;
pPrevTopic = NULL;
for (; pTopic->pNext; pTopic = pTopic->pNext)
{
if (pTopic->dwTopicId >= pNewTopic->dwTopicId)
{
/* We pass the inserted point */
break;
}
pPrevTopic = pTopic;
}
if (pTopic->dwTopicId == pNewTopic->dwTopicId)
{
// Same topic. Return pNewTopic for reuse
if (cNumOcc)
MergeOccurrence (pTopic, pNewTopic, cNumOcc);
return(pNewTopic);
}
// Handle empty case
if (pPrevTopic == NULL)
{
/* Insert at the beginning */
pNewTopic->pNext = pHeader->pTopic;
pHeader->pTopic = pNewTopic;
}
else
{
/* Inserted at the middle or the end */
pNewTopic->pNext = pPrevTopic->pNext;
pPrevTopic->pNext = pNewTopic;
}
// Update the last topic
while (pTopic->pNext)
{
pTopic = pTopic->pNext;
}
pHeader->pLastTopic = pTopic;
return(NULL);
}
/*************************************************************************
* @doc PRIVATE
* @func void | MergeOccurrence |
* Merge the occurrence by adding them in order
*************************************************************************/
PRIVATE VOID NEAR MergeOccurrence (PTOPICDATA pOldTopic,
PTOPICDATA pNewTopic, int cOccNum)
{
ERRB errb;
if (CompareOccurrence (&pOldTopic->pLastOccData->OccData[0],
&pNewTopic->pOccData->OccData[0], cOccNum) <= 0)
{
// The whole last list is less than the current list. This is
// what I expect
// We just linked the 2 lists together
pOldTopic->pLastOccData->pNext = pNewTopic->pOccData;
pOldTopic->pLastOccData = pNewTopic->pLastOccData;
pOldTopic->dwOccCount += pNewTopic->dwOccCount;
return;
}
// The current list is less than the old list.
// This is weird, but still we can handle it
if (CompareOccurrence (&pNewTopic->pOccData->OccData[0],
&pOldTopic->pOccData->OccData[0], cOccNum) <= 0)
{
pNewTopic->pLastOccData->pNext = pOldTopic->pOccData;
pOldTopic->pOccData = pNewTopic->pOccData;
pOldTopic->dwOccCount += pNewTopic->dwOccCount;
return;
}
SetErrCode (&errb, E_ASSERT);
}
/*====================================================================*/
#ifdef BINHN
PRIVATE VOID PASCAL NEAR SetQueue (LPESI pEsi)
{
unsigned int i = 0;
LPESB FAR *lrgPriorityQueue;
lrgPriorityQueue = pEsi->lrgPriorityQueue;
for (i = 0; i < 20 && i < pEsi->cesb ; i++)
{
if (lrgPriorityQueue[i])
pEsi->lpbQueueStr[i] = lrgPriorityQueue[i]->lrgbMem +
lrgPriorityQueue[i]->ibBuf + 6;
}
}
#endif
/************************************************************************
* @doc PRIVATE
* @func HRESULT PASCAL NEAR | ESBBlockAllocate |
* Set the memory allocation based on the memory of the machine
* @parm DWORD | lMemSize |
* Memory allocated for the indexer
* @rdesc S_OK, or E_OUTOFMEMORY
************************************************************************/
PRIVATE HRESULT PASCAL NEAR ESBBlockAllocate (_LPIPB lpipb, DWORD lMemSize)
{
DWORD dwTopicSize;
DWORD dwOccSize;
WORD OccNodeSize = sizeof (OCCDATA) - 1 + sizeof(DWORD) *
lpipb->ucNumOccDataFields; // About 24bytes
OccNodeSize = (OccNodeSize + 3) & ~3;
/* The memory is for topic block and occurrence blocks, which
* should be in the ratio 1:1.5
*/
dwTopicSize = (lMemSize * 2) / 5;
dwOccSize = lMemSize - dwTopicSize;
#if 0
/* Don't do anything if things are too small */
if (dwTopicSize < MAX_BLOCK_SIZE || dwOccSize < MAX_BLOCK_SIZE)
return(E_OUTOFMEMORY);
#endif
// Allocate a block manager for topic node
if ((lpipb->TopicBlock.pBlockMgr =
BlockInitiate ((MAX_BLOCK_SIZE * sizeof(TOPICDATA)/sizeof(TOPICDATA)),
sizeof(TOPICDATA),
(WORD)(dwTopicSize/MAX_BLOCK_SIZE),
USE_VIRTUAL_MEMORY | THREADED_ELEMENT)) == NULL)
{
exit2:
return SetErrCode (NULL, E_OUTOFMEMORY);
}
lpipb->TopicBlock.pFreeList =
(PLIST)BlockGetLinkedList(lpipb->TopicBlock.pBlockMgr);
// Allocate a block manager for occ node
if ((lpipb->OccBlock.pBlockMgr =
BlockInitiate((MAX_BLOCK_SIZE * OccNodeSize)/OccNodeSize,
OccNodeSize, (WORD)(lMemSize / MAX_BLOCK_SIZE),
USE_VIRTUAL_MEMORY | THREADED_ELEMENT)) == NULL)
{
BlockFree(lpipb->BTNodeBlock.pBlockMgr);
lpipb->BTNodeBlock.pBlockMgr = NULL;
goto exit2;
}
lpipb->OccBlock.pFreeList = (PLIST)BlockGetLinkedList(lpipb->OccBlock.pBlockMgr);
return (S_OK);
}
PRIVATE LPV PASCAL NEAR GetBlockNode (PBLKCOMBO pBlockCombo)
{
PLIST pList;
if (pBlockCombo->pFreeList == NULL)
{
if ((BlockGrowth (pBlockCombo->pBlockMgr) != S_OK))
return (NULL);
pBlockCombo->pFreeList =
(PLIST)BlockGetLinkedList(pBlockCombo->pBlockMgr);
}
pList = pBlockCombo->pFreeList;
pBlockCombo->pFreeList = pList->pNext;
pBlockCombo->dwCount ++;
// pList->pNext = NULL;
return (pList);
}
/*************************************************************************
*
* @doc INTERNAL
*
* @func BOOL FAR PASCAL | BuildIndexFile |
* This function is for debugging purpose only. In normal indexing,
* it will never be called. Since collecting words and indexing can
* take a long time, debugging the index phase can become a hassle that
* take several hours per shot. To minimize the index time for debugging,
* all the intermediate files are saved, which are:
* - the internal sorted result file, which contains all words and
* their occurrences sorted
* - the external sorted result file, which is a snap shot of the
* ESI structures and its ESB blocks
* The only steps left will be processing the occurrence list and doing
* permanent index
*
* To use the function, add the following lines in the app:
*
* extern HRESULT PASCAL FAR BuildIndexFile (LPSTR, LPSTR, LPSTR, WORD, WORD,
* WORD, INTERRUPT_FUNC, VOID FAR *, STATUS_FUNC, VOID FAR*, PHRESULT);
*
* int fDotest;
*
* if (fDotest) {
* return BuildIndexFile ((LPSTR)"c:/tmp/test.mvb!MVINDEX",
* (LPSTR)"c:/tmp/esi.tmp", (LPSTR)"c:/tmp/iso.tmp",
* OCCF_TOPICID, IDXF_NORMALIZE, 0, (INTERRUPT_FUNC)lpfnInterruptFunc,
* (LPV)NULL,
* (STATUS_FUNC)lpfnStatusFunc, (LPV)hwndGlobal,
* NULL);
* }
*
* @parm HFPB | hfpb |
* HFPB for index file if pstrIndexFile is NULL
*
* @parm LPB | pstrIndexFile |
* The .MVB + index file, usually with the format TEST.MVB!MVINDEX
*
* @parm LPB | lpbEsiFile |
* The external sort info file
*
* @parm LPB | lpbIsiFile |
* The internal sorted info filename
*
* @parm PINDEXINFO | pIndexInfo |
* IndexInfo
*
* @rdesc S_OK if succeeded, else other non-zero error codes
*************************************************************************/
PUBLIC HRESULT PASCAL EXPORT_API FAR BuildIndexFile
(HFPB hfpb, LPSTR pstrIndexFile,
LPB lpbEsiFile, LPB lpbIsiFile, PINDEXINFO pIndexInfo)
{
_LPIPB lpipb;
LPESI lpesi;
BOOL fRet;
ERRB errb;
DWORD loop;
FLOAT rLog;
BYTE bKeyIndex = 0;
if ((lpipb = MVIndexInitiate(pIndexInfo, NULL)) == NULL)
return E_FAIL;
lpesi = &lpipb->esi;
if (LoadEsiTemp (lpipb, lpesi, lpbEsiFile, lpbIsiFile, NULL) != S_OK)
{
fRet = E_FAIL;
exit0:
MVIndexDispose (lpipb);
return fRet;
}
if (lpipb->idxf & IDXF_NORMALIZE)
{
// Allocate a huge buffer to contain all the sigma terms
if ((lpipb->wi.hSigma = _GLOBALALLOC (DLLGMEM_ZEROINIT,
(LCB)((lpipb->dwMaxTopicId + 1) * sizeof (SIGMA)))) == NULL)
return SetErrCode (&errb, E_OUTOFMEMORY);
lpipb->wi.hrgsigma = (HRGSIGMA)_GLOBALLOCK (lpipb->wi.hSigma);
// Small buffer containing pre-calculated values
if ((lpipb->wi.hLog = _GLOBALALLOC (DLLGMEM_ZEROINIT,
(CB)(cLOG_MAX * sizeof (FLOAT)))) == NULL)
{
SetErrCode (&errb, (HRESULT)(fRet = E_OUTOFMEMORY));
FreeHandle (lpipb->wi.hSigma);
goto exit0;
}
lpipb->wi.lrgrLog = (FLOAT FAR *)_GLOBALLOCK (lpipb->wi.hLog);
// Initialize the array
for (loop = cLOG_MAX - 1; loop > 0; --loop)
{
#ifndef ISBU_IR_CHANGE
rLog = (float) log10(cHundredMillion/(double)loop);
#else
rLog = (float)1.0 / (float)loop;
#endif // ISBU_IR_CHANGE
lpipb->wi.lrgrLog[loop] = rLog * rLog;
}
}
if ((fRet = MergeSortTreeFile (lpipb, NULL)) != S_OK)
return SetErrCode (&errb, (HRESULT)fRet);
if ((lpipb->idxf & KEEP_TEMP_FILE) == 0)
FileUnlink (NULL, lpipb->isi.aszTempName, REGULAR_FILE);
// If we are doing term-weighting we have to square root all sigma values
if (lpipb->idxf & IDXF_NORMALIZE)
{
// ISBU_IR_CHANGE not necessary 'cos sqrt computation is necessary in both cases
for (loop = 0; loop < lpipb->dwMaxTopicId + 1; ++loop)
lpipb->wi.hrgsigma[loop] =
(float)sqrt ((double)lpipb->wi.hrgsigma[loop]);
}
// Analyze data to get the best compression scheme
// TopicId
VGetBestScheme(&lpipb->cKey[CKEY_TOPIC_ID],
&lpipb->BitCount[CKEY_TOPIC_ID][0], lcbitBITSTREAM_ILLEGAL, TRUE);
// Occurrence Count
VGetBestScheme(&lpipb->cKey[CKEY_OCC_COUNT],
&lpipb->BitCount[CKEY_OCC_COUNT][0], lcbitBITSTREAM_ILLEGAL, TRUE);
if (lpipb->occf & OCCF_COUNT)
{
VGetBestScheme(&lpipb->cKey[bKeyIndex],
&lpipb->BitCount[bKeyIndex][0], lcbitBITSTREAM_ILLEGAL, TRUE);
bKeyIndex++;
}
if (lpipb->occf & OCCF_OFFSET)
{
VGetBestScheme(&lpipb->cKey[bKeyIndex],
&lpipb->BitCount[bKeyIndex][0], lcbitBITSTREAM_ILLEGAL, TRUE);
bKeyIndex++;
}
// Call the user callback every once in a while
if (lpipb->CallbackInfo.dwFlags & ERRFLAG_STATUS)
{
PFCALLBACK_MSG pCallbackInfo = &lpipb->CallbackInfo;
CALLBACKINFO Info;
Info.dwPhase = 2;
Info.dwIndex = 100;
fRet = (*pCallbackInfo->MessageFunc)
(ERRFLAG_STATUS, pCallbackInfo->pUserData, &Info);
if (S_OK != fRet)
goto exit0;
}
// Build the permanent index
fRet = BuildBTree(NULL, lpipb, lpipb->esi.aszTempName, hfpb, pstrIndexFile);
if (lpipb->idxf & IDXF_NORMALIZE)
{
FreeHandle (lpipb->wi.hLog);
FreeHandle (lpipb->wi.hSigma);
}
goto exit0;
}
PRIVATE VOID PASCAL NEAR SaveEsiTemp (_LPIPB lpipb, LPESI lpesi)
{
GHANDLE hfpb;
LPESB lpesb;
char szEsi[100];
GETTEMPFILENAME ((char)0, "foo", 0, szEsi);
if ((hfpb = FileOpen(NULL, szEsi, REGULAR_FILE, READ_WRITE, NULL)) == NULL)
return;
FileWrite(hfpb, lpipb, sizeof(IPB), NULL);
for (lpesb = lpesi->lpesbRoot; lpesb; lpesb = lpesb->lpesbNext)
{
if (FileWrite(hfpb, lpesb, sizeof(ESB), NULL) != sizeof(ESB))
{
FileClose (hfpb);
FileUnlink (NULL, szEsi, REGULAR_FILE);
return;
}
}
FileClose (hfpb);
MEMCPY (lpipb->szEsiTemp, szEsi, 100);
}
PRIVATE VOID PASCAL NEAR UpdateEsiTemp (_LPIPB lpipb)
{
GHANDLE hfpb;
if ((hfpb = FileOpen(NULL, lpipb->szEsiTemp, REGULAR_FILE,
READ_WRITE, NULL)) == NULL)
return;
FileWrite(hfpb, lpipb, sizeof(IPB), NULL);
FileClose (hfpb);
}
PRIVATE BOOL PASCAL LoadEsiTemp (_LPIPB lpipb, LPESI lpesi, LPB lpbEsiFile,
LPB lpbIsiFile, PHRESULT phr)
{
LPESB lpesb;
HFILE hFile;
ESB esb;
HANDLE hesb;
HRESULT fRet;
IPB ipb;
LPISI pIsi = &lpipb->isi; // Pointer to internal sort info
/* Copy the internal sort info filename */
MEMCPY (pIsi->aszTempName, lpbIsiFile, lstrlen(lpbIsiFile));
/* Read in the external sort buffer info */
if ((hFile = _lopen (lpbEsiFile, READ)) == HFILE_ERROR)
return E_NOTEXIST;
/* Read old IPB info */
_lread (hFile, &ipb, sizeof(IPB));
/* Transfer meaningful data */
lpipb->dwIndexedWord = ipb.dwIndexedWord;
lpipb->dwUniqueWord = ipb.dwUniqueWord;
lpipb->dwByteCount = ipb.dwByteCount;
lpipb->dwOccOffbits = ipb.dwOccOffbits;
lpipb->dwOccExtbits = ipb.dwOccExtbits;
lpipb->dwMaxFieldId = ipb.dwMaxFieldId;
lpipb->dwMaxWCount = ipb.dwMaxWCount;
lpipb->dwMaxOffset = ipb.dwMaxOffset;
lpipb->dwTotal3bWordLen = ipb.dwTotal3bWordLen;
lpipb->dwTotal2bWordLen = ipb.dwTotal2bWordLen;
lpipb->dwTotalUniqueWordLen = ipb.dwTotalUniqueWordLen;
lpipb->lcTopics = ipb.lcTopics;
lpipb->dwMaxTopicId = ipb.dwMaxTopicId;
// lpipb->dwMemAllowed = ipb.dwMemAllowed;
lpipb->dwMaxRecordSize = ipb.dwMaxRecordSize;
lpipb->dwMaxEsbRecSize = ipb.dwMaxEsbRecSize;
lpipb->dwMaxWLen = ipb.dwMaxWLen;
lpipb->idxf = ipb.idxf;
while ((_lread (hFile, &esb, sizeof(ESB))) == sizeof(ESB))
{
if ((hesb = _GLOBALALLOC(GMEM_MOVEABLE | GMEM_ZEROINIT,
sizeof(ESB))) == NULL) {
fRet = SetErrCode (phr,E_OUTOFMEMORY);
exit0:
_lclose (hFile);
return fRet;
}
lpesb = (LPESB)_GLOBALLOCK (hesb);
/* Copy the ESB information */
*lpesb = esb;
/* Update the structure */
lpesb->hStruct = hesb;
lpesb->lpesbNext = lpesi->lpesbRoot;
lpesi->lpesbRoot= lpesb;
lpesi->cesb ++;
}
_lclose (hFile);
fRet = S_OK;
goto exit0;
}
HRESULT FAR PASCAL AllocSigmaTable (_LPIPB lpipb)
{
ERRB errb;
DWORD loop;
float rLog;
if ((lpipb->wi.hSigma = _GLOBALALLOC (DLLGMEM_ZEROINIT,
(LCB)((lpipb->dwMaxTopicId + 1) * sizeof (SIGMA)))) == NULL)
return SetErrCode (&errb, E_OUTOFMEMORY);
lpipb->wi.hrgsigma = (HRGSIGMA)_GLOBALLOCK (lpipb->wi.hSigma);
if ((lpipb->wi.hLog = _GLOBALALLOC (DLLGMEM_ZEROINIT,
(CB)(cLOG_MAX * sizeof (FLOAT)))) == NULL)
{
FreeHandle (lpipb->wi.hSigma);
return SetErrCode (&errb, E_OUTOFMEMORY);
}
lpipb->wi.lrgrLog = (FLOAT FAR *)_GLOBALLOCK (lpipb->wi.hLog);
// Initialize the array
for (loop = cLOG_MAX - 1; loop > 0; --loop)
{
#ifndef ISBU_IR_CHANGE
rLog = (float) log10(cHundredMillion/(double)loop);
#else
rLog = (float)1.0 / (float)loop;
#endif // ISBU_IR_CHANGE
lpipb->wi.lrgrLog[loop] = rLog * rLog;
}
return(S_OK);
}