1778 lines
58 KiB
C
1778 lines
58 KiB
C
|
/*************************************************************************
|
||
|
* *
|
||
|
* PERMIND2.C *
|
||
|
* *
|
||
|
* Copyright (C) Microsoft Corporation 1990-1994 *
|
||
|
* All Rights reserved. *
|
||
|
* *
|
||
|
**************************************************************************
|
||
|
* *
|
||
|
* Module Intent *
|
||
|
* This is the final stage of the index building process. This module *
|
||
|
* converts the input data into a permanent B-Tree file. *
|
||
|
* *
|
||
|
* Stem node structure: *
|
||
|
* CbLeft |* Word | PointerToNode *| Slack *
|
||
|
* *
|
||
|
* Leaf node structure: *
|
||
|
* NxtBlkPtr|CbLeft|*Word|FieldId|TopicCnt|PointerToNode|DataSize*|Slack *
|
||
|
* *
|
||
|
* Data node structure: *
|
||
|
* |* Topic | OccBlkCnt |* OccBlk *| *| Slack *
|
||
|
* *
|
||
|
* Fields between |* *| repeat based on count values *
|
||
|
* *
|
||
|
**************************************************************************
|
||
|
* *
|
||
|
* Current Owner: BinhN *
|
||
|
* *
|
||
|
**************************************************************************/
|
||
|
|
||
|
#include <mvopsys.h>
|
||
|
#include <mem.h>
|
||
|
#include <memory.h>
|
||
|
#include <math.h>
|
||
|
#include <orkin.h>
|
||
|
#include <mvsearch.h>
|
||
|
#include "common.h"
|
||
|
#include "index.h"
|
||
|
|
||
|
#ifdef _DEBUG
|
||
|
static BYTE NEAR s_aszModule[] = __FILE__; /* Used by error return functions.*/
|
||
|
#endif
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* PRIVATE PUBLIC FUNCTIONS
|
||
|
*
|
||
|
* All of them should be declared far, unless we know they belong to
|
||
|
* the same segment. They should be included in some include file
|
||
|
*
|
||
|
*************************************************************************/
|
||
|
|
||
|
PUBLIC HRESULT FAR PASCAL BuildBTree (HFPB, _LPIPB, LPB, HFPB, LPSTR);
|
||
|
PUBLIC PNODEINFO FAR PASCAL AllocBTreeNode (_LPIPB);
|
||
|
PUBLIC VOID PASCAL FAR FreeBTreeNode (PNODEINFO pNode);
|
||
|
PUBLIC int FAR PASCAL PrefixCompressWord (LPB, LPB, LPB, int);
|
||
|
PUBLIC HRESULT FAR PASCAL FWriteBits(PFILEDATA, DWORD, BYTE);
|
||
|
PUBLIC DWORD FAR PASCAL WriteDataNode (_LPIPB, DWORD, PHRESULT);
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* PRIVATE PRIVATE FUNCTIONS
|
||
|
*
|
||
|
*************************************************************************/
|
||
|
|
||
|
PRIVATE HRESULT NEAR PASCAL AddRecordToLeaf (_LPIPB);
|
||
|
PRIVATE HRESULT NEAR PASCAL AddRecordToStem (_LPIPB, LPB);
|
||
|
PRIVATE int NEAR PASCAL CompressDword (PFILEDATA, DWORD);
|
||
|
PRIVATE HRESULT NEAR PASCAL WriteStemNode (_LPIPB, PNODEINFO);
|
||
|
PRIVATE HRESULT NEAR PASCAL WriteLeafNode (_LPIPB);
|
||
|
PRIVATE HRESULT NEAR PASCAL FlushAllNodes (_LPIPB);
|
||
|
// Compression functions
|
||
|
// PRIVATE HRESULT NEAR PASCAL FAddDword (PFILEDATA, DWORD, CKEY);
|
||
|
PRIVATE HRESULT NEAR PASCAL FWriteBool(PFILEDATA, BOOL);
|
||
|
|
||
|
|
||
|
// This table is used to avoid the calculation "(1L << v) - 1". Instead
|
||
|
// you say "argdwBits[v]", which should be faster. The table is useful
|
||
|
// other places, too.
|
||
|
DWORD argdwBits[] =
|
||
|
{
|
||
|
0x00000000, 0x00000001, 0x00000003, 0x00000007,
|
||
|
0x0000000F, 0x0000001F, 0x0000003F, 0x0000007F,
|
||
|
0x000000FF, 0x000001FF, 0x000003FF, 0x000007FF,
|
||
|
0x00000FFF, 0x00001FFF, 0x00003FFF, 0x00007FFF,
|
||
|
0x0000FFFF, 0x0001FFFF, 0x0003FFFF, 0x0007FFFF,
|
||
|
0x000FFFFF, 0x001FFFFF, 0x003FFFFF, 0x007FFFFF,
|
||
|
0x00FFFFFF, 0x01FFFFFF, 0x03FFFFFF, 0x07FFFFFF,
|
||
|
0x0FFFFFFF, 0x1FFFFFFF, 0x3FFFFFFF, 0x7FFFFFFF,
|
||
|
0xFFFFFFFF,
|
||
|
};
|
||
|
|
||
|
PRIVATE HRESULT PASCAL NEAR WriteBitStreamDWord (PFILEDATA, DWORD, int);
|
||
|
PRIVATE HRESULT PASCAL NEAR WriteFixedDWord (PFILEDATA, DWORD, int);
|
||
|
PRIVATE HRESULT PASCAL NEAR WriteBellDWord (PFILEDATA, DWORD, int);
|
||
|
|
||
|
FENCODE EncodeTable[] =
|
||
|
{
|
||
|
WriteBitStreamDWord,
|
||
|
WriteFixedDWord,
|
||
|
WriteBellDWord,
|
||
|
NULL,
|
||
|
};
|
||
|
|
||
|
|
||
|
#define FAddDword(p,dw,key) (EncodeTable[(key).cschScheme]((p), (dw), (key).ucCenter))
|
||
|
#define SAFE_SLACK 256
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* @doc PRIVATE INDEXING
|
||
|
*
|
||
|
* @func HRESULT | BuildBTree |
|
||
|
* Allocates required memory and opens input files to create a B-Tree.
|
||
|
* Parses incoming words and calls AddRecordToLeaf to process them.
|
||
|
*
|
||
|
* @parm HFPB | hfpbSysFile |
|
||
|
* If not NULL, handle to an already opened sysfile
|
||
|
*
|
||
|
* @parm _LPIPB | lpipb |
|
||
|
* Pointer to the index parameter block
|
||
|
*
|
||
|
* @parm LPB | lpszTemp |
|
||
|
* Filename of the temporary input file
|
||
|
*
|
||
|
* @parm LPB | lpszPerm |
|
||
|
* Filename of the permanent B-Tree file
|
||
|
*
|
||
|
* @rdesc Returns S_OK on success or errors if failed
|
||
|
*
|
||
|
*************************************************************************/
|
||
|
|
||
|
HRESULT FAR PASCAL BuildBTree (HFPB hfpbFileSys, _LPIPB lpipb,
|
||
|
LPB lpszTemp, HFPB hfpbPerm, LPSTR lszFilename/*IStream *pistmPerm*/)
|
||
|
{
|
||
|
PFILEDATA pOutFile; // Pointer to output data
|
||
|
PFILEDATA pInFile; // Pointer to input data
|
||
|
DWORD dwBytesRead = 0; // Checks for EOF
|
||
|
DWORD dwLeftover; // Used to adjust input buffer
|
||
|
PBTREEDATA pTreeData = &lpipb->BTreeData; // Structure defining BTree
|
||
|
PIH20 pHeader = &pTreeData->Header; // Replacement variable
|
||
|
HRESULT fRet; // Return value
|
||
|
PNODEINFO pNode; // Pointer to current input node
|
||
|
ERRB errb= S_OK;
|
||
|
PHRESULT phr = &errb;
|
||
|
int iIndex; // Index into the compressed key
|
||
|
DWORD dwUniqueTerm = 0; // Callback variable
|
||
|
BOOL fOpenedFile; // TRUE if we have to close the file
|
||
|
|
||
|
// Open input file
|
||
|
pInFile = &lpipb->InFile;
|
||
|
if ((pInFile->fFile = FileOpen (NULL, lpszTemp,
|
||
|
REGULAR_FILE, READ, phr)) == NULL)
|
||
|
return *phr;
|
||
|
|
||
|
// Allocate input buffer
|
||
|
pInFile->dwMax = FILE_BUFFER;
|
||
|
if ((pInFile->hMem =
|
||
|
_GLOBALALLOC (DLLGMEM_ZEROINIT, pInFile->dwMax + SAFE_SLACK)) == NULL)
|
||
|
{
|
||
|
fRet = E_OUTOFMEMORY;
|
||
|
exit0:
|
||
|
FileClose (pInFile->fFile);
|
||
|
if ((lpipb->idxf & KEEP_TEMP_FILE) == 0)
|
||
|
FileUnlink (NULL, lpszTemp, REGULAR_FILE);
|
||
|
return fRet;
|
||
|
}
|
||
|
pInFile->pMem = _GLOBALLOCK (pInFile->hMem);
|
||
|
pInFile->pCurrent = pInFile->pMem;
|
||
|
|
||
|
pOutFile = &lpipb->OutFile;
|
||
|
|
||
|
/* Open subfile if necessary, (and system file if necessary) */
|
||
|
pOutFile->fFile = hfpbPerm;
|
||
|
if ((fOpenedFile = FsTypeFromHfpb(hfpbPerm) != FS_SUBFILE) &&
|
||
|
(pOutFile->fFile = (HANDLE)FileOpen
|
||
|
(hfpbPerm, lszFilename, hfpbPerm ? FS_SUBFILE : REGULAR_FILE,
|
||
|
READ, phr)) == 0)
|
||
|
{
|
||
|
SetErrCode (&fRet, E_FILENOTFOUND);
|
||
|
exit1:
|
||
|
FreeHandle (pInFile->hMem);
|
||
|
goto exit0;
|
||
|
}
|
||
|
|
||
|
// Allocate output buffer, at least enough for one block
|
||
|
pOutFile->dwMax = FILE_BUFFER;
|
||
|
if (pOutFile->dwMax < (LONG)lpipb->BTreeData.Header.dwBlockSize)
|
||
|
pOutFile->dwMax = lpipb->BTreeData.Header.dwBlockSize;
|
||
|
if ((pOutFile->hMem = _GLOBALALLOC (DLLGMEM_ZEROINIT,
|
||
|
pOutFile->dwMax + SAFE_SLACK)) == NULL)
|
||
|
{
|
||
|
fRet = E_OUTOFMEMORY;
|
||
|
exit2:
|
||
|
if (fOpenedFile)
|
||
|
FileClose (hfpbPerm);
|
||
|
goto exit1;
|
||
|
}
|
||
|
pOutFile->pMem = _GLOBALLOCK (pOutFile->hMem);
|
||
|
|
||
|
// Skip 1K to hold header infomation
|
||
|
pOutFile->pCurrent = pOutFile->pMem + FILE_HEADER;
|
||
|
pOutFile->cbLeft = pOutFile->dwMax - FILE_HEADER;
|
||
|
pOutFile->foPhysicalOffset.dwOffset = FILE_HEADER;
|
||
|
pOutFile->ibit = cbitBYTE - 1;
|
||
|
|
||
|
// Allocate first leaf node
|
||
|
if ((pTreeData->rgpNodeInfo[0] = AllocBTreeNode (lpipb)) == NULL)
|
||
|
{
|
||
|
fRet = E_OUTOFMEMORY;
|
||
|
exit3:
|
||
|
FreeHandle (pOutFile->hMem);
|
||
|
goto exit2;
|
||
|
}
|
||
|
pHeader->nidLast = 1;
|
||
|
pHeader->cIdxLevels = 1;
|
||
|
|
||
|
// pNode points to the leaf node structure
|
||
|
pNode = pTreeData->rgpNodeInfo[0];
|
||
|
pNode->Slack = LEAF_SLACK;
|
||
|
|
||
|
// Set the bytes left in node block
|
||
|
pNode->cbLeft = lpipb->BTreeData.Header.dwBlockSize - FOFFSET_SIZE -
|
||
|
sizeof(WORD);
|
||
|
|
||
|
// Set the word length flag
|
||
|
if (lpipb->occf & OCCF_LENGTH)
|
||
|
pTreeData->fOccfLength = 1;
|
||
|
|
||
|
#if 0
|
||
|
// Save some math time if we're doing term-weighting
|
||
|
if (lpipb->idxf & IDXF_NORMALIZE)
|
||
|
{
|
||
|
MEMSET (pTreeData->argbLog, (BYTE)0, cLOG_MAX * sizeof (BYTE));
|
||
|
if ((hLog = _GLOBALALLOC (GMEM_MOVEABLE,
|
||
|
(CB)(cLOG_MAX * sizeof (FLOAT)))) == NULL)
|
||
|
{
|
||
|
fRet = E_OUTOFMEMORY;
|
||
|
goto exit3;
|
||
|
}
|
||
|
pTreeData->lrgrLog = (float FAR *)_GLOBALLOCK (hLog);
|
||
|
}
|
||
|
else
|
||
|
hLog = NULL;
|
||
|
#endif
|
||
|
|
||
|
// Load the input buffer & repeat until all records are processed
|
||
|
pInFile->dwMax = pInFile->cbLeft =
|
||
|
FileRead (pInFile->fFile, pInFile->pMem, pInFile->dwMax, phr);
|
||
|
do
|
||
|
{
|
||
|
// Call the user callback every once in a while
|
||
|
if (!(++dwUniqueTerm % 8192L)
|
||
|
&& (lpipb->CallbackInfo.dwFlags & ERRFLAG_STATUS))
|
||
|
{
|
||
|
PFCALLBACK_MSG pCallbackInfo = &lpipb->CallbackInfo;
|
||
|
CALLBACKINFO Info;
|
||
|
|
||
|
Info.dwPhase = 3;
|
||
|
Info.dwIndex = (DWORD)((float)dwUniqueTerm / lpipb->dwUniqueWord * 100);
|
||
|
fRet = (*pCallbackInfo->MessageFunc)
|
||
|
(ERRFLAG_STATUS, pCallbackInfo->pUserData, &Info);
|
||
|
if (S_OK != fRet)
|
||
|
goto exit4;
|
||
|
}
|
||
|
|
||
|
if ((fRet = AddRecordToLeaf (lpipb)) != S_OK)
|
||
|
goto exit4;
|
||
|
|
||
|
// pInFile->pCurrent points to the record size
|
||
|
// 256 is just an arbitrary number of slack to minimize out of data
|
||
|
|
||
|
// kevynct: pCurrent points to a record length which does not include
|
||
|
// the DWORD record len size, so we add this when checking. Actually, we
|
||
|
// add twice that to be safe.
|
||
|
if (pInFile->cbLeft <= SAFE_SLACK ||
|
||
|
(LONG)(GETLONG ((LPUL)(pInFile->pCurrent)) + 2 * sizeof(DWORD)) >= pInFile->cbLeft)
|
||
|
{
|
||
|
MEMMOVE (pInFile->pMem, pInFile->pCurrent, pInFile->cbLeft);
|
||
|
if ((pInFile->cbLeft += FileRead (pInFile->fFile, pInFile->pMem +
|
||
|
pInFile->cbLeft, pInFile->dwMax - pInFile->cbLeft, phr)) < 0)
|
||
|
{
|
||
|
fRet = *phr;
|
||
|
exit4:
|
||
|
// Free log block used for term-weighting
|
||
|
#if 0
|
||
|
FreeHandle (hLog);
|
||
|
#endif
|
||
|
|
||
|
// Free all node blocks
|
||
|
dwLeftover = 0;
|
||
|
while (pTreeData->rgpNodeInfo[dwLeftover] != NULL)
|
||
|
{
|
||
|
FreeBTreeNode(pTreeData->rgpNodeInfo[dwLeftover++]);
|
||
|
}
|
||
|
goto exit3;
|
||
|
}
|
||
|
|
||
|
pInFile->dwMax = pInFile->cbLeft;
|
||
|
pInFile->pCurrent = pInFile->pMem;
|
||
|
}
|
||
|
} while (fRet == S_OK && pInFile->cbLeft);
|
||
|
|
||
|
// Flush anything left in the output buffer
|
||
|
if ((fRet = FlushAllNodes (lpipb)) != S_OK)
|
||
|
goto exit4;
|
||
|
|
||
|
|
||
|
// Write out the sigma table
|
||
|
if (lpipb->idxf & IDXF_NORMALIZE)
|
||
|
{
|
||
|
pHeader->WeightTabOffset = pOutFile->foPhysicalOffset;
|
||
|
pHeader->WeightTabSize = (LCB)((lpipb->dwMaxTopicId + 1) *
|
||
|
sizeof (SIGMA));
|
||
|
if (FileWrite (pOutFile->fFile, lpipb->wi.hrgsigma,
|
||
|
pHeader->WeightTabSize, phr) != (LONG)pHeader->WeightTabSize)
|
||
|
{
|
||
|
fRet = *phr;
|
||
|
goto exit4;
|
||
|
}
|
||
|
pOutFile->foStartOffset = FoAddDw(pOutFile->foStartOffset,
|
||
|
pHeader->WeightTabSize);
|
||
|
}
|
||
|
|
||
|
// Copy info to header
|
||
|
pHeader->FileStamp = INDEX_STAMP;
|
||
|
pHeader->version = VERCURRENT;
|
||
|
pHeader->occf = lpipb->occf;
|
||
|
pHeader->idxf = lpipb->idxf;
|
||
|
pHeader->lcTopics = lpipb->lcTopics;
|
||
|
pHeader->dwMaxTopicId = lpipb->dwMaxTopicId;
|
||
|
pHeader->dwMaxFieldId = lpipb->dwMaxFieldId;
|
||
|
|
||
|
pHeader->dwMaxWCount = lpipb->dwMaxWCount;
|
||
|
pHeader->dwMaxOffset = lpipb->dwMaxOffset;
|
||
|
pHeader->dwMaxWLen = lpipb->dwMaxWLen;
|
||
|
pHeader->dwTotalWords = lpipb->dwIndexedWord; // Total indexed words
|
||
|
pHeader->dwUniqueWords = lpipb->dwUniqueWord; // Total unique words
|
||
|
pHeader->dwTotal2bWordLen = lpipb->dwTotal2bWordLen;
|
||
|
pHeader->dwTotal3bWordLen = lpipb->dwTotal3bWordLen;
|
||
|
pHeader->dwUniqueWordLen = lpipb->dwTotalUniqueWordLen;
|
||
|
|
||
|
pHeader->ckeyTopicId = lpipb->cKey[CKEY_TOPIC_ID];
|
||
|
pHeader->ckeyOccCount = lpipb->cKey[CKEY_OCC_COUNT];
|
||
|
iIndex = CKEY_OCC_BASE;
|
||
|
if (pHeader->occf & OCCF_COUNT)
|
||
|
pHeader->ckeyWordCount = lpipb->cKey[iIndex++];
|
||
|
if (pHeader->occf & OCCF_OFFSET)
|
||
|
pHeader->ckeyOffset = lpipb->cKey[iIndex];
|
||
|
|
||
|
if (FileSeekWrite (pOutFile->fFile, (LPB)pHeader, MakeFo (0, 0),
|
||
|
sizeof (IH20), phr) != sizeof (IH20))
|
||
|
{
|
||
|
fRet = *phr;
|
||
|
goto exit4;
|
||
|
}
|
||
|
|
||
|
// Call the user callback every once in a while
|
||
|
if (lpipb->CallbackInfo.dwFlags & ERRFLAG_STATUS)
|
||
|
{
|
||
|
PFCALLBACK_MSG pCallbackInfo = &lpipb->CallbackInfo;
|
||
|
CALLBACKINFO Info;
|
||
|
|
||
|
Info.dwPhase = 3;
|
||
|
Info.dwIndex = 100;
|
||
|
fRet = (*pCallbackInfo->MessageFunc)
|
||
|
(ERRFLAG_STATUS, pCallbackInfo->pUserData, &Info);
|
||
|
if (S_OK != fRet)
|
||
|
goto exit4;
|
||
|
}
|
||
|
fRet = S_OK;
|
||
|
goto exit4;
|
||
|
} /* BuildBTree */
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* @doc PRIVATE INDEXING
|
||
|
*
|
||
|
* @func HRESULT | AddRecordToLeaf |
|
||
|
* Add the record pointed to by pDtreeData->OutFile->pCurrent to the B-Tree
|
||
|
* contained in the structure.
|
||
|
*
|
||
|
* @parm _LPIPB | lpipb |
|
||
|
* Pointer to the index parameter block
|
||
|
*
|
||
|
* @rdesc Returns S_OK on success or errors if failed
|
||
|
*
|
||
|
*************************************************************************/
|
||
|
#ifdef _DEBUG
|
||
|
static BYTE LastWord[4000] = {0};
|
||
|
static BYTE CurWord[4000] = {0};
|
||
|
#endif
|
||
|
|
||
|
HRESULT PASCAL AddRecordToLeaf (_LPIPB lpipb)
|
||
|
{
|
||
|
// Local Replacement Variables
|
||
|
PBTREEDATA pTreeData = &lpipb->BTreeData;
|
||
|
PFILEDATA pOutFile = &lpipb->OutFile; // Output data
|
||
|
PFILEDATA pInFile = &lpipb->InFile; // Input data
|
||
|
HFPB fOutput = pOutFile->fFile; // Output file
|
||
|
HFPB fInput = lpipb->InFile.fFile; // Input file
|
||
|
LPB pInCurPtr = lpipb->InFile.pCurrent; // Input buffer
|
||
|
PNODEINFO pNode;
|
||
|
LPB lpbWord; // Pointer to the word string
|
||
|
OCCF occf = lpipb->occf;
|
||
|
|
||
|
// Working Variables
|
||
|
DWORD dwTopicCount; // Number of topic in record
|
||
|
DWORD dwFieldId;
|
||
|
DWORD dwBlockSize; // Size of the entire occ block
|
||
|
LPB pDest;
|
||
|
WORD uStringSize;
|
||
|
ERRB errb;
|
||
|
|
||
|
|
||
|
// We always start from the leaf node
|
||
|
pNode = pTreeData->rgpNodeInfo[0];
|
||
|
|
||
|
// Set pointer to working buffer
|
||
|
pDest = pNode->pTmpResult;
|
||
|
|
||
|
// Advance input buffer to the word string
|
||
|
pInCurPtr += sizeof (DWORD);
|
||
|
lpbWord = pInCurPtr;
|
||
|
|
||
|
// Insert the word into the buffer
|
||
|
pDest += PrefixCompressWord (pDest, pInCurPtr,
|
||
|
pNode->pLastWord, pTreeData->fOccfLength);
|
||
|
|
||
|
// Get the word length
|
||
|
uStringSize = GETWORD((LPUW)pInCurPtr);
|
||
|
lpipb->dwTotalUniqueWordLen += uStringSize;
|
||
|
|
||
|
// Adjust for the word length storage
|
||
|
uStringSize += sizeof(SHORT);
|
||
|
|
||
|
// Skip the word
|
||
|
pInCurPtr += uStringSize;
|
||
|
|
||
|
#ifdef _DEBUG
|
||
|
STRCPY (LastWord, CurWord);
|
||
|
MEMCPY (CurWord, lpbWord + 2, GETWORD((LPUW)lpbWord));
|
||
|
CurWord[GETWORD((LPUW)lpbWord)] = 0;
|
||
|
if (STRCMP (LastWord, CurWord) > 0)
|
||
|
SetErrCode (NULL, E_ASSERT);
|
||
|
// if (STRCMP (CurWord, "forbidden") == 0)
|
||
|
// _asm int 3;
|
||
|
#endif
|
||
|
|
||
|
// If OccfLength is set skip it now
|
||
|
// (It has already been appended to the compressed word)
|
||
|
if (pTreeData->fOccfLength)
|
||
|
pInCurPtr += CbByteUnpack(&dwBlockSize, pInCurPtr);
|
||
|
|
||
|
// Copy the FieldID
|
||
|
if (occf & OCCF_FIELDID)
|
||
|
{
|
||
|
CbByteUnpack (&dwFieldId, pInCurPtr);
|
||
|
do {
|
||
|
*pDest++ = *pInCurPtr;
|
||
|
} while (*pInCurPtr++ & 0x80);
|
||
|
}
|
||
|
|
||
|
// Get Topic Count
|
||
|
#if 0
|
||
|
CbByteUnpack (&dwTopicCount, pInCurPtr);
|
||
|
do
|
||
|
{
|
||
|
*pDest++ = *pInCurPtr;
|
||
|
} while (*pInCurPtr++ & 0x80);
|
||
|
#else
|
||
|
dwTopicCount = GETLONG((LPUL)pInCurPtr);
|
||
|
pInCurPtr += sizeof(DWORD);
|
||
|
pDest += CbBytePack(pDest, dwTopicCount);
|
||
|
#endif
|
||
|
|
||
|
// Check to see if this entry will fit in the leaf node
|
||
|
// We can't write the data block until we know where the entry
|
||
|
// will be stored. We must add in FOFFSET_SIZE to our current location
|
||
|
// to determine size. We ignore the block size field, so we might encroach
|
||
|
// on the slack by a few bytes.
|
||
|
if (pNode->cbLeft - pNode->Slack < (SHORT)(pDest -pNode->pTmpResult +FOFFSET_SIZE))
|
||
|
{
|
||
|
HRESULT fRet;
|
||
|
|
||
|
if ((fRet = AddRecordToStem (lpipb, lpbWord)) != S_OK)
|
||
|
return(fRet);
|
||
|
|
||
|
// If the prefix count is zero, no problem
|
||
|
// Else we have to re-copy the word, since we are in a new leaf node
|
||
|
if (0 != pNode->pTmpResult[1])
|
||
|
{
|
||
|
dwBlockSize = PrefixCompressWord (pNode->pTmpResult, lpbWord,
|
||
|
pNode->pLastWord, pTreeData->fOccfLength);
|
||
|
pDest = pNode->pTmpResult + dwBlockSize;
|
||
|
if (occf & OCCF_FIELDID)
|
||
|
pDest += CbBytePack (pDest, dwFieldId);
|
||
|
pDest += CbBytePack (pDest, dwTopicCount);
|
||
|
}
|
||
|
}
|
||
|
// Save new word as last word
|
||
|
MEMCPY (pNode->pLastWord, lpbWord, uStringSize + 2);
|
||
|
|
||
|
// Set pointer to beginning of data block
|
||
|
pDest += CopyFileOffset (pDest, pOutFile->foPhysicalOffset);
|
||
|
|
||
|
// Update the bytes left
|
||
|
pInFile->cbLeft -= (LONG) (pInCurPtr - pInFile->pCurrent);
|
||
|
#ifdef _DEBUG
|
||
|
if (pInFile->cbLeft <= 0)
|
||
|
SetErrCode (NULL, E_ASSERT);
|
||
|
#endif
|
||
|
|
||
|
// Compress data block to output buffer and store it's compressed size
|
||
|
pInFile->pCurrent = pInCurPtr;
|
||
|
if ((dwBlockSize = WriteDataNode (lpipb, dwTopicCount, &errb)) == 0)
|
||
|
return errb;
|
||
|
pDest += CbBytePack (pDest, dwBlockSize);
|
||
|
|
||
|
// Copy the temp buffer to the real node
|
||
|
dwBlockSize = (DWORD)(pDest - pNode->pTmpResult);
|
||
|
MEMCPY (pNode->pCurPtr, pNode->pTmpResult, dwBlockSize);
|
||
|
pNode->pCurPtr += dwBlockSize;
|
||
|
pNode->cbLeft -= (WORD)dwBlockSize;
|
||
|
|
||
|
return S_OK;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* @doc PRIVATE INDEXING
|
||
|
*
|
||
|
* @func DWORD | AddRecordToStem |
|
||
|
* Add a key to a stem node, creating/flushing nodes as necessary.
|
||
|
*
|
||
|
* @parm LPB | lpbWord |
|
||
|
* The word to add the the stem node (last word in the full leaf node)
|
||
|
*
|
||
|
* @rdesc S_OK if successful, or errors if failed
|
||
|
*
|
||
|
*************************************************************************/
|
||
|
|
||
|
HRESULT PASCAL AddRecordToStem (_LPIPB lpipb, LPB lpbWord)
|
||
|
{
|
||
|
SHORT CurLevel = 0;
|
||
|
PNODEINFO pStemNode;
|
||
|
PNODEINFO pLastNode;
|
||
|
|
||
|
PBTREEDATA pTreeData = &lpipb->BTreeData;
|
||
|
PNODEINFO pLeafNode = pTreeData->rgpNodeInfo[0];
|
||
|
LPB pLastWord;
|
||
|
int cbTemp;
|
||
|
ERRB errb = S_OK;
|
||
|
HRESULT fRet;
|
||
|
|
||
|
// Move up through stem nodes until space can be found/made
|
||
|
pStemNode = pLeafNode;
|
||
|
do
|
||
|
{
|
||
|
pLastWord = pStemNode->pLastWord;
|
||
|
pStemNode = pTreeData->rgpNodeInfo[++CurLevel];
|
||
|
if (pStemNode == NULL)
|
||
|
{ // Create a new stem node
|
||
|
if ((pStemNode = pTreeData->rgpNodeInfo[CurLevel] =
|
||
|
AllocBTreeNode (lpipb)) == NULL)
|
||
|
return SetErrCode (NULL, E_OUTOFMEMORY);
|
||
|
pStemNode->Slack = STEM_SLACK;
|
||
|
pStemNode->cbLeft = lpipb->BTreeData.Header.dwBlockSize
|
||
|
- sizeof(WORD);
|
||
|
if (++pTreeData->Header.cIdxLevels > MAX_TREE_HEIGHT)
|
||
|
return E_TREETOOBIG;
|
||
|
}
|
||
|
pTreeData->Header.nidLast++;
|
||
|
} while (pStemNode->cbLeft - pStemNode->Slack <
|
||
|
(SHORT)(GETWORD ((LPUW)pLastWord) + sizeof (SHORT) + FOFFSET_SIZE));
|
||
|
|
||
|
// Work back down through the nodes clearing them to disk
|
||
|
while (CurLevel > 1)
|
||
|
{
|
||
|
pLastNode = pTreeData->rgpNodeInfo[--CurLevel];
|
||
|
pLastWord = pLastNode->pLastWord;
|
||
|
// Copy word to stem node
|
||
|
if ((cbTemp = PrefixCompressWord (pStemNode->pCurPtr, pLastWord,
|
||
|
pStemNode->pLastWord, pTreeData->fOccfLength)) == 0)
|
||
|
{
|
||
|
return errb;
|
||
|
}
|
||
|
pStemNode->pCurPtr += cbTemp;
|
||
|
|
||
|
// Update the last word in the stem node
|
||
|
MEMCPY (pStemNode->pLastWord, pLastWord, GETWORD((LPUW)pLastWord)+ 2*sizeof(WORD));
|
||
|
|
||
|
// Set pointer in stem node
|
||
|
CopyFileOffset (pStemNode->pCurPtr,
|
||
|
lpipb->OutFile.foPhysicalOffset);
|
||
|
pStemNode->pCurPtr += FOFFSET_SIZE;
|
||
|
pStemNode->cbLeft -= FOFFSET_SIZE + cbTemp;
|
||
|
#ifdef _DEBUG
|
||
|
if (pStemNode->cbLeft <= 0)
|
||
|
SetErrCode (NULL, E_ASSERT);
|
||
|
#endif
|
||
|
|
||
|
pStemNode = pTreeData->rgpNodeInfo[CurLevel];
|
||
|
if ((fRet = WriteStemNode (lpipb, pStemNode)) != S_OK)
|
||
|
return(fRet);
|
||
|
|
||
|
}
|
||
|
|
||
|
// Clear the leaf node into the first stem node & reset it
|
||
|
|
||
|
// Copy last word to stem node
|
||
|
if ((cbTemp = PrefixCompressWord (pStemNode->pCurPtr,
|
||
|
pLeafNode->pLastWord, pStemNode->pLastWord,
|
||
|
pTreeData->fOccfLength)) == 0)
|
||
|
{
|
||
|
return errb;
|
||
|
}
|
||
|
pStemNode->pCurPtr += cbTemp;
|
||
|
pStemNode->cbLeft -= cbTemp;
|
||
|
#ifdef _DEBUG
|
||
|
if (pStemNode->cbLeft <= 0)
|
||
|
SetErrCode (NULL, E_ASSERT);
|
||
|
#endif
|
||
|
|
||
|
|
||
|
// Update the last word in the stem node
|
||
|
MEMCPY (pStemNode->pLastWord, pLeafNode->pLastWord,
|
||
|
GETWORD((LPUW)(pLeafNode->pLastWord))+2*sizeof(WORD));
|
||
|
|
||
|
// Set pointer to the leaf node
|
||
|
CopyFileOffset (pStemNode->pCurPtr, lpipb->OutFile.foPhysicalOffset);
|
||
|
pStemNode->pCurPtr += FOFFSET_SIZE;
|
||
|
pStemNode->cbLeft -= FOFFSET_SIZE;
|
||
|
#ifdef _DEBUG
|
||
|
if (pStemNode->cbLeft <= 0)
|
||
|
SetErrCode (NULL, E_ASSERT);
|
||
|
#endif
|
||
|
|
||
|
// Flush leaf node to output buffer and reset it
|
||
|
return WriteLeafNode (lpipb);
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* @doc PRIVATE INDEXING
|
||
|
*
|
||
|
* @func int | CompressDword |
|
||
|
* Compresses the input stream into the output buffer using a high
|
||
|
* bit encoding method. If the buffer is full it will be flushed to
|
||
|
* a file.
|
||
|
*
|
||
|
* @parm PFILEDATA | pOutput |
|
||
|
* Pointer to output buffer info
|
||
|
*
|
||
|
* @parm LPDWORD | pSrc |
|
||
|
* Pointer to the uncompressed input stream
|
||
|
*
|
||
|
* @rdesc Returns the number of compressed bytes buffered
|
||
|
*
|
||
|
*************************************************************************/
|
||
|
|
||
|
int PASCAL CompressDword (PFILEDATA pOutput, DWORD dwValue)
|
||
|
{
|
||
|
LPB pDest = pOutput->pCurrent;
|
||
|
int cBytes = 0; // Count of compressed bytes
|
||
|
ERRB errb;
|
||
|
|
||
|
// Any room left in output buffer?
|
||
|
if (sizeof(DWORD) * 2 >= pOutput->cbLeft)
|
||
|
{
|
||
|
DWORD dwSize;
|
||
|
|
||
|
FileWrite (pOutput->fFile, pOutput->pMem,
|
||
|
(dwSize = (DWORD)(pDest - pOutput->pMem)), &errb);
|
||
|
pDest = pOutput->pMem;
|
||
|
pOutput->cbLeft = pOutput->dwMax;
|
||
|
pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset, dwSize);
|
||
|
}
|
||
|
|
||
|
while (dwValue)
|
||
|
{
|
||
|
*pDest = (BYTE)(dwValue & 0x7F);
|
||
|
cBytes++;
|
||
|
dwValue >>= 7;
|
||
|
if (dwValue != 0)
|
||
|
*pDest |= 0x80;
|
||
|
pDest++;
|
||
|
}
|
||
|
pOutput->pCurrent = pDest;
|
||
|
pOutput->foPhysicalOffset =
|
||
|
FoAddDw (pOutput->foPhysicalOffset, (DWORD)cBytes);
|
||
|
pOutput->cbLeft -= cBytes;
|
||
|
#ifdef _DEBUG
|
||
|
if (pOutput->cbLeft <= 0)
|
||
|
SetErrCode (NULL, E_ASSERT);
|
||
|
#endif
|
||
|
return cBytes;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* @doc PRIVATE INDEXING
|
||
|
*
|
||
|
* @func DWORD | WriteDataNode |
|
||
|
* Compresses the input stream into the output buffer. If the buffer
|
||
|
* is full it will be flushed to a file.
|
||
|
*
|
||
|
* @parm _LPIPB | lpipb |
|
||
|
* Pointer to global buffer
|
||
|
*
|
||
|
* @parm DWORD | dwTopicCount |
|
||
|
* The number of topics in the input stream
|
||
|
*
|
||
|
* @parm PHRESULT | phr |
|
||
|
* Error buffer
|
||
|
*
|
||
|
* @rdesc Returns the number of compressed bytes written
|
||
|
*
|
||
|
*************************************************************************/
|
||
|
|
||
|
PUBLIC DWORD PASCAL FAR WriteDataNode (_LPIPB lpipb,
|
||
|
DWORD dwTopicCount, PHRESULT phr)
|
||
|
{
|
||
|
// Local replacement Variables
|
||
|
PBTREEDATA pTreeData = &lpipb->BTreeData;
|
||
|
PFILEDATA pOutput = &lpipb->OutFile; // Output data structure
|
||
|
PFILEDATA pInFile = &lpipb->InFile; // Input data structre
|
||
|
HFPB fFile = pOutput->fFile; // Output file handle
|
||
|
|
||
|
// Working Variables
|
||
|
DWORD dwBlockSize; // Size of block to compress
|
||
|
DWORD dwEncodedSize = 0; // Size of encoded block
|
||
|
DWORD dwTopicIdDelta; // Really only used for weight values
|
||
|
DWORD TopicLoop;
|
||
|
DWORD dwSlackSize;
|
||
|
DWORD loop;
|
||
|
DWORD dwTemp;
|
||
|
FILEOFFSET foStart; // Physical beginning of bit compression block
|
||
|
FLOAT rTerm; // Only used when IDXF_NORMALIZE is set
|
||
|
FLOAT rWeight; // Only used when IDXF_NORMALIZE is set
|
||
|
WORD wWeight; // Only used when IDXF_NORMALIZE is set
|
||
|
DWORD dwTopicId = 0; // Only used when IDXF_NORMALIZE is set
|
||
|
int cbTemp; // # of compressed bytes that uncompressed
|
||
|
OCCF occf = lpipb->occf;
|
||
|
HRESULT fRet;
|
||
|
|
||
|
foStart = pOutput->foPhysicalOffset;
|
||
|
wWeight = 0; // UNDONE: Don't need it
|
||
|
|
||
|
for (TopicLoop = dwTopicCount; TopicLoop > 0; --TopicLoop)
|
||
|
{
|
||
|
// Move to the byte boundary
|
||
|
if (pOutput->ibit != cbitBYTE - 1)
|
||
|
{
|
||
|
pOutput->ibit = cbitBYTE - 1;
|
||
|
|
||
|
if (--pOutput->cbLeft)
|
||
|
{
|
||
|
pOutput->pCurrent++;
|
||
|
pOutput->foPhysicalOffset = FoAddDw (pOutput->foPhysicalOffset, 1);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if (FileWrite (pOutput->fFile, pOutput->pMem,
|
||
|
dwTemp = (DWORD)(pOutput->pCurrent - pOutput->pMem), phr) != (LONG)dwTemp)
|
||
|
return(0);
|
||
|
pOutput->pCurrent = pOutput->pMem;
|
||
|
pOutput->cbLeft = pOutput->dwMax;
|
||
|
pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset, dwTemp);
|
||
|
#ifdef _DEBUG
|
||
|
MEMSET (pOutput->pMem, 0, pOutput->dwMax);
|
||
|
#endif
|
||
|
}
|
||
|
}
|
||
|
// Store TopicId as necessary
|
||
|
if (pInFile->cbLeft < 2 * sizeof (DWORD))
|
||
|
{
|
||
|
MEMMOVE (pInFile->pMem, pInFile->pCurrent, pInFile->cbLeft);
|
||
|
pInFile->cbLeft += FileRead (pInFile->fFile, pInFile->pMem + pInFile->cbLeft,
|
||
|
pInFile->dwMax - pInFile->cbLeft, phr);
|
||
|
pInFile->dwMax = pInFile->cbLeft;
|
||
|
pInFile->pCurrent = pInFile->pMem;
|
||
|
}
|
||
|
cbTemp = CbByteUnpack (&dwTopicIdDelta, pInFile->pCurrent);
|
||
|
dwTopicId += dwTopicIdDelta; // Get the real TopicID
|
||
|
if ((fRet = FAddDword (pOutput, dwTopicIdDelta,
|
||
|
lpipb->cKey[CKEY_TOPIC_ID])) != S_OK)
|
||
|
{
|
||
|
SetErrCode(phr, fRet);
|
||
|
return(0);
|
||
|
|
||
|
}
|
||
|
|
||
|
pInFile->pCurrent += cbTemp;
|
||
|
pInFile->cbLeft -= cbTemp;
|
||
|
|
||
|
if (occf & OCCF_HAVE_OCCURRENCE)
|
||
|
{
|
||
|
|
||
|
// Get number of occ data records for this topic
|
||
|
if (pInFile->cbLeft < 2 * sizeof (DWORD))
|
||
|
{
|
||
|
MEMMOVE (pInFile->pMem, pInFile->pCurrent, pInFile->cbLeft);
|
||
|
pInFile->cbLeft += FileRead (pInFile->fFile,
|
||
|
pInFile->pMem + pInFile->cbLeft,
|
||
|
pInFile->dwMax - pInFile->cbLeft, phr);
|
||
|
pInFile->dwMax = pInFile->cbLeft;
|
||
|
pInFile->pCurrent = pInFile->pMem;
|
||
|
}
|
||
|
cbTemp = CbByteUnpack (&dwBlockSize, pInFile->pCurrent);
|
||
|
pInFile->pCurrent += cbTemp;
|
||
|
pInFile->cbLeft -= cbTemp;
|
||
|
}
|
||
|
|
||
|
// If we are term weighing we have to calculate the weight
|
||
|
if (lpipb->idxf & IDXF_NORMALIZE)
|
||
|
{
|
||
|
#ifndef ISBU_IR_CHANGE
|
||
|
// log10(x/y) == log10 (x) - log10 (y). Since x in our case is a known constant,
|
||
|
// 100,000,000, I'm replacing that with its equivalent log10 value of 8.0 and subtracting
|
||
|
// the log10(y) from it
|
||
|
rTerm = (float) (8.0 - log10((double) dwTopicCount));
|
||
|
// In extreme cases, rTerm could be 0 or even -ve (when dwTopicCount approaches or
|
||
|
// exceeds 100,000,000)
|
||
|
if (rTerm <= (float) 0.0)
|
||
|
rTerm = cVerySmallWt; // very small value. == log(100 mil/ 95 mil)
|
||
|
// NOTE : rWeight for the doc term would be as follows:
|
||
|
// rWeight = float(min(4096, dwBlockSize)) * rTerm / lpipb->wi.hrgsigma[dwTopicId]
|
||
|
//
|
||
|
// Since rTerm needs to be recomputed again for the query term weight computation,
|
||
|
// and since rTerm will be the same value for the current term ('cos N and n of log(N/n)
|
||
|
// are the same (N = 100 million and n is whatever the doc term freq is for the term),
|
||
|
// we will factor in the second rTerm at index time. This way, we don't have to deal
|
||
|
// with rTerm at search time (reduces computation and query time shortens)
|
||
|
//
|
||
|
// MV 2.0 initially did the same thing. However, BinhN removed the second rTerm
|
||
|
// because he decided to remove the rTerm altogether from the query term weight. He
|
||
|
// did that to keep the scores reasonably high.
|
||
|
|
||
|
rWeight = ((float) min(cTFThreshold, dwBlockSize)) * rTerm * rTerm / lpipb->wi.hrgsigma[dwTopicId];
|
||
|
// without the additional rTerm, we would probably be between 0.0 and 1.0
|
||
|
if (rWeight > rTerm)
|
||
|
wWeight = 0xFFFF;
|
||
|
else
|
||
|
wWeight = (WORD) ((float)0xFFFF * rWeight / rTerm);
|
||
|
#else
|
||
|
rTerm = (float) (65535.0 * 8) / (float)dwTopicCount;
|
||
|
|
||
|
rWeight = (float)dwBlockSize * rTerm / lpipb->wi.hrgsigma[dwTopicId];
|
||
|
if (rWeight >= 65535.0)
|
||
|
wWeight = 65335;
|
||
|
else
|
||
|
wWeight = (WORD)rWeight;
|
||
|
#endif // ISBU_IR_CHANGE
|
||
|
|
||
|
// Write the weight to the output buffer
|
||
|
if ((fRet = FWriteBits (&lpipb->OutFile, (DWORD)wWeight,
|
||
|
(BYTE)(sizeof (WORD) * cbitBYTE))) != S_OK)
|
||
|
{
|
||
|
SetErrCode (phr, fRet);
|
||
|
return(0);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Don't do anything else if there is nothing else to do!!!
|
||
|
if ((occf & OCCF_HAVE_OCCURRENCE) == 0)
|
||
|
continue;
|
||
|
|
||
|
// Write the OccCount
|
||
|
if ((fRet = FAddDword (pOutput, dwBlockSize,
|
||
|
lpipb->cKey[CKEY_OCC_COUNT])) != S_OK)
|
||
|
{
|
||
|
SetErrCode (phr, fRet);
|
||
|
return(0);
|
||
|
}
|
||
|
|
||
|
// Encode the occ block
|
||
|
for (loop = dwBlockSize; loop > 0; loop--)
|
||
|
{
|
||
|
int iIndex;
|
||
|
|
||
|
iIndex = CKEY_OCC_BASE;
|
||
|
|
||
|
// Make sure input buffer holds enough data
|
||
|
if (pInFile->cbLeft < 5 * sizeof (DWORD))
|
||
|
{
|
||
|
MEMMOVE (pInFile->pMem, pInFile->pCurrent, pInFile->cbLeft);
|
||
|
pInFile->cbLeft += FileRead (pInFile->fFile,
|
||
|
pInFile->pMem + pInFile->cbLeft,
|
||
|
pInFile->dwMax - pInFile->cbLeft, phr);
|
||
|
pInFile->dwMax = pInFile->cbLeft;
|
||
|
pInFile->pCurrent = pInFile->pMem;
|
||
|
}
|
||
|
|
||
|
if (occf & OCCF_COUNT)
|
||
|
{
|
||
|
cbTemp = CbByteUnpack (&dwTemp, pInFile->pCurrent);
|
||
|
pInFile->pCurrent += cbTemp;
|
||
|
pInFile->cbLeft -= cbTemp;
|
||
|
if ((fRet = FAddDword (pOutput, dwTemp, lpipb->cKey[iIndex])) !=
|
||
|
S_OK)
|
||
|
{
|
||
|
SetErrCode (phr, fRet);
|
||
|
return(0);
|
||
|
}
|
||
|
iIndex++;
|
||
|
}
|
||
|
if (occf & OCCF_OFFSET)
|
||
|
{
|
||
|
cbTemp = CbByteUnpack (&dwTemp, pInFile->pCurrent);
|
||
|
pInFile->pCurrent += cbTemp;
|
||
|
pInFile->cbLeft -= cbTemp;
|
||
|
if ((fRet = FAddDword (pOutput, dwTemp, lpipb->cKey[iIndex])) !=
|
||
|
S_OK)
|
||
|
{
|
||
|
SetErrCode (phr, fRet);
|
||
|
return(0);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
// Advance to next byte (we are partially through a byte now)
|
||
|
pOutput->ibit = cbitBYTE - 1;
|
||
|
pOutput->pCurrent++;
|
||
|
pOutput->foPhysicalOffset = FoAddDw (pOutput->foPhysicalOffset, 1);
|
||
|
pOutput->cbLeft--;
|
||
|
#ifdef _DEBUG
|
||
|
if (pOutput->cbLeft <= 0)
|
||
|
SetErrCode (NULL, E_ASSERT);
|
||
|
#endif
|
||
|
dwEncodedSize += DwSubFo (pOutput->foPhysicalOffset, foStart);
|
||
|
|
||
|
// Leave slack space, but not for uncommon words
|
||
|
if (dwTopicCount <= 2)
|
||
|
dwSlackSize = 0;
|
||
|
else
|
||
|
dwSlackSize = dwEncodedSize / 10;
|
||
|
|
||
|
dwEncodedSize += dwSlackSize;
|
||
|
// Keep a running total of all allocated slack space
|
||
|
pTreeData->Header.dwSlackCount += dwSlackSize;
|
||
|
|
||
|
while (dwSlackSize)
|
||
|
{
|
||
|
if (pOutput->cbLeft < (LONG)dwSlackSize)
|
||
|
{ // The slack block doesn't fit in the output buffer
|
||
|
// Write as much as we can then flush the buffer and write the rest
|
||
|
// MEMSET (pOutput->pCurrent, 0, pOutput->cbLeft);
|
||
|
DWORD dwSize;
|
||
|
|
||
|
dwSlackSize -= pOutput->cbLeft;
|
||
|
if (0 == FileWrite (fFile, pOutput->pMem,
|
||
|
dwSize = pOutput->dwMax, phr))
|
||
|
{
|
||
|
return 0;
|
||
|
}
|
||
|
pOutput->pCurrent = pOutput->pMem;
|
||
|
pOutput->foPhysicalOffset =
|
||
|
FoAddDw (pOutput->foPhysicalOffset, pOutput->cbLeft);
|
||
|
pOutput->cbLeft = pOutput->dwMax;
|
||
|
pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset, dwSize);
|
||
|
}
|
||
|
else
|
||
|
{ // The slack fits, no problems
|
||
|
MEMSET (pOutput->pCurrent, 0, dwSlackSize);
|
||
|
pOutput->pCurrent += dwSlackSize;
|
||
|
pOutput->foPhysicalOffset =
|
||
|
FoAddDw (pOutput->foPhysicalOffset, dwSlackSize);
|
||
|
pOutput->cbLeft -= dwSlackSize;
|
||
|
#ifdef _DEBUG
|
||
|
if (pOutput->cbLeft <= 0)
|
||
|
SetErrCode (NULL, E_ASSERT);
|
||
|
#endif
|
||
|
dwSlackSize = 0;
|
||
|
}
|
||
|
}
|
||
|
return dwEncodedSize;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* @doc PRIVATE INDEXING
|
||
|
*
|
||
|
* @func void | WriteStemNode |
|
||
|
* Flushes a stem node in the BTree to the output buffer. Once flushed,
|
||
|
* the node is reset to the beginning and filled with zeros.
|
||
|
*
|
||
|
* @parm _LPIPB | lpipb |
|
||
|
* Pointer the IPB structure
|
||
|
*
|
||
|
* @parm PNODEINFO | pNode |
|
||
|
* Pointer to the node to flush
|
||
|
*
|
||
|
*************************************************************************/
|
||
|
|
||
|
PRIVATE HRESULT PASCAL WriteStemNode (_LPIPB lpipb, PNODEINFO pNode)
|
||
|
{
|
||
|
// Local Replacement Variable
|
||
|
PBTREEDATA pTreeData = &lpipb->BTreeData;
|
||
|
PFILEDATA pOutput = &lpipb->OutFile; // Output structure
|
||
|
LPB pDest; // Output buffer
|
||
|
LPB pStart = pNode->pBuffer; // Start of node buffer
|
||
|
|
||
|
// Local Working Variables
|
||
|
DWORD dwBytesLeft; // Bytes left to write
|
||
|
ERRB errb;
|
||
|
|
||
|
#if 0 // Use 2-bytes for cbLeft to simplify the work of update
|
||
|
// Compress CbLeft to output buffer
|
||
|
dwBytesLeft = lpipb->BTreeData.Header.dwBlockSize - FOFFSET_SIZE -
|
||
|
CompressDword (pOutput, (DWORD)pNode->cbLeft);
|
||
|
#else
|
||
|
*(LPUW)(pOutput->pCurrent) = (WORD)pNode->cbLeft;
|
||
|
pOutput->pCurrent += sizeof(WORD);
|
||
|
pOutput->cbLeft -= sizeof(WORD);
|
||
|
pOutput->foPhysicalOffset =
|
||
|
FoAddDw (pOutput->foPhysicalOffset, (DWORD)sizeof(WORD));
|
||
|
dwBytesLeft = lpipb->BTreeData.Header.dwBlockSize - sizeof(WORD);
|
||
|
#endif
|
||
|
pDest = pOutput->pCurrent;
|
||
|
|
||
|
// Keep a running total of all allocated slack space
|
||
|
pTreeData->Header.dwSlackCount += pNode->cbLeft;
|
||
|
|
||
|
// This is why the buffer must be >= BTREE_NODE_SIZE
|
||
|
// This could be put in a loop to avoid that restriction, but it
|
||
|
// is probably not worth it. (See also WriteLeafNode)
|
||
|
if (pOutput->cbLeft < (LONG)dwBytesLeft)
|
||
|
{
|
||
|
LONG dwSize;
|
||
|
if (FileWrite (pOutput->fFile,
|
||
|
pOutput->pMem, dwSize = (DWORD)(pDest - pOutput->pMem), &errb) != dwSize)
|
||
|
return(errb);
|
||
|
|
||
|
pDest = pOutput->pMem;
|
||
|
pOutput->cbLeft = pOutput->dwMax;
|
||
|
pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset, dwSize);
|
||
|
}
|
||
|
MEMCPY (pDest, pStart, dwBytesLeft);
|
||
|
pOutput->foPhysicalOffset =
|
||
|
FoAddDw (pOutput->foPhysicalOffset, dwBytesLeft);
|
||
|
pOutput->cbLeft -= dwBytesLeft;
|
||
|
#ifdef _DEBUG
|
||
|
if (pOutput->cbLeft <= 0)
|
||
|
SetErrCode (NULL, E_ASSERT);
|
||
|
#endif
|
||
|
// Set the external variable
|
||
|
pOutput->pCurrent = pDest + dwBytesLeft;
|
||
|
|
||
|
// Set to all zeros so we know when we have reached the end of data later
|
||
|
MEMSET (pNode->pBuffer, 0, lpipb->BTreeData.Header.dwBlockSize);
|
||
|
pNode->cbLeft = lpipb->BTreeData.Header.dwBlockSize - sizeof(WORD);
|
||
|
pNode->pCurPtr = pNode->pBuffer;
|
||
|
*(PUSHORT)pNode->pLastWord = 0;
|
||
|
return(S_OK);
|
||
|
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* @doc PRIVATE INDEXING
|
||
|
*
|
||
|
* @func void | WriteLeafNode |
|
||
|
* Flushes a leaf node in the BTree to the output buffer. Once flushed,
|
||
|
* the node is reset to the beginning and filled with zeros.
|
||
|
*
|
||
|
* @parm _LPIPB | lpipb |
|
||
|
* Pointer to index block
|
||
|
*
|
||
|
* @rdesc S_OK or other errors
|
||
|
*************************************************************************/
|
||
|
|
||
|
PRIVATE HRESULT PASCAL NEAR WriteLeafNode (_LPIPB lpipb)
|
||
|
{
|
||
|
// Local Replacement Variables
|
||
|
PBTREEDATA pTreeData = &lpipb->BTreeData;
|
||
|
PFILEDATA pOutput = &lpipb->OutFile; // Output data structure
|
||
|
LPB pDest = pOutput->pCurrent; // Output buffer
|
||
|
FILEOFFSET OffsetPointer = pTreeData->OffsetPointer;
|
||
|
FILEOFFSET foPhysicalOffset = pOutput->foPhysicalOffset;
|
||
|
PNODEINFO pNode = pTreeData->rgpNodeInfo[0]; // Leaf node
|
||
|
LPB pStart = pNode->pBuffer; // Beginning of the node buffer
|
||
|
// Working Variables
|
||
|
DWORD dwLeft;
|
||
|
FILEOFFSET StartOffset; // Physical offset of the begining
|
||
|
// of the output buffer
|
||
|
ERRB errb;
|
||
|
|
||
|
// Backpatch the current offset to the last nodes pointer
|
||
|
if (!FoIsNil (OffsetPointer))
|
||
|
{
|
||
|
// Is the backpatch location in the output buffer?
|
||
|
if (FoCompare (OffsetPointer,
|
||
|
(StartOffset = FoSubFo (foPhysicalOffset,
|
||
|
MakeFo ((DWORD)(pDest - pOutput->pMem), 0)))) >= 0)
|
||
|
{
|
||
|
CopyFileOffset (pOutput->pMem + DwSubFo
|
||
|
(OffsetPointer, StartOffset), foPhysicalOffset);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if (FileSeekWrite (pOutput->fFile, &foPhysicalOffset,
|
||
|
OffsetPointer, sizeof (DWORD), &errb) !=
|
||
|
sizeof (DWORD))
|
||
|
return(errb);
|
||
|
|
||
|
FileSeek (pOutput->fFile, StartOffset, 0, NULL);
|
||
|
}
|
||
|
}
|
||
|
// Set the backpatch location for next time
|
||
|
pTreeData->OffsetPointer = foPhysicalOffset;
|
||
|
|
||
|
// Skip the record pointer for this record (will be backpatched next time)
|
||
|
|
||
|
if (pOutput->cbLeft <= 0 )
|
||
|
{
|
||
|
LONG dwSize;
|
||
|
|
||
|
if (FileWrite (pOutput->fFile, pOutput->pMem,
|
||
|
dwSize = (DWORD)(pDest - pOutput->pMem), &errb) != dwSize)
|
||
|
return(errb);
|
||
|
|
||
|
pDest = pOutput->pMem;
|
||
|
pOutput->cbLeft = pOutput->dwMax;
|
||
|
pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset, dwSize);
|
||
|
}
|
||
|
MEMSET (pDest, 0, FOFFSET_SIZE);
|
||
|
pOutput->cbLeft -= FOFFSET_SIZE;
|
||
|
#ifdef _DEBUG
|
||
|
if (pOutput->cbLeft <= 0)
|
||
|
SetErrCode (NULL, E_ASSERT);
|
||
|
#endif
|
||
|
pOutput->pCurrent = pDest + FOFFSET_SIZE;
|
||
|
pOutput->foPhysicalOffset = FoAddDw (foPhysicalOffset, FOFFSET_SIZE);
|
||
|
|
||
|
#if 0 // Use 2-bytes for cbLeft to simplify the work of update
|
||
|
// Compress CbLeft to output buffer
|
||
|
dwLeft = lpipb->BTreeData.Header.dwBlockSize - FOFFSET_SIZE -
|
||
|
CompressDword (pOutput, (DWORD)pNode->cbLeft);
|
||
|
#else
|
||
|
*(LPUW)(pOutput->pCurrent) = (WORD)pNode->cbLeft;
|
||
|
pOutput->foPhysicalOffset =
|
||
|
FoAddDw (pOutput->foPhysicalOffset, (DWORD)sizeof(WORD));
|
||
|
pOutput->cbLeft -= sizeof(WORD);
|
||
|
dwLeft = lpipb->BTreeData.Header.dwBlockSize - FOFFSET_SIZE - sizeof(WORD);
|
||
|
pOutput->pCurrent += sizeof(WORD);
|
||
|
#endif
|
||
|
pDest = pOutput->pCurrent;
|
||
|
|
||
|
// Keep a running total of all allocated slack space
|
||
|
pTreeData->Header.dwSlackCount += pNode->cbLeft;
|
||
|
|
||
|
// This is why the buffer must be >= BTREE_NODE_SIZE
|
||
|
// This could be put in a loop to avoid that restriction, but it
|
||
|
// is probably not worth it. (See also WriteStemNode)
|
||
|
if (pOutput->cbLeft < (LONG)dwLeft)
|
||
|
{
|
||
|
LONG dwSize;
|
||
|
if (FileWrite (pOutput->fFile, pOutput->pMem,
|
||
|
dwSize = (DWORD)(pDest - pOutput->pMem), &errb) != dwSize)
|
||
|
return(errb);
|
||
|
|
||
|
pDest = pOutput->pMem;
|
||
|
pOutput->cbLeft = pOutput->dwMax;
|
||
|
pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset, dwSize);
|
||
|
}
|
||
|
MEMCPY (pDest, pStart, dwLeft);
|
||
|
pOutput->foPhysicalOffset =
|
||
|
FoAddDw (pOutput->foPhysicalOffset, dwLeft);
|
||
|
pOutput->cbLeft -= dwLeft;
|
||
|
#ifdef _DEBUG
|
||
|
if (pOutput->cbLeft <= 0)
|
||
|
SetErrCode (NULL, E_ASSERT);
|
||
|
#endif
|
||
|
pOutput->pCurrent = pDest + dwLeft;
|
||
|
|
||
|
// Reset buffer back to beginning
|
||
|
MEMSET (pNode->pBuffer, 0, lpipb->BTreeData.Header.dwBlockSize);
|
||
|
pNode->pCurPtr = pNode->pBuffer;
|
||
|
|
||
|
// Set the bytes left in node block
|
||
|
pNode->cbLeft = lpipb->BTreeData.Header.dwBlockSize -
|
||
|
FOFFSET_SIZE - sizeof(WORD);
|
||
|
*(PUSHORT)pNode->pLastWord = 0;
|
||
|
return(S_OK);
|
||
|
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc PRIVATE INDEXING
|
||
|
*
|
||
|
* @func PNODEINFO | AllocBTreeNode |
|
||
|
* Allocates memory for the node structure as well as the data buffer
|
||
|
* contained in the structure.
|
||
|
*
|
||
|
* @parm _LPIPB | lpipb |
|
||
|
* Pointer to index parameter block
|
||
|
*
|
||
|
* @rdesc Returns a pointer to the newly allocated node
|
||
|
*************************************************************************/
|
||
|
|
||
|
PUBLIC PNODEINFO PASCAL FAR AllocBTreeNode (_LPIPB lpipb)
|
||
|
{
|
||
|
PNODEINFO pNode;
|
||
|
|
||
|
// Allocate node structure
|
||
|
if ((pNode = GlobalLockedStructMemAlloc (sizeof (NODEINFO))) == NULL)
|
||
|
{
|
||
|
exit0:
|
||
|
SetErrCode (NULL, E_OUTOFMEMORY);
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
// Allocate data buffer
|
||
|
if ((pNode->hMem =
|
||
|
_GLOBALALLOC (DLLGMEM_ZEROINIT,
|
||
|
pNode->dwBlockSize = lpipb->BTreeData.Header.dwBlockSize)) == NULL)
|
||
|
{
|
||
|
exit1:
|
||
|
GlobalLockedStructMemFree(pNode);
|
||
|
goto exit0;
|
||
|
}
|
||
|
pNode->pCurPtr = pNode->pBuffer = (LPB)_GLOBALLOCK (pNode->hMem);
|
||
|
|
||
|
// Allocate a buffer with the maximum word length, which is the block
|
||
|
// size
|
||
|
|
||
|
if ((pNode->hLastWord =
|
||
|
_GLOBALALLOC (DLLGMEM_ZEROINIT, pNode->dwBlockSize)) == NULL)
|
||
|
{
|
||
|
exit2:
|
||
|
FreeHandle (pNode->hMem);
|
||
|
goto exit1;
|
||
|
}
|
||
|
pNode->pLastWord = (LPB)_GLOBALLOCK (pNode->hLastWord);
|
||
|
|
||
|
// Alllocate temporary result buffer.
|
||
|
|
||
|
if ((pNode->hTmp =
|
||
|
_GLOBALALLOC (DLLGMEM_ZEROINIT, pNode->dwBlockSize)) == NULL)
|
||
|
{
|
||
|
FreeHandle (pNode->hLastWord);
|
||
|
goto exit2;
|
||
|
}
|
||
|
pNode->pTmpResult = (LPB)_GLOBALLOCK (pNode->hTmp);
|
||
|
|
||
|
return pNode;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc PRIVATE INDEXING
|
||
|
*
|
||
|
* @func VOID | FreeBTreeNode |
|
||
|
* Free all memory allocated for the node
|
||
|
*
|
||
|
* @parm PNODEINFO | pNode |
|
||
|
* BTree node to be freed
|
||
|
*************************************************************************/
|
||
|
|
||
|
PUBLIC VOID PASCAL FAR FreeBTreeNode (PNODEINFO pNode)
|
||
|
{
|
||
|
if (pNode == NULL)
|
||
|
return;
|
||
|
FreeHandle (pNode->hTmp);
|
||
|
FreeHandle (pNode->hMem);
|
||
|
FreeHandle (pNode->hLastWord);
|
||
|
GlobalLockedStructMemFree(pNode);
|
||
|
}
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* @doc PRIVATE INDEXING
|
||
|
*
|
||
|
* @func HRESULT | PrefixCompressWord |
|
||
|
* Adds a word to a record based on the last word in the node.
|
||
|
*
|
||
|
* @parm LPB | pDest |
|
||
|
* Pointer to the destination buffer
|
||
|
*
|
||
|
* @parm LPB | lpbWord |
|
||
|
* Pointer to the word string to add to node. The format is:
|
||
|
* - 2-byte: string length
|
||
|
* - n-byte: the string itself
|
||
|
* - cbBytePack: real word length
|
||
|
*
|
||
|
* @parm LPB | pLastWord |
|
||
|
* Pointer to the last word entered in the destination buffer
|
||
|
*
|
||
|
* @parm int | fOccfLengthSet |
|
||
|
* Set to 1 if OCCF_LENGTH field is set, else 0
|
||
|
*
|
||
|
* @parm PHRESULT | pErrb |
|
||
|
* Pointer to error structure
|
||
|
*
|
||
|
* @rdesc returns number of bytes written to the destination buffer
|
||
|
* @rcomm
|
||
|
* Strings are compressed based on how many beginning bytes
|
||
|
* (prefix) it has in common woth the previous word. The format is
|
||
|
* - String's length : 2-byte CbPacked
|
||
|
* - Prefix length : 1-byte (0 - 127). If the high bit is set
|
||
|
* another word length is to follow the word
|
||
|
* - Word : n-byte without the prefix
|
||
|
* - Word's real length - 2-byte CbPacked: only exist if the
|
||
|
* prefix length high bit is set
|
||
|
*************************************************************************/
|
||
|
|
||
|
PUBLIC int PASCAL FAR PrefixCompressWord
|
||
|
(LPB pDest, LPB lpbWord, LPB pLastWord, int fOccfLengthSet)
|
||
|
{
|
||
|
// Working Variables
|
||
|
int bPrefix; // The number of prefix bytes that match
|
||
|
unsigned int wPostfix; // Bytes left over that don't match
|
||
|
USHORT cbMinWordLen; // Smallest word size between the two words
|
||
|
LPB pStart = pDest; // Starting position
|
||
|
DWORD dwRealLength; // The real length of the word
|
||
|
|
||
|
|
||
|
// Get the minimum word length
|
||
|
wPostfix = GETWORD ((LPUW)lpbWord);
|
||
|
if ((cbMinWordLen = GETWORD ((LPUW)pLastWord)) > wPostfix)
|
||
|
cbMinWordLen = (USHORT) wPostfix;
|
||
|
|
||
|
// Add one to adjust for two byte word headers (saves an add in the loop)
|
||
|
cbMinWordLen++;
|
||
|
|
||
|
for (bPrefix = 2; bPrefix <= cbMinWordLen; bPrefix++)
|
||
|
{
|
||
|
if (lpbWord[bPrefix] != pLastWord[bPrefix])
|
||
|
break;
|
||
|
}
|
||
|
// Adjust back to the real value
|
||
|
bPrefix -= 2;
|
||
|
|
||
|
// Prefix must be <= 127 (high bit is used to indicate fOccfLength field)
|
||
|
if (bPrefix > 127)
|
||
|
bPrefix = 127;
|
||
|
|
||
|
cbMinWordLen = (USHORT) wPostfix; // Save the word length
|
||
|
wPostfix -= bPrefix;
|
||
|
|
||
|
// Add wLen to wPostfix to get total byte count then write it.
|
||
|
// The extra byte is for the prefix byte
|
||
|
pDest += (USHORT)CbBytePack (pDest, (DWORD)(wPostfix + 1));
|
||
|
|
||
|
// If WordLen == string length then don't write WordLen
|
||
|
if (fOccfLengthSet)
|
||
|
{
|
||
|
CbByteUnpack (&dwRealLength, lpbWord + sizeof(WORD) + cbMinWordLen );
|
||
|
if (dwRealLength == cbMinWordLen)
|
||
|
fOccfLengthSet = FALSE;
|
||
|
}
|
||
|
|
||
|
// Write prefix size
|
||
|
// If fOccfLengthSet is set, set high bit of bPrefix
|
||
|
if (fOccfLengthSet)
|
||
|
*pDest = bPrefix | 0x80;
|
||
|
else
|
||
|
*pDest = (BYTE) bPrefix;
|
||
|
pDest++;
|
||
|
|
||
|
// Copy the postfix string over
|
||
|
MEMCPY (pDest, lpbWord + (bPrefix + sizeof (SHORT)), wPostfix);
|
||
|
pDest += wPostfix;
|
||
|
|
||
|
// if fOccfLengthSet is set append WordLen to end of word
|
||
|
// (WordLen field follows word in input stream)
|
||
|
if (fOccfLengthSet)
|
||
|
pDest += CbBytePack (pDest, dwRealLength);
|
||
|
|
||
|
return (int)(pDest - pStart);
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* @doc PRIVATE INDEXING
|
||
|
*
|
||
|
* @func void | FlushAllNodes |
|
||
|
* Flushes the remaining nodes to disk when the tree is completely built.
|
||
|
*
|
||
|
* @parm _LPIPB | lpipb |
|
||
|
* Pointer to index block
|
||
|
*
|
||
|
* @rdesc S_OK on success or errors if failed
|
||
|
*
|
||
|
*************************************************************************/
|
||
|
|
||
|
HRESULT PASCAL FlushAllNodes (_LPIPB lpipb)
|
||
|
{
|
||
|
PBTREEDATA pTreeData = &lpipb->BTreeData;
|
||
|
PFILEDATA pOutput = &lpipb->OutFile;
|
||
|
|
||
|
PNODEINFO pLeafNode;
|
||
|
PNODEINFO pStemNode;
|
||
|
int WordSize;
|
||
|
BYTE curLevel = 0;
|
||
|
ERRB errb = S_OK;
|
||
|
HRESULT fRet;
|
||
|
|
||
|
pStemNode = pTreeData->rgpNodeInfo[0];
|
||
|
|
||
|
while (pTreeData->rgpNodeInfo[++curLevel] != NULL)
|
||
|
{
|
||
|
pLeafNode = pStemNode;
|
||
|
pStemNode = pTreeData->rgpNodeInfo[curLevel];
|
||
|
|
||
|
if ((WordSize = PrefixCompressWord (pStemNode->pCurPtr,
|
||
|
pLeafNode->pLastWord, pStemNode->pLastWord,
|
||
|
pTreeData->fOccfLength)) == 0)
|
||
|
{
|
||
|
return errb;
|
||
|
}
|
||
|
// Save new word as last word
|
||
|
MEMCPY (pStemNode->pLastWord, pLeafNode->pLastWord,
|
||
|
GETWORD ((LPUW)pLeafNode->pLastWord) + 2);
|
||
|
|
||
|
pStemNode->pCurPtr += WordSize;
|
||
|
pStemNode->cbLeft -= WordSize;
|
||
|
#ifdef _DEBUG
|
||
|
if (pOutput->cbLeft <= 0)
|
||
|
SetErrCode (NULL, E_ASSERT);
|
||
|
#endif
|
||
|
|
||
|
CopyFileOffset (pStemNode->pCurPtr,
|
||
|
lpipb->OutFile.foPhysicalOffset);
|
||
|
pStemNode->pCurPtr += FOFFSET_SIZE;
|
||
|
pStemNode->cbLeft -= FOFFSET_SIZE;
|
||
|
#ifdef _DEBUG
|
||
|
if (pOutput->cbLeft <= 0)
|
||
|
SetErrCode (NULL, E_ASSERT);
|
||
|
#endif
|
||
|
|
||
|
if (curLevel == 1)
|
||
|
{
|
||
|
if ((fRet = WriteLeafNode (lpipb)) != S_OK)
|
||
|
return(fRet);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if ((fRet = WriteStemNode (lpipb, pLeafNode)) != S_OK)
|
||
|
return(fRet);
|
||
|
}
|
||
|
}
|
||
|
// Set the pointer to the top stem node
|
||
|
pTreeData->Header.foIdxRoot = pOutput->foPhysicalOffset;
|
||
|
pTreeData->Header.nidIdxRoot = pOutput->foPhysicalOffset.dwOffset;
|
||
|
|
||
|
if (curLevel == 1)
|
||
|
{
|
||
|
if ((fRet = WriteLeafNode (lpipb)) != S_OK)
|
||
|
return(fRet);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if ((fRet = WriteStemNode (lpipb, pStemNode)) != S_OK)
|
||
|
return(fRet);
|
||
|
}
|
||
|
|
||
|
{
|
||
|
LONG dwSize;
|
||
|
|
||
|
// Flush the output buffer
|
||
|
if (FileWrite (pOutput->fFile, pOutput->pMem,
|
||
|
dwSize = (DWORD)(pOutput->pCurrent - pOutput->pMem), &errb) != dwSize)
|
||
|
return(errb);
|
||
|
pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset, dwSize);
|
||
|
}
|
||
|
|
||
|
return S_OK;
|
||
|
}
|
||
|
|
||
|
PRIVATE HRESULT PASCAL NEAR WriteBitStreamDWord (PFILEDATA pOutput, DWORD dw,
|
||
|
int ckeyCenter)
|
||
|
{
|
||
|
BYTE ucBits;
|
||
|
HRESULT fRet;
|
||
|
|
||
|
// Bitstream scheme.
|
||
|
//
|
||
|
// This writes "dw" one-bits followed by a zero-bit.
|
||
|
//
|
||
|
for (; dw;)
|
||
|
{
|
||
|
if (dw < cbitBYTE * sizeof(DWORD))
|
||
|
{
|
||
|
|
||
|
ucBits = (BYTE)dw;
|
||
|
dw = 0;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
ucBits = cbitBYTE * sizeof(DWORD);
|
||
|
dw -= cbitBYTE * sizeof(DWORD);
|
||
|
}
|
||
|
if ((fRet = FWriteBits(pOutput, argdwBits[ucBits],
|
||
|
(BYTE)ucBits)) != S_OK)
|
||
|
return fRet;
|
||
|
}
|
||
|
return FWriteBool(pOutput, 0);
|
||
|
}
|
||
|
|
||
|
PRIVATE HRESULT PASCAL NEAR WriteFixedDWord (PFILEDATA pOutput, DWORD dw,
|
||
|
int ckeyCenter)
|
||
|
{
|
||
|
// This just writes "ckey.ucCenter" bits of data.
|
||
|
return (FWriteBits (pOutput, dw, (BYTE)(ckeyCenter + 1)));
|
||
|
}
|
||
|
|
||
|
PRIVATE HRESULT PASCAL NEAR WriteBellDWord (PFILEDATA pOutput, DWORD dw,
|
||
|
int ckeyCenter)
|
||
|
{
|
||
|
BYTE ucBits;
|
||
|
HRESULT fRet;
|
||
|
|
||
|
// The "BELL" scheme is more complicated.
|
||
|
ucBits = (BYTE)CbitBitsDw(dw);
|
||
|
|
||
|
if (ucBits <= ckeyCenter)
|
||
|
{
|
||
|
//
|
||
|
// Encoding a small value. Write a zero, then write
|
||
|
// "ckey.ucCenter" bits of the value, which
|
||
|
// is guaranteed to be enough.
|
||
|
//
|
||
|
if ((fRet = FWriteBool(pOutput, 0)) != S_OK)
|
||
|
return fRet;
|
||
|
return FWriteBits(pOutput, dw, (BYTE)(ckeyCenter));
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// Encoding a value that won't fit in "ckey.ucCenter" bits.
|
||
|
// "ucBits" is how many bits it will really take.
|
||
|
//
|
||
|
// First, write out "ucBits - ckey.ucCenter" one-bits.
|
||
|
//
|
||
|
if ((fRet = FWriteBits(pOutput, argdwBits[ucBits -
|
||
|
ckeyCenter], (BYTE)(ucBits - ckeyCenter))) != S_OK)
|
||
|
return fRet;
|
||
|
//
|
||
|
// Now, write out the value in "ucBits" bits,
|
||
|
// but zero the high-bit first.
|
||
|
//
|
||
|
return FWriteBits(pOutput, dw & argdwBits[ucBits - 1], ucBits);
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* @doc PRIVATE INDEXING
|
||
|
*
|
||
|
* @func HRESULT | FWriteBits |
|
||
|
* Writes a bunch of bits into the output buffer.
|
||
|
*
|
||
|
* @parm PFILEDATA | pOutput |
|
||
|
* Pointer to the output data structure
|
||
|
*
|
||
|
* @parm DWORD | dwVal |
|
||
|
* DWORD value to write
|
||
|
*
|
||
|
* @parm BYTE | cbits |
|
||
|
* Number of bits to write from dwVal
|
||
|
*
|
||
|
* @rdesc Returns S_OK on success or errors if failed
|
||
|
*
|
||
|
*************************************************************************/
|
||
|
|
||
|
PUBLIC HRESULT FAR PASCAL FWriteBits (PFILEDATA pOutput, DWORD dwVal, BYTE cBits)
|
||
|
{
|
||
|
BYTE cbitThisPassBits;
|
||
|
BYTE bThis;
|
||
|
ERRB errb;
|
||
|
static DWORD Count = 0;
|
||
|
|
||
|
// Loop until no bits left
|
||
|
for (; cBits;)
|
||
|
{
|
||
|
|
||
|
if (pOutput->ibit < 0)
|
||
|
{
|
||
|
pOutput->pCurrent++;
|
||
|
pOutput->foPhysicalOffset =
|
||
|
FoAddDw (pOutput->foPhysicalOffset, 1);
|
||
|
pOutput->cbLeft--;
|
||
|
#ifdef _DEBUG
|
||
|
if (pOutput->cbLeft <= 0)
|
||
|
SetErrCode (NULL, E_ASSERT);
|
||
|
#endif
|
||
|
|
||
|
// Room left in output buffer?
|
||
|
if (pOutput->cbLeft <= 256)
|
||
|
{
|
||
|
LONG dwSize;
|
||
|
|
||
|
if (FileWrite (pOutput->fFile, pOutput->pMem,
|
||
|
dwSize = (DWORD)(pOutput->pCurrent - pOutput->pMem), &errb) !=
|
||
|
dwSize)
|
||
|
return(errb);
|
||
|
|
||
|
pOutput->cbLeft = pOutput->dwMax;
|
||
|
pOutput->pCurrent = pOutput->pMem;
|
||
|
pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset,
|
||
|
dwSize);
|
||
|
#ifdef _DEBUG
|
||
|
// MEMSET (pOutput->pMem, 0, pOutput->dwMax);
|
||
|
// Count++;
|
||
|
// if (!FoEquals(pOutput->foStartOffset, pOutput->foPhysicalOffset))
|
||
|
// _asm int 3;
|
||
|
#endif
|
||
|
}
|
||
|
pOutput->ibit = cbitBYTE - 1;
|
||
|
}
|
||
|
else
|
||
|
{ // Write some bits.
|
||
|
cbitThisPassBits = (pOutput->ibit + 1 < cBits) ?
|
||
|
pOutput->ibit + 1 : cBits;
|
||
|
bThis = (pOutput->ibit == cbitBYTE - 1) ?
|
||
|
0 : *pOutput->pCurrent;
|
||
|
bThis |= ((dwVal >> (cBits - cbitThisPassBits)) <<
|
||
|
(pOutput->ibit - cbitThisPassBits + 1));
|
||
|
*pOutput->pCurrent = (BYTE)bThis;
|
||
|
pOutput->ibit -= cbitThisPassBits;
|
||
|
cBits -= (BYTE)cbitThisPassBits;
|
||
|
}
|
||
|
}
|
||
|
return S_OK;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* @doc PRIVATE INDEXING
|
||
|
*
|
||
|
* @func HRESULT | FWriteBool |
|
||
|
* Writes a single bit into the output buffer.
|
||
|
*
|
||
|
* @parm PFILEDATA | pOutput |
|
||
|
* Pointer to the output data structure
|
||
|
*
|
||
|
* @parm BOOL | dwVal |
|
||
|
* BOOL value to write
|
||
|
*
|
||
|
* @rdesc Returns S_OK on success or errors if failed
|
||
|
*
|
||
|
*************************************************************************/
|
||
|
|
||
|
PRIVATE HRESULT NEAR PASCAL FWriteBool (PFILEDATA pOutput, BOOL fVal)
|
||
|
{
|
||
|
HRESULT fRet = E_FAIL;
|
||
|
ERRB errb;
|
||
|
|
||
|
if (pOutput->ibit < 0)
|
||
|
{ // This byte is full, point to a new byte
|
||
|
pOutput->pCurrent++;
|
||
|
pOutput->foPhysicalOffset =
|
||
|
FoAddDw (pOutput->foPhysicalOffset, 1);
|
||
|
pOutput->cbLeft--;
|
||
|
#ifdef _DEBUG
|
||
|
if (pOutput->cbLeft <= 0)
|
||
|
SetErrCode (NULL, E_ASSERT);
|
||
|
#endif
|
||
|
|
||
|
// Room left in output buffer?
|
||
|
if (pOutput->cbLeft <= sizeof(DWORD))
|
||
|
{
|
||
|
LONG dwSize;
|
||
|
if (FileWrite (pOutput->fFile, pOutput->pMem,
|
||
|
dwSize = (DWORD)(pOutput->pCurrent - pOutput->pMem), &errb) != dwSize)
|
||
|
return(errb);
|
||
|
|
||
|
pOutput->pCurrent = pOutput->pMem;
|
||
|
pOutput->cbLeft = pOutput->dwMax;
|
||
|
pOutput->foStartOffset = FoAddDw(pOutput->foStartOffset, dwSize);
|
||
|
#ifdef _DEBUG
|
||
|
MEMSET (pOutput->pMem, 0, pOutput->dwMax);
|
||
|
#endif
|
||
|
}
|
||
|
pOutput->ibit = cbitBYTE - 1;
|
||
|
}
|
||
|
if (pOutput->ibit == cbitBYTE - 1) // Zero out a brand-new byte.
|
||
|
*pOutput->pCurrent = (BYTE)0;
|
||
|
if (fVal) // Write my boolean.
|
||
|
*pOutput->pCurrent |= 1 << pOutput->ibit;
|
||
|
pOutput->ibit--;
|
||
|
return S_OK; // Fine.
|
||
|
}
|
||
|
|
||
|
|
||
|
HRESULT PASCAL FAR BuildBtreeFromEso (HFPB hfpb,
|
||
|
LPSTR pstrFilename, LPB lpbEsiFile,
|
||
|
LPB lpbEsoFile, PINDEXINFO pIndexInfo)
|
||
|
{
|
||
|
_LPIPB lpipb;
|
||
|
HRESULT fRet;
|
||
|
ERRB errb;
|
||
|
BYTE bKeyIndex = 0;
|
||
|
IPB ipb;
|
||
|
HFILE hFile;
|
||
|
|
||
|
if ((lpipb = MVIndexInitiate(pIndexInfo, NULL)) == NULL)
|
||
|
return E_OUTOFMEMORY;
|
||
|
|
||
|
/* Read in the external sort buffer info */
|
||
|
|
||
|
if ((hFile = _lopen (lpbEsiFile, READ)) == HFILE_ERROR)
|
||
|
return E_NOTEXIST;
|
||
|
|
||
|
/* Read old IPB info */
|
||
|
_lread (hFile, &ipb, sizeof(IPB));
|
||
|
|
||
|
/* Transfer meaningful data */
|
||
|
|
||
|
lpipb->dwIndexedWord = ipb.dwIndexedWord;
|
||
|
lpipb->dwUniqueWord = ipb.dwUniqueWord;
|
||
|
lpipb->dwByteCount = ipb.dwByteCount;
|
||
|
lpipb->dwOccOffbits = ipb.dwOccOffbits;
|
||
|
lpipb->dwOccExtbits = ipb.dwOccExtbits;
|
||
|
lpipb->dwMaxFieldId = ipb.dwMaxFieldId;
|
||
|
lpipb->dwMaxWCount = ipb.dwMaxWCount;
|
||
|
lpipb->dwMaxOffset = ipb.dwMaxOffset;
|
||
|
lpipb->dwTotal3bWordLen = ipb.dwTotal3bWordLen;
|
||
|
lpipb->dwTotal2bWordLen = ipb.dwTotal2bWordLen;
|
||
|
lpipb->dwTotalUniqueWordLen = ipb.dwTotalUniqueWordLen;
|
||
|
lpipb->lcTopics = ipb.lcTopics;
|
||
|
lpipb->dwMaxTopicId = ipb.dwMaxTopicId;
|
||
|
// lpipb->dwMemAllowed = ipb.dwMemAllowed;
|
||
|
lpipb->dwMaxRecordSize = ipb.dwMaxRecordSize;
|
||
|
lpipb->dwMaxEsbRecSize = ipb.dwMaxEsbRecSize;
|
||
|
lpipb->dwMaxWLen = ipb.dwMaxWLen;
|
||
|
lpipb->idxf = ipb.idxf;
|
||
|
if (lpipb->idxf & IDXF_NORMALIZE)
|
||
|
{
|
||
|
if ((lpipb->wi.hSigma = _GLOBALALLOC (DLLGMEM_ZEROINIT,
|
||
|
(LCB)((lpipb->dwMaxTopicId + 1) * sizeof (SIGMA)))) == NULL)
|
||
|
return SetErrCode (&errb, E_OUTOFMEMORY);
|
||
|
lpipb->wi.hrgsigma = (HRGSIGMA)_GLOBALLOCK (lpipb->wi.hSigma);
|
||
|
|
||
|
if ((lpipb->wi.hLog = _GLOBALALLOC (DLLGMEM_ZEROINIT,
|
||
|
(CB)(cLOG_MAX * sizeof (FLOAT)))) == NULL)
|
||
|
{
|
||
|
SetErrCode (&errb, (HRESULT)(fRet = E_OUTOFMEMORY));
|
||
|
exit1:
|
||
|
FreeHandle (lpipb->wi.hSigma);
|
||
|
MVIndexDispose (lpipb);
|
||
|
return fRet;
|
||
|
}
|
||
|
#if 0
|
||
|
lpipb->wi.lrgrLog = (FLOAT FAR *)_GLOBALLOCK (lpipb->wi.hLog);
|
||
|
// Initialize the array
|
||
|
for (loop = cLOG_MAX - 1; loop > 0; --loop)
|
||
|
{
|
||
|
rLog = (FLOAT)1.0 / (float)loop;
|
||
|
lpipb->wi.lrgrLog[loop] = rLog * rLog;
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
// Build the permanent index
|
||
|
fRet = BuildBTree(NULL, lpipb, lpbEsoFile, hfpb, pstrFilename);
|
||
|
if (lpipb->idxf & IDXF_NORMALIZE)
|
||
|
{
|
||
|
FreeHandle (lpipb->wi.hLog);
|
||
|
goto exit1;
|
||
|
}
|
||
|
fRet = S_OK;
|
||
|
goto exit1;
|
||
|
}
|