454 lines
19 KiB
C
454 lines
19 KiB
C
/*************************************************************************
|
|
* *
|
|
* IINDEX.H *
|
|
* *
|
|
* Copyright (C) Microsoft Corporation 1990-1994 *
|
|
* All Rights reserved. *
|
|
* *
|
|
**************************************************************************
|
|
* *
|
|
* Current Owner: BinhN *
|
|
* *
|
|
**************************************************************************/
|
|
|
|
|
|
/******************************************
|
|
* Internal sort stuff.
|
|
******************************************/
|
|
#ifdef _32BIT
|
|
#define MAX_BLOCK_SIZE (DWORD)0x80000
|
|
#else
|
|
#define MAX_BLOCK_SIZE (DWORD)0x0000FF00
|
|
#endif
|
|
|
|
typedef struct _list
|
|
{
|
|
struct _list FAR * pNext;
|
|
} FAR *PLIST;
|
|
|
|
// - - - - - - - - -
|
|
|
|
// Tree data types
|
|
typedef struct OCCDATA
|
|
{
|
|
struct OCCDATA FAR *pNext; // Linked-list chain
|
|
DWORD OccData[1]; // Array of n-DWORD
|
|
} OCCDATA,
|
|
FAR *POCCDATA;
|
|
|
|
typedef struct TOPICDATA
|
|
{
|
|
struct TOPICDATA FAR *pNext; // Linked-list chain 4
|
|
DWORD dwOccCount; // Count of occurrences in list 4
|
|
DWORD dwTopicId; // TopicId for this topic 4
|
|
POCCDATA pOccData; // First OccData in list 4
|
|
POCCDATA pLastOccData; // Last inserted OccData 4
|
|
} TOPICDATA, // = 20
|
|
FAR *PTOPICDATA;
|
|
|
|
typedef struct STRDATA
|
|
{
|
|
PTOPICDATA pTopic; // First Topic in list 4
|
|
PTOPICDATA pLastTopic; // Last inserted Topic 4
|
|
LPB pText; // Sort word as a Pascal string 4
|
|
DWORD dwField; // Field Id for the sort word 4
|
|
DWORD dwTopicCount; // Count of Topics in list 4
|
|
DWORD dwWordLength; // Word length (from OCC data) 4
|
|
} STRDATA, // = 24
|
|
FAR *PSTRDATA;
|
|
|
|
typedef struct BTNODE
|
|
{
|
|
enum TREECOLOR {RED, BLACK} color; // Color of node - for balancing 4
|
|
struct BTNODE FAR *pParent; // Pointer to parent node 4
|
|
struct BTNODE FAR *pLeft; // Pointer to left child node 4
|
|
struct BTNODE FAR *pRight; // Pointer to right child node 4
|
|
STRDATA StringData; // Pointer to string data 24
|
|
} BTNODE, // = 32
|
|
FAR *PBTNODE;
|
|
|
|
|
|
typedef struct MERGEHEADER
|
|
{
|
|
DWORD dwRecordSize;
|
|
LPB lpbWord; // Pascal string
|
|
DWORD dwFieldId; // Field Id
|
|
DWORD dwWordLength; // Real life word length
|
|
DWORD dwStrLen; // Current string length
|
|
DWORD dwTopicCount; // Topic count
|
|
DWORD dwLastTopicId; // Last topic id
|
|
PTOPICDATA pTopic; // Pointer to first Topic in list
|
|
PTOPICDATA pLastTopic; // Last inserted Topic
|
|
FILEOFFSET foTopicCount; // Backpatching address
|
|
LPB pTopicCount; // Pointer to topic count location
|
|
BYTE fEmitRecord; // Flag to denote rec is emitted
|
|
BYTE Pad1; // Padding for DWORD aligned
|
|
} MERGEHEADER, FAR *PMERGEHEADER;
|
|
|
|
|
|
// Typedefs for an external sort buffer. Each of these has associated
|
|
// with it a large (easily > 1meg) block of sorted words. A few of
|
|
// these words will end up in an internal buffer. These external sort
|
|
// buffers will be formed into a chain, one chain will have associated
|
|
// with it in total all of the words that are going to be sorted. A
|
|
// merge will be performed on the words associated with the chain to
|
|
// produce a final sorted list of words.
|
|
|
|
typedef struct InternalSortInfo
|
|
{
|
|
HFPB hfpb; // Handle to temp file
|
|
PBTNODE pBalanceTree; // Root node of the balanced tree
|
|
FILEOFFSET lfo; // File offset
|
|
FILEOFFSET lfoRecBackPatch; // Backpatching record offset
|
|
DWORD dwRecLength; // Record (data associated with 1 word) length
|
|
HANDLE hSortBuffer; // Handle to sort buffer
|
|
BYTE FAR *pSortBuffer; // Memory buffer for file output
|
|
BYTE FAR *pStartRec; // Record start point in the buffer
|
|
BYTE FAR *pCurPtr; // Current insertion point in the buffer
|
|
DWORD dwMaxEsbRecSize; // Maximum record size of current ESB
|
|
BYTE DeepLevel; // Deepest level of the tree
|
|
BYTE Pad1;
|
|
BYTE Pad2;
|
|
BYTE Pad3;
|
|
BYTE aszTempName[_MAX_PATH]; // Temp file for tree flush, ericjut: change from cbMAX_PATH to _MAX_PATH
|
|
} ISI,
|
|
FAR *LPISI;
|
|
|
|
typedef HANDLE HESB;
|
|
|
|
typedef struct ExternalSortBuffer
|
|
{
|
|
HANDLE hStruct; // This structure's handle. MUST BE 1ST!!
|
|
struct ExternalSortBuffer FAR *lpesbNext; // Next buffer in the list.
|
|
FILEOFFSET lfo; // This starts out as an offset in the
|
|
// temp file at which the first word
|
|
// associated with this buffer will
|
|
// be found. As words are disposed
|
|
// of it will increment.
|
|
FILEOFFSET lfoMax; // This is the offset of the end of
|
|
// the area of the temp file that
|
|
// contains words for this external
|
|
// sort buffer.
|
|
DWORD dwEsbSize; // Actual size of the internal buffer.
|
|
DWORD ibBuf; // Pointer to the current record in
|
|
// the internal buffer.
|
|
HANDLE hMem; // Handle to buffered block.
|
|
LRGB lrgbMem; // Pointer to buffered block.
|
|
} ESB, FAR *LPESB;
|
|
|
|
|
|
// - - - - - - - - -
|
|
|
|
// Information about the external sort process as a while.
|
|
|
|
typedef struct ExternalSortInfo
|
|
{
|
|
FILEOFFSET lfoTempOffset; // Current size of the output file
|
|
HFPB hfpb; // Handle to ouput file
|
|
LPFBI lpfbiTemp; // Temp file buffer
|
|
DWORD cesb; // Number of ESB blocks allocated
|
|
LPESB lpesbRoot; // First buffer in the external-buffer linked-list
|
|
DWORD cbEsbBuf; // The size of each ESB buffer.
|
|
DWORD uiQueueSize; // Priority queue's size
|
|
GHANDLE hPriorityQueue; // Handle to Priority Queue
|
|
LPESB FAR *lrgPriorityQueue; // Priority Queue
|
|
|
|
// Output buffer handling
|
|
HANDLE hBuf; // Handle to output buiffer
|
|
LPB pOutputBuffer; // Pointer to output buffer
|
|
DWORD ibBuf; // Buffer index
|
|
WORD fFlag; // Various flag
|
|
WORD pad;
|
|
LPB lpbQueueStr [cbMAX_PATH];
|
|
BYTE aszTempName[_MAX_PATH]; // Temp sorted result name
|
|
} ESI,
|
|
FAR *LPESI;
|
|
|
|
|
|
// Information kept that pertains directly to "tfc" term-weighting.
|
|
|
|
typedef float SIGMA;
|
|
typedef SIGMA HUGE *HPSIGMA;
|
|
typedef SIGMA HUGE *HRGSIGMA;
|
|
|
|
typedef DWORD LISIGMA;
|
|
|
|
#define LASTWORD_SIZE 1024 // Size of last word buffer in each node
|
|
|
|
typedef struct BTREEDATA
|
|
{
|
|
// Array of tree blocks
|
|
PNODEINFO rgpNodeInfo[MAX_TREE_HEIGHT]; // Array of tree nodes
|
|
PNODEINFO rgpTmpNodeInfo[MAX_TREE_HEIGHT]; // Array of tree nodes
|
|
|
|
FILEOFFSET OffsetPointer; // File offset of the last nodes
|
|
// pointer to the next node (for traversal)
|
|
IH20 Header;
|
|
DWORD NID; // Number of nodes allocated
|
|
|
|
FLOAT rLogN; // Used for term-weighting
|
|
FLOAT FAR *lrgrLog; // This will be an array of numbers that
|
|
// contains a common weighting sub-expression
|
|
BYTE argbLog[cLOG_MAX]; // An array of 8-bit flags. If one of
|
|
// these is non-zero the corresponding
|
|
// value in lrgrLog is valid
|
|
|
|
BYTE fOccfLength; // Word Length field flag
|
|
BYTE padding[3]; // Maintain DWORD alignment
|
|
|
|
} BTREEDATA, FAR *PBTREEDATA;
|
|
#define lisigmaMAX ((LISIGMA)524288L) // This value is arbitrary
|
|
// but should not be allowed
|
|
// to grow, if possible.
|
|
|
|
typedef struct WeightInfo
|
|
{
|
|
HRGSIGMA hrgsigma; // Pointer to array of sigma elements.
|
|
HANDLE hSigma; // Handle to "hrgsigma".
|
|
FLOAT FAR *lrgrLog; // Array of LOG values to speed up processing
|
|
HANDLE hLog; // Handle to "
|
|
} WI;
|
|
|
|
|
|
typedef struct BLKCOMBO
|
|
{
|
|
LPV pBlockMgr;
|
|
PLIST pFreeList;
|
|
DWORD dwCount;
|
|
} BLKCOMBO, FAR *PBLKCOMBO;
|
|
|
|
typedef struct
|
|
{
|
|
DWORD dwPhase; // Current indexing phase
|
|
// 1: Collection phase
|
|
// 2: Sort and coalate phase
|
|
// 3: Permament index building phase
|
|
DWORD dwIndex; // Completion index
|
|
} CALLBACKINFO, FAR *PCALLBACKINFO;
|
|
// - - - - - - - - -
|
|
|
|
// Nerve information about the indexing process. Most memory allocated
|
|
// and files created are in some way attached to one of these.
|
|
|
|
typedef struct IndexParamBlock
|
|
{
|
|
HANDLE hStruct; // This structure's handle. MUST BE 1ST
|
|
DWORD dwKey; // Key for callback
|
|
FCALLBACK_MSG CallbackInfo; // User callback info
|
|
|
|
//
|
|
// Miscellaneous.
|
|
//
|
|
WI wi; // Term-weighting information.
|
|
FILEOFFSET foMaxOffset; // Maximum offset of the file (file size)
|
|
|
|
// Useful information to be used
|
|
DWORD lcTopics; // The number of unique documents
|
|
DWORD dwMaxTopicId; // Use to hold compare value for lcTopics
|
|
DWORD dwMemAllowed; // Size of memory allocated for index
|
|
DWORD dwMaxRecordSize; // Maximum record size in collecting word
|
|
DWORD dwMaxEsbRecSize; // Current ESB maximum record size
|
|
DWORD dwMaxWLen; // Maximum word's length value
|
|
DWORD dwLastIndexedTopic; // For word collection
|
|
HFREELIST hFreeList; // Handle to the Index FreeList
|
|
//
|
|
// Callbacks.
|
|
//
|
|
FCOMPARE lpfnCompare; // Compare function for sort
|
|
LPV lpvSortParm; // Sort parameters
|
|
|
|
// Sort information.
|
|
//
|
|
ISI isi; // Internal sort information.
|
|
ESI esi; // External sort information.
|
|
|
|
|
|
LPV pDataBlock; // Block manager for string
|
|
BLKCOMBO BTNodeBlock; // Block manager for btnode
|
|
BLKCOMBO TopicBlock; // Block manager for topic block
|
|
BLKCOMBO OccBlock; // Block manager for occurrence
|
|
PLIST pOccFreeList; // Free list of occurrence nodes
|
|
|
|
BTREEDATA BTreeData; // BTree data info
|
|
|
|
// Input/output file
|
|
FILEDATA InFile; // File info for input file
|
|
FILEDATA OutFile; // File info for output file
|
|
|
|
PNODEINFO pIndexDataNode;
|
|
|
|
// Various buffer used for update
|
|
HANDLE hTmpBuf; // Temp buf for word record
|
|
LPB pTmpBuf;
|
|
LPB pWord; // Pointer to word record
|
|
HFPB hfpbIdxFile;
|
|
|
|
HANDLE hData;
|
|
LPB pDataBuffer; // Buffer for new data
|
|
DWORD dwDataSize; // Size of the buffer data
|
|
|
|
DWORD BitCount[7][33]; // Array to hold the bit count for bit
|
|
// compression scheme.
|
|
// [0] = TopicID, [1] = OccCount, [2]-[6] = Occs
|
|
|
|
// Statistics informations
|
|
DWORD dwIndexedWord; // Total of indexed words (statistics)
|
|
DWORD dwUniqueWord; // How many unique words indexed (statistics)
|
|
DWORD dwByteCount; // How many bytes indexed (statistics)
|
|
DWORD dwOccOffbits; // How many bits for offset (statistics)
|
|
DWORD dwOccExtbits; // How many bits for extent (statistics)
|
|
DWORD dwMaxFieldId; // Maximum field value
|
|
DWORD dwMaxWCount; // Maximum word count value
|
|
DWORD dwMaxOffset; // Maximum offset value
|
|
DWORD dwTotal3bWordLen; // Total length of all words > 2 bytes
|
|
DWORD dwTotal2bWordLen; // Total length of all words <= 2 bytes
|
|
DWORD dwTotalUniqueWordLen; // Total length of all unique words
|
|
|
|
CKEY cKey[5]; // Compression keys (2-bytes * 5)
|
|
|
|
// BYTE ucNumOccFields; // The number of bits set in "occf".
|
|
WORD idxf; // Index characteristic flags.
|
|
WORD occf; // A flag byte that keeps track of
|
|
// which occurence element fields
|
|
// should be indexed.
|
|
BYTE ucNumOccDataFields; // The number of bits set that are saved in OCCDATA
|
|
BYTE fOccComp; // Set to 1 if Occurrences need to be sorted
|
|
// in collect2.(They are added out of order)
|
|
BYTE cMaxLevel;
|
|
BYTE bState;
|
|
BYTE szEsiTemp[cbMAX_PATH]; // Temp ESI
|
|
} IPB,
|
|
FAR *_LPIPB;
|
|
|
|
|
|
// bState values
|
|
|
|
#define INDEXING_STATE 0 // We are doing indexing
|
|
#define UPDATING_STATE 1 // We are updating the index
|
|
#define DELETING_STATE 2 // We are deleting data from teh index
|
|
|
|
// - - - - - - - - -
|
|
|
|
// These defines indicate how many bits per word occurence list are
|
|
// wasted through the adoption of either the "fixed", "high bit
|
|
// replacement" or "bitstream" compression schemes. This wasted space
|
|
// is wasted through the insertion of one or more flag bits into the
|
|
// data-stream.
|
|
|
|
#define cbitWASTED_FIXED (1 + CBIT_WIDTH_BITS)
|
|
// If the first bit is set, it means that the
|
|
// "fixed" scheme was adopted, so the total
|
|
// number of bits that was necessary to
|
|
// indicate this was one. More bits are
|
|
// used to store the "width" value that is
|
|
// associated with this scheme. This has
|
|
// been the most commonly used compression
|
|
// scheme in practice.
|
|
#define cbitWASTED_BELL (2 + CBIT_WIDTH_BITS)
|
|
// If the first bit wasn't set, and the second
|
|
// one was, it indicates that the "bell"
|
|
// scheme was used. The total wasted to
|
|
// indicated this scheme was two bits, plus
|
|
// the "width" value (the "center")
|
|
// associated with this scheme.
|
|
#define cbitWASTED_BITSTREAM (2)
|
|
// If neither the first bit nor the second bit
|
|
// were set, the bitstream scheme was used.
|
|
// The total wasted space was also two bits,
|
|
// the same as for the "bell" scheme. This
|
|
// has been the least-used scheme in
|
|
// practice.
|
|
|
|
#define lcbitBITSTREAM_ILLEGAL ((DWORD)-1L)
|
|
// This value indicates that the function
|
|
// is not allowed to select the "bitstream"
|
|
// compression scheme.
|
|
|
|
#define cbitCENTER_MAX ((CBIT)33)
|
|
// Legal "center" values are 0..32. This is
|
|
// weird because you'd expect it to be
|
|
// 0..31 but it's not.
|
|
|
|
// - - - - - - - - -
|
|
|
|
// This structure is used in the occurence-list building phase of
|
|
// indexing. The structure includes information local to a single
|
|
// occurence list.
|
|
|
|
typedef struct OccurenceListInfo
|
|
{
|
|
DWORD lcSublists; // The number of sub-lists in this
|
|
// occurence list.
|
|
CKEY ckey; // The manner in which doc-ID deltas
|
|
// are compressed in this list.
|
|
} OLI,
|
|
FAR *LPOLI;
|
|
|
|
typedef struct MergeParams
|
|
{
|
|
DWORD FAR *rgTopicId;
|
|
DWORD dwCount;
|
|
DWORD FAR *lpTopicIdLast; // internal use, last position saved
|
|
} MERGEPARAMS, FAR *LPMERGEPARAMS;
|
|
// - - - - - - - - -
|
|
|
|
// Convert occurence list file to a final index file.
|
|
|
|
/*******************************************************************
|
|
* *
|
|
* FUNCTIONS PROTOTYPES *
|
|
* *
|
|
*******************************************************************/
|
|
|
|
/*********************************************************************
|
|
* *
|
|
* SORT FUNCTIONS (SORT.C) *
|
|
* *
|
|
*********************************************************************/
|
|
|
|
PUBLIC ERR PASCAL FAR HugeDataSort(LPV HUGE *, DWORD, FCOMPARE, LPV,
|
|
INTERRUPT_FUNC, LPV);
|
|
PUBLIC VOID PASCAL FAR HugeInsertionSort (LPV HUGE *, DWORD, FCOMPARE, LPV);
|
|
PUBLIC ERR PASCAL FAR PriorityQueueRemove (LPESI, FCOMPARE, LPV);
|
|
PUBLIC ERR PASCAL FAR PriorityQueueCreate (LPESI, FCOMPARE, LPV);
|
|
PUBLIC ERR PASCAL NEAR IndexSort (LPW, LPB, int);
|
|
PUBLIC ERR PASCAL NEAR IndexMergeSort (HFILE FAR *, LSZ, LPW, LPB, int, int);
|
|
|
|
/*********************************************************************
|
|
* *
|
|
* ENCODING FUNCTIONS (ENCODE.C) *
|
|
* *
|
|
*********************************************************************/
|
|
|
|
PUBLIC CB PASCAL NEAR OccurrencePack (LPB, LPOCC, WORD);
|
|
PUBLIC VOID PASCAL NEAR OccurrenceUnpack(LPOCC, LPB, OCCF);
|
|
PUBLIC CB PASCAL NEAR CbCopySortPackedOcc (LPB, LPB, WORD);
|
|
PUBLIC CBIT PASCAL NEAR CbitBitsDw (DWORD);
|
|
PUBLIC void NEAR PASCAL VGetBestScheme(LPCKEY, LRGDW, DWORD, int);
|
|
PUBLIC CB PASCAL FAR CbBytePack(LPB, DWORD);
|
|
|
|
/*********************************************************************
|
|
* *
|
|
* INDEXING FUNCTIONS *
|
|
* *
|
|
*********************************************************************/
|
|
|
|
PUBLIC VOID PASCAL FAR FreeISI (LPIPB);
|
|
PUBLIC void NEAR PASCAL FreeEsi(LPIPB);
|
|
PUBLIC LCB FAR PASCAL LcbGetFreeMemory(LPERRB);
|
|
PUBLIC ERR FAR PASCAL SortFlushISI (_LPIPB);
|
|
PUBLIC int PASCAL FAR WordRecCompare(LPB, LPB, LPV);
|
|
PUBLIC ERR FAR PASCAL MergeSortTreeFile (_LPIPB, LPMERGEPARAMS);
|
|
PUBLIC int FAR PASCAL CompareOccurrence (LPDW, LPDW, int);
|
|
PUBLIC int FAR PASCAL StrCmp2BytePascal (LPB, LPB);
|
|
ERR FAR PASCAL FlushTree(_LPIPB lpipb);
|
|
PUBLIC ERR FAR PASCAL BuildBTree (HFPB, _LPIPB, LPB, HFPB, LPSTR);
|
|
PUBLIC ERR FAR PASCAL FWriteBits(PFILEDATA, DWORD, BYTE);
|
|
PUBLIC ERR PASCAL FAR IndexOpenRW (LPIPB, HFPB, LSZ);
|
|
PUBLIC PNODEINFO PASCAL FAR AllocBTreeNode (_LPIPB lpipb);
|
|
PUBLIC VOID PASCAL FAR FreeBTreeNode (PNODEINFO pNode);
|
|
PUBLIC ERR PASCAL FAR ReadNewNode (HFPB, PNODEINFO, int);
|
|
PUBLIC PNODEINFO PASCAL FAR AllocBTreeNode (_LPIPB lpipb);
|
|
PUBLIC ERR PASCAL FAR SkipOldData (_LPIPB, PNODEINFO);
|
|
PUBLIC ERR FAR PASCAL AllocSigmaTable (_LPIPB lpipb);
|