/************************************************************************* * * * IINDEX.H * * * * Copyright (C) Microsoft Corporation 1990-1994 * * All Rights reserved. * * * ************************************************************************** * * * Current Owner: BinhN * * * **************************************************************************/ /****************************************** * Internal sort stuff. ******************************************/ #ifdef _32BIT #define MAX_BLOCK_SIZE (DWORD)0x80000 #else #define MAX_BLOCK_SIZE (DWORD)0x0000FF00 #endif typedef struct _list { struct _list FAR * pNext; } FAR *PLIST; // - - - - - - - - - // Tree data types typedef struct OCCDATA { struct OCCDATA FAR *pNext; // Linked-list chain DWORD OccData[1]; // Array of n-DWORD } OCCDATA, FAR *POCCDATA; typedef struct TOPICDATA { struct TOPICDATA FAR *pNext; // Linked-list chain 4 DWORD dwOccCount; // Count of occurrences in list 4 DWORD dwTopicId; // TopicId for this topic 4 POCCDATA pOccData; // First OccData in list 4 POCCDATA pLastOccData; // Last inserted OccData 4 } TOPICDATA, // = 20 FAR *PTOPICDATA; typedef struct STRDATA { PTOPICDATA pTopic; // First Topic in list 4 PTOPICDATA pLastTopic; // Last inserted Topic 4 LPB pText; // Sort word as a Pascal string 4 DWORD dwField; // Field Id for the sort word 4 DWORD dwTopicCount; // Count of Topics in list 4 DWORD dwWordLength; // Word length (from OCC data) 4 } STRDATA, // = 24 FAR *PSTRDATA; typedef struct BTNODE { enum TREECOLOR {RED, BLACK} color; // Color of node - for balancing 4 struct BTNODE FAR *pParent; // Pointer to parent node 4 struct BTNODE FAR *pLeft; // Pointer to left child node 4 struct BTNODE FAR *pRight; // Pointer to right child node 4 STRDATA StringData; // Pointer to string data 24 } BTNODE, // = 32 FAR *PBTNODE; typedef struct MERGEHEADER { DWORD dwRecordSize; LPB lpbWord; // Pascal string DWORD dwFieldId; // Field Id DWORD dwWordLength; // Real life word length DWORD dwStrLen; // Current string length DWORD dwTopicCount; // Topic count DWORD dwLastTopicId; // Last topic id PTOPICDATA pTopic; // Pointer to first Topic in list PTOPICDATA pLastTopic; // Last inserted Topic FILEOFFSET foTopicCount; // Backpatching address LPB pTopicCount; // Pointer to topic count location BYTE fEmitRecord; // Flag to denote rec is emitted BYTE Pad1; // Padding for DWORD aligned } MERGEHEADER, FAR *PMERGEHEADER; // Typedefs for an external sort buffer. Each of these has associated // with it a large (easily > 1meg) block of sorted words. A few of // these words will end up in an internal buffer. These external sort // buffers will be formed into a chain, one chain will have associated // with it in total all of the words that are going to be sorted. A // merge will be performed on the words associated with the chain to // produce a final sorted list of words. typedef struct InternalSortInfo { HFPB hfpb; // Handle to temp file PBTNODE pBalanceTree; // Root node of the balanced tree FILEOFFSET lfo; // File offset FILEOFFSET lfoRecBackPatch; // Backpatching record offset DWORD dwRecLength; // Record (data associated with 1 word) length HANDLE hSortBuffer; // Handle to sort buffer BYTE FAR *pSortBuffer; // Memory buffer for file output BYTE FAR *pStartRec; // Record start point in the buffer BYTE FAR *pCurPtr; // Current insertion point in the buffer DWORD dwMaxEsbRecSize; // Maximum record size of current ESB BYTE DeepLevel; // Deepest level of the tree BYTE Pad1; BYTE Pad2; BYTE Pad3; BYTE aszTempName[_MAX_PATH]; // Temp file for tree flush, ericjut: change from cbMAX_PATH to _MAX_PATH } ISI, FAR *LPISI; typedef HANDLE HESB; typedef struct ExternalSortBuffer { HANDLE hStruct; // This structure's handle. MUST BE 1ST!! struct ExternalSortBuffer FAR *lpesbNext; // Next buffer in the list. FILEOFFSET lfo; // This starts out as an offset in the // temp file at which the first word // associated with this buffer will // be found. As words are disposed // of it will increment. FILEOFFSET lfoMax; // This is the offset of the end of // the area of the temp file that // contains words for this external // sort buffer. DWORD dwEsbSize; // Actual size of the internal buffer. DWORD ibBuf; // Pointer to the current record in // the internal buffer. HANDLE hMem; // Handle to buffered block. LRGB lrgbMem; // Pointer to buffered block. } ESB, FAR *LPESB; // - - - - - - - - - // Information about the external sort process as a while. typedef struct ExternalSortInfo { FILEOFFSET lfoTempOffset; // Current size of the output file HFPB hfpb; // Handle to ouput file LPFBI lpfbiTemp; // Temp file buffer DWORD cesb; // Number of ESB blocks allocated LPESB lpesbRoot; // First buffer in the external-buffer linked-list DWORD cbEsbBuf; // The size of each ESB buffer. DWORD uiQueueSize; // Priority queue's size GHANDLE hPriorityQueue; // Handle to Priority Queue LPESB FAR *lrgPriorityQueue; // Priority Queue // Output buffer handling HANDLE hBuf; // Handle to output buiffer LPB pOutputBuffer; // Pointer to output buffer DWORD ibBuf; // Buffer index WORD fFlag; // Various flag WORD pad; LPB lpbQueueStr [cbMAX_PATH]; BYTE aszTempName[_MAX_PATH]; // Temp sorted result name } ESI, FAR *LPESI; // Information kept that pertains directly to "tfc" term-weighting. typedef float SIGMA; typedef SIGMA HUGE *HPSIGMA; typedef SIGMA HUGE *HRGSIGMA; typedef DWORD LISIGMA; #define LASTWORD_SIZE 1024 // Size of last word buffer in each node typedef struct BTREEDATA { // Array of tree blocks PNODEINFO rgpNodeInfo[MAX_TREE_HEIGHT]; // Array of tree nodes PNODEINFO rgpTmpNodeInfo[MAX_TREE_HEIGHT]; // Array of tree nodes FILEOFFSET OffsetPointer; // File offset of the last nodes // pointer to the next node (for traversal) IH20 Header; DWORD NID; // Number of nodes allocated FLOAT rLogN; // Used for term-weighting FLOAT FAR *lrgrLog; // This will be an array of numbers that // contains a common weighting sub-expression BYTE argbLog[cLOG_MAX]; // An array of 8-bit flags. If one of // these is non-zero the corresponding // value in lrgrLog is valid BYTE fOccfLength; // Word Length field flag BYTE padding[3]; // Maintain DWORD alignment } BTREEDATA, FAR *PBTREEDATA; #define lisigmaMAX ((LISIGMA)524288L) // This value is arbitrary // but should not be allowed // to grow, if possible. typedef struct WeightInfo { HRGSIGMA hrgsigma; // Pointer to array of sigma elements. HANDLE hSigma; // Handle to "hrgsigma". FLOAT FAR *lrgrLog; // Array of LOG values to speed up processing HANDLE hLog; // Handle to " } WI; typedef struct BLKCOMBO { LPV pBlockMgr; PLIST pFreeList; DWORD dwCount; } BLKCOMBO, FAR *PBLKCOMBO; typedef struct { DWORD dwPhase; // Current indexing phase // 1: Collection phase // 2: Sort and coalate phase // 3: Permament index building phase DWORD dwIndex; // Completion index } CALLBACKINFO, FAR *PCALLBACKINFO; // - - - - - - - - - // Nerve information about the indexing process. Most memory allocated // and files created are in some way attached to one of these. typedef struct IndexParamBlock { HANDLE hStruct; // This structure's handle. MUST BE 1ST DWORD dwKey; // Key for callback FCALLBACK_MSG CallbackInfo; // User callback info // // Miscellaneous. // WI wi; // Term-weighting information. FILEOFFSET foMaxOffset; // Maximum offset of the file (file size) // Useful information to be used DWORD lcTopics; // The number of unique documents DWORD dwMaxTopicId; // Use to hold compare value for lcTopics DWORD dwMemAllowed; // Size of memory allocated for index DWORD dwMaxRecordSize; // Maximum record size in collecting word DWORD dwMaxEsbRecSize; // Current ESB maximum record size DWORD dwMaxWLen; // Maximum word's length value DWORD dwLastIndexedTopic; // For word collection HFREELIST hFreeList; // Handle to the Index FreeList // // Callbacks. // FCOMPARE lpfnCompare; // Compare function for sort LPV lpvSortParm; // Sort parameters // Sort information. // ISI isi; // Internal sort information. ESI esi; // External sort information. LPV pDataBlock; // Block manager for string BLKCOMBO BTNodeBlock; // Block manager for btnode BLKCOMBO TopicBlock; // Block manager for topic block BLKCOMBO OccBlock; // Block manager for occurrence PLIST pOccFreeList; // Free list of occurrence nodes BTREEDATA BTreeData; // BTree data info // Input/output file FILEDATA InFile; // File info for input file FILEDATA OutFile; // File info for output file PNODEINFO pIndexDataNode; // Various buffer used for update HANDLE hTmpBuf; // Temp buf for word record LPB pTmpBuf; LPB pWord; // Pointer to word record HFPB hfpbIdxFile; HANDLE hData; LPB pDataBuffer; // Buffer for new data DWORD dwDataSize; // Size of the buffer data DWORD BitCount[7][33]; // Array to hold the bit count for bit // compression scheme. // [0] = TopicID, [1] = OccCount, [2]-[6] = Occs // Statistics informations DWORD dwIndexedWord; // Total of indexed words (statistics) DWORD dwUniqueWord; // How many unique words indexed (statistics) DWORD dwByteCount; // How many bytes indexed (statistics) DWORD dwOccOffbits; // How many bits for offset (statistics) DWORD dwOccExtbits; // How many bits for extent (statistics) DWORD dwMaxFieldId; // Maximum field value DWORD dwMaxWCount; // Maximum word count value DWORD dwMaxOffset; // Maximum offset value DWORD dwTotal3bWordLen; // Total length of all words > 2 bytes DWORD dwTotal2bWordLen; // Total length of all words <= 2 bytes DWORD dwTotalUniqueWordLen; // Total length of all unique words CKEY cKey[5]; // Compression keys (2-bytes * 5) // BYTE ucNumOccFields; // The number of bits set in "occf". WORD idxf; // Index characteristic flags. WORD occf; // A flag byte that keeps track of // which occurence element fields // should be indexed. BYTE ucNumOccDataFields; // The number of bits set that are saved in OCCDATA BYTE fOccComp; // Set to 1 if Occurrences need to be sorted // in collect2.(They are added out of order) BYTE cMaxLevel; BYTE bState; BYTE szEsiTemp[cbMAX_PATH]; // Temp ESI } IPB, FAR *_LPIPB; // bState values #define INDEXING_STATE 0 // We are doing indexing #define UPDATING_STATE 1 // We are updating the index #define DELETING_STATE 2 // We are deleting data from teh index // - - - - - - - - - // These defines indicate how many bits per word occurence list are // wasted through the adoption of either the "fixed", "high bit // replacement" or "bitstream" compression schemes. This wasted space // is wasted through the insertion of one or more flag bits into the // data-stream. #define cbitWASTED_FIXED (1 + CBIT_WIDTH_BITS) // If the first bit is set, it means that the // "fixed" scheme was adopted, so the total // number of bits that was necessary to // indicate this was one. More bits are // used to store the "width" value that is // associated with this scheme. This has // been the most commonly used compression // scheme in practice. #define cbitWASTED_BELL (2 + CBIT_WIDTH_BITS) // If the first bit wasn't set, and the second // one was, it indicates that the "bell" // scheme was used. The total wasted to // indicated this scheme was two bits, plus // the "width" value (the "center") // associated with this scheme. #define cbitWASTED_BITSTREAM (2) // If neither the first bit nor the second bit // were set, the bitstream scheme was used. // The total wasted space was also two bits, // the same as for the "bell" scheme. This // has been the least-used scheme in // practice. #define lcbitBITSTREAM_ILLEGAL ((DWORD)-1L) // This value indicates that the function // is not allowed to select the "bitstream" // compression scheme. #define cbitCENTER_MAX ((CBIT)33) // Legal "center" values are 0..32. This is // weird because you'd expect it to be // 0..31 but it's not. // - - - - - - - - - // This structure is used in the occurence-list building phase of // indexing. The structure includes information local to a single // occurence list. typedef struct OccurenceListInfo { DWORD lcSublists; // The number of sub-lists in this // occurence list. CKEY ckey; // The manner in which doc-ID deltas // are compressed in this list. } OLI, FAR *LPOLI; typedef struct MergeParams { DWORD FAR *rgTopicId; DWORD dwCount; DWORD FAR *lpTopicIdLast; // internal use, last position saved } MERGEPARAMS, FAR *LPMERGEPARAMS; // - - - - - - - - - // Convert occurence list file to a final index file. /******************************************************************* * * * FUNCTIONS PROTOTYPES * * * *******************************************************************/ /********************************************************************* * * * SORT FUNCTIONS (SORT.C) * * * *********************************************************************/ PUBLIC ERR PASCAL FAR HugeDataSort(LPV HUGE *, DWORD, FCOMPARE, LPV, INTERRUPT_FUNC, LPV); PUBLIC VOID PASCAL FAR HugeInsertionSort (LPV HUGE *, DWORD, FCOMPARE, LPV); PUBLIC ERR PASCAL FAR PriorityQueueRemove (LPESI, FCOMPARE, LPV); PUBLIC ERR PASCAL FAR PriorityQueueCreate (LPESI, FCOMPARE, LPV); PUBLIC ERR PASCAL NEAR IndexSort (LPW, LPB, int); PUBLIC ERR PASCAL NEAR IndexMergeSort (HFILE FAR *, LSZ, LPW, LPB, int, int); /********************************************************************* * * * ENCODING FUNCTIONS (ENCODE.C) * * * *********************************************************************/ PUBLIC CB PASCAL NEAR OccurrencePack (LPB, LPOCC, WORD); PUBLIC VOID PASCAL NEAR OccurrenceUnpack(LPOCC, LPB, OCCF); PUBLIC CB PASCAL NEAR CbCopySortPackedOcc (LPB, LPB, WORD); PUBLIC CBIT PASCAL NEAR CbitBitsDw (DWORD); PUBLIC void NEAR PASCAL VGetBestScheme(LPCKEY, LRGDW, DWORD, int); PUBLIC CB PASCAL FAR CbBytePack(LPB, DWORD); /********************************************************************* * * * INDEXING FUNCTIONS * * * *********************************************************************/ PUBLIC VOID PASCAL FAR FreeISI (LPIPB); PUBLIC void NEAR PASCAL FreeEsi(LPIPB); PUBLIC LCB FAR PASCAL LcbGetFreeMemory(LPERRB); PUBLIC ERR FAR PASCAL SortFlushISI (_LPIPB); PUBLIC int PASCAL FAR WordRecCompare(LPB, LPB, LPV); PUBLIC ERR FAR PASCAL MergeSortTreeFile (_LPIPB, LPMERGEPARAMS); PUBLIC int FAR PASCAL CompareOccurrence (LPDW, LPDW, int); PUBLIC int FAR PASCAL StrCmp2BytePascal (LPB, LPB); ERR FAR PASCAL FlushTree(_LPIPB lpipb); PUBLIC ERR FAR PASCAL BuildBTree (HFPB, _LPIPB, LPB, HFPB, LPSTR); PUBLIC ERR FAR PASCAL FWriteBits(PFILEDATA, DWORD, BYTE); PUBLIC ERR PASCAL FAR IndexOpenRW (LPIPB, HFPB, LSZ); PUBLIC PNODEINFO PASCAL FAR AllocBTreeNode (_LPIPB lpipb); PUBLIC VOID PASCAL FAR FreeBTreeNode (PNODEINFO pNode); PUBLIC ERR PASCAL FAR ReadNewNode (HFPB, PNODEINFO, int); PUBLIC PNODEINFO PASCAL FAR AllocBTreeNode (_LPIPB lpipb); PUBLIC ERR PASCAL FAR SkipOldData (_LPIPB, PNODEINFO); PUBLIC ERR FAR PASCAL AllocSigmaTable (_LPIPB lpipb);