windows-nt/Source/XPSP1/NT/enduser/stuff/itircl/fts/search/index.h

454 lines
19 KiB
C
Raw Normal View History

2020-09-26 03:20:57 -05:00
/*************************************************************************
* *
* IINDEX.H *
* *
* Copyright (C) Microsoft Corporation 1990-1994 *
* All Rights reserved. *
* *
**************************************************************************
* *
* Current Owner: BinhN *
* *
**************************************************************************/
/******************************************
* Internal sort stuff.
******************************************/
#ifdef _32BIT
#define MAX_BLOCK_SIZE (DWORD)0x80000
#else
#define MAX_BLOCK_SIZE (DWORD)0x0000FF00
#endif
typedef struct _list
{
struct _list FAR * pNext;
} FAR *PLIST;
// - - - - - - - - -
// Tree data types
typedef struct OCCDATA
{
struct OCCDATA FAR *pNext; // Linked-list chain
DWORD OccData[1]; // Array of n-DWORD
} OCCDATA,
FAR *POCCDATA;
typedef struct TOPICDATA
{
struct TOPICDATA FAR *pNext; // Linked-list chain 4
DWORD dwOccCount; // Count of occurrences in list 4
DWORD dwTopicId; // TopicId for this topic 4
POCCDATA pOccData; // First OccData in list 4
POCCDATA pLastOccData; // Last inserted OccData 4
} TOPICDATA, // = 20
FAR *PTOPICDATA;
typedef struct STRDATA
{
PTOPICDATA pTopic; // First Topic in list 4
PTOPICDATA pLastTopic; // Last inserted Topic 4
LPB pText; // Sort word as a Pascal string 4
DWORD dwField; // Field Id for the sort word 4
DWORD dwTopicCount; // Count of Topics in list 4
DWORD dwWordLength; // Word length (from OCC data) 4
} STRDATA, // = 24
FAR *PSTRDATA;
typedef struct BTNODE
{
enum TREECOLOR {RED, BLACK} color; // Color of node - for balancing 4
struct BTNODE FAR *pParent; // Pointer to parent node 4
struct BTNODE FAR *pLeft; // Pointer to left child node 4
struct BTNODE FAR *pRight; // Pointer to right child node 4
STRDATA StringData; // Pointer to string data 24
} BTNODE, // = 32
FAR *PBTNODE;
typedef struct MERGEHEADER
{
DWORD dwRecordSize;
LPB lpbWord; // Pascal string
DWORD dwFieldId; // Field Id
DWORD dwWordLength; // Real life word length
DWORD dwStrLen; // Current string length
DWORD dwTopicCount; // Topic count
DWORD dwLastTopicId; // Last topic id
PTOPICDATA pTopic; // Pointer to first Topic in list
PTOPICDATA pLastTopic; // Last inserted Topic
FILEOFFSET foTopicCount; // Backpatching address
LPB pTopicCount; // Pointer to topic count location
BYTE fEmitRecord; // Flag to denote rec is emitted
BYTE Pad1; // Padding for DWORD aligned
} MERGEHEADER, FAR *PMERGEHEADER;
// Typedefs for an external sort buffer. Each of these has associated
// with it a large (easily > 1meg) block of sorted words. A few of
// these words will end up in an internal buffer. These external sort
// buffers will be formed into a chain, one chain will have associated
// with it in total all of the words that are going to be sorted. A
// merge will be performed on the words associated with the chain to
// produce a final sorted list of words.
typedef struct InternalSortInfo
{
HFPB hfpb; // Handle to temp file
PBTNODE pBalanceTree; // Root node of the balanced tree
FILEOFFSET lfo; // File offset
FILEOFFSET lfoRecBackPatch; // Backpatching record offset
DWORD dwRecLength; // Record (data associated with 1 word) length
HANDLE hSortBuffer; // Handle to sort buffer
BYTE FAR *pSortBuffer; // Memory buffer for file output
BYTE FAR *pStartRec; // Record start point in the buffer
BYTE FAR *pCurPtr; // Current insertion point in the buffer
DWORD dwMaxEsbRecSize; // Maximum record size of current ESB
BYTE DeepLevel; // Deepest level of the tree
BYTE Pad1;
BYTE Pad2;
BYTE Pad3;
BYTE aszTempName[_MAX_PATH]; // Temp file for tree flush, ericjut: change from cbMAX_PATH to _MAX_PATH
} ISI,
FAR *LPISI;
typedef HANDLE HESB;
typedef struct ExternalSortBuffer
{
HANDLE hStruct; // This structure's handle. MUST BE 1ST!!
struct ExternalSortBuffer FAR *lpesbNext; // Next buffer in the list.
FILEOFFSET lfo; // This starts out as an offset in the
// temp file at which the first word
// associated with this buffer will
// be found. As words are disposed
// of it will increment.
FILEOFFSET lfoMax; // This is the offset of the end of
// the area of the temp file that
// contains words for this external
// sort buffer.
DWORD dwEsbSize; // Actual size of the internal buffer.
DWORD ibBuf; // Pointer to the current record in
// the internal buffer.
HANDLE hMem; // Handle to buffered block.
LRGB lrgbMem; // Pointer to buffered block.
} ESB, FAR *LPESB;
// - - - - - - - - -
// Information about the external sort process as a while.
typedef struct ExternalSortInfo
{
FILEOFFSET lfoTempOffset; // Current size of the output file
HFPB hfpb; // Handle to ouput file
LPFBI lpfbiTemp; // Temp file buffer
DWORD cesb; // Number of ESB blocks allocated
LPESB lpesbRoot; // First buffer in the external-buffer linked-list
DWORD cbEsbBuf; // The size of each ESB buffer.
DWORD uiQueueSize; // Priority queue's size
GHANDLE hPriorityQueue; // Handle to Priority Queue
LPESB FAR *lrgPriorityQueue; // Priority Queue
// Output buffer handling
HANDLE hBuf; // Handle to output buiffer
LPB pOutputBuffer; // Pointer to output buffer
DWORD ibBuf; // Buffer index
WORD fFlag; // Various flag
WORD pad;
LPB lpbQueueStr [cbMAX_PATH];
BYTE aszTempName[_MAX_PATH]; // Temp sorted result name
} ESI,
FAR *LPESI;
// Information kept that pertains directly to "tfc" term-weighting.
typedef float SIGMA;
typedef SIGMA HUGE *HPSIGMA;
typedef SIGMA HUGE *HRGSIGMA;
typedef DWORD LISIGMA;
#define LASTWORD_SIZE 1024 // Size of last word buffer in each node
typedef struct BTREEDATA
{
// Array of tree blocks
PNODEINFO rgpNodeInfo[MAX_TREE_HEIGHT]; // Array of tree nodes
PNODEINFO rgpTmpNodeInfo[MAX_TREE_HEIGHT]; // Array of tree nodes
FILEOFFSET OffsetPointer; // File offset of the last nodes
// pointer to the next node (for traversal)
IH20 Header;
DWORD NID; // Number of nodes allocated
FLOAT rLogN; // Used for term-weighting
FLOAT FAR *lrgrLog; // This will be an array of numbers that
// contains a common weighting sub-expression
BYTE argbLog[cLOG_MAX]; // An array of 8-bit flags. If one of
// these is non-zero the corresponding
// value in lrgrLog is valid
BYTE fOccfLength; // Word Length field flag
BYTE padding[3]; // Maintain DWORD alignment
} BTREEDATA, FAR *PBTREEDATA;
#define lisigmaMAX ((LISIGMA)524288L) // This value is arbitrary
// but should not be allowed
// to grow, if possible.
typedef struct WeightInfo
{
HRGSIGMA hrgsigma; // Pointer to array of sigma elements.
HANDLE hSigma; // Handle to "hrgsigma".
FLOAT FAR *lrgrLog; // Array of LOG values to speed up processing
HANDLE hLog; // Handle to "
} WI;
typedef struct BLKCOMBO
{
LPV pBlockMgr;
PLIST pFreeList;
DWORD dwCount;
} BLKCOMBO, FAR *PBLKCOMBO;
typedef struct
{
DWORD dwPhase; // Current indexing phase
// 1: Collection phase
// 2: Sort and coalate phase
// 3: Permament index building phase
DWORD dwIndex; // Completion index
} CALLBACKINFO, FAR *PCALLBACKINFO;
// - - - - - - - - -
// Nerve information about the indexing process. Most memory allocated
// and files created are in some way attached to one of these.
typedef struct IndexParamBlock
{
HANDLE hStruct; // This structure's handle. MUST BE 1ST
DWORD dwKey; // Key for callback
FCALLBACK_MSG CallbackInfo; // User callback info
//
// Miscellaneous.
//
WI wi; // Term-weighting information.
FILEOFFSET foMaxOffset; // Maximum offset of the file (file size)
// Useful information to be used
DWORD lcTopics; // The number of unique documents
DWORD dwMaxTopicId; // Use to hold compare value for lcTopics
DWORD dwMemAllowed; // Size of memory allocated for index
DWORD dwMaxRecordSize; // Maximum record size in collecting word
DWORD dwMaxEsbRecSize; // Current ESB maximum record size
DWORD dwMaxWLen; // Maximum word's length value
DWORD dwLastIndexedTopic; // For word collection
HFREELIST hFreeList; // Handle to the Index FreeList
//
// Callbacks.
//
FCOMPARE lpfnCompare; // Compare function for sort
LPV lpvSortParm; // Sort parameters
// Sort information.
//
ISI isi; // Internal sort information.
ESI esi; // External sort information.
LPV pDataBlock; // Block manager for string
BLKCOMBO BTNodeBlock; // Block manager for btnode
BLKCOMBO TopicBlock; // Block manager for topic block
BLKCOMBO OccBlock; // Block manager for occurrence
PLIST pOccFreeList; // Free list of occurrence nodes
BTREEDATA BTreeData; // BTree data info
// Input/output file
FILEDATA InFile; // File info for input file
FILEDATA OutFile; // File info for output file
PNODEINFO pIndexDataNode;
// Various buffer used for update
HANDLE hTmpBuf; // Temp buf for word record
LPB pTmpBuf;
LPB pWord; // Pointer to word record
HFPB hfpbIdxFile;
HANDLE hData;
LPB pDataBuffer; // Buffer for new data
DWORD dwDataSize; // Size of the buffer data
DWORD BitCount[7][33]; // Array to hold the bit count for bit
// compression scheme.
// [0] = TopicID, [1] = OccCount, [2]-[6] = Occs
// Statistics informations
DWORD dwIndexedWord; // Total of indexed words (statistics)
DWORD dwUniqueWord; // How many unique words indexed (statistics)
DWORD dwByteCount; // How many bytes indexed (statistics)
DWORD dwOccOffbits; // How many bits for offset (statistics)
DWORD dwOccExtbits; // How many bits for extent (statistics)
DWORD dwMaxFieldId; // Maximum field value
DWORD dwMaxWCount; // Maximum word count value
DWORD dwMaxOffset; // Maximum offset value
DWORD dwTotal3bWordLen; // Total length of all words > 2 bytes
DWORD dwTotal2bWordLen; // Total length of all words <= 2 bytes
DWORD dwTotalUniqueWordLen; // Total length of all unique words
CKEY cKey[5]; // Compression keys (2-bytes * 5)
// BYTE ucNumOccFields; // The number of bits set in "occf".
WORD idxf; // Index characteristic flags.
WORD occf; // A flag byte that keeps track of
// which occurence element fields
// should be indexed.
BYTE ucNumOccDataFields; // The number of bits set that are saved in OCCDATA
BYTE fOccComp; // Set to 1 if Occurrences need to be sorted
// in collect2.(They are added out of order)
BYTE cMaxLevel;
BYTE bState;
BYTE szEsiTemp[cbMAX_PATH]; // Temp ESI
} IPB,
FAR *_LPIPB;
// bState values
#define INDEXING_STATE 0 // We are doing indexing
#define UPDATING_STATE 1 // We are updating the index
#define DELETING_STATE 2 // We are deleting data from teh index
// - - - - - - - - -
// These defines indicate how many bits per word occurence list are
// wasted through the adoption of either the "fixed", "high bit
// replacement" or "bitstream" compression schemes. This wasted space
// is wasted through the insertion of one or more flag bits into the
// data-stream.
#define cbitWASTED_FIXED (1 + CBIT_WIDTH_BITS)
// If the first bit is set, it means that the
// "fixed" scheme was adopted, so the total
// number of bits that was necessary to
// indicate this was one. More bits are
// used to store the "width" value that is
// associated with this scheme. This has
// been the most commonly used compression
// scheme in practice.
#define cbitWASTED_BELL (2 + CBIT_WIDTH_BITS)
// If the first bit wasn't set, and the second
// one was, it indicates that the "bell"
// scheme was used. The total wasted to
// indicated this scheme was two bits, plus
// the "width" value (the "center")
// associated with this scheme.
#define cbitWASTED_BITSTREAM (2)
// If neither the first bit nor the second bit
// were set, the bitstream scheme was used.
// The total wasted space was also two bits,
// the same as for the "bell" scheme. This
// has been the least-used scheme in
// practice.
#define lcbitBITSTREAM_ILLEGAL ((DWORD)-1L)
// This value indicates that the function
// is not allowed to select the "bitstream"
// compression scheme.
#define cbitCENTER_MAX ((CBIT)33)
// Legal "center" values are 0..32. This is
// weird because you'd expect it to be
// 0..31 but it's not.
// - - - - - - - - -
// This structure is used in the occurence-list building phase of
// indexing. The structure includes information local to a single
// occurence list.
typedef struct OccurenceListInfo
{
DWORD lcSublists; // The number of sub-lists in this
// occurence list.
CKEY ckey; // The manner in which doc-ID deltas
// are compressed in this list.
} OLI,
FAR *LPOLI;
typedef struct MergeParams
{
DWORD FAR *rgTopicId;
DWORD dwCount;
DWORD FAR *lpTopicIdLast; // internal use, last position saved
} MERGEPARAMS, FAR *LPMERGEPARAMS;
// - - - - - - - - -
// Convert occurence list file to a final index file.
/*******************************************************************
* *
* FUNCTIONS PROTOTYPES *
* *
*******************************************************************/
/*********************************************************************
* *
* SORT FUNCTIONS (SORT.C) *
* *
*********************************************************************/
PUBLIC ERR PASCAL FAR HugeDataSort(LPV HUGE *, DWORD, FCOMPARE, LPV,
INTERRUPT_FUNC, LPV);
PUBLIC VOID PASCAL FAR HugeInsertionSort (LPV HUGE *, DWORD, FCOMPARE, LPV);
PUBLIC ERR PASCAL FAR PriorityQueueRemove (LPESI, FCOMPARE, LPV);
PUBLIC ERR PASCAL FAR PriorityQueueCreate (LPESI, FCOMPARE, LPV);
PUBLIC ERR PASCAL NEAR IndexSort (LPW, LPB, int);
PUBLIC ERR PASCAL NEAR IndexMergeSort (HFILE FAR *, LSZ, LPW, LPB, int, int);
/*********************************************************************
* *
* ENCODING FUNCTIONS (ENCODE.C) *
* *
*********************************************************************/
PUBLIC CB PASCAL NEAR OccurrencePack (LPB, LPOCC, WORD);
PUBLIC VOID PASCAL NEAR OccurrenceUnpack(LPOCC, LPB, OCCF);
PUBLIC CB PASCAL NEAR CbCopySortPackedOcc (LPB, LPB, WORD);
PUBLIC CBIT PASCAL NEAR CbitBitsDw (DWORD);
PUBLIC void NEAR PASCAL VGetBestScheme(LPCKEY, LRGDW, DWORD, int);
PUBLIC CB PASCAL FAR CbBytePack(LPB, DWORD);
/*********************************************************************
* *
* INDEXING FUNCTIONS *
* *
*********************************************************************/
PUBLIC VOID PASCAL FAR FreeISI (LPIPB);
PUBLIC void NEAR PASCAL FreeEsi(LPIPB);
PUBLIC LCB FAR PASCAL LcbGetFreeMemory(LPERRB);
PUBLIC ERR FAR PASCAL SortFlushISI (_LPIPB);
PUBLIC int PASCAL FAR WordRecCompare(LPB, LPB, LPV);
PUBLIC ERR FAR PASCAL MergeSortTreeFile (_LPIPB, LPMERGEPARAMS);
PUBLIC int FAR PASCAL CompareOccurrence (LPDW, LPDW, int);
PUBLIC int FAR PASCAL StrCmp2BytePascal (LPB, LPB);
ERR FAR PASCAL FlushTree(_LPIPB lpipb);
PUBLIC ERR FAR PASCAL BuildBTree (HFPB, _LPIPB, LPB, HFPB, LPSTR);
PUBLIC ERR FAR PASCAL FWriteBits(PFILEDATA, DWORD, BYTE);
PUBLIC ERR PASCAL FAR IndexOpenRW (LPIPB, HFPB, LSZ);
PUBLIC PNODEINFO PASCAL FAR AllocBTreeNode (_LPIPB lpipb);
PUBLIC VOID PASCAL FAR FreeBTreeNode (PNODEINFO pNode);
PUBLIC ERR PASCAL FAR ReadNewNode (HFPB, PNODEINFO, int);
PUBLIC PNODEINFO PASCAL FAR AllocBTreeNode (_LPIPB lpipb);
PUBLIC ERR PASCAL FAR SkipOldData (_LPIPB, PNODEINFO);
PUBLIC ERR FAR PASCAL AllocSigmaTable (_LPIPB lpipb);