windows-nt/Source/XPSP1/NT/enduser/stuff/itircl/fts/search/search.h

579 lines
21 KiB
C
Raw Permalink Normal View History

2020-09-26 03:20:57 -05:00
/*************************************************************************
* *
* ISEARCH.H *
* *
* Copyright (C) Microsoft Corporation 1990-1992 *
* All Rights reserved. *
* *
**************************************************************************
* *
* Module Intent *
* Common defines internal to the searcher. None of this stuff is *
* available outside the search engine. *
* *
**************************************************************************
* *
* Current Owner: BinhN *
* *
**************************************************************************
* *
* Released by Development: (date) *
* *
*************************************************************************/
// Critical structures that gets messed up in /Zp8
#pragma pack(1)
#define occifNONE 0x0000 // No flags.
#define occifEOF 0x0001 // End of file.
#define occifWRITTEN 0x0002 // This occurrence has been written.
#define occifMATCH 0x0004 // This occurrence is a match.
#define occifHAS_MATCH 0x0008 // Don't do combination this pass.
/**************************************************************************
*
* SYMBOLS STRUCTURE
*
**************************************************************************/
#define cWordsPerToken 5 // a guess at average number of stemmed or wildcard variants
typedef struct WORDINFO
{
struct WORDINFO FAR *pNext;
// Term frequency
DWORD cTopic;
// Word data information
FILEOFFSET foData;
DWORD cbData;
WORD wRealLength; // Real word length
} WORDINFO, FAR *LPWORDINFO;
typedef struct STRING_TOKEN {
struct STRING_TOKEN FAR *pNext;
LPB lpString; // String itself
WORD cUsed; // Times this string appears in the query
WORD wWeight; // Weight of the word
WORD wWeightRemain; // Sum of term weights AFTER this one in the list
LPWORDINFO lpwi; // List of word data for this token
DWORD dwTopicCount;
} STRING_TOKEN;
/* Set the default size of a string block. Assuming that a word length
* is 6, allocate enough memory for 20 words
*/
#define STRING_BLOCK_SIZE (sizeof(STRING_TOKEN) + 6) * 20
/* String flags */
#define EXACT_MATCH 0x01
#define WILDCARD_MATCH 0x02
#define TERM_RANGE_MATCH 0x03
/**************************************************************************
*
* QUERY TREE STRUCTURE & FUNCTIONS
*
**************************************************************************/
#define MAX_QUERY_NODE 0xFFFF // Maximum number of tokens in a query
#define TOPICLIST_NODE 0x01 // The node is a topicId node
#define OCCURENCE_NODE 0x02 // The node is an occurrence node
#define NODE_TYPE_MASK 0x0f
#define DONT_FREE 0x10 // Don't free the node after unlink
#define STACK_SIZE 15 // Maximum level of stack
// This is an information buffer structure that you pass to "HitListGetTopic"
// which fills in its fields. You can look at the fields not marked as
// "internal", and also pass it to other API functions.
#define TOPIC_INFO \
WORD wWeight; /* Topicument-weight. */ \
DWORD dwTopicId; /* Topic-ID associated with this hit. */ \
DWORD lcOccur /* Number of occurrences (hits) */
typedef struct SNGLINK
{
struct SNGLINK FAR *pNext;
} SGNLINK, FAR *LPSLINK;
// Internal Occurrence structure
// Be careful when changing the fields of it. See MARKER struct below
typedef struct OCCURENCE
{
struct OCCURENCE FAR *pNext;
WORD fFlag; /* Various flags */
WORD cLength; /* Word length */
DWORD dwCount; /* Word Count, needed for phrase */
DWORD dwOffset; /* Word offset, needed for hilite */
DWORD dwFieldId; /* Field id (_DEBUG only) */
WORD wWeight; /* Hit weight */
LPVOID lpvTerm; /* Pointer to a term in WORD-prefix length
* Unicode format, i.e. a "wide ST".
*/
#if defined (_MIPS) || defined (ppc)
WORD wPad;
DWORD dwPad;
#endif
} OCCURENCE, FAR *LPIOCC;
/* Marker node. Be careful when changing the fields of it.
* 1/ The size of it must be <= the size of an OCCURENCE
* 2/ Location of pNext and fFlag must be the same between the two
* structure */
typedef struct MARKER
{
struct OCCURENCE FAR *pNext;
WORD fFlag; /* Various flags */
WORD Unused; /* Unused */
struct MARKER FAR *pPrevMark; /* Previous marker */
struct MARKER FAR *pNextMark; /* Next marker */
} MARKER, FAR *LPMARKER;
/* Occurrence flags */
#define TO_BE_KEPT 0x01
#define TO_BE_SKIPPED 0x02
#define TO_BE_COMPARED 0x04
#define IS_LAST_NODE 0x08
#define IS_MARKER_NODE 0x10
/* Internal Topic List structure */
typedef struct TOPIC_LIST
{
struct TOPIC_LIST FAR * pNext;
LPIOCC lpOccur; // Pointer to occurrence lists
unsigned short fFlag; // Various flags, such as TO_BE_KEPT
TOPIC_INFO;
} TOPIC_LIST;
typedef TOPIC_LIST FAR *LPITOPIC;
/* TopicId node flags */
#define TO_BE_KEPT 0x01
#define HAS_MARKER 0x02
#define IS_MARKER_NODE 0x10
#define WRITTEN_TO_DISK 0x20
#define DL_NEXT(p) (((LPITOPIC)p)->pNext)
#define DL_OCCUR(p) (((LPITOPIC)p)->lpOccur)
/* QueryInfo flags */
#define IN_PHRASE 0x0001
#define FREE_CHARTAB 0x0002
#define FORCED_PHRASE 0x0004
#define CW_PHRASE 0x0010 // Must match COMPOUNDWORD_PHRASE in medv20.h
/* Query info node */
typedef struct QueryInfo
{
LPB lpbQuery; // Query expression bytes.
LPB pWhitespace; // Working variable for implicit phrase
LPQT lpQueryTree; // Query tree.
LPV lpStack; // Pointer to operator's stack
DWORD dwOffset; // Current offset
BREAKER_FUNC lpfnBreakFunc;
LPSIPB lpStopListInfo; // Associated stop list info
LPCHARTAB lpCharTab; // Pointer to character table
LPOPSYM lpOpSymTab; // Operator symbol table
LPERRB lperrb; // Error buffer
DWORD fFlag; // Flag
WORD cOpEntry; // Number of operator entries
WORD Pad;
} QUERY_INFO,
FAR *LPQI;
/* Parameter of an unary operator */
typedef union NodeParm
{
VOID FAR *lpStruct;
DWORD dwValue;
} NODE_PARM;
/* Query Tree nodes */
#define OPERATOR_NODE 1 // Operator
#define TERM_NODE 2 // A string to be searched for
#define NULL_NODE 3 // A string that can't be found
#define EXPRESSION_NODE 4 // The node contains the result
#define STOP_NODE 5 // A stop word
typedef struct QTNODE
{
struct QTNODE FAR *pNext;
struct QTNODE FAR *pPrev;
LPITOPIC lpTopicList; /* Topic linked list */
union
{
STRING_TOKEN FAR *pToken; /* Word associated with this node */
WORD wProxDist; /* Proximity distance */
VOID FAR *lpStruct; /* Structure associated with unary op */
} u;
DWORD cTopic; // Number of TopicId lists of this node
// This number will change when merging
// gets involved
// Max and Min topic id. This is useful for speeding up retrieval
DWORD dwMaxTopicId; // Max topic Id in the list
DWORD dwMinTopicId; // Min topic id in the list
// Word data information
FILEOFFSET foData;
DWORD cbData;
/* Characteristics associated with the node */
LPB lpHiString; /* Hi limit of the string (for THRU) */
LPGROUP lpGroup; /* Group associated with this node */
DWORD dwFieldId; /* FieldID associated with term */
WORD NodeType; /* What type (operator, term, etc) */
WORD OpVal; /* Operator value */
WORD iCurOff; // Offset to the beginning of the word
WORD wRealLength; // Real word length
LPVOID lpvIndexedTerm; /* Pointer to the term in the index that
* currently matches this node. The term's
* string is in WORD-prefix length Unicode
* format, i.e. a "wide ST".
*/
// General info
WORD fFlag; /* Various flags */
WORD Offset; /* Offset from the beginning of the query */
WORD wBrkDtype; /* Breaker's dtype (for THRU) */
WORD Pad;
} QTNODE, FAR *_LPQTNODE;
#define QTN_LEFT(p) (((QTNODE FAR *)p)->pPrev)
#define QTN_RIGHT(p) (((QTNODE FAR *)p)->pNext)
#define QTN_PREV(p) (((QTNODE FAR *)p)->pPrev)
#define QTN_NEXT(p) (((QTNODE FAR *)p)->pNext)
#define QTN_NODETYPE(p) (((QTNODE FAR *)p)->NodeType)
#define QTN_OPVAL(p) (((QTNODE FAR *)p)->OpVal)
#define QTN_TOPICLIST(p) (((QTNODE FAR *)p)->lpTopicList)
#define QTN_TOKEN(p) (((QTNODE FAR *)p)->u.pToken)
#define QTN_PARMS(p) (((QTNODE FAR *)p)->u.lpStruct)
#define QTN_FLAG(p) (((QTNODE FAR *)p)->fFlag)
#define QTN_HITERM(p) (((QTNODE FAR *)p)->lpHiString)
#define QTN_OFFSET(p) (((QTNODE FAR *)p)->Offset)
#define QTN_FIELDID(p) (((QTNODE FAR *)p)->dwFieldId)
#define QTN_DTYPE(p) (((QTNODE FAR *)p)->wBrkDtype)
#define QTN_GROUP(p) (((QTNODE FAR *)p)->lpGroup)
/* Block of query's nodes. We allocate 16 nodes per block */
#define QUERY_BLOCK_SIZE sizeof(QTNODE)*16
#define cTOPIC_PER_BLOCK 500
#define cOCC_PER_BLOCK 1000
/* Query tree's flags */
#define TO_BE_SORTED 0x0001
#define HAS_NEAR_RESULT 0x0002
#define ALL_OR 0x0004
#define ALL_AND 0x0008
#define PROCESSED 0x0010
#define ALL_ANDORNOT 0x0020
/* Query tree structure */
typedef struct QTREE
{
CUSTOMSTRUCT cStruct; /* Structure's handle, MUST BE 1ST FIELD!! */
LONG cQuery; /* Note: this can't be unsigned */
DWORD dwcOccFields; /* Occurence fields count */
DWORD dwOccSize; /* Occurence node size */
/* Unary operator related fields */
LPGROUP lpGroup; /* Group associated with all terms */
DWORD dwFieldId; /* Field-ID assigned to all followed terms.*/
WORD wProxDist; /* Proximity distance */
WORD iDefaultOp; /* Default operator. */
WORD wBrkDtype; /* Breaker's dtype (for THRU) */
WORD fFlag; /* Various querytree flags */
/* String table */
LPV lpStringBlock; /* String's memory block */
STRING_TOKEN FAR *lpStrList;/* Pointer to strings table */
LPV lpWordInfoBlock;
/* Topic list related global variables */
LPV lpTopicMemBlock; /* Pointer to Topic memory block */
LPITOPIC lpTopicStartSearch; /* Starting node for searching */
LPSLINK lpTopicFreeList; /* Pointer to free doc list */
DWORD dwTopicNodeCnt;
/* Occ list related global variables */
LPV lpOccMemBlock; /* Pointer to Occ memory block */
LPIOCC lpOccStartSearch; /* Starting occurrence for searching */
LPSLINK lpOccFreeList; /* Pointer to free occurrence list */
DWORD dwOccNodeCnt;
/* Buffer for the tree's nodes */
LPV lpNodeBlock; /* Nodes memory block */
_LPQTNODE lpTopNode; /* Pointer to top node */
/* Index information */
FILEOFFSET foIdxRoot; /* Top node offset */
DWORD dwBlockSize; /* Index block size */
WORD TreeDepth; /* Depth of tree */
WORD cIdxLevels; /* Index's depth */
OCCF occf;
IDXF idxf;
CKEY ckeyTopicId; // 2-bytes
CKEY ckeyOccCount; // 2-bytes
CKEY ckeyWordCount;
CKEY ckeyOffset;
/* MAGIC value... */
LONG magic;
/* Interrupt flag for online use. Online apps don't have callbacks
* so we have to provide an API to set this flag
*/
BYTE cInterruptCount; /* Interrupt checking */
BYTE fInterrupt;
/* Similarity stuff */
LPV lpDocScores;
} QTREE,
FAR *_LPQT;
#define HQUERY_MAGIC 0x04121956
// This defines the "type" of word term node.
#define TERM_EXACT 1 // "Standard" term.
#define TERM_PREFIX 2 // "Wildcard" term.
#define TERM_RANGE 3 // Range term. This says "give me everything
// between some low bound and some high
// bound."
/*
* This defines the value or type of operator node. It corresponds to
* the OpVal field of struct OPSYM
*/
#define AND_OP 0 // AND operator
#define OR_OP 1 // OR operator
#define NOT_OP 2 // NOT operator
#define PHRASE_OP 3 // PHRASE operator
#define NEAR_OP 4 // NEAR operator
#define MAX_DEFAULT_OP OR_OP // Maximum value of default operator
#define RANGE_OP 5
#define GROUP_OP 6
#define FIELD_OP 7
#define BRKR_OP 8
#define MAX_OPERATOR 8
#define STOP_OP 9 // stop word
#define QUOTE 50
#define RIGHT_PAREN 51
#define LEFT_PAREN 52
#define TERM_TOKEN 53
/* Operator type */
#define BINARY_OP 0x01
#define UNARY_OP 0x02
#define PARSE_TOKEN 0x04
/* Operator attribute */
#define COMMUTATIVE 0x10 // a * b = b * a
#define ASSOCIATIVE 0x20 // a * (b * c) = (a * b) * c
#define ZERO 0x40 // a * a = a
extern WORD OperatorAttributeTable[];
/*
* Those are properties of a binary node expression.
*/
#define EXPRESSION_TERM 1 // One branch is an expression, one
// is a term
#define EXPRESSION_EXPRESSION 2 // Both branches are expressions
#if 1
typedef ERR (PASCAL NEAR *FNHANDLER) (LPQT, _LPQTNODE, LPITOPIC, LPV, int);
#else
typedef ERR (PASCAL NEAR *FNHANDLER) (LPQT, LPV, LPV, LPV, int);
#endif
#define ORDERED_BASED 1 // Based on topicId numbered
#define HIT_COUNT_BASED 2 // Based doc's hit count
#define WEIGHT_BASED 3 // based on doc's weight
typedef struct RetVars
{
LPQT lpqt; // Pointer to query tree for global variables
LPBYTE pLeadByteTable; // Pointer to lead byte table for DBCS
DWORD dwTopicCount; // Number of topics
DWORD dwFieldID; // Current fieldid
DWORD cOccFields;
DWORD dwOccSize;
LPB lpbCur;
NODEINFO LeafInfo;
NODEINFO DataInfo;
SRCHINFO SrchInfo; // Search information
LCID lcid; // WIN32 locale ID specified at build time
WORD wWordLength; // Word length
WORD fFlag; // General flags
BYTE pBTreeWord[CB_MAX_WORD_LEN]; // Buffer for the decoded word
BYTE pModifiedWord[CB_MAX_WORD_LEN]; // Buffer for the modified word
BYTE pStemmedBTreeWord[CB_MAX_WORD_LEN]; // Stemmed BTree word
BYTE pStemmedQueryWord[CB_MAX_WORD_LEN]; // Stemmed searched word
BYTE fRank; // If non-zero the result is ranked.
BYTE pNodeBuf[BTREE_NODE_SIZE]; // Generic b-tree node buffer.
BYTE pDataBuf[FILE_BUFFER];
} RETV,
FAR *LPRETV;
/**************************************************************************
*
* OPEN INDEX STRUCTURE
*
**************************************************************************/
typedef struct Idx
{
GHANDLE hStruct; // Handle to this structure.
DWORD dwKey;
FCALLBACK_MSG Callback;
LPBYTE pLeadByteTable; // Pointer to table of DBCS lead bytes
HANDLE hLeadByteTable;
IH20 ih; // Index header.
HFPB hfpbIdxSubFile; // Index file handle. If this is NULL, the
// index isn't open, else it is.
//HFPB hfpbSysFile; // Handle of Index File system
GHANDLE hTopNode; // Handle to "lrgbTopNode".
LRGB lrgbTopNode; // Pointer to the index top node.
FLOAT fSigmaTable; // Sigma table
HANDLE hSigma; // Handle of sigma table
LPERRB lperrb; // Pointer to error block
WORD wSlackSize; // Size of slack in a node
WORD Pad;
} IDX,
FAR *_LPIDX; // The "_" indicates that this is
// a private structure that needs
// to be available publicly. The
// public will call this an "LPIDX".
/*************************************************************************
*
* Hitlist structure
*
*************************************************************************/
typedef struct HitList
{
GHANDLE hStruct; // Structure's handle. MUST BE 1ST FIELD
DWORD lcReturnedTopics; // The number of Topics returned. (what the user wants)
DWORD lcTotalNumOfTopics; // The total number of topics hit by the query.
LPITOPIC lpTopicList; // Starting of TopicList
LPBLK lpOccMemBlock; // Pointer to Occ memory block
LPBLK lpTopicMemBlock; // Pointer to Topic memory block
/* All the following fields are for internal use only */
LPVOID lpHttpQ; // for online search only
DWORD lcMaxTopic; // Max TopicId number (internal)
LPITOPIC lpLastTopic; // Last accessed Topic pointer (internal)
DWORD lLastTopicId; // Last accessed TopicId (internal)
/* Topic list cache */
GHANDLE hTopic; // Handle to Topic file
GHANDLE hTopicCache; // Handle to Topic cache
LPITOPIC lpTopicCache; // Cache for Topic info
DWORD dwTopicCacheStart; // Starting Topic number if the cache
DWORD dwTopicInCacheCount; // Number of topic currently in cache
/* Occurrence cache */
GHANDLE hOcc; // Handle to occurrences file
GHANDLE hOccCache; // Handle to Occ cache
DWORD dwCurTopic; // Current doc that the hit list belongs to
LPIOCC lpOccCache; // Cache for Occ info
DWORD dwOccCacheStart; // Starting Occ number if the cache
/* Various hitlist info for hitlist merge */
struct HitList FAR * lpMainHitList;
struct HitList FAR * lpUpdateHitList;
BYTE lszTopicName[cbMAX_PATH]; // Topic filename
BYTE lszOccName[cbMAX_PATH]; // Occ filename
} HL, FAR *_LPHL;
#define DO_FAST_MERGE(pSrch, lpqt) (((pSrch)->Flag & QUERYRESULT_SKIPOCCINFO) && ((lpqt)->fFlag & ALL_ANDORNOT))
/*************************************************************************
*
* Global Variables
*
* Those variables should be read only
*************************************************************************/
extern FNHANDLER HandlerFuncTable[];
extern OPSYM OperatorSymbolTable[];
extern OPSYM FlatOpSymbolTable[];
extern BYTE LigatureTable[];
/*************************************************************************
*
* Functions Prototypes
*
*************************************************************************/
/* qtparse.c */
PUBLIC LPQT PASCAL NEAR QueryTreeAlloc(void);
PUBLIC ERR PASCAL NEAR QueryTreeAddToken (_LPQT, int, LST, DWORD, BOOL);
PUBLIC LPQT PASCAL NEAR QueryTreeBuild (LPQI);
#if defined(_DEBUG) && DOS_ONLY
PUBLIC ERR PASCAL FAR PrintList(LPQT);
#endif
/* search.c */
PUBLIC ERR PASCAL NEAR ResolveTree(LPIDX, _LPQTNODE, LPRETV, BOOL);
PUBLIC BOOL NEAR PASCAL FGroupLookup(LPGROUP, DWORD);
/* qtlist */
PUBLIC TOPIC_LIST FAR* PASCAL NEAR TopicNodeAllocate(LPQT);
PUBLIC VOID PASCAL NEAR TopicNodeFree (LPQT, _LPQTNODE, LPITOPIC, LPITOPIC);
PUBLIC ERR PASCAL NEAR TopicNodeInsert (LPQT, _LPQTNODE, LPITOPIC);
PUBLIC LPITOPIC PASCAL NEAR TopicNodeSearch(LPQT, _LPQTNODE, DWORD);
PUBLIC LPIOCC PASCAL NEAR OccNodeAllocate(LPQT);
PUBLIC LPIOCC PASCAL NEAR OccNodeSearch(LPQT, LPITOPIC , LPIOCC );
PUBLIC ERR PASCAL NEAR OccNodeInsert(LPQT, LPITOPIC, LPIOCC);
PUBLIC int PASCAL NEAR OccCompare(LPIOCC, LPIOCC);
PUBLIC VOID PASCAL NEAR RemoveNode(LPQT, LPV, LPSLINK, LPSLINK, int);
PUBLIC VOID PASCAL NEAR FreeTree (_LPQTNODE);
/* combine.c */
PUBLIC VOID PASCAL NEAR RemoveUnmarkedTopicList (LPQT, _LPQTNODE, BOOL);
PUBLIC VOID PASCAL NEAR RemoveUnmarkedNearTopicList (_LPQT, _LPQTNODE);
PUBLIC VOID PASCAL NEAR MarkTopicList (_LPQTNODE);
PUBLIC VOID PASCAL NEAR MergeOccurence(LPQT, LPITOPIC , LPITOPIC);
PUBLIC VOID PASCAL NEAR SortResult (LPQT, _LPQTNODE, WORD);
PUBLIC ERR PASCAL NEAR OrHandler(LPQT, _LPQTNODE, LPITOPIC, LPV, int);
PUBLIC ERR PASCAL NEAR AndHandler(LPQT, _LPQTNODE, LPITOPIC, LPV, int);
PUBLIC ERR PASCAL NEAR NotHandler(LPQT, _LPQTNODE, LPITOPIC, LPV, int);
PUBLIC ERR PASCAL NEAR NearHandler(LPQT, _LPQTNODE, LPITOPIC, LPV, int);
PUBLIC ERR PASCAL NEAR PhraseHandler(LPQT, _LPQTNODE, LPITOPIC, LPV, int);
PUBLIC VOID PASCAL NEAR NearHandlerCleanUp (LPQT, _LPQTNODE);
PUBLIC ERR PASCAL NEAR TopicListSort (_LPQTNODE lpQtNode, BOOL fFlag);
// Critical structures that gets messed up in /Zp8
#pragma pack()