885 lines
34 KiB
C
885 lines
34 KiB
C
|
/*************************************************************************
|
||
|
* *
|
||
|
* Copyright (C) Microsoft Corporation 1990-1994 *
|
||
|
* All Rights reserved. *
|
||
|
* *
|
||
|
**************************************************************************
|
||
|
* *
|
||
|
* Module Intent *
|
||
|
* All typedefs and defines needed for user's retrieval *
|
||
|
* *
|
||
|
*************************************************************************/
|
||
|
|
||
|
#ifndef __MVSEARCH_H_
|
||
|
#define __MVSEARCH_H_
|
||
|
#ifdef __cplusplus
|
||
|
extern "C" {
|
||
|
#endif
|
||
|
|
||
|
#pragma pack(1) // Guard against Zp problems.
|
||
|
|
||
|
#include <iterror.h>
|
||
|
/*************************************************************************
|
||
|
* Basic defines.
|
||
|
*************************************************************************/
|
||
|
|
||
|
#ifdef _32BIT
|
||
|
#define EXPORT_API
|
||
|
#define HUGE
|
||
|
|
||
|
#else
|
||
|
#define EXPORT_API _export
|
||
|
#define HUGE huge
|
||
|
#endif
|
||
|
|
||
|
#ifdef PRIVATE
|
||
|
#undef PRIVATE
|
||
|
#endif
|
||
|
|
||
|
#define PRIVATE static
|
||
|
|
||
|
#ifdef PUBLIC
|
||
|
#undef PUBLIC
|
||
|
#endif
|
||
|
|
||
|
#define PUBLIC
|
||
|
|
||
|
#define cbMAX_PATH (CB)256 // Maximum pathname length.
|
||
|
|
||
|
/* Maximum year's value that can be passed to MediaView's FBreakEpoch() */
|
||
|
|
||
|
#define MAX_YEAR ((unsigned long)0xFFFFFFFF / 366)
|
||
|
|
||
|
/* Maximum word length that is accepted by MediaView's breaker. */
|
||
|
#define CB_MAX_WORD_LEN ((CB)1000) // Longest legal word.
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* Typedef
|
||
|
*************************************************************************/
|
||
|
|
||
|
#ifndef LPB
|
||
|
typedef BYTE FAR * LPB;
|
||
|
#endif
|
||
|
|
||
|
typedef WORD IB; // Index into an array of bytes.
|
||
|
#ifdef _32BIT
|
||
|
typedef DWORD CB; // Count of bytes.
|
||
|
#else
|
||
|
typedef WORD CB; // Count of bytes.
|
||
|
#endif
|
||
|
typedef DWORD LCB; // Count of bytes.
|
||
|
typedef WORD CBIT; // Count of bits.
|
||
|
typedef WORD FAR * LPW; // pointer to word
|
||
|
|
||
|
typedef BYTE FAR *LSZ; // 0-terminated string far pointer
|
||
|
typedef BYTE FAR *LST; // Pascal style string far pointer
|
||
|
typedef void FAR *LPV; // Far void pointer
|
||
|
typedef DWORD LFO; // 32-bit file offset.
|
||
|
typedef DWORD LCF; // 32-bit file count bytes
|
||
|
typedef void NEAR *NPV; // Void near pointers.
|
||
|
|
||
|
typedef NPV NPIBI; // Near
|
||
|
typedef WORD IDXF;
|
||
|
typedef LPV LPIBI; // Far
|
||
|
typedef LPV LPSIPB; // Stop information parameter block.
|
||
|
|
||
|
typedef LPV LPCAT; // Pointer to catalog
|
||
|
typedef LPV LPGROUP; // pointer to a group.
|
||
|
typedef LPV LPIDX; // Pointer index block.
|
||
|
typedef LPV LPQT; // Pointer to Query tree.
|
||
|
typedef LPV LPIPB; // Pointer to Index parameter block.
|
||
|
typedef LPV LPWHEEL; // Pointer to wheel parameter block.
|
||
|
typedef LPV LPHL; // Pointer to hitlist block.
|
||
|
typedef LPV LPCTAB; // Pointer to chartab
|
||
|
typedef LPV LPOPTAB; // Pointer to operator table
|
||
|
typedef LPV LPBRKI; // Pointer to breaker info
|
||
|
typedef WORD OCCF;
|
||
|
|
||
|
typedef HANDLE GHANDLE;
|
||
|
typedef HANDLE HGPOUP; // Handle to Group list
|
||
|
typedef DWORD IDGROUP; // Group's ID
|
||
|
|
||
|
typedef GHANDLE HIDX;
|
||
|
typedef GHANDLE HFPB;
|
||
|
|
||
|
/*************************************************************************
|
||
|
* Word-breaker API and associated defines.
|
||
|
*************************************************************************/
|
||
|
|
||
|
typedef HANDLE HIBI; // "Internal break info". The individual
|
||
|
// word breakers allocate this
|
||
|
/*
|
||
|
* FWORDCB
|
||
|
* Call back function needed for MediaView breaker. All the LST strings
|
||
|
* are special 2-byte length preceded strings.
|
||
|
*
|
||
|
* LST lstRawWord:
|
||
|
* Words as they appear originally. MediaView only uses the length that
|
||
|
* is need for highlighting
|
||
|
*
|
||
|
* LST lstNormWord:
|
||
|
* Normalized word, which will be indexed. Normalized are words that
|
||
|
* are modified (such as stemmed, changed to lower case, etc)
|
||
|
*
|
||
|
* DWORD dwOffset:
|
||
|
* Offset in the topic (or from the beginning of the buffer passed to
|
||
|
* MediaView breakers) where the word occurs
|
||
|
*
|
||
|
* LPV lpUser:
|
||
|
* User's data, propageted down to the user's call back function
|
||
|
*/
|
||
|
typedef ERR (FAR PASCAL * FWORDCB)(LST lstRawWord, LST lstNormWord,
|
||
|
DWORD dwOffset, LPV lpUser);
|
||
|
|
||
|
/* BREAKER_INIT
|
||
|
* This is the breaker's initialization routine. This routine will
|
||
|
* be called only by MediaView's Title Builder before any calls to
|
||
|
* the breaker is made
|
||
|
*/
|
||
|
typedef LPIBI (FAR PASCAL * BREAKER_INIT)(VOID);
|
||
|
|
||
|
/* BREAKER_FREE
|
||
|
* Termination routine for the breaker. This will allow the breaker
|
||
|
* to free any internal buffer used by it
|
||
|
*/
|
||
|
typedef void (FAR PASCAL * BREAKER_FREE)(LPIBI);
|
||
|
|
||
|
/*
|
||
|
* Breaker function's parameter structure:
|
||
|
*
|
||
|
* BRK_PARMS structure
|
||
|
* LPIBI lpInternalBreakInfo:
|
||
|
* This points to internal information associated with a breaker
|
||
|
* (such as memory buffer, flags, etc). It is solely used by
|
||
|
* that breaker
|
||
|
*
|
||
|
* BYTE FAR * lpbBuf;
|
||
|
* Buffer containing the strings to be broken into invidual words
|
||
|
* This buffer is allocated by the application
|
||
|
*
|
||
|
* DWORD cbBufCount;
|
||
|
* The size of the buffer
|
||
|
*
|
||
|
* DWORD lcbBufOffset;
|
||
|
* The offset of the strings from the topic. This is needed if
|
||
|
* OCCF_OFFSET is used, since the MV breaker will return offsets
|
||
|
* of the words based on this offset
|
||
|
*
|
||
|
* LPV lpvUser
|
||
|
* Anything that the application's callback function needs. The way
|
||
|
* the breaker works is that:
|
||
|
* - The application calls the breaker with some buffers to be broken
|
||
|
* into words
|
||
|
* - For each word the breaker will call the app's callback function
|
||
|
* to return the word and its associated information (length, offset)
|
||
|
*
|
||
|
* FWORDCB lpfnOutWord;
|
||
|
* Pointer to application callback function
|
||
|
*
|
||
|
* LPSIPB lpStopInfoBlock;
|
||
|
* Stop word information. This contains a list of words that the
|
||
|
* application wants the breaker to ignore. This pertains to
|
||
|
* MediaView's breaker only
|
||
|
*
|
||
|
* LPVOID lpCharTab;
|
||
|
* Character table information. This pertains to MediaView's breaker
|
||
|
* only
|
||
|
*
|
||
|
* WORD fFlags;
|
||
|
* Internal flags set and used by MediaView's breaker only
|
||
|
*
|
||
|
* } BRK_PARMS, FAR *LPBRK_PARMS;
|
||
|
*/
|
||
|
|
||
|
typedef struct BRK_PARMS
|
||
|
{
|
||
|
LPIBI lpInternalBreakInfo;
|
||
|
BYTE FAR * lpbBuf;
|
||
|
DWORD cbBufCount;
|
||
|
DWORD lcbBufOffset;
|
||
|
LPV lpvUser;
|
||
|
FWORDCB lpfnOutWord;
|
||
|
LPSIPB lpStopInfoBlock;
|
||
|
LPVOID lpCharTab;
|
||
|
WORD fFlags;
|
||
|
WORD Pad; // Padding to make DWORD align
|
||
|
} BRK_PARMS, FAR *LPBRK_PARMS;
|
||
|
|
||
|
/* BREAKER_FUNC
|
||
|
* Breaker's function prototype for various MediaView's breaker functions
|
||
|
* such as FBreakWords()
|
||
|
*/
|
||
|
typedef ERR (FAR PASCAL * BREAKER_FUNC) (LPBRK_PARMS);
|
||
|
|
||
|
/*
|
||
|
* BRKLIST structure
|
||
|
* For internal use only
|
||
|
*/
|
||
|
typedef struct BreakList
|
||
|
{
|
||
|
HANDLE hnd; // handle to this structure
|
||
|
HANDLE hLib;
|
||
|
BREAKER_FUNC lpfnBreakFunc;
|
||
|
LPSIPB lpStopListInfo;
|
||
|
LPVOID lpCharTab;
|
||
|
} BRKLIST, FAR *LPBRKLIST;
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* The following breakers functions are internal functions
|
||
|
* They can be served as a template prototypes for user's functions
|
||
|
*************************************************************************/
|
||
|
PUBLIC LPIBI EXPORT_API FAR PASCAL BreakerInitiate(void);
|
||
|
PUBLIC void EXPORT_API FAR PASCAL BreakerFree(LPIBI);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL FBreakWords(LPBRK_PARMS);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL FBreakNumber(LPBRK_PARMS);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL FBreakDate(LPBRK_PARMS);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL FBreakTime(LPBRK_PARMS);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL FBreakEpoch(LPBRK_PARMS);
|
||
|
// This exists only to enable MVJK to link statically.
|
||
|
// We must have the same function names for the static build.
|
||
|
PUBLIC ERR FAR PASCAL FBreakStems(LPBRK_PARMS lpBrkParms);
|
||
|
|
||
|
|
||
|
// (EX)ternal (BR)ea(K)er (P)ara(M)eter structure that old .c search code
|
||
|
// can pass to ExtBreakText() in order to configure and call the
|
||
|
// new COM breakers. The breaker control params have been purposely defined
|
||
|
// to mimic those in BRK_PARAMS as much as possible. Note that the ones
|
||
|
// missing are now internal to the COM breaker implementation.
|
||
|
typedef struct _exbrkpm
|
||
|
{
|
||
|
// This section to be specified by .c caller of breaker.
|
||
|
DWORD dwBreakWordType; // Reg. text, number, date, etc.
|
||
|
LPBYTE lpbBuf; // Text buffer;
|
||
|
DWORD cbBufCount; // No. of chars in buffer.
|
||
|
LPVOID lpvUser; // Caller data that gets passed through
|
||
|
// to *lpfnOutWord.
|
||
|
FWORDCB lpfnOutWord; // Pointer to word callback function.
|
||
|
WORD fFlags; // Breaker flags.
|
||
|
|
||
|
// This section is owned by the index COM object and should not be
|
||
|
// modified by the .c caller.
|
||
|
LPVOID lpvIndexObjBridge;
|
||
|
} EXBRKPM, *PEXBRKPM;
|
||
|
|
||
|
PUBLIC HRESULT EXPORT_API FAR PASCAL ExtBreakText(PEXBRKPM pexbrkpm);
|
||
|
PUBLIC HRESULT EXPORT_API FAR PASCAL ExtStemWord(LPVOID lpvIndexObjBridge,
|
||
|
LPBYTE lpbStemWord, LPBYTE lpbRawWord);
|
||
|
PUBLIC HRESULT EXPORT_API FAR PASCAL ExtLookupStopWord(
|
||
|
LPVOID lpvIndexObjBridge, LPBYTE lpbStopWord);
|
||
|
PUBLIC HRESULT EXPORT_API FAR PASCAL ExtAddQueryResultTerm(
|
||
|
LPVOID lpvIndexObjBridge, LPBYTE lpbTermHit, LPVOID *ppvTermHit);
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc API EXTERNAL
|
||
|
* @func ERR FAR PASCAL | fInterrupt |
|
||
|
* Function to support interrupt (cancel) feature.
|
||
|
* @parm LPVOID | lpV |
|
||
|
* Parameter used by the callback interrupt function
|
||
|
* @rdesc ERR_SUCCESS if there is no interrupt, else ERR_INTERRUPT
|
||
|
*************************************************************************/
|
||
|
typedef ERR (FAR PASCAL *INTERRUPT_FUNC)(LPVOID);
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc API EXTERNAL
|
||
|
* @func int FAR PASCAL | fStatus |
|
||
|
* Function to support status messaging feature.
|
||
|
* @parm LPSTR | lpStr |
|
||
|
* Message to be displayed
|
||
|
* @rdesc Different status codes
|
||
|
*************************************************************************/
|
||
|
typedef VOID (FAR PASCAL *STATUS_FUNC)(LPSTR);
|
||
|
|
||
|
|
||
|
#define BREAKERBUFFERSIZE 1024 // Size of breaker's state info struct
|
||
|
|
||
|
/*
|
||
|
* Breaker Table Constants
|
||
|
*/
|
||
|
#define MAXNUMBRKRS 16 // maximum number of breakers.
|
||
|
#define MAXBRKRLEN 1024 // maximum size of breaker line in |SYSTEM.
|
||
|
|
||
|
#ifndef ISBU_IR_CONSTS
|
||
|
#define ISBU_IR_CONSTS
|
||
|
|
||
|
#define cHundredMillion ((float) 100000000.0)
|
||
|
#define cVerySmallWt ((float) 0.02)
|
||
|
#define cNintyFiveMillion 95000000
|
||
|
#define cTFThreshold 4096
|
||
|
|
||
|
#endif // ISBU_IR_CONSTS
|
||
|
|
||
|
/*************************************************************************
|
||
|
* Stop list retrieval API.
|
||
|
*************************************************************************/
|
||
|
|
||
|
|
||
|
typedef ERR (FAR PASCAL * STOPLKUP)(LPSIPB, LST);
|
||
|
|
||
|
PUBLIC LPSIPB EXPORT_API FAR PASCAL MVStopListInitiate (WORD wTabSize,
|
||
|
LPERRB lperrb);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL MVStopListIndexLoad(HFPB hSysFile,
|
||
|
LPSIPB lpsipb, LSZ szStopFilename);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL MVStopListLookup(LPSIPB lpsipb,
|
||
|
LST sPascalString);
|
||
|
PUBLIC void EXPORT_API FAR PASCAL MVStopListDispose(LPSIPB lpsipb);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL MVStopListAddWord(LPSIPB lpsipb,
|
||
|
LST sPascalString);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL MVStopListLoad(HFPB hfpbIn, LPSIPB lpsipb,
|
||
|
LSZ szFilename, BREAKER_FUNC lpfnBreakerFunc,
|
||
|
LPV lpCharTab);
|
||
|
PUBLIC ERR EXPORT_API PASCAL FAR MVStopFileBuild (HFPB hSysFile,
|
||
|
LPSIPB lpsipb, LSZ szStopfilename);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL MVStopListEnumWords(LPSIPB lpsipb,
|
||
|
LST *plstWord, LONG *plWordInfo, LPVOID *ppvWordInfo);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL MVStopListFindWordPtr(LPSIPB lpsipb,
|
||
|
LST lstWord, LST *plstWordInList);
|
||
|
|
||
|
/*************************************************************************
|
||
|
* Query's defines and API
|
||
|
*************************************************************************/
|
||
|
|
||
|
/* User's operators for retrieval */
|
||
|
|
||
|
#define AND_OP 0
|
||
|
#define OR_OP 1
|
||
|
#define NOT_OP 2
|
||
|
#define PHRASE_OP 3
|
||
|
#define NEAR_OP 4
|
||
|
|
||
|
#define DEF_PROX_DIST 8 // Default prox distance
|
||
|
|
||
|
// In strings with embedded font tag, this number denotes that the next byte
|
||
|
// is an index into a charmap table
|
||
|
|
||
|
#define EMBEDFONT_BYTE_TAG 3
|
||
|
#define TL_QKEY 0x8000 // Use with cDefOp to treat operators as words
|
||
|
|
||
|
/*
|
||
|
* Parser function's parameter structure:
|
||
|
*
|
||
|
* PARSE_PARMS structure
|
||
|
* This structure contains all the informations necessary for
|
||
|
* parsing a line.
|
||
|
*
|
||
|
* LPB lpbQuery:
|
||
|
* Pointer to buffer containing the query to be parsed
|
||
|
*
|
||
|
* LPBRKLIST lpfnTable:
|
||
|
* Array of breaker functions (FBreakWords, etc) indexed by their dtype
|
||
|
*
|
||
|
* DWORD cbQuery:
|
||
|
* Query's buffer length
|
||
|
*/
|
||
|
|
||
|
typedef struct PARSE_PARMS
|
||
|
{
|
||
|
LPCSTR lpbQuery; /* Pointer to query buffer */
|
||
|
EXBRKPM *pexbrkpm; /* External breaker param structure */
|
||
|
DWORD cbQuery; /* Query buffer's length */
|
||
|
|
||
|
/* Note: all the following fields may be gone in the future if
|
||
|
* we provide new operator to support them
|
||
|
*/
|
||
|
|
||
|
LPGROUP lpGroup; /* Group */
|
||
|
LPVOID lpOpTab; /* Operator table */
|
||
|
WORD wCompoundWord;
|
||
|
WORD cProxDist; /* Proximity distance */
|
||
|
WORD cDefOp; /* Default operator */
|
||
|
char padding[2];
|
||
|
} PARSE_PARMS, FAR *LPPARSE_PARMS;
|
||
|
|
||
|
PUBLIC LPQT EXPORT_API FAR PASCAL MVQueryParse(LPPARSE_PARMS, LPERRB);
|
||
|
|
||
|
PUBLIC void EXPORT_API FAR PASCAL MVQueryFree(LPQT);
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* Index & Hitlist Retrieval API
|
||
|
*************************************************************************/
|
||
|
|
||
|
|
||
|
/*
|
||
|
* This is an information buffer structure that you pass to "HitListGetTopic"
|
||
|
* which fills in its fields. You can look at the fields not marked as
|
||
|
* "internal", and also pass it to other API functions.
|
||
|
*/
|
||
|
|
||
|
typedef struct TopicInfo
|
||
|
{
|
||
|
DWORD dwTopicId; // Topic-ID associated with this hit.
|
||
|
DWORD lcHits; // Number of hits in this document.
|
||
|
union
|
||
|
{
|
||
|
DWORD liFirstHit; // Index in the ROCC file of the first
|
||
|
// hit in this document (internal).
|
||
|
LPV lpTopicList; // Pointer to TopicList (internal)
|
||
|
};
|
||
|
WORD wWeight; // Document-weight.
|
||
|
WORD Pad;
|
||
|
} TOPICINFO,
|
||
|
FAR *PTOPICINFO;
|
||
|
|
||
|
/*
|
||
|
* This is an information buffer structure that you pass to "HitListGetHit"
|
||
|
* which fills in its fields.
|
||
|
*/
|
||
|
|
||
|
typedef struct HitInfo
|
||
|
{
|
||
|
DWORD dwOffset; // Byte-offset ".
|
||
|
DWORD dwFieldId; // Field-ID associated with this hit.
|
||
|
DWORD dwCount;
|
||
|
DWORD dwLength; // Word-length ".
|
||
|
LPVOID lpvTerm; // Pointer to a term in WORD-prefix length
|
||
|
// Unicode format, i.e. a "wide ST".
|
||
|
} HIT,
|
||
|
FAR *LPHIT;
|
||
|
|
||
|
|
||
|
// SLOP is extra bytes to handle diacritic. Each byte represents an
|
||
|
// occurence of one diacritic. 5 of them should be more than enough to
|
||
|
// handle all diacritics in a word. This set up will allow us to
|
||
|
// simplify the checking by just doing it for RawWord only
|
||
|
|
||
|
#define SLOP 5
|
||
|
|
||
|
/* FBreakWord states */
|
||
|
|
||
|
#define SCAN_WHITE_STATE 0
|
||
|
#define SCAN_WORD_STATE 1
|
||
|
#define SCAN_NUM_STATE 2
|
||
|
#define SCAN_SEP_STATE 3
|
||
|
#define SCAN_LEADBYTE_STATE 4
|
||
|
#define SCAN_SBKANA_STATE 5
|
||
|
|
||
|
/* Other breaker functions' states */
|
||
|
|
||
|
#define INITIAL_STATE 0
|
||
|
#define COLLECTING_STATE 1
|
||
|
|
||
|
// The following defines have an impact on the speed of the breaker.
|
||
|
// The character class that appears the most should have the lowest
|
||
|
// value (eg. 1), since the compiler will generate DEC AX, JE Lab
|
||
|
// We want that class to be executed first. Since most documents have
|
||
|
// more lower case (normalized) characters, CLASS_NORM should be 1
|
||
|
|
||
|
#define NO_CLASS 0
|
||
|
#define CLASS_NORM 0x01 // The char is already normalized
|
||
|
#define CLASS_CHAR 0x02 // The char needs to be normalized
|
||
|
#define CLASS_DIGIT 0x03 // The char is a digit
|
||
|
#define CLASS_NSTRIP 0x04 // Strip from number (like comma)
|
||
|
#define CLASS_NKEEP 0x05 // Keep with number (eg. decimal point)
|
||
|
#define CLASS_STRIP 0x06 // Strip from the word (eg. apostrophe)
|
||
|
#define CLASS_TYPE 0x07 // Reserved
|
||
|
#define CLASS_CONTEXTNSTRIP 0x08 // Stripped or not depending on context
|
||
|
#define CLASS_WILDCARD 0x09 // This is a wildcard char
|
||
|
#define CLASS_LEADBYTE 0x0A // This is a DBCS lead-byte
|
||
|
#define CLASS_SBKANA 0x0B // This is a single Kana byte
|
||
|
#define CLASS_LIGATURE 0x0C // This is a ligature char
|
||
|
|
||
|
|
||
|
/* Map to extract the class for special characters */
|
||
|
#define SPECIAL_CHAR_MAP 0xFF00
|
||
|
|
||
|
#define CLASS_BULLET 0x0100
|
||
|
#define CLASS_ENDASH 0x0200
|
||
|
#define CLASS_EMDASH 0x0300
|
||
|
#define CLASS_LQUOTE 0x0400
|
||
|
#define CLASS_RQUOTE 0x0500
|
||
|
#define CLASS_LDBLQUOTE 0x0600
|
||
|
#define CLASS_RDBLQUOTE 0x0700
|
||
|
#define CLASS_TERMINATOR 0x0800 // Ignore whatever from this char on
|
||
|
// for word wheel sorting
|
||
|
|
||
|
/* Special reserved wildcard character */
|
||
|
#define WILDCARD_STAR '*'
|
||
|
#define WILDCARD_CHAR '?'
|
||
|
|
||
|
/*
|
||
|
Those fields will be imbedded in the beginning of the date, time, etc.
|
||
|
string. This insures that only same types are compared together
|
||
|
correctly
|
||
|
*/
|
||
|
|
||
|
/* All special data types will have that byte at the beginning */
|
||
|
#define SPECIAL_TYPE 0x1
|
||
|
|
||
|
#ifdef TEST
|
||
|
#define DATE_FORMAT 0x3131
|
||
|
#define EPOCH_FORMAT 0x3231
|
||
|
#define TIME_FORMAT 0x3331
|
||
|
#define NUMBER_FORMAT 0x3431
|
||
|
#else
|
||
|
#define DATE_FORMAT 0x01
|
||
|
#define EPOCH_FORMAT 0x02
|
||
|
#define TIME_FORMAT 0x03
|
||
|
#define NUMBER_FORMAT 0x04
|
||
|
#endif
|
||
|
|
||
|
/* Sign byte of number */
|
||
|
#define NEGATIVE '1'
|
||
|
#define POSITIVE '2'
|
||
|
|
||
|
// - - - - - - - - -
|
||
|
typedef struct InternalBreakInfo
|
||
|
{
|
||
|
HANDLE hibi; // Handle to this structure
|
||
|
LCB lcb; // Byte offset of the start of the word that's
|
||
|
// being constructed. This is equal to the
|
||
|
// user-specified offset of the start of the
|
||
|
// block being processed plus the offset into
|
||
|
// the block at which the word starts. More
|
||
|
// simply, this is the "byte offset" field of
|
||
|
// the occurence element.
|
||
|
CB cbNormPunctLen; // When processing something that I'm calling
|
||
|
// a "number", I have to deal with characters
|
||
|
// such as ".", which I have to strip if
|
||
|
// they're at the end of the word, but have
|
||
|
// to keep if they're in the middle. This
|
||
|
// keeps track of the number of characters
|
||
|
// that I have to remove if the word ends.
|
||
|
// For instance, for the word "162..", this
|
||
|
// value will be 2 at the time the word ends,
|
||
|
// which would tell me to remove the last two
|
||
|
// characters (the "..").
|
||
|
CB cbRawPunctLen; // This works like "cbNormPunctLen", with a
|
||
|
// small difference. This field handles
|
||
|
// characters that are stripped, but which
|
||
|
// affect the "length" of the "number" being
|
||
|
// processed. An example case is the ","
|
||
|
// character. For instance, "12,345" is 6
|
||
|
// characters long, even though the "," gets
|
||
|
// stripped, but "12345," is five characters
|
||
|
// long, because the trailing comma doesn't
|
||
|
// affect the length.
|
||
|
BYTE state;
|
||
|
BYTE fGotType; // Flag to denote we have got the 1st byte
|
||
|
// of a 2-byte special type
|
||
|
BYTE astNormWord[CB_MAX_WORD_LEN + 1 + SLOP];
|
||
|
BYTE astRawWord[CB_MAX_WORD_LEN + 1];
|
||
|
} IBI,
|
||
|
FAR *_LPIBI;
|
||
|
|
||
|
|
||
|
PUBLIC LPIDX EXPORT_API FAR PASCAL MVIndexOpen(HANDLE, LSZ, LPERRB);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL MVSearchSetCallback (LPQT, LPVOID);
|
||
|
PUBLIC void EXPORT_API FAR PASCAL MVIndexClose(LPIDX);
|
||
|
|
||
|
typedef struct SearchInfo
|
||
|
{
|
||
|
DWORD dwMemAllowed; // Maximum memory allowed
|
||
|
DWORD dwTopicCount; // Maximum topics that the user wants to return
|
||
|
DWORD Flag; // Search flags
|
||
|
DWORD dwValue; // Internal use (should be set to 0)
|
||
|
DWORD dwTopicFullCalc; // Maximum topics of dwTopicCount which are guaranteed
|
||
|
// to have fully calculated top N similarity scores.
|
||
|
LPVOID lpvIndexObjBridge; // Allows internal .c query code to indirectly call
|
||
|
// pluggable COM stemmers.
|
||
|
} SRCHINFO, FAR *PSRCHINFO;
|
||
|
|
||
|
// Parameter passed to indexSearch. This should match with medv.h
|
||
|
|
||
|
#define QUERYRESULT_RANK 0x0100 // Ranked the result. If not highest hit 1st
|
||
|
//#define QUERYRESULT_UNSORTED 0x0200 // Result topics are 1st in 1st out (in UID order)
|
||
|
#define QUERYRESULT_UIDSORT 0x0200 // Topics are returned in UID order
|
||
|
#define QUERYRESULT_IN_MEM 0x0400 // Result should be kept in mem
|
||
|
#define QUERYRESULT_GROUPCREATE 0x0800 // Create a group from the hitlist
|
||
|
#define QUERYRESULT_NORMALIZE 0x1000 // Normalize result. Short topic 1st
|
||
|
#define QUERYRESULT_LONGFIRST 0x2000 // Long topic 1st (not supported yet)
|
||
|
#define QUERYRESULT_ALPHASORT 0x4000 // Alphabetical sort (not supported yet)
|
||
|
#define QUERYRESULT_SKIPOCCINFO 0x8000 // Topic list only, no occurrence info
|
||
|
|
||
|
#define STEMMED_SEARCH 0x00010000 // Perform runtime stemming (English only)
|
||
|
#define LARGEQUERY_SEARCH 0x00020000 // Perform large query search
|
||
|
#define SIMILAR_SEARCH 0x00040000 // Perform "find similar" search:
|
||
|
// currently memory-optimized ranked boolean OR
|
||
|
#define QUERY_GETTERMS 0x00080000 // Return with each set of occurrence
|
||
|
// data a pointer to the term string
|
||
|
// that the data is associated with.
|
||
|
|
||
|
PUBLIC LPHL EXPORT_API FAR PASCAL MVIndexSearch(LPIDX, LPQT, PSRCHINFO,
|
||
|
LPGROUP, LPERRB);
|
||
|
|
||
|
#ifndef SIMILARITY
|
||
|
PUBLIC LPHL EXPORT_API FAR PASCAL MVIndexFindSimilar (LPIDX lpidx,
|
||
|
LPPARSE_PARMS lpParms, PSRCHINFO pSrchInfo, LPGROUP lpResGroup, LPVOID pCallback,
|
||
|
LPERRB lperrb);
|
||
|
#endif // SIMILARITY
|
||
|
|
||
|
PUBLIC ERR EXPORT_API PASCAL FAR MVHitListFlush (LPHL, DWORD);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL MVHitListGetTopic(LPHL, DWORD, PTOPICINFO);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL MVHitListGetHit(LPHL, PTOPICINFO, DWORD, LPHIT);
|
||
|
PUBLIC void EXPORT_API FAR PASCAL MVHitListDispose(LPHL);
|
||
|
PUBLIC DWORD EXPORT_API PASCAL FAR MVHitListEntries (LPHL);
|
||
|
PUBLIC LONG EXPORT_API PASCAL FAR MVHitListMax (LPHL);
|
||
|
PUBLIC ERR EXPORT_API PASCAL FAR MVHitListGroup (LPVOID, LPHL);
|
||
|
|
||
|
PUBLIC void EXPORT_API PASCAL FAR MVGetIndexInfoLpidx(LPIDX, struct IndexInfo *);
|
||
|
|
||
|
/*************************************************************************
|
||
|
* Character Table Retrieval API
|
||
|
*************************************************************************/
|
||
|
PUBLIC VOID EXPORT_API FAR PASCAL MVCharTableDispose (LPCTAB);
|
||
|
PUBLIC LPCTAB EXPORT_API FAR PASCAL MVCharTableIndexLoad(HFPB, LSZ, LPERRB);
|
||
|
PUBLIC LPCTAB EXPORT_API FAR PASCAL MVCharTableLoad (HFPB, LPB, LPERRB);
|
||
|
PUBLIC LPCTAB EXPORT_API FAR PASCAL MVCharTableGetDefault (LPERRB);
|
||
|
PUBLIC VOID EXPORT_API FAR PASCAL MVCharTableDispose (LPCTAB);
|
||
|
PUBLIC ERR EXPORT_API PASCAL FAR MVCharTableFileBuild (HFPB, LPCTAB, LSZ);
|
||
|
PUBLIC VOID EXPORT_API FAR PASCAL MVCharTableSetWildcards (LPCTAB);
|
||
|
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* Operator Table Index & Retrieval API
|
||
|
*************************************************************************/
|
||
|
PUBLIC LPOPTAB EXPORT_API PASCAL FAR MVOpTableLoad (LSZ, LPERRB);
|
||
|
PUBLIC VOID EXPORT_API PASCAL FAR MVOpTableDispose (LPOPTAB);
|
||
|
PUBLIC LPOPTAB EXPORT_API FAR PASCAL MVOpTableIndexLoad(HANDLE, LSZ, LPERRB);
|
||
|
PUBLIC LPOPTAB EXPORT_API FAR PASCAL MVOpTableGetDefault(LPERRB);
|
||
|
PUBLIC ERR EXPORT_API PASCAL FAR MVOpTableFileBuild(HFPB, LPOPTAB, LSZ);
|
||
|
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* Index API
|
||
|
*************************************************************************/
|
||
|
|
||
|
/*************************************************************************
|
||
|
INDEXINFO structure
|
||
|
This structure is used to give the indexer information on how the
|
||
|
the index should be built given a certain amount of resource
|
||
|
|
||
|
DWORD dwMemSize :
|
||
|
Specify approximately how much memory the indexer is allowed to use.
|
||
|
The rule of thumb is that if more memory is allowed indexing speed
|
||
|
will be increased, if this much memory is given to the indexer by the
|
||
|
operating system. The catch is that the operating system can not
|
||
|
allocate such amount of memory for the indexer, so when some allowed
|
||
|
limit is reached, the O/S will start swapping to disk (using Virtual
|
||
|
Memory). This will slow down the indexer signicantly. It is best to
|
||
|
try to do some trial/error testing to see how much memory will give
|
||
|
an optmal indexing speed.
|
||
|
If dwMemSize = 0, a minimum 4M is used
|
||
|
|
||
|
DWORD dwBlockSize:
|
||
|
Desired block size used by the index. The minimum size if 4096. By
|
||
|
varying the block size, the user can make the overall size of the index
|
||
|
smaller or larger, the retrieval speed slower or faster. Again, there
|
||
|
is no strict rule. A large block will cause slowness in comparison,
|
||
|
while a small block will cause extra seeks to be perfromed
|
||
|
|
||
|
DWORD Occf:
|
||
|
Occurrence field flags. Those describe what data are to be included
|
||
|
in the index. The currently supported flags are:
|
||
|
OCCF_FIELDID
|
||
|
OCCF_TOPICID
|
||
|
OCCF_COUNT
|
||
|
OCCF_OFFSET
|
||
|
OCCF_LENGTH
|
||
|
Those flags can be OR'ed together
|
||
|
|
||
|
DWORD Idxf
|
||
|
Flags to denote how the index is built in term of ranking. The only
|
||
|
supported flag is
|
||
|
IDXF_NORMALIZE
|
||
|
which tell the indexer to do normalized ranking. This causes the
|
||
|
indexer to generate a huge table proportional with the maximum
|
||
|
topic id to be reside in memory at index time, and saved with the
|
||
|
index. This flag may cause the index's size to increase significantly,
|
||
|
especially when the topic ids are non-sequential (ie. random). Using
|
||
|
this flag with non-sequential topic ids may cause the indexer to
|
||
|
fail because lack of memory. Normalized searches will have a tendercy
|
||
|
to return short topics first
|
||
|
|
||
|
LCID lcid
|
||
|
This is used mostly for runtime stemming. Currently, runtime
|
||
|
stemming is supported for English only. Any other language doesn't
|
||
|
support runtime stemming yet
|
||
|
*************************************************************************/
|
||
|
typedef struct IndexInfo
|
||
|
{
|
||
|
DWORD dwMemSize; // Memory allocated for indexing to use
|
||
|
DWORD dwBlockSize; // Unit block size of the index
|
||
|
DWORD Occf; // Occurrenc field flags
|
||
|
DWORD Idxf; // Various index flag
|
||
|
|
||
|
//--------------- New Members for File Version 4.0 ----------------
|
||
|
DWORD dwCodePageID; // ANSI code page no. specified at build time
|
||
|
LCID lcid; // WIN32 locale ID specified at build time;
|
||
|
DWORD dwBreakerInstID; // breaker instance that was used to parse
|
||
|
// terms for the index at build time.
|
||
|
} INDEXINFO, FAR *PINDEXINFO;
|
||
|
|
||
|
// Various idxf flags. They can be OR'ed together
|
||
|
|
||
|
#define IDXF_NONE ((IDXF)0x0000) // Nothing, just do straight boolean.
|
||
|
#define IDXF_NORMALIZE ((IDXF)0x0001) // Index is normalized
|
||
|
#define IDXF_NOSLACK ((IDXF)0x0002) // Not supported yet
|
||
|
#define KEEP_TEMP_FILE ((IDXF)0x0100) // Keep flag
|
||
|
|
||
|
#ifdef OLD_LANG_INFO
|
||
|
// Various language flags
|
||
|
|
||
|
#define LANGUAGE_ENGLISH 0x00
|
||
|
#define LANGUAGE_JAPANESE 0x01
|
||
|
#define LANGUAGE_TRAD_CHINESE 0x02 // Mapped to old LANGUAGE_CHINESE
|
||
|
#define LANGUAGE_KOREAN 0x03
|
||
|
#define LANGUAGE_ANSI 0x04
|
||
|
#define LANGUAGE_SIMP_CHINESE 0x05
|
||
|
|
||
|
#define SZ_LANGUAGE_ENGLISH L"English"
|
||
|
#define SZ_LANGUAGE_JAPANESE L"Japanese"
|
||
|
#define SZ_LANGUAGE_TRAD_CHINESE L"TradChinese"
|
||
|
#define SZ_LANGUAGE_KOREAN L"Korean"
|
||
|
#define SZ_LANGUAGE_ANSI L"ANSI"
|
||
|
#define SZ_LANGUAGE_SIMP_CHINESE L"SimpChinese"
|
||
|
|
||
|
#define CSZ_LANGUAGE_ENGLISH 7 // strlen (SZ_LANGUAGE_ENGLISH)
|
||
|
#define CSZ_LANGUAGE_JAPANESE 8 // strlen (SZ_LANGUAGE_JAPANESE)
|
||
|
#define CSZ_LANGUAGE_TRAD_CHINESE 11 // strlen (SZ_LANGUAGE_TRAD_CHINESE)
|
||
|
#define CSZ_LANGUAGE_KOREAN 6 // strlen (SZ_LANGUAGE_KOREAN)
|
||
|
#define CSZ_LANGUAGE_ANSI 4 // strlen (SZ_LANGUAGE_ANSI)
|
||
|
#define CSZ_LANGUAGE_SIMP_CHINESE 11 // strlen (SZ_LANGUAGE_SIMP_CHINESE)
|
||
|
|
||
|
#endif // OLD_LANG_INFO - obsoleted by IT 4.0's use of locale ids.
|
||
|
|
||
|
|
||
|
/*
|
||
|
* Occurence field flags. These are initially used to indicate which
|
||
|
* occurence fields are to be indexed. Once an index is created, the
|
||
|
* flags used to create an index are stored in it. During retrieval
|
||
|
* these flags are examined in order to determine how to decode the
|
||
|
* index format. Any combination of these fields is legal
|
||
|
* OCCF_TOPICID field must be always present.
|
||
|
*
|
||
|
* OCCF_NONE
|
||
|
* Basically, this is equivalent to OCCF_TOPICID, since the indexer
|
||
|
* always turn OCCF_TOPICID on
|
||
|
*
|
||
|
* OCCF_FIELDID
|
||
|
* Save information about the field that the word belongs to. This must
|
||
|
* be set if the application is using VFLD
|
||
|
*
|
||
|
* OCCF_TOPICID
|
||
|
* Topic ID is to be saved. Basically, a topic id is just a number
|
||
|
* associated with the topic to uniquely identify it. This flag is always
|
||
|
* set by the indexer
|
||
|
*
|
||
|
* OCCF_COUNT
|
||
|
* Save information about the positions of the words relative to the
|
||
|
* first word in a topic. This flag must be set if the application wants
|
||
|
* to use NEAR or phrase operator in the search
|
||
|
*
|
||
|
* OCCF_OFFSET
|
||
|
* Save the information about the offset of the word in the topic. This
|
||
|
* is mostly used if the application wants to do search result highlighting
|
||
|
*
|
||
|
* OCCF_LENGTH
|
||
|
* Save the information about the length of the word in the topic. This
|
||
|
* is mostly used if the application wants to do search result highlighting
|
||
|
* The length of a word can be different from the real length if stemming
|
||
|
* or aliasing is used
|
||
|
*
|
||
|
* OCCF_LANGUAGE
|
||
|
* Currently ignored. This is for future multi-lingual topics support
|
||
|
*/
|
||
|
|
||
|
#define OCCF_NONE ((OCCF)0x0000) // Blank.
|
||
|
#define OCCF_FIELDID ((OCCF)0x0001) // Field-ID is present.
|
||
|
#define OCCF_TOPICID ((OCCF)0x0002) // Topic ID is present. This should
|
||
|
// always be set.
|
||
|
#define OCCF_COUNT ((OCCF)0x0004) // Word-count is present.
|
||
|
#define OCCF_OFFSET ((OCCF)0x0008) // Byte-offset is present.
|
||
|
#define OCCF_LENGTH ((OCCF)0x0010) // Word-length is present.
|
||
|
#define OCCF_LANGUAGE ((OCCF)0x0020) // Language (not supported yet)
|
||
|
|
||
|
#define OCCF_HAVE_OCCURRENCE (OCCF_OFFSET | OCCF_COUNT)
|
||
|
|
||
|
/*
|
||
|
OCC structure
|
||
|
This structure contains all the information related to a word to be
|
||
|
added to the index
|
||
|
|
||
|
DWORD dwFieldId:
|
||
|
Field Id value associated with the word. This field is added to
|
||
|
the index if OCCF_FIELDID flag is set
|
||
|
|
||
|
DWORD dwTopicID:
|
||
|
Id of the topic that the word belongs to. Used if OCCF_TOPICID is set
|
||
|
|
||
|
DWORD dwCount:
|
||
|
Starting at 0, this is the position of the word compared to the first
|
||
|
word in the topic (eg.the 11th word in the topic). Used if OCCF_COUNT
|
||
|
is set
|
||
|
|
||
|
DWORD dwOffset:
|
||
|
Starting at 0, this is the offset of the word compared to the first
|
||
|
byte in the topic. Used if OCCF_OFFSET is set
|
||
|
|
||
|
WORD wWordLen:
|
||
|
Real length of the word (unstemmed or alias). Used if OCCF_LENGTH is
|
||
|
set
|
||
|
|
||
|
WORD wLanguage:
|
||
|
Currently ignored
|
||
|
|
||
|
This structure is passed to MVIndexAddWord
|
||
|
*/
|
||
|
typedef struct Occurence
|
||
|
{
|
||
|
DWORD dwFieldId; // Field-ID.
|
||
|
DWORD dwTopicID; // TopicID.
|
||
|
DWORD dwCount; // Word-count.
|
||
|
DWORD dwOffset; // Byte-offset.
|
||
|
WORD wWordLen; // Word-length.
|
||
|
WORD wLanguage; // Language (not supported yet)
|
||
|
} OCC, FAR *LPOCC;
|
||
|
|
||
|
PUBLIC LPIPB EXPORT_API FAR PASCAL MVIndexInitiate(PINDEXINFO, LPERRB);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL MVIndexAddWord(LPIPB, LST, LPOCC);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL MVIndexBuild(HFPB, LPIPB, HFPB, LPSTR);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL MVIndexTopicDelete(HFPB, LPIPB, LSZ,
|
||
|
DWORD FAR [], DWORD);
|
||
|
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL MVIndexUpdate(HFPB, LPIPB, LSZ);
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL MVIndexUpdateEx(HFPB, LPIPB, LSZ, DWORD FAR [], DWORD);
|
||
|
PUBLIC void EXPORT_API FAR PASCAL MVIndexDispose(LPIPB);
|
||
|
|
||
|
/*************************************************************************
|
||
|
* Structures for internal uses only
|
||
|
*************************************************************************/
|
||
|
|
||
|
|
||
|
/*****************************************************************************
|
||
|
*
|
||
|
* MatchCache Structure
|
||
|
*
|
||
|
* The layout engine expects the matches to be in sorted order and
|
||
|
* non-overlapping. Neither assumption is maintained by the search
|
||
|
* engine. So, when the layout asks for matches for a particular
|
||
|
* topic, they are sorted and combined and stored in a match cache.
|
||
|
*
|
||
|
*****************************************************************************/
|
||
|
|
||
|
typedef struct _MATCHCACHE
|
||
|
{
|
||
|
long lTopic; // the topic number
|
||
|
long lItem; // the item in the topic list
|
||
|
long lMatches; // the number of matches
|
||
|
HIT match[1]; // array of match information
|
||
|
} MATCHCACHE, NEAR *PMATCHCACHE, FAR *LPMATCHCACHE;
|
||
|
|
||
|
#pragma pack() // Guard against Zp problems.
|
||
|
|
||
|
#ifdef __cplusplus
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
#endif //__MVSEARCH_H_
|
||
|
|