windows-nt/Source/XPSP1/NT/enduser/stuff/itircl/inc/mvsearch.h

/*************************************************************************
*                                                                        *
*  Copyright (C) Microsoft Corporation 1990-1994                         *
*  All Rights reserved.                                                  *
*                                                                        *
**************************************************************************
*                                                                        *
*  Module Intent                                                         *
*   All typedefs and defines needed for user's retrieval                 *
*                                                                        *
*************************************************************************/

#ifndef __MVSEARCH_H_
#define __MVSEARCH_H_
#ifdef __cplusplus
extern "C" {
#endif

#pragma pack(1) // Guard against Zp problems.

#include <iterror.h>
/*************************************************************************
 *                          Basic defines.
 *************************************************************************/

#ifdef _32BIT
#define EXPORT_API
#define HUGE

#else
#define EXPORT_API      _export
#define HUGE huge
#endif

#ifdef PRIVATE
#undef PRIVATE
#endif

#define PRIVATE static

#ifdef PUBLIC
#undef PUBLIC
#endif

#define PUBLIC

#define cbMAX_PATH              (CB)256  // Maximum pathname length.

/* Maximum year's value that can be passed to MediaView's FBreakEpoch() */

#define MAX_YEAR        ((unsigned long)0xFFFFFFFF / 366)

/* Maximum word length that is accepted by MediaView's breaker. */
#define CB_MAX_WORD_LEN ((CB)1000)       // Longest legal word.


/*************************************************************************
 *                          Typedef
 *************************************************************************/

#ifndef LPB
typedef BYTE FAR * LPB;
#endif

typedef WORD    IB;                     // Index into an array of bytes.
#ifdef _32BIT
typedef DWORD    CB;                     // Count of bytes.
#else
typedef WORD    CB;                     // Count of bytes.
#endif
typedef DWORD   LCB;                    // Count of bytes.
typedef WORD    CBIT;                   // Count of bits.
typedef WORD FAR * LPW;                 // pointer to word

typedef BYTE    FAR *LSZ;               // 0-terminated string far pointer
typedef BYTE    FAR *LST;               // Pascal style string far pointer
typedef void    FAR *LPV;               // Far void pointer
typedef DWORD   LFO;                    // 32-bit file offset.
typedef DWORD   LCF;                    // 32-bit file count bytes
typedef void    NEAR *NPV;              // Void near pointers.

typedef NPV     NPIBI;                  // Near
typedef WORD    IDXF;
typedef LPV     LPIBI;                  // Far
typedef LPV     LPSIPB;                 // Stop information parameter block.

typedef LPV     LPCAT;                  // Pointer to catalog
typedef LPV     LPGROUP;                // pointer to a group.
typedef LPV     LPIDX;                  // Pointer index block.
typedef LPV     LPQT;                   // Pointer to Query tree.
typedef LPV     LPIPB;                  // Pointer to Index parameter block.
typedef LPV     LPWHEEL;                // Pointer to wheel parameter block.
typedef LPV     LPHL;                   // Pointer to hitlist block.
typedef LPV     LPCTAB;                 // Pointer to chartab
typedef LPV     LPOPTAB;                // Pointer to operator table
typedef LPV     LPBRKI;                 // Pointer to breaker info
typedef WORD    OCCF;

typedef HANDLE  GHANDLE;
typedef HANDLE  HGPOUP;                 // Handle to Group list
typedef DWORD   IDGROUP;                // Group's ID

typedef GHANDLE HIDX;
typedef GHANDLE HFPB;

/*************************************************************************
 *             Word-breaker API and associated defines.
 *************************************************************************/

typedef HANDLE  HIBI;           // "Internal break info".  The individual
                                //  word breakers allocate this
/*
 *  FWORDCB
 *  Call back function needed for MediaView breaker. All the LST strings
 *  are special 2-byte length preceded strings.
 *
 *  LST lstRawWord:
 *      Words as they appear originally. MediaView only uses the length that
 *      is need for highlighting
 *
 *  LST lstNormWord:
 *      Normalized word, which will be indexed. Normalized are words that
 *      are modified (such as stemmed, changed to lower case, etc)
 *
 *  DWORD dwOffset:
 *      Offset in the topic (or from the beginning of the buffer passed to
 *      MediaView breakers) where the word occurs
 *
 *  LPV lpUser:
 *      User's data, propageted down to the user's call back function
 */
typedef ERR     (FAR PASCAL * FWORDCB)(LST lstRawWord, LST lstNormWord,
    DWORD dwOffset, LPV lpUser);

/*  BREAKER_INIT
 *      This is the breaker's initialization routine. This routine will
 *      be called only by MediaView's Title Builder before any calls to
 *      the breaker is made
 */
typedef LPIBI   (FAR PASCAL * BREAKER_INIT)(VOID);

/*  BREAKER_FREE
 *      Termination routine for the breaker. This will allow the breaker
 *      to free any internal buffer used by it
 */
typedef void    (FAR PASCAL * BREAKER_FREE)(LPIBI);

/*
 *  Breaker function's parameter structure:
 *
 *  BRK_PARMS structure
 *      LPIBI   lpInternalBreakInfo:
 *          This points to internal information associated with a breaker
 *          (such as memory buffer, flags, etc). It is solely used by
 *          that breaker
 *
 *      BYTE FAR * lpbBuf;
 *          Buffer containing the strings to be broken into invidual words
 *          This buffer is allocated by the application
 *
 *      DWORD   cbBufCount;
 *          The size of the buffer
 *
 *      DWORD   lcbBufOffset;
 *          The offset of the strings from the topic. This is needed if
 *          OCCF_OFFSET is used, since the MV breaker will return offsets
 *          of the words based on this offset
 *
 *      LPV lpvUser
 *          Anything that the application's callback function needs. The way
 *          the breaker works is that:
 *          - The application calls the breaker with some buffers to be broken
 *          into words
 *          - For each word the breaker will call the app's callback function
 *          to return the word and its associated information (length, offset)
 *
 *      FWORDCB lpfnOutWord;
 *          Pointer to application callback function
 *
 *      LPSIPB  lpStopInfoBlock;
 *          Stop word information. This contains a list of words that the
 *          application wants the breaker to ignore. This pertains to
 *          MediaView's breaker only
 *
 *      LPVOID  lpCharTab;
 *          Character table information. This pertains to MediaView's breaker
 *          only
 *
 *      WORD    fFlags;
 *          Internal flags set and used by MediaView's breaker only
 *
 *  } BRK_PARMS, FAR *LPBRK_PARMS;
 */

typedef struct BRK_PARMS
{
    LPIBI   lpInternalBreakInfo;
    BYTE FAR * lpbBuf;
    DWORD   cbBufCount;
    DWORD   lcbBufOffset;
    LPV     lpvUser;
    FWORDCB lpfnOutWord;
    LPSIPB  lpStopInfoBlock;
    LPVOID  lpCharTab;
    WORD    fFlags;
    WORD    Pad;                   // Padding to make DWORD align
} BRK_PARMS, FAR *LPBRK_PARMS;

/*  BREAKER_FUNC
 *      Breaker's function prototype for various MediaView's breaker functions
 *      such as FBreakWords()
 */
typedef ERR     (FAR PASCAL * BREAKER_FUNC) (LPBRK_PARMS);

/*
 * BRKLIST structure
 *      For internal use only
 */
typedef struct BreakList
{
    HANDLE  hnd;        // handle to this structure
    HANDLE  hLib;
    BREAKER_FUNC lpfnBreakFunc;
    LPSIPB      lpStopListInfo;
    LPVOID  lpCharTab;
} BRKLIST, FAR *LPBRKLIST;


/*************************************************************************
 *
 *  The following breakers functions are internal functions
 *  They can be served as a template prototypes for user's functions
 *************************************************************************/
PUBLIC  LPIBI EXPORT_API FAR PASCAL BreakerInitiate(void);
PUBLIC  void EXPORT_API FAR PASCAL BreakerFree(LPIBI);
PUBLIC  ERR EXPORT_API FAR PASCAL FBreakWords(LPBRK_PARMS);
PUBLIC  ERR EXPORT_API FAR PASCAL FBreakNumber(LPBRK_PARMS);
PUBLIC  ERR EXPORT_API FAR PASCAL FBreakDate(LPBRK_PARMS);
PUBLIC  ERR EXPORT_API FAR PASCAL FBreakTime(LPBRK_PARMS);
PUBLIC  ERR EXPORT_API FAR PASCAL FBreakEpoch(LPBRK_PARMS);
// This exists only to enable MVJK to link statically.
// We must have the same function names for the static build.
PUBLIC ERR FAR PASCAL FBreakStems(LPBRK_PARMS lpBrkParms);


// (EX)ternal (BR)ea(K)er (P)ara(M)eter structure that old .c search code
// can pass to ExtBreakText() in order to configure and call the
// new COM breakers.  The breaker control params have been purposely defined
// to mimic those in BRK_PARAMS as much as possible.  Note that the ones
// missing are now internal to the COM breaker implementation.
typedef struct _exbrkpm
{
	// This section to be specified by .c caller of breaker.
	DWORD	dwBreakWordType;		// Reg. text, number, date, etc.
	LPBYTE	lpbBuf;					// Text buffer;
	DWORD	cbBufCount;				// No. of chars in buffer.
	LPVOID	lpvUser;				// Caller data that gets passed through
									// to *lpfnOutWord.
	FWORDCB	lpfnOutWord;			// Pointer to word callback function.
	WORD	fFlags;					// Breaker flags.

	// This section is owned by the index COM object and should not be
	// modified by the .c caller.
	LPVOID	lpvIndexObjBridge;
} EXBRKPM, *PEXBRKPM;

PUBLIC HRESULT EXPORT_API FAR PASCAL ExtBreakText(PEXBRKPM pexbrkpm);
PUBLIC HRESULT EXPORT_API FAR PASCAL ExtStemWord(LPVOID lpvIndexObjBridge,
									LPBYTE lpbStemWord, LPBYTE lpbRawWord);
PUBLIC HRESULT EXPORT_API FAR PASCAL ExtLookupStopWord(
							LPVOID lpvIndexObjBridge, LPBYTE lpbStopWord);
PUBLIC HRESULT EXPORT_API FAR PASCAL ExtAddQueryResultTerm(
			LPVOID lpvIndexObjBridge, LPBYTE lpbTermHit, LPVOID *ppvTermHit);


/*************************************************************************
 *  @doc    API EXTERNAL
 *  @func   ERR FAR PASCAL | fInterrupt |
 *      Function to support interrupt (cancel) feature.
 *  @parm   LPVOID | lpV |
 *      Parameter used by the callback interrupt function
 *  @rdesc  ERR_SUCCESS if there is no interrupt, else ERR_INTERRUPT
 *************************************************************************/
typedef ERR    (FAR PASCAL *INTERRUPT_FUNC)(LPVOID);

/*************************************************************************
 *  @doc    API EXTERNAL
 *  @func   int FAR PASCAL | fStatus |
 *      Function to support status messaging feature.
 *  @parm   LPSTR | lpStr |
 *      Message to be displayed
 *  @rdesc  Different status codes
 *************************************************************************/
typedef VOID (FAR PASCAL *STATUS_FUNC)(LPSTR);


#define BREAKERBUFFERSIZE       1024    // Size of breaker's state info struct

/*
 *       Breaker Table Constants
 */
#define MAXNUMBRKRS     16     // maximum number of breakers.
#define MAXBRKRLEN      1024   // maximum size of breaker line in |SYSTEM.

#ifndef ISBU_IR_CONSTS
#define ISBU_IR_CONSTS

#define cHundredMillion ((float) 100000000.0)
#define cVerySmallWt ((float) 0.02)
#define cNintyFiveMillion 95000000
#define cTFThreshold 4096

#endif // ISBU_IR_CONSTS

/*************************************************************************
 *                     Stop list retrieval API.
 *************************************************************************/


typedef ERR (FAR PASCAL * STOPLKUP)(LPSIPB, LST);

PUBLIC  LPSIPB EXPORT_API FAR PASCAL MVStopListInitiate (WORD wTabSize,
    LPERRB lperrb);
PUBLIC  ERR EXPORT_API FAR PASCAL MVStopListIndexLoad(HFPB hSysFile,
    LPSIPB lpsipb, LSZ szStopFilename);
PUBLIC  ERR EXPORT_API FAR PASCAL MVStopListLookup(LPSIPB lpsipb,
    LST sPascalString);
PUBLIC  void EXPORT_API FAR PASCAL MVStopListDispose(LPSIPB lpsipb);
PUBLIC  ERR EXPORT_API FAR PASCAL MVStopListAddWord(LPSIPB lpsipb,
    LST sPascalString);
PUBLIC  ERR EXPORT_API FAR PASCAL MVStopListLoad(HFPB hfpbIn, LPSIPB lpsipb,
    LSZ szFilename, BREAKER_FUNC lpfnBreakerFunc,
    LPV lpCharTab);
PUBLIC  ERR EXPORT_API PASCAL FAR MVStopFileBuild (HFPB hSysFile,
    LPSIPB lpsipb, LSZ szStopfilename);
PUBLIC	ERR EXPORT_API FAR PASCAL MVStopListEnumWords(LPSIPB lpsipb,
						LST *plstWord, LONG *plWordInfo, LPVOID *ppvWordInfo);
PUBLIC	ERR EXPORT_API FAR PASCAL MVStopListFindWordPtr(LPSIPB lpsipb,
											LST lstWord, LST *plstWordInList);

/*************************************************************************
 *                      Query's defines and API
 *************************************************************************/

/* User's operators for retrieval */

#define AND_OP      0
#define OR_OP       1
#define NOT_OP      2
#define PHRASE_OP   3
#define NEAR_OP     4

#define DEF_PROX_DIST   8               // Default prox distance

// In strings with embedded font tag, this number denotes that the next byte
// is an index into a charmap table

#define EMBEDFONT_BYTE_TAG              3
#define TL_QKEY 0x8000    // Use with cDefOp to treat operators as words

/*
 *  Parser function's parameter structure:
 *
 *  PARSE_PARMS structure
 *  This structure contains all the informations necessary for
 *  parsing a line.
 *
 *  LPB lpbQuery:
 *		Pointer to buffer containing the query to be parsed
 *
 *  LPBRKLIST lpfnTable:
 *      Array of breaker functions (FBreakWords, etc) indexed by their dtype
 *
 *  DWORD cbQuery:
 *      Query's buffer length
 */

typedef struct PARSE_PARMS
{
    LPCSTR lpbQuery;        /* Pointer to query buffer */
    EXBRKPM *pexbrkpm; 		/* External breaker param structure */
    DWORD cbQuery;       /* Query buffer's length */

    /* Note: all the following fields may be gone in the future if
     * we provide new operator to support them
     */

    LPGROUP lpGroup;    /* Group */
    LPVOID  lpOpTab;    /* Operator table */
    WORD wCompoundWord;
    WORD cProxDist;     /* Proximity distance */
    WORD cDefOp;        /* Default operator */
    char padding[2];
} PARSE_PARMS, FAR *LPPARSE_PARMS;

PUBLIC LPQT EXPORT_API FAR PASCAL MVQueryParse(LPPARSE_PARMS, LPERRB);

PUBLIC  void EXPORT_API FAR PASCAL MVQueryFree(LPQT);


/*************************************************************************
 *               Index & Hitlist Retrieval API
 *************************************************************************/


/*
 *      This is an information buffer structure that you pass to "HitListGetTopic"
 *      which fills in its fields.  You can look at the fields not marked as
 *      "internal", and also pass it to other API functions.
 */

typedef struct  TopicInfo
{
    DWORD   dwTopicId;         // Topic-ID associated with this hit.
    DWORD   lcHits;            // Number of hits in this document.
    union
    {
        DWORD   liFirstHit;    // Index in the ROCC file of the first
                               //  hit in this document (internal).
        LPV     lpTopicList;   // Pointer to TopicList (internal)
    };
    WORD    wWeight;           // Document-weight.
    WORD    Pad;
}   TOPICINFO,
    FAR *PTOPICINFO;

/*
 *      This is an information buffer structure that you pass to "HitListGetHit"
 *      which fills in its fields.
 */

typedef struct  HitInfo
{
    DWORD   dwOffset;               // Byte-offset ".
    DWORD   dwFieldId;              // Field-ID associated with this hit.
    DWORD   dwCount;
    DWORD   dwLength;               // Word-length ".
	LPVOID	lpvTerm;				// Pointer to a term in WORD-prefix length
									//	Unicode format, i.e. a "wide ST".
}   HIT,
    FAR *LPHIT;


//  SLOP is extra bytes to handle diacritic. Each byte represents an
//  occurence of one diacritic. 5 of them should be more than enough to
//  handle all diacritics in a word. This set up will allow us to
//  simplify the checking by just doing it for RawWord only

#define SLOP    5

/* FBreakWord states */

#define SCAN_WHITE_STATE    0
#define SCAN_WORD_STATE     1
#define SCAN_NUM_STATE      2
#define SCAN_SEP_STATE      3
#define SCAN_LEADBYTE_STATE 4
#define SCAN_SBKANA_STATE   5

/* Other breaker functions' states */

#define INITIAL_STATE       0
#define COLLECTING_STATE    1

//  The following defines have an impact on the speed of the breaker.
//  The character class that appears the most should have the lowest
//  value (eg. 1), since the compiler will generate DEC AX, JE Lab
//  We want that class to be executed first. Since most documents have
//  more lower case (normalized) characters, CLASS_NORM should be 1

#define NO_CLASS        0
#define CLASS_NORM      0x01       // The char is already normalized
#define CLASS_CHAR      0x02       // The char needs to be normalized
#define CLASS_DIGIT     0x03       // The char is a digit
#define CLASS_NSTRIP    0x04       // Strip from number (like comma)
#define CLASS_NKEEP     0x05       // Keep with number (eg. decimal point)
#define CLASS_STRIP     0x06       // Strip from the word (eg. apostrophe)
#define CLASS_TYPE      0x07       // Reserved
#define CLASS_CONTEXTNSTRIP 0x08   // Stripped or not depending on context
#define CLASS_WILDCARD  0x09       // This is a wildcard char
#define CLASS_LEADBYTE  0x0A       // This is a DBCS lead-byte
#define CLASS_SBKANA    0x0B       // This is a single Kana byte
#define CLASS_LIGATURE  0x0C       // This is a ligature char


/* Map to extract the class for special characters */
#define  SPECIAL_CHAR_MAP 0xFF00

#define CLASS_BULLET    0x0100
#define CLASS_ENDASH    0x0200
#define CLASS_EMDASH    0x0300
#define CLASS_LQUOTE    0x0400
#define CLASS_RQUOTE    0x0500
#define CLASS_LDBLQUOTE 0x0600
#define CLASS_RDBLQUOTE 0x0700
#define CLASS_TERMINATOR 0x0800     // Ignore whatever from this char on
                                    // for word wheel sorting

/* Special reserved wildcard character */
#define WILDCARD_STAR   '*'
#define WILDCARD_CHAR   '?'

/*
    Those fields will be imbedded in the beginning of the date, time, etc.
    string. This insures that only same types are compared together
    correctly
 */

/* All special data types will have that byte at the beginning */
#define SPECIAL_TYPE    0x1

#ifdef TEST
#define DATE_FORMAT     0x3131
#define EPOCH_FORMAT    0x3231
#define TIME_FORMAT     0x3331
#define NUMBER_FORMAT   0x3431
#else
#define DATE_FORMAT     0x01
#define EPOCH_FORMAT    0x02
#define TIME_FORMAT     0x03
#define NUMBER_FORMAT   0x04
#endif

/* Sign byte of number */
#define NEGATIVE    '1'
#define POSITIVE    '2'

//  -   -   -   -   -   -   -   -   -
typedef struct  InternalBreakInfo
{
    HANDLE  hibi;       // Handle to this structure
    LCB lcb;    // Byte offset of the start of the word that's
        //  being constructed.  This is equal to the
        //  user-specified offset of the start of the
        //  block being processed plus the offset into
        //  the block at which the word starts.  More
        //  simply, this is the "byte offset" field of
        //  the occurence element.
    CB  cbNormPunctLen; // When processing something that I'm calling
        //  a "number", I have to deal with characters
        //  such as ".", which I have to strip if
        //  they're at the end of the word, but have
        //  to keep if they're in the middle.  This
        //  keeps track of the number of characters
        //  that I have to remove if the word ends.
        //  For instance, for the word "162..", this
        //  value will be 2 at the time the word ends,
        //  which would tell me to remove the last two
        //  characters (the "..").
    CB  cbRawPunctLen;  // This works like "cbNormPunctLen", with a
        //  small difference.  This field handles
        //  characters that are stripped, but which
        //  affect the "length" of the "number" being
        //  processed. An example case is the ","
        //  character.  For instance, "12,345" is 6
        //  characters long, even though the "," gets
        //  stripped, but "12345," is five characters
        //  long, because the trailing comma doesn't
        //  affect the length.
    BYTE    state;
    BYTE    fGotType;   // Flag to denote we have got the 1st byte
        // of a 2-byte special type
    BYTE    astNormWord[CB_MAX_WORD_LEN + 1 + SLOP];
    BYTE    astRawWord[CB_MAX_WORD_LEN + 1];
}   IBI,
    FAR *_LPIBI;


PUBLIC  LPIDX EXPORT_API FAR PASCAL MVIndexOpen(HANDLE, LSZ, LPERRB);
PUBLIC  ERR  EXPORT_API FAR PASCAL MVSearchSetCallback (LPQT, LPVOID);
PUBLIC  void EXPORT_API FAR PASCAL MVIndexClose(LPIDX);

typedef struct SearchInfo
{
    DWORD  dwMemAllowed;    // Maximum memory allowed
    DWORD  dwTopicCount;    // Maximum topics that the user wants to return
    DWORD  Flag;            // Search flags
	DWORD  dwValue;			// Internal use (should be set to 0)
	DWORD  dwTopicFullCalc;		// Maximum topics of dwTopicCount which are guaranteed
								// to have fully calculated top N similarity scores.
	LPVOID lpvIndexObjBridge;	// Allows internal .c query code to indirectly call
								// pluggable COM stemmers.
} SRCHINFO, FAR *PSRCHINFO;

// Parameter passed to indexSearch. This should match with medv.h

#define QUERYRESULT_RANK        0x0100 // Ranked the result. If not highest hit 1st
//#define QUERYRESULT_UNSORTED    0x0200 // Result topics are 1st in 1st out (in UID order)
#define QUERYRESULT_UIDSORT     0x0200 // Topics are returned in UID order
#define QUERYRESULT_IN_MEM      0x0400 // Result should be kept in mem
#define QUERYRESULT_GROUPCREATE 0x0800 // Create a group from the hitlist
#define QUERYRESULT_NORMALIZE   0x1000 // Normalize result. Short topic 1st
#define QUERYRESULT_LONGFIRST   0x2000 // Long topic 1st (not supported yet)
#define QUERYRESULT_ALPHASORT   0x4000 // Alphabetical sort (not supported yet)
#define QUERYRESULT_SKIPOCCINFO 0x8000 // Topic list only, no occurrence info

#define STEMMED_SEARCH      0x00010000 // Perform runtime stemming (English only)
#define LARGEQUERY_SEARCH	0x00020000 // Perform large query search
#define SIMILAR_SEARCH		0x00040000 // Perform "find similar" search:
									   // currently memory-optimized ranked boolean OR
#define QUERY_GETTERMS		0x00080000	// Return with each set of occurrence
										// data a pointer to the term string
										// that the data is associated with.

PUBLIC  LPHL EXPORT_API FAR PASCAL MVIndexSearch(LPIDX, LPQT, PSRCHINFO,
    LPGROUP, LPERRB);

#ifndef SIMILARITY
PUBLIC LPHL EXPORT_API FAR PASCAL MVIndexFindSimilar (LPIDX lpidx,
  LPPARSE_PARMS lpParms, PSRCHINFO pSrchInfo, LPGROUP lpResGroup, LPVOID pCallback,
  LPERRB lperrb);
#endif // SIMILARITY

PUBLIC ERR EXPORT_API PASCAL FAR MVHitListFlush (LPHL, DWORD);
PUBLIC ERR EXPORT_API FAR PASCAL MVHitListGetTopic(LPHL, DWORD, PTOPICINFO);
PUBLIC ERR EXPORT_API FAR PASCAL MVHitListGetHit(LPHL, PTOPICINFO, DWORD, LPHIT);
PUBLIC void EXPORT_API FAR PASCAL MVHitListDispose(LPHL);
PUBLIC DWORD EXPORT_API PASCAL FAR MVHitListEntries (LPHL);
PUBLIC LONG EXPORT_API PASCAL FAR MVHitListMax (LPHL);
PUBLIC ERR EXPORT_API PASCAL FAR MVHitListGroup (LPVOID, LPHL);

PUBLIC void EXPORT_API PASCAL FAR MVGetIndexInfoLpidx(LPIDX, struct IndexInfo *);

/*************************************************************************
 *                     Character Table Retrieval API
 *************************************************************************/
PUBLIC VOID EXPORT_API FAR PASCAL MVCharTableDispose (LPCTAB);
PUBLIC LPCTAB EXPORT_API FAR PASCAL MVCharTableIndexLoad(HFPB, LSZ, LPERRB);
PUBLIC LPCTAB EXPORT_API FAR PASCAL MVCharTableLoad (HFPB, LPB, LPERRB);
PUBLIC LPCTAB EXPORT_API FAR PASCAL MVCharTableGetDefault (LPERRB);
PUBLIC VOID EXPORT_API FAR PASCAL MVCharTableDispose (LPCTAB);
PUBLIC ERR EXPORT_API PASCAL FAR MVCharTableFileBuild (HFPB, LPCTAB, LSZ);
PUBLIC VOID EXPORT_API FAR PASCAL MVCharTableSetWildcards (LPCTAB);


/*************************************************************************
 *                 Operator Table Index & Retrieval API
 *************************************************************************/
PUBLIC LPOPTAB EXPORT_API PASCAL FAR MVOpTableLoad (LSZ, LPERRB);
PUBLIC VOID EXPORT_API PASCAL FAR MVOpTableDispose (LPOPTAB);
PUBLIC LPOPTAB EXPORT_API FAR PASCAL MVOpTableIndexLoad(HANDLE, LSZ, LPERRB);
PUBLIC LPOPTAB EXPORT_API FAR PASCAL MVOpTableGetDefault(LPERRB);
PUBLIC ERR EXPORT_API PASCAL FAR MVOpTableFileBuild(HFPB, LPOPTAB, LSZ);


/*************************************************************************
 *                          Index API
 *************************************************************************/

/*************************************************************************
    INDEXINFO structure
    This structure is used to give the indexer information on how the
    the index should be built given a certain amount of resource

    DWORD   dwMemSize :
        Specify approximately how much memory the indexer is allowed to use.
        The rule of thumb is that if more memory is allowed indexing speed
        will be increased, if this much memory is given to the indexer by the
        operating system. The catch is that the operating system can not
        allocate such amount of memory for the indexer, so when some allowed
        limit is reached, the O/S will start swapping to disk (using Virtual
        Memory). This will slow down the indexer signicantly. It is best to
        try to do some trial/error testing to see how much memory will give
        an optmal indexing speed.
        If dwMemSize = 0, a minimum 4M is used

   DWORD   dwBlockSize:
        Desired block size used by the index. The minimum size if 4096. By
        varying the block size, the user can make the overall size of the index
        smaller or larger, the retrieval speed slower or faster. Again, there
        is no strict rule. A large block will cause slowness in comparison,
        while a small block will cause extra seeks to be perfromed

   DWORD   Occf:
        Occurrence field flags. Those describe what data are to be included
        in the index. The currently supported flags are:
            OCCF_FIELDID
            OCCF_TOPICID
            OCCF_COUNT
            OCCF_OFFSET
            OCCF_LENGTH
        Those flags can be OR'ed together

   DWORD   Idxf
        Flags to denote how the index is built in term of ranking. The only
        supported flag is
            IDXF_NORMALIZE
        which tell the indexer to do normalized ranking. This causes the
        indexer to generate a huge table proportional with the maximum
        topic id to be reside in memory at index time, and saved with the
        index. This flag may cause the index's size to increase significantly,
        especially when the topic ids are non-sequential (ie. random). Using
        this flag with non-sequential topic ids may cause the indexer to
        fail because lack of memory. Normalized searches will have a tendercy
        to return short topics first

    LCID   lcid
        This is used mostly for runtime stemming. Currently, runtime
        stemming is supported for English only. Any other language doesn't
        support runtime stemming yet
 *************************************************************************/
typedef struct IndexInfo
{
    DWORD   dwMemSize;      // Memory allocated for indexing to use
    DWORD   dwBlockSize;    // Unit block size of the index
    DWORD   Occf;           // Occurrenc field flags
    DWORD   Idxf;           // Various index flag

	//--------------- New Members for File Version 4.0 ----------------
	DWORD	dwCodePageID;		// ANSI code page no. specified at build time
	LCID	lcid;				// WIN32 locale ID specified at build time;
	DWORD	dwBreakerInstID;	// breaker instance that was used to parse
								// terms for the index at build time.
} INDEXINFO, FAR *PINDEXINFO;

// Various idxf flags. They can be OR'ed together

#define     IDXF_NONE       ((IDXF)0x0000)   // Nothing, just do straight boolean.
#define     IDXF_NORMALIZE  ((IDXF)0x0001)   // Index is normalized
#define     IDXF_NOSLACK    ((IDXF)0x0002)   // Not supported yet
#define     KEEP_TEMP_FILE  ((IDXF)0x0100)   // Keep flag

#ifdef OLD_LANG_INFO
// Various language flags

#define LANGUAGE_ENGLISH        0x00
#define LANGUAGE_JAPANESE       0x01
#define LANGUAGE_TRAD_CHINESE   0x02 // Mapped to old LANGUAGE_CHINESE
#define LANGUAGE_KOREAN         0x03
#define LANGUAGE_ANSI           0x04
#define LANGUAGE_SIMP_CHINESE   0x05

#define SZ_LANGUAGE_ENGLISH         L"English"
#define SZ_LANGUAGE_JAPANESE        L"Japanese"
#define SZ_LANGUAGE_TRAD_CHINESE    L"TradChinese"
#define SZ_LANGUAGE_KOREAN          L"Korean"
#define SZ_LANGUAGE_ANSI            L"ANSI"
#define SZ_LANGUAGE_SIMP_CHINESE    L"SimpChinese"

#define CSZ_LANGUAGE_ENGLISH        7   // strlen (SZ_LANGUAGE_ENGLISH)
#define CSZ_LANGUAGE_JAPANESE       8   // strlen (SZ_LANGUAGE_JAPANESE)
#define CSZ_LANGUAGE_TRAD_CHINESE   11  // strlen (SZ_LANGUAGE_TRAD_CHINESE)
#define CSZ_LANGUAGE_KOREAN         6   // strlen (SZ_LANGUAGE_KOREAN)
#define CSZ_LANGUAGE_ANSI           4   // strlen (SZ_LANGUAGE_ANSI)
#define CSZ_LANGUAGE_SIMP_CHINESE   11  // strlen (SZ_LANGUAGE_SIMP_CHINESE)

#endif	// OLD_LANG_INFO - obsoleted by IT 4.0's use of locale ids.


/*
 *  Occurence field flags.  These are initially used to indicate which
 *  occurence fields are to be indexed.  Once an index is created, the
 *  flags used to create an index are stored in it.  During retrieval
 *  these flags are examined in order to determine how to decode the
 *  index format.  Any combination of these fields is legal
 *  OCCF_TOPICID field must be always present.
 *
 *  OCCF_NONE
 *      Basically, this is equivalent to OCCF_TOPICID, since the indexer
 *      always turn OCCF_TOPICID on
 *
 *  OCCF_FIELDID
 *      Save information about the field that the word belongs to. This must
 *      be set if the application is using VFLD
 *
 *  OCCF_TOPICID
 *      Topic ID is to be saved. Basically, a topic id is just a number
 *      associated with the topic to uniquely identify it. This flag is always
 *      set by the indexer
 *
 *  OCCF_COUNT
 *      Save information about the positions of the words relative to the
 *      first word in a topic. This flag must be set if the application wants
 *      to use NEAR or phrase operator in the search
 *
 *  OCCF_OFFSET
 *      Save the information about the offset of the word in the topic. This
 *      is mostly used if the application wants to do search result highlighting
 *
 *  OCCF_LENGTH
 *      Save the information about the length of the word in the topic. This
 *      is mostly used if the application wants to do search result highlighting
 *      The length of a word can be different from the real length if stemming
 *      or aliasing is used
 *
 *  OCCF_LANGUAGE
 *      Currently ignored. This is for future multi-lingual topics support
 */

#define OCCF_NONE       ((OCCF)0x0000)    // Blank.
#define OCCF_FIELDID    ((OCCF)0x0001)    // Field-ID is present.
#define OCCF_TOPICID    ((OCCF)0x0002)    // Topic ID is present.  This should
                                          //  always be set.
#define OCCF_COUNT      ((OCCF)0x0004)    // Word-count is present.
#define OCCF_OFFSET     ((OCCF)0x0008)    // Byte-offset is present.
#define OCCF_LENGTH     ((OCCF)0x0010)    // Word-length is present.
#define OCCF_LANGUAGE   ((OCCF)0x0020)    // Language (not supported yet)

#define OCCF_HAVE_OCCURRENCE (OCCF_OFFSET | OCCF_COUNT)

/*
    OCC structure
    This structure contains all the information related to a word to be
    added to the index

    DWORD   dwFieldId:
        Field Id value associated with the word. This field is added to
        the index if OCCF_FIELDID flag is set

    DWORD   dwTopicID:
        Id of the topic that the word belongs to. Used if OCCF_TOPICID is set

    DWORD   dwCount:
        Starting at 0, this is the position of the word compared to the first
        word in the topic (eg.the 11th word in the topic). Used if OCCF_COUNT
        is set

    DWORD   dwOffset:
        Starting at 0, this is the offset of the word compared to the first
        byte in the topic. Used if OCCF_OFFSET is set

    WORD    wWordLen:
        Real length of the word (unstemmed or alias). Used if OCCF_LENGTH is
        set

    WORD    wLanguage:
        Currently ignored

    This structure is passed to MVIndexAddWord
 */
typedef struct  Occurence
{
    DWORD   dwFieldId;              // Field-ID.
    DWORD   dwTopicID;              // TopicID.
    DWORD   dwCount;                // Word-count.
    DWORD   dwOffset;               // Byte-offset.
    WORD    wWordLen;               // Word-length.
    WORD    wLanguage;              // Language (not supported yet)
} OCC, FAR *LPOCC;

PUBLIC  LPIPB EXPORT_API FAR PASCAL MVIndexInitiate(PINDEXINFO, LPERRB);
PUBLIC  ERR EXPORT_API FAR PASCAL MVIndexAddWord(LPIPB, LST, LPOCC);
PUBLIC  ERR EXPORT_API FAR PASCAL MVIndexBuild(HFPB, LPIPB, HFPB, LPSTR);
PUBLIC  ERR EXPORT_API FAR PASCAL MVIndexTopicDelete(HFPB, LPIPB, LSZ,
    DWORD FAR [], DWORD);

PUBLIC  ERR EXPORT_API FAR PASCAL MVIndexUpdate(HFPB, LPIPB, LSZ);
PUBLIC  ERR EXPORT_API FAR PASCAL MVIndexUpdateEx(HFPB, LPIPB, LSZ, DWORD FAR [], DWORD);
PUBLIC  void EXPORT_API FAR PASCAL MVIndexDispose(LPIPB);

/*************************************************************************
 *              Structures for internal uses only
 *************************************************************************/


/*****************************************************************************
*
*       MatchCache Structure
*
*   The layout engine expects the matches to be in sorted order and
*   non-overlapping.  Neither assumption is maintained by the search
*   engine.  So, when the layout asks for matches for a particular
*   topic, they are sorted and combined and stored in a match cache.
*
*****************************************************************************/

typedef struct _MATCHCACHE
{
    long    lTopic;         // the topic number
    long    lItem;          // the item in the topic list
    long    lMatches;       // the number of matches
    HIT     match[1];       // array of match information
} MATCHCACHE, NEAR *PMATCHCACHE, FAR *LPMATCHCACHE;

#pragma pack()  // Guard against Zp problems.

#ifdef __cplusplus
}
#endif

#endif //__MVSEARCH_H_