//+--------------------------------------------------------------------------
//
//  Copyright (C) 1994, Microsoft Corporation.  All Rights Reserved.
//
//  File:       thammerp.h
//
//  History:    07-Jun-95   PatHal      Created.
//
//---------------------------------------------------------------------------

#ifndef _THAMMERP_H_
#define  _THAMMERP_H_

// Following values used for setting fMode parameters when calling
// the EnumTokens or Tokenize api.  These are used to control the content
// of the pichOffsets, pwszStem, and pwszToken strings sent to the callback
// procedure.  Note: default is fastest.

// "Selection Chunks" mode is used for Word Smart Selection.  Offsets sent
// to the callback do not necessarily correspond to morphemes boundaries.
// The chunk boundary offsets are encoded in the Japanese morphology
// that exists in T-Hammer as a resource.
#define TOKENIZE_MODE_SELECTION_OFFSETS          0x00000010

// "Stem Chunks" mode is used for Auto Summarization.  Offsets sent
// to the callback correspond to stems (and one containing all bound morphemes)
// LATER!  How should prefixes be handled?  If we remove them from the
// output then the last offset of one call will no longer equal the first offset from
// the next call.
#define TOKENIZE_MODE_STEM_OFFSETS               0x00000020

// " Summarization Stems" mode is used for Auto Summarization.  Output
// is the "stem" portion of the Bunsetsu Phrase.  For example, for Japanese
// "oyogu" the outputted form would be the stem "oyo".
#define TOKENIZE_MODE_SUMMARIZATION_OFFSETS       0x00000040

// "Break Compounds" is a special mode that instructs t-hammer to break
// compound nouns in the stem.  Use this with "Summarization Stems".
// The default is to not break the compounds (i.e. this is off by default).
#define TOKENIZE_MODE_BREAK_COMPOUNDS           0x00000080

// "ChBreak Unknowns" is a special mode that controls tokenization of unknown
// strings.  When set this forces T-Hammer to output unknown stems on a per
// character basis.  By default, this is not set which means that an unknown string
// (for example, a proper name) is outputted as a single contiguous chunk
#define TOKENIZE_MODE_CHBREAK_UNKNOWNS          0x00000100

// "Stem Info" mode is used for Dictionary Form and for obtaining POS and MCat
//  info for all.  Each callback contains one stem.  Prefixes are ignored
#define TOKENIZE_MODE_STEM_INFO               0x00000200

// "Sentence Offsets" mode is used to return sentence breaks - no further
// analysis is performed.  This is useful to segment corpora before adding
// to a test database (e.g. Babble)
#define TOKENIZE_MODE_SENTENCE_OFFSETS               0x00000400

// "Best Break" is the default.  Only the single most
// probable breaks will be output.
#define TOKENIZE_MODE_BEST_BREAK                0x00001000

// "Alternate Breaks" instructs tokenizer to output all possible
// breaks
#define TOKENIZE_MODE_ALTERNATE_BREAKS          0x00020000

// "Bunsetsu Phrases" is a default, too.   Outputs phrase breaks.
// Warning: for a word-tagged corpus, use "Break Morpheme"
#define TOKENIZE_MODE_BUNSETSU_PHRASES          0x00040000

// "Best Tags" is also default.  Outputs only one most probable
// tag for a given segmentation.   Break ambiguity and tag ambiguity are
// orthogonal attributes of the output string, hence you can "or" them together
#define TOKENIZE_MODE_BEST_TAGS                 0x00080000

// "Alternate Tags" instructs tokenizer to output all possible taggings for
// each break.  Warning: for some languages, there will be many more tag
// alternatives than break alternatives, so the output will be quite verbose.
#define TOKENIZE_MODE_ALTERNATE_TAGS            0x00100000

// "Debug" instructs tokenizer to output morpheme label information for
// alternates and tag strings for morphemes
#define TOKENIZE_MODE_VERBOSE                     0x00200000


// "Output DebugLog" is only meaningful useful for debug builds.  Setting
// this flag on and calling debug T-Hammer has the effect that T-Hammer
// outputs verbose tracing information for the morphology and stem analysis
// to a separate file named debug.utf.  CAUTION: this file is typically 500 times
// the size of the source corpus in size, so be forewarned.
#define TOKENIZE_MODE_OUTPUT_DEBUGLOG           0x00800000

// "Disable PL" means the Primary Lexicon will be disabled. This mode is for
// debug purposes only. Retail versions don't have it.
#define TOKENIZE_MODE_DISABLE_PL                0x01000000

// Instrumentation switches for collecting scoring statistics
// First switch is for Postfix Score Info
#define TOKENIZE_MODE_SCOREINFO_POSTFIX           0x02000000
// Second switch is for SPB Scoring Info
#define TOKENIZE_MODE_SCOREINFO_SPB           0x04000000

// Output morpheme records for FE-Morph API
#define TOKENIZE_MODE_MORPHEME_RECORDS          0x10000000

// Output multiple selection analyses for tagging tool
#define TOKENIZE_MODE_SELECTION_OFFSETS_EX      0x20000000

// Output summarization offsets with POS for spelling variant(conversion)
#define TOKENIZE_MODE_SUMMARIZATION_OFFSETS_EX1  0x40000000

// Output words in their dictionary form
#define TOKENIZE_MODE_DICTIONARY_FORM  0x80000000

// Output words in their dictionary form
#define TOKENIZE_MODE_SEPARATE_MORPHEMES  0x00400000

// The EnumPhrases and batch-processing Tokenize api are only used in the debug build


//+--------------------------------------------------------------------------
// defines and typedefs for "Record" subsystem
//---------------------------------------------------------------------------

#define IATTR_NIL  0
#define IATTR_SPB  1
#define IATTR_STEM 2
#define IATTR_POS  3
#define IATTR_MCAT 4
#define IATTR_FT   5
#define IATTR_LT   6

#define TH_NULL_HANDLE (TH_HANDLE)0

typedef UINT TH_HANDLE;

typedef struct tagTH_ATTRVAL
{
    UINT      iAttr; // attribute index
    TH_HANDLE hVal;  // value handle
} TH_ATTRVAL, *PTH_ATTRVAL;

typedef struct tagTH_RECORD
{
    UINT cAttrMax;
    UINT cAttrVals;
    TH_ATTRVAL *pAttrVals; // variable length attribute value array

    UINT cBitMax;
    UINT cBitVals;
    DWORD *pBitVals;  // variable length bit value array
} TH_RECORD, *PTH_RECORD;

typedef enum tagTH_TYPE
{
    TH_TYPE_INT = 1,
    TH_TYPE_STR,
    TH_TYPE_REG,
    // add more here
    TH_TYPE_MAX
} TH_TYPE;

#define TYPEOF(x)  HIWORD(x)
#define INDEXOF(x) LOWORD(x)

// pcai - 6/18/97 Makes it clear for MCat's
//
typedef BYTE MCAT;
#define SV_WORD_LEN_MAX     0x10
#define SV_WORD_IREAD_MAX   SV_WORD_LEN_MAX


//+--------------------------------------------------------------------------
//  Routine:    EnumPhrasesCallback
//
//  Synopsis: Sends delimited output (tokens) to test app callback procedure
//
//  Parameters:
//      pwszToken- pointer to wide character token string,
//      fTokenType - flag describing the types of tag in pwszToken (see above).
//
// Returns:
//  TRUE - to abort token enumeration
//  FALSE - to continue
//---------------------------------------------------------------------------
// BOOL
// EnumPhrasesCallback (
//     PWSTR pwszToken,
//     DWORD fTokenType);

typedef BOOL (CALLBACK * ENUM_PHRASES_CALLBACK)(
    IN PWSTR pwszToken,
    IN DWORD fTokenType,
    IN OUT LPARAM lpData);

//+--------------------------------------------------------------------------
//  Routine:    EnumPhrases (corresponds to mode 4 of tokenize test harness)
//
//  Synopsis:  This is the entry point for tokenizing phrases.  Sends tokenized
//  phrases which can either be offsets or zero-delimited strings to the callback
//  (defined below)
//
//  Parameters:
//  pwszText - pointer to wide-character text buffer to be tokenized,
//  cchText - count of characters in text buffer,
//  fTokenizeMode - flag describing the callback mode  (see above),
//     pEnumTokOutputProc - pointer to callback procedure handling token
//     enumeration,
//  lpData - client defined data
//
//  Returns:
//      TH_ERROR_SUCCESS - if the call completed successfully
//      TH_ERROR_NOHPBS - if there were no HPBs
//      TH_ERROR_INVALID_INPUT - if the input buffer was bad
//      TH_ERROR_INVALID_CALLBACK - if the input callback was bad
//---------------------------------------------------------------------------
INT
APIENTRY
EnumPhrases(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fBeginEndHPBMode,
    IN ENUM_PHRASES_CALLBACK pcbEnumPhrases,
    IN LPARAM lpData);

typedef INT (APIENTRY *LP_ENUM_PHRASES)(
   IN PCWSTR pwszText,
   IN DWORD cchText,
   IN DWORD fBeginEndHPBMode,
   IN ENUM_PHRASES_CALLBACK pcbEnumPhrases,
   IN LPARAM lpData);

// T-Hammer uses the folg. values to set the fTokenType parameter
// when calling back to EnumTokOutputProc.  The Tokenize test app
// uses this type information to control the comparison to the re-
// tokenized corpus as well as to format the output in general.

// "Phrase" signifies that the the end of the pwszToken string marks a
// phrase boundary.  This is the default.
#define TOKEN_TYPE_PHRASE            0x01

// "Morpheme" signifies an intra-phrase morpheme
// boundary (including the stem).
#define TOKEN_TYPE_MORPHEME     0x02

// "Alternate" signifies that the current pwszToken string is an alternate
// (primary tokens are sent before alternates).
#define TOKEN_TYPE_ALTERNATE    0x04

// "Hard Break" signifies an unambiguous text boundary.
// Note that between punctuation types the output is either all alternate
// or all non-alternate.  Any bitwise OR combination of the following
// types is possible.
#define TOKEN_TYPE_HARDBREAK  0x08

// "Label" means the token should not be used to compare to test corpus,
// but should be output parenthetically (or stored as the morpheme name),
// for example with enclosing parens
#define TOKEN_TYPE_LABEL             0x10

// "Stem" signifies that this morpheme is part of the head which corresponds
// to a "jiritsugo" for Japanese.  This is used for coloring in the tagtool
#define TOKEN_TYPE_STEM             0x20

//+--------------------------------------------------------------------------
//  Routine:    Tokenize
//
//  Synopsis: Internal word-breaker entry point for executing tokenization.
//  Returns array of delimited offsets in pibBreaks
//
//  Parameters:
//  pwszText - pointer to wide-character text buffer to be tokenized,
//  cchText - count of characters in text buffer,
//  pichBreaks - pointer to return buffer, which is filled with delimiter (breaks) offset information
//  pcBreaks - size of previous buffer; number of actual breaks used is returned
//
//  Returns:
//       TH_ERROR_SUCCESS - if the call completed successfully
//       TH_ERROR_INVALID_INPUT - if the input buffer was bad
//       TH_ERROR_INVALID_CALLBACK - if the input callback was bad
//
//  Note: Tokenize will never fail with NOHPBs, since it assumes that
//  the beginning and ends are HPBs
//
//  Notes:
//      Like lstrlen, this function try/excepts on the input buffer and returns FALSE when an exception
//      involving invalid memory dereferencing.
//
//  Open Issue:
//  1.  Do we need to change the name of this API?  "Tokenize" is a generic
//       name - maybe we should save it for a more general-purpose API.
//---------------------------------------------------------------------------
INT
APIENTRY
Tokenize(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fTokenizeMode,
    OUT PDWORD pichBreaks,
    IN OUT PDWORD cBreaks);

typedef DWORD (APIENTRY *LP_TOKENIZE)(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fTokenizeMode,
    OUT PDWORD pichBreaks,
    IN OUT PDWORD cBreaks);

//+--------------------------------------------------------------------------
//  Routine:    EnumSummarizationOffsetsEx
//
//  Temporary private entry point to overload Summarization and get back the
//  number of cch procesed.  Please refer to EnumSummarizationOffsets (thammer.h)
//  for details
//---------------------------------------------------------------------------
INT
APIENTRY
EnumSummarizationOffsetsEx(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fBeginEndHPBMode,
    IN ENUM_SUMMARIZATION_OFFSETS_CALLBACK pcbEnumSummarizationOffsets,
    IN OUT DWORD *pcchTextProcessed,
    IN LPARAM lpData);

//+--------------------------------------------------------------------------
//  Routine:    EnumSelectionOffsetsExCallback
//
//  Synopsis: same as EnumSelectionOffsetsCallback with an added parameter
//  that allows mutliple analyses to be sent back to client
//
//  Parameters:
//      ...
//      fInfo - dword bit mask that contains info on whether an analysis is primary
//              and/or spb initial
//      ...
//---------------------------------------------------------------------------
// BOOL
// EnumSelectionOffsetsExCallback (
//    IN CONST DWORD *pichOffsets,
//    IN DWORD cOffsets,
//    IN DWORD fInfo,
//    IN OUT LPARAM lpData);

#define SELN_OFFSETS_INFO_PRIMARY    0x00000001
#define SELN_OFFSETS_INFO_SPB_END    0x00000002

typedef BOOL (CALLBACK * ENUM_SELECTION_OFFSETS_EX_CALLBACK)(
    IN CONST DWORD *pichOffsets,
    IN CONST DWORD cOffsets,
    IN CONST DWORD fInfo,
    IN OUT LPARAM lpData);

//+--------------------------------------------------------------------------
//  Routine:    EnumSelectionOffsetsEx
//
//  Synopsis:  Same as EnumSelectionOffsets, but takes an "extended" callback
//   (see above for details)
//---------------------------------------------------------------------------
INT
APIENTRY
EnumSelectionOffsetsEx(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fBeginEndHPBMode,
    IN ENUM_SELECTION_OFFSETS_EX_CALLBACK pcbEnumSelectionOffsetsEx,
    IN LPARAM lpData);

typedef INT (APIENTRY *LP_ENUM_SELECTION_OFFSETS_EX)(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fBeginEndHPBMode,
    IN ENUM_SELECTION_OFFSETS_EX_CALLBACK pcbEnumSelectionOffsetsEx,
    IN LPARAM lpData);

//+--------------------------------------------------------------------------
//  Routine:  EnumSPBRecordsCallback
//
//  Synopsis:
//
//  Parameters:
//      pRec - points to an array of TH_RECORDs
//      cRec - number of TH_RECORD structs in the pRec[] array
//      iScoreIPL - well-formedness score of the given sentence.
//      pvData - points to client defined data
//
//  Returns:
//---------------------------------------------------------------------------
// BOOL
// EnumSPBRecordsCallback(
//    IN PTH_RECORD pRec,
//    IN DWORD cRec,
//    IN DWORD dwFlags,
//    IN DWORD iScoreIPL,
//    IN PVOID pvData);

#define SPBRECS_SENTEDGE 0x00000001

typedef BOOL (CALLBACK * ENUM_SPB_RECORD_CALLBACK)(
    IN PTH_RECORD pRec,
    IN DWORD cRec,
    IN DWORD dwFlags,
    IN DWORD iScoreIPL,
    IN PVOID pvData);

//+--------------------------------------------------------------------------
//  Routine:  EnumSPBRecords
//
//  Synopsis: 
//
//  Parameters:
//      pwszText - points to a sentence to analyze
//      pcbEnumSPBRecordsCB - callback function pointer.
//      lpData - points to client defined data
//
//  Returns:
//---------------------------------------------------------------------------
INT APIENTRY
EnumSPBRecords(
    IN PCWSTR pwszText,
    IN DWORD fMode,               // for TOKENIZE_MODE_DICTIONARY_FORM
    IN ENUM_SPB_RECORD_CALLBACK pcbEnumSPBRecordsCB,
    IN PVOID pvData);

typedef INT (APIENTRY *LP_ENUM_SPB_RECORDS)(
    IN PCWSTR pwszText,
    IN DWORD fMode,               // for TOKENIZE_MODE_DICTIONARY_FORM
    IN ENUM_SPB_RECORD_CALLBACK pcbEnumSPBRecordsCB,
    IN PVOID pvData);


PCWSTR
GetStringVal(
    TH_HANDLE hVal);

typedef PCWSTR (APIENTRY *LP_GET_STRING_VAL)(
    TH_HANDLE hVal);

DWORD
GetIntegerVal(
    TH_HANDLE hVal);

typedef DWORD (APIENTRY *LP_GET_INTEGER_VAL)(
    TH_HANDLE hVal);

TH_HANDLE
GetAttr(
    const PTH_RECORD pRec,
    UINT iAttr);

typedef TH_HANDLE (APIENTRY *LP_GET_ATTR) (
    const PTH_RECORD pRec,
    UINT iAttr);

//+--------------------------------------------------------------------------
//  Routine:  FxCallback
//
//  Synopsis:
//
//  Parameters:
//
//  Returns:
//---------------------------------------------------------------------------
// BOOL WINAPI
// FxCallback(
//    DWORD iFilter,
//    WCHAR *pwzFilter,
//    DWORD cRec,
//    TH_RECORD *pRec,
//    VOID *pvData);
//
typedef BOOL (WINAPI *LP_FXCB)(
    IN DWORD iFilter,
    IN WCHAR *pwzFilter,
    DWORD cRec,
    TH_RECORD *pRec,
    IN VOID *pvData);

//+--------------------------------------------------------------------------
//  Routine:  Fx
//
//  Synopsis:
//
//  Parameters:
//
//  Returns:
//---------------------------------------------------------------------------
BOOL WINAPI
Fx(
    IN PVOID *ppvFilter,
    IN DWORD cFilter,
    IN PCWSTR pwzPhrase,
    IN LP_FXCB pfnFxCallback,
    IN PVOID pvData);

typedef BOOL (WINAPI *LP_FX)(
    IN PVOID *ppvFilter,
    IN DWORD cFilter,
    IN PCWSTR pwzPhrase,
    IN LP_FXCB pfnFxCallback,
    IN PVOID pvData);

// SV-related structs and functions

//+--------------------------------------------------------------------------
//  Structure:  SV_INFO
//
//  Synopsis:   This structure is used by the SVAPI functions.
//
//---------------------------------------------------------------------------
typedef struct _SVINFO
{
    unsigned sid   : 18;        // sense id
    unsigned svid  : 6;         // id for spelling variant
    unsigned cRead : 8;         // # of elements in the reading chain for this sid
    BYTE bIPL;                  // IPL of this spelling variant
    MCAT mcat;                  // index of MCat

    // array of reading indices for the sid
    // if awRead[i] < READING_BASE, it represents a kana
    // otherwise, (awRead[i] - READING_BASE) is the
    // index into the reading table
    WORD aiwRead[SV_WORD_IREAD_MAX];

} SV_INFO;

//+--------------------------------------------------------------------------
//  Routine:  SVFindSid
//
//  Fill SV_INFO structure by searching for pwzWord in the SV-lexicon.
//  This routine is called in order to normalize/convert/reconvert
//  a word to its matching sense id(s).  This routine returns one or more
//  SV_INFO records, each being a match to the word.
//
//  Parameters: pwzWord = pointer to word for which an sid is needed
//              pwzMcat = MCat string.
//                        If pwzMcat is not valid , search for the best sid
//                        in all MCat's.
//              asvi    = array of SV_INFO for receiving sid, svid
//                        and reading indices
//                          (must be pre-allocated by the caller)
//              csviMax    = Max # of matches desired.
//                        SVFINDSID_GET_FIRST: only the first match
//                        SVFINDSID_GET_BEST: only the best match(lowest IPL)
//                        Otherwise, the first bMax matches.
//
//  Returns:    SVFINDSID_RET_NONE - no matches are found
//              otherwise returns the number of matches returned in asvi
//
//---------------------------------------------------------------------------
DWORD APIENTRY
SV_FindSid(
        IN CONST WCHAR *pwzWord,
        IN CONST WCHAR *pwzMcat,
    IN OUT SV_INFO *asvi,
    IN DWORD csviMax);

typedef DWORD (APIENTRY *LP_SV_FINDSID) (
        IN CONST WCHAR *pwzWord,
        IN CONST WCHAR *pwzMcat,
        IN OUT SV_INFO *asvi,
    IN DWORD csviMax);

//+--------------------------------------------------------------------------
//  Routine:  SVFindSid
//
//  Get sv orthography given SV_INFO structure.
//
//  Parameters: pwzWord = pointer to word for receiving the sv's orthography
//                          (must be pre-allocated by the caller)
//              psvi    = pointer to SV_INFO for receiving return information
//                          (must contain meaningful information)
//
//  Returns:    TRUE - successful
//              FALSE - unsuccessful
//---------------------------------------------------------------------------
BOOL APIENTRY
SV_GetOrtho(
        IN SV_INFO *psvi,
        IN OUT WCHAR *pwzOrtho);

typedef BOOL (APIENTRY *LP_SV_GETORTHO) (
        IN SV_INFO *psvi,
        IN OUT WCHAR *pwzOrtho);

//+--------------------------------------------------------------------------
//  Routine:  TurnOn_FindSVStems
//
//  Synopsis:   Turn on the flag for using FindSVStems().
//
//  Parameters: none
//
//  Returns:    none
//---------------------------------------------------------------------------
BOOL APIENTRY SV_EnableFindSVStems();

typedef VOID (APIENTRY *LP_SV_ENABLE_FINDSVSTEMS) ();

#define SVID_ALL_KANJI          0
#define SVID_ALL_KANA           1

#define SV_FINDSID_GET_FIRST 1
#define SV_FINDSID_GET_BEST  0xFFFFFFFF

// SVFindSid's return values
#define SV_FINDSID_RET_NONE  0
#define SV_FINDSID_ERROR     0xFFFFFFFF

#endif // _THAMMERP_H_