windows-nt/Source/XPSP1/NT/inetsrv/intlwb/chs/thammer.h

//+--------------------------------------------------------------------------
//
//  Copyright (C) 1994, 1995, 1996 Microsoft Corporation.  All Rights Reserved.
//
//  File:       thammer.h
//
//  This include file defines 3 exported APIs and their callbacks that export
//  word-breaking functionality for non-spaced Asian languages (Japanese, Chinese)
//
//  Summary of exports:
//      EnumSelectionOffsets - This function returns the offsets for the
//          selection chunks as specified in the Selection Profile (set at compile-time)
//      EnumSummarizationOffsets - This function returns the offsets for the
//          prefix (if any), the stem, and bound morphemes (fuzokugo).
//      EnumStemOffsets - This function returns the offsets for the stem only.
//          Offsets corresponding to any prefix or postfix characters will not
//          be returned.
//
//  History:                    pathal          Created.
//              25-Jun-97       pathal          Add TH_ERROR_INIT_FAILED
//              05-Jul-97       pathal          Add EnumSentenceOffsets, etc.
//---------------------------------------------------------------------------

// Return errors: the following error codes can be returned from any of
// T-Hammer's exported APIs (EnumSelectionOffsets, EnumSummarizationOffsets,
// and EnumStemOffsets)
//

#define TH_ERROR_SUCCESS 0
#define TH_ERROR_NOHPBS 1
#define TH_ERROR_INVALID_INPUT 2
#define TH_ERROR_INVALID_CALLBACK 3
#define TH_ERROR_INIT_FAILED 4
#define TH_ERROR_NOT_IMPLEMENTED 5

// Offset delimiter: the following code is used to delimit the end of a list of
// token offsets returned to one of the Enum* callback routines.  This is not
// an error code.

#define TH_SELECTION_INVALID_OFFSET 0xFFFFFFFF

// TOKENIZE_MODE: Begin and End HPB Modes
//
// Begin and End HPB modes signify that a hard phrase break comes before the
// first character in the string and/or follows after the last character in the string
// If these flags are not set, then the default behavior of EnumTokens is to start
// enumerating tokens to the right of the leftmost HPB, which probably won't
// be at the first character (unless it is a punctuation symbol) and to conclude
// enumeration at the rightmost HPB, which likely will not be the true end of the
// string.  So, these flags in affect force HPBs at the 0th and nth offsets, where
// n is the number of characters in the input buffer
//
// WARNNIG: Since Tokenize operates in batch mode, it assumes that the
// start and end of the input buffer are HPBs. These flags are only used for
// EnumTokens
//
#define TOKENIZE_MODE_BEGIN_HPB             0x00000001
#define TOKENIZE_MODE_END_HPB             0x00000002

// Note on HPBs:  HPB = hard phrase break.
// HPBs are statistically determined from analyzing a tagged corpora.
// Roughly, they cor-respond to places where you csn break with 100%
// precision (=confidence). Mostly this is around punctuation characters
// and certain conspicuous [case markers | character type] bigrams.


// When the Hide Punctuation mode is set in the tokenize flag parameter
// T-Hammer strips punctuation out of the Stem Offsets and Summarization Offsets
// callback
//
#define TOKENIZE_MODE_HIDE_PUNCTUATION    0x00000004

//+--------------------------------------------------------------------------
//  Routine:    EnumSelectionOffsetsCallback
//
//  Synopsis: client-side callback that receives a list of offsets for selection chunks
//
//  Parameters:
//      pichOffsets - pointer to first element in an array of offsets into client
//          text buffer. NOTE: callback is not allowed to stash pichChunks for
//          later processing.  pichChunks will not persist between successive
//          callbacks.  If the callback wants to use the data pointed to by pich
//          it must copy it to its own store
//      cOffsets - number of offsets passed to client (always > 1)
//      lpData - client defined data
//
// Return:
//  TRUE - to abort token enumeration
//  FALSE - to continue
//---------------------------------------------------------------------------
// BOOL
// EnumSelectionOffsetsCallback (
//    IN CONST DWORD *pichOffsets,
//    IN DWORD cOffsets,
//    IN OUT LPARAM lpData);

typedef BOOL (CALLBACK * ENUM_SELECTION_OFFSETS_CALLBACK)(
    IN CONST DWORD *pichOffsets,
    IN CONST DWORD cOffsets,
    IN OUT LPARAM lpData);

//+--------------------------------------------------------------------------
//  Routine:    EnumSelectionOffsets
//
//  Synopsis:  This is the main entry point for tokenizing text.  Sends tokens,
//  which can either be offsets or zero delimited strings to callback.
//
//  Parameters:
//  pwszText - pointer to wide-character text buffer to be tokenized,
//  cchText - count of characters in text buffer,
//  fBeginEndHPBMode - flag describing the callback mode  (see above),
//  pcbEnumSelectionOffsets - pointer to callback procedure handling token
//     enumeration,
//  lpData - client defined data
//
//  Returns:
//      TH_ERROR_SUCCESS - if the call completed successfully
//      TH_ERROR_NOHPBS - if there were no HPBs
//      TH_ERROR_INVALID_INPUT - if the input buffer was bad
//      TH_ERROR_INVALID_CALLBACK - if the input callback was bad
//---------------------------------------------------------------------------
INT
APIENTRY
EnumSelectionOffsets(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fBeginEndHPBMode,
    IN ENUM_SELECTION_OFFSETS_CALLBACK pcbEnumSelectionOffsets,
    IN LPARAM lpData);

typedef INT (APIENTRY *LP_ENUM_SELECTION_OFFSETS)(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fBeginEndHPBMode,
    IN ENUM_SELECTION_OFFSETS_CALLBACK pcbEnumSelectionOffsets,
    IN LPARAM lpData);

//+--------------------------------------------------------------------------
//  Routine:    EnumSummarizationOffsetsCallback
//
//  Synopsis: client-side callback that receives a list of offsets for each stem
//      in the free morpheme (jiritsugo) phrase.   Last offset is always contains
//      the complete string of bound morphemes (fuzokugo).  For example,
//      for "kaisan shite nai sou desu", offsets are returned for "kaisan" and
//      "shite nai sou desu".  So, counting the first initial offset, there are three
//      offsets.
//
//  Parameters:
//      pichOffsets - pointer to first element in an array of offsets into client
//          text buffer. NOTE: callback is not allowed to stash pichOffsets for
//          later processing.  pichOffsets will not persist between successive
//          callbacks.  If the callback wants to use the data pointed to by pich
//          it must copy it to its own store
//      cOffsets - number of offsets passed to client (always > 1)
//      lpData - client defined data
//
// Return:
//  TRUE - to abort token enumeration
//  FALSE - to continue
//---------------------------------------------------------------------------
// BOOL
// EnumSummarizationOffsets (
//    IN CONST DWORD *pichOffsets,
//    IN DWORD cOffsets,
//    IN OUT LPARAM lpData);

typedef BOOL (CALLBACK * ENUM_SUMMARIZATION_OFFSETS_CALLBACK)(
    IN CONST DWORD *pichOffsets,
    IN CONST DWORD cOffsets,
    IN OUT LPARAM lpData);

typedef BOOL (CALLBACK * ENUM_SUMMARIZATION_OFFSETS_EX1_CALLBACK)(
    IN CONST DWORD *pichOffsets,
    IN CONST DWORD cOffsets,
    IN PCWSTR pwzPOS,
    IN PCWSTR pwzMCat,
    IN OUT LPARAM lpData);

//+--------------------------------------------------------------------------
//  Routine:    EnumSummarizationOffsets
//
//  Synopsis:  This is the entry point for returning offsets for tokens used
//  in summarization.   These tokens correspond to stems and bound morphemes
//  (fuzokugo) in the text.  A list of offsets (and a count) is sent to the
//  EnumSummarizationOffsets callback (see above)
//
//  Parameters:
//  pwszText - pointer to wide-character text buffer to be tokenized,
//  cchText - count of characters in text buffer,
//  fTokenizeMode - flag describing the callback mode  (see above),
//     pEnumTokOutputProc - pointer to callback procedure handling token
//     enumeration,
//  lpData - client defined data
//
//  Returns:
//      TH_ERROR_SUCCESS - if the call completed successfully
//      TH_ERROR_NOHPBS - if there were no HPBs
//      TH_ERROR_INVALID_INPUT - if the input buffer was bad
//      TH_ERROR_INVALID_CALLBACK - if the input callback was bad
//---------------------------------------------------------------------------
INT
APIENTRY
EnumSummarizationOffsets(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fBeginEndHPBMode,
    IN ENUM_SUMMARIZATION_OFFSETS_CALLBACK pcbEnumSummarizationOffsets,
    IN LPARAM lpData);

INT
APIENTRY
EnumSummarizationOffsetsEx1(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fBeginEndHPBMode,
    IN ENUM_SUMMARIZATION_OFFSETS_EX1_CALLBACK pcbEnumSummarizationOffsetsEx1,
    IN LPARAM lpData);

typedef INT (APIENTRY *LP_ENUM_SUMMARIZATION_OFFSETS)(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fBeginEndHPBMode,
    IN ENUM_SUMMARIZATION_OFFSETS_CALLBACK pcbEnumSummarizationOffsets,
    IN LPARAM lpData);

typedef INT (APIENTRY *LP_ENUM_SUMMARIZATION_OFFSETS_EX1)(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fBeginEndHPBMode,
    IN ENUM_SUMMARIZATION_OFFSETS_EX1_CALLBACK pcbEnumSummarizationOffsetsEx1,
    IN LPARAM lpData);

//+--------------------------------------------------------------------------
//  Routine:    EnumStemOffsetsCallback
//
//  Synopsis: client-side callback that receives a zero--terminated stem per SPB
//
//  Parameters:
//      pwszStem - zero terminated stem string
//      lpData - client defined data
//
// Return:
//  TRUE - to abort token enumeration
//  FALSE - to continue
//---------------------------------------------------------------------------
// BOOL
// EnumStemOffsetsCallback (
//    IN WCHAR *pwszStem,
//    IN OUT LPARAM lpData);

typedef BOOL (CALLBACK * ENUM_STEM_OFFSETS_CALLBACK)(
    IN CONST DWORD *pichOffsets,
    IN CONST DWORD cOffsets,
    IN OUT LPARAM lpData);

//+--------------------------------------------------------------------------
//  Routine:    EnumStemOffsets
//
//  Synopsis:  This is the entry point for tokenizing stems.  Sends offsets,
//  for stems to the EnumStemOffsets callback (see above)
//
//  Parameters:
//  pwszText - pointer to wide-character text buffer to be tokenized,
//  cchText - count of characters in text buffer,
//  fTokenizeMode - flag describing the callback mode  (see above),
//     pEnumTokOutputProc - pointer to callback procedure handling token
//     enumeration,
//  lpData - client defined data
//
//  Returns:
//      TH_ERROR_SUCCESS - if the call completed successfully
//      TH_ERROR_NOHPBS - if there were no HPBs
//      TH_ERROR_INVALID_INPUT - if the input buffer was bad
//      TH_ERROR_INVALID_CALLBACK - if the input callback was bad
//---------------------------------------------------------------------------
INT
APIENTRY
EnumStemOffsets(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fBeginEndHPBMode,
    IN ENUM_STEM_OFFSETS_CALLBACK pcbEnumStemOffsets,
    IN OUT DWORD *pcchTextProcessed,
    IN LPARAM lpData);

typedef INT (APIENTRY *LP_ENUM_STEM_OFFSETS)(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fBeginEndHPBMode,
    IN ENUM_STEM_OFFSETS_CALLBACK pcbEnumStemOffsets,
    IN OUT DWORD *pcchTextProcessed,
    IN LPARAM lpData);

//+--------------------------------------------------------------------------
//  Routine:    EnumStemInfoCallback
//
//  Synopsis: client-side callback that receives offsets and stem information
//
//  Parameters:
//      ichOffset - offset to first character in stem
//      cchLen - length of the stem
//      pwszPOS - string containing POS info
//      pwszMCat - string containing MCat info
//      pwszDictionaryForm - string containing Dictionary Form
//      lpData - client defined data
//
// Return:
//  TRUE - to abort token enumeration
//  FALSE - to continue
//---------------------------------------------------------------------------
// BOOL
// EnumStemInfoCallback (
//    IN CONST DWORD ichOffset,
//    IN CONST DWORD cchLen,
//    IN PCWSTR pwszPOS,
//    IN PCWSTR pwszMCat,
//    IN PCWSTR pwszDictionaryForm,
//    IN OUT LPARAM lpData);

typedef BOOL (CALLBACK * ENUM_STEM_INFO_CALLBACK)(
    IN CONST DWORD ichOffset,
    IN CONST DWORD cchLen,
    IN PCWSTR pwszPOS,
    IN PCWSTR pwszMCat,
    IN PCWSTR pwszDictionaryForm,
    IN OUT LPARAM lpData);

//+--------------------------------------------------------------------------
//  Routine:    EnumStemInfo
//
//  Synopsis:  Call this routine to get information about stems.
//  For example, if you want the dictionary form, part-of-speech or
//  MCat information for a stem, then this is the API for you
//
//  Parameters:
//  pwszText - pointer to wide-character text buffer to be tokenized,
//  cchText - count of characters in text buffer,
//  fTokenizeMode - flag describing the callback mode  (see above),
//  pcbEnumStemInfo - pointer to callback procedure handling stem info
//      enumeration
//  lpData - client defined data
//
//  Returns:
//      TH_ERROR_SUCCESS - if the call completed successfully
//      TH_ERROR_NOHPBS - if there were no HPBs
//      TH_ERROR_INVALID_INPUT - if the input buffer was bad
//      TH_ERROR_INVALID_CALLBACK - if the input callback was bad
//---------------------------------------------------------------------------
INT
APIENTRY
EnumStemInfo(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fBeginEndHPBMode,
    IN ENUM_STEM_INFO_CALLBACK pcbEnumStemInfo,
    IN OUT DWORD *pcchTextProcessed,
    IN LPARAM lpData);

typedef INT (APIENTRY *LP_ENUM_STEM_INFO)(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fBeginEndHPBMode,
    IN ENUM_STEM_INFO_CALLBACK pcbEnumStemInfo,
    IN OUT DWORD *pcchTextProcessed,
    IN LPARAM lpData);

//+--------------------------------------------------------------------------
//  Routine:    EnumSentenceOffsetsCallback
//
//  Synopsis: client-side callback that receives a list of offsets for sentence breaks
//
//  Parameters:
//      ichOffsetStart - offset to start of sentence
//      ichOffsetEnd - offset to end of sentence (includes terminating punctuation)
//      lpData - client defined data
//
// Return:
//  TRUE - to abort token enumeration
//  FALSE - to continue
//---------------------------------------------------------------------------
// BOOL
// EnumSentenceOffsetsCallback (
//    IN DWORD ichOffsetStart,
//    IN DWORD ichOffsetEnd,
//    IN OUT LPARAM lpData);

typedef BOOL (CALLBACK * ENUM_SENTENCE_OFFSETS_CALLBACK)(
    IN DWORD ichOffsetStart,
    IN DWORD ichOffsetEnd,
    IN OUT LPARAM lpData);

//+--------------------------------------------------------------------------
//  Routine:    EnumSentenceOffsets
//
//  Synopsis:  This is the main entry point for breaking sentences.
//      Sends offsets delimiting sentences to the callback.
//
//  Parameters:
//  pwszText - pointer to wide-character text buffer to be tokenized,
//  cchText - count of characters in text buffer,
//  fTokenizeMode - not used.  later this will be used to control how
//      partial sentences are handled.
//  pEnumSentenceOffsetsCallback - pointer to callback procedure handling offsets
//  lpData - client defined data
//
//  Returns:
//      TH_ERROR_SUCCESS - if the call completed successfully
//      TH_ERROR_NOHPBS - if there were no HPBs
//      TH_ERROR_INVALID_INPUT - if the input buffer was bad
//      TH_ERROR_INVALID_CALLBACK - if the input callback was bad
//---------------------------------------------------------------------------
INT
APIENTRY
EnumSentenceOffsets(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fTokenizeMode,
    IN ENUM_SENTENCE_OFFSETS_CALLBACK pcbEnumSentenceOffsets,
    IN LPARAM lpData);

typedef INT (APIENTRY *LP_ENUM_SENTENCE_OFFSETS)(
    IN PCWSTR pwszText,
    IN DWORD cchText,
    IN DWORD fTokenizeMode,
    IN ENUM_SENTENCE_OFFSETS_CALLBACK pcbEnumSentenceOffsets,
    IN LPARAM lpData);


//+--------------------------------------------------------------------------
//  Routine:    FEMorphCallback
//
//  Synopsis:  The callback that gets a text stream from T-Hammer.
//
//  Parameters:
//      pwszWMorphRecs - a pointer to wide character text stream,
//                   which contains mophological analyses of a given sentence
//      pvData - pointer to private data
//
//  Returns:
//      TRUE if no more analysis is needed
//---------------------------------------------------------------------------
// BOOL
// FEMorphCallback(
//    IN PWSTR pwszMorphRecs);

typedef BOOL (CALLBACK * FEMORPH_CALLBACK)(
    IN PWSTR pwszMorphRecs,
    IN VOID *pvData);

//+--------------------------------------------------------------------------
//  Routine:    FEMorph
//
//  Synopsis:  This is the entry point for NLPWIN morpheme analysis.
//  Sends a morpheme record string back to the lex callback in NLPWIN
//
//  Parameters:
//  pwszText - pointer to wide-character text buffer to be tokenized,
//  pcbFEMorphCB - pointer to callback procedure handling morph rec enumeration
//  pvData - pointer to private data
//
//  Returns:
//---------------------------------------------------------------------------
INT
APIENTRY
FEMorph(
    IN PCWSTR pwszText,
    IN FEMORPH_CALLBACK pcbFEMorphCB,
    IN VOID *pvData);

typedef INT (APIENTRY *LP_FEMORPH)(
    IN PCWSTR pwszText,
    IN FEMORPH_CALLBACK pcbFEMorphCB,
    IN VOID *pvData);