windows-nt/Source/XPSP1/NT/inetsrv/intlwb/chs/thammer.h

474 lines
17 KiB
C
Raw Normal View History

2020-09-26 03:20:57 -05:00
//+--------------------------------------------------------------------------
//
// Copyright (C) 1994, 1995, 1996 Microsoft Corporation. All Rights Reserved.
//
// File: thammer.h
//
// This include file defines 3 exported APIs and their callbacks that export
// word-breaking functionality for non-spaced Asian languages (Japanese, Chinese)
//
// Summary of exports:
// EnumSelectionOffsets - This function returns the offsets for the
// selection chunks as specified in the Selection Profile (set at compile-time)
// EnumSummarizationOffsets - This function returns the offsets for the
// prefix (if any), the stem, and bound morphemes (fuzokugo).
// EnumStemOffsets - This function returns the offsets for the stem only.
// Offsets corresponding to any prefix or postfix characters will not
// be returned.
//
// History: pathal Created.
// 25-Jun-97 pathal Add TH_ERROR_INIT_FAILED
// 05-Jul-97 pathal Add EnumSentenceOffsets, etc.
//---------------------------------------------------------------------------
// Return errors: the following error codes can be returned from any of
// T-Hammer's exported APIs (EnumSelectionOffsets, EnumSummarizationOffsets,
// and EnumStemOffsets)
//
#define TH_ERROR_SUCCESS 0
#define TH_ERROR_NOHPBS 1
#define TH_ERROR_INVALID_INPUT 2
#define TH_ERROR_INVALID_CALLBACK 3
#define TH_ERROR_INIT_FAILED 4
#define TH_ERROR_NOT_IMPLEMENTED 5
// Offset delimiter: the following code is used to delimit the end of a list of
// token offsets returned to one of the Enum* callback routines. This is not
// an error code.
#define TH_SELECTION_INVALID_OFFSET 0xFFFFFFFF
// TOKENIZE_MODE: Begin and End HPB Modes
//
// Begin and End HPB modes signify that a hard phrase break comes before the
// first character in the string and/or follows after the last character in the string
// If these flags are not set, then the default behavior of EnumTokens is to start
// enumerating tokens to the right of the leftmost HPB, which probably won't
// be at the first character (unless it is a punctuation symbol) and to conclude
// enumeration at the rightmost HPB, which likely will not be the true end of the
// string. So, these flags in affect force HPBs at the 0th and nth offsets, where
// n is the number of characters in the input buffer
//
// WARNNIG: Since Tokenize operates in batch mode, it assumes that the
// start and end of the input buffer are HPBs. These flags are only used for
// EnumTokens
//
#define TOKENIZE_MODE_BEGIN_HPB 0x00000001
#define TOKENIZE_MODE_END_HPB 0x00000002
// Note on HPBs: HPB = hard phrase break.
// HPBs are statistically determined from analyzing a tagged corpora.
// Roughly, they cor-respond to places where you csn break with 100%
// precision (=confidence). Mostly this is around punctuation characters
// and certain conspicuous [case markers | character type] bigrams.
// When the Hide Punctuation mode is set in the tokenize flag parameter
// T-Hammer strips punctuation out of the Stem Offsets and Summarization Offsets
// callback
//
#define TOKENIZE_MODE_HIDE_PUNCTUATION 0x00000004
//+--------------------------------------------------------------------------
// Routine: EnumSelectionOffsetsCallback
//
// Synopsis: client-side callback that receives a list of offsets for selection chunks
//
// Parameters:
// pichOffsets - pointer to first element in an array of offsets into client
// text buffer. NOTE: callback is not allowed to stash pichChunks for
// later processing. pichChunks will not persist between successive
// callbacks. If the callback wants to use the data pointed to by pich
// it must copy it to its own store
// cOffsets - number of offsets passed to client (always > 1)
// lpData - client defined data
//
// Return:
// TRUE - to abort token enumeration
// FALSE - to continue
//---------------------------------------------------------------------------
// BOOL
// EnumSelectionOffsetsCallback (
// IN CONST DWORD *pichOffsets,
// IN DWORD cOffsets,
// IN OUT LPARAM lpData);
typedef BOOL (CALLBACK * ENUM_SELECTION_OFFSETS_CALLBACK)(
IN CONST DWORD *pichOffsets,
IN CONST DWORD cOffsets,
IN OUT LPARAM lpData);
//+--------------------------------------------------------------------------
// Routine: EnumSelectionOffsets
//
// Synopsis: This is the main entry point for tokenizing text. Sends tokens,
// which can either be offsets or zero delimited strings to callback.
//
// Parameters:
// pwszText - pointer to wide-character text buffer to be tokenized,
// cchText - count of characters in text buffer,
// fBeginEndHPBMode - flag describing the callback mode (see above),
// pcbEnumSelectionOffsets - pointer to callback procedure handling token
// enumeration,
// lpData - client defined data
//
// Returns:
// TH_ERROR_SUCCESS - if the call completed successfully
// TH_ERROR_NOHPBS - if there were no HPBs
// TH_ERROR_INVALID_INPUT - if the input buffer was bad
// TH_ERROR_INVALID_CALLBACK - if the input callback was bad
//---------------------------------------------------------------------------
INT
APIENTRY
EnumSelectionOffsets(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fBeginEndHPBMode,
IN ENUM_SELECTION_OFFSETS_CALLBACK pcbEnumSelectionOffsets,
IN LPARAM lpData);
typedef INT (APIENTRY *LP_ENUM_SELECTION_OFFSETS)(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fBeginEndHPBMode,
IN ENUM_SELECTION_OFFSETS_CALLBACK pcbEnumSelectionOffsets,
IN LPARAM lpData);
//+--------------------------------------------------------------------------
// Routine: EnumSummarizationOffsetsCallback
//
// Synopsis: client-side callback that receives a list of offsets for each stem
// in the free morpheme (jiritsugo) phrase. Last offset is always contains
// the complete string of bound morphemes (fuzokugo). For example,
// for "kaisan shite nai sou desu", offsets are returned for "kaisan" and
// "shite nai sou desu". So, counting the first initial offset, there are three
// offsets.
//
// Parameters:
// pichOffsets - pointer to first element in an array of offsets into client
// text buffer. NOTE: callback is not allowed to stash pichOffsets for
// later processing. pichOffsets will not persist between successive
// callbacks. If the callback wants to use the data pointed to by pich
// it must copy it to its own store
// cOffsets - number of offsets passed to client (always > 1)
// lpData - client defined data
//
// Return:
// TRUE - to abort token enumeration
// FALSE - to continue
//---------------------------------------------------------------------------
// BOOL
// EnumSummarizationOffsets (
// IN CONST DWORD *pichOffsets,
// IN DWORD cOffsets,
// IN OUT LPARAM lpData);
typedef BOOL (CALLBACK * ENUM_SUMMARIZATION_OFFSETS_CALLBACK)(
IN CONST DWORD *pichOffsets,
IN CONST DWORD cOffsets,
IN OUT LPARAM lpData);
typedef BOOL (CALLBACK * ENUM_SUMMARIZATION_OFFSETS_EX1_CALLBACK)(
IN CONST DWORD *pichOffsets,
IN CONST DWORD cOffsets,
IN PCWSTR pwzPOS,
IN PCWSTR pwzMCat,
IN OUT LPARAM lpData);
//+--------------------------------------------------------------------------
// Routine: EnumSummarizationOffsets
//
// Synopsis: This is the entry point for returning offsets for tokens used
// in summarization. These tokens correspond to stems and bound morphemes
// (fuzokugo) in the text. A list of offsets (and a count) is sent to the
// EnumSummarizationOffsets callback (see above)
//
// Parameters:
// pwszText - pointer to wide-character text buffer to be tokenized,
// cchText - count of characters in text buffer,
// fTokenizeMode - flag describing the callback mode (see above),
// pEnumTokOutputProc - pointer to callback procedure handling token
// enumeration,
// lpData - client defined data
//
// Returns:
// TH_ERROR_SUCCESS - if the call completed successfully
// TH_ERROR_NOHPBS - if there were no HPBs
// TH_ERROR_INVALID_INPUT - if the input buffer was bad
// TH_ERROR_INVALID_CALLBACK - if the input callback was bad
//---------------------------------------------------------------------------
INT
APIENTRY
EnumSummarizationOffsets(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fBeginEndHPBMode,
IN ENUM_SUMMARIZATION_OFFSETS_CALLBACK pcbEnumSummarizationOffsets,
IN LPARAM lpData);
INT
APIENTRY
EnumSummarizationOffsetsEx1(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fBeginEndHPBMode,
IN ENUM_SUMMARIZATION_OFFSETS_EX1_CALLBACK pcbEnumSummarizationOffsetsEx1,
IN LPARAM lpData);
typedef INT (APIENTRY *LP_ENUM_SUMMARIZATION_OFFSETS)(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fBeginEndHPBMode,
IN ENUM_SUMMARIZATION_OFFSETS_CALLBACK pcbEnumSummarizationOffsets,
IN LPARAM lpData);
typedef INT (APIENTRY *LP_ENUM_SUMMARIZATION_OFFSETS_EX1)(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fBeginEndHPBMode,
IN ENUM_SUMMARIZATION_OFFSETS_EX1_CALLBACK pcbEnumSummarizationOffsetsEx1,
IN LPARAM lpData);
//+--------------------------------------------------------------------------
// Routine: EnumStemOffsetsCallback
//
// Synopsis: client-side callback that receives a zero--terminated stem per SPB
//
// Parameters:
// pwszStem - zero terminated stem string
// lpData - client defined data
//
// Return:
// TRUE - to abort token enumeration
// FALSE - to continue
//---------------------------------------------------------------------------
// BOOL
// EnumStemOffsetsCallback (
// IN WCHAR *pwszStem,
// IN OUT LPARAM lpData);
typedef BOOL (CALLBACK * ENUM_STEM_OFFSETS_CALLBACK)(
IN CONST DWORD *pichOffsets,
IN CONST DWORD cOffsets,
IN OUT LPARAM lpData);
//+--------------------------------------------------------------------------
// Routine: EnumStemOffsets
//
// Synopsis: This is the entry point for tokenizing stems. Sends offsets,
// for stems to the EnumStemOffsets callback (see above)
//
// Parameters:
// pwszText - pointer to wide-character text buffer to be tokenized,
// cchText - count of characters in text buffer,
// fTokenizeMode - flag describing the callback mode (see above),
// pEnumTokOutputProc - pointer to callback procedure handling token
// enumeration,
// lpData - client defined data
//
// Returns:
// TH_ERROR_SUCCESS - if the call completed successfully
// TH_ERROR_NOHPBS - if there were no HPBs
// TH_ERROR_INVALID_INPUT - if the input buffer was bad
// TH_ERROR_INVALID_CALLBACK - if the input callback was bad
//---------------------------------------------------------------------------
INT
APIENTRY
EnumStemOffsets(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fBeginEndHPBMode,
IN ENUM_STEM_OFFSETS_CALLBACK pcbEnumStemOffsets,
IN OUT DWORD *pcchTextProcessed,
IN LPARAM lpData);
typedef INT (APIENTRY *LP_ENUM_STEM_OFFSETS)(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fBeginEndHPBMode,
IN ENUM_STEM_OFFSETS_CALLBACK pcbEnumStemOffsets,
IN OUT DWORD *pcchTextProcessed,
IN LPARAM lpData);
//+--------------------------------------------------------------------------
// Routine: EnumStemInfoCallback
//
// Synopsis: client-side callback that receives offsets and stem information
//
// Parameters:
// ichOffset - offset to first character in stem
// cchLen - length of the stem
// pwszPOS - string containing POS info
// pwszMCat - string containing MCat info
// pwszDictionaryForm - string containing Dictionary Form
// lpData - client defined data
//
// Return:
// TRUE - to abort token enumeration
// FALSE - to continue
//---------------------------------------------------------------------------
// BOOL
// EnumStemInfoCallback (
// IN CONST DWORD ichOffset,
// IN CONST DWORD cchLen,
// IN PCWSTR pwszPOS,
// IN PCWSTR pwszMCat,
// IN PCWSTR pwszDictionaryForm,
// IN OUT LPARAM lpData);
typedef BOOL (CALLBACK * ENUM_STEM_INFO_CALLBACK)(
IN CONST DWORD ichOffset,
IN CONST DWORD cchLen,
IN PCWSTR pwszPOS,
IN PCWSTR pwszMCat,
IN PCWSTR pwszDictionaryForm,
IN OUT LPARAM lpData);
//+--------------------------------------------------------------------------
// Routine: EnumStemInfo
//
// Synopsis: Call this routine to get information about stems.
// For example, if you want the dictionary form, part-of-speech or
// MCat information for a stem, then this is the API for you
//
// Parameters:
// pwszText - pointer to wide-character text buffer to be tokenized,
// cchText - count of characters in text buffer,
// fTokenizeMode - flag describing the callback mode (see above),
// pcbEnumStemInfo - pointer to callback procedure handling stem info
// enumeration
// lpData - client defined data
//
// Returns:
// TH_ERROR_SUCCESS - if the call completed successfully
// TH_ERROR_NOHPBS - if there were no HPBs
// TH_ERROR_INVALID_INPUT - if the input buffer was bad
// TH_ERROR_INVALID_CALLBACK - if the input callback was bad
//---------------------------------------------------------------------------
INT
APIENTRY
EnumStemInfo(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fBeginEndHPBMode,
IN ENUM_STEM_INFO_CALLBACK pcbEnumStemInfo,
IN OUT DWORD *pcchTextProcessed,
IN LPARAM lpData);
typedef INT (APIENTRY *LP_ENUM_STEM_INFO)(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fBeginEndHPBMode,
IN ENUM_STEM_INFO_CALLBACK pcbEnumStemInfo,
IN OUT DWORD *pcchTextProcessed,
IN LPARAM lpData);
//+--------------------------------------------------------------------------
// Routine: EnumSentenceOffsetsCallback
//
// Synopsis: client-side callback that receives a list of offsets for sentence breaks
//
// Parameters:
// ichOffsetStart - offset to start of sentence
// ichOffsetEnd - offset to end of sentence (includes terminating punctuation)
// lpData - client defined data
//
// Return:
// TRUE - to abort token enumeration
// FALSE - to continue
//---------------------------------------------------------------------------
// BOOL
// EnumSentenceOffsetsCallback (
// IN DWORD ichOffsetStart,
// IN DWORD ichOffsetEnd,
// IN OUT LPARAM lpData);
typedef BOOL (CALLBACK * ENUM_SENTENCE_OFFSETS_CALLBACK)(
IN DWORD ichOffsetStart,
IN DWORD ichOffsetEnd,
IN OUT LPARAM lpData);
//+--------------------------------------------------------------------------
// Routine: EnumSentenceOffsets
//
// Synopsis: This is the main entry point for breaking sentences.
// Sends offsets delimiting sentences to the callback.
//
// Parameters:
// pwszText - pointer to wide-character text buffer to be tokenized,
// cchText - count of characters in text buffer,
// fTokenizeMode - not used. later this will be used to control how
// partial sentences are handled.
// pEnumSentenceOffsetsCallback - pointer to callback procedure handling offsets
// lpData - client defined data
//
// Returns:
// TH_ERROR_SUCCESS - if the call completed successfully
// TH_ERROR_NOHPBS - if there were no HPBs
// TH_ERROR_INVALID_INPUT - if the input buffer was bad
// TH_ERROR_INVALID_CALLBACK - if the input callback was bad
//---------------------------------------------------------------------------
INT
APIENTRY
EnumSentenceOffsets(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fTokenizeMode,
IN ENUM_SENTENCE_OFFSETS_CALLBACK pcbEnumSentenceOffsets,
IN LPARAM lpData);
typedef INT (APIENTRY *LP_ENUM_SENTENCE_OFFSETS)(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fTokenizeMode,
IN ENUM_SENTENCE_OFFSETS_CALLBACK pcbEnumSentenceOffsets,
IN LPARAM lpData);
//+--------------------------------------------------------------------------
// Routine: FEMorphCallback
//
// Synopsis: The callback that gets a text stream from T-Hammer.
//
// Parameters:
// pwszWMorphRecs - a pointer to wide character text stream,
// which contains mophological analyses of a given sentence
// pvData - pointer to private data
//
// Returns:
// TRUE if no more analysis is needed
//---------------------------------------------------------------------------
// BOOL
// FEMorphCallback(
// IN PWSTR pwszMorphRecs);
typedef BOOL (CALLBACK * FEMORPH_CALLBACK)(
IN PWSTR pwszMorphRecs,
IN VOID *pvData);
//+--------------------------------------------------------------------------
// Routine: FEMorph
//
// Synopsis: This is the entry point for NLPWIN morpheme analysis.
// Sends a morpheme record string back to the lex callback in NLPWIN
//
// Parameters:
// pwszText - pointer to wide-character text buffer to be tokenized,
// pcbFEMorphCB - pointer to callback procedure handling morph rec enumeration
// pvData - pointer to private data
//
// Returns:
//---------------------------------------------------------------------------
INT
APIENTRY
FEMorph(
IN PCWSTR pwszText,
IN FEMORPH_CALLBACK pcbFEMorphCB,
IN VOID *pvData);
typedef INT (APIENTRY *LP_FEMORPH)(
IN PCWSTR pwszText,
IN FEMORPH_CALLBACK pcbFEMorphCB,
IN VOID *pvData);