474 lines
17 KiB
C
474 lines
17 KiB
C
//+--------------------------------------------------------------------------
|
|
//
|
|
// Copyright (C) 1994, 1995, 1996 Microsoft Corporation. All Rights Reserved.
|
|
//
|
|
// File: thammer.h
|
|
//
|
|
// This include file defines 3 exported APIs and their callbacks that export
|
|
// word-breaking functionality for non-spaced Asian languages (Japanese, Chinese)
|
|
//
|
|
// Summary of exports:
|
|
// EnumSelectionOffsets - This function returns the offsets for the
|
|
// selection chunks as specified in the Selection Profile (set at compile-time)
|
|
// EnumSummarizationOffsets - This function returns the offsets for the
|
|
// prefix (if any), the stem, and bound morphemes (fuzokugo).
|
|
// EnumStemOffsets - This function returns the offsets for the stem only.
|
|
// Offsets corresponding to any prefix or postfix characters will not
|
|
// be returned.
|
|
//
|
|
// History: pathal Created.
|
|
// 25-Jun-97 pathal Add TH_ERROR_INIT_FAILED
|
|
// 05-Jul-97 pathal Add EnumSentenceOffsets, etc.
|
|
//---------------------------------------------------------------------------
|
|
|
|
// Return errors: the following error codes can be returned from any of
|
|
// T-Hammer's exported APIs (EnumSelectionOffsets, EnumSummarizationOffsets,
|
|
// and EnumStemOffsets)
|
|
//
|
|
|
|
#define TH_ERROR_SUCCESS 0
|
|
#define TH_ERROR_NOHPBS 1
|
|
#define TH_ERROR_INVALID_INPUT 2
|
|
#define TH_ERROR_INVALID_CALLBACK 3
|
|
#define TH_ERROR_INIT_FAILED 4
|
|
#define TH_ERROR_NOT_IMPLEMENTED 5
|
|
|
|
// Offset delimiter: the following code is used to delimit the end of a list of
|
|
// token offsets returned to one of the Enum* callback routines. This is not
|
|
// an error code.
|
|
|
|
#define TH_SELECTION_INVALID_OFFSET 0xFFFFFFFF
|
|
|
|
// TOKENIZE_MODE: Begin and End HPB Modes
|
|
//
|
|
// Begin and End HPB modes signify that a hard phrase break comes before the
|
|
// first character in the string and/or follows after the last character in the string
|
|
// If these flags are not set, then the default behavior of EnumTokens is to start
|
|
// enumerating tokens to the right of the leftmost HPB, which probably won't
|
|
// be at the first character (unless it is a punctuation symbol) and to conclude
|
|
// enumeration at the rightmost HPB, which likely will not be the true end of the
|
|
// string. So, these flags in affect force HPBs at the 0th and nth offsets, where
|
|
// n is the number of characters in the input buffer
|
|
//
|
|
// WARNNIG: Since Tokenize operates in batch mode, it assumes that the
|
|
// start and end of the input buffer are HPBs. These flags are only used for
|
|
// EnumTokens
|
|
//
|
|
#define TOKENIZE_MODE_BEGIN_HPB 0x00000001
|
|
#define TOKENIZE_MODE_END_HPB 0x00000002
|
|
|
|
// Note on HPBs: HPB = hard phrase break.
|
|
// HPBs are statistically determined from analyzing a tagged corpora.
|
|
// Roughly, they cor-respond to places where you csn break with 100%
|
|
// precision (=confidence). Mostly this is around punctuation characters
|
|
// and certain conspicuous [case markers | character type] bigrams.
|
|
|
|
|
|
// When the Hide Punctuation mode is set in the tokenize flag parameter
|
|
// T-Hammer strips punctuation out of the Stem Offsets and Summarization Offsets
|
|
// callback
|
|
//
|
|
#define TOKENIZE_MODE_HIDE_PUNCTUATION 0x00000004
|
|
|
|
//+--------------------------------------------------------------------------
|
|
// Routine: EnumSelectionOffsetsCallback
|
|
//
|
|
// Synopsis: client-side callback that receives a list of offsets for selection chunks
|
|
//
|
|
// Parameters:
|
|
// pichOffsets - pointer to first element in an array of offsets into client
|
|
// text buffer. NOTE: callback is not allowed to stash pichChunks for
|
|
// later processing. pichChunks will not persist between successive
|
|
// callbacks. If the callback wants to use the data pointed to by pich
|
|
// it must copy it to its own store
|
|
// cOffsets - number of offsets passed to client (always > 1)
|
|
// lpData - client defined data
|
|
//
|
|
// Return:
|
|
// TRUE - to abort token enumeration
|
|
// FALSE - to continue
|
|
//---------------------------------------------------------------------------
|
|
// BOOL
|
|
// EnumSelectionOffsetsCallback (
|
|
// IN CONST DWORD *pichOffsets,
|
|
// IN DWORD cOffsets,
|
|
// IN OUT LPARAM lpData);
|
|
|
|
typedef BOOL (CALLBACK * ENUM_SELECTION_OFFSETS_CALLBACK)(
|
|
IN CONST DWORD *pichOffsets,
|
|
IN CONST DWORD cOffsets,
|
|
IN OUT LPARAM lpData);
|
|
|
|
//+--------------------------------------------------------------------------
|
|
// Routine: EnumSelectionOffsets
|
|
//
|
|
// Synopsis: This is the main entry point for tokenizing text. Sends tokens,
|
|
// which can either be offsets or zero delimited strings to callback.
|
|
//
|
|
// Parameters:
|
|
// pwszText - pointer to wide-character text buffer to be tokenized,
|
|
// cchText - count of characters in text buffer,
|
|
// fBeginEndHPBMode - flag describing the callback mode (see above),
|
|
// pcbEnumSelectionOffsets - pointer to callback procedure handling token
|
|
// enumeration,
|
|
// lpData - client defined data
|
|
//
|
|
// Returns:
|
|
// TH_ERROR_SUCCESS - if the call completed successfully
|
|
// TH_ERROR_NOHPBS - if there were no HPBs
|
|
// TH_ERROR_INVALID_INPUT - if the input buffer was bad
|
|
// TH_ERROR_INVALID_CALLBACK - if the input callback was bad
|
|
//---------------------------------------------------------------------------
|
|
INT
|
|
APIENTRY
|
|
EnumSelectionOffsets(
|
|
IN PCWSTR pwszText,
|
|
IN DWORD cchText,
|
|
IN DWORD fBeginEndHPBMode,
|
|
IN ENUM_SELECTION_OFFSETS_CALLBACK pcbEnumSelectionOffsets,
|
|
IN LPARAM lpData);
|
|
|
|
typedef INT (APIENTRY *LP_ENUM_SELECTION_OFFSETS)(
|
|
IN PCWSTR pwszText,
|
|
IN DWORD cchText,
|
|
IN DWORD fBeginEndHPBMode,
|
|
IN ENUM_SELECTION_OFFSETS_CALLBACK pcbEnumSelectionOffsets,
|
|
IN LPARAM lpData);
|
|
|
|
//+--------------------------------------------------------------------------
|
|
// Routine: EnumSummarizationOffsetsCallback
|
|
//
|
|
// Synopsis: client-side callback that receives a list of offsets for each stem
|
|
// in the free morpheme (jiritsugo) phrase. Last offset is always contains
|
|
// the complete string of bound morphemes (fuzokugo). For example,
|
|
// for "kaisan shite nai sou desu", offsets are returned for "kaisan" and
|
|
// "shite nai sou desu". So, counting the first initial offset, there are three
|
|
// offsets.
|
|
//
|
|
// Parameters:
|
|
// pichOffsets - pointer to first element in an array of offsets into client
|
|
// text buffer. NOTE: callback is not allowed to stash pichOffsets for
|
|
// later processing. pichOffsets will not persist between successive
|
|
// callbacks. If the callback wants to use the data pointed to by pich
|
|
// it must copy it to its own store
|
|
// cOffsets - number of offsets passed to client (always > 1)
|
|
// lpData - client defined data
|
|
//
|
|
// Return:
|
|
// TRUE - to abort token enumeration
|
|
// FALSE - to continue
|
|
//---------------------------------------------------------------------------
|
|
// BOOL
|
|
// EnumSummarizationOffsets (
|
|
// IN CONST DWORD *pichOffsets,
|
|
// IN DWORD cOffsets,
|
|
// IN OUT LPARAM lpData);
|
|
|
|
typedef BOOL (CALLBACK * ENUM_SUMMARIZATION_OFFSETS_CALLBACK)(
|
|
IN CONST DWORD *pichOffsets,
|
|
IN CONST DWORD cOffsets,
|
|
IN OUT LPARAM lpData);
|
|
|
|
typedef BOOL (CALLBACK * ENUM_SUMMARIZATION_OFFSETS_EX1_CALLBACK)(
|
|
IN CONST DWORD *pichOffsets,
|
|
IN CONST DWORD cOffsets,
|
|
IN PCWSTR pwzPOS,
|
|
IN PCWSTR pwzMCat,
|
|
IN OUT LPARAM lpData);
|
|
|
|
//+--------------------------------------------------------------------------
|
|
// Routine: EnumSummarizationOffsets
|
|
//
|
|
// Synopsis: This is the entry point for returning offsets for tokens used
|
|
// in summarization. These tokens correspond to stems and bound morphemes
|
|
// (fuzokugo) in the text. A list of offsets (and a count) is sent to the
|
|
// EnumSummarizationOffsets callback (see above)
|
|
//
|
|
// Parameters:
|
|
// pwszText - pointer to wide-character text buffer to be tokenized,
|
|
// cchText - count of characters in text buffer,
|
|
// fTokenizeMode - flag describing the callback mode (see above),
|
|
// pEnumTokOutputProc - pointer to callback procedure handling token
|
|
// enumeration,
|
|
// lpData - client defined data
|
|
//
|
|
// Returns:
|
|
// TH_ERROR_SUCCESS - if the call completed successfully
|
|
// TH_ERROR_NOHPBS - if there were no HPBs
|
|
// TH_ERROR_INVALID_INPUT - if the input buffer was bad
|
|
// TH_ERROR_INVALID_CALLBACK - if the input callback was bad
|
|
//---------------------------------------------------------------------------
|
|
INT
|
|
APIENTRY
|
|
EnumSummarizationOffsets(
|
|
IN PCWSTR pwszText,
|
|
IN DWORD cchText,
|
|
IN DWORD fBeginEndHPBMode,
|
|
IN ENUM_SUMMARIZATION_OFFSETS_CALLBACK pcbEnumSummarizationOffsets,
|
|
IN LPARAM lpData);
|
|
|
|
INT
|
|
APIENTRY
|
|
EnumSummarizationOffsetsEx1(
|
|
IN PCWSTR pwszText,
|
|
IN DWORD cchText,
|
|
IN DWORD fBeginEndHPBMode,
|
|
IN ENUM_SUMMARIZATION_OFFSETS_EX1_CALLBACK pcbEnumSummarizationOffsetsEx1,
|
|
IN LPARAM lpData);
|
|
|
|
typedef INT (APIENTRY *LP_ENUM_SUMMARIZATION_OFFSETS)(
|
|
IN PCWSTR pwszText,
|
|
IN DWORD cchText,
|
|
IN DWORD fBeginEndHPBMode,
|
|
IN ENUM_SUMMARIZATION_OFFSETS_CALLBACK pcbEnumSummarizationOffsets,
|
|
IN LPARAM lpData);
|
|
|
|
typedef INT (APIENTRY *LP_ENUM_SUMMARIZATION_OFFSETS_EX1)(
|
|
IN PCWSTR pwszText,
|
|
IN DWORD cchText,
|
|
IN DWORD fBeginEndHPBMode,
|
|
IN ENUM_SUMMARIZATION_OFFSETS_EX1_CALLBACK pcbEnumSummarizationOffsetsEx1,
|
|
IN LPARAM lpData);
|
|
|
|
//+--------------------------------------------------------------------------
|
|
// Routine: EnumStemOffsetsCallback
|
|
//
|
|
// Synopsis: client-side callback that receives a zero--terminated stem per SPB
|
|
//
|
|
// Parameters:
|
|
// pwszStem - zero terminated stem string
|
|
// lpData - client defined data
|
|
//
|
|
// Return:
|
|
// TRUE - to abort token enumeration
|
|
// FALSE - to continue
|
|
//---------------------------------------------------------------------------
|
|
// BOOL
|
|
// EnumStemOffsetsCallback (
|
|
// IN WCHAR *pwszStem,
|
|
// IN OUT LPARAM lpData);
|
|
|
|
typedef BOOL (CALLBACK * ENUM_STEM_OFFSETS_CALLBACK)(
|
|
IN CONST DWORD *pichOffsets,
|
|
IN CONST DWORD cOffsets,
|
|
IN OUT LPARAM lpData);
|
|
|
|
//+--------------------------------------------------------------------------
|
|
// Routine: EnumStemOffsets
|
|
//
|
|
// Synopsis: This is the entry point for tokenizing stems. Sends offsets,
|
|
// for stems to the EnumStemOffsets callback (see above)
|
|
//
|
|
// Parameters:
|
|
// pwszText - pointer to wide-character text buffer to be tokenized,
|
|
// cchText - count of characters in text buffer,
|
|
// fTokenizeMode - flag describing the callback mode (see above),
|
|
// pEnumTokOutputProc - pointer to callback procedure handling token
|
|
// enumeration,
|
|
// lpData - client defined data
|
|
//
|
|
// Returns:
|
|
// TH_ERROR_SUCCESS - if the call completed successfully
|
|
// TH_ERROR_NOHPBS - if there were no HPBs
|
|
// TH_ERROR_INVALID_INPUT - if the input buffer was bad
|
|
// TH_ERROR_INVALID_CALLBACK - if the input callback was bad
|
|
//---------------------------------------------------------------------------
|
|
INT
|
|
APIENTRY
|
|
EnumStemOffsets(
|
|
IN PCWSTR pwszText,
|
|
IN DWORD cchText,
|
|
IN DWORD fBeginEndHPBMode,
|
|
IN ENUM_STEM_OFFSETS_CALLBACK pcbEnumStemOffsets,
|
|
IN OUT DWORD *pcchTextProcessed,
|
|
IN LPARAM lpData);
|
|
|
|
typedef INT (APIENTRY *LP_ENUM_STEM_OFFSETS)(
|
|
IN PCWSTR pwszText,
|
|
IN DWORD cchText,
|
|
IN DWORD fBeginEndHPBMode,
|
|
IN ENUM_STEM_OFFSETS_CALLBACK pcbEnumStemOffsets,
|
|
IN OUT DWORD *pcchTextProcessed,
|
|
IN LPARAM lpData);
|
|
|
|
//+--------------------------------------------------------------------------
|
|
// Routine: EnumStemInfoCallback
|
|
//
|
|
// Synopsis: client-side callback that receives offsets and stem information
|
|
//
|
|
// Parameters:
|
|
// ichOffset - offset to first character in stem
|
|
// cchLen - length of the stem
|
|
// pwszPOS - string containing POS info
|
|
// pwszMCat - string containing MCat info
|
|
// pwszDictionaryForm - string containing Dictionary Form
|
|
// lpData - client defined data
|
|
//
|
|
// Return:
|
|
// TRUE - to abort token enumeration
|
|
// FALSE - to continue
|
|
//---------------------------------------------------------------------------
|
|
// BOOL
|
|
// EnumStemInfoCallback (
|
|
// IN CONST DWORD ichOffset,
|
|
// IN CONST DWORD cchLen,
|
|
// IN PCWSTR pwszPOS,
|
|
// IN PCWSTR pwszMCat,
|
|
// IN PCWSTR pwszDictionaryForm,
|
|
// IN OUT LPARAM lpData);
|
|
|
|
typedef BOOL (CALLBACK * ENUM_STEM_INFO_CALLBACK)(
|
|
IN CONST DWORD ichOffset,
|
|
IN CONST DWORD cchLen,
|
|
IN PCWSTR pwszPOS,
|
|
IN PCWSTR pwszMCat,
|
|
IN PCWSTR pwszDictionaryForm,
|
|
IN OUT LPARAM lpData);
|
|
|
|
//+--------------------------------------------------------------------------
|
|
// Routine: EnumStemInfo
|
|
//
|
|
// Synopsis: Call this routine to get information about stems.
|
|
// For example, if you want the dictionary form, part-of-speech or
|
|
// MCat information for a stem, then this is the API for you
|
|
//
|
|
// Parameters:
|
|
// pwszText - pointer to wide-character text buffer to be tokenized,
|
|
// cchText - count of characters in text buffer,
|
|
// fTokenizeMode - flag describing the callback mode (see above),
|
|
// pcbEnumStemInfo - pointer to callback procedure handling stem info
|
|
// enumeration
|
|
// lpData - client defined data
|
|
//
|
|
// Returns:
|
|
// TH_ERROR_SUCCESS - if the call completed successfully
|
|
// TH_ERROR_NOHPBS - if there were no HPBs
|
|
// TH_ERROR_INVALID_INPUT - if the input buffer was bad
|
|
// TH_ERROR_INVALID_CALLBACK - if the input callback was bad
|
|
//---------------------------------------------------------------------------
|
|
INT
|
|
APIENTRY
|
|
EnumStemInfo(
|
|
IN PCWSTR pwszText,
|
|
IN DWORD cchText,
|
|
IN DWORD fBeginEndHPBMode,
|
|
IN ENUM_STEM_INFO_CALLBACK pcbEnumStemInfo,
|
|
IN OUT DWORD *pcchTextProcessed,
|
|
IN LPARAM lpData);
|
|
|
|
typedef INT (APIENTRY *LP_ENUM_STEM_INFO)(
|
|
IN PCWSTR pwszText,
|
|
IN DWORD cchText,
|
|
IN DWORD fBeginEndHPBMode,
|
|
IN ENUM_STEM_INFO_CALLBACK pcbEnumStemInfo,
|
|
IN OUT DWORD *pcchTextProcessed,
|
|
IN LPARAM lpData);
|
|
|
|
//+--------------------------------------------------------------------------
|
|
// Routine: EnumSentenceOffsetsCallback
|
|
//
|
|
// Synopsis: client-side callback that receives a list of offsets for sentence breaks
|
|
//
|
|
// Parameters:
|
|
// ichOffsetStart - offset to start of sentence
|
|
// ichOffsetEnd - offset to end of sentence (includes terminating punctuation)
|
|
// lpData - client defined data
|
|
//
|
|
// Return:
|
|
// TRUE - to abort token enumeration
|
|
// FALSE - to continue
|
|
//---------------------------------------------------------------------------
|
|
// BOOL
|
|
// EnumSentenceOffsetsCallback (
|
|
// IN DWORD ichOffsetStart,
|
|
// IN DWORD ichOffsetEnd,
|
|
// IN OUT LPARAM lpData);
|
|
|
|
typedef BOOL (CALLBACK * ENUM_SENTENCE_OFFSETS_CALLBACK)(
|
|
IN DWORD ichOffsetStart,
|
|
IN DWORD ichOffsetEnd,
|
|
IN OUT LPARAM lpData);
|
|
|
|
//+--------------------------------------------------------------------------
|
|
// Routine: EnumSentenceOffsets
|
|
//
|
|
// Synopsis: This is the main entry point for breaking sentences.
|
|
// Sends offsets delimiting sentences to the callback.
|
|
//
|
|
// Parameters:
|
|
// pwszText - pointer to wide-character text buffer to be tokenized,
|
|
// cchText - count of characters in text buffer,
|
|
// fTokenizeMode - not used. later this will be used to control how
|
|
// partial sentences are handled.
|
|
// pEnumSentenceOffsetsCallback - pointer to callback procedure handling offsets
|
|
// lpData - client defined data
|
|
//
|
|
// Returns:
|
|
// TH_ERROR_SUCCESS - if the call completed successfully
|
|
// TH_ERROR_NOHPBS - if there were no HPBs
|
|
// TH_ERROR_INVALID_INPUT - if the input buffer was bad
|
|
// TH_ERROR_INVALID_CALLBACK - if the input callback was bad
|
|
//---------------------------------------------------------------------------
|
|
INT
|
|
APIENTRY
|
|
EnumSentenceOffsets(
|
|
IN PCWSTR pwszText,
|
|
IN DWORD cchText,
|
|
IN DWORD fTokenizeMode,
|
|
IN ENUM_SENTENCE_OFFSETS_CALLBACK pcbEnumSentenceOffsets,
|
|
IN LPARAM lpData);
|
|
|
|
typedef INT (APIENTRY *LP_ENUM_SENTENCE_OFFSETS)(
|
|
IN PCWSTR pwszText,
|
|
IN DWORD cchText,
|
|
IN DWORD fTokenizeMode,
|
|
IN ENUM_SENTENCE_OFFSETS_CALLBACK pcbEnumSentenceOffsets,
|
|
IN LPARAM lpData);
|
|
|
|
|
|
//+--------------------------------------------------------------------------
|
|
// Routine: FEMorphCallback
|
|
//
|
|
// Synopsis: The callback that gets a text stream from T-Hammer.
|
|
//
|
|
// Parameters:
|
|
// pwszWMorphRecs - a pointer to wide character text stream,
|
|
// which contains mophological analyses of a given sentence
|
|
// pvData - pointer to private data
|
|
//
|
|
// Returns:
|
|
// TRUE if no more analysis is needed
|
|
//---------------------------------------------------------------------------
|
|
// BOOL
|
|
// FEMorphCallback(
|
|
// IN PWSTR pwszMorphRecs);
|
|
|
|
typedef BOOL (CALLBACK * FEMORPH_CALLBACK)(
|
|
IN PWSTR pwszMorphRecs,
|
|
IN VOID *pvData);
|
|
|
|
//+--------------------------------------------------------------------------
|
|
// Routine: FEMorph
|
|
//
|
|
// Synopsis: This is the entry point for NLPWIN morpheme analysis.
|
|
// Sends a morpheme record string back to the lex callback in NLPWIN
|
|
//
|
|
// Parameters:
|
|
// pwszText - pointer to wide-character text buffer to be tokenized,
|
|
// pcbFEMorphCB - pointer to callback procedure handling morph rec enumeration
|
|
// pvData - pointer to private data
|
|
//
|
|
// Returns:
|
|
//---------------------------------------------------------------------------
|
|
INT
|
|
APIENTRY
|
|
FEMorph(
|
|
IN PCWSTR pwszText,
|
|
IN FEMORPH_CALLBACK pcbFEMorphCB,
|
|
IN VOID *pvData);
|
|
|
|
typedef INT (APIENTRY *LP_FEMORPH)(
|
|
IN PCWSTR pwszText,
|
|
IN FEMORPH_CALLBACK pcbFEMorphCB,
|
|
IN VOID *pvData);
|