windows-nt/Source/XPSP1/NT/inetsrv/intlwb/chs/thammerp.h

610 lines
20 KiB
C
Raw Normal View History

2020-09-26 03:20:57 -05:00
//+--------------------------------------------------------------------------
//
// Copyright (C) 1994, Microsoft Corporation. All Rights Reserved.
//
// File: thammerp.h
//
// History: 07-Jun-95 PatHal Created.
//
//---------------------------------------------------------------------------
#ifndef _THAMMERP_H_
#define _THAMMERP_H_
// Following values used for setting fMode parameters when calling
// the EnumTokens or Tokenize api. These are used to control the content
// of the pichOffsets, pwszStem, and pwszToken strings sent to the callback
// procedure. Note: default is fastest.
// "Selection Chunks" mode is used for Word Smart Selection. Offsets sent
// to the callback do not necessarily correspond to morphemes boundaries.
// The chunk boundary offsets are encoded in the Japanese morphology
// that exists in T-Hammer as a resource.
#define TOKENIZE_MODE_SELECTION_OFFSETS 0x00000010
// "Stem Chunks" mode is used for Auto Summarization. Offsets sent
// to the callback correspond to stems (and one containing all bound morphemes)
// LATER! How should prefixes be handled? If we remove them from the
// output then the last offset of one call will no longer equal the first offset from
// the next call.
#define TOKENIZE_MODE_STEM_OFFSETS 0x00000020
// " Summarization Stems" mode is used for Auto Summarization. Output
// is the "stem" portion of the Bunsetsu Phrase. For example, for Japanese
// "oyogu" the outputted form would be the stem "oyo".
#define TOKENIZE_MODE_SUMMARIZATION_OFFSETS 0x00000040
// "Break Compounds" is a special mode that instructs t-hammer to break
// compound nouns in the stem. Use this with "Summarization Stems".
// The default is to not break the compounds (i.e. this is off by default).
#define TOKENIZE_MODE_BREAK_COMPOUNDS 0x00000080
// "ChBreak Unknowns" is a special mode that controls tokenization of unknown
// strings. When set this forces T-Hammer to output unknown stems on a per
// character basis. By default, this is not set which means that an unknown string
// (for example, a proper name) is outputted as a single contiguous chunk
#define TOKENIZE_MODE_CHBREAK_UNKNOWNS 0x00000100
// "Stem Info" mode is used for Dictionary Form and for obtaining POS and MCat
// info for all. Each callback contains one stem. Prefixes are ignored
#define TOKENIZE_MODE_STEM_INFO 0x00000200
// "Sentence Offsets" mode is used to return sentence breaks - no further
// analysis is performed. This is useful to segment corpora before adding
// to a test database (e.g. Babble)
#define TOKENIZE_MODE_SENTENCE_OFFSETS 0x00000400
// "Best Break" is the default. Only the single most
// probable breaks will be output.
#define TOKENIZE_MODE_BEST_BREAK 0x00001000
// "Alternate Breaks" instructs tokenizer to output all possible
// breaks
#define TOKENIZE_MODE_ALTERNATE_BREAKS 0x00020000
// "Bunsetsu Phrases" is a default, too. Outputs phrase breaks.
// Warning: for a word-tagged corpus, use "Break Morpheme"
#define TOKENIZE_MODE_BUNSETSU_PHRASES 0x00040000
// "Best Tags" is also default. Outputs only one most probable
// tag for a given segmentation. Break ambiguity and tag ambiguity are
// orthogonal attributes of the output string, hence you can "or" them together
#define TOKENIZE_MODE_BEST_TAGS 0x00080000
// "Alternate Tags" instructs tokenizer to output all possible taggings for
// each break. Warning: for some languages, there will be many more tag
// alternatives than break alternatives, so the output will be quite verbose.
#define TOKENIZE_MODE_ALTERNATE_TAGS 0x00100000
// "Debug" instructs tokenizer to output morpheme label information for
// alternates and tag strings for morphemes
#define TOKENIZE_MODE_VERBOSE 0x00200000
// "Output DebugLog" is only meaningful useful for debug builds. Setting
// this flag on and calling debug T-Hammer has the effect that T-Hammer
// outputs verbose tracing information for the morphology and stem analysis
// to a separate file named debug.utf. CAUTION: this file is typically 500 times
// the size of the source corpus in size, so be forewarned.
#define TOKENIZE_MODE_OUTPUT_DEBUGLOG 0x00800000
// "Disable PL" means the Primary Lexicon will be disabled. This mode is for
// debug purposes only. Retail versions don't have it.
#define TOKENIZE_MODE_DISABLE_PL 0x01000000
// Instrumentation switches for collecting scoring statistics
// First switch is for Postfix Score Info
#define TOKENIZE_MODE_SCOREINFO_POSTFIX 0x02000000
// Second switch is for SPB Scoring Info
#define TOKENIZE_MODE_SCOREINFO_SPB 0x04000000
// Output morpheme records for FE-Morph API
#define TOKENIZE_MODE_MORPHEME_RECORDS 0x10000000
// Output multiple selection analyses for tagging tool
#define TOKENIZE_MODE_SELECTION_OFFSETS_EX 0x20000000
// Output summarization offsets with POS for spelling variant(conversion)
#define TOKENIZE_MODE_SUMMARIZATION_OFFSETS_EX1 0x40000000
// Output words in their dictionary form
#define TOKENIZE_MODE_DICTIONARY_FORM 0x80000000
// Output words in their dictionary form
#define TOKENIZE_MODE_SEPARATE_MORPHEMES 0x00400000
// The EnumPhrases and batch-processing Tokenize api are only used in the debug build
//+--------------------------------------------------------------------------
// defines and typedefs for "Record" subsystem
//---------------------------------------------------------------------------
#define IATTR_NIL 0
#define IATTR_SPB 1
#define IATTR_STEM 2
#define IATTR_POS 3
#define IATTR_MCAT 4
#define IATTR_FT 5
#define IATTR_LT 6
#define TH_NULL_HANDLE (TH_HANDLE)0
typedef UINT TH_HANDLE;
typedef struct tagTH_ATTRVAL
{
UINT iAttr; // attribute index
TH_HANDLE hVal; // value handle
} TH_ATTRVAL, *PTH_ATTRVAL;
typedef struct tagTH_RECORD
{
UINT cAttrMax;
UINT cAttrVals;
TH_ATTRVAL *pAttrVals; // variable length attribute value array
UINT cBitMax;
UINT cBitVals;
DWORD *pBitVals; // variable length bit value array
} TH_RECORD, *PTH_RECORD;
typedef enum tagTH_TYPE
{
TH_TYPE_INT = 1,
TH_TYPE_STR,
TH_TYPE_REG,
// add more here
TH_TYPE_MAX
} TH_TYPE;
#define TYPEOF(x) HIWORD(x)
#define INDEXOF(x) LOWORD(x)
// pcai - 6/18/97 Makes it clear for MCat's
//
typedef BYTE MCAT;
#define SV_WORD_LEN_MAX 0x10
#define SV_WORD_IREAD_MAX SV_WORD_LEN_MAX
//+--------------------------------------------------------------------------
// Routine: EnumPhrasesCallback
//
// Synopsis: Sends delimited output (tokens) to test app callback procedure
//
// Parameters:
// pwszToken- pointer to wide character token string,
// fTokenType - flag describing the types of tag in pwszToken (see above).
//
// Returns:
// TRUE - to abort token enumeration
// FALSE - to continue
//---------------------------------------------------------------------------
// BOOL
// EnumPhrasesCallback (
// PWSTR pwszToken,
// DWORD fTokenType);
typedef BOOL (CALLBACK * ENUM_PHRASES_CALLBACK)(
IN PWSTR pwszToken,
IN DWORD fTokenType,
IN OUT LPARAM lpData);
//+--------------------------------------------------------------------------
// Routine: EnumPhrases (corresponds to mode 4 of tokenize test harness)
//
// Synopsis: This is the entry point for tokenizing phrases. Sends tokenized
// phrases which can either be offsets or zero-delimited strings to the callback
// (defined below)
//
// Parameters:
// pwszText - pointer to wide-character text buffer to be tokenized,
// cchText - count of characters in text buffer,
// fTokenizeMode - flag describing the callback mode (see above),
// pEnumTokOutputProc - pointer to callback procedure handling token
// enumeration,
// lpData - client defined data
//
// Returns:
// TH_ERROR_SUCCESS - if the call completed successfully
// TH_ERROR_NOHPBS - if there were no HPBs
// TH_ERROR_INVALID_INPUT - if the input buffer was bad
// TH_ERROR_INVALID_CALLBACK - if the input callback was bad
//---------------------------------------------------------------------------
INT
APIENTRY
EnumPhrases(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fBeginEndHPBMode,
IN ENUM_PHRASES_CALLBACK pcbEnumPhrases,
IN LPARAM lpData);
typedef INT (APIENTRY *LP_ENUM_PHRASES)(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fBeginEndHPBMode,
IN ENUM_PHRASES_CALLBACK pcbEnumPhrases,
IN LPARAM lpData);
// T-Hammer uses the folg. values to set the fTokenType parameter
// when calling back to EnumTokOutputProc. The Tokenize test app
// uses this type information to control the comparison to the re-
// tokenized corpus as well as to format the output in general.
// "Phrase" signifies that the the end of the pwszToken string marks a
// phrase boundary. This is the default.
#define TOKEN_TYPE_PHRASE 0x01
// "Morpheme" signifies an intra-phrase morpheme
// boundary (including the stem).
#define TOKEN_TYPE_MORPHEME 0x02
// "Alternate" signifies that the current pwszToken string is an alternate
// (primary tokens are sent before alternates).
#define TOKEN_TYPE_ALTERNATE 0x04
// "Hard Break" signifies an unambiguous text boundary.
// Note that between punctuation types the output is either all alternate
// or all non-alternate. Any bitwise OR combination of the following
// types is possible.
#define TOKEN_TYPE_HARDBREAK 0x08
// "Label" means the token should not be used to compare to test corpus,
// but should be output parenthetically (or stored as the morpheme name),
// for example with enclosing parens
#define TOKEN_TYPE_LABEL 0x10
// "Stem" signifies that this morpheme is part of the head which corresponds
// to a "jiritsugo" for Japanese. This is used for coloring in the tagtool
#define TOKEN_TYPE_STEM 0x20
//+--------------------------------------------------------------------------
// Routine: Tokenize
//
// Synopsis: Internal word-breaker entry point for executing tokenization.
// Returns array of delimited offsets in pibBreaks
//
// Parameters:
// pwszText - pointer to wide-character text buffer to be tokenized,
// cchText - count of characters in text buffer,
// pichBreaks - pointer to return buffer, which is filled with delimiter (breaks) offset information
// pcBreaks - size of previous buffer; number of actual breaks used is returned
//
// Returns:
// TH_ERROR_SUCCESS - if the call completed successfully
// TH_ERROR_INVALID_INPUT - if the input buffer was bad
// TH_ERROR_INVALID_CALLBACK - if the input callback was bad
//
// Note: Tokenize will never fail with NOHPBs, since it assumes that
// the beginning and ends are HPBs
//
// Notes:
// Like lstrlen, this function try/excepts on the input buffer and returns FALSE when an exception
// involving invalid memory dereferencing.
//
// Open Issue:
// 1. Do we need to change the name of this API? "Tokenize" is a generic
// name - maybe we should save it for a more general-purpose API.
//---------------------------------------------------------------------------
INT
APIENTRY
Tokenize(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fTokenizeMode,
OUT PDWORD pichBreaks,
IN OUT PDWORD cBreaks);
typedef DWORD (APIENTRY *LP_TOKENIZE)(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fTokenizeMode,
OUT PDWORD pichBreaks,
IN OUT PDWORD cBreaks);
//+--------------------------------------------------------------------------
// Routine: EnumSummarizationOffsetsEx
//
// Temporary private entry point to overload Summarization and get back the
// number of cch procesed. Please refer to EnumSummarizationOffsets (thammer.h)
// for details
//---------------------------------------------------------------------------
INT
APIENTRY
EnumSummarizationOffsetsEx(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fBeginEndHPBMode,
IN ENUM_SUMMARIZATION_OFFSETS_CALLBACK pcbEnumSummarizationOffsets,
IN OUT DWORD *pcchTextProcessed,
IN LPARAM lpData);
//+--------------------------------------------------------------------------
// Routine: EnumSelectionOffsetsExCallback
//
// Synopsis: same as EnumSelectionOffsetsCallback with an added parameter
// that allows mutliple analyses to be sent back to client
//
// Parameters:
// ...
// fInfo - dword bit mask that contains info on whether an analysis is primary
// and/or spb initial
// ...
//---------------------------------------------------------------------------
// BOOL
// EnumSelectionOffsetsExCallback (
// IN CONST DWORD *pichOffsets,
// IN DWORD cOffsets,
// IN DWORD fInfo,
// IN OUT LPARAM lpData);
#define SELN_OFFSETS_INFO_PRIMARY 0x00000001
#define SELN_OFFSETS_INFO_SPB_END 0x00000002
typedef BOOL (CALLBACK * ENUM_SELECTION_OFFSETS_EX_CALLBACK)(
IN CONST DWORD *pichOffsets,
IN CONST DWORD cOffsets,
IN CONST DWORD fInfo,
IN OUT LPARAM lpData);
//+--------------------------------------------------------------------------
// Routine: EnumSelectionOffsetsEx
//
// Synopsis: Same as EnumSelectionOffsets, but takes an "extended" callback
// (see above for details)
//---------------------------------------------------------------------------
INT
APIENTRY
EnumSelectionOffsetsEx(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fBeginEndHPBMode,
IN ENUM_SELECTION_OFFSETS_EX_CALLBACK pcbEnumSelectionOffsetsEx,
IN LPARAM lpData);
typedef INT (APIENTRY *LP_ENUM_SELECTION_OFFSETS_EX)(
IN PCWSTR pwszText,
IN DWORD cchText,
IN DWORD fBeginEndHPBMode,
IN ENUM_SELECTION_OFFSETS_EX_CALLBACK pcbEnumSelectionOffsetsEx,
IN LPARAM lpData);
//+--------------------------------------------------------------------------
// Routine: EnumSPBRecordsCallback
//
// Synopsis:
//
// Parameters:
// pRec - points to an array of TH_RECORDs
// cRec - number of TH_RECORD structs in the pRec[] array
// iScoreIPL - well-formedness score of the given sentence.
// pvData - points to client defined data
//
// Returns:
//---------------------------------------------------------------------------
// BOOL
// EnumSPBRecordsCallback(
// IN PTH_RECORD pRec,
// IN DWORD cRec,
// IN DWORD dwFlags,
// IN DWORD iScoreIPL,
// IN PVOID pvData);
#define SPBRECS_SENTEDGE 0x00000001
typedef BOOL (CALLBACK * ENUM_SPB_RECORD_CALLBACK)(
IN PTH_RECORD pRec,
IN DWORD cRec,
IN DWORD dwFlags,
IN DWORD iScoreIPL,
IN PVOID pvData);
//+--------------------------------------------------------------------------
// Routine: EnumSPBRecords
//
// Synopsis:
//
// Parameters:
// pwszText - points to a sentence to analyze
// pcbEnumSPBRecordsCB - callback function pointer.
// lpData - points to client defined data
//
// Returns:
//---------------------------------------------------------------------------
INT APIENTRY
EnumSPBRecords(
IN PCWSTR pwszText,
IN DWORD fMode, // for TOKENIZE_MODE_DICTIONARY_FORM
IN ENUM_SPB_RECORD_CALLBACK pcbEnumSPBRecordsCB,
IN PVOID pvData);
typedef INT (APIENTRY *LP_ENUM_SPB_RECORDS)(
IN PCWSTR pwszText,
IN DWORD fMode, // for TOKENIZE_MODE_DICTIONARY_FORM
IN ENUM_SPB_RECORD_CALLBACK pcbEnumSPBRecordsCB,
IN PVOID pvData);
PCWSTR
GetStringVal(
TH_HANDLE hVal);
typedef PCWSTR (APIENTRY *LP_GET_STRING_VAL)(
TH_HANDLE hVal);
DWORD
GetIntegerVal(
TH_HANDLE hVal);
typedef DWORD (APIENTRY *LP_GET_INTEGER_VAL)(
TH_HANDLE hVal);
TH_HANDLE
GetAttr(
const PTH_RECORD pRec,
UINT iAttr);
typedef TH_HANDLE (APIENTRY *LP_GET_ATTR) (
const PTH_RECORD pRec,
UINT iAttr);
//+--------------------------------------------------------------------------
// Routine: FxCallback
//
// Synopsis:
//
// Parameters:
//
// Returns:
//---------------------------------------------------------------------------
// BOOL WINAPI
// FxCallback(
// DWORD iFilter,
// WCHAR *pwzFilter,
// DWORD cRec,
// TH_RECORD *pRec,
// VOID *pvData);
//
typedef BOOL (WINAPI *LP_FXCB)(
IN DWORD iFilter,
IN WCHAR *pwzFilter,
DWORD cRec,
TH_RECORD *pRec,
IN VOID *pvData);
//+--------------------------------------------------------------------------
// Routine: Fx
//
// Synopsis:
//
// Parameters:
//
// Returns:
//---------------------------------------------------------------------------
BOOL WINAPI
Fx(
IN PVOID *ppvFilter,
IN DWORD cFilter,
IN PCWSTR pwzPhrase,
IN LP_FXCB pfnFxCallback,
IN PVOID pvData);
typedef BOOL (WINAPI *LP_FX)(
IN PVOID *ppvFilter,
IN DWORD cFilter,
IN PCWSTR pwzPhrase,
IN LP_FXCB pfnFxCallback,
IN PVOID pvData);
// SV-related structs and functions
//+--------------------------------------------------------------------------
// Structure: SV_INFO
//
// Synopsis: This structure is used by the SVAPI functions.
//
//---------------------------------------------------------------------------
typedef struct _SVINFO
{
unsigned sid : 18; // sense id
unsigned svid : 6; // id for spelling variant
unsigned cRead : 8; // # of elements in the reading chain for this sid
BYTE bIPL; // IPL of this spelling variant
MCAT mcat; // index of MCat
// array of reading indices for the sid
// if awRead[i] < READING_BASE, it represents a kana
// otherwise, (awRead[i] - READING_BASE) is the
// index into the reading table
WORD aiwRead[SV_WORD_IREAD_MAX];
} SV_INFO;
//+--------------------------------------------------------------------------
// Routine: SVFindSid
//
// Fill SV_INFO structure by searching for pwzWord in the SV-lexicon.
// This routine is called in order to normalize/convert/reconvert
// a word to its matching sense id(s). This routine returns one or more
// SV_INFO records, each being a match to the word.
//
// Parameters: pwzWord = pointer to word for which an sid is needed
// pwzMcat = MCat string.
// If pwzMcat is not valid , search for the best sid
// in all MCat's.
// asvi = array of SV_INFO for receiving sid, svid
// and reading indices
// (must be pre-allocated by the caller)
// csviMax = Max # of matches desired.
// SVFINDSID_GET_FIRST: only the first match
// SVFINDSID_GET_BEST: only the best match(lowest IPL)
// Otherwise, the first bMax matches.
//
// Returns: SVFINDSID_RET_NONE - no matches are found
// otherwise returns the number of matches returned in asvi
//
//---------------------------------------------------------------------------
DWORD APIENTRY
SV_FindSid(
IN CONST WCHAR *pwzWord,
IN CONST WCHAR *pwzMcat,
IN OUT SV_INFO *asvi,
IN DWORD csviMax);
typedef DWORD (APIENTRY *LP_SV_FINDSID) (
IN CONST WCHAR *pwzWord,
IN CONST WCHAR *pwzMcat,
IN OUT SV_INFO *asvi,
IN DWORD csviMax);
//+--------------------------------------------------------------------------
// Routine: SVFindSid
//
// Get sv orthography given SV_INFO structure.
//
// Parameters: pwzWord = pointer to word for receiving the sv's orthography
// (must be pre-allocated by the caller)
// psvi = pointer to SV_INFO for receiving return information
// (must contain meaningful information)
//
// Returns: TRUE - successful
// FALSE - unsuccessful
//---------------------------------------------------------------------------
BOOL APIENTRY
SV_GetOrtho(
IN SV_INFO *psvi,
IN OUT WCHAR *pwzOrtho);
typedef BOOL (APIENTRY *LP_SV_GETORTHO) (
IN SV_INFO *psvi,
IN OUT WCHAR *pwzOrtho);
//+--------------------------------------------------------------------------
// Routine: TurnOn_FindSVStems
//
// Synopsis: Turn on the flag for using FindSVStems().
//
// Parameters: none
//
// Returns: none
//---------------------------------------------------------------------------
BOOL APIENTRY SV_EnableFindSVStems();
typedef VOID (APIENTRY *LP_SV_ENABLE_FINDSVSTEMS) ();
#define SVID_ALL_KANJI 0
#define SVID_ALL_KANA 1
#define SV_FINDSID_GET_FIRST 1
#define SV_FINDSID_GET_BEST 0xFFFFFFFF
// SVFindSid's return values
#define SV_FINDSID_RET_NONE 0
#define SV_FINDSID_ERROR 0xFFFFFFFF
#endif // _THAMMERP_H_