//+-------------------------------------------------------------------------- // // Copyright (C) 1994, Microsoft Corporation. All Rights Reserved. // // File: thammerp.h // // History: 07-Jun-95 PatHal Created. // //--------------------------------------------------------------------------- #ifndef _THAMMERP_H_ #define _THAMMERP_H_ // Following values used for setting fMode parameters when calling // the EnumTokens or Tokenize api. These are used to control the content // of the pichOffsets, pwszStem, and pwszToken strings sent to the callback // procedure. Note: default is fastest. // "Selection Chunks" mode is used for Word Smart Selection. Offsets sent // to the callback do not necessarily correspond to morphemes boundaries. // The chunk boundary offsets are encoded in the Japanese morphology // that exists in T-Hammer as a resource. #define TOKENIZE_MODE_SELECTION_OFFSETS 0x00000010 // "Stem Chunks" mode is used for Auto Summarization. Offsets sent // to the callback correspond to stems (and one containing all bound morphemes) // LATER! How should prefixes be handled? If we remove them from the // output then the last offset of one call will no longer equal the first offset from // the next call. #define TOKENIZE_MODE_STEM_OFFSETS 0x00000020 // " Summarization Stems" mode is used for Auto Summarization. Output // is the "stem" portion of the Bunsetsu Phrase. For example, for Japanese // "oyogu" the outputted form would be the stem "oyo". #define TOKENIZE_MODE_SUMMARIZATION_OFFSETS 0x00000040 // "Break Compounds" is a special mode that instructs t-hammer to break // compound nouns in the stem. Use this with "Summarization Stems". // The default is to not break the compounds (i.e. this is off by default). #define TOKENIZE_MODE_BREAK_COMPOUNDS 0x00000080 // "ChBreak Unknowns" is a special mode that controls tokenization of unknown // strings. When set this forces T-Hammer to output unknown stems on a per // character basis. By default, this is not set which means that an unknown string // (for example, a proper name) is outputted as a single contiguous chunk #define TOKENIZE_MODE_CHBREAK_UNKNOWNS 0x00000100 // "Stem Info" mode is used for Dictionary Form and for obtaining POS and MCat // info for all. Each callback contains one stem. Prefixes are ignored #define TOKENIZE_MODE_STEM_INFO 0x00000200 // "Sentence Offsets" mode is used to return sentence breaks - no further // analysis is performed. This is useful to segment corpora before adding // to a test database (e.g. Babble) #define TOKENIZE_MODE_SENTENCE_OFFSETS 0x00000400 // "Best Break" is the default. Only the single most // probable breaks will be output. #define TOKENIZE_MODE_BEST_BREAK 0x00001000 // "Alternate Breaks" instructs tokenizer to output all possible // breaks #define TOKENIZE_MODE_ALTERNATE_BREAKS 0x00020000 // "Bunsetsu Phrases" is a default, too. Outputs phrase breaks. // Warning: for a word-tagged corpus, use "Break Morpheme" #define TOKENIZE_MODE_BUNSETSU_PHRASES 0x00040000 // "Best Tags" is also default. Outputs only one most probable // tag for a given segmentation. Break ambiguity and tag ambiguity are // orthogonal attributes of the output string, hence you can "or" them together #define TOKENIZE_MODE_BEST_TAGS 0x00080000 // "Alternate Tags" instructs tokenizer to output all possible taggings for // each break. Warning: for some languages, there will be many more tag // alternatives than break alternatives, so the output will be quite verbose. #define TOKENIZE_MODE_ALTERNATE_TAGS 0x00100000 // "Debug" instructs tokenizer to output morpheme label information for // alternates and tag strings for morphemes #define TOKENIZE_MODE_VERBOSE 0x00200000 // "Output DebugLog" is only meaningful useful for debug builds. Setting // this flag on and calling debug T-Hammer has the effect that T-Hammer // outputs verbose tracing information for the morphology and stem analysis // to a separate file named debug.utf. CAUTION: this file is typically 500 times // the size of the source corpus in size, so be forewarned. #define TOKENIZE_MODE_OUTPUT_DEBUGLOG 0x00800000 // "Disable PL" means the Primary Lexicon will be disabled. This mode is for // debug purposes only. Retail versions don't have it. #define TOKENIZE_MODE_DISABLE_PL 0x01000000 // Instrumentation switches for collecting scoring statistics // First switch is for Postfix Score Info #define TOKENIZE_MODE_SCOREINFO_POSTFIX 0x02000000 // Second switch is for SPB Scoring Info #define TOKENIZE_MODE_SCOREINFO_SPB 0x04000000 // Output morpheme records for FE-Morph API #define TOKENIZE_MODE_MORPHEME_RECORDS 0x10000000 // Output multiple selection analyses for tagging tool #define TOKENIZE_MODE_SELECTION_OFFSETS_EX 0x20000000 // Output summarization offsets with POS for spelling variant(conversion) #define TOKENIZE_MODE_SUMMARIZATION_OFFSETS_EX1 0x40000000 // Output words in their dictionary form #define TOKENIZE_MODE_DICTIONARY_FORM 0x80000000 // Output words in their dictionary form #define TOKENIZE_MODE_SEPARATE_MORPHEMES 0x00400000 // The EnumPhrases and batch-processing Tokenize api are only used in the debug build //+-------------------------------------------------------------------------- // defines and typedefs for "Record" subsystem //--------------------------------------------------------------------------- #define IATTR_NIL 0 #define IATTR_SPB 1 #define IATTR_STEM 2 #define IATTR_POS 3 #define IATTR_MCAT 4 #define IATTR_FT 5 #define IATTR_LT 6 #define TH_NULL_HANDLE (TH_HANDLE)0 typedef UINT TH_HANDLE; typedef struct tagTH_ATTRVAL { UINT iAttr; // attribute index TH_HANDLE hVal; // value handle } TH_ATTRVAL, *PTH_ATTRVAL; typedef struct tagTH_RECORD { UINT cAttrMax; UINT cAttrVals; TH_ATTRVAL *pAttrVals; // variable length attribute value array UINT cBitMax; UINT cBitVals; DWORD *pBitVals; // variable length bit value array } TH_RECORD, *PTH_RECORD; typedef enum tagTH_TYPE { TH_TYPE_INT = 1, TH_TYPE_STR, TH_TYPE_REG, // add more here TH_TYPE_MAX } TH_TYPE; #define TYPEOF(x) HIWORD(x) #define INDEXOF(x) LOWORD(x) // pcai - 6/18/97 Makes it clear for MCat's // typedef BYTE MCAT; #define SV_WORD_LEN_MAX 0x10 #define SV_WORD_IREAD_MAX SV_WORD_LEN_MAX //+-------------------------------------------------------------------------- // Routine: EnumPhrasesCallback // // Synopsis: Sends delimited output (tokens) to test app callback procedure // // Parameters: // pwszToken- pointer to wide character token string, // fTokenType - flag describing the types of tag in pwszToken (see above). // // Returns: // TRUE - to abort token enumeration // FALSE - to continue //--------------------------------------------------------------------------- // BOOL // EnumPhrasesCallback ( // PWSTR pwszToken, // DWORD fTokenType); typedef BOOL (CALLBACK * ENUM_PHRASES_CALLBACK)( IN PWSTR pwszToken, IN DWORD fTokenType, IN OUT LPARAM lpData); //+-------------------------------------------------------------------------- // Routine: EnumPhrases (corresponds to mode 4 of tokenize test harness) // // Synopsis: This is the entry point for tokenizing phrases. Sends tokenized // phrases which can either be offsets or zero-delimited strings to the callback // (defined below) // // Parameters: // pwszText - pointer to wide-character text buffer to be tokenized, // cchText - count of characters in text buffer, // fTokenizeMode - flag describing the callback mode (see above), // pEnumTokOutputProc - pointer to callback procedure handling token // enumeration, // lpData - client defined data // // Returns: // TH_ERROR_SUCCESS - if the call completed successfully // TH_ERROR_NOHPBS - if there were no HPBs // TH_ERROR_INVALID_INPUT - if the input buffer was bad // TH_ERROR_INVALID_CALLBACK - if the input callback was bad //--------------------------------------------------------------------------- INT APIENTRY EnumPhrases( IN PCWSTR pwszText, IN DWORD cchText, IN DWORD fBeginEndHPBMode, IN ENUM_PHRASES_CALLBACK pcbEnumPhrases, IN LPARAM lpData); typedef INT (APIENTRY *LP_ENUM_PHRASES)( IN PCWSTR pwszText, IN DWORD cchText, IN DWORD fBeginEndHPBMode, IN ENUM_PHRASES_CALLBACK pcbEnumPhrases, IN LPARAM lpData); // T-Hammer uses the folg. values to set the fTokenType parameter // when calling back to EnumTokOutputProc. The Tokenize test app // uses this type information to control the comparison to the re- // tokenized corpus as well as to format the output in general. // "Phrase" signifies that the the end of the pwszToken string marks a // phrase boundary. This is the default. #define TOKEN_TYPE_PHRASE 0x01 // "Morpheme" signifies an intra-phrase morpheme // boundary (including the stem). #define TOKEN_TYPE_MORPHEME 0x02 // "Alternate" signifies that the current pwszToken string is an alternate // (primary tokens are sent before alternates). #define TOKEN_TYPE_ALTERNATE 0x04 // "Hard Break" signifies an unambiguous text boundary. // Note that between punctuation types the output is either all alternate // or all non-alternate. Any bitwise OR combination of the following // types is possible. #define TOKEN_TYPE_HARDBREAK 0x08 // "Label" means the token should not be used to compare to test corpus, // but should be output parenthetically (or stored as the morpheme name), // for example with enclosing parens #define TOKEN_TYPE_LABEL 0x10 // "Stem" signifies that this morpheme is part of the head which corresponds // to a "jiritsugo" for Japanese. This is used for coloring in the tagtool #define TOKEN_TYPE_STEM 0x20 //+-------------------------------------------------------------------------- // Routine: Tokenize // // Synopsis: Internal word-breaker entry point for executing tokenization. // Returns array of delimited offsets in pibBreaks // // Parameters: // pwszText - pointer to wide-character text buffer to be tokenized, // cchText - count of characters in text buffer, // pichBreaks - pointer to return buffer, which is filled with delimiter (breaks) offset information // pcBreaks - size of previous buffer; number of actual breaks used is returned // // Returns: // TH_ERROR_SUCCESS - if the call completed successfully // TH_ERROR_INVALID_INPUT - if the input buffer was bad // TH_ERROR_INVALID_CALLBACK - if the input callback was bad // // Note: Tokenize will never fail with NOHPBs, since it assumes that // the beginning and ends are HPBs // // Notes: // Like lstrlen, this function try/excepts on the input buffer and returns FALSE when an exception // involving invalid memory dereferencing. // // Open Issue: // 1. Do we need to change the name of this API? "Tokenize" is a generic // name - maybe we should save it for a more general-purpose API. //--------------------------------------------------------------------------- INT APIENTRY Tokenize( IN PCWSTR pwszText, IN DWORD cchText, IN DWORD fTokenizeMode, OUT PDWORD pichBreaks, IN OUT PDWORD cBreaks); typedef DWORD (APIENTRY *LP_TOKENIZE)( IN PCWSTR pwszText, IN DWORD cchText, IN DWORD fTokenizeMode, OUT PDWORD pichBreaks, IN OUT PDWORD cBreaks); //+-------------------------------------------------------------------------- // Routine: EnumSummarizationOffsetsEx // // Temporary private entry point to overload Summarization and get back the // number of cch procesed. Please refer to EnumSummarizationOffsets (thammer.h) // for details //--------------------------------------------------------------------------- INT APIENTRY EnumSummarizationOffsetsEx( IN PCWSTR pwszText, IN DWORD cchText, IN DWORD fBeginEndHPBMode, IN ENUM_SUMMARIZATION_OFFSETS_CALLBACK pcbEnumSummarizationOffsets, IN OUT DWORD *pcchTextProcessed, IN LPARAM lpData); //+-------------------------------------------------------------------------- // Routine: EnumSelectionOffsetsExCallback // // Synopsis: same as EnumSelectionOffsetsCallback with an added parameter // that allows mutliple analyses to be sent back to client // // Parameters: // ... // fInfo - dword bit mask that contains info on whether an analysis is primary // and/or spb initial // ... //--------------------------------------------------------------------------- // BOOL // EnumSelectionOffsetsExCallback ( // IN CONST DWORD *pichOffsets, // IN DWORD cOffsets, // IN DWORD fInfo, // IN OUT LPARAM lpData); #define SELN_OFFSETS_INFO_PRIMARY 0x00000001 #define SELN_OFFSETS_INFO_SPB_END 0x00000002 typedef BOOL (CALLBACK * ENUM_SELECTION_OFFSETS_EX_CALLBACK)( IN CONST DWORD *pichOffsets, IN CONST DWORD cOffsets, IN CONST DWORD fInfo, IN OUT LPARAM lpData); //+-------------------------------------------------------------------------- // Routine: EnumSelectionOffsetsEx // // Synopsis: Same as EnumSelectionOffsets, but takes an "extended" callback // (see above for details) //--------------------------------------------------------------------------- INT APIENTRY EnumSelectionOffsetsEx( IN PCWSTR pwszText, IN DWORD cchText, IN DWORD fBeginEndHPBMode, IN ENUM_SELECTION_OFFSETS_EX_CALLBACK pcbEnumSelectionOffsetsEx, IN LPARAM lpData); typedef INT (APIENTRY *LP_ENUM_SELECTION_OFFSETS_EX)( IN PCWSTR pwszText, IN DWORD cchText, IN DWORD fBeginEndHPBMode, IN ENUM_SELECTION_OFFSETS_EX_CALLBACK pcbEnumSelectionOffsetsEx, IN LPARAM lpData); //+-------------------------------------------------------------------------- // Routine: EnumSPBRecordsCallback // // Synopsis: // // Parameters: // pRec - points to an array of TH_RECORDs // cRec - number of TH_RECORD structs in the pRec[] array // iScoreIPL - well-formedness score of the given sentence. // pvData - points to client defined data // // Returns: //--------------------------------------------------------------------------- // BOOL // EnumSPBRecordsCallback( // IN PTH_RECORD pRec, // IN DWORD cRec, // IN DWORD dwFlags, // IN DWORD iScoreIPL, // IN PVOID pvData); #define SPBRECS_SENTEDGE 0x00000001 typedef BOOL (CALLBACK * ENUM_SPB_RECORD_CALLBACK)( IN PTH_RECORD pRec, IN DWORD cRec, IN DWORD dwFlags, IN DWORD iScoreIPL, IN PVOID pvData); //+-------------------------------------------------------------------------- // Routine: EnumSPBRecords // // Synopsis: // // Parameters: // pwszText - points to a sentence to analyze // pcbEnumSPBRecordsCB - callback function pointer. // lpData - points to client defined data // // Returns: //--------------------------------------------------------------------------- INT APIENTRY EnumSPBRecords( IN PCWSTR pwszText, IN DWORD fMode, // for TOKENIZE_MODE_DICTIONARY_FORM IN ENUM_SPB_RECORD_CALLBACK pcbEnumSPBRecordsCB, IN PVOID pvData); typedef INT (APIENTRY *LP_ENUM_SPB_RECORDS)( IN PCWSTR pwszText, IN DWORD fMode, // for TOKENIZE_MODE_DICTIONARY_FORM IN ENUM_SPB_RECORD_CALLBACK pcbEnumSPBRecordsCB, IN PVOID pvData); PCWSTR GetStringVal( TH_HANDLE hVal); typedef PCWSTR (APIENTRY *LP_GET_STRING_VAL)( TH_HANDLE hVal); DWORD GetIntegerVal( TH_HANDLE hVal); typedef DWORD (APIENTRY *LP_GET_INTEGER_VAL)( TH_HANDLE hVal); TH_HANDLE GetAttr( const PTH_RECORD pRec, UINT iAttr); typedef TH_HANDLE (APIENTRY *LP_GET_ATTR) ( const PTH_RECORD pRec, UINT iAttr); //+-------------------------------------------------------------------------- // Routine: FxCallback // // Synopsis: // // Parameters: // // Returns: //--------------------------------------------------------------------------- // BOOL WINAPI // FxCallback( // DWORD iFilter, // WCHAR *pwzFilter, // DWORD cRec, // TH_RECORD *pRec, // VOID *pvData); // typedef BOOL (WINAPI *LP_FXCB)( IN DWORD iFilter, IN WCHAR *pwzFilter, DWORD cRec, TH_RECORD *pRec, IN VOID *pvData); //+-------------------------------------------------------------------------- // Routine: Fx // // Synopsis: // // Parameters: // // Returns: //--------------------------------------------------------------------------- BOOL WINAPI Fx( IN PVOID *ppvFilter, IN DWORD cFilter, IN PCWSTR pwzPhrase, IN LP_FXCB pfnFxCallback, IN PVOID pvData); typedef BOOL (WINAPI *LP_FX)( IN PVOID *ppvFilter, IN DWORD cFilter, IN PCWSTR pwzPhrase, IN LP_FXCB pfnFxCallback, IN PVOID pvData); // SV-related structs and functions //+-------------------------------------------------------------------------- // Structure: SV_INFO // // Synopsis: This structure is used by the SVAPI functions. // //--------------------------------------------------------------------------- typedef struct _SVINFO { unsigned sid : 18; // sense id unsigned svid : 6; // id for spelling variant unsigned cRead : 8; // # of elements in the reading chain for this sid BYTE bIPL; // IPL of this spelling variant MCAT mcat; // index of MCat // array of reading indices for the sid // if awRead[i] < READING_BASE, it represents a kana // otherwise, (awRead[i] - READING_BASE) is the // index into the reading table WORD aiwRead[SV_WORD_IREAD_MAX]; } SV_INFO; //+-------------------------------------------------------------------------- // Routine: SVFindSid // // Fill SV_INFO structure by searching for pwzWord in the SV-lexicon. // This routine is called in order to normalize/convert/reconvert // a word to its matching sense id(s). This routine returns one or more // SV_INFO records, each being a match to the word. // // Parameters: pwzWord = pointer to word for which an sid is needed // pwzMcat = MCat string. // If pwzMcat is not valid , search for the best sid // in all MCat's. // asvi = array of SV_INFO for receiving sid, svid // and reading indices // (must be pre-allocated by the caller) // csviMax = Max # of matches desired. // SVFINDSID_GET_FIRST: only the first match // SVFINDSID_GET_BEST: only the best match(lowest IPL) // Otherwise, the first bMax matches. // // Returns: SVFINDSID_RET_NONE - no matches are found // otherwise returns the number of matches returned in asvi // //--------------------------------------------------------------------------- DWORD APIENTRY SV_FindSid( IN CONST WCHAR *pwzWord, IN CONST WCHAR *pwzMcat, IN OUT SV_INFO *asvi, IN DWORD csviMax); typedef DWORD (APIENTRY *LP_SV_FINDSID) ( IN CONST WCHAR *pwzWord, IN CONST WCHAR *pwzMcat, IN OUT SV_INFO *asvi, IN DWORD csviMax); //+-------------------------------------------------------------------------- // Routine: SVFindSid // // Get sv orthography given SV_INFO structure. // // Parameters: pwzWord = pointer to word for receiving the sv's orthography // (must be pre-allocated by the caller) // psvi = pointer to SV_INFO for receiving return information // (must contain meaningful information) // // Returns: TRUE - successful // FALSE - unsuccessful //--------------------------------------------------------------------------- BOOL APIENTRY SV_GetOrtho( IN SV_INFO *psvi, IN OUT WCHAR *pwzOrtho); typedef BOOL (APIENTRY *LP_SV_GETORTHO) ( IN SV_INFO *psvi, IN OUT WCHAR *pwzOrtho); //+-------------------------------------------------------------------------- // Routine: TurnOn_FindSVStems // // Synopsis: Turn on the flag for using FindSVStems(). // // Parameters: none // // Returns: none //--------------------------------------------------------------------------- BOOL APIENTRY SV_EnableFindSVStems(); typedef VOID (APIENTRY *LP_SV_ENABLE_FINDSVSTEMS) (); #define SVID_ALL_KANJI 0 #define SVID_ALL_KANA 1 #define SV_FINDSID_GET_FIRST 1 #define SV_FINDSID_GET_BEST 0xFFFFFFFF // SVFindSid's return values #define SV_FINDSID_RET_NONE 0 #define SV_FINDSID_ERROR 0xFFFFFFFF #endif // _THAMMERP_H_