//+--------------------------------------------------------------------------- // // // CThaiBreakTree - class CThaiBreakTree // // History: // created 7/99 aarayas // // ©1999 Microsoft Corporation //---------------------------------------------------------------------------- #ifndef _CTHAIBREAKTREE_H_ #define _CTHAIBREAKTREE_H_ #include #include #include "CThaiTrieIter.hpp" #include "CThaiSentTrieIter.hpp" #include "CThaiTrigramTrieIter.hpp" #include "CBreakTree.hpp" #include "lextable.hpp" #define MAXTHAIBREAKNODE 255000 #define MAXUNSIGNEDINT 4294967295 #define UNABLETOCREATENODE MAXUNSIGNEDINT class CThaiWordBreak; class ThaiBreakNode { public: ThaiBreakNode() {}; int iPos; BYTE iBreakLen; DWORD dwTAG; unsigned int NextBreak; unsigned int Down; }; class CThaiBreakTree : public CBreakTree { friend class CThaiWordBreak; public: CThaiBreakTree(); ~CThaiBreakTree(); #if defined (NGRAM_ENABLE) void Init(CTrie* pTrie, CTrie* pSentTrie, CTrie* pTrigramTrie); #else void Init(CTrie* pTrie, CTrie* pTrigramTrie); #endif #if defined (NGRAM_ENABLE) inline void Reset(); inline bool MoveNext(); inline bool MoveDown(); inline unsigned int CreateNode(int iPos, BYTE iBreakLen, DWORD dwPOS); bool GenerateTree(WCHAR* pszBegin, WCHAR* pszEnd); bool MaximalMatching(); #endif int Soundex(WCHAR* word); unsigned int TrigramBreak(WCHAR* pwchBegin, WCHAR* pwchEnd); int FindAltWord(WCHAR* wzWord,unsigned int iWordLen, BYTE Alt, BYTE* pBreakPos); protected: #if defined (NGRAM_ENABLE) inline DWORD DeterminePurgeOrUnknown(unsigned int iCurrentNode, unsigned int iBreakLen); inline void DeterminePurgeEndingSentence(WCHAR* pszBeginWord, unsigned int iNode); #endif inline unsigned int Maximum(unsigned int x, unsigned y) { if (x > y) return x; else return y;} unsigned int GetWeight(WCHAR* pszBegin); unsigned int GetWeight(WCHAR* pszBegin, DWORD* pdwTag); float BigramProbablity(DWORD dwTag1,DWORD dwTag2); DWORD TrigramProbablity(DWORD dwTag1,DWORD dwTag2,DWORD dwTag3); unsigned int SoundexSearch(WCHAR* pszBegin); inline bool ShouldMerge(WCHAR* pwszPrevWord, unsigned int iPrevWordLen, unsigned int iMergeWordLen, DWORD dwPrevTag); bool Traverse(unsigned int iLevel, unsigned int iCurrentNode, unsigned int iNumUnknown); unsigned int GetCluster(WCHAR* pszIndex); void MaximalMatchingAddBreakToList(unsigned int iNumBreak); inline void AddBreakToList(unsigned int iNumBreak, unsigned int iNumUnknown); inline bool CompareSentenceStructure(unsigned int iNumBreak, unsigned int iNumUnknown); bool IsSentenceStruct(WCHAR* pos, unsigned int iPosLen); ThaiBreakNode* breakTree; CThaiTrieIter thaiTrieIter; CThaiTrieIter thaiTrieIter1; CThaiSentTrieIter thaiSentIter; CThaiTrigramTrieIter thaiTrigramIter; WCHAR* pszBegin; WCHAR* pszEnd; unsigned int iNodeIndex; unsigned int iNumNode; // Array of break and part-of-speech use for Traverse the Tree. BYTE* breakArray; DWORD* tagArray; WCHAR* POSArray; unsigned int iNumUnknownMaximalPOSArray; // Array of break for use with maximal matching array; unsigned int maxToken; unsigned int maxLevel; BYTE* maximalMatchingBreakArray; DWORD* maximalMatchingTAGArray; WCHAR* maximalMatchingPOSArray; // Array of break for use with trigram array. BYTE* trigramBreakArray; }; #endif