520 lines
11 KiB
C++
520 lines
11 KiB
C++
|
//+---------------------------------------------------------------------------
|
|||
|
//
|
|||
|
//
|
|||
|
// CTrie - class CTrie encapsulation for Trie data structure.
|
|||
|
//
|
|||
|
// History:
|
|||
|
// created 6/99 aarayas
|
|||
|
//
|
|||
|
// <20>1999 Microsoft Corporation
|
|||
|
//----------------------------------------------------------------------------
|
|||
|
#include "ctrie.hpp"
|
|||
|
|
|||
|
#define VERSIONMAJOR 1
|
|||
|
#define VERSIONMINOR 0
|
|||
|
|
|||
|
//+---------------------------------------------------------------------------
|
|||
|
//
|
|||
|
// Class: CTrieIter
|
|||
|
//
|
|||
|
// Synopsis: constructor
|
|||
|
//
|
|||
|
// Arguments:
|
|||
|
//
|
|||
|
// Modifies:
|
|||
|
//
|
|||
|
// History: created 6/99 aarayas
|
|||
|
//
|
|||
|
// Notes:
|
|||
|
//
|
|||
|
//----------------------------------------------------------------------------
|
|||
|
CTrieIter::CTrieIter()
|
|||
|
{
|
|||
|
// Initialize local variables.
|
|||
|
Reset();
|
|||
|
wc = 0;
|
|||
|
fWordEnd = FALSE;
|
|||
|
fRestricted = FALSE;
|
|||
|
frq = 0;
|
|||
|
dwTag = 0;
|
|||
|
}
|
|||
|
|
|||
|
//+---------------------------------------------------------------------------
|
|||
|
//
|
|||
|
// Class: CTrieIter
|
|||
|
//
|
|||
|
// Synopsis: copy constructor
|
|||
|
//
|
|||
|
// Arguments:
|
|||
|
//
|
|||
|
// Modifies:
|
|||
|
//
|
|||
|
// History: created 6/99 aarayas
|
|||
|
//
|
|||
|
// Notes:
|
|||
|
//
|
|||
|
//----------------------------------------------------------------------------
|
|||
|
CTrieIter::CTrieIter(const CTrieIter& trieIter)
|
|||
|
{
|
|||
|
// Copy all variables from Initial trie.
|
|||
|
memcpy(&trieScan, &trieIter.trieScan, sizeof(TRIESCAN));
|
|||
|
pTrieCtrl = trieIter.pTrieCtrl;
|
|||
|
wc = trieIter.wc;
|
|||
|
fWordEnd = trieIter.fWordEnd;
|
|||
|
fRestricted = trieIter.fRestricted;
|
|||
|
frq = trieIter.frq;
|
|||
|
dwTag = trieIter.dwTag;
|
|||
|
}
|
|||
|
|
|||
|
//+---------------------------------------------------------------------------
|
|||
|
//
|
|||
|
// Class: CTrieIter
|
|||
|
//
|
|||
|
// Synopsis: Initialize variables.
|
|||
|
//
|
|||
|
// Arguments:
|
|||
|
//
|
|||
|
// Modifies:
|
|||
|
//
|
|||
|
// History: created 6/99 aarayas
|
|||
|
//
|
|||
|
// Notes:
|
|||
|
//
|
|||
|
//----------------------------------------------------------------------------
|
|||
|
void CTrieIter::Init(CTrie* ctrie)
|
|||
|
{
|
|||
|
// Initialize TrieCtrl
|
|||
|
pTrieCtrl = ctrie->pTrieCtrl;
|
|||
|
}
|
|||
|
|
|||
|
//+---------------------------------------------------------------------------
|
|||
|
//
|
|||
|
// Class: CTrieIter
|
|||
|
//
|
|||
|
// Synopsis: Initialize variables.
|
|||
|
//
|
|||
|
// Arguments:
|
|||
|
//
|
|||
|
// Modifies:
|
|||
|
//
|
|||
|
// History: created 3/00 aarayas
|
|||
|
//
|
|||
|
// Notes:
|
|||
|
//
|
|||
|
//----------------------------------------------------------------------------
|
|||
|
void CTrieIter::Init(TRIECTRL* pTrieCtrl1)
|
|||
|
{
|
|||
|
// Initialize TrieCtrl
|
|||
|
pTrieCtrl = pTrieCtrl1;
|
|||
|
}
|
|||
|
|
|||
|
//+---------------------------------------------------------------------------
|
|||
|
//
|
|||
|
// Class: CTrieIter
|
|||
|
//
|
|||
|
// Synopsis: Bring interation index to the first node.
|
|||
|
//
|
|||
|
// Arguments:
|
|||
|
//
|
|||
|
// Modifies:
|
|||
|
//
|
|||
|
// History: created 6/99 aarayas
|
|||
|
//
|
|||
|
// Notes:
|
|||
|
//
|
|||
|
//----------------------------------------------------------------------------
|
|||
|
void CTrieIter::Reset()
|
|||
|
{
|
|||
|
// Reset Trie.
|
|||
|
memset(&trieScan, 0, sizeof(TRIESCAN));
|
|||
|
}
|
|||
|
|
|||
|
//+---------------------------------------------------------------------------
|
|||
|
//
|
|||
|
// Class: CTrieIter
|
|||
|
//
|
|||
|
// Synopsis: Move Iteration index down one node.
|
|||
|
//
|
|||
|
// Arguments:
|
|||
|
//
|
|||
|
// Modifies:
|
|||
|
//
|
|||
|
// History: created 6/99 aarayas
|
|||
|
//
|
|||
|
// Notes:
|
|||
|
//
|
|||
|
//----------------------------------------------------------------------------
|
|||
|
BOOL CTrieIter::Down()
|
|||
|
{
|
|||
|
// Move the Trie down one node.
|
|||
|
return TrieGetNextState(pTrieCtrl, &trieScan);
|
|||
|
}
|
|||
|
|
|||
|
//+---------------------------------------------------------------------------
|
|||
|
//
|
|||
|
// Class: CTrieIter
|
|||
|
//
|
|||
|
// Synopsis: Move Iteration index right one node.
|
|||
|
//
|
|||
|
// Arguments:
|
|||
|
//
|
|||
|
// Modifies:
|
|||
|
//
|
|||
|
// History: created 6/99 aarayas
|
|||
|
//
|
|||
|
// Notes:
|
|||
|
//
|
|||
|
//----------------------------------------------------------------------------
|
|||
|
BOOL CTrieIter::Right()
|
|||
|
{
|
|||
|
// Move the Trie right one node.
|
|||
|
return TrieGetNextNode(pTrieCtrl, &trieScan);
|
|||
|
}
|
|||
|
|
|||
|
//+---------------------------------------------------------------------------
|
|||
|
//
|
|||
|
// Class: CTrieIter
|
|||
|
//
|
|||
|
// Synopsis: Bring interation index to the first node.
|
|||
|
//
|
|||
|
// Arguments:
|
|||
|
//
|
|||
|
// Modifies:
|
|||
|
//
|
|||
|
// History: created 6/99 aarayas
|
|||
|
//
|
|||
|
// Notes:
|
|||
|
//
|
|||
|
//----------------------------------------------------------------------------
|
|||
|
void CTrieIter::GetNode()
|
|||
|
{
|
|||
|
wc = trieScan.wch;
|
|||
|
fWordEnd = (trieScan.wFlags & TRIE_NODE_VALID) &&
|
|||
|
(!(trieScan.wFlags & TRIE_NODE_TAGGED) ||
|
|||
|
(trieScan.aTags[0].dwData & iDialectMask));
|
|||
|
|
|||
|
if (fWordEnd)
|
|||
|
{
|
|||
|
fRestricted = (trieScan.wFlags & TRIE_NODE_TAGGED) &&
|
|||
|
(trieScan.aTags[0].dwData & iRestrictedMask);
|
|||
|
frq = (BYTE) (trieScan.wFlags & TRIE_NODE_TAGGED ?
|
|||
|
(trieScan.aTags[0].dwData & 0x300) >> iFrqShift :
|
|||
|
frqpenNormal);
|
|||
|
|
|||
|
posTag = (DWORD) (trieScan.wFlags & TRIE_NODE_TAGGED ?
|
|||
|
(trieScan.aTags[0].dwData & iPosMask) >> iPosShift :
|
|||
|
0);
|
|||
|
|
|||
|
dwTag = (DWORD) (trieScan.wFlags & TRIE_NODE_TAGGED ?
|
|||
|
trieScan.aTags[0].dwData :
|
|||
|
0);
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
//+---------------------------------------------------------------------------
|
|||
|
//
|
|||
|
// Class: CTrie
|
|||
|
//
|
|||
|
// Synopsis: Constructor
|
|||
|
//
|
|||
|
// Arguments:
|
|||
|
//
|
|||
|
// Modifies:
|
|||
|
//
|
|||
|
// History: created 6/99 aarayas
|
|||
|
//
|
|||
|
// Notes:
|
|||
|
//
|
|||
|
//----------------------------------------------------------------------------
|
|||
|
CTrie::CTrie()
|
|||
|
{
|
|||
|
pMapFile = NULL;
|
|||
|
pTrieCtrl = NULL;
|
|||
|
pTrieScan = NULL;
|
|||
|
}
|
|||
|
|
|||
|
//+---------------------------------------------------------------------------
|
|||
|
//
|
|||
|
// Class: CTrie
|
|||
|
//
|
|||
|
// Synopsis: Destructor
|
|||
|
//
|
|||
|
// Arguments:
|
|||
|
//
|
|||
|
// Modifies:
|
|||
|
//
|
|||
|
// History: created 6/99 aarayas
|
|||
|
//
|
|||
|
// Notes:
|
|||
|
//
|
|||
|
//----------------------------------------------------------------------------
|
|||
|
CTrie::~CTrie()
|
|||
|
{
|
|||
|
UnInit();
|
|||
|
}
|
|||
|
|
|||
|
//+---------------------------------------------------------------------------
|
|||
|
//
|
|||
|
// Class: CTrie
|
|||
|
//
|
|||
|
// Synopsis: Initialize Trie.
|
|||
|
//
|
|||
|
// Arguments:
|
|||
|
//
|
|||
|
// Modifies:
|
|||
|
//
|
|||
|
// History: created 6/99 aarayas
|
|||
|
//
|
|||
|
// Notes:
|
|||
|
//
|
|||
|
//----------------------------------------------------------------------------
|
|||
|
PTEC CTrie::Init(WCHAR* szFileName)
|
|||
|
{
|
|||
|
// Declare and Initialize all local variables.
|
|||
|
PTEC ptec = ptecModuleError;
|
|||
|
|
|||
|
// The function assume that pMapFile and pTrieCtrl is NULL, else there are possible memory leak.
|
|||
|
// possibility of this could be call Initilization without Terminating.
|
|||
|
assert(pMapFile == NULL);
|
|||
|
assert(pTrieCtrl == NULL);
|
|||
|
|
|||
|
// Initialize pMapFile and pTrieCtrl to NULL.
|
|||
|
pMapFile = NULL;
|
|||
|
pTrieCtrl = NULL;
|
|||
|
|
|||
|
pMapFile = OpenMapFileW(szFileName);
|
|||
|
|
|||
|
if (pMapFile == NULL)
|
|||
|
{
|
|||
|
// Unable to load map files, return invalid read error.
|
|||
|
ptec = retcode(ptecIOErrorMainLex, ptecFileRead);
|
|||
|
}
|
|||
|
else if (pMapFile->pvMap == NULL)
|
|||
|
{
|
|||
|
// Return Invalid format and close the files.
|
|||
|
ptec = retcode(ptecIOErrorMainLex, ptecInvalidFormat);
|
|||
|
CloseMapFile(pMapFile);
|
|||
|
}
|
|||
|
else
|
|||
|
{
|
|||
|
BYTE *pmap = (BYTE *) pMapFile->pvMap;
|
|||
|
|
|||
|
// find the header
|
|||
|
LEXHEADER *plxhead = (LEXHEADER *) pmap;
|
|||
|
pmap += sizeof(LEXHEADER);
|
|||
|
|
|||
|
// verify that it's a valid lex file
|
|||
|
if (!(plxhead->lxid == lxidSpeller && plxhead->vendorid == vendoridMicrosoft &&
|
|||
|
PROOFMAJORVERSION(plxhead->version) == VERSIONMAJOR ))
|
|||
|
{
|
|||
|
// If we reached here than the lexicon is no in a valid Thai wordbreak format.
|
|||
|
ptec = retcode(ptecIOErrorMainLex, ptecInvalidFormat);
|
|||
|
}
|
|||
|
else
|
|||
|
{
|
|||
|
// Make sure the language matches check the first dialect of the lexicon.
|
|||
|
// CTrie also support both Thai and Vietnamese language.
|
|||
|
if ( (plxhead->lidArray[0] != lidThai) && (plxhead->lidArray[0] != lidViet) )
|
|||
|
{
|
|||
|
// If we reached here than we are not using Thai lexicon.
|
|||
|
ptec = retcode(ptecIOErrorMainLex, ptecInvalidLanguage);
|
|||
|
}
|
|||
|
else
|
|||
|
{
|
|||
|
// The size of the copyright notice
|
|||
|
int cCopyright = 0;
|
|||
|
WCHAR* pwzCopyright = NULL;
|
|||
|
int cLexSup = 0;
|
|||
|
|
|||
|
cCopyright = * (int *) pmap;
|
|||
|
pmap += sizeof(int);
|
|||
|
|
|||
|
// The copyright notice itself
|
|||
|
pwzCopyright = (WCHAR *) pmap;
|
|||
|
pmap += cCopyright * sizeof(WCHAR);
|
|||
|
|
|||
|
// Skip Supplemental data for Thai word break.
|
|||
|
cLexSup = * (int *) pmap;
|
|||
|
pmap += sizeof(int);
|
|||
|
pmap += cLexSup;
|
|||
|
|
|||
|
pTrieCtrl = TrieInit(pmap);
|
|||
|
if (pTrieCtrl)
|
|||
|
{
|
|||
|
// We were able to load and point to the Trie okay.
|
|||
|
//MessageBoxW(0,L"Was able to initialize Trie",pwsz,MB_OK);
|
|||
|
pTrieScan = new CTrieIter();
|
|||
|
pTrieScan->Init(this);
|
|||
|
ptec = ptecNoErrors;
|
|||
|
}
|
|||
|
else
|
|||
|
{
|
|||
|
// We were not able to initailize main lexicon.
|
|||
|
ptec = retcode(ptecIOErrorMainLex, ptecInvalidMainLex);
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
return ptec;
|
|||
|
}
|
|||
|
|
|||
|
//+---------------------------------------------------------------------------
|
|||
|
//
|
|||
|
// Class: CTrie
|
|||
|
//
|
|||
|
// Synopsis: Initialize Trie.
|
|||
|
//
|
|||
|
// Arguments:
|
|||
|
//
|
|||
|
// Modifies:
|
|||
|
//
|
|||
|
// History: created 2/2000 aarayas
|
|||
|
//
|
|||
|
// Notes:
|
|||
|
//
|
|||
|
//----------------------------------------------------------------------------
|
|||
|
PTEC CTrie::InitRc(LPBYTE pmap)
|
|||
|
{
|
|||
|
// Declare and Initialize all local variables.
|
|||
|
PTEC ptec = ptecModuleError;
|
|||
|
|
|||
|
// The function assume that pMapFile and pTrieCtrl is NULL, else there are possible memory leak.
|
|||
|
// possibility of this could be call Initilization without Terminating.
|
|||
|
assert(pMapFile == NULL);
|
|||
|
assert(pTrieCtrl == NULL);
|
|||
|
assert(pTrieScan == NULL);
|
|||
|
|
|||
|
// Initialize pMapFile and pTrieCtrl to NULL.
|
|||
|
pMapFile = NULL;
|
|||
|
pTrieCtrl = NULL;
|
|||
|
pTrieScan = NULL;
|
|||
|
|
|||
|
LEXHEADER *plxhead = (LEXHEADER *) pmap;
|
|||
|
pmap += sizeof(LEXHEADER);
|
|||
|
|
|||
|
// The size of the copyright notice
|
|||
|
int cCopyright = 0;
|
|||
|
WCHAR* pwzCopyright = NULL;
|
|||
|
int cLexSup = 0;
|
|||
|
|
|||
|
cCopyright = * (int *) pmap;
|
|||
|
pmap += sizeof(int);
|
|||
|
|
|||
|
// The copyright notice itself
|
|||
|
pwzCopyright = (WCHAR *) pmap;
|
|||
|
pmap += cCopyright * sizeof(WCHAR);
|
|||
|
|
|||
|
// Skip Supplemental data for Thai word break.
|
|||
|
cLexSup = * (int *) pmap;
|
|||
|
pmap += sizeof(int);
|
|||
|
pmap += cLexSup;
|
|||
|
|
|||
|
pTrieCtrl = TrieInit(pmap);
|
|||
|
if (pTrieCtrl)
|
|||
|
{
|
|||
|
// We were able to load and point to the Trie okay.
|
|||
|
//MessageBoxW(0,L"Was able to initialize Trie",L"ThWB",MB_OK);
|
|||
|
pTrieScan = new CTrieIter();
|
|||
|
pTrieScan->Init(this);
|
|||
|
ptec = ptecNoErrors;
|
|||
|
}
|
|||
|
else
|
|||
|
{
|
|||
|
// We were not able to initailize main lexicon.
|
|||
|
ptec = retcode(ptecIOErrorMainLex, ptecInvalidMainLex);
|
|||
|
}
|
|||
|
|
|||
|
return ptec;
|
|||
|
}
|
|||
|
|
|||
|
//+---------------------------------------------------------------------------
|
|||
|
//
|
|||
|
// Class: CTrie
|
|||
|
//
|
|||
|
// Synopsis: UnInitialize Trie.
|
|||
|
//
|
|||
|
// Arguments:
|
|||
|
//
|
|||
|
// Modifies:
|
|||
|
//
|
|||
|
// History: created 6/99 aarayas
|
|||
|
//
|
|||
|
// Notes:
|
|||
|
//
|
|||
|
//----------------------------------------------------------------------------
|
|||
|
void CTrie::UnInit()
|
|||
|
{
|
|||
|
// Free up memory allocated by Trie.
|
|||
|
if (pTrieCtrl)
|
|||
|
{
|
|||
|
TrieFree(pTrieCtrl);
|
|||
|
pTrieCtrl = NULL;
|
|||
|
}
|
|||
|
|
|||
|
// Close the map files.
|
|||
|
if (pMapFile)
|
|||
|
{
|
|||
|
CloseMapFile(pMapFile);
|
|||
|
pMapFile = NULL;
|
|||
|
}
|
|||
|
|
|||
|
if (pTrieScan)
|
|||
|
{
|
|||
|
delete pTrieScan;
|
|||
|
pTrieScan = NULL;
|
|||
|
}
|
|||
|
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
//+---------------------------------------------------------------------------
|
|||
|
//
|
|||
|
// Class: CTrie
|
|||
|
//
|
|||
|
// Synopsis: searches for the given string in the trie
|
|||
|
//
|
|||
|
// Arguments:
|
|||
|
//
|
|||
|
// Modifies:
|
|||
|
//
|
|||
|
// History: created 6/99 aarayas
|
|||
|
//
|
|||
|
// Notes:
|
|||
|
//
|
|||
|
//----------------------------------------------------------------------------
|
|||
|
BOOL CTrie::Find(WCHAR* szWord, DWORD* pdwPOS)
|
|||
|
{
|
|||
|
// Declare and initialize all local variables.
|
|||
|
int i = 0;
|
|||
|
|
|||
|
if (pTrieScan == NULL)
|
|||
|
return FALSE;
|
|||
|
|
|||
|
pTrieScan->Reset();
|
|||
|
|
|||
|
if (!pTrieScan->Down())
|
|||
|
return FALSE;
|
|||
|
|
|||
|
while (TRUE)
|
|||
|
{
|
|||
|
pTrieScan->GetNode();
|
|||
|
if (pTrieScan->wc == szWord[i])
|
|||
|
{
|
|||
|
i++;
|
|||
|
if (pTrieScan->fWordEnd && szWord[i] == '\0')
|
|||
|
{
|
|||
|
*pdwPOS = pTrieScan->posTag;
|
|||
|
return TRUE;
|
|||
|
}
|
|||
|
else if (szWord[i] == '\0') break;
|
|||
|
// Move down the Trie Branch.
|
|||
|
else if (!pTrieScan->Down()) break;
|
|||
|
}
|
|||
|
// Move right of the Trie Branch
|
|||
|
else if (!pTrieScan->Right()) break;
|
|||
|
}
|
|||
|
*pdwPOS = POS_UNKNOWN;
|
|||
|
return FALSE;
|
|||
|
}
|