windows-nt/Source/XPSP1/NT/inetsrv/intlwb/cht2/srcs/chtbrkr.cpp

#include <windows.h>
#include "lexicon.h"
#include "rulelex.h"
#include "LexMgr.h"
#include "CHTBrKr.h"
#include "BaseLex.h"


CCHTWordBreaker::CCHTWordBreaker(void)
{
    m_pcLexicon = NULL;
    m_ppWordLattice = NULL;
    m_pdwCandidateNumber = NULL;
    m_dwSentenceLength = 0;
    m_dwLatticeLength = 0;
    m_pdwMaxWordLength = NULL;
    m_psBreakResult = NULL;
	m_pcRuleLex = NULL;
}
    
CCHTWordBreaker::~CCHTWordBreaker(void)
{
    DWORD i;

    if (m_pcLexicon) {
        delete m_pcLexicon;
        m_pcLexicon = NULL;
    }
    if (m_ppWordLattice) {
        for (i = 0; i < m_dwSentenceLength; ++i) {
            if (m_ppWordLattice[i]) {
                delete m_ppWordLattice[i];
            }
        }
        delete m_ppWordLattice;
        m_ppWordLattice = NULL;
    }
    if (m_pdwCandidateNumber) {
        delete m_pdwCandidateNumber;
        m_pdwCandidateNumber = NULL;
    }
    if (m_pdwMaxWordLength) {
        delete m_pdwMaxWordLength;
        m_pdwMaxWordLength = NULL;
    }
    if (m_psBreakResult) {
        if (m_psBreakResult->puWordLen) {
            delete m_psBreakResult->puWordLen;
        }
        if (m_psBreakResult->pbTerminalCode) {
            delete m_psBreakResult->pbTerminalCode;
        }
        if (m_psBreakResult->puWordAttrib) {
            delete m_psBreakResult->puWordAttrib;
        }
    }
    m_dwSentenceLength = 0;
    m_dwLatticeLength = 0;
}

BOOL CCHTWordBreaker::AllocLattice(
    DWORD dwLength)
{
    BOOL fRet = FALSE;
    DWORD i;

    m_pdwMaxWordLength= new DWORD[dwLength];
    if (!m_pdwMaxWordLength) { goto _exit; }

    m_pdwCandidateNumber = new DWORD[dwLength];
    if (!m_pdwCandidateNumber) { goto _exit; }
     
    m_ppWordLattice = new PSLatticeNode[dwLength];
    if (!m_ppWordLattice) { goto _exit; }
    for (i = 0; i < dwLength; ++i) {
        m_ppWordLattice[i] = NULL;
    }
    for (i = 0; i < dwLength; ++i) {
        m_ppWordLattice[i] = new SLatticeNode[MAX_CHAR_PER_WORD];
        if (!m_ppWordLattice[i]) { goto _exit; }
        m_pdwCandidateNumber[i] = 0;
    }
    m_dwLatticeLength = dwLength;
    fRet = TRUE;
_exit:
    if (!fRet) {
        DestroyLattice();
    }        
    return fRet;
}

void CCHTWordBreaker::DestroyLattice()
{
    DWORD i;

    if (m_pdwCandidateNumber) {
        delete m_pdwCandidateNumber;
        m_pdwCandidateNumber = NULL;
    }
    if (m_pdwMaxWordLength) {
        delete m_pdwMaxWordLength; 
        m_pdwMaxWordLength = NULL;
    }
    if (m_ppWordLattice) {
        for (i = 0; i < m_dwLatticeLength; ++i) {
            if (m_ppWordLattice[i]) {
                delete m_ppWordLattice[i];
            } 
        }
        m_ppWordLattice = NULL;
        m_dwLatticeLength = 0;
    }
}


BOOL CCHTWordBreaker::InitData(
    HINSTANCE hInstance) 
{
    BOOL fRet = FALSE;

    m_pcLexicon = new CCHTLexicon;
    if (!m_pcLexicon) { goto _exit; }
    fRet = m_pcLexicon->InitData(hInstance);
    if (!fRet) { goto _exit; }

    m_pcRuleLex = new CRuleLexicon;
    if (!m_pcRuleLex)  { goto _exit; }

    m_psBreakResult = new SBreakResult;
    if (!m_psBreakResult) { goto _exit; }
	FillMemory(m_psBreakResult, sizeof(SBreakResult), 0);
    m_psBreakResult->puWordLen = new UINT[LATTICE_LENGHT];
    m_psBreakResult->pbTerminalCode = new BYTE[LATTICE_LENGHT];
    m_psBreakResult->puWordAttrib = new UINT[LATTICE_LENGHT];
    if (!AllocLattice(LATTICE_LENGHT)) { goto _exit; }  
    fRet = TRUE;
_exit:
    if (!fRet) {
        if (m_pcLexicon) { 
            delete m_pcLexicon;
            m_pcLexicon = NULL;
        }
        if (m_pcRuleLex) {
            delete m_pcRuleLex;
            m_pcRuleLex = NULL;
        }
        if (m_psBreakResult) {
            if (m_psBreakResult->puWordLen) {
                delete m_psBreakResult->puWordLen;
            }
            if (m_psBreakResult->pbTerminalCode) {
                delete m_psBreakResult->pbTerminalCode;
            }
            if (m_psBreakResult->puWordAttrib) {
                delete m_psBreakResult->puWordAttrib;
            }
            m_psBreakResult = NULL;  
        }
        DestroyLattice();
    }
    return fRet;
}

BOOL CCHTWordBreaker::LatticeGrow(
    DWORD dwNewLength)
{
    BOOL fRet = FALSE;

    if (dwNewLength <= m_dwLatticeLength) { 
        fRet = TRUE;
        goto _exit;
    }
    DestroyLattice();
    if (AllocLattice(dwNewLength)) {
        fRet = TRUE;
    } else {
        AllocLattice(LATTICE_LENGHT);
    }
    if (m_psBreakResult) {
        if (m_psBreakResult->puWordLen) {
            delete m_psBreakResult->puWordLen;
        }
        if (m_psBreakResult->pbTerminalCode) {
            delete m_psBreakResult->pbTerminalCode;
        }
        if (m_psBreakResult->puWordAttrib) {
            delete m_psBreakResult->puWordAttrib;
        }
        m_psBreakResult->puWordLen = new UINT[dwNewLength];
        m_psBreakResult->pbTerminalCode = new BYTE[dwNewLength];
        m_psBreakResult->puWordAttrib  = new UINT[dwNewLength];
    }

_exit:
    return fRet;
}

DWORD CCHTWordBreaker::BreakText(
    LPCWSTR   lpcwszText,
    INT       nTextLen,
    CBaseLex* pcBaseLex,
    DWORD     dwMaxWordLen,
    BOOL      fBreakWithParser)
{
    m_psBreakResult->dwWordNumber = 0;

    if (!LatticeGrow(nTextLen)) { goto _exit; }

    if (BuildLattice(lpcwszText, nTextLen, pcBaseLex, dwMaxWordLen)) {
        GetResult();
        // process Surrogate Char begin
/*
	    INT nCurrentIndex;
        DWORD dwSurIndex;
        nCurrentIndex = 0;
        for (dwSurIndex = 0; dwSurIndex < m_psBreakResult->dwWordNumber; ++dwSurIndex) {
            if (m_psBreakResult->puWordLen[dwSurIndex] == 1) { // High word of surrogate char should be breaked into signal char word
                if (lpcwszText[nCurrentIndex] >= 0xd800 && lpcwszText[nCurrentIndex] <= 0xdbff) { // High word is 
				    if (nCurrentIndex >= nTextLen - 1) { // Should be an error
                    } else if (lpcwszText[nCurrentIndex + 1] >= 0xdc00 && lpcwszText[nCurrentIndex + 1] <= 0xdfff) { // Is surrogate char
				        DWORD dwMoveDataNum;
                        dwMoveDataNum = m_psBreakResult->dwWordNumber - (dwSurIndex + 1 + 1);
					    m_psBreakResult->puWordLen[dwSurIndex] = 2;
                        CopyMemory(&(m_psBreakResult->puWordLen[dwSurIndex + 1]), &(m_psBreakResult->puWordLen[dwSurIndex + 1 + 1]), dwMoveDataNum * sizeof(UINT));
                        CopyMemory(&(m_psBreakResult->pbTerminalCode[dwSurIndex + 1]), &(m_psBreakResult->pbTerminalCode[dwSurIndex + 1 + 1]), dwMoveDataNum * sizeof(BYTE));
                        CopyMemory(&(m_psBreakResult->puWordAttrib[dwSurIndex + 1]), &(m_psBreakResult->puWordAttrib[dwSurIndex + 1 + 1]), dwMoveDataNum * sizeof(UINT));
                        m_psBreakResult->dwWordNumber -= 1;
					    //nCurrentIndex -= 1;
                    } else {// Should be an error
                    }
                }
            }
            nCurrentIndex += m_psBreakResult->puWordLen[dwSurIndex];
        } */
        // process Surrogate Char end

        if (fBreakWithParser) {
#ifdef PARSER
            DWORD i, dwBeginIndex, dwParseLen;
		    PWORD pwTerminalCode;
            pwTerminalCode = NULL;
            dwParseLen = 0;
            pwTerminalCode = new WORD[m_psBreakResult->dwWordNumber];
            MultiByteToWideChar(950, MB_PRECOMPOSED, (const char *)m_psBreakResult->pbTerminalCode, 
			    m_psBreakResult->dwWordNumber, pwTerminalCode, m_psBreakResult->dwWordNumber); 
            for (dwBeginIndex = 0; dwBeginIndex < m_psBreakResult->dwWordNumber; dwBeginIndex += 1) {
                if (m_psBreakResult->pbTerminalCode[dwBeginIndex] == ' ') { continue; }
                for (dwParseLen = 1; dwBeginIndex + dwParseLen < m_psBreakResult->dwWordNumber; ++dwParseLen) {
                    if (m_psBreakResult->pbTerminalCode[dwBeginIndex + dwParseLen] == ' ') {
                        break;
                    }
                }
                for ( ; dwParseLen > 1; --dwParseLen) {
                    if (m_pcRuleLex->IsAWord(&pwTerminalCode[dwBeginIndex], dwParseLen)) {
                        break;
                    } 
                }
                if (dwParseLen > 1) { // adjust break result
                    for (i = 1; i < dwParseLen; ++i) {      
                        m_psBreakResult->puWordLen[dwBeginIndex] += m_psBreakResult->puWordLen[dwBeginIndex + i];
                    }
                    m_psBreakResult->puWordAttrib[dwBeginIndex] = ATTR_RULE_WORD;
                    DWORD dwMoveDataNum;
                    dwMoveDataNum = m_psBreakResult->dwWordNumber - (dwBeginIndex + dwParseLen);
                    CopyMemory(&(m_psBreakResult->puWordLen[dwBeginIndex + 1]),
                        &(m_psBreakResult->puWordLen[dwBeginIndex + dwParseLen]),
                        dwMoveDataNum * sizeof(UINT));
                    CopyMemory(&(m_psBreakResult->pbTerminalCode[dwBeginIndex + 1]),
                        &(m_psBreakResult->pbTerminalCode[dwBeginIndex + dwParseLen]),
                        dwMoveDataNum * sizeof(BYTE));
                    CopyMemory(&(m_psBreakResult->puWordAttrib[dwBeginIndex + 1]),
                        &(m_psBreakResult->puWordAttrib[dwBeginIndex + dwParseLen]),
                        dwMoveDataNum * sizeof(UINT));
                    m_psBreakResult->dwWordNumber -= (dwParseLen - 1);
                }
            }
		    if (pwTerminalCode) {
		        delete [] pwTerminalCode;
            }
#endif
        }// if support parser
    } // if build lattice success

_exit:
    return m_psBreakResult->dwWordNumber;
}

DWORD CCHTWordBreaker::GetResult(void)
{
    DWORD dwRet = 0;
    DWORD dwLen = 0;
    SLocalPath sLocalPath[2];
    UINT  uBestIndex = 0, uCandIndex, uLocalPathIndex;
    DWORD dw2ndIndex, dw3rdIndex;
    DWORD i, j, k;


    m_psBreakResult->dwWordNumber = 0;

    uCandIndex = (uBestIndex + 1) % 2;    
    
    while (dwLen < m_dwSentenceLength) {
        uLocalPathIndex = 0;
        if (m_pdwCandidateNumber[dwLen] == 1) {
            sLocalPath[uBestIndex].dwLength[0] = 1;
            sLocalPath[uBestIndex].bTerminalCode[0] = m_ppWordLattice[dwLen][0].bTerminalCode;
            sLocalPath[uBestIndex].wAttribute[0] = m_ppWordLattice[dwLen][0].wAttr;
        } else {
            FillMemory(&sLocalPath[uBestIndex], sizeof(SLocalPath), 0);
            for (i = 0; i < m_pdwCandidateNumber[dwLen]; ++i) {
                FillMemory(&sLocalPath[uCandIndex], sizeof(SLocalPath), 0);
                ++sLocalPath[uCandIndex].uStep;
                sLocalPath[uCandIndex].dwLength[uLocalPathIndex] = m_ppWordLattice[dwLen][i].uLen;
                sLocalPath[uCandIndex].wUnicount[uLocalPathIndex] = m_ppWordLattice[dwLen][i].wCount;
                sLocalPath[uCandIndex].wAttribute[uLocalPathIndex] = m_ppWordLattice[dwLen][i].wAttr;
                sLocalPath[uCandIndex].bTerminalCode[uLocalPathIndex++] = m_ppWordLattice[dwLen][i].bTerminalCode;
                dw2ndIndex = dwLen + m_ppWordLattice[dwLen][i].uLen; 
                if (dw2ndIndex < m_dwSentenceLength) {
                    for (j = 0; j < m_pdwCandidateNumber[dw2ndIndex]; ++j) {
                        ++sLocalPath[uCandIndex].uStep;
                        sLocalPath[uCandIndex].dwLength[uLocalPathIndex] = m_ppWordLattice[dw2ndIndex][j].uLen;
                        sLocalPath[uCandIndex].wUnicount[uLocalPathIndex] = m_ppWordLattice[dw2ndIndex][j].wCount;
                        sLocalPath[uCandIndex].wAttribute[uLocalPathIndex] = m_ppWordLattice[dw2ndIndex][j].wAttr;
                        sLocalPath[uCandIndex].bTerminalCode[uLocalPathIndex++] = m_ppWordLattice[dw2ndIndex][j].bTerminalCode;
                        dw3rdIndex = dw2ndIndex + m_ppWordLattice[dw2ndIndex][j].uLen;
                        if (dw3rdIndex < m_dwSentenceLength) {
                            for (k = 0; k < m_pdwCandidateNumber[dw3rdIndex]; ++k) {
                                ++sLocalPath[uCandIndex].uStep;
                                sLocalPath[uCandIndex].dwLength[uLocalPathIndex] = m_ppWordLattice[dw3rdIndex][k].uLen;
                                sLocalPath[uCandIndex].wUnicount[uLocalPathIndex] = m_ppWordLattice[dw3rdIndex][k].wCount;
                                sLocalPath[uCandIndex].wAttribute[uLocalPathIndex] = m_ppWordLattice[dw3rdIndex][k].wAttr;
                                sLocalPath[uCandIndex].bTerminalCode[uLocalPathIndex++] = m_ppWordLattice[dw3rdIndex][k].bTerminalCode;
                                GetScore(&(sLocalPath[uCandIndex])); 
                                if (CompareScore(&(sLocalPath[uCandIndex]), &(sLocalPath[uBestIndex])) > 0) {
                                    CopyMemory(&sLocalPath[uBestIndex], &sLocalPath[uCandIndex], sizeof(SLocalPath));
                                }
                                --uLocalPathIndex;
                                --sLocalPath[uCandIndex].uStep;
                            }
                        } else {
                            GetScore(&(sLocalPath[uCandIndex])); 
                            if (CompareScore(&(sLocalPath[uCandIndex]), &(sLocalPath[uBestIndex])) > 0) {
                                CopyMemory(&sLocalPath[uBestIndex], &sLocalPath[uCandIndex], sizeof(SLocalPath));
                            }
                        }
                        --uLocalPathIndex;
                        --sLocalPath[uCandIndex].uStep;
                    }
                } else {
                    GetScore(&(sLocalPath[uCandIndex])); 
                    if (CompareScore(&(sLocalPath[uCandIndex]), &(sLocalPath[uBestIndex])) > 0) {
                        CopyMemory(&sLocalPath[uBestIndex], &sLocalPath[uCandIndex], sizeof(SLocalPath));
                    }
                }
                --uLocalPathIndex;
                --sLocalPath[uCandIndex].uStep;
            }
        }
        m_psBreakResult->puWordLen[m_psBreakResult->dwWordNumber] = sLocalPath[uBestIndex].dwLength[0];
        m_psBreakResult->pbTerminalCode[m_psBreakResult->dwWordNumber] = sLocalPath[uBestIndex].bTerminalCode[0];
        m_psBreakResult->puWordAttrib[m_psBreakResult->dwWordNumber] = sLocalPath[uBestIndex].wAttribute[0];
        ++m_psBreakResult->dwWordNumber;
        dwLen += sLocalPath[uBestIndex].dwLength[0];
    }

    return m_psBreakResult->dwWordNumber;
}

INT CCHTWordBreaker::CompareScore(
    PSLocalPath psLocalPath1,
    PSLocalPath psLocalPath2)
{
    if (psLocalPath1->uPathLength > psLocalPath2->uPathLength) {
        return 1;
    } else if (psLocalPath1->uPathLength < psLocalPath2->uPathLength) {
        return -1;
    } else if (psLocalPath1->uStep < psLocalPath2->uStep) {
        return 1;
    } else if (psLocalPath1->uStep > psLocalPath2->uStep) {
        return -1;
    } else {
    }

    if (psLocalPath1->fVariance > psLocalPath2->fVariance) {
        return -1;
    } 
    if (psLocalPath1->fVariance < psLocalPath2->fVariance) {
        return 1;
    } 
    if (psLocalPath1->uCompoundNum > psLocalPath2->uCompoundNum) {
        return -1;
    } 
    if (psLocalPath1->uCompoundNum < psLocalPath2->uCompoundNum) {
        return 1;
    } 
    if (psLocalPath1->uDMNum > psLocalPath2->uDMNum) {
        return -1;
    } 
    if (psLocalPath1->uDMNum < psLocalPath2->uDMNum) {
        return 1;
    } 
    if (psLocalPath1->wUniCountSum > psLocalPath2->wUniCountSum) {
        return 1;
    } 
    if (psLocalPath1->wUniCountSum < psLocalPath2->wUniCountSum) {
        return -1;
    } 
    return 0;
}

void CCHTWordBreaker::GetScore(
    PSLocalPath psLocalPath)
{
    UINT i;
    double fAverageSum;

    psLocalPath->uCompoundNum = 0;
    psLocalPath->uDMNum = 0;
    psLocalPath->uPathLength = 0;
    psLocalPath->fVariance = 0;
    psLocalPath->wUniCountSum = 0;

    for (i = 0; i < psLocalPath->uStep; ++i) {
        if (psLocalPath->wAttribute[i] & ATTR_COMPOUND) {
            psLocalPath->uCompoundNum++;    
        }
        if (psLocalPath->wAttribute[i] & ATTR_DM) {
            psLocalPath->uDMNum++;        
        }
        psLocalPath->uPathLength += psLocalPath->dwLength[i];
        psLocalPath->wUniCountSum += psLocalPath->wUnicount[i];
    }
    fAverageSum = (double)psLocalPath->uPathLength / psLocalPath->uStep; 
    for (i = 0; i < psLocalPath->uStep; ++i) {
        if (fAverageSum > psLocalPath->dwLength[i]) {
            psLocalPath->fVariance += (fAverageSum - psLocalPath->dwLength[i]);         
        } else {
            psLocalPath->fVariance += (psLocalPath->dwLength[i] - fAverageSum);    
        }
    }
}

         
BOOL CCHTWordBreaker::BuildLattice(
    LPCWSTR   lpcwszText,
    DWORD     dwTextLen,
    CBaseLex* pcBaseLex,
    DWORD     dwMaxWordLen)
{ 
    DWORD i, j;

    FillMemory(m_pdwCandidateNumber, sizeof(DWORD) * dwTextLen, 0);
    // we should use head link
    for (i = 0; i < dwTextLen; ++i) {
        m_pdwMaxWordLength[i] = 1;
        for (j = i; (j - i + 1) <= dwMaxWordLen && j < dwTextLen; ++j) {
            if (m_pcLexicon->GetWordInfo(&lpcwszText[i], (j - i + 1), 
                &(m_ppWordLattice[i][m_pdwCandidateNumber[i]].wCount),
                &(m_ppWordLattice[i][m_pdwCandidateNumber[i]].wAttr),
                &(m_ppWordLattice[i][m_pdwCandidateNumber[i]].bTerminalCode))) { 
                m_ppWordLattice[i][m_pdwCandidateNumber[i]++].uLen = (j - i + 1);
                if (j - i + 1 > m_pdwMaxWordLength[i]) {
                    m_pdwMaxWordLength[i] = j - i + 1 ;
                }
            } else if (pcBaseLex && pcBaseLex->GetWordInfo(&lpcwszText[i], (j - i + 1), 
                &(m_ppWordLattice[i][m_pdwCandidateNumber[i]].wAttr))) {
                m_ppWordLattice[i][m_pdwCandidateNumber[i]].wCount = APLEXICON_COUNT;
                m_ppWordLattice[i][m_pdwCandidateNumber[i]].bTerminalCode = ' ';
                m_ppWordLattice[i][m_pdwCandidateNumber[i]++].uLen = (j - i + 1);
                if (j - i + 1 > m_pdwMaxWordLength[i]) {
                    m_pdwMaxWordLength[i] = j - i + 1 ;
                }
            } else {
            }
        }
        if (!m_pdwCandidateNumber[i]) {
            m_ppWordLattice[i][0].uLen = 1;
            m_ppWordLattice[i][0].wCount = 0;
            m_ppWordLattice[i][0].wAttr = 0;
            m_ppWordLattice[i][0].fVariance = 0;
            m_ppWordLattice[i][0].bTerminalCode = ' ';
            ++m_pdwCandidateNumber[i];
        }
    }
    m_dwSentenceLength = dwTextLen;
    return TRUE;
}
/*
DWORD CCHTWordBreaker::LongestRuleWord(
    DWORD dwIndex)
{
    DWORD dwRet = 0, i;
    
    for (i = 0; i < m_pdwCandidateNumber[dwIndex]; ++i) {
        if (m_ppWordLattice[dwIndex][i].bAttr & ATTR_RULE_WORD) {
            if (m_ppWordLattice[dwIndex][i].uLen > dwRet) {
                dwRet = m_ppWordLattice[dwIndex][i].uLen;
            }
            
        }
    }
    return dwRet;
}
*/
Add source files 2020-09-26 03:20:57 -05:00			`#include <windows.h>`
			`#include "lexicon.h"`
			`#include "rulelex.h"`
			`#include "LexMgr.h"`
			`#include "CHTBrKr.h"`
			`#include "BaseLex.h"`


			`CCHTWordBreaker::CCHTWordBreaker(void)`
			`{`
			`m_pcLexicon = NULL;`
			`m_ppWordLattice = NULL;`
			`m_pdwCandidateNumber = NULL;`
			`m_dwSentenceLength = 0;`
			`m_dwLatticeLength = 0;`
			`m_pdwMaxWordLength = NULL;`
			`m_psBreakResult = NULL;`
			`m_pcRuleLex = NULL;`
			`}`

			`CCHTWordBreaker::~CCHTWordBreaker(void)`
			`{`
			`DWORD i;`

			`if (m_pcLexicon) {`
			`delete m_pcLexicon;`
			`m_pcLexicon = NULL;`
			`}`
			`if (m_ppWordLattice) {`
			`for (i = 0; i < m_dwSentenceLength; ++i) {`
			`if (m_ppWordLattice[i]) {`
			`delete m_ppWordLattice[i];`
			`}`
			`}`
			`delete m_ppWordLattice;`
			`m_ppWordLattice = NULL;`
			`}`
			`if (m_pdwCandidateNumber) {`
			`delete m_pdwCandidateNumber;`
			`m_pdwCandidateNumber = NULL;`
			`}`
			`if (m_pdwMaxWordLength) {`
			`delete m_pdwMaxWordLength;`
			`m_pdwMaxWordLength = NULL;`
			`}`
			`if (m_psBreakResult) {`
			`if (m_psBreakResult->puWordLen) {`
			`delete m_psBreakResult->puWordLen;`
			`}`
			`if (m_psBreakResult->pbTerminalCode) {`
			`delete m_psBreakResult->pbTerminalCode;`
			`}`
			`if (m_psBreakResult->puWordAttrib) {`
			`delete m_psBreakResult->puWordAttrib;`
			`}`
			`}`
			`m_dwSentenceLength = 0;`
			`m_dwLatticeLength = 0;`
			`}`

			`BOOL CCHTWordBreaker::AllocLattice(`
			`DWORD dwLength)`
			`{`
			`BOOL fRet = FALSE;`
			`DWORD i;`

			`m_pdwMaxWordLength= new DWORD[dwLength];`
			`if (!m_pdwMaxWordLength) { goto _exit; }`

			`m_pdwCandidateNumber = new DWORD[dwLength];`
			`if (!m_pdwCandidateNumber) { goto _exit; }`

			`m_ppWordLattice = new PSLatticeNode[dwLength];`
			`if (!m_ppWordLattice) { goto _exit; }`
			`for (i = 0; i < dwLength; ++i) {`
			`m_ppWordLattice[i] = NULL;`
			`}`
			`for (i = 0; i < dwLength; ++i) {`
			`m_ppWordLattice[i] = new SLatticeNode[MAX_CHAR_PER_WORD];`
			`if (!m_ppWordLattice[i]) { goto _exit; }`
			`m_pdwCandidateNumber[i] = 0;`
			`}`
			`m_dwLatticeLength = dwLength;`
			`fRet = TRUE;`
			`_exit:`
			`if (!fRet) {`
			`DestroyLattice();`
			`}`
			`return fRet;`
			`}`

			`void CCHTWordBreaker::DestroyLattice()`
			`{`
			`DWORD i;`

			`if (m_pdwCandidateNumber) {`
			`delete m_pdwCandidateNumber;`
			`m_pdwCandidateNumber = NULL;`
			`}`
			`if (m_pdwMaxWordLength) {`
			`delete m_pdwMaxWordLength;`
			`m_pdwMaxWordLength = NULL;`
			`}`
			`if (m_ppWordLattice) {`
			`for (i = 0; i < m_dwLatticeLength; ++i) {`
			`if (m_ppWordLattice[i]) {`
			`delete m_ppWordLattice[i];`
			`}`
			`}`
			`m_ppWordLattice = NULL;`
			`m_dwLatticeLength = 0;`
			`}`
			`}`


			`BOOL CCHTWordBreaker::InitData(`
			`HINSTANCE hInstance)`
			`{`
			`BOOL fRet = FALSE;`

			`m_pcLexicon = new CCHTLexicon;`
			`if (!m_pcLexicon) { goto _exit; }`
			`fRet = m_pcLexicon->InitData(hInstance);`
			`if (!fRet) { goto _exit; }`

			`m_pcRuleLex = new CRuleLexicon;`
			`if (!m_pcRuleLex) { goto _exit; }`

			`m_psBreakResult = new SBreakResult;`
			`if (!m_psBreakResult) { goto _exit; }`
			`FillMemory(m_psBreakResult, sizeof(SBreakResult), 0);`
			`m_psBreakResult->puWordLen = new UINT[LATTICE_LENGHT];`
			`m_psBreakResult->pbTerminalCode = new BYTE[LATTICE_LENGHT];`
			`m_psBreakResult->puWordAttrib = new UINT[LATTICE_LENGHT];`
			`if (!AllocLattice(LATTICE_LENGHT)) { goto _exit; }`
			`fRet = TRUE;`
			`_exit:`
			`if (!fRet) {`
			`if (m_pcLexicon) {`
			`delete m_pcLexicon;`
			`m_pcLexicon = NULL;`
			`}`
			`if (m_pcRuleLex) {`
			`delete m_pcRuleLex;`
			`m_pcRuleLex = NULL;`
			`}`
			`if (m_psBreakResult) {`
			`if (m_psBreakResult->puWordLen) {`
			`delete m_psBreakResult->puWordLen;`
			`}`
			`if (m_psBreakResult->pbTerminalCode) {`
			`delete m_psBreakResult->pbTerminalCode;`
			`}`
			`if (m_psBreakResult->puWordAttrib) {`
			`delete m_psBreakResult->puWordAttrib;`
			`}`
			`m_psBreakResult = NULL;`
			`}`
			`DestroyLattice();`
			`}`
			`return fRet;`
			`}`

			`BOOL CCHTWordBreaker::LatticeGrow(`
			`DWORD dwNewLength)`
			`{`
			`BOOL fRet = FALSE;`

			`if (dwNewLength <= m_dwLatticeLength) {`
			`fRet = TRUE;`
			`goto _exit;`
			`}`
			`DestroyLattice();`
			`if (AllocLattice(dwNewLength)) {`
			`fRet = TRUE;`
			`} else {`
			`AllocLattice(LATTICE_LENGHT);`
			`}`
			`if (m_psBreakResult) {`
			`if (m_psBreakResult->puWordLen) {`
			`delete m_psBreakResult->puWordLen;`
			`}`
			`if (m_psBreakResult->pbTerminalCode) {`
			`delete m_psBreakResult->pbTerminalCode;`
			`}`
			`if (m_psBreakResult->puWordAttrib) {`
			`delete m_psBreakResult->puWordAttrib;`
			`}`
			`m_psBreakResult->puWordLen = new UINT[dwNewLength];`
			`m_psBreakResult->pbTerminalCode = new BYTE[dwNewLength];`
			`m_psBreakResult->puWordAttrib = new UINT[dwNewLength];`
			`}`

			`_exit:`
			`return fRet;`
			`}`

			`DWORD CCHTWordBreaker::BreakText(`
			`LPCWSTR lpcwszText,`
			`INT nTextLen,`
			`CBaseLex* pcBaseLex,`
			`DWORD dwMaxWordLen,`
			`BOOL fBreakWithParser)`
			`{`
			`m_psBreakResult->dwWordNumber = 0;`

			`if (!LatticeGrow(nTextLen)) { goto _exit; }`

			`if (BuildLattice(lpcwszText, nTextLen, pcBaseLex, dwMaxWordLen)) {`
			`GetResult();`
			`// process Surrogate Char begin`
			`/*`
			`INT nCurrentIndex;`
			`DWORD dwSurIndex;`
			`nCurrentIndex = 0;`
			`for (dwSurIndex = 0; dwSurIndex < m_psBreakResult->dwWordNumber; ++dwSurIndex) {`
			`if (m_psBreakResult->puWordLen[dwSurIndex] == 1) { // High word of surrogate char should be breaked into signal char word`
			`if (lpcwszText[nCurrentIndex] >= 0xd800 && lpcwszText[nCurrentIndex] <= 0xdbff) { // High word is`
			`if (nCurrentIndex >= nTextLen - 1) { // Should be an error`
			`} else if (lpcwszText[nCurrentIndex + 1] >= 0xdc00 && lpcwszText[nCurrentIndex + 1] <= 0xdfff) { // Is surrogate char`
			`DWORD dwMoveDataNum;`
			`dwMoveDataNum = m_psBreakResult->dwWordNumber - (dwSurIndex + 1 + 1);`
			`m_psBreakResult->puWordLen[dwSurIndex] = 2;`
			`CopyMemory(&(m_psBreakResult->puWordLen[dwSurIndex + 1]), &(m_psBreakResult->puWordLen[dwSurIndex + 1 + 1]), dwMoveDataNum * sizeof(UINT));`
			`CopyMemory(&(m_psBreakResult->pbTerminalCode[dwSurIndex + 1]), &(m_psBreakResult->pbTerminalCode[dwSurIndex + 1 + 1]), dwMoveDataNum * sizeof(BYTE));`
			`CopyMemory(&(m_psBreakResult->puWordAttrib[dwSurIndex + 1]), &(m_psBreakResult->puWordAttrib[dwSurIndex + 1 + 1]), dwMoveDataNum * sizeof(UINT));`
			`m_psBreakResult->dwWordNumber -= 1;`
			`//nCurrentIndex -= 1;`
			`} else {// Should be an error`
			`}`
			`}`
			`}`
			`nCurrentIndex += m_psBreakResult->puWordLen[dwSurIndex];`
			`} */`
			`// process Surrogate Char end`

			`if (fBreakWithParser) {`
			`#ifdef PARSER`
			`DWORD i, dwBeginIndex, dwParseLen;`
			`PWORD pwTerminalCode;`
			`pwTerminalCode = NULL;`
			`dwParseLen = 0;`
			`pwTerminalCode = new WORD[m_psBreakResult->dwWordNumber];`
			`MultiByteToWideChar(950, MB_PRECOMPOSED, (const char *)m_psBreakResult->pbTerminalCode,`
			`m_psBreakResult->dwWordNumber, pwTerminalCode, m_psBreakResult->dwWordNumber);`
			`for (dwBeginIndex = 0; dwBeginIndex < m_psBreakResult->dwWordNumber; dwBeginIndex += 1) {`
			`if (m_psBreakResult->pbTerminalCode[dwBeginIndex] == ' ') { continue; }`
			`for (dwParseLen = 1; dwBeginIndex + dwParseLen < m_psBreakResult->dwWordNumber; ++dwParseLen) {`
			`if (m_psBreakResult->pbTerminalCode[dwBeginIndex + dwParseLen] == ' ') {`
			`break;`
			`}`
			`}`
			`for ( ; dwParseLen > 1; --dwParseLen) {`
			`if (m_pcRuleLex->IsAWord(&pwTerminalCode[dwBeginIndex], dwParseLen)) {`
			`break;`
			`}`
			`}`
			`if (dwParseLen > 1) { // adjust break result`
			`for (i = 1; i < dwParseLen; ++i) {`
			`m_psBreakResult->puWordLen[dwBeginIndex] += m_psBreakResult->puWordLen[dwBeginIndex + i];`
			`}`
			`m_psBreakResult->puWordAttrib[dwBeginIndex] = ATTR_RULE_WORD;`
			`DWORD dwMoveDataNum;`
			`dwMoveDataNum = m_psBreakResult->dwWordNumber - (dwBeginIndex + dwParseLen);`
			`CopyMemory(&(m_psBreakResult->puWordLen[dwBeginIndex + 1]),`
			`&(m_psBreakResult->puWordLen[dwBeginIndex + dwParseLen]),`
			`dwMoveDataNum * sizeof(UINT));`
			`CopyMemory(&(m_psBreakResult->pbTerminalCode[dwBeginIndex + 1]),`
			`&(m_psBreakResult->pbTerminalCode[dwBeginIndex + dwParseLen]),`
			`dwMoveDataNum * sizeof(BYTE));`
			`CopyMemory(&(m_psBreakResult->puWordAttrib[dwBeginIndex + 1]),`
			`&(m_psBreakResult->puWordAttrib[dwBeginIndex + dwParseLen]),`
			`dwMoveDataNum * sizeof(UINT));`
			`m_psBreakResult->dwWordNumber -= (dwParseLen - 1);`
			`}`
			`}`
			`if (pwTerminalCode) {`
			`delete [] pwTerminalCode;`
			`}`
			`#endif`
			`}// if support parser`
			`} // if build lattice success`

			`_exit:`
			`return m_psBreakResult->dwWordNumber;`
			`}`

			`DWORD CCHTWordBreaker::GetResult(void)`
			`{`
			`DWORD dwRet = 0;`
			`DWORD dwLen = 0;`
			`SLocalPath sLocalPath[2];`
			`UINT uBestIndex = 0, uCandIndex, uLocalPathIndex;`
			`DWORD dw2ndIndex, dw3rdIndex;`
			`DWORD i, j, k;`


			`m_psBreakResult->dwWordNumber = 0;`

			`uCandIndex = (uBestIndex + 1) % 2;`

			`while (dwLen < m_dwSentenceLength) {`
			`uLocalPathIndex = 0;`
			`if (m_pdwCandidateNumber[dwLen] == 1) {`
			`sLocalPath[uBestIndex].dwLength[0] = 1;`
			`sLocalPath[uBestIndex].bTerminalCode[0] = m_ppWordLattice[dwLen][0].bTerminalCode;`
			`sLocalPath[uBestIndex].wAttribute[0] = m_ppWordLattice[dwLen][0].wAttr;`
			`} else {`
			`FillMemory(&sLocalPath[uBestIndex], sizeof(SLocalPath), 0);`
			`for (i = 0; i < m_pdwCandidateNumber[dwLen]; ++i) {`
			`FillMemory(&sLocalPath[uCandIndex], sizeof(SLocalPath), 0);`
			`++sLocalPath[uCandIndex].uStep;`
			`sLocalPath[uCandIndex].dwLength[uLocalPathIndex] = m_ppWordLattice[dwLen][i].uLen;`
			`sLocalPath[uCandIndex].wUnicount[uLocalPathIndex] = m_ppWordLattice[dwLen][i].wCount;`
			`sLocalPath[uCandIndex].wAttribute[uLocalPathIndex] = m_ppWordLattice[dwLen][i].wAttr;`
			`sLocalPath[uCandIndex].bTerminalCode[uLocalPathIndex++] = m_ppWordLattice[dwLen][i].bTerminalCode;`
			`dw2ndIndex = dwLen + m_ppWordLattice[dwLen][i].uLen;`
			`if (dw2ndIndex < m_dwSentenceLength) {`
			`for (j = 0; j < m_pdwCandidateNumber[dw2ndIndex]; ++j) {`
			`++sLocalPath[uCandIndex].uStep;`
			`sLocalPath[uCandIndex].dwLength[uLocalPathIndex] = m_ppWordLattice[dw2ndIndex][j].uLen;`
			`sLocalPath[uCandIndex].wUnicount[uLocalPathIndex] = m_ppWordLattice[dw2ndIndex][j].wCount;`
			`sLocalPath[uCandIndex].wAttribute[uLocalPathIndex] = m_ppWordLattice[dw2ndIndex][j].wAttr;`
			`sLocalPath[uCandIndex].bTerminalCode[uLocalPathIndex++] = m_ppWordLattice[dw2ndIndex][j].bTerminalCode;`
			`dw3rdIndex = dw2ndIndex + m_ppWordLattice[dw2ndIndex][j].uLen;`
			`if (dw3rdIndex < m_dwSentenceLength) {`
			`for (k = 0; k < m_pdwCandidateNumber[dw3rdIndex]; ++k) {`
			`++sLocalPath[uCandIndex].uStep;`
			`sLocalPath[uCandIndex].dwLength[uLocalPathIndex] = m_ppWordLattice[dw3rdIndex][k].uLen;`
			`sLocalPath[uCandIndex].wUnicount[uLocalPathIndex] = m_ppWordLattice[dw3rdIndex][k].wCount;`
			`sLocalPath[uCandIndex].wAttribute[uLocalPathIndex] = m_ppWordLattice[dw3rdIndex][k].wAttr;`
			`sLocalPath[uCandIndex].bTerminalCode[uLocalPathIndex++] = m_ppWordLattice[dw3rdIndex][k].bTerminalCode;`
			`GetScore(&(sLocalPath[uCandIndex]));`
			`if (CompareScore(&(sLocalPath[uCandIndex]), &(sLocalPath[uBestIndex])) > 0) {`
			`CopyMemory(&sLocalPath[uBestIndex], &sLocalPath[uCandIndex], sizeof(SLocalPath));`
			`}`
			`--uLocalPathIndex;`
			`--sLocalPath[uCandIndex].uStep;`
			`}`
			`} else {`
			`GetScore(&(sLocalPath[uCandIndex]));`
			`if (CompareScore(&(sLocalPath[uCandIndex]), &(sLocalPath[uBestIndex])) > 0) {`
			`CopyMemory(&sLocalPath[uBestIndex], &sLocalPath[uCandIndex], sizeof(SLocalPath));`
			`}`
			`}`
			`--uLocalPathIndex;`
			`--sLocalPath[uCandIndex].uStep;`
			`}`
			`} else {`
			`GetScore(&(sLocalPath[uCandIndex]));`
			`if (CompareScore(&(sLocalPath[uCandIndex]), &(sLocalPath[uBestIndex])) > 0) {`
			`CopyMemory(&sLocalPath[uBestIndex], &sLocalPath[uCandIndex], sizeof(SLocalPath));`
			`}`
			`}`
			`--uLocalPathIndex;`
			`--sLocalPath[uCandIndex].uStep;`
			`}`
			`}`
			`m_psBreakResult->puWordLen[m_psBreakResult->dwWordNumber] = sLocalPath[uBestIndex].dwLength[0];`
			`m_psBreakResult->pbTerminalCode[m_psBreakResult->dwWordNumber] = sLocalPath[uBestIndex].bTerminalCode[0];`
			`m_psBreakResult->puWordAttrib[m_psBreakResult->dwWordNumber] = sLocalPath[uBestIndex].wAttribute[0];`
			`++m_psBreakResult->dwWordNumber;`
			`dwLen += sLocalPath[uBestIndex].dwLength[0];`
			`}`

			`return m_psBreakResult->dwWordNumber;`
			`}`

			`INT CCHTWordBreaker::CompareScore(`
			`PSLocalPath psLocalPath1,`
			`PSLocalPath psLocalPath2)`
			`{`
			`if (psLocalPath1->uPathLength > psLocalPath2->uPathLength) {`
			`return 1;`
			`} else if (psLocalPath1->uPathLength < psLocalPath2->uPathLength) {`
			`return -1;`
			`} else if (psLocalPath1->uStep < psLocalPath2->uStep) {`
			`return 1;`
			`} else if (psLocalPath1->uStep > psLocalPath2->uStep) {`
			`return -1;`
			`} else {`
			`}`

			`if (psLocalPath1->fVariance > psLocalPath2->fVariance) {`
			`return -1;`
			`}`
			`if (psLocalPath1->fVariance < psLocalPath2->fVariance) {`
			`return 1;`
			`}`
			`if (psLocalPath1->uCompoundNum > psLocalPath2->uCompoundNum) {`
			`return -1;`
			`}`
			`if (psLocalPath1->uCompoundNum < psLocalPath2->uCompoundNum) {`
			`return 1;`
			`}`
			`if (psLocalPath1->uDMNum > psLocalPath2->uDMNum) {`
			`return -1;`
			`}`
			`if (psLocalPath1->uDMNum < psLocalPath2->uDMNum) {`
			`return 1;`
			`}`
			`if (psLocalPath1->wUniCountSum > psLocalPath2->wUniCountSum) {`
			`return 1;`
			`}`
			`if (psLocalPath1->wUniCountSum < psLocalPath2->wUniCountSum) {`
			`return -1;`
			`}`
			`return 0;`
			`}`

			`void CCHTWordBreaker::GetScore(`
			`PSLocalPath psLocalPath)`
			`{`
			`UINT i;`
			`double fAverageSum;`

			`psLocalPath->uCompoundNum = 0;`
			`psLocalPath->uDMNum = 0;`
			`psLocalPath->uPathLength = 0;`
			`psLocalPath->fVariance = 0;`
			`psLocalPath->wUniCountSum = 0;`

			`for (i = 0; i < psLocalPath->uStep; ++i) {`
			`if (psLocalPath->wAttribute[i] & ATTR_COMPOUND) {`
			`psLocalPath->uCompoundNum++;`
			`}`
			`if (psLocalPath->wAttribute[i] & ATTR_DM) {`
			`psLocalPath->uDMNum++;`
			`}`
			`psLocalPath->uPathLength += psLocalPath->dwLength[i];`
			`psLocalPath->wUniCountSum += psLocalPath->wUnicount[i];`
			`}`
			`fAverageSum = (double)psLocalPath->uPathLength / psLocalPath->uStep;`
			`for (i = 0; i < psLocalPath->uStep; ++i) {`
			`if (fAverageSum > psLocalPath->dwLength[i]) {`
			`psLocalPath->fVariance += (fAverageSum - psLocalPath->dwLength[i]);`
			`} else {`
			`psLocalPath->fVariance += (psLocalPath->dwLength[i] - fAverageSum);`
			`}`
			`}`
			`}`


			`BOOL CCHTWordBreaker::BuildLattice(`
			`LPCWSTR lpcwszText,`
			`DWORD dwTextLen,`
			`CBaseLex* pcBaseLex,`
			`DWORD dwMaxWordLen)`
			`{`
			`DWORD i, j;`

			`FillMemory(m_pdwCandidateNumber, sizeof(DWORD) * dwTextLen, 0);`
			`// we should use head link`
			`for (i = 0; i < dwTextLen; ++i) {`
			`m_pdwMaxWordLength[i] = 1;`
			`for (j = i; (j - i + 1) <= dwMaxWordLen && j < dwTextLen; ++j) {`
			`if (m_pcLexicon->GetWordInfo(&lpcwszText[i], (j - i + 1),`
			`&(m_ppWordLattice[i][m_pdwCandidateNumber[i]].wCount),`
			`&(m_ppWordLattice[i][m_pdwCandidateNumber[i]].wAttr),`
			`&(m_ppWordLattice[i][m_pdwCandidateNumber[i]].bTerminalCode))) {`
			`m_ppWordLattice[i][m_pdwCandidateNumber[i]++].uLen = (j - i + 1);`
			`if (j - i + 1 > m_pdwMaxWordLength[i]) {`
			`m_pdwMaxWordLength[i] = j - i + 1 ;`
			`}`
			`} else if (pcBaseLex && pcBaseLex->GetWordInfo(&lpcwszText[i], (j - i + 1),`
			`&(m_ppWordLattice[i][m_pdwCandidateNumber[i]].wAttr))) {`
			`m_ppWordLattice[i][m_pdwCandidateNumber[i]].wCount = APLEXICON_COUNT;`
			`m_ppWordLattice[i][m_pdwCandidateNumber[i]].bTerminalCode = ' ';`
			`m_ppWordLattice[i][m_pdwCandidateNumber[i]++].uLen = (j - i + 1);`
			`if (j - i + 1 > m_pdwMaxWordLength[i]) {`
			`m_pdwMaxWordLength[i] = j - i + 1 ;`
			`}`
			`} else {`
			`}`
			`}`
			`if (!m_pdwCandidateNumber[i]) {`
			`m_ppWordLattice[i][0].uLen = 1;`
			`m_ppWordLattice[i][0].wCount = 0;`
			`m_ppWordLattice[i][0].wAttr = 0;`
			`m_ppWordLattice[i][0].fVariance = 0;`
			`m_ppWordLattice[i][0].bTerminalCode = ' ';`
			`++m_pdwCandidateNumber[i];`
			`}`
			`}`
			`m_dwSentenceLength = dwTextLen;`
			`return TRUE;`
			`}`
			`/*`
			`DWORD CCHTWordBreaker::LongestRuleWord(`
			`DWORD dwIndex)`
			`{`
			`DWORD dwRet = 0, i;`

			`for (i = 0; i < m_pdwCandidateNumber[dwIndex]; ++i) {`
			`if (m_ppWordLattice[dwIndex][i].bAttr & ATTR_RULE_WORD) {`
			`if (m_ppWordLattice[dwIndex][i].uLen > dwRet) {`
			`dwRet = m_ppWordLattice[dwIndex][i].uLen;`
			`}`

			`}`
			`}`
			`return dwRet;`
			`}`
			`*/`