windows-nt/Source/XPSP1/NT/inetsrv/intlwb/cht2/srcs/chtbrkr.cpp
2020-09-26 16:20:57 +08:00

504 lines
18 KiB
C++

#include <windows.h>
#include "lexicon.h"
#include "rulelex.h"
#include "LexMgr.h"
#include "CHTBrKr.h"
#include "BaseLex.h"
CCHTWordBreaker::CCHTWordBreaker(void)
{
m_pcLexicon = NULL;
m_ppWordLattice = NULL;
m_pdwCandidateNumber = NULL;
m_dwSentenceLength = 0;
m_dwLatticeLength = 0;
m_pdwMaxWordLength = NULL;
m_psBreakResult = NULL;
m_pcRuleLex = NULL;
}
CCHTWordBreaker::~CCHTWordBreaker(void)
{
DWORD i;
if (m_pcLexicon) {
delete m_pcLexicon;
m_pcLexicon = NULL;
}
if (m_ppWordLattice) {
for (i = 0; i < m_dwSentenceLength; ++i) {
if (m_ppWordLattice[i]) {
delete m_ppWordLattice[i];
}
}
delete m_ppWordLattice;
m_ppWordLattice = NULL;
}
if (m_pdwCandidateNumber) {
delete m_pdwCandidateNumber;
m_pdwCandidateNumber = NULL;
}
if (m_pdwMaxWordLength) {
delete m_pdwMaxWordLength;
m_pdwMaxWordLength = NULL;
}
if (m_psBreakResult) {
if (m_psBreakResult->puWordLen) {
delete m_psBreakResult->puWordLen;
}
if (m_psBreakResult->pbTerminalCode) {
delete m_psBreakResult->pbTerminalCode;
}
if (m_psBreakResult->puWordAttrib) {
delete m_psBreakResult->puWordAttrib;
}
}
m_dwSentenceLength = 0;
m_dwLatticeLength = 0;
}
BOOL CCHTWordBreaker::AllocLattice(
DWORD dwLength)
{
BOOL fRet = FALSE;
DWORD i;
m_pdwMaxWordLength= new DWORD[dwLength];
if (!m_pdwMaxWordLength) { goto _exit; }
m_pdwCandidateNumber = new DWORD[dwLength];
if (!m_pdwCandidateNumber) { goto _exit; }
m_ppWordLattice = new PSLatticeNode[dwLength];
if (!m_ppWordLattice) { goto _exit; }
for (i = 0; i < dwLength; ++i) {
m_ppWordLattice[i] = NULL;
}
for (i = 0; i < dwLength; ++i) {
m_ppWordLattice[i] = new SLatticeNode[MAX_CHAR_PER_WORD];
if (!m_ppWordLattice[i]) { goto _exit; }
m_pdwCandidateNumber[i] = 0;
}
m_dwLatticeLength = dwLength;
fRet = TRUE;
_exit:
if (!fRet) {
DestroyLattice();
}
return fRet;
}
void CCHTWordBreaker::DestroyLattice()
{
DWORD i;
if (m_pdwCandidateNumber) {
delete m_pdwCandidateNumber;
m_pdwCandidateNumber = NULL;
}
if (m_pdwMaxWordLength) {
delete m_pdwMaxWordLength;
m_pdwMaxWordLength = NULL;
}
if (m_ppWordLattice) {
for (i = 0; i < m_dwLatticeLength; ++i) {
if (m_ppWordLattice[i]) {
delete m_ppWordLattice[i];
}
}
m_ppWordLattice = NULL;
m_dwLatticeLength = 0;
}
}
BOOL CCHTWordBreaker::InitData(
HINSTANCE hInstance)
{
BOOL fRet = FALSE;
m_pcLexicon = new CCHTLexicon;
if (!m_pcLexicon) { goto _exit; }
fRet = m_pcLexicon->InitData(hInstance);
if (!fRet) { goto _exit; }
m_pcRuleLex = new CRuleLexicon;
if (!m_pcRuleLex) { goto _exit; }
m_psBreakResult = new SBreakResult;
if (!m_psBreakResult) { goto _exit; }
FillMemory(m_psBreakResult, sizeof(SBreakResult), 0);
m_psBreakResult->puWordLen = new UINT[LATTICE_LENGHT];
m_psBreakResult->pbTerminalCode = new BYTE[LATTICE_LENGHT];
m_psBreakResult->puWordAttrib = new UINT[LATTICE_LENGHT];
if (!AllocLattice(LATTICE_LENGHT)) { goto _exit; }
fRet = TRUE;
_exit:
if (!fRet) {
if (m_pcLexicon) {
delete m_pcLexicon;
m_pcLexicon = NULL;
}
if (m_pcRuleLex) {
delete m_pcRuleLex;
m_pcRuleLex = NULL;
}
if (m_psBreakResult) {
if (m_psBreakResult->puWordLen) {
delete m_psBreakResult->puWordLen;
}
if (m_psBreakResult->pbTerminalCode) {
delete m_psBreakResult->pbTerminalCode;
}
if (m_psBreakResult->puWordAttrib) {
delete m_psBreakResult->puWordAttrib;
}
m_psBreakResult = NULL;
}
DestroyLattice();
}
return fRet;
}
BOOL CCHTWordBreaker::LatticeGrow(
DWORD dwNewLength)
{
BOOL fRet = FALSE;
if (dwNewLength <= m_dwLatticeLength) {
fRet = TRUE;
goto _exit;
}
DestroyLattice();
if (AllocLattice(dwNewLength)) {
fRet = TRUE;
} else {
AllocLattice(LATTICE_LENGHT);
}
if (m_psBreakResult) {
if (m_psBreakResult->puWordLen) {
delete m_psBreakResult->puWordLen;
}
if (m_psBreakResult->pbTerminalCode) {
delete m_psBreakResult->pbTerminalCode;
}
if (m_psBreakResult->puWordAttrib) {
delete m_psBreakResult->puWordAttrib;
}
m_psBreakResult->puWordLen = new UINT[dwNewLength];
m_psBreakResult->pbTerminalCode = new BYTE[dwNewLength];
m_psBreakResult->puWordAttrib = new UINT[dwNewLength];
}
_exit:
return fRet;
}
DWORD CCHTWordBreaker::BreakText(
LPCWSTR lpcwszText,
INT nTextLen,
CBaseLex* pcBaseLex,
DWORD dwMaxWordLen,
BOOL fBreakWithParser)
{
m_psBreakResult->dwWordNumber = 0;
if (!LatticeGrow(nTextLen)) { goto _exit; }
if (BuildLattice(lpcwszText, nTextLen, pcBaseLex, dwMaxWordLen)) {
GetResult();
// process Surrogate Char begin
/*
INT nCurrentIndex;
DWORD dwSurIndex;
nCurrentIndex = 0;
for (dwSurIndex = 0; dwSurIndex < m_psBreakResult->dwWordNumber; ++dwSurIndex) {
if (m_psBreakResult->puWordLen[dwSurIndex] == 1) { // High word of surrogate char should be breaked into signal char word
if (lpcwszText[nCurrentIndex] >= 0xd800 && lpcwszText[nCurrentIndex] <= 0xdbff) { // High word is
if (nCurrentIndex >= nTextLen - 1) { // Should be an error
} else if (lpcwszText[nCurrentIndex + 1] >= 0xdc00 && lpcwszText[nCurrentIndex + 1] <= 0xdfff) { // Is surrogate char
DWORD dwMoveDataNum;
dwMoveDataNum = m_psBreakResult->dwWordNumber - (dwSurIndex + 1 + 1);
m_psBreakResult->puWordLen[dwSurIndex] = 2;
CopyMemory(&(m_psBreakResult->puWordLen[dwSurIndex + 1]), &(m_psBreakResult->puWordLen[dwSurIndex + 1 + 1]), dwMoveDataNum * sizeof(UINT));
CopyMemory(&(m_psBreakResult->pbTerminalCode[dwSurIndex + 1]), &(m_psBreakResult->pbTerminalCode[dwSurIndex + 1 + 1]), dwMoveDataNum * sizeof(BYTE));
CopyMemory(&(m_psBreakResult->puWordAttrib[dwSurIndex + 1]), &(m_psBreakResult->puWordAttrib[dwSurIndex + 1 + 1]), dwMoveDataNum * sizeof(UINT));
m_psBreakResult->dwWordNumber -= 1;
//nCurrentIndex -= 1;
} else {// Should be an error
}
}
}
nCurrentIndex += m_psBreakResult->puWordLen[dwSurIndex];
} */
// process Surrogate Char end
if (fBreakWithParser) {
#ifdef PARSER
DWORD i, dwBeginIndex, dwParseLen;
PWORD pwTerminalCode;
pwTerminalCode = NULL;
dwParseLen = 0;
pwTerminalCode = new WORD[m_psBreakResult->dwWordNumber];
MultiByteToWideChar(950, MB_PRECOMPOSED, (const char *)m_psBreakResult->pbTerminalCode,
m_psBreakResult->dwWordNumber, pwTerminalCode, m_psBreakResult->dwWordNumber);
for (dwBeginIndex = 0; dwBeginIndex < m_psBreakResult->dwWordNumber; dwBeginIndex += 1) {
if (m_psBreakResult->pbTerminalCode[dwBeginIndex] == ' ') { continue; }
for (dwParseLen = 1; dwBeginIndex + dwParseLen < m_psBreakResult->dwWordNumber; ++dwParseLen) {
if (m_psBreakResult->pbTerminalCode[dwBeginIndex + dwParseLen] == ' ') {
break;
}
}
for ( ; dwParseLen > 1; --dwParseLen) {
if (m_pcRuleLex->IsAWord(&pwTerminalCode[dwBeginIndex], dwParseLen)) {
break;
}
}
if (dwParseLen > 1) { // adjust break result
for (i = 1; i < dwParseLen; ++i) {
m_psBreakResult->puWordLen[dwBeginIndex] += m_psBreakResult->puWordLen[dwBeginIndex + i];
}
m_psBreakResult->puWordAttrib[dwBeginIndex] = ATTR_RULE_WORD;
DWORD dwMoveDataNum;
dwMoveDataNum = m_psBreakResult->dwWordNumber - (dwBeginIndex + dwParseLen);
CopyMemory(&(m_psBreakResult->puWordLen[dwBeginIndex + 1]),
&(m_psBreakResult->puWordLen[dwBeginIndex + dwParseLen]),
dwMoveDataNum * sizeof(UINT));
CopyMemory(&(m_psBreakResult->pbTerminalCode[dwBeginIndex + 1]),
&(m_psBreakResult->pbTerminalCode[dwBeginIndex + dwParseLen]),
dwMoveDataNum * sizeof(BYTE));
CopyMemory(&(m_psBreakResult->puWordAttrib[dwBeginIndex + 1]),
&(m_psBreakResult->puWordAttrib[dwBeginIndex + dwParseLen]),
dwMoveDataNum * sizeof(UINT));
m_psBreakResult->dwWordNumber -= (dwParseLen - 1);
}
}
if (pwTerminalCode) {
delete [] pwTerminalCode;
}
#endif
}// if support parser
} // if build lattice success
_exit:
return m_psBreakResult->dwWordNumber;
}
DWORD CCHTWordBreaker::GetResult(void)
{
DWORD dwRet = 0;
DWORD dwLen = 0;
SLocalPath sLocalPath[2];
UINT uBestIndex = 0, uCandIndex, uLocalPathIndex;
DWORD dw2ndIndex, dw3rdIndex;
DWORD i, j, k;
m_psBreakResult->dwWordNumber = 0;
uCandIndex = (uBestIndex + 1) % 2;
while (dwLen < m_dwSentenceLength) {
uLocalPathIndex = 0;
if (m_pdwCandidateNumber[dwLen] == 1) {
sLocalPath[uBestIndex].dwLength[0] = 1;
sLocalPath[uBestIndex].bTerminalCode[0] = m_ppWordLattice[dwLen][0].bTerminalCode;
sLocalPath[uBestIndex].wAttribute[0] = m_ppWordLattice[dwLen][0].wAttr;
} else {
FillMemory(&sLocalPath[uBestIndex], sizeof(SLocalPath), 0);
for (i = 0; i < m_pdwCandidateNumber[dwLen]; ++i) {
FillMemory(&sLocalPath[uCandIndex], sizeof(SLocalPath), 0);
++sLocalPath[uCandIndex].uStep;
sLocalPath[uCandIndex].dwLength[uLocalPathIndex] = m_ppWordLattice[dwLen][i].uLen;
sLocalPath[uCandIndex].wUnicount[uLocalPathIndex] = m_ppWordLattice[dwLen][i].wCount;
sLocalPath[uCandIndex].wAttribute[uLocalPathIndex] = m_ppWordLattice[dwLen][i].wAttr;
sLocalPath[uCandIndex].bTerminalCode[uLocalPathIndex++] = m_ppWordLattice[dwLen][i].bTerminalCode;
dw2ndIndex = dwLen + m_ppWordLattice[dwLen][i].uLen;
if (dw2ndIndex < m_dwSentenceLength) {
for (j = 0; j < m_pdwCandidateNumber[dw2ndIndex]; ++j) {
++sLocalPath[uCandIndex].uStep;
sLocalPath[uCandIndex].dwLength[uLocalPathIndex] = m_ppWordLattice[dw2ndIndex][j].uLen;
sLocalPath[uCandIndex].wUnicount[uLocalPathIndex] = m_ppWordLattice[dw2ndIndex][j].wCount;
sLocalPath[uCandIndex].wAttribute[uLocalPathIndex] = m_ppWordLattice[dw2ndIndex][j].wAttr;
sLocalPath[uCandIndex].bTerminalCode[uLocalPathIndex++] = m_ppWordLattice[dw2ndIndex][j].bTerminalCode;
dw3rdIndex = dw2ndIndex + m_ppWordLattice[dw2ndIndex][j].uLen;
if (dw3rdIndex < m_dwSentenceLength) {
for (k = 0; k < m_pdwCandidateNumber[dw3rdIndex]; ++k) {
++sLocalPath[uCandIndex].uStep;
sLocalPath[uCandIndex].dwLength[uLocalPathIndex] = m_ppWordLattice[dw3rdIndex][k].uLen;
sLocalPath[uCandIndex].wUnicount[uLocalPathIndex] = m_ppWordLattice[dw3rdIndex][k].wCount;
sLocalPath[uCandIndex].wAttribute[uLocalPathIndex] = m_ppWordLattice[dw3rdIndex][k].wAttr;
sLocalPath[uCandIndex].bTerminalCode[uLocalPathIndex++] = m_ppWordLattice[dw3rdIndex][k].bTerminalCode;
GetScore(&(sLocalPath[uCandIndex]));
if (CompareScore(&(sLocalPath[uCandIndex]), &(sLocalPath[uBestIndex])) > 0) {
CopyMemory(&sLocalPath[uBestIndex], &sLocalPath[uCandIndex], sizeof(SLocalPath));
}
--uLocalPathIndex;
--sLocalPath[uCandIndex].uStep;
}
} else {
GetScore(&(sLocalPath[uCandIndex]));
if (CompareScore(&(sLocalPath[uCandIndex]), &(sLocalPath[uBestIndex])) > 0) {
CopyMemory(&sLocalPath[uBestIndex], &sLocalPath[uCandIndex], sizeof(SLocalPath));
}
}
--uLocalPathIndex;
--sLocalPath[uCandIndex].uStep;
}
} else {
GetScore(&(sLocalPath[uCandIndex]));
if (CompareScore(&(sLocalPath[uCandIndex]), &(sLocalPath[uBestIndex])) > 0) {
CopyMemory(&sLocalPath[uBestIndex], &sLocalPath[uCandIndex], sizeof(SLocalPath));
}
}
--uLocalPathIndex;
--sLocalPath[uCandIndex].uStep;
}
}
m_psBreakResult->puWordLen[m_psBreakResult->dwWordNumber] = sLocalPath[uBestIndex].dwLength[0];
m_psBreakResult->pbTerminalCode[m_psBreakResult->dwWordNumber] = sLocalPath[uBestIndex].bTerminalCode[0];
m_psBreakResult->puWordAttrib[m_psBreakResult->dwWordNumber] = sLocalPath[uBestIndex].wAttribute[0];
++m_psBreakResult->dwWordNumber;
dwLen += sLocalPath[uBestIndex].dwLength[0];
}
return m_psBreakResult->dwWordNumber;
}
INT CCHTWordBreaker::CompareScore(
PSLocalPath psLocalPath1,
PSLocalPath psLocalPath2)
{
if (psLocalPath1->uPathLength > psLocalPath2->uPathLength) {
return 1;
} else if (psLocalPath1->uPathLength < psLocalPath2->uPathLength) {
return -1;
} else if (psLocalPath1->uStep < psLocalPath2->uStep) {
return 1;
} else if (psLocalPath1->uStep > psLocalPath2->uStep) {
return -1;
} else {
}
if (psLocalPath1->fVariance > psLocalPath2->fVariance) {
return -1;
}
if (psLocalPath1->fVariance < psLocalPath2->fVariance) {
return 1;
}
if (psLocalPath1->uCompoundNum > psLocalPath2->uCompoundNum) {
return -1;
}
if (psLocalPath1->uCompoundNum < psLocalPath2->uCompoundNum) {
return 1;
}
if (psLocalPath1->uDMNum > psLocalPath2->uDMNum) {
return -1;
}
if (psLocalPath1->uDMNum < psLocalPath2->uDMNum) {
return 1;
}
if (psLocalPath1->wUniCountSum > psLocalPath2->wUniCountSum) {
return 1;
}
if (psLocalPath1->wUniCountSum < psLocalPath2->wUniCountSum) {
return -1;
}
return 0;
}
void CCHTWordBreaker::GetScore(
PSLocalPath psLocalPath)
{
UINT i;
double fAverageSum;
psLocalPath->uCompoundNum = 0;
psLocalPath->uDMNum = 0;
psLocalPath->uPathLength = 0;
psLocalPath->fVariance = 0;
psLocalPath->wUniCountSum = 0;
for (i = 0; i < psLocalPath->uStep; ++i) {
if (psLocalPath->wAttribute[i] & ATTR_COMPOUND) {
psLocalPath->uCompoundNum++;
}
if (psLocalPath->wAttribute[i] & ATTR_DM) {
psLocalPath->uDMNum++;
}
psLocalPath->uPathLength += psLocalPath->dwLength[i];
psLocalPath->wUniCountSum += psLocalPath->wUnicount[i];
}
fAverageSum = (double)psLocalPath->uPathLength / psLocalPath->uStep;
for (i = 0; i < psLocalPath->uStep; ++i) {
if (fAverageSum > psLocalPath->dwLength[i]) {
psLocalPath->fVariance += (fAverageSum - psLocalPath->dwLength[i]);
} else {
psLocalPath->fVariance += (psLocalPath->dwLength[i] - fAverageSum);
}
}
}
BOOL CCHTWordBreaker::BuildLattice(
LPCWSTR lpcwszText,
DWORD dwTextLen,
CBaseLex* pcBaseLex,
DWORD dwMaxWordLen)
{
DWORD i, j;
FillMemory(m_pdwCandidateNumber, sizeof(DWORD) * dwTextLen, 0);
// we should use head link
for (i = 0; i < dwTextLen; ++i) {
m_pdwMaxWordLength[i] = 1;
for (j = i; (j - i + 1) <= dwMaxWordLen && j < dwTextLen; ++j) {
if (m_pcLexicon->GetWordInfo(&lpcwszText[i], (j - i + 1),
&(m_ppWordLattice[i][m_pdwCandidateNumber[i]].wCount),
&(m_ppWordLattice[i][m_pdwCandidateNumber[i]].wAttr),
&(m_ppWordLattice[i][m_pdwCandidateNumber[i]].bTerminalCode))) {
m_ppWordLattice[i][m_pdwCandidateNumber[i]++].uLen = (j - i + 1);
if (j - i + 1 > m_pdwMaxWordLength[i]) {
m_pdwMaxWordLength[i] = j - i + 1 ;
}
} else if (pcBaseLex && pcBaseLex->GetWordInfo(&lpcwszText[i], (j - i + 1),
&(m_ppWordLattice[i][m_pdwCandidateNumber[i]].wAttr))) {
m_ppWordLattice[i][m_pdwCandidateNumber[i]].wCount = APLEXICON_COUNT;
m_ppWordLattice[i][m_pdwCandidateNumber[i]].bTerminalCode = ' ';
m_ppWordLattice[i][m_pdwCandidateNumber[i]++].uLen = (j - i + 1);
if (j - i + 1 > m_pdwMaxWordLength[i]) {
m_pdwMaxWordLength[i] = j - i + 1 ;
}
} else {
}
}
if (!m_pdwCandidateNumber[i]) {
m_ppWordLattice[i][0].uLen = 1;
m_ppWordLattice[i][0].wCount = 0;
m_ppWordLattice[i][0].wAttr = 0;
m_ppWordLattice[i][0].fVariance = 0;
m_ppWordLattice[i][0].bTerminalCode = ' ';
++m_pdwCandidateNumber[i];
}
}
m_dwSentenceLength = dwTextLen;
return TRUE;
}
/*
DWORD CCHTWordBreaker::LongestRuleWord(
DWORD dwIndex)
{
DWORD dwRet = 0, i;
for (i = 0; i < m_pdwCandidateNumber[dwIndex]; ++i) {
if (m_ppWordLattice[dwIndex][i].bAttr & ATTR_RULE_WORD) {
if (m_ppWordLattice[dwIndex][i].uLen > dwRet) {
dwRet = m_ppWordLattice[dwIndex][i].uLen;
}
}
}
return dwRet;
}
*/