windows-nt/Source/XPSP1/NT/inetsrv/intlwb/cht2/srcs/propnoun.cpp
2020-09-26 16:20:57 +08:00

447 lines
7.5 KiB
C++
Raw Permalink Blame History

#include <windows.h>
#include <assert.h>
#include "PropNoun.H"
int __cdecl CharCompare(
const void *item1,
const void *item2)
{
PCharProb pChar1 = (PCharProb) item1;
PCharProb pChar2 = (PCharProb) item2;
if (pChar1->dwUnicode > pChar2->dwUnicode) {
return 1;
} else if (pChar1->dwUnicode < pChar2->dwUnicode) {
return -1;
} else {
return 0;
}
}
int __cdecl UnicodeCompare(
const void *item1,
const void *item2)
{
int nSize1 = lstrlenW((LPWSTR) item1) * sizeof(WCHAR),
nSize2 = lstrlenW((LPWSTR) item2) * sizeof(WCHAR);
return memcmp(item1, item2, nSize1 > nSize2 ? nSize1 : nSize2);
}
int __cdecl EngNameCompare(
const void *item1,
const void *item2)
{
PEngName p1 = (PEngName) item1;
PEngName p2 = (PEngName) item2;
if (p1->wPrevUnicode > p2->wPrevUnicode) {
return 1;
} else if (p1->wPrevUnicode < p2->wPrevUnicode) {
return -1;
} else {
if (p1->wNextUnicode > p2->wNextUnicode) {
return 1;
} else if (p1->wNextUnicode < p2->wNextUnicode) {
return -1;
} else {
return 0;
}
}
}
CProperNoun::CProperNoun(
HINSTANCE hInstance) :
m_dProperNameThreshold(FL_PROPER_NAME_THRESHOLD),
m_pCharProb(NULL),
m_dwTotalCharProbNum(0),
m_pEngNameData(NULL),
m_hProcessHeap(0),
m_hInstance(hInstance)
{
}
CProperNoun::~CProperNoun()
{
}
BOOL CProperNoun::InitData()
{
BOOL fRet = FALSE;
HRSRC hResource;
HGLOBAL hGlobal;
m_hProcessHeap = GetProcessHeap();
// Find resource
hResource = FindResource(m_hInstance, TEXT("CNAME"), TEXT("BIN"));
if (!hResource) { goto _exit; }
// Load resource
hGlobal = LoadResource(m_hInstance, hResource);
if (!hGlobal) { goto _exit; }
m_pCharProb = (PCharProb) LockResource(hGlobal);
if (!m_pCharProb) { goto _exit; }
m_dwTotalCharProbNum = SizeofResource(m_hInstance, hResource) / sizeof(CharProb);
/*
// Find resource
hResource = FindResource(m_hInstance, TEXT("ENAME"),
TEXT("BIN"));
if (!hResource) { goto _exit; }
// Load resource
hGlobal = LoadResource(m_hInstance, hResource);
if (!hGlobal) { goto _exit; }
m_pEngNameData = (PEngNameData) LockResource(hGlobal);
m_pEngNameData->pwUnicode = (PWORD) ((PBYTE) m_pEngNameData +
sizeof(m_pEngNameData->dwTotalEngUnicodeNum) +
sizeof(m_pEngNameData->dwTotalEngNamePairNum));
m_pEngNameData->pEngNamePair = (PEngName) ((PBYTE) m_pEngNameData +
sizeof(m_pEngNameData->dwTotalEngUnicodeNum) +
sizeof(m_pEngNameData->dwTotalEngNamePairNum) +
sizeof(m_pEngNameData->pwUnicode[0]) * m_pEngNameData->dwTotalEngUnicodeNum);
// m_pEngName = (PEngName) LockResource(hGlobal);
// m_dwTotalEngNameNum = SizeofResource(m_hInstance, hResource) / sizeof(EngName);
*/
qsort(m_pwszSurname, m_dwTotalSurnameNum, sizeof(m_pwszSurname[0]), UnicodeCompare);
fRet = TRUE;
_exit:
return fRet;
}
BOOL CProperNoun::IsAProperNoun(
LPWSTR lpwszChar,
UINT uCount)
{
return (IsAChineseName(lpwszChar, uCount) || IsAEnglishName(lpwszChar, uCount));
}
BOOL CProperNoun::IsAChineseName(
LPCWSTR lpcwszChar,
UINT uCount)
{
static WCHAR wszChar[3] = { NULL };
PWCHAR pwsResult;
wszChar[0] = lpcwszChar[0];
// Find surname
if (pwsResult = (PWCHAR) bsearch(wszChar, m_pwszSurname, m_dwTotalSurnameNum, sizeof(m_pwszSurname[0]),
UnicodeCompare)) {
FLOAT flProbability = 1;
PCharProb pCharProb;
CharProb CProb;
// Calculate probability to be a proper noun
for (UINT i = 1; i < uCount; ++i) {
CProb.dwUnicode = lpcwszChar[i];
if (pCharProb = (PCharProb) bsearch(&CProb, m_pCharProb,
m_dwTotalCharProbNum, sizeof(m_pCharProb[0]), CharCompare)) {
flProbability *= pCharProb->flProbability;
} else {
flProbability *= (FLOAT) FL_DEFAULT_CHAR_PROBABILITY;
}
}
if (flProbability >= m_dProperNameThreshold) {
return TRUE;
}
}
return FALSE;
}
BOOL CProperNoun::IsAEnglishName(
LPCWSTR lpwszChar,
UINT uCount)
{
static EngName Name;
Name.wPrevUnicode = lpwszChar[0];
Name.wNextUnicode = lpwszChar[uCount - 1];
if (bsearch(&Name, m_pEngNameData->pEngNamePair, m_pEngNameData->dwTotalEngUnicodeNum, sizeof(EngName), EngNameCompare)) {
return TRUE;
}
return FALSE;
}
WCHAR CProperNoun::m_pwszSurname[][3] = {
L"<EFBFBD>B",
L"<EFBFBD>R",
L"<EFBFBD>_",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>C",
L"<EFBFBD>K",
L"<EFBFBD>T",
L"<EFBFBD>]",
L"<EFBFBD>q",
L"<EFBFBD>v",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>V",
L"<EFBFBD>w",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>E",
L"<EFBFBD>d",
L"<EFBFBD>f",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>H",
L"<EFBFBD>L",
L"<EFBFBD>f",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>P",
L"<EFBFBD>s",
L"<EFBFBD>u",
L"<EFBFBD>x",
L"<EFBFBD>}",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>L",
L"<EFBFBD>Z",
L"<EFBFBD>k",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>J",
L"<EFBFBD>\\",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>I",
L"<EFBFBD>R",
L"<EFBFBD>_",
L"<EFBFBD>d",
L"<EFBFBD>h",
L"<EFBFBD>q",
L"<EFBFBD>x",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>J",
L"<EFBFBD>S",
L"<EFBFBD>]",
L"<EFBFBD>p",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>L",
L"<EFBFBD>V",
L"<EFBFBD>]",
L"<EFBFBD>c",
L"<EFBFBD>u",
L"<EFBFBD>}",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>Z",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>K",
L"<EFBFBD>q",
L"<EFBFBD>|",
L"<EFBFBD>}",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>O",
L"<EFBFBD>Z",
L"<EFBFBD>d",
L"<EFBFBD>h",
L"<EFBFBD>i",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>\\",
L"<EFBFBD>s",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>^",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>J",
L"<EFBFBD>q",
L"<EFBFBD>{",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>O",
L"<EFBFBD>P",
L"<EFBFBD>R",
L"<EFBFBD>d",
L"<EFBFBD>k",
L"<EFBFBD>s",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>q",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>Q",
L"<EFBFBD>l",
L"<EFBFBD>p",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>a",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>p",
L"<EFBFBD>u",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>B",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>G",
L"<EFBFBD>H",
L"<EFBFBD>|",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>P",
L"<EFBFBD>c",
L"<EFBFBD>p",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>F",
L"<EFBFBD>N",
L"<EFBFBD>R",
L"<EFBFBD>d",
L"<EFBFBD>j",
L"<EFBFBD>s",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>t",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"£",
L"²",
L"¿",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>C",
L"<EFBFBD>Q",
L"<EFBFBD>e",
L"ù",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>Y",
L"<EFBFBD>u",
L"ĩ",
L"Ī",
L"Ĭ",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>U",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD><EFBFBD>",
L"<EFBFBD>e",
L"<EFBFBD>s",
L"м",
L"<EFBFBD>\\",
L"<EFBFBD>k"
};
DWORD CProperNoun::m_dwTotalSurnameNum = sizeof(m_pwszSurname) / sizeof(m_pwszSurname[0]);