447 lines
7.5 KiB
C++
447 lines
7.5 KiB
C++
#include <windows.h>
|
||
#include <assert.h>
|
||
#include "PropNoun.H"
|
||
|
||
int __cdecl CharCompare(
|
||
const void *item1,
|
||
const void *item2)
|
||
{
|
||
PCharProb pChar1 = (PCharProb) item1;
|
||
PCharProb pChar2 = (PCharProb) item2;
|
||
|
||
if (pChar1->dwUnicode > pChar2->dwUnicode) {
|
||
return 1;
|
||
} else if (pChar1->dwUnicode < pChar2->dwUnicode) {
|
||
return -1;
|
||
} else {
|
||
return 0;
|
||
}
|
||
}
|
||
|
||
int __cdecl UnicodeCompare(
|
||
const void *item1,
|
||
const void *item2)
|
||
{
|
||
int nSize1 = lstrlenW((LPWSTR) item1) * sizeof(WCHAR),
|
||
nSize2 = lstrlenW((LPWSTR) item2) * sizeof(WCHAR);
|
||
return memcmp(item1, item2, nSize1 > nSize2 ? nSize1 : nSize2);
|
||
}
|
||
|
||
int __cdecl EngNameCompare(
|
||
const void *item1,
|
||
const void *item2)
|
||
{
|
||
PEngName p1 = (PEngName) item1;
|
||
PEngName p2 = (PEngName) item2;
|
||
|
||
if (p1->wPrevUnicode > p2->wPrevUnicode) {
|
||
return 1;
|
||
} else if (p1->wPrevUnicode < p2->wPrevUnicode) {
|
||
return -1;
|
||
} else {
|
||
if (p1->wNextUnicode > p2->wNextUnicode) {
|
||
return 1;
|
||
} else if (p1->wNextUnicode < p2->wNextUnicode) {
|
||
return -1;
|
||
} else {
|
||
return 0;
|
||
}
|
||
}
|
||
}
|
||
|
||
CProperNoun::CProperNoun(
|
||
HINSTANCE hInstance) :
|
||
m_dProperNameThreshold(FL_PROPER_NAME_THRESHOLD),
|
||
m_pCharProb(NULL),
|
||
m_dwTotalCharProbNum(0),
|
||
m_pEngNameData(NULL),
|
||
m_hProcessHeap(0),
|
||
m_hInstance(hInstance)
|
||
{
|
||
}
|
||
|
||
CProperNoun::~CProperNoun()
|
||
{
|
||
}
|
||
|
||
BOOL CProperNoun::InitData()
|
||
{
|
||
BOOL fRet = FALSE;
|
||
HRSRC hResource;
|
||
HGLOBAL hGlobal;
|
||
|
||
m_hProcessHeap = GetProcessHeap();
|
||
|
||
// Find resource
|
||
hResource = FindResource(m_hInstance, TEXT("CNAME"), TEXT("BIN"));
|
||
if (!hResource) { goto _exit; }
|
||
|
||
// Load resource
|
||
hGlobal = LoadResource(m_hInstance, hResource);
|
||
if (!hGlobal) { goto _exit; }
|
||
|
||
m_pCharProb = (PCharProb) LockResource(hGlobal);
|
||
if (!m_pCharProb) { goto _exit; }
|
||
m_dwTotalCharProbNum = SizeofResource(m_hInstance, hResource) / sizeof(CharProb);
|
||
/*
|
||
// Find resource
|
||
hResource = FindResource(m_hInstance, TEXT("ENAME"),
|
||
TEXT("BIN"));
|
||
if (!hResource) { goto _exit; }
|
||
|
||
// Load resource
|
||
hGlobal = LoadResource(m_hInstance, hResource);
|
||
if (!hGlobal) { goto _exit; }
|
||
|
||
m_pEngNameData = (PEngNameData) LockResource(hGlobal);
|
||
m_pEngNameData->pwUnicode = (PWORD) ((PBYTE) m_pEngNameData +
|
||
sizeof(m_pEngNameData->dwTotalEngUnicodeNum) +
|
||
sizeof(m_pEngNameData->dwTotalEngNamePairNum));
|
||
m_pEngNameData->pEngNamePair = (PEngName) ((PBYTE) m_pEngNameData +
|
||
sizeof(m_pEngNameData->dwTotalEngUnicodeNum) +
|
||
sizeof(m_pEngNameData->dwTotalEngNamePairNum) +
|
||
sizeof(m_pEngNameData->pwUnicode[0]) * m_pEngNameData->dwTotalEngUnicodeNum);
|
||
|
||
// m_pEngName = (PEngName) LockResource(hGlobal);
|
||
// m_dwTotalEngNameNum = SizeofResource(m_hInstance, hResource) / sizeof(EngName);
|
||
*/
|
||
qsort(m_pwszSurname, m_dwTotalSurnameNum, sizeof(m_pwszSurname[0]), UnicodeCompare);
|
||
|
||
fRet = TRUE;
|
||
|
||
_exit:
|
||
|
||
return fRet;
|
||
}
|
||
|
||
BOOL CProperNoun::IsAProperNoun(
|
||
LPWSTR lpwszChar,
|
||
UINT uCount)
|
||
{
|
||
return (IsAChineseName(lpwszChar, uCount) || IsAEnglishName(lpwszChar, uCount));
|
||
}
|
||
|
||
BOOL CProperNoun::IsAChineseName(
|
||
LPCWSTR lpcwszChar,
|
||
UINT uCount)
|
||
{
|
||
static WCHAR wszChar[3] = { NULL };
|
||
PWCHAR pwsResult;
|
||
|
||
wszChar[0] = lpcwszChar[0];
|
||
|
||
// Find surname
|
||
if (pwsResult = (PWCHAR) bsearch(wszChar, m_pwszSurname, m_dwTotalSurnameNum, sizeof(m_pwszSurname[0]),
|
||
UnicodeCompare)) {
|
||
FLOAT flProbability = 1;
|
||
PCharProb pCharProb;
|
||
CharProb CProb;
|
||
|
||
// Calculate probability to be a proper noun
|
||
for (UINT i = 1; i < uCount; ++i) {
|
||
CProb.dwUnicode = lpcwszChar[i];
|
||
if (pCharProb = (PCharProb) bsearch(&CProb, m_pCharProb,
|
||
m_dwTotalCharProbNum, sizeof(m_pCharProb[0]), CharCompare)) {
|
||
flProbability *= pCharProb->flProbability;
|
||
} else {
|
||
flProbability *= (FLOAT) FL_DEFAULT_CHAR_PROBABILITY;
|
||
}
|
||
}
|
||
|
||
if (flProbability >= m_dProperNameThreshold) {
|
||
return TRUE;
|
||
}
|
||
}
|
||
|
||
return FALSE;
|
||
}
|
||
|
||
BOOL CProperNoun::IsAEnglishName(
|
||
LPCWSTR lpwszChar,
|
||
UINT uCount)
|
||
{
|
||
static EngName Name;
|
||
|
||
Name.wPrevUnicode = lpwszChar[0];
|
||
Name.wNextUnicode = lpwszChar[uCount - 1];
|
||
|
||
if (bsearch(&Name, m_pEngNameData->pEngNamePair, m_pEngNameData->dwTotalEngUnicodeNum, sizeof(EngName), EngNameCompare)) {
|
||
return TRUE;
|
||
}
|
||
|
||
return FALSE;
|
||
}
|
||
|
||
WCHAR CProperNoun::m_pwszSurname[][3] = {
|
||
L"<EFBFBD>B",
|
||
L"<EFBFBD>R",
|
||
L"<EFBFBD>_",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>C",
|
||
L"<EFBFBD>K",
|
||
L"<EFBFBD>T",
|
||
L"<EFBFBD>]",
|
||
L"<EFBFBD>q",
|
||
L"<EFBFBD>v",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>V",
|
||
L"<EFBFBD>w",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>E",
|
||
L"<EFBFBD>d",
|
||
L"<EFBFBD>f",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>H",
|
||
L"<EFBFBD>L",
|
||
L"<EFBFBD>f",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>P",
|
||
L"<EFBFBD>s",
|
||
L"<EFBFBD>u",
|
||
L"<EFBFBD>x",
|
||
L"<EFBFBD>}",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>L",
|
||
L"<EFBFBD>Z",
|
||
L"<EFBFBD>k",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>J",
|
||
L"<EFBFBD>\\",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>I",
|
||
L"<EFBFBD>R",
|
||
L"<EFBFBD>_",
|
||
L"<EFBFBD>d",
|
||
L"<EFBFBD>h",
|
||
L"<EFBFBD>q",
|
||
L"<EFBFBD>x",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>J",
|
||
L"<EFBFBD>S",
|
||
L"<EFBFBD>]",
|
||
L"<EFBFBD>p",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>L",
|
||
L"<EFBFBD>V",
|
||
L"<EFBFBD>]",
|
||
L"<EFBFBD>c",
|
||
L"<EFBFBD>u",
|
||
L"<EFBFBD>}",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>Z",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>K",
|
||
L"<EFBFBD>q",
|
||
L"<EFBFBD>|",
|
||
L"<EFBFBD>}",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>O",
|
||
L"<EFBFBD>Z",
|
||
L"<EFBFBD>d",
|
||
L"<EFBFBD>h",
|
||
L"<EFBFBD>i",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>\\",
|
||
L"<EFBFBD>s",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>^",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>J",
|
||
L"<EFBFBD>q",
|
||
L"<EFBFBD>{",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>O",
|
||
L"<EFBFBD>P",
|
||
L"<EFBFBD>R",
|
||
L"<EFBFBD>d",
|
||
L"<EFBFBD>k",
|
||
L"<EFBFBD>s",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>q",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>Q",
|
||
L"<EFBFBD>l",
|
||
L"<EFBFBD>p",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>a",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>p",
|
||
L"<EFBFBD>u",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>B",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>G",
|
||
L"<EFBFBD>H",
|
||
L"<EFBFBD>|",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>P",
|
||
L"<EFBFBD>c",
|
||
L"<EFBFBD>p",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>F",
|
||
L"<EFBFBD>N",
|
||
L"<EFBFBD>R",
|
||
L"<EFBFBD>d",
|
||
L"<EFBFBD>j",
|
||
L"<EFBFBD>s",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>t",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"£",
|
||
L"²",
|
||
L"¿",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>C",
|
||
L"<EFBFBD>Q",
|
||
L"<EFBFBD>e",
|
||
L"ù",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>Y",
|
||
L"<EFBFBD>u",
|
||
L"ĩ",
|
||
L"Ī",
|
||
L"Ĭ",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>U",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD><EFBFBD>",
|
||
L"<EFBFBD>e",
|
||
L"<EFBFBD>s",
|
||
L"м",
|
||
L"<EFBFBD>\\",
|
||
L"<EFBFBD>k"
|
||
};
|
||
|
||
DWORD CProperNoun::m_dwTotalSurnameNum = sizeof(m_pwszSurname) / sizeof(m_pwszSurname[0]); |