447 lines
7.5 KiB
C++
447 lines
7.5 KiB
C++
|
#include <windows.h>
|
|||
|
#include <assert.h>
|
|||
|
#include "PropNoun.H"
|
|||
|
|
|||
|
int __cdecl CharCompare(
|
|||
|
const void *item1,
|
|||
|
const void *item2)
|
|||
|
{
|
|||
|
PCharProb pChar1 = (PCharProb) item1;
|
|||
|
PCharProb pChar2 = (PCharProb) item2;
|
|||
|
|
|||
|
if (pChar1->dwUnicode > pChar2->dwUnicode) {
|
|||
|
return 1;
|
|||
|
} else if (pChar1->dwUnicode < pChar2->dwUnicode) {
|
|||
|
return -1;
|
|||
|
} else {
|
|||
|
return 0;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
int __cdecl UnicodeCompare(
|
|||
|
const void *item1,
|
|||
|
const void *item2)
|
|||
|
{
|
|||
|
int nSize1 = lstrlenW((LPWSTR) item1) * sizeof(WCHAR),
|
|||
|
nSize2 = lstrlenW((LPWSTR) item2) * sizeof(WCHAR);
|
|||
|
return memcmp(item1, item2, nSize1 > nSize2 ? nSize1 : nSize2);
|
|||
|
}
|
|||
|
|
|||
|
int __cdecl EngNameCompare(
|
|||
|
const void *item1,
|
|||
|
const void *item2)
|
|||
|
{
|
|||
|
PEngName p1 = (PEngName) item1;
|
|||
|
PEngName p2 = (PEngName) item2;
|
|||
|
|
|||
|
if (p1->wPrevUnicode > p2->wPrevUnicode) {
|
|||
|
return 1;
|
|||
|
} else if (p1->wPrevUnicode < p2->wPrevUnicode) {
|
|||
|
return -1;
|
|||
|
} else {
|
|||
|
if (p1->wNextUnicode > p2->wNextUnicode) {
|
|||
|
return 1;
|
|||
|
} else if (p1->wNextUnicode < p2->wNextUnicode) {
|
|||
|
return -1;
|
|||
|
} else {
|
|||
|
return 0;
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
CProperNoun::CProperNoun(
|
|||
|
HINSTANCE hInstance) :
|
|||
|
m_dProperNameThreshold(FL_PROPER_NAME_THRESHOLD),
|
|||
|
m_pCharProb(NULL),
|
|||
|
m_dwTotalCharProbNum(0),
|
|||
|
m_pEngNameData(NULL),
|
|||
|
m_hProcessHeap(0),
|
|||
|
m_hInstance(hInstance)
|
|||
|
{
|
|||
|
}
|
|||
|
|
|||
|
CProperNoun::~CProperNoun()
|
|||
|
{
|
|||
|
}
|
|||
|
|
|||
|
BOOL CProperNoun::InitData()
|
|||
|
{
|
|||
|
BOOL fRet = FALSE;
|
|||
|
HRSRC hResource;
|
|||
|
HGLOBAL hGlobal;
|
|||
|
|
|||
|
m_hProcessHeap = GetProcessHeap();
|
|||
|
|
|||
|
// Find resource
|
|||
|
hResource = FindResource(m_hInstance, TEXT("CNAME"), TEXT("BIN"));
|
|||
|
if (!hResource) { goto _exit; }
|
|||
|
|
|||
|
// Load resource
|
|||
|
hGlobal = LoadResource(m_hInstance, hResource);
|
|||
|
if (!hGlobal) { goto _exit; }
|
|||
|
|
|||
|
m_pCharProb = (PCharProb) LockResource(hGlobal);
|
|||
|
if (!m_pCharProb) { goto _exit; }
|
|||
|
m_dwTotalCharProbNum = SizeofResource(m_hInstance, hResource) / sizeof(CharProb);
|
|||
|
/*
|
|||
|
// Find resource
|
|||
|
hResource = FindResource(m_hInstance, TEXT("ENAME"),
|
|||
|
TEXT("BIN"));
|
|||
|
if (!hResource) { goto _exit; }
|
|||
|
|
|||
|
// Load resource
|
|||
|
hGlobal = LoadResource(m_hInstance, hResource);
|
|||
|
if (!hGlobal) { goto _exit; }
|
|||
|
|
|||
|
m_pEngNameData = (PEngNameData) LockResource(hGlobal);
|
|||
|
m_pEngNameData->pwUnicode = (PWORD) ((PBYTE) m_pEngNameData +
|
|||
|
sizeof(m_pEngNameData->dwTotalEngUnicodeNum) +
|
|||
|
sizeof(m_pEngNameData->dwTotalEngNamePairNum));
|
|||
|
m_pEngNameData->pEngNamePair = (PEngName) ((PBYTE) m_pEngNameData +
|
|||
|
sizeof(m_pEngNameData->dwTotalEngUnicodeNum) +
|
|||
|
sizeof(m_pEngNameData->dwTotalEngNamePairNum) +
|
|||
|
sizeof(m_pEngNameData->pwUnicode[0]) * m_pEngNameData->dwTotalEngUnicodeNum);
|
|||
|
|
|||
|
// m_pEngName = (PEngName) LockResource(hGlobal);
|
|||
|
// m_dwTotalEngNameNum = SizeofResource(m_hInstance, hResource) / sizeof(EngName);
|
|||
|
*/
|
|||
|
qsort(m_pwszSurname, m_dwTotalSurnameNum, sizeof(m_pwszSurname[0]), UnicodeCompare);
|
|||
|
|
|||
|
fRet = TRUE;
|
|||
|
|
|||
|
_exit:
|
|||
|
|
|||
|
return fRet;
|
|||
|
}
|
|||
|
|
|||
|
BOOL CProperNoun::IsAProperNoun(
|
|||
|
LPWSTR lpwszChar,
|
|||
|
UINT uCount)
|
|||
|
{
|
|||
|
return (IsAChineseName(lpwszChar, uCount) || IsAEnglishName(lpwszChar, uCount));
|
|||
|
}
|
|||
|
|
|||
|
BOOL CProperNoun::IsAChineseName(
|
|||
|
LPCWSTR lpcwszChar,
|
|||
|
UINT uCount)
|
|||
|
{
|
|||
|
static WCHAR wszChar[3] = { NULL };
|
|||
|
PWCHAR pwsResult;
|
|||
|
|
|||
|
wszChar[0] = lpcwszChar[0];
|
|||
|
|
|||
|
// Find surname
|
|||
|
if (pwsResult = (PWCHAR) bsearch(wszChar, m_pwszSurname, m_dwTotalSurnameNum, sizeof(m_pwszSurname[0]),
|
|||
|
UnicodeCompare)) {
|
|||
|
FLOAT flProbability = 1;
|
|||
|
PCharProb pCharProb;
|
|||
|
CharProb CProb;
|
|||
|
|
|||
|
// Calculate probability to be a proper noun
|
|||
|
for (UINT i = 1; i < uCount; ++i) {
|
|||
|
CProb.dwUnicode = lpcwszChar[i];
|
|||
|
if (pCharProb = (PCharProb) bsearch(&CProb, m_pCharProb,
|
|||
|
m_dwTotalCharProbNum, sizeof(m_pCharProb[0]), CharCompare)) {
|
|||
|
flProbability *= pCharProb->flProbability;
|
|||
|
} else {
|
|||
|
flProbability *= (FLOAT) FL_DEFAULT_CHAR_PROBABILITY;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
if (flProbability >= m_dProperNameThreshold) {
|
|||
|
return TRUE;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
return FALSE;
|
|||
|
}
|
|||
|
|
|||
|
BOOL CProperNoun::IsAEnglishName(
|
|||
|
LPCWSTR lpwszChar,
|
|||
|
UINT uCount)
|
|||
|
{
|
|||
|
static EngName Name;
|
|||
|
|
|||
|
Name.wPrevUnicode = lpwszChar[0];
|
|||
|
Name.wNextUnicode = lpwszChar[uCount - 1];
|
|||
|
|
|||
|
if (bsearch(&Name, m_pEngNameData->pEngNamePair, m_pEngNameData->dwTotalEngUnicodeNum, sizeof(EngName), EngNameCompare)) {
|
|||
|
return TRUE;
|
|||
|
}
|
|||
|
|
|||
|
return FALSE;
|
|||
|
}
|
|||
|
|
|||
|
WCHAR CProperNoun::m_pwszSurname[][3] = {
|
|||
|
L"<EFBFBD>B",
|
|||
|
L"<EFBFBD>R",
|
|||
|
L"<EFBFBD>_",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>C",
|
|||
|
L"<EFBFBD>K",
|
|||
|
L"<EFBFBD>T",
|
|||
|
L"<EFBFBD>]",
|
|||
|
L"<EFBFBD>q",
|
|||
|
L"<EFBFBD>v",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>V",
|
|||
|
L"<EFBFBD>w",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>E",
|
|||
|
L"<EFBFBD>d",
|
|||
|
L"<EFBFBD>f",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>H",
|
|||
|
L"<EFBFBD>L",
|
|||
|
L"<EFBFBD>f",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>P",
|
|||
|
L"<EFBFBD>s",
|
|||
|
L"<EFBFBD>u",
|
|||
|
L"<EFBFBD>x",
|
|||
|
L"<EFBFBD>}",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>L",
|
|||
|
L"<EFBFBD>Z",
|
|||
|
L"<EFBFBD>k",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>J",
|
|||
|
L"<EFBFBD>\\",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>I",
|
|||
|
L"<EFBFBD>R",
|
|||
|
L"<EFBFBD>_",
|
|||
|
L"<EFBFBD>d",
|
|||
|
L"<EFBFBD>h",
|
|||
|
L"<EFBFBD>q",
|
|||
|
L"<EFBFBD>x",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>J",
|
|||
|
L"<EFBFBD>S",
|
|||
|
L"<EFBFBD>]",
|
|||
|
L"<EFBFBD>p",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>L",
|
|||
|
L"<EFBFBD>V",
|
|||
|
L"<EFBFBD>]",
|
|||
|
L"<EFBFBD>c",
|
|||
|
L"<EFBFBD>u",
|
|||
|
L"<EFBFBD>}",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>Z",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>K",
|
|||
|
L"<EFBFBD>q",
|
|||
|
L"<EFBFBD>|",
|
|||
|
L"<EFBFBD>}",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>O",
|
|||
|
L"<EFBFBD>Z",
|
|||
|
L"<EFBFBD>d",
|
|||
|
L"<EFBFBD>h",
|
|||
|
L"<EFBFBD>i",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>\\",
|
|||
|
L"<EFBFBD>s",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>^",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>J",
|
|||
|
L"<EFBFBD>q",
|
|||
|
L"<EFBFBD>{",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>O",
|
|||
|
L"<EFBFBD>P",
|
|||
|
L"<EFBFBD>R",
|
|||
|
L"<EFBFBD>d",
|
|||
|
L"<EFBFBD>k",
|
|||
|
L"<EFBFBD>s",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>q",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>Q",
|
|||
|
L"<EFBFBD>l",
|
|||
|
L"<EFBFBD>p",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>a",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>p",
|
|||
|
L"<EFBFBD>u",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>B",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>G",
|
|||
|
L"<EFBFBD>H",
|
|||
|
L"<EFBFBD>|",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>P",
|
|||
|
L"<EFBFBD>c",
|
|||
|
L"<EFBFBD>p",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>F",
|
|||
|
L"<EFBFBD>N",
|
|||
|
L"<EFBFBD>R",
|
|||
|
L"<EFBFBD>d",
|
|||
|
L"<EFBFBD>j",
|
|||
|
L"<EFBFBD>s",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>t",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"£",
|
|||
|
L"²",
|
|||
|
L"¿",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>C",
|
|||
|
L"<EFBFBD>Q",
|
|||
|
L"<EFBFBD>e",
|
|||
|
L"ù",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>Y",
|
|||
|
L"<EFBFBD>u",
|
|||
|
L"ĩ",
|
|||
|
L"Ī",
|
|||
|
L"Ĭ",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>U",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD><EFBFBD>",
|
|||
|
L"<EFBFBD>e",
|
|||
|
L"<EFBFBD>s",
|
|||
|
L"м",
|
|||
|
L"<EFBFBD>\\",
|
|||
|
L"<EFBFBD>k"
|
|||
|
};
|
|||
|
|
|||
|
DWORD CProperNoun::m_dwTotalSurnameNum = sizeof(m_pwszSurname) / sizeof(m_pwszSurname[0]);
|