windows-nt/Source/XPSP1/NT/inetsrv/intlwb/kor/baseapi.cpp

1538 lines
43 KiB
C++
Raw Normal View History

2020-09-26 03:20:57 -05:00
// =========================================================================
// Copyright (C) 1997 - 1998, Microsoft Corporation. All Rights Reserved.
//
// File Name : BASEAPI.CPP
// Function : NLP BASE ENGINE API Definition
// =========================================================================
#include <string.h>
#include <malloc.h>
#include <sys\stat.h>
#include "basecore.hpp"
#include "basecode.hpp"
#include "basedef.hpp"
#include "basegbl.hpp"
#include "MainDict.h"
extern int Compose_RIEUL_Irregular (char *, char *);
extern int Compose_HIEUH_Irregular (char *, char *);
extern int Compose_PIEUP_Irregular (char *, char *);
extern int Compose_TIEUT_Irregular (char *, char *);
extern int Compose_SIOS_Irregular (char *, char *);
extern BOOL Compose_YEO_Irregular (char *, char *);
extern BOOL Compose_REO_REU_Irregular (char *, char *);
extern BOOL Compose_GEORA_Irregular (char *, char *);
extern BOOL Compose_Regular (char *, char *);
extern void SetSilHeosa (int, WORD *);
#include "stemkor.h"
// by dhyu -- 1996. 1
typedef struct
{
LPCSTR contract;
LPCSTR noconstract;
} contract_tossi;
contract_tossi ContractTossi [] =
{
{ "\xa4\xa4", "\xB4\xC2"},
{ "\xA4\xA9", "\xB8\xA6"},
{ "\xA4\xA4\xC4\xBF\xB3\xE7", "\xB4\xC2\xC4\xBF\xB3\xE7"},
{ NULL, NULL}
};
/*
char ChangableFirstStem [][2] =
{
{__K_D_D, __V_m}, // ssangtikeut, eu
{
}
*/
inline
BOOL isHANGEUL(char cCh1,char cCh2)
{
unsigned char ch1,ch2 ;
ch1=(unsigned char)cCh1;
ch2 =(unsigned char)cCh2;
if ( ((ch1 >= 0xb0) && (ch1 <= 0xc8)) && (ch2>=0xa1) )
return TRUE;
else if ( ((ch1 >= 0x81) && (ch1 <= 0xc5)) && ( ((ch2 >= 0x41) && (ch2 <= 0x5a)) || ((ch2 >= 0x61) && (ch2 <= 0x7a)) || ((ch2 >= 0x81) && (ch2 <= 0xa0)) ) )
return TRUE;
else if ( ((ch1 >= 0x81) && (ch1 <= 0xa0)) && (ch2 >= 0xa1) )
return TRUE;
//else if ( ((ch1 >= 0xca) && (ch1 <= 0xfe)) && (ch2 >= 0xa1) )
// return TRUE;
else if ((ch1 == 0xa4) && (ch2 >= 0xa1))
return TRUE;
return FALSE;
}
WINSRC StemmerInit(HSTM *hStm) // Stemmer Engine session Handle
{
STMI *pstmi;
HGLOBAL hgbl;
hgbl = GlobalAlloc(GHND, sizeof(STMI));
if (hgbl == NULL) return FAIL;
else
*hStm = (HSTM) hgbl;
pstmi = (STMI*)GlobalLock(hgbl);
if (pstmi == NULL) return FAIL;
pstmi->Option = 0x00000000;
GlobalUnlock(hgbl);
return NULL; // normal operation
}
WINSRC StemmerSetOption (HSTM hStm, UINT Option)
{
STMI *pstmi;
HGLOBAL hgbl = (HGLOBAL) hStm;
pstmi = (STMI *)GlobalLock(hgbl);
if (pstmi == NULL)
{
MessageBox (NULL, "StemmerSetOption", "Fail", MB_OK);
GlobalUnlock(hgbl);
return srcModuleError | srcInvalidID;
}
pstmi->Option = Option;
GlobalUnlock (hgbl);
return NULL;
}
WINSRC StemmerGetOption (HSTM hStm, UINT *Option)
{
STMI *pstmi;
HGLOBAL hgbl = (HGLOBAL) hStm;
pstmi = (STMI *)GlobalLock(hgbl);
if (pstmi == NULL)
{
GlobalUnlock(hgbl);
return srcModuleError | srcInvalidID;
}
*Option = pstmi->Option;
GlobalUnlock (hgbl);
return NULL;
}
WINSRC StemmerOpenMdr(HSTM sid, char *lpspathMain) // Dictionary File path
{
STMI *pstmi;
HGLOBAL hgbl;
hgbl = (HGLOBAL) sid;
pstmi = (STMI *)GlobalLock(hgbl);
if (pstmi == NULL)
{
GlobalUnlock(hgbl);
return srcModuleError | srcInvalidID;
}
if (lstrlen(lpspathMain) == 0)
{
GlobalUnlock(hgbl);
return srcIOErrorMdr | srcInvalidMdr;
}
if (!OpenMainDict (lpspathMain))
{
GlobalUnlock(hgbl);
return srcIOErrorMdr | srcInvalidMdr;
}
GlobalUnlock(hgbl);
return NULL; // normal operation
}
WINSRC StemmerCloseMdr(HSTM sid)
{
STMI *pstmi;
HGLOBAL hgbl;
hgbl = (HGLOBAL) sid;
pstmi = (STMI *)GlobalLock(hgbl);
if (pstmi == NULL) return FAIL;
if (pstmi->bMdr)
CloseMainDict ();
GlobalUnlock(hgbl);
return NULL; // normal operation
}
WINSRC StemmerDecomposeW (HSTM hStm,
LPCWSTR iword,
LPWDOB lpSob)
{
LPSTR MultiByteIword;
DOB sob;
int index = 0;
int len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, (LPCWSTR) iword, -1, NULL, 0, NULL, NULL);
MultiByteIword = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len);
// add a check for this point
if ( MultiByteIword == NULL ) {
return srcModuleError;
}
len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, (LPCWSTR) iword, -1, MultiByteIword, len, NULL, NULL);
sob.wordlist = (LPSTR) LocalAlloc (LPTR, sizeof (char) * lpSob->sch);
// add a check for this point
if ( sob.wordlist == NULL ) {
LocalFree(MultiByteIword);
return srcModuleError;
}
sob.sch = lpSob->sch;
SRC src = StemmerDecompose(hStm, MultiByteIword, &sob);
lpSob->num = sob.num;
if (src == NULL)
{
char *tmpstr;
for (int j = 0, index2 = 0; j < sob.num; j++)
{
tmpstr = sob.wordlist+index2;
len = MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, NULL, 0);
LPWSTR tmpwstr = (LPWSTR) LocalAlloc (LPTR, sizeof (WCHAR) * len);
// add a check for this point
if ( tmpwstr == NULL ) {
LocalFree (MultiByteIword);
LocalFree (sob.wordlist);
return srcModuleError;
}
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, (LPWSTR) tmpwstr, len);
memcpy (lpSob->wordlist+index, tmpwstr, len*sizeof(WCHAR));
memcpy (lpSob->wordlist+index+len, tmpstr+lstrlen (tmpstr)+1, 2);
memcpy (lpSob->wordlist+index+len + 1, tmpwstr+len-1, sizeof(WCHAR));
index += (len+2);
index2 += (lstrlen(tmpstr)+4);
LocalFree (tmpwstr);
}
}
lpSob->len = (WORD)index;
LocalFree (MultiByteIword);
LocalFree (sob.wordlist);
return src;
}
SRC GetOneResult (RLIST *rList, LPDOB lpSob)
{
WORD value;
int count;
if (rList->num >= rList->max)
return srcNoMoreResult;
lpSob->len = 0;
lpSob->num = 0;
for (UINT i = rList->num, index = 0; i < rList->max; i++)
{
count = 0;
while (rList->next [index+count] != '+' && rList->next [index+count] != '\t')
count++;
if (lpSob->len + count < lpSob->sch)
{
memcpy (lpSob->wordlist+lpSob->len, rList->next+index, count);
lpSob->num++;
}
else
return srcOOM | srcExcessBuffer;
lpSob->len += (WORD)count;
lpSob->wordlist [lpSob->len++] = '\0';
SetSilHeosa(rList->vbuf [i], &value);
memcpy (lpSob->wordlist + lpSob->len, &value, 2);
lpSob->wordlist [lpSob->len+2] = '\0';
lpSob->len += 3;
if (rList->next[index+count] == '\t')
break;
index += (count + 1);
}
rList->next += (index+count+1);
rList->num = i+1;
return NULL;
}
WINSRC StemmerDecompose(HSTM hstm,
LPCSTR iword, // input word
LPDOB psob) // the number of candidates
{
int len = lstrlen ((char *) iword);
if (len >= 45)
{
psob->num = 1;
lstrcpy ((LPSTR) psob->wordlist, (LPSTR) iword);
psob->len = (WORD)len;
return srcInvalid;
}
for (int i = 0; i < len; i += 2)
if (!isHANGEUL (iword [i], iword [i+1]))
{
psob->num = 1;
lstrcpy ((LPSTR) psob->wordlist, (LPSTR) iword);
psob->len = (WORD)len;
return srcInvalid;
}
STMI *pstmi;
HGLOBAL hgbl = (HGLOBAL) hstm;
pstmi = (STMI *)GlobalLock(hgbl);
if (pstmi == NULL)
{
GlobalUnlock(hgbl);
return srcModuleError | srcInvalidID;
}
BaseEngine BaseCheck;
char lrgsz [400];
memset (pstmi->rList.lrgsz, NULLCHAR, 400);
lstrcpy (pstmi->rList.iword, iword);
pstmi->rList.max = 0;
BOOL affixFlag = TRUE;
if (pstmi->Option & SO_ALONE)
{
int num = BaseCheck.NLP_BASE_ALONE (iword, lrgsz);
if (num > 0)
{
affixFlag = FALSE;
lstrcat (pstmi->rList.lrgsz, lrgsz);
for (int i = 0; i < num; i++)
pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i];
pstmi->rList.max += num;
}
}
if (pstmi->Option & SO_NOUNPHRASE)
{
int num = BaseCheck.NLP_BASE_NOUN (iword, lrgsz);
if (num > 0)
{
affixFlag = FALSE;
lstrcat (pstmi->rList.lrgsz, lrgsz);
for (int i = 0; i < num; i++)
pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i];
pstmi->rList.max += num;
}
}
if (pstmi->Option & SO_PREDICATE)
{
int num = BaseCheck.NLP_BASE_VERB (iword, lrgsz);
if (num > 0)
{
lstrcat (pstmi->rList.lrgsz, lrgsz);
for (int i = 0; i < num; i++)
pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i];
pstmi->rList.max += num;
}
}
if (pstmi->Option & SO_COMPOUND)
{
if (pstmi->rList.max == 0)
{
int num = BaseCheck.NLP_BASE_COMPOUND (iword, lrgsz);
if (num > 0)
{
lstrcpy (pstmi->rList.lrgsz, lrgsz);
for (int i = 0; i < num; i++)
pstmi->rList.vbuf [i] = BaseCheck.vbuf [i];
pstmi->rList.max = num;
}
}
}
if (affixFlag && pstmi->Option & SO_SUFFIX)
{
int num = BaseCheck.NLP_BASE_AFFIX (iword, lrgsz);
if (num > 0)
{
lstrcat (pstmi->rList.lrgsz, lrgsz);
for (int i = 0; i < num; i++)
pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i];
pstmi->rList.max += num;
}
}
pstmi->rList.num = 0;
pstmi->rList.next = pstmi->rList.lrgsz;
SRC src = GetOneResult (&(pstmi->rList), psob);
if (src == srcNoMoreResult)
{
src = srcInvalid;
lstrcpy (psob->wordlist, iword);
}
GlobalUnlock(hgbl);
return src;
}
WINSRC StemmerDecomposeMoreW (HSTM hStm, LPCWSTR lpWord, LPWDOB lpSob)
{
LPSTR MultiByteIword;
DOB sob;
int len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpWord, -1, NULL, 0, NULL, NULL);
MultiByteIword = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len);
// add a check for this point
if ( MultiByteIword == NULL ) {
return srcModuleError;
}
len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpWord, -1, MultiByteIword, len, NULL, NULL);
sob.wordlist = (LPSTR) LocalAlloc (LPTR, sizeof (char) * lpSob->sch);
// add a check for this point
if ( sob.wordlist == NULL ) {
LocalFree(MultiByteIword);
return srcModuleError;
}
sob.sch = lpSob->sch;
SRC src = StemmerDecomposeMore(hStm, MultiByteIword, &sob);
lpSob->num = sob.num;
int index = 0;
if (src == NULL)
{
char *tmpstr;
for (int j = 0, index2 = 0; j < sob.num; j++)
{
tmpstr = sob.wordlist+index2;
len = MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, NULL, 0);
LPWSTR tmpwstr = (LPWSTR) LocalAlloc (LPTR, sizeof (WCHAR) * len);
// add a check for this point
if ( tmpwstr == NULL ) {
LocalFree(MultiByteIword);
LocalFree(sob.wordlist);
return srcModuleError;
}
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, (LPWSTR) tmpwstr, len);
memcpy (lpSob->wordlist+index, tmpwstr, len*sizeof(WCHAR));
memcpy (lpSob->wordlist+index+len, tmpstr+lstrlen (tmpstr)+1, 2);
memcpy (lpSob->wordlist+index+len + 1, tmpwstr+len-1, sizeof(WCHAR));
index += (len+2);
index2 += (lstrlen(tmpstr)+4);
LocalFree (tmpwstr);
}
}
lpSob->len = (WORD)index;
LocalFree (MultiByteIword);
LocalFree (sob.wordlist);
return src;
}
WINSRC StemmerDecomposeMore (HSTM hStm, LPCSTR lpWord, LPDOB lpSob)
{
STMI *pstmi;
HGLOBAL hgbl = (HGLOBAL) hStm;
pstmi = (STMI *)GlobalLock(hgbl);
if (pstmi == NULL)
{
GlobalUnlock(hgbl);
return srcModuleError | srcInvalidID;
}
if (lstrcmp (pstmi->rList.iword, lpWord))
{
return srcModuleError;
}
SRC src = GetOneResult (&(pstmi->rList), lpSob);
GlobalUnlock(hgbl);
return src;
}
WINSRC StemmerEnumDecomposeW (HSTM hStm, LPCWSTR lpWord, LPWDOB lpSob, LPFNDECOMPOSEW lpfnCallBack)
{
LPSTR MultiByteIword;
DOB sob;
int len = lstrlen ((char *) lpWord);
if (len >= 45)
{
lpSob->num = 1;
wcscpy (lpSob->wordlist, lpWord);
lpSob->len = (WORD)len;
return srcInvalid;
}
for (int i = 0; i < len; i++)
if (0xabff < lpWord [i] && lpWord [i] < 0xd7a4)
{
lpSob->num = 1;
lstrcpy ((LPSTR) lpSob->wordlist, (LPSTR) lpWord);
lpSob->len = (WORD)len;
return srcInvalid;
}
STMI *pstmi;
HGLOBAL hgbl = (HGLOBAL) hStm;
pstmi = (STMI *)GlobalLock(hgbl);
if (pstmi == NULL)
{
GlobalUnlock(hgbl);
return srcModuleError | srcInvalidID;
}
BaseEngine BaseCheck;
len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpWord, -1, NULL, 0, NULL, NULL);
MultiByteIword = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len);
// add a check for this point
if ( MultiByteIword == NULL ) {
GlobalUnlock(hgbl);
return srcModuleError;
}
len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpWord, -1, MultiByteIword, len, NULL, NULL);
sob.wordlist = (LPSTR) LocalAlloc (LPTR, sizeof (char) * lpSob->sch);
// add a check for this point
if ( sob.wordlist == NULL ) {
GlobalUnlock(hgbl);
LocalFree(MultiByteIword);
return srcModuleError;
}
sob.sch = lpSob->sch;
char lrgsz [400];
memset (pstmi->rList.lrgsz, NULLCHAR, 400);
lstrcpy (pstmi->rList.iword, MultiByteIword);
pstmi->rList.max = 0;
int num = BaseCheck.NLP_BASE_NOUN (MultiByteIword, lrgsz);
if (num > 0)
{
lstrcpy (pstmi->rList.lrgsz, lrgsz);
for (int i = 0; i < num; i++)
pstmi->rList.vbuf [i] = BaseCheck.vbuf [i];
pstmi->rList.max = num;
}
num = BaseCheck.NLP_BASE_ALONE (MultiByteIword, lrgsz);
if (num > 0)
{
lstrcat (pstmi->rList.lrgsz, lrgsz);
for (int i = 0; i < num; i++)
pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i];
pstmi->rList.max += num;
}
num = BaseCheck.NLP_BASE_VERB (MultiByteIword, lrgsz);
if (num > 0)
{
lstrcat (pstmi->rList.lrgsz, lrgsz);
for (int i = 0; i < num; i++)
pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i];
pstmi->rList.max += num;
}
if (num == 0)
{
num = BaseCheck.NLP_BASE_COMPOUND (MultiByteIword, lrgsz);
if (num > 0)
{
lstrcpy (pstmi->rList.lrgsz, lrgsz);
for (int i = 0; i < num; i++)
pstmi->rList.vbuf [i] = BaseCheck.vbuf [i];
pstmi->rList.max = num;
}
}
pstmi->rList.num = 0;
pstmi->rList.next = pstmi->rList.lrgsz;
while (GetOneResult (&(pstmi->rList), &sob) == NULL)
{
char *tmpstr;
for (int j = 0, index2 = 0, index = 0; j < sob.num; j++)
{
tmpstr = sob.wordlist+index2;
len = MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, NULL, 0);
LPWSTR tmpwstr = (LPWSTR) LocalAlloc (LPTR, sizeof (WCHAR) * len);
// add a check for this point
if ( tmpwstr == NULL ) {
GlobalUnlock(hgbl);
LocalFree (MultiByteIword);
LocalFree (sob.wordlist);
return srcModuleError;
}
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, tmpstr, -1, (LPWSTR) tmpwstr, len);
memcpy (lpSob->wordlist+index, tmpwstr, len*sizeof(WCHAR));
memcpy (lpSob->wordlist+index+len, tmpstr+lstrlen (tmpstr)+1, 2);
memcpy (lpSob->wordlist+index+len + 1, tmpwstr+len-1, sizeof(WCHAR));
index += (len+2);
index2 += (lstrlen(tmpstr)+4);
LocalFree (tmpwstr);
}
lpSob->len = (WORD)index;
lpSob->num = sob.num;
lpfnCallBack (lpSob);
}
GlobalUnlock(hgbl);
LocalFree (MultiByteIword);
LocalFree (sob.wordlist);
return NULL;
}
WINSRC StemmerEnumDecompose (HSTM hStm, LPCSTR lpWord, LPDOB lpSob, LPFNDECOMPOSE lpfnCallBack)
{
int len = lstrlen ((char *) lpWord);
if (len >= 45)
{
lpSob->num = 1;
lstrcpy ((LPSTR) lpSob->wordlist, lpWord);
lpSob->len = (WORD)len;
return srcInvalid;
}
for (int i = 0; i < len; i += 2)
if (!isHANGEUL (lpWord [i], lpWord [i+1]))
{
lpSob->num = 1;
lstrcpy ((LPSTR) lpSob->wordlist, (LPSTR) lpWord);
lpSob->len = (WORD)len;
return srcInvalid;
}
STMI *pstmi;
HGLOBAL hgbl = (HGLOBAL) hStm;
pstmi = (STMI *)GlobalLock(hgbl);
if (pstmi == NULL)
{
GlobalUnlock(hgbl);
return srcModuleError | srcInvalidID;
}
BaseEngine BaseCheck;
char lrgsz [400];
memset (pstmi->rList.lrgsz, NULLCHAR, 400);
lstrcpy (pstmi->rList.iword, lpWord);
int num = BaseCheck.NLP_BASE_NOUN (lpWord, lrgsz);
pstmi->rList.max = 0;
if (num > 0)
{
lstrcpy (pstmi->rList.lrgsz, lrgsz);
for (int i = 0; i < num; i++)
pstmi->rList.vbuf [i] = BaseCheck.vbuf [i];
pstmi->rList.max = num;
}
num = BaseCheck.NLP_BASE_ALONE (lpWord, lrgsz);
if (num > 0)
{
lstrcat (pstmi->rList.lrgsz, lrgsz);
for (int i = 0; i < num; i++)
pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i];
pstmi->rList.max += num;
}
num = BaseCheck.NLP_BASE_VERB (lpWord, lrgsz);
if (num > 0)
{
lstrcat (pstmi->rList.lrgsz, lrgsz);
for (int i = 0; i < num; i++)
pstmi->rList.vbuf [pstmi->rList.max + i] = BaseCheck.vbuf [i];
pstmi->rList.max += num;
}
if (num == 0)
{
num = BaseCheck.NLP_BASE_COMPOUND (lpWord, lrgsz);
if (num > 0)
{
lstrcpy (pstmi->rList.lrgsz, lrgsz);
for (int i = 0; i < num; i++)
pstmi->rList.vbuf [i] = BaseCheck.vbuf [i];
pstmi->rList.max = num;
}
}
pstmi->rList.num = 0;
pstmi->rList.next = pstmi->rList.lrgsz;
while (GetOneResult (&(pstmi->rList), lpSob) == NULL)
lpfnCallBack (lpSob);
GlobalUnlock(hgbl);
return NULL;
}
WINSRC StemmerComposeW (HSTM hstm, WCIB sib, LPWSTR rword)
{
CIB tmpsib;
LPSTR MultiByteRword;
int len = (wcslen (sib.silsa) + 1) * 2;
tmpsib.silsa = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len);
// add a check for this point.
if ( tmpsib.silsa == NULL ) {
return srcModuleError;
}
len = WideCharToMultiByte (CP_ACP, 0, (LPCWSTR) sib.silsa, -1, tmpsib.silsa, len, NULL, NULL);
int len2 = (wcslen (sib.heosa) + 1) * 2;
tmpsib.heosa = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len2);
// add a check for this point.
if ( tmpsib.heosa == NULL ) {
LocalFree(tmpsib.silsa);
return srcModuleError;
}
len2 = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, (LPCWSTR) sib.heosa, -1, tmpsib.heosa, len2, NULL, NULL);
MultiByteRword = (LPSTR) LocalAlloc (LPTR, sizeof (char) * (len + len2));
// add a check for this point.
if ( MultiByteRword == NULL ) {
LocalFree(tmpsib.silsa);
LocalFree(tmpsib.heosa);
return srcModuleError;
}
tmpsib.pos = sib.pos;
SRC src = StemmerCompose (hstm, tmpsib, MultiByteRword);
len = MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, MultiByteRword, -1, NULL, 0);
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, MultiByteRword, -1, (LPWSTR) rword, len);
LocalFree (tmpsib.silsa);
LocalFree (tmpsib.heosa);
LocalFree (MultiByteRword);
return src;
}
int CVCheckNP(char *stem, char *ending, BYTE action)
// Check vowel harmony for NOUN + Tossi. If the last letter of stem is RIEUR, that should seriously be considered.
{
int len = strlen (ending) + 1;
if ((action & 0x80) && (action & 0x40)) // CV = 11
return VALID;
if (!(action & 0x80) && (action & 0x40)) { // CV = 01
if (stem[0] >= __V_k)
return VALID;
if (stem[0] == __K_R && ending[0] == __K_R && ending[1] == __V_h)
// Tossi is "RO"(CV=01) and the last letter of stem is RIEUR.
return VALID;
if (ending[0] == __K_S && ending[1] == __V_j) {
// "SEO" --> "E SEO"
memmove (ending+2, ending, len);
ending [0] = __K_I;
ending [1] = __V_p;
return MORECHECK;
}
if (ending[0] == __K_N && ending[1] == __V_m && ending[2] == __K_N) {
// "NEUN" --> "EUN"
ending [0] = __K_I;
}
if (ending[0] == __K_G && ending[1] == __V_k) {
// "GA" --> "I"
ending[0] = __K_I;
ending[1] = __V_l;
return MORECHECK;
}
if (ending[0] == __K_I && ending[1] == __V_hk) {
// "WA" --> "GWA"
ending [0] = __K_G;
return MORECHECK;
}
if (ending [0] == __K_R) {
if (ending[1] == __V_m && ending[2] == __K_R) {
// "REUL" --> "EUL"
ending [0] = __K_I;
return INVALID;
}
if (ending[1] == __V_h) {
// "RO" --> "EU RO"
memmove (ending+2, ending, len);
ending [0] = __K_I;
ending [1] = __V_m;
return MORECHECK;
}
// add "I" to the first part of ending
memmove (ending+2, ending, len);
ending [0] = __K_I;
ending [1] = __V_l;
return MORECHECK;
}
if ((ending [0] == __K_N) ||
(ending [0] == __K_S && ending [1] == __V_l) || // "SI"
(ending [0] == __K_I && ending [1] == __V_u) || // "YEO"
(ending[0] == __K_I && ending[1] == __V_i && ending[2] == __K_M // "YA MAL RO" --> "I YA MAL RO"
&& ending[3] == __V_k && ending[4] == __K_R && ending[5] == __K_R && ending[6] == __V_h))
{
// Add "I" to the first part of ending
memmove (ending+2, ending, len);
ending [0] = __K_I;
ending [1] = __V_l;
return MORECHECK;
}
return MORECHECK;
}
// CV==10
if (stem[0] >= __V_k) {
if (ending [0] == __K_G) {
// "GWA" --> "WA"
ending [0] = __K_I;
return MORECHECK;
}
if (ending[1] == __V_l) {
if (len == 3) {
// "I" --> "GA"
ending [0] = __K_G;
ending [1] = __V_k;
return MORECHECK;
}
else {
// remove "I"
memmove (ending, ending+2, len-2);
return INVALID;
}
}
if (ending[1] == __V_k)
{
ending [1] = __V_i;
return MORECHECK;
}
if (ending[2] == __K_N) {
// "EUN" --> "NEUN"
ending [0] = __K_N;
return MORECHECK;
}
if (len == 4) {
// "EUL" --> "REUL"
ending [0] = __K_R;
return MORECHECK;
}
else {
// Remove "EU"
memmove (ending, ending+2, len-2);
return INVALID;
}
}
if (stem[0] == __K_R && ending[0] == __K_I && ending[1] == __V_m
&& ending[2] == __K_R && ending[3] == __V_h) {
// Remove "EU"
memmove (ending, ending+2, len-2);
return INVALID;
}
return VALID;
}
WINSRC StemmerCompose (HSTM hstm, CIB sib, LPSTR rword)
{
STMI *pstmi;
HGLOBAL hgbl = (HGLOBAL) hstm;
int ret, i;
BYTE action;
pstmi = (STMI *)GlobalLock(hgbl);
if (pstmi == NULL)
{
GlobalUnlock(hgbl);
return srcModuleError | srcInvalidID;
}
lstrcpy (rword, (char *)sib.silsa);
for (i = 0; sib.silsa [i] != 0; i += 2)
if (!isHANGEUL (sib.silsa [i], sib.silsa [i+1]))
{
lstrcat (rword, sib.heosa);
return NULL;
}
for (i = 0; sib.heosa [i] != 0; i +=2)
if (!isHANGEUL (sib.heosa [i], sib.heosa [i+1]))
{
lstrcat (rword, sib.heosa);
return NULL;
}
CODECONVERT conv;
char *incode = (char *) LocalAlloc (LPTR, sizeof (char) * (lstrlen (sib.silsa)*3+1 + lstrlen (sib.heosa)*3+7));
// add a check for this point.
if ( incode == NULL ) {
GlobalUnlock(hgbl);
return srcModuleError;
}
char *inheosa = (char *) LocalAlloc (LPTR, sizeof (char) * (lstrlen (sib.heosa)*3+7));
// add a check for this point.
if ( inheosa == NULL ) {
GlobalUnlock(hgbl);
LocalFree(incode);
return srcModuleError;
}
conv.HAN2INS (sib.silsa, incode, codeWanSeong);
conv.HAN2INR (sib.heosa, inheosa, codeWanSeong);
LPSTR tmptossi = (LPSTR) LocalAlloc (LPTR, sizeof (char) * lstrlen (sib.heosa)*2 );
// add a check for this point
if (tmptossi == NULL ) {
GlobalUnlock(hgbl);
LocalFree(incode);
LocalFree(inheosa);
return srcModuleError;
}
char *inending = (char *) LocalAlloc (LPTR, sizeof (char) * (lstrlen(sib.heosa)*3+7));
// add a check for this point
if ( inending== NULL ) {
GlobalUnlock(hgbl);
LocalFree(incode);
LocalFree(inheosa);
LocalFree(tmptossi);
return srcModuleError;
}
char *inrword = (char *) LocalAlloc (LPTR, sizeof (char) * (lstrlen(sib.silsa)*3+lstrlen(sib.heosa)*3+6));
// add a check for this point
if (inrword == NULL ) {
GlobalUnlock(hgbl);
LocalFree(incode);
LocalFree(inheosa);
LocalFree(tmptossi);
LocalFree(inending);
return srcModuleError;
}
switch (sib.pos & 0x0f00)
{
case POS_NOUN :
case POS_PRONOUN :
case POS_NUMBER :
lstrcpy (tmptossi, sib.heosa);
if (FindHeosaWord (inheosa, _TOSSI, &action) & FINAL)
{
conv.ReverseIN (inheosa, inending);
conv.ReverseIN (incode, inrword);
CVCheckNP (inrword, inending, action);
conv.INS2HAN (inending, tmptossi, codeWanSeong);
// we should check contraction tossi, for example, Nieun, Rieul
for (i = 0; ContractTossi [i].contract != NULL; i++)
if (lstrcmp (ContractTossi [i].contract, tmptossi)==0)
conv.HAN2INS ((char *)tmptossi, inending, codeWanSeong);
lstrcat (incode, inending);
conv.INS2HAN(incode, (char *)rword, codeWanSeong);
//LocalFree (incode);
LocalFree (inheosa);
LocalFree (tmptossi);
LocalFree (inending);
LocalFree (inrword);
GlobalUnlock (hgbl);
return NULL;
}
lstrcat (rword, tmptossi);
LocalFree (incode);
LocalFree (inheosa);
LocalFree (tmptossi);
LocalFree (inending);
LocalFree (inrword);
GlobalUnlock (hgbl);
return srcComposeError;
break;
case POS_VERB :
case POS_ADJECTIVE :
case POS_AUXVERB :
case POS_AUXADJ :
conv.HAN2INS ((char *)sib.heosa, inending, codeWanSeong);
conv.HAN2INR ((char *)sib.silsa, incode, codeWanSeong);
if ((ret = Compose_RIEUL_Irregular (incode, inending)) != NOT_COMPOSED)
goto ErrorCheck;
if ((ret = Compose_HIEUH_Irregular (incode, inending)) != NOT_COMPOSED)
goto ErrorCheck;
if ((ret = Compose_PIEUP_Irregular (incode, inending)) != NOT_COMPOSED)
goto ErrorCheck;
if ((ret = Compose_TIEUT_Irregular (incode, inending)) != NOT_COMPOSED)
goto ErrorCheck;
if ((ret = Compose_SIOS_Irregular (incode, inending)) != NOT_COMPOSED)
goto ErrorCheck;
if (Compose_YEO_Irregular (incode, inending))
goto Quit;
if (Compose_REO_REU_Irregular (incode, inending))
goto Quit;
if (Compose_GEORA_Irregular (incode, inending))
goto Quit;
Compose_Regular (incode, inending);
ErrorCheck : if (ret == COMPOSE_ERROR)
{
lstrcat (rword, sib.heosa);
LocalFree (incode);
LocalFree (inheosa);
LocalFree (tmptossi);
LocalFree (inending);
LocalFree (inrword);
GlobalUnlock (hgbl);
return srcComposeError;
}
Quit: conv.ReverseIN (incode, inrword);
lstrcat (inrword, inending);
conv.INS2HAN (inrword, (char *)rword, codeWanSeong);
break;
default :
lstrcat (rword, sib.heosa);
LocalFree (incode);
LocalFree (inheosa);
LocalFree (tmptossi);
LocalFree (inending);
LocalFree (inrword);
GlobalUnlock (hgbl);
return srcComposeError;
}
LocalFree (incode);
LocalFree (inheosa);
LocalFree (tmptossi);
LocalFree (inending);
LocalFree (inrword);
GlobalUnlock (hgbl);
return NULL;
}
WINSRC StemmerTerminate(HSTM hstm)
{
STMI *pstmi;
HGLOBAL hgbl = (HGLOBAL) hstm;
pstmi = (STMI *)GlobalLock(hgbl);
if (pstmi == NULL)
{
GlobalUnlock(hgbl);
return srcModuleError | srcInvalidID;
}
GlobalUnlock (hgbl);
GlobalFree (hgbl);
return NULL; //normal operation
}
WINSRC StemmerOpenUdr (HSTM stmi, LPCSTR lpPathUdr)
{
return NULL;
}
WINSRC StemmerCloseUdr (HSTM stmi)
{
return NULL;
}
WINSRC StemmerCompareW (HSTM hstm, LPCWSTR lpStr1, LPCWSTR lpStr2, LPWSTR lpStem, LPWSTR lpEnding1, LPWSTR lpEnding2, WORD *pos)
{
LPSTR MultiByteStr1, MultiByteStr2, MultiByteStem, MultiByteEnding1, MultiByteEnding2;
int len1 = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr1, -1, NULL, 0, NULL, NULL);
MultiByteStr1 = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len1);
// add a check for this point.
if (MultiByteStr1 == NULL ) {
return srcModuleError;
}
len1 = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr1, -1, MultiByteStr1, len1, NULL, NULL);
int len2 = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr2, -1, NULL, 0, NULL, NULL);
MultiByteStr2 = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len2);
// add a check for this point.
if (MultiByteStr2 == NULL ) {
LocalFree(MultiByteStr1);
return srcModuleError;
}
len2 = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr2, -1, MultiByteStr2, len2, NULL, NULL);
int len = len1 > len2 ? len1 : len2;
MultiByteStem = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len);
// add a check for this point.
if (MultiByteStem == NULL ) {
LocalFree(MultiByteStr1);
LocalFree(MultiByteStr2);
return srcModuleError;
}
MultiByteEnding1 = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len);
// add a check for this point.
if (MultiByteEnding1 == NULL ) {
LocalFree(MultiByteStr1);
LocalFree(MultiByteStr2);
LocalFree(MultiByteStem);
return srcModuleError;
}
MultiByteEnding2 = (LPSTR) LocalAlloc (LPTR, sizeof (char) * len);
// add a check for this point.
if (MultiByteEnding2 == NULL ) {
LocalFree(MultiByteStr1);
LocalFree(MultiByteStr2);
LocalFree(MultiByteStem);
LocalFree(MultiByteEnding1);
return srcModuleError;
}
SRC src = StemmerCompare(hstm, MultiByteStr1, MultiByteStr2, MultiByteStem, MultiByteEnding1, MultiByteEnding2, pos);
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, MultiByteStem, -1, lpStem, sizeof (lpStem));
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, MultiByteEnding1, -1, lpEnding1, sizeof (lpEnding1));
MultiByteToWideChar(UWANSUNG_CODE_PAGE, 0, MultiByteEnding2, -1, lpEnding2, sizeof (lpEnding2));
LocalFree (MultiByteStr1);
LocalFree (MultiByteStr2);
LocalFree (MultiByteStem);
LocalFree (MultiByteEnding1);
LocalFree (MultiByteEnding2);
return src;
}
WINSRC StemmerCompare (HSTM hstm, LPCSTR lpStr1, LPCSTR lpStr2, LPSTR lpStem, LPSTR lpEnding1, LPSTR lpEnding2, WORD *pos)
{
// First, check the chosung of two strings
// if they are different, we may not use stemming.
CODECONVERT conv;
char inheosa1 [80], inheosa2 [80];
BYTE action;
char *incodeStr1 = new char [lstrlen (lpStr1) * 4 + 1];
char *incodeStr2 = new char [lstrlen (lpStr2) * 4 + 1];
conv.HAN2INS ((char *)lpStr1, incodeStr1, codeWanSeong);
conv.HAN2INS ((char *)lpStr2, incodeStr2, codeWanSeong);
if (incodeStr1 [0] != incodeStr2 [0])
return srcInvalid;
if (incodeStr1 [1] != incodeStr2 [1])
{
return srcInvalid;
}
delete incodeStr1;
delete incodeStr2;
STMI *pstmi;
HGLOBAL hgbl = (HGLOBAL) hstm;
pstmi = (STMI *)GlobalLock(hgbl);
if (pstmi == NULL)
{
GlobalUnlock(hgbl);
return srcModuleError | srcInvalidID;
}
BaseEngine BaseCheck;
char stem1[10][100], stem2[10][100], ending1[10][100], ending2[10][100], lrgsz [400];
int num1, num2, count;
WORD winfo [10];
if ((pstmi->Option & SO_NOUNPHRASE) && (pstmi->Option & (SO_NP_NOUN | SO_NP_PRONOUN | SO_NP_NUMBER | SO_NP_DEPENDENT)))
{
int num = BaseCheck.NLP_BASE_NOUN (lpStr1, lrgsz);
BOOL first = TRUE;
for (int i = 0, index = 0, l = 0, index2 = 0; i < num; i++)
{
count = 0;
while (lrgsz [index+count] != '+' && lrgsz[index+count] != '\t')
count++;
if (first)
{
memcpy (stem1 [l], lrgsz+index, count);
stem1 [l][count] = '\0';
winfo [l] = BaseCheck.vbuf [i];
first = FALSE;
}
else
{
memcpy (ending1 [l]+index2, lrgsz+index, count);
index2 += count;
}
if (lrgsz[index+count] == '\t')
{
ending1 [l][index2] = '\0';
l++;
first = TRUE;
index2 = 0;
}
index += (count + 1);
}
num1 = l;
num = BaseCheck.NLP_BASE_NOUN (lpStr2, lrgsz);
for (i = 0, index = 0, l = 0, index2 = 0; i < num; i++)
{
count = 0;
while (lrgsz [index+count] != '+' && lrgsz [index+count] != '\t')
count++;
if (first)
{
memcpy (stem2 [l], lrgsz+index, count);
stem2 [l][count] = '\0';
first = FALSE;
}
else
{
memcpy (ending2 [l]+index2, lrgsz+index, count);
index2 += count;
}
if (lrgsz[index+count] == '\t')
{
ending2 [l][index2] = '\0';
l++;
first = TRUE;
index2 = 0;
}
index += (count + 1);
}
num2 = l;
int j;
for (i = 0; i < num1; i++)
{
for (j = 0; j < num2; j++)
if (lstrcmp (stem1[i], stem2 [j]) == 0)
break;
if (j != num2)
break;
}
if (i != num1)
{
lstrcpy (lpStem, stem1 [i]);
lstrcpy (lpEnding1, ending1 [i]);
lstrcpy (lpEnding2, ending2 [j]);
*pos = winfo [i];
GlobalUnlock (hgbl);
return NULL;
}
}
if (pstmi->Option & (SO_PREDICATE | SO_AUXILIARY))
{
int num = BaseCheck.NLP_BASE_VERB (lpStr1, lrgsz);
BOOL first = TRUE;
for (int i = 0, index = 0, l = 0, index2 = 0; i < num; i++)
{
count = 0;
while (lrgsz [index+count] != '+' && lrgsz[index+count] != '\t')
count++;
if (first)
{
memcpy (stem1 [l], lrgsz+index, count);
stem1 [l][count] = '\0';
winfo [l] = BaseCheck.vbuf [i];
first = FALSE;
}
else
{
memcpy (ending1 [l]+index2, lrgsz+index, count);
index2 += count;
}
if (lrgsz[index+count] == '\t')
{
ending1 [l][index2] = '\0';
l++;
first = TRUE;
index2 = 0;
}
index += (count + 1);
}
num1 = l;
num = BaseCheck.NLP_BASE_VERB (lpStr2, lrgsz);
for (i = 0, index = 0, l = 0, index2 = 0; i < num; i++)
{
count = 0;
while (lrgsz [index+count] != '+' && lrgsz [index+count] != '\t')
count++;
if (first)
{
memcpy (stem2 [l], lrgsz+index, count);
stem2 [l][count] = '\0';
first = FALSE;
}
else
{
memcpy (ending2 [l]+index2, lrgsz+index, count);
index2 += count;
}
if (lrgsz[index+count] == '\t')
{
ending2 [l][index2] = '\0';
l++;
first = TRUE;
index2 = 0;
}
index += (count + 1);
}
num2 = l;
int j;
for (i = 0; i < num1; i++)
{
for (j = 0; j < num2; j++)
if (lstrcmp (stem1[i], stem2 [j]) == 0)
break;
if (j != num2)
break;
}
if (i != num1)
{
lstrcpy (lpStem, stem1 [i]);
lstrcpy (lpEnding1, ending1 [i]);
lstrcpy (lpEnding2, ending2 [j]);
*pos = winfo [i];
GlobalUnlock (hgbl);
return NULL;
}
}
// for proper noun, for example, name
if (pstmi->Option & SO_NP_PROPER)
{
int len1 = lstrlen(lpStr1);
int len2 = lstrlen(lpStr2);
int shortlen = len1 > len2 ? len2 : len1;
if (strncmp (lpStr1, lpStr2, shortlen) == 0)
{
lstrcpy (lpStem, lpStr1);
lpStem [shortlen] = '\0';
char index [1];
index[0] = 'm';
CODECONVERT Conv;
BOOL res1 = TRUE, res2= TRUE;
lstrcpy (lpEnding1, lpStr1 + shortlen);
lstrcpy (lpEnding2, lpStr2 + shortlen);
if (lstrlen (lpEnding1))
{
Conv.HAN2INS ((char *)lpEnding1, inheosa1, codeWanSeong);
if (!(FindHeosaWord(inheosa1, _TOSSI, &action) & FINAL))
res1 = FALSE;
}
if (lstrlen (lpEnding2))
{
Conv.HAN2INS ((char *)lpEnding2, inheosa2, codeWanSeong);
if (!(FindHeosaWord(inheosa2, _TOSSI, &action) & FINAL))
res2 = FALSE;
}
if (res1 && res2)
{
*pos = POS_NOUN | PROPER_NOUN;
GlobalUnlock (hgbl);
return NULL;
}
}
}
GlobalUnlock (hgbl);
return srcInvalid;
}
WINSRC StemmerIsEndingW (HSTM hstm, LPCWSTR lpStr, UINT flag, BOOL *found)
{
LPSTR MultiByteStr;
int len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr, -1, NULL, 0, NULL, NULL);
MultiByteStr = (LPSTR) LocalAlloc (LPTR, len);
// add a check for this point
if (MultiByteStr == NULL ) {
return srcModuleError;
}
len = WideCharToMultiByte (UWANSUNG_CODE_PAGE, 0, lpStr, -1, MultiByteStr, len, NULL, NULL);
SRC src = StemmerIsEnding(hstm, MultiByteStr, flag, found);
LocalFree (MultiByteStr);
return src;
}
WINSRC StemmerIsEnding (HSTM hstm, LPCSTR lpStr, UINT flag, BOOL *found)
{
BOOL tossiCheck, endingCheck;
switch (flag)
{
case IS_TOSSI : tossiCheck = TRUE; endingCheck = FALSE; break;
case IS_ENDING : endingCheck = TRUE; tossiCheck = FALSE; break;
case IS_TOSSI | IS_ENDING : tossiCheck = endingCheck = TRUE; break;
default : return srcModuleError;
}
STMI *pstmi;
HGLOBAL hgbl = (HGLOBAL) hstm;
pstmi = (STMI *)GlobalLock(hgbl);
if (pstmi == NULL)
{
GlobalUnlock(hgbl);
return srcModuleError | srcInvalidID;
}
BYTE action;
char *inheosa = (char *)LocalAlloc (LPTR, lstrlen(lpStr) * 4 + 1);
// add a check for this point
if (inheosa == NULL ) {
GlobalUnlock(hgbl);
return srcModuleError;
}
CODECONVERT Conv;
Conv.HAN2INR ((char *)lpStr, inheosa, codeWanSeong);
*found = FALSE;
if (tossiCheck)
{
int res = FindHeosaWord(inheosa, _TOSSI, &action);
if (res & FINAL)
{
*found = TRUE;
endingCheck = FALSE;
}
}
if (endingCheck)
{
int res = FindHeosaWord(inheosa, _ENDING, &action);
if (res == FINAL)
*found = TRUE;
}
LocalFree (inheosa);
GlobalUnlock (hgbl);
return NULL;
}
/*
BOOL WINAPI DllMain (HINSTANCE hDLL, DWORD dwReason, LPVOID lpReserved){
extern char TempJumpNum [], TempSujaNum [], TempBaseNum [], TempNumNoun [], TempSuffixOut [];
extern char bTemp [], TempETC [], TempDap [];
extern LenDict JumpNum;
extern LenDict SujaNum;
extern LenDict BaseNum;
extern LenDict NumNoun;
extern LenDict Suffix;
extern LenDict B_Dict;
extern LenDict T_Dict;
extern LenDict Dap;
switch(dwReason) {
case DLL_PROCESS_ATTACH :
JumpNum.InitLenDict(TempJumpNum, 5, 5);
SujaNum.InitLenDict(TempSujaNum, 8, 27);
BaseNum.InitLenDict(TempBaseNum, 5, 3);
NumNoun.InitLenDict(TempNumNoun, 8, 32);
Suffix.InitLenDict(TempSuffixOut, 8, 8);
B_Dict.InitLenDict(bTemp, 5, 1);
T_Dict.InitLenDict(TempETC, 10, 7);
Dap.InitLenDict(TempDap, 5, 1);
break ;
case DLL_THREAD_ATTACH:
break;
case DLL_THREAD_DETACH:
break;
case DLL_PROCESS_DETACH :
break ;
} //switch
return TRUE ;
}
*/