999 lines
31 KiB
C
999 lines
31 KiB
C
|
/*************************************************************************
|
||
|
* *
|
||
|
* BREAKER.C *
|
||
|
* *
|
||
|
* Copyright (C) Microsoft Corporation 1990-1994 *
|
||
|
* All Rights reserved. *
|
||
|
* *
|
||
|
**************************************************************************
|
||
|
* *
|
||
|
* Module Intent *
|
||
|
* Word breaker module *
|
||
|
* This module provides word-breaking routines applicable to the ANSI *
|
||
|
* character-set. This means American English. *
|
||
|
* Note that ANSI does not mean ASCII. *
|
||
|
* *
|
||
|
* WARNING: Tab setting is 4 for this file *
|
||
|
* *
|
||
|
**************************************************************************
|
||
|
* *
|
||
|
* Current Owner: BinhN *
|
||
|
* *
|
||
|
**************************************************************************
|
||
|
* *
|
||
|
* Released by Development: (date) *
|
||
|
* *
|
||
|
*************************************************************************/
|
||
|
#include <verstamp.h>
|
||
|
SETVERSIONSTAMP(MVBK);
|
||
|
|
||
|
#include <mvopsys.h>
|
||
|
|
||
|
#include <iterror.h>
|
||
|
#include <mvsearch.h>
|
||
|
#include "common.h"
|
||
|
|
||
|
/* Macros to access structure's members */
|
||
|
|
||
|
#define CP_CLASS(p) (((LPCMAP)p)->Class & 0xff)
|
||
|
#define CP_NORMC(p) (((LPCMAP)p)->Norm)
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* INTERNAL PRIVATE FUNCTIONS
|
||
|
* All of them should be declared near
|
||
|
*************************************************************************/
|
||
|
PRIVATE ERR NEAR PASCAL WordBreakStem(LPBRK_PARMS, WORD);
|
||
|
PRIVATE int PASCAL NEAR LigatureMap(BYTE c, LPB lpbNormWord,
|
||
|
LPCMAP lpCharPropTab, LPB lpbLigatureTab, WORD wcLigature);
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* SINGLE TO DOUBLE-WIDTH KATAKANA MAPPING ARRAY
|
||
|
*
|
||
|
*************************************************************************/
|
||
|
|
||
|
// Single-Width to Double-Width Mapping Array
|
||
|
//
|
||
|
static const int mtable[][2]={
|
||
|
{129,66},{129,117},{129,118},{129,65},{129,69},{131,146},{131,64},
|
||
|
{131,66},{131,68},{131,70},{131,72},{131,131},{131,133},{131,135},
|
||
|
{131,98},{129,91},{131,65},{131,67},{131,69},{131,71},{131,73},
|
||
|
{131,74},{131,76},{131,78},{131,80},{131,82},{131,84},{131,86},
|
||
|
{131,88},{131,90},{131,92},{131,94},{131,96},{131,99},{131,101},
|
||
|
{131,103},{131,105},{131,106},{131,107},{131,108},{131,109},
|
||
|
{131,110},{131,113},{131,116},{131,119},{131,122},{131,125},
|
||
|
{131,126},{131,128},{131,129},{131,130},{131,132},{131,134},
|
||
|
{131,136},{131,137},{131,138},{131,139},{131,140},{131,141},
|
||
|
{131,143},{131,147},{129,74},{129,75} };
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc API INDEX RETRIEVAL
|
||
|
*
|
||
|
* @func LPIBI FAR PASCAL | BreakerInitiate |
|
||
|
* Allocates a breaker parameter block. This parameter block keeps
|
||
|
* track of the breaker's "global" variables.
|
||
|
*
|
||
|
* @rdesc NULL if the call fails (ie. no more memory)
|
||
|
* a pointer to the block if it succeeds.
|
||
|
*************************************************************************/
|
||
|
|
||
|
PUBLIC LPIBI EXPORT_API FAR PASCAL BreakerInitiate(void)
|
||
|
{
|
||
|
_LPIBI lpibi;
|
||
|
register HANDLE hibi;
|
||
|
|
||
|
if ((hibi = GlobalAlloc(GMEM_MOVEABLE | GMEM_ZEROINIT,
|
||
|
sizeof(IBI))) == NULL) {
|
||
|
return NULL;
|
||
|
}
|
||
|
//
|
||
|
// All variables not explicitly initialized are assumed to be
|
||
|
// initialized as zero.
|
||
|
//
|
||
|
lpibi = (_LPIBI)GlobalLock(hibi);
|
||
|
lpibi->hibi = hibi;
|
||
|
return lpibi;
|
||
|
}
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc API INDEX RETRIEVAL
|
||
|
*
|
||
|
* @func void FAR PASCAL | BreakerFree |
|
||
|
* Frees a word-breaker parameter block.
|
||
|
*
|
||
|
* @parm LPIBI | lpibi |
|
||
|
* Pointer to the InternalBreakInfo Structure containing all the
|
||
|
* informations about states
|
||
|
*************************************************************************/
|
||
|
PUBLIC void EXPORT_API FAR PASCAL BreakerFree(_LPIBI lpibi)
|
||
|
{
|
||
|
HANDLE hibi;
|
||
|
/* Do sanity check */
|
||
|
if (lpibi == NULL)
|
||
|
return;
|
||
|
|
||
|
hibi = lpibi->hibi;
|
||
|
GlobalUnlock(hibi);
|
||
|
GlobalFree(hibi);
|
||
|
}
|
||
|
|
||
|
// - - - - - - - - -
|
||
|
|
||
|
// Break words out from a block of standard text characters.
|
||
|
//
|
||
|
// This routine is incredibly important. Any change in the performance
|
||
|
// of this function will have immediate and obvious influence upon the
|
||
|
// performance of the indexing system as a whole. Consequently, the
|
||
|
// function should be very fast.
|
||
|
//
|
||
|
// This function uses a simple state machine to try to achieve the
|
||
|
// necessary speed. It's in a different loop depending upon what kind
|
||
|
// of characters it's trying to find, and it uses "goto" statements to
|
||
|
// shift back and forth between "states".
|
||
|
//
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc API RETRIEVAL INDEX
|
||
|
*
|
||
|
* @func ERR | FBreakWords |
|
||
|
* This function break a string into a sequence of words.
|
||
|
*
|
||
|
* @parm LPBRK_PARMS | lpBrkParms |
|
||
|
* Pointer to structure containing all the parameters needed for
|
||
|
* the breaker. They include:
|
||
|
* 1/ Pointer to the InternalBreakInfo
|
||
|
* 2/ Pointer to input buffer containing the word stream
|
||
|
* 3/ Size of the input bufer
|
||
|
* 4/ Offset in the source text of the first byte of the input buffer
|
||
|
* 5/ Pointer to user's parameter block for the user's function
|
||
|
* 6/ User's function to call with words. The format of the call should
|
||
|
* be (*lpfnfOutWord)(BYTE *RawWord, BYTE *NormWord, LCB lcb,
|
||
|
* LPV lpvUser)
|
||
|
* The function should return S_OK if succeeded
|
||
|
* The function can be NULL
|
||
|
* 7/ Pointer to stop word table. This table contains stop words specific
|
||
|
* to this breaker. If this is non-null, then the function
|
||
|
* will flag errors for stop word present in the query
|
||
|
* 8/ Pointer to character table. If NULL, then the default built-in
|
||
|
* character table will be used
|
||
|
*
|
||
|
* @rdesc
|
||
|
* The function returns S_OK if succeeded. The failure's causes
|
||
|
* are:
|
||
|
* @flag E_WORDTOOLONG | Word too long
|
||
|
* @flag errors | returned by the lpfnfOutWord
|
||
|
*************************************************************************/
|
||
|
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL FBreakWords(LPBRK_PARMS lpBrkParms)
|
||
|
{
|
||
|
return (WordBreakStem(lpBrkParms, FALSE));
|
||
|
}
|
||
|
|
||
|
#if 0
|
||
|
/*************************************************************************
|
||
|
* @doc API RETRIEVAL INDEX
|
||
|
*
|
||
|
* @func ERR | FBreakAndStemWords |
|
||
|
* This function breaks a string into a sequence of words and
|
||
|
* stems each resulting word
|
||
|
*
|
||
|
* @parm LPBRK_PARMS | lpBrkParms |
|
||
|
* Pointer to structure containing all the parameters needed for
|
||
|
* the breaker. They include:
|
||
|
* 1/ Pointer to the InternalBreakInfo
|
||
|
* 2/ Pointer to input buffer containing the word stream
|
||
|
* 3/ Size of the input bufer
|
||
|
* 4/ Offset in the source text of the first byte of the input buffer
|
||
|
* 5/ Pointer to user's parameter block for the user's function
|
||
|
* 6/ User's function to call with words. The format of the call should
|
||
|
* be (*lpfnfOutWord)(BYTE *RawWord, BYTE *NormWord, LCB lcb,
|
||
|
* LPV lpvUser)
|
||
|
* The function should return S_OK if succeeded
|
||
|
* The function can be NULL
|
||
|
* 7/ Pointer to stop word table. This table contains stop words specific
|
||
|
* to this breaker. If this is non-null, then the function
|
||
|
* will flag errors for stop word present in the query
|
||
|
* 8/ Pointer to character table. If NULL, then the default built-in
|
||
|
* character table will be used
|
||
|
*
|
||
|
* @rdesc
|
||
|
* The function returns S_OK if succeeded. The failure's causes
|
||
|
* are:
|
||
|
* @flag E_WORDTOOLONG | Word too long
|
||
|
* @flag Other errors | returned by the lpfnfOutWord
|
||
|
*************************************************************************/
|
||
|
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL FBreakAndStemWords(LPBRK_PARMS lpBrkParms)
|
||
|
{
|
||
|
return (WordBreakStem(lpBrkParms, TRUE));
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
|
||
|
PUBLIC ERR EXPORT_API FAR PASCAL BreakerVersion (void)
|
||
|
{
|
||
|
return CHARTABVER;
|
||
|
}
|
||
|
|
||
|
// This exists only to enable MVJK to link statically.
|
||
|
// We must have the same function names for the static build.
|
||
|
PUBLIC ERR FAR PASCAL FBreakStems(LPBRK_PARMS lpBrkParms)
|
||
|
{
|
||
|
return E_NOTSUPPORTED;
|
||
|
}
|
||
|
|
||
|
// This exists only to enable MVJK to link statically.
|
||
|
// We must have the same function names for the static build.
|
||
|
PUBLIC ERR FAR PASCAL FSelectWord (LPCSTR pBuffer, DWORD dwCount,
|
||
|
DWORD dwOffset, LPDWORD pStart, LPDWORD pEnd)
|
||
|
{
|
||
|
return E_NOTSUPPORTED;
|
||
|
}
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc INTERNAL
|
||
|
*
|
||
|
* @func ERR | WordBreakStem |
|
||
|
* This function breaks a string into a sequence of words and
|
||
|
* stems each resulting word
|
||
|
*
|
||
|
* @parm BYTE | fStem |
|
||
|
* If set, stem the word
|
||
|
*
|
||
|
* @rdesc
|
||
|
* The function returns S_OK if succeeded. The failure's causes
|
||
|
* are:
|
||
|
* @flag E_WORDTOOLONG | Word too long
|
||
|
* @flag Other errors | returned by the lpfnfOutWord
|
||
|
*************************************************************************/
|
||
|
|
||
|
PRIVATE ERR NEAR PASCAL WordBreakStem(LPBRK_PARMS lpBrkParms, WORD fStem)
|
||
|
{
|
||
|
register LPB lpbRawWord; // Pointer to RawWord buffer
|
||
|
register LPB lpbNormWord; // Pointer to NormWord buffer
|
||
|
LPCMAP lpCharPropTab; // Pointer to the char property table
|
||
|
LPB lpbInBuffer; // Buffer to groot through.
|
||
|
LPB lpbRawWordLimit; // Limit of RawWord buffer
|
||
|
#if 0
|
||
|
LPB lpbNormWordLimit; // Limit of NormWord buffer
|
||
|
#endif
|
||
|
BYTE bCurChar; // Current character.
|
||
|
BYTE fScan = TRUE;
|
||
|
ERR fRet;
|
||
|
#if 0
|
||
|
BYTE astStemmed[CB_MAX_WORD_LEN + 2]; // Temporary buffer for stemming
|
||
|
#endif
|
||
|
LPB lpbLigature = NULL;
|
||
|
WORD wcLigature = 0;
|
||
|
LPCHARTAB lpCharTab;
|
||
|
LPB astNormWord;
|
||
|
LPB astRawWord;
|
||
|
BYTE fAcceptWildCard;
|
||
|
|
||
|
/* Breakers parameters break out */
|
||
|
|
||
|
_LPIBI lpibi;
|
||
|
LPB lpbInBuf;
|
||
|
CB cbInBufSize;
|
||
|
LCB lcbInBufOffset;
|
||
|
LPV lpvUser;
|
||
|
FWORDCB lpfnfOutWord;
|
||
|
_LPSIPB lpsipb;
|
||
|
LPCMAP lpCMap = NULL;
|
||
|
|
||
|
/*
|
||
|
* Initialize variables
|
||
|
*/
|
||
|
|
||
|
if (lpBrkParms == NULL ||
|
||
|
(lpibi = lpBrkParms->lpInternalBreakInfo) == NULL)
|
||
|
return E_INVALIDARG;
|
||
|
|
||
|
astNormWord = (LPB)lpibi->astNormWord;
|
||
|
astRawWord = (LPB)lpibi->astRawWord;
|
||
|
|
||
|
lpbInBuf = lpBrkParms->lpbBuf;
|
||
|
lpvUser = lpBrkParms->lpvUser;
|
||
|
lpfnfOutWord = lpBrkParms->lpfnOutWord;
|
||
|
lpsipb = lpBrkParms->lpStopInfoBlock;
|
||
|
fAcceptWildCard = (BYTE)(lpBrkParms->fFlags & ACCEPT_WILDCARD);
|
||
|
|
||
|
/*
|
||
|
* Restore to the proper state. This is in place to handle
|
||
|
* words that cross block boundaries, and to deal with explicit
|
||
|
* buffer-flush commands.
|
||
|
*/
|
||
|
if ((lpbInBuffer = lpbInBuf) != NULL) {
|
||
|
|
||
|
cbInBufSize = lpBrkParms->cbBufCount;
|
||
|
lcbInBufOffset = lpBrkParms->lcbBufOffset;
|
||
|
|
||
|
if (lpCharTab = lpBrkParms->lpCharTab) {
|
||
|
lpCMap = (LPCMAP)(lpCharTab->lpCMapTab);
|
||
|
lpbLigature = lpCharTab->lpLigature;
|
||
|
wcLigature = lpCharTab->wcLigature;
|
||
|
}
|
||
|
else {
|
||
|
return(E_INVALIDARG);
|
||
|
}
|
||
|
|
||
|
lpbRawWordLimit = (LPB)&astRawWord[CB_MAX_WORD_LEN];
|
||
|
|
||
|
switch (lpibi->state) {
|
||
|
case SCAN_WHITE_STATE:
|
||
|
goto ScanWhite; // Running through white space.
|
||
|
case SCAN_WORD_STATE:
|
||
|
lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2];
|
||
|
lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2];
|
||
|
goto ScanWord; // Found one 'a'..'z', collecting.
|
||
|
|
||
|
case SCAN_NUM_STATE:
|
||
|
lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2];
|
||
|
lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2];
|
||
|
goto ScanNumber;// Found one '0'..'9', collecting.
|
||
|
|
||
|
case SCAN_LEADBYTE_STATE:
|
||
|
lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2];
|
||
|
lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2];
|
||
|
goto ScanLeadByte;
|
||
|
|
||
|
case SCAN_SBKANA_STATE:
|
||
|
lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2];
|
||
|
lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2];
|
||
|
goto ScanSbKana;
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
cbInBufSize = fScan = 0;
|
||
|
switch (lpibi->state) {
|
||
|
case SCAN_WHITE_STATE:
|
||
|
return S_OK; // Still stuck in white space.
|
||
|
case SCAN_WORD_STATE:
|
||
|
goto FlushWord; // Flush a word.
|
||
|
case SCAN_NUM_STATE:
|
||
|
goto FlushNumber; // Flush a number.
|
||
|
case SCAN_LEADBYTE_STATE:
|
||
|
goto ScanLeadByte;
|
||
|
case SCAN_SBKANA_STATE:
|
||
|
goto ScanSbKana;
|
||
|
}
|
||
|
}
|
||
|
//
|
||
|
// W H I T E - S P A C E S T A T E
|
||
|
//
|
||
|
// While in this state the code is hunting through white-space,
|
||
|
// searching for an alpha character or a digit character. If
|
||
|
// it finds one, it initializes the word and goes to either the
|
||
|
// word-collection state or the number-collection state.
|
||
|
//
|
||
|
ScanWhite:
|
||
|
for (; cbInBufSize; cbInBufSize--, lpbInBuffer++) {
|
||
|
//
|
||
|
// Get the character and its class.
|
||
|
//
|
||
|
|
||
|
switch (CP_CLASS(&lpCMap[*lpbInBuffer])) {
|
||
|
case CLASS_WILDCARD:
|
||
|
if (fAcceptWildCard == FALSE)
|
||
|
continue;
|
||
|
case CLASS_TYPE: // Found the 1st byte of the special string
|
||
|
case CLASS_CHAR: // Found a non-normalized char
|
||
|
case CLASS_NORM: // Found a normalized character
|
||
|
case CLASS_LIGATURE: // Found a ligature
|
||
|
|
||
|
// jump to the word-collection state.
|
||
|
lpibi->lcb = (DWORD)(lcbInBufOffset +
|
||
|
(lpbInBuffer - lpbInBuf));
|
||
|
lpbRawWord = (LPB)&astRawWord[2];
|
||
|
lpbNormWord = (LPB)&astNormWord[2];
|
||
|
goto ScanWord;
|
||
|
|
||
|
case CLASS_DIGIT: // Found a digit.
|
||
|
lpibi->lcb = (DWORD)(lcbInBufOffset +
|
||
|
(lpbInBuffer - lpbInBuf));
|
||
|
lpibi->cbNormPunctLen = lpibi->cbRawPunctLen = 0;
|
||
|
lpbRawWord = (LPB)&astRawWord[2];
|
||
|
lpbNormWord = (LPB)&astNormWord[2];
|
||
|
goto ScanNumber;
|
||
|
|
||
|
case CLASS_LEADBYTE:
|
||
|
lpibi->lcb = (DWORD)(lcbInBufOffset +
|
||
|
(lpbInBuffer - lpbInBuf));
|
||
|
lpbRawWord = (LPB)&astRawWord[2];
|
||
|
lpbNormWord = (LPB)&astNormWord[2];
|
||
|
*(LPW)astNormWord = *(LPW)astRawWord = 0;
|
||
|
goto ScanLeadByte;
|
||
|
case CLASS_SBKANA:
|
||
|
lpibi->lcb = (DWORD)(lcbInBufOffset +
|
||
|
(lpbInBuffer - lpbInBuf));
|
||
|
*(LPW)astNormWord = *(LPW)astRawWord = 0;
|
||
|
lpbRawWord = (LPB)&astRawWord[2];
|
||
|
lpbNormWord = (LPB)&astNormWord[2];
|
||
|
goto ScanSbKana;
|
||
|
}
|
||
|
}
|
||
|
//
|
||
|
// If I run out of data, set things up so I'll come back
|
||
|
// to this state if the user provides more data.
|
||
|
//
|
||
|
lpibi->state = SCAN_WHITE_STATE;
|
||
|
return S_OK;
|
||
|
|
||
|
ScanWord:
|
||
|
//
|
||
|
// W O R D S T A T E
|
||
|
//
|
||
|
// While in this state the code is attempting to append alpha
|
||
|
// and digit characters to the alpha character it's already
|
||
|
// found. Apostrophes are stripped.
|
||
|
//
|
||
|
for (; cbInBufSize; cbInBufSize--, lpbInBuffer++) {
|
||
|
//
|
||
|
// Get the character and its class.
|
||
|
//
|
||
|
lpCharPropTab = &lpCMap[bCurChar = *lpbInBuffer];
|
||
|
switch (CP_CLASS(lpCharPropTab)) {
|
||
|
case CLASS_NORM :
|
||
|
case CLASS_DIGIT :
|
||
|
case CLASS_CHAR:
|
||
|
//
|
||
|
// Found a normalized character or a digit.
|
||
|
// Append it to the output buffer.
|
||
|
//
|
||
|
if (lpbRawWord >= lpbRawWordLimit)
|
||
|
return (E_WORDTOOLONG);
|
||
|
*lpbRawWord++ = bCurChar;
|
||
|
*lpbNormWord++ = CP_NORMC(&lpCMap[bCurChar]);
|
||
|
break;
|
||
|
|
||
|
case CLASS_LIGATURE:
|
||
|
//
|
||
|
// Found an ligature character. Normalize
|
||
|
// it and append it to the output buffer.
|
||
|
//
|
||
|
if (lpbRawWord >= lpbRawWordLimit)
|
||
|
return (E_WORDTOOLONG);
|
||
|
*lpbRawWord++ = bCurChar;
|
||
|
lpbNormWord += LigatureMap (bCurChar, lpbNormWord,
|
||
|
lpCMap, lpbLigature, wcLigature);
|
||
|
break;
|
||
|
|
||
|
case CLASS_STRIP:
|
||
|
//
|
||
|
// Found an apostrophe or somesuch. Ignore
|
||
|
// this character, but increment the word length,
|
||
|
// since it counts as part of the un-normalized
|
||
|
// word's length.
|
||
|
//
|
||
|
if (lpbRawWord >= lpbRawWordLimit)
|
||
|
return (E_WORDTOOLONG);
|
||
|
*lpbRawWord++ = bCurChar;
|
||
|
break;
|
||
|
|
||
|
case CLASS_TYPE :
|
||
|
/* Set the flag to remind us to get the
|
||
|
second byte.
|
||
|
*/
|
||
|
lpibi->fGotType = TRUE;
|
||
|
*lpbRawWord++ = *lpbNormWord++ = bCurChar;
|
||
|
break;
|
||
|
|
||
|
case CLASS_WILDCARD:
|
||
|
//
|
||
|
// Found a wildcard character
|
||
|
// Append it to the output buffer if we accept wildcard
|
||
|
//
|
||
|
if (fAcceptWildCard) {
|
||
|
if (lpbRawWord >= lpbRawWordLimit)
|
||
|
return (E_WORDTOOLONG);
|
||
|
*lpbRawWord++ = bCurChar;
|
||
|
*lpbNormWord++ = bCurChar;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
default:
|
||
|
if (lpibi->fGotType == TRUE) {
|
||
|
lpibi->fGotType = FALSE;
|
||
|
|
||
|
/* Found a the 2nd byte of a special type
|
||
|
Append it to the output buffer. */
|
||
|
|
||
|
*lpbRawWord++ = *lpbNormWord++ = bCurChar;
|
||
|
break;
|
||
|
}
|
||
|
//
|
||
|
// Found something weird, or I have been ordered
|
||
|
// to flush the output buffer. Flush the output
|
||
|
// buffer and go back to the "grooting through
|
||
|
// white space" state (#0).
|
||
|
//
|
||
|
FlushWord:
|
||
|
if (fScan)
|
||
|
{
|
||
|
/* Recalculate the length only if scanning */
|
||
|
*(LPW)astRawWord = (WORD)(lpbRawWord -
|
||
|
(LPB)&astRawWord[2]);
|
||
|
*(LPW)astNormWord = (WORD)(lpbNormWord -
|
||
|
(LPB)&astNormWord[2]);
|
||
|
}
|
||
|
|
||
|
/* Check for stop word if required */
|
||
|
if (lpsipb)
|
||
|
{
|
||
|
if (lpsipb->lpfnStopListLookup(lpsipb,
|
||
|
astNormWord) == S_OK)
|
||
|
{
|
||
|
goto ScanWhite; // Ignore stop words
|
||
|
}
|
||
|
}
|
||
|
#if 0
|
||
|
|
||
|
if (fStem)
|
||
|
{
|
||
|
/* Do stemming if requested */
|
||
|
if (FStem(astStemmed, astNormWord) == S_OK)
|
||
|
{
|
||
|
MEMCPY(astNormWord, astStemmed, GETWORD(astStemmed)
|
||
|
+ sizeof(WORD));
|
||
|
}
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
/* Execute user's function */
|
||
|
if (*lpfnfOutWord && (fRet = (*lpfnfOutWord)(astRawWord,
|
||
|
lpibi->astNormWord, lpibi->lcb, lpvUser)) != S_OK)
|
||
|
return fRet;
|
||
|
goto ScanWhite;
|
||
|
}
|
||
|
}
|
||
|
//
|
||
|
// If I run out of data, set things up so I'll come back
|
||
|
// to this state if the user provides more data. If they
|
||
|
// just want me to flush, I come back to the "flush a
|
||
|
// word" state (#1f), since at this time I already have
|
||
|
// a valid word, since I got an alpha-char in state #0,
|
||
|
// and may have gotten more since.
|
||
|
//
|
||
|
lpibi->state = SCAN_WORD_STATE;
|
||
|
*(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2]);
|
||
|
*(LPW)astNormWord = (WORD)(lpbNormWord - (LPB)&astNormWord[2]);
|
||
|
return S_OK;
|
||
|
|
||
|
|
||
|
ScanLeadByte:
|
||
|
if(!cbInBufSize)
|
||
|
{
|
||
|
// no character - we may have lost a DBC
|
||
|
//
|
||
|
lpibi->state = SCAN_WHITE_STATE;
|
||
|
*(LPW)astNormWord = *(LPW)astRawWord = 0;
|
||
|
return S_OK;
|
||
|
}
|
||
|
|
||
|
if(!GETWORD(astNormWord))
|
||
|
{
|
||
|
// process lead byte
|
||
|
//
|
||
|
*(LPW)astNormWord = *(LPW)astRawWord = 1;
|
||
|
astNormWord[2] = *lpbInBuffer++;
|
||
|
--cbInBufSize;
|
||
|
}
|
||
|
|
||
|
if(!cbInBufSize)
|
||
|
{
|
||
|
// no more characters - set up state so we come back to get trail byte.
|
||
|
//
|
||
|
lpibi->state = SCAN_LEADBYTE_STATE;
|
||
|
return S_OK;
|
||
|
}
|
||
|
|
||
|
// process trail byte
|
||
|
//
|
||
|
*(LPW)astNormWord = *(LPW)astRawWord = 2;
|
||
|
astNormWord[3] = *lpbInBuffer++;
|
||
|
--cbInBufSize;
|
||
|
|
||
|
// flush the DBC
|
||
|
//
|
||
|
if (*lpfnfOutWord &&
|
||
|
(fRet = (*lpfnfOutWord)(astRawWord,astNormWord, lpibi->lcb,lpvUser))
|
||
|
!= S_OK)
|
||
|
return fRet;
|
||
|
|
||
|
if(!cbInBufSize)
|
||
|
{
|
||
|
// no more characters - we have already flushed our DBC so we will just
|
||
|
// set the state back to scanning for white space.
|
||
|
//
|
||
|
lpibi->state = SCAN_WHITE_STATE;
|
||
|
return S_OK;
|
||
|
}
|
||
|
|
||
|
// all done - go back to scanning white space.
|
||
|
//
|
||
|
goto ScanWhite;
|
||
|
|
||
|
ScanSbKana:
|
||
|
if(!cbInBufSize)
|
||
|
{
|
||
|
// Buffer is empty. Flush the buffer if we are holding a character.
|
||
|
//
|
||
|
if(GETWORD(astNormWord))
|
||
|
{
|
||
|
if (*lpfnfOutWord &&
|
||
|
(fRet = (*lpfnfOutWord)(astRawWord,astNormWord, lpibi->lcb,lpvUser))
|
||
|
!= S_OK)
|
||
|
return fRet;
|
||
|
}
|
||
|
|
||
|
lpibi->state = SCAN_WHITE_STATE;
|
||
|
*(LPW)astNormWord = *(LPW)astRawWord = 0;
|
||
|
return S_OK;
|
||
|
}
|
||
|
|
||
|
// Note: The basic algorithm (including the mapping table) used here to
|
||
|
// convert half-width Katakana characters to full-width Katakana appears
|
||
|
// in the book "Understanding Japanese Information Systems" by
|
||
|
// O'Reily & Associates.
|
||
|
|
||
|
|
||
|
// If the RawWord buffer is empty then we will process this as a first
|
||
|
// character (we are not looking for an diacritic mark).
|
||
|
//
|
||
|
if(!GETWORD(astRawWord))
|
||
|
{
|
||
|
// Verify that we have a half-width Katakana character. This check is
|
||
|
// a good safeguard against erroneous information in a user defined
|
||
|
// charmap.
|
||
|
//
|
||
|
if(*lpbInBuffer >= 161 && *lpbInBuffer <= 223)
|
||
|
{
|
||
|
// We have a half-width Katakana character. Now compute the equivalent
|
||
|
// full-width character via the mapping table.
|
||
|
//
|
||
|
astNormWord[2] = (BYTE)(mtable[*lpbInBuffer-161][0]);
|
||
|
astNormWord[3] = (BYTE)(mtable[*lpbInBuffer-161][1]);
|
||
|
*(LPW)astNormWord = 2;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// This is an error condition. For some reason the charmap has
|
||
|
// *lpbInBuffer tagged as CLASS_SBKANA when in fact it's not
|
||
|
// a single byte Katakana character. This is probably the result
|
||
|
// of an improperly formed user defined charmap.
|
||
|
//
|
||
|
// Since there's no way to determine the real class of this character
|
||
|
// we will send it to the bit bucket.
|
||
|
//
|
||
|
lpbInBuffer++;
|
||
|
cbInBufSize--;
|
||
|
*(LPW)astNormWord = *(LPW)astRawWord = 0;
|
||
|
lpibi->state = SCAN_WHITE_STATE;
|
||
|
goto ScanWhite;
|
||
|
}
|
||
|
*(LPW)astRawWord = 1; // we have processed one character so far
|
||
|
astRawWord[2] = *lpbInBuffer; // we will need the original character later
|
||
|
lpbInBuffer++;
|
||
|
cbInBufSize--;
|
||
|
}
|
||
|
|
||
|
// Check if we have more characters in the buffer.
|
||
|
//
|
||
|
if(!cbInBufSize)
|
||
|
{
|
||
|
// Return because the buffer is empty.
|
||
|
//
|
||
|
lpibi->state = SCAN_SBKANA_STATE;
|
||
|
return S_OK;
|
||
|
}
|
||
|
|
||
|
// check if the second character is nigori mark.
|
||
|
//
|
||
|
if(*lpbInBuffer == 222)
|
||
|
{
|
||
|
// see if we have a half-width katakana that can be modified by nigori.
|
||
|
//
|
||
|
if((astRawWord[1] >= 182 && astRawWord[1] <= 196) ||
|
||
|
(astRawWord[1] >= 202 && astRawWord[1] <= 206) || (astRawWord[1] == 179))
|
||
|
{
|
||
|
// transform kana into kana with maru
|
||
|
//
|
||
|
if((astNormWord[2] >= 74 && astNormWord[2] <= 103) ||
|
||
|
(astNormWord[2] >= 110 && astNormWord[2] <= 122))
|
||
|
astNormWord[2]++;
|
||
|
else if(astNormWord[2] == 131 && astNormWord[3] == 69)
|
||
|
astNormWord[3] = 148;
|
||
|
|
||
|
|
||
|
// set the word lengths and advance the buffer.
|
||
|
//
|
||
|
*(LPW)astNormWord=2;
|
||
|
*(LPW)astRawWord =2;
|
||
|
lpbInBuffer++;
|
||
|
cbInBufSize--;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// check if following character is maru mark
|
||
|
//
|
||
|
else if(*lpbInBuffer==223)
|
||
|
{
|
||
|
// see if we have a half-width katakana that can be modified by maru.
|
||
|
//
|
||
|
if((astRawWord[2] >= 202 && astRawWord[2] <= 206))
|
||
|
{
|
||
|
// transform kana into kana with nigori
|
||
|
//
|
||
|
if(astNormWord[3] >= 110 && astNormWord[3] <= 122)
|
||
|
astNormWord[3]+=2;
|
||
|
|
||
|
// set the word lengths and advance the buffer.
|
||
|
//
|
||
|
*(LPW)astNormWord=2;
|
||
|
*(LPW)astRawWord=2;
|
||
|
lpbInBuffer++;
|
||
|
cbInBufSize--;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Note: If the character at *lpbInBuffer wasn't a diacritic mark, then it
|
||
|
// will be processed when ScanWhite is re-entered.
|
||
|
//
|
||
|
// Another note: The above code only combines diacritic marks with
|
||
|
// single-width Katakana characters that can be modifed
|
||
|
// by these marks (not all can). If we happen to encounter
|
||
|
// a situation where the diacritic can't be combined
|
||
|
// into the character, we let the character continue
|
||
|
// back to ScanWhite where it will be re-sent to
|
||
|
// ScanSbKana, however this time it will be a first
|
||
|
// character and be converted into its stand-alone
|
||
|
// full-width equivalent (maru and nigori have full-width
|
||
|
// character equilalents that contain just the mark).
|
||
|
|
||
|
// flush the buffer
|
||
|
//
|
||
|
if (*lpfnfOutWord &&
|
||
|
(fRet = (*lpfnfOutWord)(astRawWord,astNormWord, lpibi->lcb,lpvUser))
|
||
|
!= S_OK)
|
||
|
return fRet;
|
||
|
|
||
|
// reset word lengths and return to scanning for white space.
|
||
|
//
|
||
|
*(LPW)astNormWord = *(LPW)astRawWord = 0;
|
||
|
lpibi->state = SCAN_WHITE_STATE;
|
||
|
|
||
|
// Return if buffer is empty
|
||
|
//
|
||
|
if(!cbInBufSize)
|
||
|
return S_OK;
|
||
|
|
||
|
// all done - go back to scanning white space.
|
||
|
//
|
||
|
goto ScanWhite;
|
||
|
|
||
|
|
||
|
ScanNumber:
|
||
|
//
|
||
|
// N U M B E R S T A T E
|
||
|
//
|
||
|
// While in this state the code is attempting to append alpha
|
||
|
// and digit characters to the digit character it's already
|
||
|
// found. This state is more complex than the word grabbing
|
||
|
// state, because it deals with slashes and hyphens in a weird
|
||
|
// way. They're allowed in a number unless they appear at the
|
||
|
// end. Extra variables have to account for these conditions.
|
||
|
//
|
||
|
for (; cbInBufSize; cbInBufSize--, lpbInBuffer++) {
|
||
|
//
|
||
|
// Get the character and its class.
|
||
|
//
|
||
|
lpCharPropTab = &lpCMap[bCurChar = *lpbInBuffer];
|
||
|
switch (CP_CLASS(lpCharPropTab)) {
|
||
|
case CLASS_DIGIT :
|
||
|
case CLASS_NORM :
|
||
|
case CLASS_CHAR:
|
||
|
//
|
||
|
// Found a normalized character or a digit.
|
||
|
// Append it to the output buffer.
|
||
|
//
|
||
|
if (lpbRawWord >= lpbRawWordLimit)
|
||
|
return (E_WORDTOOLONG);
|
||
|
*lpbRawWord++ = bCurChar;
|
||
|
*lpbNormWord++ = CP_NORMC(&lpCMap[bCurChar]);
|
||
|
lpibi->cbRawPunctLen = 0;
|
||
|
lpibi->cbNormPunctLen = 0;
|
||
|
break;
|
||
|
|
||
|
case CLASS_LIGATURE:
|
||
|
//
|
||
|
// Found an ligature character. Normalize
|
||
|
// it and append it to the output buffer.
|
||
|
//
|
||
|
if (lpbRawWord >= lpbRawWordLimit)
|
||
|
return (E_WORDTOOLONG);
|
||
|
*lpbRawWord++ = bCurChar;
|
||
|
lpbNormWord += LigatureMap (bCurChar, lpbNormWord,
|
||
|
lpCMap, lpbLigature, wcLigature);
|
||
|
lpibi->cbRawPunctLen = 0;
|
||
|
lpibi->cbNormPunctLen = 0;
|
||
|
break;
|
||
|
|
||
|
case CLASS_NKEEP:
|
||
|
//
|
||
|
// Found a hyphen or a slash. These are kept
|
||
|
// as part of the number unless they appear at
|
||
|
// the end of the number.
|
||
|
//
|
||
|
if (lpbRawWord >= lpbRawWordLimit)
|
||
|
return (E_WORDTOOLONG);
|
||
|
*lpbRawWord++ = bCurChar;
|
||
|
*lpbNormWord++= bCurChar;
|
||
|
lpibi->cbRawPunctLen++;
|
||
|
lpibi->cbNormPunctLen++;
|
||
|
break;
|
||
|
|
||
|
case CLASS_NSTRIP:
|
||
|
//
|
||
|
// Found a comma or somesuch. Ignore this
|
||
|
// character, but increment the word length,
|
||
|
// since it counts as part of the un-normalized
|
||
|
// number's length.
|
||
|
//
|
||
|
if (lpbRawWord >= lpbRawWordLimit)
|
||
|
return (E_WORDTOOLONG);
|
||
|
*lpbRawWord++= bCurChar;
|
||
|
lpibi->cbRawPunctLen++;
|
||
|
break;
|
||
|
|
||
|
case CLASS_CONTEXTNSTRIP:
|
||
|
//
|
||
|
// Found special character used for number separator. This
|
||
|
// may be a space in French, ie. 100 000. The problem here
|
||
|
// is that we must differentiate it from a regular word
|
||
|
// separator. In the meantime, ignore this character, but
|
||
|
// increment the word length
|
||
|
//
|
||
|
if (lpbRawWord >= lpbRawWordLimit)
|
||
|
return (E_WORDTOOLONG);
|
||
|
*lpbRawWord++= bCurChar;
|
||
|
lpibi->cbRawPunctLen++;
|
||
|
cbInBufSize--;
|
||
|
lpbInBuffer++;
|
||
|
goto ScanSeparator; // Found a "possible" separator
|
||
|
break;
|
||
|
|
||
|
case CLASS_WILDCARD:
|
||
|
//
|
||
|
// Found a wildcard character
|
||
|
// Append it to the output buffer if we accept wildcard
|
||
|
//
|
||
|
if (fAcceptWildCard) {
|
||
|
if (lpbRawWord >= lpbRawWordLimit)
|
||
|
return (E_WORDTOOLONG);
|
||
|
*lpbRawWord++ = bCurChar;
|
||
|
*lpbNormWord++ = bCurChar;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
default:
|
||
|
//
|
||
|
// Found something weird, or I have been ordered
|
||
|
// to flush the output buffer. Flush the output
|
||
|
// buffer and go back to the "grooting through
|
||
|
// white space" state (#0).
|
||
|
//
|
||
|
// This is a little more complicated than the
|
||
|
// analogous routine for dealing with words.
|
||
|
// This has to deal with words that have some
|
||
|
// number of trailing punctuation characters.
|
||
|
// These need to be stripped from the word, and
|
||
|
// the un-normalized word length value needs to
|
||
|
// be adjusted as well.
|
||
|
//
|
||
|
FlushNumber:
|
||
|
if (fScan)
|
||
|
{
|
||
|
/* Recalculate the length only if scanning */
|
||
|
*(LPW)astRawWord = (WORD)(lpbRawWord -
|
||
|
(LPB)&astRawWord[2] -
|
||
|
lpibi->cbRawPunctLen);
|
||
|
*(LPW)astNormWord = (WORD)(lpbNormWord -
|
||
|
(LPB)&astNormWord[2] -
|
||
|
lpibi->cbNormPunctLen);
|
||
|
}
|
||
|
|
||
|
/* Check for stop word if required */
|
||
|
if (lpsipb)
|
||
|
{
|
||
|
if (lpsipb->lpfnStopListLookup(lpsipb,
|
||
|
astNormWord) == S_OK)
|
||
|
{
|
||
|
goto ScanWhite; // Ignore stop words
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (*lpfnfOutWord && (fRet = (*lpfnfOutWord)(astRawWord,
|
||
|
astNormWord, lpibi->lcb, lpvUser)) != S_OK)
|
||
|
return fRet;
|
||
|
goto ScanWhite;
|
||
|
}
|
||
|
}
|
||
|
//
|
||
|
// If I run out of data, set things up so I'll come back
|
||
|
// to this state if the user provides more data. If they
|
||
|
// just want me to flush, I come back to the "flush a
|
||
|
// number" state (#2f), since at this time I already have
|
||
|
// a valid word, since I got an digit-char in state #0,
|
||
|
// and may have gotten more since.
|
||
|
//
|
||
|
lpibi->state = SCAN_NUM_STATE;
|
||
|
*(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2]);
|
||
|
*(LPW)astNormWord = (WORD)(lpbNormWord - (LPB)&astNormWord[2]);
|
||
|
return S_OK;
|
||
|
|
||
|
ScanSeparator:
|
||
|
// S E P A R A T O R S T A T E
|
||
|
//
|
||
|
// This state deals with special character used to separate digits
|
||
|
// of numbers. Example:
|
||
|
// 100 000 ' ' is used to separate the digits in French(??)
|
||
|
// In some sense, comma belongs to this class, when we
|
||
|
// deal with US numbers. Because of compability with Liljoe, they
|
||
|
// are set to be CLASS_NSTRIP. The rules to distinguish between
|
||
|
// a digit separator from regular word separator is: If there is a
|
||
|
// digit thats follows, then this is a digit separator, else it is
|
||
|
// a regular word separator
|
||
|
//
|
||
|
if (cbInBufSize) {
|
||
|
//
|
||
|
// Get the character and its class.
|
||
|
//
|
||
|
lpCharPropTab = &lpCMap[bCurChar = *lpbInBuffer];
|
||
|
if (CP_CLASS(lpCharPropTab) == CLASS_DIGIT) {
|
||
|
|
||
|
/* The followed character is a digit, so this must be a digit
|
||
|
* separator. Continue to get the number */
|
||
|
|
||
|
goto ScanNumber;
|
||
|
}
|
||
|
else {
|
||
|
/* Back out the change since this is a word separator */
|
||
|
|
||
|
lpbRawWord--;
|
||
|
*(LPW)astRawWord = (WORD)(lpbRawWord -
|
||
|
(LPB)&astRawWord[2]);
|
||
|
lpibi->cbRawPunctLen--;
|
||
|
goto FlushNumber;
|
||
|
}
|
||
|
}
|
||
|
//
|
||
|
// If I run out of data, set things up so I'll come back
|
||
|
// to this state if the user provides more data.
|
||
|
//
|
||
|
lpibi->state = SCAN_SEP_STATE;
|
||
|
*(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2]);
|
||
|
*(LPW)astNormWord = (WORD)(lpbNormWord - (LPB)&astNormWord[2]);
|
||
|
return S_OK;
|
||
|
}
|
||
|
|
||
|
PRIVATE int PASCAL NEAR LigatureMap(BYTE c, LPB lpbNormWord,
|
||
|
LPCMAP lpCMap, LPB lpbLigatureTab, WORD wcLigature)
|
||
|
{
|
||
|
for (;wcLigature > 0; wcLigature --) {
|
||
|
if (*lpbLigatureTab == c) {
|
||
|
*lpbNormWord++ = CP_NORMC(&lpCMap[lpbLigatureTab[1]]);
|
||
|
*lpbNormWord++ = CP_NORMC(&lpCMap[lpbLigatureTab[2]]);
|
||
|
return 2;
|
||
|
}
|
||
|
lpbLigatureTab += 3;
|
||
|
}
|
||
|
|
||
|
/* Not a ligature */
|
||
|
*lpbNormWord++ = CP_NORMC(&lpCMap[c]);
|
||
|
return 1;
|
||
|
}
|
||
|
|