windows-nt/Source/XPSP1/NT/enduser/stuff/itircl/fts/breakers/breaker.c

999 lines
31 KiB
C
Raw Permalink Normal View History

2020-09-26 03:20:57 -05:00
/*************************************************************************
* *
* BREAKER.C *
* *
* Copyright (C) Microsoft Corporation 1990-1994 *
* All Rights reserved. *
* *
**************************************************************************
* *
* Module Intent *
* Word breaker module *
* This module provides word-breaking routines applicable to the ANSI *
* character-set. This means American English. *
* Note that ANSI does not mean ASCII. *
* *
* WARNING: Tab setting is 4 for this file *
* *
**************************************************************************
* *
* Current Owner: BinhN *
* *
**************************************************************************
* *
* Released by Development: (date) *
* *
*************************************************************************/
#include <verstamp.h>
SETVERSIONSTAMP(MVBK);
#include <mvopsys.h>
#include <iterror.h>
#include <mvsearch.h>
#include "common.h"
/* Macros to access structure's members */
#define CP_CLASS(p) (((LPCMAP)p)->Class & 0xff)
#define CP_NORMC(p) (((LPCMAP)p)->Norm)
/*************************************************************************
*
* INTERNAL PRIVATE FUNCTIONS
* All of them should be declared near
*************************************************************************/
PRIVATE ERR NEAR PASCAL WordBreakStem(LPBRK_PARMS, WORD);
PRIVATE int PASCAL NEAR LigatureMap(BYTE c, LPB lpbNormWord,
LPCMAP lpCharPropTab, LPB lpbLigatureTab, WORD wcLigature);
/*************************************************************************
*
* SINGLE TO DOUBLE-WIDTH KATAKANA MAPPING ARRAY
*
*************************************************************************/
// Single-Width to Double-Width Mapping Array
//
static const int mtable[][2]={
{129,66},{129,117},{129,118},{129,65},{129,69},{131,146},{131,64},
{131,66},{131,68},{131,70},{131,72},{131,131},{131,133},{131,135},
{131,98},{129,91},{131,65},{131,67},{131,69},{131,71},{131,73},
{131,74},{131,76},{131,78},{131,80},{131,82},{131,84},{131,86},
{131,88},{131,90},{131,92},{131,94},{131,96},{131,99},{131,101},
{131,103},{131,105},{131,106},{131,107},{131,108},{131,109},
{131,110},{131,113},{131,116},{131,119},{131,122},{131,125},
{131,126},{131,128},{131,129},{131,130},{131,132},{131,134},
{131,136},{131,137},{131,138},{131,139},{131,140},{131,141},
{131,143},{131,147},{129,74},{129,75} };
/*************************************************************************
* @doc API INDEX RETRIEVAL
*
* @func LPIBI FAR PASCAL | BreakerInitiate |
* Allocates a breaker parameter block. This parameter block keeps
* track of the breaker's "global" variables.
*
* @rdesc NULL if the call fails (ie. no more memory)
* a pointer to the block if it succeeds.
*************************************************************************/
PUBLIC LPIBI EXPORT_API FAR PASCAL BreakerInitiate(void)
{
_LPIBI lpibi;
register HANDLE hibi;
if ((hibi = GlobalAlloc(GMEM_MOVEABLE | GMEM_ZEROINIT,
sizeof(IBI))) == NULL) {
return NULL;
}
//
// All variables not explicitly initialized are assumed to be
// initialized as zero.
//
lpibi = (_LPIBI)GlobalLock(hibi);
lpibi->hibi = hibi;
return lpibi;
}
/*************************************************************************
* @doc API INDEX RETRIEVAL
*
* @func void FAR PASCAL | BreakerFree |
* Frees a word-breaker parameter block.
*
* @parm LPIBI | lpibi |
* Pointer to the InternalBreakInfo Structure containing all the
* informations about states
*************************************************************************/
PUBLIC void EXPORT_API FAR PASCAL BreakerFree(_LPIBI lpibi)
{
HANDLE hibi;
/* Do sanity check */
if (lpibi == NULL)
return;
hibi = lpibi->hibi;
GlobalUnlock(hibi);
GlobalFree(hibi);
}
// - - - - - - - - -
// Break words out from a block of standard text characters.
//
// This routine is incredibly important. Any change in the performance
// of this function will have immediate and obvious influence upon the
// performance of the indexing system as a whole. Consequently, the
// function should be very fast.
//
// This function uses a simple state machine to try to achieve the
// necessary speed. It's in a different loop depending upon what kind
// of characters it's trying to find, and it uses "goto" statements to
// shift back and forth between "states".
//
/*************************************************************************
* @doc API RETRIEVAL INDEX
*
* @func ERR | FBreakWords |
* This function break a string into a sequence of words.
*
* @parm LPBRK_PARMS | lpBrkParms |
* Pointer to structure containing all the parameters needed for
* the breaker. They include:
* 1/ Pointer to the InternalBreakInfo
* 2/ Pointer to input buffer containing the word stream
* 3/ Size of the input bufer
* 4/ Offset in the source text of the first byte of the input buffer
* 5/ Pointer to user's parameter block for the user's function
* 6/ User's function to call with words. The format of the call should
* be (*lpfnfOutWord)(BYTE *RawWord, BYTE *NormWord, LCB lcb,
* LPV lpvUser)
* The function should return S_OK if succeeded
* The function can be NULL
* 7/ Pointer to stop word table. This table contains stop words specific
* to this breaker. If this is non-null, then the function
* will flag errors for stop word present in the query
* 8/ Pointer to character table. If NULL, then the default built-in
* character table will be used
*
* @rdesc
* The function returns S_OK if succeeded. The failure's causes
* are:
* @flag E_WORDTOOLONG | Word too long
* @flag errors | returned by the lpfnfOutWord
*************************************************************************/
PUBLIC ERR EXPORT_API FAR PASCAL FBreakWords(LPBRK_PARMS lpBrkParms)
{
return (WordBreakStem(lpBrkParms, FALSE));
}
#if 0
/*************************************************************************
* @doc API RETRIEVAL INDEX
*
* @func ERR | FBreakAndStemWords |
* This function breaks a string into a sequence of words and
* stems each resulting word
*
* @parm LPBRK_PARMS | lpBrkParms |
* Pointer to structure containing all the parameters needed for
* the breaker. They include:
* 1/ Pointer to the InternalBreakInfo
* 2/ Pointer to input buffer containing the word stream
* 3/ Size of the input bufer
* 4/ Offset in the source text of the first byte of the input buffer
* 5/ Pointer to user's parameter block for the user's function
* 6/ User's function to call with words. The format of the call should
* be (*lpfnfOutWord)(BYTE *RawWord, BYTE *NormWord, LCB lcb,
* LPV lpvUser)
* The function should return S_OK if succeeded
* The function can be NULL
* 7/ Pointer to stop word table. This table contains stop words specific
* to this breaker. If this is non-null, then the function
* will flag errors for stop word present in the query
* 8/ Pointer to character table. If NULL, then the default built-in
* character table will be used
*
* @rdesc
* The function returns S_OK if succeeded. The failure's causes
* are:
* @flag E_WORDTOOLONG | Word too long
* @flag Other errors | returned by the lpfnfOutWord
*************************************************************************/
PUBLIC ERR EXPORT_API FAR PASCAL FBreakAndStemWords(LPBRK_PARMS lpBrkParms)
{
return (WordBreakStem(lpBrkParms, TRUE));
}
#endif
PUBLIC ERR EXPORT_API FAR PASCAL BreakerVersion (void)
{
return CHARTABVER;
}
// This exists only to enable MVJK to link statically.
// We must have the same function names for the static build.
PUBLIC ERR FAR PASCAL FBreakStems(LPBRK_PARMS lpBrkParms)
{
return E_NOTSUPPORTED;
}
// This exists only to enable MVJK to link statically.
// We must have the same function names for the static build.
PUBLIC ERR FAR PASCAL FSelectWord (LPCSTR pBuffer, DWORD dwCount,
DWORD dwOffset, LPDWORD pStart, LPDWORD pEnd)
{
return E_NOTSUPPORTED;
}
/*************************************************************************
* @doc INTERNAL
*
* @func ERR | WordBreakStem |
* This function breaks a string into a sequence of words and
* stems each resulting word
*
* @parm BYTE | fStem |
* If set, stem the word
*
* @rdesc
* The function returns S_OK if succeeded. The failure's causes
* are:
* @flag E_WORDTOOLONG | Word too long
* @flag Other errors | returned by the lpfnfOutWord
*************************************************************************/
PRIVATE ERR NEAR PASCAL WordBreakStem(LPBRK_PARMS lpBrkParms, WORD fStem)
{
register LPB lpbRawWord; // Pointer to RawWord buffer
register LPB lpbNormWord; // Pointer to NormWord buffer
LPCMAP lpCharPropTab; // Pointer to the char property table
LPB lpbInBuffer; // Buffer to groot through.
LPB lpbRawWordLimit; // Limit of RawWord buffer
#if 0
LPB lpbNormWordLimit; // Limit of NormWord buffer
#endif
BYTE bCurChar; // Current character.
BYTE fScan = TRUE;
ERR fRet;
#if 0
BYTE astStemmed[CB_MAX_WORD_LEN + 2]; // Temporary buffer for stemming
#endif
LPB lpbLigature = NULL;
WORD wcLigature = 0;
LPCHARTAB lpCharTab;
LPB astNormWord;
LPB astRawWord;
BYTE fAcceptWildCard;
/* Breakers parameters break out */
_LPIBI lpibi;
LPB lpbInBuf;
CB cbInBufSize;
LCB lcbInBufOffset;
LPV lpvUser;
FWORDCB lpfnfOutWord;
_LPSIPB lpsipb;
LPCMAP lpCMap = NULL;
/*
* Initialize variables
*/
if (lpBrkParms == NULL ||
(lpibi = lpBrkParms->lpInternalBreakInfo) == NULL)
return E_INVALIDARG;
astNormWord = (LPB)lpibi->astNormWord;
astRawWord = (LPB)lpibi->astRawWord;
lpbInBuf = lpBrkParms->lpbBuf;
lpvUser = lpBrkParms->lpvUser;
lpfnfOutWord = lpBrkParms->lpfnOutWord;
lpsipb = lpBrkParms->lpStopInfoBlock;
fAcceptWildCard = (BYTE)(lpBrkParms->fFlags & ACCEPT_WILDCARD);
/*
* Restore to the proper state. This is in place to handle
* words that cross block boundaries, and to deal with explicit
* buffer-flush commands.
*/
if ((lpbInBuffer = lpbInBuf) != NULL) {
cbInBufSize = lpBrkParms->cbBufCount;
lcbInBufOffset = lpBrkParms->lcbBufOffset;
if (lpCharTab = lpBrkParms->lpCharTab) {
lpCMap = (LPCMAP)(lpCharTab->lpCMapTab);
lpbLigature = lpCharTab->lpLigature;
wcLigature = lpCharTab->wcLigature;
}
else {
return(E_INVALIDARG);
}
lpbRawWordLimit = (LPB)&astRawWord[CB_MAX_WORD_LEN];
switch (lpibi->state) {
case SCAN_WHITE_STATE:
goto ScanWhite; // Running through white space.
case SCAN_WORD_STATE:
lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2];
lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2];
goto ScanWord; // Found one 'a'..'z', collecting.
case SCAN_NUM_STATE:
lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2];
lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2];
goto ScanNumber;// Found one '0'..'9', collecting.
case SCAN_LEADBYTE_STATE:
lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2];
lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2];
goto ScanLeadByte;
case SCAN_SBKANA_STATE:
lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2];
lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2];
goto ScanSbKana;
}
}
else {
cbInBufSize = fScan = 0;
switch (lpibi->state) {
case SCAN_WHITE_STATE:
return S_OK; // Still stuck in white space.
case SCAN_WORD_STATE:
goto FlushWord; // Flush a word.
case SCAN_NUM_STATE:
goto FlushNumber; // Flush a number.
case SCAN_LEADBYTE_STATE:
goto ScanLeadByte;
case SCAN_SBKANA_STATE:
goto ScanSbKana;
}
}
//
// W H I T E - S P A C E S T A T E
//
// While in this state the code is hunting through white-space,
// searching for an alpha character or a digit character. If
// it finds one, it initializes the word and goes to either the
// word-collection state or the number-collection state.
//
ScanWhite:
for (; cbInBufSize; cbInBufSize--, lpbInBuffer++) {
//
// Get the character and its class.
//
switch (CP_CLASS(&lpCMap[*lpbInBuffer])) {
case CLASS_WILDCARD:
if (fAcceptWildCard == FALSE)
continue;
case CLASS_TYPE: // Found the 1st byte of the special string
case CLASS_CHAR: // Found a non-normalized char
case CLASS_NORM: // Found a normalized character
case CLASS_LIGATURE: // Found a ligature
// jump to the word-collection state.
lpibi->lcb = (DWORD)(lcbInBufOffset +
(lpbInBuffer - lpbInBuf));
lpbRawWord = (LPB)&astRawWord[2];
lpbNormWord = (LPB)&astNormWord[2];
goto ScanWord;
case CLASS_DIGIT: // Found a digit.
lpibi->lcb = (DWORD)(lcbInBufOffset +
(lpbInBuffer - lpbInBuf));
lpibi->cbNormPunctLen = lpibi->cbRawPunctLen = 0;
lpbRawWord = (LPB)&astRawWord[2];
lpbNormWord = (LPB)&astNormWord[2];
goto ScanNumber;
case CLASS_LEADBYTE:
lpibi->lcb = (DWORD)(lcbInBufOffset +
(lpbInBuffer - lpbInBuf));
lpbRawWord = (LPB)&astRawWord[2];
lpbNormWord = (LPB)&astNormWord[2];
*(LPW)astNormWord = *(LPW)astRawWord = 0;
goto ScanLeadByte;
case CLASS_SBKANA:
lpibi->lcb = (DWORD)(lcbInBufOffset +
(lpbInBuffer - lpbInBuf));
*(LPW)astNormWord = *(LPW)astRawWord = 0;
lpbRawWord = (LPB)&astRawWord[2];
lpbNormWord = (LPB)&astNormWord[2];
goto ScanSbKana;
}
}
//
// If I run out of data, set things up so I'll come back
// to this state if the user provides more data.
//
lpibi->state = SCAN_WHITE_STATE;
return S_OK;
ScanWord:
//
// W O R D S T A T E
//
// While in this state the code is attempting to append alpha
// and digit characters to the alpha character it's already
// found. Apostrophes are stripped.
//
for (; cbInBufSize; cbInBufSize--, lpbInBuffer++) {
//
// Get the character and its class.
//
lpCharPropTab = &lpCMap[bCurChar = *lpbInBuffer];
switch (CP_CLASS(lpCharPropTab)) {
case CLASS_NORM :
case CLASS_DIGIT :
case CLASS_CHAR:
//
// Found a normalized character or a digit.
// Append it to the output buffer.
//
if (lpbRawWord >= lpbRawWordLimit)
return (E_WORDTOOLONG);
*lpbRawWord++ = bCurChar;
*lpbNormWord++ = CP_NORMC(&lpCMap[bCurChar]);
break;
case CLASS_LIGATURE:
//
// Found an ligature character. Normalize
// it and append it to the output buffer.
//
if (lpbRawWord >= lpbRawWordLimit)
return (E_WORDTOOLONG);
*lpbRawWord++ = bCurChar;
lpbNormWord += LigatureMap (bCurChar, lpbNormWord,
lpCMap, lpbLigature, wcLigature);
break;
case CLASS_STRIP:
//
// Found an apostrophe or somesuch. Ignore
// this character, but increment the word length,
// since it counts as part of the un-normalized
// word's length.
//
if (lpbRawWord >= lpbRawWordLimit)
return (E_WORDTOOLONG);
*lpbRawWord++ = bCurChar;
break;
case CLASS_TYPE :
/* Set the flag to remind us to get the
second byte.
*/
lpibi->fGotType = TRUE;
*lpbRawWord++ = *lpbNormWord++ = bCurChar;
break;
case CLASS_WILDCARD:
//
// Found a wildcard character
// Append it to the output buffer if we accept wildcard
//
if (fAcceptWildCard) {
if (lpbRawWord >= lpbRawWordLimit)
return (E_WORDTOOLONG);
*lpbRawWord++ = bCurChar;
*lpbNormWord++ = bCurChar;
break;
}
default:
if (lpibi->fGotType == TRUE) {
lpibi->fGotType = FALSE;
/* Found a the 2nd byte of a special type
Append it to the output buffer. */
*lpbRawWord++ = *lpbNormWord++ = bCurChar;
break;
}
//
// Found something weird, or I have been ordered
// to flush the output buffer. Flush the output
// buffer and go back to the "grooting through
// white space" state (#0).
//
FlushWord:
if (fScan)
{
/* Recalculate the length only if scanning */
*(LPW)astRawWord = (WORD)(lpbRawWord -
(LPB)&astRawWord[2]);
*(LPW)astNormWord = (WORD)(lpbNormWord -
(LPB)&astNormWord[2]);
}
/* Check for stop word if required */
if (lpsipb)
{
if (lpsipb->lpfnStopListLookup(lpsipb,
astNormWord) == S_OK)
{
goto ScanWhite; // Ignore stop words
}
}
#if 0
if (fStem)
{
/* Do stemming if requested */
if (FStem(astStemmed, astNormWord) == S_OK)
{
MEMCPY(astNormWord, astStemmed, GETWORD(astStemmed)
+ sizeof(WORD));
}
}
#endif
/* Execute user's function */
if (*lpfnfOutWord && (fRet = (*lpfnfOutWord)(astRawWord,
lpibi->astNormWord, lpibi->lcb, lpvUser)) != S_OK)
return fRet;
goto ScanWhite;
}
}
//
// If I run out of data, set things up so I'll come back
// to this state if the user provides more data. If they
// just want me to flush, I come back to the "flush a
// word" state (#1f), since at this time I already have
// a valid word, since I got an alpha-char in state #0,
// and may have gotten more since.
//
lpibi->state = SCAN_WORD_STATE;
*(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2]);
*(LPW)astNormWord = (WORD)(lpbNormWord - (LPB)&astNormWord[2]);
return S_OK;
ScanLeadByte:
if(!cbInBufSize)
{
// no character - we may have lost a DBC
//
lpibi->state = SCAN_WHITE_STATE;
*(LPW)astNormWord = *(LPW)astRawWord = 0;
return S_OK;
}
if(!GETWORD(astNormWord))
{
// process lead byte
//
*(LPW)astNormWord = *(LPW)astRawWord = 1;
astNormWord[2] = *lpbInBuffer++;
--cbInBufSize;
}
if(!cbInBufSize)
{
// no more characters - set up state so we come back to get trail byte.
//
lpibi->state = SCAN_LEADBYTE_STATE;
return S_OK;
}
// process trail byte
//
*(LPW)astNormWord = *(LPW)astRawWord = 2;
astNormWord[3] = *lpbInBuffer++;
--cbInBufSize;
// flush the DBC
//
if (*lpfnfOutWord &&
(fRet = (*lpfnfOutWord)(astRawWord,astNormWord, lpibi->lcb,lpvUser))
!= S_OK)
return fRet;
if(!cbInBufSize)
{
// no more characters - we have already flushed our DBC so we will just
// set the state back to scanning for white space.
//
lpibi->state = SCAN_WHITE_STATE;
return S_OK;
}
// all done - go back to scanning white space.
//
goto ScanWhite;
ScanSbKana:
if(!cbInBufSize)
{
// Buffer is empty. Flush the buffer if we are holding a character.
//
if(GETWORD(astNormWord))
{
if (*lpfnfOutWord &&
(fRet = (*lpfnfOutWord)(astRawWord,astNormWord, lpibi->lcb,lpvUser))
!= S_OK)
return fRet;
}
lpibi->state = SCAN_WHITE_STATE;
*(LPW)astNormWord = *(LPW)astRawWord = 0;
return S_OK;
}
// Note: The basic algorithm (including the mapping table) used here to
// convert half-width Katakana characters to full-width Katakana appears
// in the book "Understanding Japanese Information Systems" by
// O'Reily & Associates.
// If the RawWord buffer is empty then we will process this as a first
// character (we are not looking for an diacritic mark).
//
if(!GETWORD(astRawWord))
{
// Verify that we have a half-width Katakana character. This check is
// a good safeguard against erroneous information in a user defined
// charmap.
//
if(*lpbInBuffer >= 161 && *lpbInBuffer <= 223)
{
// We have a half-width Katakana character. Now compute the equivalent
// full-width character via the mapping table.
//
astNormWord[2] = (BYTE)(mtable[*lpbInBuffer-161][0]);
astNormWord[3] = (BYTE)(mtable[*lpbInBuffer-161][1]);
*(LPW)astNormWord = 2;
}
else
{
// This is an error condition. For some reason the charmap has
// *lpbInBuffer tagged as CLASS_SBKANA when in fact it's not
// a single byte Katakana character. This is probably the result
// of an improperly formed user defined charmap.
//
// Since there's no way to determine the real class of this character
// we will send it to the bit bucket.
//
lpbInBuffer++;
cbInBufSize--;
*(LPW)astNormWord = *(LPW)astRawWord = 0;
lpibi->state = SCAN_WHITE_STATE;
goto ScanWhite;
}
*(LPW)astRawWord = 1; // we have processed one character so far
astRawWord[2] = *lpbInBuffer; // we will need the original character later
lpbInBuffer++;
cbInBufSize--;
}
// Check if we have more characters in the buffer.
//
if(!cbInBufSize)
{
// Return because the buffer is empty.
//
lpibi->state = SCAN_SBKANA_STATE;
return S_OK;
}
// check if the second character is nigori mark.
//
if(*lpbInBuffer == 222)
{
// see if we have a half-width katakana that can be modified by nigori.
//
if((astRawWord[1] >= 182 && astRawWord[1] <= 196) ||
(astRawWord[1] >= 202 && astRawWord[1] <= 206) || (astRawWord[1] == 179))
{
// transform kana into kana with maru
//
if((astNormWord[2] >= 74 && astNormWord[2] <= 103) ||
(astNormWord[2] >= 110 && astNormWord[2] <= 122))
astNormWord[2]++;
else if(astNormWord[2] == 131 && astNormWord[3] == 69)
astNormWord[3] = 148;
// set the word lengths and advance the buffer.
//
*(LPW)astNormWord=2;
*(LPW)astRawWord =2;
lpbInBuffer++;
cbInBufSize--;
}
}
// check if following character is maru mark
//
else if(*lpbInBuffer==223)
{
// see if we have a half-width katakana that can be modified by maru.
//
if((astRawWord[2] >= 202 && astRawWord[2] <= 206))
{
// transform kana into kana with nigori
//
if(astNormWord[3] >= 110 && astNormWord[3] <= 122)
astNormWord[3]+=2;
// set the word lengths and advance the buffer.
//
*(LPW)astNormWord=2;
*(LPW)astRawWord=2;
lpbInBuffer++;
cbInBufSize--;
}
}
// Note: If the character at *lpbInBuffer wasn't a diacritic mark, then it
// will be processed when ScanWhite is re-entered.
//
// Another note: The above code only combines diacritic marks with
// single-width Katakana characters that can be modifed
// by these marks (not all can). If we happen to encounter
// a situation where the diacritic can't be combined
// into the character, we let the character continue
// back to ScanWhite where it will be re-sent to
// ScanSbKana, however this time it will be a first
// character and be converted into its stand-alone
// full-width equivalent (maru and nigori have full-width
// character equilalents that contain just the mark).
// flush the buffer
//
if (*lpfnfOutWord &&
(fRet = (*lpfnfOutWord)(astRawWord,astNormWord, lpibi->lcb,lpvUser))
!= S_OK)
return fRet;
// reset word lengths and return to scanning for white space.
//
*(LPW)astNormWord = *(LPW)astRawWord = 0;
lpibi->state = SCAN_WHITE_STATE;
// Return if buffer is empty
//
if(!cbInBufSize)
return S_OK;
// all done - go back to scanning white space.
//
goto ScanWhite;
ScanNumber:
//
// N U M B E R S T A T E
//
// While in this state the code is attempting to append alpha
// and digit characters to the digit character it's already
// found. This state is more complex than the word grabbing
// state, because it deals with slashes and hyphens in a weird
// way. They're allowed in a number unless they appear at the
// end. Extra variables have to account for these conditions.
//
for (; cbInBufSize; cbInBufSize--, lpbInBuffer++) {
//
// Get the character and its class.
//
lpCharPropTab = &lpCMap[bCurChar = *lpbInBuffer];
switch (CP_CLASS(lpCharPropTab)) {
case CLASS_DIGIT :
case CLASS_NORM :
case CLASS_CHAR:
//
// Found a normalized character or a digit.
// Append it to the output buffer.
//
if (lpbRawWord >= lpbRawWordLimit)
return (E_WORDTOOLONG);
*lpbRawWord++ = bCurChar;
*lpbNormWord++ = CP_NORMC(&lpCMap[bCurChar]);
lpibi->cbRawPunctLen = 0;
lpibi->cbNormPunctLen = 0;
break;
case CLASS_LIGATURE:
//
// Found an ligature character. Normalize
// it and append it to the output buffer.
//
if (lpbRawWord >= lpbRawWordLimit)
return (E_WORDTOOLONG);
*lpbRawWord++ = bCurChar;
lpbNormWord += LigatureMap (bCurChar, lpbNormWord,
lpCMap, lpbLigature, wcLigature);
lpibi->cbRawPunctLen = 0;
lpibi->cbNormPunctLen = 0;
break;
case CLASS_NKEEP:
//
// Found a hyphen or a slash. These are kept
// as part of the number unless they appear at
// the end of the number.
//
if (lpbRawWord >= lpbRawWordLimit)
return (E_WORDTOOLONG);
*lpbRawWord++ = bCurChar;
*lpbNormWord++= bCurChar;
lpibi->cbRawPunctLen++;
lpibi->cbNormPunctLen++;
break;
case CLASS_NSTRIP:
//
// Found a comma or somesuch. Ignore this
// character, but increment the word length,
// since it counts as part of the un-normalized
// number's length.
//
if (lpbRawWord >= lpbRawWordLimit)
return (E_WORDTOOLONG);
*lpbRawWord++= bCurChar;
lpibi->cbRawPunctLen++;
break;
case CLASS_CONTEXTNSTRIP:
//
// Found special character used for number separator. This
// may be a space in French, ie. 100 000. The problem here
// is that we must differentiate it from a regular word
// separator. In the meantime, ignore this character, but
// increment the word length
//
if (lpbRawWord >= lpbRawWordLimit)
return (E_WORDTOOLONG);
*lpbRawWord++= bCurChar;
lpibi->cbRawPunctLen++;
cbInBufSize--;
lpbInBuffer++;
goto ScanSeparator; // Found a "possible" separator
break;
case CLASS_WILDCARD:
//
// Found a wildcard character
// Append it to the output buffer if we accept wildcard
//
if (fAcceptWildCard) {
if (lpbRawWord >= lpbRawWordLimit)
return (E_WORDTOOLONG);
*lpbRawWord++ = bCurChar;
*lpbNormWord++ = bCurChar;
break;
}
default:
//
// Found something weird, or I have been ordered
// to flush the output buffer. Flush the output
// buffer and go back to the "grooting through
// white space" state (#0).
//
// This is a little more complicated than the
// analogous routine for dealing with words.
// This has to deal with words that have some
// number of trailing punctuation characters.
// These need to be stripped from the word, and
// the un-normalized word length value needs to
// be adjusted as well.
//
FlushNumber:
if (fScan)
{
/* Recalculate the length only if scanning */
*(LPW)astRawWord = (WORD)(lpbRawWord -
(LPB)&astRawWord[2] -
lpibi->cbRawPunctLen);
*(LPW)astNormWord = (WORD)(lpbNormWord -
(LPB)&astNormWord[2] -
lpibi->cbNormPunctLen);
}
/* Check for stop word if required */
if (lpsipb)
{
if (lpsipb->lpfnStopListLookup(lpsipb,
astNormWord) == S_OK)
{
goto ScanWhite; // Ignore stop words
}
}
if (*lpfnfOutWord && (fRet = (*lpfnfOutWord)(astRawWord,
astNormWord, lpibi->lcb, lpvUser)) != S_OK)
return fRet;
goto ScanWhite;
}
}
//
// If I run out of data, set things up so I'll come back
// to this state if the user provides more data. If they
// just want me to flush, I come back to the "flush a
// number" state (#2f), since at this time I already have
// a valid word, since I got an digit-char in state #0,
// and may have gotten more since.
//
lpibi->state = SCAN_NUM_STATE;
*(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2]);
*(LPW)astNormWord = (WORD)(lpbNormWord - (LPB)&astNormWord[2]);
return S_OK;
ScanSeparator:
// S E P A R A T O R S T A T E
//
// This state deals with special character used to separate digits
// of numbers. Example:
// 100 000 ' ' is used to separate the digits in French(??)
// In some sense, comma belongs to this class, when we
// deal with US numbers. Because of compability with Liljoe, they
// are set to be CLASS_NSTRIP. The rules to distinguish between
// a digit separator from regular word separator is: If there is a
// digit thats follows, then this is a digit separator, else it is
// a regular word separator
//
if (cbInBufSize) {
//
// Get the character and its class.
//
lpCharPropTab = &lpCMap[bCurChar = *lpbInBuffer];
if (CP_CLASS(lpCharPropTab) == CLASS_DIGIT) {
/* The followed character is a digit, so this must be a digit
* separator. Continue to get the number */
goto ScanNumber;
}
else {
/* Back out the change since this is a word separator */
lpbRawWord--;
*(LPW)astRawWord = (WORD)(lpbRawWord -
(LPB)&astRawWord[2]);
lpibi->cbRawPunctLen--;
goto FlushNumber;
}
}
//
// If I run out of data, set things up so I'll come back
// to this state if the user provides more data.
//
lpibi->state = SCAN_SEP_STATE;
*(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2]);
*(LPW)astNormWord = (WORD)(lpbNormWord - (LPB)&astNormWord[2]);
return S_OK;
}
PRIVATE int PASCAL NEAR LigatureMap(BYTE c, LPB lpbNormWord,
LPCMAP lpCMap, LPB lpbLigatureTab, WORD wcLigature)
{
for (;wcLigature > 0; wcLigature --) {
if (*lpbLigatureTab == c) {
*lpbNormWord++ = CP_NORMC(&lpCMap[lpbLigatureTab[1]]);
*lpbNormWord++ = CP_NORMC(&lpCMap[lpbLigatureTab[2]]);
return 2;
}
lpbLigatureTab += 3;
}
/* Not a ligature */
*lpbNormWord++ = CP_NORMC(&lpCMap[c]);
return 1;
}