windows-nt/Source/XPSP1/NT/enduser/stuff/itircl/fts/breakers/breaker.c

/*************************************************************************
*                                                                        *
*  BREAKER.C                                                             *
*                                                                        *
*  Copyright (C) Microsoft Corporation 1990-1994                         *
*  All Rights reserved.                                                  *
*                                                                        *
**************************************************************************
*                                                                        *
*  Module Intent                                                         *
*   Word breaker module                                                  *
* 	This module provides word-breaking routines applicable to the ANSI   *
* 	character-set.  This means American English.                         *
* 	Note that ANSI does not mean ASCII.                                  *
*                                                                        *
*   WARNING: Tab setting is 4 for this file                              *
*                                                                        *
**************************************************************************
*                                                                        *
*  Current Owner: BinhN                                                  *
*                                                                        *
**************************************************************************
*                                                                        *
*  Released by Development:     (date)                                   *
*                                                                        *
*************************************************************************/
#include <verstamp.h>
SETVERSIONSTAMP(MVBK);

#include <mvopsys.h>

#include <iterror.h>
#include <mvsearch.h>
#include "common.h"

/* Macros to access structure's members */

#define	CP_CLASS(p)	(((LPCMAP)p)->Class & 0xff)
#define	CP_NORMC(p)	(((LPCMAP)p)->Norm)

/*************************************************************************
 *
 *	                  INTERNAL PRIVATE FUNCTIONS
 *	All of them should be declared near
 *************************************************************************/
PRIVATE ERR NEAR PASCAL WordBreakStem(LPBRK_PARMS, WORD);
PRIVATE int PASCAL NEAR LigatureMap(BYTE c, LPB lpbNormWord,
	LPCMAP lpCharPropTab, LPB lpbLigatureTab, WORD wcLigature);


/*************************************************************************
 *
 *	            SINGLE TO DOUBLE-WIDTH KATAKANA MAPPING ARRAY
 *
 *************************************************************************/

// Single-Width to Double-Width Mapping Array
//
static const int mtable[][2]={
   {129,66},{129,117},{129,118},{129,65},{129,69},{131,146},{131,64},
   {131,66},{131,68},{131,70},{131,72},{131,131},{131,133},{131,135},
   {131,98},{129,91},{131,65},{131,67},{131,69},{131,71},{131,73},
   {131,74},{131,76},{131,78},{131,80},{131,82},{131,84},{131,86},
   {131,88},{131,90},{131,92},{131,94},{131,96},{131,99},{131,101},
   {131,103},{131,105},{131,106},{131,107},{131,108},{131,109},
   {131,110},{131,113},{131,116},{131,119},{131,122},{131,125},
   {131,126},{131,128},{131,129},{131,130},{131,132},{131,134},
   {131,136},{131,137},{131,138},{131,139},{131,140},{131,141},
   {131,143},{131,147},{129,74},{129,75} };


/*************************************************************************
 *	@doc	API INDEX RETRIEVAL
 *
 *	@func	LPIBI FAR PASCAL | BreakerInitiate |
 *		Allocates a breaker parameter block. This parameter block keeps
 *		track of the breaker's "global" variables.
 *
 *	@rdesc	NULL if the call fails (ie. no more memory)
 *		a pointer to the block if it succeeds.
 *************************************************************************/

PUBLIC LPIBI EXPORT_API FAR PASCAL BreakerInitiate(void)
{
	_LPIBI	lpibi;
	register HANDLE	hibi;

	if ((hibi = GlobalAlloc(GMEM_MOVEABLE | GMEM_ZEROINIT,
		sizeof(IBI))) == NULL) {
		return NULL;
	}
	//
	//	All variables not explicitly initialized are assumed to be
	//	initialized as zero.
	//
	lpibi = (_LPIBI)GlobalLock(hibi);
	lpibi->hibi = hibi;
	return lpibi;
}

/*************************************************************************
 *	@doc	API INDEX RETRIEVAL
 *
 *	@func	void FAR PASCAL | BreakerFree |
 *		Frees a word-breaker parameter block.
 *
 *	@parm	LPIBI | lpibi |
 *		Pointer to the InternalBreakInfo Structure containing all the
 *		informations about states
 *************************************************************************/
PUBLIC void EXPORT_API FAR PASCAL BreakerFree(_LPIBI lpibi)
{
	HANDLE	hibi;
	/* Do sanity check */
	if (lpibi == NULL)
		return;

	hibi = lpibi->hibi;
	GlobalUnlock(hibi);
	GlobalFree(hibi);
}

//	-	-	-	-	-	-	-	-	-

//	Break words out from a block of standard text characters.
//
//	This routine is incredibly important.  Any change in the performance
//	of this function will have immediate and obvious influence upon the
//	performance of the indexing system as a whole.  Consequently, the
//	function should be very fast.
//
//	This function uses a simple state machine to try to achieve the
//	necessary speed.  It's in a different loop depending upon what kind
//	of characters it's trying to find, and it uses "goto" statements to
//	shift back and forth between "states".
//

/*************************************************************************
 *	@doc	API RETRIEVAL INDEX
 *
 *	@func	ERR | FBreakWords |
 *		This function break a string into a sequence of words.
 *
 *	@parm	LPBRK_PARMS | lpBrkParms |
 *		Pointer to structure containing all the parameters needed for
 *		the breaker. They include:
 *		1/ Pointer to the InternalBreakInfo
 *		2/ Pointer to input buffer containing the word stream
 *		3/ Size of the input bufer
 *		4/ Offset in the source text of the first byte of the input buffer
 *		5/ Pointer to user's parameter block for the user's function
 *		6/ User's function to call with words. The format of the call should
 *		be (*lpfnfOutWord)(BYTE *RawWord, BYTE *NormWord, LCB lcb,
 *			LPV lpvUser)
 *		The function should return S_OK if succeeded
 *		The function can be NULL
 *		7/ Pointer to stop word table. This table contains stop words specific
 *		to this breaker. If this is non-null, then the function
 *		will flag errors for stop word present in the query
 *		8/ Pointer to character table. If NULL, then the default built-in
 *		character table will be used
 *
 *	@rdesc
 *		The function returns S_OK if succeeded. The failure's causes
 *		are:
 *	@flag	E_WORDTOOLONG | Word too long
 *	@flag	errors | returned by the lpfnfOutWord
 *************************************************************************/

PUBLIC ERR EXPORT_API FAR PASCAL FBreakWords(LPBRK_PARMS lpBrkParms)
{
	return (WordBreakStem(lpBrkParms, FALSE));
}

#if 0
/*************************************************************************
 *	@doc	API RETRIEVAL INDEX
 *
 *	@func	ERR | FBreakAndStemWords |
 *		This function breaks a string into a sequence of words and
 *		stems each resulting word
 *
 *	@parm	LPBRK_PARMS | lpBrkParms |
 *		Pointer to structure containing all the parameters needed for
 *		the breaker. They include:
 *		1/ Pointer to the InternalBreakInfo
 *		2/ Pointer to input buffer containing the word stream
 *		3/ Size of the input bufer
 *		4/ Offset in the source text of the first byte of the input buffer
 *		5/ Pointer to user's parameter block for the user's function
 *		6/ User's function to call with words. The format of the call should
 *		be (*lpfnfOutWord)(BYTE *RawWord, BYTE *NormWord, LCB lcb,
 *			LPV lpvUser)
 *		The function should return S_OK if succeeded
 *		The function can be NULL
 *		7/ Pointer to stop word table. This table contains stop words specific
 *		to this breaker. If this is non-null, then the function
 *		will flag errors for stop word present in the query
 *		8/ Pointer to character table. If NULL, then the default built-in
 *		character table will be used
 *
 *	@rdesc
 *		The function returns S_OK if succeeded. The failure's causes
 *		are:
 *	@flag	E_WORDTOOLONG | Word too long
 *	@flag	Other errors | returned by the lpfnfOutWord
 *************************************************************************/

PUBLIC ERR EXPORT_API FAR PASCAL FBreakAndStemWords(LPBRK_PARMS lpBrkParms)
{
	return (WordBreakStem(lpBrkParms, TRUE));
}
#endif


PUBLIC ERR EXPORT_API FAR PASCAL BreakerVersion (void)
{
	return	CHARTABVER;
}

// This exists only to enable MVJK to link statically.
// We must have the same function names for the static build.
PUBLIC ERR FAR PASCAL FBreakStems(LPBRK_PARMS lpBrkParms)
{
	return E_NOTSUPPORTED;
}

// This exists only to enable MVJK to link statically.
// We must have the same function names for the static build.
PUBLIC ERR FAR PASCAL FSelectWord (LPCSTR pBuffer, DWORD dwCount,
    DWORD dwOffset, LPDWORD pStart, LPDWORD pEnd)
{
	return E_NOTSUPPORTED;
}

/*************************************************************************
 *	@doc	INTERNAL
 *
 *	@func	ERR | WordBreakStem |
 *		This function breaks a string into a sequence of words and
 *		stems each resulting word
 *
 *	@parm	BYTE | fStem |
 *		If set, stem the word
 *
 *	@rdesc
 *		The function returns S_OK if succeeded. The failure's causes
 *		are:
 *	@flag	E_WORDTOOLONG | Word too long
 *	@flag	Other errors | returned by the lpfnfOutWord
 *************************************************************************/

PRIVATE ERR NEAR PASCAL WordBreakStem(LPBRK_PARMS lpBrkParms, WORD fStem)
{
	register LPB lpbRawWord;	// Pointer to RawWord buffer
	register LPB lpbNormWord;	// Pointer to NormWord buffer
	LPCMAP lpCharPropTab;		// Pointer to the char property table
	LPB	lpbInBuffer;			// Buffer to groot through.
	LPB	lpbRawWordLimit;		// Limit of RawWord buffer
#if 0
	LPB	lpbNormWordLimit;		// Limit of NormWord buffer
#endif
	BYTE	bCurChar;			// Current character.
	BYTE	fScan = TRUE;
	ERR	 fRet;
#if 0
	BYTE	astStemmed[CB_MAX_WORD_LEN + 2]; // Temporary buffer for stemming
#endif
	LPB		lpbLigature = NULL;
	WORD	wcLigature = 0;
	LPCHARTAB lpCharTab;
	LPB		astNormWord;
	LPB		astRawWord;
	BYTE	fAcceptWildCard;

	/* Breakers parameters break out */

	_LPIBI lpibi;
	LPB lpbInBuf;
	CB cbInBufSize;
	LCB lcbInBufOffset;
	LPV lpvUser;
	FWORDCB lpfnfOutWord;
	_LPSIPB lpsipb;
	LPCMAP lpCMap = NULL;

	/*
	 *	Initialize variables
	 */

	if (lpBrkParms == NULL ||
		(lpibi = lpBrkParms->lpInternalBreakInfo) == NULL)
		return E_INVALIDARG;

	astNormWord = (LPB)lpibi->astNormWord;
	astRawWord = (LPB)lpibi->astRawWord;

	lpbInBuf = lpBrkParms->lpbBuf;
	lpvUser = lpBrkParms->lpvUser;
	lpfnfOutWord = lpBrkParms->lpfnOutWord;
	lpsipb = lpBrkParms->lpStopInfoBlock;
	fAcceptWildCard = (BYTE)(lpBrkParms->fFlags & ACCEPT_WILDCARD);

	/*
	 *	Restore to the proper state.  This is in place to handle
	 *	words that cross block boundaries, and to deal with explicit
	 *	buffer-flush commands.
	 */
	if ((lpbInBuffer = lpbInBuf) != NULL) {

		cbInBufSize = lpBrkParms->cbBufCount;
		lcbInBufOffset = lpBrkParms->lcbBufOffset;

		if (lpCharTab = lpBrkParms->lpCharTab) {
			lpCMap = (LPCMAP)(lpCharTab->lpCMapTab);
			lpbLigature = lpCharTab->lpLigature;
			wcLigature = lpCharTab->wcLigature;
		}
      else {
         return(E_INVALIDARG);
      }

		lpbRawWordLimit = (LPB)&astRawWord[CB_MAX_WORD_LEN];

		switch (lpibi->state) {
		    case SCAN_WHITE_STATE:
				goto ScanWhite;	// Running through white space.
		    case SCAN_WORD_STATE:
				lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2];
				lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2];
				goto ScanWord;	// Found one 'a'..'z', collecting.

		    case SCAN_NUM_STATE:
				lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2];
				lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2];
				goto ScanNumber;// Found one '0'..'9', collecting.
				
			case SCAN_LEADBYTE_STATE:
				lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2];
				lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2];
				goto ScanLeadByte; 

			case SCAN_SBKANA_STATE:
				lpbRawWord = (LPB)&astRawWord[GETWORD(astRawWord)+2];
				lpbNormWord = (LPB)&astNormWord[GETWORD(astNormWord)+2];
				goto ScanSbKana; 
		}
	}
	else {
		cbInBufSize = fScan = 0;
		switch (lpibi->state) {
		    case SCAN_WHITE_STATE:
				return S_OK;	// Still stuck in white space.
		    case SCAN_WORD_STATE:
				goto FlushWord;	// Flush a word.
		    case SCAN_NUM_STATE:
				goto FlushNumber;	// Flush a number.
            case SCAN_LEADBYTE_STATE:
                goto ScanLeadByte;
            case SCAN_SBKANA_STATE:
                goto ScanSbKana;
		}
	}
	//
	//	W H I T E - S P A C E   S T A T E
	//
	//	While in this state the code is hunting through white-space,
	//	searching for an alpha character or a digit character.  If
	//	it finds one, it initializes the word and goes to either the
	//	word-collection state or the number-collection state.
	//
ScanWhite:
	for (; cbInBufSize; cbInBufSize--, lpbInBuffer++) {
		//
		//	Get the character and its class.
		//

		switch (CP_CLASS(&lpCMap[*lpbInBuffer])) {
			case CLASS_WILDCARD:
				if (fAcceptWildCard == FALSE)
					continue;
			case CLASS_TYPE: // Found the 1st byte of the special string
			case CLASS_CHAR: //	Found a non-normalized char
			case CLASS_NORM: //	Found a normalized character
            case CLASS_LIGATURE: // Found a ligature

			//	jump to the word-collection state.
				lpibi->lcb = (DWORD)(lcbInBufOffset +
					(lpbInBuffer - lpbInBuf));
				lpbRawWord = (LPB)&astRawWord[2];
				lpbNormWord = (LPB)&astNormWord[2];
				goto ScanWord;

			case CLASS_DIGIT: //	Found a digit.
				lpibi->lcb = (DWORD)(lcbInBufOffset +
					(lpbInBuffer - lpbInBuf));
				lpibi->cbNormPunctLen = lpibi->cbRawPunctLen = 0;
				lpbRawWord = (LPB)&astRawWord[2];
				lpbNormWord = (LPB)&astNormWord[2];
				goto ScanNumber;
				
            case CLASS_LEADBYTE:
                lpibi->lcb = (DWORD)(lcbInBufOffset +
                (lpbInBuffer - lpbInBuf));
                lpbRawWord = (LPB)&astRawWord[2];
                lpbNormWord = (LPB)&astNormWord[2];
                *(LPW)astNormWord = *(LPW)astRawWord = 0;
                goto ScanLeadByte;
            case CLASS_SBKANA:  
                lpibi->lcb = (DWORD)(lcbInBufOffset +
                (lpbInBuffer - lpbInBuf));
                *(LPW)astNormWord = *(LPW)astRawWord = 0;
                lpbRawWord = (LPB)&astRawWord[2];
                lpbNormWord = (LPB)&astNormWord[2];
            goto ScanSbKana;
		}
	}
	//
	//	If I run out of data, set things up so I'll come back
	//	to this state if the user provides more data.
	//
	lpibi->state = SCAN_WHITE_STATE;
	return S_OK;

ScanWord:
	//
	//	W O R D   S T A T E
	//
	//	While in this state the code is attempting to append alpha
	//	and digit characters to the alpha character it's already
	//	found.  Apostrophes are stripped.
	//
	for (; cbInBufSize; cbInBufSize--, lpbInBuffer++) {
		//
		//	Get the character and its class.
		//
		lpCharPropTab = &lpCMap[bCurChar = *lpbInBuffer];
		switch (CP_CLASS(lpCharPropTab)) {
			case CLASS_NORM :
			case CLASS_DIGIT :
            case CLASS_CHAR:
			//
			//	Found a normalized character or a digit.
			//	Append it to the output buffer.
			//
				if (lpbRawWord >= lpbRawWordLimit)
					return (E_WORDTOOLONG);
				*lpbRawWord++ = bCurChar;
    			*lpbNormWord++ = CP_NORMC(&lpCMap[bCurChar]);
				break;
			
			case CLASS_LIGATURE:
			//
			//	Found an ligature character.  Normalize
			//	it and append it to the output buffer.
			//
				if (lpbRawWord >= lpbRawWordLimit)
					return (E_WORDTOOLONG);
				*lpbRawWord++ = bCurChar;
				lpbNormWord += LigatureMap (bCurChar, lpbNormWord,
					lpCMap, lpbLigature, wcLigature);
				break;
				
			case CLASS_STRIP:
			//
			//	Found an apostrophe or somesuch.  Ignore
			//	this character, but increment the word length,
			//	since it counts as part of the un-normalized
			//	word's length.
			//
				if (lpbRawWord >= lpbRawWordLimit)
					return (E_WORDTOOLONG);
				*lpbRawWord++ = bCurChar;
				break;

			case CLASS_TYPE :
				/* Set the flag to remind us to get the
					second byte.
				*/
				lpibi->fGotType = TRUE;
				*lpbRawWord++ = *lpbNormWord++ = bCurChar;
				break;

			case CLASS_WILDCARD:
			//
			//	Found a wildcard character
			//	Append it to the output buffer if we accept wildcard
			//
				if (fAcceptWildCard) {
					if (lpbRawWord >= lpbRawWordLimit)
						return (E_WORDTOOLONG);
					*lpbRawWord++ = bCurChar;
					*lpbNormWord++ = bCurChar;
					break;
				}
			
			default:
				if (lpibi->fGotType == TRUE) {
					lpibi->fGotType = FALSE;

					/* Found a the 2nd byte of a special type
						Append it to the output buffer. */

					*lpbRawWord++ = *lpbNormWord++ = bCurChar;
					break;
				}
			//
			//	Found something weird, or I have been ordered
			//	to flush the output buffer.  Flush the output
			//	buffer and go back to the "grooting through
			//	white space" state (#0).
			//
FlushWord:	
				if (fScan)
				{
				/* Recalculate the length only if scanning */
					*(LPW)astRawWord = (WORD)(lpbRawWord -
						(LPB)&astRawWord[2]);
					*(LPW)astNormWord = (WORD)(lpbNormWord -
						(LPB)&astNormWord[2]);
				}

				/* Check for stop word if required */
				if (lpsipb)
				{
					if (lpsipb->lpfnStopListLookup(lpsipb,
						astNormWord) == S_OK)
					{
						goto ScanWhite;	// Ignore stop words
					}
				}
#if 0

				if (fStem)
				{
    				/* Do stemming if requested */
					if (FStem(astStemmed, astNormWord) == S_OK)
					{
						MEMCPY(astNormWord, astStemmed, GETWORD(astStemmed)
							+ sizeof(WORD));
					}
				}
#endif

				/* Execute user's function */
				if (*lpfnfOutWord && (fRet = (*lpfnfOutWord)(astRawWord,
					lpibi->astNormWord, lpibi->lcb, lpvUser)) != S_OK)
					return fRet;
				goto ScanWhite;
		}
	}
	//
	//	If I run out of data, set things up so I'll come back
	//	to this state if the user provides more data.  If they
	//	just want me to flush, I come back to the "flush a
	//	word" state (#1f), since at this time I already have
	//	a valid word, since I got an alpha-char in state #0,
	//	and may have gotten more since.
	//
	lpibi->state = SCAN_WORD_STATE;
	*(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2]);
	*(LPW)astNormWord = (WORD)(lpbNormWord - (LPB)&astNormWord[2]);
	return S_OK;


ScanLeadByte:
   if(!cbInBufSize)
   {
      // no character - we may have lost a DBC
      //
   	  lpibi->state = SCAN_WHITE_STATE;
      *(LPW)astNormWord = *(LPW)astRawWord = 0;
	   return S_OK;
   }

   if(!GETWORD(astNormWord))
   {
      // process lead byte
      //
      *(LPW)astNormWord = *(LPW)astRawWord = 1;
      astNormWord[2] = *lpbInBuffer++;
      --cbInBufSize;
   }

   if(!cbInBufSize)
   {
      // no more characters - set up state so we come back to get trail byte.
      //
   	lpibi->state = SCAN_LEADBYTE_STATE;
	   return S_OK;
   }

   // process trail byte
   //
   *(LPW)astNormWord = *(LPW)astRawWord = 2;
   astNormWord[3] = *lpbInBuffer++;
   --cbInBufSize;

   // flush the DBC
   //
   if (*lpfnfOutWord &&
	   (fRet = (*lpfnfOutWord)(astRawWord,astNormWord, lpibi->lcb,lpvUser))
	   != S_OK)
		return fRet;

   if(!cbInBufSize)
   {
      // no more characters - we have already flushed our DBC so we will just
      // set the state back to scanning for white space.
      //
   	lpibi->state = SCAN_WHITE_STATE;
	   return S_OK;
   }

   // all done - go back to scanning white space.
   //
	goto ScanWhite;

ScanSbKana:
   if(!cbInBufSize)
   {
      // Buffer is empty.  Flush the buffer if we are holding a character.
      //
      if(GETWORD(astNormWord))
      {
         if (*lpfnfOutWord &&
	         (fRet = (*lpfnfOutWord)(astRawWord,astNormWord, lpibi->lcb,lpvUser))
	         != S_OK)
		      return fRet;
      }

      lpibi->state = SCAN_WHITE_STATE;
      *(LPW)astNormWord = *(LPW)astRawWord = 0;
	  return S_OK;
   }

   // Note: The basic algorithm (including the mapping table) used here to
   // convert half-width Katakana characters to full-width Katakana appears
   // in the book "Understanding Japanese Information Systems" by
   // O'Reily & Associates.

   
   // If the RawWord buffer is empty then we will process this as a first 
   // character (we are not looking for an diacritic mark).
   //
   if(!GETWORD(astRawWord))
   {
      // Verify that we have a half-width Katakana character.  This check is
      // a good safeguard against erroneous information in a user defined
      // charmap.  
      //
      if(*lpbInBuffer >= 161 && *lpbInBuffer <= 223)
      {
         // We have a half-width Katakana character. Now compute the equivalent
         // full-width character via the mapping table.
         //
         astNormWord[2] = (BYTE)(mtable[*lpbInBuffer-161][0]);
         astNormWord[3] = (BYTE)(mtable[*lpbInBuffer-161][1]);
         *(LPW)astNormWord = 2;
      }
      else
      {
         // This is an error condition.  For some reason the charmap has 
         // *lpbInBuffer tagged as CLASS_SBKANA when in fact it's not
         // a single byte Katakana character.  This is probably the result
         // of an improperly formed user defined charmap.
         // 
         // Since there's no way to determine the real class of this character
         // we will send it to the bit bucket.
         //
         lpbInBuffer++;
         cbInBufSize--;
         *(LPW)astNormWord = *(LPW)astRawWord = 0;
 	      lpibi->state = SCAN_WHITE_STATE;
      	goto ScanWhite;
      }
      *(LPW)astRawWord = 1;         // we have processed one character so far
      astRawWord[2] = *lpbInBuffer; // we will need the original character later
      lpbInBuffer++;
      cbInBufSize--;
   }

   // Check if we have more characters in the buffer.
   //
   if(!cbInBufSize)
   {
      // Return because the buffer is empty.
	  //
	  lpibi->state = SCAN_SBKANA_STATE;
     return S_OK;
   }

   // check if the second character is nigori mark.
   //
   if(*lpbInBuffer == 222)                
   {
      // see if we have a half-width katakana that can be modified by nigori.
      //
      if((astRawWord[1] >= 182 && astRawWord[1] <= 196) || 
         (astRawWord[1] >= 202 && astRawWord[1] <= 206) || (astRawWord[1] == 179))
      {
         // transform kana into kana with maru
         //
         if((astNormWord[2] >= 74 && astNormWord[2] <= 103) ||
             (astNormWord[2] >= 110 && astNormWord[2] <= 122))
             astNormWord[2]++;
         else if(astNormWord[2] == 131 && astNormWord[3] == 69)
            astNormWord[3] = 148;


         // set the word lengths and advance the buffer.
         //
         *(LPW)astNormWord=2;
         *(LPW)astRawWord =2;            
         lpbInBuffer++;
         cbInBufSize--;
      }
   }

   // check if following character is maru mark
   //
   else if(*lpbInBuffer==223)
   {
      // see if we have a half-width katakana that can be modified by maru.
      //
      if((astRawWord[2] >= 202 && astRawWord[2] <= 206))
      {
         // transform kana into kana with nigori
         //
         if(astNormWord[3] >= 110 && astNormWord[3] <= 122)
            astNormWord[3]+=2;

         // set the word lengths and advance the buffer.
         //
         *(LPW)astNormWord=2;
         *(LPW)astRawWord=2;
         lpbInBuffer++;
         cbInBufSize--;
      }
   }

   // Note: If the character at *lpbInBuffer wasn't a diacritic mark, then it
   //       will be processed when ScanWhite is re-entered.
   //
   // Another note:  The above code only combines diacritic marks with
   //                single-width Katakana characters that can be modifed
   //                by these marks (not all can).  If we happen to encounter
   //                a situation where the diacritic can't be combined 
   //                into the character, we let the character continue
   //                back to ScanWhite where it will be re-sent to 
   //                ScanSbKana, however this time it will be a first
   //                character and be converted into its stand-alone
   //                full-width equivalent (maru and nigori have full-width 
   //                character equilalents that contain just the mark).
 
   // flush the buffer
   //
   if (*lpfnfOutWord &&
	   (fRet = (*lpfnfOutWord)(astRawWord,astNormWord, lpibi->lcb,lpvUser)) 
	   != S_OK)
		return fRet;

   // reset word lengths and return to scanning for white space.
   //
   *(LPW)astNormWord = *(LPW)astRawWord = 0;
 	lpibi->state = SCAN_WHITE_STATE;

   // Return if buffer is empty
   //
   if(!cbInBufSize)
	   return S_OK;

   // all done - go back to scanning white space.
   //
	goto ScanWhite;


ScanNumber:
	//
	//	N U M B E R   S T A T E
	//
	//	While in this state the code is attempting to append alpha
	//	and digit characters to the digit character it's already
	//	found.  This state is more complex than the word grabbing
	//	state, because it deals with slashes and hyphens in a weird
	//	way.  They're allowed in a number unless they appear at the
	//	end.  Extra variables have to account for these conditions.
	//
	for (; cbInBufSize; cbInBufSize--, lpbInBuffer++) {
		//
		//	Get the character and its class.
		//
		lpCharPropTab = &lpCMap[bCurChar = *lpbInBuffer];
		switch (CP_CLASS(lpCharPropTab)) {
			case CLASS_DIGIT :
			case CLASS_NORM :
			case CLASS_CHAR:
			//
			//	Found a normalized character or a digit.
			//	Append it to the output buffer.
			//
				if (lpbRawWord >= lpbRawWordLimit)
					return (E_WORDTOOLONG);
				*lpbRawWord++ = bCurChar;
    			*lpbNormWord++ = CP_NORMC(&lpCMap[bCurChar]);
				lpibi->cbRawPunctLen = 0;
				lpibi->cbNormPunctLen = 0;
				break;

			case CLASS_LIGATURE:
			//
			//	Found an ligature character.  Normalize
			//	it and append it to the output buffer.
			//
				if (lpbRawWord >= lpbRawWordLimit)
					return (E_WORDTOOLONG);
				*lpbRawWord++ = bCurChar;
				lpbNormWord += LigatureMap (bCurChar, lpbNormWord,
					lpCMap, lpbLigature, wcLigature);
				lpibi->cbRawPunctLen = 0;
				lpibi->cbNormPunctLen = 0;
				break;
				
			case CLASS_NKEEP:
			//
			//	Found a hyphen or a slash.  These are kept
			//	as part of the number unless they appear at
			//	the end of the number.
			//
				if (lpbRawWord >= lpbRawWordLimit)
					return (E_WORDTOOLONG);
				*lpbRawWord++ = bCurChar;
				*lpbNormWord++= bCurChar;
				lpibi->cbRawPunctLen++;
				lpibi->cbNormPunctLen++;
				break;

			case CLASS_NSTRIP:
			//
			//	Found a comma or somesuch.  Ignore this
			//	character, but increment the word length,
			//	since it counts as part of the un-normalized
			//	number's length.
			//
				if (lpbRawWord >= lpbRawWordLimit)
					return (E_WORDTOOLONG);
				*lpbRawWord++= bCurChar;
				lpibi->cbRawPunctLen++;
				break;

			case CLASS_CONTEXTNSTRIP:
			//
			//	Found special character used for number separator. This
			//	may be a space in French, ie. 100 000. The problem here
			//	is that we must differentiate it from a regular word
			//	separator. In the meantime, ignore this character, but
			//	increment the word length
			//
				if (lpbRawWord >= lpbRawWordLimit)
					return (E_WORDTOOLONG);
				*lpbRawWord++= bCurChar;
				lpibi->cbRawPunctLen++;
				cbInBufSize--;
				lpbInBuffer++;
				goto ScanSeparator; // Found a "possible" separator
				break;

			case CLASS_WILDCARD:
			//
			//	Found a wildcard character
			//	Append it to the output buffer if we accept wildcard
			//
				if (fAcceptWildCard) {
					if (lpbRawWord >= lpbRawWordLimit)
						return (E_WORDTOOLONG);
					*lpbRawWord++ = bCurChar;
					*lpbNormWord++ = bCurChar;
					break;
				}

			default:
			//
			//	Found something weird, or I have been ordered
			//	to flush the output buffer.  Flush the output
			//	buffer and go back to the "grooting through
			//	white space" state (#0).
			//
			//	This is a little more complicated than the
			//	analogous routine for dealing with words.
			//	This has to deal with words that have some
			//	number of trailing punctuation characters.
			//	These need to be stripped from the word, and
			//	the un-normalized word length value needs to
			//	be adjusted as well.
			//
FlushNumber:	
				if (fScan)
				{
    				/* Recalculate the length only if scanning */
					*(LPW)astRawWord = (WORD)(lpbRawWord -
						(LPB)&astRawWord[2] -
						lpibi->cbRawPunctLen);
					*(LPW)astNormWord = (WORD)(lpbNormWord -
						(LPB)&astNormWord[2] -
						lpibi->cbNormPunctLen);
				}

				/* Check for stop word if required */
				if (lpsipb)
				{
					if (lpsipb->lpfnStopListLookup(lpsipb,
						astNormWord) == S_OK)
					{
						goto ScanWhite;	// Ignore stop words
					}
				}

				if (*lpfnfOutWord && (fRet = (*lpfnfOutWord)(astRawWord,
					astNormWord, lpibi->lcb, lpvUser)) != S_OK)
					return fRet;
				goto ScanWhite;
		}
	}
	//
	//	If I run out of data, set things up so I'll come back
	//	to this state if the user provides more data.  If they
	//	just want me to flush, I come back to the "flush a
	//	number" state (#2f), since at this time I already have
	//	a valid word, since I got an digit-char in state #0,
	//	and may have gotten more since.
	//
	lpibi->state = SCAN_NUM_STATE;
	*(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2]);
	*(LPW)astNormWord = (WORD)(lpbNormWord - (LPB)&astNormWord[2]);
	return S_OK;

ScanSeparator:
	//	S E P A R A T O R   S T A T E
	//
	//	This state deals with special character used to separate digits
	//	of numbers. Example:
	//		100 000		' ' is used to separate the digits in French(??)
	//	In some sense, comma belongs to this class, when we
	//	deal with US numbers. Because of compability with Liljoe, they
	//	are set to be CLASS_NSTRIP. The rules to distinguish between
	//	a digit separator from regular word separator is: If there is a
	//	digit thats follows, then this is a digit separator, else it is
	//	a regular word separator
	//		
	if (cbInBufSize) {
		//
		//	Get the character and its class.
		//
		lpCharPropTab = &lpCMap[bCurChar = *lpbInBuffer];
		if (CP_CLASS(lpCharPropTab) == CLASS_DIGIT) {

			/* The followed character is a digit, so this must be a digit
			 * separator. Continue to get the number */

			goto ScanNumber;
		}
		else {
			/* Back out the change since this is a word separator */

			lpbRawWord--;
			*(LPW)astRawWord = (WORD)(lpbRawWord -
				(LPB)&astRawWord[2]);
			lpibi->cbRawPunctLen--;
			goto FlushNumber;
		}
	}
	//
	//	If I run out of data, set things up so I'll come back
	//	to this state if the user provides more data.
	//
	lpibi->state = SCAN_SEP_STATE;
	*(LPW)astRawWord = (WORD)(lpbRawWord - (LPB)&astRawWord[2]);
    *(LPW)astNormWord = (WORD)(lpbNormWord - (LPB)&astNormWord[2]);
	return S_OK;
}

PRIVATE int PASCAL NEAR LigatureMap(BYTE c, LPB lpbNormWord,
	LPCMAP lpCMap, LPB lpbLigatureTab, WORD wcLigature)
{
	for (;wcLigature > 0; wcLigature --) { 
		if (*lpbLigatureTab == c) {
			*lpbNormWord++ = CP_NORMC(&lpCMap[lpbLigatureTab[1]]);
			*lpbNormWord++ = CP_NORMC(&lpCMap[lpbLigatureTab[2]]);
			return 2;
		}
		lpbLigatureTab += 3;
	}

	/* Not a ligature */
	*lpbNormWord++ = CP_NORMC(&lpCMap[c]);
	return 1;
}