windows-nt/Source/XPSP1/NT/enduser/stuff/itircl/fts/search/stem.c


/*************************************************************************
*                                                                        *
*  STEM.C                                                                *
*                                                                        *
*  Copyright (C) Microsoft Corporation 1990-1994                         *
*  All Rights reserved.                                                  *
*                                                                        *
**************************************************************************
*                                                                        *
*  Module Intent                                                         *
*   This module contains the functions to strip off the suffix of a word *
*   It is based on the research paper of  Dr. Porter, pulished in        *
*             An algorithm for suffix stripping                          *
*          Program, Vol.14, no.3,pp 130-137, July 1980                   *
*                                                                        *
*  Description:                                                          *
*                                                                        *
*   The full description of the algorithm can be found in that document  *
*   Basically, the algorithm consists of:                                *
*      - Matching the suffix from a table of suffixes                    *
*      - Applies the rule that comes with the suffix                     *
*      - If the rule matches, then change the suffix to the new one      *
*                                                                        *
*  Comments:                                                             *
*                                                                        *
*     1/ There are some misconceptions about stripping the suffix        *
*     People are thinking in term of super-smart algorithm that can      *
*     strip a word to its stem. The fact is that it is not necessarily   *
*     true. For example, DIED is strippe to DI, but not DIE.             *
*                                                                        *
*     2/ The current code is SLOW, but it easy to understand in term     *
*     of implementation, since it is straigthforward from the algorithm  *
*     description. The impact on runtime is nothing. On compiled time    *
*     stemming 5,000,000 words will take less than 1 hour, which is      *
*     acceptable, since a project that large requires 1-2 days to        *
*     compile.                                                           *
*                                                                        *
*     To improve the speed (up to 2 times), we can scan the suffix       *
*     if one letter doesn't match we can jump pass all stem that have    *
*     this letter                                                        *
*   WARNING: Tab setting is 4 for this file                              *
*                                                                        *
**************************************************************************
*                                                                        *
*  Current Owner: BinhN                                                  *
*                                                                        *
**************************************************************************/
#include <mvopsys.h>
#include <memory.h>
#include <mvsearch.h>
#include "common.h"


#define VOWEL       0
#define CONSONANT   1
#define MIXED       2

#define	MIN_LENGTH_FOR_STEM	3

/* Rule table structure */

typedef struct RULE
{
    LPB szInitSuffix;           // Initial suffix
    LPB szNewSuffix;            // New suffix
    LPB szCondition;            // Stemming condition
    short NextTable;            // Next table to jump to
} RULE, FAR *LPRULE;

/* The conventional letter  used for the stemming condition are:
 *
 *  '1':    Measure == 1
 *  '2':    Measure > 1
 *  'd':    Double consonant at the end (*d in the document)
 *  'o':    Form cvc , and 2nd c is not W, X or Y (*o in the document)
 *  'p':    Measure > 0
 *  's':    Remove the last consonant (used with 'd')
 *  'v':    Word contains vowels (*v* in the document)
 *  '*':    Terminated with the next letter (*S in the document)
 *  '&':    AND operation
 *  '|':    OR operation
 *  '!':    NOT operation
 * The rule operation is based on a postfix notation, so "m=1 and *o*" is 
 * described as "1o&"
 */

RULE RuleTab0[] =
{
    "\4sses",       "\2ss",     NULL,   1,
    "\3ies",        "\1i",      NULL,   1,
    "\2ss",         "\2ss",     NULL,   1,
    "\1s",          "\0",       NULL,   1,
    NULL,           NULL,       NULL,   1,
};

RULE RuleTab1[] =
{
    "\3eed",        "\2ee",     "p",    3,
    "\2ed",         "\0",       "v",    2,
    "\3ing",        "\0",       "v",    2,
    NULL,           NULL,       NULL,   3,
};

RULE RuleTab2[] =
{
    "\2at",         "\3ate",    NULL,   3,
    "\2bl",         "\3ble",    NULL,   3,
    "\2iz",         "\3ize",    NULL,   3,

    /* The following szNewSuffix has a negative \377
     * (-1) length. It is to be used to reduce a
     * double consonant ending to single consonant
     */

    "\0",           "\377\0",   "*l*s|*z|!d&s", 3,
    "\0",           "\1e",      "1o&",  3,
    NULL,           NULL,       NULL,   3,
};

RULE RuleTab3[] =
{
    "\1y",          "\1i",      "v",    4,
    NULL,           NULL,       NULL,   4,
};

RULE RuleTab4[] =
{
    "\7ational",    "\3ate",    "p",    5,
    "\6tional",     "\4tion",   "p",    5,
    "\4enci",       "\4ence",   "p",    5,
    "\4anci",       "\4ance",   "p",    5,
    "\4izer",       "\3ize",    "p",    5,
    "\4abli",       "\4able",   "p",    5,
    "\4alli",       "\2al",     "p",    5,
    "\5entli",      "\3ent",    "p",    5,
    "\3eli",        "\1e",      "p",    5,
    "\5ousli",      "\3ous",    "p",    5,
    "\7ization",    "\3ize",    "p",    5,
    "\5ation",      "\3ate",    "p",    5,
    "\4ator",       "\3ate",    "p",    5,
    "\5alism",      "\2al",     "p",    5,
    "\7iveness",    "\3ive",    "p",    5,
    "\7fulness",    "\3ful",    "p",    5,
    "\7ousness",    "\3ous",    "p",    5,
    "\5aliti",      "\2al",     "p",    5,
    "\5iviti",      "\3ive",    "p",    5,
    "\6biliti",     "\3ble",    "p",    5,
    NULL,           NULL,       NULL,   5,
};

RULE RuleTab5[] =
{
    "\5icate",      "\2ic",     "p",    6,
    "\5ative",      "\0",       "p",    6,
    "\5alize",      "\2al",     "p",    6,
    "\5iciti",      "\2ic",     "p",    6,
    "\4ical",       "\2ic",     "p",    6,
    "\3ful",        "\0",       "p",    6,
    "\4ness",       "\0",       "p",    6,
    NULL,           NULL,       NULL,   6,
};

RULE RuleTab6[] =
{
    "\2al",         "\0",       "2",    7,
    "\4ance",       "\0",       "2",    7,
    "\4ence",       "\0",       "2",    7,
    "\2er",         "\0",       "p",    7,
    "\2ic",         "\0",       "2",    7,
    "\4able",       "\0",       "2",    7,
    "\4ible",       "\0",       "2",    7,
    "\3ant",        "\0",       "2",    7,
    "\5ement",      "\0",       "2",    7,
    "\4ment",       "\0",       "2",    7,
    "\3ent",        "\0",       "2",    7,
    "\3ion",        "\0",       "2*s*t|&",      7,
    "\2ou",         "\0",       "2",    7,
    "\3ism",        "\0",       "2",    7,
    "\3ate",        "\0",       "2",    7,
    "\3iti",        "\0",       "2",    7,
    "\3ous",        "\0",       "2",    7,
    "\3ive",        "\0",       "2",    7,
    "\3ize",        "\0",       "2",    7,
    NULL,           NULL,       NULL,   7,
};


RULE RuleTab7[] =
{
    "\1e",          "\0",       "2",    8,
    "\1e",          "\0",       "1o!&", 8,
    NULL,           NULL,       NULL,   8,
};

RULE RuleTab8[] =
{
    "\2ll",         "\1l",      "2",    9,
    "\0",           "\377\0",   "2*l&d&s",  9,
    NULL,           NULL,       NULL,   9,
};

char CharTypeTab[] =
{
    VOWEL,      //a
    CONSONANT,  //b
    CONSONANT,  //c
    CONSONANT,  //d
    VOWEL,      //e
    CONSONANT,  //f
    CONSONANT,  //g
    CONSONANT,  //h
    VOWEL,      //i
    CONSONANT,  //j
    CONSONANT,  //k
    CONSONANT,  //l
    CONSONANT,  //m
    CONSONANT,  //n
    VOWEL,      //o
    CONSONANT,  //p
    CONSONANT,  //q
    CONSONANT,  //r
    CONSONANT,  //s
    CONSONANT,  //t
    VOWEL,      //u
    CONSONANT,  //v
    CONSONANT,  //w
    CONSONANT,  //x
    MIXED,      //y, consonant, but may be vowel if after consonant
    CONSONANT,  //z
};

LPRULE RuleTables[] =
{
    RuleTab0,
    RuleTab1,
    RuleTab2,
    RuleTab3,
    RuleTab4,
    RuleTab5,
    RuleTab6,
    RuleTab7,
    RuleTab8,
    NULL,
};

/*************************************************************************
 *
 *                    INTERNAL PRIVATE FUNCTIONS
 *  All of them should be declared near
 *************************************************************************/
int PRIVATE PASCAL NEAR MeasureCalc (LPB, int);
int PRIVATE PASCAL NEAR ConditionMet (LPB, LPB, LPB, int);
int PRIVATE PASCAL NEAR SuffixMatch (LPB lpbWord, LPB lpSuffix);
HRESULT PRIVATE PASCAL NEAR MarkType (LPB, LPB, int);


/*************************************************************************
 *
 *  @doc    API INDEX RETRIEVAL
 *
 *  @func   HRESULT PASCAL FAR | FStem |
 *      This function will strip the suffix from a word, ie, "stem" it
 *
 *  @parm   LPB | lpbStemWord |
 *      Buffer to contain the stemmed word
 *
 *  @parm   LPB | lpbWord |
 *      Word to be stemmed
 *
 *  @rdesc  S_OK if succeeded, or E_INVALIDARG if the null argument is
 *      passed
 *
 *  @comm   The word passed must have all the letters in lower case for
 *      The function to work with. WARNING: There is no checking about
 *      case, so thing can go wrong if the word contains upper case letter
 *      or non alphabetic letter.
 *
 *************************************************************************/

PUBLIC HRESULT PASCAL FAR EXPORT_API FStem (LPB lpbStemWord, LPB lpbWord)
{
    register int wLength;   // Length of the word
    register int i;         // Scratch variable
    LPRULE lpRuleTab;       // Pointer to rule table
    LPRULE lpRule;          // Pointer to rule
    int wLengthSaved;
    int wNewSuffixLength;   // This must be signed!
    int wInitSuffixLength;
    char lpbWordType [CB_MAX_WORD_LEN];
    LPB szInitSuffix;
    LPB szNewSuffix;
    int TableIndex;         // For debugging purpose only
    int RuleIndex;          // For debugging purpose only
    LPB lpbTmp;
    
    if (lpbWord == NULL)
        return E_INVALIDARG;
    
    wLength = (*(LPW)lpbWordType = *((LPW)lpbWord));
    if (wLength >= CB_MAX_WORD_LEN)
        return(E_WORDTOOLONG);
        

    /* Copy the word over */
    MEMCPY (lpbStemWord, lpbWord, wLength + 2);

    /* Don't do any stemming for words <= 3 bytes */
    if (wLength <= MIN_LENGTH_FOR_STEM)
        return S_OK;

    /* Mark the type of each letter to be consonant or vowel */
    if (MarkType (lpbStemWord+2, lpbWordType+2, wLength) != S_OK)
    {
        /* We got some non alphabetic characters. Just return */
        return S_OK;
    }

        
    /* Traverse all the tables and check for stemming conditions */

    for (TableIndex = 0, lpRuleTab = RuleTables[0]; lpRuleTab;)
    {

        /* Check for each rule */

        for (RuleIndex = 0, lpRule = lpRuleTab;
            szInitSuffix = lpRule->szInitSuffix; lpRule++, RuleIndex++)
        {

            szNewSuffix = lpRule->szNewSuffix;

            /* The casting is needed to make wNewSuffixLength signed */
            wNewSuffixLength = (char)*szNewSuffix++;

            wInitSuffixLength = (char)*szInitSuffix++;

            /* Check for condition match */

            if (wLength >= wInitSuffixLength)
            {
                
                lpbTmp = lpbStemWord + wLength + 2 - wInitSuffixLength;
                /* Compare the suffixes */

                for (i = wInitSuffixLength;
                    i > 0 && (*lpbTmp == *szInitSuffix);
                    i--, lpbTmp++, szInitSuffix++);

                /* Restore szInitSuffix */
                szInitSuffix = lpRule->szInitSuffix;

                if (i != 0) // String comparison fails
                    continue;

                /* Save the word length */
                wLengthSaved = wLength;

                /* Update word length since we don't include the suffix
                 * length in our computation
                 */
                wLength -= wInitSuffixLength;


                /* Now check the stemming condition */

                if (ConditionMet (lpbStemWord, lpbWordType,
                    lpRule->szCondition, wLength))
                {

                    /* Rule applies, change to the new suffix */

                    if (wNewSuffixLength > 0)
                    {

                        MEMCPY (&lpbStemWord[wLength+2], szNewSuffix,
                            wNewSuffixLength);

                        /* Update the word type */

                        MarkType (szNewSuffix,
                            lpbWordType + wLength + 2, wNewSuffixLength);
                    }

                    /* Update the word length
                     * The check for wLength is necessary since we don't
                     * want to strip evething
                     */

                    if (wLength + wNewSuffixLength > 0)
                        *(LPW)lpbStemWord = (wLength += wNewSuffixLength);

					if (wLength <= MIN_LENGTH_FOR_STEM)
						goto Done;

                    break;
                }
                else
                {

                    /* Rule doesn't apply, Restore the word length */
                    wLength = wLengthSaved;
                }
            }
        }

        /* Go to the next table */
        lpRuleTab = RuleTables [TableIndex = lpRule->NextTable];
    }

	Done:
    lpbStemWord[*((LPW)lpbStemWord)+2] = 0;
    return S_OK;
}

/*************************************************************************
 *
 *  @doc    INTERNAL
 *
 *  @func   int PASCAL NEAR | MeasureCalc |
 *      Calculate the measure of a word. The measure is defined as
 *      the pair (VC), where V is the vowels, and C consonants. A word
 *      is described as [C](VC)m[V], where the first C and the last V are
 *      optional. m is the measure of the word (or part of word without
 *      the suffix). Example:
 *          architect: m = 3 (arch, it, ect)
 *          convention: m = 3 (onv, ent, ion)
 *          lie: m = 0, since the first consonant, and the last vowels
 *              don't count
 *
 *  @parm   LPB | lpbWordType |
 *      Buffer containing word type
 *
 *  @parm   int | wLength |
 *      The length of the word
 *
 *  @rdesc  Return the measure of the word
 *
 *************************************************************************/

int PRIVATE PASCAL NEAR MeasureCalc (LPB lpbWordType, register int wLength)
{
    register int cMeasure;

#if 0
    /* Safety chck
     * IFdef out for speed. This is a internal function
     */
    if (lpbWordType == NULL)
        return 0;
#endif

    /* Initialize the word measure */
    cMeasure = 0;

    /* Skip the beginning consonants */

    for (;wLength > 0 && *lpbWordType == CONSONANT; wLength--, lpbWordType++);

    /* Get the vowel/consonant pairs */
    while (wLength > 0)
    {

        /* Get all the vowels */
        for (; wLength > 0 && *lpbWordType == VOWEL; wLength--, lpbWordType++);

        if (wLength > 0)
        {
            cMeasure ++;

            /* Get all the consonants */
            for (; wLength > 0 && *lpbWordType == CONSONANT;
                wLength--, lpbWordType++);
        }
    }
    return cMeasure;
}

/*************************************************************************
 *
 *  @doc    INTERNAL
 *
 *  @func   int PASCAL NEAR | ConditionMet |
 *      This fuction check the condition to be met by a particular
 *      suffix.
 *
 *  @parm   LPB | lpbWord |
 *      Buffer contains the word to be stemmed> This is a 2-byte prefixed
 *      pascal string
 *
 *  @parm   LPB | lpbWordType |
 *      Buffer containing the type of each letter of the word. This
 *      is a parallel buffer
 *
 *  @parm   LPB | szCondition |
 *      Condtion in postfix form
 *
 *  @parm   int | wLength |
 *      Length of the word
 *
 *  @rdesc  TRUE, if the condition is met, FALSE otherwise
 *
 *************************************************************************/

int PRIVATE PASCAL NEAR ConditionMet (LPB lpbWord, LPB lpbWordType,
    LPB szCondition, int wLength)
{
    int StackIndex;
    int Stack[4];
    int wLengthSaved;
    int LastByte;
    LPB lpbTmp;
    LPB lpbTmpType;

    if (szCondition == NULL)
        return TRUE;

    /* Initialize variables
     * Note: The original codes are written for a 1-byte length preceded
     * string. The new format is 2-byte preceded string. To minimize the
     * change, lpbTmp is used, and points to the 2nd byte
     */

    StackIndex = -1;
    lpbTmp = lpbWord + 1;
    lpbTmpType = lpbWordType + 1;
    LastByte = lpbTmp[wLength];

    while (*szCondition)
    {
        switch (*szCondition)
        {
            case '*':   // *S in the document

                /* Check to see if the stem ends with the next letter */

                Stack[++StackIndex] =
                    (LastByte == *(++szCondition));
                break;

            case 'd':   // *d in the document

                /* Check to see if the stem ends with a double consonant */

                Stack[++StackIndex] = (wLength > 2 &&
                    LastByte == lpbTmp[wLength - 1] &&
                    lpbTmpType[wLength] == CONSONANT);
                break;

            case 's':   // Remove the last consonant
                if (Stack[0])
                {
                    lpbTmp[wLength] = 0;
                    wLength --;
                    *(LPW)lpbWordType = *(LPW)lpbWord = (WORD) wLength;
                }
                break;

            case 'v':   // *v* in the document

                /* Check to see if the word has a vowel */

                wLengthSaved = wLength; /* Save the length */

                for (; wLength &&
                    lpbTmpType[wLength] != VOWEL; wLength--);

                Stack[++StackIndex] = wLength > 0;

                /* Restore the word length */
                wLength = wLengthSaved;
                break;

            case 'o':
                /* *o in the document, ie.
                    - The word ends with the form cvc
                    - The second c is not W, X, Y
					The +2 is for skipping the word length
                 */
                Stack[++StackIndex] = (wLength >= 3) &&
                    (lpbWordType[wLength + 1] == CONSONANT) &&
                    (lpbWordType[wLength] == VOWEL) &&
                    (lpbWordType[wLength - 1] == CONSONANT) &&
                    (LastByte != 'w' && LastByte != 'x' && LastByte != 'y');
                break;

            /* The conditions below test Measure. If they fails, then
             * the whole condition fails. ie. there is no need to test
             * any other conditions. There is no need to save the result
             * on the stack
             */

            case 'p':   // Measure > 0
                if ((Stack[++StackIndex] =
                    MeasureCalc (lpbWordType+2, wLength) > 0) == FALSE)
                    return FALSE;
                break;

            case '2':   // Measure > 1
                if ((Stack[++StackIndex] =
                    MeasureCalc (lpbWordType+2, wLength) > 1) == FALSE)
                    return FALSE;
                break;

            case '1':   // Measure == 1
                if ((Stack[++StackIndex] =
                    MeasureCalc (lpbWordType+2, wLength) == 1) == FALSE)
                    return FALSE;
                break;

            /* The next conditions are operators combination */

            case '|':
                
                /* OR the result of the top 2 stack entries */

                Stack[StackIndex-1] |= Stack[StackIndex];
                StackIndex--;
                break;

            case '&':
                
                /* AND the result of the top 2 stack entries */

                Stack[StackIndex-1] &= Stack[StackIndex];
                StackIndex--;
                break;

            case '!':
                
                /* NOT the result of the top stack entry */

                Stack[StackIndex] = !Stack[StackIndex];
                break;

            default:
                return FALSE;
        }
        szCondition++;
    }

    return Stack[0];
}

/*************************************************************************
 *
 *  @doc    INTERNAL
 *
 *  @func   HRESULT PASCAL NEAR | MarkType |
 *      Marking the type of each letter of the word to be CONSONANT or
 *      VOWEL
 *
 *  @parm   LPB | lpbWord |
 *      Buffer containing the word
 *
 *  @parm   LPB | lpBufType |
 *      Buffer to contain the type of the letters
 *
 *  @parm   int | wLength |
 *      Length of the word
 *
 *************************************************************************/

HRESULT PRIVATE PASCAL NEAR MarkType (LPB lpbWord, LPB lpBufType, int wLength)
{
    for (; wLength > 0; lpBufType++, lpbWord++, wLength--)
    {

        /* Consider wildcard characters to be consonnant */
        if (*lpbWord == '?' || *lpbWord == '*')
        {
            *lpBufType = CONSONANT;
            continue;
        }

        if (*lpbWord < 'a' || *lpbWord > 'z')
            return E_FAIL;

        switch (CharTypeTab [*lpbWord - 'a'])
        {
            case CONSONANT:
                *lpBufType = CONSONANT;
                break;
            case VOWEL:
                *lpBufType = VOWEL;
                break;
            case MIXED:
                if (*(lpBufType - 1) == CONSONANT)
                    *lpBufType = VOWEL;
                else
                    *lpBufType = CONSONANT;
                break;
        }
    }
    return S_OK;
}
Add source files 2020-09-26 03:20:57 -05:00
			`/*************************************************************************`
			`* *`
			`* STEM.C *`
			`* *`
			`* Copyright (C) Microsoft Corporation 1990-1994 *`
			`* All Rights reserved. *`
			`* *`
			`**************************************************************************`
			`* *`
			`* Module Intent *`
			`* This module contains the functions to strip off the suffix of a word *`
			`* It is based on the research paper of Dr. Porter, pulished in *`
			`* An algorithm for suffix stripping *`
			`* Program, Vol.14, no.3,pp 130-137, July 1980 *`
			`* *`
			`* Description: *`
			`* *`
			`* The full description of the algorithm can be found in that document *`
			`* Basically, the algorithm consists of: *`
			`* - Matching the suffix from a table of suffixes *`
			`* - Applies the rule that comes with the suffix *`
			`* - If the rule matches, then change the suffix to the new one *`
			`* *`
			`* Comments: *`
			`* *`
			`* 1/ There are some misconceptions about stripping the suffix *`
			`* People are thinking in term of super-smart algorithm that can *`
			`* strip a word to its stem. The fact is that it is not necessarily *`
			`* true. For example, DIED is strippe to DI, but not DIE. *`
			`* *`
			`* 2/ The current code is SLOW, but it easy to understand in term *`
			`* of implementation, since it is straigthforward from the algorithm *`
			`* description. The impact on runtime is nothing. On compiled time *`
			`* stemming 5,000,000 words will take less than 1 hour, which is *`
			`* acceptable, since a project that large requires 1-2 days to *`
			`* compile. *`
			`* *`
			`* To improve the speed (up to 2 times), we can scan the suffix *`
			`* if one letter doesn't match we can jump pass all stem that have *`
			`* this letter *`
			`* WARNING: Tab setting is 4 for this file *`
			`* *`
			`**************************************************************************`
			`* *`
			`* Current Owner: BinhN *`
			`* *`
			`**************************************************************************/`
			`#include <mvopsys.h>`
			`#include <memory.h>`
			`#include <mvsearch.h>`
			`#include "common.h"`


			`#define VOWEL 0`
			`#define CONSONANT 1`
			`#define MIXED 2`

			`#define MIN_LENGTH_FOR_STEM 3`

			`/* Rule table structure */`

			`typedef struct RULE`
			`{`
			`LPB szInitSuffix; // Initial suffix`
			`LPB szNewSuffix; // New suffix`
			`LPB szCondition; // Stemming condition`
			`short NextTable; // Next table to jump to`
			`} RULE, FAR *LPRULE;`

			`/* The conventional letter used for the stemming condition are:`
			`*`
			`* '1': Measure == 1`
			`* '2': Measure > 1`
			`* 'd': Double consonant at the end (*d in the document)`
			`* 'o': Form cvc , and 2nd c is not W, X or Y (*o in the document)`
			`* 'p': Measure > 0`
			`* 's': Remove the last consonant (used with 'd')`
			`* 'v': Word contains vowels (v in the document)`
			`* '': Terminated with the next letter (S in the document)`
			`* '&': AND operation`
			`* '\|': OR operation`
			`* '!': NOT operation`
			`* The rule operation is based on a postfix notation, so "m=1 and o" is`
			`* described as "1o&"`
			`*/`

			`RULE RuleTab0[] =`
			`{`
			`"\4sses", "\2ss", NULL, 1,`
			`"\3ies", "\1i", NULL, 1,`
			`"\2ss", "\2ss", NULL, 1,`
			`"\1s", "\0", NULL, 1,`
			`NULL, NULL, NULL, 1,`
			`};`

			`RULE RuleTab1[] =`
			`{`
			`"\3eed", "\2ee", "p", 3,`
			`"\2ed", "\0", "v", 2,`
			`"\3ing", "\0", "v", 2,`
			`NULL, NULL, NULL, 3,`
			`};`

			`RULE RuleTab2[] =`
			`{`
			`"\2at", "\3ate", NULL, 3,`
			`"\2bl", "\3ble", NULL, 3,`
			`"\2iz", "\3ize", NULL, 3,`

			`/* The following szNewSuffix has a negative \377`
			`* (-1) length. It is to be used to reduce a`
			`* double consonant ending to single consonant`
			`*/`

			`"\0", "\377\0", "ls\|*z\|!d&s", 3,`
			`"\0", "\1e", "1o&", 3,`
			`NULL, NULL, NULL, 3,`
			`};`

			`RULE RuleTab3[] =`
			`{`
			`"\1y", "\1i", "v", 4,`
			`NULL, NULL, NULL, 4,`
			`};`

			`RULE RuleTab4[] =`
			`{`
			`"\7ational", "\3ate", "p", 5,`
			`"\6tional", "\4tion", "p", 5,`
			`"\4enci", "\4ence", "p", 5,`
			`"\4anci", "\4ance", "p", 5,`
			`"\4izer", "\3ize", "p", 5,`
			`"\4abli", "\4able", "p", 5,`
			`"\4alli", "\2al", "p", 5,`
			`"\5entli", "\3ent", "p", 5,`
			`"\3eli", "\1e", "p", 5,`
			`"\5ousli", "\3ous", "p", 5,`
			`"\7ization", "\3ize", "p", 5,`
			`"\5ation", "\3ate", "p", 5,`
			`"\4ator", "\3ate", "p", 5,`
			`"\5alism", "\2al", "p", 5,`
			`"\7iveness", "\3ive", "p", 5,`
			`"\7fulness", "\3ful", "p", 5,`
			`"\7ousness", "\3ous", "p", 5,`
			`"\5aliti", "\2al", "p", 5,`
			`"\5iviti", "\3ive", "p", 5,`
			`"\6biliti", "\3ble", "p", 5,`
			`NULL, NULL, NULL, 5,`
			`};`

			`RULE RuleTab5[] =`
			`{`
			`"\5icate", "\2ic", "p", 6,`
			`"\5ative", "\0", "p", 6,`
			`"\5alize", "\2al", "p", 6,`
			`"\5iciti", "\2ic", "p", 6,`
			`"\4ical", "\2ic", "p", 6,`
			`"\3ful", "\0", "p", 6,`
			`"\4ness", "\0", "p", 6,`
			`NULL, NULL, NULL, 6,`
			`};`

			`RULE RuleTab6[] =`
			`{`
			`"\2al", "\0", "2", 7,`
			`"\4ance", "\0", "2", 7,`
			`"\4ence", "\0", "2", 7,`
			`"\2er", "\0", "p", 7,`
			`"\2ic", "\0", "2", 7,`
			`"\4able", "\0", "2", 7,`
			`"\4ible", "\0", "2", 7,`
			`"\3ant", "\0", "2", 7,`
			`"\5ement", "\0", "2", 7,`
			`"\4ment", "\0", "2", 7,`
			`"\3ent", "\0", "2", 7,`
			`"\3ion", "\0", "2st\|&", 7,`
			`"\2ou", "\0", "2", 7,`
			`"\3ism", "\0", "2", 7,`
			`"\3ate", "\0", "2", 7,`
			`"\3iti", "\0", "2", 7,`
			`"\3ous", "\0", "2", 7,`
			`"\3ive", "\0", "2", 7,`
			`"\3ize", "\0", "2", 7,`
			`NULL, NULL, NULL, 7,`
			`};`


			`RULE RuleTab7[] =`
			`{`
			`"\1e", "\0", "2", 8,`
			`"\1e", "\0", "1o!&", 8,`
			`NULL, NULL, NULL, 8,`
			`};`

			`RULE RuleTab8[] =`
			`{`
			`"\2ll", "\1l", "2", 9,`
			`"\0", "\377\0", "2*l&d&s", 9,`
			`NULL, NULL, NULL, 9,`
			`};`

			`char CharTypeTab[] =`
			`{`
			`VOWEL, //a`
			`CONSONANT, //b`
			`CONSONANT, //c`
			`CONSONANT, //d`
			`VOWEL, //e`
			`CONSONANT, //f`
			`CONSONANT, //g`
			`CONSONANT, //h`
			`VOWEL, //i`
			`CONSONANT, //j`
			`CONSONANT, //k`
			`CONSONANT, //l`
			`CONSONANT, //m`
			`CONSONANT, //n`
			`VOWEL, //o`
			`CONSONANT, //p`
			`CONSONANT, //q`
			`CONSONANT, //r`
			`CONSONANT, //s`
			`CONSONANT, //t`
			`VOWEL, //u`
			`CONSONANT, //v`
			`CONSONANT, //w`
			`CONSONANT, //x`
			`MIXED, //y, consonant, but may be vowel if after consonant`
			`CONSONANT, //z`
			`};`

			`LPRULE RuleTables[] =`
			`{`
			`RuleTab0,`
			`RuleTab1,`
			`RuleTab2,`
			`RuleTab3,`
			`RuleTab4,`
			`RuleTab5,`
			`RuleTab6,`
			`RuleTab7,`
			`RuleTab8,`
			`NULL,`
			`};`

			`/*************************************************************************`
			`*`
			`* INTERNAL PRIVATE FUNCTIONS`
			`* All of them should be declared near`
			`*************************************************************************/`
			`int PRIVATE PASCAL NEAR MeasureCalc (LPB, int);`
			`int PRIVATE PASCAL NEAR ConditionMet (LPB, LPB, LPB, int);`
			`int PRIVATE PASCAL NEAR SuffixMatch (LPB lpbWord, LPB lpSuffix);`
			`HRESULT PRIVATE PASCAL NEAR MarkType (LPB, LPB, int);`


			`/*************************************************************************`
			`*`
			`* @doc API INDEX RETRIEVAL`
			`*`
			`* @func HRESULT PASCAL FAR \| FStem \|`
			`* This function will strip the suffix from a word, ie, "stem" it`
			`*`
			`* @parm LPB \| lpbStemWord \|`
			`* Buffer to contain the stemmed word`
			`*`
			`* @parm LPB \| lpbWord \|`
			`* Word to be stemmed`
			`*`
			`* @rdesc S_OK if succeeded, or E_INVALIDARG if the null argument is`
			`* passed`
			`*`
			`* @comm The word passed must have all the letters in lower case for`
			`* The function to work with. WARNING: There is no checking about`
			`* case, so thing can go wrong if the word contains upper case letter`
			`* or non alphabetic letter.`
			`*`
			`*************************************************************************/`

			`PUBLIC HRESULT PASCAL FAR EXPORT_API FStem (LPB lpbStemWord, LPB lpbWord)`
			`{`
			`register int wLength; // Length of the word`
			`register int i; // Scratch variable`
			`LPRULE lpRuleTab; // Pointer to rule table`
			`LPRULE lpRule; // Pointer to rule`
			`int wLengthSaved;`
			`int wNewSuffixLength; // This must be signed!`
			`int wInitSuffixLength;`
			`char lpbWordType [CB_MAX_WORD_LEN];`
			`LPB szInitSuffix;`
			`LPB szNewSuffix;`
			`int TableIndex; // For debugging purpose only`
			`int RuleIndex; // For debugging purpose only`
			`LPB lpbTmp;`

			`if (lpbWord == NULL)`
			`return E_INVALIDARG;`

			`wLength = ((LPW)lpbWordType = ((LPW)lpbWord));`
			`if (wLength >= CB_MAX_WORD_LEN)`
			`return(E_WORDTOOLONG);`


			`/* Copy the word over */`
			`MEMCPY (lpbStemWord, lpbWord, wLength + 2);`

			`/* Don't do any stemming for words <= 3 bytes */`
			`if (wLength <= MIN_LENGTH_FOR_STEM)`
			`return S_OK;`

			`/* Mark the type of each letter to be consonant or vowel */`
			`if (MarkType (lpbStemWord+2, lpbWordType+2, wLength) != S_OK)`
			`{`
			`/* We got some non alphabetic characters. Just return */`
			`return S_OK;`
			`}`


			`/* Traverse all the tables and check for stemming conditions */`

			`for (TableIndex = 0, lpRuleTab = RuleTables[0]; lpRuleTab;)`
			`{`

			`/* Check for each rule */`

			`for (RuleIndex = 0, lpRule = lpRuleTab;`
			`szInitSuffix = lpRule->szInitSuffix; lpRule++, RuleIndex++)`
			`{`

			`szNewSuffix = lpRule->szNewSuffix;`

			`/* The casting is needed to make wNewSuffixLength signed */`
			`wNewSuffixLength = (char)*szNewSuffix++;`

			`wInitSuffixLength = (char)*szInitSuffix++;`

			`/* Check for condition match */`

			`if (wLength >= wInitSuffixLength)`
			`{`

			`lpbTmp = lpbStemWord + wLength + 2 - wInitSuffixLength;`
			`/* Compare the suffixes */`

			`for (i = wInitSuffixLength;`
			`i > 0 && (lpbTmp == szInitSuffix);`
			`i--, lpbTmp++, szInitSuffix++);`

			`/* Restore szInitSuffix */`
			`szInitSuffix = lpRule->szInitSuffix;`

			`if (i != 0) // String comparison fails`
			`continue;`

			`/* Save the word length */`
			`wLengthSaved = wLength;`

			`/* Update word length since we don't include the suffix`
			`* length in our computation`
			`*/`
			`wLength -= wInitSuffixLength;`


			`/* Now check the stemming condition */`

			`if (ConditionMet (lpbStemWord, lpbWordType,`
			`lpRule->szCondition, wLength))`
			`{`

			`/* Rule applies, change to the new suffix */`

			`if (wNewSuffixLength > 0)`
			`{`

			`MEMCPY (&lpbStemWord[wLength+2], szNewSuffix,`
			`wNewSuffixLength);`

			`/* Update the word type */`

			`MarkType (szNewSuffix,`
			`lpbWordType + wLength + 2, wNewSuffixLength);`
			`}`

			`/* Update the word length`
			`* The check for wLength is necessary since we don't`
			`* want to strip evething`
			`*/`

			`if (wLength + wNewSuffixLength > 0)`
			`*(LPW)lpbStemWord = (wLength += wNewSuffixLength);`

			`if (wLength <= MIN_LENGTH_FOR_STEM)`
			`goto Done;`

			`break;`
			`}`
			`else`
			`{`

			`/* Rule doesn't apply, Restore the word length */`
			`wLength = wLengthSaved;`
			`}`
			`}`
			`}`

			`/* Go to the next table */`
			`lpRuleTab = RuleTables [TableIndex = lpRule->NextTable];`
			`}`

			`Done:`
			`lpbStemWord[*((LPW)lpbStemWord)+2] = 0;`
			`return S_OK;`
			`}`

			`/*************************************************************************`
			`*`
			`* @doc INTERNAL`
			`*`
			`* @func int PASCAL NEAR \| MeasureCalc \|`
			`* Calculate the measure of a word. The measure is defined as`
			`* the pair (VC), where V is the vowels, and C consonants. A word`
			`* is described as [C](VC)m[V], where the first C and the last V are`
			`* optional. m is the measure of the word (or part of word without`
			`* the suffix). Example:`
			`* architect: m = 3 (arch, it, ect)`
			`* convention: m = 3 (onv, ent, ion)`
			`* lie: m = 0, since the first consonant, and the last vowels`
			`* don't count`
			`*`
			`* @parm LPB \| lpbWordType \|`
			`* Buffer containing word type`
			`*`
			`* @parm int \| wLength \|`
			`* The length of the word`
			`*`
			`* @rdesc Return the measure of the word`
			`*`
			`*************************************************************************/`

			`int PRIVATE PASCAL NEAR MeasureCalc (LPB lpbWordType, register int wLength)`
			`{`
			`register int cMeasure;`

			`#if 0`
			`/* Safety chck`
			`* IFdef out for speed. This is a internal function`
			`*/`
			`if (lpbWordType == NULL)`
			`return 0;`
			`#endif`

			`/* Initialize the word measure */`
			`cMeasure = 0;`

			`/* Skip the beginning consonants */`

			`for (;wLength > 0 && *lpbWordType == CONSONANT; wLength--, lpbWordType++);`

			`/* Get the vowel/consonant pairs */`
			`while (wLength > 0)`
			`{`

			`/* Get all the vowels */`
			`for (; wLength > 0 && *lpbWordType == VOWEL; wLength--, lpbWordType++);`

			`if (wLength > 0)`
			`{`
			`cMeasure ++;`

			`/* Get all the consonants */`
			`for (; wLength > 0 && *lpbWordType == CONSONANT;`
			`wLength--, lpbWordType++);`
			`}`
			`}`
			`return cMeasure;`
			`}`

			`/*************************************************************************`
			`*`
			`* @doc INTERNAL`
			`*`
			`* @func int PASCAL NEAR \| ConditionMet \|`
			`* This fuction check the condition to be met by a particular`
			`* suffix.`
			`*`
			`* @parm LPB \| lpbWord \|`
			`* Buffer contains the word to be stemmed> This is a 2-byte prefixed`
			`* pascal string`
			`*`
			`* @parm LPB \| lpbWordType \|`
			`* Buffer containing the type of each letter of the word. This`
			`* is a parallel buffer`
			`*`
			`* @parm LPB \| szCondition \|`
			`* Condtion in postfix form`
			`*`
			`* @parm int \| wLength \|`
			`* Length of the word`
			`*`
			`* @rdesc TRUE, if the condition is met, FALSE otherwise`
			`*`
			`*************************************************************************/`

			`int PRIVATE PASCAL NEAR ConditionMet (LPB lpbWord, LPB lpbWordType,`
			`LPB szCondition, int wLength)`
			`{`
			`int StackIndex;`
			`int Stack[4];`
			`int wLengthSaved;`
			`int LastByte;`
			`LPB lpbTmp;`
			`LPB lpbTmpType;`

			`if (szCondition == NULL)`
			`return TRUE;`

			`/* Initialize variables`
			`* Note: The original codes are written for a 1-byte length preceded`
			`* string. The new format is 2-byte preceded string. To minimize the`
			`* change, lpbTmp is used, and points to the 2nd byte`
			`*/`

			`StackIndex = -1;`
			`lpbTmp = lpbWord + 1;`
			`lpbTmpType = lpbWordType + 1;`
			`LastByte = lpbTmp[wLength];`

			`while (*szCondition)`
			`{`
			`switch (*szCondition)`
			`{`
			`case '': // S in the document`

			`/* Check to see if the stem ends with the next letter */`

			`Stack[++StackIndex] =`
			`(LastByte == *(++szCondition));`
			`break;`

			`case 'd': // *d in the document`

			`/* Check to see if the stem ends with a double consonant */`

			`Stack[++StackIndex] = (wLength > 2 &&`
			`LastByte == lpbTmp[wLength - 1] &&`
			`lpbTmpType[wLength] == CONSONANT);`
			`break;`

			`case 's': // Remove the last consonant`
			`if (Stack[0])`
			`{`
			`lpbTmp[wLength] = 0;`
			`wLength --;`
			`(LPW)lpbWordType = (LPW)lpbWord = (WORD) wLength;`
			`}`
			`break;`

			`case 'v': // v in the document`

			`/* Check to see if the word has a vowel */`

			`wLengthSaved = wLength; /* Save the length */`

			`for (; wLength &&`
			`lpbTmpType[wLength] != VOWEL; wLength--);`

			`Stack[++StackIndex] = wLength > 0;`

			`/* Restore the word length */`
			`wLength = wLengthSaved;`
			`break;`

			`case 'o':`
			`/* *o in the document, ie.`
			`- The word ends with the form cvc`
			`- The second c is not W, X, Y`
			`The +2 is for skipping the word length`
			`*/`
			`Stack[++StackIndex] = (wLength >= 3) &&`
			`(lpbWordType[wLength + 1] == CONSONANT) &&`
			`(lpbWordType[wLength] == VOWEL) &&`
			`(lpbWordType[wLength - 1] == CONSONANT) &&`
			`(LastByte != 'w' && LastByte != 'x' && LastByte != 'y');`
			`break;`

			`/* The conditions below test Measure. If they fails, then`
			`* the whole condition fails. ie. there is no need to test`
			`* any other conditions. There is no need to save the result`
			`* on the stack`
			`*/`

			`case 'p': // Measure > 0`
			`if ((Stack[++StackIndex] =`
			`MeasureCalc (lpbWordType+2, wLength) > 0) == FALSE)`
			`return FALSE;`
			`break;`

			`case '2': // Measure > 1`
			`if ((Stack[++StackIndex] =`
			`MeasureCalc (lpbWordType+2, wLength) > 1) == FALSE)`
			`return FALSE;`
			`break;`

			`case '1': // Measure == 1`
			`if ((Stack[++StackIndex] =`
			`MeasureCalc (lpbWordType+2, wLength) == 1) == FALSE)`
			`return FALSE;`
			`break;`

			`/* The next conditions are operators combination */`

			`case '\|':`

			`/* OR the result of the top 2 stack entries */`

			`Stack[StackIndex-1] \|= Stack[StackIndex];`
			`StackIndex--;`
			`break;`

			`case '&':`

			`/* AND the result of the top 2 stack entries */`

			`Stack[StackIndex-1] &= Stack[StackIndex];`
			`StackIndex--;`
			`break;`

			`case '!':`

			`/* NOT the result of the top stack entry */`

			`Stack[StackIndex] = !Stack[StackIndex];`
			`break;`

			`default:`
			`return FALSE;`
			`}`
			`szCondition++;`
			`}`

			`return Stack[0];`
			`}`

			`/*************************************************************************`
			`*`
			`* @doc INTERNAL`
			`*`
			`* @func HRESULT PASCAL NEAR \| MarkType \|`
			`* Marking the type of each letter of the word to be CONSONANT or`
			`* VOWEL`
			`*`
			`* @parm LPB \| lpbWord \|`
			`* Buffer containing the word`
			`*`
			`* @parm LPB \| lpBufType \|`
			`* Buffer to contain the type of the letters`
			`*`
			`* @parm int \| wLength \|`
			`* Length of the word`
			`*`
			`*************************************************************************/`

			`HRESULT PRIVATE PASCAL NEAR MarkType (LPB lpbWord, LPB lpBufType, int wLength)`
			`{`
			`for (; wLength > 0; lpBufType++, lpbWord++, wLength--)`
			`{`

			`/* Consider wildcard characters to be consonnant */`
			`if (lpbWord == '?' \|\| lpbWord == '*')`
			`{`
			`*lpBufType = CONSONANT;`
			`continue;`
			`}`

			`if (lpbWord < 'a' \|\| lpbWord > 'z')`
			`return E_FAIL;`

			`switch (CharTypeTab [*lpbWord - 'a'])`
			`{`
			`case CONSONANT:`
			`*lpBufType = CONSONANT;`
			`break;`
			`case VOWEL:`
			`*lpBufType = VOWEL;`
			`break;`
			`case MIXED:`
			`if (*(lpBufType - 1) == CONSONANT)`
			`*lpBufType = VOWEL;`
			`else`
			`*lpBufType = CONSONANT;`
			`break;`
			`}`
			`}`
			`return S_OK;`
			`}`