windows-nt/Source/XPSP1/NT/enduser/stuff/itircl/fts/search/stem.c


/*************************************************************************
*                                                                        *
*  STEM.C                                                                *
*                                                                        *
*  Copyright (C) Microsoft Corporation 1990-1994                         *
*  All Rights reserved.                                                  *
*                                                                        *
**************************************************************************
*                                                                        *
*  Module Intent                                                         *
*   This module contains the functions to strip off the suffix of a word *
*   It is based on the research paper of  Dr. Porter, pulished in        *
*             An algorithm for suffix stripping                          *
*          Program, Vol.14, no.3,pp 130-137, July 1980                   *
*                                                                        *
*  Description:                                                          *
*                                                                        *
*   The full description of the algorithm can be found in that document  *
*   Basically, the algorithm consists of:                                *
*      - Matching the suffix from a table of suffixes                    *
*      - Applies the rule that comes with the suffix                     *
*      - If the rule matches, then change the suffix to the new one      *
*                                                                        *
*  Comments:                                                             *
*                                                                        *
*     1/ There are some misconceptions about stripping the suffix        *
*     People are thinking in term of super-smart algorithm that can      *
*     strip a word to its stem. The fact is that it is not necessarily   *
*     true. For example, DIED is strippe to DI, but not DIE.             *
*                                                                        *
*     2/ The current code is SLOW, but it easy to understand in term     *
*     of implementation, since it is straigthforward from the algorithm  *
*     description. The impact on runtime is nothing. On compiled time    *
*     stemming 5,000,000 words will take less than 1 hour, which is      *
*     acceptable, since a project that large requires 1-2 days to        *
*     compile.                                                           *
*                                                                        *
*     To improve the speed (up to 2 times), we can scan the suffix       *
*     if one letter doesn't match we can jump pass all stem that have    *
*     this letter                                                        *
*   WARNING: Tab setting is 4 for this file                              *
*                                                                        *
**************************************************************************
*                                                                        *
*  Current Owner: BinhN                                                  *
*                                                                        *
**************************************************************************/
#include <mvopsys.h>
#include <memory.h>
#include <mvsearch.h>
#include "common.h"


#define VOWEL       0
#define CONSONANT   1
#define MIXED       2

#define	MIN_LENGTH_FOR_STEM	3

/* Rule table structure */

typedef struct RULE
{
    LPB szInitSuffix;           // Initial suffix
    LPB szNewSuffix;            // New suffix
    LPB szCondition;            // Stemming condition
    short NextTable;            // Next table to jump to
} RULE, FAR *LPRULE;

/* The conventional letter  used for the stemming condition are:
 *
 *  '1':    Measure == 1
 *  '2':    Measure > 1
 *  'd':    Double consonant at the end (*d in the document)
 *  'o':    Form cvc , and 2nd c is not W, X or Y (*o in the document)
 *  'p':    Measure > 0
 *  's':    Remove the last consonant (used with 'd')
 *  'v':    Word contains vowels (*v* in the document)
 *  '*':    Terminated with the next letter (*S in the document)
 *  '&':    AND operation
 *  '|':    OR operation
 *  '!':    NOT operation
 * The rule operation is based on a postfix notation, so "m=1 and *o*" is
 * described as "1o&"
 */

RULE RuleTab0[] =
{
    "\4sses",       "\2ss",     NULL,   1,
    "\3ies",        "\1i",      NULL,   1,
    "\2ss",         "\2ss",     NULL,   1,
    "\1s",          "\0",       NULL,   1,
    NULL,           NULL,       NULL,   1,
};

RULE RuleTab1[] =
{
    "\3eed",        "\2ee",     "p",    3,
    "\2ed",         "\0",       "v",    2,
    "\3ing",        "\0",       "v",    2,
    NULL,           NULL,       NULL,   3,
};

RULE RuleTab2[] =
{
    "\2at",         "\3ate",    NULL,   3,
    "\2bl",         "\3ble",    NULL,   3,
    "\2iz",         "\3ize",    NULL,   3,

    /* The following szNewSuffix has a negative \377
     * (-1) length. It is to be used to reduce a
     * double consonant ending to single consonant
     */

    "\0",           "\377\0",   "*l*s|*z|!d&s", 3,
    "\0",           "\1e",      "1o&",  3,
    NULL,           NULL,       NULL,   3,
};

RULE RuleTab3[] =
{
    "\1y",          "\1i",      "v",    4,
    NULL,           NULL,       NULL,   4,
};

RULE RuleTab4[] =
{
    "\7ational",    "\3ate",    "p",    5,
    "\6tional",     "\4tion",   "p",    5,
    "\4enci",       "\4ence",   "p",    5,
    "\4anci",       "\4ance",   "p",    5,
    "\4izer",       "\3ize",    "p",    5,
    "\4abli",       "\4able",   "p",    5,
    "\4alli",       "\2al",     "p",    5,
    "\5entli",      "\3ent",    "p",    5,
    "\3eli",        "\1e",      "p",    5,
    "\5ousli",      "\3ous",    "p",    5,
    "\7ization",    "\3ize",    "p",    5,
    "\5ation",      "\3ate",    "p",    5,
    "\4ator",       "\3ate",    "p",    5,
    "\5alism",      "\2al",     "p",    5,
    "\7iveness",    "\3ive",    "p",    5,
    "\7fulness",    "\3ful",    "p",    5,
    "\7ousness",    "\3ous",    "p",    5,
    "\5aliti",      "\2al",     "p",    5,
    "\5iviti",      "\3ive",    "p",    5,
    "\6biliti",     "\3ble",    "p",    5,
    NULL,           NULL,       NULL,   5,
};

RULE RuleTab5[] =
{
    "\5icate",      "\2ic",     "p",    6,
    "\5ative",      "\0",       "p",    6,
    "\5alize",      "\2al",     "p",    6,
    "\5iciti",      "\2ic",     "p",    6,
    "\4ical",       "\2ic",     "p",    6,
    "\3ful",        "\0",       "p",    6,
    "\4ness",       "\0",       "p",    6,
    NULL,           NULL,       NULL,   6,
};

RULE RuleTab6[] =
{
    "\2al",         "\0",       "2",    7,
    "\4ance",       "\0",       "2",    7,
    "\4ence",       "\0",       "2",    7,
    "\2er",         "\0",       "p",    7,
    "\2ic",         "\0",       "2",    7,
    "\4able",       "\0",       "2",    7,
    "\4ible",       "\0",       "2",    7,
    "\3ant",        "\0",       "2",    7,
    "\5ement",      "\0",       "2",    7,
    "\4ment",       "\0",       "2",    7,
    "\3ent",        "\0",       "2",    7,
    "\3ion",        "\0",       "2*s*t|&",      7,
    "\2ou",         "\0",       "2",    7,
    "\3ism",        "\0",       "2",    7,
    "\3ate",        "\0",       "2",    7,
    "\3iti",        "\0",       "2",    7,
    "\3ous",        "\0",       "2",    7,
    "\3ive",        "\0",       "2",    7,
    "\3ize",        "\0",       "2",    7,
    NULL,           NULL,       NULL,   7,
};


RULE RuleTab7[] =
{
    "\1e",          "\0",       "2",    8,
    "\1e",          "\0",       "1o!&", 8,
    NULL,           NULL,       NULL,   8,
};

RULE RuleTab8[] =
{
    "\2ll",         "\1l",      "2",    9,
    "\0",           "\377\0",   "2*l&d&s",  9,
    NULL,           NULL,       NULL,   9,
};

char CharTypeTab[] =
{
    VOWEL,      //a
    CONSONANT,  //b
    CONSONANT,  //c
    CONSONANT,  //d
    VOWEL,      //e
    CONSONANT,  //f
    CONSONANT,  //g
    CONSONANT,  //h
    VOWEL,      //i
    CONSONANT,  //j
    CONSONANT,  //k
    CONSONANT,  //l
    CONSONANT,  //m
    CONSONANT,  //n
    VOWEL,      //o
    CONSONANT,  //p
    CONSONANT,  //q
    CONSONANT,  //r
    CONSONANT,  //s
    CONSONANT,  //t
    VOWEL,      //u
    CONSONANT,  //v
    CONSONANT,  //w
    CONSONANT,  //x
    MIXED,      //y, consonant, but may be vowel if after consonant
    CONSONANT,  //z
};

LPRULE RuleTables[] =
{
    RuleTab0,
    RuleTab1,
    RuleTab2,
    RuleTab3,
    RuleTab4,
    RuleTab5,
    RuleTab6,
    RuleTab7,
    RuleTab8,
    NULL,
};

/*************************************************************************
 *
 *                    INTERNAL PRIVATE FUNCTIONS
 *  All of them should be declared near
 *************************************************************************/
int PRIVATE PASCAL NEAR MeasureCalc (LPB, int);
int PRIVATE PASCAL NEAR ConditionMet (LPB, LPB, LPB, int);
int PRIVATE PASCAL NEAR SuffixMatch (LPB lpbWord, LPB lpSuffix);
HRESULT PRIVATE PASCAL NEAR MarkType (LPB, LPB, int);


/*************************************************************************
 *
 *  @doc    API INDEX RETRIEVAL
 *
 *  @func   HRESULT PASCAL FAR | FStem |
 *      This function will strip the suffix from a word, ie, "stem" it
 *
 *  @parm   LPB | lpbStemWord |
 *      Buffer to contain the stemmed word
 *
 *  @parm   LPB | lpbWord |
 *      Word to be stemmed
 *
 *  @rdesc  S_OK if succeeded, or E_INVALIDARG if the null argument is
 *      passed
 *
 *  @comm   The word passed must have all the letters in lower case for
 *      The function to work with. WARNING: There is no checking about
 *      case, so thing can go wrong if the word contains upper case letter
 *      or non alphabetic letter.
 *
 *************************************************************************/

PUBLIC HRESULT PASCAL FAR EXPORT_API FStem (LPB lpbStemWord, LPB lpbWord)
{
    register int wLength;   // Length of the word
    register int i;         // Scratch variable
    LPRULE lpRuleTab;       // Pointer to rule table
    LPRULE lpRule;          // Pointer to rule
    int wLengthSaved;
    int wNewSuffixLength;   // This must be signed!
    int wInitSuffixLength;
    char lpbWordType [CB_MAX_WORD_LEN];
    LPB szInitSuffix;
    LPB szNewSuffix;
    int TableIndex;         // For debugging purpose only
    int RuleIndex;          // For debugging purpose only
    LPB lpbTmp;

    if (lpbWord == NULL)
        return E_INVALIDARG;

    wLength = (*(LPW)lpbWordType = *((LPW)lpbWord));
    if (wLength >= CB_MAX_WORD_LEN)
        return(E_WORDTOOLONG);


    /* Copy the word over */
    MEMCPY (lpbStemWord, lpbWord, wLength + 2);

    /* Don't do any stemming for words <= 3 bytes */
    if (wLength <= MIN_LENGTH_FOR_STEM)
        return S_OK;

    /* Mark the type of each letter to be consonant or vowel */
    if (MarkType (lpbStemWord+2, lpbWordType+2, wLength) != S_OK)
    {
        /* We got some non alphabetic characters. Just return */
        return S_OK;
    }


    /* Traverse all the tables and check for stemming conditions */

    for (TableIndex = 0, lpRuleTab = RuleTables[0]; lpRuleTab;)
    {

        /* Check for each rule */

        for (RuleIndex = 0, lpRule = lpRuleTab;
            szInitSuffix = lpRule->szInitSuffix; lpRule++, RuleIndex++)
        {

            szNewSuffix = lpRule->szNewSuffix;

            /* The casting is needed to make wNewSuffixLength signed */
            wNewSuffixLength = (char)*szNewSuffix++;

            wInitSuffixLength = (char)*szInitSuffix++;

            /* Check for condition match */

            if (wLength >= wInitSuffixLength)
            {

                lpbTmp = lpbStemWord + wLength + 2 - wInitSuffixLength;
                /* Compare the suffixes */

                for (i = wInitSuffixLength;
                    i > 0 && (*lpbTmp == *szInitSuffix);
                    i--, lpbTmp++, szInitSuffix++);

                /* Restore szInitSuffix */
                szInitSuffix = lpRule->szInitSuffix;

                if (i != 0) // String comparison fails
                    continue;

                /* Save the word length */
                wLengthSaved = wLength;

                /* Update word length since we don't include the suffix
                 * length in our computation
                 */
                wLength -= wInitSuffixLength;


                /* Now check the stemming condition */

                if (ConditionMet (lpbStemWord, lpbWordType,
                    lpRule->szCondition, wLength))
                {

                    /* Rule applies, change to the new suffix */

                    if (wNewSuffixLength > 0)
                    {

                        MEMCPY (&lpbStemWord[wLength+2], szNewSuffix,
                            wNewSuffixLength);

                        /* Update the word type */

                        MarkType (szNewSuffix,
                            lpbWordType + wLength + 2, wNewSuffixLength);
                    }

                    /* Update the word length
                     * The check for wLength is necessary since we don't
                     * want to strip evething
                     */

                    if (wLength + wNewSuffixLength > 0)
                        *(LPW)lpbStemWord = (wLength += wNewSuffixLength);

					if (wLength <= MIN_LENGTH_FOR_STEM)
						goto Done;

                    break;
                }
                else
                {

                    /* Rule doesn't apply, Restore the word length */
                    wLength = wLengthSaved;
                }
            }
        }

        /* Go to the next table */
        lpRuleTab = RuleTables [TableIndex = lpRule->NextTable];
    }

	Done:
    lpbStemWord[*((LPW)lpbStemWord)+2] = 0;
    return S_OK;
}

/*************************************************************************
 *
 *  @doc    INTERNAL
 *
 *  @func   int PASCAL NEAR | MeasureCalc |
 *      Calculate the measure of a word. The measure is defined as
 *      the pair (VC), where V is the vowels, and C consonants. A word
 *      is described as [C](VC)m[V], where the first C and the last V are
 *      optional. m is the measure of the word (or part of word without
 *      the suffix). Example:
 *          architect: m = 3 (arch, it, ect)
 *          convention: m = 3 (onv, ent, ion)
 *          lie: m = 0, since the first consonant, and the last vowels
 *              don't count
 *
 *  @parm   LPB | lpbWordType |
 *      Buffer containing word type
 *
 *  @parm   int | wLength |
 *      The length of the word
 *
 *  @rdesc  Return the measure of the word
 *
 *************************************************************************/

int PRIVATE PASCAL NEAR MeasureCalc (LPB lpbWordType, register int wLength)
{
    register int cMeasure;

#if 0
    /* Safety chck
     * IFdef out for speed. This is a internal function
     */
    if (lpbWordType == NULL)
        return 0;
#endif

    /* Initialize the word measure */
    cMeasure = 0;

    /* Skip the beginning consonants */

    for (;wLength > 0 && *lpbWordType == CONSONANT; wLength--, lpbWordType++);

    /* Get the vowel/consonant pairs */
    while (wLength > 0)
    {

        /* Get all the vowels */
        for (; wLength > 0 && *lpbWordType == VOWEL; wLength--, lpbWordType++);

        if (wLength > 0)
        {
            cMeasure ++;

            /* Get all the consonants */
            for (; wLength > 0 && *lpbWordType == CONSONANT;
                wLength--, lpbWordType++);
        }
    }
    return cMeasure;
}

/*************************************************************************
 *
 *  @doc    INTERNAL
 *
 *  @func   int PASCAL NEAR | ConditionMet |
 *      This fuction check the condition to be met by a particular
 *      suffix.
 *
 *  @parm   LPB | lpbWord |
 *      Buffer contains the word to be stemmed> This is a 2-byte prefixed
 *      pascal string
 *
 *  @parm   LPB | lpbWordType |
 *      Buffer containing the type of each letter of the word. This
 *      is a parallel buffer
 *
 *  @parm   LPB | szCondition |
 *      Condtion in postfix form
 *
 *  @parm   int | wLength |
 *      Length of the word
 *
 *  @rdesc  TRUE, if the condition is met, FALSE otherwise
 *
 *************************************************************************/

int PRIVATE PASCAL NEAR ConditionMet (LPB lpbWord, LPB lpbWordType,
    LPB szCondition, int wLength)
{
    int StackIndex;
    int Stack[4];
    int wLengthSaved;
    int LastByte;
    LPB lpbTmp;
    LPB lpbTmpType;

    if (szCondition == NULL)
        return TRUE;

    /* Initialize variables
     * Note: The original codes are written for a 1-byte length preceded
     * string. The new format is 2-byte preceded string. To minimize the
     * change, lpbTmp is used, and points to the 2nd byte
     */

    StackIndex = -1;
    lpbTmp = lpbWord + 1;
    lpbTmpType = lpbWordType + 1;
    LastByte = lpbTmp[wLength];

    while (*szCondition)
    {
        switch (*szCondition)
        {
            case '*':   // *S in the document

                /* Check to see if the stem ends with the next letter */

                Stack[++StackIndex] =
                    (LastByte == *(++szCondition));
                break;

            case 'd':   // *d in the document

                /* Check to see if the stem ends with a double consonant */

                Stack[++StackIndex] = (wLength > 2 &&
                    LastByte == lpbTmp[wLength - 1] &&
                    lpbTmpType[wLength] == CONSONANT);
                break;

            case 's':   // Remove the last consonant
                if (Stack[0])
                {
                    lpbTmp[wLength] = 0;
                    wLength --;
                    *(LPW)lpbWordType = *(LPW)lpbWord = (WORD) wLength;
                }
                break;

            case 'v':   // *v* in the document

                /* Check to see if the word has a vowel */

                wLengthSaved = wLength; /* Save the length */

                for (; wLength &&
                    lpbTmpType[wLength] != VOWEL; wLength--);

                Stack[++StackIndex] = wLength > 0;

                /* Restore the word length */
                wLength = wLengthSaved;
                break;

            case 'o':
                /* *o in the document, ie.
                    - The word ends with the form cvc
                    - The second c is not W, X, Y
					The +2 is for skipping the word length
                 */
                Stack[++StackIndex] = (wLength >= 3) &&
                    (lpbWordType[wLength + 1] == CONSONANT) &&
                    (lpbWordType[wLength] == VOWEL) &&
                    (lpbWordType[wLength - 1] == CONSONANT) &&
                    (LastByte != 'w' && LastByte != 'x' && LastByte != 'y');
                break;

            /* The conditions below test Measure. If they fails, then
             * the whole condition fails. ie. there is no need to test
             * any other conditions. There is no need to save the result
             * on the stack
             */

            case 'p':   // Measure > 0
                if ((Stack[++StackIndex] =
                    MeasureCalc (lpbWordType+2, wLength) > 0) == FALSE)
                    return FALSE;
                break;

            case '2':   // Measure > 1
                if ((Stack[++StackIndex] =
                    MeasureCalc (lpbWordType+2, wLength) > 1) == FALSE)
                    return FALSE;
                break;

            case '1':   // Measure == 1
                if ((Stack[++StackIndex] =
                    MeasureCalc (lpbWordType+2, wLength) == 1) == FALSE)
                    return FALSE;
                break;

            /* The next conditions are operators combination */

            case '|':

                /* OR the result of the top 2 stack entries */

                Stack[StackIndex-1] |= Stack[StackIndex];
                StackIndex--;
                break;

            case '&':

                /* AND the result of the top 2 stack entries */

                Stack[StackIndex-1] &= Stack[StackIndex];
                StackIndex--;
                break;

            case '!':

                /* NOT the result of the top stack entry */

                Stack[StackIndex] = !Stack[StackIndex];
                break;

            default:
                return FALSE;
        }
        szCondition++;
    }

    return Stack[0];
}

/*************************************************************************
 *
 *  @doc    INTERNAL
 *
 *  @func   HRESULT PASCAL NEAR | MarkType |
 *      Marking the type of each letter of the word to be CONSONANT or
 *      VOWEL
 *
 *  @parm   LPB | lpbWord |
 *      Buffer containing the word
 *
 *  @parm   LPB | lpBufType |
 *      Buffer to contain the type of the letters
 *
 *  @parm   int | wLength |
 *      Length of the word
 *
 *************************************************************************/

HRESULT PRIVATE PASCAL NEAR MarkType (LPB lpbWord, LPB lpBufType, int wLength)
{
    for (; wLength > 0; lpBufType++, lpbWord++, wLength--)
    {

        /* Consider wildcard characters to be consonnant */
        if (*lpbWord == '?' || *lpbWord == '*')
        {
            *lpBufType = CONSONANT;
            continue;
        }

        if (*lpbWord < 'a' || *lpbWord > 'z')
            return E_FAIL;

        switch (CharTypeTab [*lpbWord - 'a'])
        {
            case CONSONANT:
                *lpBufType = CONSONANT;
                break;
            case VOWEL:
                *lpBufType = VOWEL;
                break;
            case MIXED:
                if (*(lpBufType - 1) == CONSONANT)
                    *lpBufType = VOWEL;
                else
                    *lpBufType = CONSONANT;
                break;
        }
    }
    return S_OK;
}