windows-nt/Source/XPSP1/NT/enduser/stuff/itircl/fts/search/stem.c

697 lines
22 KiB
C
Raw Permalink Normal View History

2020-09-26 03:20:57 -05:00
/*************************************************************************
* *
* STEM.C *
* *
* Copyright (C) Microsoft Corporation 1990-1994 *
* All Rights reserved. *
* *
**************************************************************************
* *
* Module Intent *
* This module contains the functions to strip off the suffix of a word *
* It is based on the research paper of Dr. Porter, pulished in *
* An algorithm for suffix stripping *
* Program, Vol.14, no.3,pp 130-137, July 1980 *
* *
* Description: *
* *
* The full description of the algorithm can be found in that document *
* Basically, the algorithm consists of: *
* - Matching the suffix from a table of suffixes *
* - Applies the rule that comes with the suffix *
* - If the rule matches, then change the suffix to the new one *
* *
* Comments: *
* *
* 1/ There are some misconceptions about stripping the suffix *
* People are thinking in term of super-smart algorithm that can *
* strip a word to its stem. The fact is that it is not necessarily *
* true. For example, DIED is strippe to DI, but not DIE. *
* *
* 2/ The current code is SLOW, but it easy to understand in term *
* of implementation, since it is straigthforward from the algorithm *
* description. The impact on runtime is nothing. On compiled time *
* stemming 5,000,000 words will take less than 1 hour, which is *
* acceptable, since a project that large requires 1-2 days to *
* compile. *
* *
* To improve the speed (up to 2 times), we can scan the suffix *
* if one letter doesn't match we can jump pass all stem that have *
* this letter *
* WARNING: Tab setting is 4 for this file *
* *
**************************************************************************
* *
* Current Owner: BinhN *
* *
**************************************************************************/
#include <mvopsys.h>
#include <memory.h>
#include <mvsearch.h>
#include "common.h"
#define VOWEL 0
#define CONSONANT 1
#define MIXED 2
#define MIN_LENGTH_FOR_STEM 3
/* Rule table structure */
typedef struct RULE
{
LPB szInitSuffix; // Initial suffix
LPB szNewSuffix; // New suffix
LPB szCondition; // Stemming condition
short NextTable; // Next table to jump to
} RULE, FAR *LPRULE;
/* The conventional letter used for the stemming condition are:
*
* '1': Measure == 1
* '2': Measure > 1
* 'd': Double consonant at the end (*d in the document)
* 'o': Form cvc , and 2nd c is not W, X or Y (*o in the document)
* 'p': Measure > 0
* 's': Remove the last consonant (used with 'd')
* 'v': Word contains vowels (*v* in the document)
* '*': Terminated with the next letter (*S in the document)
* '&': AND operation
* '|': OR operation
* '!': NOT operation
* The rule operation is based on a postfix notation, so "m=1 and *o*" is
* described as "1o&"
*/
RULE RuleTab0[] =
{
"\4sses", "\2ss", NULL, 1,
"\3ies", "\1i", NULL, 1,
"\2ss", "\2ss", NULL, 1,
"\1s", "\0", NULL, 1,
NULL, NULL, NULL, 1,
};
RULE RuleTab1[] =
{
"\3eed", "\2ee", "p", 3,
"\2ed", "\0", "v", 2,
"\3ing", "\0", "v", 2,
NULL, NULL, NULL, 3,
};
RULE RuleTab2[] =
{
"\2at", "\3ate", NULL, 3,
"\2bl", "\3ble", NULL, 3,
"\2iz", "\3ize", NULL, 3,
/* The following szNewSuffix has a negative \377
* (-1) length. It is to be used to reduce a
* double consonant ending to single consonant
*/
"\0", "\377\0", "*l*s|*z|!d&s", 3,
"\0", "\1e", "1o&", 3,
NULL, NULL, NULL, 3,
};
RULE RuleTab3[] =
{
"\1y", "\1i", "v", 4,
NULL, NULL, NULL, 4,
};
RULE RuleTab4[] =
{
"\7ational", "\3ate", "p", 5,
"\6tional", "\4tion", "p", 5,
"\4enci", "\4ence", "p", 5,
"\4anci", "\4ance", "p", 5,
"\4izer", "\3ize", "p", 5,
"\4abli", "\4able", "p", 5,
"\4alli", "\2al", "p", 5,
"\5entli", "\3ent", "p", 5,
"\3eli", "\1e", "p", 5,
"\5ousli", "\3ous", "p", 5,
"\7ization", "\3ize", "p", 5,
"\5ation", "\3ate", "p", 5,
"\4ator", "\3ate", "p", 5,
"\5alism", "\2al", "p", 5,
"\7iveness", "\3ive", "p", 5,
"\7fulness", "\3ful", "p", 5,
"\7ousness", "\3ous", "p", 5,
"\5aliti", "\2al", "p", 5,
"\5iviti", "\3ive", "p", 5,
"\6biliti", "\3ble", "p", 5,
NULL, NULL, NULL, 5,
};
RULE RuleTab5[] =
{
"\5icate", "\2ic", "p", 6,
"\5ative", "\0", "p", 6,
"\5alize", "\2al", "p", 6,
"\5iciti", "\2ic", "p", 6,
"\4ical", "\2ic", "p", 6,
"\3ful", "\0", "p", 6,
"\4ness", "\0", "p", 6,
NULL, NULL, NULL, 6,
};
RULE RuleTab6[] =
{
"\2al", "\0", "2", 7,
"\4ance", "\0", "2", 7,
"\4ence", "\0", "2", 7,
"\2er", "\0", "p", 7,
"\2ic", "\0", "2", 7,
"\4able", "\0", "2", 7,
"\4ible", "\0", "2", 7,
"\3ant", "\0", "2", 7,
"\5ement", "\0", "2", 7,
"\4ment", "\0", "2", 7,
"\3ent", "\0", "2", 7,
"\3ion", "\0", "2*s*t|&", 7,
"\2ou", "\0", "2", 7,
"\3ism", "\0", "2", 7,
"\3ate", "\0", "2", 7,
"\3iti", "\0", "2", 7,
"\3ous", "\0", "2", 7,
"\3ive", "\0", "2", 7,
"\3ize", "\0", "2", 7,
NULL, NULL, NULL, 7,
};
RULE RuleTab7[] =
{
"\1e", "\0", "2", 8,
"\1e", "\0", "1o!&", 8,
NULL, NULL, NULL, 8,
};
RULE RuleTab8[] =
{
"\2ll", "\1l", "2", 9,
"\0", "\377\0", "2*l&d&s", 9,
NULL, NULL, NULL, 9,
};
char CharTypeTab[] =
{
VOWEL, //a
CONSONANT, //b
CONSONANT, //c
CONSONANT, //d
VOWEL, //e
CONSONANT, //f
CONSONANT, //g
CONSONANT, //h
VOWEL, //i
CONSONANT, //j
CONSONANT, //k
CONSONANT, //l
CONSONANT, //m
CONSONANT, //n
VOWEL, //o
CONSONANT, //p
CONSONANT, //q
CONSONANT, //r
CONSONANT, //s
CONSONANT, //t
VOWEL, //u
CONSONANT, //v
CONSONANT, //w
CONSONANT, //x
MIXED, //y, consonant, but may be vowel if after consonant
CONSONANT, //z
};
LPRULE RuleTables[] =
{
RuleTab0,
RuleTab1,
RuleTab2,
RuleTab3,
RuleTab4,
RuleTab5,
RuleTab6,
RuleTab7,
RuleTab8,
NULL,
};
/*************************************************************************
*
* INTERNAL PRIVATE FUNCTIONS
* All of them should be declared near
*************************************************************************/
int PRIVATE PASCAL NEAR MeasureCalc (LPB, int);
int PRIVATE PASCAL NEAR ConditionMet (LPB, LPB, LPB, int);
int PRIVATE PASCAL NEAR SuffixMatch (LPB lpbWord, LPB lpSuffix);
HRESULT PRIVATE PASCAL NEAR MarkType (LPB, LPB, int);
/*************************************************************************
*
* @doc API INDEX RETRIEVAL
*
* @func HRESULT PASCAL FAR | FStem |
* This function will strip the suffix from a word, ie, "stem" it
*
* @parm LPB | lpbStemWord |
* Buffer to contain the stemmed word
*
* @parm LPB | lpbWord |
* Word to be stemmed
*
* @rdesc S_OK if succeeded, or E_INVALIDARG if the null argument is
* passed
*
* @comm The word passed must have all the letters in lower case for
* The function to work with. WARNING: There is no checking about
* case, so thing can go wrong if the word contains upper case letter
* or non alphabetic letter.
*
*************************************************************************/
PUBLIC HRESULT PASCAL FAR EXPORT_API FStem (LPB lpbStemWord, LPB lpbWord)
{
register int wLength; // Length of the word
register int i; // Scratch variable
LPRULE lpRuleTab; // Pointer to rule table
LPRULE lpRule; // Pointer to rule
int wLengthSaved;
int wNewSuffixLength; // This must be signed!
int wInitSuffixLength;
char lpbWordType [CB_MAX_WORD_LEN];
LPB szInitSuffix;
LPB szNewSuffix;
int TableIndex; // For debugging purpose only
int RuleIndex; // For debugging purpose only
LPB lpbTmp;
if (lpbWord == NULL)
return E_INVALIDARG;
wLength = (*(LPW)lpbWordType = *((LPW)lpbWord));
if (wLength >= CB_MAX_WORD_LEN)
return(E_WORDTOOLONG);
/* Copy the word over */
MEMCPY (lpbStemWord, lpbWord, wLength + 2);
/* Don't do any stemming for words <= 3 bytes */
if (wLength <= MIN_LENGTH_FOR_STEM)
return S_OK;
/* Mark the type of each letter to be consonant or vowel */
if (MarkType (lpbStemWord+2, lpbWordType+2, wLength) != S_OK)
{
/* We got some non alphabetic characters. Just return */
return S_OK;
}
/* Traverse all the tables and check for stemming conditions */
for (TableIndex = 0, lpRuleTab = RuleTables[0]; lpRuleTab;)
{
/* Check for each rule */
for (RuleIndex = 0, lpRule = lpRuleTab;
szInitSuffix = lpRule->szInitSuffix; lpRule++, RuleIndex++)
{
szNewSuffix = lpRule->szNewSuffix;
/* The casting is needed to make wNewSuffixLength signed */
wNewSuffixLength = (char)*szNewSuffix++;
wInitSuffixLength = (char)*szInitSuffix++;
/* Check for condition match */
if (wLength >= wInitSuffixLength)
{
lpbTmp = lpbStemWord + wLength + 2 - wInitSuffixLength;
/* Compare the suffixes */
for (i = wInitSuffixLength;
i > 0 && (*lpbTmp == *szInitSuffix);
i--, lpbTmp++, szInitSuffix++);
/* Restore szInitSuffix */
szInitSuffix = lpRule->szInitSuffix;
if (i != 0) // String comparison fails
continue;
/* Save the word length */
wLengthSaved = wLength;
/* Update word length since we don't include the suffix
* length in our computation
*/
wLength -= wInitSuffixLength;
/* Now check the stemming condition */
if (ConditionMet (lpbStemWord, lpbWordType,
lpRule->szCondition, wLength))
{
/* Rule applies, change to the new suffix */
if (wNewSuffixLength > 0)
{
MEMCPY (&lpbStemWord[wLength+2], szNewSuffix,
wNewSuffixLength);
/* Update the word type */
MarkType (szNewSuffix,
lpbWordType + wLength + 2, wNewSuffixLength);
}
/* Update the word length
* The check for wLength is necessary since we don't
* want to strip evething
*/
if (wLength + wNewSuffixLength > 0)
*(LPW)lpbStemWord = (wLength += wNewSuffixLength);
if (wLength <= MIN_LENGTH_FOR_STEM)
goto Done;
break;
}
else
{
/* Rule doesn't apply, Restore the word length */
wLength = wLengthSaved;
}
}
}
/* Go to the next table */
lpRuleTab = RuleTables [TableIndex = lpRule->NextTable];
}
Done:
lpbStemWord[*((LPW)lpbStemWord)+2] = 0;
return S_OK;
}
/*************************************************************************
*
* @doc INTERNAL
*
* @func int PASCAL NEAR | MeasureCalc |
* Calculate the measure of a word. The measure is defined as
* the pair (VC), where V is the vowels, and C consonants. A word
* is described as [C](VC)m[V], where the first C and the last V are
* optional. m is the measure of the word (or part of word without
* the suffix). Example:
* architect: m = 3 (arch, it, ect)
* convention: m = 3 (onv, ent, ion)
* lie: m = 0, since the first consonant, and the last vowels
* don't count
*
* @parm LPB | lpbWordType |
* Buffer containing word type
*
* @parm int | wLength |
* The length of the word
*
* @rdesc Return the measure of the word
*
*************************************************************************/
int PRIVATE PASCAL NEAR MeasureCalc (LPB lpbWordType, register int wLength)
{
register int cMeasure;
#if 0
/* Safety chck
* IFdef out for speed. This is a internal function
*/
if (lpbWordType == NULL)
return 0;
#endif
/* Initialize the word measure */
cMeasure = 0;
/* Skip the beginning consonants */
for (;wLength > 0 && *lpbWordType == CONSONANT; wLength--, lpbWordType++);
/* Get the vowel/consonant pairs */
while (wLength > 0)
{
/* Get all the vowels */
for (; wLength > 0 && *lpbWordType == VOWEL; wLength--, lpbWordType++);
if (wLength > 0)
{
cMeasure ++;
/* Get all the consonants */
for (; wLength > 0 && *lpbWordType == CONSONANT;
wLength--, lpbWordType++);
}
}
return cMeasure;
}
/*************************************************************************
*
* @doc INTERNAL
*
* @func int PASCAL NEAR | ConditionMet |
* This fuction check the condition to be met by a particular
* suffix.
*
* @parm LPB | lpbWord |
* Buffer contains the word to be stemmed> This is a 2-byte prefixed
* pascal string
*
* @parm LPB | lpbWordType |
* Buffer containing the type of each letter of the word. This
* is a parallel buffer
*
* @parm LPB | szCondition |
* Condtion in postfix form
*
* @parm int | wLength |
* Length of the word
*
* @rdesc TRUE, if the condition is met, FALSE otherwise
*
*************************************************************************/
int PRIVATE PASCAL NEAR ConditionMet (LPB lpbWord, LPB lpbWordType,
LPB szCondition, int wLength)
{
int StackIndex;
int Stack[4];
int wLengthSaved;
int LastByte;
LPB lpbTmp;
LPB lpbTmpType;
if (szCondition == NULL)
return TRUE;
/* Initialize variables
* Note: The original codes are written for a 1-byte length preceded
* string. The new format is 2-byte preceded string. To minimize the
* change, lpbTmp is used, and points to the 2nd byte
*/
StackIndex = -1;
lpbTmp = lpbWord + 1;
lpbTmpType = lpbWordType + 1;
LastByte = lpbTmp[wLength];
while (*szCondition)
{
switch (*szCondition)
{
case '*': // *S in the document
/* Check to see if the stem ends with the next letter */
Stack[++StackIndex] =
(LastByte == *(++szCondition));
break;
case 'd': // *d in the document
/* Check to see if the stem ends with a double consonant */
Stack[++StackIndex] = (wLength > 2 &&
LastByte == lpbTmp[wLength - 1] &&
lpbTmpType[wLength] == CONSONANT);
break;
case 's': // Remove the last consonant
if (Stack[0])
{
lpbTmp[wLength] = 0;
wLength --;
*(LPW)lpbWordType = *(LPW)lpbWord = (WORD) wLength;
}
break;
case 'v': // *v* in the document
/* Check to see if the word has a vowel */
wLengthSaved = wLength; /* Save the length */
for (; wLength &&
lpbTmpType[wLength] != VOWEL; wLength--);
Stack[++StackIndex] = wLength > 0;
/* Restore the word length */
wLength = wLengthSaved;
break;
case 'o':
/* *o in the document, ie.
- The word ends with the form cvc
- The second c is not W, X, Y
The +2 is for skipping the word length
*/
Stack[++StackIndex] = (wLength >= 3) &&
(lpbWordType[wLength + 1] == CONSONANT) &&
(lpbWordType[wLength] == VOWEL) &&
(lpbWordType[wLength - 1] == CONSONANT) &&
(LastByte != 'w' && LastByte != 'x' && LastByte != 'y');
break;
/* The conditions below test Measure. If they fails, then
* the whole condition fails. ie. there is no need to test
* any other conditions. There is no need to save the result
* on the stack
*/
case 'p': // Measure > 0
if ((Stack[++StackIndex] =
MeasureCalc (lpbWordType+2, wLength) > 0) == FALSE)
return FALSE;
break;
case '2': // Measure > 1
if ((Stack[++StackIndex] =
MeasureCalc (lpbWordType+2, wLength) > 1) == FALSE)
return FALSE;
break;
case '1': // Measure == 1
if ((Stack[++StackIndex] =
MeasureCalc (lpbWordType+2, wLength) == 1) == FALSE)
return FALSE;
break;
/* The next conditions are operators combination */
case '|':
/* OR the result of the top 2 stack entries */
Stack[StackIndex-1] |= Stack[StackIndex];
StackIndex--;
break;
case '&':
/* AND the result of the top 2 stack entries */
Stack[StackIndex-1] &= Stack[StackIndex];
StackIndex--;
break;
case '!':
/* NOT the result of the top stack entry */
Stack[StackIndex] = !Stack[StackIndex];
break;
default:
return FALSE;
}
szCondition++;
}
return Stack[0];
}
/*************************************************************************
*
* @doc INTERNAL
*
* @func HRESULT PASCAL NEAR | MarkType |
* Marking the type of each letter of the word to be CONSONANT or
* VOWEL
*
* @parm LPB | lpbWord |
* Buffer containing the word
*
* @parm LPB | lpBufType |
* Buffer to contain the type of the letters
*
* @parm int | wLength |
* Length of the word
*
*************************************************************************/
HRESULT PRIVATE PASCAL NEAR MarkType (LPB lpbWord, LPB lpBufType, int wLength)
{
for (; wLength > 0; lpBufType++, lpbWord++, wLength--)
{
/* Consider wildcard characters to be consonnant */
if (*lpbWord == '?' || *lpbWord == '*')
{
*lpBufType = CONSONANT;
continue;
}
if (*lpbWord < 'a' || *lpbWord > 'z')
return E_FAIL;
switch (CharTypeTab [*lpbWord - 'a'])
{
case CONSONANT:
*lpBufType = CONSONANT;
break;
case VOWEL:
*lpBufType = VOWEL;
break;
case MIXED:
if (*(lpBufType - 1) == CONSONANT)
*lpBufType = VOWEL;
else
*lpBufType = CONSONANT;
break;
}
}
return S_OK;
}