904 lines
28 KiB
C
904 lines
28 KiB
C
|
/*************************************************************************
|
||
|
* *
|
||
|
* STOP.C *
|
||
|
* *
|
||
|
* Copyright (C) Microsoft Corporation 1990-1994 *
|
||
|
* All Rights reserved. *
|
||
|
* *
|
||
|
**************************************************************************
|
||
|
* *
|
||
|
* Module Intent *
|
||
|
* Stop list indexing and retrieval *
|
||
|
* *
|
||
|
**************************************************************************
|
||
|
* *
|
||
|
* Written By : Binh Nguyen *
|
||
|
* Current Owner: Binh Nguyen *
|
||
|
* *
|
||
|
*************************************************************************/
|
||
|
#include <mvopsys.h>
|
||
|
#include <orkin.h>
|
||
|
#include <mem.h>
|
||
|
#include <memory.h>
|
||
|
#include <io.h>
|
||
|
#include <mvsearch.h>
|
||
|
#include "common.h"
|
||
|
|
||
|
#ifdef _DEBUG
|
||
|
static BYTE NEAR s_aszModule[] = __FILE__; /* Used by error return functions.*/
|
||
|
#endif
|
||
|
|
||
|
#define cbSTOP_BUF ((CB)512) // Number of bytes read at a time
|
||
|
// from the stop-word file.
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* API FUNCTIONS
|
||
|
* Those functions should be exported in a .DEF file
|
||
|
*************************************************************************/
|
||
|
PUBLIC HRESULT EXPORT_API FAR PASCAL MVStopListAddWord(LPSIPB, LST);
|
||
|
PUBLIC HRESULT EXPORT_API FAR PASCAL MVStopListIndexLoad (HFPB, LPSIPB, LSZ);
|
||
|
PUBLIC LPSIPB EXPORT_API FAR PASCAL MVStopListInitiate(WORD, PHRESULT);
|
||
|
PUBLIC void EXPORT_API FAR PASCAL MVStopListDispose(LPSIPB);
|
||
|
|
||
|
PUBLIC HRESULT EXPORT_API FAR PASCAL MVStopListLoad(HFPB, LPSIPB, LSZ,
|
||
|
BREAKER_FUNC, LPV);
|
||
|
PUBLIC HRESULT EXPORT_API PASCAL FAR MVStopFileBuild (HFPB, LPSIPB, LSZ);
|
||
|
PUBLIC LPCHAIN EXPORT_API FAR PASCAL MVStopListFind(_LPSIPB lpsipb, LST lstWord);
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* INTERNAL PRIVATE FUNCTIONS
|
||
|
* All of them should be declared near
|
||
|
*************************************************************************/
|
||
|
|
||
|
PRIVATE WORD NEAR PASCAL GetHashKey (WORD, LST);
|
||
|
|
||
|
/*************************************************************************
|
||
|
*
|
||
|
* INTERNAL PUBLIC FUNCTIONS
|
||
|
* All of them should be declared far, and included in some include file
|
||
|
*************************************************************************/
|
||
|
|
||
|
PUBLIC HRESULT FAR PASCAL FStopCallback(LST, LST, LFO, LPV);
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc API RETRIEVAL
|
||
|
*
|
||
|
* @func LPSIPB FAR PASCAL | MVStopListInitiate |
|
||
|
* Create and initiate a stop-word information structure
|
||
|
*
|
||
|
* @parm PHRESULT | phr |
|
||
|
* Pointer to error buffer.
|
||
|
*
|
||
|
* @parm WORD | wTabSize |
|
||
|
* Table size in DWORD. The process of stop word checking will
|
||
|
* be faster with larger values of dwTabSize.
|
||
|
*
|
||
|
* @rdesc the pointer to the stop-list structure if succeeded,
|
||
|
* NULL if failed. The error buffer will contain descriptions about
|
||
|
* the cause of the failure
|
||
|
*************************************************************************/
|
||
|
|
||
|
PUBLIC LPSIPB EXPORT_API FAR PASCAL MVStopListInitiate(WORD wTabSize,
|
||
|
PHRESULT phr)
|
||
|
{
|
||
|
_LPSIPB lpsipb;
|
||
|
|
||
|
if (wTabSize < HASH_SIZE)
|
||
|
wTabSize = HASH_SIZE;
|
||
|
|
||
|
/* Allocate a StopInfo structure */
|
||
|
if ((lpsipb = (_LPSIPB)GLOBALLOCKEDSTRUCTMEMALLOC(sizeof(SIPB) +
|
||
|
wTabSize * sizeof(LPB))) == NULL)
|
||
|
{
|
||
|
exit00:
|
||
|
SetErrCode(phr, E_OUTOFMEMORY);
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
lpsipb->HashTab = (LPCHAIN FAR *)((LPB)lpsipb + sizeof(SIPB));
|
||
|
|
||
|
/* Allocate a word block buffer */
|
||
|
if ((lpsipb->lpBlkMgr = BlockInitiate (WORDBUF_SIZE, 0, 0, 0)) == NULL)
|
||
|
{
|
||
|
GlobalLockedStructMemFree((LPV)lpsipb);
|
||
|
goto exit00;
|
||
|
}
|
||
|
|
||
|
lpsipb->wTabSize = wTabSize; /* Size of hash table */
|
||
|
lpsipb->lpfnStopListLookup = MVStopListLookup;
|
||
|
return (LPSIPB)lpsipb;
|
||
|
}
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc API RETRIEVAL
|
||
|
*
|
||
|
* @func HRESULT FAR PASCAL | MVStopListAddWord |
|
||
|
* Add a word to a stop list
|
||
|
*
|
||
|
* @parm LPSIPB | lpsipb |
|
||
|
* Pointer to stop-word information structure
|
||
|
*
|
||
|
* @parm LST | lstWord |
|
||
|
* Pointer to 2-byte length preceded Pascal word to be added
|
||
|
* into the stop-word list
|
||
|
*
|
||
|
* @rdesc S_OK if succeeded
|
||
|
*************************************************************************/
|
||
|
|
||
|
PUBLIC HRESULT EXPORT_API FAR PASCAL MVStopListAddWord(_LPSIPB lpsipb, LST lstWord)
|
||
|
{
|
||
|
WORD wHash;
|
||
|
LPCHAIN lpChain;
|
||
|
WORD wByteUsed;
|
||
|
|
||
|
// Sanity check
|
||
|
if (lpsipb == NULL || lstWord == NULL)
|
||
|
return(E_INVALIDARG);
|
||
|
|
||
|
/* Look for the word. If it is already there then just
|
||
|
* return S_OK, don't add it into the list
|
||
|
*/
|
||
|
if (lpChain = MVStopListFind (lpsipb, lstWord))
|
||
|
{
|
||
|
// Don't add if already there.
|
||
|
lpChain->dwCount++;
|
||
|
return S_OK;
|
||
|
}
|
||
|
|
||
|
wByteUsed = *(LPUW)lstWord + 2;
|
||
|
|
||
|
#ifndef _32BIT
|
||
|
if (lpsipb->cbTextUsed + wByteUsed > MAX_STOPWORD_BUFSIZE) {
|
||
|
/* There are too many stop words */
|
||
|
return ERR_TOOMANYSTOPS;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
lpsipb->cbTextUsed += wByteUsed ;
|
||
|
|
||
|
/* Copy the word into the word buffer block */
|
||
|
if ((lpChain = (LPCHAIN)BlockCopy (lpsipb->lpBlkMgr, lstWord, wByteUsed,
|
||
|
sizeof(CHAIN) - 1)) == NULL)
|
||
|
return E_OUTOFMEMORY;
|
||
|
|
||
|
lpChain->dwCount = 0;
|
||
|
|
||
|
/* Compute hash key */
|
||
|
wHash = GetHashKey(lpsipb->wTabSize, lstWord);
|
||
|
|
||
|
/* Add the word to the hash table */
|
||
|
CH_NEXT(lpChain) = lpsipb->HashTab[wHash];
|
||
|
lpsipb->HashTab[wHash] = lpChain;
|
||
|
|
||
|
return S_OK; // Function worked.
|
||
|
}
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc API RETRIEVAL
|
||
|
*
|
||
|
* @func void FAR PASCAL | MVStopListDispose |
|
||
|
* Frees memory associated with a stop list.
|
||
|
*
|
||
|
* @parm LPSIPB | lpsipb |
|
||
|
* Pointer to stop-word information structure
|
||
|
*************************************************************************/
|
||
|
|
||
|
PUBLIC void EXPORT_API FAR PASCAL MVStopListDispose (_LPSIPB lpsipb)
|
||
|
{
|
||
|
if (lpsipb == NULL)
|
||
|
return;
|
||
|
|
||
|
/* Free the word buffer */
|
||
|
BlockFree(lpsipb->lpBlkMgr);
|
||
|
|
||
|
/* Free the stop info structure */
|
||
|
GlobalLockedStructMemFree((LPV)lpsipb);
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc API RETRIEVAL
|
||
|
*
|
||
|
* @func HRESULT FAR PASCAL | MVStopListIndexLoad |
|
||
|
* Read a stop-word list stored in the subfile/dos file.
|
||
|
*
|
||
|
* @parm HFPB | hfpb |
|
||
|
* Handle to input file. Can be mvfs subfile or separate dos file.
|
||
|
*
|
||
|
* @parm LPSIPB | lpsipb |
|
||
|
* Pointer to stop-word information structure
|
||
|
*
|
||
|
* @parm LPIDX | lpidx |
|
||
|
* Pointer to index structure
|
||
|
*
|
||
|
* @parm LSZ | lszWordBreaker |
|
||
|
* Word breaker to be used
|
||
|
*
|
||
|
* @rdesc S_OK if succeeded, other errors if failed.
|
||
|
*************************************************************************/
|
||
|
/*
|
||
|
The strings are stored in the file in a sequence of pascal strings
|
||
|
*/
|
||
|
|
||
|
PUBLIC HRESULT EXPORT_API FAR PASCAL MVStopListIndexLoad (HFPB hfpbSysFile,
|
||
|
_LPSIPB lpsipb, LSZ lszStopFile)
|
||
|
{
|
||
|
BYTE argbInBuf[CB_STOP_BUF];
|
||
|
FILEOFFSET lfo;
|
||
|
FILEOFFSET foStart;
|
||
|
HFPB hfpbSubFile;
|
||
|
BOOL fOpenedFile;
|
||
|
HRESULT fRet = S_OK;
|
||
|
WORD cbRead;
|
||
|
int fLast;
|
||
|
LPSTOP lpStopHdr;
|
||
|
LPB lpWord;
|
||
|
WORD wOffsetInBuf;
|
||
|
WORD wLen;
|
||
|
ERRB errb;
|
||
|
|
||
|
/* Sanity check */
|
||
|
if (lpsipb == NULL)
|
||
|
return SetErrCode (NULL, E_INVALIDARG);
|
||
|
|
||
|
/* Open the subfile */
|
||
|
if ((fOpenedFile =
|
||
|
FsTypeFromHfpb(hfpbSubFile = hfpbSysFile) != FS_SUBFILE) &&
|
||
|
(hfpbSubFile = FileOpen
|
||
|
(hfpbSysFile, lszStopFile, hfpbSysFile ? FS_SUBFILE : REGULAR_FILE,
|
||
|
READ, &errb)) == NULL)
|
||
|
{
|
||
|
return errb;
|
||
|
}
|
||
|
|
||
|
// If we didn't open the file, we need to find out where the file seek
|
||
|
// pointer is initially so that we only seek relative to that starting
|
||
|
// position (i.e. the caller owns the part of the file that comes before).
|
||
|
foStart = (fOpenedFile ? MakeFo(0,0) :
|
||
|
FileSeek (hfpbSubFile, MakeFo (0, 0), wFSSeekCur, &fRet));
|
||
|
|
||
|
/* Read and check the file validity */
|
||
|
if (FAILED(fRet) ||
|
||
|
(cbRead = (WORD)FileSeekRead
|
||
|
(hfpbSubFile, (LPV)(lpStopHdr = (LPSTOP)argbInBuf),
|
||
|
FoAddFo(foStart, MakeFo(0, 0)), sizeof(STOP_HDR), &fRet))
|
||
|
!= sizeof(STOP_HDR))
|
||
|
{
|
||
|
exit01:
|
||
|
// Close file only if we were the one's who opened it.
|
||
|
if (fOpenedFile)
|
||
|
(void)FileClose(hfpbSubFile); // Return value not checked
|
||
|
// because the file is open
|
||
|
// for read-only.
|
||
|
return fRet;
|
||
|
}
|
||
|
|
||
|
/* MAC codes. They will be eliminated through optimization */
|
||
|
|
||
|
lpStopHdr->FileStamp = SWAPWORD(lpStopHdr->FileStamp);
|
||
|
lpStopHdr->version = SWAPWORD(lpStopHdr->version);
|
||
|
lpStopHdr->dwFileSize = SWAPLONG(lpStopHdr->dwFileSize);
|
||
|
|
||
|
|
||
|
if (lpStopHdr->FileStamp != STOP_STAMP ||
|
||
|
lpStopHdr->version != VERCURRENT)
|
||
|
{
|
||
|
fRet = SetErrCode(&errb, E_BADVERSION);
|
||
|
goto exit01;
|
||
|
}
|
||
|
|
||
|
/* Start at the beginning of the buffer */
|
||
|
wOffsetInBuf = 0;
|
||
|
|
||
|
for (lfo = FoAddFo(foStart, MakeFo(STOP_HDR_SIZE, 0));;)
|
||
|
{
|
||
|
LPB lpbCur;
|
||
|
WORD cbReadOurs = 0;
|
||
|
|
||
|
if ((cbRead = (WORD)FileSeekRead(hfpbSubFile,
|
||
|
lpbCur = ((LPB)argbInBuf + wOffsetInBuf), lfo,
|
||
|
CB_STOP_BUF - wOffsetInBuf, &errb)) == cbIO_ERROR)
|
||
|
{
|
||
|
SetErrCode(&errb, fRet = E_FILEREAD);
|
||
|
goto exit01;
|
||
|
}
|
||
|
|
||
|
lfo = FoAddDw(lfo, (DWORD)cbRead);
|
||
|
|
||
|
while (cbRead - cbReadOurs++ >= sizeof(WORD))
|
||
|
{
|
||
|
if (*((WORD UNALIGNED * UNALIGNED)lpbCur) == 0)
|
||
|
{
|
||
|
FILEOFFSET foCur;
|
||
|
|
||
|
// Get our current seek position.
|
||
|
foCur = FileSeek (hfpbSubFile, MakeFo (0, 0), wFSSeekCur, &fRet);
|
||
|
|
||
|
// We already advanced cbReadOurs by one in the loop
|
||
|
// condition; advance it by one more to account for
|
||
|
// the second byte of the NULL word. Then we move
|
||
|
// the seek pointer back by the difference so that we
|
||
|
// don't leave it past the end of our data.
|
||
|
FileSeek (hfpbSubFile,
|
||
|
FoSubFo(foCur, MakeFo(cbRead - ++cbReadOurs, 0)),
|
||
|
wFSSeekSet, &fRet);
|
||
|
ITASSERT(SUCCEEDED(fRet));
|
||
|
cbRead = cbReadOurs;
|
||
|
fLast = TRUE;
|
||
|
}
|
||
|
else
|
||
|
lpbCur++;
|
||
|
}
|
||
|
|
||
|
cbRead += wOffsetInBuf; // Catch what's left from previous scan
|
||
|
wOffsetInBuf = 0;
|
||
|
|
||
|
/* Add the word into the stop word list */
|
||
|
for (lpWord = argbInBuf; cbRead > 0;)
|
||
|
{
|
||
|
|
||
|
/* If the whole word has been read in, just add it to the
|
||
|
stop list, else we have to "reconstruct" it
|
||
|
*/
|
||
|
// erinfox: we have to byte-swap on Mac
|
||
|
*(WORD UNALIGNED * UNALIGNED)lpWord = SWAPWORD(*(WORD UNALIGNED * UNALIGNED)lpWord);
|
||
|
|
||
|
wLen = *(LPUW)(lpWord) + 2;
|
||
|
if (wLen <= cbRead)
|
||
|
{
|
||
|
|
||
|
/* Everything fits */
|
||
|
if ((fRet = MVStopListAddWord(lpsipb, lpWord)) != S_OK)
|
||
|
goto exit01;
|
||
|
cbRead -= wLen;
|
||
|
lpWord += wLen; /* Move to next word */
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
/* Copy the word to the beginning of the buffer */
|
||
|
MEMCPY(argbInBuf, lpWord, cbRead);
|
||
|
wOffsetInBuf = cbRead;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (fLast)
|
||
|
break;
|
||
|
}
|
||
|
fRet = S_OK; // Succeeded
|
||
|
goto exit01;
|
||
|
}
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc API INDEX RETRIEVAL
|
||
|
*
|
||
|
* @func HRESULT FAR PASCAL | MVStopListLoad |
|
||
|
* Read a stop-word list from an external file. The file must have
|
||
|
* only one stop word per line, or else there is potential loss
|
||
|
* of stop words.
|
||
|
*
|
||
|
* @parm LPSIPB | lpsipb |
|
||
|
* Pointer to stop-word information structure
|
||
|
*
|
||
|
* @parm LSZ | lszStopFile |
|
||
|
* Stop word filename. This is a simple ASCII text file
|
||
|
*
|
||
|
* @parm BREAKER_FUNC | lpfnBreakFunc |
|
||
|
* Word breaker to be used
|
||
|
*
|
||
|
* @parm PHRESULT | phr |
|
||
|
* Pointer to error buffer.
|
||
|
*
|
||
|
* @rdesc S_OK if succeeded, other errors failed.
|
||
|
*************************************************************************/
|
||
|
PUBLIC HRESULT EXPORT_API FAR PASCAL MVStopListLoad(HFPB hfpbIn, _LPSIPB lpsipb,
|
||
|
LSZ lszStopFile, BREAKER_FUNC lpfnBreakFunc,
|
||
|
LPCHARTAB lpCharTab)
|
||
|
{
|
||
|
BYTE argbInBuf[cbSTOP_BUF]; // IO buffer
|
||
|
HFPB hfpb; // File handle
|
||
|
BOOL fOpenedFile;
|
||
|
_LPIBI lpibi; // Pointer to internal breaker info
|
||
|
HANDLE hbi; // Handle to internal brekaer info
|
||
|
HRESULT fRet; // Returned value
|
||
|
BRK_PARMS brkParms; // Breaker parameters structure
|
||
|
LPB lpStart; // Beginning of strings to be parsed
|
||
|
LPB lpEnd; // End of strings to be parsed
|
||
|
WORD wStrLength; // Bytes in string
|
||
|
CB cbTobeRead; // Bytes to be read
|
||
|
CB cbRead; // Bytes actually read
|
||
|
int fLast; // TRUE if this is the last read
|
||
|
int fGetWord; // TRUE if we get a whole word
|
||
|
|
||
|
/* Sanity check */
|
||
|
if (lpsipb == NULL || (lszStopFile == NULL && hfpbIn == NULL)
|
||
|
|| lpfnBreakFunc == NULL)
|
||
|
return E_INVALIDARG;
|
||
|
|
||
|
if ((fOpenedFile = FsTypeFromHfpb(hfpb = hfpbIn) != FS_SUBFILE) &&
|
||
|
(hfpb = (HANDLE)FileOpen
|
||
|
(hfpbIn, lszStopFile, hfpbIn ? FS_SUBFILE : REGULAR_FILE,
|
||
|
READ, &fRet)) == 0)
|
||
|
{
|
||
|
return (fRet);
|
||
|
}
|
||
|
|
||
|
/* Allocate a breaker info block */
|
||
|
|
||
|
if ((hbi = _GLOBALALLOC(DLLGMEM_ZEROINIT, (LCB)sizeof(IBI))) == NULL)
|
||
|
{
|
||
|
return E_OUTOFMEMORY;
|
||
|
}
|
||
|
lpibi = (_LPIBI)_GLOBALLOCK(hbi);
|
||
|
|
||
|
/* Initialize variables */
|
||
|
brkParms.lcbBufOffset = 0L;
|
||
|
brkParms.lpInternalBreakInfo = lpibi;
|
||
|
brkParms.lpvUser = lpsipb;
|
||
|
brkParms.lpfnOutWord = (FWORDCB)FStopCallback;
|
||
|
brkParms.lpStopInfoBlock = NULL;
|
||
|
brkParms.lpCharTab = lpCharTab;
|
||
|
|
||
|
cbTobeRead = cbSTOP_BUF; // Read in a buffer whole
|
||
|
lpStart = lpEnd = (LPB)argbInBuf; // Start & End of string
|
||
|
fGetWord = FALSE; // We didn't get any word yet
|
||
|
wStrLength = 0;
|
||
|
|
||
|
/* The idea is to break the file into sequences of lines, and pass
|
||
|
* each line to the word breaker. The assumption made is that we
|
||
|
* should only have one word per line, since various type breakers
|
||
|
* can only handle one word a type.
|
||
|
*/
|
||
|
|
||
|
for (;;)
|
||
|
{
|
||
|
cbRead = (WORD)FileRead(hfpb, lpEnd, cbTobeRead, &fRet);
|
||
|
if (FAILED(fRet))
|
||
|
{
|
||
|
exit01:
|
||
|
/* Free breaker info block */
|
||
|
_GLOBALUNLOCK(hbi);
|
||
|
_GLOBALFREE(hbi);
|
||
|
|
||
|
/* Close the file */
|
||
|
if (fOpenedFile)
|
||
|
FileClose(hfpb);
|
||
|
return fRet;
|
||
|
}
|
||
|
else
|
||
|
fLast = (cbRead != cbTobeRead);
|
||
|
|
||
|
lpEnd = lpStart;
|
||
|
cbRead += wStrLength; // Get what left in buffer
|
||
|
wStrLength = 0;
|
||
|
|
||
|
while (cbRead != (CB)-1)
|
||
|
{
|
||
|
/* Break the buffer into lines */
|
||
|
|
||
|
if (*lpEnd == '\r' || *lpEnd == '\n' || !cbRead)
|
||
|
{
|
||
|
if (wStrLength)
|
||
|
{
|
||
|
|
||
|
/* Process the word we got */
|
||
|
brkParms.lpbBuf = lpStart;
|
||
|
brkParms.cbBufCount = wStrLength;
|
||
|
|
||
|
if ((fRet = (*lpfnBreakFunc)((LPBRK_PARMS)&brkParms))
|
||
|
!= S_OK)
|
||
|
goto exit01;
|
||
|
|
||
|
/* Flush the breaker buffer */
|
||
|
brkParms.lpbBuf = NULL;
|
||
|
brkParms.cbBufCount = 0;
|
||
|
if ((fRet = (*lpfnBreakFunc)((LPBRK_PARMS)&brkParms))
|
||
|
!= S_OK)
|
||
|
goto exit01;
|
||
|
|
||
|
wStrLength = 0;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
/* Update the pointer to the new word */
|
||
|
if (wStrLength == 0)
|
||
|
lpStart = lpEnd;
|
||
|
wStrLength++; // Increase string's length
|
||
|
}
|
||
|
|
||
|
cbRead--;
|
||
|
lpEnd++;
|
||
|
}
|
||
|
|
||
|
|
||
|
if (fLast)
|
||
|
break;
|
||
|
|
||
|
/* Now copy the partial string to the beginning of the buffer */
|
||
|
MEMCPY(argbInBuf, lpStart, wStrLength);
|
||
|
lpEnd = (lpStart = argbInBuf) + wStrLength;
|
||
|
cbTobeRead = cbSTOP_BUF - wStrLength; // Read in a buffer whole
|
||
|
}
|
||
|
|
||
|
if (wStrLength)
|
||
|
{
|
||
|
/* Flush the breaker buffer */
|
||
|
brkParms.lpbBuf = NULL;
|
||
|
brkParms.cbBufCount = 0;
|
||
|
if ((fRet = (*lpfnBreakFunc)((LPBRK_PARMS)&brkParms)) != S_OK)
|
||
|
goto exit01;
|
||
|
}
|
||
|
fRet = S_OK; // Succeeded
|
||
|
goto exit01;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc INTERNAL
|
||
|
*
|
||
|
* @func WORD NEAR PASCAL | GetHashKey |
|
||
|
* Compute the hash key of a string. This key is used for indexing
|
||
|
* into the stop word hash table
|
||
|
*
|
||
|
* @parm LST | lstWord |
|
||
|
* Pointer to a 2-byte length preceded Pascal-type string
|
||
|
*
|
||
|
* @rdesc
|
||
|
* Return the index into the stop words hash table
|
||
|
*************************************************************************/
|
||
|
|
||
|
PRIVATE WORD NEAR PASCAL GetHashKey (WORD hashSize, LST lstWord)
|
||
|
{
|
||
|
register unsigned int wHash;
|
||
|
register unsigned int nLength;
|
||
|
|
||
|
wHash = 0;
|
||
|
nLength = *(LPUW)lstWord;
|
||
|
lstWord += sizeof(WORD);
|
||
|
for (; nLength; nLength--)
|
||
|
{
|
||
|
wHash = (wHash << 1) | (wHash >> 15);
|
||
|
wHash ^= *lstWord++;
|
||
|
}
|
||
|
wHash %= hashSize;
|
||
|
return ((WORD)wHash);
|
||
|
}
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc API RETRIEVAL INDEX
|
||
|
*
|
||
|
* @func LPCHAIN FAR PASCAL | MVStopListFind |
|
||
|
* This looks for a word (lstWord) in a stop-word (lpsipb)
|
||
|
*
|
||
|
* @parm LPSIPB | lpsipb |
|
||
|
* Pointer to stop-word list structure
|
||
|
*
|
||
|
* @parm LST | lstWord |
|
||
|
* Pointer to string to be looked for
|
||
|
*
|
||
|
* @rdesc Pointer to the node if found, NULL otherwise
|
||
|
*************************************************************************/
|
||
|
|
||
|
PUBLIC LPCHAIN EXPORT_API FAR PASCAL MVStopListFind(_LPSIPB lpsipb, LST lstWord)
|
||
|
{
|
||
|
WORD wHash; // Hash key
|
||
|
LPCHAIN lpChain; // Pointer to the word chain
|
||
|
|
||
|
// Sanity check
|
||
|
if (lpsipb == NULL || lstWord == NULL)
|
||
|
return(NULL);
|
||
|
|
||
|
/* Compute hash key */
|
||
|
wHash = GetHashKey(lpsipb->wTabSize, lstWord);
|
||
|
lpChain = lpsipb->HashTab[wHash];
|
||
|
|
||
|
while (lpChain)
|
||
|
{
|
||
|
if (!StringDiff2 (&CH_WORD(lpChain), lstWord))
|
||
|
return (lpChain);
|
||
|
lpChain = CH_NEXT(lpChain);
|
||
|
}
|
||
|
return (NULL);
|
||
|
}
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc API RETRIEVAL INDEX
|
||
|
*
|
||
|
* @func HRESULT FAR PASCAL | MVStopListLookup |
|
||
|
* This looks for a word (lstWord) in a stop-word (lpsipb)
|
||
|
*
|
||
|
* @parm LPSIPB | lpsipb |
|
||
|
* Pointer to stop-word list structure
|
||
|
*
|
||
|
* @parm LST | lstWord |
|
||
|
* Pointer to string to be looked for
|
||
|
*
|
||
|
* @rdesc S_OK if found, E_FAIL if not, or other errors
|
||
|
*************************************************************************/
|
||
|
|
||
|
PUBLIC HRESULT EXPORT_API FAR PASCAL MVStopListLookup(_LPSIPB lpsipb, LST lstWord)
|
||
|
{
|
||
|
WORD wHash; // Hash key
|
||
|
LPCHAIN lpChain; // Pointer to the word chain
|
||
|
|
||
|
|
||
|
// Sanity check
|
||
|
if (lpsipb == NULL || lstWord == NULL)
|
||
|
return(E_INVALIDARG);
|
||
|
|
||
|
/* Compute hash key */
|
||
|
wHash = GetHashKey(lpsipb->wTabSize, lstWord);
|
||
|
lpChain = lpsipb->HashTab[wHash];
|
||
|
|
||
|
while (lpChain)
|
||
|
{
|
||
|
if (!StringDiff2 (&CH_WORD(lpChain), lstWord))
|
||
|
return (S_OK);
|
||
|
lpChain = CH_NEXT(lpChain);
|
||
|
}
|
||
|
return (E_FAIL);
|
||
|
}
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc API INDEX
|
||
|
*
|
||
|
* @func HRESULT PASCAL FAR | MVStopFileBuild |
|
||
|
* Incorporate the stop word list into the system file
|
||
|
*
|
||
|
* @parm HFPB | hpfbSysFile |
|
||
|
* If non-zero, handle to an opened system file.
|
||
|
*
|
||
|
* @parm LPSIPB | lpsipb |
|
||
|
* Pointer to stop-word information structure
|
||
|
*
|
||
|
* @parm LSZ | lszFilename |
|
||
|
* If hpfbSysFile is non-zero, this is the name of the stop's subfile
|
||
|
* else this is a regular DOS file
|
||
|
*
|
||
|
* @rdesc S_OK if succeeded, E_FAIL if tehre is nothing to build
|
||
|
* or other errors
|
||
|
*************************************************************************/
|
||
|
PUBLIC HRESULT EXPORT_API PASCAL FAR MVStopFileBuild (HFPB hfpbSysFile,
|
||
|
_LPSIPB lpsipb, LSZ lszFilename)
|
||
|
{
|
||
|
HFPB hfpbStop; // Pointer to final index file info.
|
||
|
HRESULT fRet = S_OK;
|
||
|
STOP_HDR Stop_hdr;
|
||
|
HFPB hfpb = 0;
|
||
|
BOOL fCreatedFile;
|
||
|
BYTE Dummy[STOP_HDR_SIZE]; // Dummy buffer to write 0
|
||
|
int i;
|
||
|
LPCHAIN lpChain;
|
||
|
LST lstWord;
|
||
|
WORD wLen;
|
||
|
CB cbByteLeft;
|
||
|
GHANDLE hBuf;
|
||
|
LPB lpbBuf;
|
||
|
LPB lpbStart;
|
||
|
LPB lpbLimit;
|
||
|
ERRB errb;
|
||
|
FILEOFFSET fo;
|
||
|
FILEOFFSET foStart;
|
||
|
|
||
|
/* Sanity check */
|
||
|
if (lpsipb == NULL || (lszFilename == NULL && hfpbSysFile == NULL))
|
||
|
return E_INVALIDARG;
|
||
|
|
||
|
if (lpsipb->cbTextUsed == 0)
|
||
|
return E_FAIL; /* Nothing to build */
|
||
|
|
||
|
if ((fCreatedFile =
|
||
|
FsTypeFromHfpb(hfpbStop = hfpbSysFile) != FS_SUBFILE) &&
|
||
|
(hfpbStop = FileCreate(hfpbSysFile, lszFilename,
|
||
|
hfpbSysFile ? FS_SUBFILE: REGULAR_FILE, &errb)) == 0)
|
||
|
return errb;
|
||
|
|
||
|
// If we didn't open the file, we need to find out where the file seek
|
||
|
// pointer is initially so that we only seek relative to that starting
|
||
|
// position (i.e. the caller owns the part of the file that comes before).
|
||
|
foStart = (fCreatedFile ? MakeFo(0,0) :
|
||
|
FileSeek (hfpbStop, MakeFo (0, 0), wFSSeekCur, &fRet));
|
||
|
|
||
|
if (FAILED(fRet))
|
||
|
goto exit01;
|
||
|
|
||
|
/* Write out the stop file header */
|
||
|
Stop_hdr.FileStamp = STOP_STAMP;
|
||
|
Stop_hdr.version = VERCURRENT;
|
||
|
Stop_hdr.dwFileSize = lpsipb->cbTextUsed;
|
||
|
|
||
|
MEMSET(Dummy, 0, STOP_HDR_SIZE);
|
||
|
|
||
|
/* Write all zeroes to the header area, which is larger than the
|
||
|
* STOP_HDR structure.
|
||
|
*/
|
||
|
if (FileSeekWrite (hfpbStop, Dummy, FoAddFo(foStart, MakeFo (0, 0)),
|
||
|
STOP_HDR_SIZE, &errb) != STOP_HDR_SIZE)
|
||
|
{
|
||
|
fRet = errb;
|
||
|
exit01:
|
||
|
if (fCreatedFile)
|
||
|
FileClose (hfpbStop);
|
||
|
return(fRet);
|
||
|
|
||
|
}
|
||
|
|
||
|
if (FileSeekWrite (hfpbStop, &Stop_hdr, FoAddFo(foStart, MakeFo (0, 0)),
|
||
|
sizeof (STOP_HDR), &errb) != sizeof (STOP_HDR))
|
||
|
{
|
||
|
fRet = errb;
|
||
|
goto exit01;
|
||
|
}
|
||
|
|
||
|
|
||
|
/* Allocate a buffer to flush the data */
|
||
|
if ((hBuf = _GLOBALALLOC (DLLGMEM, cbByteLeft = CB_HUGE_BUF)) == NULL)
|
||
|
{
|
||
|
SetErrCode (&errb, fRet = E_OUTOFMEMORY);
|
||
|
goto exit01;
|
||
|
}
|
||
|
|
||
|
lpbBuf = lpbStart = (LPB)_GLOBALLOCK(hBuf);
|
||
|
lpbLimit = lpbStart + CB_HUGE_BUF - CB_MAX_WORD_LEN;
|
||
|
|
||
|
/* Seek the file to the correct offset */
|
||
|
fo = FoAddFo(foStart, MakeFo (STOP_HDR_SIZE, 0));
|
||
|
|
||
|
if (!FoEquals (FileSeek (hfpbStop, fo, 0, &errb), fo))
|
||
|
{
|
||
|
fRet = E_FILESEEK;
|
||
|
exit02:
|
||
|
_GLOBALUNLOCK(hBuf);
|
||
|
_GLOBALFREE(hBuf);
|
||
|
goto exit01;
|
||
|
}
|
||
|
|
||
|
/* Write out the buffer */
|
||
|
for (i = lpsipb->wTabSize - 1; i >= 0; i--)
|
||
|
{
|
||
|
for (lpChain = lpsipb->HashTab[i]; lpChain;
|
||
|
lpChain = CH_NEXT(lpChain))
|
||
|
{
|
||
|
lstWord = &CH_WORD (lpChain);
|
||
|
MEMCPY (lpbBuf, lstWord, wLen = *(WORD FAR *)lstWord + 2);
|
||
|
lpbBuf += wLen;
|
||
|
if (lpbBuf >= lpbLimit)
|
||
|
{
|
||
|
/* No more room, just flush the buffer */
|
||
|
|
||
|
FileWrite(hfpbStop, lpbStart, (DWORD)(lpbBuf - lpbStart), &errb);
|
||
|
if ((fRet = errb) != S_OK)
|
||
|
goto exit02;
|
||
|
|
||
|
lpbBuf = lpbStart;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/* Flush the buffer */
|
||
|
FileWrite (hfpbStop, lpbStart, (DWORD)(lpbBuf - lpbStart), &errb);
|
||
|
|
||
|
if ((fRet = errb) == S_OK)
|
||
|
{
|
||
|
/* Write a trailing 0 word (i.e. a NULL st) to mark
|
||
|
* the end of the word list.
|
||
|
*/
|
||
|
*((WORD *)lpbStart) = 0;
|
||
|
FileWrite (hfpbStop, lpbStart, sizeof(WORD), &errb);
|
||
|
fRet = errb;
|
||
|
}
|
||
|
|
||
|
goto exit02;
|
||
|
}
|
||
|
|
||
|
PUBLIC HRESULT FAR PASCAL FStopCallback(
|
||
|
LST lstRawWord,
|
||
|
LST lstNormWord,
|
||
|
LFO lfoWordOffset,
|
||
|
_LPSIPB lpsipb)
|
||
|
{
|
||
|
return MVStopListAddWord(lpsipb, lstNormWord);
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc API RETRIEVAL
|
||
|
*
|
||
|
* @func HRESULT FAR PASCAL | MVStopListEnumWords |
|
||
|
* Enumerate the words in a stop list, getting a pointer to each.
|
||
|
*
|
||
|
* @parm LPSIPB | lpsipb |
|
||
|
* Pointer to stop-word information structure
|
||
|
*
|
||
|
* @parm LST* | plstWord |
|
||
|
* Indirect Pointer to 2-byte length preceded Pascal word that is
|
||
|
* the next word identified by *pdwWordInfo and *ppvWordInfo.
|
||
|
*
|
||
|
* @parm LONG* | plWordInfo |
|
||
|
* Pointer to information used to determine what the next word is
|
||
|
* in the stop word list. Passing -1 along with NULL for *ppvWordInfo
|
||
|
* means start at the beginning. On exit, this contains an appropriate
|
||
|
* value that can be passed in again to get the next word, provided
|
||
|
* that no intervening calls have been made to MVStopListAddWord.
|
||
|
*
|
||
|
* @parm LPVOID* | ppvWordInfo |
|
||
|
* Indirect pointer to information used to determine what the next word is
|
||
|
* in the stop word list. Passing NULL along with -1 for *plWordInfo
|
||
|
* means start at the beginning. On exit, this contains an appropriate
|
||
|
* value that can be passed in again to get the next word, provided
|
||
|
* that no intervening calls have been made to MVStopListAddWord.
|
||
|
*
|
||
|
* @rdesc S_OK if succeeded
|
||
|
* @rdesc E_OUTOFRANGE if there are no more words in the stop list.
|
||
|
*************************************************************************/
|
||
|
PUBLIC HRESULT EXPORT_API FAR PASCAL MVStopListEnumWords(_LPSIPB lpsipb,
|
||
|
LST *plstWord, LONG *plWordInfo, LPVOID *ppvWordInfo)
|
||
|
{
|
||
|
LPCHAIN lpchain = NULL;
|
||
|
LONG iHashChain;
|
||
|
|
||
|
if (lpsipb == NULL || plstWord == NULL ||
|
||
|
plWordInfo == NULL || ppvWordInfo == NULL)
|
||
|
return (SetErrReturn(E_POINTER));
|
||
|
|
||
|
iHashChain = *plWordInfo;
|
||
|
|
||
|
// If after the last call to us, we were left sitting on a hash chain
|
||
|
// element, just advance to the next one (which may be NULL).
|
||
|
if ((lpchain = (LPCHAIN) *ppvWordInfo) != NULL)
|
||
|
lpchain = CH_NEXT(lpchain);
|
||
|
|
||
|
// If we're now sitting on a NULL hash chain (initial condition or we
|
||
|
// reached the end of a previous chain), we need to find the beginning
|
||
|
// of the next chain in the hash table.
|
||
|
while (iHashChain < lpsipb->wTabSize - 1 && lpchain == NULL)
|
||
|
lpchain = lpsipb->HashTab[++iHashChain];
|
||
|
|
||
|
if (iHashChain >= lpsipb->wTabSize - 1 && lpchain == NULL)
|
||
|
return (SetErrReturn(E_OUTOFRANGE));
|
||
|
|
||
|
*plstWord = &CH_WORD(lpchain);
|
||
|
*ppvWordInfo = (LPVOID)lpchain;
|
||
|
*plWordInfo = iHashChain;
|
||
|
|
||
|
return (S_OK);
|
||
|
}
|
||
|
|
||
|
|
||
|
/*************************************************************************
|
||
|
* @doc API RETRIEVAL
|
||
|
*
|
||
|
* @func HRESULT FAR PASCAL | MVStopListFindWordPtr |
|
||
|
* Find a word in the stop list and return a pointer to it.
|
||
|
*
|
||
|
* @parm LPSIPB | lpsipb |
|
||
|
* Pointer to stop-word information structure
|
||
|
*
|
||
|
* @parm LST | lstWord |
|
||
|
* Pointer to a 2-byte length preceded Pascal
|
||
|
* string containing the word to find.
|
||
|
*
|
||
|
* @parm LST* | plstWordInList |
|
||
|
* On exit, indirect pointer to 2-byte length preceded Pascal
|
||
|
* string for the word that was found.
|
||
|
*
|
||
|
* @rdesc S_OK if succeeded
|
||
|
* @rdesc E_NOTFOUND if the word isn't in the stop list
|
||
|
*************************************************************************/
|
||
|
PUBLIC HRESULT EXPORT_API FAR PASCAL MVStopListFindWordPtr(_LPSIPB lpsipb,
|
||
|
LST lstWord, LST *plstWordInList)
|
||
|
{
|
||
|
HRESULT hr = S_OK;
|
||
|
LPCHAIN lpchain;
|
||
|
|
||
|
if ((lpchain = MVStopListFind(lpsipb, lstWord)) != NULL)
|
||
|
*(LST UNALIGNED * UNALIGNED)plstWordInList = &CH_WORD(lpchain);
|
||
|
else
|
||
|
hr = E_NOTFOUND;
|
||
|
|
||
|
return (hr);
|
||
|
}
|