windows-nt/Source/XPSP1/NT/inetsrv/intlwb/cht2/srcs/defbrkr.cpp
2020-09-26 16:20:57 +08:00

498 lines
15 KiB
C++

#include <windows.h>
#include "IWBrKr.h"
#include "DefBrKr.h"
#define ZERO_WIDTH_SPACE 0x200B
#define MAX_Def_WordBrKr_Prcess_Len 1000
BOOL IsWinNT(void)
{
OSVERSIONINFOA osVersionInfo;
BOOL fRet = FALSE;
osVersionInfo.dwOSVersionInfoSize = sizeof(osVersionInfo);
GetVersionExA(&osVersionInfo);
if (osVersionInfo.dwPlatformId == VER_PLATFORM_WIN32_NT) {
fRet = TRUE;
}
return fRet;
}
BOOL MyGetStringTypeEx(
LCID LocalID,
DWORD dwInfoType,
const WCHAR *lpSrcStr, // unicode base
INT cchSrc,
LPWORD lpCharType)
{
BOOL fRet = FALSE;
if (IsWinNT()) {
fRet = GetStringTypeW(dwInfoType, lpSrcStr, cchSrc,lpCharType);
} else {
DWORD dwANSISize = 0;
dwANSISize = WideCharToMultiByte(GetACP(), WC_COMPOSITECHECK, lpSrcStr, cchSrc,
NULL, 0, NULL, NULL);
if (dwANSISize) {
LPSTR lpAnsiStr = NULL;
lpAnsiStr = new CHAR[dwANSISize];
if (lpAnsiStr) {
dwANSISize = WideCharToMultiByte(GetACP(), WC_COMPOSITECHECK, lpSrcStr, cchSrc,
lpAnsiStr, dwANSISize, NULL, NULL);
fRet = GetStringTypeExA(LocalID, dwInfoType, lpAnsiStr, dwANSISize, lpCharType);
if (ERROR_INVALID_PARAMETER == GetLastError() && (CT_CTYPE1 == dwInfoType || CT_CTYPE3 == dwInfoType)) {
for (INT i = 0; i < cchSrc; ++i) {
switch (dwInfoType) {
case CT_CTYPE1:
lpCharType[i] = C1_ALPHA;
break;
case CT_CTYPE3:
lpCharType[i] = (C3_NONSPACING | C3_ALPHA);
break;
}
}
fRet = TRUE;
}
delete [] lpAnsiStr;
lpAnsiStr = NULL;
}
}
}
return fRet;
}
CDefWordBreaker::CDefWordBreaker()
{
ccCompare = MAX_Def_WordBrKr_Prcess_Len;
}
//+-------------------------------------------------------------------------
//
// Method: CDefWordBreaker::IsWordChar
//
// Synopsis: Find whether the i'th character in the buffer _awString
// is a word character (rather than word break)
//
// Arguments: [i] -- index into _awString
//
// History: 22-Jul-1994 BartoszM Created
//
//--------------------------------------------------------------------------
inline BOOL CDefWordBreaker::IsWordChar(
int i,
PWORD _aCharInfo1,
PWORD _aCharInfo3,
const WCHAR* pwcChunk) const
{
if ( (_aCharInfo1[i] & (C1_ALPHA | C1_DIGIT))
|| (_aCharInfo3[i] & C3_NONSPACING) )
{
return TRUE;
}
WCHAR c = pwcChunk[i];
if (c == L'_')
return TRUE;
if (c == 0xa0) // non breaking space
{
// followed by a non-spacing character
// (looking ahead is okay)
if (_aCharInfo3[i+1] & C3_NONSPACING)
return TRUE;
}
return FALSE;
}
//+---------------------------------------------------------------------------
//
// Member: CDefWordBreaker::ScanChunk
//
// Synopsis: For each character find its type
//
//
// History: 16-Aug-94 BartoszM Created
//
//----------------------------------------------------------------------------
BOOL CDefWordBreaker::ScanChunk(
PWORD _aCharInfo1,
PWORD _aCharInfo3,
const WCHAR *pwcChunk,
ULONG ucwc)
{
BOOL fRet = FALSE;
// POSIX character typing, Source, Size of source, Character info
if (!MyGetStringTypeEx(GetSystemDefaultLCID(), CT_CTYPE1, pwcChunk, ucwc, _aCharInfo1)) {
// Additional POSIX, Source, Size of source, Character info 3
} else if (!MyGetStringTypeEx(GetSystemDefaultLCID(), CT_CTYPE3, pwcChunk, ucwc, _aCharInfo3)) { //
} else {
fRet = TRUE;
}
return fRet;
}
/*
BOOL CDefWordBreaker::ScanChunk(
PWORD _aCharInfo1,
PWORD _aCharInfo3,
const WCHAR *pwcChunk,
ULONG ucwc)
{
//
// GetStringTypeW is returning error 87 (ERROR_INVALID_PARAMETER) if
// we pass in a null string.
//
// Win4Assert( (0 != _cMapped) && (0 != _pwcChunk) );
if (IsWinNT())
{
if (!MyGetStringTypeEx(0, // Dummy
CT_CTYPE1, // POSIX character typing
pwcChunk, // Source
ucwc, // Size of source
_aCharInfo1 ) ) // Character info
{
return FALSE;
}
if ( !MyGetStringTypeEx(0, // Dummy
CT_CTYPE3, // Additional POSIX
pwcChunk, // Source
ucwc, // Size of source
_aCharInfo3 ) ) // Character info 3
{
return FALSE;
}
}
else
{
//
// BUGBUG: This is all wrong -- we don't know if this is the right
// locale to use and there isn't a way to know at this point.
//
if (!MyGetStringTypeEx( GetSystemDefaultLCID(),
CT_CTYPE1, // POSIX character typing
pwcChunk, // Source
ucwc, // Size of source
_aCharInfo1 ) ) // Character info
{
// ciDebugOut(( DEB_ERROR, "GetStringTypeW returned %d\n",
// GetLastError() ));
// Win9x just stinks. No 2 ways about it.
if ( ERROR_INVALID_PARAMETER == GetLastError() )
{
for ( unsigned i = 0; i < ucwc; i++ )
_aCharInfo1[i] = C1_ALPHA;
return TRUE;
}
return FALSE;
}
if ( !MyGetStringTypeEx(GetSystemDefaultLCID(),
CT_CTYPE3, // Additional POSIX
pwcChunk, // Source
ucwc, // Size of source
_aCharInfo3 ) ) // Character info 3
{
// ciDebugOut(( DEB_ERROR, "GetStringTypeW CTYPE3 returned %d\n",
// GetLastError() ));
// Win9x just stinks. No 2 ways about it.
if ( ERROR_INVALID_PARAMETER == GetLastError() )
{
for ( unsigned i = 0; i < ucwc; i++ )
_aCharInfo3[i] = ( C3_NONSPACING | C3_ALPHA );
return TRUE;
}
return FALSE;
}
}
return TRUE;
} //ScanChunk
*/
//+---------------------------------------------------------------------------
//
// Member: CDefWordBreaker::BreakText
//
// Synopsis: Break input stream into words.
//
// Arguments: [pTextSource] - source of input buffers
// [pWordSink] - sink for words
// [pPhraseSink] - sink for noun phrases
//
// History: 07-June-91 t-WadeR Created
// 12-Oct-92 AmyA Added Unicode support
// 18-Nov-92 AmyA Overloaded
// 11-Apr-94 KyleP Sync with spec
// 26-Aug-94 BartoszM Fixed Unicode parsing
//
//----------------------------------------------------------------------------
SCODE CDefWordBreaker::BreakText(
TEXT_SOURCE *pTextSource,
IWordSink *pWordSink,
IPhraseSink *pPhraseSink,
DWORD dwBase)
{
LPWORD _aCharInfo1 = NULL;
LPWORD _aCharInfo3 = NULL;
if ( 0 == pTextSource )
return E_INVALIDARG;
if ( 0 == pWordSink || pTextSource->iCur == pTextSource->iEnd)
return S_OK;
if (pTextSource->iCur > pTextSource->iEnd)
{
// Win4Assert ( !"BreakText called with bad TEXT_SOURCE" );
return E_FAIL;
}
SCODE sc = S_OK;
ULONG cwc, cwcProcd; // cwcProcd is # chars actually processed by Tokenize()
do {
//
// Flag for first time thru loop below. This is to fix the case
// where the length of the buffer passed in is less than
// MAX_II_BUFFER_LEN. In this case iEnd-iCur is <= MAX_II_BUFFER_LEN
// and we break out the inner loop and call
// pfnFillTextBuffer without having processed any characters,
// and so pfnFillTextBuffer returns TRUE without adding any new
// characters and this results in an infinite loop.
BOOL fFirstTime = TRUE;
while (pTextSource->iCur < pTextSource->iEnd) {
cwc = pTextSource->iEnd - pTextSource->iCur;
// Process in buckets of MAX_II_BUFER_LEN only
if (cwc >= CDefWordBreaker::ccCompare) {
cwc = CDefWordBreaker::ccCompare;
} else if ( !fFirstTime) {
break;
} else {
}
if (_aCharInfo1) {
delete [] _aCharInfo1;
_aCharInfo1 = NULL;
}
if (_aCharInfo3) {
delete [] _aCharInfo3;
_aCharInfo3 = NULL;
}
_aCharInfo1 = new WORD[cwc + 1];
_aCharInfo3 = new WORD[cwc + 1];
if (_aCharInfo1 && _aCharInfo3) {
Tokenize( pTextSource, cwc, pWordSink, cwcProcd, _aCharInfo1, _aCharInfo3, dwBase);
}
// Win4Assert( cwcProcd <= cwc );
pTextSource->iCur += cwcProcd;
fFirstTime = FALSE;
}
} while(SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)));
cwc = pTextSource->iEnd - pTextSource->iCur;
// we know that the remaining text should be less than ccCompare
// Win4Assert( cwc < CDefWordBreaker::ccCompare );
if (0 != cwc) {
if (_aCharInfo1) {
delete [] _aCharInfo1;
_aCharInfo1 = NULL;
}
if (_aCharInfo3) {
delete [] _aCharInfo3;
_aCharInfo3 = NULL;
}
_aCharInfo1 = new WORD[cwc + 1];
_aCharInfo3 = new WORD[cwc + 1];
if (_aCharInfo1 && _aCharInfo1) {
Tokenize(pTextSource, cwc, pWordSink, cwcProcd, _aCharInfo1, _aCharInfo3, dwBase);
}
}
if (_aCharInfo1) {
delete [] _aCharInfo1;
_aCharInfo1 = NULL;
}
if (_aCharInfo3) {
delete [] _aCharInfo3;
_aCharInfo3 = NULL;
}
return sc;
} //BreakText
//+---------------------------------------------------------------------------
//
// Member: CDefWordBreaker::Tokenize
//
// Synopsis: Tokenize the input buffer into words
//
// Arguments: [pTextSource] -- input text source
// [cwc] -- # chars to process
// [pWordSink] -- sink for words
// [cwcProd] -- # chars actually processed returned here
//
// History: 10-Aug-95 SitaramR Created
//
//----------------------------------------------------------------------------
void CDefWordBreaker::Tokenize( TEXT_SOURCE *pTextSource,
ULONG cwc,
IWordSink *pWordSink,
ULONG& cwcProcd,
PWORD _aCharInfo1,
PWORD _aCharInfo3,
DWORD dwBase)
{
const WCHAR* pwcChunk = NULL;
WCHAR _awcBufZWS[MAX_Def_WordBrKr_Prcess_Len];
pwcChunk = &pTextSource->awcBuffer[pTextSource->iCur];
if (!ScanChunk(_aCharInfo1, _aCharInfo3, pwcChunk, cwc)) {
return;
}
BOOL fWordHasZWS = FALSE; // Does the current word have a zero-width-space ?
unsigned uLenZWS; // Length of a word minus embedded zero-width-spaces
//
// iBeginWord is the offset into _aCharInfo of the beginning character of
// a word. iCur is the first *unprocessed* character.
// They are indexes into the mapped chunk.
//
unsigned iBeginWord = 0;
unsigned iCur = 0;
//
// Pump words from mapped chunk to word sink
//
while (iCur < cwc)
{
//
// Skip whitespace, punctuation, etc.
//
for (; iCur < cwc; iCur++)
if (IsWordChar (iCur, _aCharInfo1, _aCharInfo3, pwcChunk))
break;
// iCur points to a word char or is equal to _cMapped
iBeginWord = iCur;
if (iCur < cwc)
iCur++; // we knew it pointed at word character
//
// Find word break. Filter may output Unicode zero-width-space, which
// should be ignored by the wordbreaker.
//
fWordHasZWS = FALSE;
for (; iCur < cwc; iCur++)
{
if (!IsWordChar(iCur, _aCharInfo1, _aCharInfo3, pwcChunk))
{
if (pwcChunk[iCur] == ZERO_WIDTH_SPACE )
fWordHasZWS = TRUE;
else
break;
}
}
if (fWordHasZWS)
{
//
// Copy word into _awcBufZWS after stripping zero-width-spaces
//
uLenZWS = 0;
for ( unsigned i=iBeginWord; i<iCur; i++ )
{
if (pwcChunk[i] != ZERO_WIDTH_SPACE )
_awcBufZWS[uLenZWS++] = pwcChunk[i];
}
}
// iCur points to a non-word char or is equal to _cMapped
if (iCur < cwc)
{
// store the word and its source position
if ( fWordHasZWS )
pWordSink->PutWord( uLenZWS, _awcBufZWS, // stripped word
iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);
else
pWordSink->PutWord( iCur - iBeginWord, pwcChunk + iBeginWord, // the word
iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);
iCur++; // we knew it pointed at non-word char
iBeginWord = iCur; // in case we exit the loop now
}
} // next word
// Win4Assert( iCur == _cMapped );
// End of words in chunk.
// iCur == _cMapped
// iBeginWord points at beginning of word or == _cMapped
if ( 0 == iBeginWord )
{
// A single word fills from beginning of this chunk
// to the end. This is either a very long word or
// a short word in a leftover buffer.
// store the word and its source position
if ( fWordHasZWS )
pWordSink->PutWord( uLenZWS, _awcBufZWS, // stripped word
iCur, pTextSource->iCur + dwBase); // its source pos.
else
pWordSink->PutWord( iCur, pwcChunk, // the word
iCur, pTextSource->iCur + dwBase); // its source pos.
//
// Position it to not add the word twice.
//
iBeginWord = iCur;
}
//
// If this is the last chunk from text source, then process the
// last fragment
//
if ( cwc < CDefWordBreaker::ccCompare && iBeginWord != iCur )
{
// store the word and its source position
if ( fWordHasZWS )
pWordSink->PutWord( uLenZWS, _awcBufZWS, // stripped word
iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);
else
pWordSink->PutWord( iCur - iBeginWord, pwcChunk + iBeginWord, // the word
iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);
iBeginWord = iCur;
}
cwcProcd = iBeginWord;
}