windows-nt/Source/XPSP1/NT/inetsrv/intlwb/cht2/srcs/defbrkr.cpp

#include <windows.h>
#include "IWBrKr.h"
#include "DefBrKr.h"

#define ZERO_WIDTH_SPACE   0x200B
#define MAX_Def_WordBrKr_Prcess_Len   1000

BOOL IsWinNT(void)
{
    OSVERSIONINFOA  osVersionInfo;
    BOOL fRet = FALSE;
    
    osVersionInfo.dwOSVersionInfoSize = sizeof(osVersionInfo);
    GetVersionExA(&osVersionInfo);
    if (osVersionInfo.dwPlatformId == VER_PLATFORM_WIN32_NT) {
        fRet = TRUE; 
    }
    return fRet;
}

BOOL MyGetStringTypeEx(
    LCID   LocalID,
    DWORD  dwInfoType,
    const WCHAR *lpSrcStr,   // unicode base
    INT    cchSrc,
    LPWORD lpCharType)
{
    BOOL fRet = FALSE;

    if (IsWinNT()) {
        fRet = GetStringTypeW(dwInfoType, lpSrcStr, cchSrc,lpCharType);
    } else {
        DWORD dwANSISize = 0;
        dwANSISize = WideCharToMultiByte(GetACP(), WC_COMPOSITECHECK, lpSrcStr, cchSrc,
            NULL, 0, NULL, NULL);
        if (dwANSISize) {
            LPSTR lpAnsiStr = NULL;
            lpAnsiStr = new CHAR[dwANSISize];
            if (lpAnsiStr) {
                dwANSISize = WideCharToMultiByte(GetACP(), WC_COMPOSITECHECK, lpSrcStr, cchSrc,
                    lpAnsiStr, dwANSISize, NULL, NULL);
                fRet = GetStringTypeExA(LocalID, dwInfoType, lpAnsiStr, dwANSISize, lpCharType);
                if (ERROR_INVALID_PARAMETER == GetLastError() && (CT_CTYPE1 == dwInfoType || CT_CTYPE3 == dwInfoType)) {
                    for (INT i = 0; i < cchSrc; ++i) {
                        switch (dwInfoType) {
                        case CT_CTYPE1:
                            lpCharType[i] = C1_ALPHA;
                            break;
                        case CT_CTYPE3:
                            lpCharType[i] = (C3_NONSPACING | C3_ALPHA);
                            break;
                        }
                    }
                    fRet = TRUE;
                }
                delete [] lpAnsiStr;
                lpAnsiStr = NULL;
            }
        }
    }
    return fRet;
}


CDefWordBreaker::CDefWordBreaker()
{
    ccCompare = MAX_Def_WordBrKr_Prcess_Len;
}
//+-------------------------------------------------------------------------
//
//  Method:     CDefWordBreaker::IsWordChar
//
//  Synopsis:   Find whether the i'th character in the buffer _awString
//              is a word character (rather than word break)
//
//  Arguments:  [i] -- index into _awString
//
//  History:    22-Jul-1994  BartoszM       Created
//
//--------------------------------------------------------------------------

inline BOOL CDefWordBreaker::IsWordChar(
    int i,
    PWORD _aCharInfo1,
    PWORD _aCharInfo3,
    const WCHAR* pwcChunk) const
{
    if ( (_aCharInfo1[i] & (C1_ALPHA | C1_DIGIT))
        || (_aCharInfo3[i] & C3_NONSPACING)  )
    {
        return TRUE;
    }

    WCHAR c = pwcChunk[i];

    if (c == L'_')
        return TRUE;

    if (c == 0xa0) // non breaking space
    {
        // followed by a non-spacing character
        // (looking ahead is okay)
        if (_aCharInfo3[i+1] & C3_NONSPACING)
            return TRUE;
    }
    return FALSE;
}

//+---------------------------------------------------------------------------
//
//  Member:     CDefWordBreaker::ScanChunk
//
//  Synopsis:   For each character find its type
//
//
//  History:    16-Aug-94  BartoszM     Created
//
//----------------------------------------------------------------------------
BOOL CDefWordBreaker::ScanChunk(
    PWORD _aCharInfo1, 
    PWORD _aCharInfo3,
    const WCHAR *pwcChunk,
    ULONG ucwc)
{
    BOOL fRet = FALSE;

    // POSIX character typing, Source, Size of source, Character info
    if (!MyGetStringTypeEx(GetSystemDefaultLCID(), CT_CTYPE1, pwcChunk, ucwc, _aCharInfo1)) { 
     // Additional POSIX, Source, Size of source, Character info 3
    } else if (!MyGetStringTypeEx(GetSystemDefaultLCID(), CT_CTYPE3, pwcChunk, ucwc, _aCharInfo3)) {         // 
    } else {
        fRet = TRUE;
    }
    return fRet;
}

/*
BOOL CDefWordBreaker::ScanChunk(
    PWORD _aCharInfo1, 
    PWORD _aCharInfo3,
    const WCHAR *pwcChunk,
    ULONG ucwc)
{

    //
    // GetStringTypeW is returning error 87 (ERROR_INVALID_PARAMETER) if
    // we pass in a null string.
    //
//  Win4Assert( (0 != _cMapped) && (0 != _pwcChunk) );

    if (IsWinNT())
    {
        if (!MyGetStringTypeEx(0,                     // Dummy
                              CT_CTYPE1,              // POSIX character typing
                              pwcChunk,               // Source
                              ucwc,                   // Size of source
                              _aCharInfo1 ) )         // Character info
        {
            return FALSE;
        }

        if ( !MyGetStringTypeEx(0,                    // Dummy
                              CT_CTYPE3,              // Additional POSIX
                              pwcChunk,               // Source
                              ucwc,                   // Size of source
                              _aCharInfo3 ) )         // Character info 3
        {
            return FALSE;
        }
    }
    else
    {
        //
        // BUGBUG: This is all wrong -- we don't know if this is the right
        //         locale to use and there isn't a way to know at this point.
        //

        if (!MyGetStringTypeEx( GetSystemDefaultLCID(),
                                CT_CTYPE1,              // POSIX character typing
                                pwcChunk,               // Source
                                ucwc,                   // Size of source
                                _aCharInfo1 ) )         // Character info
        {
//           ciDebugOut(( DEB_ERROR, "GetStringTypeW returned %d\n",
//                         GetLastError() ));

            // Win9x just stinks.  No 2 ways about it.

            if ( ERROR_INVALID_PARAMETER == GetLastError() )
            {
                for ( unsigned i = 0; i < ucwc; i++ )
                    _aCharInfo1[i] = C1_ALPHA;

                return TRUE;
            }

            return FALSE;
        }

        if ( !MyGetStringTypeEx(GetSystemDefaultLCID(),
                                CT_CTYPE3,              // Additional POSIX
                                pwcChunk,               // Source
                                ucwc,                   // Size of source
                                _aCharInfo3 ) )         // Character info 3
        {
//            ciDebugOut(( DEB_ERROR, "GetStringTypeW CTYPE3 returned %d\n",
 //                        GetLastError() ));

            // Win9x just stinks.  No 2 ways about it.

            if ( ERROR_INVALID_PARAMETER == GetLastError() )
            {
                for ( unsigned i = 0; i < ucwc; i++ )
                    _aCharInfo3[i] = ( C3_NONSPACING | C3_ALPHA );

                return TRUE;
            }

            return FALSE;
        }
    }

    return TRUE;
} //ScanChunk
*/
//+---------------------------------------------------------------------------
//
//  Member:     CDefWordBreaker::BreakText
//
//  Synopsis:   Break input stream into words.
//
//  Arguments:  [pTextSource] - source of input buffers
//              [pWordSink] - sink for words
//              [pPhraseSink] - sink for noun phrases
//
//  History:    07-June-91  t-WadeR     Created
//              12-Oct-92   AmyA        Added Unicode support
//              18-Nov-92   AmyA        Overloaded
//              11-Apr-94   KyleP       Sync with spec
//              26-Aug-94   BartoszM    Fixed Unicode parsing
//
//----------------------------------------------------------------------------

SCODE CDefWordBreaker::BreakText(
    TEXT_SOURCE *pTextSource,
    IWordSink   *pWordSink,
    IPhraseSink *pPhraseSink,
    DWORD       dwBase)
{
    LPWORD _aCharInfo1 = NULL;
    LPWORD _aCharInfo3 = NULL;

    if ( 0 == pTextSource )
        return E_INVALIDARG;

    if ( 0 == pWordSink || pTextSource->iCur == pTextSource->iEnd)
        return S_OK;

    if (pTextSource->iCur > pTextSource->iEnd)
    {
//        Win4Assert ( !"BreakText called with bad TEXT_SOURCE" );
        return E_FAIL;
    }

    SCODE sc = S_OK;

    ULONG cwc, cwcProcd;     // cwcProcd is # chars actually processed by Tokenize()

    do {
      //
      // Flag for first time thru loop below. This is to fix the case
      // where the length of the buffer passed in is less than
      // MAX_II_BUFFER_LEN. In this case iEnd-iCur is <= MAX_II_BUFFER_LEN
      // and we break out the inner loop and call
      // pfnFillTextBuffer without having processed any characters,
      // and so pfnFillTextBuffer returns TRUE without adding any new
      // characters and this results in an infinite loop.
        BOOL fFirstTime = TRUE;
        while (pTextSource->iCur < pTextSource->iEnd) {
            cwc = pTextSource->iEnd - pTextSource->iCur;
            // Process in buckets of MAX_II_BUFER_LEN only
            if (cwc >= CDefWordBreaker::ccCompare) {
                cwc = CDefWordBreaker::ccCompare;
            } else if ( !fFirstTime) {
                break;
            } else {
            }

            if (_aCharInfo1) {
                delete [] _aCharInfo1;
                _aCharInfo1 = NULL;
            }
            if (_aCharInfo3) {
                delete [] _aCharInfo3;
                _aCharInfo3 = NULL;
            }
            _aCharInfo1 = new WORD[cwc + 1];
            _aCharInfo3 = new WORD[cwc + 1];
            if (_aCharInfo1 && _aCharInfo3) {
                Tokenize( pTextSource, cwc, pWordSink, cwcProcd, _aCharInfo1, _aCharInfo3, dwBase);
            }

//          Win4Assert( cwcProcd <= cwc );
            pTextSource->iCur += cwcProcd;
            fFirstTime = FALSE;
        }
    } while(SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)));

    cwc = pTextSource->iEnd - pTextSource->iCur;
    // we know that the remaining text should be less than ccCompare

    // Win4Assert( cwc < CDefWordBreaker::ccCompare );

    if (0 != cwc) {
        if (_aCharInfo1) {
            delete [] _aCharInfo1;
            _aCharInfo1 = NULL;
        }
        if (_aCharInfo3) {
            delete [] _aCharInfo3;
            _aCharInfo3 = NULL;
        }
        _aCharInfo1 = new WORD[cwc + 1];
        _aCharInfo3 = new WORD[cwc + 1];
        if (_aCharInfo1 && _aCharInfo1) {
            Tokenize(pTextSource, cwc, pWordSink, cwcProcd, _aCharInfo1, _aCharInfo3, dwBase);
        }
    }

    if (_aCharInfo1) {
        delete [] _aCharInfo1;
        _aCharInfo1 = NULL;
    }
    if (_aCharInfo3) {
        delete [] _aCharInfo3;
         _aCharInfo3 = NULL;
    }

    return sc;
} //BreakText

//+---------------------------------------------------------------------------
//
//  Member:     CDefWordBreaker::Tokenize
//
//  Synopsis:   Tokenize the input buffer into words
//
//  Arguments:  [pTextSource]  --  input text source
//              [cwc]          --  # chars to process
//              [pWordSink]    --  sink for words
//              [cwcProd]      --  # chars actually processed returned here
//
//  History:    10-Aug-95   SitaramR    Created
//
//----------------------------------------------------------------------------

void CDefWordBreaker::Tokenize( TEXT_SOURCE *pTextSource,
                                ULONG cwc,
                                IWordSink *pWordSink,
                                ULONG& cwcProcd,
                                PWORD _aCharInfo1,
                                PWORD _aCharInfo3,
                                DWORD dwBase)
{
    const WCHAR* pwcChunk = NULL;
    WCHAR        _awcBufZWS[MAX_Def_WordBrKr_Prcess_Len];

    pwcChunk = &pTextSource->awcBuffer[pTextSource->iCur];

    if (!ScanChunk(_aCharInfo1, _aCharInfo3, pwcChunk, cwc)) {
        return;
    }

    BOOL fWordHasZWS = FALSE;     // Does the current word have a zero-width-space ?
    unsigned uLenZWS;             // Length of a word minus embedded zero-width-spaces

    //
    // iBeginWord is the offset into _aCharInfo of the beginning character of
    // a word.  iCur is the first *unprocessed* character.
    // They are indexes into the mapped chunk.
    //

    unsigned iBeginWord = 0;
    unsigned iCur = 0;

    //
    // Pump words from mapped chunk to word sink
    //
    while (iCur < cwc)
    {
        //
        // Skip whitespace, punctuation, etc.
        //
        for (; iCur < cwc; iCur++)
            if (IsWordChar (iCur, _aCharInfo1, _aCharInfo3, pwcChunk))
                break;

        // iCur points to a word char or is equal to _cMapped

        iBeginWord = iCur;
        if (iCur < cwc)
            iCur++; // we knew it pointed at word character

        //
        // Find word break. Filter may output Unicode zero-width-space, which
        // should be ignored by the wordbreaker.
        //
        fWordHasZWS = FALSE;
        for (; iCur < cwc; iCur++)
        {
            if (!IsWordChar(iCur, _aCharInfo1, _aCharInfo3, pwcChunk))
            {
                if (pwcChunk[iCur] == ZERO_WIDTH_SPACE )
                    fWordHasZWS = TRUE;
                else
                    break;
            }
        }

        if (fWordHasZWS)
        {
            //
            // Copy word into _awcBufZWS after stripping zero-width-spaces
            //

            uLenZWS = 0;
            for ( unsigned i=iBeginWord; i<iCur; i++ )
            {
                if (pwcChunk[i] != ZERO_WIDTH_SPACE )
                    _awcBufZWS[uLenZWS++] = pwcChunk[i];
            }
        }

        // iCur points to a non-word char or is equal to _cMapped

        if (iCur < cwc)
        {
            // store the word and its source position
            if ( fWordHasZWS )
                pWordSink->PutWord( uLenZWS, _awcBufZWS,                       // stripped word
                                    iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);
            else
                pWordSink->PutWord( iCur - iBeginWord, pwcChunk + iBeginWord, // the word
                                    iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);

            iCur++; // we knew it pointed at non-word char
            iBeginWord = iCur; // in case we exit the loop now
        }

    } // next word

//    Win4Assert( iCur == _cMapped );
    // End of words in chunk.
    // iCur == _cMapped
    // iBeginWord points at beginning of word or == _cMapped

    if ( 0 == iBeginWord )
    {
        // A single word fills from beginning of this chunk
        // to the end. This is either a very long word or
        // a short word in a leftover buffer.

        // store the word and its source position
        if ( fWordHasZWS )
            pWordSink->PutWord( uLenZWS, _awcBufZWS,       // stripped word
                                iCur, pTextSource->iCur + dwBase); // its source pos.
        else
            pWordSink->PutWord( iCur, pwcChunk,           // the word
                                iCur, pTextSource->iCur + dwBase); // its source pos.

        //
        // Position it to not add the word twice.
        //
        iBeginWord = iCur;
    }

    //
    // If this is the last chunk from text source, then process the
    // last fragment
    //

    if ( cwc < CDefWordBreaker::ccCompare && iBeginWord != iCur )
    {
        // store the word and its source position
        if ( fWordHasZWS )
            pWordSink->PutWord( uLenZWS, _awcBufZWS,                        // stripped word
                                iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);
        else
            pWordSink->PutWord( iCur - iBeginWord, pwcChunk + iBeginWord,  // the word
                                iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);

        iBeginWord = iCur;
    }

    cwcProcd = iBeginWord;
}
Add source files 2020-09-26 03:20:57 -05:00			`#include <windows.h>`
			`#include "IWBrKr.h"`
			`#include "DefBrKr.h"`

			`#define ZERO_WIDTH_SPACE 0x200B`
			`#define MAX_Def_WordBrKr_Prcess_Len 1000`

			`BOOL IsWinNT(void)`
			`{`
			`OSVERSIONINFOA osVersionInfo;`
			`BOOL fRet = FALSE;`

			`osVersionInfo.dwOSVersionInfoSize = sizeof(osVersionInfo);`
			`GetVersionExA(&osVersionInfo);`
			`if (osVersionInfo.dwPlatformId == VER_PLATFORM_WIN32_NT) {`
			`fRet = TRUE;`
			`}`
			`return fRet;`
			`}`

			`BOOL MyGetStringTypeEx(`
			`LCID LocalID,`
			`DWORD dwInfoType,`
			`const WCHAR *lpSrcStr, // unicode base`
			`INT cchSrc,`
			`LPWORD lpCharType)`
			`{`
			`BOOL fRet = FALSE;`

			`if (IsWinNT()) {`
			`fRet = GetStringTypeW(dwInfoType, lpSrcStr, cchSrc,lpCharType);`
			`} else {`
			`DWORD dwANSISize = 0;`
			`dwANSISize = WideCharToMultiByte(GetACP(), WC_COMPOSITECHECK, lpSrcStr, cchSrc,`
			`NULL, 0, NULL, NULL);`
			`if (dwANSISize) {`
			`LPSTR lpAnsiStr = NULL;`
			`lpAnsiStr = new CHAR[dwANSISize];`
			`if (lpAnsiStr) {`
			`dwANSISize = WideCharToMultiByte(GetACP(), WC_COMPOSITECHECK, lpSrcStr, cchSrc,`
			`lpAnsiStr, dwANSISize, NULL, NULL);`
			`fRet = GetStringTypeExA(LocalID, dwInfoType, lpAnsiStr, dwANSISize, lpCharType);`
			`if (ERROR_INVALID_PARAMETER == GetLastError() && (CT_CTYPE1 == dwInfoType \|\| CT_CTYPE3 == dwInfoType)) {`
			`for (INT i = 0; i < cchSrc; ++i) {`
			`switch (dwInfoType) {`
			`case CT_CTYPE1:`
			`lpCharType[i] = C1_ALPHA;`
			`break;`
			`case CT_CTYPE3:`
			`lpCharType[i] = (C3_NONSPACING \| C3_ALPHA);`
			`break;`
			`}`
			`}`
			`fRet = TRUE;`
			`}`
			`delete [] lpAnsiStr;`
			`lpAnsiStr = NULL;`
			`}`
			`}`
			`}`
			`return fRet;`
			`}`


			`CDefWordBreaker::CDefWordBreaker()`
			`{`
			`ccCompare = MAX_Def_WordBrKr_Prcess_Len;`
			`}`
			`//+-------------------------------------------------------------------------`
			`//`
			`// Method: CDefWordBreaker::IsWordChar`
			`//`
			`// Synopsis: Find whether the i'th character in the buffer _awString`
			`// is a word character (rather than word break)`
			`//`
			`// Arguments: [i] -- index into _awString`
			`//`
			`// History: 22-Jul-1994 BartoszM Created`
			`//`
			`//--------------------------------------------------------------------------`

			`inline BOOL CDefWordBreaker::IsWordChar(`
			`int i,`
			`PWORD _aCharInfo1,`
			`PWORD _aCharInfo3,`
			`const WCHAR* pwcChunk) const`
			`{`
			`if ( (_aCharInfo1[i] & (C1_ALPHA \| C1_DIGIT))`
			`\|\| (_aCharInfo3[i] & C3_NONSPACING) )`
			`{`
			`return TRUE;`
			`}`

			`WCHAR c = pwcChunk[i];`

			`if (c == L'_')`
			`return TRUE;`

			`if (c == 0xa0) // non breaking space`
			`{`
			`// followed by a non-spacing character`
			`// (looking ahead is okay)`
			`if (_aCharInfo3[i+1] & C3_NONSPACING)`
			`return TRUE;`
			`}`
			`return FALSE;`
			`}`

			`//+---------------------------------------------------------------------------`
			`//`
			`// Member: CDefWordBreaker::ScanChunk`
			`//`
			`// Synopsis: For each character find its type`
			`//`
			`//`
			`// History: 16-Aug-94 BartoszM Created`
			`//`
			`//----------------------------------------------------------------------------`
			`BOOL CDefWordBreaker::ScanChunk(`
			`PWORD _aCharInfo1,`
			`PWORD _aCharInfo3,`
			`const WCHAR *pwcChunk,`
			`ULONG ucwc)`
			`{`
			`BOOL fRet = FALSE;`

			`// POSIX character typing, Source, Size of source, Character info`
			`if (!MyGetStringTypeEx(GetSystemDefaultLCID(), CT_CTYPE1, pwcChunk, ucwc, _aCharInfo1)) {`
			`// Additional POSIX, Source, Size of source, Character info 3`
			`} else if (!MyGetStringTypeEx(GetSystemDefaultLCID(), CT_CTYPE3, pwcChunk, ucwc, _aCharInfo3)) { //`
			`} else {`
			`fRet = TRUE;`
			`}`
			`return fRet;`
			`}`

			`/*`
			`BOOL CDefWordBreaker::ScanChunk(`
			`PWORD _aCharInfo1,`
			`PWORD _aCharInfo3,`
			`const WCHAR *pwcChunk,`
			`ULONG ucwc)`
			`{`

			`//`
			`// GetStringTypeW is returning error 87 (ERROR_INVALID_PARAMETER) if`
			`// we pass in a null string.`
			`//`
			`// Win4Assert( (0 != _cMapped) && (0 != _pwcChunk) );`

			`if (IsWinNT())`
			`{`
			`if (!MyGetStringTypeEx(0, // Dummy`
			`CT_CTYPE1, // POSIX character typing`
			`pwcChunk, // Source`
			`ucwc, // Size of source`
			`_aCharInfo1 ) ) // Character info`
			`{`
			`return FALSE;`
			`}`

			`if ( !MyGetStringTypeEx(0, // Dummy`
			`CT_CTYPE3, // Additional POSIX`
			`pwcChunk, // Source`
			`ucwc, // Size of source`
			`_aCharInfo3 ) ) // Character info 3`
			`{`
			`return FALSE;`
			`}`
			`}`
			`else`
			`{`
			`//`
			`// BUGBUG: This is all wrong -- we don't know if this is the right`
			`// locale to use and there isn't a way to know at this point.`
			`//`

			`if (!MyGetStringTypeEx( GetSystemDefaultLCID(),`
			`CT_CTYPE1, // POSIX character typing`
			`pwcChunk, // Source`
			`ucwc, // Size of source`
			`_aCharInfo1 ) ) // Character info`
			`{`
			`// ciDebugOut(( DEB_ERROR, "GetStringTypeW returned %d\n",`
			`// GetLastError() ));`

			`// Win9x just stinks. No 2 ways about it.`

			`if ( ERROR_INVALID_PARAMETER == GetLastError() )`
			`{`
			`for ( unsigned i = 0; i < ucwc; i++ )`
			`_aCharInfo1[i] = C1_ALPHA;`

			`return TRUE;`
			`}`

			`return FALSE;`
			`}`

			`if ( !MyGetStringTypeEx(GetSystemDefaultLCID(),`
			`CT_CTYPE3, // Additional POSIX`
			`pwcChunk, // Source`
			`ucwc, // Size of source`
			`_aCharInfo3 ) ) // Character info 3`
			`{`
			`// ciDebugOut(( DEB_ERROR, "GetStringTypeW CTYPE3 returned %d\n",`
			`// GetLastError() ));`

			`// Win9x just stinks. No 2 ways about it.`

			`if ( ERROR_INVALID_PARAMETER == GetLastError() )`
			`{`
			`for ( unsigned i = 0; i < ucwc; i++ )`
			`_aCharInfo3[i] = ( C3_NONSPACING \| C3_ALPHA );`

			`return TRUE;`
			`}`

			`return FALSE;`
			`}`
			`}`

			`return TRUE;`
			`} //ScanChunk`
			`*/`
			`//+---------------------------------------------------------------------------`
			`//`
			`// Member: CDefWordBreaker::BreakText`
			`//`
			`// Synopsis: Break input stream into words.`
			`//`
			`// Arguments: [pTextSource] - source of input buffers`
			`// [pWordSink] - sink for words`
			`// [pPhraseSink] - sink for noun phrases`
			`//`
			`// History: 07-June-91 t-WadeR Created`
			`// 12-Oct-92 AmyA Added Unicode support`
			`// 18-Nov-92 AmyA Overloaded`
			`// 11-Apr-94 KyleP Sync with spec`
			`// 26-Aug-94 BartoszM Fixed Unicode parsing`
			`//`
			`//----------------------------------------------------------------------------`

			`SCODE CDefWordBreaker::BreakText(`
			`TEXT_SOURCE *pTextSource,`
			`IWordSink *pWordSink,`
			`IPhraseSink *pPhraseSink,`
			`DWORD dwBase)`
			`{`
			`LPWORD _aCharInfo1 = NULL;`
			`LPWORD _aCharInfo3 = NULL;`

			`if ( 0 == pTextSource )`
			`return E_INVALIDARG;`

			`if ( 0 == pWordSink \|\| pTextSource->iCur == pTextSource->iEnd)`
			`return S_OK;`

			`if (pTextSource->iCur > pTextSource->iEnd)`
			`{`
			`// Win4Assert ( !"BreakText called with bad TEXT_SOURCE" );`
			`return E_FAIL;`
			`}`

			`SCODE sc = S_OK;`

			`ULONG cwc, cwcProcd; // cwcProcd is # chars actually processed by Tokenize()`

			`do {`
			`//`
			`// Flag for first time thru loop below. This is to fix the case`
			`// where the length of the buffer passed in is less than`
			`// MAX_II_BUFFER_LEN. In this case iEnd-iCur is <= MAX_II_BUFFER_LEN`
			`// and we break out the inner loop and call`
			`// pfnFillTextBuffer without having processed any characters,`
			`// and so pfnFillTextBuffer returns TRUE without adding any new`
			`// characters and this results in an infinite loop.`
			`BOOL fFirstTime = TRUE;`
			`while (pTextSource->iCur < pTextSource->iEnd) {`
			`cwc = pTextSource->iEnd - pTextSource->iCur;`
			`// Process in buckets of MAX_II_BUFER_LEN only`
			`if (cwc >= CDefWordBreaker::ccCompare) {`
			`cwc = CDefWordBreaker::ccCompare;`
			`} else if ( !fFirstTime) {`
			`break;`
			`} else {`
			`}`

			`if (_aCharInfo1) {`
			`delete [] _aCharInfo1;`
			`_aCharInfo1 = NULL;`
			`}`
			`if (_aCharInfo3) {`
			`delete [] _aCharInfo3;`
			`_aCharInfo3 = NULL;`
			`}`
			`_aCharInfo1 = new WORD[cwc + 1];`
			`_aCharInfo3 = new WORD[cwc + 1];`
			`if (_aCharInfo1 && _aCharInfo3) {`
			`Tokenize( pTextSource, cwc, pWordSink, cwcProcd, _aCharInfo1, _aCharInfo3, dwBase);`
			`}`

			`// Win4Assert( cwcProcd <= cwc );`
			`pTextSource->iCur += cwcProcd;`
			`fFirstTime = FALSE;`
			`}`
			`} while(SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)));`

			`cwc = pTextSource->iEnd - pTextSource->iCur;`
			`// we know that the remaining text should be less than ccCompare`

			`// Win4Assert( cwc < CDefWordBreaker::ccCompare );`

			`if (0 != cwc) {`
			`if (_aCharInfo1) {`
			`delete [] _aCharInfo1;`
			`_aCharInfo1 = NULL;`
			`}`
			`if (_aCharInfo3) {`
			`delete [] _aCharInfo3;`
			`_aCharInfo3 = NULL;`
			`}`
			`_aCharInfo1 = new WORD[cwc + 1];`
			`_aCharInfo3 = new WORD[cwc + 1];`
			`if (_aCharInfo1 && _aCharInfo1) {`
			`Tokenize(pTextSource, cwc, pWordSink, cwcProcd, _aCharInfo1, _aCharInfo3, dwBase);`
			`}`
			`}`

			`if (_aCharInfo1) {`
			`delete [] _aCharInfo1;`
			`_aCharInfo1 = NULL;`
			`}`
			`if (_aCharInfo3) {`
			`delete [] _aCharInfo3;`
			`_aCharInfo3 = NULL;`
			`}`

			`return sc;`
			`} //BreakText`

			`//+---------------------------------------------------------------------------`
			`//`
			`// Member: CDefWordBreaker::Tokenize`
			`//`
			`// Synopsis: Tokenize the input buffer into words`
			`//`
			`// Arguments: [pTextSource] -- input text source`
			`// [cwc] -- # chars to process`
			`// [pWordSink] -- sink for words`
			`// [cwcProd] -- # chars actually processed returned here`
			`//`
			`// History: 10-Aug-95 SitaramR Created`
			`//`
			`//----------------------------------------------------------------------------`

			`void CDefWordBreaker::Tokenize( TEXT_SOURCE *pTextSource,`
			`ULONG cwc,`
			`IWordSink *pWordSink,`
			`ULONG& cwcProcd,`
			`PWORD _aCharInfo1,`
			`PWORD _aCharInfo3,`
			`DWORD dwBase)`
			`{`
			`const WCHAR* pwcChunk = NULL;`
			`WCHAR _awcBufZWS[MAX_Def_WordBrKr_Prcess_Len];`

			`pwcChunk = &pTextSource->awcBuffer[pTextSource->iCur];`

			`if (!ScanChunk(_aCharInfo1, _aCharInfo3, pwcChunk, cwc)) {`
			`return;`
			`}`

			`BOOL fWordHasZWS = FALSE; // Does the current word have a zero-width-space ?`
			`unsigned uLenZWS; // Length of a word minus embedded zero-width-spaces`

			`//`
			`// iBeginWord is the offset into _aCharInfo of the beginning character of`
			`// a word. iCur is the first unprocessed character.`
			`// They are indexes into the mapped chunk.`
			`//`

			`unsigned iBeginWord = 0;`
			`unsigned iCur = 0;`

			`//`
			`// Pump words from mapped chunk to word sink`
			`//`
			`while (iCur < cwc)`
			`{`
			`//`
			`// Skip whitespace, punctuation, etc.`
			`//`
			`for (; iCur < cwc; iCur++)`
			`if (IsWordChar (iCur, _aCharInfo1, _aCharInfo3, pwcChunk))`
			`break;`

			`// iCur points to a word char or is equal to _cMapped`

			`iBeginWord = iCur;`
			`if (iCur < cwc)`
			`iCur++; // we knew it pointed at word character`

			`//`
			`// Find word break. Filter may output Unicode zero-width-space, which`
			`// should be ignored by the wordbreaker.`
			`//`
			`fWordHasZWS = FALSE;`
			`for (; iCur < cwc; iCur++)`
			`{`
			`if (!IsWordChar(iCur, _aCharInfo1, _aCharInfo3, pwcChunk))`
			`{`
			`if (pwcChunk[iCur] == ZERO_WIDTH_SPACE )`
			`fWordHasZWS = TRUE;`
			`else`
			`break;`
			`}`
			`}`

			`if (fWordHasZWS)`
			`{`
			`//`
			`// Copy word into _awcBufZWS after stripping zero-width-spaces`
			`//`

			`uLenZWS = 0;`
			`for ( unsigned i=iBeginWord; i<iCur; i++ )`
			`{`
			`if (pwcChunk[i] != ZERO_WIDTH_SPACE )`
			`_awcBufZWS[uLenZWS++] = pwcChunk[i];`
			`}`
			`}`

			`// iCur points to a non-word char or is equal to _cMapped`

			`if (iCur < cwc)`
			`{`
			`// store the word and its source position`
			`if ( fWordHasZWS )`
			`pWordSink->PutWord( uLenZWS, _awcBufZWS, // stripped word`
			`iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);`
			`else`
			`pWordSink->PutWord( iCur - iBeginWord, pwcChunk + iBeginWord, // the word`
			`iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);`

			`iCur++; // we knew it pointed at non-word char`
			`iBeginWord = iCur; // in case we exit the loop now`
			`}`

			`} // next word`

			`// Win4Assert( iCur == _cMapped );`
			`// End of words in chunk.`
			`// iCur == _cMapped`
			`// iBeginWord points at beginning of word or == _cMapped`

			`if ( 0 == iBeginWord )`
			`{`
			`// A single word fills from beginning of this chunk`
			`// to the end. This is either a very long word or`
			`// a short word in a leftover buffer.`

			`// store the word and its source position`
			`if ( fWordHasZWS )`
			`pWordSink->PutWord( uLenZWS, _awcBufZWS, // stripped word`
			`iCur, pTextSource->iCur + dwBase); // its source pos.`
			`else`
			`pWordSink->PutWord( iCur, pwcChunk, // the word`
			`iCur, pTextSource->iCur + dwBase); // its source pos.`

			`//`
			`// Position it to not add the word twice.`
			`//`
			`iBeginWord = iCur;`
			`}`

			`//`
			`// If this is the last chunk from text source, then process the`
			`// last fragment`
			`//`

			`if ( cwc < CDefWordBreaker::ccCompare && iBeginWord != iCur )`
			`{`
			`// store the word and its source position`
			`if ( fWordHasZWS )`
			`pWordSink->PutWord( uLenZWS, _awcBufZWS, // stripped word`
			`iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);`
			`else`
			`pWordSink->PutWord( iCur - iBeginWord, pwcChunk + iBeginWord, // the word`
			`iCur - iBeginWord, pTextSource->iCur + iBeginWord + dwBase);`

			`iBeginWord = iCur;`
			`}`

			`cwcProcd = iBeginWord;`
			`}`