windows-nt/Source/XPSP1/NT/inetsrv/query/deflang/defbreak.cxx

//+---------------------------------------------------------------------------
//
//  Microsoft Windows
//  Copyright (C) Microsoft Corporation, 1991 - 1994.
//
//  File:       DefBreak.cxx
//
//  Contents:   Text Word Breaker
//
//  History:    08-May-91   t-WadeR     Created stubs, filled in ASCII code.
//              06-Jun-91   t-WadeR     Changed to use input-based pipeline
//              11-Apr-92   KyleP       Sync to spec
//
//----------------------------------------------------------------------------

#include <pch.cxx>
#pragma hdrstop

#include <DefBreak.hxx>

//+---------------------------------------------------------------------------
//
//  Member:     CDefWordBreaker::CDefWordBreaker
//
//  Synopsis:   Constructor for the CDefWordBreaker class.
//
//  History:    07-June-91  t-WadeR     Created
//              12-Oct-92   AmyA        Added Unicode support
//
//----------------------------------------------------------------------------

CDefWordBreaker::CDefWordBreaker()
        : _cRefs(1)
{
    ciDebugOut(( DEB_ITRACE, "Creating default wordbreaker\n" ));

    // Look at IsWordChar. We don't want the last non-breaking
    // space in the chunk to be considered a word break.
    // It will be processed again (correctly) when we move to the next chunk.

    _aCharInfo3 [CDefWordBreaker::ccCompare] = C3_NONSPACING;
}

//+---------------------------------------------------------------------------
//
//  Member:     CWordBreaker::~CWordBreaker
//
//  Synopsis:   Destructor for the CWordBreaker class.
//
//----------------------------------------------------------------------------

CDefWordBreaker::~CDefWordBreaker()
{
}

//+-------------------------------------------------------------------------
//
//  Method:     CDefWordBreaker::QueryInterface
//
//  Synopsis:   Rebind to other interface
//
//  Arguments:  [riid]      -- IID of new interface
//              [ppvObject] -- New interface * returned here
//
//  Returns:    S_OK if bind succeeded, E_NOINTERFACE if bind failed
//
//  History:    23-Feb-1994     KyleP   Created
//
//--------------------------------------------------------------------------

SCODE STDMETHODCALLTYPE CDefWordBreaker::QueryInterface( REFIID riid,
                                                         void ** ppvObject)
{
    if ( 0 == ppvObject )
        return E_INVALIDARG;

    if ( IID_IWordBreaker == riid )
        *ppvObject = (IUnknown *)(IWordBreaker *)this;
    else if ( IID_IUnknown == riid )
        *ppvObject = (IUnknown *)(IPersist *)(IPersistFile *)this;
    else
    {
        *ppvObject = 0;
        return E_NOINTERFACE;
    }

    AddRef();
    return S_OK;
}

//+-------------------------------------------------------------------------
//
//  Method:     CDefWordBreaker::AddRef
//
//  Synopsis:   Increments refcount
//
//  History:    23-Feb-1994     KyleP   Created
//
//--------------------------------------------------------------------------

ULONG STDMETHODCALLTYPE CDefWordBreaker::AddRef()
{
    return InterlockedIncrement( &_cRefs );
}

//+-------------------------------------------------------------------------
//
//  Method:     CDefWordBreaker::Release
//
//  Synopsis:   Decrement refcount.  Delete if necessary.
//
//  History:    23-Feb-1994     KyleP   Created
//
//--------------------------------------------------------------------------

ULONG STDMETHODCALLTYPE CDefWordBreaker::Release()
{
    unsigned long uTmp = InterlockedDecrement( &_cRefs );

    if ( 0 == uTmp )
        delete this;

    return uTmp;
}

//+-------------------------------------------------------------------------
//
//  Method:     CDefWordBreaker::Init
//
//  Synopsis:   Initialize word-breaker
//
//  Arguments:  [fQuery]         -- TRUE if query-time
//              [ulMaxTokenSize] -- Maximum size token stored by caller
//              [pfLicense]      -- Set to true if use restricted
//
//  Returns:    Status code
//
//  History:    11-Apr-1994  KyleP       Created
//
//--------------------------------------------------------------------------

SCODE STDMETHODCALLTYPE CDefWordBreaker::Init( BOOL fQuery,
                                               ULONG ulMaxTokenSize,
                                               BOOL *pfLicense )
{
    if ( 0 == pfLicense )
        return E_INVALIDARG;

    *pfLicense = FALSE;

    return S_OK;
}

//+-------------------------------------------------------------------------
//
//  Method:     CDefWordBreaker::IsWordChar
//
//  Synopsis:   Find whether the i'th character in the buffer _awString
//              is a word character (rather than word break)
//
//  Arguments:  [i] -- index into _awString
//
//  History:    22-Jul-1994  BartoszM       Created
//
//--------------------------------------------------------------------------

inline BOOL CDefWordBreaker::IsWordChar (int i) const
{
    if (   (_aCharInfo1[i] & (C1_ALPHA | C1_DIGIT))
        || (_aCharInfo3[i] & C3_NONSPACING)  )
    {
        return TRUE;
    }

    WCHAR c = _pwcChunk[i];

    if (c == L'_')
        return TRUE;

    if (c == 0xa0) // non breaking space
    {
        // followed by a non-spacing character
        // (looking ahead is okay)
        if (_aCharInfo3[i+1] & C3_NONSPACING)
            return TRUE;
    }
    return FALSE;
}

//+---------------------------------------------------------------------------
//
//  Member:     CDefWordBreaker::ScanChunk
//
//  Synopsis:   For each character find its type
//
//
//  History:    16-Aug-94  BartoszM     Created
//
//----------------------------------------------------------------------------

BOOL CDefWordBreaker::ScanChunk ()
{

    //
    // GetStringTypeW is returning error 87 (ERROR_INVALID_PARAMETER) if
    // we pass in a null string.
    //
    Win4Assert( (0 != _cMapped) && (0 != _pwcChunk) );

    if ( !GetStringTypeW( CT_CTYPE1,              // POSIX character typing
                          _pwcChunk,              // Source
                          _cMapped,               // Size of source
                          _aCharInfo1 ) )         // Character info
    {
        ciDebugOut(( DEB_ERROR, "GetStringTypeW returned %d\n",
                     GetLastError() ));
        return FALSE;
    }

    if ( !GetStringTypeW( CT_CTYPE3,              // Additional POSIX
                          _pwcChunk,
                          _cMapped,               // Size of source
                          _aCharInfo3 ) )         // Character info 3
    {
        ciDebugOut(( DEB_ERROR, "GetStringTypeW CTYPE3 returned %d\n",
                     GetLastError() ));
        return FALSE;
    }
    return TRUE;
}

//+---------------------------------------------------------------------------
//
//  Member:     CDefWordBreaker::BreakText
//
//  Synopsis:   Break input stream into words.
//
//  Arguments:  [pTextSource] - source of input buffers
//              [pWordSink] - sink for words
//              [pPhraseSink] - sink for noun phrases
//
//  History:    07-June-91  t-WadeR     Created
//              12-Oct-92   AmyA        Added Unicode support
//              18-Nov-92   AmyA        Overloaded
//              11-Apr-94   KyleP       Sync with spec
//              26-Aug-94   BartoszM    Fixed Unicode parsing
//
//----------------------------------------------------------------------------

SCODE STDMETHODCALLTYPE CDefWordBreaker::BreakText( TEXT_SOURCE *pTextSource,
                                                    IWordSink *pWordSink,
                                                    IPhraseSink *pPhraseSink )
{
    if ( 0 == pTextSource )
        return E_INVALIDARG;

    if ( 0 == pWordSink || pTextSource->iCur == pTextSource->iEnd)
        return S_OK;

    if (pTextSource->iCur > pTextSource->iEnd)
    {
        Win4Assert ( !"BreakText called with bad TEXT_SOURCE" );
        return E_FAIL;
    }

    SCODE sc = S_OK;

    ULONG cwc, cwcProcd;     // cwcProcd is # chars actually processed by Tokenize()
    TRY
    {
        do
        {
            //
            // Flag for first time thru loop below. This is to fix the case
            // where the length of the buffer passed in is less than
            // MAX_II_BUFFER_LEN. In this case iEnd-iCur is <= MAX_II_BUFFER_LEN
            // and we break out the inner loop and call
            // pfnFillTextBuffer without having processed any characters,
            // and so pfnFillTextBuffer returns TRUE without adding any new
            // characters and this results in an infinite loop.
            //
            BOOL fFirstTime = TRUE;

            while ( pTextSource->iCur < pTextSource->iEnd )
            {
                cwc = pTextSource->iEnd - pTextSource->iCur;

                //
                // Process in buckets of MAX_II_BUFER_LEN only
                //
                if ( cwc >= CDefWordBreaker::ccCompare )
                    cwc = CDefWordBreaker::ccCompare;
                else if ( !fFirstTime )
                    break;

                Tokenize( pTextSource, cwc, pWordSink, cwcProcd );

                Win4Assert( cwcProcd <= cwc );

                pTextSource->iCur += cwcProcd;

                fFirstTime = FALSE;
            }
        } while ( SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)) );

        cwc = pTextSource->iEnd - pTextSource->iCur;

        // we know that the remaining text should be less than ccCompare
        Win4Assert( cwc < CDefWordBreaker::ccCompare );

        if ( 0 != cwc )
        {
            Tokenize( pTextSource, cwc, pWordSink, cwcProcd );
        }

    }
    CATCH (CException, e)
    {
        ciDebugOut(( DEB_ITRACE,
                     "Exception 0x%x caught when breaking text in default wordbreaker\n",
                     e.GetErrorCode() ));

        sc = GetOleError( e );
    }
    END_CATCH

    return sc;
}


//+---------------------------------------------------------------------------
//
//  Member:     CDefWordBreaker::Tokenize
//
//  Synopsis:   Tokenize the input buffer into words
//
//  Arguments:  [pTextSource]  --  input text source
//              [cwc]          --  # chars to process
//              [pWordSink]    --  sink for words
//              [cwcProd]      --  # chars actually processed returned here
//
//  History:    10-Aug-95   SitaramR    Created
//
//----------------------------------------------------------------------------

void CDefWordBreaker::Tokenize( TEXT_SOURCE *pTextSource,
                                ULONG cwc,
                                IWordSink *pWordSink,
                                ULONG& cwcProcd )
{
    _pwcChunk = &pTextSource->awcBuffer[pTextSource->iCur];
    _cMapped = cwc;


    if ( !ScanChunk() )
        THROW( CException( E_FAIL ) );

    BOOL fWordHasZWS = FALSE;     // Does the current word have a zero-width-space ?
    unsigned uLenZWS;             // Length of a word minus embedded zero-width-spaces

    //
    // iBeginWord is the offset into _aCharInfo of the beginning character of
    // a word.  iCur is the first *unprocessed* character.
    // They are indexes into the mapped chunk.
    //

    unsigned iBeginWord = 0;
    unsigned iCur = 0;

    SCODE sc = S_OK;

    //
    // Pump words from mapped chunk to word sink
    //
    while ( iCur < _cMapped )
    {
        //
        // Skip whitespace, punctuation, etc.
        //
        for (; iCur < _cMapped; iCur++)
            if ( IsWordChar (iCur) )
                break;

        // iCur points to a word char or is equal to _cMapped

        iBeginWord = iCur;
        if (iCur < _cMapped)
            iCur++; // we knew it pointed at word character

        //
        // Find word break. Filter may output Unicode zero-width-space, which
        // should be ignored by the wordbreaker.
        //
        fWordHasZWS = FALSE;
        for (; iCur < _cMapped; iCur++)
        {
            if ( !IsWordChar (iCur) )
            {
                if ( _pwcChunk[iCur] == ZERO_WIDTH_SPACE )
                    fWordHasZWS = TRUE;
                else
                    break;
            }
        }

        if ( fWordHasZWS )
        {
            //
            // Copy word into _awcBufZWS after stripping zero-width-spaces
            //

            uLenZWS = 0;
            for ( unsigned i=iBeginWord; i<iCur; i++ )
            {
                if ( _pwcChunk[i] != ZERO_WIDTH_SPACE )
                    _awcBufZWS[uLenZWS++] = _pwcChunk[i];
            }
        }

        // iCur points to a non-word char or is equal to _cMapped

        if ( iCur < _cMapped )
        {
            // store the word and its source position
            if ( fWordHasZWS )
                sc = pWordSink->PutWord( uLenZWS,
                                         _awcBufZWS,    // stripped word
                                         iCur - iBeginWord,
                                         pTextSource->iCur + iBeginWord );
            else
                sc = pWordSink->PutWord( iCur - iBeginWord,
                                         _pwcChunk + iBeginWord, // the word
                                         iCur - iBeginWord,
                                         pTextSource->iCur + iBeginWord );

            if ( FAILED( sc ) )
                THROW( CException( sc ) );

            iCur++; // we knew it pointed at non-word char
            iBeginWord = iCur; // in case we exit the loop now
        }
    } // next word

    Win4Assert( iCur == _cMapped );

    // End of words in chunk.
    // iCur == _cMapped
    // iBeginWord points at beginning of word or == _cMapped

    if ( 0 == iBeginWord )
    {
        // A single word fills from beginning of this chunk
        // to the end. This is either a very long word or
        // a short word in a leftover buffer.

        // store the word and its source position
        if ( fWordHasZWS )
            sc = pWordSink->PutWord( uLenZWS,
                                     _awcBufZWS,       // stripped word
                                     iCur,
                                     pTextSource->iCur ); // its source pos.
        else
            sc = pWordSink->PutWord( iCur,
                                     _pwcChunk,       // the word
                                     iCur,
                                     pTextSource->iCur ); // its source pos.

        if ( FAILED( sc ) )
            THROW( CException( sc ) );

        //
        // Position it to not add the word twice.
        //

        iBeginWord = iCur;
    }

    //
    // If this is the last chunk from text source, then process the
    // last fragment
    //

    if ( cwc < CDefWordBreaker::ccCompare && iBeginWord != iCur )
    {
        // store the word and its source position
        if ( fWordHasZWS )
            sc = pWordSink->PutWord( uLenZWS,
                                     _awcBufZWS,    // stripped word
                                     iCur - iBeginWord,
                                     pTextSource->iCur + iBeginWord );
        else
            sc = pWordSink->PutWord( iCur - iBeginWord,
                                     _pwcChunk + iBeginWord,  // the word
                                     iCur - iBeginWord,
                                     pTextSource->iCur + iBeginWord );

        if ( FAILED( sc ) )
            THROW( CException( sc ) );

        iBeginWord = iCur;
    }

    cwcProcd = iBeginWord;
}


//+---------------------------------------------------------------------------
//
//  Member:     CDefWordBreaker::ComposePhrase
//
//  Synopsis:   Convert a noun and a modifier into a phrase
//
//  Arguments:  [pwcNoun] -- pointer to noun.
//              [cwcNoun] -- count of chars in pwcNoun
//              [pwcModifier] -- pointer to word modifying pwcNoun
//              [cwcModifier] -- count of chars in pwcModifier
//              [ulAttachmentType] -- relationship between pwcNoun &pwcModifier
//
//  History:    10-Aug-95   SitaramR    Created Header
//
//----------------------------------------------------------------------------

SCODE STDMETHODCALLTYPE CDefWordBreaker::ComposePhrase( WCHAR const *pwcNoun,
                                                        ULONG cwcNoun,
                                                        WCHAR const *pwcModifier,
                                                        ULONG cwcModifier,
                                                        ULONG ulAttachmentType,
                                                        WCHAR *pwcPhrase,
                                                        ULONG *pcwcPhrase )
{
    //
    // Never emitted phrase in the first place.
    //

    ciDebugOut(( DEB_WARN,
                 "IWordBreaker::ComposePhrase called on default word breaker\n" ));
    return( E_FAIL );
}

//+---------------------------------------------------------------------------
//
//  Member:     CWordBreaker::GetLicenseToUse
//
//  Synopsis:   Returns a pointer to vendors license information
//
//  Arguments:  [ppwcsLicense] -- ptr to ptr to which license info is returned
//
//  History:    10-Aug-95  SitaramR     Created Header
//
//----------------------------------------------------------------------------

SCODE STDMETHODCALLTYPE CDefWordBreaker::GetLicenseToUse( const WCHAR **ppwcsLicense )
{
    if ( 0 == ppwcsLicense )
        return E_INVALIDARG;

    static WCHAR const * wcsCopyright = L"Copyright (c) Microsoft Corporation, 1991-1998";
    *ppwcsLicense = wcsCopyright;

    return( S_OK );
}


extern long gulcInstances;


//+-------------------------------------------------------------------------
//
//  Method:     CDefWordBreakerCF::CDefWordBreakerCF
//
//  Synopsis:   Default Word Breaker class factory constructor
//
//  History:    07-Feb-1995     SitaramR   Created
//
//--------------------------------------------------------------------------

CDefWordBreakerCF::CDefWordBreakerCF( )
        : _cRefs( 1 )
{
    InterlockedIncrement( &gulcInstances );
}

//+-------------------------------------------------------------------------
//
//  Method:     CDefWordBreakerCF::~CDefWordBreakerCF
//
//  Synopsis:   Default Word Breaker class factory destructor
//
//  History:    07-Feb-1995     SitaramR   Created
//
//--------------------------------------------------------------------------

CDefWordBreakerCF::~CDefWordBreakerCF()
{
    InterlockedDecrement( &gulcInstances );
}

//+-------------------------------------------------------------------------
//
//  Method:     CDefWordBreakerCF::QueryInterface
//
//  Synopsis:   Rebind to other interface
//
//  Arguments:  [riid]      -- IID of new interface
//              [ppvObject] -- New interface * returned here
//
//  Returns:    S_OK if bind succeeded, E_NOINTERFACE if bind failed
//
//  History:    07-Feb-1995     SitaramR   Created
//
//--------------------------------------------------------------------------

SCODE STDMETHODCALLTYPE CDefWordBreakerCF::QueryInterface(
    REFIID  riid,
    void ** ppvObject )
{
    if ( IID_IClassFactory == riid )
        *ppvObject = (IUnknown *)(IClassFactory *)this;
    else if ( IID_IUnknown == riid )
        *ppvObject = (IUnknown *)this;
    else
    {
        *ppvObject = 0;
        return E_NOINTERFACE;
    }

    AddRef();
    return S_OK;
}

//+-------------------------------------------------------------------------
//
//  Method:     CDefWordBreakerCF::AddRef
//
//  Synopsis:   Increments refcount
//
//  History:    07-Feb-1995     SitaramR   Created
//
//--------------------------------------------------------------------------

ULONG STDMETHODCALLTYPE CDefWordBreakerCF::AddRef()
{
    return InterlockedIncrement( &_cRefs );
}

//+-------------------------------------------------------------------------
//
//  Method:     CDefWordBreakerCF::Release
//
//  Synopsis:   Decrement refcount.  Delete if necessary.
//
//  History:    07-Feb-1995     SitaramR   Created
//
//--------------------------------------------------------------------------

ULONG STDMETHODCALLTYPE CDefWordBreakerCF::Release()
{
    unsigned long uTmp = InterlockedDecrement( &_cRefs );

    if ( 0 == uTmp )
        delete this;

    return uTmp;
}


//+-------------------------------------------------------------------------
//
//  Method:     CDefWordBreakerCF::CreateInstance
//
//  Synopsis:   Creates new CDefWordBreaker object
//
//  Arguments:  [pUnkOuter] -- 'Outer' IUnknown
//              [riid]      -- Interface to bind
//              [ppvObject] -- Interface returned here
//
//  History:    07-Feb-1995     SitaramR   Created
//
//--------------------------------------------------------------------------

SCODE STDMETHODCALLTYPE CDefWordBreakerCF::CreateInstance( IUnknown * pUnkOuter,
                                                        REFIID riid,
                                                        void  * * ppvObject )
{
    CDefWordBreaker *pIUnk = 0;
    SCODE sc = S_OK;

    TRY
    {
        pIUnk = new CDefWordBreaker();
        sc = pIUnk->QueryInterface(  riid , ppvObject );

        pIUnk->Release();  // Release extra refcount from QueryInterface
    }
    CATCH(CException, e)
    {
        Win4Assert( 0 == pIUnk );

        switch( e.GetErrorCode() )
        {
        case E_OUTOFMEMORY:
            sc = (E_OUTOFMEMORY);
            break;

        default:
            sc = (E_UNEXPECTED);
        }
    }
    END_CATCH;

    return (sc);
}

//+-------------------------------------------------------------------------
//
//  Method:     CDefWordBreakerCF::LockServer
//
//  Synopsis:   Force class factory to remain loaded
//
//  Arguments:  [fLock] -- TRUE if locking, FALSE if unlocking
//
//  Returns:    S_OK
//
//  History:    07-Feb-1995     SitaramR   Created
//
//--------------------------------------------------------------------------

SCODE STDMETHODCALLTYPE CDefWordBreakerCF::LockServer(BOOL fLock)
{
    if(fLock)
        InterlockedIncrement( &gulcInstances );
    else
        InterlockedDecrement( &gulcInstances );

    return(S_OK);
}