windows-nt/Source/XPSP1/NT/inetsrv/query/expander/qparse.cxx
2020-09-26 16:20:57 +08:00

839 lines
26 KiB
C++

//+---------------------------------------------------------------------------
//
// Microsoft Windows
// Copyright (C) Microsoft Corporation, 1991 - 2000.
//
// File: QPARSE.CXX
//
// Contents: Query parser
//
// Classes: CQParse -- query parser
//
// History: 19-Sep-91 BartoszM Implemented.
//
//----------------------------------------------------------------------------
#include <pch.cxx>
#pragma hdrstop
#include <qparse.hxx>
#include <norm.hxx>
#include <drep.hxx>
#include <cci.hxx>
#include <pidmap.hxx>
#include <fa.hxx>
#include <compare.hxx>
#include "qkrep.hxx"
DECLARE_SMARTP( InternalPropertyRestriction )
static GUID guidQuery = DBQUERYGUID;
static CFullPropSpec psUnfiltered( guidQuery, DISPID_QUERY_UNFILTERED );
static GUID guidStorage = PSGUID_STORAGE;
static CFullPropSpec psFilename( guidStorage, PID_STG_NAME );
static CFullPropSpec psRevName( guidQuery, DISPID_QUERY_REVNAME );
//+---------------------------------------------------------------------------
//
// Member: CQParse::CQParse, public
//
// Synopsis: Break phrases, normalize, and stem the query expression
//
// Arguments: [pidmap] -- Propid mapper
// [langList] -- Language list
//
// History: 19-Sep-91 BartoszM Created.
//
//----------------------------------------------------------------------------
CQParse::CQParse( CPidMapper & pidmap, CLangList & langList )
: _flags(0),
_pidmap( pidmap ),
_langList( langList ),
_lcidSystemDefault( GetSystemDefaultLCID() )
{
}
//+---------------------------------------------------------------------------
//
// Member: CQParse::Parse, public
//
// Synopsis: Recursively parse expression
//
// Arguments: [pRst] -- Tree of query expressions
//
// Returns: Possibly modified expression
//
// History: 19-Sep-91 BartoszM Created.
// 18-Jan-92 KyleP Use restrictions
// 15-May-96 DwightKr Add check for NULL NOT restriction
//
// Notes: The return CRestriction will be different than [pRst].
// [pRst] is not touched.
//
//----------------------------------------------------------------------------
CRestriction* CQParse::Parse( CRestriction* pRst )
{
// go through leaves:
// normalize values
// break and normalize phrases (create phrase nodes)
// GenerateMethod level 1 -- convert to ranges
// higher GenerateMethod levels -- use stemmer
if ( pRst->IsLeaf() )
{
return Leaf ( pRst );
}
else
{
if ( pRst->Type() == RTNot )
{
CNotRestriction * pnrst = (CNotRestriction *)pRst;
XRestriction xRst( Parse( pnrst->GetChild() ) );
if ( xRst.GetPointer() == 0 )
{
THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
}
CNotRestriction *pNotRst = new CNotRestriction( xRst.GetPointer() );
Win4Assert( pNotRst->IsValid() );
xRst.Acquire();
return pNotRst;
}
CNodeRestriction* pnrstSource = pRst->CastToNode();
XNodeRestriction xnrstTarget;
BOOL fVector;
if ( pRst->Type() == RTVector )
{
fVector = TRUE;
xnrstTarget.Set( new CVectorRestriction( ((CVectorRestriction *)pRst)->RankMethod(),
pRst->CastToNode()->Count() ) );
}
else
{
fVector = FALSE;
xnrstTarget.Set( new CNodeRestriction( pRst->Type(),
pRst->CastToNode()->Count() ) );
}
Win4Assert( xnrstTarget->IsValid() );
//
// Vector nodes must be treated slightly differently than
// AND/OR/ANDNOT nodes. Noise words must be placeholders
// in a vector node.
//
BOOL fAndNode = ( xnrstTarget->Type() == RTAnd || xnrstTarget->Type() == RTProximity );
ULONG cOrCount = ( fAndNode ? 1: pnrstSource->Count() ); // Number of non-noise OR components
for ( unsigned i = 0; i < pnrstSource->Count(); i++ )
{
CRestriction * px = Parse ( pnrstSource->GetChild(i) );
//
// Don't store noise phrases (null nodes) during parse,
// *unless* this is a vector node.
if ( 0 == px && fVector )
{
px = new CRestriction;
}
if ( 0 != px )
{
XRestriction xRst( px );
xnrstTarget->AddChild ( px );
xRst.Acquire();
}
else
{
cOrCount--;
if ( 0 == cOrCount ) // all components are noise only
THROW( CException( QUERY_E_ALLNOISE ) );
}
}
return xnrstTarget.Acquire();
}
} //Parse
//+---------------------------------------------------------------------------
//
// Member: CQParse::Leaf, private
//
// Synopsis: Parse the leaf node of expression tree
//
// Arguments: [pExpr] -- leaf expression
//
// Returns: Possibly modified expression
//
// Requires: pExpr->IsLeaf() TRUE
//
// History: 19-Sep-91 BartoszM Created.
// 18-Jan-92 KyleP Use restrictions
// 05-Nov-93 DwightKr Changed PutUnsignedValue => PutValue
//
//----------------------------------------------------------------------------
CRestriction* CQParse::Leaf ( CRestriction* pRst )
{
Win4Assert ( pRst->IsLeaf() );
switch( pRst->Type() )
{
case RTContent:
{
CContentRestriction* pContRst = (CContentRestriction *) pRst;
ULONG GenerateMethod = pContRst->GenerateMethod();
if ( GenerateMethod > GENERATE_METHOD_MAX_USER )
{
vqDebugOut(( DEB_ERROR,
"QParse: GenerateMethod 0x%x > GENERATE_METHOD_MAX_USER\n",
GenerateMethod ));
THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
}
CQueryKeyRepository keyRep( GenerateMethod );
CRestriction * pPhraseRst;
switch ( BreakPhrase ( pContRst->GetPhrase(),
pContRst->GetProperty(),
pContRst->GetLocale(),
GenerateMethod,
keyRep,
0,
_pidmap,
_langList) )
{
case BP_NOISE:
_flags |= CI_NOISE_IN_PHRASE;
// Note fall through...
case BP_OK:
pPhraseRst = keyRep.AcqRst();
if ( pPhraseRst )
pPhraseRst->SetWeight( pRst->Weight() );
else
_flags |= CI_NOISE_PHRASE;
break;
default:
Win4Assert( !"How did we get here?" );
case BP_INVALID_PROPERTY:
pPhraseRst = 0;
break;
} // switch
return pPhraseRst;
break;
}
case RTNatLanguage:
{
CNatLanguageRestriction* pNatLangRst = (CNatLanguageRestriction *) pRst;
CVectorKeyRepository vecKeyRep( pNatLangRst->GetProperty(),
pNatLangRst->GetLocale(),
pRst->Weight(),
_pidmap,
_langList );
CRestriction* pVectorRst;
switch ( BreakPhrase ( pNatLangRst->GetPhrase(),
pNatLangRst->GetProperty(),
pNatLangRst->GetLocale(),
GENERATE_METHOD_INFLECT,
vecKeyRep,
&vecKeyRep,
_pidmap,
_langList ) )
{
case BP_NOISE:
_flags |= CI_NOISE_IN_PHRASE;
// Note fall through...
case BP_OK:
pVectorRst = vecKeyRep.AcqRst();
if ( pVectorRst )
pVectorRst->SetWeight( pRst->Weight() );
else
_flags |= CI_NOISE_PHRASE;
break;
default:
Win4Assert( !"How did we get here?" );
case BP_INVALID_PROPERTY:
pVectorRst = 0;
break;
} // switch
return pVectorRst;
break;
}
case RTProperty:
{
CPropertyRestriction * prstProp = (CPropertyRestriction *)pRst;
if ( getBaseRelop(prstProp->Relation()) > PRSomeBits )
{
vqDebugOut(( DEB_ERROR,
"QParse: Invalid comparison operator %d\n",
prstProp->Relation() ));
THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
}
PROPID pid = _pidmap.NameToPid( prstProp->GetProperty() );
if ( pidInvalid != pid )
pid = _pidmap.PidToRealPid( pid );
if ( pidInvalid == pid )
{
vqDebugOut(( DEB_ERROR,
"QParse: Invalid property\n" ));
THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
}
CInternalPropertyRestriction * prstIProp =
new CInternalPropertyRestriction( prstProp->Relation(),
pid,
prstProp->Value() );
Win4Assert( prstIProp->IsValid() );
XInternalPropertyRestriction xrstIProp( prstIProp );
//
// If the property restriction is over a string value, then create
// a helper content restriction.
//
switch( prstProp->Value().Type() )
{
case VT_LPSTR:
AddLpstrHelper( prstProp, prstIProp );
break;
case VT_LPWSTR:
AddLpwstrHelper( prstProp, prstIProp );
break;
case VT_LPWSTR | VT_VECTOR:
AddLpwstrVectorHelper( prstProp, prstIProp );
break;
case VT_BOOL:
if ( prstProp->Value().GetBOOL() != FALSE &&
prstProp->Relation() == PREQ &&
prstProp->GetProperty() == psUnfiltered )
{
delete xrstIProp.Acquire();
CUnfilteredRestriction *pUnfiltRst = new CUnfilteredRestriction;
return( pUnfiltRst );
}
break;
default:
break;
}
return( xrstIProp.Acquire() );
}
default:
{
vqDebugOut(( DEB_ERROR,
"QParse: Invalid restriction type %d\n",
pRst->Type() ));
THROW( CException( QUERY_E_INVALIDRESTRICTION ) );
return 0;
}
}
} //Leaf
//+---------------------------------------------------------------------------
//
// Member: CQParse::AddLpwstrHelper, private
//
// Synopsis: Add content helpers for VT_LPWSTR properties.
//
// Arguments: [prstProp] -- Property restriction (input)
// [prstIProp] -- Internal property restriction (output)
//
// History: 03-Oct-95 KyleP Broke out as method.
//
//----------------------------------------------------------------------------
void CQParse::AddLpwstrHelper( CPropertyRestriction * prstProp,
CInternalPropertyRestriction * prstIProp )
{
//
// For equality, we create a content restriction with GenerateMethod level 0.
//
if ( prstProp->Relation() == PREQ )
{
CQueryKeyRepository keyRep( GENERATE_METHOD_EXACT );
BreakPhrase ( (WCHAR *)prstProp->Value().GetLPWSTR(),
prstProp->GetProperty(),
_lcidSystemDefault,
GENERATE_METHOD_EXACT,
keyRep,
0,
_pidmap,
_langList );
prstIProp->SetContentHelper( keyRep.AcqRst() );
}
//
// For regular expression, create a GenerateMethod match for any fixed prefix
// on the string.
//
else if ( prstProp->Relation() == PRRE )
{
const MAX_PREFIX_LENGTH = 50;
unsigned i = wcscspn( prstProp->Value().GetLPWSTR(),
awcSpecialRegex );
//
// Should the 0 below be a registry parameter?
//
if ( i > 0 )
{
WCHAR wcs[MAX_PREFIX_LENGTH];
if ( i > sizeof(wcs)/sizeof(WCHAR) - 2 )
i = sizeof(wcs)/sizeof(WCHAR) - 2;
memcpy( wcs, prstProp->Value().GetLPWSTR(), i*sizeof(WCHAR) );
wcs[i] = 0;
//
// Trickery: Key repository is GENERATE_METHOD_PREFIX which turns key into ranges.
// Phrase is broken with GENERATE_METHOD_EXACTPREFIXMATCH which does 'exact'
// prefix matching: e.g. it uses the noise word list. The result is
// that we match ranges, but don't set a content helper if we hit
// a noise word. This is different from a user GENERATE_METHOD_PREFIX
// which uses a very minimal noise word list (only 1 character
// prefixes are noise).
//
CQueryKeyRepository keyRep( GENERATE_METHOD_PREFIX );
if ( BP_OK == BreakPhrase ( wcs,
prstProp->GetProperty(),
_lcidSystemDefault,
GENERATE_METHOD_EXACTPREFIXMATCH,
keyRep,
0,
_pidmap,
_langList ) )
{
prstIProp->SetContentHelper( keyRep.AcqRst() );
}
}
//
// If this is the filename property, then add to the content helper
// the reversed suffix string w/o wildcards. For *.cxx we would
// add xxc.
//
if ( prstProp->GetProperty() == psFilename )
{
WCHAR wcs[MAX_PREFIX_LENGTH];
WCHAR const * pBegin = prstProp->Value().GetLPWSTR();
WCHAR const * pEnd = pBegin + wcslen(pBegin) - 1;
i = 0;
for ( ; pEnd >= pBegin && i < MAX_PREFIX_LENGTH - 1 ; pEnd-- )
{
if ( wcschr( awcSpecialRegexReverse, *pEnd ) == 0 )
wcs[i++] = *pEnd;
else
{
wcs[i] = 0;
break;
}
}
if ( i < MAX_PREFIX_LENGTH )
wcs[i] = 0;
wcs[MAX_PREFIX_LENGTH - 1] = 0;
if ( i > 0 )
{
CQueryKeyRepository keyRep( GENERATE_METHOD_PREFIX );
if ( prstIProp->GetContentHelper() == 0 )
{
if ( BP_OK == BreakPhrase ( wcs,
psRevName,
_lcidSystemDefault,
GENERATE_METHOD_EXACTPREFIXMATCH,
keyRep,
0,
_pidmap,
_langList ) )
{
prstIProp->SetContentHelper( keyRep.AcqRst() );
}
}
else
{
if ( BP_OK == BreakPhrase ( wcs,
psRevName,
_lcidSystemDefault,
GENERATE_METHOD_EXACTPREFIXMATCH,
keyRep,
0,
_pidmap,
_langList ) )
{
CNodeRestriction *pNodeRst = new CNodeRestriction( RTAnd, 2 );
XNodeRestriction rstAnd( pNodeRst );
Win4Assert( rstAnd->IsValid() );
unsigned posOrig;
rstAnd->AddChild( prstIProp->GetContentHelper(), posOrig );
prstIProp->AcquireContentHelper();
XRestriction xRst( keyRep.AcqRst() );
unsigned pos;
rstAnd->AddChild( xRst.GetPointer(), pos );
xRst.Acquire();
if ( 0 == rstAnd->GetChild( pos ) )
{
prstIProp->SetContentHelper( rstAnd->RemoveChild( posOrig ) );
}
else
{
prstIProp->SetContentHelper( rstAnd.Acquire() );
}
}
}
}
}
}
} //AddLpwstrHelper
//+---------------------------------------------------------------------------
//
// Member: CQParse::AddLpstrHelper, private
//
// Synopsis: Add content helpers for VT_LPSTR properties.
//
// Arguments: [prstProp] -- Property restriction (input)
// [prstIProp] -- Internal property restriction (output)
//
// History: 03-Oct-95 KyleP Broke out as method.
//
//----------------------------------------------------------------------------
void CQParse::AddLpstrHelper( CPropertyRestriction * prstProp,
CInternalPropertyRestriction * prstIProp )
{
//
// For equality, we create a content restriction with GenerateMethod level 0.
//
if ( prstProp->Relation() == PREQ )
{
CQueryKeyRepository keyRep( GENERATE_METHOD_EXACT );
BreakPhrase ( prstProp->Value().GetLPSTR(),
prstProp->GetProperty(),
_lcidSystemDefault,
GENERATE_METHOD_EXACT,
keyRep,
0,
_pidmap,
_langList );
prstIProp->SetContentHelper( keyRep.AcqRst() );
}
//
// For regular expression, create a GenerateMethod match for any fixed prefix
// on the string.
//
else if ( prstProp->Relation() == PRRE )
{
const MAX_PREFIX_LENGTH = 50;
unsigned i = strcspn( prstProp->Value().GetLPSTR(),
acSpecialRegex );
//
// Should the 0 below be a registry parameter?
//
if ( i > 0 )
{
char ac[MAX_PREFIX_LENGTH];
if ( i > sizeof(ac) - 1 )
i = sizeof(ac) - 1;
memcpy( ac, prstProp->Value().GetLPSTR(), i );
ac[i] = 0;
//
// Trickery: Key repository is GENERATE_METHOD_PREFIX which turns key into ranges.
// Phrase is broken with GENERATE_METHOD_EXACTPREFIXMATCH which does 'exact'
// prefix matching: e.g. it uses the noise word list. The result is
// that we match ranges, but don't set a content helper if we hit
// a noise word. This is different from a user GENERATE_METHOD_PREFIX
// which uses a very minimal noise word list (only 1 character
// prefixes are noise).
//
CQueryKeyRepository keyRep( GENERATE_METHOD_PREFIX );
if ( BP_OK == BreakPhrase ( ac,
prstProp->GetProperty(),
_lcidSystemDefault,
GENERATE_METHOD_EXACTPREFIXMATCH,
keyRep,
0,
_pidmap,
_langList ) )
{
prstIProp->SetContentHelper( keyRep.AcqRst() );
}
}
}
} //AddLpstrHelper
//+---------------------------------------------------------------------------
//
// Member: CQParse::AddLpwstrVectorHelper, private
//
// Synopsis: Add content helpers for VT_LPWSTR | VT_VECTOR properties.
//
// Arguments: [prstProp] -- Property restriction (input)
// [prstIProp] -- Internal property restriction (output)
//
// History: 03-Oct-95 KyleP Created
//
//----------------------------------------------------------------------------
void CQParse::AddLpwstrVectorHelper( CPropertyRestriction * prstProp,
CInternalPropertyRestriction * prstIProp )
{
if ( prstProp->Value().Count() == 0 )
{
//
// Null vector, hence no helper restriction
//
return;
}
if ( prstProp->Relation() == PREQ || prstProp->Relation() == (PREQ | PRAll) )
{
XNodeRestriction xrstAnd( new CNodeRestriction( RTAnd, prstProp->Value().Count() ) );
Win4Assert( xrstAnd->IsValid() );
for ( unsigned i = 0; i < prstProp->Value().Count(); i++ )
{
CQueryKeyRepository keyRep( GENERATE_METHOD_EXACT );
BreakPhrase ( (WCHAR *)prstProp->Value().GetLPWSTR( i ),
prstProp->GetProperty(),
_lcidSystemDefault,
GENERATE_METHOD_EXACT,
keyRep,
0,
_pidmap,
_langList );
CRestriction * prst = keyRep.AcqRst();
if ( 0 != prst )
{
XPtr<CRestriction> xRst( prst );
xrstAnd->AddChild( prst );
xRst.Acquire();
}
else
{
_flags |= CI_NOISE_IN_PHRASE;
}
}
//
// If there aren't any nodes (because of noise words) don't set the
// content helper, so we can fall back on enumeration. Set _flags
// in this case so it's obvious why we had to fall back on enumration.
//
if ( xrstAnd->Count() == 1 )
prstIProp->SetContentHelper( xrstAnd->RemoveChild( 0 ) );
else if ( 0 != xrstAnd->Count() )
prstIProp->SetContentHelper( xrstAnd.Acquire() );
}
else if ( prstProp->Relation() == (PREQ | PRAny) )
{
XNodeRestriction xrstOr( new CNodeRestriction( RTOr, prstProp->Value().Count() ) );
for ( unsigned i = 0; i < prstProp->Value().Count(); i++ )
{
CQueryKeyRepository keyRep( GENERATE_METHOD_EXACT );
BreakPhrase ( (WCHAR *)prstProp->Value().GetLPWSTR( i ),
prstProp->GetProperty(),
_lcidSystemDefault,
GENERATE_METHOD_EXACT,
keyRep,
0,
_pidmap,
_langList );
CRestriction * prst = keyRep.AcqRst();
if ( 0 != prst )
{
XPtr<CRestriction> xRst( prst );
xrstOr->AddChild( prst );
xRst.Acquire();
}
else
break; // If we can't match all OR clauses, then we're in trouble.
}
//
// RTAny is all-or-nothing. A missed clause in one that CI can't resolve, which
// means there are objects that match this query that CI won't find.
//
if ( xrstOr->Count() == prstProp->Value().Count() )
{
if ( xrstOr->Count() == 1 )
prstIProp->SetContentHelper( xrstOr->RemoveChild( 0 ) );
else
prstIProp->SetContentHelper( xrstOr.Acquire() );
}
else
{
_flags |= CI_NOISE_IN_PHRASE;
}
}
} //AddLpwstrVectorHelper
//+---------------------------------------------------------------------------
//
// Function: BreakPhrase
//
// Synopsis: Break phrase into words and noun phrases
//
// Arguments: [phrase] -- string
// [ps] -- property specification
// [GenerateMethod] -- GenerateMethod flag
// [keyRep] -- key repository into which words will be deposited
// [pPhraseSink] -- sink for phrases
// [pidMap] -- pid mapper used to convert property to propid
//
// Returns: Noise word status.
//
// History: 19-Sep-1991 BartoszM Created.
// 18-Jan-1992 KyleP Use restrictions
// 12-Feb-2000 KitmanH Added hack to fix German word breaking
// issue for prefix matching queries
//
//----------------------------------------------------------------------------
BreakPhraseStatus BreakPhrase ( WCHAR const * phrase,
const CFullPropSpec & ps,
LCID lcid,
ULONG GenerateMethod,
PKeyRepository& krep,
IPhraseSink *pPhraseSink,
CPidMapper & pidMap,
CLangList & langList )
{
CDataRepository drep( krep, pPhraseSink, TRUE, GenerateMethod, pidMap, langList );
if ( drep.PutLanguage( lcid ) && drep.PutPropName( ps ) )
{
ciDebugOut (( DEB_ITRACE,
"BreakPhrase: phrase = \"%ws\" Propid = %lu\n",
phrase, drep.GetPropId() ));
drep.PutPhrase( phrase, wcslen(phrase) + 1 );
krep.FixUp( drep );
if ( drep.ContainedNoiseWords() )
return BP_NOISE;
else
return BP_OK;
}
else
return BP_INVALID_PROPERTY;
} //BreakPhrase
//
// DBCS version of the previous function.
//
BreakPhraseStatus BreakPhrase ( char const * phrase,
const CFullPropSpec & ps,
LCID lcid,
ULONG GenerateMethod,
PKeyRepository& krep,
IPhraseSink *pPhraseSink,
CPidMapper & pidMap,
CLangList & langList )
{
CDataRepository drep( krep, pPhraseSink, TRUE, GenerateMethod, pidMap, langList );
if ( drep.PutLanguage( lcid ) && drep.PutPropName( ps ) )
{
ciDebugOut (( DEB_ITRACE,
"BreakPhrase: phrase = \"%s\" Propid = %lu\n",
phrase, drep.GetPropId() ));
drep.PutPhrase( phrase, strlen(phrase) + 1 );
krep.FixUp( drep );
if ( drep.ContainedNoiseWords() )
return BP_NOISE;
else
return BP_OK;
}
else
return BP_INVALID_PROPERTY;
} //BreakPhrase