1333 lines
36 KiB
C++
1333 lines
36 KiB
C++
//+---------------------------------------------------------------------------
|
|
//
|
|
// Copyright (C) 1994-1998, Microsoft Corporation.
|
|
//
|
|
// File: SCANNER.CXX
|
|
//
|
|
// Contents: Implementation of CQueryScanner
|
|
//
|
|
// History: 22-May-92 AmyA Created.
|
|
// 23-Jun-92 MikeHew Added weight token recognition.
|
|
// 17-May-94 t-jeffc Added error info and reg ex support.
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
#include <pch.cxx>
|
|
#pragma hdrstop
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::CQueryScanner, public
|
|
//
|
|
// Synopsis: Create a scanner from a string.
|
|
//
|
|
// Arguments: [buffer] -- the string to be scanned.
|
|
// [fLookForTextualKeywords] -- TRUE if the scanner should
|
|
// look for "and/or/not/near" in
|
|
// text form.
|
|
// [lcid] -- language for and/or/not/near detection
|
|
// [fTreatPlusAsToken] -- TRUE if the scanner should treat the
|
|
// '+' character as a token (used
|
|
// in GroupBy parsing)
|
|
//
|
|
// Notes: This string is not copied, so the scanner does not own it.
|
|
// If the string is changed outside of the scanner, it will
|
|
// affect the information that is returned.
|
|
//
|
|
// History: 30-Apr-92 AmyA Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
CQueryScanner::CQueryScanner(
|
|
WCHAR const * buffer,
|
|
BOOL fLookForTextualKeywords,
|
|
LCID lcid,
|
|
BOOL fTreatPlusAsToken )
|
|
: _text( buffer ),
|
|
_pBuf( buffer ),
|
|
_pLookAhead( buffer ),
|
|
_fLookForTextualKeywords( fLookForTextualKeywords ),
|
|
_fTreatPlusAsToken( fTreatPlusAsToken ),
|
|
_lcid( lcid )
|
|
{
|
|
Accept();
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::AcceptWord, public
|
|
//
|
|
// Synopsis: Consumes a single word out of a phrase
|
|
//
|
|
// Requires: Should be called after AcqWord
|
|
//
|
|
// History: 15-Sep-92 BartoszM Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
void CQueryScanner::AcceptWord()
|
|
{
|
|
_pLookAhead = _text;
|
|
Accept();
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::AcceptColumn, public
|
|
//
|
|
// Synopsis: Consumes a column name out of a phrase
|
|
//
|
|
// Requires: Should be called after AcqColumn
|
|
//
|
|
// History: 15-Sep-92 BartoszM Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
void CQueryScanner::AcceptColumn()
|
|
{
|
|
AcceptWord();
|
|
}
|
|
|
|
|
|
struct SStringToken
|
|
{
|
|
WCHAR * pwcToken;
|
|
unsigned cwc;
|
|
Token token;
|
|
};
|
|
|
|
static SStringToken s_EnglishStringTokens[] =
|
|
{
|
|
{ L"AND", (sizeof L"AND" / sizeof WCHAR) - 1, AND_TOKEN },
|
|
{ L"OR", (sizeof L"OR" / sizeof WCHAR) - 1, OR_TOKEN },
|
|
{ L"NOT", (sizeof L"NOT" / sizeof WCHAR) - 1, NOT_TOKEN },
|
|
{ L"NEAR", (sizeof L"NEAR" / sizeof WCHAR) - 1, PROX_TOKEN },
|
|
};
|
|
|
|
static SStringToken s_GermanStringTokens[] =
|
|
{
|
|
{ L"UND", (sizeof L"UND" / sizeof WCHAR) - 1, AND_TOKEN },
|
|
{ L"ODER", (sizeof L"ODER" / sizeof WCHAR) - 1, OR_TOKEN },
|
|
{ L"NICHT", (sizeof L"NICHT" / sizeof WCHAR) - 1, NOT_TOKEN },
|
|
{ L"NAH", (sizeof L"NAH" / sizeof WCHAR) - 1, PROX_TOKEN },
|
|
};
|
|
|
|
static SStringToken s_FrenchStringTokens[] =
|
|
{
|
|
{ L"ET", (sizeof L"ET" / sizeof WCHAR) - 1, AND_TOKEN },
|
|
{ L"OU", (sizeof L"OU" / sizeof WCHAR) - 1, OR_TOKEN },
|
|
{ L"SANS", (sizeof L"SANS" / sizeof WCHAR) - 1, NOT_TOKEN },
|
|
{ L"PRES", (sizeof L"PRES" / sizeof WCHAR) - 1, PROX_TOKEN },
|
|
};
|
|
|
|
static SStringToken s_SpanishStringTokens[] =
|
|
{
|
|
{ L"Y", (sizeof L"Y" / sizeof WCHAR) - 1, AND_TOKEN },
|
|
{ L"O", (sizeof L"O" / sizeof WCHAR) - 1, OR_TOKEN },
|
|
{ L"NO", (sizeof L"NO" / sizeof WCHAR) - 1, NOT_TOKEN },
|
|
{ L"CERCA", (sizeof L"CERCA" / sizeof WCHAR) - 1, PROX_TOKEN },
|
|
};
|
|
|
|
static SStringToken s_DutchStringTokens[] =
|
|
{
|
|
{ L"EN", (sizeof L"EN" / sizeof WCHAR) - 1, AND_TOKEN },
|
|
{ L"OF", (sizeof L"OF" / sizeof WCHAR) - 1, OR_TOKEN },
|
|
{ L"NIET", (sizeof L"NIET" / sizeof WCHAR) - 1, NOT_TOKEN },
|
|
{ L"NABIJ", (sizeof L"NABIJ" / sizeof WCHAR) - 1, PROX_TOKEN },
|
|
};
|
|
|
|
static WCHAR aSwedishNear[] = { L'N', 0xc4, L'R', L'A', 0 };
|
|
|
|
static SStringToken s_SwedishStringTokens[] =
|
|
{
|
|
{ L"OCH", (sizeof L"OCH" / sizeof WCHAR) - 1, AND_TOKEN },
|
|
{ L"ELLER", (sizeof L"ELLER" / sizeof WCHAR) - 1, OR_TOKEN },
|
|
{ L"INTE", (sizeof L"INTE" / sizeof WCHAR) - 1, NOT_TOKEN },
|
|
{ aSwedishNear, 4, PROX_TOKEN },
|
|
};
|
|
|
|
static SStringToken s_ItalianStringTokens[] =
|
|
{
|
|
{ L"E", (sizeof L"E" / sizeof WCHAR) - 1, AND_TOKEN },
|
|
{ L"O", (sizeof L"O" / sizeof WCHAR) - 1, OR_TOKEN },
|
|
{ L"NO", (sizeof L"NO" / sizeof WCHAR) - 1, NOT_TOKEN },
|
|
{ L"VICINO", (sizeof L"VICINO" / sizeof WCHAR) - 1, PROX_TOKEN },
|
|
};
|
|
|
|
const unsigned cStringTokens = sizeof(s_EnglishStringTokens) /
|
|
sizeof(s_EnglishStringTokens[0]);
|
|
|
|
#define WORD_STR L"{}!&|~*@#()[],\t=<>\n\"^ "
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Function: InternalFindStringToken
|
|
//
|
|
// Synopsis: Looks for a textual token in plain text.
|
|
//
|
|
// Arguments: [pwcIn] -- string to search
|
|
// [token] -- returns the token found
|
|
// [cwc] -- returns length of token found
|
|
// [pTokens] -- token array to use
|
|
//
|
|
// Returns: Pointer to token or 0 if none was found
|
|
//
|
|
// History: 08-Feb-96 dlee created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
WCHAR * InternalFindStringToken(
|
|
WCHAR * pwcIn,
|
|
Token & token,
|
|
unsigned & cwc,
|
|
SStringToken * pTokens )
|
|
{
|
|
// for each of and/or/not/near
|
|
|
|
WCHAR *pwcOut = 0;
|
|
|
|
for ( unsigned i = 0; i < cStringTokens; i++ )
|
|
{
|
|
WCHAR *pwcStr = wcsstr( pwcIn, pTokens[i].pwcToken );
|
|
|
|
while ( pwcStr )
|
|
{
|
|
// found a match -- does it have white space on either side?
|
|
|
|
WCHAR wcBeyond = * (pwcStr + pTokens[i].cwc);
|
|
if ( ( ( 0 == wcBeyond ) ||
|
|
( wcschr( WORD_STR, wcBeyond ) ) ) &&
|
|
( ( pwcStr == pwcIn ) ||
|
|
( iswspace( * ( pwcStr - 1 ) ) ) ) )
|
|
{
|
|
// if the first match found or the match closest to the
|
|
// beginning of the string, use it.
|
|
|
|
if ( ( 0 == pwcOut ) ||
|
|
( pwcStr < pwcOut ) )
|
|
{
|
|
pwcOut = pwcStr;
|
|
token = pTokens[i].token;
|
|
cwc = pTokens[i].cwc;
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
pwcStr = wcsstr( pwcStr + 1, pTokens[i].pwcToken );
|
|
}
|
|
}
|
|
|
|
return pwcOut;
|
|
} //InternalFindStringToken
|
|
|
|
SStringToken * GetStringTokenArray(
|
|
LCID lcid )
|
|
{
|
|
SStringToken *pTokens;
|
|
|
|
switch ( PRIMARYLANGID( LANGIDFROMLCID( lcid ) ) )
|
|
{
|
|
case LANG_GERMAN :
|
|
pTokens = s_GermanStringTokens;
|
|
break;
|
|
case LANG_FRENCH :
|
|
pTokens = s_FrenchStringTokens;
|
|
break;
|
|
case LANG_SPANISH :
|
|
pTokens = s_SpanishStringTokens;
|
|
break;
|
|
case LANG_DUTCH :
|
|
pTokens = s_DutchStringTokens;
|
|
break;
|
|
case LANG_SWEDISH :
|
|
pTokens = s_SwedishStringTokens;
|
|
break;
|
|
case LANG_ITALIAN :
|
|
pTokens = s_ItalianStringTokens;
|
|
break;
|
|
case LANG_NEUTRAL :
|
|
case LANG_ENGLISH :
|
|
default :
|
|
pTokens = s_EnglishStringTokens;
|
|
break;
|
|
}
|
|
|
|
Win4Assert( 0 != pTokens );
|
|
|
|
return pTokens;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Function: FindStringToken
|
|
//
|
|
// Synopsis: Looks for a textual token in plain text. Always tries
|
|
// English, tries a different language depending on _lcid.
|
|
//
|
|
// Arguments: [pwcIn] -- string to search
|
|
// [token] -- returns the token found
|
|
// [cwc] -- returns length of token found
|
|
//
|
|
// Returns: Pointer to token or 0 if none was found
|
|
//
|
|
// History: 08-Feb-96 dlee created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
WCHAR * CQueryScanner::FindStringToken(
|
|
WCHAR * pwcIn,
|
|
Token & token,
|
|
unsigned & cwc )
|
|
{
|
|
SStringToken * pTokens = GetStringTokenArray( _lcid );
|
|
|
|
WCHAR * pwcToken = InternalFindStringToken( pwcIn, token, cwc, pTokens );
|
|
|
|
// if the search above wasn't in English, try English too.
|
|
|
|
if ( pTokens != s_EnglishStringTokens )
|
|
{
|
|
unsigned cwcEnglish;
|
|
Token tokenEnglish;
|
|
WCHAR * pwcEnglish = InternalFindStringToken( pwcIn,
|
|
tokenEnglish,
|
|
cwcEnglish,
|
|
s_EnglishStringTokens );
|
|
|
|
// If there is no language-specific match or the English match
|
|
// occurs before the language-specific match, use the English
|
|
// match.
|
|
|
|
if ( ( 0 != pwcEnglish ) &&
|
|
( ( 0 == pwcToken ) || ( pwcEnglish < pwcToken ) ) )
|
|
{
|
|
pwcToken = pwcEnglish;
|
|
token = tokenEnglish;
|
|
cwc = cwcEnglish;
|
|
}
|
|
}
|
|
|
|
return pwcToken;
|
|
} //FindStringToken
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::Accept, public
|
|
//
|
|
// Synopsis: Determines what the next token is. Will advance _pLookAhead
|
|
// over the next token and white space.
|
|
//
|
|
// Notes: There are five different types of TEXT_TOKENS, Phrase, Path,
|
|
// Number, Column and Command. Since the length of the token
|
|
// depends on which token it is, _pLookAhead is forwarded to the
|
|
// end of the longest, and _text is used to parse the token in the
|
|
// various Acq and Get methods.
|
|
//
|
|
// History: 30-Apr-92 AmyA Created
|
|
// 19-May-92 AmyA Added Guid hack
|
|
// 23-Jun-92 MikeHew Added weight token recognition.
|
|
// 26-May-94 t-jeffc Added more tokens; rearranged to
|
|
// support parsing errors
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
void CQueryScanner::Accept()
|
|
{
|
|
EatWhiteSpace();
|
|
|
|
_text = _pLookAhead;
|
|
|
|
switch ( *_pLookAhead )
|
|
{
|
|
case '&':
|
|
_pLookAhead++;
|
|
_token = AND_TOKEN;
|
|
break;
|
|
|
|
case '*':
|
|
_pLookAhead++;
|
|
|
|
if ( *_pLookAhead == '*' )
|
|
{
|
|
_token = FUZ2_TOKEN;
|
|
_pLookAhead++;
|
|
}
|
|
else
|
|
_token = FUZZY_TOKEN;
|
|
break;
|
|
|
|
case '=':
|
|
_pLookAhead++;
|
|
_token = EQUAL_TOKEN;
|
|
break;
|
|
|
|
case '<':
|
|
_pLookAhead++;
|
|
if ( *_pLookAhead == '=' )
|
|
{
|
|
_token = LESS_EQUAL_TOKEN;
|
|
_pLookAhead++;
|
|
}
|
|
else
|
|
_token = LESS_TOKEN;
|
|
break;
|
|
|
|
case '>':
|
|
_pLookAhead++;
|
|
if ( *_pLookAhead == '=' )
|
|
{
|
|
_token = GREATER_EQUAL_TOKEN;
|
|
_pLookAhead++;
|
|
}
|
|
else
|
|
_token = GREATER_TOKEN;
|
|
break;
|
|
|
|
case '!':
|
|
_pLookAhead++;
|
|
if ( *_pLookAhead == '=' )
|
|
{
|
|
_token = NOT_EQUAL_TOKEN;
|
|
_pLookAhead++;
|
|
}
|
|
else
|
|
{
|
|
_token = NOT_TOKEN;
|
|
}
|
|
break;
|
|
|
|
case '|':
|
|
_pLookAhead++;
|
|
_token = OR_TOKEN;
|
|
break;
|
|
|
|
case '~':
|
|
_pLookAhead++;
|
|
_token = PROX_TOKEN;
|
|
break;
|
|
|
|
case '@':
|
|
_pLookAhead++;
|
|
_token = PROP_TOKEN;
|
|
break;
|
|
|
|
case '#':
|
|
_pLookAhead++;
|
|
_token = PROP_REGEX_TOKEN;
|
|
break;
|
|
|
|
case '(':
|
|
_pLookAhead++;
|
|
_token = OPEN_TOKEN;
|
|
break;
|
|
|
|
case ')':
|
|
_pLookAhead++;
|
|
_token = CLOSE_TOKEN;
|
|
break;
|
|
|
|
case '[':
|
|
_pLookAhead++;
|
|
_token = W_OPEN_TOKEN;
|
|
break;
|
|
|
|
case ']':
|
|
_pLookAhead++;
|
|
_token = W_CLOSE_TOKEN;
|
|
break;
|
|
|
|
case ',':
|
|
_pLookAhead++;
|
|
_token = COMMA_TOKEN;
|
|
break;
|
|
|
|
case '\0':
|
|
case 0x1A: // CTRL-Z
|
|
_token = EOS_TOKEN;
|
|
break;
|
|
|
|
case '"':
|
|
_pLookAhead++;
|
|
_token = QUOTES_TOKEN;
|
|
break;
|
|
|
|
case '$':
|
|
_pLookAhead++;
|
|
_token = PROP_NATLANG_TOKEN;
|
|
break;
|
|
|
|
case '{':
|
|
_pLookAhead++;
|
|
_token = C_OPEN_TOKEN;
|
|
break;
|
|
|
|
case '}':
|
|
_pLookAhead++;
|
|
_token = C_CLOSE_TOKEN;
|
|
break;
|
|
|
|
case '^':
|
|
{
|
|
WCHAR wc = *(_pLookAhead + 1);
|
|
|
|
BOOL fOk = TRUE;
|
|
|
|
if (L'a' == wc) // all bits
|
|
_token = ALLOF_TOKEN;
|
|
else if (L's' == wc) // some bits
|
|
_token = SOMEOF_TOKEN;
|
|
else
|
|
fOk = FALSE;
|
|
|
|
if (fOk)
|
|
{
|
|
_pLookAhead += 2;
|
|
break;
|
|
}
|
|
}
|
|
// FALL THROUGH
|
|
|
|
case '+':
|
|
if (*_pLookAhead == L'+' && _fTreatPlusAsToken)
|
|
{
|
|
_pLookAhead++;
|
|
_token = PLUS_TOKEN;
|
|
break;
|
|
}
|
|
// FALL THROUGH
|
|
|
|
default:
|
|
{
|
|
// forwards pwcEnd over anything that could be in a phrase,
|
|
// which is the most inclusive of the TEXT_TOKENs.
|
|
// (except, for regex's and phrases in quotes - but they're
|
|
// handled separately)
|
|
|
|
WCHAR const *pwcEnd = _text + wcscspn( _text, PHRASE_STR );
|
|
|
|
if ( _fLookForTextualKeywords )
|
|
{
|
|
unsigned cwc = (unsigned) ( pwcEnd - _text );
|
|
cwc = __min( cwc, MAX_PATH * 2 );
|
|
|
|
// if a textual keyword is beyond 500 chars in the string,
|
|
// blow it off -- the workaround is to use the '&|~' version.
|
|
|
|
WCHAR awcBuf[ 1 + MAX_PATH * 2 ];
|
|
RtlCopyMemory( awcBuf, _text, cwc * sizeof WCHAR );
|
|
awcBuf[ cwc ] = 0;
|
|
|
|
ULONG cwcOut = LCMapString( _lcid,
|
|
LCMAP_UPPERCASE,
|
|
awcBuf,
|
|
cwc,
|
|
awcBuf,
|
|
cwc );
|
|
if ( cwcOut != cwc )
|
|
THROW( CException() );
|
|
|
|
Token token;
|
|
unsigned cwcToken;
|
|
WCHAR *pwcTok = FindStringToken( awcBuf, token, cwcToken );
|
|
|
|
if ( 0 != pwcTok )
|
|
{
|
|
// a textual token exists in the string
|
|
|
|
if ( pwcTok == awcBuf )
|
|
{
|
|
// textual token at the start of the string
|
|
|
|
_token = token;
|
|
_pLookAhead = _text + cwcToken;
|
|
}
|
|
else
|
|
{
|
|
// textual token in the middle of the string, stop the
|
|
// current token at that point and get it next time
|
|
// Accept() is called.
|
|
|
|
_pLookAhead = _text + ( pwcTok - awcBuf );
|
|
_token = TEXT_TOKEN;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_pLookAhead = pwcEnd;
|
|
_token = TEXT_TOKEN;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_pLookAhead = pwcEnd;
|
|
_token = TEXT_TOKEN;
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::AllocReturnString, private inline
|
|
//
|
|
// Synopsis: Copies all of the relevant characters of the string that
|
|
// _text is pointing to and returns the new string.
|
|
//
|
|
// History: 17 Apr 97 AlanW Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
inline WCHAR * CQueryScanner::AllocReturnString( int cch )
|
|
{
|
|
WCHAR * newBuf = new WCHAR [ cch + 1 ];
|
|
RtlCopyMemory ( newBuf, _text, cch * sizeof(WCHAR));
|
|
newBuf[cch] = L'\0';
|
|
|
|
_text += cch;
|
|
while ( iswspace(*_text) )
|
|
_text++;
|
|
|
|
return newBuf;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::AcqPath, public
|
|
//
|
|
// Synopsis: Copies all of the relevant characters of the string that
|
|
// _text is pointing to and returns the new string. Will
|
|
// return 0 if _text is at end of whole TEXT_TOKEN.
|
|
//
|
|
// Notes: Since the string is copied, the caller of this function is
|
|
// responsible for freeing the memory occupied by the string.
|
|
// This method can be called several times before calling
|
|
// Accept(), so many paths can be acquired if they exist in the
|
|
// scanner.
|
|
//
|
|
// History: 30-Apr-92 AmyA Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
WCHAR * CQueryScanner::AcqPath()
|
|
{
|
|
if ( IsEndOfTextToken() )
|
|
return 0;
|
|
|
|
// how many characters follow _text that are not in CMND_STR?
|
|
|
|
int count = wcscspn( _text, CMND_STR );
|
|
|
|
return AllocReturnString( count );
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::AcqWord, public
|
|
//
|
|
// Synopsis: Copies the word that _text is pointing to and returns the
|
|
// new string. Positions _text after the word and whitespace.
|
|
// Returns 0 if at the end of a TEXT_TOKEN.
|
|
//
|
|
// History: 29-Jun-92 MikeHew Created.
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
WCHAR * CQueryScanner::AcqWord()
|
|
{
|
|
if ( IsEndOfTextToken() )
|
|
return 0;
|
|
|
|
WCHAR const * pEnd = _text;
|
|
|
|
while ( !iswspace(*pEnd) && pEnd < _pLookAhead )
|
|
pEnd++;
|
|
|
|
unsigned count = CiPtrToUint( pEnd - _text );
|
|
|
|
return AllocReturnString( count );
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::AcqColumn, public
|
|
//
|
|
// Synopsis: Copies a column name and returns the new string. A column
|
|
// name is either a single word, or a quoted string.
|
|
// Positions _text after the word and whitespace.
|
|
//
|
|
// Returns: WCHAR* pointer to column name. 0 if no column name found.
|
|
//
|
|
// History: 17 Apr 97 AlanW Created.
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
WCHAR * CQueryScanner::AcqColumn()
|
|
{
|
|
if ( QUOTES_TOKEN == _token)
|
|
{
|
|
Accept();
|
|
WCHAR * pwszOut = AcqPhraseInQuotes();
|
|
_text = _pLookAhead;
|
|
return pwszOut;
|
|
}
|
|
|
|
if ( IsEndOfTextToken() )
|
|
return 0;
|
|
|
|
int count = wcscspn( _text, COLUMN_STR );
|
|
return AllocReturnString( count );
|
|
}
|
|
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::AcqPhrase, public
|
|
//
|
|
// Synopsis: Copies all of the relevant characters of the string that
|
|
// _text is pointing to and returns the new string.
|
|
// Returns 0 if at the end of a text token.
|
|
//
|
|
// Notes: Since the string is copied, the caller of this function is
|
|
// responsible for freeing the memory occupied by the string.
|
|
// The difference between this function and AcqPath is that this
|
|
// should only be called once before calling Accept().
|
|
//
|
|
// History: 30-Apr-92 AmyA Created
|
|
// 09-May-96 DwightKr Strip trailing white space
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
WCHAR * CQueryScanner::AcqPhrase()
|
|
{
|
|
if( IsEndOfTextToken() )
|
|
return 0;
|
|
|
|
//
|
|
// Strip trailing white-space from the end of the phrase. _pLookAhead
|
|
// points to the first character of the NEXT phrase.
|
|
//
|
|
WCHAR const * pEnd = _pLookAhead - 1;
|
|
while ( (pEnd > _text) && iswspace(*pEnd) )
|
|
{
|
|
pEnd--;
|
|
}
|
|
|
|
unsigned count = CiPtrToUint( pEnd - _text ) + 1;
|
|
|
|
WCHAR * newBuf = new WCHAR [ count + 1 ];
|
|
RtlCopyMemory( newBuf, _text, count * sizeof( WCHAR ) );
|
|
newBuf[count] = 0;
|
|
|
|
return newBuf;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::AcqRegEx, public
|
|
//
|
|
// Synopsis: Copies all of the relevant characters of the string that
|
|
// _text is pointing to and returns the new string. Matches
|
|
// the longest string possible - the only restriction is that
|
|
// the regex can not contain any of the characters in REGEX_STR
|
|
// outside of <> braces (which may be nested).
|
|
// Returns 0 if the regex is empty.
|
|
//
|
|
// Notes: Since the string is copied, the caller of this function is
|
|
// responsible for freeing the memory occupied by the string.
|
|
// Because some regex characters are duplicated in the query
|
|
// language, _pLookAhead is ignored (and actually reset) in
|
|
// this operation. Like AcqPhrase(), this should be called only
|
|
// once before Accept().
|
|
//
|
|
// History: 10-May-94 t-jeffc Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
WCHAR * CQueryScanner::AcqRegEx()
|
|
{
|
|
WCHAR const * pEnd = _text;
|
|
BOOL fDone = FALSE;
|
|
BOOL fQuoted = FALSE;
|
|
|
|
if ( *pEnd == L'"' )
|
|
{
|
|
fQuoted = TRUE;
|
|
pEnd++;
|
|
}
|
|
|
|
// scan the string - stop at \0 or if any REGEX_STR characters are
|
|
// found outside of braces
|
|
//
|
|
for( ;; )
|
|
{
|
|
switch( *pEnd )
|
|
{
|
|
case '\0':
|
|
if ( fQuoted )
|
|
THROW( CException( QPARSE_E_UNEXPECTED_EOS ) );
|
|
|
|
fDone = TRUE;
|
|
break;
|
|
|
|
case ' ':
|
|
if ( !fQuoted )
|
|
fDone = TRUE;
|
|
break;
|
|
|
|
case ')':
|
|
if ( !fQuoted )
|
|
{
|
|
if ( ( pEnd != _text ) &&
|
|
( '|' != (*(pEnd-1)) ) )
|
|
fDone = TRUE;
|
|
}
|
|
break;
|
|
|
|
case '"':
|
|
if ( fQuoted )
|
|
{
|
|
pEnd++;
|
|
fDone = TRUE;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
|
|
} // switch( *pEnd )
|
|
|
|
if( fDone ) break;
|
|
|
|
pEnd++;
|
|
}
|
|
|
|
if( _text == pEnd )
|
|
return 0;
|
|
|
|
// set _pLookAhead
|
|
_pLookAhead = pEnd;
|
|
|
|
// copy the string
|
|
unsigned count = CiPtrToUint( _pLookAhead - _text );
|
|
|
|
if ( fQuoted )
|
|
{
|
|
Win4Assert( count >= 2 );
|
|
count -= 2;
|
|
}
|
|
|
|
WCHAR * newBuf = new WCHAR[ count + 1 ];
|
|
|
|
RtlCopyMemory( newBuf, _text + (fQuoted ? 1 : 0), count * sizeof( WCHAR ) );
|
|
newBuf[ count ] = 0;
|
|
|
|
return newBuf;
|
|
|
|
}
|
|
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::AcqPhraseInQuotes, public
|
|
//
|
|
// Synopsis: Copies all characters until a matching " is found, or until
|
|
// the end of string. Embedded quotes are escaped with a quote:
|
|
// "Bill ""the man"" Gates"
|
|
//
|
|
// Notes: Since the string is copied, the caller of this function is
|
|
// responsible for freeing the memory occupied by the string.
|
|
//
|
|
// History: 18-Jan-95 SitaramR Created
|
|
// 3-Jul-96 dlee added embedded quotes
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
WCHAR * CQueryScanner::AcqPhraseInQuotes()
|
|
{
|
|
WCHAR const * pEnd = _text;
|
|
|
|
do
|
|
{
|
|
if ( 0 == *pEnd )
|
|
break;
|
|
|
|
if ( L'"' == *pEnd )
|
|
{
|
|
if ( L'"' == *(pEnd+1) )
|
|
pEnd++;
|
|
else
|
|
break;
|
|
}
|
|
|
|
pEnd++;
|
|
} while ( TRUE );
|
|
|
|
unsigned count = CiPtrToUint( pEnd - _text );
|
|
|
|
WCHAR * newBuf = new WCHAR [ count + 1 ];
|
|
WCHAR * pwcNewBuf = newBuf;
|
|
WCHAR const * pStart = _text;
|
|
|
|
// copy the string, but remove the extra quote characters
|
|
|
|
while ( pStart < pEnd )
|
|
{
|
|
*pwcNewBuf++ = *pStart++;
|
|
if ( L'"' == *pStart )
|
|
pStart++;
|
|
}
|
|
|
|
*pwcNewBuf = 0;
|
|
|
|
if ( *pEnd == L'"' )
|
|
_pLookAhead = pEnd + 1;
|
|
else
|
|
_pLookAhead = pEnd;
|
|
|
|
return newBuf;
|
|
}
|
|
|
|
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::GetNumber, public
|
|
//
|
|
// Synopsis: If _text is at the end of the TEXT_TOKEN, returns FALSE.
|
|
// If not, puts the ULONG from the scanner into number and
|
|
// returns TRUE.
|
|
//
|
|
// Arguments: [number] -- the ULONG which will be changed and passed back
|
|
// out as the ULONG from the scanner.
|
|
// [fAtEnd] -- returns TRUE if at the end of the scanned string
|
|
//
|
|
// Notes: May be called several times in a loop before Accept() is
|
|
// called.
|
|
//
|
|
// History: 11-May-92 AmyA Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
BOOL CQueryScanner::GetNumber( ULONG & number, BOOL & fAtEnd )
|
|
{
|
|
if ( IsEndOfTextToken() || !iswdigit(*_text) || (*_text == L'-') )
|
|
return FALSE;
|
|
|
|
// is this a hex number?
|
|
|
|
ULONG base = 10;
|
|
|
|
if (_text[0] == L'0' && (_text[1] == L'x' || _text[1] == L'X'))
|
|
{
|
|
_text += 2;
|
|
base = 16;
|
|
}
|
|
|
|
const WCHAR * pwcStart = _text;
|
|
|
|
number = wcstoul( _text, (WCHAR **)(&_text), base );
|
|
|
|
// looks like a real number?
|
|
|
|
if ( ( pwcStart == _text ) ||
|
|
( L'.' == *_text ) )
|
|
return FALSE;
|
|
|
|
while ( iswspace(*_text) )
|
|
_text++;
|
|
|
|
fAtEnd = ( 0 == *_text );
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::GetNumber, public
|
|
//
|
|
// Synopsis: If _text is at the end of the TEXT_TOKEN, returns FALSE.
|
|
// If not, puts the LONG from the scanner into number and
|
|
// returns TRUE.
|
|
//
|
|
// Arguments: [number] -- the LONG which will be changed and passed back
|
|
// out as the LONG from the scanner.
|
|
// [fAtEnd] -- returns TRUE if at the end of the scanned string
|
|
//
|
|
// Notes: May be called several times in a loop before Accept() is
|
|
// called.
|
|
//
|
|
// History: 96-Jan-15 DwightKr Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
BOOL CQueryScanner::GetNumber( LONG & number, BOOL & fAtEnd )
|
|
{
|
|
WCHAR *text = (WCHAR *) _text;
|
|
|
|
BOOL IsNegative = FALSE;
|
|
|
|
ULONG ulMax = (ULONG) LONG_MAX;
|
|
|
|
if ( L'-' == _text[0] )
|
|
{
|
|
IsNegative = TRUE;
|
|
|
|
ulMax++; // can represent 1 more negative than positive.
|
|
|
|
_text++;
|
|
}
|
|
|
|
ULONG ulNumber;
|
|
if ( !GetNumber( ulNumber, fAtEnd ) )
|
|
{
|
|
_text = text;
|
|
return FALSE;
|
|
}
|
|
|
|
// Signed number overflow/underflow
|
|
|
|
if ( ulNumber > ulMax )
|
|
{
|
|
_text = text;
|
|
return FALSE;
|
|
}
|
|
|
|
if ( IsNegative )
|
|
{
|
|
if ( ulMax == ulNumber )
|
|
number = LONG_MIN;
|
|
else
|
|
number = - (LONG) ulNumber;
|
|
}
|
|
else
|
|
{
|
|
number = (LONG) ulNumber;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::GetNumber, public
|
|
//
|
|
// Synopsis: If _text is at the end of the TEXT_TOKEN, returns FALSE.
|
|
// If not, puts the ULONG from the scanner into number and
|
|
// returns TRUE.
|
|
//
|
|
// Arguments: [number] -- the ULONG which will be changed and passed back
|
|
// out as the ULONG from the scanner.
|
|
// [fAtEnd] -- returns TRUE if at the end of the scanned string
|
|
//
|
|
// Notes: May be called several times in a loop before Accept() is
|
|
// called.
|
|
//
|
|
// History: 27-Feb-96 dlee Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
BOOL CQueryScanner::GetNumber( unsigned _int64 & number, BOOL & fAtEnd )
|
|
{
|
|
if ( IsEndOfTextToken() || !iswdigit(*_text) || (*_text == L'-') )
|
|
return FALSE;
|
|
|
|
// is this a hex number?
|
|
|
|
ULONG base = 10;
|
|
|
|
if (_text[0] == L'0' && (_text[1] == L'x' || _text[1] == L'X'))
|
|
{
|
|
_text += 2;
|
|
base = 16;
|
|
}
|
|
|
|
const WCHAR * pwcStart = _text;
|
|
|
|
number = _wcstoui64( _text, (WCHAR **)(&_text), base );
|
|
|
|
// looks like a real number?
|
|
|
|
if ( ( pwcStart == _text ) ||
|
|
( L'.' == *_text ) )
|
|
return FALSE;
|
|
|
|
while ( iswspace(*_text) )
|
|
_text++;
|
|
|
|
fAtEnd = ( 0 == *_text );
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::GetNumber, public
|
|
//
|
|
// Synopsis: If _text is at the end of the TEXT_TOKEN, returns FALSE.
|
|
// If not, puts the LONG from the scanner into number and
|
|
// returns TRUE.
|
|
//
|
|
// Arguments: [number] -- the LONG which will be changed and passed back
|
|
// out as the LONG from the scanner.
|
|
// [fAtEnd] -- returns TRUE if at the end of the scanned string
|
|
//
|
|
// Notes: May be called several times in a loop before Accept() is
|
|
// called.
|
|
//
|
|
// History: 27-Feb-96 dlee Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
BOOL CQueryScanner::GetNumber( _int64 & number, BOOL & fAtEnd )
|
|
{
|
|
WCHAR *text = (WCHAR *) _text;
|
|
|
|
BOOL IsNegative = FALSE;
|
|
|
|
unsigned _int64 ullMax = (unsigned _int64) _I64_MAX;
|
|
|
|
if ( L'-' == _text[0] )
|
|
{
|
|
IsNegative = TRUE;
|
|
|
|
ullMax++; // can represent 1 more negative than positive.
|
|
|
|
_text++;
|
|
}
|
|
|
|
unsigned _int64 ullNumber;
|
|
if ( !GetNumber( ullNumber, fAtEnd ) )
|
|
{
|
|
_text = text;
|
|
return FALSE;
|
|
}
|
|
|
|
// Signed number overflow/underflow
|
|
|
|
if ( ullNumber > ullMax )
|
|
{
|
|
_text = text;
|
|
return FALSE;
|
|
}
|
|
|
|
if ( IsNegative )
|
|
{
|
|
if ( ullMax == ullNumber )
|
|
number = _I64_MIN;
|
|
else
|
|
number = -((_int64) ullNumber);
|
|
}
|
|
else
|
|
{
|
|
number = (_int64) ullNumber;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::GetNumber, public
|
|
//
|
|
// Synopsis: If _text is at the end of the TEXT_TOKEN, returns FALSE.
|
|
// If not, puts the LONG from the scanner into number and
|
|
// returns TRUE.
|
|
//
|
|
// Arguments: [number] -- the double which will be changed and passed back
|
|
// out as the double from the scanner.
|
|
//
|
|
// Notes: May be called several times in a loop before Accept() is
|
|
// called.
|
|
//
|
|
// History: 96-Jan-15 DwightKr Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
BOOL CQueryScanner::GetNumber( double & number )
|
|
{
|
|
if ( IsEndOfTextToken() || !iswdigit(*_text) )
|
|
return FALSE;
|
|
|
|
if ( swscanf( _text, L"%lf", &number ) != 1 )
|
|
{
|
|
return FALSE;
|
|
}
|
|
|
|
while ( iswspace(*_text) != 0 )
|
|
_text++;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::GetCommandChar, public
|
|
//
|
|
// Synopsis: Returns the command character pointed to by _text and advances
|
|
// _text. If the command can't be uniquely determined by the
|
|
// first character, each subsequent call will return the next
|
|
// character in the word. After the command has been determined,
|
|
// AcceptCommand() should be called and then operand parsing may begin.
|
|
//
|
|
// History: 14-May-92 AmyA Created
|
|
// 16-May-94 t-jeffc Returns one character at a time to
|
|
// support more commands
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
WCHAR CQueryScanner::GetCommandChar()
|
|
{
|
|
if( IsEndOfTextToken() )
|
|
return 0;
|
|
|
|
WCHAR chCommand = _text[0];
|
|
|
|
_text++;
|
|
|
|
return towlower( chCommand );
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::AcceptCommand, public
|
|
//
|
|
// Synopsis: Advances _text past any characters in the command.
|
|
// Used when enough command characters have been
|
|
// read to uniquely determine the command and begin parsing
|
|
// the operands.
|
|
//
|
|
// History: 16-May-94 t-jeffc Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
void CQueryScanner::AcceptCommand()
|
|
{
|
|
int cChars = wcscspn( _text, CMND_STR ); // how many characters follow
|
|
// _text that are not in CMND_STR
|
|
|
|
_text += cChars;
|
|
|
|
_pLookAhead = _text;
|
|
|
|
Accept();
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::ResetBuffer, public
|
|
//
|
|
// Synopsis: Puts a new string into _pBuf and resets _pLookAhead
|
|
// accordingly.
|
|
//
|
|
// Arguments: [buffer] -- the new string for _pBuf
|
|
//
|
|
// History: 05-May-92 AmyA Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
void CQueryScanner::ResetBuffer( WCHAR const * buffer )
|
|
{
|
|
_pBuf = buffer;
|
|
_pLookAhead = _pBuf;
|
|
Accept();
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::EatWhiteSpace, private
|
|
//
|
|
// Synopsis: Advances _pLookAhead past any white space in the string.
|
|
//
|
|
// History: 29-Apr-92 AmyA Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
void CQueryScanner::EatWhiteSpace()
|
|
{
|
|
while ( iswspace(*_pLookAhead) != 0 )
|
|
_pLookAhead++;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::IsEndOfTextToken, private
|
|
//
|
|
// Synopsis: Returns TRUE if the current token is not a TEXT_TOKEN or
|
|
// if the string starting at _text to _pLookAhead contains
|
|
// nothing but whitespace.
|
|
//
|
|
// History: 27-May-94 t-jeffc Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
BOOL CQueryScanner::IsEndOfTextToken()
|
|
{
|
|
if( _token == TEXT_TOKEN && _text < _pLookAhead )
|
|
return FALSE;
|
|
else
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CQueryScanner::AcqLine, public
|
|
//
|
|
// Synopsis: Copies all of the remaining characters on the line;
|
|
// return 0 if _text is at end of whole TEXT_TOKEN.
|
|
//
|
|
// Arguments: [fParseQuotes] -- if TRUE, initial and final quotes are removed
|
|
//
|
|
// Notes: Since the string is copied, the caller of this function is
|
|
// responsible for freeing the memory occupied by the string.
|
|
// This method can be called several times before calling
|
|
// Accept(), so many paths can be acquired if they exist in the
|
|
// scanner.
|
|
//
|
|
// History: 96-Jan-03 DwightKr Created
|
|
// 96-Feb-26 DwightKr Allow lines to be quoted
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
WCHAR * CQueryScanner::AcqLine( BOOL fParseQuotes )
|
|
{
|
|
if ( *_text == L'\0' )
|
|
return 0;
|
|
|
|
unsigned cwcBuffer = wcslen(_text);
|
|
|
|
//
|
|
// If there are \r, \n, or other white space at the end of the string,
|
|
// strip it off
|
|
//
|
|
|
|
while ( cwcBuffer > 0 && _text[cwcBuffer-1] <= L' ' )
|
|
cwcBuffer--;
|
|
|
|
if ( fParseQuotes )
|
|
{
|
|
//
|
|
// If there is a pair of quotes delimiting this line, strip them off
|
|
//
|
|
|
|
if ( (L'"' == _text[0]) && (cwcBuffer > 1) )
|
|
{
|
|
if ( L'"' == _text[cwcBuffer-1] )
|
|
cwcBuffer--;
|
|
|
|
_text++;
|
|
cwcBuffer--;
|
|
}
|
|
}
|
|
|
|
WCHAR *pText = new WCHAR [ cwcBuffer + 1 ];
|
|
RtlCopyMemory( pText, _text, cwcBuffer * sizeof(WCHAR) );
|
|
pText[cwcBuffer] = 0;
|
|
|
|
_pLookAhead = _text + cwcBuffer - 1;
|
|
|
|
return pText;
|
|
} //AcqLine
|
|
|