windows-nt/Source/XPSP1/NT/inetsrv/query/qutil/triplish/parser.l

1579 lines
61 KiB
Plaintext
Raw Normal View History

2020-09-26 03:20:57 -05:00
%{
//+---------------------------------------------------------------------------
//
// Microsoft Windows
// Copyright (C) Microsoft Corporation, 1997 - 2000.
//
// File: parser.l
//
// Contents: Lex rules for parser
//
// Notes: Written for flex version 2.5.4
//
// History: 10-01-97 emilyb created
//
//----------------------------------------------------------------------------
class CValueParser;
#include "yybase.hxx"
#include "parser.h"
#include "parsepl.h"
#include "flexcpp.h"
#define TOKEN(tknNum) return (tknNum);
#define STRING_VALUE(tknNum, fLong, fQuote) \
{ \
if (!IsTokenEmpty()) \
return CreateTknValue(yylval, tknNum, fLong, fQuote); \
}
/*
** Make Lex read from a block of data
** buffer is the character buffer,
** result is a variable to store the number of chars read
** ms is the size of the buffer
*/
#undef YY_INPUT
#define YY_INPUT(b, r, ms) (r = yybufferinput(b, ms))
DECLARE_INFOLEVEL(yacc)
//+---------------------------------------------------------------------------
//
// Function: YYLEXER::IsTokenEmpty
//
// Synopsis: Determines if a token is empty. An empty token only has
// whitespace or has nothing in it.
//
// Arguments: None.
//
// Returns: Boolean value.
//
// History: 08-APR-98 KrishnaN created
//
//----------------------------------------------------------------------------
BOOL YYLEXER::IsTokenEmpty()
{
LPWSTR pwsz = yytext;
Win4Assert(pwsz);
while (*pwsz != 0)
{
if (*pwsz != L' ' && *pwsz != L'\t')
return FALSE;
pwsz++;
}
return TRUE;
}
//+---------------------------------------------------------------------------
//
// Function: YYLEXER::IsNotOperator
//
// Synopsis: Determines if we have a not operator.
//
// Arguments: None.
//
// Returns: Boolean value.
//
// History: 08-DEC-98 KrishnaN created
//
//----------------------------------------------------------------------------
BOOL YYLEXER::IsNotOperator()
{
LPWSTR pwsz = yytext;
Win4Assert(pwsz);
// skip past leading spaces
int i = 0;
while (*pwsz != 0 && (*pwsz == L' ' || *pwsz == L'\t'))
{
pwsz++;
i++;
}
// If we don't have at least four chars to consider, we don't have a
// not operator.
if (yyleng < i+4)
return FALSE;
if ( (*pwsz == L'n' || *pwsz == L'N') &&
(*(pwsz+1) == L'o' || *(pwsz+1) == L'O') &&
(*(pwsz+2) == L't' || *(pwsz+2) == L'T') &&
(*(pwsz+3) == L'@' || *(pwsz+3) == L'#' || *(pwsz+3) == L'$')
)
return TRUE;
else
return FALSE;
}
//+---------------------------------------------------------------------------
//
// Function: YYLEXER::CreateTknValue
//
// Synopsis: Allocs a WCHAR string which is passed to the YACC value stack.
//
// Arguments: [ppStg] -- set to pointer to alloc'd memory
// [tknNum] -- token id
// [fLong] -- true if token is in longhand version
// [fQuote] -- true if token is quoted
//
// Returns: Updated token id
//
// History: 10-01-97 emilyb created
//
//----------------------------------------------------------------------------
short YYLEXER::CreateTknValue(YYSTYPE *ppStg, short tknNum, BOOL fLong, BOOL fQuote )
{
HRESULT hr = S_OK;
short retTkn = tknNum;
LPWSTR pwsz = yytext;
if (!fQuote)
{
// If we see a double quote, consider the string quoted.
while (L' ' == *pwsz)
pwsz++;
if (*pwsz == L'"')
{
// strip trailing blanks and check if we see a trailing "
LPWSTR pLast = pwsz + wcslen(pwsz) - 1;
while (pLast >= pwsz && L' ' == *pLast )
{
*pLast = L'\0';
pLast--;
}
if (*pLast == L'"' && pLast > pwsz )
fQuote = TRUE;
}
}
// start parsing from the beginning of the string
pwsz = yytext;
if (_PHRASEORREGEX == tknNum)
{
// A quoted string is always a phrase.
if (fQuote)
retTkn = _PHRASE;
else
retTkn = DetermineTokenType();
}
switch (retTkn)
{
case _PHRASE:
{
LPWSTR pLast;
pLast = pwsz + wcslen(pwsz) - 1;
// if long version, find the phrase
if (fLong)
{
pwsz = pwsz + wcslen(L"{phrase}");
pLast = pLast - wcslen(L"{/phrase}"+1);
Win4Assert(*pLast == L'{');
*pLast = L'\0';
}
// strip leading and trailing blanks
while (L' ' == *pwsz)
pwsz++;
pLast = pwsz + wcslen(pwsz) - 1;
while (pLast >= pwsz && L' ' == *pLast )
{
*pLast = L'\0';
pLast--;
}
// NOTE: Don't strip double quotes here, they will be stripped later
yaccDebugOut((DEB_ITRACE, "Phrase %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short"));
}
break;
case _PROPNAME:
{
LPWSTR pLast;
if (fLong) // looks like: { prop name = "prop name" }
{
// find =
while (L'=' != *pwsz)
pwsz++;
pwsz++;
pLast = pwsz + wcslen(pwsz) - 1;
Win4Assert( *pLast == L'}');
*pLast-- = L'\0';
}
else
{
// Strip @ or # or $ token
Win4Assert(*pwsz == L'@' || *pwsz == L'#' || *pwsz == L'$');
pwsz = pwsz + 1;
}
// strip leading and trailing blanks
while (L' ' == *pwsz)
pwsz++;
pLast = pwsz + wcslen(pwsz) - 1;
while (pLast >= pwsz && L' ' == *pLast )
{
*pLast--= L'\0';
}
if (fQuote)
{
pwsz++;
*pLast = L'\0';
}
yaccDebugOut((DEB_ITRACE, "Propname %ws in %ws format and %ws\n",
pwsz, fLong ? L"Long" : L"Short", fQuote ? L"quoted" : L"unquoted"));
}
break;
case _FREETEXT:
{
LPWSTR pLast;
// if long version, find the FREETEXT
if (fLong)
{
pwsz = pwsz + wcslen(L"{freetext}");
pLast = pwsz + wcslen(pwsz) - 1;
pLast = pLast - wcslen(L"{/freetext}")+1;
Win4Assert(*pLast == L'{');
*pLast = L'\0';
}
// strip leading and trailing blanks
while (L' ' == *pwsz)
pwsz++;
pLast = pwsz + wcslen(pwsz) - 1;
while (pLast >= pwsz && L' ' == *pLast )
{
*pLast = L'\0';
pLast--;
}
if (fQuote)
{
Win4Assert(pLast >= pwsz+1);
// Strip quotes
pwsz = pwsz + 1;
*pLast = L'\0';
}
yaccDebugOut((DEB_ITRACE, "Freetext %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short"));
}
break;
case _REGEX:
{
LPWSTR pLast;
// if long version, find the regex
if (fLong)
{
pwsz = pwsz + wcslen(L"{regex}");
pLast = pwsz + wcslen(pwsz);
pLast = pLast - wcslen(L"{/regex}");
Win4Assert(*pLast == L'{');
*pLast = L'\0';
}
// strip leading blanks
while (L' ' == *pwsz)
pwsz++;
// If the first char is =, ignore it. We only ignore the first
// = character. This is backward compatible with Triplish1
if (L'=' == *pwsz)
pwsz++;
// strip leading and trailing blanks
while (L' ' == *pwsz)
pwsz++;
pLast = pwsz + wcslen(pwsz) - 1;
while (pLast >= pwsz && L' ' == *pLast )
{
*pLast = L'\0';
pLast--;
}
// After we strip a leading =, we might have a quoted phrase
// Check only if fQuote is false.
// We don't want to deal with an unpaired double quote.
if (!fQuote && *pwsz == L'"' && *pLast == L'"' && pLast > pwsz )
fQuote = TRUE;
if (fQuote)
{
Win4Assert(pLast >= pwsz+1);
// Strip quotes
pwsz = pwsz + 1;
*pLast = L'\0';
}
yaccDebugOut((DEB_ITRACE, "RegEx %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short"));
}
break;
case _WEIGHT:
{
Assert (fLong);
Assert(!fQuote);
if (fLong) // looks like: {weight value = number }
{
// find =
while (L'=' != *pwsz)
pwsz++;
pwsz++;
// step past leading blanks
while (L' ' == *pwsz)
pwsz++;
// remove trailing } and blanks
LPWSTR pLast = pwsz + wcslen(pwsz) - 1;
Win4Assert(*pLast == L'}');
*(pLast--) = L'\0';
while (pLast >= pwsz && L' ' == *pLast )
{
*(pLast--) = L'\0';
}
}
}
break;
case _NEARDIST:
{
Assert (fLong);
Assert(!fQuote);
if (fLong) // looks like: dist = number
{
// find =
while (L'=' != *pwsz)
pwsz++;
pwsz++;
// step past leading blanks
while (L' ' == *pwsz)
pwsz++;
}
yaccDebugOut((DEB_ITRACE, "NearDist string: %ws in %s format\n", pwsz, fLong ? L"Long" : L"Short"));
}
break;
case _NEARUNIT:
{
Assert (fLong);
Assert(!fQuote);
if (fLong) // looks like: unit = blah
{
// find =
while (L'=' != *pwsz)
pwsz++;
pwsz++;
// step past leading blanks
while (L' ' == *pwsz)
pwsz++;
}
yaccDebugOut((DEB_ITRACE, "NearUnit string: %ws in %s format\n", pwsz, fLong ? L"Long" : L"Short"));
}
break;
case _VECTORELEMENT:
{
// strip leading and trailing blanks
while (L' ' == *pwsz)
pwsz++;
LPWSTR pTemp = pwsz + wcslen(pwsz) - 1;
if (fLong) // strip trailing ;
{
Win4Assert(L';' == *pTemp);
*pTemp--='\0';
}
while (L' ' == *pTemp && pTemp > pwsz)
*pTemp-- = L'\0';
if (fQuote)
{
// Strip quotes
pwsz = pwsz + 1;
pwsz[wcslen(pwsz)-1] = L'\0';
}
yaccDebugOut((DEB_ITRACE, "VectorElem %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short"));
}
break;
case _VEMETHOD:
{
Assert (fLong);
LPWSTR pTemp;
if (fLong) // looks like: {vector rankmethod= blah}
{
// find =
while (L'=' != *pwsz)
pwsz++;
pwsz++;
// strip trailing }
pTemp = pwsz + wcslen(pwsz) - 1;
Win4Assert(L'}' == *pTemp);
*pTemp-- = L'\0';
}
// strip leading and trailing blanks and quotes
while (L' ' == *pwsz)
pwsz++;
pTemp = pwsz + wcslen(pwsz) - 1;
while (L' ' == *pTemp && pTemp > pwsz)
*pTemp-- = L'\0';
if (fQuote)
{
// Strip quotes
pwsz = pwsz + 1;
pwsz[wcslen(pwsz)-1] = L'\0';
}
yaccDebugOut((DEB_ITRACE, "VectorMethod %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short"));
}
break;
}
int len = wcslen(pwsz);
XPtrST<WCHAR> xwszRet(new WCHAR[len + 1]);
_allocations.Add(xwszRet.GetPointer(), _allocations.Count());
RtlCopyMemory(xwszRet.GetPointer(), pwsz, (len+1) * sizeof(WCHAR));
(*ppStg).pwszChar = xwszRet.Acquire();
return retTkn;
}
//+---------------------------------------------------------------------------
//
// Function: YYLEXER::DetermineTokenType
//
// Synopsis: Determines if we have a regular expression or a regular string.
// A regular expression is a string that contains atleast one of
// *, ?, or | characters.
//
// Returns: Token id
//
// History: Jun-05-98 KrishnaN created
//
//----------------------------------------------------------------------------
short YYLEXER::DetermineTokenType()
{
LPWSTR pwsz = yytext;
LPWSTR pLast = pwsz + wcslen(pwsz) - 1;
while (pLast >= pwsz)
{
if (L'|' == *pwsz || L'*' == *pwsz || L'?' == *pwsz)
return _REGEX;
pwsz++;
}
// None of the regular expression defining characters have been found
return _PHRASE;
}
//
//
// RULES
//
// Notes: Any characters which are not matched, cause yylexer to throw.
// We can also throw if E_OUTOFMEMORY.
// Tokens which need 2 return more than 1 value (e.g. {near}
// use start states to return each pice of the value. The start
// states also emit a "token end" token so that the parser can
// check that they are syntactically complete.
// Lex matches to the longest match in the rules. If 2 matches
// are the same, it matches to the 1st match.
%}
%x innear
%x shortgen
%x shortregex
%x mayberegex
%x implicitphrase
%x infreefreetext
%x invector
white [ \t\n\f\r]+
begin_freetext \{[fF][rR][eE][eE][tT][eE][xX][tT]\}[ ]*
end_freetext [ ]*\{\/[fF][rR][eE][eE][tT][eE][xX][tT]\}
begin_phrase \{[pP][hH][rR][aA][sS][eE]\}[ ]*
end_phrase [ ]*\{\/[pP][hH][rR][aA][sS][eE]\}
prop [pP][rR][oO][pP]
propname {prop}[ ]+[nN][aA][mM][eE][ ]*
contains [cC][oO][nN][tT][aA][iI][nN][sS]
and [aA][nN][dD]
or [oO][rR]
not [nN][oO][tT]
near [nN][eE][aA][rR]
vector [vV][eE][cC][tT][oO][rR]
vecmethod {vector}[ ]+[rR][aA][nN][kK][mM][eE][tT][hH][oO][dD][ ]*
ve [vV][eE]
weight [wW][eE][iI][gG][hH][tT][ ]+[vV][aA][lL][uU][eE][ ]*
coerce [cC][oO][eE][rR][cC][eE]
generate [gG][eE][nN][eE][rR][aA][tT][eE]
genmethod {generate}[ ]+[mM][eE][tT][hH][oO][dD][ ]*
begin_regex \{[rR][eE][gG][eE][xX]\}[ ]*
end_regex [ ]*\{\/[rR][eE][gG][eE][xX]\}
dist [dD][iI][sS][tT][ ]*
unit [uU][nN][iI][tT][ ]*
word [wW][oO][rR][dD]
sent [sS][eE][nN][tT]
par [pP][aA][rR]
chap [cC][hH][aA][pP]
%%
{white} { /* do nothing */ }
\( { fContinueImplicitPhrase = FALSE;
fContinueRegex = FALSE;
fContinueMaybeRegex = FALSE;
TOKEN (_OPEN);
}
\) {
fContinueImplicitPhrase = FALSE;
fContinueRegex = FALSE;
fContinueMaybeRegex = FALSE;
TOKEN (_CLOSE);
}
%{// ************
// PROPNAME
// ************ %}
%{ // If something was treated as a phrase in Tripolish 1, it should
// be treated as such even now. That applies here. For e.g. @propname
// caused the following text to be treated as a phrase. The same should
// apply to {prop name = propname}
//
%}
%{// shorthand, quoted %}
@\"[^"]+\" {
// treat value as a phrase
BEGIN implicitphrase;
STRING_VALUE(_PROPNAME, FALSE, TRUE);
}
%{// shorthand, not quoted %}
@[^" <>=!&|~\^]+ {
// treat value as a phrase
BEGIN implicitphrase;
STRING_VALUE(_PROPNAME, FALSE, FALSE);
}
%{// shorthand, quoted %}
$\"[^"]+\" {
// treat value as freetext
BEGIN infreefreetext;
STRING_VALUE(_PROPNAME, FALSE, TRUE);
}
%{// shorthand, not quoted %}
$[^" <>=!&|~\^]+ {
// treat value as freetext
BEGIN infreefreetext;
STRING_VALUE(_PROPNAME, FALSE, FALSE);
}
%{// longhand, quoted %}
\{{propname}=[ ]*\"[^"]*\"[ ]*\} {
// treat value as a phrase
BEGIN implicitphrase;
STRING_VALUE(_PROPNAME, TRUE, TRUE);
}
%{// longhand, not quoted %}
\{{propname}=[ ]*[^"} ][^}]*\} {
// treat value as a phrase
BEGIN implicitphrase;
STRING_VALUE(_PROPNAME, TRUE, FALSE);
}
%{// closing token %}
\{\/{prop}\} { TOKEN (_PROPEND); }
%{// *********
// OPERATORS
// ********* %}
{contains}[ ]+ { if (fContinueImplicitPhrase)
{
BEGIN implicitphrase;
fContinueImplicitPhrase = FALSE;
}
else if (fContinueRegex)
{
BEGIN shortregex;
fContinueRegex = FALSE;
}
else if (fContinueMaybeRegex)
{
BEGIN mayberegex;
fContinueMaybeRegex = FALSE;
}
TOKEN (_CONTAINS);
}
{and}[ ]+ { if (fContinueImplicitPhrase)
{
BEGIN implicitphrase;
fContinueImplicitPhrase = FALSE;
}
else if (fContinueRegex)
{
BEGIN shortregex;
fContinueRegex = FALSE;
}
else if (fContinueMaybeRegex)
{
BEGIN mayberegex;
fContinueMaybeRegex = FALSE;
}
TOKEN (_AND);
}
{and}\{ {
yyless(yyleng-1);
if (fContinueImplicitPhrase)
{
BEGIN implicitphrase;
fContinueImplicitPhrase = FALSE;
}
else if (fContinueRegex)
{
BEGIN shortregex;
fContinueRegex = FALSE;
}
else if (fContinueMaybeRegex)
{
BEGIN mayberegex;
fContinueMaybeRegex = FALSE;
}
TOKEN (_AND);
}
{or}[ ]+ { if (fContinueImplicitPhrase)
{
yaccDebugOut(( DEB_ITRACE, "fContinueImplicitPhrase\n" ));
BEGIN implicitphrase;
fContinueImplicitPhrase = FALSE;
}
else if (fContinueRegex)
{
yaccDebugOut(( DEB_ITRACE, "fContinueRegex\n" ));
BEGIN shortregex;
fContinueRegex = FALSE;
}
else if (fContinueMaybeRegex)
{
yaccDebugOut(( DEB_ITRACE, "fContinueMaybeRegex\n" ));
BEGIN mayberegex;
fContinueMaybeRegex = FALSE;
}
yaccDebugOut(( DEB_ITRACE, "OR TOKEN found !!!\n" ));
TOKEN (_OR); }
{or}\{ {
yyless(yyleng-1);
if (fContinueImplicitPhrase)
{
yaccDebugOut(( DEB_ITRACE, "OR{ fContinueImplicitPhrase\n" ));
BEGIN implicitphrase;
fContinueImplicitPhrase = FALSE;
}
else if (fContinueRegex)
{
yaccDebugOut(( DEB_ITRACE, "OR{ fContinueRegex\n" ));
BEGIN shortregex;
fContinueRegex = FALSE;
}
else if (fContinueMaybeRegex)
{
yaccDebugOut(( DEB_ITRACE, "OR{ fContinueMaybeRegex\n" ));
BEGIN mayberegex;
fContinueMaybeRegex = FALSE;
}
yaccDebugOut(( DEB_ITRACE, "OR{ TOKEN found !!!\n" ));
TOKEN (_OR); }
{not}[ ]+ { if (fContinueImplicitPhrase)
{
BEGIN implicitphrase;
fContinueImplicitPhrase = FALSE;
}
else if (fContinueRegex)
{
BEGIN shortregex;
fContinueRegex = FALSE;
}
else if (fContinueMaybeRegex)
{
BEGIN mayberegex;
fContinueMaybeRegex = FALSE;
}
TOKEN (_NOT);}
{not}\{ {
yyless(yyleng-1);
if (fContinueImplicitPhrase)
{
BEGIN implicitphrase;
fContinueImplicitPhrase = FALSE;
}
else if (fContinueRegex)
{
BEGIN shortregex;
fContinueRegex = FALSE;
}
else if (fContinueMaybeRegex)
{
BEGIN mayberegex;
fContinueMaybeRegex = FALSE;
}
TOKEN (_NOT);}
& { if (fContinueImplicitPhrase)
{
BEGIN implicitphrase;
fContinueImplicitPhrase = FALSE;
}
else if (fContinueRegex)
{
BEGIN shortregex;
fContinueRegex = FALSE;
}
else if (fContinueMaybeRegex)
{
BEGIN mayberegex;
fContinueMaybeRegex = FALSE;
}
TOKEN (_AND);}
\| { if (fContinueImplicitPhrase)
{
BEGIN implicitphrase;
fContinueImplicitPhrase = FALSE;
}
else if (fContinueRegex)
{
BEGIN shortregex;
fContinueRegex = FALSE;
}
else if (fContinueMaybeRegex)
{
BEGIN mayberegex;
fContinueMaybeRegex = FALSE;
}
TOKEN (_OR);}
! { if (fContinueImplicitPhrase)
{
BEGIN implicitphrase;
fContinueImplicitPhrase = FALSE;
}
else if (fContinueRegex)
{
BEGIN shortregex;
fContinueRegex = FALSE;
}
else if (fContinueMaybeRegex)
{
BEGIN mayberegex;
fContinueMaybeRegex = FALSE;
}
TOKEN (_NOT);}
{near}[ ]+ { yaccDebugOut(( DEB_ITRACE, "near[ ]+ _NEAR token, begin implicitphrase\n" ));
BEGIN implicitphrase;
TOKEN (_NEAR);}
{near}\{ { yaccDebugOut(( DEB_ITRACE, "near{ _NEAR token, begin implicitphrase\n" ));
yyless(yyleng-1);
BEGIN implicitphrase;
TOKEN (_NEAR);}
~ { BEGIN implicitphrase;
TOKEN (_NEAR);}
\< { TOKEN (_LT);}
\> { TOKEN (_GT);}
\<\= { TOKEN (_LTE);}
\>\= { TOKEN (_GTE);}
\= { if (fContinueMaybeRegex)
{
// We are not sure if we are going to find a
// regular expression or a phrase.
BEGIN mayberegex;
fContinueMaybeRegex = FALSE;
}
TOKEN (_EQ);
}
\!\= { TOKEN (_NE); }
\^a { TOKEN (_ALLOF); }
\^s { TOKEN (_SOMEOF); }
\<[ ]*\^s |
\^s[ ]*\< { TOKEN (_LTSOME); }
\>[ ]*\^s |
\^s[ ]*\> { TOKEN (_GTSOME); }
\<\=[ ]*\^s |
\^s[ ]*\<\= { TOKEN (_LTESOME); }
\>\=[ ]*\^s |
\^s[ ]*\>\= { TOKEN (_GTESOME); }
\=[ ]*\^s |
\^s[ ]*\= { TOKEN (_EQSOME); }
\!\=[ ]*\^s |
\^s[ ]*\!\= { TOKEN (_NESOME); }
\^s[ ]*\^a { TOKEN (_ALLOFSOME); }
\^s[ ]*\^s { TOKEN (_SOMEOFSOME); }
\^<[ ]*\^a |
\^a[ ]*\< { TOKEN (_LTALL); }
\>[ ]*\^a |
\^a[ ]*\> { TOKEN (_GTALL); }
\<\=[ ]*\^a |
\^a[ ]*\<\= { TOKEN (_LTEALL); }
\>\=[ ]*\^a |
\^a[ ]*\>\= { TOKEN (_GTEALL); }
\=[ ]*\^a |
\^a[ ]*\= { TOKEN (_EQALL); }
\!\=[ ]*\^a |
\^a[ ]*\!\= { TOKEN (_NEALL); }
\^a[ ]*\^a { TOKEN (_ALLOFALL); }
\^a[ ]*\^s { TOKEN (_SOMEOFALL); }
%{// *************
// VECTOR SPACE TOKENS
// ************* %}
\{{vecmethod}=[ ]*\"[^"]*\"[ ]*\} { STRING_VALUE(_VEMETHOD, TRUE, TRUE); }
\{{vecmethod}=[^}]*\} { STRING_VALUE(_VEMETHOD, TRUE, FALSE); }
\{{ve}\} {
// makes more sense to enter phrase mode
// rather than freetext mode.
fContinueImplicitPhrase = TRUE;
BEGIN implicitphrase;
TOKEN (_VE);
}
\{\/{vector}\} { TOKEN (_VECTOR_END); }
%{// *************
// longhand NEAR
// ************* %}
%{// must return both unit and distance, so use start state to pull them out, and
// return _NEAR_END so parser knows we hit the closing }
%}
\{{near}[ ] { yaccDebugOut(( DEB_ITRACE, "Longhand _NEAR token, begin innear\n" ));
BEGIN innear; }
\{{near}\{ { yaccDebugOut(( DEB_ITRACE, "Longhand _NEAR{ token, begin innear\n" ));
yyless(yyleng-1);
BEGIN innear; }
%{// ************
// WEIGHT
// ************ %}
\{{weight}=[ ]*(0|1|0\.[0-9]*|1\.[0]*|\.[0-9]+)[ ]*\} {
if (fContinueImplicitPhrase)
{
BEGIN implicitphrase;
fContinueImplicitPhrase = FALSE;
}
yaccDebugOut(( DEB_ITRACE, "_WEIGHT TOKEN FOUND!!\n" ));
STRING_VALUE(_WEIGHT,TRUE,FALSE);
}
\{{coerce}\} {
if (fContinueImplicitPhrase)
{
BEGIN implicitphrase;
fContinueImplicitPhrase = FALSE;
}
TOKEN (_COERCE); }
%{// ****************
// longhand GENERATE
// **************** %}
\{{genmethod}=[" ]*prefix[" ]*\} {
if (fContinueImplicitPhrase)
{
BEGIN implicitphrase;
fContinueImplicitPhrase = FALSE;
}
yaccDebugOut((DEB_ITRACE, "Prefix recognized.\n"));
TOKEN(_GENPREFIX);
}
\{{genmethod}=[" ]*inflect[" ]*\} {
if (fContinueImplicitPhrase)
{
BEGIN implicitphrase;
fContinueImplicitPhrase = FALSE;
}
yaccDebugOut((DEB_ITRACE, "Inflect recognized.\n"));
TOKEN(_GENINFLECT);
}
\{\/{generate}\} { TOKEN (_GENNORMAL); }
%{// ****************
// longhand REGEX
// **************** %}
{begin_regex}\"[^"]*\"{end_regex} { STRING_VALUE(_REGEX,TRUE,TRUE);}
{begin_regex}[^{]*{end_regex} { STRING_VALUE(_REGEX,TRUE,FALSE);}
{begin_regex}([^{]*\|[()\[{}\],*?+][^{]*)*{end_regex} { STRING_VALUE(_REGEX,TRUE,FALSE);}
%{// ****************
// shorthand REGEX
// **************** %}
%{// shorthand, quoted %}
#\"[^"]+\" {
// Get into short form of reg expression
BEGIN shortregex;
STRING_VALUE(_PROPNAME, FALSE, TRUE);
}
%{// shorthand, not quoted %}
#[^" <>=!&|~\^]+ {
// Get into short form of reg expression
BEGIN shortregex;
STRING_VALUE(_PROPNAME, FALSE, FALSE);
}
%{// ***************
// longhand PHRASE
// *************** %}
%{// quoted, with trailing * or ** %}
{begin_phrase}\"[^"]*\"{end_phrase}\* {
// trailing * has to be for inflection -
// process it in shortgen on next pass.
// Grab phrase now.
yyless(yyleng-1);
BEGIN shortgen;
STRING_VALUE(_PHRASE,TRUE,TRUE);
}
%{// quoted, without trailing * or ** %}
{begin_phrase}\"[^"]*\"{end_phrase} {
// no trailing * -- phrase only
STRING_VALUE(_PHRASE,TRUE,TRUE);
}
%{// unquoted, with trailing * or ** %}
{begin_phrase}[^{]*{end_phrase}\* {
// trailing * has to be for inflection -
// process it in shortgen on next pass.
// Grab phrase now.
yyless(yyleng-1);
BEGIN shortgen;
STRING_VALUE(_PHRASE,TRUE,FALSE);
}
%{// unquoted, without trailing * or ** %}
{begin_phrase}[^{]*{end_phrase} {
// no trailing * -- phrase only
STRING_VALUE(_PHRASE,TRUE,FALSE);
}
%{// *************
// shorthand PHRASE
// ************* %}
%{// with trailing * or ** %}
\"[^"]*\"\* {
// trailing * has to be for inflection -
// process it in shortgen on next pass.
// Grab phrase now.
yyless(yyleng-1);
BEGIN shortgen;
STRING_VALUE(_PHRASE, FALSE, TRUE);
}
%{ // without trailing * or ** %}
\"[^"]*\" {
// no trailing * -- phrase only
STRING_VALUE(_PHRASE, FALSE, TRUE);
}
%{// *****************
// longhand FREETEXT
// ***************** %}
%{// quoted, with trailing * or ** %}
{begin_freetext}\"[^"]*\"{end_freetext}\* {
// trailing * has to be for inflection -
// process it in shortgen on next pass.
// Grab freetext now.
yyless(yyleng-1);
BEGIN shortgen;
STRING_VALUE(_FREETEXT,TRUE,TRUE);
}
%{// quoted, without trailing * or ** %}
{begin_freetext}\"[^"]*\"{end_freetext} {
// no trailing * -- freetext only
STRING_VALUE(_FREETEXT,TRUE,TRUE);
}
%{// unquoted, with trailing * or ** %}
{begin_freetext}[^{]*{end_freetext}\* {
// trailing * has to be for inflection -
// process it in shortgen on next pass.
// Grab freetext now.
yyless(yyleng-1);
BEGIN shortgen;
STRING_VALUE(_FREETEXT,TRUE,FALSE);
}
%{// unquoted, without trailing * or ** %}
{begin_freetext}[^{]*{end_freetext} {
// no trailing * -- freetext only
STRING_VALUE(_FREETEXT,TRUE,FALSE);
}
%{// ******************
// shorthand FREETEXT
// ****************** %}
[^#$@~&|<>=!\^*"()\{ ][^&~|{) ]*[ ] {
// For backward compatibility, we want to special
// case and recognize the "not" operator when it
// is immediately followed by a mode specifier character
// (@, $, #). For e.g. "not@size > 2" should be treated
// as if we have a "not" operator followed by "@size > 2".
// Without this special case, "not@size > 2" gets recognized
// as free text.
if (IsNotOperator())
{
yyless(3);
BEGIN INITIAL;
TOKEN(_NOT);
}
yaccDebugOut(( DEB_ITRACE, "fTreatFreetextAsPhrase is %d\n", fTreatFreetextAsPhrase ));
if (fTreatFreetextAsPhrase)
BEGIN implicitphrase;
else
BEGIN infreefreetext;
fTreatFreetextAsPhrase = FALSE;
yymore();
}
[^#$@~&|<>=!\^*"()\{ ][^&~|{) ]* {
// IsNotOperator is used here for the same reason as the
// use above, except that this rule covers situations where
// we have no spaces in the query. E.g. "not@size>2".
// This should be equivalent to
// "not@size > 2", which in turn should be equivalent to
// "not @size > 2"
if (IsNotOperator())
{
yyless(3);
BEGIN INITIAL;
TOKEN(_NOT);
}
if (fTreatFreetextAsPhrase)
{
STRING_VALUE(_PHRASE,FALSE,FALSE);
}
else
{
STRING_VALUE(_FREETEXT,FALSE,FALSE);
}
fTreatFreetextAsPhrase = FALSE;
}
%{// *************
// VECTOR VALUES
// ************* %}
%{// quoted multi-value vector - has ; separator. Singlets caught in parser %}
\([ ]*\"[^"]*\"[ ]*; { BEGIN invector; yyless(1);}
%{// unquoted multi-value vector - has ; separator. Singlets caught in parser %}
\([^(;)]+; { BEGIN invector; yyless(1);}
%{//
// INNEAR: longhand NEAR processing
//
%}
<innear>{white} {}
<innear>, {}
<innear>dist[ ]*=[ ]*[0-9]+ { STRING_VALUE(_NEARDIST,TRUE,FALSE);}
<innear>unit[ ]*=[ ]*{word} { STRING_VALUE(_NEARUNIT,TRUE,FALSE);}
<innear>unit[ ]*=[ ]*{sent} { STRING_VALUE(_NEARUNIT,TRUE,FALSE);}
<innear>unit[ ]*=[ ]*{par} { STRING_VALUE(_NEARUNIT,TRUE,FALSE);}
<innear>unit[ ]*=[ ]*{chap} { STRING_VALUE(_NEARUNIT,TRUE,FALSE);}
<innear>\} { BEGIN implicitphrase; TOKEN (_NEAR_END);}
%{//
// INVECTOR: multi value vector processing
//
%}
<invector>{white} {}
<invector>; {}
<invector>\"[^"]*\" { STRING_VALUE(_VECTORELEMENT, FALSE, TRUE);}
<invector>[^ ";)][^;)]*; { STRING_VALUE(_VECTORELEMENT, TRUE, FALSE);}
<invector>[^ ";)][^;)]*\) {
// Need to emit _VECTORELEMENT and _VE_END -- so backup 1
// so we can emit _VE_END on next pass
yyless(yyleng-1);
STRING_VALUE(_VECTORELEMENT, FALSE, FALSE);
}
<invector>\) { BEGIN INITIAL; TOKEN (_VE_END); }
%{//
// INFREEFREETEXT: shorthand FREETEXT processing
//
// NOTE: and, or, near need to be localized %}
<infreefreetext>[ ]+ { yymore(); }
<infreefreetext>{and}[ ] {
yyless(yyleng-4);
BEGIN INITIAL;
STRING_VALUE(_FREETEXT,FALSE,FALSE);
}
<infreefreetext>{and}\{ {
yyless(yyleng-4);
BEGIN INITIAL;
STRING_VALUE(_FREETEXT,FALSE,FALSE);
}
<infreefreetext>{or}[ ] {
yyless(yyleng-3);
BEGIN INITIAL;
STRING_VALUE(_FREETEXT,FALSE,FALSE);
}
<infreefreetext>{or}\{ {
yyless(yyleng-3);
BEGIN INITIAL;
STRING_VALUE(_FREETEXT,FALSE,FALSE);
}
<infreefreetext>{near}[ ] {
yaccDebugOut(( DEB_ITRACE, "{infreefreetext}{near}[ ]\n" ));
yyless(yyleng-5);
fTreatFreetextAsPhrase = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE,FALSE,FALSE);
}
<infreefreetext>{near}\{ {
yaccDebugOut(( DEB_ITRACE, "{infreefreetext}{near}{\n" ));
yyless(yyleng-5);
fTreatFreetextAsPhrase = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE,FALSE,FALSE);
}
<infreefreetext>\{{near}[ ] {
yaccDebugOut(( DEB_ITRACE, "{infreefreetext}{{near}\n" ));
yyless(yyleng-6);
fTreatFreetextAsPhrase = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE,FALSE,FALSE);
}
<infreefreetext>\{{near}\{ {
yaccDebugOut(( DEB_ITRACE, "{infreefreetext}{{near}{\n" ));
yyless(yyleng-6);
fTreatFreetextAsPhrase = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE,FALSE,FALSE);
}
<infreefreetext>& {
yyless(yyleng-1);
BEGIN INITIAL;
STRING_VALUE(_FREETEXT,FALSE,FALSE);
}
<infreefreetext>\| {
yyless(yyleng-1);
BEGIN INITIAL;
STRING_VALUE(_FREETEXT,FALSE,FALSE);
}
<infreefreetext>~ {
yyless(yyleng-1);
fTreatFreetextAsPhrase = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE,FALSE,FALSE);
}
<infreefreetext>\( {
yyless(yyleng-1);
BEGIN INITIAL;
STRING_VALUE(_FREETEXT,FALSE,FALSE);
}
<infreefreetext>\) {
yyless(yyleng-1);
BEGIN INITIAL;
STRING_VALUE(_FREETEXT,FALSE,FALSE);
}
<infreefreetext>\{ {
yyless(yyleng-1);
BEGIN INITIAL;
STRING_VALUE(_FREETEXT,FALSE,FALSE);
}
<infreefreetext>\"[^"]+\" {
BEGIN INITIAL;
STRING_VALUE(_FREETEXT,FALSE,FALSE);
}
<infreefreetext>[^~&|{}()" ]+[ ] { yymore(); }
<infreefreetext>[^~&|{}()" ]+ {
BEGIN INITIAL;
STRING_VALUE(_FREETEXT,FALSE,FALSE);
}
%{//
// SHORTGEN: * or ** processing
//
// can only get here by backing up over *,
// so we will always find a match %}
<shortgen>\*\* {
BEGIN INITIAL;
TOKEN(_SHGENINFLECT);
}
<shortgen>\* {
BEGIN INITIAL;
TOKEN(_SHGENPREFIX);
}
%{//
// SHORTREGEX: #propname processing
//
// can only get here when #"propname" or #propname
// (quoted or unquoted) version is detected.
// NOTE: and, or need to be localized
// NOTE: It doesn't make sense to have the near operator following
// a regular expression. A regex is Boolean and doesn't evaluate
// to a position value.
//
//
%}
<shortregex>[ ]+ { yymore(); }
<shortregex>= {
// ignore equal operators...
BEGIN shortregex;
}
<shortregex>\"[^"]*\" { STRING_VALUE(_REGEX, FALSE, TRUE);}
<shortregex>{and}[ ] {
fContinueRegex = TRUE;
yyless(yyleng-4);
BEGIN INITIAL;
STRING_VALUE(_REGEX,FALSE,FALSE);
}
<shortregex>{or}[ ] {
fContinueRegex = TRUE;
yyless(yyleng-3);
BEGIN INITIAL;
STRING_VALUE(_REGEX,FALSE,FALSE);
}
<shortregex>{not}[ ] {
yyless(yyleng-4);
// The only valid way to get here is to
// have had seen "and" before. Don't recognize
// a regex. Back off and let the lexer takes its
// normal course.
fContinueRegex = TRUE;
BEGIN INITIAL;
}
<shortregex>& {
fContinueRegex = TRUE;
yyless(yyleng-1);
BEGIN INITIAL;
STRING_VALUE(_REGEX,FALSE,FALSE);
}
<shortregex>\| {
fContinueRegex = TRUE;
yyless(yyleng-1);
BEGIN INITIAL;
STRING_VALUE(_REGEX,FALSE,FALSE);
}
<shortregex>! {
yyless(yyleng-1);
// The only valid way to get here is to
// have had seen "and" before. Don't recognize
// a phrase. Back off and let the lexer takes its
// normal course.
fContinueRegex = TRUE;
BEGIN INITIAL;
}
%{
// When we find an operator we should treat it as one.
// So backup and get out if you see one.
// Normally '^' is treated as part of an operator (e.g. ^a), but it also
// has a special meaning in regular expression syntax. So we will have to
// let it through when it is part of a regular expression. As an alternative,
// we can allow '^' in regular expression in a limited manner (i.e. only the use
// in square brackets to exclude the set of chars "[^abc]" where abc are excluded).
// This alternative will let the common case use of '^' in a regular expression
// while allowing it to be treated as part of an operator when it doesn't
// occur immediately after a '['.
// We are implementing the alternative here because our regex capability
// only allows for the "[^" construct.
%}
<shortregex>[\^<>@$#] {
yyless(yyleng-1);
fContinueRegex = FALSE;
BEGIN INITIAL;
}
<shortregex>\( {
yyless(yyleng-1);
BEGIN INITIAL;
STRING_VALUE(_REGEX,FALSE,FALSE);
}
<shortregex>\) {
yyless(yyleng-1);
BEGIN INITIAL;
STRING_VALUE(_REGEX,FALSE,FALSE);
}
<shortregex>\{ {
yyless(yyleng-1);
BEGIN INITIAL;
STRING_VALUE(_REGEX,FALSE,FALSE);
}
<shortregex>(([^~&|{}()\^<>!@$#= ])*(\|[()\[{}\],*?+])*(\|\[\^)*([^~&|{}()\^<>!@$#= ])*)+[ ] { yymore(); }
<shortregex>(([^~&|{}()\^<>!@$#= ])*(\|[()\[{}\],*?+])*(\|\[\^)*([^~&|{}()\^<>!@$#= ])*)+ {
fContinueRegex = TRUE;
BEGIN INITIAL;
STRING_VALUE(_REGEX,FALSE,FALSE);
}
<mayberegex>{and}[ ] {
yyless(yyleng-4);
fContinueMaybeRegex = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
}
<mayberegex>{or}[ ] {
yyless(yyleng-3);
fContinueMaybeRegex = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
}
<mayberegex>{not}[ ] {
yyless(yyleng-4);
// The only valid way to get here is to
// have had seen "and" before. Don't recognize
// a regex. Back off and let the lexer takes its
// normal course.
fContinueMaybeRegex = TRUE;
BEGIN INITIAL;
}
<mayberegex>& {
fContinueMaybeRegex = TRUE;
yyless(yyleng-1);
BEGIN INITIAL;
STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
}
<mayberegex>\| {
fContinueMaybeRegex = TRUE;
yyless(yyleng-1);
BEGIN INITIAL;
STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
}
<mayberegex>! {
yyless(yyleng-1);
// The only valid way to get here is to
// have had seen "and" before. Don't recognize
// a phrase. Back off and let the lexer takes its
// normal course.
fContinueMaybeRegex = TRUE;
BEGIN INITIAL;
}
<mayberegex>\( {
yyless(yyleng-1);
BEGIN INITIAL;
STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
}
<mayberegex>\) {
yyless(yyleng-1);
BEGIN INITIAL;
STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
}
<mayberegex>\{ {
yyless(yyleng-1);
BEGIN INITIAL;
STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
}
<mayberegex>[ ]+ { yymore(); }
<mayberegex>\"[^"]*\" { STRING_VALUE(_PHRASE, FALSE, TRUE);}
<mayberegex>(([^~&|{}()\^<>!@$# ])*(\|[()\[{}\],*?+])*(\|\[\^)*([^~&|{}()\^<>!@$# ])*)+[ ] { yymore(); }
<mayberegex>(([^~&|{}()\^<>!@$# ])*(\|[()\[{}\],*?+])*(\|\[\^)*([^~&|{}()\^<>!@$# ])*)+ {
fContinueMaybeRegex = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
}
%{
// When we find an operator at the start of a phrase,
// we should treat it as one. So backup and get out if you see one.
%}
<mayberegex>[\^<>@$#] {
yyless(yyleng-1);
fContinueMaybeRegex = FALSE;
BEGIN INITIAL;
}
%{//
// IMPLICITPHRASE: Where phrase is implied.
//
// can only get here when @propname or {prop name = propname} is detected.
// NOTE: and, or, not need to be localized when time permits.
//
// NTRAID#DB-NTBUG9-84571-2000/07/31-dlee Indexing Service tripolish2 query expressions misinterpreted as strings
// if expression has trailing blanks, we'll emit a string value
%}
<implicitphrase>\"[^"]*\" {
fContinueImplicitPhrase = FALSE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE, FALSE, TRUE);
}
<implicitphrase>[ ]+ { yymore(); }
<implicitphrase>{and}[ ] {
yyless(yyleng-4);
fContinueImplicitPhrase = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE,FALSE,FALSE);
}
<implicitphrase>{or}[ ] {
yyless(yyleng-3);
fContinueImplicitPhrase = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE,FALSE,FALSE);
}
<implicitphrase>{near}[ ] {
yyless(yyleng-5);
// We want to treat the following token as a phrase
fContinueImplicitPhrase = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE,FALSE,FALSE);
}
<implicitphrase>{near}\{ {
yyless(yyleng-5);
// We want to treat the following token as a phrase
fContinueImplicitPhrase = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE,FALSE,FALSE);
}
<implicitphrase>{not}[ ] {
yyless(yyleng-4);
// The only valid way to get here is to
// have had seen "and" before. Don't recognize
// a phrase. Back off and let the lexer takes its
// normal course.
fContinueImplicitPhrase = TRUE;
BEGIN INITIAL;
}
<implicitphrase>& {
yyless(yyleng-1);
fContinueImplicitPhrase = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE,FALSE,FALSE);
}
<implicitphrase>~ {
yyless(yyleng-1);
// We want to treat the following token as a phrase
fContinueImplicitPhrase = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE,FALSE,FALSE);
}
<implicitphrase>! {
yyless(yyleng-1);
// The only valid way to get here is to
// have had seen "and" before. Don't recognize
// a phrase. Back off and let the lexer takes its
// normal course.
fContinueImplicitPhrase = TRUE;
BEGIN INITIAL;
}
<implicitphrase>\| {
yyless(yyleng-1);
fContinueImplicitPhrase = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE,FALSE,FALSE);
}
<implicitphrase>\( {
yyless(yyleng-1);
fContinueImplicitPhrase = FALSE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE,FALSE,FALSE);
}
<implicitphrase>\) {
yyless(yyleng-1);
fContinueImplicitPhrase = FALSE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE,FALSE,FALSE);
}
<implicitphrase>\{ {
yyless(yyleng-1);
fContinueImplicitPhrase = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE,FALSE,FALSE);
}
<implicitphrase>{contains}[ ] {
yyless(yyleng-9);
fContinueImplicitPhrase = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE,FALSE,FALSE);
}
%{
// When we find an operator at the start of an implicit phrase,
// we should treat it as one. So backup and get out if you see one.
%}
<implicitphrase>[\^<>@$#] {
yyless(yyleng-1);
fContinueImplicitPhrase = FALSE;
BEGIN INITIAL;
}
%{
// Triplish2 uses = to indicate that whatever appears after it may
// be using wildcards. Implement that here.
%}
<implicitphrase>= {
yyless(yyleng-1);
fContinueMaybeRegex = TRUE;
BEGIN INITIAL;
}
<implicitphrase>[^~&|{}()\^<>=!@$# ]+[ ] { yymore(); }
<implicitphrase>[^~&|{}()\^<>=!@$# ]+ {
fContinueImplicitPhrase = TRUE;
BEGIN INITIAL;
STRING_VALUE(_PHRASE,FALSE,FALSE);
}