1579 lines
61 KiB
Plaintext
1579 lines
61 KiB
Plaintext
%{
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Microsoft Windows
|
|
// Copyright (C) Microsoft Corporation, 1997 - 2000.
|
|
//
|
|
// File: parser.l
|
|
//
|
|
// Contents: Lex rules for parser
|
|
//
|
|
// Notes: Written for flex version 2.5.4
|
|
//
|
|
// History: 10-01-97 emilyb created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
class CValueParser;
|
|
|
|
#include "yybase.hxx"
|
|
#include "parser.h"
|
|
#include "parsepl.h"
|
|
#include "flexcpp.h"
|
|
|
|
#define TOKEN(tknNum) return (tknNum);
|
|
#define STRING_VALUE(tknNum, fLong, fQuote) \
|
|
{ \
|
|
if (!IsTokenEmpty()) \
|
|
return CreateTknValue(yylval, tknNum, fLong, fQuote); \
|
|
}
|
|
|
|
/*
|
|
** Make Lex read from a block of data
|
|
** buffer is the character buffer,
|
|
** result is a variable to store the number of chars read
|
|
** ms is the size of the buffer
|
|
*/
|
|
#undef YY_INPUT
|
|
#define YY_INPUT(b, r, ms) (r = yybufferinput(b, ms))
|
|
|
|
DECLARE_INFOLEVEL(yacc)
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Function: YYLEXER::IsTokenEmpty
|
|
//
|
|
// Synopsis: Determines if a token is empty. An empty token only has
|
|
// whitespace or has nothing in it.
|
|
//
|
|
// Arguments: None.
|
|
//
|
|
// Returns: Boolean value.
|
|
//
|
|
// History: 08-APR-98 KrishnaN created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
BOOL YYLEXER::IsTokenEmpty()
|
|
{
|
|
LPWSTR pwsz = yytext;
|
|
|
|
Win4Assert(pwsz);
|
|
|
|
while (*pwsz != 0)
|
|
{
|
|
if (*pwsz != L' ' && *pwsz != L'\t')
|
|
return FALSE;
|
|
pwsz++;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Function: YYLEXER::IsNotOperator
|
|
//
|
|
// Synopsis: Determines if we have a not operator.
|
|
//
|
|
// Arguments: None.
|
|
//
|
|
// Returns: Boolean value.
|
|
//
|
|
// History: 08-DEC-98 KrishnaN created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
BOOL YYLEXER::IsNotOperator()
|
|
{
|
|
LPWSTR pwsz = yytext;
|
|
|
|
Win4Assert(pwsz);
|
|
|
|
// skip past leading spaces
|
|
int i = 0;
|
|
while (*pwsz != 0 && (*pwsz == L' ' || *pwsz == L'\t'))
|
|
{
|
|
pwsz++;
|
|
i++;
|
|
}
|
|
|
|
// If we don't have at least four chars to consider, we don't have a
|
|
// not operator.
|
|
if (yyleng < i+4)
|
|
return FALSE;
|
|
|
|
if ( (*pwsz == L'n' || *pwsz == L'N') &&
|
|
(*(pwsz+1) == L'o' || *(pwsz+1) == L'O') &&
|
|
(*(pwsz+2) == L't' || *(pwsz+2) == L'T') &&
|
|
(*(pwsz+3) == L'@' || *(pwsz+3) == L'#' || *(pwsz+3) == L'$')
|
|
)
|
|
return TRUE;
|
|
else
|
|
return FALSE;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Function: YYLEXER::CreateTknValue
|
|
//
|
|
// Synopsis: Allocs a WCHAR string which is passed to the YACC value stack.
|
|
//
|
|
// Arguments: [ppStg] -- set to pointer to alloc'd memory
|
|
// [tknNum] -- token id
|
|
// [fLong] -- true if token is in longhand version
|
|
// [fQuote] -- true if token is quoted
|
|
//
|
|
// Returns: Updated token id
|
|
//
|
|
// History: 10-01-97 emilyb created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
short YYLEXER::CreateTknValue(YYSTYPE *ppStg, short tknNum, BOOL fLong, BOOL fQuote )
|
|
{
|
|
HRESULT hr = S_OK;
|
|
short retTkn = tknNum;
|
|
LPWSTR pwsz = yytext;
|
|
|
|
if (!fQuote)
|
|
{
|
|
// If we see a double quote, consider the string quoted.
|
|
while (L' ' == *pwsz)
|
|
pwsz++;
|
|
if (*pwsz == L'"')
|
|
{
|
|
// strip trailing blanks and check if we see a trailing "
|
|
|
|
LPWSTR pLast = pwsz + wcslen(pwsz) - 1;
|
|
|
|
while (pLast >= pwsz && L' ' == *pLast )
|
|
{
|
|
*pLast = L'\0';
|
|
pLast--;
|
|
}
|
|
|
|
if (*pLast == L'"' && pLast > pwsz )
|
|
fQuote = TRUE;
|
|
}
|
|
}
|
|
|
|
// start parsing from the beginning of the string
|
|
pwsz = yytext;
|
|
|
|
if (_PHRASEORREGEX == tknNum)
|
|
{
|
|
// A quoted string is always a phrase.
|
|
if (fQuote)
|
|
retTkn = _PHRASE;
|
|
else
|
|
retTkn = DetermineTokenType();
|
|
}
|
|
|
|
switch (retTkn)
|
|
{
|
|
case _PHRASE:
|
|
|
|
{
|
|
LPWSTR pLast;
|
|
|
|
pLast = pwsz + wcslen(pwsz) - 1;
|
|
|
|
// if long version, find the phrase
|
|
if (fLong)
|
|
{
|
|
pwsz = pwsz + wcslen(L"{phrase}");
|
|
pLast = pLast - wcslen(L"{/phrase}"+1);
|
|
Win4Assert(*pLast == L'{');
|
|
*pLast = L'\0';
|
|
}
|
|
|
|
// strip leading and trailing blanks
|
|
while (L' ' == *pwsz)
|
|
pwsz++;
|
|
|
|
pLast = pwsz + wcslen(pwsz) - 1;
|
|
|
|
while (pLast >= pwsz && L' ' == *pLast )
|
|
{
|
|
*pLast = L'\0';
|
|
pLast--;
|
|
}
|
|
// NOTE: Don't strip double quotes here, they will be stripped later
|
|
|
|
yaccDebugOut((DEB_ITRACE, "Phrase %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short"));
|
|
}
|
|
break;
|
|
case _PROPNAME:
|
|
|
|
{
|
|
LPWSTR pLast;
|
|
|
|
if (fLong) // looks like: { prop name = "prop name" }
|
|
{
|
|
// find =
|
|
while (L'=' != *pwsz)
|
|
pwsz++;
|
|
pwsz++;
|
|
|
|
pLast = pwsz + wcslen(pwsz) - 1;
|
|
Win4Assert( *pLast == L'}');
|
|
*pLast-- = L'\0';
|
|
}
|
|
else
|
|
{
|
|
// Strip @ or # or $ token
|
|
Win4Assert(*pwsz == L'@' || *pwsz == L'#' || *pwsz == L'$');
|
|
pwsz = pwsz + 1;
|
|
}
|
|
|
|
// strip leading and trailing blanks
|
|
while (L' ' == *pwsz)
|
|
pwsz++;
|
|
|
|
pLast = pwsz + wcslen(pwsz) - 1;
|
|
|
|
while (pLast >= pwsz && L' ' == *pLast )
|
|
{
|
|
*pLast--= L'\0';
|
|
}
|
|
|
|
if (fQuote)
|
|
{
|
|
pwsz++;
|
|
*pLast = L'\0';
|
|
}
|
|
|
|
yaccDebugOut((DEB_ITRACE, "Propname %ws in %ws format and %ws\n",
|
|
pwsz, fLong ? L"Long" : L"Short", fQuote ? L"quoted" : L"unquoted"));
|
|
}
|
|
break;
|
|
case _FREETEXT:
|
|
{
|
|
LPWSTR pLast;
|
|
|
|
// if long version, find the FREETEXT
|
|
if (fLong)
|
|
{
|
|
pwsz = pwsz + wcslen(L"{freetext}");
|
|
pLast = pwsz + wcslen(pwsz) - 1;
|
|
pLast = pLast - wcslen(L"{/freetext}")+1;
|
|
Win4Assert(*pLast == L'{');
|
|
*pLast = L'\0';
|
|
}
|
|
|
|
// strip leading and trailing blanks
|
|
while (L' ' == *pwsz)
|
|
pwsz++;
|
|
|
|
pLast = pwsz + wcslen(pwsz) - 1;
|
|
|
|
while (pLast >= pwsz && L' ' == *pLast )
|
|
{
|
|
*pLast = L'\0';
|
|
pLast--;
|
|
}
|
|
|
|
if (fQuote)
|
|
{
|
|
Win4Assert(pLast >= pwsz+1);
|
|
// Strip quotes
|
|
pwsz = pwsz + 1;
|
|
*pLast = L'\0';
|
|
}
|
|
|
|
yaccDebugOut((DEB_ITRACE, "Freetext %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short"));
|
|
}
|
|
break;
|
|
|
|
case _REGEX:
|
|
{
|
|
LPWSTR pLast;
|
|
|
|
// if long version, find the regex
|
|
if (fLong)
|
|
{
|
|
pwsz = pwsz + wcslen(L"{regex}");
|
|
pLast = pwsz + wcslen(pwsz);
|
|
pLast = pLast - wcslen(L"{/regex}");
|
|
Win4Assert(*pLast == L'{');
|
|
*pLast = L'\0';
|
|
}
|
|
|
|
// strip leading blanks
|
|
while (L' ' == *pwsz)
|
|
pwsz++;
|
|
|
|
// If the first char is =, ignore it. We only ignore the first
|
|
// = character. This is backward compatible with Triplish1
|
|
if (L'=' == *pwsz)
|
|
pwsz++;
|
|
|
|
// strip leading and trailing blanks
|
|
while (L' ' == *pwsz)
|
|
pwsz++;
|
|
|
|
pLast = pwsz + wcslen(pwsz) - 1;
|
|
|
|
while (pLast >= pwsz && L' ' == *pLast )
|
|
{
|
|
*pLast = L'\0';
|
|
pLast--;
|
|
}
|
|
|
|
// After we strip a leading =, we might have a quoted phrase
|
|
// Check only if fQuote is false.
|
|
// We don't want to deal with an unpaired double quote.
|
|
if (!fQuote && *pwsz == L'"' && *pLast == L'"' && pLast > pwsz )
|
|
fQuote = TRUE;
|
|
|
|
if (fQuote)
|
|
{
|
|
Win4Assert(pLast >= pwsz+1);
|
|
// Strip quotes
|
|
pwsz = pwsz + 1;
|
|
*pLast = L'\0';
|
|
}
|
|
|
|
yaccDebugOut((DEB_ITRACE, "RegEx %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short"));
|
|
}
|
|
break;
|
|
|
|
case _WEIGHT:
|
|
{
|
|
Assert (fLong);
|
|
Assert(!fQuote);
|
|
if (fLong) // looks like: {weight value = number }
|
|
{
|
|
// find =
|
|
while (L'=' != *pwsz)
|
|
pwsz++;
|
|
pwsz++;
|
|
|
|
// step past leading blanks
|
|
while (L' ' == *pwsz)
|
|
pwsz++;
|
|
|
|
// remove trailing } and blanks
|
|
LPWSTR pLast = pwsz + wcslen(pwsz) - 1;
|
|
Win4Assert(*pLast == L'}');
|
|
*(pLast--) = L'\0';
|
|
|
|
while (pLast >= pwsz && L' ' == *pLast )
|
|
{
|
|
*(pLast--) = L'\0';
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
|
|
case _NEARDIST:
|
|
{
|
|
Assert (fLong);
|
|
Assert(!fQuote);
|
|
if (fLong) // looks like: dist = number
|
|
{
|
|
// find =
|
|
while (L'=' != *pwsz)
|
|
pwsz++;
|
|
pwsz++;
|
|
|
|
// step past leading blanks
|
|
while (L' ' == *pwsz)
|
|
pwsz++;
|
|
}
|
|
|
|
yaccDebugOut((DEB_ITRACE, "NearDist string: %ws in %s format\n", pwsz, fLong ? L"Long" : L"Short"));
|
|
}
|
|
break;
|
|
case _NEARUNIT:
|
|
{
|
|
Assert (fLong);
|
|
Assert(!fQuote);
|
|
if (fLong) // looks like: unit = blah
|
|
{
|
|
// find =
|
|
while (L'=' != *pwsz)
|
|
pwsz++;
|
|
pwsz++;
|
|
|
|
// step past leading blanks
|
|
while (L' ' == *pwsz)
|
|
pwsz++;
|
|
}
|
|
|
|
yaccDebugOut((DEB_ITRACE, "NearUnit string: %ws in %s format\n", pwsz, fLong ? L"Long" : L"Short"));
|
|
}
|
|
break;
|
|
case _VECTORELEMENT:
|
|
{
|
|
// strip leading and trailing blanks
|
|
while (L' ' == *pwsz)
|
|
pwsz++;
|
|
|
|
LPWSTR pTemp = pwsz + wcslen(pwsz) - 1;
|
|
|
|
if (fLong) // strip trailing ;
|
|
{
|
|
Win4Assert(L';' == *pTemp);
|
|
*pTemp--='\0';
|
|
}
|
|
|
|
while (L' ' == *pTemp && pTemp > pwsz)
|
|
*pTemp-- = L'\0';
|
|
|
|
if (fQuote)
|
|
{
|
|
// Strip quotes
|
|
pwsz = pwsz + 1;
|
|
pwsz[wcslen(pwsz)-1] = L'\0';
|
|
}
|
|
|
|
yaccDebugOut((DEB_ITRACE, "VectorElem %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short"));
|
|
}
|
|
break;
|
|
case _VEMETHOD:
|
|
{
|
|
Assert (fLong);
|
|
|
|
LPWSTR pTemp;
|
|
|
|
if (fLong) // looks like: {vector rankmethod= blah}
|
|
{
|
|
// find =
|
|
while (L'=' != *pwsz)
|
|
pwsz++;
|
|
pwsz++;
|
|
|
|
// strip trailing }
|
|
pTemp = pwsz + wcslen(pwsz) - 1;
|
|
Win4Assert(L'}' == *pTemp);
|
|
*pTemp-- = L'\0';
|
|
|
|
}
|
|
|
|
// strip leading and trailing blanks and quotes
|
|
while (L' ' == *pwsz)
|
|
pwsz++;
|
|
|
|
pTemp = pwsz + wcslen(pwsz) - 1;
|
|
|
|
while (L' ' == *pTemp && pTemp > pwsz)
|
|
*pTemp-- = L'\0';
|
|
|
|
if (fQuote)
|
|
{
|
|
// Strip quotes
|
|
pwsz = pwsz + 1;
|
|
pwsz[wcslen(pwsz)-1] = L'\0';
|
|
}
|
|
|
|
yaccDebugOut((DEB_ITRACE, "VectorMethod %ws in %ws format\n", pwsz, fLong ? L"Long" : L"Short"));
|
|
}
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
int len = wcslen(pwsz);
|
|
XPtrST<WCHAR> xwszRet(new WCHAR[len + 1]);
|
|
|
|
_allocations.Add(xwszRet.GetPointer(), _allocations.Count());
|
|
|
|
RtlCopyMemory(xwszRet.GetPointer(), pwsz, (len+1) * sizeof(WCHAR));
|
|
(*ppStg).pwszChar = xwszRet.Acquire();
|
|
|
|
return retTkn;
|
|
}
|
|
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Function: YYLEXER::DetermineTokenType
|
|
//
|
|
// Synopsis: Determines if we have a regular expression or a regular string.
|
|
// A regular expression is a string that contains atleast one of
|
|
// *, ?, or | characters.
|
|
//
|
|
// Returns: Token id
|
|
//
|
|
// History: Jun-05-98 KrishnaN created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
short YYLEXER::DetermineTokenType()
|
|
{
|
|
LPWSTR pwsz = yytext;
|
|
|
|
LPWSTR pLast = pwsz + wcslen(pwsz) - 1;
|
|
|
|
while (pLast >= pwsz)
|
|
{
|
|
if (L'|' == *pwsz || L'*' == *pwsz || L'?' == *pwsz)
|
|
return _REGEX;
|
|
|
|
pwsz++;
|
|
}
|
|
|
|
// None of the regular expression defining characters have been found
|
|
return _PHRASE;
|
|
}
|
|
|
|
|
|
//
|
|
//
|
|
// RULES
|
|
//
|
|
// Notes: Any characters which are not matched, cause yylexer to throw.
|
|
// We can also throw if E_OUTOFMEMORY.
|
|
// Tokens which need 2 return more than 1 value (e.g. {near}
|
|
// use start states to return each pice of the value. The start
|
|
// states also emit a "token end" token so that the parser can
|
|
// check that they are syntactically complete.
|
|
// Lex matches to the longest match in the rules. If 2 matches
|
|
// are the same, it matches to the 1st match.
|
|
%}
|
|
|
|
%x innear
|
|
%x shortgen
|
|
%x shortregex
|
|
%x mayberegex
|
|
%x implicitphrase
|
|
%x infreefreetext
|
|
%x invector
|
|
|
|
white [ \t\n\f\r]+
|
|
|
|
begin_freetext \{[fF][rR][eE][eE][tT][eE][xX][tT]\}[ ]*
|
|
end_freetext [ ]*\{\/[fF][rR][eE][eE][tT][eE][xX][tT]\}
|
|
begin_phrase \{[pP][hH][rR][aA][sS][eE]\}[ ]*
|
|
end_phrase [ ]*\{\/[pP][hH][rR][aA][sS][eE]\}
|
|
prop [pP][rR][oO][pP]
|
|
propname {prop}[ ]+[nN][aA][mM][eE][ ]*
|
|
contains [cC][oO][nN][tT][aA][iI][nN][sS]
|
|
and [aA][nN][dD]
|
|
or [oO][rR]
|
|
not [nN][oO][tT]
|
|
near [nN][eE][aA][rR]
|
|
vector [vV][eE][cC][tT][oO][rR]
|
|
vecmethod {vector}[ ]+[rR][aA][nN][kK][mM][eE][tT][hH][oO][dD][ ]*
|
|
ve [vV][eE]
|
|
weight [wW][eE][iI][gG][hH][tT][ ]+[vV][aA][lL][uU][eE][ ]*
|
|
coerce [cC][oO][eE][rR][cC][eE]
|
|
generate [gG][eE][nN][eE][rR][aA][tT][eE]
|
|
genmethod {generate}[ ]+[mM][eE][tT][hH][oO][dD][ ]*
|
|
begin_regex \{[rR][eE][gG][eE][xX]\}[ ]*
|
|
end_regex [ ]*\{\/[rR][eE][gG][eE][xX]\}
|
|
dist [dD][iI][sS][tT][ ]*
|
|
unit [uU][nN][iI][tT][ ]*
|
|
word [wW][oO][rR][dD]
|
|
sent [sS][eE][nN][tT]
|
|
par [pP][aA][rR]
|
|
chap [cC][hH][aA][pP]
|
|
|
|
%%
|
|
|
|
{white} { /* do nothing */ }
|
|
|
|
\( { fContinueImplicitPhrase = FALSE;
|
|
fContinueRegex = FALSE;
|
|
fContinueMaybeRegex = FALSE;
|
|
TOKEN (_OPEN);
|
|
}
|
|
\) {
|
|
fContinueImplicitPhrase = FALSE;
|
|
fContinueRegex = FALSE;
|
|
fContinueMaybeRegex = FALSE;
|
|
TOKEN (_CLOSE);
|
|
}
|
|
|
|
%{// ************
|
|
// PROPNAME
|
|
// ************ %}
|
|
|
|
%{ // If something was treated as a phrase in Tripolish 1, it should
|
|
// be treated as such even now. That applies here. For e.g. @propname
|
|
// caused the following text to be treated as a phrase. The same should
|
|
// apply to {prop name = propname}
|
|
//
|
|
%}
|
|
|
|
%{// shorthand, quoted %}
|
|
@\"[^"]+\" {
|
|
// treat value as a phrase
|
|
BEGIN implicitphrase;
|
|
STRING_VALUE(_PROPNAME, FALSE, TRUE);
|
|
}
|
|
%{// shorthand, not quoted %}
|
|
@[^" <>=!&|~\^]+ {
|
|
// treat value as a phrase
|
|
BEGIN implicitphrase;
|
|
STRING_VALUE(_PROPNAME, FALSE, FALSE);
|
|
}
|
|
|
|
%{// shorthand, quoted %}
|
|
$\"[^"]+\" {
|
|
// treat value as freetext
|
|
BEGIN infreefreetext;
|
|
STRING_VALUE(_PROPNAME, FALSE, TRUE);
|
|
}
|
|
%{// shorthand, not quoted %}
|
|
$[^" <>=!&|~\^]+ {
|
|
// treat value as freetext
|
|
BEGIN infreefreetext;
|
|
STRING_VALUE(_PROPNAME, FALSE, FALSE);
|
|
}
|
|
|
|
%{// longhand, quoted %}
|
|
\{{propname}=[ ]*\"[^"]*\"[ ]*\} {
|
|
// treat value as a phrase
|
|
BEGIN implicitphrase;
|
|
STRING_VALUE(_PROPNAME, TRUE, TRUE);
|
|
}
|
|
%{// longhand, not quoted %}
|
|
\{{propname}=[ ]*[^"} ][^}]*\} {
|
|
// treat value as a phrase
|
|
BEGIN implicitphrase;
|
|
STRING_VALUE(_PROPNAME, TRUE, FALSE);
|
|
}
|
|
%{// closing token %}
|
|
\{\/{prop}\} { TOKEN (_PROPEND); }
|
|
|
|
%{// *********
|
|
// OPERATORS
|
|
// ********* %}
|
|
|
|
{contains}[ ]+ { if (fContinueImplicitPhrase)
|
|
{
|
|
BEGIN implicitphrase;
|
|
fContinueImplicitPhrase = FALSE;
|
|
}
|
|
else if (fContinueRegex)
|
|
{
|
|
BEGIN shortregex;
|
|
fContinueRegex = FALSE;
|
|
}
|
|
else if (fContinueMaybeRegex)
|
|
{
|
|
BEGIN mayberegex;
|
|
fContinueMaybeRegex = FALSE;
|
|
}
|
|
TOKEN (_CONTAINS);
|
|
}
|
|
{and}[ ]+ { if (fContinueImplicitPhrase)
|
|
{
|
|
BEGIN implicitphrase;
|
|
fContinueImplicitPhrase = FALSE;
|
|
}
|
|
else if (fContinueRegex)
|
|
{
|
|
BEGIN shortregex;
|
|
fContinueRegex = FALSE;
|
|
}
|
|
else if (fContinueMaybeRegex)
|
|
{
|
|
BEGIN mayberegex;
|
|
fContinueMaybeRegex = FALSE;
|
|
}
|
|
TOKEN (_AND);
|
|
}
|
|
{and}\{ {
|
|
yyless(yyleng-1);
|
|
if (fContinueImplicitPhrase)
|
|
{
|
|
BEGIN implicitphrase;
|
|
fContinueImplicitPhrase = FALSE;
|
|
}
|
|
else if (fContinueRegex)
|
|
{
|
|
BEGIN shortregex;
|
|
fContinueRegex = FALSE;
|
|
}
|
|
else if (fContinueMaybeRegex)
|
|
{
|
|
BEGIN mayberegex;
|
|
fContinueMaybeRegex = FALSE;
|
|
}
|
|
TOKEN (_AND);
|
|
}
|
|
{or}[ ]+ { if (fContinueImplicitPhrase)
|
|
{
|
|
yaccDebugOut(( DEB_ITRACE, "fContinueImplicitPhrase\n" ));
|
|
BEGIN implicitphrase;
|
|
fContinueImplicitPhrase = FALSE;
|
|
}
|
|
else if (fContinueRegex)
|
|
{
|
|
yaccDebugOut(( DEB_ITRACE, "fContinueRegex\n" ));
|
|
BEGIN shortregex;
|
|
fContinueRegex = FALSE;
|
|
}
|
|
else if (fContinueMaybeRegex)
|
|
{
|
|
yaccDebugOut(( DEB_ITRACE, "fContinueMaybeRegex\n" ));
|
|
BEGIN mayberegex;
|
|
fContinueMaybeRegex = FALSE;
|
|
}
|
|
yaccDebugOut(( DEB_ITRACE, "OR TOKEN found !!!\n" ));
|
|
TOKEN (_OR); }
|
|
{or}\{ {
|
|
yyless(yyleng-1);
|
|
if (fContinueImplicitPhrase)
|
|
{
|
|
yaccDebugOut(( DEB_ITRACE, "OR{ fContinueImplicitPhrase\n" ));
|
|
BEGIN implicitphrase;
|
|
fContinueImplicitPhrase = FALSE;
|
|
}
|
|
else if (fContinueRegex)
|
|
{
|
|
yaccDebugOut(( DEB_ITRACE, "OR{ fContinueRegex\n" ));
|
|
BEGIN shortregex;
|
|
fContinueRegex = FALSE;
|
|
}
|
|
else if (fContinueMaybeRegex)
|
|
{
|
|
yaccDebugOut(( DEB_ITRACE, "OR{ fContinueMaybeRegex\n" ));
|
|
BEGIN mayberegex;
|
|
fContinueMaybeRegex = FALSE;
|
|
}
|
|
yaccDebugOut(( DEB_ITRACE, "OR{ TOKEN found !!!\n" ));
|
|
TOKEN (_OR); }
|
|
{not}[ ]+ { if (fContinueImplicitPhrase)
|
|
{
|
|
BEGIN implicitphrase;
|
|
fContinueImplicitPhrase = FALSE;
|
|
}
|
|
else if (fContinueRegex)
|
|
{
|
|
BEGIN shortregex;
|
|
fContinueRegex = FALSE;
|
|
}
|
|
else if (fContinueMaybeRegex)
|
|
{
|
|
BEGIN mayberegex;
|
|
fContinueMaybeRegex = FALSE;
|
|
}
|
|
TOKEN (_NOT);}
|
|
{not}\{ {
|
|
yyless(yyleng-1);
|
|
if (fContinueImplicitPhrase)
|
|
{
|
|
BEGIN implicitphrase;
|
|
fContinueImplicitPhrase = FALSE;
|
|
}
|
|
else if (fContinueRegex)
|
|
{
|
|
BEGIN shortregex;
|
|
fContinueRegex = FALSE;
|
|
}
|
|
else if (fContinueMaybeRegex)
|
|
{
|
|
BEGIN mayberegex;
|
|
fContinueMaybeRegex = FALSE;
|
|
}
|
|
TOKEN (_NOT);}
|
|
& { if (fContinueImplicitPhrase)
|
|
{
|
|
BEGIN implicitphrase;
|
|
fContinueImplicitPhrase = FALSE;
|
|
}
|
|
else if (fContinueRegex)
|
|
{
|
|
BEGIN shortregex;
|
|
fContinueRegex = FALSE;
|
|
}
|
|
else if (fContinueMaybeRegex)
|
|
{
|
|
BEGIN mayberegex;
|
|
fContinueMaybeRegex = FALSE;
|
|
}
|
|
TOKEN (_AND);}
|
|
\| { if (fContinueImplicitPhrase)
|
|
{
|
|
BEGIN implicitphrase;
|
|
fContinueImplicitPhrase = FALSE;
|
|
}
|
|
else if (fContinueRegex)
|
|
{
|
|
BEGIN shortregex;
|
|
fContinueRegex = FALSE;
|
|
}
|
|
else if (fContinueMaybeRegex)
|
|
{
|
|
BEGIN mayberegex;
|
|
fContinueMaybeRegex = FALSE;
|
|
}
|
|
TOKEN (_OR);}
|
|
! { if (fContinueImplicitPhrase)
|
|
{
|
|
BEGIN implicitphrase;
|
|
fContinueImplicitPhrase = FALSE;
|
|
}
|
|
else if (fContinueRegex)
|
|
{
|
|
BEGIN shortregex;
|
|
fContinueRegex = FALSE;
|
|
}
|
|
else if (fContinueMaybeRegex)
|
|
{
|
|
BEGIN mayberegex;
|
|
fContinueMaybeRegex = FALSE;
|
|
}
|
|
TOKEN (_NOT);}
|
|
{near}[ ]+ { yaccDebugOut(( DEB_ITRACE, "near[ ]+ _NEAR token, begin implicitphrase\n" ));
|
|
BEGIN implicitphrase;
|
|
TOKEN (_NEAR);}
|
|
{near}\{ { yaccDebugOut(( DEB_ITRACE, "near{ _NEAR token, begin implicitphrase\n" ));
|
|
yyless(yyleng-1);
|
|
BEGIN implicitphrase;
|
|
TOKEN (_NEAR);}
|
|
~ { BEGIN implicitphrase;
|
|
TOKEN (_NEAR);}
|
|
\< { TOKEN (_LT);}
|
|
\> { TOKEN (_GT);}
|
|
\<\= { TOKEN (_LTE);}
|
|
\>\= { TOKEN (_GTE);}
|
|
\= { if (fContinueMaybeRegex)
|
|
{
|
|
// We are not sure if we are going to find a
|
|
// regular expression or a phrase.
|
|
|
|
BEGIN mayberegex;
|
|
fContinueMaybeRegex = FALSE;
|
|
}
|
|
TOKEN (_EQ);
|
|
}
|
|
\!\= { TOKEN (_NE); }
|
|
\^a { TOKEN (_ALLOF); }
|
|
\^s { TOKEN (_SOMEOF); }
|
|
\<[ ]*\^s |
|
|
\^s[ ]*\< { TOKEN (_LTSOME); }
|
|
\>[ ]*\^s |
|
|
\^s[ ]*\> { TOKEN (_GTSOME); }
|
|
\<\=[ ]*\^s |
|
|
\^s[ ]*\<\= { TOKEN (_LTESOME); }
|
|
\>\=[ ]*\^s |
|
|
\^s[ ]*\>\= { TOKEN (_GTESOME); }
|
|
\=[ ]*\^s |
|
|
\^s[ ]*\= { TOKEN (_EQSOME); }
|
|
\!\=[ ]*\^s |
|
|
\^s[ ]*\!\= { TOKEN (_NESOME); }
|
|
|
|
\^s[ ]*\^a { TOKEN (_ALLOFSOME); }
|
|
\^s[ ]*\^s { TOKEN (_SOMEOFSOME); }
|
|
\^<[ ]*\^a |
|
|
\^a[ ]*\< { TOKEN (_LTALL); }
|
|
\>[ ]*\^a |
|
|
\^a[ ]*\> { TOKEN (_GTALL); }
|
|
\<\=[ ]*\^a |
|
|
\^a[ ]*\<\= { TOKEN (_LTEALL); }
|
|
\>\=[ ]*\^a |
|
|
\^a[ ]*\>\= { TOKEN (_GTEALL); }
|
|
\=[ ]*\^a |
|
|
\^a[ ]*\= { TOKEN (_EQALL); }
|
|
\!\=[ ]*\^a |
|
|
\^a[ ]*\!\= { TOKEN (_NEALL); }
|
|
\^a[ ]*\^a { TOKEN (_ALLOFALL); }
|
|
\^a[ ]*\^s { TOKEN (_SOMEOFALL); }
|
|
|
|
%{// *************
|
|
// VECTOR SPACE TOKENS
|
|
// ************* %}
|
|
\{{vecmethod}=[ ]*\"[^"]*\"[ ]*\} { STRING_VALUE(_VEMETHOD, TRUE, TRUE); }
|
|
\{{vecmethod}=[^}]*\} { STRING_VALUE(_VEMETHOD, TRUE, FALSE); }
|
|
\{{ve}\} {
|
|
// makes more sense to enter phrase mode
|
|
// rather than freetext mode.
|
|
fContinueImplicitPhrase = TRUE;
|
|
BEGIN implicitphrase;
|
|
TOKEN (_VE);
|
|
}
|
|
|
|
\{\/{vector}\} { TOKEN (_VECTOR_END); }
|
|
|
|
%{// *************
|
|
// longhand NEAR
|
|
// ************* %}
|
|
|
|
%{// must return both unit and distance, so use start state to pull them out, and
|
|
// return _NEAR_END so parser knows we hit the closing }
|
|
%}
|
|
\{{near}[ ] { yaccDebugOut(( DEB_ITRACE, "Longhand _NEAR token, begin innear\n" ));
|
|
BEGIN innear; }
|
|
\{{near}\{ { yaccDebugOut(( DEB_ITRACE, "Longhand _NEAR{ token, begin innear\n" ));
|
|
yyless(yyleng-1);
|
|
BEGIN innear; }
|
|
|
|
%{// ************
|
|
// WEIGHT
|
|
// ************ %}
|
|
\{{weight}=[ ]*(0|1|0\.[0-9]*|1\.[0]*|\.[0-9]+)[ ]*\} {
|
|
if (fContinueImplicitPhrase)
|
|
{
|
|
BEGIN implicitphrase;
|
|
fContinueImplicitPhrase = FALSE;
|
|
}
|
|
|
|
yaccDebugOut(( DEB_ITRACE, "_WEIGHT TOKEN FOUND!!\n" ));
|
|
STRING_VALUE(_WEIGHT,TRUE,FALSE);
|
|
}
|
|
|
|
\{{coerce}\} {
|
|
if (fContinueImplicitPhrase)
|
|
{
|
|
BEGIN implicitphrase;
|
|
fContinueImplicitPhrase = FALSE;
|
|
}
|
|
TOKEN (_COERCE); }
|
|
|
|
%{// ****************
|
|
// longhand GENERATE
|
|
// **************** %}
|
|
|
|
\{{genmethod}=[" ]*prefix[" ]*\} {
|
|
if (fContinueImplicitPhrase)
|
|
{
|
|
BEGIN implicitphrase;
|
|
fContinueImplicitPhrase = FALSE;
|
|
}
|
|
yaccDebugOut((DEB_ITRACE, "Prefix recognized.\n"));
|
|
TOKEN(_GENPREFIX);
|
|
}
|
|
\{{genmethod}=[" ]*inflect[" ]*\} {
|
|
if (fContinueImplicitPhrase)
|
|
{
|
|
BEGIN implicitphrase;
|
|
fContinueImplicitPhrase = FALSE;
|
|
}
|
|
yaccDebugOut((DEB_ITRACE, "Inflect recognized.\n"));
|
|
TOKEN(_GENINFLECT);
|
|
}
|
|
\{\/{generate}\} { TOKEN (_GENNORMAL); }
|
|
|
|
%{// ****************
|
|
// longhand REGEX
|
|
// **************** %}
|
|
|
|
{begin_regex}\"[^"]*\"{end_regex} { STRING_VALUE(_REGEX,TRUE,TRUE);}
|
|
{begin_regex}[^{]*{end_regex} { STRING_VALUE(_REGEX,TRUE,FALSE);}
|
|
{begin_regex}([^{]*\|[()\[{}\],*?+][^{]*)*{end_regex} { STRING_VALUE(_REGEX,TRUE,FALSE);}
|
|
|
|
|
|
%{// ****************
|
|
// shorthand REGEX
|
|
// **************** %}
|
|
|
|
%{// shorthand, quoted %}
|
|
#\"[^"]+\" {
|
|
// Get into short form of reg expression
|
|
BEGIN shortregex;
|
|
STRING_VALUE(_PROPNAME, FALSE, TRUE);
|
|
}
|
|
%{// shorthand, not quoted %}
|
|
#[^" <>=!&|~\^]+ {
|
|
// Get into short form of reg expression
|
|
BEGIN shortregex;
|
|
STRING_VALUE(_PROPNAME, FALSE, FALSE);
|
|
}
|
|
|
|
|
|
%{// ***************
|
|
// longhand PHRASE
|
|
// *************** %}
|
|
|
|
%{// quoted, with trailing * or ** %}
|
|
{begin_phrase}\"[^"]*\"{end_phrase}\* {
|
|
// trailing * has to be for inflection -
|
|
// process it in shortgen on next pass.
|
|
// Grab phrase now.
|
|
yyless(yyleng-1);
|
|
BEGIN shortgen;
|
|
STRING_VALUE(_PHRASE,TRUE,TRUE);
|
|
}
|
|
%{// quoted, without trailing * or ** %}
|
|
{begin_phrase}\"[^"]*\"{end_phrase} {
|
|
// no trailing * -- phrase only
|
|
STRING_VALUE(_PHRASE,TRUE,TRUE);
|
|
}
|
|
%{// unquoted, with trailing * or ** %}
|
|
{begin_phrase}[^{]*{end_phrase}\* {
|
|
// trailing * has to be for inflection -
|
|
// process it in shortgen on next pass.
|
|
// Grab phrase now.
|
|
yyless(yyleng-1);
|
|
BEGIN shortgen;
|
|
STRING_VALUE(_PHRASE,TRUE,FALSE);
|
|
}
|
|
%{// unquoted, without trailing * or ** %}
|
|
{begin_phrase}[^{]*{end_phrase} {
|
|
// no trailing * -- phrase only
|
|
STRING_VALUE(_PHRASE,TRUE,FALSE);
|
|
}
|
|
%{// *************
|
|
// shorthand PHRASE
|
|
// ************* %}
|
|
|
|
%{// with trailing * or ** %}
|
|
\"[^"]*\"\* {
|
|
// trailing * has to be for inflection -
|
|
// process it in shortgen on next pass.
|
|
// Grab phrase now.
|
|
yyless(yyleng-1);
|
|
BEGIN shortgen;
|
|
STRING_VALUE(_PHRASE, FALSE, TRUE);
|
|
}
|
|
%{ // without trailing * or ** %}
|
|
\"[^"]*\" {
|
|
// no trailing * -- phrase only
|
|
STRING_VALUE(_PHRASE, FALSE, TRUE);
|
|
}
|
|
|
|
%{// *****************
|
|
// longhand FREETEXT
|
|
// ***************** %}
|
|
|
|
%{// quoted, with trailing * or ** %}
|
|
{begin_freetext}\"[^"]*\"{end_freetext}\* {
|
|
// trailing * has to be for inflection -
|
|
// process it in shortgen on next pass.
|
|
// Grab freetext now.
|
|
yyless(yyleng-1);
|
|
BEGIN shortgen;
|
|
STRING_VALUE(_FREETEXT,TRUE,TRUE);
|
|
}
|
|
%{// quoted, without trailing * or ** %}
|
|
{begin_freetext}\"[^"]*\"{end_freetext} {
|
|
// no trailing * -- freetext only
|
|
STRING_VALUE(_FREETEXT,TRUE,TRUE);
|
|
}
|
|
%{// unquoted, with trailing * or ** %}
|
|
{begin_freetext}[^{]*{end_freetext}\* {
|
|
// trailing * has to be for inflection -
|
|
// process it in shortgen on next pass.
|
|
// Grab freetext now.
|
|
yyless(yyleng-1);
|
|
BEGIN shortgen;
|
|
STRING_VALUE(_FREETEXT,TRUE,FALSE);
|
|
}
|
|
%{// unquoted, without trailing * or ** %}
|
|
{begin_freetext}[^{]*{end_freetext} {
|
|
// no trailing * -- freetext only
|
|
STRING_VALUE(_FREETEXT,TRUE,FALSE);
|
|
}
|
|
%{// ******************
|
|
// shorthand FREETEXT
|
|
// ****************** %}
|
|
|
|
[^#$@~&|<>=!\^*"()\{ ][^&~|{) ]*[ ] {
|
|
// For backward compatibility, we want to special
|
|
// case and recognize the "not" operator when it
|
|
// is immediately followed by a mode specifier character
|
|
// (@, $, #). For e.g. "not@size > 2" should be treated
|
|
// as if we have a "not" operator followed by "@size > 2".
|
|
// Without this special case, "not@size > 2" gets recognized
|
|
// as free text.
|
|
|
|
if (IsNotOperator())
|
|
{
|
|
yyless(3);
|
|
BEGIN INITIAL;
|
|
TOKEN(_NOT);
|
|
}
|
|
|
|
yaccDebugOut(( DEB_ITRACE, "fTreatFreetextAsPhrase is %d\n", fTreatFreetextAsPhrase ));
|
|
if (fTreatFreetextAsPhrase)
|
|
BEGIN implicitphrase;
|
|
else
|
|
BEGIN infreefreetext;
|
|
|
|
fTreatFreetextAsPhrase = FALSE;
|
|
yymore();
|
|
}
|
|
[^#$@~&|<>=!\^*"()\{ ][^&~|{) ]* {
|
|
// IsNotOperator is used here for the same reason as the
|
|
// use above, except that this rule covers situations where
|
|
// we have no spaces in the query. E.g. "not@size>2".
|
|
// This should be equivalent to
|
|
// "not@size > 2", which in turn should be equivalent to
|
|
// "not @size > 2"
|
|
|
|
if (IsNotOperator())
|
|
{
|
|
yyless(3);
|
|
BEGIN INITIAL;
|
|
TOKEN(_NOT);
|
|
}
|
|
|
|
if (fTreatFreetextAsPhrase)
|
|
{
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|
|
else
|
|
{
|
|
STRING_VALUE(_FREETEXT,FALSE,FALSE);
|
|
}
|
|
|
|
fTreatFreetextAsPhrase = FALSE;
|
|
}
|
|
|
|
|
|
%{// *************
|
|
// VECTOR VALUES
|
|
// ************* %}
|
|
|
|
%{// quoted multi-value vector - has ; separator. Singlets caught in parser %}
|
|
\([ ]*\"[^"]*\"[ ]*; { BEGIN invector; yyless(1);}
|
|
%{// unquoted multi-value vector - has ; separator. Singlets caught in parser %}
|
|
\([^(;)]+; { BEGIN invector; yyless(1);}
|
|
|
|
|
|
%{//
|
|
// INNEAR: longhand NEAR processing
|
|
//
|
|
%}
|
|
<innear>{white} {}
|
|
<innear>, {}
|
|
<innear>dist[ ]*=[ ]*[0-9]+ { STRING_VALUE(_NEARDIST,TRUE,FALSE);}
|
|
<innear>unit[ ]*=[ ]*{word} { STRING_VALUE(_NEARUNIT,TRUE,FALSE);}
|
|
<innear>unit[ ]*=[ ]*{sent} { STRING_VALUE(_NEARUNIT,TRUE,FALSE);}
|
|
<innear>unit[ ]*=[ ]*{par} { STRING_VALUE(_NEARUNIT,TRUE,FALSE);}
|
|
<innear>unit[ ]*=[ ]*{chap} { STRING_VALUE(_NEARUNIT,TRUE,FALSE);}
|
|
<innear>\} { BEGIN implicitphrase; TOKEN (_NEAR_END);}
|
|
|
|
%{//
|
|
// INVECTOR: multi value vector processing
|
|
//
|
|
%}
|
|
<invector>{white} {}
|
|
<invector>; {}
|
|
<invector>\"[^"]*\" { STRING_VALUE(_VECTORELEMENT, FALSE, TRUE);}
|
|
<invector>[^ ";)][^;)]*; { STRING_VALUE(_VECTORELEMENT, TRUE, FALSE);}
|
|
<invector>[^ ";)][^;)]*\) {
|
|
// Need to emit _VECTORELEMENT and _VE_END -- so backup 1
|
|
// so we can emit _VE_END on next pass
|
|
yyless(yyleng-1);
|
|
STRING_VALUE(_VECTORELEMENT, FALSE, FALSE);
|
|
}
|
|
<invector>\) { BEGIN INITIAL; TOKEN (_VE_END); }
|
|
|
|
%{//
|
|
// INFREEFREETEXT: shorthand FREETEXT processing
|
|
//
|
|
// NOTE: and, or, near need to be localized %}
|
|
<infreefreetext>[ ]+ { yymore(); }
|
|
<infreefreetext>{and}[ ] {
|
|
yyless(yyleng-4);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_FREETEXT,FALSE,FALSE);
|
|
}
|
|
<infreefreetext>{and}\{ {
|
|
yyless(yyleng-4);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_FREETEXT,FALSE,FALSE);
|
|
}
|
|
<infreefreetext>{or}[ ] {
|
|
yyless(yyleng-3);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_FREETEXT,FALSE,FALSE);
|
|
}
|
|
<infreefreetext>{or}\{ {
|
|
yyless(yyleng-3);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_FREETEXT,FALSE,FALSE);
|
|
}
|
|
<infreefreetext>{near}[ ] {
|
|
yaccDebugOut(( DEB_ITRACE, "{infreefreetext}{near}[ ]\n" ));
|
|
yyless(yyleng-5);
|
|
fTreatFreetextAsPhrase = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|
|
|
|
<infreefreetext>{near}\{ {
|
|
yaccDebugOut(( DEB_ITRACE, "{infreefreetext}{near}{\n" ));
|
|
yyless(yyleng-5);
|
|
fTreatFreetextAsPhrase = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|
|
<infreefreetext>\{{near}[ ] {
|
|
yaccDebugOut(( DEB_ITRACE, "{infreefreetext}{{near}\n" ));
|
|
yyless(yyleng-6);
|
|
fTreatFreetextAsPhrase = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|
|
<infreefreetext>\{{near}\{ {
|
|
yaccDebugOut(( DEB_ITRACE, "{infreefreetext}{{near}{\n" ));
|
|
yyless(yyleng-6);
|
|
fTreatFreetextAsPhrase = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|
|
<infreefreetext>& {
|
|
yyless(yyleng-1);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_FREETEXT,FALSE,FALSE);
|
|
}
|
|
<infreefreetext>\| {
|
|
yyless(yyleng-1);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_FREETEXT,FALSE,FALSE);
|
|
}
|
|
<infreefreetext>~ {
|
|
yyless(yyleng-1);
|
|
fTreatFreetextAsPhrase = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|
|
<infreefreetext>\( {
|
|
yyless(yyleng-1);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_FREETEXT,FALSE,FALSE);
|
|
}
|
|
<infreefreetext>\) {
|
|
yyless(yyleng-1);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_FREETEXT,FALSE,FALSE);
|
|
}
|
|
<infreefreetext>\{ {
|
|
yyless(yyleng-1);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_FREETEXT,FALSE,FALSE);
|
|
}
|
|
<infreefreetext>\"[^"]+\" {
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_FREETEXT,FALSE,FALSE);
|
|
}
|
|
<infreefreetext>[^~&|{}()" ]+[ ] { yymore(); }
|
|
<infreefreetext>[^~&|{}()" ]+ {
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_FREETEXT,FALSE,FALSE);
|
|
}
|
|
|
|
%{//
|
|
// SHORTGEN: * or ** processing
|
|
//
|
|
// can only get here by backing up over *,
|
|
// so we will always find a match %}
|
|
<shortgen>\*\* {
|
|
BEGIN INITIAL;
|
|
TOKEN(_SHGENINFLECT);
|
|
}
|
|
<shortgen>\* {
|
|
BEGIN INITIAL;
|
|
TOKEN(_SHGENPREFIX);
|
|
}
|
|
|
|
%{//
|
|
// SHORTREGEX: #propname processing
|
|
//
|
|
// can only get here when #"propname" or #propname
|
|
// (quoted or unquoted) version is detected.
|
|
// NOTE: and, or need to be localized
|
|
// NOTE: It doesn't make sense to have the near operator following
|
|
// a regular expression. A regex is Boolean and doesn't evaluate
|
|
// to a position value.
|
|
//
|
|
//
|
|
|
|
%}
|
|
<shortregex>[ ]+ { yymore(); }
|
|
<shortregex>= {
|
|
// ignore equal operators...
|
|
BEGIN shortregex;
|
|
}
|
|
<shortregex>\"[^"]*\" { STRING_VALUE(_REGEX, FALSE, TRUE);}
|
|
<shortregex>{and}[ ] {
|
|
fContinueRegex = TRUE;
|
|
yyless(yyleng-4);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_REGEX,FALSE,FALSE);
|
|
}
|
|
<shortregex>{or}[ ] {
|
|
fContinueRegex = TRUE;
|
|
yyless(yyleng-3);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_REGEX,FALSE,FALSE);
|
|
}
|
|
|
|
<shortregex>{not}[ ] {
|
|
yyless(yyleng-4);
|
|
// The only valid way to get here is to
|
|
// have had seen "and" before. Don't recognize
|
|
// a regex. Back off and let the lexer takes its
|
|
// normal course.
|
|
fContinueRegex = TRUE;
|
|
BEGIN INITIAL;
|
|
}
|
|
<shortregex>& {
|
|
fContinueRegex = TRUE;
|
|
yyless(yyleng-1);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_REGEX,FALSE,FALSE);
|
|
}
|
|
<shortregex>\| {
|
|
fContinueRegex = TRUE;
|
|
yyless(yyleng-1);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_REGEX,FALSE,FALSE);
|
|
}
|
|
|
|
<shortregex>! {
|
|
yyless(yyleng-1);
|
|
// The only valid way to get here is to
|
|
// have had seen "and" before. Don't recognize
|
|
// a phrase. Back off and let the lexer takes its
|
|
// normal course.
|
|
fContinueRegex = TRUE;
|
|
BEGIN INITIAL;
|
|
}
|
|
%{
|
|
// When we find an operator we should treat it as one.
|
|
// So backup and get out if you see one.
|
|
// Normally '^' is treated as part of an operator (e.g. ^a), but it also
|
|
// has a special meaning in regular expression syntax. So we will have to
|
|
// let it through when it is part of a regular expression. As an alternative,
|
|
// we can allow '^' in regular expression in a limited manner (i.e. only the use
|
|
// in square brackets to exclude the set of chars "[^abc]" where abc are excluded).
|
|
// This alternative will let the common case use of '^' in a regular expression
|
|
// while allowing it to be treated as part of an operator when it doesn't
|
|
// occur immediately after a '['.
|
|
// We are implementing the alternative here because our regex capability
|
|
// only allows for the "[^" construct.
|
|
%}
|
|
<shortregex>[\^<>@$#] {
|
|
yyless(yyleng-1);
|
|
fContinueRegex = FALSE;
|
|
BEGIN INITIAL;
|
|
}
|
|
<shortregex>\( {
|
|
yyless(yyleng-1);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_REGEX,FALSE,FALSE);
|
|
}
|
|
<shortregex>\) {
|
|
yyless(yyleng-1);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_REGEX,FALSE,FALSE);
|
|
}
|
|
<shortregex>\{ {
|
|
yyless(yyleng-1);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_REGEX,FALSE,FALSE);
|
|
}
|
|
<shortregex>(([^~&|{}()\^<>!@$#= ])*(\|[()\[{}\],*?+])*(\|\[\^)*([^~&|{}()\^<>!@$#= ])*)+[ ] { yymore(); }
|
|
<shortregex>(([^~&|{}()\^<>!@$#= ])*(\|[()\[{}\],*?+])*(\|\[\^)*([^~&|{}()\^<>!@$#= ])*)+ {
|
|
fContinueRegex = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_REGEX,FALSE,FALSE);
|
|
}
|
|
|
|
|
|
<mayberegex>{and}[ ] {
|
|
yyless(yyleng-4);
|
|
fContinueMaybeRegex = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
|
|
}
|
|
<mayberegex>{or}[ ] {
|
|
yyless(yyleng-3);
|
|
fContinueMaybeRegex = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
|
|
}
|
|
|
|
<mayberegex>{not}[ ] {
|
|
yyless(yyleng-4);
|
|
// The only valid way to get here is to
|
|
// have had seen "and" before. Don't recognize
|
|
// a regex. Back off and let the lexer takes its
|
|
// normal course.
|
|
fContinueMaybeRegex = TRUE;
|
|
BEGIN INITIAL;
|
|
}
|
|
<mayberegex>& {
|
|
fContinueMaybeRegex = TRUE;
|
|
yyless(yyleng-1);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
|
|
}
|
|
<mayberegex>\| {
|
|
fContinueMaybeRegex = TRUE;
|
|
yyless(yyleng-1);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
|
|
}
|
|
|
|
<mayberegex>! {
|
|
yyless(yyleng-1);
|
|
// The only valid way to get here is to
|
|
// have had seen "and" before. Don't recognize
|
|
// a phrase. Back off and let the lexer takes its
|
|
// normal course.
|
|
fContinueMaybeRegex = TRUE;
|
|
BEGIN INITIAL;
|
|
}
|
|
<mayberegex>\( {
|
|
yyless(yyleng-1);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
|
|
}
|
|
<mayberegex>\) {
|
|
yyless(yyleng-1);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
|
|
}
|
|
<mayberegex>\{ {
|
|
yyless(yyleng-1);
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
|
|
}
|
|
<mayberegex>[ ]+ { yymore(); }
|
|
<mayberegex>\"[^"]*\" { STRING_VALUE(_PHRASE, FALSE, TRUE);}
|
|
<mayberegex>(([^~&|{}()\^<>!@$# ])*(\|[()\[{}\],*?+])*(\|\[\^)*([^~&|{}()\^<>!@$# ])*)+[ ] { yymore(); }
|
|
<mayberegex>(([^~&|{}()\^<>!@$# ])*(\|[()\[{}\],*?+])*(\|\[\^)*([^~&|{}()\^<>!@$# ])*)+ {
|
|
fContinueMaybeRegex = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASEORREGEX,FALSE,FALSE);
|
|
}
|
|
%{
|
|
// When we find an operator at the start of a phrase,
|
|
// we should treat it as one. So backup and get out if you see one.
|
|
%}
|
|
<mayberegex>[\^<>@$#] {
|
|
yyless(yyleng-1);
|
|
fContinueMaybeRegex = FALSE;
|
|
BEGIN INITIAL;
|
|
}
|
|
|
|
|
|
%{//
|
|
// IMPLICITPHRASE: Where phrase is implied.
|
|
//
|
|
// can only get here when @propname or {prop name = propname} is detected.
|
|
// NOTE: and, or, not need to be localized when time permits.
|
|
//
|
|
// NTRAID#DB-NTBUG9-84571-2000/07/31-dlee Indexing Service tripolish2 query expressions misinterpreted as strings
|
|
// if expression has trailing blanks, we'll emit a string value
|
|
%}
|
|
<implicitphrase>\"[^"]*\" {
|
|
fContinueImplicitPhrase = FALSE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE, FALSE, TRUE);
|
|
}
|
|
<implicitphrase>[ ]+ { yymore(); }
|
|
<implicitphrase>{and}[ ] {
|
|
yyless(yyleng-4);
|
|
fContinueImplicitPhrase = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|
|
<implicitphrase>{or}[ ] {
|
|
yyless(yyleng-3);
|
|
fContinueImplicitPhrase = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|
|
<implicitphrase>{near}[ ] {
|
|
yyless(yyleng-5);
|
|
// We want to treat the following token as a phrase
|
|
fContinueImplicitPhrase = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|
|
<implicitphrase>{near}\{ {
|
|
yyless(yyleng-5);
|
|
// We want to treat the following token as a phrase
|
|
fContinueImplicitPhrase = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|
|
<implicitphrase>{not}[ ] {
|
|
yyless(yyleng-4);
|
|
// The only valid way to get here is to
|
|
// have had seen "and" before. Don't recognize
|
|
// a phrase. Back off and let the lexer takes its
|
|
// normal course.
|
|
fContinueImplicitPhrase = TRUE;
|
|
BEGIN INITIAL;
|
|
}
|
|
<implicitphrase>& {
|
|
yyless(yyleng-1);
|
|
fContinueImplicitPhrase = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|
|
<implicitphrase>~ {
|
|
yyless(yyleng-1);
|
|
// We want to treat the following token as a phrase
|
|
fContinueImplicitPhrase = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|
|
|
|
<implicitphrase>! {
|
|
yyless(yyleng-1);
|
|
// The only valid way to get here is to
|
|
// have had seen "and" before. Don't recognize
|
|
// a phrase. Back off and let the lexer takes its
|
|
// normal course.
|
|
fContinueImplicitPhrase = TRUE;
|
|
BEGIN INITIAL;
|
|
}
|
|
<implicitphrase>\| {
|
|
yyless(yyleng-1);
|
|
fContinueImplicitPhrase = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|
|
<implicitphrase>\( {
|
|
yyless(yyleng-1);
|
|
fContinueImplicitPhrase = FALSE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|
|
<implicitphrase>\) {
|
|
yyless(yyleng-1);
|
|
fContinueImplicitPhrase = FALSE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|
|
<implicitphrase>\{ {
|
|
yyless(yyleng-1);
|
|
fContinueImplicitPhrase = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|
|
<implicitphrase>{contains}[ ] {
|
|
yyless(yyleng-9);
|
|
fContinueImplicitPhrase = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|
|
%{
|
|
// When we find an operator at the start of an implicit phrase,
|
|
// we should treat it as one. So backup and get out if you see one.
|
|
%}
|
|
<implicitphrase>[\^<>@$#] {
|
|
yyless(yyleng-1);
|
|
fContinueImplicitPhrase = FALSE;
|
|
BEGIN INITIAL;
|
|
}
|
|
%{
|
|
// Triplish2 uses = to indicate that whatever appears after it may
|
|
// be using wildcards. Implement that here.
|
|
%}
|
|
<implicitphrase>= {
|
|
yyless(yyleng-1);
|
|
fContinueMaybeRegex = TRUE;
|
|
BEGIN INITIAL;
|
|
}
|
|
<implicitphrase>[^~&|{}()\^<>=!@$# ]+[ ] { yymore(); }
|
|
<implicitphrase>[^~&|{}()\^<>=!@$# ]+ {
|
|
fContinueImplicitPhrase = TRUE;
|
|
BEGIN INITIAL;
|
|
STRING_VALUE(_PHRASE,FALSE,FALSE);
|
|
}
|