452 lines
11 KiB
C++
452 lines
11 KiB
C++
/*++
|
|
|
|
Copyright (c) 2000 Microsoft Corporation
|
|
|
|
Module Name:
|
|
|
|
adllexer.cpp
|
|
|
|
Abstract:
|
|
|
|
Implementation of the lexer for the ADL language
|
|
|
|
Author:
|
|
|
|
t-eugenz - August 2000
|
|
|
|
Environment:
|
|
|
|
User mode only.
|
|
|
|
Revision History:
|
|
|
|
Created - August 2000
|
|
|
|
--*/
|
|
|
|
|
|
#include "adl.h"
|
|
|
|
//
|
|
// Constant values outside WCHAR range, for special characters
|
|
//
|
|
|
|
#define CHAR_COMMA 65538
|
|
#define CHAR_QUOTE 65539
|
|
#define CHAR_SEMICOLON 65540
|
|
#define CHAR_OPENPAREN 65541
|
|
#define CHAR_CLOSEPAREN 65542
|
|
#define CHAR_NULL 65543
|
|
#define CHAR_NEWLINE 65544
|
|
#define CHAR_RETURN 65545
|
|
#define CHAR_TAB 65546
|
|
#define CHAR_SPACE 65547
|
|
#define CHAR_AT 65548
|
|
#define CHAR_SLASH 65549
|
|
#define CHAR_PERIOD 65550
|
|
|
|
//
|
|
// States of the lexer DFA
|
|
//
|
|
|
|
#define STATE_WHITESPACE 0
|
|
#define STATE_BEGIN 1
|
|
#define STATE_IDENT 2
|
|
#define STATE_QUOTE 3
|
|
#define STATE_DONE 4
|
|
|
|
|
|
//
|
|
// If the character is found in the special character map, use the special
|
|
// symbol (>65535), otherwise use the regular character value
|
|
//
|
|
|
|
#define RESOLVE_CHAR(CHAR, MAP, ITER, ITEREND) \
|
|
((((ITER) = (MAP).find((CHAR)) ) == (ITEREND) ) ? (CHAR) : (*(ITER)).second)
|
|
|
|
|
|
|
|
AdlLexer::AdlLexer(IN const WCHAR *input,
|
|
IN OUT AdlStatement *adlStat,
|
|
IN const PADL_LANGUAGE_SPEC pLang)
|
|
/*++
|
|
|
|
Routine Description:
|
|
|
|
Constructor for the AdlLexer. Initializes the mapping for finding special
|
|
characters, and other initial state information
|
|
|
|
Arguments:
|
|
|
|
input - The input string
|
|
|
|
adlStat - The AdlStatement instance, for token garbage collection
|
|
|
|
pLang - The ADL language description
|
|
|
|
Return Value:
|
|
|
|
none
|
|
|
|
--*/
|
|
|
|
{
|
|
|
|
_input = input;
|
|
_pLang = pLang;
|
|
_adlStat = adlStat;
|
|
|
|
_position = 0;
|
|
_tokCount = 0;
|
|
|
|
|
|
//
|
|
// Special character mapping
|
|
//
|
|
|
|
_mapCharCode[_pLang->CH_NULL] = CHAR_NULL;
|
|
_mapCharCode[_pLang->CH_SPACE] = CHAR_SPACE;
|
|
_mapCharCode[_pLang->CH_TAB] = CHAR_TAB;
|
|
_mapCharCode[_pLang->CH_NEWLINE] = CHAR_NEWLINE;
|
|
_mapCharCode[_pLang->CH_RETURN] = CHAR_RETURN;
|
|
_mapCharCode[_pLang->CH_QUOTE] = CHAR_QUOTE;
|
|
_mapCharCode[_pLang->CH_COMMA] = CHAR_COMMA;
|
|
_mapCharCode[_pLang->CH_SEMICOLON] = CHAR_SEMICOLON;
|
|
_mapCharCode[_pLang->CH_OPENPAREN] = CHAR_OPENPAREN;
|
|
_mapCharCode[_pLang->CH_CLOSEPAREN] = CHAR_CLOSEPAREN;
|
|
_mapCharCode[_pLang->CH_AT] = CHAR_AT;
|
|
_mapCharCode[_pLang->CH_SLASH] = CHAR_SLASH;
|
|
_mapCharCode[_pLang->CH_PERIOD] = CHAR_PERIOD;
|
|
|
|
//
|
|
// Only find end of map once
|
|
//
|
|
|
|
_iterEnd = _mapCharCode.end();
|
|
|
|
|
|
//
|
|
// Place all special tokens into a map, for O(log n) string searches
|
|
//
|
|
|
|
_mapStringToken[_pLang->SZ_TK_AND] = TK_AND;
|
|
_mapStringToken[_pLang->SZ_TK_EXCEPT] = TK_EXCEPT;
|
|
_mapStringToken[_pLang->SZ_TK_ON] = TK_ON;
|
|
_mapStringToken[_pLang->SZ_TK_ALLOWED] = TK_ALLOWED;
|
|
_mapStringToken[_pLang->SZ_TK_AS] = TK_AS;
|
|
_mapStringToken[_pLang->SZ_TK_THIS_OBJECT] = TK_THIS_OBJECT;
|
|
_mapStringToken[_pLang->SZ_TK_CONTAINERS] = TK_CONTAINERS;
|
|
_mapStringToken[_pLang->SZ_TK_OBJECTS] = TK_OBJECTS;
|
|
_mapStringToken[_pLang->SZ_TK_CONTAINERS_OBJECTS] = TK_CONTAINERS_OBJECTS;
|
|
_mapStringToken[_pLang->SZ_TK_NO_PROPAGATE] = TK_NO_PROPAGATE;
|
|
|
|
}
|
|
|
|
|
|
DWORD AdlLexer::NextToken(OUT AdlToken **value)
|
|
/*++
|
|
|
|
Routine Description:
|
|
|
|
This retrieves the next token from the input string. This is basically a
|
|
DFA which begins in the WHITESPACE state, and runs until it reaches
|
|
the DONE state, at which point it returns a token.
|
|
|
|
Arguments:
|
|
|
|
value - Pointer to a new token containing the string value
|
|
is stored in *value
|
|
|
|
Return Value:
|
|
|
|
DWORD - The token type, as #define'd by YACC in tokens.h
|
|
|
|
--*/
|
|
{
|
|
|
|
//
|
|
// Initial DFA state
|
|
//
|
|
|
|
DWORD state = STATE_WHITESPACE;
|
|
|
|
DWORD tokType = TK_ERROR;
|
|
|
|
wstring curToken;
|
|
|
|
DWORD dwInput;
|
|
|
|
DWORD dwTokStart = 0;
|
|
|
|
//
|
|
// First token should be the grammar type
|
|
//
|
|
|
|
if( _tokCount == 0 )
|
|
{
|
|
_tokCount++;
|
|
return _pLang->dwLanguageType;
|
|
|
|
}
|
|
|
|
|
|
dwInput = RESOLVE_CHAR(_input[_position], _mapCharCode, _iter, _iterEnd);
|
|
|
|
while( state != STATE_DONE )
|
|
{
|
|
switch( state )
|
|
{
|
|
|
|
case STATE_WHITESPACE:
|
|
|
|
switch( dwInput )
|
|
{
|
|
|
|
case CHAR_NULL:
|
|
tokType = 0;
|
|
state = STATE_DONE;
|
|
break;
|
|
|
|
case CHAR_NEWLINE:
|
|
_position++;
|
|
dwInput = RESOLVE_CHAR(_input[_position],
|
|
_mapCharCode,
|
|
_iter,
|
|
_iterEnd);
|
|
|
|
break;
|
|
|
|
case CHAR_RETURN:
|
|
_position++;
|
|
dwInput = RESOLVE_CHAR(_input[_position],
|
|
_mapCharCode,
|
|
_iter,
|
|
_iterEnd);
|
|
|
|
break;
|
|
|
|
case CHAR_SPACE:
|
|
_position++;
|
|
dwInput = RESOLVE_CHAR(_input[_position],
|
|
_mapCharCode,
|
|
_iter,
|
|
_iterEnd);
|
|
break;
|
|
|
|
case CHAR_TAB:
|
|
_position++;
|
|
dwInput = RESOLVE_CHAR(_input[_position],
|
|
_mapCharCode,
|
|
_iter,
|
|
_iterEnd);
|
|
break;
|
|
|
|
default:
|
|
state = STATE_BEGIN;
|
|
break;
|
|
}
|
|
|
|
break;
|
|
|
|
case STATE_BEGIN:
|
|
|
|
dwTokStart = _position;
|
|
|
|
tokType = TK_ERROR;
|
|
|
|
switch( dwInput )
|
|
{
|
|
case CHAR_NULL:
|
|
state = STATE_DONE;
|
|
break;
|
|
|
|
case CHAR_COMMA:
|
|
if( tokType == TK_ERROR )
|
|
{
|
|
tokType = TK_COMMA;
|
|
}
|
|
|
|
case CHAR_OPENPAREN:
|
|
if( tokType == TK_ERROR )
|
|
{
|
|
tokType = TK_OPENPAREN;
|
|
}
|
|
|
|
case CHAR_CLOSEPAREN:
|
|
if( tokType == TK_ERROR )
|
|
{
|
|
tokType = TK_CLOSEPAREN;
|
|
}
|
|
|
|
case CHAR_SEMICOLON:
|
|
if( tokType == TK_ERROR )
|
|
{
|
|
tokType = TK_SEMICOLON;
|
|
}
|
|
|
|
case CHAR_AT:
|
|
if( tokType == TK_ERROR )
|
|
{
|
|
tokType = TK_AT;
|
|
}
|
|
|
|
case CHAR_SLASH:
|
|
if( tokType == TK_ERROR )
|
|
{
|
|
tokType = TK_SLASH;
|
|
}
|
|
|
|
case CHAR_PERIOD:
|
|
if( tokType == TK_ERROR )
|
|
{
|
|
tokType = TK_PERIOD;
|
|
}
|
|
|
|
//
|
|
// Same action for all special single-char tokens
|
|
//
|
|
curToken.append( &(_input[_position]), 1 );
|
|
_position++;
|
|
dwInput = RESOLVE_CHAR(_input[_position],
|
|
_mapCharCode,
|
|
_iter,
|
|
_iterEnd);
|
|
|
|
state = STATE_DONE;
|
|
break;
|
|
|
|
case CHAR_QUOTE:
|
|
_position++;
|
|
dwInput = RESOLVE_CHAR(_input[_position],
|
|
_mapCharCode,
|
|
_iter,
|
|
_iterEnd);
|
|
|
|
state = STATE_QUOTE;
|
|
tokType = TK_IDENT;
|
|
break;
|
|
|
|
default:
|
|
state = STATE_IDENT;
|
|
tokType = TK_IDENT;
|
|
break;
|
|
}
|
|
|
|
break;
|
|
|
|
case STATE_IDENT:
|
|
|
|
switch( dwInput )
|
|
{
|
|
case CHAR_NULL:
|
|
case CHAR_COMMA:
|
|
case CHAR_OPENPAREN:
|
|
case CHAR_CLOSEPAREN:
|
|
case CHAR_SEMICOLON:
|
|
case CHAR_NEWLINE:
|
|
case CHAR_RETURN:
|
|
case CHAR_TAB:
|
|
case CHAR_SPACE:
|
|
case CHAR_AT:
|
|
case CHAR_SLASH:
|
|
case CHAR_PERIOD:
|
|
case CHAR_QUOTE:
|
|
|
|
state = STATE_DONE;
|
|
break;
|
|
|
|
default:
|
|
curToken.append( &(_input[_position]), 1 );
|
|
_position++;
|
|
dwInput = RESOLVE_CHAR(_input[_position],
|
|
_mapCharCode,
|
|
_iter,
|
|
_iterEnd);
|
|
|
|
break;
|
|
}
|
|
|
|
break;
|
|
|
|
case STATE_QUOTE:
|
|
|
|
switch( dwInput )
|
|
{
|
|
case CHAR_NULL:
|
|
case CHAR_TAB:
|
|
case CHAR_NEWLINE:
|
|
case CHAR_RETURN:
|
|
throw AdlStatement::ERROR_UNTERMINATED_STRING;
|
|
break;
|
|
|
|
|
|
case CHAR_QUOTE:
|
|
|
|
_position++;
|
|
dwInput = RESOLVE_CHAR(_input[_position],
|
|
_mapCharCode,
|
|
_iter,
|
|
_iterEnd);
|
|
state = STATE_DONE;
|
|
break;
|
|
|
|
default:
|
|
curToken.append( &(_input[_position]), 1 );
|
|
_position++;
|
|
dwInput = RESOLVE_CHAR(_input[_position],
|
|
_mapCharCode,
|
|
_iter,
|
|
_iterEnd);
|
|
break;
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
//
|
|
// Should never get here, well-defined states
|
|
//
|
|
|
|
assert(FALSE);
|
|
break;
|
|
}
|
|
}
|
|
|
|
//
|
|
// Done state was reached
|
|
// Export the string and column/row info in YACC-form here
|
|
//
|
|
|
|
AdlToken *outVal;
|
|
|
|
outVal = new AdlToken(curToken.c_str(), dwTokStart, _position - 1);
|
|
|
|
_adlStat->AddToken(outVal);
|
|
|
|
//
|
|
// Check if the string is a special token, case-insensitive
|
|
//
|
|
|
|
if( _mapStringToken.find(outVal->GetValue()) != _mapStringToken.end() )
|
|
{
|
|
tokType = _mapStringToken[outVal->GetValue()];
|
|
}
|
|
|
|
|
|
*value = outVal;
|
|
|
|
//
|
|
// Set this token to be the error token. This way, if the string is
|
|
// not accepted by the parser, we know at which token the parser failed
|
|
// If another error occurs later, this value will be overwritten
|
|
//
|
|
|
|
_adlStat->SetErrorToken(outVal);
|
|
|
|
_tokCount++;
|
|
|
|
return tokType;
|
|
}
|
|
|