// Copyright (c) 1999 Microsoft Corporation. All rights reserved. // // Declaration of Lexer. // //#define LIMITEDVBSCRIPT_LOGLEXER // §§ #include "stdinc.h" #include "enginc.h" #include "englex.h" #include "limits" #ifdef LIMITEDVBSCRIPT_LOGLEXER #include "englog.h" #endif ////////////////////////////////////////////////////////////////////// // Unicode/ASCII character classification inline bool iswasciialpha(WCHAR c) { return (c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'); } inline bool iswasciidigit(WCHAR c) { return c >= L'0' && c <= L'9'; } inline bool iswasciialnum(WCHAR c) { return iswasciialpha(c) || iswasciidigit(c); } inline WCHAR towasciilower(WCHAR c) { return (c >= L'A' && c <= L'Z') ? c + (L'a' - L'A') : c; } ////////////////////////////////////////////////////////////////////// // token tables const TokenKeysym g_TokenKeysyms[] = { { L'(', TOKEN_lparen }, { L')', TOKEN_rparen }, { L',', TOKEN_comma }, { L'-', TOKEN_op_minus }, { L'^', TOKEN_op_pow }, { L'*', TOKEN_op_mult }, { L'\\', TOKEN_op_div }, { L'+', TOKEN_op_plus }, { L'<', TOKEN_op_lt }, { L'>', TOKEN_op_gt }, { L'=', TOKEN_op_eq }, { L'\0', TOKEN_eof } }; const TokenKeyword g_TokenKeywords[] = { { L"sub", TOKEN_sub }, { L"dim", TOKEN_dim }, { L"if", TOKEN_if }, { L"then", TOKEN_then }, { L"end", TOKEN_end }, { L"elseif", TOKEN_elseif }, { L"else", TOKEN_else }, { L"set", TOKEN_set }, { L"call", TOKEN_call }, { L"not", TOKEN_op_not }, { L"mod", TOKEN_op_mod }, { L"is", TOKEN_is }, { L"and", TOKEN_and }, { L"or", TOKEN_or }, { NULL, TOKEN_eof } }; ////////////////////////////////////////////////////////////////////// // helper functions bool CheckOperatorType(Token t, bool fAcceptParens, bool fAcceptUnary, bool fAcceptBinary, bool fAcceptOverloadedAssignmentTokens) { switch (t) { case TOKEN_set: case TOKEN_sub: return fAcceptOverloadedAssignmentTokens; case TOKEN_lparen: case TOKEN_rparen: return fAcceptParens; case TOKEN_op_minus: return fAcceptUnary || fAcceptBinary; case TOKEN_op_not: return fAcceptUnary; case TOKEN_op_pow: case TOKEN_op_mult: case TOKEN_op_div: case TOKEN_op_mod: case TOKEN_op_plus: case TOKEN_op_lt: case TOKEN_op_leq: case TOKEN_op_gt: case TOKEN_op_geq: case TOKEN_op_eq: case TOKEN_op_neq: case TOKEN_is: case TOKEN_and: case TOKEN_or: return fAcceptBinary; } return false; } ////////////////////////////////////////////////////////////////////// // Lexer Lexer::Lexer(const WCHAR *pwszSource) : m_p(pwszSource), m_pNext(NULL), m_iLine(1), m_iColumn(1), m_t(TOKEN_sub) { this->Scan(); } void Lexer::Next() { assert(m_t != TOKEN_eof); if (m_pNext) { m_iColumn += (int)(m_pNext - m_p); m_p = m_pNext; m_pNext = NULL; } else { ++m_p; ++m_iColumn; } } void Lexer::Scan() { m_szStr[0] = L'\0'; m_iNum = 0; bool fLineBreak = m_t == TOKEN_linebreak; for (;;) { if (fLineBreak) { // line breaks tokens are reported on the line/column that they occur so this isn't isn't adjusted until the next pass ++m_iLine; m_iColumn = 1; } ScanMain(); if (!fLineBreak || m_t != TOKEN_linebreak) break; Next(); } #ifdef LIMITEDVBSCRIPT_LOGLEXER LogToken(*this); #endif } void Lexer::ScanMain() { for (;; this->Next()) { switch (*m_p) { case L'\0': // end of script m_t = TOKEN_eof; return; case L'\'': // comment till end of line for (; *m_p && *m_p != L'\n'; ++m_p) {} --m_p; // put one char back so the next loop can process it break; case L'\t': case L' ': // ignore horizontal white space break; case L'\r': // ignore carriage returns --m_iColumn; // in fact, they don't even count as characters break; case L'\n': // line break m_t = TOKEN_linebreak; return; default: if (*m_p == L'\"') { // string literal m_pNext = m_p + 1; char *pszDest = m_szStr; const char *pszMax = m_szStr + g_iMaxBuffer - 1; do { if (!iswascii(*m_pNext)) { this->Next(); // this will update the current position to the offending character -- indicating the correct column of the error this->err(LEXERR_NonAsciiCharacterInStringLiteral); return; } if (*m_pNext == L'\n' || *m_pNext == L'\r') { this->err(LEXERR_StringLiteralUnterminated); return; } if (*m_pNext == L'\"') { if (*++m_pNext != L'\"') break; // found terminating quote // There were two quotes, the escape sequence for a single quote. The first was skipped and we're all ready to append the second. } *pszDest++ = *m_pNext++; // we know this works because the character is ascii and those codes correspond to the same numbers in Unicode } while (pszDest <= pszMax); if (pszDest > pszMax) { this->err(LEXERR_StringLiteralTooLong); } else { *pszDest = L'\0'; m_t = TOKEN_stringliteral; } return; } if (iswasciidigit(*m_p)) { // numeric literal // Cant find a _wtoi like function that handles overflow so do the conversion myself. // §§ Look at runtime version to be sure these aren't constantly recomputed const int iMaxChop = std::numeric_limits::max() / 10; // if number gets bigger than this and there's another digit then we're going to overflow const WCHAR wchMaxLast = std::numeric_limits::max() % 10 + L'0'; // if number equals iMaxChop and the next digit is bigger than this then we're going to overflow m_pNext = m_p; m_iNum = 0; do { m_iNum *= 10; m_iNum += *m_pNext++ - L'0'; } while (iswasciidigit(*m_pNext) && (m_iNum < iMaxChop || (m_iNum == iMaxChop && *m_pNext <= wchMaxLast))); if (iswasciidigit(*m_pNext)) this->err(LEXERR_NumericLiteralTooLarge); else m_t = TOKEN_numericliteral; return; } if (!iswasciialpha(*m_p) && !(*m_p == L'_')) { // look for a token in the table of symbols for (int i = 0; g_TokenKeysyms[i].c; ++i) { if (*m_p == g_TokenKeysyms[i].c) { // we have a match m_t = g_TokenKeysyms[i].t; // check for the two-character symbols (>=, <=, <>) if (m_t == TOKEN_op_lt) { WCHAR wchNext = *(m_p + 1); if (wchNext == L'=') { m_t = TOKEN_op_leq; m_pNext = m_p + 2; } else if (wchNext == L'>') { m_t = TOKEN_op_neq; m_pNext = m_p + 2; } } else if (m_t == TOKEN_op_gt) { if (*(m_p + 1) == L'=') { m_t = TOKEN_op_geq; m_pNext = m_p + 2; } } return; } } // the symbol was not recognized this->err(LEXERR_InvalidCharacter); return; } // look for a token in the table of keywords for (int i = 0; g_TokenKeywords[i].s; ++i) { const WCHAR *pwchToken = g_TokenKeywords[i].s; const WCHAR *pwchSource = m_p; while (*pwchToken && *pwchSource && towasciilower(*pwchToken) == towasciilower(*pwchSource)) { ++pwchToken; ++pwchSource; } if (!*pwchToken && !iswasciialnum(*pwchSource)) { // made it to the end of Token and source word m_t = g_TokenKeywords[i].t; m_pNext = pwchSource; return; } } // must be an identifier for (m_pNext = m_p + 1; iswasciialnum(*m_pNext) || *m_pNext == L'_'; ++m_pNext) {} if (m_pNext - m_p > g_iMaxBuffer - 1) { this->err(LEXERR_IdentifierTooLong); return; } char *psz = m_szStr; for (const WCHAR *pwsz = m_p; pwsz < m_pNext; ++psz, ++pwsz) { *psz = *pwsz; } *psz = '\0'; if (*m_pNext == L'.') { ++m_pNext; m_t = TOKEN_identifierdot; } else { m_t = TOKEN_identifier; } return; } } } void Lexer::err(LexErr iErr) { static const char *s_rgpszErrorText[] = { "Unexpected error!", // shouldn't ever get this error "Invalid character", "Identifier too long", "String too long", "Unterminated string constant", "Number too large" }; assert(ARRAY_SIZE(s_rgpszErrorText) == LEXERR_Max); if (iErr <= 0 || iErr >= LEXERR_Max) { assert(false); iErr = LEXERR_NoError; } m_t = TOKEN_eof; m_iNum = iErr; // copy error into the buffer const char *psz = s_rgpszErrorText[iErr]; const char *pszMax = m_szStr + g_iMaxBuffer - 1; for (char *pszDest = m_szStr; pszDest < pszMax && *psz; *pszDest++ = *psz++) {} assert(!*psz); // since this function is used with hard-coded strings we shouldn't ever get one too long *pszDest = '\0'; }