609 lines
16 KiB
C++
609 lines
16 KiB
C++
//+---------------------------------------------------------------------------
|
|
//
|
|
// File: syntax.h
|
|
//
|
|
// Contents: syntax modules for regexpr
|
|
//
|
|
// Classes: perl_syntax, posix_syntax
|
|
//
|
|
// History: 3-29-00 ericne Created
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
#pragma once
|
|
|
|
#include <string>
|
|
#include <cwchar>
|
|
#include <iterator>
|
|
|
|
#ifndef ARRAYSIZE
|
|
#define ARRAYSIZE(x) (sizeof(x)/sizeof(*(x)))
|
|
#endif
|
|
|
|
#ifndef UCHAR_MAX
|
|
#define UCHAR_MAX 0xff
|
|
#endif
|
|
|
|
#ifndef WCHAR_MAX
|
|
#define WCHAR_MAX ((wchar_t)-1)
|
|
#endif
|
|
|
|
template<>
|
|
struct std::iterator_traits< const char * >
|
|
{ // get traits from iterator _Iter
|
|
typedef random_access_iterator_tag iterator_category;
|
|
typedef char value_type;
|
|
typedef ptrdiff_t difference_type;
|
|
typedef difference_type distance_type; // retained
|
|
typedef char * pointer;
|
|
typedef char & reference;
|
|
};
|
|
|
|
template<>
|
|
struct std::iterator_traits< const wchar_t * >
|
|
{ // get traits from iterator _Iter
|
|
typedef random_access_iterator_tag iterator_category;
|
|
typedef wchar_t value_type;
|
|
typedef ptrdiff_t difference_type;
|
|
typedef difference_type distance_type; // retained
|
|
typedef wchar_t * pointer;
|
|
typedef wchar_t & reference;
|
|
};
|
|
|
|
namespace regex
|
|
{
|
|
|
|
//
|
|
// The following are the tokens that can be emitted by the syntax module.
|
|
// Don't reorder this list!!!
|
|
//
|
|
enum TOKEN
|
|
{
|
|
NO_TOKEN = 0,
|
|
|
|
// REGULAR TOKENS
|
|
BEGIN_GROUP,
|
|
END_GROUP,
|
|
ALTERNATION,
|
|
BEGIN_LINE,
|
|
END_LINE,
|
|
BEGIN_CHARSET,
|
|
MATCH_ANY,
|
|
ESCAPE,
|
|
|
|
// QUANTIFICATION TOKENS
|
|
ONE_OR_MORE,
|
|
ZERO_OR_MORE,
|
|
ZERO_OR_ONE,
|
|
ONE_OR_MORE_MIN,
|
|
ZERO_OR_MORE_MIN,
|
|
ZERO_OR_ONE_MIN,
|
|
BEGIN_RANGE,
|
|
RANGE_SEPARATOR,
|
|
END_RANGE,
|
|
END_RANGE_MIN,
|
|
|
|
// ESCAPE SEQUENCES
|
|
ESC_DIGIT,
|
|
ESC_NOT_DIGIT,
|
|
ESC_SPACE,
|
|
ESC_NOT_SPACE,
|
|
ESC_WORD,
|
|
ESC_NOT_WORD,
|
|
ESC_BEGIN_STRING,
|
|
ESC_END_STRING,
|
|
ESC_END_STRING_z,
|
|
ESC_WORD_BOUNDARY,
|
|
ESC_NOT_WORD_BOUNDARY,
|
|
ESC_WORD_START,
|
|
ESC_WORD_STOP,
|
|
ESC_QUOTE_META_ON,
|
|
ESC_QUOTE_META_OFF,
|
|
|
|
// SUBSTITUTION TOKENS
|
|
SUBST_BACKREF,
|
|
SUBST_PREMATCH,
|
|
SUBST_POSTMATCH,
|
|
SUBST_MATCH,
|
|
SUBST_ESCAPE,
|
|
SUBST_QUOTE_META_ON,
|
|
SUBST_UPPER_ON,
|
|
SUBST_UPPER_NEXT,
|
|
SUBST_LOWER_ON,
|
|
SUBST_LOWER_NEXT,
|
|
SUBST_ALL_OFF,
|
|
|
|
// CHARSET TOKENS
|
|
CHARSET_NEGATE,
|
|
CHARSET_ESCAPE,
|
|
CHARSET_RANGE,
|
|
CHARSET_BACKSPACE,
|
|
CHARSET_END,
|
|
CHARSET_ALNUM,
|
|
CHARSET_ALPHA,
|
|
CHARSET_BLANK,
|
|
CHARSET_CNTRL,
|
|
CHARSET_DIGIT,
|
|
CHARSET_GRAPH,
|
|
CHARSET_LOWER,
|
|
CHARSET_PRINT,
|
|
CHARSET_PUNCT,
|
|
CHARSET_SPACE,
|
|
CHARSET_UPPER,
|
|
CHARSET_XDIGIT,
|
|
|
|
// EXTENSION TOKENS
|
|
EXT_NOBACKREF,
|
|
EXT_POS_LOOKAHEAD,
|
|
EXT_NEG_LOOKAHEAD,
|
|
EXT_POS_LOOKBEHIND,
|
|
EXT_NEG_LOOKBEHIND,
|
|
EXT_INDEPENDENT,
|
|
EXT_UNKNOWN
|
|
};
|
|
|
|
struct posix_charset_type
|
|
{
|
|
const char * const szcharset;
|
|
const size_t cchars;
|
|
|
|
posix_charset_type( const char * const sz, const size_t c )
|
|
: szcharset(sz), cchars(c) {}
|
|
};
|
|
|
|
extern const posix_charset_type g_rgposix_charsets[];
|
|
extern const size_t g_cposix_charsets;
|
|
|
|
template< typename const_iterator >
|
|
bool is_posix_charset( const_iterator icur, const_iterator iend, const char * szcharset )
|
|
{
|
|
for( ; icur != iend && '\0' != *szcharset; ++icur, ++szcharset )
|
|
{
|
|
if( *icur != *szcharset )
|
|
return false;
|
|
}
|
|
return '\0' == *szcharset;
|
|
}
|
|
|
|
//
|
|
// The perl_syntax class encapsulates the Perl 5 regular expression syntax. It is
|
|
// used as a template parameter to basic_rpattern. To customize regex syntax, create
|
|
// your own syntax class and use it as a template parameter instead.
|
|
//
|
|
|
|
class perl_syntax_base
|
|
{
|
|
protected:
|
|
static TOKEN s_rgreg[ UCHAR_MAX + 1 ];
|
|
static TOKEN s_rgescape[ UCHAR_MAX + 1 ];
|
|
|
|
struct init_perl_syntax;
|
|
friend struct init_perl_syntax;
|
|
static struct init_perl_syntax
|
|
{
|
|
init_perl_syntax();
|
|
} s_init_perl_syntax;
|
|
|
|
static inline TOKEN look_up( char ch, TOKEN rg[] ) { return rg[ (unsigned char)ch ]; }
|
|
static inline TOKEN look_up( wchar_t ch, TOKEN rg[] ) { return UCHAR_MAX < ch ? NO_TOKEN : rg[ (unsigned char)ch ]; }
|
|
};
|
|
|
|
template< typename CH >
|
|
class perl_syntax : protected perl_syntax_base
|
|
{
|
|
public:
|
|
typedef std::basic_string<CH>::iterator iterator;
|
|
typedef std::basic_string<CH>::const_iterator const_iterator;
|
|
typedef CH char_type;
|
|
|
|
private:
|
|
static bool min_quant( iterator & icur, const_iterator iend )
|
|
{
|
|
return ( (const_iterator)++icur != iend && CH('?') == *icur ? (++icur,true) : false );
|
|
}
|
|
|
|
public:
|
|
static TOKEN reg_token( iterator & icur, const_iterator iend )
|
|
{
|
|
assert( (const_iterator)icur != iend );
|
|
TOKEN tok = look_up( *icur, s_rgreg );
|
|
if( tok )
|
|
++icur;
|
|
if( ESCAPE == tok && (const_iterator)icur != iend )
|
|
{
|
|
tok = look_up( *icur, s_rgescape );
|
|
if( tok )
|
|
++icur;
|
|
else
|
|
tok = ESCAPE;
|
|
}
|
|
return tok;
|
|
}
|
|
static TOKEN quant_token( iterator & icur, const_iterator iend )
|
|
{
|
|
assert( (const_iterator)icur != iend );
|
|
TOKEN tok = NO_TOKEN;
|
|
switch( *icur )
|
|
{
|
|
case CH('*'):
|
|
tok = min_quant( icur, iend ) ? ZERO_OR_MORE_MIN : ZERO_OR_MORE;
|
|
break;
|
|
case CH('+'):
|
|
tok = min_quant( icur, iend ) ? ONE_OR_MORE_MIN : ONE_OR_MORE;
|
|
break;
|
|
case CH('?'):
|
|
tok = min_quant( icur, iend ) ? ZERO_OR_ONE_MIN : ZERO_OR_ONE;
|
|
break;
|
|
case CH('}'):
|
|
tok = min_quant( icur, iend ) ? END_RANGE_MIN : END_RANGE;
|
|
break;
|
|
case CH('{'):
|
|
tok = BEGIN_RANGE;
|
|
++icur;
|
|
break;
|
|
case CH(','):
|
|
tok = RANGE_SEPARATOR;
|
|
++icur;
|
|
break;
|
|
}
|
|
return tok;
|
|
}
|
|
static TOKEN charset_token( iterator & icur, const_iterator iend )
|
|
{
|
|
assert( (const_iterator)icur != iend );
|
|
TOKEN tok = NO_TOKEN;
|
|
switch( *icur )
|
|
{
|
|
case CH('-'):
|
|
tok = CHARSET_RANGE;
|
|
++icur;
|
|
break;
|
|
case CH('^'):
|
|
tok = CHARSET_NEGATE;
|
|
++icur;
|
|
break;
|
|
case CH(']'):
|
|
tok = CHARSET_END;
|
|
++icur;
|
|
break;
|
|
case CH('\\'):
|
|
tok = CHARSET_ESCAPE;
|
|
if( (const_iterator)++icur == iend )
|
|
break;
|
|
switch( *icur )
|
|
{
|
|
case CH('b'):
|
|
tok = CHARSET_BACKSPACE;
|
|
++icur;
|
|
break;
|
|
case CH('d'):
|
|
tok = ESC_DIGIT;
|
|
++icur;
|
|
break;
|
|
case CH('D'):
|
|
tok = ESC_NOT_DIGIT;
|
|
++icur;
|
|
break;
|
|
case CH('s'):
|
|
tok = ESC_SPACE;
|
|
++icur;
|
|
break;
|
|
case CH('S'):
|
|
tok = ESC_NOT_SPACE;
|
|
++icur;
|
|
break;
|
|
case CH('w'):
|
|
tok = ESC_WORD;
|
|
++icur;
|
|
break;
|
|
case CH('W'):
|
|
tok = ESC_NOT_WORD;
|
|
++icur;
|
|
break;
|
|
}
|
|
break;
|
|
case CH('['):
|
|
for( size_t i=0; !tok && i < g_cposix_charsets; ++i )
|
|
{
|
|
if( is_posix_charset<const_iterator>( icur, iend, g_rgposix_charsets[i].szcharset ) )
|
|
{
|
|
tok = TOKEN(CHARSET_ALNUM + i);
|
|
std::advance( icur, g_rgposix_charsets[i].cchars );
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
return tok;
|
|
}
|
|
static TOKEN subst_token( iterator & icur, const_iterator iend )
|
|
{
|
|
assert( (const_iterator)icur != iend );
|
|
TOKEN tok = NO_TOKEN;
|
|
switch( *icur )
|
|
{
|
|
case CH('\\'):
|
|
tok = SUBST_ESCAPE;
|
|
if( (const_iterator)++icur != iend )
|
|
switch( *icur )
|
|
{
|
|
case CH('Q'):
|
|
tok = SUBST_QUOTE_META_ON;
|
|
++icur;
|
|
break;
|
|
case CH('U'):
|
|
tok = SUBST_UPPER_ON;
|
|
++icur;
|
|
break;
|
|
case CH('u'):
|
|
tok = SUBST_UPPER_NEXT;
|
|
++icur;
|
|
break;
|
|
case CH('L'):
|
|
tok = SUBST_LOWER_ON;
|
|
++icur;
|
|
break;
|
|
case CH('l'):
|
|
tok = SUBST_LOWER_NEXT;
|
|
++icur;
|
|
break;
|
|
case CH('E'):
|
|
tok = SUBST_ALL_OFF;
|
|
++icur;
|
|
break;
|
|
}
|
|
break;
|
|
case CH('$'):
|
|
tok = SUBST_BACKREF;
|
|
if( (const_iterator)++icur != iend )
|
|
switch( *icur )
|
|
{
|
|
case CH('&'):
|
|
tok = SUBST_MATCH;
|
|
++icur;
|
|
break;
|
|
case CH('`'):
|
|
tok = SUBST_PREMATCH;
|
|
++icur;
|
|
break;
|
|
case CH('\''):
|
|
tok = SUBST_POSTMATCH;
|
|
++icur;
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
return tok;
|
|
}
|
|
static TOKEN ext_token( iterator & icur, const_iterator iend, unsigned & flags )
|
|
{
|
|
assert( (const_iterator)icur != iend );
|
|
bool finclude;
|
|
TOKEN tok = NO_TOKEN;
|
|
if( CH('?') == *icur )
|
|
{
|
|
tok = EXT_UNKNOWN;
|
|
if( (const_iterator)++icur != iend )
|
|
{
|
|
switch( *icur )
|
|
{
|
|
case CH(':'):
|
|
tok = EXT_NOBACKREF;
|
|
++icur;
|
|
break;
|
|
case CH('='):
|
|
tok = EXT_POS_LOOKAHEAD;
|
|
++icur;
|
|
break;
|
|
case CH('!'):
|
|
tok = EXT_NEG_LOOKAHEAD;
|
|
++icur;
|
|
break;
|
|
case CH('<'):
|
|
if( (const_iterator)++icur == iend )
|
|
break;
|
|
switch( *icur )
|
|
{
|
|
case CH('='):
|
|
tok = EXT_POS_LOOKBEHIND;
|
|
++icur;
|
|
break;
|
|
case CH('!'):
|
|
tok = EXT_NEG_LOOKBEHIND;
|
|
++icur;
|
|
break;
|
|
}
|
|
break;
|
|
case CH('>'):
|
|
tok = EXT_INDEPENDENT;
|
|
++icur;
|
|
break;
|
|
default:
|
|
finclude = true;
|
|
do
|
|
{
|
|
if( CH(':') == *icur )
|
|
{
|
|
tok = EXT_NOBACKREF;
|
|
++icur;
|
|
break;
|
|
}
|
|
if( CH(')') == *icur )
|
|
{
|
|
tok = EXT_NOBACKREF;
|
|
break;
|
|
}
|
|
if( CH('-') == *icur && finclude )
|
|
finclude = false;
|
|
else if( CH('i') == *icur )
|
|
flags = finclude ? ( flags | NOCASE ) : ( flags & ~NOCASE );
|
|
else if( CH('m') == *icur )
|
|
flags = finclude ? ( flags | MULTILINE ) : ( flags & ~MULTILINE );
|
|
else if( CH('s') == *icur )
|
|
flags = finclude ? ( flags | SINGLELINE ) : ( flags & ~SINGLELINE );
|
|
else
|
|
break;
|
|
} while( (const_iterator)++icur != iend );
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return tok;
|
|
}
|
|
};
|
|
|
|
//
|
|
// Implements the basic POSIX regular expression syntax
|
|
//
|
|
template< typename CH >
|
|
class posix_syntax
|
|
{
|
|
public:
|
|
typedef std::basic_string<CH>::iterator iterator;
|
|
typedef std::basic_string<CH>::const_iterator const_iterator;
|
|
typedef CH char_type;
|
|
|
|
static TOKEN reg_token( iterator & icur, const_iterator iend )
|
|
{
|
|
TOKEN tok = NO_TOKEN;
|
|
switch( *icur )
|
|
{
|
|
case '.':
|
|
tok = MATCH_ANY;
|
|
++icur;
|
|
break;
|
|
case '^':
|
|
tok = BEGIN_LINE;
|
|
++icur;
|
|
break;
|
|
case '$':
|
|
tok = END_LINE;
|
|
++icur;
|
|
break;
|
|
case '[':
|
|
tok = BEGIN_CHARSET;
|
|
++icur;
|
|
break;
|
|
case '\\':
|
|
tok = ESCAPE;
|
|
++icur;
|
|
if( (const_iterator)icur != iend )
|
|
{
|
|
switch( *icur )
|
|
{
|
|
case '(':
|
|
tok = BEGIN_GROUP;
|
|
++icur;
|
|
break;
|
|
case ')':
|
|
tok = END_GROUP;
|
|
++icur;
|
|
break;
|
|
case '|':
|
|
tok = ALTERNATION;
|
|
++icur;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
return tok;
|
|
}
|
|
static TOKEN quant_token( iterator & icur, const_iterator iend )
|
|
{
|
|
TOKEN tok = NO_TOKEN;
|
|
switch( *icur )
|
|
{
|
|
case '*':
|
|
tok = ZERO_OR_MORE;
|
|
++icur;
|
|
break;
|
|
case ',':
|
|
tok = RANGE_SEPARATOR;
|
|
++icur;
|
|
break;
|
|
case '\\':
|
|
++icur;
|
|
if( (const_iterator)icur != iend )
|
|
{
|
|
switch( *icur )
|
|
{
|
|
case '?':
|
|
tok = ZERO_OR_ONE;
|
|
++icur;
|
|
break;
|
|
case '+':
|
|
tok = ONE_OR_MORE;
|
|
++icur;
|
|
break;
|
|
case '{':
|
|
tok = BEGIN_RANGE;
|
|
++icur;
|
|
break;
|
|
case '}':
|
|
tok = END_RANGE;
|
|
++icur;
|
|
break;
|
|
default:
|
|
--icur;
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
--icur;
|
|
}
|
|
}
|
|
return tok;
|
|
}
|
|
static TOKEN charset_token( iterator & icur, const_iterator iend )
|
|
{
|
|
TOKEN tok = NO_TOKEN;
|
|
switch( *icur )
|
|
{
|
|
case '^':
|
|
tok = CHARSET_NEGATE;
|
|
++icur;
|
|
break;
|
|
case '-':
|
|
tok = CHARSET_RANGE;
|
|
++icur;
|
|
break;
|
|
case ']':
|
|
tok = CHARSET_END;
|
|
++icur;
|
|
break;
|
|
case '[':
|
|
for( size_t i=0; !tok && i < g_cposix_charsets; ++i )
|
|
{
|
|
if( is_posix_charset<const_iterator>( icur, iend, g_rgposix_charsets[i].szcharset ) )
|
|
{
|
|
tok = TOKEN(CHARSET_ALNUM + i);
|
|
std::advance( icur, g_rgposix_charsets[i].cchars );
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
return tok;
|
|
}
|
|
static TOKEN subst_token( iterator & icur, const_iterator iend )
|
|
{
|
|
TOKEN tok = NO_TOKEN;
|
|
if( '\\' == *icur )
|
|
{
|
|
tok = SUBST_ESCAPE;
|
|
++icur;
|
|
if( (const_iterator)icur != iend && '0' <= *icur && '9' >= *icur )
|
|
{
|
|
tok = SUBST_BACKREF;
|
|
}
|
|
}
|
|
return tok;
|
|
}
|
|
static TOKEN ext_token( iterator & icur, const_iterator iend, unsigned & flags )
|
|
{
|
|
return NO_TOKEN;
|
|
}
|
|
};
|
|
|
|
} // namespace regex
|