1019 lines
33 KiB
C++
1019 lines
33 KiB
C++
//+---------------------------------------------------------------------------
|
|
//
|
|
// File: basic_regexpr.hxx
|
|
//
|
|
// Contents: classes for regular expression pattern matching a-la perl
|
|
//
|
|
// Classes: basic_rpattern, basic_regexpr
|
|
//
|
|
// Functions: basic_regexpr::match
|
|
// basic_regexpr::substitute
|
|
// basic_regexpr::cbackrefs
|
|
// basic_regexpr::backref
|
|
// basic_regexpr::all_backrefs
|
|
// basic_regexpr::backref_str
|
|
//
|
|
// Coupling:
|
|
//
|
|
// History: 12-11-1998 ericne Created
|
|
// 01-05-2001 ericne Removed dependency on VC's choice
|
|
// of STL iterator types.
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
#pragma once
|
|
|
|
// C4786 identifier was truncated to '255' characters in the debug information
|
|
#pragma warning( disable : 4290 4786 )
|
|
|
|
#ifdef _MT
|
|
#include <windows.h> // for CRITICAL_SECTION
|
|
#endif
|
|
|
|
#include <string>
|
|
#include <stdexcept>
|
|
#include <vector>
|
|
#include <list>
|
|
#include <map>
|
|
#include <iostream>
|
|
#include <tchar.h>
|
|
#include <new.h> // for _set_new_handler
|
|
#include <crtdbg.h>
|
|
#include "syntax.h"
|
|
|
|
namespace regex
|
|
{
|
|
|
|
// Called when an allocation fails
|
|
inline int __cdecl my_new_handler( size_t )
|
|
{
|
|
throw std::bad_alloc();
|
|
}
|
|
|
|
// For pushing and popping the new handler
|
|
class push_new_handler
|
|
{
|
|
_PNH m_pnh;
|
|
public:
|
|
push_new_handler( _PNH pnh )
|
|
{
|
|
m_pnh = _set_new_handler( pnh );
|
|
}
|
|
~push_new_handler()
|
|
{
|
|
(void)_set_new_handler( m_pnh );
|
|
}
|
|
};
|
|
|
|
class bad_regexpr : public std::runtime_error
|
|
{
|
|
public:
|
|
explicit bad_regexpr(const std::string& _S)
|
|
: std::runtime_error(_S) {}
|
|
virtual ~bad_regexpr() {}
|
|
};
|
|
|
|
//
|
|
// Flags to control how matching occurs
|
|
//
|
|
enum REGEX_FLAGS
|
|
{
|
|
NOCASE = 0x0001, // ignore case
|
|
GLOBAL = 0x0002, // match everywhere in the string
|
|
MULTILINE = 0x0004, // ^ and $ can match internal line breaks
|
|
SINGLELINE = 0x0008, // . can match newline character
|
|
RIGHTMOST = 0x0010, // start matching at the right of the string
|
|
NOBACKREFS = 0x0020, // only meaningful when used with GLOBAL and substitute
|
|
FIRSTBACKREFS = 0x0040, // only meaningful when used with GLOBAL
|
|
ALLBACKREFS = 0x0080, // only meaningful when used with GLOBAL
|
|
CSTRINGS = 0x0100, // optimize pattern for use with null-terminated strings
|
|
NORMALIZE = 0x0200 // Preprocess patterns: "\\n" => "\n", etc.
|
|
};
|
|
|
|
// Forward declarations
|
|
template< typename CI > struct match_param;
|
|
template< typename CI > class match_group;
|
|
template< typename CI > class match_wrapper;
|
|
template< typename CI > class match_charset;
|
|
template< typename CI > class basic_rpattern_base;
|
|
|
|
// --------------------------------------------------------------------------
|
|
//
|
|
// Class: width_type
|
|
//
|
|
// Description: represents the width of a sub-expression
|
|
//
|
|
// Methods: width_add - add two widths
|
|
// width_mult - multiply two widths
|
|
// width_type - ctor
|
|
// width_type - ctor
|
|
// operator= - assign a width
|
|
// operator== - are widths equal
|
|
// operator!= - are widths now equal
|
|
// operator+ - add two widths
|
|
// operator* - multiply two widths
|
|
//
|
|
// Members: m_min - smallest number of characters a sub-expr can span
|
|
// m_max - largest number of characters a sub-expr can span
|
|
//
|
|
// History: 8/14/2000 - ericne - Created
|
|
//
|
|
// --------------------------------------------------------------------------
|
|
struct width_type
|
|
{
|
|
size_t m_min;
|
|
size_t m_max;
|
|
|
|
static size_t width_add( size_t a, size_t b )
|
|
{
|
|
return ( -1 == a || -1 == b ? -1 : a + b );
|
|
}
|
|
|
|
static size_t width_mult( size_t a, size_t b )
|
|
{
|
|
return ( -1 == a || -1 == b ? -1 : a * b );
|
|
}
|
|
|
|
width_type( size_t _min = 0, size_t _max = -1 )
|
|
: m_min(_min), m_max(_max)
|
|
{
|
|
}
|
|
|
|
width_type( const width_type & that )
|
|
: m_min(that.m_min), m_max(that.m_max)
|
|
{
|
|
}
|
|
|
|
width_type & operator=( const width_type & that )
|
|
{
|
|
m_min = that.m_min;
|
|
m_max = that.m_max;
|
|
return *this;
|
|
}
|
|
|
|
bool operator==( const width_type & that ) const
|
|
{
|
|
return ( m_min == that.m_min && m_max == that.m_max );
|
|
}
|
|
|
|
bool operator!=( const width_type & that ) const
|
|
{
|
|
return ( m_min != that.m_min || m_max != that.m_max );
|
|
}
|
|
|
|
width_type operator+( const width_type & that ) const
|
|
{
|
|
return width_type( width_add( m_min, that.m_min ), width_add( m_max, that.m_max ) );
|
|
}
|
|
|
|
width_type operator*( const width_type & that ) const
|
|
{
|
|
return width_type( width_mult( m_min, that.m_min ), width_mult( m_max, that.m_max ) );
|
|
}
|
|
};
|
|
|
|
const width_type worst_width(0,-1);
|
|
const width_type uninit_width(-1,-1);
|
|
|
|
// --------------------------------------------------------------------------
|
|
//
|
|
// Class: sub_expr
|
|
//
|
|
// Description: patterns are "compiled" into a directed graph of sub_expr
|
|
// structs. Matching is accomplished by traversing this graph.
|
|
//
|
|
// Methods: sub_expr - construct a sub_expr
|
|
// _match_this - does this sub_expr match at the given location
|
|
// _width_this - what is the width of this sub_expr
|
|
// ~sub_expr - virt dtor so cleanup happens correctly
|
|
// _delete - delete this node in the graph and all nodes linked
|
|
// next - pointer to the next node in the graph
|
|
// next - pointer to the next node in the graph
|
|
// match_next - match the rest of the graph
|
|
// domatch - match_this and match_next
|
|
// is_assertion - true if this sub_expr is a zero-width assertion
|
|
// get_width - find the width of the graph at this sub_expr
|
|
//
|
|
// Members: m_pnext - pointer to the next node in the graph
|
|
//
|
|
// History: 8/14/2000 - ericne - Created
|
|
//
|
|
// --------------------------------------------------------------------------
|
|
template< typename CI >
|
|
class sub_expr
|
|
{
|
|
sub_expr * m_pnext;
|
|
|
|
protected:
|
|
|
|
// Only derived classes and basic_rpattern can instantiate sub_expr's
|
|
sub_expr( )
|
|
: m_pnext(NULL)
|
|
{
|
|
}
|
|
|
|
// match this object only
|
|
virtual bool _match_this( match_param<CI> & param, CI & icur ) const throw()
|
|
{
|
|
return true;
|
|
}
|
|
|
|
virtual width_type _width_this() throw() = 0;
|
|
|
|
public:
|
|
|
|
typedef typename std::iterator_traits<CI>::value_type char_type;
|
|
|
|
friend class match_wrapper<CI>; // wrappers can access _match_this method
|
|
|
|
virtual ~sub_expr() {}
|
|
|
|
virtual void _delete()
|
|
{
|
|
if( m_pnext )
|
|
m_pnext->_delete();
|
|
delete this;
|
|
}
|
|
|
|
inline const sub_expr *const next() const { return m_pnext; }
|
|
inline sub_expr * & next() { return m_pnext; }
|
|
|
|
// Match all subsequent objects
|
|
inline bool match_next( match_param<CI> & param, CI icur ) const throw()
|
|
{
|
|
return NULL == m_pnext || m_pnext->domatch( param, icur );
|
|
}
|
|
|
|
// Match this object and all subsequent objects
|
|
// If domatch returns false, it must not change any internal state
|
|
virtual bool domatch( match_param<CI> & param, CI icur ) const throw()
|
|
{
|
|
return ( _match_this(param,icur) && match_next(param,icur) );
|
|
}
|
|
|
|
virtual bool is_assertion() const throw()
|
|
{
|
|
return false;
|
|
}
|
|
|
|
width_type get_width() throw()
|
|
{
|
|
width_type this_width = _width_this();
|
|
|
|
if( NULL == m_pnext )
|
|
return this_width;
|
|
|
|
width_type that_width = m_pnext->get_width();
|
|
|
|
return ( this_width + that_width );
|
|
}
|
|
};
|
|
|
|
template< typename CI >
|
|
void delete_sub_expr( sub_expr<CI> * psub )
|
|
{
|
|
if( psub )
|
|
psub->_delete();
|
|
}
|
|
|
|
template< typename CI, typename SY = perl_syntax<std::iterator_traits<CI>::value_type> >
|
|
class create_charset_helper
|
|
{
|
|
public:
|
|
typedef std::iterator_traits<CI>::value_type char_type;
|
|
|
|
static sub_expr<CI> * create_charset_aux(
|
|
std::basic_string<char_type> & str,
|
|
std::basic_string<char_type>::iterator & icur,
|
|
unsigned flags );
|
|
};
|
|
|
|
|
|
// --------------------------------------------------------------------------
|
|
//
|
|
// Class: auto_sub_ptr
|
|
//
|
|
// Description: Class for automatically cleaning up the structure associated
|
|
// with a parsed pattern
|
|
//
|
|
// Methods: auto_sub_ptr - private copy ctor - not used
|
|
// operator= - private assign operator - not used
|
|
// operator T* - private implicit cast operator - not used
|
|
// auto_sub_ptr - ctor
|
|
// ~auto_sub_ptr - dtor, frees ptr
|
|
// free_ptr - explicitly free pointer
|
|
// release - relinquish ownership of ptr
|
|
// operator= - take ownership of ptr
|
|
// get - return ptr
|
|
// get - return ptr
|
|
// operator-> - method call through ptr
|
|
// operator-> - method call through ptr
|
|
//
|
|
// Members: m_psub - sub_expr pointer
|
|
//
|
|
// History: 8/14/2000 - ericne - Created
|
|
//
|
|
// --------------------------------------------------------------------------
|
|
template< typename T >
|
|
class auto_sub_ptr
|
|
{
|
|
T * m_psub;
|
|
|
|
// hide these methods
|
|
auto_sub_ptr( const auto_sub_ptr<T> & ) {}
|
|
auto_sub_ptr & operator=( const auto_sub_ptr<T> & ) { return *this; }
|
|
operator T*() const { return m_psub; }
|
|
|
|
public:
|
|
auto_sub_ptr( T * psub = NULL ) : m_psub( psub ) {}
|
|
|
|
~auto_sub_ptr()
|
|
{
|
|
free_ptr();
|
|
}
|
|
|
|
void free_ptr() // deallocate
|
|
{
|
|
delete_sub_expr( m_psub );
|
|
}
|
|
|
|
T * release() // relinquish ownership, but don't deallocate
|
|
{
|
|
T * psub = m_psub;
|
|
m_psub = NULL;
|
|
return psub;
|
|
}
|
|
|
|
auto_sub_ptr<T> & operator=( T * psub )
|
|
{
|
|
delete_sub_expr( m_psub );
|
|
m_psub = psub;
|
|
return *this;
|
|
}
|
|
|
|
inline const T*const get() const { return m_psub; }
|
|
inline T* & get() { return m_psub; }
|
|
inline const T*const operator->() const { return m_psub; }
|
|
inline T* operator->() { return m_psub; }
|
|
};
|
|
|
|
template< typename CI >
|
|
struct backref_tag : public std::pair<CI,CI>
|
|
{
|
|
backref_tag( CI i1 = CI(0), CI i2 = CI(0) )
|
|
: std::pair<CI,CI>(i1,i2), reserved(0) {}
|
|
operator bool() const throw() { return first != CI(0) && second != CI(0); }
|
|
bool operator!() const throw() { return ! operator bool(); }
|
|
size_t reserved; // used for internal book-keeping
|
|
};
|
|
|
|
template< typename CH >
|
|
backref_tag< const CH * > _static_match_helper(
|
|
const CH * szstr,
|
|
const basic_rpattern_base< const CH * > & pat,
|
|
std::vector< backref_tag< const CH * > > * prgbackrefs ) throw();
|
|
|
|
template< typename CH >
|
|
size_t _static_count_helper(
|
|
const CH * szstr,
|
|
const basic_rpattern_base< const CH * > & pat ) throw();
|
|
|
|
// --------------------------------------------------------------------------
|
|
//
|
|
// Class: basic_regexpr
|
|
//
|
|
// Description: string class that allows regular expression pattern matching
|
|
//
|
|
// Methods: basic_regexpr - ctor
|
|
// match - static method for matching C-style strings
|
|
// match - non-static method for matching C++-style strings
|
|
// count - static method for couting matches in C-style strings
|
|
// count - non-static method for counting matches in C++-style strin
|
|
// substitute - perform substitutions in C++-style strings
|
|
// cbackrefs - return the count of internally stored back-references
|
|
// rstart - offset to start of n-th backref
|
|
// rlength - lenght of n-th backref
|
|
// backref - return the n-th backref
|
|
// all_backrefs - return a vector of all saved backrefs
|
|
// backref_str - return the string to which the backreferences refer
|
|
//
|
|
// Members: m_rgbackrefs - vector of backrefs
|
|
// m_backref_str - temp string buffer
|
|
// m_pbackref_str - pointer to the string containing the string to which
|
|
// the backreferences refer (either *this or m_backref_str)
|
|
//
|
|
// Typedefs: backref_type -
|
|
// backref_vector -
|
|
//
|
|
// History: 8/14/2000 - ericne - Created
|
|
//
|
|
// --------------------------------------------------------------------------
|
|
template< typename CH, typename TR = std::char_traits<CH>, typename AL = std::allocator<CH> >
|
|
class basic_regexpr : public std::basic_string<CH,TR,AL>
|
|
{
|
|
public:
|
|
|
|
basic_regexpr( const allocator_type & a = allocator_type() )
|
|
: std::basic_string<CH,TR,AL>( a ), m_pbackref_str( & m_backref_str ) {}
|
|
|
|
basic_regexpr( const CH * p,
|
|
const allocator_type & a = allocator_type() )
|
|
: std::basic_string<CH,TR,AL>( p, a ), m_pbackref_str( & m_backref_str ) {}
|
|
|
|
basic_regexpr( const CH * p, size_type n,
|
|
const allocator_type & a = allocator_type() )
|
|
: std::basic_string<CH,TR,AL>( p, n, a ), m_pbackref_str( & m_backref_str ) {}
|
|
|
|
basic_regexpr( const std::basic_string<CH,TR,AL> & s, size_type pos = 0, size_type n = npos,
|
|
const allocator_type & a = allocator_type() )
|
|
: std::basic_string<CH,TR,AL>( s, pos, n, a ), m_pbackref_str( & m_backref_str ) {}
|
|
|
|
basic_regexpr( size_type n, CH ch,
|
|
const allocator_type & a = allocator_type() )
|
|
: std::basic_string<CH,TR,AL>( n, ch, a ), m_pbackref_str( & m_backref_str ) {}
|
|
|
|
basic_regexpr( const_iterator begin, const_iterator end,
|
|
const allocator_type & a = allocator_type() )
|
|
: std::basic_string<CH,TR,AL>( begin, end, a ), m_pbackref_str( & m_backref_str ) {}
|
|
|
|
// actually stores iterators into *m_pbackref_str:
|
|
typedef backref_tag<const_iterator> backref_type;
|
|
typedef std::vector< backref_type > backref_vector;
|
|
|
|
// stores pointers into the null-terminated C-stype string
|
|
typedef backref_tag< const CH * > backref_type_c;
|
|
typedef std::vector< backref_type_c > backref_vector_c;
|
|
|
|
// returns $0, the first backref
|
|
static backref_type_c match( const CH * szstr,
|
|
const basic_rpattern_base< const CH * > & pat,
|
|
backref_vector_c * prgbackrefs = NULL ) throw()
|
|
{
|
|
return _static_match_helper<CH>( szstr, pat, prgbackrefs );
|
|
}
|
|
|
|
// returns $0, the first backref
|
|
backref_type match( const basic_rpattern_base< const_iterator > & pat,
|
|
size_type pos = 0,
|
|
size_type len = npos ) const throw();
|
|
|
|
static size_t count( const CH * szstr,
|
|
const basic_rpattern_base< const CH * > & pat ) throw()
|
|
{
|
|
return _static_count_helper<CH>( szstr, pat );
|
|
}
|
|
|
|
size_t count( const basic_rpattern_base< const_iterator > & pat,
|
|
size_type pos = 0,
|
|
size_type len = npos ) const throw();
|
|
|
|
size_t substitute( const basic_rpattern_base< const_iterator > & pat,
|
|
size_type pos = 0,
|
|
size_type len = npos ) throw(std::bad_alloc);
|
|
|
|
size_t cbackrefs() const throw()
|
|
{
|
|
return m_rgbackrefs.size();
|
|
}
|
|
|
|
size_type rstart( size_t cbackref = 0 ) const throw(std::out_of_range)
|
|
{
|
|
return std::distance( m_pbackref_str->begin(), m_rgbackrefs.at( cbackref ).first );
|
|
}
|
|
|
|
size_type rlength( size_t cbackref = 0 ) const throw(std::out_of_range)
|
|
{
|
|
return std::distance( m_rgbackrefs.at( cbackref ).first, m_rgbackrefs.at( cbackref ).second );
|
|
}
|
|
|
|
backref_type backref( size_t cbackref ) const throw(std::out_of_range)
|
|
{
|
|
return m_rgbackrefs.at( cbackref );
|
|
}
|
|
|
|
const backref_vector & all_backrefs() const throw()
|
|
{
|
|
return m_rgbackrefs;
|
|
}
|
|
|
|
const std::basic_string<CH,TR,AL> & backref_str() const throw()
|
|
{
|
|
return *m_pbackref_str;
|
|
}
|
|
|
|
protected:
|
|
|
|
// save information about the backrefs
|
|
// mutable because these can change in the "const" match() method.
|
|
mutable backref_vector m_rgbackrefs;
|
|
mutable std::basic_string<CH,TR,AL> m_backref_str;
|
|
mutable const std::basic_string<CH,TR,AL> * m_pbackref_str;
|
|
};
|
|
|
|
// --------------------------------------------------------------------------
|
|
//
|
|
// Class: match_param
|
|
//
|
|
// Description: Struct that contains the state of the matching operation.
|
|
// Passed by reference to all domatch and _match_this routines.
|
|
//
|
|
// Methods: match_param - ctor
|
|
// match_param - ctor
|
|
//
|
|
// Members: ibegin - start of the string
|
|
// istart - start of this iteration
|
|
// istop - end of the string
|
|
// prgbackrefs - pointer to backref array0
|
|
//
|
|
// History: 8/14/2000 - ericne - Created
|
|
//
|
|
// --------------------------------------------------------------------------
|
|
template< typename CI >
|
|
struct match_param
|
|
{
|
|
CI ibegin;
|
|
CI istart;
|
|
CI istop;
|
|
std::vector< backref_tag< CI > > * prgbackrefs;
|
|
|
|
match_param( CI _istart,
|
|
CI _istop,
|
|
std::vector< backref_tag< CI > > * _prgbackrefs )
|
|
: ibegin(_istart),
|
|
istart(_istart),
|
|
istop(_istop),
|
|
prgbackrefs(_prgbackrefs)
|
|
{
|
|
}
|
|
match_param( CI _ibegin,
|
|
CI _istart,
|
|
CI _istop,
|
|
std::vector< backref_tag< CI > > * _prgbackrefs )
|
|
: ibegin(_ibegin),
|
|
istart(_istart),
|
|
istop(_istop),
|
|
prgbackrefs(_prgbackrefs)
|
|
{
|
|
}
|
|
};
|
|
|
|
// --------------------------------------------------------------------------
|
|
//
|
|
// Class: subst_node
|
|
//
|
|
// Description: Substitution strings are parsed into an array of these
|
|
// structures in order to speed up subst operations.
|
|
//
|
|
// Members: stype - type of this struct
|
|
// subst_string - do a string substitution
|
|
// subst_backref - do a bacref substitution
|
|
// op - execute an operation
|
|
//
|
|
// History: 8/14/2000 - ericne - Created
|
|
//
|
|
// --------------------------------------------------------------------------
|
|
struct subst_node
|
|
{
|
|
enum subst_type { SUBST_STRING, SUBST_BACKREF, SUBST_OP };
|
|
enum { PREMATCH = -1, POSTMATCH = -2 };
|
|
enum op_type { UPPER_ON = SUBST_UPPER_ON,
|
|
UPPER_NEXT = SUBST_UPPER_NEXT,
|
|
LOWER_ON = SUBST_LOWER_ON,
|
|
LOWER_NEXT = SUBST_LOWER_NEXT,
|
|
ALL_OFF = SUBST_ALL_OFF };
|
|
subst_type stype;
|
|
union
|
|
{
|
|
struct
|
|
{
|
|
size_t rstart;
|
|
size_t rlength;
|
|
} subst_string;
|
|
size_t subst_backref;
|
|
op_type op;
|
|
};
|
|
};
|
|
|
|
// --------------------------------------------------------------------------
|
|
//
|
|
// Class: basic_rpattern_base
|
|
//
|
|
// Description:
|
|
//
|
|
// Methods: basic_rpattern_base - ctor
|
|
// flags - get the state of the flags
|
|
// uses_backrefs - true if the backrefs are referenced
|
|
// get_first_subexpression - return ptr to first sub_expr struct
|
|
// get_width - get min/max nbr chars this pattern can match
|
|
// loops - if false, we only need to try to match at 1st position
|
|
// cgroups - number of visible groups
|
|
// _cgroups_total - total number of groups, including hidden (?:) groups
|
|
// get_pat - get string representing the pattern
|
|
// get_subst - get string representing the substitution string
|
|
// get_subst_list - get the list of subst nodes
|
|
// _normalize_string - perform character escaping
|
|
// _reset - reinitialize the pattern
|
|
//
|
|
// Members: m_fuses_backrefs -
|
|
// m_floop -
|
|
// m_cgroups -
|
|
// m_cgroups_visible -
|
|
// m_flags -
|
|
// m_nwidth -
|
|
// m_pat -
|
|
// m_subst -
|
|
// m_subst_list -
|
|
// m_pfirst -
|
|
//
|
|
// Typedefs: char_type -
|
|
//
|
|
// History: 8/14/2000 - ericne - Created
|
|
//
|
|
// --------------------------------------------------------------------------
|
|
template< typename CI >
|
|
class basic_rpattern_base
|
|
{
|
|
public:
|
|
typedef std::iterator_traits<CI>::value_type char_type;
|
|
|
|
basic_rpattern_base( unsigned flags = 0,
|
|
const std::basic_string<char_type> & pat = std::basic_string<char_type>(),
|
|
const std::basic_string<char_type> & subst = std::basic_string<char_type>() ) throw()
|
|
: m_fuses_backrefs( false ),
|
|
m_floop( true ),
|
|
m_cgroups( 0 ),
|
|
m_cgroups_visible( 0 ),
|
|
m_flags( flags ),
|
|
m_nwidth( uninit_width ),
|
|
m_pat( pat ),
|
|
m_subst( subst ),
|
|
m_pfirst( NULL )
|
|
{
|
|
}
|
|
|
|
unsigned flags() const throw()
|
|
{
|
|
return m_flags;
|
|
}
|
|
|
|
bool uses_backrefs() const throw()
|
|
{
|
|
return m_fuses_backrefs;
|
|
}
|
|
|
|
const sub_expr<CI> * get_first_subexpression() const throw()
|
|
{
|
|
return m_pfirst.get();
|
|
}
|
|
|
|
width_type get_width() const throw()
|
|
{
|
|
return m_nwidth;
|
|
}
|
|
|
|
bool loops() const throw()
|
|
{
|
|
return m_floop;
|
|
}
|
|
|
|
size_t cgroups() const throw()
|
|
{
|
|
return m_cgroups_visible;
|
|
}
|
|
|
|
size_t _cgroups_total() const throw()
|
|
{
|
|
return m_cgroups;
|
|
}
|
|
|
|
const std::basic_string<char_type> & get_pat() const throw()
|
|
{
|
|
return m_pat;
|
|
}
|
|
|
|
const std::basic_string<char_type> & get_subst() const throw()
|
|
{
|
|
return m_subst;
|
|
}
|
|
|
|
const std::list<subst_node> & get_subst_list() const throw()
|
|
{
|
|
return m_subst_list;
|
|
}
|
|
|
|
protected:
|
|
|
|
void _normalize_string( std::basic_string<char_type> & str );
|
|
|
|
void _reset()
|
|
{
|
|
m_fuses_backrefs = false;
|
|
m_flags = 0;
|
|
}
|
|
|
|
bool m_fuses_backrefs; // true if the substitution uses backrefs
|
|
bool m_floop; // false if m_pfirst->domatch only needs to be called once
|
|
size_t m_cgroups; // number of groups (always at least one)
|
|
size_t m_cgroups_visible; // number of visible groups
|
|
unsigned m_flags; // flags used to customize search/replace
|
|
width_type m_nwidth; // width of the pattern
|
|
|
|
std::basic_string<char_type> m_pat; // contains the unparsed pattern
|
|
std::basic_string<char_type> m_subst; // contains the unparsed substitution
|
|
|
|
std::list<subst_node> m_subst_list; // used to speed up substitution
|
|
auto_sub_ptr<sub_expr<CI> > m_pfirst; // first subexpression in pattern
|
|
};
|
|
|
|
// --------------------------------------------------------------------------
|
|
//
|
|
// Class: basic_rpattern
|
|
//
|
|
// Description:
|
|
//
|
|
// Methods: basic_rpattern - ctor
|
|
// basic_rpattern -
|
|
// basic_rpattern -
|
|
// init - for (re)initializing a pattern
|
|
// init -
|
|
// set_substitution - set the substitution string
|
|
// set_flags - set the flags
|
|
// register_intrinsic_charset - bind an escape sequence to a user-def'd charset
|
|
// purge_intrinsic_charsets - delete all user-def'd charsets
|
|
// _get_next_group_nbr - return a monotomically increasing id
|
|
// _find_next_group - parse the next group of the pattern
|
|
// _find_next - parse the next sub_expr of the pattern
|
|
// _find_atom - parse the next atom of the pattern
|
|
// _quantify - quantify the sub_expr
|
|
// _common_init - perform some common initialization tasks
|
|
// _parse_subst - parse the substitution string
|
|
// _add_subst_backref - add a backref node to the subst list
|
|
// _reset - reinitialize the pattern
|
|
//
|
|
// Members: s_charset_map - for maintaining user-defined charsets
|
|
// m_invisible_groups - list of hidden groups to be numbered last
|
|
//
|
|
// Typedefs: syntax_type -
|
|
//
|
|
// History: 8/14/2000 - ericne - Created
|
|
//
|
|
// --------------------------------------------------------------------------
|
|
template< typename CI, typename SY = perl_syntax<std::iterator_traits<CI>::value_type> >
|
|
class basic_rpattern : public basic_rpattern_base<CI>
|
|
{
|
|
public:
|
|
|
|
friend class match_charset<CI>;
|
|
|
|
typedef SY syntax_type;
|
|
|
|
basic_rpattern() throw();
|
|
|
|
basic_rpattern( const std::basic_string<char_type> & pat, unsigned flags=0 ) throw(bad_regexpr,std::bad_alloc);
|
|
|
|
basic_rpattern( const std::basic_string<char_type> & pat, const std::basic_string<char_type> & subst, unsigned flags=0 ) throw(bad_regexpr,std::bad_alloc);
|
|
|
|
void init( const std::basic_string<char_type> & pat, unsigned flags=0 ) throw(bad_regexpr,std::bad_alloc);
|
|
|
|
void init( const std::basic_string<char_type> & pat, const std::basic_string<char_type> & subst, unsigned flags=0 ) throw(bad_regexpr,std::bad_alloc);
|
|
|
|
void set_substitution( const std::basic_string<char_type> & subst ) throw(bad_regexpr,std::bad_alloc);
|
|
|
|
void set_flags( unsigned flags ) throw(bad_regexpr,std::bad_alloc);
|
|
|
|
class charset_map
|
|
{
|
|
struct charsets
|
|
{
|
|
sub_expr<CI> * rgpcharsets[2];
|
|
std::basic_string<char_type> str_charset;
|
|
|
|
charsets() throw()
|
|
{
|
|
memset( rgpcharsets, 0, sizeof( rgpcharsets ) );
|
|
}
|
|
~charsets() throw()
|
|
{
|
|
clean();
|
|
}
|
|
void clean() throw()
|
|
{
|
|
for( int i=0; i < (sizeof(rgpcharsets)/sizeof(*rgpcharsets)); ++i )
|
|
delete_sub_expr( rgpcharsets[i] );
|
|
}
|
|
match_charset<CI> * get_charset( unsigned flags ) throw(bad_regexpr,std::bad_alloc)
|
|
{
|
|
push_new_handler pnh( &my_new_handler );
|
|
// Since these charsets are only used while creating other charsets,
|
|
// all flags besides NOCASE can safely be ignored here.
|
|
bool index = ( NOCASE == ( NOCASE & flags ) );
|
|
if( NULL == rgpcharsets[ index ] )
|
|
{
|
|
std::basic_string<char_type>::iterator istart = str_charset.begin();
|
|
rgpcharsets[ index ] = create_charset_helper<CI,SY>::create_charset_aux( str_charset, ++istart, flags );
|
|
}
|
|
return (match_charset<CI>*) rgpcharsets[ index ];
|
|
}
|
|
};
|
|
|
|
typedef std::map<char_type,charsets> map_type;
|
|
std::auto_ptr<map_type> m_pmap;
|
|
|
|
public:
|
|
|
|
void put( char_type ch, const std::basic_string<char_type> & str ) throw(bad_regexpr,std::bad_alloc)
|
|
{
|
|
// These characters cannot be bound to a user-defined intrinsic character set
|
|
static const char_type rgIllegal[] =
|
|
{
|
|
'0','1','2','3','4','5','6','7','8','9','A','Z','z','Q',
|
|
'b','B','d','D','f','n','r','s','S','t','v','w','W','E'
|
|
};
|
|
|
|
// So operator new throws bad_alloc on failure.
|
|
push_new_handler pnh( &my_new_handler );
|
|
|
|
if( std::char_traits<char_type>::find( rgIllegal, ARRAYSIZE( rgIllegal ), ch ) )
|
|
throw bad_regexpr( "illegal character specified for intrinsic character set." );
|
|
|
|
if( NULL == m_pmap.get() )
|
|
m_pmap = auto_ptr<map_type>( new map_type );
|
|
|
|
// creates an empty entry if one does not already exist
|
|
charsets & chrsts = (*m_pmap)[ch];
|
|
chrsts.clean();
|
|
chrsts.str_charset = str;
|
|
|
|
// Try compiling the character set once to make sure it is properly formed:
|
|
(void) chrsts.get_charset( 0 );
|
|
}
|
|
|
|
match_charset<CI> * get( char_type ch, unsigned flags ) throw()
|
|
{
|
|
match_charset<CI> * pRet = NULL;
|
|
if( NULL != m_pmap.get() )
|
|
{
|
|
try
|
|
{
|
|
push_new_handler pnh( &my_new_handler );
|
|
map_type::iterator iter = m_pmap->find( ch );
|
|
if( iter != m_pmap->end() )
|
|
pRet = iter->second.get_charset( flags );
|
|
}
|
|
catch(...) {}
|
|
}
|
|
|
|
return pRet;
|
|
}
|
|
|
|
void purge() throw()
|
|
{
|
|
if( NULL != m_pmap.get() )
|
|
delete m_pmap.release();
|
|
}
|
|
};
|
|
|
|
static void register_intrinsic_charset(
|
|
char_type ch, const std::basic_string<char_type> & str ) throw(bad_regexpr,std::bad_alloc)
|
|
{
|
|
s_charset_map.put( ch, str );
|
|
}
|
|
|
|
static void purge_intrinsic_charsets() throw()
|
|
{
|
|
s_charset_map.purge();
|
|
}
|
|
|
|
protected:
|
|
|
|
static charset_map s_charset_map;
|
|
|
|
size_t _get_next_group_nbr()
|
|
{
|
|
return m_cgroups++;
|
|
}
|
|
|
|
match_group<CI> * _find_next_group( std::basic_string<char_type>::iterator & ipat,
|
|
unsigned & flags,
|
|
std::vector<match_group<CI>*> & rggroups );
|
|
|
|
bool _find_next( std::basic_string<char_type>::iterator & ipat,
|
|
match_group<CI> * pgroup, unsigned & flags,
|
|
std::vector<match_group<CI>*> & rggroups );
|
|
|
|
void _find_atom( std::basic_string<char_type>::iterator & ipat,
|
|
match_group<CI> * pgroup, unsigned flags );
|
|
|
|
void _quantify( auto_sub_ptr<sub_expr<CI> > & pnew,
|
|
match_group<CI> * pnew_group,
|
|
std::basic_string<char_type>::iterator & ipat );
|
|
|
|
void _common_init( unsigned flags );
|
|
|
|
void _parse_subst();
|
|
|
|
void _add_subst_backref( subst_node & snode, size_t nbackref, size_t rstart );
|
|
|
|
void _reset();
|
|
|
|
std::list<match_group<CI>*> m_invisible_groups; // groups w/o backrefs
|
|
|
|
};
|
|
|
|
inline std::ostream & operator<<( std::ostream & sout,
|
|
const basic_regexpr<char>::backref_type & br )
|
|
{
|
|
for( std::string::const_iterator ithis = br.first; ithis != br.second; ++ithis )
|
|
sout.put( *ithis );
|
|
return sout;
|
|
}
|
|
|
|
inline std::wostream & operator<<( std::wostream & sout,
|
|
const basic_regexpr<wchar_t>::backref_type & br )
|
|
{
|
|
for( std::wstring::const_iterator ithis = br.first; ithis != br.second; ++ithis )
|
|
sout.put( *ithis > UCHAR_MAX ? L'?' : *ithis );
|
|
return sout;
|
|
}
|
|
|
|
typedef basic_regexpr<TCHAR> regexpr;
|
|
typedef std::basic_string<TCHAR> tstring;
|
|
|
|
typedef basic_rpattern<const TCHAR *,perl_syntax<TCHAR> > perl_rpattern_c;
|
|
typedef basic_rpattern<const TCHAR *,posix_syntax<TCHAR> > posix_rpattern_c;
|
|
typedef basic_rpattern<tstring::const_iterator,perl_syntax<TCHAR> > perl_rpattern;
|
|
typedef basic_rpattern<tstring::const_iterator,posix_syntax<TCHAR> > posix_rpattern;
|
|
|
|
typedef perl_rpattern rpattern; // matches against std::string
|
|
typedef perl_rpattern_c rpattern_c; // matches against null-terminated, c-style strings
|
|
|
|
#ifdef _MT
|
|
|
|
//
|
|
// Define some classes and macros for creating function-local
|
|
// static const rpatterns in a thread-safe way
|
|
//
|
|
|
|
template< typename PAT >
|
|
class rpattern_destroyer
|
|
{
|
|
const bool & m_fConstructed;
|
|
const PAT & m_refPat;
|
|
public:
|
|
rpattern_destroyer( const bool & fConstructed, const PAT & refPat )
|
|
: m_fConstructed( fConstructed ), m_refPat( refPat )
|
|
{
|
|
}
|
|
~rpattern_destroyer()
|
|
{
|
|
if( m_fConstructed )
|
|
_Destroy( & m_refPat );
|
|
}
|
|
};
|
|
|
|
class CRegExCritSect : private CRITICAL_SECTION
|
|
{
|
|
public:
|
|
CRegExCritSect() { InitializeCriticalSection(this); }
|
|
~CRegExCritSect() { DeleteCriticalSection(this); }
|
|
void Enter() { EnterCriticalSection(this); }
|
|
void Leave() { LeaveCriticalSection(this); }
|
|
};
|
|
|
|
extern CRegExCritSect g_objRegExCritSect;
|
|
|
|
class CRegExLock
|
|
{
|
|
public:
|
|
CRegExLock() { g_objRegExCritSect.Enter(); }
|
|
~CRegExLock() { g_objRegExCritSect.Leave(); }
|
|
};
|
|
|
|
#define STATIC_RPATTERN_EX( type, var, params ) \
|
|
static unsigned char s_rgb_##var[ sizeof type ]; \
|
|
static bool s_f_##var = false; \
|
|
static const type & var = *reinterpret_cast<type*>( s_rgb_##var ); \
|
|
static const regex::rpattern_destroyer<type> s_des_##var( s_f_##var, var ); \
|
|
if( ! s_f_##var ) \
|
|
{ \
|
|
regex::CRegExLock objLock; \
|
|
if( ! s_f_##var ) \
|
|
{ \
|
|
new( s_rgb_##var ) type params; \
|
|
s_f_##var = true; \
|
|
} \
|
|
}
|
|
|
|
#else
|
|
|
|
#define STATIC_RPATTERN_EX( type, var, params ) \
|
|
static const type var params;
|
|
|
|
#endif
|
|
|
|
#define STATIC_RPATTERN( var, params ) \
|
|
STATIC_RPATTERN_EX( regex::rpattern, var, params )
|
|
|
|
} // namespace regex
|
|
|