windows-nt/Source/XPSP1/NT/shell/ext/mlang/validate.cpp

/*----------------------------------------------------------------------------
	%%File: validate.c
	%%Unit: fechmap
	%%Contact: jpick

	"Rolling" state machines that allow interactive verification of
    DBCS and EUC files.  Currently, separate tables are stored for
    each encoding so that the state machines can be run in parallel
    (i.e., multiple parse streams).

	These routines are used by auto-detection and if caller wants
    conversion routines to return errors on invalid characters.

    Following is a description of the structure of the DBCS and EUC
    encodings handled by this module.  This information is taken from
    CJK.INF (maintained by Ken Lunde, author of _Understanding Japanese
    Information Processing_).  This information governs the structure
    of the class and validation state tables used in this module.

    Big5
      Two-byte Standard Characters         Encoding Ranges
          first byte range                     0xA1-0xFE
          second byte ranges                   0x40-0x7E, 0xA1-0xFE
      One-byte Characters                  Encoding Range
          ASCII                                0x21-0x7E

    GBK
      Two-byte Standard Characters         Encoding Ranges
          first byte range                     0x81-0xFE
          second byte ranges                   0x40-0x7E and 0x80-0xFE
      One-byte Characters                  Encoding Range
          ASCII                                0x21-0x7E

    HZ (information from HZ spec Fung F. Lee (lee@umunhum.stanford.edu))
	  One-byte characters					Encoding Ranges
      	first GB byte range						0x21-0x77
      	second GB byte range					0x21-0x7E
      	ASCII									0x21-0x7E
      Mode switching						Encoding sequence
		escape sequence from GB to ASCII		0x7E followed by 0x7B ("~{")
		escape sequence from ASCII to GB		0x7E followed by 0x7D ("~}")
		line continuation marker 				0x7E followed by 0x0A
        (Note: ASCII mode is the default mode)

    Shift-Jis
      Two-byte Standard Characters         Encoding Ranges
          first byte ranges                    0x81-0x9F, 0xE0-0xEF
          second byte ranges                   0x40-0x7E, 0x80-0xFC
      Two-byte User-defined Dharacters     Encoding Ranges
          first byte range                     0xF0-0xFC
          second byte ranges                   0x40-0x7E, 0x80-0xFC
      One-byte Characters                  Encoding Range
          Half-width katakana                  0xA1-0xDF
          ASCII/JIS-Roman                      0x21-0x7E

    Wansung
      Two-byte Standard Characters         Encoding Ranges
          first byte range                     0x81-0xFE
          second byte ranges                   0x40-0x7E and 0x80-0xFE
      One-byte Characters                  Encoding Range
          ASCII                                0x21-0x7E

    EUC-Cn
      Code set 0 (ASCII or GB 1988-89):        0x21-0x7E
      Code set 1 (GB 2312-80):                 0xA1A1-0xFEFE
      Code set 2:                              unused
      Code set 3:                              unused

    EUC-Jp
      Code set 0 (ASCII or JIS X 0201-1976 Roman):  0x21-0x7E
      Code set 1 (JIS X 0208):                 0xA1A1-0xFEFE
      Code set 2 (half-width katakana):        0x8EA1-0x8EDF
      Code set 3 (JIS X 0212-1990):            0x8FA1A1-0x8FFEFE

    EUC-Kr
      Code set 0 (ASCII or KS C 5636-1993):    0x21-0x7E
      Code set 1 (KS C 5601-1992):             0xA1A1-0xFEFE
      Code set 2:                              unused
      Code set 3:                              unused

    EUC-Tw
      Code set 0 (ASCII):                      0x21-0x7E
      Code set 1 (CNS 11643-1992 Plane 1):     0xA1A1-0xFEFE
      Code set 2 (CNS 11643-1992 Planes 1-16): 0x8EA1A1A1-0x8EB0FEFE
      Code set 3:                              unused

	UTF-7 (information from the RFC2152 by D.Goldsmith)
	  One-byte characters					Encoding Ranges
      	Direct and Optionally direct			0x21-0x2A, 0x2C-0x5B,
      											0x5D-0x60, 0x7B-0x7D
      											0x09, 0x0A, 0x0D, 0x20
      	Modified Base64							0x2B, 0x2F-39, 0x41-0x5A, 0x61-0x7A
      Mode switching
      	escape sequence from D/O to M. Base64 	0x2B
      	escape sequence from M. Base64 to D/O 	0x2D (or any control character)

 ----------------------------------------------------------------------------*/

#include <stdio.h>
#include <stddef.h>

#include "private.h"
#include "fechmap_.h"
#include "lexint_.h"


/*----------------------------------------------------------------------------
	Common Defs for all Sequence Validation
----------------------------------------------------------------------------*/

// Characters are broken down into ranges -- the smallest ranges that
// are treated as important by either EUC or DBCS (all flavors).  In
// some cases, the smallest range is a single character.  It saves
// some space to avoid having two class tables (even though more states
// are added to the state machines), so both encodings share these
// tokens.

// Common Tokens
//
#define ollow       0		// "other" legal low ascii character
#define x000a       1		// 0x0a ("\n")
#define x212a       2		// characters in range 0x21-0x2a
#define x002b       3		// 0x2b	("+")
#define x002c       4		// 0x2c	(",")
#define x002d       5		// 0x2d	("-")
#define x002e       6       // 0x2e ("\")
#define x2f39       7		// characters in range 0x2f-0x39
#define x3a3f       8		// characters in range 0x3a-0x3f
#define x0040       9       // 0x40
#define x415a       10		// characters in range 0x41-0x5a
#define x005b       11		// 0x5b ("[")
#define x005c       12		// 0x5c ("\")
#define x5d60       13		// characters in range 0x5d-0x60
#define x6177       14      // characters in range 0x61-0x77
#define	x787a       15		// characters in range 0x78-0x7a
#define x007b       16		// 0x7b ("{")
#define x007c       17		// 0x7c ("|")
#define x007d       18		// 0x7d ("}")
#define x007e       19		// 0x7e ("~")
#define x007f       20		// 0x7f (DEL)
#define x0080       21		// 0x80
#define x818d       22		// characters in range 0x81-0x8d
#define x008e       23		// 0x8e
#define x008f       24		// 0x8f
#define x909f       25		// characters in range 0x90-0x9f
#define x00a0       26		// 0xa0
#define xa1b0       27		// characters in range 0xa1-0xb0
#define xb1df       28		// characters in range 0xb1-0xdf
#define xe0ef       29		// characters in range 0xe0-0xef
#define xf0fc       30		// characters in range 0xf0-0xfc
#define xfdfe       31		// characters in range 0xfd-0xfe

#define ateof       32		// end-of-file
#define other       33		// character not covered by above tokens

#define nTokens     34      //

// Class table
//
static char _rgchCharClass[256] =
//         0      1      2      3      4      5      6      7      8      9      a      b      c      d      e      f
    {
//  0      nul    soh    stx    etx    eot    enq    ack    bel    bs     tab    lf     vt     np     cr     so     si		0
           other, other, other, other, other, other, other, other, other, ollow, x000a, other, other, ollow, other, other,

//  1      dle    dc1    dc2    dc3    dc4    nak    syn    etb    can    em     eof    esc    fs     gs     rs     us		1
           other, other, other, other, other, other, other, other, other, other, ollow, other, other, other, other, other,

//  2      sp     !      "      #      $      %      &      '      (      )      *      +      ,      -      .      /		2
           ollow, x212a, x212a, x212a, x212a, x212a, x212a, x212a, x212a, x212a, x212a, x002b, x002c, x002d, x002e, x2f39,

//  3      0      1      2      3      4      5      6      7      8      9      :      ;      <      =      >      ?		3
           x2f39, x2f39, x2f39, x2f39, x2f39, x2f39, x2f39, x2f39, x2f39, x2f39, x3a3f, x3a3f, x3a3f, x3a3f, x3a3f, x3a3f,

//  4      @      A      B      C      D      E      F      G      H      I      J      K      L      M      N      O		4
           x0040, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a,

//  5      P      Q      R      S      T      U      V      W      X      Y      Z      [      \      ]      ^      _		5
           x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x005b, x005c, x5d60, x5d60, x5d60,

//  6      `      a      b      c      d      e      f      g      h      i      j      k      l      m      n      o		6
           x5d60, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177,

//  7      p      q      r      s      t      u      v      w      x      y      z      {      |      }      ~      del		7
           x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x787a, x787a, x787a, x007b, x007c, x007d, x007e, x007f,

//	8                                                                                                                       8
           x0080, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x008e, x008f,

//	9                                                                                                                       9
           x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f,

//	a                                                                                                                       a
           x00a0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0,

//	b                                                                                                                       b
           xa1b0, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df,

//	c                                                                                                                       c
           xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df,

//	d                                                                                                                       d
           xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df,

//	e                                                                                                                       e
           xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef,

//	f                                                                                                                       f
           xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xfdfe, xfdfe, other,

//         0      1      2      3      4      5      6      7      8      9      a      b      c      d      e      f
};


// Common States -- All SM's use these
//
#define ACC         0x4e
#define ERR         0x7f

// Other States -- All SM's use some of these, not all use all
//
#define ST0         0x00
#define ST0c        0x40
#define ST1         0x01
#define ST1c        0x41
#define ST2         0x02
#define ST2c        0x42
#define ST3         0x03
#define ST3c        0x43
#define ST4         0x04
#define ST4c        0x44

// Each state can have a corresponding counting stata i.e. stata with
// with the same transitions but during which we look for special sequences.
//
#define FTstCounting(tst)                   (((tst) & 0x40) != 0)   // If the state is counting (including ACC)
#define TstNotCountingFromTst(tst)          ((tst) & 0x3f)          // Obtain the real state from the counting

/*----------------------------------------------------------------------------
	DBCS character sequence validation
----------------------------------------------------------------------------*/

#define nSJisStates		2
static signed char _rgchSJisNextState[nSJisStates][nTokens] =
{
//   o     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     a     o
//   l     0     2     0     0     0     0     2     3     0     4     0     0     5     6     7     0     0     0     0     0     0     8     0     0     9     0     a     b     e     f     f     t     t
//   l     0     1     0     0     0     0     e     a     0     1     0     0     d     1     8     0     0     0     0     0     0     1     0     0     0     0     1     1     0     0     d     e     h
//   o     0     2     2     2     2     2     3     3     4     5     5     5     6     7     7     7     7     7     7     7     8     8     8     8     9     a     b     d     e     f     f     o     e
//   w     a     a     b     c     d     e     9     f     0     a     b     c     0     7     a     b     c     d     e     f     0     d     e     f     f     0     0     f     f     c     e     f     r
//------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
//


// DBCS State 0 -- start (look for legal single byte or lead byte)
    ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ERR,  ERR,  ST1,  ST1,  ST1,  ST1,  ERR,  ACC,  ACC,  ST1,  ST1,  ERR,  ACC,  ERR,

// DBCS State 1 -- saw lead byte, need legal trail byte
    ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ERR,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ERR,  ERR,  ERR,

};

#define nBig5States		2
static signed char _rgchBig5NextState[nBig5States][nTokens] =
{
//
//   o     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     a     o
//   l     0     2     0     0     0     0     2     3     0     4     0     0     5     6     7     0     0     0     0     0     0     8     0     0     9     0     a     b     e     f     f     t     t
//   l     0     1     0     0     0     0     f     a     0     1     0     0     d     1     8     0     0     0     0     0     0     1     0     0     0     0     1     1     0     0     d     e     h
//   o     0     2     2     2     2     2     3     3     4     5     5     5     6     7     7     7     7     7     7     7     8     8     8     8     9     a     b     d     e     f     f     o     e
//   w     a     a     b     c     d     e     9     f     0     a     b     c     0     7     a     b     c     d     e     f     0     d     e     f     f     0     0     f     f     c     e     f     r
//------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
//

// DBCS State 0 -- start (look for legal single byte or lead byte)
    ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ERR,  ERR,  ST1,  ST1,  ST1,  ST1,  ST1,  ST1,  ST1,  ST1,  ST1,  ST1,  ACC,  ERR,

// DBCS State 1 -- saw lead byte, need legal trail byte
    ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ACC,  ACC,  ACC,  ACC,  ACC,  ERR,  ERR,

};

#define nGbkWanStates		2
static signed char _rgchGbkWanNextState[nGbkWanStates][nTokens] =
{
//
//   o     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     a     o
//   l     0     2     0     0     0     0     2     3     0     4     0     0     5     6     7     0     0     0     0     0     0     8     0     0     9     0     a     b     e     f     f     t     t
//   l     0     1     0     0     0     0     f     a     0     1     0     0     d     1     8     0     0     0     0     0     0     1     0     0     0     0     1     1     0     0     d     e     h
//   o     0     2     2     2     2     2     3     3     4     5     5     5     6     7     7     7     7     7     7     7     8     8     8     8     9     a     b     d     e     f     f     o     e
//   w     a     a     b     c     d     e     9     f     0     a     b     c     0     7     a     b     c     d     e     f     0     d     e     f     f     0     0     f     f     c     e     f     r
//-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
//

// DBCS State 0 -- start (look for legal single byte or lead byte)
    ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ERR,  ERR,  ST1,  ST1,  ST1,  ST1,  ST1,  ST1,  ST1,  ST1,  ST1,  ST1,  ACC,  ERR,

// DBCS State 1 -- saw lead byte, need legal trail byte
    ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ERR,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ERR,  ERR,


};


/*----------------------------------------------------------------------------
	EUC character sequence validation
----------------------------------------------------------------------------*/

#define nEucJpStates		4
static signed char _rgchEucJpNextState[nEucJpStates][nTokens] =
{
//
//   o     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     a     o
//   l     0     2     0     0     0     0     2     3     0     4     0     0     5     6     7     0     0     0     0     0     0     8     0     0     9     0     a     b     e     f     f     t     t
//   l     0     1     0     0     0     0     f     a     0     1     0     0     d     1     8     0     0     0     0     0     0     1     0     0     0     0     1     1     0     0     d     e     h
//   o     0     2     2     2     2     2     3     3     4     5     5     5     6     7     7     7     7     7     7     7     8     8     8     8     9     a     b     d     e     f     f     o     e
//   w     a     a     b     c     d     e     9     f     0     a     b     c     0     7     a     b     c     d     e     f     0     d     e     f     f     0     0     f     f     c     e     f     r
//-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
//

// EUC State 0 -- start
    ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ERR,  ERR,  ERR,  ST2,  ST3,  ERR,  ERR,  ST1,  ST1,  ST1,  ST1,  ST1,  ACC,  ERR,

// EUC State 1 -- saw a1fe, need one more a1fe
    ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ACC,  ACC,  ACC,  ACC,  ACC,  ERR,  ERR,

// EUC State 2 -- saw 8e, need a1df
    ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ACC,  ACC,  ERR,  ERR,  ERR,  ERR,  ERR,

// EUC State 3 -- saw 8f, need 2 a1fe
    ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ST1,  ST1,  ST1,  ST1,  ST1,  ERR,  ERR,

};

#define nEucKrCnStates		2
static signed char _rgchEucKrCnNextState[nEucKrCnStates][nTokens] =
{
//
//   o     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     a     o
//   l     0     2     0     0     0     0     2     3     0     4     0     0     5     6     7     0     0     0     0     0     0     8     0     0     9     0     a     b     e     f     f     t     t
//   l     0     1     0     0     0     0     f     a     0     1     0     0     d     1     8     0     0     0     0     0     0     1     0     0     0     0     1     1     0     0     d     e     h
//   o     0     2     2     2     2     2     3     3     4     5     5     5     6     7     7     7     7     7     7     7     8     8     8     8     9     a     b     d     e     f     f     o     e
//   w     a     a     b     c     d     e     9     f     0     a     b     c     0     7     a     b     c     d     e     f     0     d     e     f     f     0     0     f     f     c     e     f     r
//-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
//

// EUC State 0 -- start
    ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ST1,  ST1,  ST1,  ST1,  ST1,  ACC,  ERR,

// EUC State 1 -- saw a1fe, need one more a1fe
    ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ACC,  ACC,  ACC,  ACC,  ACC,  ERR,  ERR,

};

#define nEucTwStates		4
static signed char _rgchEucTwNextState[nEucTwStates][nTokens] =
{
//
//   o     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     a     o
//   l     0     2     0     0     0     0     2     3     0     4     0     0     5     6     7     0     0     0     0     0     0     8     0     0     9     0     a     b     e     f     f     t     t
//   l     0     1     0     0     0     0     f     a     0     1     0     0     d     1     8     0     0     0     0     0     0     1     0     0     0     0     1     1     0     0     d     e     h
//   o     0     2     2     2     2     2     3     3     4     5     5     5     6     7     7     7     7     7     7     7     8     8     8     8     9     a     b     d     e     f     f     o     e
//   w     a     a     b     c     d     e     9     f     0     a     b     c     0     7     a     b     c     d     e     f     0     d     e     f     f     0     0     f     f     c     e     f     r
//-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
//

// EUC State 0 -- start
    ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ERR,  ERR,  ERR,  ST2,  ERR,  ERR,  ERR,  ST1,  ST1,  ST1,  ST1,  ST1,  ACC,  ERR,

// EUC State 1 -- saw a1fe, need one more a1fe
    ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ACC,  ACC,  ACC,  ACC,  ACC,  ERR,  ERR,

// EUC State 2 -- saw 8e, need a1b0
    ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ST3,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,

// EUC State 3 -- saw 8e, a1b0; need 2 a1fe
    ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ST1,  ST1,  ST1,  ST1,  ST1,  ERR,  ERR,

};

/*-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
	HZ character sequence validation
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------*/
// Currently some of the rules for HZ encoding outlined above are a bit loosened up.
// (e.g. the range for the first GB byte is expanded) The rules were adjusted based on real data.

#define nHzStates		5
static signed char _rgchHzNextState[nHzStates][nTokens] =
{
//
//   o     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     a     o
//   l     0     2     0     0     0     0     2     3     0     4     0     0     5     6     7     0     0     0     0     0     0     8     0     0     9     0     a     b     e     f     f     t     t
//   l     0     1     0     0     0     0     f     a     0     1     0     0     d     1     8     0     0     0     0     0     0     1     0     0     0     0     1     1     0     0     d     e     h
//   o     0     2     2     2     2     2     3     3     4     5     5     5     6     7     7     7     7     7     7     7     8     8     8     8     9     a     b     d     e     f     f     o     e
//   w     a     a     b     c     d     e     9     f     0     a     b     c     0     7     a     b     c     d     e     f     0     d     e     f     f     0     0     f     f     c     e     f     r
//-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
//

// HZ State 0 -- ASCII
    ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ST1c, ACC,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ACC,  ERR,

// HZ State 1 -- saw "~," looking for "{" to make transition to GB mode
    ERR,  ACC,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ST2c, ERR,  ERR,  ACC,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,

// HZ State 2 -- just saw "{," expecting GB byte
    ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ERR,  ERR,  ERR,  ST4c, ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,

// HZ State 3 -- expecting GB byte
    ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST4c, ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,

// HZ State 4 -- saw "~," looking for "}" to make transition to ASCII mode
    ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ST3,  ACC,  ST3,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,

};

/*-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
	UTF-7 character sequence validation
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------*/

#define nUtf7States		3
static signed char _rgchUtf7NextState[nUtf7States][nTokens] =
{
//
//   o     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     x     a     o
//   l     0     2     0     0     0     0     2     3     0     4     0     0     5     6     7     0     0     0     0     0     0     8     0     0     9     0     a     b     e     f     f     t     t
//   l     0     1     0     0     0     0     f     a     0     1     0     0     d     1     8     0     0     0     0     0     0     1     0     0     0     0     1     1     0     0     d     e     h
//   o     0     2     2     2     2     2     3     3     4     5     5     5     6     7     7     7     7     7     7     7     8     8     8     8     9     a     b     d     e     f     f     o     e
//   w     a     a     b     c     d     e     9     f     0     a     b     c     0     7     a     b     c     d     e     f     0     d     e     f     f     0     0     f     f     c     e     f     r
//-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
//

// UTF7 State 0 -- Direct/optionally direct ACSII mode, state transition can happen on "+"
    ACC,  ACC,  ACC,  ST1c, ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ERR,  ACC,  ACC,  ACC,  ACC,  ACC,  ACC,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ACC,  ERR,

// UTF7 State 1 -- Expecting first character from Modified Base64 alphabet
    ERR,  ERR,  ERR,  ST2,  ERR,  ACC,  ERR,  ST2,  ERR,  ERR,  ST2,  ERR,  ERR,  ERR,  ST2,  ST2,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,

// UTF7 State 2 -- Modified Base64 alphabet mode, can be exited with "-" or any control character.
    ACC,  ACC,  ERR,  ST2,  ERR,  ACC,  ERR,  ST2,  ERR,  ERR,  ST2,  ERR,  ERR,  ERR,  ST2,  ST2,  ERR,  ERR,  ERR,  ERR,  ACC,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ACC,  ERR,
};

/*----------------------------------------------------------------------------
	UTF-8 character sequence validation
----------------------------------------------------------------------------*/

static int _nUtf8Tb = 0;

#define BIT7(a)           ((a) & 0x80)
#define BIT6(a)           ((a) & 0x40)

/* N  U T F  8 */
/*----------------------------------------------------------------------------
	%%Function: _NUtf8
	%%Contact: jpick

	UTF-8 doesn't require a state table for validation, just a count
	of the number of expected trail bytes.  See utf8lex.c for an
	explanation of this code.
----------------------------------------------------------------------------*/
static int __inline NUtf8(UCHAR uch, BOOL fEoi)
{
	// BIT7(uch) == 0 implies single ASCII byte.
	// BIT6(uch) == 0 implies one of n trail bytes.
	// Otherwise, lead byte, with number of bits set
	//   up to first 0 equal to the total number bytes
	//   in the sequence.
	//
	// REVIEW: _nUtf8Tb *is* really the state of this
	//   validator -- use nState in structure?
	//
	if (fEoi && (_nUtf8Tb != 0))
		{
		return 0;				// unexpected end-of-input
		}
    else if (BIT7(uch) == 0)
        {
		if (_nUtf8Tb != 0)		// unexpected single byte
			return 0;
		return 1;
        }
    else if (BIT6(uch) == 0)
        {
		if (_nUtf8Tb == 0)		// unexpected trail byte
			return 0;
		if ((--_nUtf8Tb) == 0)
			return 1;
        }
    else
        {
		if (_nUtf8Tb != 0)		// unexpected lead byte
			return 0;
        while (BIT7(uch) != 0)
            {
            uch <<= 1;
            _nUtf8Tb++;
            }
        _nUtf8Tb--;				// don't count lead byte
        }
	return -1;
}


/*----------------------------------------------------------------------------
	Character Mapping Defs
----------------------------------------------------------------------------*/

// If caller wants us to check characters as part of validation
//
typedef BOOL (*PFNCHECKCHAR)(ICET icetIn);

#define cchMaxBuff		5
typedef struct _cc
{
	int nCp;						// code page
	int cchBuff;					// fill count of character buffer
	PFNCHECKCHAR pfnCheckChar;		// character check routine
	char rgchBuff[cchMaxBuff];		// character buffer
} CC;

// Character validation prototypes
//
static BOOL _FDbcsCheckChar(ICET icetIn);


// DBCS character checker structures
//

// Big5
static CC _ccBig5 =
{
	nCpTaiwan,
	0,
	_FDbcsCheckChar,
};

// Gbk
static CC _ccGbk =
{
	nCpChina,
	0,
	_FDbcsCheckChar,
};

// ShiftJis
static CC _ccSJis =
{
	nCpJapan,
	0,
	_FDbcsCheckChar,
};

// Wansung
static CC _ccWan =
{
	nCpKorea,
	0,
	_FDbcsCheckChar,
};


// Character checker structures just used as buffers.
//

// Euc-Jp
static CC _ccEucJp =
{
	0,
	0,
	0,
};

// Hz
static CC _ccHz =
{
	0,
	0,
	0,
};

// Utf7
static CC _ccUtf7 =
{
	0,
	0,
	0,
};

/*----------------------------------------------------------------------------
	Character Occurrence Counters
----------------------------------------------------------------------------*/

// If calling app wants us to track occurrences of common character
// sequences during validation (used only by auto-detection, so far).
//

typedef struct _coce
{
	int   cHits;
	short cwch;
	WCHAR rgwch[2];
} COCE;

typedef struct _coc
{
	BOOL  fMatching;
	short nCoceCurr;
	short nCoceIndex;
	int   ccoce;
	COCE *rgcoce;
} COC;

// Big5
//
static COCE _rgcoceBig5[] =
{
	{0, 2, {(WCHAR)0xa7da, (WCHAR)0xadcc},},			// "wo men"
	{0, 2, {(WCHAR)0xa8e4, (WCHAR)0xb9ea},},			// "qi shi"
	{0, 2, {(WCHAR)0xa65d, (WCHAR)0xacb0},},			// "yin wei"
	{0, 2, {(WCHAR)0xb8ea, (WCHAR)0xb054},},			// "zi xun"
	{0, 2, {(WCHAR)0xb971, (WCHAR)0xb8a3},},			// "diam nao"
	{0, 2, {(WCHAR)0xbaf4, (WCHAR)0xb8f4},},			// "wang lu"
	{0, 2, {(WCHAR)0xbd75, (WCHAR)0xa457},},			// "xian shang"
	{0, 2, {(WCHAR)0xc577, (WCHAR)0xaaef},},			// "huan ying"
	{0, 2, {(WCHAR)0xa477, (WCHAR)0xb867},},			// "yi jing"
};

static COC _cocBig5 =
{
	fFalse,												// fMatching
	0,													// nCoceCurr
	0,													// nCoceIndex
	sizeof(_rgcoceBig5) / sizeof(_rgcoceBig5[0]), 		// ccoce
	_rgcoceBig5,										// rgcoce
};

// Euc-Cn
//
static COCE _rgcoceEucCn[] =
{
	{0, 2, {(WCHAR)0xcbfb, (WCHAR)0xc3c7},},			// "ta men"
	{0, 2, {(WCHAR)0xced2, (WCHAR)0xc3c7},},			// "wo men"
	{0, 2, {(WCHAR)0xd2f2, (WCHAR)0xb4cb},},			// "yin ci"
	{0, 2, {(WCHAR)0xcab2, (WCHAR)0xc3b4},},			// "shen mo"
	{0, 2, {(WCHAR)0xc8e7, (WCHAR)0xb9fb},},			// "ru guo"
	{0, 2, {(WCHAR)0xd2f2, (WCHAR)0xceaa},},			// "yin wei"
	{0, 2, {(WCHAR)0xcbf9, (WCHAR)0xd2d4},},			// "suo yi"
	{0, 2, {(WCHAR)0xbbb6, (WCHAR)0xd3ad},},			// "huan ying"
	{0, 2, {(WCHAR)0xcdf8, (WCHAR)0xc2e7},},			// "wang luo"
	{0, 2, {(WCHAR)0xd0c5, (WCHAR)0xcfa2},},			// "xin xi"
	{0, 2, {(WCHAR)0xbcc6, (WCHAR)0xcbe3},},			// "ji guan"
};

static COC _cocEucCn =
{
	fFalse,												// fMatching
	0,													// nCoceCurr
	0,													// nCoceIndex
	sizeof(_rgcoceEucCn) / sizeof(_rgcoceEucCn[0]), 	// ccoce
	_rgcoceEucCn,										// rgcoce
};

// Euc-Kr
//
static COCE _rgcoceEucKr[] =
{
	{0, 2, {(WCHAR)0xb0a1, (WCHAR)0x0020},},
	{0, 2, {(WCHAR)0xb0a1, (WCHAR)0xa1a1},},
	{0, 2, {(WCHAR)0xb4c2, (WCHAR)0x0020},},
	{0, 2, {(WCHAR)0xb4c2, (WCHAR)0xa1a1},},
	{0, 2, {(WCHAR)0xb4d9, (WCHAR)0x002e},},
	{0, 2, {(WCHAR)0xb4d9, (WCHAR)0xa3ae},},
	{0, 2, {(WCHAR)0xb8a6, (WCHAR)0x0020},},
	{0, 2, {(WCHAR)0xb8a6, (WCHAR)0xa1a1},},
	{0, 2, {(WCHAR)0xc0ba, (WCHAR)0x0020},},
	{0, 2, {(WCHAR)0xc0ba, (WCHAR)0xa1a1},},
	{0, 2, {(WCHAR)0xc0bb, (WCHAR)0x0020},},
	{0, 2, {(WCHAR)0xc0bb, (WCHAR)0xa1a1},},
	{0, 2, {(WCHAR)0xc0cc, (WCHAR)0x0020},},
	{0, 2, {(WCHAR)0xc0cc, (WCHAR)0xa1a1},},
};

static COC _cocEucKr =
{
	fFalse,												// fMatching
	0,													// nCoceCurr
	0,													// nCoceIndex
	sizeof(_rgcoceEucKr) / sizeof(_rgcoceEucKr[0]), 	// ccoce
	_rgcoceEucKr,										// rgcoce
};

// EUC-Jp
//
static COCE _rgcoceEucJp[] =
{
	{0, 2, {(WCHAR)0xa4c7, (WCHAR)0xa4b9},},			// "de su"
	{0, 2, {(WCHAR)0xa4c0, (WCHAR)0xa1a3},},			// "da ."
	{0, 2, {(WCHAR)0xa4a4, (WCHAR)0xa4eb},},			// "i ru"
	{0, 2, {(WCHAR)0xa4de, (WCHAR)0xa4b9},},			// "ma su"
	{0, 2, {(WCHAR)0xa4b7, (WCHAR)0xa4bf},},			// "shi ta"
	{0, 2, {(WCHAR)0xa4b9, (WCHAR)0xa4eb},},			// "su ru"
	{0, 2, {(WCHAR)0xa4bf, (WCHAR)0xa1a3},},			// "ta ."
	{0, 2, {(WCHAR)0xa4eb, (WCHAR)0xa1a3},},			// "ru ."
};

static COC _cocEucJp =
{
	fFalse,												// fMatching
	0,													// nCoceCurr
	0,													// nCoceIndex
	sizeof(_rgcoceEucJp) / sizeof(_rgcoceEucJp[0]), 	// ccoce
	_rgcoceEucJp,										// rgcoce
};

// GBK
//
static COCE _rgcoceGbk[] =
{
	{0, 2, {(WCHAR)0xcbfb, (WCHAR)0xc3c7},},			// "ta men"
	{0, 2, {(WCHAR)0xced2, (WCHAR)0xc3c7},},			// "wo men"
	{0, 2, {(WCHAR)0xd2f2, (WCHAR)0xb4cb},},			// "yin ci"
	{0, 2, {(WCHAR)0xcab2, (WCHAR)0xc3b4},},			// "shen mo"
	{0, 2, {(WCHAR)0xc8e7, (WCHAR)0xb9fb},},			// "ru guo"
	{0, 2, {(WCHAR)0xd2f2, (WCHAR)0xceaa},},			// "yin wei"
	{0, 2, {(WCHAR)0xcbf9, (WCHAR)0xd2d4},},			// "suo yi"
	{0, 2, {(WCHAR)0xbbb6, (WCHAR)0xd3ad},},			// "huan ying"
	{0, 2, {(WCHAR)0xcdf8, (WCHAR)0xc2e7},},			// "wang luo"
	{0, 2, {(WCHAR)0xd0c5, (WCHAR)0xcfa2},},			// "xin xi"
	{0, 2, {(WCHAR)0xbcc6, (WCHAR)0xcbe3},},			// "ji guan"
};

static COC _cocGbk =
{
	fFalse,												// fMatching
	0,													// nCoceCurr
	0,													// nCoceIndex
	sizeof(_rgcoceGbk) / sizeof(_rgcoceGbk[0]), 		// ccoce
	_rgcoceGbk,											// rgcoce
};

// Shift-JIS
//
static COCE _rgcoceSJis[] =
{
	{0, 2, {(WCHAR)0x82c5, (WCHAR)0x82b7},},			// "de su"
	{0, 2, {(WCHAR)0x82be, (WCHAR)0x8142},},			// "da ."
	{0, 2, {(WCHAR)0x82a2, (WCHAR)0x82e9},},			// "i ru"
	{0, 2, {(WCHAR)0x82dc, (WCHAR)0x82b7},},			// "ma su"
	{0, 2, {(WCHAR)0x82b5, (WCHAR)0x82bd},},			// "shi ta"
	{0, 2, {(WCHAR)0x82b7, (WCHAR)0x82e9},},			// "su ru"
	{0, 2, {(WCHAR)0x82bd, (WCHAR)0x8142},},			// "ta ."
	{0, 2, {(WCHAR)0x82e9, (WCHAR)0x8142},},			// "ru ."
};

static COC _cocSJis =
{
	fFalse,												// fMatching
	0,													// nCoceCurr
	0,													// nCoceIndex
	sizeof(_rgcoceSJis) / sizeof(_rgcoceSJis[0]), 		// ccoce
	_rgcoceSJis,										// rgcoce
};

// Wansung
//
// REVIEW: bug (1/2 this table is being ignored)
//
static COCE _rgcoceWan[] =
{
	{0, 2, {(WCHAR)0xb0a1, (WCHAR)0x0020},},
	{0, 2, {(WCHAR)0xb0a1, (WCHAR)0xa1a1},},
	{0, 2, {(WCHAR)0xb4c2, (WCHAR)0x0020},},
	{0, 2, {(WCHAR)0xb4c2, (WCHAR)0xa1a1},},
	{0, 2, {(WCHAR)0xb4d9, (WCHAR)0x002e},},
	{0, 2, {(WCHAR)0xb4d9, (WCHAR)0xa3ae},},
	{0, 2, {(WCHAR)0xb8a6, (WCHAR)0x0020},},
	{0, 2, {(WCHAR)0xb8a6, (WCHAR)0xa1a1},},
	{0, 2, {(WCHAR)0xc0ba, (WCHAR)0x0020},},
	{0, 2, {(WCHAR)0xc0ba, (WCHAR)0xa1a1},},
	{0, 2, {(WCHAR)0xc0bb, (WCHAR)0x0020},},
	{0, 2, {(WCHAR)0xc0bb, (WCHAR)0xa1a1},},
	{0, 2, {(WCHAR)0xc0cc, (WCHAR)0x0020},},
	{0, 2, {(WCHAR)0xc0cc, (WCHAR)0xa1a1},},
};

static COC _cocWan =
{
	fFalse,												// fMatching
	0,													// nCoceCurr
	0,													// nCoceIndex
	sizeof(_rgcoceWan) / sizeof(_rgcoceWan[0]), 		// ccoce
	_rgcoceWan,											// rgcoce
};

// Hz
//
static COCE _rgcoceHz[] =
{
	{0, 2, {(WCHAR)0x007e, (WCHAR)0x007b},},			// ~{
	{0, 2, {(WCHAR)0x007e, (WCHAR)0x007d},},            //  ~}
};

static COC _cocHz =
{
	fFalse,												// fMatching
	0,													// nCoceCurr
	0,													// nCoceIndex
	sizeof(_rgcoceHz) / sizeof(_rgcoceHz[0]), 		    // ccoce
	_rgcoceHz,											// rgcoce
};

// Utf7
//
static COCE _rgcoceUtf7[] =
{
	{0, 2, {(WCHAR)0x002b, (WCHAR)0x002d},},			// +-
};

static COC _cocUtf7 =
{
	fFalse,												// fMatching
	0,													// nCoceCurr
	0,													// nCoceIndex
	sizeof(_rgcoceUtf7) / sizeof(_rgcoceUtf7[0]), 		// ccoce
	_rgcoceUtf7,										// rgcoce
};

// Character counter prototype.
//
static void _CountChars(ICET icetIn);


/*----------------------------------------------------------------------------
	Main Definitions
----------------------------------------------------------------------------*/

// Structure to keep state, state machine and other associated
// information for a given character set "parse stream."
//
typedef struct _vr
{
	BOOL  fInUse;
	DWORD dwFlags;
	int   nState;
	CC   *ccCheck;
	signed char (*rgchNextState)[nTokens];
} VR;

// Array of validation records.  We allow multiple, active parse
// streams for auto-detect -- this way, it can concurrently keep
// a parse stream for each encoding type, without needing to read
// its input multiple times.
//
static VR _mpicetvr[icetCount] =
{
	{fTrue,  0, ST0, 0,         _rgchEucKrCnNextState,},		// icetEucCn
	{fTrue,  0, ST0, &_ccEucJp, _rgchEucJpNextState,},			// icetEucJp
	{fTrue,  0, ST0, 0,         _rgchEucKrCnNextState,},		// icetEucKr
	{fTrue,  0, ST0, 0,         _rgchEucTwNextState,},			// icetEucTw
	{fFalse, 0, ST0, 0,         0,},							// icetIso2022Cn
	{fFalse, 0, ST0, 0,         0,},							// icetIso2022Jp
	{fFalse, 0, ST0, 0,         0,},							// icetIso2022Kr
	{fFalse, 0, ST0, 0,         0,},							// icetIso2022Tw
	{fTrue,  0, ST0, &_ccBig5,  _rgchBig5NextState,},			// icetBig5
	{fTrue,  0, ST0, &_ccGbk,   _rgchGbkWanNextState,},			// icetGbk
	{fTrue,  0, ST0, &_ccHz,    _rgchHzNextState,},             // icetHz
	{fTrue,  0, ST0, &_ccSJis,  _rgchSJisNextState,},			// icetShiftJis
	{fTrue,  0, ST0, &_ccWan,   _rgchGbkWanNextState,},			// icetWansung
	{fTrue,  0, ST0, &_ccUtf7,  _rgchUtf7NextState,},           // icetUtf7
	{fTrue,  0, ST0, 0,        0,},								// icetUtf8
};

// Array of character sequence counters, one per encoding type.
//
static COC *_mpicetlpcoc[icetCount] =
{
	&_cocEucCn,			// icetEucCn
	&_cocEucJp,			// icetEucJp
	&_cocEucKr,			// icetEucKr
	0,					// icetEucTw
	0,					// icetIso2022Cn
	0,					// icetIso2022Jp
	0,					// icetIso2022Kr
	0,					// icetIso2022Tw
	&_cocBig5,			// icetBig5
	&_cocGbk,			// icetGbk
	&_cocHz,            // icetHz
	&_cocSJis,			// icetShiftJis
	&_cocWan,			// icetWansung
	&_cocUtf7,          // icetUtf7
	0,					// icetUtf8
};


/* V A L I D A T E  I N I T */
/*----------------------------------------------------------------------------
	%%Function: ValidateInit
	%%Contact: jpick

	Initialize the state machine for the given character set (set its
	state to ST0 (the start state) and store its parsing options).
----------------------------------------------------------------------------*/
void ValidateInit(ICET icetIn, DWORD dwFlags)
{
	// Initialize the character occurrence counter, if caller wants
	// us to count common character sequences (auto-detect, only,
	// for now).  Turn off the count-common-chars flag if we're not
	// set up to count sequences (meaning we don't have a set of
	// common characters for this encoding type or have no place
	// to buffer them).
	//
	if (dwFlags & grfCountCommonChars)
		{
		if ((_mpicetlpcoc[icetIn]) && (_mpicetvr[icetIn].ccCheck))
			{
			int i;
			for (i = 0; i < _mpicetlpcoc[icetIn]->ccoce; i++)
				_mpicetlpcoc[icetIn]->rgcoce[i].cHits = 0;
			_mpicetlpcoc[icetIn]->fMatching = fFalse;
			}
		else
			{
			dwFlags &= ~grfCountCommonChars;
			}
		}

	// If validation not supported for the encoding type, there's
	// nothing else for us to do here.
	//
	if (!_mpicetvr[icetIn].fInUse)
		return;

	_mpicetvr[icetIn].nState = ST0;

	// Can't do character mapping validation without character
	// checker information.  (If we do have the character checker,
	// initialize its buffer length to 0).
	//
	if (_mpicetvr[icetIn].ccCheck)
		_mpicetvr[icetIn].ccCheck->cchBuff = 0;
	else
		dwFlags &= ~grfValidateCharMapping;

	// It's also impossible without a valid code page.
	//
	if ((dwFlags & grfValidateCharMapping) && !IsValidCodePage(_mpicetvr[icetIn].ccCheck->nCp))
		dwFlags &= ~grfValidateCharMapping;

	_mpicetvr[icetIn].dwFlags = dwFlags;

	if (icetIn == icetUtf8)
		_nUtf8Tb = 0;
}


/* V A L I D A T E  R E S E T  A L L*/
/*----------------------------------------------------------------------------
	%%Function: ValidateInitAll
	%%Contact: jpick

	Initialize the state machines for all character sets (set their
	states to ST0 (the start state) and store their parsing options).
----------------------------------------------------------------------------*/
void ValidateInitAll(DWORD dwFlags)
{
	int i;
	for (i = 0 ; i < icetCount; i++)
		{
		if (!_mpicetvr[i].fInUse)
			continue;
		ValidateInit((ICET)i, dwFlags);
		}
}


/* V A L I D A T E  R E S E T */
/*----------------------------------------------------------------------------
	%%Function: ValidateReset
	%%Contact: jpick

	Reset the state machine for the given character set (set its state
	to ST0 (the start state)).
----------------------------------------------------------------------------*/
void ValidateReset(ICET icetIn)
{
	// Initialize the character occurrence counter, if caller wants
	// us to count common character sequences (auto-detect, only,
	// for now).  We're guaranteed to have the structures if the
	// flag is set by ValidateInit(), above.
	//
	if (_mpicetvr[icetIn].dwFlags & grfCountCommonChars)
		{
		int i;
		for (i = 0; i < _mpicetlpcoc[icetIn]->ccoce; i++)
			_mpicetlpcoc[icetIn]->rgcoce[i].cHits = 0;
		_mpicetlpcoc[icetIn]->fMatching = fFalse;
		}

	// If validation not supported for the encoding type, there's
	// nothing else for us to do here.
	//
	if (!_mpicetvr[icetIn].fInUse)
		return;

	_mpicetvr[icetIn].nState = ST0;

	if (_mpicetvr[icetIn].ccCheck)
		_mpicetvr[icetIn].ccCheck->cchBuff = 0;

	if (icetIn == icetUtf8)
		_nUtf8Tb = 0;
}


/* V A L I D A T E  R E S E T  A L L */
/*----------------------------------------------------------------------------
	%%Function: ValidateResetAll
	%%Contact: jpick

	Reset the state machines for all character sets (set their states to
	ST0 (the start state)).
----------------------------------------------------------------------------*/
void ValidateResetAll(void)
{
	int i;

	for (i=0 ; i < icetCount; i++)
		{
		if (!_mpicetvr[i].fInUse)
			continue;
		ValidateReset((ICET)i);
		}
}


/* N  V A L I D A T E  U C H */
/*----------------------------------------------------------------------------
	%%Function: NValidateUch
	%%Contact: jpick

	Single step parser, takes one transition through the state table
	for the given character set.  Current state is kept for each
	character set's parse stream.

	Routine returns -1 if it does not reach a final state on this
	transition; 0 if transitioned to ERR(or) and 1 if transtioned
	to ACC(ept).

	If final state is ACC(ept), machine reset to ST0 (start state).
	(i.e., there's no need to manually reset on ACC(ept)).

	Routine is also a convenient collection point for certain
	statistics (currently only the counting of occurrences of common
	character sequences (defined for character sets, above)).
----------------------------------------------------------------------------*/
int NValidateUch(ICET icetIn, UCHAR uch, BOOL fEoi)
{
	int nToken;
	int nPrevState;
	int rc = -1;

	// If not validating this icet, nothing to do (so say
	// we accept the character).
	//
	if (!_mpicetvr[icetIn].fInUse)
		return 1;
	if (_mpicetvr[icetIn].nState == ERR)
		return 0;

	// Ignore all zeros in the detection file.
	if (!uch && !fEoi)
    	{
            goto _LRet;
        }

	// Hack -- want to validate UTF-8, but don't need a state
	// table to do so.  Treat as special case here and return.
	//
	if (icetIn == icetUtf8)
		{
		if ((rc = NUtf8(uch, fEoi)) == 0)
			_mpicetvr[icetIn].nState = ERR;
		return rc;
		}

	// Classify the character...
	//
	nPrevState = _mpicetvr[icetIn].nState;
	nToken = fEoi ? ateof : _rgchCharClass[uch];

	// First obtain a real number for a state based on the counting state...
	// Then do the transition...
	//
	_mpicetvr[icetIn].nState = (_mpicetvr[icetIn].rgchNextState)[TstNotCountingFromTst(_mpicetvr[icetIn].nState)][nToken];

#if 0
	if (_mpicetvr[icetIn].nState == ERR)
		printf("Character 0x%.2x; Going from state %.2x to state %.2x\n", uch, nPrevState, _mpicetvr[icetIn].nState);
#endif

	// If we're in an error state or have seen end-of-input, return.
	//
	if ((_mpicetvr[icetIn].nState == ERR) || (nToken == ateof))
		goto _LRet;

	// Are we to do character mapping validation?  (If this flag
	// is set, we're guaranteed to have a character checker
	// structure).  How about character occurrence counting?
	// (This also guarantees us a character checker structure).
	//
	if (!(_mpicetvr[icetIn].dwFlags & grfValidateCharMapping) &&
			!(_mpicetvr[icetIn].dwFlags & grfCountCommonChars))
		{
		goto _LRet;
		}

	// Buffer the current character (trusting that we'll never get
	// more than the max amount -- present tables enforce this)
	// (if it's Utf7 or Hz, buffer only if we are in the counting state
	//
	if (FTstCounting(_mpicetvr[icetIn].nState) || (icetIn != icetHz && icetIn != icetUtf7))
		_mpicetvr[icetIn].ccCheck->rgchBuff[_mpicetvr[icetIn].ccCheck->cchBuff++] = uch;

	// Return if we are not in the counting state
	//
	if (!(FTstCounting(_mpicetvr[icetIn].nState)))
		goto _LRet;

	// Call the character checker, if we have one.
	//
	if (_mpicetvr[icetIn].dwFlags & grfValidateCharMapping)
		{
		if (_mpicetvr[icetIn].ccCheck->pfnCheckChar && !(_mpicetvr[icetIn].ccCheck->pfnCheckChar)(icetIn))
			{
			_mpicetvr[icetIn].nState = ERR;
			goto _LRet;
			}
		}

	// If we're counting common characters, do so now.
	//
	if (_mpicetvr[icetIn].dwFlags & grfCountCommonChars)
		_CountChars(icetIn);

	// Reset the character checker/counter buffer.
	//
	_mpicetvr[icetIn].ccCheck->cchBuff = 0;

_LRet:

	// Return the appropriate code.
	//
	switch (_mpicetvr[icetIn].nState)
		{
		case ERR:
			return 0;
		case ACC:
			_mpicetvr[icetIn].nState = ST0;			// Reset
			return 1;
		default:
			return -1;								// need more data
		}
}


/* F  V A L I D A T E  C H A R  C O U N T */
/*----------------------------------------------------------------------------
	%%Function: FValidateCharCount
	%%Contact: jpick

	Return the number of matched special character sequences for the
	given character set.  If we're not keeping track of these sequences
	for the character set, either because we don't have the necessary
	static data or because the flag wasn't set by the calling routine,
	return fFalse.  Otherwise, return the count in *lpcMatch and return
	fTrue;

	(We track the counts separately for each sequence, just in case
	we want to weight them differently in the future.  Return the
	total, here).
----------------------------------------------------------------------------*/
BOOL FValidateCharCount(ICET icetIn, int *lpcMatch)
{
	int i;
	COC *lpcoc = _mpicetlpcoc[icetIn];
	VR *lpvr = &_mpicetvr[icetIn];

	if (!lpcoc || !lpvr->fInUse || !(lpvr->dwFlags & grfCountCommonChars))
		return fFalse;

	for (i = 0, *lpcMatch = 0; i < lpcoc->ccoce; i++)
		*lpcMatch += lpcoc->rgcoce[i].cHits;

	return fTrue;
}


/* _  C O U N T  C H A R S */
/*----------------------------------------------------------------------------
	%%Function: _CountChars
	%%Contact: jpick

	We've just completed a legal character for the given character
	set.  Match it against the set of special character sequences for
	the character set, if we have them.  Update match counts and
	current match indices (since sequences can span multiple legal
	characters) as needed.
----------------------------------------------------------------------------*/
static void _CountChars(ICET icetIn)
{
	WCHAR wch;
	int i;
	BOOL fFound;

	// Anything to do?
	//
	if (!_mpicetlpcoc[icetIn] || !_mpicetvr[icetIn].ccCheck)
		return;

	// Build the WCHAR.
	//
	switch (_mpicetvr[icetIn].ccCheck->cchBuff)
		{
		case 1:
			wch = WchFromUchUch(0, _mpicetvr[icetIn].ccCheck->rgchBuff[0]);
			break;
		case 2:
			wch = WchFromUchUch(_mpicetvr[icetIn].ccCheck->rgchBuff[0],
								_mpicetvr[icetIn].ccCheck->rgchBuff[1]);
			break;
		case 3:
			wch = WchFromUchUch(_mpicetvr[icetIn].ccCheck->rgchBuff[1],
								_mpicetvr[icetIn].ccCheck->rgchBuff[2]);
			break;
		case 4:
			wch = WchFromUchUch(_mpicetvr[icetIn].ccCheck->rgchBuff[2],
								_mpicetvr[icetIn].ccCheck->rgchBuff[3]);
			break;
		default:
			return;
		}

	// Are we currently working on matching a sequence?
	//
	if ((_mpicetlpcoc[icetIn]->fMatching) &&
		(wch == _mpicetlpcoc[icetIn]->rgcoce[_mpicetlpcoc[icetIn]->nCoceCurr].rgwch[_mpicetlpcoc[icetIn]->nCoceIndex]))
		{
		// Did we just match the entire sequence?  If so, increment the
		// hit count and reset.
		//
		if (++_mpicetlpcoc[icetIn]->nCoceIndex >= _mpicetlpcoc[icetIn]->rgcoce[_mpicetlpcoc[icetIn]->nCoceCurr].cwch)
			{
			++_mpicetlpcoc[icetIn]->rgcoce[_mpicetlpcoc[icetIn]->nCoceCurr].cHits;
			_mpicetlpcoc[icetIn]->fMatching = fFalse;
			}

		// All done.
		//
		return;
		}

	// If we need to start matching again (either because we're not
	// currently in a sequence or because a 2nd or later character
	// didn't match), try the current character as a lead character.
	//
	// REVIEW: wrong for sequences longer than 2 wchars.
	//
	for (i = 0, fFound = fFalse; (!fFound && (i < _mpicetlpcoc[icetIn]->ccoce)); i++)
		{
		if (wch == _mpicetlpcoc[icetIn]->rgcoce[i].rgwch[0])
			fFound = fTrue;
		}

	// Any luck?
	//
	if (!fFound)
		{
		_mpicetlpcoc[icetIn]->fMatching = fFalse;
		return;
		}

	// Store the matching state.
	//
	_mpicetlpcoc[icetIn]->fMatching = fTrue;
	_mpicetlpcoc[icetIn]->nCoceCurr = i - 1;
	_mpicetlpcoc[icetIn]->nCoceIndex = 1;			// where to look next
}


/* _  D B C S  C H E C K  C H A R */
/*----------------------------------------------------------------------------
	%%Function: _DbcsCheckChar
	%%Contact: jpick

	Character validator for DBCS formats.  Attempts to round-trip a
	legal multi-byte sequence to ensure that its valid for the given
	character set.

	REVIEW:  Slow, slow, slow -- do we really gain anything from the
    round-trip check, or is conversion *to* Unicode a sufficient test?
----------------------------------------------------------------------------*/
static WCHAR _rgwBuff[10];
static UCHAR _rgchBuff[30];

static BOOL _FDbcsCheckChar(ICET icetIn)
{
	int cCvt;

	// skip 1 byte characters, mostly uninteresting (Shift-Jis ??).
	//
	if (_mpicetvr[icetIn].ccCheck->cchBuff == 1)
		return fTrue;

	if (!(cCvt = MultiByteToWideChar(_mpicetvr[icetIn].ccCheck->nCp,
									 MB_ERR_INVALID_CHARS,
									 _mpicetvr[icetIn].ccCheck->rgchBuff,
									 _mpicetvr[icetIn].ccCheck->cchBuff,
									 _rgwBuff, 10)))
		{
		if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
			return fFalse;
		}

	return fTrue;  // probably not always right
}