1325 lines
53 KiB
C++
1325 lines
53 KiB
C++
/*----------------------------------------------------------------------------
|
|
%%File: validate.c
|
|
%%Unit: fechmap
|
|
%%Contact: jpick
|
|
|
|
"Rolling" state machines that allow interactive verification of
|
|
DBCS and EUC files. Currently, separate tables are stored for
|
|
each encoding so that the state machines can be run in parallel
|
|
(i.e., multiple parse streams).
|
|
|
|
These routines are used by auto-detection and if caller wants
|
|
conversion routines to return errors on invalid characters.
|
|
|
|
Following is a description of the structure of the DBCS and EUC
|
|
encodings handled by this module. This information is taken from
|
|
CJK.INF (maintained by Ken Lunde, author of _Understanding Japanese
|
|
Information Processing_). This information governs the structure
|
|
of the class and validation state tables used in this module.
|
|
|
|
Big5
|
|
Two-byte Standard Characters Encoding Ranges
|
|
first byte range 0xA1-0xFE
|
|
second byte ranges 0x40-0x7E, 0xA1-0xFE
|
|
One-byte Characters Encoding Range
|
|
ASCII 0x21-0x7E
|
|
|
|
GBK
|
|
Two-byte Standard Characters Encoding Ranges
|
|
first byte range 0x81-0xFE
|
|
second byte ranges 0x40-0x7E and 0x80-0xFE
|
|
One-byte Characters Encoding Range
|
|
ASCII 0x21-0x7E
|
|
|
|
HZ (information from HZ spec Fung F. Lee (lee@umunhum.stanford.edu))
|
|
One-byte characters Encoding Ranges
|
|
first GB byte range 0x21-0x77
|
|
second GB byte range 0x21-0x7E
|
|
ASCII 0x21-0x7E
|
|
Mode switching Encoding sequence
|
|
escape sequence from GB to ASCII 0x7E followed by 0x7B ("~{")
|
|
escape sequence from ASCII to GB 0x7E followed by 0x7D ("~}")
|
|
line continuation marker 0x7E followed by 0x0A
|
|
(Note: ASCII mode is the default mode)
|
|
|
|
Shift-Jis
|
|
Two-byte Standard Characters Encoding Ranges
|
|
first byte ranges 0x81-0x9F, 0xE0-0xEF
|
|
second byte ranges 0x40-0x7E, 0x80-0xFC
|
|
Two-byte User-defined Dharacters Encoding Ranges
|
|
first byte range 0xF0-0xFC
|
|
second byte ranges 0x40-0x7E, 0x80-0xFC
|
|
One-byte Characters Encoding Range
|
|
Half-width katakana 0xA1-0xDF
|
|
ASCII/JIS-Roman 0x21-0x7E
|
|
|
|
Wansung
|
|
Two-byte Standard Characters Encoding Ranges
|
|
first byte range 0x81-0xFE
|
|
second byte ranges 0x40-0x7E and 0x80-0xFE
|
|
One-byte Characters Encoding Range
|
|
ASCII 0x21-0x7E
|
|
|
|
EUC-Cn
|
|
Code set 0 (ASCII or GB 1988-89): 0x21-0x7E
|
|
Code set 1 (GB 2312-80): 0xA1A1-0xFEFE
|
|
Code set 2: unused
|
|
Code set 3: unused
|
|
|
|
EUC-Jp
|
|
Code set 0 (ASCII or JIS X 0201-1976 Roman): 0x21-0x7E
|
|
Code set 1 (JIS X 0208): 0xA1A1-0xFEFE
|
|
Code set 2 (half-width katakana): 0x8EA1-0x8EDF
|
|
Code set 3 (JIS X 0212-1990): 0x8FA1A1-0x8FFEFE
|
|
|
|
EUC-Kr
|
|
Code set 0 (ASCII or KS C 5636-1993): 0x21-0x7E
|
|
Code set 1 (KS C 5601-1992): 0xA1A1-0xFEFE
|
|
Code set 2: unused
|
|
Code set 3: unused
|
|
|
|
EUC-Tw
|
|
Code set 0 (ASCII): 0x21-0x7E
|
|
Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE
|
|
Code set 2 (CNS 11643-1992 Planes 1-16): 0x8EA1A1A1-0x8EB0FEFE
|
|
Code set 3: unused
|
|
|
|
UTF-7 (information from the RFC2152 by D.Goldsmith)
|
|
One-byte characters Encoding Ranges
|
|
Direct and Optionally direct 0x21-0x2A, 0x2C-0x5B,
|
|
0x5D-0x60, 0x7B-0x7D
|
|
0x09, 0x0A, 0x0D, 0x20
|
|
Modified Base64 0x2B, 0x2F-39, 0x41-0x5A, 0x61-0x7A
|
|
Mode switching
|
|
escape sequence from D/O to M. Base64 0x2B
|
|
escape sequence from M. Base64 to D/O 0x2D (or any control character)
|
|
|
|
----------------------------------------------------------------------------*/
|
|
|
|
#include <stdio.h>
|
|
#include <stddef.h>
|
|
|
|
#include "private.h"
|
|
#include "fechmap_.h"
|
|
#include "lexint_.h"
|
|
|
|
|
|
/*----------------------------------------------------------------------------
|
|
Common Defs for all Sequence Validation
|
|
----------------------------------------------------------------------------*/
|
|
|
|
// Characters are broken down into ranges -- the smallest ranges that
|
|
// are treated as important by either EUC or DBCS (all flavors). In
|
|
// some cases, the smallest range is a single character. It saves
|
|
// some space to avoid having two class tables (even though more states
|
|
// are added to the state machines), so both encodings share these
|
|
// tokens.
|
|
|
|
// Common Tokens
|
|
//
|
|
#define ollow 0 // "other" legal low ascii character
|
|
#define x000a 1 // 0x0a ("\n")
|
|
#define x212a 2 // characters in range 0x21-0x2a
|
|
#define x002b 3 // 0x2b ("+")
|
|
#define x002c 4 // 0x2c (",")
|
|
#define x002d 5 // 0x2d ("-")
|
|
#define x002e 6 // 0x2e ("\")
|
|
#define x2f39 7 // characters in range 0x2f-0x39
|
|
#define x3a3f 8 // characters in range 0x3a-0x3f
|
|
#define x0040 9 // 0x40
|
|
#define x415a 10 // characters in range 0x41-0x5a
|
|
#define x005b 11 // 0x5b ("[")
|
|
#define x005c 12 // 0x5c ("\")
|
|
#define x5d60 13 // characters in range 0x5d-0x60
|
|
#define x6177 14 // characters in range 0x61-0x77
|
|
#define x787a 15 // characters in range 0x78-0x7a
|
|
#define x007b 16 // 0x7b ("{")
|
|
#define x007c 17 // 0x7c ("|")
|
|
#define x007d 18 // 0x7d ("}")
|
|
#define x007e 19 // 0x7e ("~")
|
|
#define x007f 20 // 0x7f (DEL)
|
|
#define x0080 21 // 0x80
|
|
#define x818d 22 // characters in range 0x81-0x8d
|
|
#define x008e 23 // 0x8e
|
|
#define x008f 24 // 0x8f
|
|
#define x909f 25 // characters in range 0x90-0x9f
|
|
#define x00a0 26 // 0xa0
|
|
#define xa1b0 27 // characters in range 0xa1-0xb0
|
|
#define xb1df 28 // characters in range 0xb1-0xdf
|
|
#define xe0ef 29 // characters in range 0xe0-0xef
|
|
#define xf0fc 30 // characters in range 0xf0-0xfc
|
|
#define xfdfe 31 // characters in range 0xfd-0xfe
|
|
|
|
#define ateof 32 // end-of-file
|
|
#define other 33 // character not covered by above tokens
|
|
|
|
#define nTokens 34 //
|
|
|
|
// Class table
|
|
//
|
|
static char _rgchCharClass[256] =
|
|
// 0 1 2 3 4 5 6 7 8 9 a b c d e f
|
|
{
|
|
// 0 nul soh stx etx eot enq ack bel bs tab lf vt np cr so si 0
|
|
other, other, other, other, other, other, other, other, other, ollow, x000a, other, other, ollow, other, other,
|
|
|
|
// 1 dle dc1 dc2 dc3 dc4 nak syn etb can em eof esc fs gs rs us 1
|
|
other, other, other, other, other, other, other, other, other, other, ollow, other, other, other, other, other,
|
|
|
|
// 2 sp ! " # $ % & ' ( ) * + , - . / 2
|
|
ollow, x212a, x212a, x212a, x212a, x212a, x212a, x212a, x212a, x212a, x212a, x002b, x002c, x002d, x002e, x2f39,
|
|
|
|
// 3 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 3
|
|
x2f39, x2f39, x2f39, x2f39, x2f39, x2f39, x2f39, x2f39, x2f39, x2f39, x3a3f, x3a3f, x3a3f, x3a3f, x3a3f, x3a3f,
|
|
|
|
// 4 @ A B C D E F G H I J K L M N O 4
|
|
x0040, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a,
|
|
|
|
// 5 P Q R S T U V W X Y Z [ \ ] ^ _ 5
|
|
x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x415a, x005b, x005c, x5d60, x5d60, x5d60,
|
|
|
|
// 6 ` a b c d e f g h i j k l m n o 6
|
|
x5d60, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177,
|
|
|
|
// 7 p q r s t u v w x y z { | } ~ del 7
|
|
x6177, x6177, x6177, x6177, x6177, x6177, x6177, x6177, x787a, x787a, x787a, x007b, x007c, x007d, x007e, x007f,
|
|
|
|
// 8 8
|
|
x0080, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x818d, x008e, x008f,
|
|
|
|
// 9 9
|
|
x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f, x909f,
|
|
|
|
// a a
|
|
x00a0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0, xa1b0,
|
|
|
|
// b b
|
|
xa1b0, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df,
|
|
|
|
// c c
|
|
xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df,
|
|
|
|
// d d
|
|
xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df, xb1df,
|
|
|
|
// e e
|
|
xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef, xe0ef,
|
|
|
|
// f f
|
|
xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xf0fc, xfdfe, xfdfe, other,
|
|
|
|
// 0 1 2 3 4 5 6 7 8 9 a b c d e f
|
|
};
|
|
|
|
|
|
// Common States -- All SM's use these
|
|
//
|
|
#define ACC 0x4e
|
|
#define ERR 0x7f
|
|
|
|
// Other States -- All SM's use some of these, not all use all
|
|
//
|
|
#define ST0 0x00
|
|
#define ST0c 0x40
|
|
#define ST1 0x01
|
|
#define ST1c 0x41
|
|
#define ST2 0x02
|
|
#define ST2c 0x42
|
|
#define ST3 0x03
|
|
#define ST3c 0x43
|
|
#define ST4 0x04
|
|
#define ST4c 0x44
|
|
|
|
// Each state can have a corresponding counting stata i.e. stata with
|
|
// with the same transitions but during which we look for special sequences.
|
|
//
|
|
#define FTstCounting(tst) (((tst) & 0x40) != 0) // If the state is counting (including ACC)
|
|
#define TstNotCountingFromTst(tst) ((tst) & 0x3f) // Obtain the real state from the counting
|
|
|
|
/*----------------------------------------------------------------------------
|
|
DBCS character sequence validation
|
|
----------------------------------------------------------------------------*/
|
|
|
|
#define nSJisStates 2
|
|
static signed char _rgchSJisNextState[nSJisStates][nTokens] =
|
|
{
|
|
// o x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x a o
|
|
// l 0 2 0 0 0 0 2 3 0 4 0 0 5 6 7 0 0 0 0 0 0 8 0 0 9 0 a b e f f t t
|
|
// l 0 1 0 0 0 0 e a 0 1 0 0 d 1 8 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 d e h
|
|
// o 0 2 2 2 2 2 3 3 4 5 5 5 6 7 7 7 7 7 7 7 8 8 8 8 9 a b d e f f o e
|
|
// w a a b c d e 9 f 0 a b c 0 7 a b c d e f 0 d e f f 0 0 f f c e f r
|
|
//------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
//
|
|
|
|
|
|
// DBCS State 0 -- start (look for legal single byte or lead byte)
|
|
ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ST1, ST1, ST1, ST1, ERR, ACC, ACC, ST1, ST1, ERR, ACC, ERR,
|
|
|
|
// DBCS State 1 -- saw lead byte, need legal trail byte
|
|
ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ERR,
|
|
|
|
};
|
|
|
|
#define nBig5States 2
|
|
static signed char _rgchBig5NextState[nBig5States][nTokens] =
|
|
{
|
|
//
|
|
// o x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x a o
|
|
// l 0 2 0 0 0 0 2 3 0 4 0 0 5 6 7 0 0 0 0 0 0 8 0 0 9 0 a b e f f t t
|
|
// l 0 1 0 0 0 0 f a 0 1 0 0 d 1 8 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 d e h
|
|
// o 0 2 2 2 2 2 3 3 4 5 5 5 6 7 7 7 7 7 7 7 8 8 8 8 9 a b d e f f o e
|
|
// w a a b c d e 9 f 0 a b c 0 7 a b c d e f 0 d e f f 0 0 f f c e f r
|
|
//------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
//
|
|
|
|
// DBCS State 0 -- start (look for legal single byte or lead byte)
|
|
ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ST1, ST1, ST1, ST1, ST1, ST1, ST1, ST1, ST1, ST1, ACC, ERR,
|
|
|
|
// DBCS State 1 -- saw lead byte, need legal trail byte
|
|
ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ACC, ACC, ACC, ACC, ERR, ERR,
|
|
|
|
};
|
|
|
|
#define nGbkWanStates 2
|
|
static signed char _rgchGbkWanNextState[nGbkWanStates][nTokens] =
|
|
{
|
|
//
|
|
// o x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x a o
|
|
// l 0 2 0 0 0 0 2 3 0 4 0 0 5 6 7 0 0 0 0 0 0 8 0 0 9 0 a b e f f t t
|
|
// l 0 1 0 0 0 0 f a 0 1 0 0 d 1 8 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 d e h
|
|
// o 0 2 2 2 2 2 3 3 4 5 5 5 6 7 7 7 7 7 7 7 8 8 8 8 9 a b d e f f o e
|
|
// w a a b c d e 9 f 0 a b c 0 7 a b c d e f 0 d e f f 0 0 f f c e f r
|
|
//-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
//
|
|
|
|
// DBCS State 0 -- start (look for legal single byte or lead byte)
|
|
ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ST1, ST1, ST1, ST1, ST1, ST1, ST1, ST1, ST1, ST1, ACC, ERR,
|
|
|
|
// DBCS State 1 -- saw lead byte, need legal trail byte
|
|
ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR,
|
|
|
|
|
|
};
|
|
|
|
|
|
/*----------------------------------------------------------------------------
|
|
EUC character sequence validation
|
|
----------------------------------------------------------------------------*/
|
|
|
|
#define nEucJpStates 4
|
|
static signed char _rgchEucJpNextState[nEucJpStates][nTokens] =
|
|
{
|
|
//
|
|
// o x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x a o
|
|
// l 0 2 0 0 0 0 2 3 0 4 0 0 5 6 7 0 0 0 0 0 0 8 0 0 9 0 a b e f f t t
|
|
// l 0 1 0 0 0 0 f a 0 1 0 0 d 1 8 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 d e h
|
|
// o 0 2 2 2 2 2 3 3 4 5 5 5 6 7 7 7 7 7 7 7 8 8 8 8 9 a b d e f f o e
|
|
// w a a b c d e 9 f 0 a b c 0 7 a b c d e f 0 d e f f 0 0 f f c e f r
|
|
//-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
//
|
|
|
|
// EUC State 0 -- start
|
|
ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ERR, ST2, ST3, ERR, ERR, ST1, ST1, ST1, ST1, ST1, ACC, ERR,
|
|
|
|
// EUC State 1 -- saw a1fe, need one more a1fe
|
|
ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ACC, ACC, ACC, ACC, ERR, ERR,
|
|
|
|
// EUC State 2 -- saw 8e, need a1df
|
|
ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ACC, ERR, ERR, ERR, ERR, ERR,
|
|
|
|
// EUC State 3 -- saw 8f, need 2 a1fe
|
|
ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ST1, ST1, ST1, ST1, ST1, ERR, ERR,
|
|
|
|
};
|
|
|
|
#define nEucKrCnStates 2
|
|
static signed char _rgchEucKrCnNextState[nEucKrCnStates][nTokens] =
|
|
{
|
|
//
|
|
// o x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x a o
|
|
// l 0 2 0 0 0 0 2 3 0 4 0 0 5 6 7 0 0 0 0 0 0 8 0 0 9 0 a b e f f t t
|
|
// l 0 1 0 0 0 0 f a 0 1 0 0 d 1 8 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 d e h
|
|
// o 0 2 2 2 2 2 3 3 4 5 5 5 6 7 7 7 7 7 7 7 8 8 8 8 9 a b d e f f o e
|
|
// w a a b c d e 9 f 0 a b c 0 7 a b c d e f 0 d e f f 0 0 f f c e f r
|
|
//-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
//
|
|
|
|
// EUC State 0 -- start
|
|
ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ST1, ST1, ST1, ST1, ST1, ACC, ERR,
|
|
|
|
// EUC State 1 -- saw a1fe, need one more a1fe
|
|
ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ACC, ACC, ACC, ACC, ERR, ERR,
|
|
|
|
};
|
|
|
|
#define nEucTwStates 4
|
|
static signed char _rgchEucTwNextState[nEucTwStates][nTokens] =
|
|
{
|
|
//
|
|
// o x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x a o
|
|
// l 0 2 0 0 0 0 2 3 0 4 0 0 5 6 7 0 0 0 0 0 0 8 0 0 9 0 a b e f f t t
|
|
// l 0 1 0 0 0 0 f a 0 1 0 0 d 1 8 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 d e h
|
|
// o 0 2 2 2 2 2 3 3 4 5 5 5 6 7 7 7 7 7 7 7 8 8 8 8 9 a b d e f f o e
|
|
// w a a b c d e 9 f 0 a b c 0 7 a b c d e f 0 d e f f 0 0 f f c e f r
|
|
//-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
//
|
|
|
|
// EUC State 0 -- start
|
|
ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ERR, ST2, ERR, ERR, ERR, ST1, ST1, ST1, ST1, ST1, ACC, ERR,
|
|
|
|
// EUC State 1 -- saw a1fe, need one more a1fe
|
|
ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ACC, ACC, ACC, ACC, ERR, ERR,
|
|
|
|
// EUC State 2 -- saw 8e, need a1b0
|
|
ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ST3, ERR, ERR, ERR, ERR, ERR, ERR,
|
|
|
|
// EUC State 3 -- saw 8e, a1b0; need 2 a1fe
|
|
ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ST1, ST1, ST1, ST1, ST1, ERR, ERR,
|
|
|
|
};
|
|
|
|
/*-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
HZ character sequence validation
|
|
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------*/
|
|
// Currently some of the rules for HZ encoding outlined above are a bit loosened up.
|
|
// (e.g. the range for the first GB byte is expanded) The rules were adjusted based on real data.
|
|
|
|
#define nHzStates 5
|
|
static signed char _rgchHzNextState[nHzStates][nTokens] =
|
|
{
|
|
//
|
|
// o x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x a o
|
|
// l 0 2 0 0 0 0 2 3 0 4 0 0 5 6 7 0 0 0 0 0 0 8 0 0 9 0 a b e f f t t
|
|
// l 0 1 0 0 0 0 f a 0 1 0 0 d 1 8 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 d e h
|
|
// o 0 2 2 2 2 2 3 3 4 5 5 5 6 7 7 7 7 7 7 7 8 8 8 8 9 a b d e f f o e
|
|
// w a a b c d e 9 f 0 a b c 0 7 a b c d e f 0 d e f f 0 0 f f c e f r
|
|
//-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
//
|
|
|
|
// HZ State 0 -- ASCII
|
|
ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ST1c, ACC, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ERR,
|
|
|
|
// HZ State 1 -- saw "~," looking for "{" to make transition to GB mode
|
|
ERR, ACC, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ST2c, ERR, ERR, ACC, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR,
|
|
|
|
// HZ State 2 -- just saw "{," expecting GB byte
|
|
ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ERR, ERR, ERR, ST4c, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR,
|
|
|
|
// HZ State 3 -- expecting GB byte
|
|
ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST4c, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR,
|
|
|
|
// HZ State 4 -- saw "~," looking for "}" to make transition to ASCII mode
|
|
ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ST3, ACC, ST3, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR,
|
|
|
|
};
|
|
|
|
/*-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
UTF-7 character sequence validation
|
|
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------*/
|
|
|
|
#define nUtf7States 3
|
|
static signed char _rgchUtf7NextState[nUtf7States][nTokens] =
|
|
{
|
|
//
|
|
// o x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x a o
|
|
// l 0 2 0 0 0 0 2 3 0 4 0 0 5 6 7 0 0 0 0 0 0 8 0 0 9 0 a b e f f t t
|
|
// l 0 1 0 0 0 0 f a 0 1 0 0 d 1 8 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 d e h
|
|
// o 0 2 2 2 2 2 3 3 4 5 5 5 6 7 7 7 7 7 7 7 8 8 8 8 9 a b d e f f o e
|
|
// w a a b c d e 9 f 0 a b c 0 7 a b c d e f 0 d e f f 0 0 f f c e f r
|
|
//-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
//
|
|
|
|
// UTF7 State 0 -- Direct/optionally direct ACSII mode, state transition can happen on "+"
|
|
ACC, ACC, ACC, ST1c, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ACC, ACC, ACC, ACC, ACC, ACC, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ERR,
|
|
|
|
// UTF7 State 1 -- Expecting first character from Modified Base64 alphabet
|
|
ERR, ERR, ERR, ST2, ERR, ACC, ERR, ST2, ERR, ERR, ST2, ERR, ERR, ERR, ST2, ST2, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR,
|
|
|
|
// UTF7 State 2 -- Modified Base64 alphabet mode, can be exited with "-" or any control character.
|
|
ACC, ACC, ERR, ST2, ERR, ACC, ERR, ST2, ERR, ERR, ST2, ERR, ERR, ERR, ST2, ST2, ERR, ERR, ERR, ERR, ACC, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ACC, ERR,
|
|
};
|
|
|
|
/*----------------------------------------------------------------------------
|
|
UTF-8 character sequence validation
|
|
----------------------------------------------------------------------------*/
|
|
|
|
static int _nUtf8Tb = 0;
|
|
|
|
#define BIT7(a) ((a) & 0x80)
|
|
#define BIT6(a) ((a) & 0x40)
|
|
|
|
/* N U T F 8 */
|
|
/*----------------------------------------------------------------------------
|
|
%%Function: _NUtf8
|
|
%%Contact: jpick
|
|
|
|
UTF-8 doesn't require a state table for validation, just a count
|
|
of the number of expected trail bytes. See utf8lex.c for an
|
|
explanation of this code.
|
|
----------------------------------------------------------------------------*/
|
|
static int __inline NUtf8(UCHAR uch, BOOL fEoi)
|
|
{
|
|
// BIT7(uch) == 0 implies single ASCII byte.
|
|
// BIT6(uch) == 0 implies one of n trail bytes.
|
|
// Otherwise, lead byte, with number of bits set
|
|
// up to first 0 equal to the total number bytes
|
|
// in the sequence.
|
|
//
|
|
// REVIEW: _nUtf8Tb *is* really the state of this
|
|
// validator -- use nState in structure?
|
|
//
|
|
if (fEoi && (_nUtf8Tb != 0))
|
|
{
|
|
return 0; // unexpected end-of-input
|
|
}
|
|
else if (BIT7(uch) == 0)
|
|
{
|
|
if (_nUtf8Tb != 0) // unexpected single byte
|
|
return 0;
|
|
return 1;
|
|
}
|
|
else if (BIT6(uch) == 0)
|
|
{
|
|
if (_nUtf8Tb == 0) // unexpected trail byte
|
|
return 0;
|
|
if ((--_nUtf8Tb) == 0)
|
|
return 1;
|
|
}
|
|
else
|
|
{
|
|
if (_nUtf8Tb != 0) // unexpected lead byte
|
|
return 0;
|
|
while (BIT7(uch) != 0)
|
|
{
|
|
uch <<= 1;
|
|
_nUtf8Tb++;
|
|
}
|
|
_nUtf8Tb--; // don't count lead byte
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
|
|
/*----------------------------------------------------------------------------
|
|
Character Mapping Defs
|
|
----------------------------------------------------------------------------*/
|
|
|
|
// If caller wants us to check characters as part of validation
|
|
//
|
|
typedef BOOL (*PFNCHECKCHAR)(ICET icetIn);
|
|
|
|
#define cchMaxBuff 5
|
|
typedef struct _cc
|
|
{
|
|
int nCp; // code page
|
|
int cchBuff; // fill count of character buffer
|
|
PFNCHECKCHAR pfnCheckChar; // character check routine
|
|
char rgchBuff[cchMaxBuff]; // character buffer
|
|
} CC;
|
|
|
|
// Character validation prototypes
|
|
//
|
|
static BOOL _FDbcsCheckChar(ICET icetIn);
|
|
|
|
|
|
// DBCS character checker structures
|
|
//
|
|
|
|
// Big5
|
|
static CC _ccBig5 =
|
|
{
|
|
nCpTaiwan,
|
|
0,
|
|
_FDbcsCheckChar,
|
|
};
|
|
|
|
// Gbk
|
|
static CC _ccGbk =
|
|
{
|
|
nCpChina,
|
|
0,
|
|
_FDbcsCheckChar,
|
|
};
|
|
|
|
// ShiftJis
|
|
static CC _ccSJis =
|
|
{
|
|
nCpJapan,
|
|
0,
|
|
_FDbcsCheckChar,
|
|
};
|
|
|
|
// Wansung
|
|
static CC _ccWan =
|
|
{
|
|
nCpKorea,
|
|
0,
|
|
_FDbcsCheckChar,
|
|
};
|
|
|
|
|
|
// Character checker structures just used as buffers.
|
|
//
|
|
|
|
// Euc-Jp
|
|
static CC _ccEucJp =
|
|
{
|
|
0,
|
|
0,
|
|
0,
|
|
};
|
|
|
|
// Hz
|
|
static CC _ccHz =
|
|
{
|
|
0,
|
|
0,
|
|
0,
|
|
};
|
|
|
|
// Utf7
|
|
static CC _ccUtf7 =
|
|
{
|
|
0,
|
|
0,
|
|
0,
|
|
};
|
|
|
|
/*----------------------------------------------------------------------------
|
|
Character Occurrence Counters
|
|
----------------------------------------------------------------------------*/
|
|
|
|
// If calling app wants us to track occurrences of common character
|
|
// sequences during validation (used only by auto-detection, so far).
|
|
//
|
|
|
|
typedef struct _coce
|
|
{
|
|
int cHits;
|
|
short cwch;
|
|
WCHAR rgwch[2];
|
|
} COCE;
|
|
|
|
typedef struct _coc
|
|
{
|
|
BOOL fMatching;
|
|
short nCoceCurr;
|
|
short nCoceIndex;
|
|
int ccoce;
|
|
COCE *rgcoce;
|
|
} COC;
|
|
|
|
// Big5
|
|
//
|
|
static COCE _rgcoceBig5[] =
|
|
{
|
|
{0, 2, {(WCHAR)0xa7da, (WCHAR)0xadcc},}, // "wo men"
|
|
{0, 2, {(WCHAR)0xa8e4, (WCHAR)0xb9ea},}, // "qi shi"
|
|
{0, 2, {(WCHAR)0xa65d, (WCHAR)0xacb0},}, // "yin wei"
|
|
{0, 2, {(WCHAR)0xb8ea, (WCHAR)0xb054},}, // "zi xun"
|
|
{0, 2, {(WCHAR)0xb971, (WCHAR)0xb8a3},}, // "diam nao"
|
|
{0, 2, {(WCHAR)0xbaf4, (WCHAR)0xb8f4},}, // "wang lu"
|
|
{0, 2, {(WCHAR)0xbd75, (WCHAR)0xa457},}, // "xian shang"
|
|
{0, 2, {(WCHAR)0xc577, (WCHAR)0xaaef},}, // "huan ying"
|
|
{0, 2, {(WCHAR)0xa477, (WCHAR)0xb867},}, // "yi jing"
|
|
};
|
|
|
|
static COC _cocBig5 =
|
|
{
|
|
fFalse, // fMatching
|
|
0, // nCoceCurr
|
|
0, // nCoceIndex
|
|
sizeof(_rgcoceBig5) / sizeof(_rgcoceBig5[0]), // ccoce
|
|
_rgcoceBig5, // rgcoce
|
|
};
|
|
|
|
// Euc-Cn
|
|
//
|
|
static COCE _rgcoceEucCn[] =
|
|
{
|
|
{0, 2, {(WCHAR)0xcbfb, (WCHAR)0xc3c7},}, // "ta men"
|
|
{0, 2, {(WCHAR)0xced2, (WCHAR)0xc3c7},}, // "wo men"
|
|
{0, 2, {(WCHAR)0xd2f2, (WCHAR)0xb4cb},}, // "yin ci"
|
|
{0, 2, {(WCHAR)0xcab2, (WCHAR)0xc3b4},}, // "shen mo"
|
|
{0, 2, {(WCHAR)0xc8e7, (WCHAR)0xb9fb},}, // "ru guo"
|
|
{0, 2, {(WCHAR)0xd2f2, (WCHAR)0xceaa},}, // "yin wei"
|
|
{0, 2, {(WCHAR)0xcbf9, (WCHAR)0xd2d4},}, // "suo yi"
|
|
{0, 2, {(WCHAR)0xbbb6, (WCHAR)0xd3ad},}, // "huan ying"
|
|
{0, 2, {(WCHAR)0xcdf8, (WCHAR)0xc2e7},}, // "wang luo"
|
|
{0, 2, {(WCHAR)0xd0c5, (WCHAR)0xcfa2},}, // "xin xi"
|
|
{0, 2, {(WCHAR)0xbcc6, (WCHAR)0xcbe3},}, // "ji guan"
|
|
};
|
|
|
|
static COC _cocEucCn =
|
|
{
|
|
fFalse, // fMatching
|
|
0, // nCoceCurr
|
|
0, // nCoceIndex
|
|
sizeof(_rgcoceEucCn) / sizeof(_rgcoceEucCn[0]), // ccoce
|
|
_rgcoceEucCn, // rgcoce
|
|
};
|
|
|
|
// Euc-Kr
|
|
//
|
|
static COCE _rgcoceEucKr[] =
|
|
{
|
|
{0, 2, {(WCHAR)0xb0a1, (WCHAR)0x0020},},
|
|
{0, 2, {(WCHAR)0xb0a1, (WCHAR)0xa1a1},},
|
|
{0, 2, {(WCHAR)0xb4c2, (WCHAR)0x0020},},
|
|
{0, 2, {(WCHAR)0xb4c2, (WCHAR)0xa1a1},},
|
|
{0, 2, {(WCHAR)0xb4d9, (WCHAR)0x002e},},
|
|
{0, 2, {(WCHAR)0xb4d9, (WCHAR)0xa3ae},},
|
|
{0, 2, {(WCHAR)0xb8a6, (WCHAR)0x0020},},
|
|
{0, 2, {(WCHAR)0xb8a6, (WCHAR)0xa1a1},},
|
|
{0, 2, {(WCHAR)0xc0ba, (WCHAR)0x0020},},
|
|
{0, 2, {(WCHAR)0xc0ba, (WCHAR)0xa1a1},},
|
|
{0, 2, {(WCHAR)0xc0bb, (WCHAR)0x0020},},
|
|
{0, 2, {(WCHAR)0xc0bb, (WCHAR)0xa1a1},},
|
|
{0, 2, {(WCHAR)0xc0cc, (WCHAR)0x0020},},
|
|
{0, 2, {(WCHAR)0xc0cc, (WCHAR)0xa1a1},},
|
|
};
|
|
|
|
static COC _cocEucKr =
|
|
{
|
|
fFalse, // fMatching
|
|
0, // nCoceCurr
|
|
0, // nCoceIndex
|
|
sizeof(_rgcoceEucKr) / sizeof(_rgcoceEucKr[0]), // ccoce
|
|
_rgcoceEucKr, // rgcoce
|
|
};
|
|
|
|
// EUC-Jp
|
|
//
|
|
static COCE _rgcoceEucJp[] =
|
|
{
|
|
{0, 2, {(WCHAR)0xa4c7, (WCHAR)0xa4b9},}, // "de su"
|
|
{0, 2, {(WCHAR)0xa4c0, (WCHAR)0xa1a3},}, // "da ."
|
|
{0, 2, {(WCHAR)0xa4a4, (WCHAR)0xa4eb},}, // "i ru"
|
|
{0, 2, {(WCHAR)0xa4de, (WCHAR)0xa4b9},}, // "ma su"
|
|
{0, 2, {(WCHAR)0xa4b7, (WCHAR)0xa4bf},}, // "shi ta"
|
|
{0, 2, {(WCHAR)0xa4b9, (WCHAR)0xa4eb},}, // "su ru"
|
|
{0, 2, {(WCHAR)0xa4bf, (WCHAR)0xa1a3},}, // "ta ."
|
|
{0, 2, {(WCHAR)0xa4eb, (WCHAR)0xa1a3},}, // "ru ."
|
|
};
|
|
|
|
static COC _cocEucJp =
|
|
{
|
|
fFalse, // fMatching
|
|
0, // nCoceCurr
|
|
0, // nCoceIndex
|
|
sizeof(_rgcoceEucJp) / sizeof(_rgcoceEucJp[0]), // ccoce
|
|
_rgcoceEucJp, // rgcoce
|
|
};
|
|
|
|
// GBK
|
|
//
|
|
static COCE _rgcoceGbk[] =
|
|
{
|
|
{0, 2, {(WCHAR)0xcbfb, (WCHAR)0xc3c7},}, // "ta men"
|
|
{0, 2, {(WCHAR)0xced2, (WCHAR)0xc3c7},}, // "wo men"
|
|
{0, 2, {(WCHAR)0xd2f2, (WCHAR)0xb4cb},}, // "yin ci"
|
|
{0, 2, {(WCHAR)0xcab2, (WCHAR)0xc3b4},}, // "shen mo"
|
|
{0, 2, {(WCHAR)0xc8e7, (WCHAR)0xb9fb},}, // "ru guo"
|
|
{0, 2, {(WCHAR)0xd2f2, (WCHAR)0xceaa},}, // "yin wei"
|
|
{0, 2, {(WCHAR)0xcbf9, (WCHAR)0xd2d4},}, // "suo yi"
|
|
{0, 2, {(WCHAR)0xbbb6, (WCHAR)0xd3ad},}, // "huan ying"
|
|
{0, 2, {(WCHAR)0xcdf8, (WCHAR)0xc2e7},}, // "wang luo"
|
|
{0, 2, {(WCHAR)0xd0c5, (WCHAR)0xcfa2},}, // "xin xi"
|
|
{0, 2, {(WCHAR)0xbcc6, (WCHAR)0xcbe3},}, // "ji guan"
|
|
};
|
|
|
|
static COC _cocGbk =
|
|
{
|
|
fFalse, // fMatching
|
|
0, // nCoceCurr
|
|
0, // nCoceIndex
|
|
sizeof(_rgcoceGbk) / sizeof(_rgcoceGbk[0]), // ccoce
|
|
_rgcoceGbk, // rgcoce
|
|
};
|
|
|
|
// Shift-JIS
|
|
//
|
|
static COCE _rgcoceSJis[] =
|
|
{
|
|
{0, 2, {(WCHAR)0x82c5, (WCHAR)0x82b7},}, // "de su"
|
|
{0, 2, {(WCHAR)0x82be, (WCHAR)0x8142},}, // "da ."
|
|
{0, 2, {(WCHAR)0x82a2, (WCHAR)0x82e9},}, // "i ru"
|
|
{0, 2, {(WCHAR)0x82dc, (WCHAR)0x82b7},}, // "ma su"
|
|
{0, 2, {(WCHAR)0x82b5, (WCHAR)0x82bd},}, // "shi ta"
|
|
{0, 2, {(WCHAR)0x82b7, (WCHAR)0x82e9},}, // "su ru"
|
|
{0, 2, {(WCHAR)0x82bd, (WCHAR)0x8142},}, // "ta ."
|
|
{0, 2, {(WCHAR)0x82e9, (WCHAR)0x8142},}, // "ru ."
|
|
};
|
|
|
|
static COC _cocSJis =
|
|
{
|
|
fFalse, // fMatching
|
|
0, // nCoceCurr
|
|
0, // nCoceIndex
|
|
sizeof(_rgcoceSJis) / sizeof(_rgcoceSJis[0]), // ccoce
|
|
_rgcoceSJis, // rgcoce
|
|
};
|
|
|
|
// Wansung
|
|
//
|
|
// REVIEW: bug (1/2 this table is being ignored)
|
|
//
|
|
static COCE _rgcoceWan[] =
|
|
{
|
|
{0, 2, {(WCHAR)0xb0a1, (WCHAR)0x0020},},
|
|
{0, 2, {(WCHAR)0xb0a1, (WCHAR)0xa1a1},},
|
|
{0, 2, {(WCHAR)0xb4c2, (WCHAR)0x0020},},
|
|
{0, 2, {(WCHAR)0xb4c2, (WCHAR)0xa1a1},},
|
|
{0, 2, {(WCHAR)0xb4d9, (WCHAR)0x002e},},
|
|
{0, 2, {(WCHAR)0xb4d9, (WCHAR)0xa3ae},},
|
|
{0, 2, {(WCHAR)0xb8a6, (WCHAR)0x0020},},
|
|
{0, 2, {(WCHAR)0xb8a6, (WCHAR)0xa1a1},},
|
|
{0, 2, {(WCHAR)0xc0ba, (WCHAR)0x0020},},
|
|
{0, 2, {(WCHAR)0xc0ba, (WCHAR)0xa1a1},},
|
|
{0, 2, {(WCHAR)0xc0bb, (WCHAR)0x0020},},
|
|
{0, 2, {(WCHAR)0xc0bb, (WCHAR)0xa1a1},},
|
|
{0, 2, {(WCHAR)0xc0cc, (WCHAR)0x0020},},
|
|
{0, 2, {(WCHAR)0xc0cc, (WCHAR)0xa1a1},},
|
|
};
|
|
|
|
static COC _cocWan =
|
|
{
|
|
fFalse, // fMatching
|
|
0, // nCoceCurr
|
|
0, // nCoceIndex
|
|
sizeof(_rgcoceWan) / sizeof(_rgcoceWan[0]), // ccoce
|
|
_rgcoceWan, // rgcoce
|
|
};
|
|
|
|
// Hz
|
|
//
|
|
static COCE _rgcoceHz[] =
|
|
{
|
|
{0, 2, {(WCHAR)0x007e, (WCHAR)0x007b},}, // ~{
|
|
{0, 2, {(WCHAR)0x007e, (WCHAR)0x007d},}, // ~}
|
|
};
|
|
|
|
static COC _cocHz =
|
|
{
|
|
fFalse, // fMatching
|
|
0, // nCoceCurr
|
|
0, // nCoceIndex
|
|
sizeof(_rgcoceHz) / sizeof(_rgcoceHz[0]), // ccoce
|
|
_rgcoceHz, // rgcoce
|
|
};
|
|
|
|
// Utf7
|
|
//
|
|
static COCE _rgcoceUtf7[] =
|
|
{
|
|
{0, 2, {(WCHAR)0x002b, (WCHAR)0x002d},}, // +-
|
|
};
|
|
|
|
static COC _cocUtf7 =
|
|
{
|
|
fFalse, // fMatching
|
|
0, // nCoceCurr
|
|
0, // nCoceIndex
|
|
sizeof(_rgcoceUtf7) / sizeof(_rgcoceUtf7[0]), // ccoce
|
|
_rgcoceUtf7, // rgcoce
|
|
};
|
|
|
|
// Character counter prototype.
|
|
//
|
|
static void _CountChars(ICET icetIn);
|
|
|
|
|
|
/*----------------------------------------------------------------------------
|
|
Main Definitions
|
|
----------------------------------------------------------------------------*/
|
|
|
|
// Structure to keep state, state machine and other associated
|
|
// information for a given character set "parse stream."
|
|
//
|
|
typedef struct _vr
|
|
{
|
|
BOOL fInUse;
|
|
DWORD dwFlags;
|
|
int nState;
|
|
CC *ccCheck;
|
|
signed char (*rgchNextState)[nTokens];
|
|
} VR;
|
|
|
|
// Array of validation records. We allow multiple, active parse
|
|
// streams for auto-detect -- this way, it can concurrently keep
|
|
// a parse stream for each encoding type, without needing to read
|
|
// its input multiple times.
|
|
//
|
|
static VR _mpicetvr[icetCount] =
|
|
{
|
|
{fTrue, 0, ST0, 0, _rgchEucKrCnNextState,}, // icetEucCn
|
|
{fTrue, 0, ST0, &_ccEucJp, _rgchEucJpNextState,}, // icetEucJp
|
|
{fTrue, 0, ST0, 0, _rgchEucKrCnNextState,}, // icetEucKr
|
|
{fTrue, 0, ST0, 0, _rgchEucTwNextState,}, // icetEucTw
|
|
{fFalse, 0, ST0, 0, 0,}, // icetIso2022Cn
|
|
{fFalse, 0, ST0, 0, 0,}, // icetIso2022Jp
|
|
{fFalse, 0, ST0, 0, 0,}, // icetIso2022Kr
|
|
{fFalse, 0, ST0, 0, 0,}, // icetIso2022Tw
|
|
{fTrue, 0, ST0, &_ccBig5, _rgchBig5NextState,}, // icetBig5
|
|
{fTrue, 0, ST0, &_ccGbk, _rgchGbkWanNextState,}, // icetGbk
|
|
{fTrue, 0, ST0, &_ccHz, _rgchHzNextState,}, // icetHz
|
|
{fTrue, 0, ST0, &_ccSJis, _rgchSJisNextState,}, // icetShiftJis
|
|
{fTrue, 0, ST0, &_ccWan, _rgchGbkWanNextState,}, // icetWansung
|
|
{fTrue, 0, ST0, &_ccUtf7, _rgchUtf7NextState,}, // icetUtf7
|
|
{fTrue, 0, ST0, 0, 0,}, // icetUtf8
|
|
};
|
|
|
|
// Array of character sequence counters, one per encoding type.
|
|
//
|
|
static COC *_mpicetlpcoc[icetCount] =
|
|
{
|
|
&_cocEucCn, // icetEucCn
|
|
&_cocEucJp, // icetEucJp
|
|
&_cocEucKr, // icetEucKr
|
|
0, // icetEucTw
|
|
0, // icetIso2022Cn
|
|
0, // icetIso2022Jp
|
|
0, // icetIso2022Kr
|
|
0, // icetIso2022Tw
|
|
&_cocBig5, // icetBig5
|
|
&_cocGbk, // icetGbk
|
|
&_cocHz, // icetHz
|
|
&_cocSJis, // icetShiftJis
|
|
&_cocWan, // icetWansung
|
|
&_cocUtf7, // icetUtf7
|
|
0, // icetUtf8
|
|
};
|
|
|
|
|
|
/* V A L I D A T E I N I T */
|
|
/*----------------------------------------------------------------------------
|
|
%%Function: ValidateInit
|
|
%%Contact: jpick
|
|
|
|
Initialize the state machine for the given character set (set its
|
|
state to ST0 (the start state) and store its parsing options).
|
|
----------------------------------------------------------------------------*/
|
|
void ValidateInit(ICET icetIn, DWORD dwFlags)
|
|
{
|
|
// Initialize the character occurrence counter, if caller wants
|
|
// us to count common character sequences (auto-detect, only,
|
|
// for now). Turn off the count-common-chars flag if we're not
|
|
// set up to count sequences (meaning we don't have a set of
|
|
// common characters for this encoding type or have no place
|
|
// to buffer them).
|
|
//
|
|
if (dwFlags & grfCountCommonChars)
|
|
{
|
|
if ((_mpicetlpcoc[icetIn]) && (_mpicetvr[icetIn].ccCheck))
|
|
{
|
|
int i;
|
|
for (i = 0; i < _mpicetlpcoc[icetIn]->ccoce; i++)
|
|
_mpicetlpcoc[icetIn]->rgcoce[i].cHits = 0;
|
|
_mpicetlpcoc[icetIn]->fMatching = fFalse;
|
|
}
|
|
else
|
|
{
|
|
dwFlags &= ~grfCountCommonChars;
|
|
}
|
|
}
|
|
|
|
// If validation not supported for the encoding type, there's
|
|
// nothing else for us to do here.
|
|
//
|
|
if (!_mpicetvr[icetIn].fInUse)
|
|
return;
|
|
|
|
_mpicetvr[icetIn].nState = ST0;
|
|
|
|
// Can't do character mapping validation without character
|
|
// checker information. (If we do have the character checker,
|
|
// initialize its buffer length to 0).
|
|
//
|
|
if (_mpicetvr[icetIn].ccCheck)
|
|
_mpicetvr[icetIn].ccCheck->cchBuff = 0;
|
|
else
|
|
dwFlags &= ~grfValidateCharMapping;
|
|
|
|
// It's also impossible without a valid code page.
|
|
//
|
|
if ((dwFlags & grfValidateCharMapping) && !IsValidCodePage(_mpicetvr[icetIn].ccCheck->nCp))
|
|
dwFlags &= ~grfValidateCharMapping;
|
|
|
|
_mpicetvr[icetIn].dwFlags = dwFlags;
|
|
|
|
if (icetIn == icetUtf8)
|
|
_nUtf8Tb = 0;
|
|
}
|
|
|
|
|
|
/* V A L I D A T E R E S E T A L L*/
|
|
/*----------------------------------------------------------------------------
|
|
%%Function: ValidateInitAll
|
|
%%Contact: jpick
|
|
|
|
Initialize the state machines for all character sets (set their
|
|
states to ST0 (the start state) and store their parsing options).
|
|
----------------------------------------------------------------------------*/
|
|
void ValidateInitAll(DWORD dwFlags)
|
|
{
|
|
int i;
|
|
for (i = 0 ; i < icetCount; i++)
|
|
{
|
|
if (!_mpicetvr[i].fInUse)
|
|
continue;
|
|
ValidateInit((ICET)i, dwFlags);
|
|
}
|
|
}
|
|
|
|
|
|
/* V A L I D A T E R E S E T */
|
|
/*----------------------------------------------------------------------------
|
|
%%Function: ValidateReset
|
|
%%Contact: jpick
|
|
|
|
Reset the state machine for the given character set (set its state
|
|
to ST0 (the start state)).
|
|
----------------------------------------------------------------------------*/
|
|
void ValidateReset(ICET icetIn)
|
|
{
|
|
// Initialize the character occurrence counter, if caller wants
|
|
// us to count common character sequences (auto-detect, only,
|
|
// for now). We're guaranteed to have the structures if the
|
|
// flag is set by ValidateInit(), above.
|
|
//
|
|
if (_mpicetvr[icetIn].dwFlags & grfCountCommonChars)
|
|
{
|
|
int i;
|
|
for (i = 0; i < _mpicetlpcoc[icetIn]->ccoce; i++)
|
|
_mpicetlpcoc[icetIn]->rgcoce[i].cHits = 0;
|
|
_mpicetlpcoc[icetIn]->fMatching = fFalse;
|
|
}
|
|
|
|
// If validation not supported for the encoding type, there's
|
|
// nothing else for us to do here.
|
|
//
|
|
if (!_mpicetvr[icetIn].fInUse)
|
|
return;
|
|
|
|
_mpicetvr[icetIn].nState = ST0;
|
|
|
|
if (_mpicetvr[icetIn].ccCheck)
|
|
_mpicetvr[icetIn].ccCheck->cchBuff = 0;
|
|
|
|
if (icetIn == icetUtf8)
|
|
_nUtf8Tb = 0;
|
|
}
|
|
|
|
|
|
/* V A L I D A T E R E S E T A L L */
|
|
/*----------------------------------------------------------------------------
|
|
%%Function: ValidateResetAll
|
|
%%Contact: jpick
|
|
|
|
Reset the state machines for all character sets (set their states to
|
|
ST0 (the start state)).
|
|
----------------------------------------------------------------------------*/
|
|
void ValidateResetAll(void)
|
|
{
|
|
int i;
|
|
|
|
for (i=0 ; i < icetCount; i++)
|
|
{
|
|
if (!_mpicetvr[i].fInUse)
|
|
continue;
|
|
ValidateReset((ICET)i);
|
|
}
|
|
}
|
|
|
|
|
|
/* N V A L I D A T E U C H */
|
|
/*----------------------------------------------------------------------------
|
|
%%Function: NValidateUch
|
|
%%Contact: jpick
|
|
|
|
Single step parser, takes one transition through the state table
|
|
for the given character set. Current state is kept for each
|
|
character set's parse stream.
|
|
|
|
Routine returns -1 if it does not reach a final state on this
|
|
transition; 0 if transitioned to ERR(or) and 1 if transtioned
|
|
to ACC(ept).
|
|
|
|
If final state is ACC(ept), machine reset to ST0 (start state).
|
|
(i.e., there's no need to manually reset on ACC(ept)).
|
|
|
|
Routine is also a convenient collection point for certain
|
|
statistics (currently only the counting of occurrences of common
|
|
character sequences (defined for character sets, above)).
|
|
----------------------------------------------------------------------------*/
|
|
int NValidateUch(ICET icetIn, UCHAR uch, BOOL fEoi)
|
|
{
|
|
int nToken;
|
|
int nPrevState;
|
|
int rc = -1;
|
|
|
|
// If not validating this icet, nothing to do (so say
|
|
// we accept the character).
|
|
//
|
|
if (!_mpicetvr[icetIn].fInUse)
|
|
return 1;
|
|
if (_mpicetvr[icetIn].nState == ERR)
|
|
return 0;
|
|
|
|
// Ignore all zeros in the detection file.
|
|
if (!uch && !fEoi)
|
|
{
|
|
goto _LRet;
|
|
}
|
|
|
|
// Hack -- want to validate UTF-8, but don't need a state
|
|
// table to do so. Treat as special case here and return.
|
|
//
|
|
if (icetIn == icetUtf8)
|
|
{
|
|
if ((rc = NUtf8(uch, fEoi)) == 0)
|
|
_mpicetvr[icetIn].nState = ERR;
|
|
return rc;
|
|
}
|
|
|
|
// Classify the character...
|
|
//
|
|
nPrevState = _mpicetvr[icetIn].nState;
|
|
nToken = fEoi ? ateof : _rgchCharClass[uch];
|
|
|
|
// First obtain a real number for a state based on the counting state...
|
|
// Then do the transition...
|
|
//
|
|
_mpicetvr[icetIn].nState = (_mpicetvr[icetIn].rgchNextState)[TstNotCountingFromTst(_mpicetvr[icetIn].nState)][nToken];
|
|
|
|
#if 0
|
|
if (_mpicetvr[icetIn].nState == ERR)
|
|
printf("Character 0x%.2x; Going from state %.2x to state %.2x\n", uch, nPrevState, _mpicetvr[icetIn].nState);
|
|
#endif
|
|
|
|
// If we're in an error state or have seen end-of-input, return.
|
|
//
|
|
if ((_mpicetvr[icetIn].nState == ERR) || (nToken == ateof))
|
|
goto _LRet;
|
|
|
|
// Are we to do character mapping validation? (If this flag
|
|
// is set, we're guaranteed to have a character checker
|
|
// structure). How about character occurrence counting?
|
|
// (This also guarantees us a character checker structure).
|
|
//
|
|
if (!(_mpicetvr[icetIn].dwFlags & grfValidateCharMapping) &&
|
|
!(_mpicetvr[icetIn].dwFlags & grfCountCommonChars))
|
|
{
|
|
goto _LRet;
|
|
}
|
|
|
|
// Buffer the current character (trusting that we'll never get
|
|
// more than the max amount -- present tables enforce this)
|
|
// (if it's Utf7 or Hz, buffer only if we are in the counting state
|
|
//
|
|
if (FTstCounting(_mpicetvr[icetIn].nState) || (icetIn != icetHz && icetIn != icetUtf7))
|
|
_mpicetvr[icetIn].ccCheck->rgchBuff[_mpicetvr[icetIn].ccCheck->cchBuff++] = uch;
|
|
|
|
// Return if we are not in the counting state
|
|
//
|
|
if (!(FTstCounting(_mpicetvr[icetIn].nState)))
|
|
goto _LRet;
|
|
|
|
// Call the character checker, if we have one.
|
|
//
|
|
if (_mpicetvr[icetIn].dwFlags & grfValidateCharMapping)
|
|
{
|
|
if (_mpicetvr[icetIn].ccCheck->pfnCheckChar && !(_mpicetvr[icetIn].ccCheck->pfnCheckChar)(icetIn))
|
|
{
|
|
_mpicetvr[icetIn].nState = ERR;
|
|
goto _LRet;
|
|
}
|
|
}
|
|
|
|
// If we're counting common characters, do so now.
|
|
//
|
|
if (_mpicetvr[icetIn].dwFlags & grfCountCommonChars)
|
|
_CountChars(icetIn);
|
|
|
|
// Reset the character checker/counter buffer.
|
|
//
|
|
_mpicetvr[icetIn].ccCheck->cchBuff = 0;
|
|
|
|
_LRet:
|
|
|
|
// Return the appropriate code.
|
|
//
|
|
switch (_mpicetvr[icetIn].nState)
|
|
{
|
|
case ERR:
|
|
return 0;
|
|
case ACC:
|
|
_mpicetvr[icetIn].nState = ST0; // Reset
|
|
return 1;
|
|
default:
|
|
return -1; // need more data
|
|
}
|
|
}
|
|
|
|
|
|
/* F V A L I D A T E C H A R C O U N T */
|
|
/*----------------------------------------------------------------------------
|
|
%%Function: FValidateCharCount
|
|
%%Contact: jpick
|
|
|
|
Return the number of matched special character sequences for the
|
|
given character set. If we're not keeping track of these sequences
|
|
for the character set, either because we don't have the necessary
|
|
static data or because the flag wasn't set by the calling routine,
|
|
return fFalse. Otherwise, return the count in *lpcMatch and return
|
|
fTrue;
|
|
|
|
(We track the counts separately for each sequence, just in case
|
|
we want to weight them differently in the future. Return the
|
|
total, here).
|
|
----------------------------------------------------------------------------*/
|
|
BOOL FValidateCharCount(ICET icetIn, int *lpcMatch)
|
|
{
|
|
int i;
|
|
COC *lpcoc = _mpicetlpcoc[icetIn];
|
|
VR *lpvr = &_mpicetvr[icetIn];
|
|
|
|
if (!lpcoc || !lpvr->fInUse || !(lpvr->dwFlags & grfCountCommonChars))
|
|
return fFalse;
|
|
|
|
for (i = 0, *lpcMatch = 0; i < lpcoc->ccoce; i++)
|
|
*lpcMatch += lpcoc->rgcoce[i].cHits;
|
|
|
|
return fTrue;
|
|
}
|
|
|
|
|
|
/* _ C O U N T C H A R S */
|
|
/*----------------------------------------------------------------------------
|
|
%%Function: _CountChars
|
|
%%Contact: jpick
|
|
|
|
We've just completed a legal character for the given character
|
|
set. Match it against the set of special character sequences for
|
|
the character set, if we have them. Update match counts and
|
|
current match indices (since sequences can span multiple legal
|
|
characters) as needed.
|
|
----------------------------------------------------------------------------*/
|
|
static void _CountChars(ICET icetIn)
|
|
{
|
|
WCHAR wch;
|
|
int i;
|
|
BOOL fFound;
|
|
|
|
// Anything to do?
|
|
//
|
|
if (!_mpicetlpcoc[icetIn] || !_mpicetvr[icetIn].ccCheck)
|
|
return;
|
|
|
|
// Build the WCHAR.
|
|
//
|
|
switch (_mpicetvr[icetIn].ccCheck->cchBuff)
|
|
{
|
|
case 1:
|
|
wch = WchFromUchUch(0, _mpicetvr[icetIn].ccCheck->rgchBuff[0]);
|
|
break;
|
|
case 2:
|
|
wch = WchFromUchUch(_mpicetvr[icetIn].ccCheck->rgchBuff[0],
|
|
_mpicetvr[icetIn].ccCheck->rgchBuff[1]);
|
|
break;
|
|
case 3:
|
|
wch = WchFromUchUch(_mpicetvr[icetIn].ccCheck->rgchBuff[1],
|
|
_mpicetvr[icetIn].ccCheck->rgchBuff[2]);
|
|
break;
|
|
case 4:
|
|
wch = WchFromUchUch(_mpicetvr[icetIn].ccCheck->rgchBuff[2],
|
|
_mpicetvr[icetIn].ccCheck->rgchBuff[3]);
|
|
break;
|
|
default:
|
|
return;
|
|
}
|
|
|
|
// Are we currently working on matching a sequence?
|
|
//
|
|
if ((_mpicetlpcoc[icetIn]->fMatching) &&
|
|
(wch == _mpicetlpcoc[icetIn]->rgcoce[_mpicetlpcoc[icetIn]->nCoceCurr].rgwch[_mpicetlpcoc[icetIn]->nCoceIndex]))
|
|
{
|
|
// Did we just match the entire sequence? If so, increment the
|
|
// hit count and reset.
|
|
//
|
|
if (++_mpicetlpcoc[icetIn]->nCoceIndex >= _mpicetlpcoc[icetIn]->rgcoce[_mpicetlpcoc[icetIn]->nCoceCurr].cwch)
|
|
{
|
|
++_mpicetlpcoc[icetIn]->rgcoce[_mpicetlpcoc[icetIn]->nCoceCurr].cHits;
|
|
_mpicetlpcoc[icetIn]->fMatching = fFalse;
|
|
}
|
|
|
|
// All done.
|
|
//
|
|
return;
|
|
}
|
|
|
|
// If we need to start matching again (either because we're not
|
|
// currently in a sequence or because a 2nd or later character
|
|
// didn't match), try the current character as a lead character.
|
|
//
|
|
// REVIEW: wrong for sequences longer than 2 wchars.
|
|
//
|
|
for (i = 0, fFound = fFalse; (!fFound && (i < _mpicetlpcoc[icetIn]->ccoce)); i++)
|
|
{
|
|
if (wch == _mpicetlpcoc[icetIn]->rgcoce[i].rgwch[0])
|
|
fFound = fTrue;
|
|
}
|
|
|
|
// Any luck?
|
|
//
|
|
if (!fFound)
|
|
{
|
|
_mpicetlpcoc[icetIn]->fMatching = fFalse;
|
|
return;
|
|
}
|
|
|
|
// Store the matching state.
|
|
//
|
|
_mpicetlpcoc[icetIn]->fMatching = fTrue;
|
|
_mpicetlpcoc[icetIn]->nCoceCurr = i - 1;
|
|
_mpicetlpcoc[icetIn]->nCoceIndex = 1; // where to look next
|
|
}
|
|
|
|
|
|
/* _ D B C S C H E C K C H A R */
|
|
/*----------------------------------------------------------------------------
|
|
%%Function: _DbcsCheckChar
|
|
%%Contact: jpick
|
|
|
|
Character validator for DBCS formats. Attempts to round-trip a
|
|
legal multi-byte sequence to ensure that its valid for the given
|
|
character set.
|
|
|
|
REVIEW: Slow, slow, slow -- do we really gain anything from the
|
|
round-trip check, or is conversion *to* Unicode a sufficient test?
|
|
----------------------------------------------------------------------------*/
|
|
static WCHAR _rgwBuff[10];
|
|
static UCHAR _rgchBuff[30];
|
|
|
|
static BOOL _FDbcsCheckChar(ICET icetIn)
|
|
{
|
|
int cCvt;
|
|
|
|
// skip 1 byte characters, mostly uninteresting (Shift-Jis ??).
|
|
//
|
|
if (_mpicetvr[icetIn].ccCheck->cchBuff == 1)
|
|
return fTrue;
|
|
|
|
if (!(cCvt = MultiByteToWideChar(_mpicetvr[icetIn].ccCheck->nCp,
|
|
MB_ERR_INVALID_CHARS,
|
|
_mpicetvr[icetIn].ccCheck->rgchBuff,
|
|
_mpicetvr[icetIn].ccCheck->cchBuff,
|
|
_rgwBuff, 10)))
|
|
{
|
|
if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
|
|
return fFalse;
|
|
}
|
|
|
|
return fTrue; // probably not always right
|
|
}
|