windows-nt/Source/XPSP1/NT/shell/ext/mlang/jislex.cpp

/*----------------------------------------------------------------------------
	%%File: jislex.c
	%%Unit: fechmap
	%%Contact: jpick

	Simple converter for decoding a subset of possible ISO-2022-7 encoded
	files (ISO-2022).  Data is translated to and from Unicode.  Converter
	operates according to user options.

	Module currently handles ISO-2022-JP (and JIS) and ISO-2022-KR.

	Converter is set up to handle ISO-2022-TW and ISO-2022-CN, but there
	are as yet no conversion tables for these.
----------------------------------------------------------------------------*/

#include <stdio.h>
#include <stddef.h>

#include "private.h"
#include "fechmap_.h"
#include "lexint_.h"


// State table for reading ISO-2022-7 encoded text
//
// Lexer recognizes the following designator sequences, used
// to select a one or two byte character set:
//
//    <esc> $ @				-- JIS C 6626-1978	(synonym of <esc> $ ( @)
//    <esc> $ A				-- GB 2312-80		(synonym of <esc> $ ( A)
//    <esc> $ B				-- JIS X 0208-1983	(synonym of <esc> $ ( B)
//
//    <esc> $ ( @			-- JIS C 6626-1978
//    <esc> $ ( A			-- GB 2312-80
//    <esc> $ ( B			-- JIS X 0208-1983
//    <esc> $ ( C			-- KS C 5601-1992
//    <esc> $ ( D			-- JIS X 0212-1990
//    <esc> $ ( E			-- ??? (ISO-IR-165:1992) ???
//    <esc> $ ( G			-- CNS 11643-1992 Plane 1
//    <esc> $ ( H			-- CNS 11643-1992 Plane 2
//    <esc> $ ( I			-- CNS 11643-1992 Plane 3
//    <esc> $ ( J			-- CNS 11643-1992 Plane 4
//    <esc> $ ( K			-- CNS 11643-1992 Plane 5
//    <esc> $ ( L			-- CNS 11643-1992 Plane 6
//    <esc> $ ( M			-- CNS 11643-1992 Plane 7
//
//    <esc> $ ) C			-- KSC 5601-1987 (Implies ISO-2022-KR ??)
//
//    <esc> & @ <esc> $ B	-- JIS X 0208-1990
//
//    <esc> ( B 			-- Ascii
//    <esc> ( H 			-- Deprecated variant of JIS-Roman
//    <esc> ( I 			-- Half-Width Katakana
//    <esc> ( J 			-- JIS-Roman
//    <esc> ( T 			-- GB 1988-89 Roman
//
// Lexer recognizes the following shift sequences, used to allow
// interpretation of a given byte or bytes:
//
//    <si>					-- locking shift, interpret bytes as G0
//    <so>					-- locking shift, interpret bytes as G1
//    <esc> n				-- locking shift, interpret bytes as G2
//    <esc> o				-- locking shift, interpret bytes as G3
//    <esc> N				-- single shift, interpret bytes as G2
//    <esc> O				-- single shift, interpret bytes as G3
//
// REVIEW (jpick): don't currently need the final four shift
//   sequences.  If we support ISO-2022-CN, we'll need to use
//   G2 and G3 and potentially, then, the last four shifts.
//

/*----------------------------------------------------------------------------
	Character Classification Table
----------------------------------------------------------------------------*/

// Tokens
//
#define	txt			(JTK) 0
#define	ext			(JTK) 1		// extended characters that are legal under certain circumstances
#define	esc			(JTK) 2
#define	si			(JTK) 3
#define	so			(JTK) 4
#define	dlr			(JTK) 5
#define	at			(JTK) 6
#define	amp			(JTK) 7
#define	opr			(JTK) 8
#define	cpr			(JTK) 9
#define	tkA			(JTK) 10
#define	tkB			(JTK) 11
#define	tkC			(JTK) 12
#define	tkD			(JTK) 13
#define	tkE			(JTK) 14
#define	tkG			(JTK) 15
#define	tkH			(JTK) 16
#define	tkI			(JTK) 17
#define	tkJ			(JTK) 18
#define	tkK			(JTK) 19
#define	tkL			(JTK) 20
#define	tkM			(JTK) 21
#define	tkT			(JTK) 22
#define	unk			(JTK) 23	// Unexpected character
#define	eof			(JTK) 24	// end-of-file
#define	err			(JTK) 25	// read error

#define nTokens		26

// Lookup table for ISO-2022-7 encoded files
//
static JTK _rgjtkCharClass[256] =
//  0    1    2    3    4    5    6    7    8    9    a    b    c    d    e    f
    {
//  nul  soh  stx  etx  eot  enq  ack  bel  bs   tab  lf   vt   np   cr   so   si		0
    txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, so,  si,

//  dle  dc1  dc2  dc3  dc4  nak  syn  etb  can  em   eof  esc  fs   gs   rs   us		1
    txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, esc, txt, txt, txt, txt,

//  sp   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /		2
    txt, txt, txt, txt, dlr, txt, amp, txt, opr, cpr, txt, txt, txt, txt, txt, txt,

//  0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?		3
    txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,

//  @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O		4
    at,  tkA, tkB, tkC, tkD, tkE, txt, tkG, tkH, tkI, tkJ, tkK, tkL, tkM, txt, txt,

//  P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _		5
    txt, txt, txt, txt, tkT, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,

//  `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o		6
    txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,

//  p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~    del		7
    txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt,

//																		                8
    unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,

//																		                9
    unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,

//																		                a
    unk, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,

//																		                b
    ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,

//																		                c
    ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,

//																		                d
    ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext,

//																		                e
    unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,

//																		                f
    unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk,

//  0    1    2    3    4    5    6    7    8    9    a    b    c    d    e    f
};


/*----------------------------------------------------------------------------
	State Table
----------------------------------------------------------------------------*/

// Final states have the high-bit set.  States that represent the reading
// of a valid character escape sequence also encode the character set
// "name" (moniker??) -- the state with the high bit masked off.
//
// Table State
//
typedef unsigned char TST;

// Final State Mask, Related
//
#define grfFinal 							(TST) 0x80
#define _NEscTypeFromState(nState)			(int) ((nState) & 0x7f)

// ASCII Escape Sequence (Final State)
#define ASC		(TST) (grfFinal | 0x00)		// Ascii

// Japanese Escape Sequences (Final States)
#define JS0		(TST) (grfFinal | 0x01)		// JIS-Roman
#define JS1		(TST) (grfFinal | 0x02)		// Half-Width Katakana
#define JS2 	(TST) (grfFinal | 0x03)		// JIS C 6226-1978
#define JS3		(TST) (grfFinal | 0x04)		// JIS X 0208-1983
#define JS4		(TST) (grfFinal | 0x05)		// JIS X 0208-1990
#define JS5		(TST) (grfFinal | 0x06)		// JIS X 0212-1990

// Chinese (PRC) Escape Sequences (Final States)
#define CS0		(TST) (grfFinal | 0x07)		// GB 1988-89 Roman
#define CS1		(TST) (grfFinal | 0x08)		// GB 2312-80

// Chinese (Taiwan) Escape Sequences (Final States)
#define TS0		(TST) (grfFinal | 0x09)		// CNS 11643-1992 Plane 1
#define TS1		(TST) (grfFinal | 0x0a)		// CNS 11643-1992 Plane 2
#define TS2		(TST) (grfFinal | 0x0b)		// CNS 11643-1992 Plane 3
#define TS3		(TST) (grfFinal | 0x0c)		// CNS 11643-1992 Plane 4
#define TS4		(TST) (grfFinal | 0x0d)		// CNS 11643-1992 Plane 5
#define TS5		(TST) (grfFinal | 0x0e)		// CNS 11643-1992 Plane 6
#define TS6		(TST) (grfFinal | 0x0f)		// CNS 11643-1992 Plane 7

// Korean Escape Sequences (Final State)
#define KS0		(TST) (grfFinal | 0x10)		// KS C 5601-1992

// Document "Signal" for ISO-2022-KR (Doc needs special processing)
#define KSD		(TST) (grfFinal | 0x11)		// ISO-2022-KR Document Signal

// Number of unique *character set* escape sequences
//
#define cCsEsc	18

// Special States (not escape sequence) (Final States)
//
#define TXT		(TST) (grfFinal | (cCsEsc + 1))		// Process Text
#define EXT		(TST) (grfFinal | (cCsEsc + 2))		// Process (Possibly Illegal) Extended Chars
#define FIN		(TST) (grfFinal | (cCsEsc + 3))		// Finish
#define EOI		(TST) (grfFinal | (cCsEsc + 4))		// Unexpected End-Of-Input
#define UNK		(TST) (grfFinal | (cCsEsc + 5))		// Unknown State (Unexpected Character)
#define ERR		(TST) (grfFinal | (cCsEsc + 6))		// Read Error

// Shift Sequences (do not specify character set) (Final States)
//
#define LSO		(TST) (grfFinal | (cCsEsc + 7))		// Locking shift out (g1 into GL)
#define LSI		(TST) (grfFinal | (cCsEsc + 8))		// Locking shift in (g0 into GL)

// For convenience, also define constants for the sets
// that the states represent.
//
#define csNIL		(-1)							// Invalid Designator
#define csASC		(_NEscTypeFromState(ASC))		// Ascii
#define csJS0		(_NEscTypeFromState(JS0))		// JIS-Roman
#define csJS1		(_NEscTypeFromState(JS1))		// Half-Width Katakana
#define csJS2		(_NEscTypeFromState(JS2))		// JIS C 6226-1978
#define csJS3		(_NEscTypeFromState(JS3))		// JIS X 0208-1983
#define csJS4		(_NEscTypeFromState(JS4))		// JIS X 0208-1990
#define csJS5		(_NEscTypeFromState(JS5))		// JIS X 0212-1990
#define csCS0		(_NEscTypeFromState(CS0))		// GB 1988-89 Roman
#define csCS1		(_NEscTypeFromState(CS1))		// GB 2312-80
#define csTS0		(_NEscTypeFromState(TS0))		// CNS 11643-1992 Plane 1
#define csTS1		(_NEscTypeFromState(TS1))		// CNS 11643-1992 Plane 2
#define csTS2		(_NEscTypeFromState(TS2))		// CNS 11643-1992 Plane 3
#define csTS3		(_NEscTypeFromState(TS3))		// CNS 11643-1992 Plane 4
#define csTS4		(_NEscTypeFromState(TS4))		// CNS 11643-1992 Plane 5
#define csTS5		(_NEscTypeFromState(TS5))		// CNS 11643-1992 Plane 6
#define csTS6		(_NEscTypeFromState(TS6))		// CNS 11643-1992 Plane 7
#define csKS0		(_NEscTypeFromState(KS0))		// KS C 5601-1992 (into G0)
#define csKSD		(_NEscTypeFromState(KSD))		// KS C 5601-1992 (into G1)

// Table States (Intermediate States)
#define ST0		(TST)  0
#define ST1		(TST)  1
#define ST2		(TST)  2
#define ST3		(TST)  3
#define ST4		(TST)  4
#define ST5		(TST)  5
#define ST6		(TST)  6
#define ST7		(TST)  7
#define ST8		(TST)  8
#define ST9		(TST)  9

// Number of "real" (table) states
//
#define nStates		10

#define	IsFinal(state)	((state) & grfFinal)


// State	Have Seen				Looking For
// ----------------------------------------------------------
// ST0		-- Start State --		<ESC> Text
// ST1		<ESC>					$ & (
// ST2		<ESC> $					( ) @ A B   (**)
// ST3		<ESC> $ (				@ A B C D E G H I J K L M
// ST4		<ESC> $ )				C
// ST5		<ESC> &					@
// ST6		<ESC> & @				<ESC>
// ST7		<ESC> & @ <ESC>			$
// ST8		<ESC> & @ <ESC> $		B
// ST9		<ESC> (					B H I J T
//
// (**)  "<ESC> $ ID" is a synonym of "<ESC> $ ( ID" for ID=(@, A, B)
//
// Because of the large number of tokens, this table is
// inverted (tokens x states).
//
static signed char _rgchNextState[nTokens][nStates] =
{
//
//           S     S     S     S     S     S     S     S     S     S
//           T     T     T     T     T     T     T     T     T     T
//           0     1     2     3     4     5     6     7     8     9
//--------------------------------------------------------------------
//
/* txt */  TXT,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* ext */  EXT,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* esc */  ST1,  UNK,  UNK,  UNK,  UNK,  UNK,  ST7,  UNK,  UNK,  UNK,
/* si  */  LSI,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* so  */  LSO,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* $   */  TXT,  ST2,  UNK,  UNK,  UNK,  UNK,  UNK,  ST8,  UNK,  UNK,
/* @   */  TXT,  UNK,  JS2,  JS2,  UNK,  ST6,  UNK,  UNK,  UNK,  UNK,
/* &   */  TXT,  ST5,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* (   */  TXT,  ST9,  ST3,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* )   */  TXT,  UNK,  ST4,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* A   */  TXT,  UNK,  CS1,  CS1,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* B   */  TXT,  UNK,  JS3,  JS3,  UNK,  UNK,  UNK,  UNK,  JS4,  ASC,
/* C   */  TXT,  UNK,  UNK,  KS0,  KSD,  UNK,  UNK,  UNK,  UNK,  UNK,
/* D   */  TXT,  UNK,  UNK,  JS5,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* E   */  TXT,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* G   */  TXT,  UNK,  UNK,  TS0,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* H   */  TXT,  UNK,  UNK,  TS1,  UNK,  UNK,  UNK,  UNK,  UNK,  JS0,
/* I   */  TXT,  UNK,  UNK,  TS2,  UNK,  UNK,  UNK,  UNK,  UNK,  JS1,
/* J   */  TXT,  UNK,  UNK,  TS3,  UNK,  UNK,  UNK,  UNK,  UNK,  JS0,
/* K   */  TXT,  UNK,  UNK,  TS4,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* L   */  TXT,  UNK,  UNK,  TS5,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* M   */  TXT,  UNK,  UNK,  TS6,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* T   */  TXT,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  CS0,
/* unk */  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,  UNK,
/* eof */  FIN,  EOI,  EOI,  EOI,  EOI,  EOI,  EOI,  EOI,  EOI,  EOI,
/* err */  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,  ERR,
};


// Also for ISO-2022 out.  Build arrays of possible character
// sets for each type of input character set.  Character sets
// should appear in order of hit probability (e.g., in 2022-Jp
// JS3 is the most common set).  Mark the end of array with -1.
// (Only store these for non-ascii sets).
//
//
// China (icetIso2022Cn)
static int _rgceCn[] = { -1, };

// Japan (icetIso2022Jp)
static int _rgceJp[] = { csJS3, csJS1, csJS5, -1, };

// Korea (icetIso2022Kr)
static int _rgceKr[] = { -1, };

// Taiwan (icetIso2022Tw)
static int _rgceTw[] = { -1, };

static int *_mpicetrgce[icetCount] =
	{
	0,				// icetEucCn
	0,				// icetEucJp
	0,				// icetEucKr
	0,				// icetEucTw
	_rgceCn,		// icetIso2022Cn
	_rgceJp,		// icetIso2022Jp
	_rgceKr,		// icetIso2022Kr
	_rgceTw,		// icetIso2022Tw
	0,				// icetBig5
	0,				// icetGbk
	0,				// icetShiftJis
	0,				// icetWansung
	0,				// icetUtf8
	};

/* _ J T K  G E T  N E X T */
/*----------------------------------------------------------------------------
	%%Function: _JtkGetNext
	%%Contact: jpick

	Get the next character and classify it.  Return the token.
----------------------------------------------------------------------------*/
static JTK __inline _JtkGetNext(IStream *pstmIn, PUCHAR puch)
{
	ULONG rc;
    HRESULT hr;

    hr = pstmIn->Read(puch, 1, &rc);

	if (hr != S_OK )
		return err;
	else if (rc == 0)
		return eof;
	else
		return _rgjtkCharClass[*puch];
}

/* C C E  R E A D  E S C  S E Q */
/*----------------------------------------------------------------------------
	%%Function: CceReadEscSeq
	%%Contact: jpick

	Read pointer is positioned at an escape sequence, figure out
	which escape sequence it is.
----------------------------------------------------------------------------*/
CCE CceReadEscSeq(IStream *pstmIn, ICET *lpicet)
{
    UCHAR uch;
	TST tstCurr;
	JTK jtk;
	CCE cceRet;
#ifdef DEBUG
	TST tstPrev;
#endif

	// Sanity checks ...
	//
#ifdef DEBUG
	if (!pstmIn || !lpicet)
		return cceInvalidParameter;
#endif

	tstCurr = ST0;

	while (1)
		{
		// Find the next stopping state.
		//
		do
			{
			// Get the next character and clasify it.
			//
			jtk = _JtkGetNext(pstmIn, &uch);

#ifdef DEBUG
			// Save the previous state for debugging purposes, only.
			//
			tstPrev = tstCurr;
#endif
			// Transition -- note that order is different than
			// "normal" transition tables.
			//
			tstCurr = _rgchNextState[jtk][tstCurr];

			} while (!IsFinal(tstCurr));

		switch (tstCurr)
			{
			case JS0:			// JIS-Roman
			case JS1:			// Half-Width Katakana
			case JS2:			// JIS C 6226-1978
			case JS3:			// JIS X 0208-1983
			case JS4:			// JIS X 0208-1990
			case JS5:			// JIS X 0212-1990
				*lpicet = icetIso2022Jp;
				cceRet = cceSuccess;
				goto _LRet;
			case CS0:			// GB 1988-89 Roman
			case CS1:			// GB 2312-80
				*lpicet = icetIso2022Cn;
				cceRet = cceSuccess;
				goto _LRet;
			case TS0:			// CNS 11643-1992 Plane 1
			case TS1:			// CNS 11643-1992 Plane 2
			case TS2:			// CNS 11643-1992 Plane 3
			case TS3:			// CNS 11643-1992 Plane 4
			case TS4:			// CNS 11643-1992 Plane 5
			case TS5:			// CNS 11643-1992 Plane 6
			case TS6:			// CNS 11643-1992 Plane 7
				*lpicet = icetIso2022Tw;
				cceRet = cceSuccess;
				goto _LRet;
			case KS0:			// KS C 5601-1992
			case KSD:			// ISO-2022-KR Document Signal
				*lpicet = icetIso2022Kr;
				cceRet = cceSuccess;
				goto _LRet;
			case ASC:			// Ascii
			case LSO:
			case LSI:
			case TXT:
			case EXT:
			case FIN:
				// Insufficient information to choose a flavor ...
				cceRet = cceMayBeAscii;
				goto _LRet;
			case ERR:
				cceRet = cceRead;
				goto _LRet;
			default:			// UNK, EOI
				cceRet = cceUnknownInput;
				goto _LRet;
			}
		}

_LRet:

	return cceRet;
}