/*---------------------------------------------------------------------------- %%File: jislex.c %%Unit: fechmap %%Contact: jpick Simple converter for decoding a subset of possible ISO-2022-7 encoded files (ISO-2022). Data is translated to and from Unicode. Converter operates according to user options. Module currently handles ISO-2022-JP (and JIS) and ISO-2022-KR. Converter is set up to handle ISO-2022-TW and ISO-2022-CN, but there are as yet no conversion tables for these. ----------------------------------------------------------------------------*/ #include #include #include "private.h" #include "fechmap_.h" #include "lexint_.h" // State table for reading ISO-2022-7 encoded text // // Lexer recognizes the following designator sequences, used // to select a one or two byte character set: // // $ @ -- JIS C 6626-1978 (synonym of $ ( @) // $ A -- GB 2312-80 (synonym of $ ( A) // $ B -- JIS X 0208-1983 (synonym of $ ( B) // // $ ( @ -- JIS C 6626-1978 // $ ( A -- GB 2312-80 // $ ( B -- JIS X 0208-1983 // $ ( C -- KS C 5601-1992 // $ ( D -- JIS X 0212-1990 // $ ( E -- ??? (ISO-IR-165:1992) ??? // $ ( G -- CNS 11643-1992 Plane 1 // $ ( H -- CNS 11643-1992 Plane 2 // $ ( I -- CNS 11643-1992 Plane 3 // $ ( J -- CNS 11643-1992 Plane 4 // $ ( K -- CNS 11643-1992 Plane 5 // $ ( L -- CNS 11643-1992 Plane 6 // $ ( M -- CNS 11643-1992 Plane 7 // // $ ) C -- KSC 5601-1987 (Implies ISO-2022-KR ??) // // & @ $ B -- JIS X 0208-1990 // // ( B -- Ascii // ( H -- Deprecated variant of JIS-Roman // ( I -- Half-Width Katakana // ( J -- JIS-Roman // ( T -- GB 1988-89 Roman // // Lexer recognizes the following shift sequences, used to allow // interpretation of a given byte or bytes: // // -- locking shift, interpret bytes as G0 // -- locking shift, interpret bytes as G1 // n -- locking shift, interpret bytes as G2 // o -- locking shift, interpret bytes as G3 // N -- single shift, interpret bytes as G2 // O -- single shift, interpret bytes as G3 // // REVIEW (jpick): don't currently need the final four shift // sequences. If we support ISO-2022-CN, we'll need to use // G2 and G3 and potentially, then, the last four shifts. // /*---------------------------------------------------------------------------- Character Classification Table ----------------------------------------------------------------------------*/ // Tokens // #define txt (JTK) 0 #define ext (JTK) 1 // extended characters that are legal under certain circumstances #define esc (JTK) 2 #define si (JTK) 3 #define so (JTK) 4 #define dlr (JTK) 5 #define at (JTK) 6 #define amp (JTK) 7 #define opr (JTK) 8 #define cpr (JTK) 9 #define tkA (JTK) 10 #define tkB (JTK) 11 #define tkC (JTK) 12 #define tkD (JTK) 13 #define tkE (JTK) 14 #define tkG (JTK) 15 #define tkH (JTK) 16 #define tkI (JTK) 17 #define tkJ (JTK) 18 #define tkK (JTK) 19 #define tkL (JTK) 20 #define tkM (JTK) 21 #define tkT (JTK) 22 #define unk (JTK) 23 // Unexpected character #define eof (JTK) 24 // end-of-file #define err (JTK) 25 // read error #define nTokens 26 // Lookup table for ISO-2022-7 encoded files // static JTK _rgjtkCharClass[256] = // 0 1 2 3 4 5 6 7 8 9 a b c d e f { // nul soh stx etx eot enq ack bel bs tab lf vt np cr so si 0 txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, so, si, // dle dc1 dc2 dc3 dc4 nak syn etb can em eof esc fs gs rs us 1 txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, esc, txt, txt, txt, txt, // sp ! " # $ % & ' ( ) * + , - . / 2 txt, txt, txt, txt, dlr, txt, amp, txt, opr, cpr, txt, txt, txt, txt, txt, txt, // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 3 txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, // @ A B C D E F G H I J K L M N O 4 at, tkA, tkB, tkC, tkD, tkE, txt, tkG, tkH, tkI, tkJ, tkK, tkL, tkM, txt, txt, // P Q R S T U V W X Y Z [ \ ] ^ _ 5 txt, txt, txt, txt, tkT, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, // ` a b c d e f g h i j k l m n o 6 txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, // p q r s t u v w x y z { | } ~ del 7 txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, txt, // 8 unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, // 9 unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, // a unk, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, // b ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, // c ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, // d ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, ext, // e unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, // f unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, unk, // 0 1 2 3 4 5 6 7 8 9 a b c d e f }; /*---------------------------------------------------------------------------- State Table ----------------------------------------------------------------------------*/ // Final states have the high-bit set. States that represent the reading // of a valid character escape sequence also encode the character set // "name" (moniker??) -- the state with the high bit masked off. // // Table State // typedef unsigned char TST; // Final State Mask, Related // #define grfFinal (TST) 0x80 #define _NEscTypeFromState(nState) (int) ((nState) & 0x7f) // ASCII Escape Sequence (Final State) #define ASC (TST) (grfFinal | 0x00) // Ascii // Japanese Escape Sequences (Final States) #define JS0 (TST) (grfFinal | 0x01) // JIS-Roman #define JS1 (TST) (grfFinal | 0x02) // Half-Width Katakana #define JS2 (TST) (grfFinal | 0x03) // JIS C 6226-1978 #define JS3 (TST) (grfFinal | 0x04) // JIS X 0208-1983 #define JS4 (TST) (grfFinal | 0x05) // JIS X 0208-1990 #define JS5 (TST) (grfFinal | 0x06) // JIS X 0212-1990 // Chinese (PRC) Escape Sequences (Final States) #define CS0 (TST) (grfFinal | 0x07) // GB 1988-89 Roman #define CS1 (TST) (grfFinal | 0x08) // GB 2312-80 // Chinese (Taiwan) Escape Sequences (Final States) #define TS0 (TST) (grfFinal | 0x09) // CNS 11643-1992 Plane 1 #define TS1 (TST) (grfFinal | 0x0a) // CNS 11643-1992 Plane 2 #define TS2 (TST) (grfFinal | 0x0b) // CNS 11643-1992 Plane 3 #define TS3 (TST) (grfFinal | 0x0c) // CNS 11643-1992 Plane 4 #define TS4 (TST) (grfFinal | 0x0d) // CNS 11643-1992 Plane 5 #define TS5 (TST) (grfFinal | 0x0e) // CNS 11643-1992 Plane 6 #define TS6 (TST) (grfFinal | 0x0f) // CNS 11643-1992 Plane 7 // Korean Escape Sequences (Final State) #define KS0 (TST) (grfFinal | 0x10) // KS C 5601-1992 // Document "Signal" for ISO-2022-KR (Doc needs special processing) #define KSD (TST) (grfFinal | 0x11) // ISO-2022-KR Document Signal // Number of unique *character set* escape sequences // #define cCsEsc 18 // Special States (not escape sequence) (Final States) // #define TXT (TST) (grfFinal | (cCsEsc + 1)) // Process Text #define EXT (TST) (grfFinal | (cCsEsc + 2)) // Process (Possibly Illegal) Extended Chars #define FIN (TST) (grfFinal | (cCsEsc + 3)) // Finish #define EOI (TST) (grfFinal | (cCsEsc + 4)) // Unexpected End-Of-Input #define UNK (TST) (grfFinal | (cCsEsc + 5)) // Unknown State (Unexpected Character) #define ERR (TST) (grfFinal | (cCsEsc + 6)) // Read Error // Shift Sequences (do not specify character set) (Final States) // #define LSO (TST) (grfFinal | (cCsEsc + 7)) // Locking shift out (g1 into GL) #define LSI (TST) (grfFinal | (cCsEsc + 8)) // Locking shift in (g0 into GL) // For convenience, also define constants for the sets // that the states represent. // #define csNIL (-1) // Invalid Designator #define csASC (_NEscTypeFromState(ASC)) // Ascii #define csJS0 (_NEscTypeFromState(JS0)) // JIS-Roman #define csJS1 (_NEscTypeFromState(JS1)) // Half-Width Katakana #define csJS2 (_NEscTypeFromState(JS2)) // JIS C 6226-1978 #define csJS3 (_NEscTypeFromState(JS3)) // JIS X 0208-1983 #define csJS4 (_NEscTypeFromState(JS4)) // JIS X 0208-1990 #define csJS5 (_NEscTypeFromState(JS5)) // JIS X 0212-1990 #define csCS0 (_NEscTypeFromState(CS0)) // GB 1988-89 Roman #define csCS1 (_NEscTypeFromState(CS1)) // GB 2312-80 #define csTS0 (_NEscTypeFromState(TS0)) // CNS 11643-1992 Plane 1 #define csTS1 (_NEscTypeFromState(TS1)) // CNS 11643-1992 Plane 2 #define csTS2 (_NEscTypeFromState(TS2)) // CNS 11643-1992 Plane 3 #define csTS3 (_NEscTypeFromState(TS3)) // CNS 11643-1992 Plane 4 #define csTS4 (_NEscTypeFromState(TS4)) // CNS 11643-1992 Plane 5 #define csTS5 (_NEscTypeFromState(TS5)) // CNS 11643-1992 Plane 6 #define csTS6 (_NEscTypeFromState(TS6)) // CNS 11643-1992 Plane 7 #define csKS0 (_NEscTypeFromState(KS0)) // KS C 5601-1992 (into G0) #define csKSD (_NEscTypeFromState(KSD)) // KS C 5601-1992 (into G1) // Table States (Intermediate States) #define ST0 (TST) 0 #define ST1 (TST) 1 #define ST2 (TST) 2 #define ST3 (TST) 3 #define ST4 (TST) 4 #define ST5 (TST) 5 #define ST6 (TST) 6 #define ST7 (TST) 7 #define ST8 (TST) 8 #define ST9 (TST) 9 // Number of "real" (table) states // #define nStates 10 #define IsFinal(state) ((state) & grfFinal) // State Have Seen Looking For // ---------------------------------------------------------- // ST0 -- Start State -- Text // ST1 $ & ( // ST2 $ ( ) @ A B (**) // ST3 $ ( @ A B C D E G H I J K L M // ST4 $ ) C // ST5 & @ // ST6 & @ // ST7 & @ $ // ST8 & @ $ B // ST9 ( B H I J T // // (**) " $ ID" is a synonym of " $ ( ID" for ID=(@, A, B) // // Because of the large number of tokens, this table is // inverted (tokens x states). // static signed char _rgchNextState[nTokens][nStates] = { // // S S S S S S S S S S // T T T T T T T T T T // 0 1 2 3 4 5 6 7 8 9 //-------------------------------------------------------------------- // /* txt */ TXT, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, /* ext */ EXT, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, /* esc */ ST1, UNK, UNK, UNK, UNK, UNK, ST7, UNK, UNK, UNK, /* si */ LSI, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, /* so */ LSO, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, /* $ */ TXT, ST2, UNK, UNK, UNK, UNK, UNK, ST8, UNK, UNK, /* @ */ TXT, UNK, JS2, JS2, UNK, ST6, UNK, UNK, UNK, UNK, /* & */ TXT, ST5, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, /* ( */ TXT, ST9, ST3, UNK, UNK, UNK, UNK, UNK, UNK, UNK, /* ) */ TXT, UNK, ST4, UNK, UNK, UNK, UNK, UNK, UNK, UNK, /* A */ TXT, UNK, CS1, CS1, UNK, UNK, UNK, UNK, UNK, UNK, /* B */ TXT, UNK, JS3, JS3, UNK, UNK, UNK, UNK, JS4, ASC, /* C */ TXT, UNK, UNK, KS0, KSD, UNK, UNK, UNK, UNK, UNK, /* D */ TXT, UNK, UNK, JS5, UNK, UNK, UNK, UNK, UNK, UNK, /* E */ TXT, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, /* G */ TXT, UNK, UNK, TS0, UNK, UNK, UNK, UNK, UNK, UNK, /* H */ TXT, UNK, UNK, TS1, UNK, UNK, UNK, UNK, UNK, JS0, /* I */ TXT, UNK, UNK, TS2, UNK, UNK, UNK, UNK, UNK, JS1, /* J */ TXT, UNK, UNK, TS3, UNK, UNK, UNK, UNK, UNK, JS0, /* K */ TXT, UNK, UNK, TS4, UNK, UNK, UNK, UNK, UNK, UNK, /* L */ TXT, UNK, UNK, TS5, UNK, UNK, UNK, UNK, UNK, UNK, /* M */ TXT, UNK, UNK, TS6, UNK, UNK, UNK, UNK, UNK, UNK, /* T */ TXT, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, CS0, /* unk */ UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, UNK, /* eof */ FIN, EOI, EOI, EOI, EOI, EOI, EOI, EOI, EOI, EOI, /* err */ ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, }; // Also for ISO-2022 out. Build arrays of possible character // sets for each type of input character set. Character sets // should appear in order of hit probability (e.g., in 2022-Jp // JS3 is the most common set). Mark the end of array with -1. // (Only store these for non-ascii sets). // // // China (icetIso2022Cn) static int _rgceCn[] = { -1, }; // Japan (icetIso2022Jp) static int _rgceJp[] = { csJS3, csJS1, csJS5, -1, }; // Korea (icetIso2022Kr) static int _rgceKr[] = { -1, }; // Taiwan (icetIso2022Tw) static int _rgceTw[] = { -1, }; static int *_mpicetrgce[icetCount] = { 0, // icetEucCn 0, // icetEucJp 0, // icetEucKr 0, // icetEucTw _rgceCn, // icetIso2022Cn _rgceJp, // icetIso2022Jp _rgceKr, // icetIso2022Kr _rgceTw, // icetIso2022Tw 0, // icetBig5 0, // icetGbk 0, // icetShiftJis 0, // icetWansung 0, // icetUtf8 }; /* _ J T K G E T N E X T */ /*---------------------------------------------------------------------------- %%Function: _JtkGetNext %%Contact: jpick Get the next character and classify it. Return the token. ----------------------------------------------------------------------------*/ static JTK __inline _JtkGetNext(IStream *pstmIn, PUCHAR puch) { ULONG rc; HRESULT hr; hr = pstmIn->Read(puch, 1, &rc); if (hr != S_OK ) return err; else if (rc == 0) return eof; else return _rgjtkCharClass[*puch]; } /* C C E R E A D E S C S E Q */ /*---------------------------------------------------------------------------- %%Function: CceReadEscSeq %%Contact: jpick Read pointer is positioned at an escape sequence, figure out which escape sequence it is. ----------------------------------------------------------------------------*/ CCE CceReadEscSeq(IStream *pstmIn, ICET *lpicet) { UCHAR uch; TST tstCurr; JTK jtk; CCE cceRet; #ifdef DEBUG TST tstPrev; #endif // Sanity checks ... // #ifdef DEBUG if (!pstmIn || !lpicet) return cceInvalidParameter; #endif tstCurr = ST0; while (1) { // Find the next stopping state. // do { // Get the next character and clasify it. // jtk = _JtkGetNext(pstmIn, &uch); #ifdef DEBUG // Save the previous state for debugging purposes, only. // tstPrev = tstCurr; #endif // Transition -- note that order is different than // "normal" transition tables. // tstCurr = _rgchNextState[jtk][tstCurr]; } while (!IsFinal(tstCurr)); switch (tstCurr) { case JS0: // JIS-Roman case JS1: // Half-Width Katakana case JS2: // JIS C 6226-1978 case JS3: // JIS X 0208-1983 case JS4: // JIS X 0208-1990 case JS5: // JIS X 0212-1990 *lpicet = icetIso2022Jp; cceRet = cceSuccess; goto _LRet; case CS0: // GB 1988-89 Roman case CS1: // GB 2312-80 *lpicet = icetIso2022Cn; cceRet = cceSuccess; goto _LRet; case TS0: // CNS 11643-1992 Plane 1 case TS1: // CNS 11643-1992 Plane 2 case TS2: // CNS 11643-1992 Plane 3 case TS3: // CNS 11643-1992 Plane 4 case TS4: // CNS 11643-1992 Plane 5 case TS5: // CNS 11643-1992 Plane 6 case TS6: // CNS 11643-1992 Plane 7 *lpicet = icetIso2022Tw; cceRet = cceSuccess; goto _LRet; case KS0: // KS C 5601-1992 case KSD: // ISO-2022-KR Document Signal *lpicet = icetIso2022Kr; cceRet = cceSuccess; goto _LRet; case ASC: // Ascii case LSO: case LSI: case TXT: case EXT: case FIN: // Insufficient information to choose a flavor ... cceRet = cceMayBeAscii; goto _LRet; case ERR: cceRet = cceRead; goto _LRet; default: // UNK, EOI cceRet = cceUnknownInput; goto _LRet; } } _LRet: return cceRet; }