windows-nt/Source/XPSP1/NT/shell/ext/mlang/fechauto.cpp

/*----------------------------------------------------------------------------
	%%File: fechauto.c
	%%Unit: fechmap
	%%Contact: jpick

	Module that attempts to auto-detect encoding for a given stream.
----------------------------------------------------------------------------*/

#include <stdio.h>
#include <stddef.h>

#include "private.h"
#include "fechmap_.h"
#include "lexint_.h"

// Code marked by these #defines will be deleted eventually ...
// (It prints out useful information and statistics about how
// auto-detect is doing and what it's finding in the input).
//
#define JPDEBUG			0
#define JPDEBUG2		0
#define JPDEBUG3		0

#define NEED_NAMES		0

#if JPDEBUG || JPDEBUG2 || JPDEBUG3
#undef NEED_NAMES
#define NEED_NAMES		1
#endif

#if NEED_NAMES
static char *rgszIcetNames[icetCount] =
{
	"icetEucCn",
	"icetEucJp",
	"icetEucKr",
	"icetEucTw",
	"icetIso2022Cn",
	"icetIso2022Jp",
	"icetIso2022Kr",
	"icetIso2022Tw",
	"icetBig5",
	"icetGbk",
	"icetHz",
	"icetShiftJis",
	"icetWansung",
	"icetUtf7",
	"icetUtf8",
};
#endif

// Characters we care about
//
#define chSo		(UCHAR) 0x0e
#define chSi		(UCHAR) 0x0f
#define chEsc		(UCHAR) 0x1b

// Minimum Sample Size
//
#define	cchMinSample		64

// High-ASCII character threshold.  If this routine is unable
// to absolutely determine the encoding of this file, it will
// need to guess.  Files that are ASCII, but contain high-ASCII
// characters (e.g., a file with some Cyrillic characters) may
// confuse us.  If the number of high-ASCII characters falls
// below this threshold, return the encoding we guessed but 
// also return a special rc that says the file "might be ASCII."
//
// 5%, for now.
//
// 40%, for now, of the high-ascii characters must be in high-
// ascii pairs.  (Pulled down because of Big5 and the other
// DBCS encodings that can have trail bytes in the low range).
//
#define nHighCharThreshold		 5		// %
#define nHighPairThreshold		40		// %

// Used by CceDetermineInputTypeReturnAll() to determine whether any icet has
// high enough count to rule out all other icets.
//
#define CchCountThreshold(icet)	(((icet) == icetHz || (icet) == icetUtf7) ? 5 : 10)


// Tokens
//
// Stop tokens (negative) imply special handling and will cause
// the processing loop to stop (eof, err, si, so and esc are
// stop tokens).
//
#define xmn			  0
#define esc			(-1)
#define  so			(-2)
#define  si			(-3)
#define eof			(-4)
#define err			(-5)

#define _FStopToken(tk)		((tk) < 0)


// Masks used in _CBitsOnFromUlong()
//
#define lMaskBitCount1	(LONG) 0x55555555
#define lMaskBitCount2	(LONG) 0x33333333
#define lMaskBitCount3	(LONG) 0x0F0F0F0F
#define lMaskBitCount4	(LONG) 0x00FF00FF
#define lMaskBitCount5	(LONG) 0x0000FFFF

/* _  C  B I T S  O N  F R O M  U L O N G */
/*----------------------------------------------------------------------------
	%%Function: _CBitsOnFromUlong
	%%Contact: jpick

	(adapted from code in convio.c)
----------------------------------------------------------------------------*/
int __inline _CBitsOnFromUlong(ULONG ulBits)
{
	ulBits = (ulBits & lMaskBitCount1) + ((ulBits & ~lMaskBitCount1) >> 1);
	ulBits = (ulBits & lMaskBitCount2) + ((ulBits & ~lMaskBitCount2) >> 2);
	ulBits = (ulBits & lMaskBitCount3) + ((ulBits & ~lMaskBitCount3) >> 4);
	ulBits = (ulBits & lMaskBitCount4) + ((ulBits & ~lMaskBitCount4) >> 8);
	ulBits = (ulBits & lMaskBitCount5) + ((ulBits & ~lMaskBitCount5) >> 16);
	
	return (int)ulBits;
}
	
// Masks for the encodings
//
#define grfEucCn        (ULONG) 0x0001
#define grfEucJp        (ULONG) 0x0002
#define grfEucKr        (ULONG) 0x0004
#define grfEucTw        (ULONG) 0x0008
#define grfIso2022Cn    (ULONG) 0x0010
#define grfIso2022Jp    (ULONG) 0x0020
#define grfIso2022Kr    (ULONG) 0x0040
#define grfIso2022Tw    (ULONG) 0x0080
#define grfBig5         (ULONG) 0x0100
#define grfGbk          (ULONG) 0x0200
#define grfHz           (ULONG) 0x0400 
#define grfShiftJis     (ULONG) 0x0800
#define grfWansung      (ULONG) 0x1000
#define grfUtf7         (ULONG) 0x2000 	
#define grfUtf8         (ULONG) 0x4000

// grfAll assumes that the tests for Euc-Kr fall within those
// for Wansung (as far as I can tell from reading, Euc-Kr is a
// strict subset of Wansung).  The same for Euc-Cn and Gbk.  No
// need to test for both the subset and the whole.
//
#define grfAll              (ULONG) 0x7FFA
#define grfAllButIso2022    (ULONG) 0x7F0A
#define cAll                13				// == number bits set in grfAll
#define cAllButIso2022      9				// == number bits set in grfAllButIso2022

// Array that maps an encoding to its mask
//
static ULONG _mpicetgrf[icetCount] =
{
	grfEucCn,
	grfEucJp,
	grfEucKr,
	grfEucTw,
	grfIso2022Cn,
	grfIso2022Jp,
	grfIso2022Kr,
	grfIso2022Tw,
	grfBig5,
	grfGbk,
	grfHz,
	grfShiftJis,
	grfWansung,
	grfUtf7,
	grfUtf8,
};

// Prototypes
//
static int  _NGetNextUch(IStream *pstmIn, unsigned char *c, BOOL *lpfIsHigh);
static ICET _IcetFromIcetMask(ULONG ulMask);
static ICET _IcetDefaultFromIcetMask(ULONG ulMask);
static CCE  _CceResolveAmbiguity(ULONG grfIcet, ICET *lpicet, int nPrefCp, EFam efPref);
static CCE  _CceReadEscSeq(IStream *pstmIn, int nPrefCp, ICET *lpicet, BOOL *lpfGuess);


/* C C E  D E T E R M I N E  I N P U T  T Y P E */
/*----------------------------------------------------------------------------
	%%Function: CceDetermineInputType
	%%Contact: jpick

	Attempt to determine the appropriate ICET type for the given 
	stream.  Caller-supplied get/unget routines used for data access.
----------------------------------------------------------------------------*/
CCE CceDetermineInputType(
    IStream   *pstmIn,           // input stream
	DWORD     dwFlags,			// configuration flags
	EFam      efPref,			// optional: preferred encoding family
	int       nPrefCp,			// optional: preferred code page
	ICET     *lpicet,			// set to detected encoding
	BOOL     *lpfGuess			// set to fTrue if function "guessed"
)
{
	unsigned char uch;
	int nToken;
	CCE cceRet;
	BOOL fGuess;
	ICET icet;
	int cIcetActive;
	ULONG grfIcetActive;	// Bitarray tracks which encodings are still active candidates.
	ICET icetSeq;
	int i, nCount, nCountCurr;
	DWORD dwValFlags;
	BOOL fIsHigh;
	int cchHigh = 0;
	int cchHighPairs = 0;
	int cchTotal = 0;
	BOOL fLastHigh = fFalse;
	
#if JPDEBUG3
	ULONG grfIcetNoCommonChars;
#endif

#if JPDEBUG
	printf("flags: %d\n", dwFlags);
#endif
	
	// Initialize parsers
	//
	dwValFlags = grfCountCommonChars;
	if (dwFlags & grfDetectUseCharMapping)
		dwValFlags |= grfValidateCharMapping;
	ValidateInitAll(dwValFlags);
	
	// Initialize locals -- be optimistic
	//
	cceRet = cceSuccess;
	fGuess = fFalse;
	grfIcetActive = grfAllButIso2022;
	cIcetActive = cAllButIso2022;
	
#if JPDEBUG3
	grfIcetNoCommonChars = grfAllButIso2022;
#endif
	
	while (fTrue)
		{
		nToken = _NGetNextUch(pstmIn, &uch, &fIsHigh);
		if (_FStopToken(nToken))
			break;
			
		// Update (admittedly dumb) statistics -- really counts high
		// ascii characters in runs (not really pairs).  But threshold
		// constants (defined, above) were determined by calculating
        // exactly these numbers for ~25 files, so it should be ok (?).
		//
		++cchTotal;
		if (fIsHigh)
			{
			++cchHigh;
			if (fLastHigh)
				++cchHighPairs;
			}
		fLastHigh = fIsHigh;
			
		for (i = 0; i < icetCount; i++)
			{
			if (!(grfIcetActive & _mpicetgrf[i]) || (NValidateUch((ICET)i, uch, fFalse) != 0))
				continue;
				
			grfIcetActive &= ~_mpicetgrf[i];
			--cIcetActive;
#if JPDEBUG
			printf("Log:  Lost %s at offset 0x%.4x (%d), char 0x%.2x\n", rgszIcetNames[i], (cchTotal-1), (cchTotal-1), uch);
#endif
			}
			
#if JPDEBUG3
		for (i = 0; i < icetCount; i++)
			{
			if (!(grfIcetActive & _mpicetgrf[i]) || !(grfIcetNoCommonChars & _mpicetgrf[i]))
				continue;
				
			if (!FValidateCharCount(i, &nCount) || (nCount == 0))
				continue;
				
			grfIcetNoCommonChars &= ~_mpicetgrf[i];
			printf("Log:  Found first common seq for %s at offset 0x%.4x (%d)\n", rgszIcetNames[i], (cchTotal-1), (cchTotal-1));
			}
#endif
			
		if ((cIcetActive == 0) || ((cIcetActive == 1) && (cchTotal > cchMinSample)))
			break;
		}
		
	// Figure out why we exited the loop.
	//
	if (nToken == err)
		{
		cceRet = cceRead;
		goto _LRet;
		}
		
	// Process escapes separately.  Interpret the escape sequence
	// to determine for real which ISO7 flavor we have found.
	//
	if ((nToken == esc) || (nToken == so) || (nToken == si))
		{
        LARGE_INTEGER   li;
        HRESULT hr;

        LISet32(li, -1 );
        hr = pstmIn->Seek(li,STREAM_SEEK_CUR, NULL);

//		if (!pfnUnget(uch, lpvPrivate))
//			{
//			cceRet = cceUnget;
//			goto _LRet;
//			}
		cceRet = _CceReadEscSeq(pstmIn, nPrefCp, &icet, &fGuess);
#if JPDEBUG
		if (cceRet == cceSuccess)
			printf("Log:  Found encoding %s at offset 0x%.4x (%d)\n", rgszIcetNames[icet], cchTotal, cchTotal);
#endif
		// ISO is a special case -- no need to check statistics.
		//
		goto _LRet;
		}
		
#if JPDEBUG2
	printf("Counts:  %d total chars, %d high chars, %d high pairs\n", cchTotal, cchHigh, cchHighPairs); 
#endif
			
	// If the token was eof, and we're not ignoring eof, transition
	// the remaining active sets on eof.
	//
	if ((nToken == eof) && !(dwFlags & grfDetectIgnoreEof))
		{
		for (i = 0; i < icetCount; i++)
			{
			if (!(grfIcetActive & _mpicetgrf[i]) || (NValidateUch((ICET)i, 0, fTrue) != 0))
				continue;
#if JPDEBUG
			printf("Log:  Lost %s at EOF\n", rgszIcetNames[i]);
#endif
			grfIcetActive &= ~_mpicetgrf[i];
			--cIcetActive;
			}
		}
		
	Assert(cIcetActive >= 0);	// better *not* be less than 0

	// See how we've narrowed our field of choices and set the 
	// return status accordingly.
	//
	if (cIcetActive <= 0)
		{
#if JPDEBUG
		printf("Log:  Bailed out entirely at offset 0x%.4x (%d)\n", cchTotal, cchTotal);
#endif
		cceRet = cceUnknownInput;
		goto _LRet;
		}
	else if (cIcetActive == 1)
		{
		icet = _IcetFromIcetMask(grfIcetActive);
#if JPDEBUG
		printf("Log:  Found encoding %s at offset 0x%.4x (%d)\n", rgszIcetNames[icet], cchTotal, cchTotal);
#endif
		// If we matched an encoding type and also found matching 
		// common character runs, skip statistics (see comment,
		// below).
		//
		if (FValidateCharCount(icet, &nCount) && (nCount > 0))
			{
#if JPDEBUG3
			printf("Log:  %d common sequences for %s\n", nCount, rgszIcetNames[icet]);
#endif
			goto _LRet;
			}
		else
			{
			goto _LStats;
			}
		}
		
	// Did we learn anything from counting characters?
	//
	icetSeq = (ICET)-1;
	nCountCurr = 0;
	for (i = 0; i < icetCount; i++)
		{
		if (!(grfIcetActive & _mpicetgrf[i]) || !FValidateCharCount((ICET)i, &nCount))
			continue;
			
		if (nCount > nCountCurr)
			{
			icetSeq = (ICET)i;
			nCountCurr = nCount;
			}
			
#if JPDEBUG3
		printf("Log:  %d common sequences for %s\n", nCount, rgszIcetNames[i]);
#endif
		}
			
	// Any luck?  If so, return.  Don't bother checking statistics.
	// We just proved that we found at least one common run of 
	// characters in this input.  The odds against this for just a
	// plain ASCII file with some high characters seem pretty high.
	// Ignore the statistics and just return the encoding type we
	// found.
	//
	if (icetSeq != -1)
		{
		icet = icetSeq;
		goto _LRet;
		}
		
#if JPDEBUG
	printf("Log:  Active Icet Mask 0x%.8x, %d left\n", grfIcetActive, cIcetActive);
	printf("Log:  Icet's left -- ");
	for (i = 0; i < icetCount; i++)
		{
		if (grfIcetActive & _mpicetgrf[i])
            printf("%s, ", rgszIcetNames[i]);
		}
    printf("\n");
#endif

	// If caller did not want us to try to guess at the encoding
	// in the absence of definitive data, bail out.
	//
	if (!(dwFlags & grfDetectResolveAmbiguity))
		{
		cceRet = cceAmbiguousInput;
		goto _LRet;
		}
		
	// We're guessing -- note it.
	//
	fGuess = fTrue;
		
	// More than one active encoding.  Attempt to resolve ambiguity.
	//
	cceRet = _CceResolveAmbiguity(grfIcetActive, &icet, nPrefCp, efPref);
	if (cceRet != cceSuccess)
		return cceRet;
		
_LStats:
		
	// Adjust the return code based on the "statistics" we gathered,
	// above.
	//
	if (cchHigh > 0)
		{
		if ((cchTotal < cchMinSample) ||
			(((cchHigh * 100) / cchTotal) < nHighCharThreshold) ||
			(((cchHighPairs * 100) / cchHigh) < nHighPairThreshold))
			{
			cceRet = cceMayBeAscii;
			}
		}
	else
		{
		cceRet = cceMayBeAscii;		// no high-ascii characters?  definitely maybe!
		}

#if JPDEBUG2
	if (cchHigh > 0)
		{
		int nPercent1 = ((cchHigh * 100) / cchTotal);
		int nPercent2 = ((cchHighPairs * 100) / cchHigh);
		printf("Ratios -- high/total: %d%%, runs/high: %d%%\n", nPercent1, nPercent2);
		}
#endif
		
_LRet:

	// Set the return variables, if successful.
	//
	if ((cceRet == cceSuccess) || (cceRet == cceMayBeAscii))
		{
		*lpicet = icet;
		*lpfGuess = fGuess;
		}
		
#if JPDEBUG
		if (cceRet == cceSuccess)
			{
			printf("Log:  Returning %s, fGuess = %s\n", rgszIcetNames[icet], (fGuess ? "fTrue" : "fFalse"));
			}
		else if (cceRet == cceMayBeAscii)
			{
			printf("Log:  Returning %s, fGuess = %s, may-be-ASCII\n", rgszIcetNames[icet], (fGuess ? "fTrue" : "fFalse"));
			}
#endif
		
	return cceRet;
}


/* _ N  G E T  N E X T  U C H */
/*----------------------------------------------------------------------------
	%%Function: _NGetNextUch
	%%Contact: jpick

	Get the next character from the input stream.  Classify the character.
----------------------------------------------------------------------------*/
static int _NGetNextUch(IStream *pstmIn, unsigned char *c, BOOL *lpfIsHigh)
{
	ULONG rc;
	unsigned char uch;
    HRESULT hr;
		  
    hr = pstmIn->Read(&uch, 1, &rc);
	
    if (rc == 0)
		return eof;
	else if (hr != S_OK )
		return err;
		
	*lpfIsHigh = (uch >= 0x80);
	*c = uch;
		
	switch (uch)
		{
		case chEsc:
			return esc;
		case chSo:
			return so;
		case chSi:
			return si;
		default:
			return xmn;
		}
}


// Masks for _CceResolveAmbiguity() -- only externally supported character
// sets are used in ambiguity resolution.  Don't include Euc-Tw here.
//
#define grfJapan			(ULONG) (grfShiftJis | grfEucJp)
#define grfChina			(ULONG) (grfEucCn | grfGbk)
#define grfKorea			(ULONG) (grfEucKr | grfWansung)
#define grfTaiwan			(ULONG) (grfBig5)
#define grfDbcs				(ULONG) (grfShiftJis | grfGbk | grfWansung | grfBig5)
#define grfEuc				(ULONG) (grfEucJp | grfEucKr | grfEucCn)


/* _ C E  F R O M  C E  M A S K */
/*----------------------------------------------------------------------------
	%%Function: _IcetFromIcetMask
	%%Contact: jpick
----------------------------------------------------------------------------*/
static ICET _IcetFromIcetMask(ULONG ulMask)
{
	switch (ulMask)
	{
	case grfEucCn:
		return icetEucCn;
	case grfEucJp:
		return icetEucJp;
	case grfEucKr:
		return icetEucKr;
	case grfEucTw:
		return icetEucTw;
	case grfIso2022Cn:
		return icetIso2022Cn;
	case grfIso2022Jp:
		return icetIso2022Jp;
	case grfIso2022Kr:
		return icetIso2022Kr;
	case grfIso2022Tw:
		return icetIso2022Tw;
	case grfBig5:
		return icetBig5;
	case grfGbk:
		return icetGbk;
	case grfHz:
		return icetHz;
	case grfShiftJis:
		return icetShiftJis;
	case grfWansung:
		return icetWansung;
	case grfUtf7:
		return icetUtf7;
	case grfUtf8:
		return icetUtf8;
	default:
		break;
	}
	
	// Should never get here ...
	//
//	NotReached();
	
	// Can't return a bogus value, here.
	//
	return icetShiftJis;
}

/* _ C E  D E F A U L T  F R O M  C E  M A S K */
/*----------------------------------------------------------------------------
	%%Function: _IcetDefaultFromIcetMask
	%%Contact: jpick
----------------------------------------------------------------------------*/
static ICET _IcetDefaultFromIcetMask(ULONG ulMask)
{
	// Priorities -- DBCS, EUC, Japan, Taiwan, China and Korea (???).
	//
	if (ulMask & grfDbcs)
		{
		if (ulMask & grfJapan)
			return icetShiftJis;
		if (ulMask & grfChina)
			return icetGbk;
		if (ulMask & grfTaiwan)
			return icetBig5;
		if (ulMask & grfKorea)
			return icetWansung;
		}
	else // EUC
		{
		if (ulMask & grfJapan)
			return icetEucJp;
		if (ulMask & grfChina)
			return icetEucCn;
		if (ulMask & grfKorea)
			return icetEucKr;			// may be able to return icetWansung, here
		}
		
	// (Assert);
	return icetShiftJis;  // ???
}

/* _ U L  C E  M A S K  F R O M  C P  E T P */
/*----------------------------------------------------------------------------
	%%Function: _UlIcetMaskFromCpEf
	%%Contact: jpick
----------------------------------------------------------------------------*/
static ULONG _UlIcetMaskFromCpEf(int nCp, EFam ef)
{
	ULONG grf = grfAll;
	
	switch (nCp)
	{
	case nCpJapan:
		grf &= grfJapan;
		break;
	case nCpChina:
		grf &= grfChina;
		break;
	case nCpKorea:
		grf &= grfKorea;
		break;
	case nCpTaiwan:
		grf &= grfTaiwan;
		break;
	default:
		break;
	}
	
	switch (ef)
	{
	case efDbcs:
		grf &= grfDbcs;
		break;
	case efEuc:
		grf &= grfEuc;
		break;
	default:
		break;
	}
	return grf;
}


/* _ C C E  R E S O L V E  A M B I G U I T Y */
/*----------------------------------------------------------------------------
	%%Function: _CceResolveAmbiguity
	%%Contact: jpick

	Attempt to resolve ambiguous input encoding based on user
	preferences, if set, and system code page.  grfIcet contains a
	bitmask representing the encodings that are still possible after
	examining the input sample.
----------------------------------------------------------------------------*/
static CCE _CceResolveAmbiguity(ULONG grfIcet, ICET *lpicet, int nPrefCp, EFam efPref)
{
    ULONG grfIcetOrig = grfIcet;
	ULONG grfPref;
	ULONG grfSys;
	ULONG grfResult;
	int cIcet;
	
	// Build "list" of encodings based on user-prefs.
	//
	grfPref = _UlIcetMaskFromCpEf(nPrefCp, efPref);
	
	// See if the user's preferences make any difference.
	//
	grfResult = grfIcet & grfPref;
	
	if (grfResult)
		{
		cIcet = _CBitsOnFromUlong(grfResult);
		if (cIcet == 1)
			{
			*lpicet = _IcetFromIcetMask(grfResult);
			return cceSuccess;
			}
		else
			grfIcet = grfResult;			// see comment, below
		}
		
	// Now look to the system code page for help.  Look at
	// the set of encodings as modified by the user
	// preferences (??? do we want to do this ???).
	//
	if (!FIsFeCp(g_uACP) || (grfIcetOrig & grfUtf8))
		goto _LDefault;
		
	// Build "list" of encodings based on system cp.
	//
	grfSys = _UlIcetMaskFromCpEf(g_uACP, (EFam) 0);
	
	// See if the system cp makes any difference.
	//
	grfResult = grfIcet & grfSys;
	
	if (grfResult)
		{
		cIcet = _CBitsOnFromUlong(grfResult);
		if (cIcet == 1)
			{
			*lpicet = _IcetFromIcetMask(grfResult);
			return cceSuccess;
			}
		}
			
_LDefault:

    // Special case -- pick UTF-8 if it's legal and the prefs
    // don't help us.
    //
	*lpicet =
        (grfIcetOrig & grfUtf8) ? icetUtf8 : _IcetDefaultFromIcetMask(grfIcet);
	return cceSuccess;
}


/* _ C C E  R E A D  E S C  S E Q */
/*----------------------------------------------------------------------------
	%%Function: _CceReadEscSeq
	%%Contact: jpick

	We've read (and put back) an escape character.  Call the ISO-2022
	escape sequence converter to have it map the escape sequence to the
	appropriate character set.  We may be looking at the escape sequence
	for ASCII, so be prepared to read ahead to the next one.
----------------------------------------------------------------------------*/
static CCE _CceReadEscSeq(
    IStream   *pstmIn,           // input stream
	int       nPrefCp,
	ICET     *lpicet,
	BOOL     *lpfGuess
)
{
	unsigned char uch;
	CCE cceRet;
	int nToken;
	BOOL fDummy;
	
	do
		{
		cceRet = CceReadEscSeq(pstmIn, lpicet); 
		
		if ((cceRet == cceSuccess) || (cceRet != cceMayBeAscii))
			break;
		
		while (fTrue)
			{
			nToken = _NGetNextUch(pstmIn, &uch, &fDummy);
			if (_FStopToken(nToken))
				break;
			}
			
		// Why did we stop?
		//
		if (nToken == err)
			{
			cceRet = cceRead;
			break;
			}
		else if (nToken == eof)
			{
			// Means this is legal ISO-2022 input, but we've seen nothing
			// but non-flavor-specific escape sequences (e.g., only ASCII
			// or shift sequences).  Choose the encoding type based on
			// preferences (only pick from those currently supported
			// externally).
			//
			switch (nPrefCp)
				{
				case nCpKorea:
					*lpicet = icetIso2022Kr;
					break;
				case nCpJapan:
				default:						// Right ??? (gotta pick something ...)
					*lpicet = icetIso2022Jp;
					break;
				}
			*lpfGuess = fTrue;					// not *really* guessing, but ... (???)
			cceRet = cceSuccess;
			break;
			}
			
		Assert((nToken == esc) || (nToken == so) || (nToken == si));
		{
        LARGE_INTEGER   li;
        HRESULT hr;

        LISet32(li, -1 );

        hr = pstmIn->Seek(li,STREAM_SEEK_CUR, NULL);
        }
		// Put it back for CceReadEscSeq() to process.
		//
//		if (!pfnUnget(uch, lpvPrivate))
//			{
//			cceRet = cceUnget;
//			break;
//			}
			
		} while (fTrue);
	
	return cceRet;
}
Add source files 2020-09-26 03:20:57 -05:00			`/*----------------------------------------------------------------------------`
			`%%File: fechauto.c`
			`%%Unit: fechmap`
			`%%Contact: jpick`

			`Module that attempts to auto-detect encoding for a given stream.`
			`----------------------------------------------------------------------------*/`

			`#include <stdio.h>`
			`#include <stddef.h>`

			`#include "private.h"`
			`#include "fechmap_.h"`
			`#include "lexint_.h"`

			`// Code marked by these #defines will be deleted eventually ...`
			`// (It prints out useful information and statistics about how`
			`// auto-detect is doing and what it's finding in the input).`
			`//`
			`#define JPDEBUG 0`
			`#define JPDEBUG2 0`
			`#define JPDEBUG3 0`

			`#define NEED_NAMES 0`

			`#if JPDEBUG \|\| JPDEBUG2 \|\| JPDEBUG3`
			`#undef NEED_NAMES`
			`#define NEED_NAMES 1`
			`#endif`

			`#if NEED_NAMES`
			`static char *rgszIcetNames[icetCount] =`
			`{`
			`"icetEucCn",`
			`"icetEucJp",`
			`"icetEucKr",`
			`"icetEucTw",`
			`"icetIso2022Cn",`
			`"icetIso2022Jp",`
			`"icetIso2022Kr",`
			`"icetIso2022Tw",`
			`"icetBig5",`
			`"icetGbk",`
			`"icetHz",`
			`"icetShiftJis",`
			`"icetWansung",`
			`"icetUtf7",`
			`"icetUtf8",`
			`};`
			`#endif`

			`// Characters we care about`
			`//`
			`#define chSo (UCHAR) 0x0e`
			`#define chSi (UCHAR) 0x0f`
			`#define chEsc (UCHAR) 0x1b`

			`// Minimum Sample Size`
			`//`
			`#define cchMinSample 64`

			`// High-ASCII character threshold. If this routine is unable`
			`// to absolutely determine the encoding of this file, it will`
			`// need to guess. Files that are ASCII, but contain high-ASCII`
			`// characters (e.g., a file with some Cyrillic characters) may`
			`// confuse us. If the number of high-ASCII characters falls`
			`// below this threshold, return the encoding we guessed but`
			`// also return a special rc that says the file "might be ASCII."`
			`//`
			`// 5%, for now.`
			`//`
			`// 40%, for now, of the high-ascii characters must be in high-`
			`// ascii pairs. (Pulled down because of Big5 and the other`
			`// DBCS encodings that can have trail bytes in the low range).`
			`//`
			`#define nHighCharThreshold 5 // %`
			`#define nHighPairThreshold 40 // %`

			`// Used by CceDetermineInputTypeReturnAll() to determine whether any icet has`
			`// high enough count to rule out all other icets.`
			`//`
			`#define CchCountThreshold(icet) (((icet) == icetHz \|\| (icet) == icetUtf7) ? 5 : 10)`



			`// Tokens`
			`//`
			`// Stop tokens (negative) imply special handling and will cause`
			`// the processing loop to stop (eof, err, si, so and esc are`
			`// stop tokens).`
			`//`
			`#define xmn 0`
			`#define esc (-1)`
			`#define so (-2)`
			`#define si (-3)`
			`#define eof (-4)`
			`#define err (-5)`

			`#define _FStopToken(tk) ((tk) < 0)`


			`// Masks used in _CBitsOnFromUlong()`
			`//`
			`#define lMaskBitCount1 (LONG) 0x55555555`
			`#define lMaskBitCount2 (LONG) 0x33333333`
			`#define lMaskBitCount3 (LONG) 0x0F0F0F0F`
			`#define lMaskBitCount4 (LONG) 0x00FF00FF`
			`#define lMaskBitCount5 (LONG) 0x0000FFFF`

			`/* _ C B I T S O N F R O M U L O N G */`
			`/*----------------------------------------------------------------------------`
			`%%Function: _CBitsOnFromUlong`
			`%%Contact: jpick`

			`(adapted from code in convio.c)`
			`----------------------------------------------------------------------------*/`
			`int __inline _CBitsOnFromUlong(ULONG ulBits)`
			`{`
			`ulBits = (ulBits & lMaskBitCount1) + ((ulBits & ~lMaskBitCount1) >> 1);`
			`ulBits = (ulBits & lMaskBitCount2) + ((ulBits & ~lMaskBitCount2) >> 2);`
			`ulBits = (ulBits & lMaskBitCount3) + ((ulBits & ~lMaskBitCount3) >> 4);`
			`ulBits = (ulBits & lMaskBitCount4) + ((ulBits & ~lMaskBitCount4) >> 8);`
			`ulBits = (ulBits & lMaskBitCount5) + ((ulBits & ~lMaskBitCount5) >> 16);`

			`return (int)ulBits;`
			`}`

			`// Masks for the encodings`
			`//`
			`#define grfEucCn (ULONG) 0x0001`
			`#define grfEucJp (ULONG) 0x0002`
			`#define grfEucKr (ULONG) 0x0004`
			`#define grfEucTw (ULONG) 0x0008`
			`#define grfIso2022Cn (ULONG) 0x0010`
			`#define grfIso2022Jp (ULONG) 0x0020`
			`#define grfIso2022Kr (ULONG) 0x0040`
			`#define grfIso2022Tw (ULONG) 0x0080`
			`#define grfBig5 (ULONG) 0x0100`
			`#define grfGbk (ULONG) 0x0200`
			`#define grfHz (ULONG) 0x0400`
			`#define grfShiftJis (ULONG) 0x0800`
			`#define grfWansung (ULONG) 0x1000`
			`#define grfUtf7 (ULONG) 0x2000`
			`#define grfUtf8 (ULONG) 0x4000`

			`// grfAll assumes that the tests for Euc-Kr fall within those`
			`// for Wansung (as far as I can tell from reading, Euc-Kr is a`
			`// strict subset of Wansung). The same for Euc-Cn and Gbk. No`
			`// need to test for both the subset and the whole.`
			`//`
			`#define grfAll (ULONG) 0x7FFA`
			`#define grfAllButIso2022 (ULONG) 0x7F0A`
			`#define cAll 13 // == number bits set in grfAll`
			`#define cAllButIso2022 9 // == number bits set in grfAllButIso2022`

			`// Array that maps an encoding to its mask`
			`//`
			`static ULONG _mpicetgrf[icetCount] =`
			`{`
			`grfEucCn,`
			`grfEucJp,`
			`grfEucKr,`
			`grfEucTw,`
			`grfIso2022Cn,`
			`grfIso2022Jp,`
			`grfIso2022Kr,`
			`grfIso2022Tw,`
			`grfBig5,`
			`grfGbk,`
			`grfHz,`
			`grfShiftJis,`
			`grfWansung,`
			`grfUtf7,`
			`grfUtf8,`
			`};`

			`// Prototypes`
			`//`
			`static int _NGetNextUch(IStream pstmIn, unsigned char c, BOOL *lpfIsHigh);`
			`static ICET _IcetFromIcetMask(ULONG ulMask);`
			`static ICET _IcetDefaultFromIcetMask(ULONG ulMask);`
			`static CCE _CceResolveAmbiguity(ULONG grfIcet, ICET *lpicet, int nPrefCp, EFam efPref);`
			`static CCE _CceReadEscSeq(IStream pstmIn, int nPrefCp, ICET lpicet, BOOL *lpfGuess);`


			`/* C C E D E T E R M I N E I N P U T T Y P E */`
			`/*----------------------------------------------------------------------------`
			`%%Function: CceDetermineInputType`
			`%%Contact: jpick`

			`Attempt to determine the appropriate ICET type for the given`
			`stream. Caller-supplied get/unget routines used for data access.`
			`----------------------------------------------------------------------------*/`
			`CCE CceDetermineInputType(`
			`IStream *pstmIn, // input stream`
			`DWORD dwFlags, // configuration flags`
			`EFam efPref, // optional: preferred encoding family`
			`int nPrefCp, // optional: preferred code page`
			`ICET *lpicet, // set to detected encoding`
			`BOOL *lpfGuess // set to fTrue if function "guessed"`
			`)`
			`{`
			`unsigned char uch;`
			`int nToken;`
			`CCE cceRet;`
			`BOOL fGuess;`
			`ICET icet;`
			`int cIcetActive;`
			`ULONG grfIcetActive; // Bitarray tracks which encodings are still active candidates.`
			`ICET icetSeq;`
			`int i, nCount, nCountCurr;`
			`DWORD dwValFlags;`
			`BOOL fIsHigh;`
			`int cchHigh = 0;`
			`int cchHighPairs = 0;`
			`int cchTotal = 0;`
			`BOOL fLastHigh = fFalse;`

			`#if JPDEBUG3`
			`ULONG grfIcetNoCommonChars;`
			`#endif`

			`#if JPDEBUG`
			`printf("flags: %d\n", dwFlags);`
			`#endif`

			`// Initialize parsers`
			`//`
			`dwValFlags = grfCountCommonChars;`
			`if (dwFlags & grfDetectUseCharMapping)`
			`dwValFlags \|= grfValidateCharMapping;`
			`ValidateInitAll(dwValFlags);`

			`// Initialize locals -- be optimistic`
			`//`
			`cceRet = cceSuccess;`
			`fGuess = fFalse;`
			`grfIcetActive = grfAllButIso2022;`
			`cIcetActive = cAllButIso2022;`

			`#if JPDEBUG3`
			`grfIcetNoCommonChars = grfAllButIso2022;`
			`#endif`

			`while (fTrue)`
			`{`
			`nToken = _NGetNextUch(pstmIn, &uch, &fIsHigh);`
			`if (_FStopToken(nToken))`
			`break;`

			`// Update (admittedly dumb) statistics -- really counts high`
			`// ascii characters in runs (not really pairs). But threshold`
			`// constants (defined, above) were determined by calculating`
			`// exactly these numbers for ~25 files, so it should be ok (?).`
			`//`
			`++cchTotal;`
			`if (fIsHigh)`
			`{`
			`++cchHigh;`
			`if (fLastHigh)`
			`++cchHighPairs;`
			`}`
			`fLastHigh = fIsHigh;`

			`for (i = 0; i < icetCount; i++)`
			`{`
			`if (!(grfIcetActive & _mpicetgrf[i]) \|\| (NValidateUch((ICET)i, uch, fFalse) != 0))`
			`continue;`

			`grfIcetActive &= ~_mpicetgrf[i];`
			`--cIcetActive;`
			`#if JPDEBUG`
			`printf("Log: Lost %s at offset 0x%.4x (%d), char 0x%.2x\n", rgszIcetNames[i], (cchTotal-1), (cchTotal-1), uch);`
			`#endif`
			`}`

			`#if JPDEBUG3`
			`for (i = 0; i < icetCount; i++)`
			`{`
			`if (!(grfIcetActive & _mpicetgrf[i]) \|\| !(grfIcetNoCommonChars & _mpicetgrf[i]))`
			`continue;`

			`if (!FValidateCharCount(i, &nCount) \|\| (nCount == 0))`
			`continue;`

			`grfIcetNoCommonChars &= ~_mpicetgrf[i];`
			`printf("Log: Found first common seq for %s at offset 0x%.4x (%d)\n", rgszIcetNames[i], (cchTotal-1), (cchTotal-1));`
			`}`
			`#endif`

			`if ((cIcetActive == 0) \|\| ((cIcetActive == 1) && (cchTotal > cchMinSample)))`
			`break;`
			`}`

			`// Figure out why we exited the loop.`
			`//`
			`if (nToken == err)`
			`{`
			`cceRet = cceRead;`
			`goto _LRet;`
			`}`

			`// Process escapes separately. Interpret the escape sequence`
			`// to determine for real which ISO7 flavor we have found.`
			`//`
			`if ((nToken == esc) \|\| (nToken == so) \|\| (nToken == si))`
			`{`
			`LARGE_INTEGER li;`
			`HRESULT hr;`

			`LISet32(li, -1 );`
			`hr = pstmIn->Seek(li,STREAM_SEEK_CUR, NULL);`

			`// if (!pfnUnget(uch, lpvPrivate))`
			`// {`
			`// cceRet = cceUnget;`
			`// goto _LRet;`
			`// }`
			`cceRet = _CceReadEscSeq(pstmIn, nPrefCp, &icet, &fGuess);`
			`#if JPDEBUG`
			`if (cceRet == cceSuccess)`
			`printf("Log: Found encoding %s at offset 0x%.4x (%d)\n", rgszIcetNames[icet], cchTotal, cchTotal);`
			`#endif`
			`// ISO is a special case -- no need to check statistics.`
			`//`
			`goto _LRet;`
			`}`

			`#if JPDEBUG2`
			`printf("Counts: %d total chars, %d high chars, %d high pairs\n", cchTotal, cchHigh, cchHighPairs);`
			`#endif`

			`// If the token was eof, and we're not ignoring eof, transition`
			`// the remaining active sets on eof.`
			`//`
			`if ((nToken == eof) && !(dwFlags & grfDetectIgnoreEof))`
			`{`
			`for (i = 0; i < icetCount; i++)`
			`{`
			`if (!(grfIcetActive & _mpicetgrf[i]) \|\| (NValidateUch((ICET)i, 0, fTrue) != 0))`
			`continue;`
			`#if JPDEBUG`
			`printf("Log: Lost %s at EOF\n", rgszIcetNames[i]);`
			`#endif`
			`grfIcetActive &= ~_mpicetgrf[i];`
			`--cIcetActive;`
			`}`
			`}`

			`Assert(cIcetActive >= 0); // better not be less than 0`

			`// See how we've narrowed our field of choices and set the`
			`// return status accordingly.`
			`//`
			`if (cIcetActive <= 0)`
			`{`
			`#if JPDEBUG`
			`printf("Log: Bailed out entirely at offset 0x%.4x (%d)\n", cchTotal, cchTotal);`
			`#endif`
			`cceRet = cceUnknownInput;`
			`goto _LRet;`
			`}`
			`else if (cIcetActive == 1)`
			`{`
			`icet = _IcetFromIcetMask(grfIcetActive);`
			`#if JPDEBUG`
			`printf("Log: Found encoding %s at offset 0x%.4x (%d)\n", rgszIcetNames[icet], cchTotal, cchTotal);`
			`#endif`
			`// If we matched an encoding type and also found matching`
			`// common character runs, skip statistics (see comment,`
			`// below).`
			`//`
			`if (FValidateCharCount(icet, &nCount) && (nCount > 0))`
			`{`
			`#if JPDEBUG3`
			`printf("Log: %d common sequences for %s\n", nCount, rgszIcetNames[icet]);`
			`#endif`
			`goto _LRet;`
			`}`
			`else`
			`{`
			`goto _LStats;`
			`}`
			`}`

			`// Did we learn anything from counting characters?`
			`//`
			`icetSeq = (ICET)-1;`
			`nCountCurr = 0;`
			`for (i = 0; i < icetCount; i++)`
			`{`
			`if (!(grfIcetActive & _mpicetgrf[i]) \|\| !FValidateCharCount((ICET)i, &nCount))`
			`continue;`

			`if (nCount > nCountCurr)`
			`{`
			`icetSeq = (ICET)i;`
			`nCountCurr = nCount;`
			`}`

			`#if JPDEBUG3`
			`printf("Log: %d common sequences for %s\n", nCount, rgszIcetNames[i]);`
			`#endif`
			`}`

			`// Any luck? If so, return. Don't bother checking statistics.`
			`// We just proved that we found at least one common run of`
			`// characters in this input. The odds against this for just a`
			`// plain ASCII file with some high characters seem pretty high.`
			`// Ignore the statistics and just return the encoding type we`
			`// found.`
			`//`
			`if (icetSeq != -1)`
			`{`
			`icet = icetSeq;`
			`goto _LRet;`
			`}`

			`#if JPDEBUG`
			`printf("Log: Active Icet Mask 0x%.8x, %d left\n", grfIcetActive, cIcetActive);`
			`printf("Log: Icet's left -- ");`
			`for (i = 0; i < icetCount; i++)`
			`{`
			`if (grfIcetActive & _mpicetgrf[i])`
			`printf("%s, ", rgszIcetNames[i]);`
			`}`
			`printf("\n");`
			`#endif`

			`// If caller did not want us to try to guess at the encoding`
			`// in the absence of definitive data, bail out.`
			`//`
			`if (!(dwFlags & grfDetectResolveAmbiguity))`
			`{`
			`cceRet = cceAmbiguousInput;`
			`goto _LRet;`
			`}`

			`// We're guessing -- note it.`
			`//`
			`fGuess = fTrue;`

			`// More than one active encoding. Attempt to resolve ambiguity.`
			`//`
			`cceRet = _CceResolveAmbiguity(grfIcetActive, &icet, nPrefCp, efPref);`
			`if (cceRet != cceSuccess)`
			`return cceRet;`

			`_LStats:`

			`// Adjust the return code based on the "statistics" we gathered,`
			`// above.`
			`//`
			`if (cchHigh > 0)`
			`{`
			`if ((cchTotal < cchMinSample) \|\|`
			`(((cchHigh * 100) / cchTotal) < nHighCharThreshold) \|\|`
			`(((cchHighPairs * 100) / cchHigh) < nHighPairThreshold))`
			`{`
			`cceRet = cceMayBeAscii;`
			`}`
			`}`
			`else`
			`{`
			`cceRet = cceMayBeAscii; // no high-ascii characters? definitely maybe!`
			`}`

			`#if JPDEBUG2`
			`if (cchHigh > 0)`
			`{`
			`int nPercent1 = ((cchHigh * 100) / cchTotal);`
			`int nPercent2 = ((cchHighPairs * 100) / cchHigh);`
			`printf("Ratios -- high/total: %d%%, runs/high: %d%%\n", nPercent1, nPercent2);`
			`}`
			`#endif`

			`_LRet:`

			`// Set the return variables, if successful.`
			`//`
			`if ((cceRet == cceSuccess) \|\| (cceRet == cceMayBeAscii))`
			`{`
			`*lpicet = icet;`
			`*lpfGuess = fGuess;`
			`}`

			`#if JPDEBUG`
			`if (cceRet == cceSuccess)`
			`{`
			`printf("Log: Returning %s, fGuess = %s\n", rgszIcetNames[icet], (fGuess ? "fTrue" : "fFalse"));`
			`}`
			`else if (cceRet == cceMayBeAscii)`
			`{`
			`printf("Log: Returning %s, fGuess = %s, may-be-ASCII\n", rgszIcetNames[icet], (fGuess ? "fTrue" : "fFalse"));`
			`}`
			`#endif`

			`return cceRet;`
			`}`


			`/* _ N G E T N E X T U C H */`
			`/*----------------------------------------------------------------------------`
			`%%Function: _NGetNextUch`
			`%%Contact: jpick`

			`Get the next character from the input stream. Classify the character.`
			`----------------------------------------------------------------------------*/`
			`static int _NGetNextUch(IStream pstmIn, unsigned char c, BOOL *lpfIsHigh)`
			`{`
			`ULONG rc;`
			`unsigned char uch;`
			`HRESULT hr;`

			`hr = pstmIn->Read(&uch, 1, &rc);`

			`if (rc == 0)`
			`return eof;`
			`else if (hr != S_OK )`
			`return err;`

			`*lpfIsHigh = (uch >= 0x80);`
			`*c = uch;`

			`switch (uch)`
			`{`
			`case chEsc:`
			`return esc;`
			`case chSo:`
			`return so;`
			`case chSi:`
			`return si;`
			`default:`
			`return xmn;`
			`}`
			`}`


			`// Masks for _CceResolveAmbiguity() -- only externally supported character`
			`// sets are used in ambiguity resolution. Don't include Euc-Tw here.`
			`//`
			`#define grfJapan (ULONG) (grfShiftJis \| grfEucJp)`
			`#define grfChina (ULONG) (grfEucCn \| grfGbk)`
			`#define grfKorea (ULONG) (grfEucKr \| grfWansung)`
			`#define grfTaiwan (ULONG) (grfBig5)`
			`#define grfDbcs (ULONG) (grfShiftJis \| grfGbk \| grfWansung \| grfBig5)`
			`#define grfEuc (ULONG) (grfEucJp \| grfEucKr \| grfEucCn)`


			`/* _ C E F R O M C E M A S K */`
			`/*----------------------------------------------------------------------------`
			`%%Function: _IcetFromIcetMask`
			`%%Contact: jpick`
			`----------------------------------------------------------------------------*/`
			`static ICET _IcetFromIcetMask(ULONG ulMask)`
			`{`
			`switch (ulMask)`
			`{`
			`case grfEucCn:`
			`return icetEucCn;`
			`case grfEucJp:`
			`return icetEucJp;`
			`case grfEucKr:`
			`return icetEucKr;`
			`case grfEucTw:`
			`return icetEucTw;`
			`case grfIso2022Cn:`
			`return icetIso2022Cn;`
			`case grfIso2022Jp:`
			`return icetIso2022Jp;`
			`case grfIso2022Kr:`
			`return icetIso2022Kr;`
			`case grfIso2022Tw:`
			`return icetIso2022Tw;`
			`case grfBig5:`
			`return icetBig5;`
			`case grfGbk:`
			`return icetGbk;`
			`case grfHz:`
			`return icetHz;`
			`case grfShiftJis:`
			`return icetShiftJis;`
			`case grfWansung:`
			`return icetWansung;`
			`case grfUtf7:`
			`return icetUtf7;`
			`case grfUtf8:`
			`return icetUtf8;`
			`default:`
			`break;`
			`}`

			`// Should never get here ...`
			`//`
			`// NotReached();`

			`// Can't return a bogus value, here.`
			`//`
			`return icetShiftJis;`
			`}`

			`/* _ C E D E F A U L T F R O M C E M A S K */`
			`/*----------------------------------------------------------------------------`
			`%%Function: _IcetDefaultFromIcetMask`
			`%%Contact: jpick`
			`----------------------------------------------------------------------------*/`
			`static ICET _IcetDefaultFromIcetMask(ULONG ulMask)`
			`{`
			`// Priorities -- DBCS, EUC, Japan, Taiwan, China and Korea (???).`
			`//`
			`if (ulMask & grfDbcs)`
			`{`
			`if (ulMask & grfJapan)`
			`return icetShiftJis;`
			`if (ulMask & grfChina)`
			`return icetGbk;`
			`if (ulMask & grfTaiwan)`
			`return icetBig5;`
			`if (ulMask & grfKorea)`
			`return icetWansung;`
			`}`
			`else // EUC`
			`{`
			`if (ulMask & grfJapan)`
			`return icetEucJp;`
			`if (ulMask & grfChina)`
			`return icetEucCn;`
			`if (ulMask & grfKorea)`
			`return icetEucKr; // may be able to return icetWansung, here`
			`}`

			`// (Assert);`
			`return icetShiftJis; // ???`
			`}`

			`/* _ U L C E M A S K F R O M C P E T P */`
			`/*----------------------------------------------------------------------------`
			`%%Function: _UlIcetMaskFromCpEf`
			`%%Contact: jpick`
			`----------------------------------------------------------------------------*/`
			`static ULONG _UlIcetMaskFromCpEf(int nCp, EFam ef)`
			`{`
			`ULONG grf = grfAll;`

			`switch (nCp)`
			`{`
			`case nCpJapan:`
			`grf &= grfJapan;`
			`break;`
			`case nCpChina:`
			`grf &= grfChina;`
			`break;`
			`case nCpKorea:`
			`grf &= grfKorea;`
			`break;`
			`case nCpTaiwan:`
			`grf &= grfTaiwan;`
			`break;`
			`default:`
			`break;`
			`}`

			`switch (ef)`
			`{`
			`case efDbcs:`
			`grf &= grfDbcs;`
			`break;`
			`case efEuc:`
			`grf &= grfEuc;`
			`break;`
			`default:`
			`break;`
			`}`
			`return grf;`
			`}`


			`/* _ C C E R E S O L V E A M B I G U I T Y */`
			`/*----------------------------------------------------------------------------`
			`%%Function: _CceResolveAmbiguity`
			`%%Contact: jpick`

			`Attempt to resolve ambiguous input encoding based on user`
			`preferences, if set, and system code page. grfIcet contains a`
			`bitmask representing the encodings that are still possible after`
			`examining the input sample.`
			`----------------------------------------------------------------------------*/`
			`static CCE _CceResolveAmbiguity(ULONG grfIcet, ICET *lpicet, int nPrefCp, EFam efPref)`
			`{`
			`ULONG grfIcetOrig = grfIcet;`
			`ULONG grfPref;`
			`ULONG grfSys;`
			`ULONG grfResult;`
			`int cIcet;`

			`// Build "list" of encodings based on user-prefs.`
			`//`
			`grfPref = _UlIcetMaskFromCpEf(nPrefCp, efPref);`

			`// See if the user's preferences make any difference.`
			`//`
			`grfResult = grfIcet & grfPref;`

			`if (grfResult)`
			`{`
			`cIcet = _CBitsOnFromUlong(grfResult);`
			`if (cIcet == 1)`
			`{`
			`*lpicet = _IcetFromIcetMask(grfResult);`
			`return cceSuccess;`
			`}`
			`else`
			`grfIcet = grfResult; // see comment, below`
			`}`

			`// Now look to the system code page for help. Look at`
			`// the set of encodings as modified by the user`
			`// preferences (??? do we want to do this ???).`
			`//`
			`if (!FIsFeCp(g_uACP) \|\| (grfIcetOrig & grfUtf8))`
			`goto _LDefault;`

			`// Build "list" of encodings based on system cp.`
			`//`
			`grfSys = _UlIcetMaskFromCpEf(g_uACP, (EFam) 0);`

			`// See if the system cp makes any difference.`
			`//`
			`grfResult = grfIcet & grfSys;`

			`if (grfResult)`
			`{`
			`cIcet = _CBitsOnFromUlong(grfResult);`
			`if (cIcet == 1)`
			`{`
			`*lpicet = _IcetFromIcetMask(grfResult);`
			`return cceSuccess;`
			`}`
			`}`

			`_LDefault:`

			`// Special case -- pick UTF-8 if it's legal and the prefs`
			`// don't help us.`
			`//`
			`*lpicet =`
			`(grfIcetOrig & grfUtf8) ? icetUtf8 : _IcetDefaultFromIcetMask(grfIcet);`
			`return cceSuccess;`
			`}`


			`/* _ C C E R E A D E S C S E Q */`
			`/*----------------------------------------------------------------------------`
			`%%Function: _CceReadEscSeq`
			`%%Contact: jpick`

			`We've read (and put back) an escape character. Call the ISO-2022`
			`escape sequence converter to have it map the escape sequence to the`
			`appropriate character set. We may be looking at the escape sequence`
			`for ASCII, so be prepared to read ahead to the next one.`
			`----------------------------------------------------------------------------*/`
			`static CCE _CceReadEscSeq(`
			`IStream *pstmIn, // input stream`
			`int nPrefCp,`
			`ICET *lpicet,`
			`BOOL *lpfGuess`
			`)`
			`{`
			`unsigned char uch;`
			`CCE cceRet;`
			`int nToken;`
			`BOOL fDummy;`

			`do`
			`{`
			`cceRet = CceReadEscSeq(pstmIn, lpicet);`

			`if ((cceRet == cceSuccess) \|\| (cceRet != cceMayBeAscii))`
			`break;`

			`while (fTrue)`
			`{`
			`nToken = _NGetNextUch(pstmIn, &uch, &fDummy);`
			`if (_FStopToken(nToken))`
			`break;`
			`}`

			`// Why did we stop?`
			`//`
			`if (nToken == err)`
			`{`
			`cceRet = cceRead;`
			`break;`
			`}`
			`else if (nToken == eof)`
			`{`
			`// Means this is legal ISO-2022 input, but we've seen nothing`
			`// but non-flavor-specific escape sequences (e.g., only ASCII`
			`// or shift sequences). Choose the encoding type based on`
			`// preferences (only pick from those currently supported`
			`// externally).`
			`//`
			`switch (nPrefCp)`
			`{`
			`case nCpKorea:`
			`*lpicet = icetIso2022Kr;`
			`break;`
			`case nCpJapan:`
			`default: // Right ??? (gotta pick something ...)`
			`*lpicet = icetIso2022Jp;`
			`break;`
			`}`
			`lpfGuess = fTrue; // not really* guessing, but ... (???)`
			`cceRet = cceSuccess;`
			`break;`
			`}`

			`Assert((nToken == esc) \|\| (nToken == so) \|\| (nToken == si));`
			`{`
			`LARGE_INTEGER li;`
			`HRESULT hr;`

			`LISet32(li, -1 );`

			`hr = pstmIn->Seek(li,STREAM_SEEK_CUR, NULL);`
			`}`
			`// Put it back for CceReadEscSeq() to process.`
			`//`
			`// if (!pfnUnget(uch, lpvPrivate))`
			`// {`
			`// cceRet = cceUnget;`
			`// break;`
			`// }`

			`} while (fTrue);`

			`return cceRet;`
			`}`