windows-nt/Source/XPSP1/NT/shell/ext/mlang/detcjpn.cpp

// =================================================================================
// Internet Character Set Detection: For Japanese
// =================================================================================

#include "private.h"
#include "detcbase.h"
#include "detcjpn.h"
#include "fechrcnv.h"
#include "codepage.h"

CIncdJapanese::CIncdJapanese(DWORD nCp)
{
    m_nScoreJis = 0;
    m_nScoreEuc = 0;
    m_nScoreSJis = 0;

    m_nISOMode = NONE ;
    m_nJISMode = REGULAR;
    m_nEucMode = REGULAR;
    m_fDoubleByteSJis = FALSE;
    // If Jpn autoselect, we'll bias to Shift-Jis like we did before
    m_nPreferredCp = (nCp == CP_JP_AUTO)? CP_JPN_SJ : nCp;
}

BOOL CIncdJapanese::CheckISOChar(UCHAR tc)
{
    switch (m_nISOMode) {
    case NONE:
        if ( tc == ESC )
            m_nISOMode = ISO_ESC ;
        break;
    case ISO_ESC:
        if ( tc == ISO2022_IN_CHAR )        // '$'
            m_nISOMode = ISO_ESC_IN ;
        else if ( tc == ISO2022_OUT_CHAR )
            m_nISOMode = ISO_ESC_OUT ;      // '('
        else
            m_nISOMode = NONE ;
        break;
    case ISO_ESC_IN:    // esc '$'
        m_nISOMode = NONE ;
        if ( tc == ISO2022_IN_JP_CHAR1 ||       // 'B'
                tc == ISO2022_IN_JP_CHAR2 )     // '@'
        {
            m_nJISMode = DOUBLEBYTE ;
            return TRUE ;
        }
        break;
    case ISO_ESC_OUT:   // esc '('
        m_nISOMode = NONE ;
        if ( tc == ISO2022_OUT_JP_CHAR1 ||      //	'B'
                tc == ISO2022_OUT_JP_CHAR2 )    //	'J'
        {
            m_nJISMode = REGULAR ;
            return TRUE ;
        }
        else if ( tc == ISO2022_OUT_JP_CHAR3 )   //	'I'
        {
            m_nJISMode = KATAKANA;
            return TRUE ;
        }
        break;
    }
    return FALSE;
}

BOOL CIncdJapanese::DetectChar(UCHAR tc)
{
	// JIS
	if ( CheckISOChar(tc) )
	    return FALSE;   // JIS mode change, don't need to check other type

	switch (m_nJISMode) {
	case REGULAR:
	    if (tc < 0x80)
	        m_nScoreJis += SCORE_MAJOR;
	    break;
	case DOUBLEBYTE:
	case KATAKANA:
	    m_nScoreJis += SCORE_MAJOR;
	    return FALSE;   // In JIS mode for sure, don't need to check other type
	}

	// EUC-J
	switch (m_nEucMode) {
	case REGULAR:
		if (tc >= 0xa1 && tc <= 0xfe) // Double Byte
			m_nEucMode = DOUBLEBYTE;
		else if (tc == 0x8e) // Single Byte Katakana
			m_nEucMode = KATAKANA;
		else if (tc < 0x80)
			m_nScoreEuc += SCORE_MAJOR;
		break;
	case DOUBLEBYTE:
		if (tc >= 0xa1 && tc <= 0xfe)
			m_nScoreEuc += SCORE_MAJOR * 2;
		m_nEucMode = REGULAR;
		break;
	case KATAKANA:
		if (tc >= 0xa1 && tc <= 0xdf) // Katakana range
			m_nScoreEuc += SCORE_MAJOR * 2;
		m_nEucMode = REGULAR;
		break;
	}

	// Shift-JIS
	if (!m_fDoubleByteSJis) {
		if ((tc >= 0x81 && tc <= 0x9f) || (tc >= 0xe0 && tc <= 0xfc)) // Double Byte
			m_fDoubleByteSJis = TRUE;
		else if (tc <= 0x7e || (tc >= 0xa1 && tc <= 0xdf))
			m_nScoreSJis += SCORE_MAJOR;
	} else {
		if (tc >= 0x40 && tc <= 0xfc && tc != 0x7f) // Trail Byte range
			m_nScoreSJis += SCORE_MAJOR * 2;
		m_fDoubleByteSJis = FALSE;
	}

	return FALSE;
}

int CIncdJapanese::GetDetectedCodeSet()
{
    int nMaxScore = m_nScoreSJis;
    int nCodeSet = CP_JPN_SJ;

    if (m_nScoreEuc > nMaxScore) {
	    nMaxScore = m_nScoreEuc;
	    nCodeSet = CP_EUC_JP ; // EUC
    } else if (m_nScoreEuc == nMaxScore) {
        if (m_nScoreEuc > MIN_JPN_DETECTLEN * SCORE_MAJOR) 
            // If the given string is not long enough, we should rather choose SJIS
            // This helps fix the bug when we are just given Window Title
            // at Shell HyperText view.
	        nCodeSet = CP_EUC_JP ; // EUC
        else
            // If we can't distinguish between EUC and Shift-Jis, we use the preferred one
            nCodeSet = m_nPreferredCp;
    }

    // JIS
    if (m_nScoreJis > nMaxScore) 
	    nCodeSet = CP_ISO_2022_JP ; 
    // Even score means all 7bits chars
    // in this case, it maybe just pure ANSI data, we return it is ambiguous.
    else if (m_nScoreJis == nMaxScore) 
	    nCodeSet = 0 ;    

    return nCodeSet;
}
Add source files 2020-09-26 03:20:57 -05:00			`// =================================================================================`
			`// Internet Character Set Detection: For Japanese`
			`// =================================================================================`

			`#include "private.h"`
			`#include "detcbase.h"`
			`#include "detcjpn.h"`
			`#include "fechrcnv.h"`
			`#include "codepage.h"`

			`CIncdJapanese::CIncdJapanese(DWORD nCp)`
			`{`
			`m_nScoreJis = 0;`
			`m_nScoreEuc = 0;`
			`m_nScoreSJis = 0;`

			`m_nISOMode = NONE ;`
			`m_nJISMode = REGULAR;`
			`m_nEucMode = REGULAR;`
			`m_fDoubleByteSJis = FALSE;`
			`// If Jpn autoselect, we'll bias to Shift-Jis like we did before`
			`m_nPreferredCp = (nCp == CP_JP_AUTO)? CP_JPN_SJ : nCp;`
			`}`

			`BOOL CIncdJapanese::CheckISOChar(UCHAR tc)`
			`{`
			`switch (m_nISOMode) {`
			`case NONE:`
			`if ( tc == ESC )`
			`m_nISOMode = ISO_ESC ;`
			`break;`
			`case ISO_ESC:`
			`if ( tc == ISO2022_IN_CHAR ) // '$'`
			`m_nISOMode = ISO_ESC_IN ;`
			`else if ( tc == ISO2022_OUT_CHAR )`
			`m_nISOMode = ISO_ESC_OUT ; // '('`
			`else`
			`m_nISOMode = NONE ;`
			`break;`
			`case ISO_ESC_IN: // esc '$'`
			`m_nISOMode = NONE ;`
			`if ( tc == ISO2022_IN_JP_CHAR1 \|\| // 'B'`
			`tc == ISO2022_IN_JP_CHAR2 ) // '@'`
			`{`
			`m_nJISMode = DOUBLEBYTE ;`
			`return TRUE ;`
			`}`
			`break;`
			`case ISO_ESC_OUT: // esc '('`
			`m_nISOMode = NONE ;`
			`if ( tc == ISO2022_OUT_JP_CHAR1 \|\| // 'B'`
			`tc == ISO2022_OUT_JP_CHAR2 ) // 'J'`
			`{`
			`m_nJISMode = REGULAR ;`
			`return TRUE ;`
			`}`
			`else if ( tc == ISO2022_OUT_JP_CHAR3 ) // 'I'`
			`{`
			`m_nJISMode = KATAKANA;`
			`return TRUE ;`
			`}`
			`break;`
			`}`
			`return FALSE;`
			`}`

			`BOOL CIncdJapanese::DetectChar(UCHAR tc)`
			`{`
			`// JIS`
			`if ( CheckISOChar(tc) )`
			`return FALSE; // JIS mode change, don't need to check other type`

			`switch (m_nJISMode) {`
			`case REGULAR:`
			`if (tc < 0x80)`
			`m_nScoreJis += SCORE_MAJOR;`
			`break;`
			`case DOUBLEBYTE:`
			`case KATAKANA:`
			`m_nScoreJis += SCORE_MAJOR;`
			`return FALSE; // In JIS mode for sure, don't need to check other type`
			`}`

			`// EUC-J`
			`switch (m_nEucMode) {`
			`case REGULAR:`
			`if (tc >= 0xa1 && tc <= 0xfe) // Double Byte`
			`m_nEucMode = DOUBLEBYTE;`
			`else if (tc == 0x8e) // Single Byte Katakana`
			`m_nEucMode = KATAKANA;`
			`else if (tc < 0x80)`
			`m_nScoreEuc += SCORE_MAJOR;`
			`break;`
			`case DOUBLEBYTE:`
			`if (tc >= 0xa1 && tc <= 0xfe)`
			`m_nScoreEuc += SCORE_MAJOR * 2;`
			`m_nEucMode = REGULAR;`
			`break;`
			`case KATAKANA:`
			`if (tc >= 0xa1 && tc <= 0xdf) // Katakana range`
			`m_nScoreEuc += SCORE_MAJOR * 2;`
			`m_nEucMode = REGULAR;`
			`break;`
			`}`

			`// Shift-JIS`
			`if (!m_fDoubleByteSJis) {`
			`if ((tc >= 0x81 && tc <= 0x9f) \|\| (tc >= 0xe0 && tc <= 0xfc)) // Double Byte`
			`m_fDoubleByteSJis = TRUE;`
			`else if (tc <= 0x7e \|\| (tc >= 0xa1 && tc <= 0xdf))`
			`m_nScoreSJis += SCORE_MAJOR;`
			`} else {`
			`if (tc >= 0x40 && tc <= 0xfc && tc != 0x7f) // Trail Byte range`
			`m_nScoreSJis += SCORE_MAJOR * 2;`
			`m_fDoubleByteSJis = FALSE;`
			`}`

			`return FALSE;`
			`}`

			`int CIncdJapanese::GetDetectedCodeSet()`
			`{`
			`int nMaxScore = m_nScoreSJis;`
			`int nCodeSet = CP_JPN_SJ;`

			`if (m_nScoreEuc > nMaxScore) {`
			`nMaxScore = m_nScoreEuc;`
			`nCodeSet = CP_EUC_JP ; // EUC`
			`} else if (m_nScoreEuc == nMaxScore) {`
			`if (m_nScoreEuc > MIN_JPN_DETECTLEN * SCORE_MAJOR)`
			`// If the given string is not long enough, we should rather choose SJIS`
			`// This helps fix the bug when we are just given Window Title`
			`// at Shell HyperText view.`
			`nCodeSet = CP_EUC_JP ; // EUC`
			`else`
			`// If we can't distinguish between EUC and Shift-Jis, we use the preferred one`
			`nCodeSet = m_nPreferredCp;`
			`}`

			`// JIS`
			`if (m_nScoreJis > nMaxScore)`
			`nCodeSet = CP_ISO_2022_JP ;`
			`// Even score means all 7bits chars`
			`// in this case, it maybe just pure ANSI data, we return it is ambiguous.`
			`else if (m_nScoreJis == nMaxScore)`
			`nCodeSet = 0 ;`

			`return nCodeSet;`
			`}`