windows-nt/Source/XPSP1/NT/shell/ext/mlang/utf8obj.cpp

// ============================================================================
// Internet Character Set Conversion: Input from UTF-8
// ============================================================================

#include "private.h"
#include "fechrcnv.h"
#include "utf8obj.h"

/******************************************************************************
**************************   C O N S T R U C T O R   **************************
******************************************************************************/

CInccUTF8In::CInccUTF8In(UINT uCodePage, int nCodeSet) : CINetCodeConverter(uCodePage, nCodeSet)
{
    Reset();    // initialization
    return ;
}

/******************************************************************************
*******************************   R E S E T   *********************************
******************************************************************************/

void CInccUTF8In::Reset()
{
	m_pfnConv = ConvMain;
	m_pfnCleanUp = CleanUpMain;
	m_nByteFollow = 0 ;
	m_tcUnicode = 0 ;
	m_tcSurrogateUnicode = 0 ;
	m_nBytesUsed = 0 ;
	m_fSurrogatesPairs = FALSE;
    return ;
}

/******************************************************************************
*************************   C O N V E R T   C H A R   *************************
******************************************************************************/

HRESULT CInccUTF8In::ConvertChar(UCHAR tc, int cchSrc)
{
	BOOL fDone = (this->*m_pfnConv)(tc);
    if (fDone)
        return S_OK;
    else
        return E_FAIL;
}

/******************************************************************************
*****************************   C L E A N   U P   *****************************
******************************************************************************/

BOOL CInccUTF8In::CleanUp()
{
	return (this->*m_pfnCleanUp)();
}

/******************************************************************************
****************************   C O N V   M A I N   ****************************
******************************************************************************/

BOOL CInccUTF8In::ConvMain(UCHAR tc)
{
	BOOL fDone = TRUE;

    if( ( 0x80 & tc ) == 0 )                    // BIT7 == 0 ASCII
    {
        Output(tc);
        fDone = Output(0);
        m_nBytesUsed = 0 ; 
    }
    else if( (0x40 & tc) == 0 )                 // BIT6 == 0 a trail byte
    {
        if( m_nByteFollow )                
        {
            if (m_fSurrogatesPairs)
            {
                m_nByteFollow--;
                m_tcSurrogateUnicode <<= 6;             // Make room for trail byte
                m_tcSurrogateUnicode |= ( 0x3F & tc );  // LOWER_6BIT add trail byte value

                if( m_nByteFollow == 0)                 // End of sequence, advance output ptr
                {
                    m_tcUnicode = (WCHAR)(((m_tcSurrogateUnicode - 0x10000) >> 10) + HIGHT_SURROGATE_START);
                    tc = (UCHAR)m_tcUnicode ;
                    if ( fDone = Output(tc) )
                    {
                        tc = (UCHAR) ( m_tcUnicode >> 8 ) ; 
                        fDone = Output(tc);
                    }
                    m_tcUnicode = (WCHAR)((m_tcSurrogateUnicode - 0x10000)%0x400 + LOW_SURROGATE_START);
                    tc = (UCHAR)m_tcUnicode ;
                    if ( fDone = Output(tc) )
                    {
                        tc = (UCHAR) ( m_tcUnicode >> 8 ) ; 
                        fDone = Output(tc);
                    }
                    m_fSurrogatesPairs = 0;
                    m_nBytesUsed = 0 ; 
                }	
                else
                    m_nBytesUsed++ ; 
            }
            else
            {
                m_nByteFollow--;
                m_tcUnicode <<= 6;                  // make room for trail byte
                m_tcUnicode |= ( 0x3F & tc );       // LOWER_6BIT add trail byte value

                if( m_nByteFollow == 0)             // end of sequence, advance output ptr
                {
                    tc = (UCHAR)m_tcUnicode ;
                    if ( fDone = Output(tc) )
                    {
                        tc = (UCHAR) ( m_tcUnicode >> 8 ) ; 
                        fDone = Output(tc);
                    }
                    m_nBytesUsed = 0 ; 
                }	
                else
                    m_nBytesUsed++ ; 
            }
        }
        else                                    // error - ignor and rest
        {
            m_nBytesUsed = 0 ; 
            m_nByteFollow = 0 ;
        }
    }
    else                                        // a lead byte
    {
        if( m_nByteFollow > 0 )                 // error, previous sequence not finished
        {
            m_nByteFollow = 0;
            Output(' ');
            fDone = Output(0);
            m_nBytesUsed = 0 ; 
        }
        else                                    // calculate # bytes to follow
        {
            while( (0x80 & tc) != 0)            // BIT7 until first 0 encountered from left to right
            {
                tc <<= 1;
                m_nByteFollow++;
            }

            if (m_nByteFollow == 4)
            {
                m_fSurrogatesPairs = TRUE;
                m_tcSurrogateUnicode = tc >> m_nByteFollow;

            }
            else
            {
                m_tcUnicode = ( tc >> m_nByteFollow ) ;
                m_nBytesUsed = 1 ;               // # bytes used
            }
            m_nByteFollow--;                     // # bytes to follow
        }
    }

	return fDone;
}

/******************************************************************************
************************   C L E A N   U P   M A I N   ************************
******************************************************************************/

BOOL CInccUTF8In::CleanUpMain()
{
	return TRUE;
}

int CInccUTF8In::GetUnconvertBytes()
{
    return  m_nBytesUsed < 4 ? m_nBytesUsed : 3 ; 
}

DWORD CInccUTF8In::GetConvertMode()
{
    // UTF8 does not use mode esc sequence
 	return 0 ;
}

void CInccUTF8In::SetConvertMode(DWORD mode)
{
    Reset();    // initialization
    // UTF8 does not use mode esc sequence
 	return ;
}

// ============================================================================
// Internet Character Set Conversion: Output to UTF-8
// ============================================================================

/******************************************************************************
**************************   C O N S T R U C T O R   **************************
******************************************************************************/

CInccUTF8Out::CInccUTF8Out(UINT uCodePage, int nCodeSet) : CINetCodeConverter(uCodePage, nCodeSet)
{
    Reset();    // initialization
    return ;
}

/******************************************************************************
*******************************   R E S E T   *********************************
******************************************************************************/

void CInccUTF8Out::Reset()
{
    m_fDoubleByte = FALSE;
    m_wchSurrogateHigh = 0;
    return ;
}

HRESULT CInccUTF8Out::ConvertChar(UCHAR tc, int cchSrc)
{
    BOOL fDone = TRUE;
    WORD uc ;
    UCHAR UTF8[4] ;

    if (m_fDoubleByte )
    {
        uc = (	(WORD) tc << 8 | m_tcLeadByte  ) ;

        if (uc >= HIGHT_SURROGATE_START && uc <= HIGHT_SURROGATE_END && cchSrc >= sizeof(WCHAR))
        {
            if (m_wchSurrogateHigh)
            {
                UTF8[0] = 0xe0 | ( m_wchSurrogateHigh >> 12 );              // 4 bits in first byte
                UTF8[1] = 0x80 | ( ( m_wchSurrogateHigh >> 6 ) & 0x3f );    // 6 bits in second
                UTF8[2] = 0x80 | ( 0x3f & m_wchSurrogateHigh);              // 6 bits in third
                Output(UTF8[0]);
                Output(UTF8[1]);
                fDone = Output(UTF8[2]);
            }
            m_wchSurrogateHigh = uc;
            m_fDoubleByte = FALSE ;
            goto CONVERT_DONE;
        }

        if (m_wchSurrogateHigh)
        {
            if (uc >= LOW_SURROGATE_START && uc <= LOW_SURROGATE_END)       // We find a surrogate pairs
            {

                DWORD dwSurrogateChar = ((m_wchSurrogateHigh-0xD800) << 10) + uc - 0xDC00 + 0x10000;
                UTF8[0] = 0xF0 | (unsigned char)( dwSurrogateChar >> 18 );                 // 3 bits in first byte
                UTF8[1] = 0x80 | (unsigned char)( ( dwSurrogateChar >> 12 ) & 0x3f );      // 6 bits in second
                UTF8[2] = 0x80 | (unsigned char)( ( dwSurrogateChar >> 6 ) & 0x3f );       // 6 bits in third
                UTF8[3] = 0x80 | (unsigned char)( 0x3f & dwSurrogateChar);                 // 6 bits in forth
                Output(UTF8[0]);
                Output(UTF8[1]);
                Output(UTF8[2]);
                fDone = Output(UTF8[3]);                
                m_fDoubleByte = FALSE ;
                m_wchSurrogateHigh = 0;
                goto CONVERT_DONE;
            }
            else                                                            // Not a surrogate pairs, error
            {
                UTF8[0] = 0xe0 | ( m_wchSurrogateHigh >> 12 );              // 4 bits in first byte
                UTF8[1] = 0x80 | ( ( m_wchSurrogateHigh >> 6 ) & 0x3f );    // 6 bits in second
                UTF8[2] = 0x80 | ( 0x3f & m_wchSurrogateHigh);              // 6 bits in third
                Output(UTF8[0]);
                Output(UTF8[1]);
                fDone = Output(UTF8[2]);
                m_wchSurrogateHigh = 0;
            }
        }


        if( ( uc & 0xff80 ) == 0 ) // ASCII
        {
            UTF8[0] = (UCHAR) uc;
            fDone = Output(UTF8[0]);
        }
        else if( ( uc & 0xf800 ) == 0 ) 			// UTF8_2_MAX 2-byte sequence if < 07ff (11 bits)
        {
            UTF8[0] = 0xC0 | (uc >> 6);             // 5 bits in first byte
            UTF8[1] = 0x80 | ( 0x3f & uc);       // 6 bits in second
            Output(UTF8[0]);
            fDone = Output(UTF8[1]);
        }
        else                                             // 3-byte sequence
        {
            UTF8[0] = 0xe0 | ( uc >> 12 );                // 4 bits in first byte
            UTF8[1] = 0x80 | ( ( uc >> 6 ) & 0x3f );      // 6 bits in second
            UTF8[2] = 0x80 | ( 0x3f & uc);                // 6 bits in third
            Output(UTF8[0]);
            Output(UTF8[1]);
            fDone = Output(UTF8[2]);
        }
        m_fDoubleByte = FALSE ;
    }
    else
    {
        m_tcLeadByte = tc ;
        m_fDoubleByte = TRUE ;
    }

CONVERT_DONE:
    if (fDone)
        return S_OK;
    else
        return E_FAIL;
}

/******************************************************************************
*****************************   C L E A N   U P   *****************************
******************************************************************************/

BOOL CInccUTF8Out::CleanUp()
{
	BOOL fDone = TRUE;

	return fDone;
}

int CInccUTF8Out::GetUnconvertBytes()
{
    return  m_fDoubleByte ? 1 : 0 ;
}

DWORD CInccUTF8Out::GetConvertMode()
{
    // UTF8 does not use mode esc sequence
 	return 0 ;
}

void CInccUTF8Out::SetConvertMode(DWORD mode)
{
    Reset();    // initialization
    // UTF8 does not use mode esc sequence
 	return ;
}
Add source files 2020-09-26 03:20:57 -05:00			`// ============================================================================`
			`// Internet Character Set Conversion: Input from UTF-8`
			`// ============================================================================`

			`#include "private.h"`
			`#include "fechrcnv.h"`
			`#include "utf8obj.h"`

			`/******************************************************************************`
			`************************ C O N S T R U C T O R ************************`
			`******************************************************************************/`

			`CInccUTF8In::CInccUTF8In(UINT uCodePage, int nCodeSet) : CINetCodeConverter(uCodePage, nCodeSet)`
			`{`
			`Reset(); // initialization`
			`return ;`
			`}`

			`/******************************************************************************`
			`***************************** R E S E T *******************************`
			`******************************************************************************/`

			`void CInccUTF8In::Reset()`
			`{`
			`m_pfnConv = ConvMain;`
			`m_pfnCleanUp = CleanUpMain;`
			`m_nByteFollow = 0 ;`
			`m_tcUnicode = 0 ;`
			`m_tcSurrogateUnicode = 0 ;`
			`m_nBytesUsed = 0 ;`
			`m_fSurrogatesPairs = FALSE;`
			`return ;`
			`}`

			`/******************************************************************************`
			`*********************** C O N V E R T C H A R ***********************`
			`******************************************************************************/`

			`HRESULT CInccUTF8In::ConvertChar(UCHAR tc, int cchSrc)`
			`{`
			`BOOL fDone = (this->*m_pfnConv)(tc);`
			`if (fDone)`
			`return S_OK;`
			`else`
			`return E_FAIL;`
			`}`

			`/******************************************************************************`
			`*************************** C L E A N U P ***************************`
			`******************************************************************************/`

			`BOOL CInccUTF8In::CleanUp()`
			`{`
			`return (this->*m_pfnCleanUp)();`
			`}`

			`/******************************************************************************`
			`************************** C O N V M A I N **************************`
			`******************************************************************************/`

			`BOOL CInccUTF8In::ConvMain(UCHAR tc)`
			`{`
			`BOOL fDone = TRUE;`

			`if( ( 0x80 & tc ) == 0 ) // BIT7 == 0 ASCII`
			`{`
			`Output(tc);`
			`fDone = Output(0);`
			`m_nBytesUsed = 0 ;`
			`}`
			`else if( (0x40 & tc) == 0 ) // BIT6 == 0 a trail byte`
			`{`
			`if( m_nByteFollow )`
			`{`
			`if (m_fSurrogatesPairs)`
			`{`
			`m_nByteFollow--;`
			`m_tcSurrogateUnicode <<= 6; // Make room for trail byte`
			`m_tcSurrogateUnicode \|= ( 0x3F & tc ); // LOWER_6BIT add trail byte value`

			`if( m_nByteFollow == 0) // End of sequence, advance output ptr`
			`{`
			`m_tcUnicode = (WCHAR)(((m_tcSurrogateUnicode - 0x10000) >> 10) + HIGHT_SURROGATE_START);`
			`tc = (UCHAR)m_tcUnicode ;`
			`if ( fDone = Output(tc) )`
			`{`
			`tc = (UCHAR) ( m_tcUnicode >> 8 ) ;`
			`fDone = Output(tc);`
			`}`
			`m_tcUnicode = (WCHAR)((m_tcSurrogateUnicode - 0x10000)%0x400 + LOW_SURROGATE_START);`
			`tc = (UCHAR)m_tcUnicode ;`
			`if ( fDone = Output(tc) )`
			`{`
			`tc = (UCHAR) ( m_tcUnicode >> 8 ) ;`
			`fDone = Output(tc);`
			`}`
			`m_fSurrogatesPairs = 0;`
			`m_nBytesUsed = 0 ;`
			`}`
			`else`
			`m_nBytesUsed++ ;`
			`}`
			`else`
			`{`
			`m_nByteFollow--;`
			`m_tcUnicode <<= 6; // make room for trail byte`
			`m_tcUnicode \|= ( 0x3F & tc ); // LOWER_6BIT add trail byte value`

			`if( m_nByteFollow == 0) // end of sequence, advance output ptr`
			`{`
			`tc = (UCHAR)m_tcUnicode ;`
			`if ( fDone = Output(tc) )`
			`{`
			`tc = (UCHAR) ( m_tcUnicode >> 8 ) ;`
			`fDone = Output(tc);`
			`}`
			`m_nBytesUsed = 0 ;`
			`}`
			`else`
			`m_nBytesUsed++ ;`
			`}`
			`}`
			`else // error - ignor and rest`
			`{`
			`m_nBytesUsed = 0 ;`
			`m_nByteFollow = 0 ;`
			`}`
			`}`
			`else // a lead byte`
			`{`
			`if( m_nByteFollow > 0 ) // error, previous sequence not finished`
			`{`
			`m_nByteFollow = 0;`
			`Output(' ');`
			`fDone = Output(0);`
			`m_nBytesUsed = 0 ;`
			`}`
			`else // calculate # bytes to follow`
			`{`
			`while( (0x80 & tc) != 0) // BIT7 until first 0 encountered from left to right`
			`{`
			`tc <<= 1;`
			`m_nByteFollow++;`
			`}`

			`if (m_nByteFollow == 4)`
			`{`
			`m_fSurrogatesPairs = TRUE;`
			`m_tcSurrogateUnicode = tc >> m_nByteFollow;`

			`}`
			`else`
			`{`
			`m_tcUnicode = ( tc >> m_nByteFollow ) ;`
			`m_nBytesUsed = 1 ; // # bytes used`
			`}`
			`m_nByteFollow--; // # bytes to follow`
			`}`
			`}`

			`return fDone;`
			`}`

			`/******************************************************************************`
			`********************** C L E A N U P M A I N **********************`
			`******************************************************************************/`

			`BOOL CInccUTF8In::CleanUpMain()`
			`{`
			`return TRUE;`
			`}`

			`int CInccUTF8In::GetUnconvertBytes()`
			`{`
			`return m_nBytesUsed < 4 ? m_nBytesUsed : 3 ;`
			`}`

			`DWORD CInccUTF8In::GetConvertMode()`
			`{`
			`// UTF8 does not use mode esc sequence`
			`return 0 ;`
			`}`

			`void CInccUTF8In::SetConvertMode(DWORD mode)`
			`{`
			`Reset(); // initialization`
			`// UTF8 does not use mode esc sequence`
			`return ;`
			`}`

			`// ============================================================================`
			`// Internet Character Set Conversion: Output to UTF-8`
			`// ============================================================================`

			`/******************************************************************************`
			`************************ C O N S T R U C T O R ************************`
			`******************************************************************************/`

			`CInccUTF8Out::CInccUTF8Out(UINT uCodePage, int nCodeSet) : CINetCodeConverter(uCodePage, nCodeSet)`
			`{`
			`Reset(); // initialization`
			`return ;`
			`}`

			`/******************************************************************************`
			`***************************** R E S E T *******************************`
			`******************************************************************************/`

			`void CInccUTF8Out::Reset()`
			`{`
			`m_fDoubleByte = FALSE;`
			`m_wchSurrogateHigh = 0;`
			`return ;`
			`}`

			`HRESULT CInccUTF8Out::ConvertChar(UCHAR tc, int cchSrc)`
			`{`
			`BOOL fDone = TRUE;`
			`WORD uc ;`
			`UCHAR UTF8[4] ;`

			`if (m_fDoubleByte )`
			`{`
			`uc = ( (WORD) tc << 8 \| m_tcLeadByte ) ;`

			`if (uc >= HIGHT_SURROGATE_START && uc <= HIGHT_SURROGATE_END && cchSrc >= sizeof(WCHAR))`
			`{`
			`if (m_wchSurrogateHigh)`
			`{`
			`UTF8[0] = 0xe0 \| ( m_wchSurrogateHigh >> 12 ); // 4 bits in first byte`
			`UTF8[1] = 0x80 \| ( ( m_wchSurrogateHigh >> 6 ) & 0x3f ); // 6 bits in second`
			`UTF8[2] = 0x80 \| ( 0x3f & m_wchSurrogateHigh); // 6 bits in third`
			`Output(UTF8[0]);`
			`Output(UTF8[1]);`
			`fDone = Output(UTF8[2]);`
			`}`
			`m_wchSurrogateHigh = uc;`
			`m_fDoubleByte = FALSE ;`
			`goto CONVERT_DONE;`
			`}`

			`if (m_wchSurrogateHigh)`
			`{`
			`if (uc >= LOW_SURROGATE_START && uc <= LOW_SURROGATE_END) // We find a surrogate pairs`
			`{`

			`DWORD dwSurrogateChar = ((m_wchSurrogateHigh-0xD800) << 10) + uc - 0xDC00 + 0x10000;`
			`UTF8[0] = 0xF0 \| (unsigned char)( dwSurrogateChar >> 18 ); // 3 bits in first byte`
			`UTF8[1] = 0x80 \| (unsigned char)( ( dwSurrogateChar >> 12 ) & 0x3f ); // 6 bits in second`
			`UTF8[2] = 0x80 \| (unsigned char)( ( dwSurrogateChar >> 6 ) & 0x3f ); // 6 bits in third`
			`UTF8[3] = 0x80 \| (unsigned char)( 0x3f & dwSurrogateChar); // 6 bits in forth`
			`Output(UTF8[0]);`
			`Output(UTF8[1]);`
			`Output(UTF8[2]);`
			`fDone = Output(UTF8[3]);`
			`m_fDoubleByte = FALSE ;`
			`m_wchSurrogateHigh = 0;`
			`goto CONVERT_DONE;`
			`}`
			`else // Not a surrogate pairs, error`
			`{`
			`UTF8[0] = 0xe0 \| ( m_wchSurrogateHigh >> 12 ); // 4 bits in first byte`
			`UTF8[1] = 0x80 \| ( ( m_wchSurrogateHigh >> 6 ) & 0x3f ); // 6 bits in second`
			`UTF8[2] = 0x80 \| ( 0x3f & m_wchSurrogateHigh); // 6 bits in third`
			`Output(UTF8[0]);`
			`Output(UTF8[1]);`
			`fDone = Output(UTF8[2]);`
			`m_wchSurrogateHigh = 0;`
			`}`
			`}`


			`if( ( uc & 0xff80 ) == 0 ) // ASCII`
			`{`
			`UTF8[0] = (UCHAR) uc;`
			`fDone = Output(UTF8[0]);`
			`}`
			`else if( ( uc & 0xf800 ) == 0 ) // UTF8_2_MAX 2-byte sequence if < 07ff (11 bits)`
			`{`
			`UTF8[0] = 0xC0 \| (uc >> 6); // 5 bits in first byte`
			`UTF8[1] = 0x80 \| ( 0x3f & uc); // 6 bits in second`
			`Output(UTF8[0]);`
			`fDone = Output(UTF8[1]);`
			`}`
			`else // 3-byte sequence`
			`{`
			`UTF8[0] = 0xe0 \| ( uc >> 12 ); // 4 bits in first byte`
			`UTF8[1] = 0x80 \| ( ( uc >> 6 ) & 0x3f ); // 6 bits in second`
			`UTF8[2] = 0x80 \| ( 0x3f & uc); // 6 bits in third`
			`Output(UTF8[0]);`
			`Output(UTF8[1]);`
			`fDone = Output(UTF8[2]);`
			`}`
			`m_fDoubleByte = FALSE ;`
			`}`
			`else`
			`{`
			`m_tcLeadByte = tc ;`
			`m_fDoubleByte = TRUE ;`
			`}`

			`CONVERT_DONE:`
			`if (fDone)`
			`return S_OK;`
			`else`
			`return E_FAIL;`
			`}`

			`/******************************************************************************`
			`*************************** C L E A N U P ***************************`
			`******************************************************************************/`

			`BOOL CInccUTF8Out::CleanUp()`
			`{`
			`BOOL fDone = TRUE;`

			`return fDone;`
			`}`

			`int CInccUTF8Out::GetUnconvertBytes()`
			`{`
			`return m_fDoubleByte ? 1 : 0 ;`
			`}`

			`DWORD CInccUTF8Out::GetConvertMode()`
			`{`
			`// UTF8 does not use mode esc sequence`
			`return 0 ;`
			`}`

			`void CInccUTF8Out::SetConvertMode(DWORD mode)`
			`{`
			`Reset(); // initialization`
			`// UTF8 does not use mode esc sequence`
			`return ;`
			`}`