windows-nt/Source/XPSP1/NT/ds/dns/dnslib/utf8.c

/*++

Copyright (c) 1997-2001 Microsoft Corporation

Module Name:

    utf8.c

Abstract:

    Domain Name System (DNS) Library

    UTF8 to\from unicode and ANSI conversions

    The UTF8\unicode routines are similar to the generic ones floating
    around the NT group, but a heck of a lot cleaner and more robust,
    including catching the invalid UTF8 string case on the utf8 to unicode
    conversion.

    The UTF8\ANSI routines are optimized for the 99% case where all the
    characters are <128 and no conversions is actually required.

Author:

    Jim Gilroy (jamesg)     March 1997

Revision History:

--*/


#include "local.h"


//
//  Macros to simplify UTF8 conversions
//

#define UTF8_1ST_OF_2     0xc0      //  110x xxxx
#define UTF8_1ST_OF_3     0xe0      //  1110 xxxx
#define UTF8_1ST_OF_4     0xf0      //  1111 xxxx
#define UTF8_TRAIL        0x80      //  10xx xxxx

#define UTF8_2_MAX        0x07ff    //  max unicode character representable in
                                    //  in two byte UTF8

#define BIT7(ch)        ((ch) & 0x80)
#define BIT6(ch)        ((ch) & 0x40)
#define BIT5(ch)        ((ch) & 0x20)
#define BIT4(ch)        ((ch) & 0x10)
#define BIT3(ch)        ((ch) & 0x08)

#define LOW6BITS(ch)    ((ch) & 0x3f)
#define LOW5BITS(ch)    ((ch) & 0x1f)
#define LOW4BITS(ch)    ((ch) & 0x0f)

#define HIGHBYTE(wch)   ((wch) & 0xff00)

//
//  Surrogate pair support
//  Two unicode characters may be linked to form a surrogate pair.
//  And for some totally unknown reason, someone thought they
//  should travel in UTF8 as four bytes instead of six.
//  No one has any idea why this is true other than to complicate
//  the code.
//

#define HIGH_SURROGATE_START  0xd800
#define HIGH_SURROGATE_END    0xdbff
#define LOW_SURROGATE_START   0xdc00
#define LOW_SURROGATE_END     0xdfff


//
//  Max "normal conversion", make space for MAX_PATH,
//  this covers all valid DNS names and strings.
//

#define TEMP_BUFFER_LENGTH  (2*MAX_PATH)


DNS_STATUS
_fastcall
Dns_ValidateUtf8Byte(
    IN      BYTE            chUtf8,
    IN OUT  PDWORD          pdwTrailCount
    )
/*++

Routine Description:

    Verifies that byte is valid UTF8 byte.

Arguments:

Return Value:

    ERROR_SUCCESS -- if valid UTF8 given trail count
    ERROR_INVALID_DATA -- if invalid

--*/
{
    DWORD   trailCount = *pdwTrailCount;

    DNSDBG( TRACE, ( "Dns_ValidateUtf8Byte()\n" ));

    //
    //  if ASCII byte, only requirement is no trail count
    //

    if ( (UCHAR)chUtf8 < 0x80 )
    {
        if ( trailCount == 0 )
        {
            return( ERROR_SUCCESS );
        }
        return( ERROR_INVALID_DATA );
    }

    //
    //  trail byte
    //      - must be in multi-byte set
    //

    if ( BIT6(chUtf8) == 0 )
    {
        if ( trailCount == 0 )
        {
            return( ERROR_INVALID_DATA );
        }
        --trailCount;
    }

    //
    //  multi-byte lead byte
    //      - must NOT be in existing multi-byte set
    //      - verify valid lead byte

    else
    {
        if ( trailCount != 0 )
        {
            return( ERROR_INVALID_DATA );
        }

        //  first of two bytes (110xxxxx)

        if ( BIT5(chUtf8) == 0 )
        {
            trailCount = 1;
        }

        //  first of three bytes (1110xxxx)

        else if ( BIT4(chUtf8) == 0 )
        {
            trailCount = 2;
        }

        //  first of four bytes (surrogate character) (11110xxx)

        else if ( BIT3(chUtf8) == 0 )
        {
            trailCount = 3;
        }

        else
        {
            return( ERROR_INVALID_DATA );
        }
    }

    //  reset caller's trail count

    *pdwTrailCount = trailCount;
    return( ERROR_SUCCESS );
}


//
//  UTF8 to unicode conversions
//
//  For some reason UTF8 is not supported in Win9x.
//  AND the implementation itself is not careful about
//  validating UTF8.
//

DWORD
_fastcall
Dns_UnicodeToUtf8(
    IN      PWCHAR          pwUnicode,
    IN      DWORD           cchUnicode,
    OUT     PCHAR           pchResult,
    IN      DWORD           cchResult
    )
/*++

Routine Description:

    Convert unicode characters to UTF8.

    Result is NULL terminated if sufficient space in result
    buffer is available.

Arguments:

    pwUnicode   -- ptr to start of unicode buffer

    cchUnicode  -- length of unicode buffer

    pchResult   -- ptr to start of result buffer for UTF8 chars

    cchResult   -- length of result buffer

Return Value:

    Count of UTF8 characters in result, if successful.
    0 on error.  GetLastError() has error code.

--*/
{
    WCHAR   wch;                // current unicode character being converted
    DWORD   lengthUtf8 = 0;     // length of UTF8 result string
    WORD    lowSurrogate;
    DWORD   surrogateDword;


    DNSDBG( TRACE, (
        "Dns_UnicodeToUtf8( %.*S )\n",
        cchUnicode,
        pwUnicode ));

    //
    //  loop converting unicode chars until run out or error
    //

    while ( cchUnicode-- )
    {
        wch = *pwUnicode++;

        //
        //  ASCII character (7 bits or less) -- converts to directly
        //

        if ( wch < 0x80 )
        {
            lengthUtf8++;

            if ( pchResult )
            {
                if ( lengthUtf8 >= cchResult )
                {
                    goto OutOfBuffer;
                }
                *pchResult++ = (CHAR)wch;
            }
            continue;
        }

        //
        //  wide character less than 0x07ff (11bits) converts to two bytes
        //      - upper 5 bits in first byte
        //      - lower 6 bits in secondar byte
        //

        else if ( wch <= UTF8_2_MAX )
        {
            lengthUtf8 += 2;

            if ( pchResult )
            {
                if ( lengthUtf8 >= cchResult )
                {
                    goto OutOfBuffer;
                }
                *pchResult++ = UTF8_1ST_OF_2 | wch >> 6;
                *pchResult++ = UTF8_TRAIL    | LOW6BITS( (UCHAR)wch );
            }
            continue;
        }

        //
        //  surrogate pair
        //      - if have high surrogate followed by low surrogate then
        //          process as surrogate pair
        //      - otherwise treat character as ordinary unicode "three-byte"
        //          character, by falling through to below
        //

        else if ( wch >= HIGH_SURROGATE_START &&
                  wch <= HIGH_SURROGATE_END &&
                  cchUnicode &&
                  (lowSurrogate = *pwUnicode) &&
                  lowSurrogate >= LOW_SURROGATE_START &&
                  lowSurrogate <= LOW_SURROGATE_END )
        {
            //  have a surrogate pair
            //      - suck up next unicode character (low surrogate of pair)
            //      - make full DWORD surrogate pair
            //      - then lay out four UTF8 bytes
            //          1st of four, then three trail bytes
            //              0x1111xxxx
            //              0x10xxxxxx
            //              0x10xxxxxx
            //              0x10xxxxxx

            DNSDBG( TRACE, (
                "Have surrogate pair %hx : %hx\n",
                wch,
                lowSurrogate ));

            pwUnicode++;
            cchUnicode--;
            lengthUtf8 += 4;

            if ( pchResult )
            {
                if ( lengthUtf8 >= cchResult )
                {
                    goto OutOfBuffer;
                }
                surrogateDword = (((wch-0xD800) << 10) + (lowSurrogate - 0xDC00) + 0x10000);

                *pchResult++ = UTF8_1ST_OF_4 | (UCHAR) (surrogateDword >> 18);
                *pchResult++ = UTF8_TRAIL    | (UCHAR) LOW6BITS(surrogateDword >> 12);
                *pchResult++ = UTF8_TRAIL    | (UCHAR) LOW6BITS(surrogateDword >> 6);
                *pchResult++ = UTF8_TRAIL    | (UCHAR) LOW6BITS(surrogateDword);

                DNSDBG( TRACE, (
                    "Converted surrogate -- DWORD = %08x\n"
                    "\tconverted %x %x %x %x\n",
                    surrogateDword,
                    (UCHAR) *(pchResult-3),
                    (UCHAR) *(pchResult-2),
                    (UCHAR) *(pchResult-1),
                    (UCHAR) *pchResult ));
            }
        }

        //
        //  wide character (non-zero in top 5 bits) converts to three bytes
        //      - top 4 bits in first byte
        //      - middle 6 bits in second byte
        //      - low 6 bits in third byte
        //

        else
        {
            lengthUtf8 += 3;

            if ( pchResult )
            {
                if ( lengthUtf8 >= cchResult )
                {
                    goto OutOfBuffer;
                }
                *pchResult++ = UTF8_1ST_OF_3 | (wch >> 12);
                *pchResult++ = UTF8_TRAIL    | LOW6BITS( wch >> 6 );
                *pchResult++ = UTF8_TRAIL    | LOW6BITS( wch );
            }
        }
    }

    //
    //  NULL terminate buffer
    //  return UTF8 character count
    //

    if ( pchResult && lengthUtf8 < cchResult )
    {
        *pchResult = 0;
    }
    return( lengthUtf8 );

OutOfBuffer:

    SetLastError( ERROR_INSUFFICIENT_BUFFER );
    return( 0 );
}


DWORD
_fastcall
Dns_Utf8ToUnicode(
    IN      PCHAR           pchUtf8,
    IN      DWORD           cchUtf8,
    OUT     PWCHAR          pwResult,
    IN      DWORD           cwResult
    )
/*++

Routine Description:

    Convert UTF8 characters to unicode.

    Result is NULL terminated if sufficient space in result
    buffer is available.

Arguments:

    pwResult    -- ptr to start of result buffer for unicode chars

    cwResult    -- length of result buffer in WCHAR

    pwUtf8      -- ptr to start of UTF8 buffer

    cchUtf8     -- length of UTF8 buffer

Return Value:

    Count of unicode characters in result, if successful.
    0 on error.  GetLastError() has error code.

--*/
{
    CHAR    ch;                     // current UTF8 character
    WCHAR   wch;                    // current unicode character
    DWORD   trailCount = 0;         // count of UTF8 trail bytes to follow
    DWORD   lengthUnicode = 0;      // length of unicode result string
    BOOL    bsurrogatePair = FALSE;
    DWORD   surrogateDword;


    //
    //  loop converting UTF8 chars until run out or error
    //

    while ( cchUtf8-- )
    {
        ch = *pchUtf8++;

        //
        //  ASCII character -- just copy
        //

        if ( BIT7(ch) == 0 )
        {
            lengthUnicode++;
            if ( pwResult )
            {
                if ( lengthUnicode >= cwResult )
                {
                    goto OutOfBuffer;
                }
                *pwResult++ = (WCHAR)ch;
            }
            continue;
        }

        //
        //  UTF8 trail byte
        //      - if not expected, error
        //      - otherwise shift unicode character 6 bits and
        //          copy in lower six bits of UTF8
        //      - if last UTF8 byte, copy result to unicode string
        //

        else if ( BIT6(ch) == 0 )
        {
            if ( trailCount == 0 )
            {
                goto InvalidUtf8;
            }

            if ( !bsurrogatePair )
            {
                wch <<= 6;
                wch |= LOW6BITS( ch );

                if ( --trailCount == 0 )
                {
                    lengthUnicode++;
                    if ( pwResult )
                    {
                        if ( lengthUnicode >= cwResult )
                        {
                            goto OutOfBuffer;
                        }
                        *pwResult++ = wch;
                    }
                }
                continue;
            }

            //  surrogate pair
            //      - same as above EXCEPT build two unicode chars
            //      from surrogateDword

            else
            {
                surrogateDword <<= 6;
                surrogateDword |= LOW6BITS( ch );

                if ( --trailCount == 0 )
                {
                    lengthUnicode += 2;

                    if ( pwResult )
                    {
                        if ( lengthUnicode >= cwResult )
                        {
                            goto OutOfBuffer;
                        }
                        surrogateDword -= 0x10000;
                        *pwResult++ = (WCHAR) ((surrogateDword >> 10) + HIGH_SURROGATE_START);
                        *pwResult++ = (WCHAR) ((surrogateDword & 0x3ff) + LOW_SURROGATE_START);
                    }
                    bsurrogatePair = FALSE;
                }
            }

        }

        //
        //  UTF8 lead byte
        //      - if currently in extension, error

        else
        {
            if ( trailCount != 0 )
            {
                goto InvalidUtf8;
            }

            //  first of two byte character (110xxxxx)

            if ( BIT5(ch) == 0 )
            {
                trailCount = 1;
                wch = LOW5BITS(ch);
                continue;
            }

            //  first of three byte character (1110xxxx)

            else if ( BIT4(ch) == 0 )
            {
                trailCount = 2;
                wch = LOW4BITS(ch);
                continue;
            }

            //  first of four byte surrogate pair (11110xxx)

            else if ( BIT3(ch) == 0 )
            {
                trailCount = 3;
                surrogateDword = LOW4BITS(ch);
                bsurrogatePair = TRUE;
            }

            else
            {
                goto InvalidUtf8;
            }
        }
    }

    //  catch if hit end in the middle of UTF8 multi-byte character

    if ( trailCount )
    {
        goto InvalidUtf8;
    }

    //
    //  NULL terminate buffer
    //  return the number of Unicode characters written.
    //

    if ( pwResult  &&  lengthUnicode < cwResult )
    {
        *pwResult = 0;
    }
    return( lengthUnicode );

OutOfBuffer:

    SetLastError( ERROR_INSUFFICIENT_BUFFER );
    return( 0 );

InvalidUtf8:

    SetLastError( ERROR_INVALID_DATA );
    return( 0 );
}


//
//  UTF8 \ ANSI conversions
//

DWORD
Dns_Utf8ToOrFromAnsi(
    OUT     PCHAR           pchResult,
    IN      DWORD           cchResult,
    IN      PCHAR           pchIn,
    IN      DWORD           cchIn,
    IN      DNS_CHARSET     InCharSet,
    IN      DNS_CHARSET     OutCharSet
    )
/*++

Routine Description:

    Convert UTF8 characters to ANSI or vice versa.

    Note:  this function appears to call string functions (string.c)
        which call back to it.  However, this calls those functions
        ONLY for conversions to\from unicode which do NOT call back
        to these functions.  Ultimately need to check if LCMapString
        can handle these issues.

Arguments:

    pchResult   -- ptr to start of result buffer for ansi chars

    cchResult   -- length of result buffer

    pchIn       -- ptr to start of input string

    cchIn       -- length of input string

    InCharSet   -- char set of input string (DnsCharSetAnsi or DnsCharSetUtf8)

    OutCharSet  -- char set for result string (DnsCharSetUtf8 or DnsCharSetAnsi)

Return Value:

    Count of bytes in result (including terminating NULL).
    0 on error.  GetLastError() has error code.

--*/
{
    DWORD       unicodeLength;
    DWORD       resultLength;
    CHAR        tempBuffer[ TEMP_BUFFER_LENGTH ];
    PCHAR       ptemp = tempBuffer;
    DNS_STATUS  status;

    DNSDBG( TRACE, (
        "Dns_Utf8ToOrFromAnsi()\n"
        "\tbuffer       = %p\n"
        "\tbuf length   = %d\n"
        "\tpchString    = %p (%*s)\n"
        "\tcchString    = %d\n"
        "\tCharSetIn    = %d\n"
        "\tCharSetOut   = %d\n",
        pchResult,
        cchResult,
        pchIn,
        cchIn, pchIn,
        cchIn,
        InCharSet,
        OutCharSet ));

    //
    //  validate charsets
    //

    ASSERT( InCharSet != OutCharSet );
    ASSERT( InCharSet == DnsCharSetAnsi || InCharSet == DnsCharSetUtf8 );
    ASSERT( OutCharSet == DnsCharSetAnsi || OutCharSet == DnsCharSetUtf8 );

    //
    //  if length not given, calculate
    //

    if ( cchIn == 0 )
    {
        cchIn = strlen( pchIn );
    }

    //
    //  string completely ASCII
    //      - simple memcopy suffices
    //      - note result must have terminating NULL
    //

    if ( Dns_IsStringAsciiEx(
                pchIn,
                cchIn ) )
    {
        if ( !pchResult )
        {
            return( cchIn + 1 );
        }

        if ( cchResult <= cchIn )
        {
            status = ERROR_INSUFFICIENT_BUFFER;
            goto Failed;
        }
        memcpy(
            pchResult,
            pchIn,
            cchIn );

        pchResult[ cchIn ] = 0;

        return( cchIn+1 );
    }

    //
    //  non-ASCII
    //      - convert to unicode, then to result character set
    //
    //  DCR_PERF:  LCMapStringA() might be able to handle all this
    //          haven't figured out how yet
    //

    unicodeLength = Dns_GetBufferLengthForStringCopy(
                        pchIn,
                        cchIn,
                        InCharSet,
                        DnsCharSetUnicode
                        );

    if ( unicodeLength > TEMP_BUFFER_LENGTH )
    {
        //  can't use static buffer, must allocate

        ptemp = Dns_StringCopyAllocate(
                    pchIn,
                    cchIn,
                    InCharSet,
                    DnsCharSetUnicode
                    );
        if ( !ptemp )
        {
            status = ERROR_INVALID_DATA;
            goto Failed;
        }
    }
    else
    {
        if ( unicodeLength == 0 )
        {
            status = ERROR_INVALID_DATA;
            goto Failed;
        }

        //  copy into temporary buffer

        resultLength = Dns_StringCopy(
                        ptemp,
                        NULL,       // adequate buffer length
                        pchIn,
                        cchIn,
                        InCharSet,
                        DnsCharSetUnicode
                        );
        if ( !resultLength )
        {
            status = ERROR_INVALID_DATA;
            goto Failed;
        }
        ASSERT( resultLength == unicodeLength );
    }

    //
    //  conversion to result char set
    //      - if have result buffer, convert into it
    //      - should have at least ONE two byte character
    //          otherwise should have taken fast path above
    //

    if ( pchResult )
    {
        resultLength = Dns_StringCopy(
                            pchResult,
                            & cchResult,        // result buffer length
                            ptemp,
                            0,
                            DnsCharSetUnicode,
                            OutCharSet
                            );
        if ( resultLength == 0 )
        {
            status = ERROR_INSUFFICIENT_BUFFER;
            goto Failed;
        }
        ASSERT( resultLength <= cchResult );
        ASSERT( pchResult[resultLength-1] == 0 );
        ASSERT( resultLength >= unicodeLength/2 );
    }

    else
    {
        resultLength = Dns_GetBufferLengthForStringCopy(
                            ptemp,
                            0,
                            DnsCharSetUnicode,
                            OutCharSet
                            );
        ASSERT( resultLength >= unicodeLength/2 );
    }

    //
    //  final mapping from unicode to result character set
    //

    if ( ptemp != tempBuffer )
    {
        FREE_HEAP( ptemp );
    }

    return( resultLength );


Failed:

    SetLastError( status );

    if ( ptemp != tempBuffer )
    {
        FREE_HEAP( ptemp );
    }

    return( 0 );
}


DWORD
Dns_AnsiToUtf8(
    IN      PCHAR           pchAnsi,
    IN      DWORD           cchAnsi,
    OUT     PCHAR           pchResult,
    IN      DWORD           cchResult
    )
/*++

Routine Description:

    Convert ANSI characters to UTF8.

Arguments:

    pchAnsi   -- ptr to start of ansi buffer

    cchAnsi  -- length of ansi buffer

    pchResult   -- ptr to start of result buffer for UTF8 chars

    cchResult   -- length of result buffer

Return Value:

    Count of UTF8 characters in result, if successful.
    0 on error.  GetLastError() has error code.

--*/
{
    return  Dns_Utf8ToOrFromAnsi(
                pchResult,          // result buffer
                cchResult,
                pchAnsi,            // in string
                cchAnsi,
                DnsCharSetAnsi,     // ANSI in
                DnsCharSetUtf8      // UTF8 out
                );
}


DWORD
Dns_Utf8ToAnsi(
    IN      PCHAR           pchUtf8,
    IN      DWORD           cchUtf8,
    OUT     PCHAR           pchResult,
    IN      DWORD           cchResult
    )
/*++

Routine Description:

    Convert UTF8 characters to ANSI.

Arguments:

    pchResult   -- ptr to start of result buffer for ansi chars

    cchResult   -- length of result buffer

    pwUtf8      -- ptr to start of UTF8 buffer

    cchUtf8     -- length of UTF8 buffer

Return Value:

    Count of ansi characters in result, if successful.
    0 on error.  GetLastError() has error code.

--*/
{
    return  Dns_Utf8ToOrFromAnsi(
                pchResult,          // result buffer
                cchResult,
                pchUtf8,            // in string
                cchUtf8,
                DnsCharSetUtf8,     // UTF8 in
                DnsCharSetAnsi      // ANSI out
                );
}


BOOL
_fastcall
Dns_IsStringAscii(
    IN      LPSTR           pszString
    )
/*++

Routine Description:

    Check if string is ASCII.

    This is equivalent to saying
        - is ANSI string already in UTF8
        or
        - is UTF8 string already in ANSI

    This allows you to optimize for the 99% case where just
    passing ASCII strings.

Arguments:

    pszString -- ANSI or UTF8 string to check for ASCIIhood

Return Value:

    TRUE if string is all ASCII (characters all < 128)
    FALSE if non-ASCII characters.

--*/
{
    register UCHAR   ch;

    //
    //  loop through until hit non-ASCII character
    //

    while ( ch = (UCHAR) *pszString++ )
    {
        if ( ch < 0x80 )
        {
            continue;
        }
        return( FALSE );
    }

    return( TRUE );
}


BOOL
_fastcall
Dns_IsStringAsciiEx(
    IN      PCHAR           pchString,
    IN      DWORD           cchString
    )
/*++

Routine Description:

    Check if ANSI (or UTF8) string is ASCII.

    This is equivalent to saying
        - is ANSI string already in UTF8
        or
        - is UTF8 string already in ANSI

    This allows you to optimize for the 99% case where just
    passing ASCII strings.

Arguments:

    pchString   -- ptr to start of ansi buffer

    cchString  -- length of ansi buffer

Return Value:

    TRUE if string is all ASCII (characters all < 128)
    FALSE if non-ASCII characters.

--*/
{
    //
    //  loop through until hit non-ASCII character
    //

    while ( cchString-- )
    {
        if ( (UCHAR)*pchString++ < 0x80 )
        {
            continue;
        }
        return( FALSE );
    }

    return( TRUE );
}


BOOL
_fastcall
Dns_IsWideStringAscii(
    IN      PWCHAR          pwszString
    )
/*++

Routine Description:

    Check if unicode string is ASCII.
    This means all characters < 128.

    Strings without extended characters need NOT be downcased
    on the wire.  This allows us to optimize for the 99% case
    where just passing ASCII strings.

Arguments:

    pwszString -- ptr to unicode string

Return Value:

    TRUE if string is all ASCII (characters all < 128)
    FALSE if non-ASCII characters.

--*/
{
    register USHORT ch;

    //
    //  loop through until hit non-ASCII character
    //

    while ( ch = (USHORT) *pwszString++ )
    {
        if ( ch < 0x80 )
        {
            continue;
        }
        return( FALSE );
    }

    return( TRUE );
}

//
//  End utf8.c
//