windows-nt/Source/XPSP1/NT/ds/dns/dnslib/utf8.c

1064 lines
23 KiB
C
Raw Normal View History

2020-09-26 03:20:57 -05:00
/*++
Copyright (c) 1997-2001 Microsoft Corporation
Module Name:
utf8.c
Abstract:
Domain Name System (DNS) Library
UTF8 to\from unicode and ANSI conversions
The UTF8\unicode routines are similar to the generic ones floating
around the NT group, but a heck of a lot cleaner and more robust,
including catching the invalid UTF8 string case on the utf8 to unicode
conversion.
The UTF8\ANSI routines are optimized for the 99% case where all the
characters are <128 and no conversions is actually required.
Author:
Jim Gilroy (jamesg) March 1997
Revision History:
--*/
#include "local.h"
//
// Macros to simplify UTF8 conversions
//
#define UTF8_1ST_OF_2 0xc0 // 110x xxxx
#define UTF8_1ST_OF_3 0xe0 // 1110 xxxx
#define UTF8_1ST_OF_4 0xf0 // 1111 xxxx
#define UTF8_TRAIL 0x80 // 10xx xxxx
#define UTF8_2_MAX 0x07ff // max unicode character representable in
// in two byte UTF8
#define BIT7(ch) ((ch) & 0x80)
#define BIT6(ch) ((ch) & 0x40)
#define BIT5(ch) ((ch) & 0x20)
#define BIT4(ch) ((ch) & 0x10)
#define BIT3(ch) ((ch) & 0x08)
#define LOW6BITS(ch) ((ch) & 0x3f)
#define LOW5BITS(ch) ((ch) & 0x1f)
#define LOW4BITS(ch) ((ch) & 0x0f)
#define HIGHBYTE(wch) ((wch) & 0xff00)
//
// Surrogate pair support
// Two unicode characters may be linked to form a surrogate pair.
// And for some totally unknown reason, someone thought they
// should travel in UTF8 as four bytes instead of six.
// No one has any idea why this is true other than to complicate
// the code.
//
#define HIGH_SURROGATE_START 0xd800
#define HIGH_SURROGATE_END 0xdbff
#define LOW_SURROGATE_START 0xdc00
#define LOW_SURROGATE_END 0xdfff
//
// Max "normal conversion", make space for MAX_PATH,
// this covers all valid DNS names and strings.
//
#define TEMP_BUFFER_LENGTH (2*MAX_PATH)
DNS_STATUS
_fastcall
Dns_ValidateUtf8Byte(
IN BYTE chUtf8,
IN OUT PDWORD pdwTrailCount
)
/*++
Routine Description:
Verifies that byte is valid UTF8 byte.
Arguments:
Return Value:
ERROR_SUCCESS -- if valid UTF8 given trail count
ERROR_INVALID_DATA -- if invalid
--*/
{
DWORD trailCount = *pdwTrailCount;
DNSDBG( TRACE, ( "Dns_ValidateUtf8Byte()\n" ));
//
// if ASCII byte, only requirement is no trail count
//
if ( (UCHAR)chUtf8 < 0x80 )
{
if ( trailCount == 0 )
{
return( ERROR_SUCCESS );
}
return( ERROR_INVALID_DATA );
}
//
// trail byte
// - must be in multi-byte set
//
if ( BIT6(chUtf8) == 0 )
{
if ( trailCount == 0 )
{
return( ERROR_INVALID_DATA );
}
--trailCount;
}
//
// multi-byte lead byte
// - must NOT be in existing multi-byte set
// - verify valid lead byte
else
{
if ( trailCount != 0 )
{
return( ERROR_INVALID_DATA );
}
// first of two bytes (110xxxxx)
if ( BIT5(chUtf8) == 0 )
{
trailCount = 1;
}
// first of three bytes (1110xxxx)
else if ( BIT4(chUtf8) == 0 )
{
trailCount = 2;
}
// first of four bytes (surrogate character) (11110xxx)
else if ( BIT3(chUtf8) == 0 )
{
trailCount = 3;
}
else
{
return( ERROR_INVALID_DATA );
}
}
// reset caller's trail count
*pdwTrailCount = trailCount;
return( ERROR_SUCCESS );
}
//
// UTF8 to unicode conversions
//
// For some reason UTF8 is not supported in Win9x.
// AND the implementation itself is not careful about
// validating UTF8.
//
DWORD
_fastcall
Dns_UnicodeToUtf8(
IN PWCHAR pwUnicode,
IN DWORD cchUnicode,
OUT PCHAR pchResult,
IN DWORD cchResult
)
/*++
Routine Description:
Convert unicode characters to UTF8.
Result is NULL terminated if sufficient space in result
buffer is available.
Arguments:
pwUnicode -- ptr to start of unicode buffer
cchUnicode -- length of unicode buffer
pchResult -- ptr to start of result buffer for UTF8 chars
cchResult -- length of result buffer
Return Value:
Count of UTF8 characters in result, if successful.
0 on error. GetLastError() has error code.
--*/
{
WCHAR wch; // current unicode character being converted
DWORD lengthUtf8 = 0; // length of UTF8 result string
WORD lowSurrogate;
DWORD surrogateDword;
DNSDBG( TRACE, (
"Dns_UnicodeToUtf8( %.*S )\n",
cchUnicode,
pwUnicode ));
//
// loop converting unicode chars until run out or error
//
while ( cchUnicode-- )
{
wch = *pwUnicode++;
//
// ASCII character (7 bits or less) -- converts to directly
//
if ( wch < 0x80 )
{
lengthUtf8++;
if ( pchResult )
{
if ( lengthUtf8 >= cchResult )
{
goto OutOfBuffer;
}
*pchResult++ = (CHAR)wch;
}
continue;
}
//
// wide character less than 0x07ff (11bits) converts to two bytes
// - upper 5 bits in first byte
// - lower 6 bits in secondar byte
//
else if ( wch <= UTF8_2_MAX )
{
lengthUtf8 += 2;
if ( pchResult )
{
if ( lengthUtf8 >= cchResult )
{
goto OutOfBuffer;
}
*pchResult++ = UTF8_1ST_OF_2 | wch >> 6;
*pchResult++ = UTF8_TRAIL | LOW6BITS( (UCHAR)wch );
}
continue;
}
//
// surrogate pair
// - if have high surrogate followed by low surrogate then
// process as surrogate pair
// - otherwise treat character as ordinary unicode "three-byte"
// character, by falling through to below
//
else if ( wch >= HIGH_SURROGATE_START &&
wch <= HIGH_SURROGATE_END &&
cchUnicode &&
(lowSurrogate = *pwUnicode) &&
lowSurrogate >= LOW_SURROGATE_START &&
lowSurrogate <= LOW_SURROGATE_END )
{
// have a surrogate pair
// - suck up next unicode character (low surrogate of pair)
// - make full DWORD surrogate pair
// - then lay out four UTF8 bytes
// 1st of four, then three trail bytes
// 0x1111xxxx
// 0x10xxxxxx
// 0x10xxxxxx
// 0x10xxxxxx
DNSDBG( TRACE, (
"Have surrogate pair %hx : %hx\n",
wch,
lowSurrogate ));
pwUnicode++;
cchUnicode--;
lengthUtf8 += 4;
if ( pchResult )
{
if ( lengthUtf8 >= cchResult )
{
goto OutOfBuffer;
}
surrogateDword = (((wch-0xD800) << 10) + (lowSurrogate - 0xDC00) + 0x10000);
*pchResult++ = UTF8_1ST_OF_4 | (UCHAR) (surrogateDword >> 18);
*pchResult++ = UTF8_TRAIL | (UCHAR) LOW6BITS(surrogateDword >> 12);
*pchResult++ = UTF8_TRAIL | (UCHAR) LOW6BITS(surrogateDword >> 6);
*pchResult++ = UTF8_TRAIL | (UCHAR) LOW6BITS(surrogateDword);
DNSDBG( TRACE, (
"Converted surrogate -- DWORD = %08x\n"
"\tconverted %x %x %x %x\n",
surrogateDword,
(UCHAR) *(pchResult-3),
(UCHAR) *(pchResult-2),
(UCHAR) *(pchResult-1),
(UCHAR) *pchResult ));
}
}
//
// wide character (non-zero in top 5 bits) converts to three bytes
// - top 4 bits in first byte
// - middle 6 bits in second byte
// - low 6 bits in third byte
//
else
{
lengthUtf8 += 3;
if ( pchResult )
{
if ( lengthUtf8 >= cchResult )
{
goto OutOfBuffer;
}
*pchResult++ = UTF8_1ST_OF_3 | (wch >> 12);
*pchResult++ = UTF8_TRAIL | LOW6BITS( wch >> 6 );
*pchResult++ = UTF8_TRAIL | LOW6BITS( wch );
}
}
}
//
// NULL terminate buffer
// return UTF8 character count
//
if ( pchResult && lengthUtf8 < cchResult )
{
*pchResult = 0;
}
return( lengthUtf8 );
OutOfBuffer:
SetLastError( ERROR_INSUFFICIENT_BUFFER );
return( 0 );
}
DWORD
_fastcall
Dns_Utf8ToUnicode(
IN PCHAR pchUtf8,
IN DWORD cchUtf8,
OUT PWCHAR pwResult,
IN DWORD cwResult
)
/*++
Routine Description:
Convert UTF8 characters to unicode.
Result is NULL terminated if sufficient space in result
buffer is available.
Arguments:
pwResult -- ptr to start of result buffer for unicode chars
cwResult -- length of result buffer in WCHAR
pwUtf8 -- ptr to start of UTF8 buffer
cchUtf8 -- length of UTF8 buffer
Return Value:
Count of unicode characters in result, if successful.
0 on error. GetLastError() has error code.
--*/
{
CHAR ch; // current UTF8 character
WCHAR wch; // current unicode character
DWORD trailCount = 0; // count of UTF8 trail bytes to follow
DWORD lengthUnicode = 0; // length of unicode result string
BOOL bsurrogatePair = FALSE;
DWORD surrogateDword;
//
// loop converting UTF8 chars until run out or error
//
while ( cchUtf8-- )
{
ch = *pchUtf8++;
//
// ASCII character -- just copy
//
if ( BIT7(ch) == 0 )
{
lengthUnicode++;
if ( pwResult )
{
if ( lengthUnicode >= cwResult )
{
goto OutOfBuffer;
}
*pwResult++ = (WCHAR)ch;
}
continue;
}
//
// UTF8 trail byte
// - if not expected, error
// - otherwise shift unicode character 6 bits and
// copy in lower six bits of UTF8
// - if last UTF8 byte, copy result to unicode string
//
else if ( BIT6(ch) == 0 )
{
if ( trailCount == 0 )
{
goto InvalidUtf8;
}
if ( !bsurrogatePair )
{
wch <<= 6;
wch |= LOW6BITS( ch );
if ( --trailCount == 0 )
{
lengthUnicode++;
if ( pwResult )
{
if ( lengthUnicode >= cwResult )
{
goto OutOfBuffer;
}
*pwResult++ = wch;
}
}
continue;
}
// surrogate pair
// - same as above EXCEPT build two unicode chars
// from surrogateDword
else
{
surrogateDword <<= 6;
surrogateDword |= LOW6BITS( ch );
if ( --trailCount == 0 )
{
lengthUnicode += 2;
if ( pwResult )
{
if ( lengthUnicode >= cwResult )
{
goto OutOfBuffer;
}
surrogateDword -= 0x10000;
*pwResult++ = (WCHAR) ((surrogateDword >> 10) + HIGH_SURROGATE_START);
*pwResult++ = (WCHAR) ((surrogateDword & 0x3ff) + LOW_SURROGATE_START);
}
bsurrogatePair = FALSE;
}
}
}
//
// UTF8 lead byte
// - if currently in extension, error
else
{
if ( trailCount != 0 )
{
goto InvalidUtf8;
}
// first of two byte character (110xxxxx)
if ( BIT5(ch) == 0 )
{
trailCount = 1;
wch = LOW5BITS(ch);
continue;
}
// first of three byte character (1110xxxx)
else if ( BIT4(ch) == 0 )
{
trailCount = 2;
wch = LOW4BITS(ch);
continue;
}
// first of four byte surrogate pair (11110xxx)
else if ( BIT3(ch) == 0 )
{
trailCount = 3;
surrogateDword = LOW4BITS(ch);
bsurrogatePair = TRUE;
}
else
{
goto InvalidUtf8;
}
}
}
// catch if hit end in the middle of UTF8 multi-byte character
if ( trailCount )
{
goto InvalidUtf8;
}
//
// NULL terminate buffer
// return the number of Unicode characters written.
//
if ( pwResult && lengthUnicode < cwResult )
{
*pwResult = 0;
}
return( lengthUnicode );
OutOfBuffer:
SetLastError( ERROR_INSUFFICIENT_BUFFER );
return( 0 );
InvalidUtf8:
SetLastError( ERROR_INVALID_DATA );
return( 0 );
}
//
// UTF8 \ ANSI conversions
//
DWORD
Dns_Utf8ToOrFromAnsi(
OUT PCHAR pchResult,
IN DWORD cchResult,
IN PCHAR pchIn,
IN DWORD cchIn,
IN DNS_CHARSET InCharSet,
IN DNS_CHARSET OutCharSet
)
/*++
Routine Description:
Convert UTF8 characters to ANSI or vice versa.
Note: this function appears to call string functions (string.c)
which call back to it. However, this calls those functions
ONLY for conversions to\from unicode which do NOT call back
to these functions. Ultimately need to check if LCMapString
can handle these issues.
Arguments:
pchResult -- ptr to start of result buffer for ansi chars
cchResult -- length of result buffer
pchIn -- ptr to start of input string
cchIn -- length of input string
InCharSet -- char set of input string (DnsCharSetAnsi or DnsCharSetUtf8)
OutCharSet -- char set for result string (DnsCharSetUtf8 or DnsCharSetAnsi)
Return Value:
Count of bytes in result (including terminating NULL).
0 on error. GetLastError() has error code.
--*/
{
DWORD unicodeLength;
DWORD resultLength;
CHAR tempBuffer[ TEMP_BUFFER_LENGTH ];
PCHAR ptemp = tempBuffer;
DNS_STATUS status;
DNSDBG( TRACE, (
"Dns_Utf8ToOrFromAnsi()\n"
"\tbuffer = %p\n"
"\tbuf length = %d\n"
"\tpchString = %p (%*s)\n"
"\tcchString = %d\n"
"\tCharSetIn = %d\n"
"\tCharSetOut = %d\n",
pchResult,
cchResult,
pchIn,
cchIn, pchIn,
cchIn,
InCharSet,
OutCharSet ));
//
// validate charsets
//
ASSERT( InCharSet != OutCharSet );
ASSERT( InCharSet == DnsCharSetAnsi || InCharSet == DnsCharSetUtf8 );
ASSERT( OutCharSet == DnsCharSetAnsi || OutCharSet == DnsCharSetUtf8 );
//
// if length not given, calculate
//
if ( cchIn == 0 )
{
cchIn = strlen( pchIn );
}
//
// string completely ASCII
// - simple memcopy suffices
// - note result must have terminating NULL
//
if ( Dns_IsStringAsciiEx(
pchIn,
cchIn ) )
{
if ( !pchResult )
{
return( cchIn + 1 );
}
if ( cchResult <= cchIn )
{
status = ERROR_INSUFFICIENT_BUFFER;
goto Failed;
}
memcpy(
pchResult,
pchIn,
cchIn );
pchResult[ cchIn ] = 0;
return( cchIn+1 );
}
//
// non-ASCII
// - convert to unicode, then to result character set
//
// DCR_PERF: LCMapStringA() might be able to handle all this
// haven't figured out how yet
//
unicodeLength = Dns_GetBufferLengthForStringCopy(
pchIn,
cchIn,
InCharSet,
DnsCharSetUnicode
);
if ( unicodeLength > TEMP_BUFFER_LENGTH )
{
// can't use static buffer, must allocate
ptemp = Dns_StringCopyAllocate(
pchIn,
cchIn,
InCharSet,
DnsCharSetUnicode
);
if ( !ptemp )
{
status = ERROR_INVALID_DATA;
goto Failed;
}
}
else
{
if ( unicodeLength == 0 )
{
status = ERROR_INVALID_DATA;
goto Failed;
}
// copy into temporary buffer
resultLength = Dns_StringCopy(
ptemp,
NULL, // adequate buffer length
pchIn,
cchIn,
InCharSet,
DnsCharSetUnicode
);
if ( !resultLength )
{
status = ERROR_INVALID_DATA;
goto Failed;
}
ASSERT( resultLength == unicodeLength );
}
//
// conversion to result char set
// - if have result buffer, convert into it
// - should have at least ONE two byte character
// otherwise should have taken fast path above
//
if ( pchResult )
{
resultLength = Dns_StringCopy(
pchResult,
& cchResult, // result buffer length
ptemp,
0,
DnsCharSetUnicode,
OutCharSet
);
if ( resultLength == 0 )
{
status = ERROR_INSUFFICIENT_BUFFER;
goto Failed;
}
ASSERT( resultLength <= cchResult );
ASSERT( pchResult[resultLength-1] == 0 );
ASSERT( resultLength >= unicodeLength/2 );
}
else
{
resultLength = Dns_GetBufferLengthForStringCopy(
ptemp,
0,
DnsCharSetUnicode,
OutCharSet
);
ASSERT( resultLength >= unicodeLength/2 );
}
//
// final mapping from unicode to result character set
//
if ( ptemp != tempBuffer )
{
FREE_HEAP( ptemp );
}
return( resultLength );
Failed:
SetLastError( status );
if ( ptemp != tempBuffer )
{
FREE_HEAP( ptemp );
}
return( 0 );
}
DWORD
Dns_AnsiToUtf8(
IN PCHAR pchAnsi,
IN DWORD cchAnsi,
OUT PCHAR pchResult,
IN DWORD cchResult
)
/*++
Routine Description:
Convert ANSI characters to UTF8.
Arguments:
pchAnsi -- ptr to start of ansi buffer
cchAnsi -- length of ansi buffer
pchResult -- ptr to start of result buffer for UTF8 chars
cchResult -- length of result buffer
Return Value:
Count of UTF8 characters in result, if successful.
0 on error. GetLastError() has error code.
--*/
{
return Dns_Utf8ToOrFromAnsi(
pchResult, // result buffer
cchResult,
pchAnsi, // in string
cchAnsi,
DnsCharSetAnsi, // ANSI in
DnsCharSetUtf8 // UTF8 out
);
}
DWORD
Dns_Utf8ToAnsi(
IN PCHAR pchUtf8,
IN DWORD cchUtf8,
OUT PCHAR pchResult,
IN DWORD cchResult
)
/*++
Routine Description:
Convert UTF8 characters to ANSI.
Arguments:
pchResult -- ptr to start of result buffer for ansi chars
cchResult -- length of result buffer
pwUtf8 -- ptr to start of UTF8 buffer
cchUtf8 -- length of UTF8 buffer
Return Value:
Count of ansi characters in result, if successful.
0 on error. GetLastError() has error code.
--*/
{
return Dns_Utf8ToOrFromAnsi(
pchResult, // result buffer
cchResult,
pchUtf8, // in string
cchUtf8,
DnsCharSetUtf8, // UTF8 in
DnsCharSetAnsi // ANSI out
);
}
BOOL
_fastcall
Dns_IsStringAscii(
IN LPSTR pszString
)
/*++
Routine Description:
Check if string is ASCII.
This is equivalent to saying
- is ANSI string already in UTF8
or
- is UTF8 string already in ANSI
This allows you to optimize for the 99% case where just
passing ASCII strings.
Arguments:
pszString -- ANSI or UTF8 string to check for ASCIIhood
Return Value:
TRUE if string is all ASCII (characters all < 128)
FALSE if non-ASCII characters.
--*/
{
register UCHAR ch;
//
// loop through until hit non-ASCII character
//
while ( ch = (UCHAR) *pszString++ )
{
if ( ch < 0x80 )
{
continue;
}
return( FALSE );
}
return( TRUE );
}
BOOL
_fastcall
Dns_IsStringAsciiEx(
IN PCHAR pchString,
IN DWORD cchString
)
/*++
Routine Description:
Check if ANSI (or UTF8) string is ASCII.
This is equivalent to saying
- is ANSI string already in UTF8
or
- is UTF8 string already in ANSI
This allows you to optimize for the 99% case where just
passing ASCII strings.
Arguments:
pchString -- ptr to start of ansi buffer
cchString -- length of ansi buffer
Return Value:
TRUE if string is all ASCII (characters all < 128)
FALSE if non-ASCII characters.
--*/
{
//
// loop through until hit non-ASCII character
//
while ( cchString-- )
{
if ( (UCHAR)*pchString++ < 0x80 )
{
continue;
}
return( FALSE );
}
return( TRUE );
}
BOOL
_fastcall
Dns_IsWideStringAscii(
IN PWCHAR pwszString
)
/*++
Routine Description:
Check if unicode string is ASCII.
This means all characters < 128.
Strings without extended characters need NOT be downcased
on the wire. This allows us to optimize for the 99% case
where just passing ASCII strings.
Arguments:
pwszString -- ptr to unicode string
Return Value:
TRUE if string is all ASCII (characters all < 128)
FALSE if non-ASCII characters.
--*/
{
register USHORT ch;
//
// loop through until hit non-ASCII character
//
while ( ch = (USHORT) *pwszString++ )
{
if ( ch < 0x80 )
{
continue;
}
return( FALSE );
}
return( TRUE );
}
//
// End utf8.c
//