1064 lines
23 KiB
C
1064 lines
23 KiB
C
|
/*++
|
|||
|
|
|||
|
Copyright (c) 1997-2001 Microsoft Corporation
|
|||
|
|
|||
|
Module Name:
|
|||
|
|
|||
|
utf8.c
|
|||
|
|
|||
|
Abstract:
|
|||
|
|
|||
|
Domain Name System (DNS) Library
|
|||
|
|
|||
|
UTF8 to\from unicode and ANSI conversions
|
|||
|
|
|||
|
The UTF8\unicode routines are similar to the generic ones floating
|
|||
|
around the NT group, but a heck of a lot cleaner and more robust,
|
|||
|
including catching the invalid UTF8 string case on the utf8 to unicode
|
|||
|
conversion.
|
|||
|
|
|||
|
The UTF8\ANSI routines are optimized for the 99% case where all the
|
|||
|
characters are <128 and no conversions is actually required.
|
|||
|
|
|||
|
Author:
|
|||
|
|
|||
|
Jim Gilroy (jamesg) March 1997
|
|||
|
|
|||
|
Revision History:
|
|||
|
|
|||
|
--*/
|
|||
|
|
|||
|
|
|||
|
#include "local.h"
|
|||
|
|
|||
|
|
|||
|
//
|
|||
|
// Macros to simplify UTF8 conversions
|
|||
|
//
|
|||
|
|
|||
|
#define UTF8_1ST_OF_2 0xc0 // 110x xxxx
|
|||
|
#define UTF8_1ST_OF_3 0xe0 // 1110 xxxx
|
|||
|
#define UTF8_1ST_OF_4 0xf0 // 1111 xxxx
|
|||
|
#define UTF8_TRAIL 0x80 // 10xx xxxx
|
|||
|
|
|||
|
#define UTF8_2_MAX 0x07ff // max unicode character representable in
|
|||
|
// in two byte UTF8
|
|||
|
|
|||
|
#define BIT7(ch) ((ch) & 0x80)
|
|||
|
#define BIT6(ch) ((ch) & 0x40)
|
|||
|
#define BIT5(ch) ((ch) & 0x20)
|
|||
|
#define BIT4(ch) ((ch) & 0x10)
|
|||
|
#define BIT3(ch) ((ch) & 0x08)
|
|||
|
|
|||
|
#define LOW6BITS(ch) ((ch) & 0x3f)
|
|||
|
#define LOW5BITS(ch) ((ch) & 0x1f)
|
|||
|
#define LOW4BITS(ch) ((ch) & 0x0f)
|
|||
|
|
|||
|
#define HIGHBYTE(wch) ((wch) & 0xff00)
|
|||
|
|
|||
|
//
|
|||
|
// Surrogate pair support
|
|||
|
// Two unicode characters may be linked to form a surrogate pair.
|
|||
|
// And for some totally unknown reason, someone thought they
|
|||
|
// should travel in UTF8 as four bytes instead of six.
|
|||
|
// No one has any idea why this is true other than to complicate
|
|||
|
// the code.
|
|||
|
//
|
|||
|
|
|||
|
#define HIGH_SURROGATE_START 0xd800
|
|||
|
#define HIGH_SURROGATE_END 0xdbff
|
|||
|
#define LOW_SURROGATE_START 0xdc00
|
|||
|
#define LOW_SURROGATE_END 0xdfff
|
|||
|
|
|||
|
|
|||
|
//
|
|||
|
// Max "normal conversion", make space for MAX_PATH,
|
|||
|
// this covers all valid DNS names and strings.
|
|||
|
//
|
|||
|
|
|||
|
#define TEMP_BUFFER_LENGTH (2*MAX_PATH)
|
|||
|
|
|||
|
|
|||
|
|
|||
|
DNS_STATUS
|
|||
|
_fastcall
|
|||
|
Dns_ValidateUtf8Byte(
|
|||
|
IN BYTE chUtf8,
|
|||
|
IN OUT PDWORD pdwTrailCount
|
|||
|
)
|
|||
|
/*++
|
|||
|
|
|||
|
Routine Description:
|
|||
|
|
|||
|
Verifies that byte is valid UTF8 byte.
|
|||
|
|
|||
|
Arguments:
|
|||
|
|
|||
|
Return Value:
|
|||
|
|
|||
|
ERROR_SUCCESS -- if valid UTF8 given trail count
|
|||
|
ERROR_INVALID_DATA -- if invalid
|
|||
|
|
|||
|
--*/
|
|||
|
{
|
|||
|
DWORD trailCount = *pdwTrailCount;
|
|||
|
|
|||
|
DNSDBG( TRACE, ( "Dns_ValidateUtf8Byte()\n" ));
|
|||
|
|
|||
|
//
|
|||
|
// if ASCII byte, only requirement is no trail count
|
|||
|
//
|
|||
|
|
|||
|
if ( (UCHAR)chUtf8 < 0x80 )
|
|||
|
{
|
|||
|
if ( trailCount == 0 )
|
|||
|
{
|
|||
|
return( ERROR_SUCCESS );
|
|||
|
}
|
|||
|
return( ERROR_INVALID_DATA );
|
|||
|
}
|
|||
|
|
|||
|
//
|
|||
|
// trail byte
|
|||
|
// - must be in multi-byte set
|
|||
|
//
|
|||
|
|
|||
|
if ( BIT6(chUtf8) == 0 )
|
|||
|
{
|
|||
|
if ( trailCount == 0 )
|
|||
|
{
|
|||
|
return( ERROR_INVALID_DATA );
|
|||
|
}
|
|||
|
--trailCount;
|
|||
|
}
|
|||
|
|
|||
|
//
|
|||
|
// multi-byte lead byte
|
|||
|
// - must NOT be in existing multi-byte set
|
|||
|
// - verify valid lead byte
|
|||
|
|
|||
|
else
|
|||
|
{
|
|||
|
if ( trailCount != 0 )
|
|||
|
{
|
|||
|
return( ERROR_INVALID_DATA );
|
|||
|
}
|
|||
|
|
|||
|
// first of two bytes (110xxxxx)
|
|||
|
|
|||
|
if ( BIT5(chUtf8) == 0 )
|
|||
|
{
|
|||
|
trailCount = 1;
|
|||
|
}
|
|||
|
|
|||
|
// first of three bytes (1110xxxx)
|
|||
|
|
|||
|
else if ( BIT4(chUtf8) == 0 )
|
|||
|
{
|
|||
|
trailCount = 2;
|
|||
|
}
|
|||
|
|
|||
|
// first of four bytes (surrogate character) (11110xxx)
|
|||
|
|
|||
|
else if ( BIT3(chUtf8) == 0 )
|
|||
|
{
|
|||
|
trailCount = 3;
|
|||
|
}
|
|||
|
|
|||
|
else
|
|||
|
{
|
|||
|
return( ERROR_INVALID_DATA );
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// reset caller's trail count
|
|||
|
|
|||
|
*pdwTrailCount = trailCount;
|
|||
|
return( ERROR_SUCCESS );
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
|
|||
|
//
|
|||
|
// UTF8 to unicode conversions
|
|||
|
//
|
|||
|
// For some reason UTF8 is not supported in Win9x.
|
|||
|
// AND the implementation itself is not careful about
|
|||
|
// validating UTF8.
|
|||
|
//
|
|||
|
|
|||
|
DWORD
|
|||
|
_fastcall
|
|||
|
Dns_UnicodeToUtf8(
|
|||
|
IN PWCHAR pwUnicode,
|
|||
|
IN DWORD cchUnicode,
|
|||
|
OUT PCHAR pchResult,
|
|||
|
IN DWORD cchResult
|
|||
|
)
|
|||
|
/*++
|
|||
|
|
|||
|
Routine Description:
|
|||
|
|
|||
|
Convert unicode characters to UTF8.
|
|||
|
|
|||
|
Result is NULL terminated if sufficient space in result
|
|||
|
buffer is available.
|
|||
|
|
|||
|
Arguments:
|
|||
|
|
|||
|
pwUnicode -- ptr to start of unicode buffer
|
|||
|
|
|||
|
cchUnicode -- length of unicode buffer
|
|||
|
|
|||
|
pchResult -- ptr to start of result buffer for UTF8 chars
|
|||
|
|
|||
|
cchResult -- length of result buffer
|
|||
|
|
|||
|
Return Value:
|
|||
|
|
|||
|
Count of UTF8 characters in result, if successful.
|
|||
|
0 on error. GetLastError() has error code.
|
|||
|
|
|||
|
--*/
|
|||
|
{
|
|||
|
WCHAR wch; // current unicode character being converted
|
|||
|
DWORD lengthUtf8 = 0; // length of UTF8 result string
|
|||
|
WORD lowSurrogate;
|
|||
|
DWORD surrogateDword;
|
|||
|
|
|||
|
|
|||
|
DNSDBG( TRACE, (
|
|||
|
"Dns_UnicodeToUtf8( %.*S )\n",
|
|||
|
cchUnicode,
|
|||
|
pwUnicode ));
|
|||
|
|
|||
|
//
|
|||
|
// loop converting unicode chars until run out or error
|
|||
|
//
|
|||
|
|
|||
|
while ( cchUnicode-- )
|
|||
|
{
|
|||
|
wch = *pwUnicode++;
|
|||
|
|
|||
|
//
|
|||
|
// ASCII character (7 bits or less) -- converts to directly
|
|||
|
//
|
|||
|
|
|||
|
if ( wch < 0x80 )
|
|||
|
{
|
|||
|
lengthUtf8++;
|
|||
|
|
|||
|
if ( pchResult )
|
|||
|
{
|
|||
|
if ( lengthUtf8 >= cchResult )
|
|||
|
{
|
|||
|
goto OutOfBuffer;
|
|||
|
}
|
|||
|
*pchResult++ = (CHAR)wch;
|
|||
|
}
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
//
|
|||
|
// wide character less than 0x07ff (11bits) converts to two bytes
|
|||
|
// - upper 5 bits in first byte
|
|||
|
// - lower 6 bits in secondar byte
|
|||
|
//
|
|||
|
|
|||
|
else if ( wch <= UTF8_2_MAX )
|
|||
|
{
|
|||
|
lengthUtf8 += 2;
|
|||
|
|
|||
|
if ( pchResult )
|
|||
|
{
|
|||
|
if ( lengthUtf8 >= cchResult )
|
|||
|
{
|
|||
|
goto OutOfBuffer;
|
|||
|
}
|
|||
|
*pchResult++ = UTF8_1ST_OF_2 | wch >> 6;
|
|||
|
*pchResult++ = UTF8_TRAIL | LOW6BITS( (UCHAR)wch );
|
|||
|
}
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
//
|
|||
|
// surrogate pair
|
|||
|
// - if have high surrogate followed by low surrogate then
|
|||
|
// process as surrogate pair
|
|||
|
// - otherwise treat character as ordinary unicode "three-byte"
|
|||
|
// character, by falling through to below
|
|||
|
//
|
|||
|
|
|||
|
else if ( wch >= HIGH_SURROGATE_START &&
|
|||
|
wch <= HIGH_SURROGATE_END &&
|
|||
|
cchUnicode &&
|
|||
|
(lowSurrogate = *pwUnicode) &&
|
|||
|
lowSurrogate >= LOW_SURROGATE_START &&
|
|||
|
lowSurrogate <= LOW_SURROGATE_END )
|
|||
|
{
|
|||
|
// have a surrogate pair
|
|||
|
// - suck up next unicode character (low surrogate of pair)
|
|||
|
// - make full DWORD surrogate pair
|
|||
|
// - then lay out four UTF8 bytes
|
|||
|
// 1st of four, then three trail bytes
|
|||
|
// 0x1111xxxx
|
|||
|
// 0x10xxxxxx
|
|||
|
// 0x10xxxxxx
|
|||
|
// 0x10xxxxxx
|
|||
|
|
|||
|
DNSDBG( TRACE, (
|
|||
|
"Have surrogate pair %hx : %hx\n",
|
|||
|
wch,
|
|||
|
lowSurrogate ));
|
|||
|
|
|||
|
pwUnicode++;
|
|||
|
cchUnicode--;
|
|||
|
lengthUtf8 += 4;
|
|||
|
|
|||
|
if ( pchResult )
|
|||
|
{
|
|||
|
if ( lengthUtf8 >= cchResult )
|
|||
|
{
|
|||
|
goto OutOfBuffer;
|
|||
|
}
|
|||
|
surrogateDword = (((wch-0xD800) << 10) + (lowSurrogate - 0xDC00) + 0x10000);
|
|||
|
|
|||
|
*pchResult++ = UTF8_1ST_OF_4 | (UCHAR) (surrogateDword >> 18);
|
|||
|
*pchResult++ = UTF8_TRAIL | (UCHAR) LOW6BITS(surrogateDword >> 12);
|
|||
|
*pchResult++ = UTF8_TRAIL | (UCHAR) LOW6BITS(surrogateDword >> 6);
|
|||
|
*pchResult++ = UTF8_TRAIL | (UCHAR) LOW6BITS(surrogateDword);
|
|||
|
|
|||
|
DNSDBG( TRACE, (
|
|||
|
"Converted surrogate -- DWORD = %08x\n"
|
|||
|
"\tconverted %x %x %x %x\n",
|
|||
|
surrogateDword,
|
|||
|
(UCHAR) *(pchResult-3),
|
|||
|
(UCHAR) *(pchResult-2),
|
|||
|
(UCHAR) *(pchResult-1),
|
|||
|
(UCHAR) *pchResult ));
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
//
|
|||
|
// wide character (non-zero in top 5 bits) converts to three bytes
|
|||
|
// - top 4 bits in first byte
|
|||
|
// - middle 6 bits in second byte
|
|||
|
// - low 6 bits in third byte
|
|||
|
//
|
|||
|
|
|||
|
else
|
|||
|
{
|
|||
|
lengthUtf8 += 3;
|
|||
|
|
|||
|
if ( pchResult )
|
|||
|
{
|
|||
|
if ( lengthUtf8 >= cchResult )
|
|||
|
{
|
|||
|
goto OutOfBuffer;
|
|||
|
}
|
|||
|
*pchResult++ = UTF8_1ST_OF_3 | (wch >> 12);
|
|||
|
*pchResult++ = UTF8_TRAIL | LOW6BITS( wch >> 6 );
|
|||
|
*pchResult++ = UTF8_TRAIL | LOW6BITS( wch );
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
//
|
|||
|
// NULL terminate buffer
|
|||
|
// return UTF8 character count
|
|||
|
//
|
|||
|
|
|||
|
if ( pchResult && lengthUtf8 < cchResult )
|
|||
|
{
|
|||
|
*pchResult = 0;
|
|||
|
}
|
|||
|
return( lengthUtf8 );
|
|||
|
|
|||
|
OutOfBuffer:
|
|||
|
|
|||
|
SetLastError( ERROR_INSUFFICIENT_BUFFER );
|
|||
|
return( 0 );
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
DWORD
|
|||
|
_fastcall
|
|||
|
Dns_Utf8ToUnicode(
|
|||
|
IN PCHAR pchUtf8,
|
|||
|
IN DWORD cchUtf8,
|
|||
|
OUT PWCHAR pwResult,
|
|||
|
IN DWORD cwResult
|
|||
|
)
|
|||
|
/*++
|
|||
|
|
|||
|
Routine Description:
|
|||
|
|
|||
|
Convert UTF8 characters to unicode.
|
|||
|
|
|||
|
Result is NULL terminated if sufficient space in result
|
|||
|
buffer is available.
|
|||
|
|
|||
|
Arguments:
|
|||
|
|
|||
|
pwResult -- ptr to start of result buffer for unicode chars
|
|||
|
|
|||
|
cwResult -- length of result buffer in WCHAR
|
|||
|
|
|||
|
pwUtf8 -- ptr to start of UTF8 buffer
|
|||
|
|
|||
|
cchUtf8 -- length of UTF8 buffer
|
|||
|
|
|||
|
Return Value:
|
|||
|
|
|||
|
Count of unicode characters in result, if successful.
|
|||
|
0 on error. GetLastError() has error code.
|
|||
|
|
|||
|
--*/
|
|||
|
{
|
|||
|
CHAR ch; // current UTF8 character
|
|||
|
WCHAR wch; // current unicode character
|
|||
|
DWORD trailCount = 0; // count of UTF8 trail bytes to follow
|
|||
|
DWORD lengthUnicode = 0; // length of unicode result string
|
|||
|
BOOL bsurrogatePair = FALSE;
|
|||
|
DWORD surrogateDword;
|
|||
|
|
|||
|
|
|||
|
//
|
|||
|
// loop converting UTF8 chars until run out or error
|
|||
|
//
|
|||
|
|
|||
|
while ( cchUtf8-- )
|
|||
|
{
|
|||
|
ch = *pchUtf8++;
|
|||
|
|
|||
|
//
|
|||
|
// ASCII character -- just copy
|
|||
|
//
|
|||
|
|
|||
|
if ( BIT7(ch) == 0 )
|
|||
|
{
|
|||
|
lengthUnicode++;
|
|||
|
if ( pwResult )
|
|||
|
{
|
|||
|
if ( lengthUnicode >= cwResult )
|
|||
|
{
|
|||
|
goto OutOfBuffer;
|
|||
|
}
|
|||
|
*pwResult++ = (WCHAR)ch;
|
|||
|
}
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
//
|
|||
|
// UTF8 trail byte
|
|||
|
// - if not expected, error
|
|||
|
// - otherwise shift unicode character 6 bits and
|
|||
|
// copy in lower six bits of UTF8
|
|||
|
// - if last UTF8 byte, copy result to unicode string
|
|||
|
//
|
|||
|
|
|||
|
else if ( BIT6(ch) == 0 )
|
|||
|
{
|
|||
|
if ( trailCount == 0 )
|
|||
|
{
|
|||
|
goto InvalidUtf8;
|
|||
|
}
|
|||
|
|
|||
|
if ( !bsurrogatePair )
|
|||
|
{
|
|||
|
wch <<= 6;
|
|||
|
wch |= LOW6BITS( ch );
|
|||
|
|
|||
|
if ( --trailCount == 0 )
|
|||
|
{
|
|||
|
lengthUnicode++;
|
|||
|
if ( pwResult )
|
|||
|
{
|
|||
|
if ( lengthUnicode >= cwResult )
|
|||
|
{
|
|||
|
goto OutOfBuffer;
|
|||
|
}
|
|||
|
*pwResult++ = wch;
|
|||
|
}
|
|||
|
}
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
// surrogate pair
|
|||
|
// - same as above EXCEPT build two unicode chars
|
|||
|
// from surrogateDword
|
|||
|
|
|||
|
else
|
|||
|
{
|
|||
|
surrogateDword <<= 6;
|
|||
|
surrogateDword |= LOW6BITS( ch );
|
|||
|
|
|||
|
if ( --trailCount == 0 )
|
|||
|
{
|
|||
|
lengthUnicode += 2;
|
|||
|
|
|||
|
if ( pwResult )
|
|||
|
{
|
|||
|
if ( lengthUnicode >= cwResult )
|
|||
|
{
|
|||
|
goto OutOfBuffer;
|
|||
|
}
|
|||
|
surrogateDword -= 0x10000;
|
|||
|
*pwResult++ = (WCHAR) ((surrogateDword >> 10) + HIGH_SURROGATE_START);
|
|||
|
*pwResult++ = (WCHAR) ((surrogateDword & 0x3ff) + LOW_SURROGATE_START);
|
|||
|
}
|
|||
|
bsurrogatePair = FALSE;
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
}
|
|||
|
|
|||
|
//
|
|||
|
// UTF8 lead byte
|
|||
|
// - if currently in extension, error
|
|||
|
|
|||
|
else
|
|||
|
{
|
|||
|
if ( trailCount != 0 )
|
|||
|
{
|
|||
|
goto InvalidUtf8;
|
|||
|
}
|
|||
|
|
|||
|
// first of two byte character (110xxxxx)
|
|||
|
|
|||
|
if ( BIT5(ch) == 0 )
|
|||
|
{
|
|||
|
trailCount = 1;
|
|||
|
wch = LOW5BITS(ch);
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
// first of three byte character (1110xxxx)
|
|||
|
|
|||
|
else if ( BIT4(ch) == 0 )
|
|||
|
{
|
|||
|
trailCount = 2;
|
|||
|
wch = LOW4BITS(ch);
|
|||
|
continue;
|
|||
|
}
|
|||
|
|
|||
|
// first of four byte surrogate pair (11110xxx)
|
|||
|
|
|||
|
else if ( BIT3(ch) == 0 )
|
|||
|
{
|
|||
|
trailCount = 3;
|
|||
|
surrogateDword = LOW4BITS(ch);
|
|||
|
bsurrogatePair = TRUE;
|
|||
|
}
|
|||
|
|
|||
|
else
|
|||
|
{
|
|||
|
goto InvalidUtf8;
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// catch if hit end in the middle of UTF8 multi-byte character
|
|||
|
|
|||
|
if ( trailCount )
|
|||
|
{
|
|||
|
goto InvalidUtf8;
|
|||
|
}
|
|||
|
|
|||
|
//
|
|||
|
// NULL terminate buffer
|
|||
|
// return the number of Unicode characters written.
|
|||
|
//
|
|||
|
|
|||
|
if ( pwResult && lengthUnicode < cwResult )
|
|||
|
{
|
|||
|
*pwResult = 0;
|
|||
|
}
|
|||
|
return( lengthUnicode );
|
|||
|
|
|||
|
OutOfBuffer:
|
|||
|
|
|||
|
SetLastError( ERROR_INSUFFICIENT_BUFFER );
|
|||
|
return( 0 );
|
|||
|
|
|||
|
InvalidUtf8:
|
|||
|
|
|||
|
SetLastError( ERROR_INVALID_DATA );
|
|||
|
return( 0 );
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
//
|
|||
|
// UTF8 \ ANSI conversions
|
|||
|
//
|
|||
|
|
|||
|
DWORD
|
|||
|
Dns_Utf8ToOrFromAnsi(
|
|||
|
OUT PCHAR pchResult,
|
|||
|
IN DWORD cchResult,
|
|||
|
IN PCHAR pchIn,
|
|||
|
IN DWORD cchIn,
|
|||
|
IN DNS_CHARSET InCharSet,
|
|||
|
IN DNS_CHARSET OutCharSet
|
|||
|
)
|
|||
|
/*++
|
|||
|
|
|||
|
Routine Description:
|
|||
|
|
|||
|
Convert UTF8 characters to ANSI or vice versa.
|
|||
|
|
|||
|
Note: this function appears to call string functions (string.c)
|
|||
|
which call back to it. However, this calls those functions
|
|||
|
ONLY for conversions to\from unicode which do NOT call back
|
|||
|
to these functions. Ultimately need to check if LCMapString
|
|||
|
can handle these issues.
|
|||
|
|
|||
|
Arguments:
|
|||
|
|
|||
|
pchResult -- ptr to start of result buffer for ansi chars
|
|||
|
|
|||
|
cchResult -- length of result buffer
|
|||
|
|
|||
|
pchIn -- ptr to start of input string
|
|||
|
|
|||
|
cchIn -- length of input string
|
|||
|
|
|||
|
InCharSet -- char set of input string (DnsCharSetAnsi or DnsCharSetUtf8)
|
|||
|
|
|||
|
OutCharSet -- char set for result string (DnsCharSetUtf8 or DnsCharSetAnsi)
|
|||
|
|
|||
|
Return Value:
|
|||
|
|
|||
|
Count of bytes in result (including terminating NULL).
|
|||
|
0 on error. GetLastError() has error code.
|
|||
|
|
|||
|
--*/
|
|||
|
{
|
|||
|
DWORD unicodeLength;
|
|||
|
DWORD resultLength;
|
|||
|
CHAR tempBuffer[ TEMP_BUFFER_LENGTH ];
|
|||
|
PCHAR ptemp = tempBuffer;
|
|||
|
DNS_STATUS status;
|
|||
|
|
|||
|
DNSDBG( TRACE, (
|
|||
|
"Dns_Utf8ToOrFromAnsi()\n"
|
|||
|
"\tbuffer = %p\n"
|
|||
|
"\tbuf length = %d\n"
|
|||
|
"\tpchString = %p (%*s)\n"
|
|||
|
"\tcchString = %d\n"
|
|||
|
"\tCharSetIn = %d\n"
|
|||
|
"\tCharSetOut = %d\n",
|
|||
|
pchResult,
|
|||
|
cchResult,
|
|||
|
pchIn,
|
|||
|
cchIn, pchIn,
|
|||
|
cchIn,
|
|||
|
InCharSet,
|
|||
|
OutCharSet ));
|
|||
|
|
|||
|
//
|
|||
|
// validate charsets
|
|||
|
//
|
|||
|
|
|||
|
ASSERT( InCharSet != OutCharSet );
|
|||
|
ASSERT( InCharSet == DnsCharSetAnsi || InCharSet == DnsCharSetUtf8 );
|
|||
|
ASSERT( OutCharSet == DnsCharSetAnsi || OutCharSet == DnsCharSetUtf8 );
|
|||
|
|
|||
|
//
|
|||
|
// if length not given, calculate
|
|||
|
//
|
|||
|
|
|||
|
if ( cchIn == 0 )
|
|||
|
{
|
|||
|
cchIn = strlen( pchIn );
|
|||
|
}
|
|||
|
|
|||
|
//
|
|||
|
// string completely ASCII
|
|||
|
// - simple memcopy suffices
|
|||
|
// - note result must have terminating NULL
|
|||
|
//
|
|||
|
|
|||
|
if ( Dns_IsStringAsciiEx(
|
|||
|
pchIn,
|
|||
|
cchIn ) )
|
|||
|
{
|
|||
|
if ( !pchResult )
|
|||
|
{
|
|||
|
return( cchIn + 1 );
|
|||
|
}
|
|||
|
|
|||
|
if ( cchResult <= cchIn )
|
|||
|
{
|
|||
|
status = ERROR_INSUFFICIENT_BUFFER;
|
|||
|
goto Failed;
|
|||
|
}
|
|||
|
memcpy(
|
|||
|
pchResult,
|
|||
|
pchIn,
|
|||
|
cchIn );
|
|||
|
|
|||
|
pchResult[ cchIn ] = 0;
|
|||
|
|
|||
|
return( cchIn+1 );
|
|||
|
}
|
|||
|
|
|||
|
//
|
|||
|
// non-ASCII
|
|||
|
// - convert to unicode, then to result character set
|
|||
|
//
|
|||
|
// DCR_PERF: LCMapStringA() might be able to handle all this
|
|||
|
// haven't figured out how yet
|
|||
|
//
|
|||
|
|
|||
|
unicodeLength = Dns_GetBufferLengthForStringCopy(
|
|||
|
pchIn,
|
|||
|
cchIn,
|
|||
|
InCharSet,
|
|||
|
DnsCharSetUnicode
|
|||
|
);
|
|||
|
|
|||
|
if ( unicodeLength > TEMP_BUFFER_LENGTH )
|
|||
|
{
|
|||
|
// can't use static buffer, must allocate
|
|||
|
|
|||
|
ptemp = Dns_StringCopyAllocate(
|
|||
|
pchIn,
|
|||
|
cchIn,
|
|||
|
InCharSet,
|
|||
|
DnsCharSetUnicode
|
|||
|
);
|
|||
|
if ( !ptemp )
|
|||
|
{
|
|||
|
status = ERROR_INVALID_DATA;
|
|||
|
goto Failed;
|
|||
|
}
|
|||
|
}
|
|||
|
else
|
|||
|
{
|
|||
|
if ( unicodeLength == 0 )
|
|||
|
{
|
|||
|
status = ERROR_INVALID_DATA;
|
|||
|
goto Failed;
|
|||
|
}
|
|||
|
|
|||
|
// copy into temporary buffer
|
|||
|
|
|||
|
resultLength = Dns_StringCopy(
|
|||
|
ptemp,
|
|||
|
NULL, // adequate buffer length
|
|||
|
pchIn,
|
|||
|
cchIn,
|
|||
|
InCharSet,
|
|||
|
DnsCharSetUnicode
|
|||
|
);
|
|||
|
if ( !resultLength )
|
|||
|
{
|
|||
|
status = ERROR_INVALID_DATA;
|
|||
|
goto Failed;
|
|||
|
}
|
|||
|
ASSERT( resultLength == unicodeLength );
|
|||
|
}
|
|||
|
|
|||
|
//
|
|||
|
// conversion to result char set
|
|||
|
// - if have result buffer, convert into it
|
|||
|
// - should have at least ONE two byte character
|
|||
|
// otherwise should have taken fast path above
|
|||
|
//
|
|||
|
|
|||
|
if ( pchResult )
|
|||
|
{
|
|||
|
resultLength = Dns_StringCopy(
|
|||
|
pchResult,
|
|||
|
& cchResult, // result buffer length
|
|||
|
ptemp,
|
|||
|
0,
|
|||
|
DnsCharSetUnicode,
|
|||
|
OutCharSet
|
|||
|
);
|
|||
|
if ( resultLength == 0 )
|
|||
|
{
|
|||
|
status = ERROR_INSUFFICIENT_BUFFER;
|
|||
|
goto Failed;
|
|||
|
}
|
|||
|
ASSERT( resultLength <= cchResult );
|
|||
|
ASSERT( pchResult[resultLength-1] == 0 );
|
|||
|
ASSERT( resultLength >= unicodeLength/2 );
|
|||
|
}
|
|||
|
|
|||
|
else
|
|||
|
{
|
|||
|
resultLength = Dns_GetBufferLengthForStringCopy(
|
|||
|
ptemp,
|
|||
|
0,
|
|||
|
DnsCharSetUnicode,
|
|||
|
OutCharSet
|
|||
|
);
|
|||
|
ASSERT( resultLength >= unicodeLength/2 );
|
|||
|
}
|
|||
|
|
|||
|
//
|
|||
|
// final mapping from unicode to result character set
|
|||
|
//
|
|||
|
|
|||
|
if ( ptemp != tempBuffer )
|
|||
|
{
|
|||
|
FREE_HEAP( ptemp );
|
|||
|
}
|
|||
|
|
|||
|
return( resultLength );
|
|||
|
|
|||
|
|
|||
|
Failed:
|
|||
|
|
|||
|
SetLastError( status );
|
|||
|
|
|||
|
if ( ptemp != tempBuffer )
|
|||
|
{
|
|||
|
FREE_HEAP( ptemp );
|
|||
|
}
|
|||
|
|
|||
|
return( 0 );
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
|
|||
|
DWORD
|
|||
|
Dns_AnsiToUtf8(
|
|||
|
IN PCHAR pchAnsi,
|
|||
|
IN DWORD cchAnsi,
|
|||
|
OUT PCHAR pchResult,
|
|||
|
IN DWORD cchResult
|
|||
|
)
|
|||
|
/*++
|
|||
|
|
|||
|
Routine Description:
|
|||
|
|
|||
|
Convert ANSI characters to UTF8.
|
|||
|
|
|||
|
Arguments:
|
|||
|
|
|||
|
pchAnsi -- ptr to start of ansi buffer
|
|||
|
|
|||
|
cchAnsi -- length of ansi buffer
|
|||
|
|
|||
|
pchResult -- ptr to start of result buffer for UTF8 chars
|
|||
|
|
|||
|
cchResult -- length of result buffer
|
|||
|
|
|||
|
Return Value:
|
|||
|
|
|||
|
Count of UTF8 characters in result, if successful.
|
|||
|
0 on error. GetLastError() has error code.
|
|||
|
|
|||
|
--*/
|
|||
|
{
|
|||
|
return Dns_Utf8ToOrFromAnsi(
|
|||
|
pchResult, // result buffer
|
|||
|
cchResult,
|
|||
|
pchAnsi, // in string
|
|||
|
cchAnsi,
|
|||
|
DnsCharSetAnsi, // ANSI in
|
|||
|
DnsCharSetUtf8 // UTF8 out
|
|||
|
);
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
|
|||
|
DWORD
|
|||
|
Dns_Utf8ToAnsi(
|
|||
|
IN PCHAR pchUtf8,
|
|||
|
IN DWORD cchUtf8,
|
|||
|
OUT PCHAR pchResult,
|
|||
|
IN DWORD cchResult
|
|||
|
)
|
|||
|
/*++
|
|||
|
|
|||
|
Routine Description:
|
|||
|
|
|||
|
Convert UTF8 characters to ANSI.
|
|||
|
|
|||
|
Arguments:
|
|||
|
|
|||
|
pchResult -- ptr to start of result buffer for ansi chars
|
|||
|
|
|||
|
cchResult -- length of result buffer
|
|||
|
|
|||
|
pwUtf8 -- ptr to start of UTF8 buffer
|
|||
|
|
|||
|
cchUtf8 -- length of UTF8 buffer
|
|||
|
|
|||
|
Return Value:
|
|||
|
|
|||
|
Count of ansi characters in result, if successful.
|
|||
|
0 on error. GetLastError() has error code.
|
|||
|
|
|||
|
--*/
|
|||
|
{
|
|||
|
return Dns_Utf8ToOrFromAnsi(
|
|||
|
pchResult, // result buffer
|
|||
|
cchResult,
|
|||
|
pchUtf8, // in string
|
|||
|
cchUtf8,
|
|||
|
DnsCharSetUtf8, // UTF8 in
|
|||
|
DnsCharSetAnsi // ANSI out
|
|||
|
);
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
|
|||
|
BOOL
|
|||
|
_fastcall
|
|||
|
Dns_IsStringAscii(
|
|||
|
IN LPSTR pszString
|
|||
|
)
|
|||
|
/*++
|
|||
|
|
|||
|
Routine Description:
|
|||
|
|
|||
|
Check if string is ASCII.
|
|||
|
|
|||
|
This is equivalent to saying
|
|||
|
- is ANSI string already in UTF8
|
|||
|
or
|
|||
|
- is UTF8 string already in ANSI
|
|||
|
|
|||
|
This allows you to optimize for the 99% case where just
|
|||
|
passing ASCII strings.
|
|||
|
|
|||
|
Arguments:
|
|||
|
|
|||
|
pszString -- ANSI or UTF8 string to check for ASCIIhood
|
|||
|
|
|||
|
Return Value:
|
|||
|
|
|||
|
TRUE if string is all ASCII (characters all < 128)
|
|||
|
FALSE if non-ASCII characters.
|
|||
|
|
|||
|
--*/
|
|||
|
{
|
|||
|
register UCHAR ch;
|
|||
|
|
|||
|
//
|
|||
|
// loop through until hit non-ASCII character
|
|||
|
//
|
|||
|
|
|||
|
while ( ch = (UCHAR) *pszString++ )
|
|||
|
{
|
|||
|
if ( ch < 0x80 )
|
|||
|
{
|
|||
|
continue;
|
|||
|
}
|
|||
|
return( FALSE );
|
|||
|
}
|
|||
|
|
|||
|
return( TRUE );
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
|
|||
|
BOOL
|
|||
|
_fastcall
|
|||
|
Dns_IsStringAsciiEx(
|
|||
|
IN PCHAR pchString,
|
|||
|
IN DWORD cchString
|
|||
|
)
|
|||
|
/*++
|
|||
|
|
|||
|
Routine Description:
|
|||
|
|
|||
|
Check if ANSI (or UTF8) string is ASCII.
|
|||
|
|
|||
|
This is equivalent to saying
|
|||
|
- is ANSI string already in UTF8
|
|||
|
or
|
|||
|
- is UTF8 string already in ANSI
|
|||
|
|
|||
|
This allows you to optimize for the 99% case where just
|
|||
|
passing ASCII strings.
|
|||
|
|
|||
|
Arguments:
|
|||
|
|
|||
|
pchString -- ptr to start of ansi buffer
|
|||
|
|
|||
|
cchString -- length of ansi buffer
|
|||
|
|
|||
|
Return Value:
|
|||
|
|
|||
|
TRUE if string is all ASCII (characters all < 128)
|
|||
|
FALSE if non-ASCII characters.
|
|||
|
|
|||
|
--*/
|
|||
|
{
|
|||
|
//
|
|||
|
// loop through until hit non-ASCII character
|
|||
|
//
|
|||
|
|
|||
|
while ( cchString-- )
|
|||
|
{
|
|||
|
if ( (UCHAR)*pchString++ < 0x80 )
|
|||
|
{
|
|||
|
continue;
|
|||
|
}
|
|||
|
return( FALSE );
|
|||
|
}
|
|||
|
|
|||
|
return( TRUE );
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
|
|||
|
BOOL
|
|||
|
_fastcall
|
|||
|
Dns_IsWideStringAscii(
|
|||
|
IN PWCHAR pwszString
|
|||
|
)
|
|||
|
/*++
|
|||
|
|
|||
|
Routine Description:
|
|||
|
|
|||
|
Check if unicode string is ASCII.
|
|||
|
This means all characters < 128.
|
|||
|
|
|||
|
Strings without extended characters need NOT be downcased
|
|||
|
on the wire. This allows us to optimize for the 99% case
|
|||
|
where just passing ASCII strings.
|
|||
|
|
|||
|
Arguments:
|
|||
|
|
|||
|
pwszString -- ptr to unicode string
|
|||
|
|
|||
|
Return Value:
|
|||
|
|
|||
|
TRUE if string is all ASCII (characters all < 128)
|
|||
|
FALSE if non-ASCII characters.
|
|||
|
|
|||
|
--*/
|
|||
|
{
|
|||
|
register USHORT ch;
|
|||
|
|
|||
|
//
|
|||
|
// loop through until hit non-ASCII character
|
|||
|
//
|
|||
|
|
|||
|
while ( ch = (USHORT) *pwszString++ )
|
|||
|
{
|
|||
|
if ( ch < 0x80 )
|
|||
|
{
|
|||
|
continue;
|
|||
|
}
|
|||
|
return( FALSE );
|
|||
|
}
|
|||
|
|
|||
|
return( TRUE );
|
|||
|
}
|
|||
|
|
|||
|
//
|
|||
|
// End utf8.c
|
|||
|
//
|