692 lines
20 KiB
C++
692 lines
20 KiB
C++
//+-------------------------------------------------------------------------
|
|
//
|
|
// Microsoft Windows
|
|
//
|
|
// Copyright (C) Microsoft Corporation, 1995 - 1999
|
|
//
|
|
// File: utf8.cpp
|
|
//
|
|
// Contents: WideChar to/from UTF8 APIs
|
|
//
|
|
// Functions: WideCharToUTF8
|
|
// UTF8ToWideChar
|
|
//
|
|
// History: 19-Feb-97 philh created
|
|
// 28-Aug-99 philh added surrogate support. Copied from
|
|
// nt\private\windows\winnls\utf.c or
|
|
// \\rastaman\ntwin\src\winnls\utf.c.
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
#include "global.hxx"
|
|
#include <dbgdef.h>
|
|
#include "utf8.h"
|
|
|
|
#if 1
|
|
|
|
// NEW SURROGATE VERSION
|
|
|
|
//
|
|
// Constant Declarations.
|
|
//
|
|
|
|
#define ASCII 0x007f
|
|
|
|
#define SHIFT_IN '+' // beginning of a shift sequence
|
|
#define SHIFT_OUT '-' // end of a shift sequence
|
|
|
|
#define UTF8_2_MAX 0x07ff // max UTF8 2-byte sequence (32 * 64 = 2048)
|
|
#define UTF8_1ST_OF_2 0xc0 // 110x xxxx
|
|
#define UTF8_1ST_OF_3 0xe0 // 1110 xxxx
|
|
#define UTF8_1ST_OF_4 0xf0 // 1111 xxxx
|
|
#define UTF8_TRAIL 0x80 // 10xx xxxx
|
|
|
|
#define HIGHER_6_BIT(u) ((u) >> 12)
|
|
#define MIDDLE_6_BIT(u) (((u) & 0x0fc0) >> 6)
|
|
#define LOWER_6_BIT(u) ((u) & 0x003f)
|
|
|
|
#define BIT7(a) ((a) & 0x80)
|
|
#define BIT6(a) ((a) & 0x40)
|
|
|
|
#define HIGH_SURROGATE_START 0xd800
|
|
#define HIGH_SURROGATE_END 0xdbff
|
|
#define LOW_SURROGATE_START 0xdc00
|
|
#define LOW_SURROGATE_END 0xdfff
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// UTF8ToUnicode
|
|
//
|
|
// Maps a UTF-8 character string to its wide character string counterpart.
|
|
//
|
|
// 02-06-96 JulieB Created.
|
|
// 03-20-99 SamerA Surrogate support.
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
int
|
|
WINAPI
|
|
UTF8ToWideChar(
|
|
LPCSTR lpSrcStr,
|
|
int cchSrc,
|
|
LPWSTR lpDestStr,
|
|
int cchDest)
|
|
{
|
|
int nTB = 0; // # trail bytes to follow
|
|
int cchWC = 0; // # of Unicode code points generated
|
|
LPCSTR pUTF8 = lpSrcStr;
|
|
DWORD dwSurrogateChar = 0; // Full surrogate char
|
|
BOOL bSurrogatePair = FALSE; // Indicate we'r collecting a surrogate pair
|
|
char UTF8;
|
|
|
|
// BEGIN ADDED CHECKS
|
|
if (cchDest < 0)
|
|
goto InvalidParameter;
|
|
|
|
if (cchSrc < 0)
|
|
cchSrc = strlen(lpSrcStr) + 1;
|
|
// END ADDED CHECKS
|
|
|
|
while ((cchSrc--) && ((cchDest == 0) || (cchWC < cchDest)))
|
|
{
|
|
//
|
|
// See if there are any trail bytes.
|
|
//
|
|
if (BIT7(*pUTF8) == 0)
|
|
{
|
|
|
|
// BEGIN FIX
|
|
if (nTB != 0)
|
|
goto InvalidParameter;
|
|
// END FIX
|
|
|
|
//
|
|
// Found ASCII.
|
|
//
|
|
if (cchDest)
|
|
{
|
|
lpDestStr[cchWC] = (WCHAR)*pUTF8;
|
|
}
|
|
bSurrogatePair = FALSE;
|
|
cchWC++;
|
|
}
|
|
else if (BIT6(*pUTF8) == 0)
|
|
{
|
|
//
|
|
// Found a trail byte.
|
|
// Note : Ignore the trail byte if there was no lead byte.
|
|
//
|
|
if (nTB != 0)
|
|
{
|
|
//
|
|
// Decrement the trail byte counter.
|
|
//
|
|
nTB--;
|
|
|
|
if (bSurrogatePair)
|
|
{
|
|
dwSurrogateChar <<= 6;
|
|
dwSurrogateChar |= LOWER_6_BIT(*pUTF8);
|
|
|
|
if (nTB == 0)
|
|
{
|
|
if (cchDest)
|
|
{
|
|
if ((cchWC + 1) < cchDest)
|
|
{
|
|
lpDestStr[cchWC] = (WCHAR)
|
|
(((dwSurrogateChar - 0x10000) >> 10) + HIGH_SURROGATE_START);
|
|
|
|
lpDestStr[cchWC+1] = (WCHAR)
|
|
((dwSurrogateChar - 0x10000)%0x400 + LOW_SURROGATE_START);
|
|
}
|
|
// BEGIN FIX
|
|
else
|
|
{
|
|
SetLastError(ERROR_INSUFFICIENT_BUFFER);
|
|
return (0);
|
|
}
|
|
// END FIX
|
|
}
|
|
|
|
cchWC += 2;
|
|
bSurrogatePair = FALSE;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
//
|
|
// Make room for the trail byte and add the trail byte
|
|
// value.
|
|
//
|
|
if (cchDest)
|
|
{
|
|
lpDestStr[cchWC] <<= 6;
|
|
lpDestStr[cchWC] |= LOWER_6_BIT(*pUTF8);
|
|
}
|
|
|
|
if (nTB == 0)
|
|
{
|
|
//
|
|
// End of sequence. Advance the output counter.
|
|
//
|
|
cchWC++;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// error - not expecting a trail byte
|
|
// BEGIN FIX
|
|
// bSurrogatePair = FALSE;
|
|
|
|
goto InvalidParameter;
|
|
// END FIX
|
|
}
|
|
}
|
|
else
|
|
{
|
|
//
|
|
// Found a lead byte.
|
|
//
|
|
if (nTB > 0)
|
|
{
|
|
//
|
|
// Error - previous sequence not finished.
|
|
//
|
|
// BEGIN FIX
|
|
// nTB = 0;
|
|
// bSurrogatePair = FALSE;
|
|
// cchWC++;
|
|
|
|
goto InvalidParameter;
|
|
// END FIX
|
|
}
|
|
else
|
|
{
|
|
//
|
|
// Calculate the number of bytes to follow.
|
|
// Look for the first 0 from left to right.
|
|
//
|
|
UTF8 = *pUTF8;
|
|
while (BIT7(UTF8) != 0)
|
|
{
|
|
UTF8 <<= 1;
|
|
nTB++;
|
|
}
|
|
|
|
//
|
|
// If this is a surrogate unicode pair
|
|
//
|
|
if (nTB == 4)
|
|
{
|
|
dwSurrogateChar = UTF8 >> nTB;
|
|
bSurrogatePair = TRUE;
|
|
}
|
|
// BEGIN FIX
|
|
else if (nTB >= 5)
|
|
{
|
|
goto InvalidParameter;
|
|
}
|
|
// END FIX
|
|
|
|
//
|
|
// Store the value from the first byte and decrement
|
|
// the number of bytes to follow.
|
|
//
|
|
if (cchDest)
|
|
{
|
|
lpDestStr[cchWC] = (WCHAR) (UTF8 >> nTB);
|
|
}
|
|
nTB--;
|
|
}
|
|
}
|
|
|
|
pUTF8++;
|
|
}
|
|
|
|
// BEGIN FIX
|
|
if (nTB != 0)
|
|
goto InvalidParameter;
|
|
// END FIX
|
|
|
|
//
|
|
// Make sure the destination buffer was large enough.
|
|
//
|
|
if (cchDest && (cchSrc >= 0))
|
|
{
|
|
SetLastError(ERROR_INSUFFICIENT_BUFFER);
|
|
return (0);
|
|
}
|
|
|
|
//
|
|
// Return the number of Unicode characters written.
|
|
//
|
|
return (cchWC);
|
|
|
|
// BEGIN FIX
|
|
InvalidParameter:
|
|
SetLastError(ERROR_INVALID_PARAMETER);
|
|
return (0);
|
|
// END FIX
|
|
}
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// UnicodeToUTF8
|
|
//
|
|
// Maps a Unicode character string to its UTF-8 string counterpart.
|
|
//
|
|
// 02-06-96 JulieB Created.
|
|
// 03-20-99 SamerA Surrogate support.
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
int
|
|
WINAPI
|
|
WideCharToUTF8(
|
|
LPCWSTR lpSrcStr,
|
|
int cchSrc,
|
|
LPSTR lpDestStr,
|
|
int cchDest)
|
|
{
|
|
LPCWSTR lpWC = lpSrcStr;
|
|
int cchU8 = 0; // # of UTF8 chars generated
|
|
DWORD dwSurrogateChar;
|
|
WCHAR wchHighSurrogate = 0;
|
|
BOOL bHandled;
|
|
|
|
// BEGIN ADDED CHECKS
|
|
if (cchDest < 0)
|
|
{
|
|
SetLastError(ERROR_INVALID_PARAMETER);
|
|
return (0);
|
|
}
|
|
|
|
if (cchSrc < 0)
|
|
cchSrc = wcslen(lpSrcStr) + 1;
|
|
// END ADDED CHECKS
|
|
|
|
while ((cchSrc--) && ((cchDest == 0) || (cchU8 < cchDest)))
|
|
{
|
|
bHandled = FALSE;
|
|
|
|
//
|
|
// Check if high surrogate is available
|
|
//
|
|
if ((*lpWC >= HIGH_SURROGATE_START) && (*lpWC <= HIGH_SURROGATE_END))
|
|
{
|
|
if (cchDest)
|
|
{
|
|
// Another high surrogate, then treat the 1st as normal
|
|
// Unicode character.
|
|
if (wchHighSurrogate)
|
|
{
|
|
if ((cchU8 + 2) < cchDest)
|
|
{
|
|
lpDestStr[cchU8++] = (char) (UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate));
|
|
lpDestStr[cchU8++] = (char) (UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate));
|
|
lpDestStr[cchU8++] = (char) (UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate));
|
|
}
|
|
else
|
|
{
|
|
// not enough buffer
|
|
cchSrc++;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
cchU8 += 3;
|
|
}
|
|
wchHighSurrogate = *lpWC;
|
|
bHandled = TRUE;
|
|
}
|
|
|
|
if (!bHandled && wchHighSurrogate)
|
|
{
|
|
if ((*lpWC >= LOW_SURROGATE_START) && (*lpWC <= LOW_SURROGATE_END))
|
|
{
|
|
// wheee, valid surrogate pairs
|
|
|
|
if (cchDest)
|
|
{
|
|
if ((cchU8 + 3) < cchDest)
|
|
{
|
|
dwSurrogateChar = (((wchHighSurrogate-0xD800) << 10) + (*lpWC - 0xDC00) + 0x10000);
|
|
|
|
lpDestStr[cchU8++] = (UTF8_1ST_OF_4 |
|
|
(unsigned char)(dwSurrogateChar >> 18)); // 3 bits from 1st byte
|
|
|
|
lpDestStr[cchU8++] = (UTF8_TRAIL |
|
|
(unsigned char)((dwSurrogateChar >> 12) & 0x3f)); // 6 bits from 2nd byte
|
|
|
|
lpDestStr[cchU8++] = (UTF8_TRAIL |
|
|
(unsigned char)((dwSurrogateChar >> 6) & 0x3f)); // 6 bits from 3rd byte
|
|
|
|
lpDestStr[cchU8++] = (UTF8_TRAIL |
|
|
(unsigned char)(0x3f & dwSurrogateChar)); // 6 bits from 4th byte
|
|
}
|
|
else
|
|
{
|
|
// not enough buffer
|
|
cchSrc++;
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// we already counted 3 previously (in high surrogate)
|
|
cchU8 += 1;
|
|
}
|
|
|
|
bHandled = TRUE;
|
|
}
|
|
else
|
|
{
|
|
// Bad Surrogate pair : ERROR
|
|
// Just process wchHighSurrogate , and the code below will
|
|
// process the current code point
|
|
if (cchDest)
|
|
{
|
|
if ((cchU8 + 2) < cchDest)
|
|
{
|
|
lpDestStr[cchU8++] = (char) (UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate));
|
|
lpDestStr[cchU8++] = (char) (UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate));
|
|
lpDestStr[cchU8++] = (char) (UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate));
|
|
}
|
|
else
|
|
{
|
|
// not enough buffer
|
|
cchSrc++;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
wchHighSurrogate = 0;
|
|
}
|
|
|
|
if (!bHandled)
|
|
{
|
|
if (*lpWC <= ASCII)
|
|
{
|
|
//
|
|
// Found ASCII.
|
|
//
|
|
if (cchDest)
|
|
{
|
|
lpDestStr[cchU8] = (char)*lpWC;
|
|
}
|
|
cchU8++;
|
|
}
|
|
else if (*lpWC <= UTF8_2_MAX)
|
|
{
|
|
//
|
|
// Found 2 byte sequence if < 0x07ff (11 bits).
|
|
//
|
|
if (cchDest)
|
|
{
|
|
if ((cchU8 + 1) < cchDest)
|
|
{
|
|
//
|
|
// Use upper 5 bits in first byte.
|
|
// Use lower 6 bits in second byte.
|
|
//
|
|
lpDestStr[cchU8++] = (char) (UTF8_1ST_OF_2 | (*lpWC >> 6));
|
|
lpDestStr[cchU8++] = (char) (UTF8_TRAIL | LOWER_6_BIT(*lpWC));
|
|
}
|
|
else
|
|
{
|
|
//
|
|
// Error - buffer too small.
|
|
//
|
|
cchSrc++;
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
cchU8 += 2;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
//
|
|
// Found 3 byte sequence.
|
|
//
|
|
if (cchDest)
|
|
{
|
|
if ((cchU8 + 2) < cchDest)
|
|
{
|
|
//
|
|
// Use upper 4 bits in first byte.
|
|
// Use middle 6 bits in second byte.
|
|
// Use lower 6 bits in third byte.
|
|
//
|
|
lpDestStr[cchU8++] = (char) (UTF8_1ST_OF_3 | HIGHER_6_BIT(*lpWC));
|
|
lpDestStr[cchU8++] = (char) (UTF8_TRAIL | MIDDLE_6_BIT(*lpWC));
|
|
lpDestStr[cchU8++] = (char) (UTF8_TRAIL | LOWER_6_BIT(*lpWC));
|
|
}
|
|
else
|
|
{
|
|
//
|
|
// Error - buffer too small.
|
|
//
|
|
cchSrc++;
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
cchU8 += 3;
|
|
}
|
|
}
|
|
}
|
|
|
|
lpWC++;
|
|
}
|
|
|
|
//
|
|
// If the last character was a high surrogate, then handle it as a normal
|
|
// unicode character.
|
|
//
|
|
if ((cchSrc < 0) && (wchHighSurrogate != 0))
|
|
{
|
|
if (cchDest)
|
|
{
|
|
if ((cchU8 + 2) < cchDest)
|
|
{
|
|
lpDestStr[cchU8++] = (char) (UTF8_1ST_OF_3 | HIGHER_6_BIT(wchHighSurrogate));
|
|
lpDestStr[cchU8++] = (char) (UTF8_TRAIL | MIDDLE_6_BIT(wchHighSurrogate));
|
|
lpDestStr[cchU8++] = (char) (UTF8_TRAIL | LOWER_6_BIT(wchHighSurrogate));
|
|
}
|
|
else
|
|
{
|
|
cchSrc++;
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// Make sure the destination buffer was large enough.
|
|
//
|
|
if (cchDest && (cchSrc >= 0))
|
|
{
|
|
SetLastError(ERROR_INSUFFICIENT_BUFFER);
|
|
return (0);
|
|
}
|
|
|
|
//
|
|
// Return the number of UTF-8 characters written.
|
|
//
|
|
return (cchU8);
|
|
}
|
|
|
|
#else
|
|
|
|
// OLD IMPLEMENTATION NOT SUPPORTING SURROGATE PAIRS
|
|
|
|
//+-------------------------------------------------------------------------
|
|
// Maps a wide-character (Unicode) string to a new UTF-8 encoded character
|
|
// string.
|
|
//
|
|
// The wide characters are mapped as follows:
|
|
//
|
|
// Start End Bits UTF-8 Characters
|
|
// ------ ------ ---- --------------------------------
|
|
// 0x0000 0x007F 7 0x0xxxxxxx
|
|
// 0x0080 0x07FF 11 0x110xxxxx 0x10xxxxxx
|
|
// 0x0800 0xFFFF 16 0x1110xxxx 0x10xxxxxx 0x10xxxxxx
|
|
//
|
|
// The parameter and return value semantics are the same as for the
|
|
// Win32 API, WideCharToMultiByte.
|
|
//
|
|
// Note, starting with NT 4.0, WideCharToMultiByte supports CP_UTF8. CP_UTF8
|
|
// isn't supported on Win95.
|
|
//--------------------------------------------------------------------------
|
|
int
|
|
WINAPI
|
|
WideCharToUTF8(
|
|
IN LPCWSTR lpWideCharStr,
|
|
IN int cchWideChar,
|
|
OUT LPSTR lpUTF8Str,
|
|
IN int cchUTF8
|
|
)
|
|
{
|
|
int cchRemainUTF8;
|
|
|
|
if (cchUTF8 < 0)
|
|
goto InvalidParameter;
|
|
cchRemainUTF8 = cchUTF8;
|
|
|
|
if (cchWideChar < 0)
|
|
cchWideChar = wcslen(lpWideCharStr) + 1;
|
|
|
|
while (cchWideChar--) {
|
|
WCHAR wch = *lpWideCharStr++;
|
|
if (wch <= 0x7F) {
|
|
// 7 bits
|
|
cchRemainUTF8 -= 1;
|
|
if (cchRemainUTF8 >= 0)
|
|
*lpUTF8Str++ = (char) wch;
|
|
} else if (wch <= 0x7FF) {
|
|
// 11 bits
|
|
cchRemainUTF8 -= 2;
|
|
if (cchRemainUTF8 >= 0) {
|
|
*lpUTF8Str++ = (char) (0xC0 | ((wch >> 6) & 0x1F));
|
|
*lpUTF8Str++ = (char) (0x80 | (wch & 0x3F));
|
|
}
|
|
} else {
|
|
// 16 bits
|
|
cchRemainUTF8 -= 3;
|
|
if (cchRemainUTF8 >= 0) {
|
|
*lpUTF8Str++ = (char) (0xE0 | ((wch >> 12) & 0x0F));
|
|
*lpUTF8Str++ = (char) (0x80 | ((wch >> 6) & 0x3F));
|
|
*lpUTF8Str++ = (char) (0x80 | (wch & 0x3F));
|
|
}
|
|
}
|
|
}
|
|
|
|
if (cchRemainUTF8 >= 0)
|
|
cchUTF8 = cchUTF8 - cchRemainUTF8;
|
|
else if (cchUTF8 == 0)
|
|
cchUTF8 = -cchRemainUTF8;
|
|
else {
|
|
cchUTF8 = 0;
|
|
SetLastError(ERROR_INSUFFICIENT_BUFFER);
|
|
}
|
|
return cchUTF8;
|
|
|
|
InvalidParameter:
|
|
SetLastError(ERROR_INVALID_PARAMETER);
|
|
return 0;
|
|
}
|
|
|
|
//+-------------------------------------------------------------------------
|
|
// Maps a UTF-8 encoded character string to a new wide-character (Unicode)
|
|
// string.
|
|
//
|
|
// See CertWideCharToUTF8 for how the UTF-8 characters are mapped to wide
|
|
// characters.
|
|
//
|
|
// The parameter and return value semantics are the same as for the
|
|
// Win32 API, MultiByteToWideChar.
|
|
//
|
|
// If the UTF-8 characters don't contain the expected high order bits,
|
|
// ERROR_INVALID_PARAMETER is set and 0 is returned.
|
|
//
|
|
// Note, starting with NT 4.0, MultiByteToWideChar supports CP_UTF8. CP_UTF8
|
|
// isn't supported on Win95.
|
|
//--------------------------------------------------------------------------
|
|
int
|
|
WINAPI
|
|
UTF8ToWideChar(
|
|
IN LPCSTR lpUTF8Str,
|
|
IN int cchUTF8,
|
|
OUT LPWSTR lpWideCharStr,
|
|
IN int cchWideChar
|
|
)
|
|
{
|
|
int cchRemainWideChar;
|
|
|
|
if (cchWideChar < 0)
|
|
goto InvalidParameter;
|
|
cchRemainWideChar = cchWideChar;
|
|
|
|
if (cchUTF8 < 0)
|
|
cchUTF8 = strlen(lpUTF8Str) + 1;
|
|
|
|
while (cchUTF8--) {
|
|
char ch = *lpUTF8Str++;
|
|
WCHAR wch;
|
|
if (0 == (ch & 0x80))
|
|
// 7 bits, 1 byte
|
|
wch = (WCHAR) ch;
|
|
else if (0xC0 == (ch & 0xE0)) {
|
|
// 11 bits, 2 bytes
|
|
char ch2;
|
|
|
|
if (--cchUTF8 < 0)
|
|
goto InvalidParameter;
|
|
ch2 = *lpUTF8Str++;
|
|
if (0x80 != (ch2 & 0xC0))
|
|
goto InvalidParameter;
|
|
wch = (((WCHAR) ch & 0x1F) << 6) | ((WCHAR) ch2 & 0x3F);
|
|
} else if (0xE0 == (ch & 0xF0)) {
|
|
// 16 bits, 3 bytes
|
|
char ch2;
|
|
char ch3;
|
|
cchUTF8 -= 2;
|
|
if (cchUTF8 < 0)
|
|
goto InvalidParameter;
|
|
ch2 = *lpUTF8Str++;
|
|
ch3 = *lpUTF8Str++;
|
|
if (0x80 != (ch2 & 0xC0) || 0x80 != (ch3 & 0xC0))
|
|
goto InvalidParameter;
|
|
wch = (((WCHAR) ch & 0x0F) << 12) | (((WCHAR) ch2 & 0x3F) << 6) |
|
|
((WCHAR) ch3 & 0x3F);
|
|
} else
|
|
goto InvalidParameter;
|
|
|
|
if (--cchRemainWideChar >= 0)
|
|
*lpWideCharStr++ = wch;
|
|
}
|
|
|
|
if (cchRemainWideChar >= 0)
|
|
cchWideChar = cchWideChar - cchRemainWideChar;
|
|
else if (cchWideChar == 0)
|
|
cchWideChar = -cchRemainWideChar;
|
|
else {
|
|
cchWideChar = 0;
|
|
SetLastError(ERROR_INSUFFICIENT_BUFFER);
|
|
}
|
|
return cchWideChar;
|
|
|
|
InvalidParameter:
|
|
SetLastError(ERROR_INVALID_PARAMETER);
|
|
return 0;
|
|
}
|
|
|
|
#endif
|