217 lines
4 KiB
C++
217 lines
4 KiB
C++
|
#include "precomp.h"
|
||
|
#include "utf8str.h"
|
||
|
|
||
|
CUTF8String::~CUTF8String()
|
||
|
{
|
||
|
if (ALLOC_UNICODE & m_eAlloc)
|
||
|
{
|
||
|
delete m_pwszUnicode;
|
||
|
}
|
||
|
else if (ALLOC_UTF8 & m_eAlloc)
|
||
|
{
|
||
|
delete m_pszUTF8;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
CUTF8String::operator LPWSTR()
|
||
|
{
|
||
|
if ((NULL == m_pwszUnicode) && (NULL != m_pszUTF8))
|
||
|
{
|
||
|
DecodeUTF8();
|
||
|
}
|
||
|
return m_pwszUnicode;
|
||
|
}
|
||
|
|
||
|
CUTF8String::operator LPSTR()
|
||
|
{
|
||
|
if ((NULL == m_pszUTF8) && (NULL != m_pwszUnicode))
|
||
|
{
|
||
|
EncodeUTF8();
|
||
|
}
|
||
|
return m_pszUTF8;
|
||
|
}
|
||
|
|
||
|
VOID CUTF8String::EncodeUTF8()
|
||
|
{
|
||
|
DebugEntry(CUTF8String::EncodeUTF8);
|
||
|
m_hr = S_OK;
|
||
|
ASSERT(NULL != m_pwszUnicode);
|
||
|
int cchUTF8 = 1; // always include a NULL terminator
|
||
|
|
||
|
// First make a pass to see how many characters we will be converting.
|
||
|
LPWSTR pwsz = m_pwszUnicode;
|
||
|
while (L'\0' != *pwsz)
|
||
|
{
|
||
|
WCHAR wch = *pwsz++;
|
||
|
if (wch < 0x80)
|
||
|
{
|
||
|
cchUTF8 += 1;
|
||
|
}
|
||
|
else if (wch < 0x800)
|
||
|
{
|
||
|
cchUTF8 += 2;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
cchUTF8 += 3;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
ASSERT(NULL == m_pszUTF8);
|
||
|
m_pszUTF8 = new CHAR[cchUTF8];
|
||
|
if (NULL != m_pszUTF8)
|
||
|
{
|
||
|
ASSERT(ALLOC_NONE == m_eAlloc);
|
||
|
m_eAlloc = ALLOC_UTF8;
|
||
|
// Start encoding here:
|
||
|
const BYTE cUtf8FirstSignal[4] = {0x00, 0x00, 0xC0, 0xE0};
|
||
|
const BYTE cMask = 0xBF;
|
||
|
const BYTE cSignal = 0x80;
|
||
|
LPSTR pszStop = m_pszUTF8 + cchUTF8;
|
||
|
LPSTR pszDst = m_pszUTF8;
|
||
|
pwsz = m_pwszUnicode;
|
||
|
while (pszDst < pszStop)
|
||
|
{
|
||
|
WCHAR wch = *pwsz++;
|
||
|
#ifdef DEBUG
|
||
|
if (L'\0' == wch)
|
||
|
{
|
||
|
ASSERT(pszDst == pszStop - 1);
|
||
|
}
|
||
|
#endif // DEBUG
|
||
|
int cchTotal;
|
||
|
if (wch < 0x80)
|
||
|
{
|
||
|
cchTotal = 1;
|
||
|
}
|
||
|
else if (wch < 0x800)
|
||
|
{
|
||
|
cchTotal = 2;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
cchTotal = 3;
|
||
|
}
|
||
|
|
||
|
pszDst += cchTotal;
|
||
|
switch (cchTotal)
|
||
|
{
|
||
|
case 3:
|
||
|
*--pszDst = (wch | cSignal) & cMask;
|
||
|
wch >>= 6;
|
||
|
// FALL THROUGH
|
||
|
case 2:
|
||
|
*--pszDst = (wch | cSignal) & cMask;
|
||
|
wch >>= 6;
|
||
|
// FALL THROUGH
|
||
|
case 1:
|
||
|
*--pszDst = (wch | cUtf8FirstSignal[cchTotal]);
|
||
|
}
|
||
|
pszDst += cchTotal;
|
||
|
}
|
||
|
m_hr = S_OK;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
m_hr = E_OUTOFMEMORY;
|
||
|
}
|
||
|
DebugExitVOID(CUTF8String::EncodeUTF8);
|
||
|
}
|
||
|
|
||
|
VOID CUTF8String::DecodeUTF8()
|
||
|
{
|
||
|
DebugEntry(CUTF8String::DecodeUTF8);
|
||
|
m_hr = S_OK;
|
||
|
ASSERT(NULL != m_pszUTF8);
|
||
|
int cchUnicode = 1; // always include a NULL terminator
|
||
|
|
||
|
LPSTR psz = m_pszUTF8;
|
||
|
// First determine the destination size (cchUnicode)
|
||
|
while ('\0' != *psz)
|
||
|
{
|
||
|
int cbChar = 0;
|
||
|
BYTE bFirst = (BYTE) *psz;
|
||
|
while (bFirst & 0x80)
|
||
|
{
|
||
|
cbChar++;
|
||
|
ASSERT(cbChar < 8);
|
||
|
bFirst <<= 1;
|
||
|
}
|
||
|
|
||
|
cbChar = max(1, cbChar);
|
||
|
psz += cbChar;
|
||
|
cchUnicode++;
|
||
|
}
|
||
|
|
||
|
ASSERT(NULL == m_pwszUnicode);
|
||
|
m_pwszUnicode = new WCHAR[cchUnicode];
|
||
|
if (NULL != m_pwszUnicode)
|
||
|
{
|
||
|
ASSERT(ALLOC_NONE == m_eAlloc);
|
||
|
m_eAlloc = ALLOC_UNICODE;
|
||
|
// Start decoding here:
|
||
|
LPWSTR pwszStop = m_pwszUnicode + cchUnicode;
|
||
|
LPWSTR pwszDst = m_pwszUnicode;
|
||
|
psz = m_pszUTF8;
|
||
|
while (pwszDst < pwszStop)
|
||
|
{
|
||
|
int cbChar = 0;
|
||
|
BYTE bFirst = (BYTE) *psz;
|
||
|
while (bFirst & 0x80)
|
||
|
{
|
||
|
cbChar++;
|
||
|
ASSERT(cbChar < 8);
|
||
|
bFirst <<= 1;
|
||
|
}
|
||
|
|
||
|
BOOL fValid = TRUE;
|
||
|
WCHAR wch = L'\0';
|
||
|
switch (cbChar)
|
||
|
{
|
||
|
case 6: psz++; // FALLTHROUGH // we don't handle
|
||
|
case 5: psz++; // FALLTHROUGH // UCS-4; skip first
|
||
|
case 4: psz++; // FALLTHROUGH // three bytes
|
||
|
case 3:
|
||
|
wch = WCHAR(*psz++ & 0x0f) << 12; // 0x0800 - 0xffff
|
||
|
fValid = fValid && ((*psz & 0xc0) == 0x80);
|
||
|
// FALLTHROUGH
|
||
|
case 2:
|
||
|
wch |= WCHAR(*psz++ & 0x3f) << 6; // 0x0080 - 0x07ff
|
||
|
fValid = fValid && ((*psz & 0xc0) == 0x80);
|
||
|
wch |= WCHAR(*psz++ & 0x3f);
|
||
|
break;
|
||
|
|
||
|
case 0:
|
||
|
wch = WCHAR(*psz++); // 0x0000 - 0x007f
|
||
|
break;
|
||
|
|
||
|
default:
|
||
|
ERROR_OUT(("CUTF8String::DecodeUTF8 found invalid UTF-8 lead byte"));
|
||
|
wch = L'?';
|
||
|
psz += cbChar;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (FALSE == fValid)
|
||
|
{
|
||
|
ERROR_OUT(("CUTF8String::DecodeUTF8 found bad UTF-8 sequence"));
|
||
|
*pwszDst = L'\0';
|
||
|
m_hr = E_FAIL;
|
||
|
break;
|
||
|
}
|
||
|
#ifdef DEBUG
|
||
|
cchUnicode--;
|
||
|
#endif // DEBUG
|
||
|
|
||
|
*pwszDst++ = wch;
|
||
|
}
|
||
|
ASSERT(0 == cchUnicode);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
m_hr = E_OUTOFMEMORY;
|
||
|
}
|
||
|
DebugExitVOID(CUTF8String::DecodeUTF8);
|
||
|
}
|
||
|
|