819 lines
25 KiB
C++
819 lines
25 KiB
C++
/*
|
|
* @(#)CharEncoder.cxx 1.0 6/10/97
|
|
*
|
|
* Copyright (c) 1997 - 1999 Microsoft Corporation. All rights reserved. *
|
|
*/
|
|
#include "stdinc.h"
|
|
#include "core.hxx"
|
|
#pragma hdrstop
|
|
|
|
#include "charencoder.hxx"
|
|
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
#include <shlwapip.h> // IsCharSpace
|
|
#ifdef UNIX
|
|
#include <lendian.hpp>
|
|
#endif
|
|
|
|
#ifdef UNIX
|
|
// Not needed under UNIX
|
|
#else
|
|
#ifndef _WIN64
|
|
#include <w95wraps.h>
|
|
#endif // _WIN64
|
|
#endif /* UNIX */
|
|
#endif
|
|
|
|
//
|
|
// Delegate other charsets to mlang
|
|
//
|
|
const EncodingEntry CharEncoder::charsetInfo [] =
|
|
{
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
{ CP_1250, _T("WINDOWS-1250"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 },
|
|
{ CP_1251, _T("WINDOWS-1251"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 },
|
|
{ CP_1252, _T("WINDOWS-1252"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 },
|
|
{ CP_1253, _T("WINDOWS-1253"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 },
|
|
{ CP_1254, _T("WINDOWS-1254"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 },
|
|
{ CP_1257, _T("WINDOWS-1257"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 },
|
|
{ CP_UCS_4, _T("UCS-4"), 4, wideCharFromUcs4Bigendian, wideCharToUcs4Bigendian },
|
|
{ CP_UCS_2, _T("ISO-10646-UCS-2"), 2, wideCharFromUcs2Bigendian, wideCharToUcs2Bigendian },
|
|
{ CP_UCS_2, _T("UNICODE-2-0-UTF-16"), 2, wideCharFromUcs2Bigendian, wideCharToUcs2Bigendian },
|
|
{ CP_UCS_2, _T("UTF-16"), 2, wideCharFromUcs2Bigendian, wideCharToUcs2Bigendian },
|
|
{ CP_UTF_8, _T("UNICODE-1-1-UTF-8"), 3, wideCharFromUtf8, wideCharToUtf8 },
|
|
{ CP_UTF_8, _T("UNICODE-2-0-UTF-8"), 3, wideCharFromUtf8, wideCharToUtf8 },
|
|
#endif
|
|
{ CP_UCS_2, L"UTF-16", 2, wideCharFromUcs2Bigendian },
|
|
{ CP_UCS_2, L"UCS-2", 2, wideCharFromUcs2Bigendian },
|
|
{ CP_UTF_8, L"UTF-8", 3, wideCharFromUtf8 },
|
|
};
|
|
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
IMultiLanguage * CharEncoder::pMultiLanguage = NULL;
|
|
#endif
|
|
|
|
Encoding * Encoding::newEncoding(const WCHAR * s, ULONG len, bool endian, bool mark)
|
|
{
|
|
//Encoding * e = new Encoding();
|
|
Encoding * e = NEW (Encoding());
|
|
if (e == NULL)
|
|
return NULL;
|
|
e->charset = NEW (WCHAR[len + 1]);
|
|
if (e->charset == NULL)
|
|
{
|
|
delete e;
|
|
return NULL;
|
|
}
|
|
::memcpy(e->charset, s, sizeof(WCHAR) * len);
|
|
e->charset[len] = 0; // guarentee NULL termination.
|
|
e->littleendian = endian;
|
|
e->byteOrderMark = mark;
|
|
return e;
|
|
}
|
|
|
|
Encoding::~Encoding()
|
|
{
|
|
if (charset != NULL)
|
|
{
|
|
delete [] charset;
|
|
}
|
|
}
|
|
|
|
int CharEncoder::getCharsetInfo(const WCHAR * charset, CODEPAGE * pcodepage, UINT * mCharSize)
|
|
{
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
CPINFO cpinfo;
|
|
|
|
#endif
|
|
|
|
|
|
for (int i = LENGTH(charsetInfo) - 1; i >= 0; i--)
|
|
{
|
|
//if (StrCmpI(charset, charsetInfo[i].charset) == 0)
|
|
if (::FusionpCompareStrings(charset, ::wcslen(charset), charsetInfo[i].charset, ::wcslen(charsetInfo[i].charset), true) == 0)
|
|
{
|
|
//
|
|
// test whether we can handle it locally or not
|
|
// BUGBUG(HACK) the index number may change if we change charsetInfo
|
|
//
|
|
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
if (i > 5 || GetCPInfo(charsetInfo[i].codepage, &cpinfo))
|
|
#endif
|
|
{
|
|
*pcodepage = charsetInfo[i].codepage;
|
|
*mCharSize = charsetInfo[i].maxCharSize;
|
|
return i;
|
|
}
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
#endif
|
|
} // end of if
|
|
}// end of for
|
|
// xiaoyu: It is assumed that an error would return if neither UTF-8 nor UCS-2
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
//
|
|
// delegate to MLANG then
|
|
//
|
|
MIMECSETINFO mimeCharsetInfo;
|
|
HRESULT hr;
|
|
|
|
hr = _EnsureMultiLanguage();
|
|
if (hr == S_OK)
|
|
{
|
|
hr = pMultiLanguage->GetCharsetInfo((WCHAR*)charset, &mimeCharsetInfo);
|
|
if (hr == S_OK)
|
|
{
|
|
*pcodepage = mimeCharsetInfo.uiInternetEncoding;
|
|
if (GetCPInfo(*pcodepage, &cpinfo))
|
|
*mCharSize = cpinfo.MaxCharSize;
|
|
else // if we don't know the max size, assume a large size
|
|
*mCharSize = 4;
|
|
return -1;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
return -2;
|
|
}
|
|
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
extern HRESULT CreateMultiLanguage(IMultiLanguage ** ppUnk);
|
|
|
|
HRESULT CharEncoder::_EnsureMultiLanguage()
|
|
{
|
|
return CreateMultiLanguage(&pMultiLanguage);
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* get information about a code page identified by <code> encoding </code>
|
|
*/
|
|
HRESULT CharEncoder::getWideCharFromMultiByteInfo(Encoding * encoding, CODEPAGE * pcodepage, WideCharFromMultiByteFunc ** pfnWideCharFromMultiByte, UINT * mCharSize)
|
|
{
|
|
HRESULT hr = S_OK;
|
|
|
|
int i = getCharsetInfo(encoding->charset, pcodepage, mCharSize);
|
|
if (i >= 0) // in our short list
|
|
{
|
|
switch (*pcodepage)
|
|
{
|
|
case CP_UCS_2:
|
|
if (encoding->littleendian)
|
|
*pfnWideCharFromMultiByte = wideCharFromUcs2Littleendian;
|
|
else
|
|
*pfnWideCharFromMultiByte = wideCharFromUcs2Bigendian;
|
|
break;
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
case CP_UCS_4:
|
|
if (encoding->littleendian)
|
|
*pfnWideCharFromMultiByte = wideCharFromUcs4Littleendian;
|
|
else
|
|
*pfnWideCharFromMultiByte = wideCharFromUcs4Bigendian;
|
|
break;
|
|
#endif
|
|
default:
|
|
*pfnWideCharFromMultiByte = charsetInfo[i].pfnWideCharFromMultiByte;
|
|
break;
|
|
}
|
|
}
|
|
// xiaoyu : we do not deal this case
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
else if (i == -1) // delegate to MLANG
|
|
{
|
|
hr = pMultiLanguage->IsConvertible(*pcodepage, CP_UCS_2);
|
|
if (S_OK == hr)
|
|
*pfnWideCharFromMultiByte = wideCharFromMultiByteMlang;
|
|
}
|
|
#endif
|
|
else // invalid encoding
|
|
{
|
|
hr = E_FAIL;
|
|
}
|
|
return hr;
|
|
}
|
|
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
/**
|
|
* get information about a code page identified by <code> encoding </code>
|
|
*/
|
|
HRESULT CharEncoder::getWideCharToMultiByteInfo(Encoding * encoding, CODEPAGE * pcodepage, WideCharToMultiByteFunc ** pfnWideCharToMultiByte, UINT * mCharSize)
|
|
{
|
|
HRESULT hr = S_OK;
|
|
|
|
int i = getCharsetInfo(encoding->charset, pcodepage, mCharSize);
|
|
if (i >= 0) // in our short list
|
|
{
|
|
switch (*pcodepage)
|
|
{
|
|
case CP_UCS_2:
|
|
if (encoding->littleendian)
|
|
*pfnWideCharToMultiByte = wideCharToUcs2Littleendian;
|
|
else
|
|
*pfnWideCharToMultiByte = wideCharToUcs2Bigendian;
|
|
break;
|
|
case CP_UCS_4:
|
|
if (encoding->littleendian)
|
|
*pfnWideCharToMultiByte = wideCharToUcs4Littleendian;
|
|
else
|
|
*pfnWideCharToMultiByte = wideCharToUcs4Bigendian;
|
|
break;
|
|
default:
|
|
*pfnWideCharToMultiByte = charsetInfo[i].pfnWideCharToMultiByte;
|
|
break;
|
|
}
|
|
}
|
|
else if (i == -1) // delegate to MLANG
|
|
{
|
|
hr = pMultiLanguage->IsConvertible(CP_UCS_2, *pcodepage);
|
|
if (hr == S_OK)
|
|
*pfnWideCharToMultiByte = wideCharToMultiByteMlang;
|
|
else
|
|
hr = E_FAIL;
|
|
}
|
|
else
|
|
{
|
|
hr = E_FAIL;
|
|
}
|
|
|
|
return hr;
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* Scans rawbuffer and translates UTF8 characters into UNICODE characters
|
|
*/
|
|
HRESULT CharEncoder::wideCharFromUtf8(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
|
|
UINT * cb, WCHAR * buffer, UINT * cch)
|
|
{
|
|
|
|
UNUSED(pdwMode);
|
|
UNUSED(codepage);
|
|
#if 0
|
|
// Just for the record - I tried this and measured it and it's twice as
|
|
// slow as our hand-crafted code.
|
|
|
|
// Back up if end of buffer is the second or third byte of a multi-byte
|
|
// encoding since MultiByteToWideChar cannot handle this case. These second
|
|
// and third bytes are easy to identify - they always start with the bit
|
|
// pattern 0x10xxxxxx.
|
|
|
|
UINT remaining = 0;
|
|
UINT count;
|
|
int endpos = (int)*cb;
|
|
|
|
while (endpos > 0 && (bytebuffer[endpos-1] & 0xc0) == 0x80)
|
|
{
|
|
endpos--;
|
|
remaining++;
|
|
}
|
|
if (endpos > 0)
|
|
{
|
|
count = MultiByteToWideChar(CP_UTF8, 0, bytebuffer, endpos, buffer, *cch);
|
|
if (count == 0)
|
|
{
|
|
return HRESULT_FROM_WIN32(GetLastError());
|
|
}
|
|
}
|
|
#else
|
|
UINT remaining = *cb;
|
|
UINT count = 0;
|
|
UINT max = *cch;
|
|
ULONG ucs4;
|
|
|
|
// UTF-8 multi-byte encoding. See Appendix A.2 of the Unicode book for more info.
|
|
//
|
|
// Unicode value 1st byte 2nd byte 3rd byte 4th byte
|
|
// 000000000xxxxxxx 0xxxxxxx
|
|
// 00000yyyyyxxxxxx 110yyyyy 10xxxxxx
|
|
// zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx
|
|
// 110110wwwwzzzzyy+ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
|
|
// 110111yyyyxxxxxx, where uuuuu = wwww + 1
|
|
WCHAR c;
|
|
bool valid = true;
|
|
|
|
while (remaining > 0 && count < max)
|
|
{
|
|
// This is an optimization for straight runs of 7-bit ascii
|
|
// inside the UTF-8 data.
|
|
c = *bytebuffer;
|
|
if (c & 0x80) // check 8th-bit and get out of here
|
|
break; // so we can do proper UTF-8 decoding.
|
|
*buffer++ = c;
|
|
bytebuffer++;
|
|
count++;
|
|
remaining--;
|
|
}
|
|
|
|
while (remaining > 0 && count < max)
|
|
{
|
|
UINT bytes = 0;
|
|
for (c = *bytebuffer; c & 0x80; c <<= 1)
|
|
bytes++;
|
|
|
|
if (bytes == 0)
|
|
bytes = 1;
|
|
|
|
if (remaining < bytes)
|
|
{
|
|
break;
|
|
}
|
|
|
|
c = 0;
|
|
switch ( bytes )
|
|
{
|
|
case 6: bytebuffer++; // We do not handle ucs4 chars
|
|
case 5: bytebuffer++; // except those on plane 1
|
|
valid = false;
|
|
// fall through
|
|
case 4:
|
|
// Do we have enough buffer?
|
|
if (count >= max - 1)
|
|
goto Cleanup;
|
|
|
|
// surrogate pairs
|
|
ucs4 = ULONG(*bytebuffer++ & 0x07) << 18;
|
|
if ((*bytebuffer & 0xc0) != 0x80)
|
|
valid = false;
|
|
ucs4 |= ULONG(*bytebuffer++ & 0x3f) << 12;
|
|
if ((*bytebuffer & 0xc0) != 0x80)
|
|
valid = false;
|
|
ucs4 |= ULONG(*bytebuffer++ & 0x3f) << 6;
|
|
if ((*bytebuffer & 0xc0) != 0x80)
|
|
valid = false;
|
|
ucs4 |= ULONG(*bytebuffer++ & 0x3f);
|
|
|
|
// For non-BMP code values of ISO/IEC 10646,
|
|
// only those in plane 1 are valid xml characters
|
|
if (ucs4 > 0x10ffff)
|
|
valid = false;
|
|
|
|
if (valid)
|
|
{
|
|
// first ucs2 char
|
|
*buffer++ = static_cast<WCHAR>((ucs4 - 0x10000) / 0x400 + 0xd800);
|
|
count++;
|
|
// second ucs2 char
|
|
c = static_cast<WCHAR>((ucs4 - 0x10000) % 0x400 + 0xdc00);
|
|
}
|
|
break;
|
|
|
|
case 3: c = WCHAR(*bytebuffer++ & 0x0f) << 12; // 0x0800 - 0xffff
|
|
if ((*bytebuffer & 0xc0) != 0x80)
|
|
valid = false;
|
|
// fall through
|
|
case 2: c |= WCHAR(*bytebuffer++ & 0x3f) << 6; // 0x0080 - 0x07ff
|
|
if ((*bytebuffer & 0xc0) != 0x80)
|
|
valid = false;
|
|
c |= WCHAR(*bytebuffer++ & 0x3f);
|
|
break;
|
|
|
|
case 1:
|
|
c = WCHAR(*bytebuffer++); // 0x0000 - 0x007f
|
|
break;
|
|
|
|
default:
|
|
valid = false; // not a valid UTF-8 character.
|
|
break;
|
|
}
|
|
|
|
// If the multibyte sequence was illegal, store a FFFF character code.
|
|
// The Unicode spec says this value may be used as a signal like this.
|
|
// This will be detected later by the parser and an error generated.
|
|
// We don't throw an exception here because the parser would not yet know
|
|
// the line and character where the error occurred and couldn't produce a
|
|
// detailed error message.
|
|
|
|
if (! valid)
|
|
{
|
|
c = 0xffff;
|
|
valid = true;
|
|
}
|
|
|
|
*buffer++ = c;
|
|
count++;
|
|
remaining -= bytes;
|
|
}
|
|
#endif
|
|
|
|
Cleanup:
|
|
// tell caller that there are bytes remaining in the buffer to
|
|
// be processed next time around when we have more data.
|
|
*cb -= remaining;
|
|
*cch = count;
|
|
return S_OK;
|
|
}
|
|
|
|
|
|
/**
|
|
* Scans bytebuffer and translates UCS2 big endian characters into UNICODE characters
|
|
*/
|
|
HRESULT CharEncoder::wideCharFromUcs2Bigendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
|
|
UINT * cb, WCHAR * buffer, UINT * cch)
|
|
{
|
|
UNUSED(codepage);
|
|
UNUSED(pdwMode);
|
|
|
|
UINT num = *cb >> 1;
|
|
if (num > *cch)
|
|
num = *cch;
|
|
for (UINT i = num; i > 0; i--)
|
|
{
|
|
*buffer++ = ((*bytebuffer) << 8) | (*(bytebuffer + 1));
|
|
bytebuffer += 2;
|
|
}
|
|
*cch = num;
|
|
*cb = num << 1;
|
|
return S_OK;
|
|
}
|
|
|
|
|
|
/**
|
|
* Scans bytebuffer and translates UCS2 little endian characters into UNICODE characters
|
|
*/
|
|
HRESULT CharEncoder::wideCharFromUcs2Littleendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
|
|
UINT * cb, WCHAR * buffer, UINT * cch)
|
|
{
|
|
UNUSED(codepage);
|
|
UNUSED(pdwMode);
|
|
|
|
UINT num = *cb / 2; // Ucs2 is two byte unicode.
|
|
if (num > *cch)
|
|
num = *cch;
|
|
|
|
|
|
#ifndef UNIX
|
|
// Optimization for windows platform where little endian maps directly to WCHAR.
|
|
// (This increases overall parser performance by 5% for large unicode files !!)
|
|
::memcpy(buffer, bytebuffer, num * sizeof(WCHAR));
|
|
#else
|
|
for (UINT i = num; i > 0 ; i--)
|
|
{
|
|
// we want the letter 'a' to be 0x0000006a.
|
|
*buffer++ = (*(bytebuffer+1)<<8) | (*bytebuffer);
|
|
bytebuffer += 2;
|
|
}
|
|
#endif
|
|
*cch = num;
|
|
*cb = num * 2;
|
|
return S_OK;
|
|
}
|
|
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
/**
|
|
* Scans bytebuffer and translates UCS4 big endian characters into UNICODE characters
|
|
*/
|
|
HRESULT CharEncoder::wideCharFromUcs4Bigendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
|
|
UINT * cb, WCHAR * buffer, UINT * cch)
|
|
{
|
|
UINT num = *cb >> 2;
|
|
if (num > *cch)
|
|
num = *cch;
|
|
for (UINT i = num; i > 0; i--)
|
|
{
|
|
#ifndef UNIX
|
|
if (*bytebuffer != 0 || *(bytebuffer + 1) != 0)
|
|
{
|
|
return XML_E_INVALID_UNICODE;
|
|
}
|
|
*buffer++ = (*(bytebuffer + 2) << 8) | (*(bytebuffer + 3));
|
|
#else
|
|
*buffer++ = ((*bytebuffer)<<24) | (*(bytebuffer+1)<<16) | (*(bytebuffer+2)<<8) | (*(bytebuffer+3));
|
|
#endif
|
|
bytebuffer += 4;
|
|
}
|
|
*cch = num;
|
|
*cb = num << 2;
|
|
return S_OK;
|
|
}
|
|
#endif
|
|
|
|
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
/**
|
|
* Scans bytebuffer and translates UCS4 little endian characters into UNICODE characters
|
|
*/
|
|
HRESULT CharEncoder::wideCharFromUcs4Littleendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
|
|
UINT * cb, WCHAR * buffer, UINT * cch)
|
|
{
|
|
UINT num = *cb >> 2; // Ucs4 is two byte unicode.
|
|
if (num > *cch)
|
|
num = *cch;
|
|
for (UINT i = num; i > 0 ; i--)
|
|
{
|
|
#ifndef UNIX
|
|
*buffer++ = (*(bytebuffer+1)<<8) | (*bytebuffer);
|
|
if (*(bytebuffer + 2) != 0 || *(bytebuffer + 3) != 0)
|
|
{
|
|
return XML_E_INVALID_UNICODE;
|
|
}
|
|
#else
|
|
*buffer++ = (*(bytebuffer+3)<<24) | (*(bytebuffer+2)<<16) | (*(bytebuffer+1)<<8) | (*bytebuffer);
|
|
#endif
|
|
bytebuffer += 4;
|
|
}
|
|
*cch = num;
|
|
*cb = num << 2;
|
|
return S_OK;
|
|
}
|
|
#endif
|
|
|
|
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
/**
|
|
* Scans bytebuffer and translates characters of charSet identified by
|
|
* <code> codepage </code> into UNICODE characters,
|
|
* using Win32 function MultiByteToWideChar() for encoding
|
|
*/
|
|
HRESULT CharEncoder::wideCharFromMultiByteWin32(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
|
|
UINT * cb, WCHAR * buffer, UINT * cch)
|
|
{
|
|
HRESULT hr = S_OK;
|
|
*cch = ::MultiByteToWideChar(codepage, MB_PRECOMPOSED,
|
|
(char*)bytebuffer, *cb,
|
|
buffer, *cch);
|
|
if (*cch == 0)
|
|
hr = GetLastError();
|
|
return hr;
|
|
}
|
|
#endif
|
|
|
|
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
/**
|
|
* Scans bytebuffer and translates multibyte characters into UNICODE characters,
|
|
* using Mlang for encoding
|
|
*/
|
|
HRESULT CharEncoder::wideCharFromMultiByteMlang(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer,
|
|
UINT * cb, WCHAR * buffer, UINT * cch)
|
|
{
|
|
HRESULT hr;
|
|
checkhr2(_EnsureMultiLanguage());
|
|
checkhr2(pMultiLanguage->ConvertStringToUnicode(pdwMode, codepage,
|
|
(char*)bytebuffer, cb,
|
|
buffer, cch ));
|
|
return S_OK;
|
|
}
|
|
#endif
|
|
|
|
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
/**
|
|
* Scans buffer and translates Unicode characters into Ucs2 big endian characters
|
|
*/
|
|
HRESULT CharEncoder::wideCharToUcs2Bigendian(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer,
|
|
UINT *cch, BYTE* bytebuffer, UINT * cb)
|
|
{
|
|
UINT num = (*cb) >> 1;
|
|
if (num > *cch)
|
|
num = *cch;
|
|
// BUGBUG - what do we do about Unix where WCHAR is 4 bytes ?
|
|
// Currently we just throw away the high WORD - but I don't know how else
|
|
// to do it, since UCS2 is 2-byte unicode by definition.
|
|
for (UINT i = num; i > 0; i--)
|
|
{
|
|
*bytebuffer++ = (*buffer) >> 8;
|
|
*bytebuffer++ = (*buffer++) & 0xFF;
|
|
}
|
|
*cch = num;
|
|
*cb = num << 1;
|
|
return S_OK;
|
|
}
|
|
#endif
|
|
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
/**
|
|
* Scans buffer and translates Unicode characters into Ucs2 little endian characters
|
|
*/
|
|
HRESULT CharEncoder::wideCharToUcs2Littleendian(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer,
|
|
UINT *cch, BYTE* bytebuffer, UINT * cb)
|
|
{
|
|
UINT num = (*cb) >> 1;
|
|
if (num > *cch)
|
|
num = *cch;
|
|
|
|
// BUGBUG - what do we do about Unix where WCHAR is 4 bytes ?
|
|
// Currently we just throw away the high WORD - but I don't know how else
|
|
// to do it, since UCS2 is 2-byte unicode by definition.
|
|
#ifndef UNIX
|
|
// Optimization for windows platform where little endian maps directly to WCHAR.
|
|
// (This increases overall parser performance by 5% for large unicode files !!)
|
|
::memcpy(bytebuffer, buffer, num * sizeof(WCHAR));
|
|
#else
|
|
for (UINT i = num; i > 0; i--)
|
|
{
|
|
*bytebuffer++ = (*buffer) & 0xFF;
|
|
*bytebuffer++ = (*buffer++) >> 8;
|
|
}
|
|
#endif
|
|
*cch = num;
|
|
*cb = num << 1;
|
|
return S_OK;
|
|
}
|
|
#endif
|
|
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
/**
|
|
* Scans buffer and translates Unicode characters into Ucs4 big endian characters
|
|
*/
|
|
HRESULT CharEncoder::wideCharToUcs4Bigendian(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer,
|
|
UINT *cch, BYTE* bytebuffer, UINT * cb)
|
|
{
|
|
UINT num = (*cb) >> 2;
|
|
if (num > *cch)
|
|
num = *cch;
|
|
|
|
for (UINT i = num; i > 0; i--)
|
|
{
|
|
#ifndef UNIX
|
|
*bytebuffer++ = 0;
|
|
*bytebuffer++ = 0;
|
|
*bytebuffer++ = (*buffer) >> 8;
|
|
*bytebuffer++ = (*buffer) & 0xFF;
|
|
#else
|
|
*bytebuffer++ = (*buffer) >> 24;
|
|
*bytebuffer++ = ((*buffer) >> 16) & 0xFF;
|
|
*bytebuffer++ = ((*buffer) >> 8) & 0xFF;
|
|
*bytebuffer++ = (*buffer) & 0xFF;
|
|
#endif
|
|
buffer++;
|
|
}
|
|
*cch = num;
|
|
*cb = num << 2;
|
|
return S_OK;
|
|
}
|
|
#endif
|
|
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
/**
|
|
* Scans buffer and translates Unicode characters into Ucs4 little endian characters
|
|
*/
|
|
HRESULT CharEncoder::wideCharToUcs4Littleendian(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer,
|
|
UINT *cch, BYTE* bytebuffer, UINT * cb)
|
|
{
|
|
UINT num = (*cb) >> 2;
|
|
if (num > *cch)
|
|
num = *cch;
|
|
|
|
for (UINT i = num; i > 0; i--)
|
|
{
|
|
#ifndef UNIX
|
|
*bytebuffer++ = (*buffer) & 0xFF;
|
|
*bytebuffer++ = (*buffer) >> 8;
|
|
*bytebuffer++ = 0;
|
|
*bytebuffer++ = 0;
|
|
#else
|
|
*bytebuffer++ = (*buffer) & 0xFF;
|
|
*bytebuffer++ = ((*buffer) >> 8) & 0xFF;
|
|
*bytebuffer++ = ((*buffer) >> 16) & 0xFF;
|
|
*bytebuffer++ = (*buffer) >> 24;
|
|
#endif
|
|
buffer++;
|
|
}
|
|
*cch = num;
|
|
*cb = num << 2;
|
|
return S_OK;
|
|
}
|
|
#endif
|
|
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
/**
|
|
* Scans buffer and translates Unicode characters into UTF8 characters
|
|
*/
|
|
HRESULT CharEncoder::wideCharToUtf8(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer,
|
|
UINT *cch, BYTE* bytebuffer, UINT * cb)
|
|
{
|
|
UINT count = 0, num = *cch, m1 = *cb, m2 = m1 - 1, m3 = m2 - 1, m4 = m3 - 1;
|
|
DWORD dw1;
|
|
bool surrogate = false;
|
|
|
|
for (UINT i = num; i > 0; i--)
|
|
{
|
|
#ifdef UNIX
|
|
// Solaris a WCHAR is 4 bytes (DWORD)
|
|
DWORD dw = 0;
|
|
DWORD dwTemp[4];
|
|
BYTE* pByte = (BYTE*)buffer;
|
|
dwTemp[3] = (DWORD)pByte[0];
|
|
dwTemp[2] = (DWORD)pByte[1];
|
|
dwTemp[1] = (DWORD)pByte[2];
|
|
dwTemp[0] = (DWORD)pByte[3];
|
|
dw = dwTemp[0]+(dwTemp[1]<<8)+(dwTemp[2]<<16)+(dwTemp[3]<<24);
|
|
#else
|
|
DWORD dw = *buffer;
|
|
#endif
|
|
|
|
if (surrogate) // is it the second char of a surrogate pair?
|
|
{
|
|
if (dw >= 0xdc00 && dw <= 0xdfff)
|
|
{
|
|
// four bytes 0x11110xxx 0x10xxxxxx 0x10xxxxxx 0x10xxxxxx
|
|
if (count < m4)
|
|
count += 4;
|
|
else
|
|
break;
|
|
ULONG ucs4 = (dw1 - 0xd800) * 0x400 + (dw - 0xdc00) + 0x10000;
|
|
*bytebuffer++ = (byte)(( ucs4 >> 18) | 0xF0);
|
|
*bytebuffer++ = (byte)((( ucs4 >> 12) & 0x3F) | 0x80);
|
|
*bytebuffer++ = (byte)((( ucs4 >> 6) & 0x3F) | 0x80);
|
|
*bytebuffer++ = (byte)(( ucs4 & 0x3F) | 0x80);
|
|
surrogate = false;
|
|
buffer++;
|
|
continue;
|
|
}
|
|
else // Then dw1 must be a three byte character
|
|
{
|
|
if (count < m3)
|
|
count += 3;
|
|
else
|
|
break;
|
|
*bytebuffer++ = (byte)(( dw1 >> 12) | 0xE0);
|
|
*bytebuffer++ = (byte)((( dw1 >> 6) & 0x3F) | 0x80);
|
|
*bytebuffer++ = (byte)(( dw1 & 0x3F) | 0x80);
|
|
}
|
|
surrogate = false;
|
|
}
|
|
|
|
if (dw < 0x80) // one byte, 0xxxxxxx
|
|
{
|
|
if (count < m1)
|
|
count++;
|
|
else
|
|
break;
|
|
*bytebuffer++ = (byte)dw;
|
|
}
|
|
else if ( dw < 0x800) // two WORDS, 110xxxxx 10xxxxxx
|
|
{
|
|
if (count < m2)
|
|
count += 2;
|
|
else
|
|
break;
|
|
*bytebuffer++ = (byte)((dw >> 6) | 0xC0);
|
|
*bytebuffer++ = (byte)((dw & 0x3F) | 0x80);
|
|
}
|
|
else if (dw >= 0xd800 && dw <= 0xdbff) // Assume that it is the first char of surrogate pair
|
|
{
|
|
if (i == 1) // last wchar in buffer
|
|
break;
|
|
dw1 = dw;
|
|
surrogate = true;
|
|
}
|
|
else // three bytes, 1110xxxx 10xxxxxx 10xxxxxx
|
|
{
|
|
if (count < m3)
|
|
count += 3;
|
|
else
|
|
break;
|
|
*bytebuffer++ = (byte)(( dw >> 12) | 0xE0);
|
|
*bytebuffer++ = (byte)((( dw >> 6) & 0x3F) | 0x80);
|
|
*bytebuffer++ = (byte)(( dw & 0x3F) | 0x80);
|
|
}
|
|
buffer++;
|
|
}
|
|
|
|
*cch = surrogate ? num - i - 1 : num - i;
|
|
*cb = count;
|
|
|
|
return S_OK;
|
|
}
|
|
#endif
|
|
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
/**
|
|
* Scans buffer and translates Unicode characters into characters identified
|
|
* by <code> codepage </>, using Win32 function WideCharToMultiByte for encoding
|
|
*/
|
|
HRESULT CharEncoder::wideCharToMultiByteWin32(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer,
|
|
UINT *cch, BYTE* bytebuffer, UINT * cb)
|
|
{
|
|
HRESULT hr = S_OK;
|
|
BOOL fBadChar = false;
|
|
*cb = ::WideCharToMultiByte(codepage, NULL, buffer, *cch, (char*)bytebuffer, *cb, NULL, &fBadChar);
|
|
if (*cb == 0)
|
|
hr = ::GetLastError();
|
|
else if (fBadChar)
|
|
// BUGBUG: how do we inform the caller which character failed?
|
|
hr = S_FALSE;
|
|
return hr;
|
|
}
|
|
#endif
|
|
|
|
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
|
|
/**
|
|
* Scans buffer and translates Unicode characters into characters of charSet
|
|
* identified by <code> codepage </code>, using Mlang for encoding
|
|
*/
|
|
HRESULT CharEncoder::wideCharToMultiByteMlang(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer,
|
|
UINT *cch, BYTE* bytebuffer, UINT * cb)
|
|
{
|
|
HRESULT hr;
|
|
checkhr2(_EnsureMultiLanguage());
|
|
checkhr2(pMultiLanguage->ConvertStringFromUnicode(pdwMode, codepage,
|
|
buffer, cch, (char*)bytebuffer, cb ));
|
|
return S_OK;
|
|
}
|
|
#endif
|