/* * @(#)CharEncoder.cxx 1.0 6/10/97 * * Copyright (c) 1997 - 1999 Microsoft Corporation. All rights reserved. * */ #include "stdinc.h" #include "core.hxx" #pragma hdrstop #include "charencoder.hxx" #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE #include // IsCharSpace #ifdef UNIX #include #endif #ifdef UNIX // Not needed under UNIX #else #ifndef _WIN64 #include #endif // _WIN64 #endif /* UNIX */ #endif // // Delegate other charsets to mlang // const EncodingEntry CharEncoder::charsetInfo [] = { #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE { CP_1250, _T("WINDOWS-1250"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 }, { CP_1251, _T("WINDOWS-1251"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 }, { CP_1252, _T("WINDOWS-1252"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 }, { CP_1253, _T("WINDOWS-1253"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 }, { CP_1254, _T("WINDOWS-1254"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 }, { CP_1257, _T("WINDOWS-1257"), 1, wideCharFromMultiByteWin32, wideCharToMultiByteWin32 }, { CP_UCS_4, _T("UCS-4"), 4, wideCharFromUcs4Bigendian, wideCharToUcs4Bigendian }, { CP_UCS_2, _T("ISO-10646-UCS-2"), 2, wideCharFromUcs2Bigendian, wideCharToUcs2Bigendian }, { CP_UCS_2, _T("UNICODE-2-0-UTF-16"), 2, wideCharFromUcs2Bigendian, wideCharToUcs2Bigendian }, { CP_UCS_2, _T("UTF-16"), 2, wideCharFromUcs2Bigendian, wideCharToUcs2Bigendian }, { CP_UTF_8, _T("UNICODE-1-1-UTF-8"), 3, wideCharFromUtf8, wideCharToUtf8 }, { CP_UTF_8, _T("UNICODE-2-0-UTF-8"), 3, wideCharFromUtf8, wideCharToUtf8 }, #endif { CP_UCS_2, L"UTF-16", 2, wideCharFromUcs2Bigendian }, { CP_UCS_2, L"UCS-2", 2, wideCharFromUcs2Bigendian }, { CP_UTF_8, L"UTF-8", 3, wideCharFromUtf8 }, }; #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE IMultiLanguage * CharEncoder::pMultiLanguage = NULL; #endif Encoding * Encoding::newEncoding(const WCHAR * s, ULONG len, bool endian, bool mark) { //Encoding * e = new Encoding(); Encoding * e = NEW (Encoding()); if (e == NULL) return NULL; e->charset = NEW (WCHAR[len + 1]); if (e->charset == NULL) { delete e; return NULL; } ::memcpy(e->charset, s, sizeof(WCHAR) * len); e->charset[len] = 0; // guarentee NULL termination. e->littleendian = endian; e->byteOrderMark = mark; return e; } Encoding::~Encoding() { if (charset != NULL) { delete [] charset; } } int CharEncoder::getCharsetInfo(const WCHAR * charset, CODEPAGE * pcodepage, UINT * mCharSize) { #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE CPINFO cpinfo; #endif for (int i = LENGTH(charsetInfo) - 1; i >= 0; i--) { //if (StrCmpI(charset, charsetInfo[i].charset) == 0) if (::FusionpCompareStrings(charset, ::wcslen(charset), charsetInfo[i].charset, ::wcslen(charsetInfo[i].charset), true) == 0) { // // test whether we can handle it locally or not // BUGBUG(HACK) the index number may change if we change charsetInfo // #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE if (i > 5 || GetCPInfo(charsetInfo[i].codepage, &cpinfo)) #endif { *pcodepage = charsetInfo[i].codepage; *mCharSize = charsetInfo[i].maxCharSize; return i; } #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE else { break; } #endif } // end of if }// end of for // xiaoyu: It is assumed that an error would return if neither UTF-8 nor UCS-2 #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE // // delegate to MLANG then // MIMECSETINFO mimeCharsetInfo; HRESULT hr; hr = _EnsureMultiLanguage(); if (hr == S_OK) { hr = pMultiLanguage->GetCharsetInfo((WCHAR*)charset, &mimeCharsetInfo); if (hr == S_OK) { *pcodepage = mimeCharsetInfo.uiInternetEncoding; if (GetCPInfo(*pcodepage, &cpinfo)) *mCharSize = cpinfo.MaxCharSize; else // if we don't know the max size, assume a large size *mCharSize = 4; return -1; } } #endif return -2; } #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE extern HRESULT CreateMultiLanguage(IMultiLanguage ** ppUnk); HRESULT CharEncoder::_EnsureMultiLanguage() { return CreateMultiLanguage(&pMultiLanguage); } #endif /** * get information about a code page identified by encoding */ HRESULT CharEncoder::getWideCharFromMultiByteInfo(Encoding * encoding, CODEPAGE * pcodepage, WideCharFromMultiByteFunc ** pfnWideCharFromMultiByte, UINT * mCharSize) { HRESULT hr = S_OK; int i = getCharsetInfo(encoding->charset, pcodepage, mCharSize); if (i >= 0) // in our short list { switch (*pcodepage) { case CP_UCS_2: if (encoding->littleendian) *pfnWideCharFromMultiByte = wideCharFromUcs2Littleendian; else *pfnWideCharFromMultiByte = wideCharFromUcs2Bigendian; break; #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE case CP_UCS_4: if (encoding->littleendian) *pfnWideCharFromMultiByte = wideCharFromUcs4Littleendian; else *pfnWideCharFromMultiByte = wideCharFromUcs4Bigendian; break; #endif default: *pfnWideCharFromMultiByte = charsetInfo[i].pfnWideCharFromMultiByte; break; } } // xiaoyu : we do not deal this case #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE else if (i == -1) // delegate to MLANG { hr = pMultiLanguage->IsConvertible(*pcodepage, CP_UCS_2); if (S_OK == hr) *pfnWideCharFromMultiByte = wideCharFromMultiByteMlang; } #endif else // invalid encoding { hr = E_FAIL; } return hr; } #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE /** * get information about a code page identified by encoding */ HRESULT CharEncoder::getWideCharToMultiByteInfo(Encoding * encoding, CODEPAGE * pcodepage, WideCharToMultiByteFunc ** pfnWideCharToMultiByte, UINT * mCharSize) { HRESULT hr = S_OK; int i = getCharsetInfo(encoding->charset, pcodepage, mCharSize); if (i >= 0) // in our short list { switch (*pcodepage) { case CP_UCS_2: if (encoding->littleendian) *pfnWideCharToMultiByte = wideCharToUcs2Littleendian; else *pfnWideCharToMultiByte = wideCharToUcs2Bigendian; break; case CP_UCS_4: if (encoding->littleendian) *pfnWideCharToMultiByte = wideCharToUcs4Littleendian; else *pfnWideCharToMultiByte = wideCharToUcs4Bigendian; break; default: *pfnWideCharToMultiByte = charsetInfo[i].pfnWideCharToMultiByte; break; } } else if (i == -1) // delegate to MLANG { hr = pMultiLanguage->IsConvertible(CP_UCS_2, *pcodepage); if (hr == S_OK) *pfnWideCharToMultiByte = wideCharToMultiByteMlang; else hr = E_FAIL; } else { hr = E_FAIL; } return hr; } #endif /** * Scans rawbuffer and translates UTF8 characters into UNICODE characters */ HRESULT CharEncoder::wideCharFromUtf8(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { UNUSED(pdwMode); UNUSED(codepage); #if 0 // Just for the record - I tried this and measured it and it's twice as // slow as our hand-crafted code. // Back up if end of buffer is the second or third byte of a multi-byte // encoding since MultiByteToWideChar cannot handle this case. These second // and third bytes are easy to identify - they always start with the bit // pattern 0x10xxxxxx. UINT remaining = 0; UINT count; int endpos = (int)*cb; while (endpos > 0 && (bytebuffer[endpos-1] & 0xc0) == 0x80) { endpos--; remaining++; } if (endpos > 0) { count = MultiByteToWideChar(CP_UTF8, 0, bytebuffer, endpos, buffer, *cch); if (count == 0) { return HRESULT_FROM_WIN32(GetLastError()); } } #else UINT remaining = *cb; UINT count = 0; UINT max = *cch; ULONG ucs4; // UTF-8 multi-byte encoding. See Appendix A.2 of the Unicode book for more info. // // Unicode value 1st byte 2nd byte 3rd byte 4th byte // 000000000xxxxxxx 0xxxxxxx // 00000yyyyyxxxxxx 110yyyyy 10xxxxxx // zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx // 110110wwwwzzzzyy+ 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx // 110111yyyyxxxxxx, where uuuuu = wwww + 1 WCHAR c; bool valid = true; while (remaining > 0 && count < max) { // This is an optimization for straight runs of 7-bit ascii // inside the UTF-8 data. c = *bytebuffer; if (c & 0x80) // check 8th-bit and get out of here break; // so we can do proper UTF-8 decoding. *buffer++ = c; bytebuffer++; count++; remaining--; } while (remaining > 0 && count < max) { UINT bytes = 0; for (c = *bytebuffer; c & 0x80; c <<= 1) bytes++; if (bytes == 0) bytes = 1; if (remaining < bytes) { break; } c = 0; switch ( bytes ) { case 6: bytebuffer++; // We do not handle ucs4 chars case 5: bytebuffer++; // except those on plane 1 valid = false; // fall through case 4: // Do we have enough buffer? if (count >= max - 1) goto Cleanup; // surrogate pairs ucs4 = ULONG(*bytebuffer++ & 0x07) << 18; if ((*bytebuffer & 0xc0) != 0x80) valid = false; ucs4 |= ULONG(*bytebuffer++ & 0x3f) << 12; if ((*bytebuffer & 0xc0) != 0x80) valid = false; ucs4 |= ULONG(*bytebuffer++ & 0x3f) << 6; if ((*bytebuffer & 0xc0) != 0x80) valid = false; ucs4 |= ULONG(*bytebuffer++ & 0x3f); // For non-BMP code values of ISO/IEC 10646, // only those in plane 1 are valid xml characters if (ucs4 > 0x10ffff) valid = false; if (valid) { // first ucs2 char *buffer++ = static_cast((ucs4 - 0x10000) / 0x400 + 0xd800); count++; // second ucs2 char c = static_cast((ucs4 - 0x10000) % 0x400 + 0xdc00); } break; case 3: c = WCHAR(*bytebuffer++ & 0x0f) << 12; // 0x0800 - 0xffff if ((*bytebuffer & 0xc0) != 0x80) valid = false; // fall through case 2: c |= WCHAR(*bytebuffer++ & 0x3f) << 6; // 0x0080 - 0x07ff if ((*bytebuffer & 0xc0) != 0x80) valid = false; c |= WCHAR(*bytebuffer++ & 0x3f); break; case 1: c = WCHAR(*bytebuffer++); // 0x0000 - 0x007f break; default: valid = false; // not a valid UTF-8 character. break; } // If the multibyte sequence was illegal, store a FFFF character code. // The Unicode spec says this value may be used as a signal like this. // This will be detected later by the parser and an error generated. // We don't throw an exception here because the parser would not yet know // the line and character where the error occurred and couldn't produce a // detailed error message. if (! valid) { c = 0xffff; valid = true; } *buffer++ = c; count++; remaining -= bytes; } #endif Cleanup: // tell caller that there are bytes remaining in the buffer to // be processed next time around when we have more data. *cb -= remaining; *cch = count; return S_OK; } /** * Scans bytebuffer and translates UCS2 big endian characters into UNICODE characters */ HRESULT CharEncoder::wideCharFromUcs2Bigendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { UNUSED(codepage); UNUSED(pdwMode); UINT num = *cb >> 1; if (num > *cch) num = *cch; for (UINT i = num; i > 0; i--) { *buffer++ = ((*bytebuffer) << 8) | (*(bytebuffer + 1)); bytebuffer += 2; } *cch = num; *cb = num << 1; return S_OK; } /** * Scans bytebuffer and translates UCS2 little endian characters into UNICODE characters */ HRESULT CharEncoder::wideCharFromUcs2Littleendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { UNUSED(codepage); UNUSED(pdwMode); UINT num = *cb / 2; // Ucs2 is two byte unicode. if (num > *cch) num = *cch; #ifndef UNIX // Optimization for windows platform where little endian maps directly to WCHAR. // (This increases overall parser performance by 5% for large unicode files !!) ::memcpy(buffer, bytebuffer, num * sizeof(WCHAR)); #else for (UINT i = num; i > 0 ; i--) { // we want the letter 'a' to be 0x0000006a. *buffer++ = (*(bytebuffer+1)<<8) | (*bytebuffer); bytebuffer += 2; } #endif *cch = num; *cb = num * 2; return S_OK; } #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE /** * Scans bytebuffer and translates UCS4 big endian characters into UNICODE characters */ HRESULT CharEncoder::wideCharFromUcs4Bigendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { UINT num = *cb >> 2; if (num > *cch) num = *cch; for (UINT i = num; i > 0; i--) { #ifndef UNIX if (*bytebuffer != 0 || *(bytebuffer + 1) != 0) { return XML_E_INVALID_UNICODE; } *buffer++ = (*(bytebuffer + 2) << 8) | (*(bytebuffer + 3)); #else *buffer++ = ((*bytebuffer)<<24) | (*(bytebuffer+1)<<16) | (*(bytebuffer+2)<<8) | (*(bytebuffer+3)); #endif bytebuffer += 4; } *cch = num; *cb = num << 2; return S_OK; } #endif #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE /** * Scans bytebuffer and translates UCS4 little endian characters into UNICODE characters */ HRESULT CharEncoder::wideCharFromUcs4Littleendian(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { UINT num = *cb >> 2; // Ucs4 is two byte unicode. if (num > *cch) num = *cch; for (UINT i = num; i > 0 ; i--) { #ifndef UNIX *buffer++ = (*(bytebuffer+1)<<8) | (*bytebuffer); if (*(bytebuffer + 2) != 0 || *(bytebuffer + 3) != 0) { return XML_E_INVALID_UNICODE; } #else *buffer++ = (*(bytebuffer+3)<<24) | (*(bytebuffer+2)<<16) | (*(bytebuffer+1)<<8) | (*bytebuffer); #endif bytebuffer += 4; } *cch = num; *cb = num << 2; return S_OK; } #endif #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE /** * Scans bytebuffer and translates characters of charSet identified by * codepage into UNICODE characters, * using Win32 function MultiByteToWideChar() for encoding */ HRESULT CharEncoder::wideCharFromMultiByteWin32(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { HRESULT hr = S_OK; *cch = ::MultiByteToWideChar(codepage, MB_PRECOMPOSED, (char*)bytebuffer, *cb, buffer, *cch); if (*cch == 0) hr = GetLastError(); return hr; } #endif #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE /** * Scans bytebuffer and translates multibyte characters into UNICODE characters, * using Mlang for encoding */ HRESULT CharEncoder::wideCharFromMultiByteMlang(DWORD* pdwMode, CODEPAGE codepage, BYTE* bytebuffer, UINT * cb, WCHAR * buffer, UINT * cch) { HRESULT hr; checkhr2(_EnsureMultiLanguage()); checkhr2(pMultiLanguage->ConvertStringToUnicode(pdwMode, codepage, (char*)bytebuffer, cb, buffer, cch )); return S_OK; } #endif #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE /** * Scans buffer and translates Unicode characters into Ucs2 big endian characters */ HRESULT CharEncoder::wideCharToUcs2Bigendian(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer, UINT *cch, BYTE* bytebuffer, UINT * cb) { UINT num = (*cb) >> 1; if (num > *cch) num = *cch; // BUGBUG - what do we do about Unix where WCHAR is 4 bytes ? // Currently we just throw away the high WORD - but I don't know how else // to do it, since UCS2 is 2-byte unicode by definition. for (UINT i = num; i > 0; i--) { *bytebuffer++ = (*buffer) >> 8; *bytebuffer++ = (*buffer++) & 0xFF; } *cch = num; *cb = num << 1; return S_OK; } #endif #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE /** * Scans buffer and translates Unicode characters into Ucs2 little endian characters */ HRESULT CharEncoder::wideCharToUcs2Littleendian(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer, UINT *cch, BYTE* bytebuffer, UINT * cb) { UINT num = (*cb) >> 1; if (num > *cch) num = *cch; // BUGBUG - what do we do about Unix where WCHAR is 4 bytes ? // Currently we just throw away the high WORD - but I don't know how else // to do it, since UCS2 is 2-byte unicode by definition. #ifndef UNIX // Optimization for windows platform where little endian maps directly to WCHAR. // (This increases overall parser performance by 5% for large unicode files !!) ::memcpy(bytebuffer, buffer, num * sizeof(WCHAR)); #else for (UINT i = num; i > 0; i--) { *bytebuffer++ = (*buffer) & 0xFF; *bytebuffer++ = (*buffer++) >> 8; } #endif *cch = num; *cb = num << 1; return S_OK; } #endif #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE /** * Scans buffer and translates Unicode characters into Ucs4 big endian characters */ HRESULT CharEncoder::wideCharToUcs4Bigendian(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer, UINT *cch, BYTE* bytebuffer, UINT * cb) { UINT num = (*cb) >> 2; if (num > *cch) num = *cch; for (UINT i = num; i > 0; i--) { #ifndef UNIX *bytebuffer++ = 0; *bytebuffer++ = 0; *bytebuffer++ = (*buffer) >> 8; *bytebuffer++ = (*buffer) & 0xFF; #else *bytebuffer++ = (*buffer) >> 24; *bytebuffer++ = ((*buffer) >> 16) & 0xFF; *bytebuffer++ = ((*buffer) >> 8) & 0xFF; *bytebuffer++ = (*buffer) & 0xFF; #endif buffer++; } *cch = num; *cb = num << 2; return S_OK; } #endif #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE /** * Scans buffer and translates Unicode characters into Ucs4 little endian characters */ HRESULT CharEncoder::wideCharToUcs4Littleendian(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer, UINT *cch, BYTE* bytebuffer, UINT * cb) { UINT num = (*cb) >> 2; if (num > *cch) num = *cch; for (UINT i = num; i > 0; i--) { #ifndef UNIX *bytebuffer++ = (*buffer) & 0xFF; *bytebuffer++ = (*buffer) >> 8; *bytebuffer++ = 0; *bytebuffer++ = 0; #else *bytebuffer++ = (*buffer) & 0xFF; *bytebuffer++ = ((*buffer) >> 8) & 0xFF; *bytebuffer++ = ((*buffer) >> 16) & 0xFF; *bytebuffer++ = (*buffer) >> 24; #endif buffer++; } *cch = num; *cb = num << 2; return S_OK; } #endif #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE /** * Scans buffer and translates Unicode characters into UTF8 characters */ HRESULT CharEncoder::wideCharToUtf8(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer, UINT *cch, BYTE* bytebuffer, UINT * cb) { UINT count = 0, num = *cch, m1 = *cb, m2 = m1 - 1, m3 = m2 - 1, m4 = m3 - 1; DWORD dw1; bool surrogate = false; for (UINT i = num; i > 0; i--) { #ifdef UNIX // Solaris a WCHAR is 4 bytes (DWORD) DWORD dw = 0; DWORD dwTemp[4]; BYTE* pByte = (BYTE*)buffer; dwTemp[3] = (DWORD)pByte[0]; dwTemp[2] = (DWORD)pByte[1]; dwTemp[1] = (DWORD)pByte[2]; dwTemp[0] = (DWORD)pByte[3]; dw = dwTemp[0]+(dwTemp[1]<<8)+(dwTemp[2]<<16)+(dwTemp[3]<<24); #else DWORD dw = *buffer; #endif if (surrogate) // is it the second char of a surrogate pair? { if (dw >= 0xdc00 && dw <= 0xdfff) { // four bytes 0x11110xxx 0x10xxxxxx 0x10xxxxxx 0x10xxxxxx if (count < m4) count += 4; else break; ULONG ucs4 = (dw1 - 0xd800) * 0x400 + (dw - 0xdc00) + 0x10000; *bytebuffer++ = (byte)(( ucs4 >> 18) | 0xF0); *bytebuffer++ = (byte)((( ucs4 >> 12) & 0x3F) | 0x80); *bytebuffer++ = (byte)((( ucs4 >> 6) & 0x3F) | 0x80); *bytebuffer++ = (byte)(( ucs4 & 0x3F) | 0x80); surrogate = false; buffer++; continue; } else // Then dw1 must be a three byte character { if (count < m3) count += 3; else break; *bytebuffer++ = (byte)(( dw1 >> 12) | 0xE0); *bytebuffer++ = (byte)((( dw1 >> 6) & 0x3F) | 0x80); *bytebuffer++ = (byte)(( dw1 & 0x3F) | 0x80); } surrogate = false; } if (dw < 0x80) // one byte, 0xxxxxxx { if (count < m1) count++; else break; *bytebuffer++ = (byte)dw; } else if ( dw < 0x800) // two WORDS, 110xxxxx 10xxxxxx { if (count < m2) count += 2; else break; *bytebuffer++ = (byte)((dw >> 6) | 0xC0); *bytebuffer++ = (byte)((dw & 0x3F) | 0x80); } else if (dw >= 0xd800 && dw <= 0xdbff) // Assume that it is the first char of surrogate pair { if (i == 1) // last wchar in buffer break; dw1 = dw; surrogate = true; } else // three bytes, 1110xxxx 10xxxxxx 10xxxxxx { if (count < m3) count += 3; else break; *bytebuffer++ = (byte)(( dw >> 12) | 0xE0); *bytebuffer++ = (byte)((( dw >> 6) & 0x3F) | 0x80); *bytebuffer++ = (byte)(( dw & 0x3F) | 0x80); } buffer++; } *cch = surrogate ? num - i - 1 : num - i; *cb = count; return S_OK; } #endif #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE /** * Scans buffer and translates Unicode characters into characters identified * by codepage , using Win32 function WideCharToMultiByte for encoding */ HRESULT CharEncoder::wideCharToMultiByteWin32(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer, UINT *cch, BYTE* bytebuffer, UINT * cb) { HRESULT hr = S_OK; BOOL fBadChar = false; *cb = ::WideCharToMultiByte(codepage, NULL, buffer, *cch, (char*)bytebuffer, *cb, NULL, &fBadChar); if (*cb == 0) hr = ::GetLastError(); else if (fBadChar) // BUGBUG: how do we inform the caller which character failed? hr = S_FALSE; return hr; } #endif #ifdef FUSION_USE_OLD_XML_PARSER_SOURCE /** * Scans buffer and translates Unicode characters into characters of charSet * identified by codepage , using Mlang for encoding */ HRESULT CharEncoder::wideCharToMultiByteMlang(DWORD* pdwMode, CODEPAGE codepage, WCHAR * buffer, UINT *cch, BYTE* bytebuffer, UINT * cb) { HRESULT hr; checkhr2(_EnsureMultiLanguage()); checkhr2(pMultiLanguage->ConvertStringFromUnicode(pdwMode, codepage, buffer, cch, (char*)bytebuffer, cb )); return S_OK; } #endif