#include "private.h" #include "detcbase.h" #include "codepage.h" #include "detcjpn.h" #include "detckrn.h" #include "fechrcnv.h" #include "ichrcnv.h" #include "cpdetect.h" #define CONV_UU 12 #define CONV_UUW 10 #define CONV_UUWI 9 #define CONV_UW 6 #define CONV_UWI 5 #define CONV_WI 3 #define MAX_CHAR_SIZE 4 #define MAPUSERDEF(x) (((x) == 50000) ? 1252 : (x)) #define CONVERT_IS_VALIDCODEPAGE(x) (((x) == CP_USER_DEFINED) ? TRUE: IsValidCodePage(x)) #define CONV_CHK_NLS 0x00000001 struct ENCODINGINFO { DWORD dwEncoding; DWORD dwCodePage; BYTE bTypeUUIW; CP_STATE nCP_State ; // whether this is a valid windows codepage ? DWORD dwFlags; // give us more flexibilities to handle different encodings differently }; static WCHAR UniocdeSignature = { 0xFFFE } ; /* Bit 4 (16) - Unicode <-> Internet Encoding Bit 3 (8) - UTF8, UTF7 Bit 2 (4) - Unicode Bit 1 (2) - Windows CodePage Bit 0 (1) - Internet Encoding P.S. if bit 4 is set, it means it should convert between Unicode and Internet Encoding directly, no intermediate step - Windows CodePage */ // these codepages including Unicode need special convertor static struct ENCODINGINFO aEncodingInfo[] = { { CP_JPN_SJ, 932, 0x02, INVALID_CP, 0 }, // W-Japanese Shift JIS { CP_CHN_GB, 936, 0x02, INVALID_CP, 0 }, // W-Simplified Chinese { CP_KOR_5601, 949, 0x02, INVALID_CP, 0 }, // W-Krean Unified Hangul { CP_TWN, 950, 0x02, INVALID_CP, 0 }, // W-Traditional Chinese { CP_UCS_2, 0, 0x04, INVALID_CP, 0 }, // U-Unicode { CP_UCS_2_BE, 0, 0x04, INVALID_CP, 0 }, // U-Unicode Big Endian { CP_1252, 1252, 0x02, INVALID_CP, 0 }, // W-Latin 1 { CP_20127, 1252, 0x11, INVALID_CP, CONV_CHK_NLS }, // US ASCII { CP_ISO_8859_1, 1252, 0x11, INVALID_CP, CONV_CHK_NLS }, // I-ISO 8859-1 Latin 1 { CP_ISO_8859_15, 1252, 0x11, INVALID_CP, CONV_CHK_NLS }, // I-ISO 8859-1 Latin 1 { CP_AUTO, 1252, 0x01, INVALID_CP, 0 }, // General auto detect { CP_ISO_2022_JP, 932, 0x01, INVALID_CP, 0 }, // I-ISO 2022-JP No Halfwidth Katakana { CP_ISO_2022_JP_ESC, 932, 0x01, INVALID_CP, 0 }, // I-ISO 2022-JP w/esc Halfwidth Katakana { CP_ISO_2022_JP_SIO, 932, 0x01, INVALID_CP, 0 }, // I-ISO 2022-JP w/sio Halfwidth Katakana { CP_ISO_2022_KR, 949, 0x01, INVALID_CP, 0 }, // I-ISO 2022-KR { CP_ISO_2022_TW, 950, 0x01, INVALID_CP, 0 }, // I-ISO 2022-TW { CP_ISO_2022_CH, 936, 0x01, INVALID_CP, 0 }, // I-ISO 2022-CH { CP_JP_AUTO, 932, 0x01, INVALID_CP, 0 }, // JP auto detect { CP_CHS_AUTO, 936, 0x01, INVALID_CP, 0 }, // Simplified Chinese auto detect { CP_KR_AUTO, 949, 0x01, INVALID_CP, 0 }, // KR auto detect { CP_CHT_AUTO, 950, 0x01, INVALID_CP, 0 }, // Traditional Chinese auto detect { CP_CYRILLIC_AUTO, 1251, 0x01, INVALID_CP, 0 }, // Cyrillic auto detect { CP_GREEK_AUTO, 1253, 0x01, INVALID_CP, 0 }, // Greek auto detect { CP_ARABIC_AUTO, 1256, 0x01, INVALID_CP, 0 }, // Arabic auto detect { CP_EUC_JP, 932, 0x01, INVALID_CP, 0 }, // EUC Japanese { CP_EUC_CH, 936, 0x01, INVALID_CP, 0 }, // EUC Chinese { CP_EUC_KR, 949, 0x01, INVALID_CP, 0 }, // EUC Korean { CP_EUC_TW, 950, 0x01, INVALID_CP, 0 }, // EUC Taiwanese { CP_CHN_HZ, 936, 0x01, INVALID_CP, 0 }, // Simplify Chinese HZ-GB { CP_UTF_7, 0, 0x08, INVALID_CP, 0 }, // U-UTF7 { CP_UTF_8, 0, 0x08, INVALID_CP, 0 }, // U-UTF8 }; // HTML name entity table for Latin-1 Supplement - from 0x00A0-0x00FF #define NAME_ENTITY_OFFSET 0x00A0 #define NAME_ENTITY_MAX 0x00FF #define NAME_ENTITY_ENTRY 96 static CHAR *g_lpstrNameEntity[NAME_ENTITY_ENTRY] = { " ", // " " -- no-break space = non-breaking space, "¡", // "¡" -- inverted exclamation mark, U+00A1 ISOnum --> "¢", // "¢" -- cent sign, U+00A2 ISOnum --> "£", // "£" -- pound sign, U+00A3 ISOnum --> "¤", // "¤" -- currency sign, U+00A4 ISOnum --> "¥", // "¥" -- yen sign = yuan sign, U+00A5 ISOnum --> "¦", // "¦" -- broken bar = broken vertical bar, "§", // "§" -- section sign, U+00A7 ISOnum --> "¨", // "¨" -- diaeresis = spacing diaeresis, "©", // "©" -- copyright sign, U+00A9 ISOnum --> "ª", // "ª" -- feminine ordinal indicator, U+00AA ISOnum --> "«", // "«" -- left-pointing double angle quotation mark "¬", // "¬" -- not sign = discretionary hyphen, "­", // "­" -- soft hyphen = discretionary hyphen, "®", // "®" -- registered sign = registered trade mark sign, "¯", // "¯" -- macron = spacing macron = overline "°", // "°" -- degree sign, U+00B0 ISOnum --> "±", // "±" -- plus-minus sign = plus-or-minus sign, "²", // "²" -- superscript two = superscript digit two "³", // "³" -- superscript three = superscript digit three "´", // "´" -- acute accent = spacing acute, "µ", // "µ" -- micro sign, U+00B5 ISOnum --> "¶", // "¶" -- pilcrow sign = paragraph sign, "·", // "·" -- middle dot = Georgian comma "¸", // "¸" -- cedilla = spacing cedilla, U+00B8 ISOdia --> "¹", // "¹" -- superscript one = superscript digit one, "º", // "º" -- masculine ordinal indicator, "»", // "»" -- right-pointing double angle quotation mark "¼", // "¼" -- vulgar fraction one quarter "½", // "½" -- vulgar fraction one half "¾", // "¾" -- vulgar fraction three quarters "¿", // "¿" -- inverted question mark "À", // "À" -- latin capital letter A with grave "Á", // "Á" -- latin capital letter A with acute, "Â", // "Â" -- latin capital letter A with circumflex, "Ã", // "Ã" -- latin capital letter A with tilde, "Ä", // "Ä" -- latin capital letter A with diaeresis, "Å", // "Å" -- latin capital letter A with ring above "Æ", // "Æ" -- latin capital letter AE "Ç", // "Ç" -- latin capital letter C with cedilla, "È", // "È" -- latin capital letter E with grave, "É", // "É" -- latin capital letter E with acute, "Ê", // "Ê" -- latin capital letter E with circumflex, "Ë", // "Ë" -- latin capital letter E with diaeresis, "Ì", // "Ì" -- latin capital letter I with grave, "Í", // "Í" -- latin capital letter I with acute, "Î", // "Î" -- latin capital letter I with circumflex, "Ï", // "Ï" -- latin capital letter I with diaeresis, "Ð", // "Ð" -- latin capital letter ETH, U+00D0 ISOlat1 --> "Ñ", // "Ñ" -- latin capital letter N with tilde, "Ò", // "Ò" -- latin capital letter O with grave, "Ó", // "Ó" -- latin capital letter O with acute, "Ô", // "Ô" -- latin capital letter O with circumflex, "Õ", // "Õ" -- latin capital letter O with tilde, "Ö", // "Ö" -- latin capital letter O with diaeresis, "×", // "×" -- multiplication sign, U+00D7 ISOnum --> "Ø", // "Ø" -- latin capital letter O with stroke "Ù", // "Ù" -- latin capital letter U with grave, "Ú", // "Ú" -- latin capital letter U with acute, "Û", // "Û" -- latin capital letter U with circumflex, "Ü", // "Ü" -- latin capital letter U with diaeresis, "Ý", // "Ý" -- latin capital letter Y with acute, "Þ", // "Þ" -- latin capital letter THORN, "ß", // "ß" -- latin small letter sharp s = ess-zed, "à", // "à" -- latin small letter a with grave "á", // "á" -- latin small letter a with acute, "â", // "â" -- latin small letter a with circumflex, "ã", // "ã" -- latin small letter a with tilde, "ä", // "ä" -- latin small letter a with diaeresis, "å", // "å" -- latin small letter a with ring above "æ", // "æ" -- latin small letter ae "ç", // "ç" -- latin small letter c with cedilla, "è", // "è" -- latin small letter e with grave, "é", // "é" -- latin small letter e with acute, "ê", // "ê" -- latin small letter e with circumflex, "ë", // "ë" -- latin small letter e with diaeresis, "ì", // "ì" -- latin small letter i with grave, "í", // "í" -- latin small letter i with acute, "î", // "î" -- latin small letter i with circumflex, "ï", // "ï" -- latin small letter i with diaeresis, "ð", // "ð" -- latin small letter eth, U+00F0 ISOlat1 --> "ñ", // "ñ" -- latin small letter n with tilde, "ò", // "ò" -- latin small letter o with grave, "ó", // "ó" -- latin small letter o with acute, "ô", // "ô" -- latin small letter o with circumflex, "õ", // "õ" -- latin small letter o with tilde, "ö", // "ö" -- latin small letter o with diaeresis, "÷", // "÷" -- division sign, U+00F7 ISOnum --> "ø", // "ø" -- latin small letter o with stroke, "ù", // "ù" -- latin small letter u with grave, "ú", // "ú" -- latin small letter u with acute, "û", // "û" -- latin small letter u with circumflex, "ü", // "ü" -- latin small letter u with diaeresis, "ý", // "ý" -- latin small letter y with acute, "þ", // "þ" -- latin small letter thorn with, "ÿ", // "ÿ" -- latin small letter y with diaeresis, }; #ifdef MORE_NAME_ENTITY // in case we decide to do more name entity latter // Additional HTML 4.0 name entity table for CP 1252 extension character set #define CP1252EXT_BASE (UINT)0x0080 #define CP1252EXT_MAX (UINT)0x009F #define NONUNI 0xFFFF #define UNDEFCHAR "???????" #define CP1252EXT_NCR_SIZE 7 struct NAME_ENTITY_EXT { UWORD uwUniCode; LPCTSTR lpszNameEntity; }; static struct NAME_ENTITY_EXT aNameEntityExt[] = { // UniCode NCR_Enty Name_Enty CP1252Ext Comment { 0x20AC, "€" }, // "€" }, // € #EURO SIGN // { NONUNI, UNDEFCHAR }, // "&;" }, //  #UNDEFINED { 0x201A, "‚" }, // "‚" }, // ‚ #SINGLE LOW-9 QUOTATION MARK { 0x0192, "ƒ" }, // "ƒ" }, // ƒ #LATIN SMALL LETTER F WITH HOOK { 0x201E, "„" }, // "„" }, // „ #DOUBLE LOW-9 QUOTATION MARK { 0x2026, "…" }, // "…" }, // … #HORIZONTAL ELLIPSIS { 0x2020, "†" }, // "†" }, // † #DAGGER { 0x2021, "‡" }, // "‡" }, // ‡ #DOUBLE DAGGER { 0x02C6, "ˆ" }, // "ˆ" }, // ˆ #MODIFIER LETTER CIRCUMFLEX ACCENT { 0x2030, "‰" }, // "‰" }, // ‰ #PER MILLE SIGN { 0x0160, "Š" }, // "Š" }, // Š #LATIN CAPITAL LETTER S WITH CARON { 0x2039, "‹" }, // "‹" }, // ‹ #SINGLE LEFT-POINTING ANGLE QUOTATION MARK { 0x0152, "Œ" }, // "Œ" }, // Œ #LATIN CAPITAL LIGATURE OE // { NONUNI, UNDEFCHAR }, // "&;" }, //  #UNDEFINED { 0x017D, "Ž" }, // "&;" }, // Ž #LATIN CAPITAL LETTER Z WITH CARON, ***no name entity defined in HTML 4.0*** // { NONUNI, UNDEFCHAR }, // "&;" }, //  #UNDEFINED // { NONUNI, UNDEFCHAR }, // "&;" }, //  #UNDEFINED { 0x2018, "‘" }, // "‘" }, // ‘ #LEFT SINGLE QUOTATION MARK { 0x2019, "’" }, // "’" }, // ’ #RIGHT SINGLE QUOTATION MARK { 0x201C, "“" }, // "“" }, // “ #LEFT DOUBLE QUOTATION MARK { 0x201D, "”" }, // "”" }, // ” #RIGHT DOUBLE QUOTATION MARK { 0x2022, "•" }, // "•" }, // • #BULLET { 0x2013, "–" }, // "–" }, // – #EN DASH { 0x2014, "—" }, // "—" }, // — #EM DASH { 0x20DC, "˜" }, // "˜" }, // ˜ #SMALL TILDE { 0x2122, "™" }, // "™" }, // ™ #TRADE MARK SIGN { 0x0161, "š" }, // "š" }, // š #LATIN SMALL LETTER S WITH CARON { 0x203A, "›" }, // "›" }, // › #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK { 0x0153, "œ" }, // "œ" }, // œ #LATIN SMALL LIGATURE OE // { NONUNI, UNDEFCHAR }, // "&;" }, //  #UNDEFINED { 0x017E, "ž" }, // "&;" }, // ž #LATIN SMALL LETTER Z WITH CARON, ***no name entity defined in HTML 4.0*** { 0x0178, "Ÿ" }, // "Ÿ" }, // Ÿ #LATIN CAPITAL LETTER Y WITH DIAERESIS }; #endif HRESULT WINAPI DoConvertINetString(LPDWORD lpdwMode, BOOL fInbound, UINT uCodePage, int nCodeSet, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDestStr, int cchDest, LPINT lpnSize); /****************************************************************************** ***************************** U T I L I T I E S *************************** ******************************************************************************/ void DataByteSwap(LPSTR DataBuf, int len ) { int i ; UCHAR tmpData ; if ( len ) for ( i = 0 ; i < len-1 ; i+=2 ) { tmpData = DataBuf[i] ; DataBuf[i] = DataBuf[i+1] ; DataBuf[i+1] = tmpData ; } return ; } void CheckUnicodeDataType(DWORD dwDstEncoding, LPSTR DataBuf, int len ) { if ( DataBuf && len ) { if ( dwDstEncoding == CP_UCS_2_BE ) DataByteSwap(DataBuf,len); } return ; } /****************************************************************************** ****************** C O N V E R T I N E T S T R I N G ****************** ******************************************************************************/ HRESULT CICharConverter::UnicodeToMultiByteEncoding(DWORD dwDstEncoding, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { int nBuffSize, i ; BOOL UseDefChar = FALSE ; LPSTR lpDefFallBack = NULL ; UCHAR DefaultCharBuff[3]; // possible DBCS + null HRESULT hr = E_FAIL; int _nDstSize = *lpnDstSize; if ( _dwUnicodeEncoding == CP_UCS_2_BE && _cvt_count == 0 ) { if ( _lpUnicodeStr = (LPSTR)LocalAlloc(LPTR, *lpnSrcSize ) ) { MoveMemory(_lpUnicodeStr, lpSrcStr, *lpnSrcSize ) ; lpSrcStr = _lpUnicodeStr ; } else { hr = E_OUTOFMEMORY; goto EXIT; } } CheckUnicodeDataType(_dwUnicodeEncoding, (LPSTR) lpSrcStr, *lpnSrcSize); nBuffSize = *lpnSrcSize / sizeof(WCHAR); // We force to use MLang NO_BEST_FIT_CHAR check on ISCII encoding since system don't accept default chars if (IS_NLS_DLL_CP(dwDstEncoding) && (dwFlag & MLCONVCHARF_USEDEFCHAR)) dwFlag |= MLCONVCHARF_NOBESTFITCHARS; if ( lpFallBack && ( dwFlag & MLCONVCHARF_USEDEFCHAR )) { // only take SBCS, no DBCS character if ( 1 == WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0, (LPCWSTR)lpFallBack, 1, (LPSTR)DefaultCharBuff, sizeof(DefaultCharBuff), NULL, NULL )) lpDefFallBack = (LPSTR) DefaultCharBuff; } if(!(*lpnDstSize = WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0, (LPCWSTR)lpSrcStr, nBuffSize, lpDstStr, *lpnDstSize, IS_NLS_DLL_CP(dwDstEncoding)? NULL:(LPCSTR)lpDefFallBack, IS_NLS_DLL_CP(dwDstEncoding)? NULL:&UseDefChar))) { hr = E_FAIL; goto EXIT; } if ( !_cvt_count ) // save SrcSize if it is the first time conversion _nSrcSize = nBuffSize * sizeof(WCHAR); if (*lpnDstSize) { if (dwFlag & ( MLCONVCHARF_NCR_ENTITIZE | MLCONVCHARF_NAME_ENTITIZE | MLCONVCHARF_NOBESTFITCHARS )) { char *lpDstStrTmp = lpDstStr; WCHAR *lpwStrTmp = NULL; WCHAR *lpwStrTmpSave = NULL; char *lpDstStrTmp2 = NULL; char *lpDstStrTmp2Save = NULL; int cCount, ConvCount = 0, nCount = 0; WCHAR *lpwSrcStrTmp = (WCHAR *)lpSrcStr; int *lpBCharOffset = NULL; int *lpBCharOffsetSave = NULL; if (!(lpwStrTmpSave = lpwStrTmp = (WCHAR *)LocalAlloc(LPTR, *lpnSrcSize))) { hr = E_OUTOFMEMORY; goto ENTITIZE_DONE; } // Make sure we have real converted buffer to check BEST_FIT_CHAR and DEFAULT_CHAR if (!_nDstSize) { lpDstStrTmp2Save = lpDstStrTmp2 = (char *)LocalAlloc(LPTR, *lpnDstSize); if (lpDstStrTmp2) { WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0, (LPCWSTR)lpSrcStr, nBuffSize, lpDstStrTmp2, *lpnDstSize, NULL, NULL ); } else { hr = E_OUTOFMEMORY; goto ENTITIZE_DONE; } } if (nBuffSize == MultiByteToWideChar(MAPUSERDEF(dwDstEncoding), 0, _nDstSize? lpDstStr : lpDstStrTmp2, *lpnDstSize, lpwStrTmp, _nSrcSize)) { // Pre scan to get number of best fit chars. for (i=0; i= NAME_ENTITY_OFFSET) && (*lpwSrcStrTmp <= NAME_ENTITY_MAX )) { fDoNEnty = TRUE; lpszNEnty = g_lpstrNameEntity[(*lpwSrcStrTmp) - NAME_ENTITY_OFFSET]; } // check if character is in the additional name entity table for CP 1252 extension if (!fDoNEnty) { for (int idx = 0; idx < ARRAYSIZE(aNameEntityExt); idx++) if (*lpwSrcStrTmp == aNameEntityExt[idx].uwUniCode) { fDoNEnty = TRUE; lpszNEnty = aNameEntityExt[idx].lpszNameEntity; break; } } if (fDoNEnty) { cCount = lstrlenA(lpszNEnty); if (_nDstSize) { CopyMemory(lpDstStrTmp, lpszNEnty, cCount); lpDstStrTmp += cCount ; } ConvCount += cCount; fConverted = TRUE; } #else // check if character is in the Latin-1 Supplement range if ((*lpwSrcStrTmp >= NAME_ENTITY_OFFSET) && (*lpwSrcStrTmp < ARRAYSIZE(g_lpstrNameEntity)+NAME_ENTITY_OFFSET)) { LPCTSTR lpszNEnty = NULL; if (!(lpszNEnty = g_lpstrNameEntity[(*lpwSrcStrTmp) - NAME_ENTITY_OFFSET])) { #ifdef DEBUG AssertMsg((BOOL)FALSE, "Name entity table broken"); #endif hr = E_FAIL; goto ENTITIZE_DONE; } cCount = lstrlenA(lpszNEnty); if (_nDstSize) { CopyMemory(lpDstStrTmp, lpszNEnty, cCount); lpDstStrTmp += cCount ; } ConvCount += cCount; fConverted = TRUE; } #endif } // check if NCR requested if ((!fConverted) && (dwFlag & MLCONVCHARF_NCR_ENTITIZE)) { if ((nCount-i >= 2) && (*lpwSrcStrTmp >= 0xD800 && *lpwSrcStrTmp <= 0xDBFF) && (*(lpwSrcStrTmp+1) >= 0xDC00 && *(lpwSrcStrTmp+1) <= 0xDFFF)) bIsSurrogatePair = TRUE; else bIsSurrogatePair = FALSE; if (_nDstSize) { lpDstStrTmp[0] = '&' ; lpDstStrTmp[1] = '#' ; lpDstStrTmp += 2 ; // If it is a Unicode surrogates pair, we convert it to real Unicode value if (bIsSurrogatePair) { DWORD dwUnicode = ((*lpwSrcStrTmp - 0xD800) << 10) + *(lpwSrcStrTmp+1) - 0xDC00 + 0x10000; _ultoa( dwUnicode, (char*)lpDstStrTmp, 10); } else _ultoa( *lpwSrcStrTmp, (char*)lpDstStrTmp, 10); cCount = lstrlenA(lpDstStrTmp); lpDstStrTmp += cCount; ConvCount += cCount; *(lpDstStrTmp++) = ';' ; } else { char szTmpString[10]; if (bIsSurrogatePair) { DWORD dwUnicode = ((*lpwSrcStrTmp - 0xD800) << 10) + *(lpwSrcStrTmp+1) - 0xDC00 + 0x10000; _ultoa( dwUnicode, szTmpString, 10); } else _ultoa( *lpwSrcStrTmp, szTmpString, 10); ConvCount += lstrlenA(szTmpString); } fConverted = TRUE; ConvCount += 3; } // handle MLCONVCHARF_USEDEFCHAR here - less priority and default method if (!fConverted) { if (_nDstSize) { *lpDstStrTmp = lpDefFallBack ? *lpDefFallBack : '?'; lpDstStrTmp++; } ConvCount++; if (!UseDefChar) UseDefChar = TRUE; } lpBCharOffset++; lpwSrcStrTmp++; // Skip next character if it is a Unicode surrogates pair if (bIsSurrogatePair) { lpBCharOffset++; lpwSrcStrTmp++; i++; } } lpBCharOffset -= nCount ; } int nRemain = (*lpnSrcSize - (int)((char*)lpwSrcStrTmp - (char *)lpSrcStr))/sizeof(WCHAR); ConvCount += WideCharToMultiByte(MAPUSERDEF(dwDstEncoding), 0, (LPCWSTR)lpwSrcStrTmp, nRemain, lpDstStrTmp, _nDstSize? _nDstSize-ConvCount : 0, NULL, NULL ); *lpnDstSize = ConvCount ; hr = S_OK; } else { hr = E_FAIL; } ENTITIZE_DONE: if (lpwStrTmpSave) LocalFree(lpwStrTmpSave); if (lpDstStrTmp2Save) LocalFree(lpDstStrTmp2Save); if (lpBCharOffsetSave) LocalFree(lpBCharOffsetSave); } else { hr = S_OK; } if (S_OK == hr && UseDefChar) hr = S_FALSE; } else { hr = E_FAIL; } EXIT: return hr; } HRESULT CICharConverter::UTF78ToUnicode(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize) { HRESULT hr ; hr = DoConvertINetString(lpdwMode, TRUE, CP_UCS_2, _dwUTFEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize); if ( !_cvt_count ) // save SrcSize if it is the first time conversion _nSrcSize = *lpnSrcSize ; CheckUnicodeDataType(_dwUnicodeEncoding, lpDstStr, *lpnDstSize); return hr ; } HRESULT CICharConverter::UnicodeToUTF78(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize) { HRESULT hr ; if ( _dwUnicodeEncoding == CP_UCS_2_BE && _cvt_count == 0 ) { if ( _lpUnicodeStr = (LPSTR)LocalAlloc(LPTR, *lpnSrcSize ) ) { MoveMemory(_lpUnicodeStr, lpSrcStr, *lpnSrcSize ) ; lpSrcStr = _lpUnicodeStr ; } else return E_OUTOFMEMORY ; } CheckUnicodeDataType(_dwUnicodeEncoding, (LPSTR) lpSrcStr, *lpnSrcSize); hr = DoConvertINetString(lpdwMode, FALSE, CP_UCS_2, _dwUTFEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize); if ( !_cvt_count ) // save SrcSize if it is the first time conversion _nSrcSize = *lpnSrcSize ; return hr ; } HRESULT CICharConverter::UnicodeToWindowsCodePage(LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { HRESULT hr ; hr = UnicodeToMultiByteEncoding(_dwWinCodePage,lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize,dwFlag,lpFallBack); return hr ; } HRESULT CICharConverter::UnicodeToInternetEncoding(LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { HRESULT hr ; hr = UnicodeToMultiByteEncoding(_dwInternetEncoding,lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize,dwFlag,lpFallBack); return hr ; } HRESULT CICharConverter::InternetEncodingToUnicode(LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize) { int cch = 0 ; int cb = *lpnSrcSize; if ( !_cvt_count ) { // If we have a multibyte character encoding, we are at risk of splitting // some characters at the read boundary. We must Make sure we have a // discrete number of characters first. UINT uMax = MAX_CHAR_SIZE ; cb++; // pre-increment do { cch = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding), MB_ERR_INVALID_CHARS | MB_PRECOMPOSED, lpSrcStr, --cb, NULL, 0 ); --uMax; } while (!cch && uMax && cb); } if ( cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string cb = *lpnSrcSize ; // restore orginal value *lpnDstSize = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding), 0, lpSrcStr, cb, (LPWSTR)lpDstStr, *lpnDstSize/sizeof(WCHAR) ); *lpnDstSize = *lpnDstSize * sizeof(WCHAR); if ( !_cvt_count ) // save SrcSize if it is the first time conversion _nSrcSize = cb ; CheckUnicodeDataType(_dwUnicodeEncoding, lpDstStr, *lpnDstSize); if (*lpnDstSize==0 && (cb || cb != *lpnSrcSize)) return E_FAIL ; else return S_OK ; } HRESULT CICharConverter::WindowsCodePageToUnicode(LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize) { int cch = 0 ; int cb = *lpnSrcSize; if ( !_cvt_count ) { UINT uMax = MAX_CHAR_SIZE ; cb++; // pre-increment do { cch = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage), MB_ERR_INVALID_CHARS | MB_PRECOMPOSED, lpSrcStr, --cb, NULL, 0 ); --uMax; } while (!cch && uMax && cb); } if ( cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string cb = *lpnSrcSize ; // restore orginal value *lpnDstSize = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage), 0, lpSrcStr, cb, (LPWSTR)lpDstStr, *lpnDstSize/sizeof(WCHAR) ); *lpnDstSize = *lpnDstSize * sizeof(WCHAR); if ( !_cvt_count ) // save SrcSize if it is the first time conversion _nSrcSize = cb ; CheckUnicodeDataType(_dwUnicodeEncoding, lpDstStr, *lpnDstSize); // Whistler Bug#360429, // Web page could have a splitting DBCS character at the very end of the page, // To work around it, we allow one byte of dangling DBCS character. if (*lpnDstSize==0 && (cb || (cb != *lpnSrcSize && ++cb != *lpnSrcSize))) return E_FAIL ; else return S_OK ; } HRESULT CICharConverter::WindowsCodePageToInternetEncoding(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { HRESULT hr ; // check if the conversion should go through Unicode indirectly if ( _dwConvertType & 0x10 ) hr = WindowsCodePageToInternetEncodingWrap(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack); else { hr = DoConvertINetString(lpdwMode, FALSE, _dwWinCodePage, _dwInternetEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize); if ( !_cvt_count ) // save SrcSize if it is the first time conversion _nSrcSize = *lpnSrcSize ; } return hr ; } HRESULT CICharConverter::InternetEncodingToWindowsCodePage(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { HRESULT hr ; // check if the conversion should go through Unicode indirectly if ( _dwConvertType & 0x10 ) hr = InternetEncodingToWindowsCodePageWrap(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack); else { hr = DoConvertINetString(lpdwMode, TRUE, _dwWinCodePage, _dwInternetEncoding, lpSrcStr, lpnSrcSize, lpDstStr, *lpnDstSize, lpnDstSize); if ( !_cvt_count ) // save SrcSize if it is the first time conversion _nSrcSize = *lpnSrcSize ; } return hr ; } HRESULT CICharConverter::WindowsCodePageToInternetEncodingWrap(LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { int nBuffSize = 0 ; int cb = *lpnSrcSize; UINT uMax = MAX_CHAR_SIZE ; BOOL UseDefChar = FALSE ; HRESULT hr = S_OK; if ( !_cvt_count ) { cb++; // pre-increment do { nBuffSize = MultiByteToWideChar( MAPUSERDEF(_dwWinCodePage), MB_ERR_INVALID_CHARS | MB_PRECOMPOSED, lpSrcStr, --cb, NULL, 0 ); --uMax; } while (!nBuffSize && uMax && cb); } if ( cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string cb = *lpnSrcSize ; // restore orginal value if (!nBuffSize) // in case there are illeage characters nBuffSize = cb ; if ( _lpInterm1Str = (LPSTR) LocalAlloc(LPTR, (nBuffSize * sizeof(WCHAR)))) { nBuffSize = MultiByteToWideChar(MAPUSERDEF(_dwWinCodePage), 0, lpSrcStr, cb, (LPWSTR)_lpInterm1Str, nBuffSize ); int iSrcSizeTmp = nBuffSize * sizeof(WCHAR); hr = UnicodeToMultiByteEncoding(MAPUSERDEF(_dwInternetEncoding), (LPCSTR)_lpInterm1Str, &iSrcSizeTmp, lpDstStr, lpnDstSize, dwFlag, lpFallBack); // *lpnDstSize = WideCharToMultiByte( MAPUSERDEF(_dwInternetEncoding), 0, // (LPCWSTR)_lpInterm1Str, nBuffSize, lpDstStr, *lpnDstSize, NULL, &UseDefChar ); if ( !_cvt_count ) // save SrcSize if it is the first time conversion _nSrcSize = cb ; } else hr = E_FAIL; if (hr == S_OK) { if (*lpnDstSize==0 && cb) hr = E_FAIL ; else { if ( UseDefChar ) return S_FALSE ; else return S_OK ; } } return hr; } HRESULT CICharConverter::InternetEncodingToWindowsCodePageWrap(LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { int nBuffSize = 0 ; int cb = *lpnSrcSize; UINT uMax = MAX_CHAR_SIZE ; BOOL UseDefChar = FALSE ; HRESULT hr = S_OK; if ( !_cvt_count ) { cb++; // pre-increment do { nBuffSize = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding), MB_ERR_INVALID_CHARS | MB_PRECOMPOSED, lpSrcStr, --cb, NULL, 0 ); --uMax; } while (!nBuffSize && uMax && cb); } if ( cb == (*lpnSrcSize - MAX_CHAR_SIZE +1 )) // if conversion problem isn't at the end of the string cb = *lpnSrcSize ; // restore orginal value if (!nBuffSize) // in case there are illeage characters nBuffSize = cb ; if ( _lpInterm1Str = (LPSTR) LocalAlloc(LPTR,nBuffSize * sizeof (WCHAR) )) { nBuffSize = MultiByteToWideChar( MAPUSERDEF(_dwInternetEncoding), 0, lpSrcStr, cb, (LPWSTR)_lpInterm1Str, nBuffSize ); int iSrcSizeTmp = nBuffSize * sizeof(WCHAR); hr = UnicodeToMultiByteEncoding(MAPUSERDEF(_dwWinCodePage), (LPCSTR)_lpInterm1Str, &iSrcSizeTmp, lpDstStr, lpnDstSize, dwFlag, lpFallBack); // *lpnDstSize = WideCharToMultiByte( MAPUSERDEF(_dwWinCodePage), 0, // (LPCWSTR)_lpInterm1Str, nBuffSize, lpDstStr, *lpnDstSize, NULL, &UseDefChar ); if ( !_cvt_count ) // save SrcSize if it is the first time conversion _nSrcSize = cb ; } else hr = E_FAIL; if (hr == S_OK) { if (*lpnDstSize==0 && cb) hr = E_FAIL ; else { if ( UseDefChar ) return S_FALSE ; else return S_OK ; } } return hr; } HRESULT CICharConverter::ConvertIWUU(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { int nBuffSize = 0 ; HRESULT hr = S_OK ; HRESULT hrWarnings = S_OK ; // InternetEncodingToWindowsCodePage if ( _dwConvertType % 2 && _dwConvertType < 21 ) /* start from Internet Encoding */ { if ( _dwConvertType == 5 || _dwConvertType == 9 ) /* use interm buffer */ { hr = InternetEncodingToWindowsCodePage(lpdwMode, lpSrcStr, lpnSrcSize, NULL, &nBuffSize, dwFlag, lpFallBack); if ( _lpInterm1Str = (LPSTR) LocalAlloc(LPTR,nBuffSize) ) { hr = InternetEncodingToWindowsCodePage(lpdwMode, lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize, dwFlag, lpFallBack); lpSrcStr = _lpInterm1Str ; *lpnSrcSize = nBuffSize ; } else goto fail ; } else hr = InternetEncodingToWindowsCodePage(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack); _cvt_count ++ ; } if ( hr != S_OK ) hrWarnings = hr ; // WindowsCodePageToUnicode or InternetEncodingToUnicode if ( _dwConvertType == 21 || _dwConvertType == 25 ) { if ( _dwConvertType == 21 ) hr = InternetEncodingToUnicode(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize); else // _dwConvertType == 25 { hr = InternetEncodingToUnicode(lpSrcStr, lpnSrcSize, NULL, &nBuffSize); if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) ) { hr = InternetEncodingToUnicode(lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize); lpSrcStr = _lpInterm1Str ; *lpnSrcSize = nBuffSize ; } else goto fail ; } _cvt_count ++ ; } else if ( _dwConvertType >= 4 && _dwConvertType <= 10 ) { if ( _dwConvertType > 8 ) { nBuffSize = 0 ; hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, NULL, &nBuffSize); if ( _cvt_count ) { if ( _lpInterm2Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) ) { hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, _lpInterm2Str, &nBuffSize); lpSrcStr = _lpInterm2Str ; *lpnSrcSize = nBuffSize ; } else goto fail ; } else { if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) ) { hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize); lpSrcStr = _lpInterm1Str ; *lpnSrcSize = nBuffSize ; } else goto fail ; } } else hr = WindowsCodePageToUnicode(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize); _cvt_count ++ ; } if ( hr != S_OK ) hrWarnings = hr ; // UnicodeToUTF78 if ( _dwConvertType & 0x08 ) #ifndef UNIX hr = UnicodeToUTF78(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize); #else { /* we now hack the lpSrcStr to be the same as 2 byte Unicode so mlang * lowlevel code can work right. */ LPWSTR lpwSrcStr = (LPWSTR)lpSrcStr; INT tmpSize = *lpnSrcSize/sizeof(WCHAR); UCHAR *pTmp = new UCHAR[(tmpSize+1)*2]; if(pTmp) { for(int i = 0; i < tmpSize; i++) { pTmp[i*2] = *lpwSrcStr++; pTmp[i*2+1] = 0x00; } pTmp[i*2] = pTmp[i*2+1] = 0x00; tmpSize *= 2; hr = UnicodeToUTF78(lpdwMode, (LPCSTR)pTmp, &tmpSize, lpDstStr, lpnDstSize); } else hr = E_FAIL; delete [] pTmp; } #endif /* UNIX */ return ( hr == S_OK ? hrWarnings : hr ) ; fail : return E_FAIL ; } HRESULT CICharConverter::ConvertUUWI(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { int nBuffSize = 0 ; HRESULT hr = S_OK ; HRESULT hrWarnings = S_OK ; // UTF78ToUnicode if ( _dwConvertType & 0x08 ) { if ( _dwConvertType == 12 ) /* convert UTF78 -> Unicode only */ hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize); else /* use interm buffer, type = 10 or 9 */ { hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, NULL, &nBuffSize); if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) ) { hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize); lpSrcStr = _lpInterm1Str ; *lpnSrcSize = nBuffSize ; } else goto fail ; } _cvt_count ++ ; } if ( hr != S_OK ) hrWarnings = hr ; // UnicodeToWindowsCodePage or UnicodeToInternetEncoding if ( _dwConvertType == 21 || _dwConvertType == 25 ) { hr = UnicodeToInternetEncoding(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack); _cvt_count ++ ; } else if ( _dwConvertType >= 4 && _dwConvertType <= 10 ) { if ( _dwConvertType % 2 ) /* use interm buffer */ { nBuffSize = 0 ; hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, NULL, &nBuffSize, dwFlag, lpFallBack); if ( _cvt_count ) { if ( _lpInterm2Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) ) { hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, _lpInterm2Str, &nBuffSize, dwFlag, lpFallBack); lpSrcStr = _lpInterm2Str ; *lpnSrcSize = nBuffSize ; } else goto fail ; } else { if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) ) { hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize, dwFlag, lpFallBack); lpSrcStr = _lpInterm1Str ; *lpnSrcSize = nBuffSize ; } else goto fail ; } } else hr = UnicodeToWindowsCodePage(lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack); _cvt_count ++ ; } if ( hr != S_OK ) hrWarnings = hr ; // WindowsCodePageToInternetEncoding if ( _dwConvertType % 2 && _dwConvertType < 21 ) hr = WindowsCodePageToInternetEncoding(lpdwMode, lpSrcStr, lpnSrcSize, lpDstStr, lpnDstSize, dwFlag, lpFallBack); return ( hr == S_OK ? hrWarnings : hr ) ; fail : return E_FAIL ; } #if 0 struct CODEPAGEINFO { UINT uCodePage ; CP_STATE nCP_State ; // whether this is a valid windows codepage ? }; // ValidCodepageInfo is used to cache whether a codepage is a vaild code // It uses circular-FIFO cache algorithm #define MAX_CP_CACHE 32 static int cp_cache_count = 0 ; static int cp_cache_ptr = 0 ; static struct CODEPAGEINFO ValidCodepageInfo[MAX_CP_CACHE]; // ValidCodepageInfo is used to cache whether a codepage is a vaild codepage // It uses circular-FIFO cache algorithm BOOL CheckIsValidCodePage (UINT uCodePage) { if ( uCodePage == 50000 ) // User defined return TRUE ; int i ; BOOL bRet ; for ( i = 0 ; i < cp_cache_count ; i++ ) { if ( uCodePage == ValidCodepageInfo[i].uCodePage ) { if ( ValidCodepageInfo[i].nCP_State == VALID_CP ) return TRUE ; else return FALSE ; } } // not found, call IsValidCodePage and cache the return value bRet = IsValidCodePage(uCodePage); EnterCriticalSection(&g_cs); ValidCodepageInfo[cp_cache_ptr].uCodePage = uCodePage ; if (bRet) ValidCodepageInfo[cp_cache_ptr].nCP_State = VALID_CP ; else ValidCodepageInfo[cp_cache_ptr].nCP_State = INVALID_CP ; if ( cp_cache_count < MAX_CP_CACHE ) cp_cache_count++ ; cp_cache_ptr = ( ++cp_cache_ptr ) % MAX_CP_CACHE ; LeaveCriticalSection(&g_cs); return bRet ; } #endif /* Conversion Flag: Bit 7 - Convert Direction. Bit 4 (16) - Unicode <-> Internet Encoding Bit 3 (8) - UTF8, UTF7 Bit 2 (4) - Unicode Bit 1 (2) - Windows CodePage Bit 0 (1) - Internet Encoding 12, 6, 3 (19) - one step convert 10, 5 (21) - two steps convert 9 (25) - three steps convert */ int GetWindowsEncodingIndex(DWORD dwEncoding) { int nr = sizeof (aEncodingInfo) / sizeof(ENCODINGINFO) ; int i, half = nr / 2, index = -1 ; if (aEncodingInfo[half].dwEncoding > dwEncoding ) { for ( i = 0 ; i < half ; i++ ) if (aEncodingInfo[i].dwEncoding == dwEncoding ) index = i ; } else if (aEncodingInfo[half].dwEncoding < dwEncoding ) { for ( i = half + 1 ; i < nr ; i++ ) if (aEncodingInfo[i].dwEncoding == dwEncoding ) index = i ; } else index = half ; if (index>=0) // found { if ( aEncodingInfo[index].nCP_State != VALID_CP && aEncodingInfo[index].dwCodePage ) { if ( aEncodingInfo[index].dwCodePage == 50000 || IsValidCodePage(aEncodingInfo[index].dwCodePage ) ) // 50000 means user defined aEncodingInfo[index].nCP_State = VALID_CP ; else aEncodingInfo[index].nCP_State = INVALID_CP ; if ((aEncodingInfo[index].nCP_State == VALID_CP) && (aEncodingInfo[index].dwFlags & CONV_CHK_NLS) && !IsValidCodePage(aEncodingInfo[index].dwEncoding)) aEncodingInfo[index].nCP_State = INVALID_CP ; } } return index ; } HRESULT CICharConverter::ConvertSetup(DWORD * pdwSrcEncoding, DWORD dwDstEncoding) { DWORD SrcFlag = 0, DstFlag = 0 ; int index, unknown = 0 ; // IE bug 109708 - WEIWU 5/11/00 // Always consider US-ASCII as a valid source encoding for conversion /* if (*pdwSrcEncoding == CP_20127 && !IsValidCodePage(CP_20127)) *pdwSrcEncoding = CP_1252; */ /* check source & destination encoding type */ index = GetWindowsEncodingIndex(*pdwSrcEncoding); if ( index >=0 ) { SrcFlag = (DWORD) aEncodingInfo[index].bTypeUUIW ; if ( aEncodingInfo[index].dwCodePage ) { _dwWinCodePage = (DWORD) aEncodingInfo[index].dwCodePage ; if (aEncodingInfo[index].nCP_State == INVALID_CP ) goto fail ; } if ( SrcFlag & 0x08 ) _dwUTFEncoding = *pdwSrcEncoding ; if ( SrcFlag & 0x01 ) _dwInternetEncoding = *pdwSrcEncoding ; if ( SrcFlag & 0x04 ) _dwUnicodeEncoding = *pdwSrcEncoding ; } // assume it is a unknown Window Codepage else { if ( !CONVERT_IS_VALIDCODEPAGE(*pdwSrcEncoding)) goto fail ; SrcFlag = 0x02 ; _dwWinCodePage = *pdwSrcEncoding ; unknown ++ ; } index = GetWindowsEncodingIndex(dwDstEncoding); if ( index >=0 ) { // check if two codepages are compatiable if ( _dwWinCodePage && aEncodingInfo[index].dwCodePage ) { if (_dwWinCodePage != (DWORD) aEncodingInfo[index].dwCodePage ) goto fail ; } DstFlag = (DWORD) aEncodingInfo[index].bTypeUUIW ; if ( aEncodingInfo[index].dwCodePage ) { _dwWinCodePage = (DWORD) aEncodingInfo[index].dwCodePage ; if (aEncodingInfo[index].nCP_State == INVALID_CP ) goto fail ; } if ( DstFlag & 0x08 ) { if (_dwUTFEncoding) _dwUTFEncoding2 = dwDstEncoding ; else _dwUTFEncoding = dwDstEncoding ; } if ( DstFlag & 0x01 ) _dwInternetEncoding = dwDstEncoding ; if ( DstFlag & 0x04 ) _dwUnicodeEncoding = dwDstEncoding ; } // 1) First time unknown, assume it is a unknown Window Codepage // the conversion become UTF78 <-> Unicode <-> Window Codepage // 2) Second time unknown, assume it is a unknown Internet Encoding // the conversion become Windows Codepage <-> Unicode <-> Internet Encoding else { if ( !CONVERT_IS_VALIDCODEPAGE(dwDstEncoding)) goto fail ; if ( unknown == 0 ) { if ( _dwWinCodePage ) { if (_dwWinCodePage != dwDstEncoding ) goto fail ; } DstFlag = 0x02 ; _dwWinCodePage = dwDstEncoding ; } else { DstFlag = 0x11 ; _dwInternetEncoding = dwDstEncoding ; } } if ( !SrcFlag | !DstFlag ) goto fail ; if ( SrcFlag == DstFlag && *pdwSrcEncoding != dwDstEncoding && ( 4 != SrcFlag ) && ( 8 != SrcFlag )) goto fail ; _dwConvertType = SrcFlag | DstFlag ; _bConvertDirt = ( SrcFlag & 0x0f ) > ( DstFlag & 0x0f ) ; // if code convertor has been allocated, deallocate it if (_hcins) { delete _hcins ; _hcins = NULL ; } return S_OK ; fail : return S_FALSE ; } HRESULT CICharConverter::DoCodeConvert(LPDWORD lpdwMode, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { HRESULT hr = S_OK ; if ( 4 == _dwConvertType ) // CP_UCS_2 <-> CP_UCS_2_BE { if (!lpDstStr) { _nSrcSize = *lpnDstSize = *lpnSrcSize ; } else { int nSize = min(*lpnDstSize,*lpnSrcSize); _nSrcSize = *lpnSrcSize ; if ( lpDstStr && nSize > 0 ) { MoveMemory(lpDstStr, lpSrcStr, nSize ); DataByteSwap(lpDstStr, nSize ); _nSrcSize = nSize ; *lpnDstSize = nSize ; } } } else if ( 8 == _dwConvertType) // UTF7 <-> UTF8 { if (_dwUTFEncoding == _dwUTFEncoding2) { _nSrcSize = *lpnDstSize = min(*lpnDstSize,*lpnSrcSize); if (*lpnDstSize > 0) MoveMemory(lpDstStr, lpSrcStr, *lpnDstSize); } else { int nBuffSize = 0; // Always succeeds hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, NULL, &nBuffSize); if (_lpInterm1Str) LocalFree(_lpInterm1Str); if ( _lpInterm1Str= (LPSTR)LocalAlloc(LPTR, nBuffSize) ) { DWORD dwTmpEncoding = _dwUTFEncoding; int nTmpSrcSize; hr = UTF78ToUnicode(lpdwMode, lpSrcStr, lpnSrcSize, _lpInterm1Str, &nBuffSize); _dwUTFEncoding = _dwUTFEncoding2 ; nTmpSrcSize = _nSrcSize; // We don't need to create another dwMode since only UTF7 conversion needs it hr = UnicodeToUTF78(lpdwMode, _lpInterm1Str, &nBuffSize, lpDstStr, lpnDstSize); _nSrcSize = nTmpSrcSize; _dwUTFEncoding = dwTmpEncoding ; } else hr = E_OUTOFMEMORY; } } else if ( _bConvertDirt ) hr = ConvertUUWI(lpdwMode, lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize, dwFlag, lpFallBack); else hr = ConvertIWUU(lpdwMode, lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize, dwFlag, lpFallBack); return hr ; } BOOL CICharConverter::ConvertCleanUp() { if (_lpInterm1Str) { LocalFree(_lpInterm1Str); _lpInterm1Str = NULL ; } if (_lpInterm2Str) { LocalFree(_lpInterm2Str); _lpInterm2Str = NULL ; } if (_lpUnicodeStr) { LocalFree(_lpUnicodeStr); _lpUnicodeStr = NULL ; } _cvt_count = 0 ; _nSrcSize = 0 ; return TRUE ; } CICharConverter::CICharConverter() { _lpInterm1Str = NULL ; _lpInterm2Str = NULL ; _lpUnicodeStr = NULL ; _hcins = NULL ; _cvt_count = 0 ; _dwWinCodePage = 0; _dwInternetEncoding = 0; _dwUTFEncoding = 0; _dwUTFEncoding2 = 0; _dwUnicodeEncoding = 0; _dwConvertType = 0; _nSrcSize = 0 ; _hcins_dst = 0 ; return ; } CICharConverter::CICharConverter(DWORD dwFlag, WCHAR *lpFallBack) { _lpInterm1Str = NULL ; _lpInterm2Str = NULL ; _lpUnicodeStr = NULL ; _hcins = NULL ; _cvt_count = 0 ; _dwWinCodePage = 0; _dwInternetEncoding = 0; _dwUTFEncoding = 0; _dwUTFEncoding2 = 0; _dwUnicodeEncoding = 0; _dwConvertType = 0; _nSrcSize = 0 ; _hcins_dst = 0 ; _dwFlag = dwFlag; _lpFallBack = lpFallBack; return ; } CICharConverter::~CICharConverter() { if (_lpInterm1Str) { LocalFree(_lpInterm1Str); _lpInterm1Str = NULL ; } if (_lpInterm2Str) { LocalFree(_lpInterm2Str); _lpInterm2Str = NULL ; } if (_lpUnicodeStr) { LocalFree(_lpUnicodeStr); _lpUnicodeStr = NULL ; } if (_hcins) { delete _hcins ; _hcins = NULL ; } } CICharConverter::CICharConverter(DWORD dwSrcEncoding, DWORD dwDstEncoding) { _lpInterm1Str = NULL ; _lpInterm2Str = NULL ; _lpUnicodeStr = NULL ; _hcins = NULL ; _cvt_count = 0 ; _dwWinCodePage = 0; _dwInternetEncoding = 0; _dwUTFEncoding = 0; _dwUTFEncoding2 = 0; _dwUnicodeEncoding = 0; _dwConvertType = 0; _nSrcSize = 0 ; _hcins_dst = 0 ; ConvertSetup(&dwSrcEncoding,dwDstEncoding); return ; } HRESULT WINAPI IsConvertINetStringAvailable(DWORD dwSrcEncoding, DWORD dwDstEncoding) { HRESULT hr; CICharConverter * INetConvert = new CICharConverter ; if (!INetConvert) return E_OUTOFMEMORY; hr = INetConvert->ConvertSetup(&dwSrcEncoding, dwDstEncoding); delete INetConvert; return hr ; } #define DETECTION_BUFFER_NUM 3 // In CP_AUTO and detection result is UTF7 case, private converter might use high word of *lpdwMode to store internal data, but we need // to use it to notify Trident the detection result, currently, we bias to returning correct detection result. // This is currently by design. If we get a change to re-prototype conversion object, we can resovle this issue HRESULT WINAPI ConvertINetStringEx(LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize, DWORD dwFlag, WCHAR *lpFallBack) { CICharConverter * INetConvert; int nSrcSize; int nDstSize; DWORD dwMode = 0 ; // dwDetectResult // CP_UNDEFINED :Fail to detect // 0 :Not a auto-detect scenario // Others :Detected encoding DWORD dwDetectResult = CP_UNDEFINED; HRESULT hr ; if(lpnSrcSize) { nSrcSize = *lpnSrcSize; } else nSrcSize = -1; if ( lpSrcStr && nSrcSize == -1 ) // Get length of lpSrcStr if not given, assuming lpSrcStr is a zero terminate string. { if ( dwSrcEncoding == CP_UCS_2 ) nSrcSize = (lstrlenW((WCHAR*)lpSrcStr) << 1) ; else nSrcSize = lstrlenA(lpSrcStr) ; } // If there is nothing need to be converted, we return S_OK; if (!nSrcSize || !lpSrcStr) { if (lpnDstSize) *lpnDstSize = 0; return S_OK; } INetConvert = new CICharConverter(dwFlag, lpFallBack) ; if (!INetConvert) return E_OUTOFMEMORY; // ASSERT(CP_AUTO != dwDstEncoding); // if null specified at dst buffer we'll get the size of required buffer. if(!lpDstStr) nDstSize = 0; else if (lpnDstSize) nDstSize = *lpnDstSize; else nDstSize = 0; if (lpdwMode) dwMode = *lpdwMode ; // In real world, clients uses 28591 as 1252, 28599 as 1254, // To correctly convert those extended characters to Unicode, // We internally replace it with 1252 if (dwDstEncoding == CP_UCS_2 || dwDstEncoding == CP_UCS_2_BE) { if ((dwSrcEncoding == CP_ISO_8859_1) && _IsValidCodePage(CP_1252)) dwSrcEncoding = CP_1252; if ((dwSrcEncoding == CP_ISO_8859_9) && _IsValidCodePage(CP_1254)) dwSrcEncoding = CP_1254; } if ((dwDstEncoding == CP_1252) && (dwSrcEncoding == CP_ISO_8859_1)) { dwSrcEncoding = CP_1252; } if ((dwDstEncoding == CP_1254) && (dwSrcEncoding == CP_ISO_8859_9)) { dwSrcEncoding = CP_1254; } // // Auto Detection for Japan // Japanese user often tag their data incorrectly, so, if MLCONVCHARF_DETECTJPN specified, // we'll do extra detection for Shift-Jis and EUC // if ( dwSrcEncoding == CP_JP_AUTO || ((dwFlag & MLCONVCHARF_DETECTJPN) && (dwSrcEncoding == CP_JPN_SJ || dwSrcEncoding == CP_EUC_JP))) // Auto Detection for Japan { CIncdJapanese DetectJapan(dwSrcEncoding); UINT uiCodePage ; uiCodePage = ( dwMode >> 16 ) & 0xffff ; if ( uiCodePage ) { dwSrcEncoding = uiCodePage ; dwDetectResult = 0; } else { dwSrcEncoding = DetectJapan.DetectStringA(lpSrcStr, nSrcSize); // if dwSrcEncoding is zero means there is an ambiguity, we don't return // the detected codepage to caller, instead we defaut its codepage internally // to SJIS if (dwSrcEncoding) { dwDetectResult = dwSrcEncoding << 16 ; } else dwSrcEncoding = CP_JPN_SJ; } } // bug #43190, we auto-detect again for euc-kr page because IMN ver 1.0 // mislabel an ISO-KR page as a ks_c_5601-1987 page. This is the only way // we can fix that mistake. else if ( dwSrcEncoding == CP_KR_AUTO || dwSrcEncoding == CP_KOR_5601 || dwSrcEncoding == CP_EUC_KR ) { CIncdKorean DetectKorean; UINT uiCodePage ; uiCodePage = ( dwMode >> 16 ) & 0xffff ; if ( uiCodePage ) { dwSrcEncoding = uiCodePage ; dwDetectResult = 0; } else { dwSrcEncoding = DetectKorean.DetectStringA(lpSrcStr, nSrcSize); if (dwSrcEncoding) { dwDetectResult = dwSrcEncoding << 16 ; } else dwSrcEncoding = CP_KOR_5601; } } else if ( dwSrcEncoding == CP_AUTO ) // General Auto Detection for all code pages { int _nSrcSize = DETECTION_MAX_LEN < nSrcSize ? DETECTION_MAX_LEN : nSrcSize; int nScores = DETECTION_BUFFER_NUM; DetectEncodingInfo Encoding[DETECTION_BUFFER_NUM]; UINT uiCodePage ; uiCodePage = ( dwMode >> 16 ) & 0xffff ; if ( uiCodePage ) { dwSrcEncoding = uiCodePage ; dwDetectResult = 0; } else { dwSrcEncoding = g_uACP; if ( S_OK == _DetectInputCodepage(MLDETECTCP_HTML, CP_AUTO, (char *)lpSrcStr, &_nSrcSize, &Encoding[0], &nScores)) { MIMECPINFO cpInfo; if (Encoding[0].nCodePage == CP_20127) Encoding[0].nCodePage = dwSrcEncoding; if (NULL != g_pMimeDatabase) { if (SUCCEEDED(g_pMimeDatabase->GetCodePageInfo(Encoding[0].nCodePage, 0x409, &cpInfo)) && (cpInfo.dwFlags & MIMECONTF_VALID)) { dwSrcEncoding = Encoding[0].nCodePage; dwDetectResult = dwSrcEncoding << 16 ; } } } // If we failed in general detection and system locale is Jpn, we try harder // with our Japanese detection engine if (dwSrcEncoding == CP_JPN_SJ && dwDetectResult == CP_UNDEFINED) { CIncdJapanese DetectJapan; DWORD dwSrcEncodingJpn = DetectJapan.DetectStringA(lpSrcStr, nSrcSize); if (dwSrcEncodingJpn) { // We only change conversion encoding without returnning this result to browser // if it is in the middle of detection, this is to prevent other encodings been mis-detected as Jpn encodings. dwSrcEncoding = dwSrcEncodingJpn; // Set search range for end tag as 10 bytes if (nSrcSize >= 10) { char szTmpStr[11] = {0}; char *lpTmpStr = szTmpStr; MLStrCpyN(szTmpStr, (char *)&lpSrcStr[nSrcSize-10], 10); //ToLower while(*lpTmpStr) { if (*lpTmpStr >= 'A' && *lpTmpStr <= 'W') *lpTmpStr += 0x20; lpTmpStr++; } // If end of page, return this result if (MLStrStr(szTmpStr, "")) dwDetectResult = dwSrcEncoding << 16 ; } } } //aEncodingInfo[GetWindowsEncodingIndex(CP_AUTO)].dwCodePage = dwSrcEncoding; } } else { // Not a auto-detect scenario dwDetectResult = 0; } if ( S_OK == ( hr = INetConvert->ConvertSetup(&dwSrcEncoding,dwDstEncoding ))) { if ( dwSrcEncoding != dwDstEncoding ) { // if high word of dwMode is CP_UTF_7, it must be detection result, don't pass it to UTF7 converter if ( dwSrcEncoding == CP_UTF_7 && (dwMode >> 16) == CP_UTF_7) dwMode &= 0xFFFF; // ASSERT(!((IS_ENCODED_ENCODING(dwSrcEncoding) || IS_ENCODED_ENCODING(dwDstEncoding)) && (NULL == lpdwMode))); hr = INetConvert->DoCodeConvert(&dwMode, lpSrcStr, &nSrcSize, lpDstStr, &nDstSize, dwFlag, lpFallBack); // return the number of bytes processed for the source. if (lpnSrcSize) *lpnSrcSize = INetConvert->_nSrcSize ; INetConvert->ConvertCleanUp(); } else { int nSize, i ; hr = S_OK ; BOOL bLeadByte = FALSE ; // only check for windows codepage if ( INetConvert->_dwConvertType == 02 && lpSrcStr ) { for ( i=0; i 4byte conversion */ LPSTR pTmp = (LPSTR) lpDstStr; LPWSTR pw4 = NULL; if(pTmp) /* allocate only if we have a lpDstStr */ pw4 = new WCHAR[nByteCountSize/2]; if(pw4) { int i = 0; LPWSTR pw4Tmp = pw4; for(; i < nByteCountSize/2; i++) *pw4Tmp++ = (UCHAR)pTmp[i*2]; pw4Tmp = pw4; for(i = 0; i < nByteCountSize/2; i++) *lpDstStr++ = *pw4Tmp++; } if(!pw4 && pTmp) /* if lpDstStr and allocate fails bail out */ hr = E_FAIL; delete [] pw4; } nByteCountSize *= 2; // Expand twice as we have 4 byte wchars. } #endif *lpnWideCharCount = nByteCountSize / sizeof(WCHAR); return hr ; } HRESULT WINAPI ConvertINetUnicodeToMultiByteEx(LPDWORD lpdwMode, DWORD dwEncoding, LPCWSTR lpSrcStr, LPINT lpnWideCharCount, LPSTR lpDstStr, LPINT lpnMultiCharCount, DWORD dwFlag, WCHAR *lpFallBack) { HRESULT hr ; int nByteCountSize=-1; if(lpnWideCharCount && *lpnWideCharCount != -1) nByteCountSize = *lpnWideCharCount * sizeof(WCHAR); hr = ConvertINetStringEx(lpdwMode,CP_UCS_2, dwEncoding, (LPCSTR) lpSrcStr, &nByteCountSize, lpDstStr, lpnMultiCharCount, dwFlag, lpFallBack); #ifdef UNIX if(dwEncoding == 1200 || dwEncoding == 65000 || dwEncoding == 65001) { nByteCountSize *= 2; // Expand twice as we have 4 byte wchars. } #endif /* UNIX */ if (lpnWideCharCount) *lpnWideCharCount = nByteCountSize / sizeof(WCHAR); return hr ; } HRESULT WINAPI ConvertINetString(LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, LPCSTR lpSrcStr, LPINT lpnSrcSize, LPSTR lpDstStr, LPINT lpnDstSize) { HRESULT hr ; hr = ConvertINetStringEx(lpdwMode,dwSrcEncoding,dwDstEncoding,lpSrcStr,lpnSrcSize,lpDstStr,lpnDstSize, 0, NULL); return hr ; } HRESULT WINAPI ConvertINetUnicodeToMultiByte(LPDWORD lpdwMode, DWORD dwEncoding, LPCWSTR lpSrcStr, LPINT lpnWideCharCount, LPSTR lpDstStr, LPINT lpnMultiCharCount) { HRESULT hr ; DWORD dwFlag = 0 ; if ( lpdwMode ) dwFlag |= ( *lpdwMode & 0x00008000 ) ? MLCONVCHARF_ENTITIZE : 0 ; hr = ConvertINetUnicodeToMultiByteEx(lpdwMode,dwEncoding,lpSrcStr,lpnWideCharCount,lpDstStr,lpnMultiCharCount,dwFlag,NULL); return hr ; } HRESULT WINAPI ConvertINetMultiByteToUnicode(LPDWORD lpdwMode, DWORD dwEncoding, LPCSTR lpSrcStr, LPINT lpnMultiCharCount, LPWSTR lpDstStr, LPINT lpnWideCharCount) { HRESULT hr ; hr = ConvertINetMultiByteToUnicodeEx(lpdwMode,dwEncoding,lpSrcStr,lpnMultiCharCount,lpDstStr,lpnWideCharCount, 0, NULL); return hr ; } #define STR_BUFFER_SIZE 2048 HRESULT _ConvertINetStringInIStream(CICharConverter * INetConvert, LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, IStream *pstmIn, IStream *pstmOut, DWORD dwFlag, WCHAR *lpFallBack) { DWORD dwMode, dwModeTemp ; HRESULT hr= S_OK, hrWarnings=S_OK; LPSTR lpstrIn = NULL, lpstrOut = NULL; ULONG nSrcSize, nSrcUsed, nSrcLeft, nDstSize, _nDstSize, nOutBuffSize ; if (lpdwMode) dwMode = *lpdwMode ; // allocate a temp input buffer - 2K in size if ( (lpstrIn = (LPSTR) LocalAlloc(LPTR, STR_BUFFER_SIZE )) == NULL ) { hrWarnings = E_OUTOFMEMORY ; goto exit; } if ( (lpstrOut = (LPSTR) LocalAlloc(LPTR, STR_BUFFER_SIZE * 2 )) == NULL ) { hrWarnings = E_OUTOFMEMORY ; goto exit; } nOutBuffSize = STR_BUFFER_SIZE * 2 ; nSrcLeft = 0 ; // In real world, clients uses 28591 as 1252, 28599 as 1254, // To correctly convert those extended characters to Unicode, // We internally replace it with 1252 if (dwDstEncoding == CP_UCS_2 || dwDstEncoding == CP_UCS_2_BE) { if ((dwSrcEncoding == CP_ISO_8859_1) && _IsValidCodePage(CP_1252)) dwSrcEncoding = CP_1252; if ((dwSrcEncoding == CP_ISO_8859_9) && _IsValidCodePage(CP_1254)) dwSrcEncoding = CP_1254; } if ((dwDstEncoding == CP_1252) && (dwSrcEncoding == CP_ISO_8859_1)) { dwSrcEncoding = CP_1252; } if ((dwDstEncoding == CP_1254) && (dwSrcEncoding == CP_ISO_8859_9)) { dwSrcEncoding = CP_1254; } if ( dwSrcEncoding == CP_JP_AUTO ) // Auto Detection for Japan { CIncdJapanese DetectJapan; UINT uiCodePage ; LARGE_INTEGER li; uiCodePage = ( dwMode >> 16 ) & 0xffff ; if ( uiCodePage ) dwSrcEncoding = uiCodePage ; else { LISet32(li, 0); hr = pstmIn->Read(lpstrIn, STR_BUFFER_SIZE , &nSrcSize); if (S_OK != hr) hrWarnings = hr; hr = pstmIn->Seek(li,STREAM_SEEK_SET, NULL); if (S_OK != hr) hrWarnings = hr; dwSrcEncoding = DetectJapan.DetectStringA(lpstrIn, nSrcSize); // if dwSrcEncoding is zero means there is an ambiguity, we don't return // the detected codepage to caller, instead we defaut its codepage internally // to SJIS if (dwSrcEncoding) { dwMode &= 0x0000ffff ; dwMode |= dwSrcEncoding << 16 ; } else dwSrcEncoding = CP_JPN_SJ; } } // bug #43190, we auto-detect again for euc-kr page because IMN ver 1.0 // mislabel an ISO-KR page as a ks_c_5601-1987 page. This is the only way // we can fix that mistake. else if ( dwSrcEncoding == CP_KR_AUTO || dwSrcEncoding == CP_KOR_5601 || dwSrcEncoding == CP_EUC_KR ) { CIncdKorean DetectKorean; UINT uiCodePage ; LARGE_INTEGER li; uiCodePage = ( dwMode >> 16 ) & 0xffff ; if ( uiCodePage ) dwSrcEncoding = uiCodePage ; else { LISet32(li, 0); hr = pstmIn->Read(lpstrIn, STR_BUFFER_SIZE, &nSrcSize); if (S_OK != hr) hrWarnings = hr; hr = pstmIn->Seek(li,STREAM_SEEK_SET, NULL); if (S_OK != hr) hrWarnings = hr; dwSrcEncoding = DetectKorean.DetectStringA(lpstrIn, nSrcSize); if (dwSrcEncoding) { dwMode &= 0x0000ffff ; dwMode |= dwSrcEncoding << 16 ; } else dwSrcEncoding = CP_KOR_5601; } } else if ( dwSrcEncoding == CP_AUTO ) // General Auto Detection for all code pages { INT nScores = 1; DWORD dwSrcEncoding ; DetectEncodingInfo Encoding; UINT uiCodePage ; LARGE_INTEGER li; uiCodePage = ( dwMode >> 16 ) & 0xffff ; if ( uiCodePage ) dwSrcEncoding = uiCodePage ; else { LISet32(li, 0); hr = pstmIn->Read(lpstrIn, STR_BUFFER_SIZE , &nSrcSize); if (S_OK != hr) hrWarnings = hr; hr = pstmIn->Seek(li,STREAM_SEEK_SET, NULL); if (S_OK != hr) hrWarnings = hr; if (DETECTION_MAX_LEN < nSrcSize) nSrcSize = DETECTION_MAX_LEN; if ( S_OK == _DetectInputCodepage(MLDETECTCP_HTML, 1252, lpstrIn, (int *)&nSrcSize, &Encoding, &nScores)) { dwSrcEncoding = Encoding.nCodePage; dwMode &= 0x0000ffff ; dwMode |= dwSrcEncoding << 16 ; } else { dwSrcEncoding = CP_ACP; } aEncodingInfo[GetWindowsEncodingIndex(CP_AUTO)].dwCodePage = dwSrcEncoding; } } if ( S_OK == ( hr = INetConvert->ConvertSetup(&dwSrcEncoding,dwDstEncoding ))) { // Loop for ever while(1) { // Read a buffer hr = pstmIn->Read(&lpstrIn[nSrcLeft], STR_BUFFER_SIZE-nSrcLeft, &nSrcSize); if (S_OK != hr) hrWarnings = hr; // Done if (0 == nSrcSize) break; nSrcSize += nSrcLeft ; nSrcUsed = nSrcSize ; dwModeTemp = dwMode ; nDstSize = 0 ; // get the size of output buffer hr = INetConvert->DoCodeConvert(&dwModeTemp, (LPCSTR)lpstrIn, (LPINT)&nSrcUsed, NULL, (LPINT)&nDstSize, dwFlag, lpFallBack); if (S_OK != hr) hrWarnings = hr; // Reallocate output buffer if so if ( nDstSize > nOutBuffSize ) { LPSTR psz = (LPSTR) LocalReAlloc(lpstrOut, nDstSize, LMEM_ZEROINIT|LMEM_MOVEABLE); if (psz == NULL) { hrWarnings = E_OUTOFMEMORY ; goto exit; } lpstrOut = psz; nOutBuffSize = nDstSize ; } _nDstSize = nDstSize; // Due to multi_stage conversion, this is the actual size is used nSrcUsed = INetConvert->_nSrcSize ; nSrcLeft = nSrcSize - nSrcUsed ; #if 0 // restore Src size nSrcUsed = nSrcSize ; #endif // do conversion hr = INetConvert->DoCodeConvert(&dwMode, (LPCSTR)lpstrIn, (LPINT)&nSrcUsed, lpstrOut, (LPINT)&_nDstSize, dwFlag, lpFallBack); if (S_OK != hr) hrWarnings = hr; // Write It hr = pstmOut->Write(lpstrOut, nDstSize, &nDstSize); if (S_OK != hr) hrWarnings = hr; if (nSrcLeft ) MoveMemory(lpstrIn, &lpstrIn[nSrcSize-nSrcLeft],nSrcLeft); INetConvert->ConvertCleanUp(); } } if (nSrcLeft ) { LARGE_INTEGER li; LISet32(li, -(LONG)nSrcLeft ); hr = pstmIn->Seek(li,STREAM_SEEK_CUR, NULL); } if (lpdwMode) *lpdwMode = dwMode ; exit : if (lpstrIn) LocalFree(lpstrIn); if (lpstrOut) LocalFree(lpstrOut); // Done return (hr == S_OK) ? hrWarnings : hr; } HRESULT WINAPI ConvertINetStringInIStream(LPDWORD lpdwMode, DWORD dwSrcEncoding, DWORD dwDstEncoding, IStream *pstmIn, IStream *pstmOut, DWORD dwFlag, WCHAR *lpFallBack) { HRESULT hr; CICharConverter * INetConvert = new CICharConverter(dwFlag, lpFallBack) ; if (!INetConvert) return E_OUTOFMEMORY; hr = _ConvertINetStringInIStream(INetConvert,lpdwMode,dwSrcEncoding,dwDstEncoding,pstmIn,pstmOut,dwFlag,lpFallBack); delete INetConvert; return hr ; }