windows-nt/Source/XPSP1/NT/shell/ext/mlang/cpdetect.cpp
2020-09-26 16:20:57 +08:00

1020 lines
30 KiB
C++

#include "private.h"
#include "detcbase.h"
#include "codepage.h"
#include "detcjpn.h"
#include "detckrn.h"
#include "fechrcnv.h"
#include "msencode.h"
#include "lcdetect.h"
#include "cpdetect.h"
CCpMRU *g_pCpMRU = NULL;
// Get data from registry and construct cache
HRESULT CCpMRU::Init(void)
{
BOOL bRegKeyReady = TRUE;
HRESULT hr = S_OK;
HKEY hkey;
_pCpMRU = NULL;
// HKCR\\Software\\Microsoft\internet explorer\\international\\CpMRU
if (ERROR_SUCCESS != RegOpenKeyEx(HKEY_CURRENT_USER,
REGSTR_PATH_CPMRU,
0, KEY_READ|KEY_SET_VALUE, &hkey))
{
DWORD dwAction = 0;
if (ERROR_SUCCESS != RegCreateKeyEx(HKEY_CURRENT_USER,
REGSTR_PATH_CPMRU,
0, NULL, REG_OPTION_NON_VOLATILE, KEY_ALL_ACCESS, NULL, &hkey, &dwAction))
{
bRegKeyReady = FALSE;
dwCpMRUEnable = 0;
hr = E_FAIL;
}
}
if (bRegKeyReady)
{
DWORD dwType = REG_DWORD;
DWORD dwSize = sizeof(DWORD);
BOOL bUseDefault = FALSE;
if (ERROR_SUCCESS != RegQueryValueEx(hkey, REG_KEY_CPMRU_ENABLE, 0, &dwType, (LPBYTE)&dwCpMRUEnable, &dwSize))
{
dwCpMRUEnable = 1;
RegSetValueEx(hkey, REG_KEY_CPMRU_ENABLE, 0, REG_DWORD, (LPBYTE)&dwCpMRUEnable, sizeof(dwCpMRUEnable));
}
// If fail to open registry data or find unreasonable cache parameters, use default settings
if ((ERROR_SUCCESS != RegQueryValueEx(hkey, REG_KEY_CPMRU_NUM, 0, &dwType, (LPBYTE)&dwCpMRUNum, &dwSize)) ||
(ERROR_SUCCESS != RegQueryValueEx(hkey, REG_KEY_CPMRU_INIT_HITS, 0, &dwType, (LPBYTE)&dwCpMRUInitHits, &dwSize)) ||
(ERROR_SUCCESS != RegQueryValueEx(hkey, REG_KEY_CPMRU_PERCENTAGE_FACTOR, 0, &dwType, (LPBYTE)&dwCpMRUFactor, &dwSize)) ||
(dwCpMRUNum > MAX_CPMRU_NUM) || !dwCpMRUFactor || !dwCpMRUInitHits)
{
dwCpMRUNum = DEFAULT_CPMRU_NUM;
dwCpMRUInitHits = DEFAULT_CPMRU_INIT_HITS;
dwCpMRUFactor = DEFAULT_CPMRU_FACTOR;
bUseDefault = TRUE;
// Store default value in registry
RegSetValueEx(hkey, REG_KEY_CPMRU_NUM, 0, REG_DWORD, (LPBYTE)&dwCpMRUNum, sizeof(dwCpMRUNum));
RegSetValueEx(hkey, REG_KEY_CPMRU_INIT_HITS, 0, REG_DWORD, (LPBYTE)&dwCpMRUInitHits, sizeof(dwCpMRUInitHits));
RegSetValueEx(hkey, REG_KEY_CPMRU_PERCENTAGE_FACTOR, 0, REG_DWORD, (LPBYTE)&dwCpMRUFactor, sizeof(dwCpMRUFactor));
}
dwSize = sizeof(CODEPAGE_MRU)*dwCpMRUNum;
if (!dwSize || NULL == (_pCpMRU = (PCODEPAGE_MRU)LocalAlloc(LPTR, dwSize)))
{
hr = E_FAIL;
dwCpMRUEnable = 0;
}
if (_pCpMRU && !bUseDefault)
{
dwType = REG_BINARY;
if (ERROR_SUCCESS != RegQueryValueEx(hkey, REG_KEY_CPMRU, 0, &dwType, (LPBYTE)_pCpMRU, &dwSize))
{
ZeroMemory(_pCpMRU,sizeof(CODEPAGE_MRU)*dwCpMRUNum);
}
}
RegCloseKey(hkey);
}
return hr;
}
// Update registry's cache value
CCpMRU::~CCpMRU(void)
{
HKEY hkey;
if (bCpUpdated)
{
if (RegOpenKeyEx(HKEY_CURRENT_USER,
REGSTR_PATH_CPMRU,
0, KEY_READ|KEY_SET_VALUE, &hkey) == ERROR_SUCCESS)
{
DWORD dwType = REG_BINARY;
DWORD dwSize = sizeof(CODEPAGE_MRU)*dwCpMRUNum;
if (_pCpMRU)
{
RegSetValueEx(hkey, REG_KEY_CPMRU, 0, dwType, (LPBYTE)_pCpMRU, dwSize);
LocalFree(_pCpMRU);
_pCpMRU = NULL;
}
RegCloseKey(hkey);
}
bCpUpdated = FALSE;
}
}
HRESULT CCpMRU::GetCpMRU(PCODEPAGE_MRU pCpMRU, UINT *puiCpNum)
{
DWORD dwTotalHits = 0;
UINT i;
HRESULT hr = E_FAIL;
if (!(*puiCpNum))
return E_INVALIDARG;
if (!_pCpMRU)
return hr;
if (!dwCpMRUEnable || !dwCpMRUInitHits)
{
*puiCpNum = 0;
return S_FALSE;
}
ZeroMemory(pCpMRU, sizeof(CODEPAGE_MRU)*(*puiCpNum));
// Get total hits acount
for (i=0; i<dwCpMRUNum; i++)
{
if (_pCpMRU[i].dwHistoryHits)
dwTotalHits += _pCpMRU[i].dwHistoryHits;
else
break;
}
// Not enough hits count to determin the result, keep collecting
if (dwTotalHits < dwCpMRUInitHits)
{
*puiCpNum = 0;
return S_FALSE;
}
for (i=0; i<dwCpMRUNum && i<*puiCpNum; i++)
{
// Percentage is 1/MIN_CPMRU_FACTOR
if (_pCpMRU[i].dwHistoryHits*dwCpMRUFactor/dwTotalHits < 1)
break;
}
if (i != 0)
{
CopyMemory(pCpMRU, _pCpMRU, sizeof(CODEPAGE_MRU)*(i));
*puiCpNum = i;
hr = S_OK;
}
return hr;
}
// Update code page MRU
void CCpMRU::UpdateCPMRU(DWORD dwEncoding)
{
UINT i,j;
if (!_pCpMRU)
return;
if ((dwEncoding == CP_AUTO) ||
(dwEncoding == CP_JP_AUTO) ||
(dwEncoding == CP_KR_AUTO))
return;
if (!bCpUpdated)
bCpUpdated = TRUE;
// Sorted
for (i=0; i< dwCpMRUNum; i++)
{
if (!_pCpMRU[i].dwEncoding || (_pCpMRU[i].dwEncoding == dwEncoding))
break;
}
// If not found, replace the last encoding
if (i == dwCpMRUNum)
{
_pCpMRU[dwCpMRUNum-1].dwEncoding = dwEncoding;
_pCpMRU[dwCpMRUNum-1].dwHistoryHits = 1;
}
else
{
_pCpMRU[i].dwHistoryHits ++;
// If it is an already exist encoding, change order as needed
if (_pCpMRU[i].dwEncoding)
{
for (j=i; j>0; j--)
{
if (_pCpMRU[j-1].dwHistoryHits >= _pCpMRU[i].dwHistoryHits)
{
break;
}
}
if (j < i)
{
// Simple sorting
CODEPAGE_MRU tmpCPMRU = _pCpMRU[i];
MoveMemory(&_pCpMRU[j+1], &_pCpMRU[j], (i-j)*sizeof(CODEPAGE_MRU));
_pCpMRU[j].dwEncoding = tmpCPMRU.dwEncoding;
_pCpMRU[j].dwHistoryHits = tmpCPMRU.dwHistoryHits;
}
}
else
{
_pCpMRU[i].dwEncoding = dwEncoding;
}
}
// Cached too many hits?
if (_pCpMRU[0].dwHistoryHits > 0xFFFFFFF0)
{
// Find the smallest one
// This loop will always terminate
// because at worst, it will stop at i=0 (which we know
// is a huge number from the "if" above).
for (i=dwCpMRUNum-1; ; i--)
{
if (_pCpMRU[i].dwHistoryHits > 1)
break;
}
// Decrease Cache value
for (j=0; j<dwCpMRUNum && _pCpMRU[j].dwHistoryHits; j++)
{
// We still keep those one hit encodings if any
_pCpMRU[j].dwHistoryHits /= _pCpMRU[i].dwHistoryHits;
}
}
}
UINT CheckEntity(LPSTR pIn, UINT nIn)
{
UINT uiRet = 0;
UINT uiSearchRange;
UINT i;
uiSearchRange = (nIn > MAX_ENTITY_LENTH)? MAX_ENTITY_LENTH:nIn;
if (*pIn == '&')
{
for(i=0; i<uiSearchRange; i++)
{
if (pIn[i] == ';')
break;
}
if (i < uiSearchRange)
{
uiSearchRange = i+1;
// NCR Entity
if (pIn[1] == '#')
{
for (i=2; i<uiSearchRange-1; i++)
if (!IS_DIGITA(pIn[i]))
{
uiSearchRange = 0;
break;
}
}
// Name Entity
else
{
for (i=1; i<uiSearchRange-1; i++)
if (!IS_CHARA(pIn[i]))
{
uiSearchRange = 0;
break;
}
}
}
else
{
uiSearchRange = 0;
}
}
else
{
uiSearchRange = 0;
}
return uiSearchRange;
}
void RemoveHtmlTags (LPSTR pIn, UINT *pnBytes)
//
// Remove HTML tags from pIn and compress whitespace, in-place.
// On input *pnBytes is the input length; on return *pnBytes is
// set to the resulting length.
//
// Name Entity and NCR Entity strings also removed
{
UINT nIn = *pnBytes;
UINT nOut = 0;
UINT nEntity = 0;
LPSTR pOut = pIn;
BOOL fSkippedSpace = FALSE;
while ( nIn > 0 /*&& nOut + 2 < *pnBytes */) {
if (*pIn == '<' && nIn > 1/* && !IsNoise (pIn[1])*/) {
// Discard text until the end of this tag. The handling here
// is pragmatic and imprecise; what matters is detecting mostly
// contents text, not tags or comments.
pIn++;
nIn--;
LPCSTR pSkip;
DWORD nLenSkip;
if ( nIn > 1 && *pIn == '%' )
{
pSkip = "%>"; // Skip <% to %>
nLenSkip = 2;
}
else if ( nIn > 3 && *pIn == '!' && !LowAsciiStrCmpNIA(pIn, "!--", 3) )
{
pSkip = "-->"; // Skip <!-- to -->
nLenSkip = 3;
}
else if ( nIn > 5 && !LowAsciiStrCmpNIA(pIn, "style", 5) )
{
pSkip = "</style>"; // Skip <style ...> to </style>
nLenSkip = 8;
}
else if ( nIn > 6 && !LowAsciiStrCmpNIA(pIn, "script", 6) )
{
pSkip = "</script>"; // Skip <script ...> to </script>
nLenSkip = 9;
}
else if ( nIn > 3 && !LowAsciiStrCmpNIA(pIn, "xml", 3) )
{
pSkip = "</xml>";
nLenSkip = 6;
}
else
{
pSkip = ">"; // match any end tag
nLenSkip = 1;
}
// Skip up to a case-insensitive match of pSkip / nLenSkip
while ( nIn > 0 )
{
// Spin fast up to a match of the first char.
// NOTE: the first-char compare is NOT case insensitive
// because this char is known to never be alphabetic.
while ( nIn > 0 && *pIn != *pSkip )
{
pIn++;
nIn--;
}
if ( nIn > nLenSkip && !LowAsciiStrCmpNIA(pIn, pSkip, nLenSkip) )
{
pIn += nLenSkip;
nIn -= nLenSkip;
fSkippedSpace = TRUE;
break;
}
if ( nIn > 0)
{
pIn++;
nIn--;
}
}
// *pIn is either one past '>' or at end of input
}
else
if (IsNoise (*pIn) || (nEntity = CheckEntity(pIn, nIn)))
{
// Collapse whitespace -- remember it but don't copy it now
fSkippedSpace = TRUE;
if (nEntity)
{
pIn+=nEntity;
nIn-=nEntity;
nEntity = 0;
}
else
{
while (nIn > 0 && IsNoise (*pIn))
pIn++, nIn--;
}
}
// *pIn is non-ws char
else
{
// Pass through all other characters
// Compress all previous noise characters to a white space
if (fSkippedSpace)
{
*pOut++ = ' ';
nOut++;
fSkippedSpace = FALSE;
}
*pOut++ = *pIn++;
nIn--;
nOut++;
}
}
*pnBytes = nOut;
}
static unsigned char szKoi8ru[] = {0xA4, 0xA6, 0xA7, 0xB4, 0xB6, 0xB7, 0xAD, 0xAE, 0xBD, 0xBE};
static unsigned char sz28592[] = {0xA1, 0xA6, /*0xAB,*/ 0xAC, 0xB1, 0xB5, 0xB6, 0xB9, /*0xBB, 0xE1*/}; // Need to fine tune this data
const CPPATCH CpData[] =
{
{CP_KOI8R, CP_KOI8RU, ARRAYSIZE(szKoi8ru), szKoi8ru},
{CP_1250, CP_ISO_8859_2, ARRAYSIZE(sz28592), sz28592},
};
// Distinguish similar western encodings
UINT PatchCodePage(UINT uiEncoding, unsigned char *pStr, int nSize)
{
int i, l,m, n, iPatch=0;
while (iPatch < ARRAYSIZE(CpData))
{
if (uiEncoding == CpData[iPatch].srcEncoding)
{
for (i=0; i<nSize; i++)
{
if (*pStr > HIGHEST_ASCII)
{
l = 0;
m = CpData[iPatch].nSize-1;
n = m / 2;
while (l <= m)
{
if (*pStr == CpData[iPatch].pszUch[n])
return CpData[iPatch].destEncoding;
else
{
if (*pStr < CpData[iPatch].pszUch[n])
{
m = n-1;
}
else
{
l = n+1;
}
n = (l+m)/2;
}
}
}
pStr++;
}
}
iPatch++;
}
return uiEncoding;
}
#if 0
const unsigned char szKOIRU[] = {0xA4, 0xA6, 0xA7, 0xB4, 0xB6, 0xB7, 0xAD, 0xAE, 0xBD, 0xBE};
BOOL _IsKOI8RU(unsigned char *pStr, int nSize)
{
int i,j;
BOOL bRet = FALSE;
// Skip parameter check since this is internal
for (i=0; i<nSize; i++)
{
if (*pStr >= szKOIRU[0] && *pStr <= szKOIRU[ARRAYSIZE(szKOIRU)-1])
{
for (j=0; j<ARRAYSIZE(szKOIRU); j++)
{
if (*pStr == szKOIRU[j])
{
bRet = TRUE;
break;
}
}
}
if (bRet)
break;
pStr++;
}
return bRet;
}
#endif
HRESULT WINAPI _DetectInputCodepage(DWORD dwFlag, DWORD uiPrefWinCodepage, CHAR *pSrcStr, INT *pcSrcSize, DetectEncodingInfo *lpEncoding, INT *pnScores)
{
HRESULT hr = S_OK;
IStream *pstmTmp = NULL;
BOOL bGuess = FALSE;
BOOL bLCDetectSucceed = FALSE;
int nBufSize = *pnScores;
CHAR *_pSrcStr = pSrcStr;
UINT nSrcSize;
int i;
BOOL bMayBeAscii = FALSE;
// Check parameters
if (!pSrcStr || !(*pcSrcSize) || !lpEncoding || *pnScores == 0)
return E_INVALIDARG;
nSrcSize = *pcSrcSize;
// Zero out return buffer
ZeroMemory(lpEncoding, sizeof(DetectEncodingInfo)*(*pnScores));
// Simple Unicode detection
if (nSrcSize >= sizeof(WCHAR))
{
UINT uiCp = 0;
if (*((WCHAR *)pSrcStr) == 0xFEFF) // Unicode
uiCp = CP_UCS_2;
else if (*((WCHAR *)pSrcStr) == 0xFFFE) // Uncode Big Endian
uiCp = CP_UCS_2_BE;
if (uiCp)
{
*pnScores = 1;
lpEncoding[0].nCodePage = uiCp;
lpEncoding[0].nConfidence = 100;
lpEncoding[0].nDocPercent = 100;
lpEncoding[0].nLangID = -1;
return S_OK;
}
}
// HTML: take off HTML 'decoration'
if (dwFlag & MLDETECTCP_HTML)
{
// Dup buffer for HTML parser
if (NULL == (_pSrcStr = (char *)LocalAlloc(LPTR, nSrcSize)))
return E_OUTOFMEMORY;
CopyMemory(_pSrcStr, pSrcStr, nSrcSize);
RemoveHtmlTags (_pSrcStr, &nSrcSize);
}
// if blank page/file...
if (!nSrcSize)
return E_FAIL;
if (nSrcSize >= MIN_TEXT_SIZE)
{
// Initialize LCDetect
if (NULL == g_pLCDetect)
{
EnterCriticalSection(&g_cs);
if (NULL == g_pLCDetect)
{
LCDetect *pLC = new LCDetect ((HMODULE)g_hInst);
if (pLC)
{
if (pLC->LoadState() == NO_ERROR)
g_pLCDetect = pLC;
else
{
delete pLC;
}
}
}
LeaveCriticalSection(&g_cs);
}
if (g_pLCDetect)
{
LCD_Detect(_pSrcStr, nSrcSize, (PLCDScore)lpEncoding, pnScores, NULL);
if (*pnScores)
{
hr = S_OK;
bLCDetectSucceed = TRUE;
}
}
}
if (!bLCDetectSucceed)
{
*pnScores = 0;
hr = E_FAIL;
}
unsigned int uiCodepage = 0;
LARGE_INTEGER li = {0,0};
ULARGE_INTEGER uli = {0,0};
if (S_OK == CreateStreamOnHGlobal(NULL, TRUE, &pstmTmp))
{
ULONG cb = (ULONG) nSrcSize ;
if (S_OK == pstmTmp->Write(_pSrcStr,cb,&cb))
{
uli.LowPart = cb ;
if (S_OK != pstmTmp->SetSize(uli))
{
hr = E_OUTOFMEMORY;
goto DETECT_DONE;
}
}
else
{
goto DETECT_DONE;
}
}
else
{
hr = E_OUTOFMEMORY;
goto DETECT_DONE;
}
pstmTmp->Seek(li,STREAM_SEEK_SET, NULL);
switch (CceDetectInputCode(pstmTmp, grfDetectResolveAmbiguity|grfDetectUseCharMapping|grfDetectIgnoreEof, (EFam) 0, 0, &uiCodepage, &bGuess))
{
case cceSuccess:
if (*pnScores)
{
// LCDETECT never detects wrong on Arabic and Russian, don't consider it as DBCS in this case
// because MSEncode might misdetect Arabic and Russian as Japanese
// Same goes for Korean JOHAB, MSENCODE doesn't support it at all
if (((lpEncoding[0].nLangID == LANG_ARABIC )|| (lpEncoding[0].nLangID == LANG_RUSSIAN) || (lpEncoding[0].nCodePage == CP_KOR_JOHAB)) &&
(lpEncoding[0].nConfidence >= MIN_ACCEPTABLE_CONFIDENCE)
&& (lpEncoding[0].nDocPercent >= MIN_DOCPERCENT) && !bGuess)
bGuess = TRUE;
for (i=0;i<*pnScores;i++)
{
if (lpEncoding[i].nCodePage == uiCodepage)
{
if ((i != 0) && !bGuess)
{
DetectEncodingInfo TmpEncoding;
// Re-arrange lanugage list for MSEncode result
MoveMemory(&TmpEncoding, &lpEncoding[0], sizeof(DetectEncodingInfo));
MoveMemory(&lpEncoding[0], &lpEncoding[i], sizeof(DetectEncodingInfo));
MoveMemory(&lpEncoding[i], &TmpEncoding, sizeof(DetectEncodingInfo));
}
// Boost confidence for double hits
lpEncoding[0].nDocPercent = 100;
if (lpEncoding[0].nConfidence < 100)
lpEncoding[0].nConfidence = 100;
break;
}
}
if (i == *pnScores)
{
if (bGuess)
{
if (nBufSize > *pnScores)
{
lpEncoding[*pnScores].nCodePage = uiCodepage;
lpEncoding[*pnScores].nConfidence = MIN_CONFIDENCE;
lpEncoding[*pnScores].nDocPercent = MIN_DOCPERCENT;
lpEncoding[*pnScores].nLangID = -1;
(*pnScores)++;
}
}
else
{
if (nBufSize > *pnScores)
{
MoveMemory(lpEncoding+1, lpEncoding, sizeof(DetectEncodingInfo) * (*pnScores));
(*pnScores)++;
}
else
{
MoveMemory(lpEncoding+1, lpEncoding, sizeof(DetectEncodingInfo) * (*pnScores-1));
}
lpEncoding[0].nCodePage = uiCodepage;
lpEncoding[0].nConfidence = 100;
lpEncoding[0].nDocPercent = MIN_DOCPERCENT;
lpEncoding[0].nLangID = -1;
}
}
}
else
{
lpEncoding[0].nCodePage = uiCodepage;
if (bGuess)
lpEncoding[0].nConfidence = MIN_CONFIDENCE;
else
lpEncoding[0].nConfidence = 100;
lpEncoding[0].nDocPercent = MIN_DOCPERCENT;
lpEncoding[0].nLangID = -1;
(*pnScores)++;
}
//hr = (g_pLCDetect || (nSrcSize < MIN_TEXT_SIZE)) ? S_OK : S_FALSE;
hr = (!g_pLCDetect || (bGuess && !bLCDetectSucceed )) ? S_FALSE : S_OK;
break;
// Currently MSEncode doesn't provide any useful information in 'cceAmbiguousInput' case.
// We may update our code here if Office team enhance MSEncode for ambiguous input later.
case cceAmbiguousInput:
break;
case cceMayBeAscii:
bMayBeAscii = TRUE;
if (!(*pnScores))
{
lpEncoding[0].nCodePage = uiCodepage;
lpEncoding[0].nConfidence = MIN_CONFIDENCE;
lpEncoding[0].nDocPercent = -1;
lpEncoding[0].nLangID = -1;
(*pnScores)++;
}
else
{
for (i=0;i<*pnScores;i++)
{
if (lpEncoding[i].nCodePage == uiCodepage)
{
break;
}
}
if (i == *pnScores)
{
if(nBufSize > *pnScores) // Append MSEncode result to the language list
{
lpEncoding[i].nCodePage = uiCodepage;
lpEncoding[i].nConfidence = -1;
lpEncoding[i].nDocPercent = -1;
lpEncoding[i].nLangID = -1;
(*pnScores)++;
}
}
}
hr = bLCDetectSucceed ? S_OK : S_FALSE;
break;
// MSEncode failed
default:
break;
}
for (i=0; i<*pnScores; i++)
{
switch (lpEncoding[i].nCodePage) {
case 850:
if ((*pnScores>1) && (lpEncoding[1].nConfidence >= MIN_CONFIDENCE))
{
// Remove 850 from detection result if there is other detection results
(*pnScores)--;
if (i < *pnScores)
MoveMemory(&lpEncoding[i], &lpEncoding[i+1], (*pnScores-i)*sizeof(DetectEncodingInfo));
ZeroMemory(&lpEncoding[*pnScores], sizeof(DetectEncodingInfo));
}
else
{
// Replace it with 1252 if it is the only result we get
lpEncoding[0].nCodePage = CP_1252;
lpEncoding[0].nConfidence =
lpEncoding[0].nDocPercent = 100;
lpEncoding[0].nLangID = LANG_ENGLISH;
}
break;
case CP_1250:
case CP_KOI8R:
lpEncoding[i].nCodePage = PatchCodePage(lpEncoding[i].nCodePage, (unsigned char *)_pSrcStr, nSrcSize);
break;
default:
break;
}
}
// If not a high confidence CP_1254 (Windows Turkish),
// we'll check if there're better detection results, and swap results if needed
if ((lpEncoding[0].nCodePage == CP_1254) &&
(*pnScores>1) &&
((lpEncoding[0].nDocPercent < 90) || (lpEncoding[1].nCodePage == CP_CHN_GB) ||
(lpEncoding[1].nCodePage == CP_TWN) || (lpEncoding[1].nCodePage == CP_JPN_SJ) || (lpEncoding[1].nCodePage == CP_KOR_5601)))
{
MoveMemory(&lpEncoding[0], &lpEncoding[1], sizeof(DetectEncodingInfo)*(*pnScores-1));
lpEncoding[*pnScores-1].nCodePage = CP_1254;
lpEncoding[*pnScores-1].nLangID = LANG_TURKISH;
}
// 852 and 1258 text only have one sure detection result
if (((lpEncoding[0].nCodePage == CP_852) || (lpEncoding[0].nCodePage == CP_1258)) &&
(*pnScores>1) &&
(lpEncoding[1].nConfidence >= MIN_CONFIDENCE))
{
DetectEncodingInfo tmpDetect = {0};
MoveMemory(&tmpDetect, &lpEncoding[0], sizeof(DetectEncodingInfo));
MoveMemory(&lpEncoding[0], &lpEncoding[1], sizeof(DetectEncodingInfo));
MoveMemory(&lpEncoding[1], &tmpDetect, sizeof(DetectEncodingInfo));
}
// Considering guessed value from MSENCODE is pretty accurate, we don't change S_OK to S_FALSE
#if 0
if ((S_OK == hr) && !bLCDetectSucceed && bGuess)
{
hr = S_FALSE;
}
#endif
if (uiPrefWinCodepage && *pnScores)
{
if (uiPrefWinCodepage == CP_AUTO && g_pCpMRU && !IS_ENCODED_ENCODING(lpEncoding[0].nCodePage))
{
UINT uiCpNum = CP_AUTO_MRU_NUM;
CODEPAGE_MRU CpMRU[CP_AUTO_MRU_NUM];
if (S_OK == g_pCpMRU->GetCpMRU(CpMRU, &uiCpNum))
{
for (i = 0; i<*pnScores; i++)
{
for (UINT j = 0; j < uiCpNum; j++)
{
if (lpEncoding[i].nCodePage == CpMRU[j].dwEncoding)
{
uiPrefWinCodepage = CpMRU[j].dwEncoding;
break;
}
}
if (uiPrefWinCodepage != CP_AUTO)
break;
}
// If detection result is not in MRU
if (uiPrefWinCodepage == CP_AUTO)
{
// Don't take Unicode as perferred encoding if it is not in detection results for following reasons
// 1. Unicode is usually tagged with charset or Unicode BOM
// 2. Currently, we don't support Unicode detection in all detection engines
if (CpMRU[0].dwEncoding != CP_UCS_2 && CpMRU[0].dwEncoding != CP_UCS_2_BE)
uiPrefWinCodepage = CpMRU[0].dwEncoding;
}
}
}
// End preferred CP check if we can't get a valid one
if (uiPrefWinCodepage == CP_AUTO)
goto PREFERCPCHECK_DONE;
for (i = 1; i<*pnScores; i++)
{
if (uiPrefWinCodepage == lpEncoding[i].nCodePage)
{
DetectEncodingInfo TmpEncoding;
// Re-arrange lanugage list for prefered codepage
TmpEncoding = lpEncoding[i];
MoveMemory(&lpEncoding[1], &lpEncoding[0], sizeof(DetectEncodingInfo)*i);
lpEncoding[0] = TmpEncoding;
break;
}
}
if ((uiPrefWinCodepage != lpEncoding[0].nCodePage) &&
((bMayBeAscii && (lpEncoding[0].nConfidence <= MIN_CONFIDENCE)) ||
(hr != S_OK && nSrcSize >= MIN_TEXT_SIZE) ||
(nSrcSize < MIN_TEXT_SIZE && !IS_ENCODED_ENCODING(lpEncoding[0].nCodePage))))
{
lpEncoding[0].nCodePage = uiPrefWinCodepage;
lpEncoding[0].nConfidence = -1;
lpEncoding[0].nDocPercent = -1;
lpEncoding[0].nLangID = -1;
*pnScores = 1;
}
}
PREFERCPCHECK_DONE:
// Assume LCDETECT won't misdetect 1252 for files over MIN_TEXT_SIZE
// and MSENCODE can handle encoded text even they're below MIN_TEXT_SIZE
if (((nSrcSize < MIN_TEXT_SIZE) && (bMayBeAscii || E_FAIL == hr)) ||
(lpEncoding[0].nCodePage == CP_1252) ||
(lpEncoding[0].nCodePage == CP_UTF_8))
{
UINT j;
for (j=0; j < nSrcSize; j++)
if (*((LPBYTE)(_pSrcStr+j)) > HIGHEST_ASCII)
break;
if (j == nSrcSize)
{
if (lpEncoding[0].nCodePage == CP_1252)
{
lpEncoding[0].nCodePage = CP_20127;
}
else
{
*pnScores = 1;
lpEncoding[0].nCodePage = CP_20127;
lpEncoding[0].nConfidence =
lpEncoding[0].nDocPercent = 100;
lpEncoding[0].nLangID = LANG_ENGLISH;
hr = S_OK;
}
}
}
// UTF-8 doesn't really have distinctive signatures,
// if text amout is small, we won't return low confidence UTF-8 detection result.
if (hr == S_FALSE && IS_ENCODED_ENCODING(lpEncoding[0].nCodePage) &&
!((nSrcSize < MIN_TEXT_SIZE) && (lpEncoding[0].nCodePage == CP_UTF_8)))
hr = S_OK;
DETECT_DONE:
if ((dwFlag & MLDETECTCP_HTML) && _pSrcStr)
LocalFree(_pSrcStr);
if (pstmTmp)
{
pstmTmp->Release();
}
return hr ;
}
HRESULT WINAPI _DetectCodepageInIStream(DWORD dwFlag, DWORD uiPrefWinCodepage, IStream *pstmIn, DetectEncodingInfo *lpEncoding, INT *pnScores)
{
HRESULT hr= S_OK, hrWarnings=S_OK;
LARGE_INTEGER libOrigin = { 0, 0 };
ULARGE_INTEGER ulPos = {0, 0};
LPSTR lpstrIn = NULL ;
ULONG nlSrcSize ;
INT nSrcUsed ;
if (!pstmIn)
return E_INVALIDARG ;
// get size
hr = pstmIn->Seek(libOrigin, STREAM_SEEK_END,&ulPos);
if (S_OK != hr)
hrWarnings = hr;
if ( ulPos.LowPart == 0 && ulPos.HighPart == 0 )
return E_INVALIDARG ;
nlSrcSize = ulPos.LowPart ;
// allocate a temp input buffer
if ( (lpstrIn = (LPSTR) LocalAlloc(LPTR, nlSrcSize )) == NULL )
{
hrWarnings = E_OUTOFMEMORY ;
goto exit;
}
// reset the pointer
hr = pstmIn->Seek(libOrigin, STREAM_SEEK_SET, NULL);
if (S_OK != hr)
hrWarnings = hr;
hr = pstmIn->Read(lpstrIn, nlSrcSize, &nlSrcSize);
if (S_OK != hr)
hrWarnings = hr;
nSrcUsed = (INT) nlSrcSize ;
hr = _DetectInputCodepage(dwFlag, uiPrefWinCodepage, lpstrIn, &nSrcUsed, lpEncoding, pnScores);
exit :
if (lpstrIn)
{
LocalFree(lpstrIn);
}
return (hr == S_OK) ? hrWarnings : hr;
}