451 lines
8.4 KiB
C
451 lines
8.4 KiB
C
/*
|
|
* XML support functions
|
|
* Copyright (C) 2000 Microsoft Corporation
|
|
*/
|
|
|
|
#include "precomp.h"
|
|
|
|
|
|
BOOL FIsXmlWhitespaceW(WCHAR wch)
|
|
{
|
|
return((wch == L' ') || (wch == L'\x9') || (wch == L'\xA') || (wch == L'\xD'));
|
|
}
|
|
|
|
|
|
BOOL FIsXmlWhitespaceA(char ch)
|
|
{
|
|
return(FIsXmlWhitespaceW((WCHAR) (BYTE) ch));
|
|
}
|
|
|
|
|
|
BOOL FIsXmlA(LPCSTR rgch, UINT cch)
|
|
{
|
|
if (memcmp(rgch, "<?xml", 5) != 0)
|
|
{
|
|
// Not XML
|
|
|
|
return(FALSE);
|
|
}
|
|
|
|
return(TRUE);
|
|
|
|
UNREFERENCED_PARAMETER( cch );
|
|
}
|
|
|
|
|
|
BOOL FIsXmlW(LPCWSTR rgwch, UINT cch)
|
|
{
|
|
if (memcmp(rgwch, L"<?xml", 5 * sizeof(WCHAR)) != 0)
|
|
{
|
|
// Not XML
|
|
|
|
return(FALSE);
|
|
}
|
|
|
|
return(TRUE);
|
|
|
|
UNREFERENCED_PARAMETER( cch );
|
|
}
|
|
|
|
|
|
BOOL FDetectXmlEncodingA(LPCSTR rgch, UINT cch, UINT *pcp)
|
|
{
|
|
LPCSTR pchMax;
|
|
LPCSTR pch;
|
|
char chQuote;
|
|
|
|
// XML files encoded in UTF-16 are required to have a BOM which if present
|
|
// would already have been detected. This means that if this file is XML
|
|
// it either is encoded in UCS-4 or UTF-32 which isn't supported or an MBCS
|
|
// encoding of some form. We check for ASCII compatible encodings only
|
|
// which includes everything we probably care about but excludes EBCDIC.
|
|
|
|
// Check for file begining with <?xml ... encoding='...' ... ?>
|
|
|
|
if (cch < 20)
|
|
{
|
|
// File is too small
|
|
|
|
return(FALSE);
|
|
}
|
|
|
|
if (!FIsXmlA(rgch, cch))
|
|
{
|
|
// Not XML
|
|
|
|
return(FALSE);
|
|
}
|
|
|
|
// Don't scan more than 4K looking for encoding even if it is valid XML
|
|
|
|
cch = __min(cch, 4096);
|
|
|
|
pchMax = rgch + cch;
|
|
pch = rgch + 5;
|
|
|
|
if (!FIsXmlWhitespaceA(*pch))
|
|
{
|
|
// Not XML
|
|
|
|
return(FALSE);
|
|
}
|
|
|
|
pch++;
|
|
|
|
chQuote = '\0';
|
|
|
|
for (;;)
|
|
{
|
|
LPCSTR pchToken;
|
|
|
|
if (pch == pchMax)
|
|
{
|
|
// Not XML
|
|
|
|
break;
|
|
}
|
|
|
|
if (FIsXmlWhitespaceA(*pch))
|
|
{
|
|
pch++;
|
|
continue;
|
|
}
|
|
|
|
if (*pch == '=')
|
|
{
|
|
pch++;
|
|
continue;
|
|
}
|
|
|
|
if ((*pch == '\'') || (*pch == '"'))
|
|
{
|
|
if (*pch == chQuote)
|
|
{
|
|
chQuote = '\0';
|
|
}
|
|
|
|
else
|
|
{
|
|
chQuote = *pch;
|
|
}
|
|
|
|
pch++;
|
|
continue;
|
|
}
|
|
|
|
if (chQuote != '\0')
|
|
{
|
|
// We are within a quoted string. Skip everything until closing quote.
|
|
|
|
pch++;
|
|
continue;
|
|
}
|
|
|
|
if ((pch + 2) > pchMax)
|
|
{
|
|
// Not XML
|
|
|
|
break;
|
|
}
|
|
|
|
if ((pch[0] == '?') && (pch[1] == '>'))
|
|
{
|
|
// This looks like XML. At this point if we don't find an encoding
|
|
// specification we could assume UTF-8. We don't because there are
|
|
// malformed XML documents and assuming UTF-8 might affect Notepad
|
|
// compatibility. This may be fine but we put it off for now.
|
|
|
|
// *pcp = CP_UTF8;
|
|
// return(TRUE);
|
|
|
|
break;
|
|
}
|
|
|
|
pchToken = pch;
|
|
|
|
while ((pch < pchMax) && (*pch != '=') && (*pch != '?') && !FIsXmlWhitespaceA(*pch))
|
|
{
|
|
pch++;
|
|
}
|
|
|
|
if (pch != (pchToken + 8))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (memcmp(pchToken, "encoding", 8) != 0)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
while ((pch < pchMax) && FIsXmlWhitespaceA(*pch))
|
|
{
|
|
pch++;
|
|
}
|
|
|
|
if ((pch == pchMax) || (*pch++ != '='))
|
|
{
|
|
// Not XML
|
|
|
|
break;
|
|
}
|
|
|
|
while ((pch < pchMax) && FIsXmlWhitespaceA(*pch))
|
|
{
|
|
pch++;
|
|
}
|
|
|
|
if ((pch == pchMax) || ((*pch != '\'') && (*pch != '"')))
|
|
{
|
|
// Not XML
|
|
|
|
break;
|
|
}
|
|
|
|
chQuote = *pch++;
|
|
|
|
pchToken = pch;
|
|
|
|
while ((pch < pchMax) && (*pch != chQuote))
|
|
{
|
|
pch++;
|
|
}
|
|
|
|
if (pch == pchMax)
|
|
{
|
|
// Not XML
|
|
|
|
break;
|
|
}
|
|
|
|
// We have an XML encoding declaration from pchToken to (pch - 1)
|
|
|
|
if (pch == pchToken)
|
|
{
|
|
// Not XML
|
|
|
|
break;
|
|
}
|
|
|
|
if (!FLookupCodepageNameA((LPCSTR) pchToken, (UINT) (pch - pchToken), pcp))
|
|
{
|
|
// Encoding is not recognized
|
|
|
|
break;
|
|
}
|
|
|
|
if ((*pcp == CP_UTF16) || (*pcp == CP_UTF16BE))
|
|
{
|
|
// These are bogus since we know the file is MBCS
|
|
|
|
break;
|
|
}
|
|
|
|
return(FValidateCodepage(hwndNP, *pcp));
|
|
}
|
|
|
|
return(FALSE);
|
|
}
|
|
|
|
|
|
BOOL FDetectXmlEncodingW(LPCWSTR rgch, UINT cch, UINT *pcp)
|
|
{
|
|
const WCHAR *pchMax;
|
|
const WCHAR *pch;
|
|
WCHAR chQuote;
|
|
|
|
// XML files encoded in UTF-16 are required to have a BOM which if present
|
|
// would already have been detected. This means that if this file is XML
|
|
// it either is encoded in UCS-4 or UTF-32 which isn't supported or an MBCS
|
|
// encoding of some form. We check for ASCII compatible encodings only
|
|
// which includes everything we probably care about but excludes EBCDIC.
|
|
|
|
// Check for file begining with <?xml ... encoding='...' ... ?>
|
|
|
|
if (cch < 20)
|
|
{
|
|
// File is too small
|
|
|
|
return(FALSE);
|
|
}
|
|
|
|
if (!FIsXmlW(rgch, cch))
|
|
{
|
|
// Not XML
|
|
|
|
return(FALSE);
|
|
}
|
|
|
|
// Don't scan more than 4K looking for encoding even if it is valid XML
|
|
|
|
cch = __min(cch, 4096);
|
|
|
|
pchMax = rgch + cch;
|
|
pch = rgch + 5;
|
|
|
|
if (!FIsXmlWhitespaceW(*pch))
|
|
{
|
|
// Not XML
|
|
|
|
return(FALSE);
|
|
}
|
|
|
|
pch++;
|
|
|
|
chQuote = L'\0';
|
|
|
|
for (;;)
|
|
{
|
|
const WCHAR *pchToken;
|
|
|
|
if (pch == pchMax)
|
|
{
|
|
// Not XML
|
|
|
|
break;
|
|
}
|
|
|
|
if (FIsXmlWhitespaceW(*pch))
|
|
{
|
|
pch++;
|
|
continue;
|
|
}
|
|
|
|
if (*pch == L'=')
|
|
{
|
|
pch++;
|
|
continue;
|
|
}
|
|
|
|
if ((*pch == L'\'') || (*pch == L'"'))
|
|
{
|
|
if (*pch == chQuote)
|
|
{
|
|
chQuote = L'\0';
|
|
}
|
|
|
|
else
|
|
{
|
|
chQuote = *pch;
|
|
}
|
|
|
|
pch++;
|
|
continue;
|
|
}
|
|
|
|
if (chQuote != L'\0')
|
|
{
|
|
// We are within a quoted string. Skip everything until closing quote.
|
|
|
|
pch++;
|
|
continue;
|
|
}
|
|
|
|
if ((pch + 2) > pchMax)
|
|
{
|
|
// Not XML
|
|
|
|
break;
|
|
}
|
|
|
|
if ((pch[0] == L'?') && (pch[1] == L'>'))
|
|
{
|
|
// This looks like XML. At this point if we don't find an encoding
|
|
// specification we could assume UTF-8. We don't because there are
|
|
// malformed XML documents and assuming UTF-8 might affect Notepad
|
|
// compatibility. This may be fine but we put it off for now.
|
|
|
|
// *pcp = CP_UTF8;
|
|
// return(TRUE);
|
|
|
|
break;
|
|
}
|
|
|
|
pchToken = pch;
|
|
|
|
while ((pch < pchMax) && (*pch != L'=') && (*pch != L'?') && !FIsXmlWhitespaceW(*pch))
|
|
{
|
|
pch++;
|
|
}
|
|
|
|
if (pch != (pchToken + 8))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (memcmp(pchToken, L"encoding", 8) != 0)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
while ((pch < pchMax) && FIsXmlWhitespaceW(*pch))
|
|
{
|
|
pch++;
|
|
}
|
|
|
|
if ((pch == pchMax) || (*pch++ != L'='))
|
|
{
|
|
// Not XML
|
|
|
|
break;
|
|
}
|
|
|
|
while ((pch < pchMax) && FIsXmlWhitespaceW(*pch))
|
|
{
|
|
pch++;
|
|
}
|
|
|
|
if ((pch == pchMax) || ((*pch != L'\'') && (*pch != L'"')))
|
|
{
|
|
// Not XML
|
|
|
|
break;
|
|
}
|
|
|
|
chQuote = *pch++;
|
|
|
|
pchToken = pch;
|
|
|
|
while ((pch < pchMax) && (*pch != chQuote))
|
|
{
|
|
pch++;
|
|
}
|
|
|
|
if (pch == pchMax)
|
|
{
|
|
// Not XML
|
|
|
|
break;
|
|
}
|
|
|
|
// We have an XML encoding declaration from pchToken to (pch - 1)
|
|
|
|
if (pch == pchToken)
|
|
{
|
|
// Not XML
|
|
|
|
break;
|
|
}
|
|
|
|
if (!FLookupCodepageNameW(pchToken, (UINT) (pch - pchToken), pcp))
|
|
{
|
|
// Encoding is not recognized
|
|
|
|
break;
|
|
}
|
|
|
|
#if 0
|
|
if ((*pcp == CP_UTF16) || (*pcp == CP_UTF16BE))
|
|
{
|
|
// These are bogus since we know the file is MBCS
|
|
|
|
break;
|
|
}
|
|
#endif
|
|
|
|
return(FValidateCodepage(hwndNP, *pcp));
|
|
}
|
|
|
|
return(FALSE);
|
|
}
|