windows-nt/Source/XPSP1/NT/shell/osshell/accesory/newpad/npxml.c
2020-09-26 16:20:57 +08:00

451 lines
8.4 KiB
C

/*
* XML support functions
* Copyright (C) 2000 Microsoft Corporation
*/
#include "precomp.h"
BOOL FIsXmlWhitespaceW(WCHAR wch)
{
return((wch == L' ') || (wch == L'\x9') || (wch == L'\xA') || (wch == L'\xD'));
}
BOOL FIsXmlWhitespaceA(char ch)
{
return(FIsXmlWhitespaceW((WCHAR) (BYTE) ch));
}
BOOL FIsXmlA(LPCSTR rgch, UINT cch)
{
if (memcmp(rgch, "<?xml", 5) != 0)
{
// Not XML
return(FALSE);
}
return(TRUE);
UNREFERENCED_PARAMETER( cch );
}
BOOL FIsXmlW(LPCWSTR rgwch, UINT cch)
{
if (memcmp(rgwch, L"<?xml", 5 * sizeof(WCHAR)) != 0)
{
// Not XML
return(FALSE);
}
return(TRUE);
UNREFERENCED_PARAMETER( cch );
}
BOOL FDetectXmlEncodingA(LPCSTR rgch, UINT cch, UINT *pcp)
{
LPCSTR pchMax;
LPCSTR pch;
char chQuote;
// XML files encoded in UTF-16 are required to have a BOM which if present
// would already have been detected. This means that if this file is XML
// it either is encoded in UCS-4 or UTF-32 which isn't supported or an MBCS
// encoding of some form. We check for ASCII compatible encodings only
// which includes everything we probably care about but excludes EBCDIC.
// Check for file begining with <?xml ... encoding='...' ... ?>
if (cch < 20)
{
// File is too small
return(FALSE);
}
if (!FIsXmlA(rgch, cch))
{
// Not XML
return(FALSE);
}
// Don't scan more than 4K looking for encoding even if it is valid XML
cch = __min(cch, 4096);
pchMax = rgch + cch;
pch = rgch + 5;
if (!FIsXmlWhitespaceA(*pch))
{
// Not XML
return(FALSE);
}
pch++;
chQuote = '\0';
for (;;)
{
LPCSTR pchToken;
if (pch == pchMax)
{
// Not XML
break;
}
if (FIsXmlWhitespaceA(*pch))
{
pch++;
continue;
}
if (*pch == '=')
{
pch++;
continue;
}
if ((*pch == '\'') || (*pch == '"'))
{
if (*pch == chQuote)
{
chQuote = '\0';
}
else
{
chQuote = *pch;
}
pch++;
continue;
}
if (chQuote != '\0')
{
// We are within a quoted string. Skip everything until closing quote.
pch++;
continue;
}
if ((pch + 2) > pchMax)
{
// Not XML
break;
}
if ((pch[0] == '?') && (pch[1] == '>'))
{
// This looks like XML. At this point if we don't find an encoding
// specification we could assume UTF-8. We don't because there are
// malformed XML documents and assuming UTF-8 might affect Notepad
// compatibility. This may be fine but we put it off for now.
// *pcp = CP_UTF8;
// return(TRUE);
break;
}
pchToken = pch;
while ((pch < pchMax) && (*pch != '=') && (*pch != '?') && !FIsXmlWhitespaceA(*pch))
{
pch++;
}
if (pch != (pchToken + 8))
{
continue;
}
if (memcmp(pchToken, "encoding", 8) != 0)
{
continue;
}
while ((pch < pchMax) && FIsXmlWhitespaceA(*pch))
{
pch++;
}
if ((pch == pchMax) || (*pch++ != '='))
{
// Not XML
break;
}
while ((pch < pchMax) && FIsXmlWhitespaceA(*pch))
{
pch++;
}
if ((pch == pchMax) || ((*pch != '\'') && (*pch != '"')))
{
// Not XML
break;
}
chQuote = *pch++;
pchToken = pch;
while ((pch < pchMax) && (*pch != chQuote))
{
pch++;
}
if (pch == pchMax)
{
// Not XML
break;
}
// We have an XML encoding declaration from pchToken to (pch - 1)
if (pch == pchToken)
{
// Not XML
break;
}
if (!FLookupCodepageNameA((LPCSTR) pchToken, (UINT) (pch - pchToken), pcp))
{
// Encoding is not recognized
break;
}
if ((*pcp == CP_UTF16) || (*pcp == CP_UTF16BE))
{
// These are bogus since we know the file is MBCS
break;
}
return(FValidateCodepage(hwndNP, *pcp));
}
return(FALSE);
}
BOOL FDetectXmlEncodingW(LPCWSTR rgch, UINT cch, UINT *pcp)
{
const WCHAR *pchMax;
const WCHAR *pch;
WCHAR chQuote;
// XML files encoded in UTF-16 are required to have a BOM which if present
// would already have been detected. This means that if this file is XML
// it either is encoded in UCS-4 or UTF-32 which isn't supported or an MBCS
// encoding of some form. We check for ASCII compatible encodings only
// which includes everything we probably care about but excludes EBCDIC.
// Check for file begining with <?xml ... encoding='...' ... ?>
if (cch < 20)
{
// File is too small
return(FALSE);
}
if (!FIsXmlW(rgch, cch))
{
// Not XML
return(FALSE);
}
// Don't scan more than 4K looking for encoding even if it is valid XML
cch = __min(cch, 4096);
pchMax = rgch + cch;
pch = rgch + 5;
if (!FIsXmlWhitespaceW(*pch))
{
// Not XML
return(FALSE);
}
pch++;
chQuote = L'\0';
for (;;)
{
const WCHAR *pchToken;
if (pch == pchMax)
{
// Not XML
break;
}
if (FIsXmlWhitespaceW(*pch))
{
pch++;
continue;
}
if (*pch == L'=')
{
pch++;
continue;
}
if ((*pch == L'\'') || (*pch == L'"'))
{
if (*pch == chQuote)
{
chQuote = L'\0';
}
else
{
chQuote = *pch;
}
pch++;
continue;
}
if (chQuote != L'\0')
{
// We are within a quoted string. Skip everything until closing quote.
pch++;
continue;
}
if ((pch + 2) > pchMax)
{
// Not XML
break;
}
if ((pch[0] == L'?') && (pch[1] == L'>'))
{
// This looks like XML. At this point if we don't find an encoding
// specification we could assume UTF-8. We don't because there are
// malformed XML documents and assuming UTF-8 might affect Notepad
// compatibility. This may be fine but we put it off for now.
// *pcp = CP_UTF8;
// return(TRUE);
break;
}
pchToken = pch;
while ((pch < pchMax) && (*pch != L'=') && (*pch != L'?') && !FIsXmlWhitespaceW(*pch))
{
pch++;
}
if (pch != (pchToken + 8))
{
continue;
}
if (memcmp(pchToken, L"encoding", 8) != 0)
{
continue;
}
while ((pch < pchMax) && FIsXmlWhitespaceW(*pch))
{
pch++;
}
if ((pch == pchMax) || (*pch++ != L'='))
{
// Not XML
break;
}
while ((pch < pchMax) && FIsXmlWhitespaceW(*pch))
{
pch++;
}
if ((pch == pchMax) || ((*pch != L'\'') && (*pch != L'"')))
{
// Not XML
break;
}
chQuote = *pch++;
pchToken = pch;
while ((pch < pchMax) && (*pch != chQuote))
{
pch++;
}
if (pch == pchMax)
{
// Not XML
break;
}
// We have an XML encoding declaration from pchToken to (pch - 1)
if (pch == pchToken)
{
// Not XML
break;
}
if (!FLookupCodepageNameW(pchToken, (UINT) (pch - pchToken), pcp))
{
// Encoding is not recognized
break;
}
#if 0
if ((*pcp == CP_UTF16) || (*pcp == CP_UTF16BE))
{
// These are bogus since we know the file is MBCS
break;
}
#endif
return(FValidateCodepage(hwndNP, *pcp));
}
return(FALSE);
}