/* * XML support functions * Copyright (C) 2000 Microsoft Corporation */ #include "precomp.h" BOOL FIsXmlWhitespaceW(WCHAR wch) { return((wch == L' ') || (wch == L'\x9') || (wch == L'\xA') || (wch == L'\xD')); } BOOL FIsXmlWhitespaceA(char ch) { return(FIsXmlWhitespaceW((WCHAR) (BYTE) ch)); } BOOL FIsXmlA(LPCSTR rgch, UINT cch) { if (memcmp(rgch, " if (cch < 20) { // File is too small return(FALSE); } if (!FIsXmlA(rgch, cch)) { // Not XML return(FALSE); } // Don't scan more than 4K looking for encoding even if it is valid XML cch = __min(cch, 4096); pchMax = rgch + cch; pch = rgch + 5; if (!FIsXmlWhitespaceA(*pch)) { // Not XML return(FALSE); } pch++; chQuote = '\0'; for (;;) { LPCSTR pchToken; if (pch == pchMax) { // Not XML break; } if (FIsXmlWhitespaceA(*pch)) { pch++; continue; } if (*pch == '=') { pch++; continue; } if ((*pch == '\'') || (*pch == '"')) { if (*pch == chQuote) { chQuote = '\0'; } else { chQuote = *pch; } pch++; continue; } if (chQuote != '\0') { // We are within a quoted string. Skip everything until closing quote. pch++; continue; } if ((pch + 2) > pchMax) { // Not XML break; } if ((pch[0] == '?') && (pch[1] == '>')) { // This looks like XML. At this point if we don't find an encoding // specification we could assume UTF-8. We don't because there are // malformed XML documents and assuming UTF-8 might affect Notepad // compatibility. This may be fine but we put it off for now. // *pcp = CP_UTF8; // return(TRUE); break; } pchToken = pch; while ((pch < pchMax) && (*pch != '=') && (*pch != '?') && !FIsXmlWhitespaceA(*pch)) { pch++; } if (pch != (pchToken + 8)) { continue; } if (memcmp(pchToken, "encoding", 8) != 0) { continue; } while ((pch < pchMax) && FIsXmlWhitespaceA(*pch)) { pch++; } if ((pch == pchMax) || (*pch++ != '=')) { // Not XML break; } while ((pch < pchMax) && FIsXmlWhitespaceA(*pch)) { pch++; } if ((pch == pchMax) || ((*pch != '\'') && (*pch != '"'))) { // Not XML break; } chQuote = *pch++; pchToken = pch; while ((pch < pchMax) && (*pch != chQuote)) { pch++; } if (pch == pchMax) { // Not XML break; } // We have an XML encoding declaration from pchToken to (pch - 1) if (pch == pchToken) { // Not XML break; } if (!FLookupCodepageNameA((LPCSTR) pchToken, (UINT) (pch - pchToken), pcp)) { // Encoding is not recognized break; } if ((*pcp == CP_UTF16) || (*pcp == CP_UTF16BE)) { // These are bogus since we know the file is MBCS break; } return(FValidateCodepage(hwndNP, *pcp)); } return(FALSE); } BOOL FDetectXmlEncodingW(LPCWSTR rgch, UINT cch, UINT *pcp) { const WCHAR *pchMax; const WCHAR *pch; WCHAR chQuote; // XML files encoded in UTF-16 are required to have a BOM which if present // would already have been detected. This means that if this file is XML // it either is encoded in UCS-4 or UTF-32 which isn't supported or an MBCS // encoding of some form. We check for ASCII compatible encodings only // which includes everything we probably care about but excludes EBCDIC. // Check for file begining with if (cch < 20) { // File is too small return(FALSE); } if (!FIsXmlW(rgch, cch)) { // Not XML return(FALSE); } // Don't scan more than 4K looking for encoding even if it is valid XML cch = __min(cch, 4096); pchMax = rgch + cch; pch = rgch + 5; if (!FIsXmlWhitespaceW(*pch)) { // Not XML return(FALSE); } pch++; chQuote = L'\0'; for (;;) { const WCHAR *pchToken; if (pch == pchMax) { // Not XML break; } if (FIsXmlWhitespaceW(*pch)) { pch++; continue; } if (*pch == L'=') { pch++; continue; } if ((*pch == L'\'') || (*pch == L'"')) { if (*pch == chQuote) { chQuote = L'\0'; } else { chQuote = *pch; } pch++; continue; } if (chQuote != L'\0') { // We are within a quoted string. Skip everything until closing quote. pch++; continue; } if ((pch + 2) > pchMax) { // Not XML break; } if ((pch[0] == L'?') && (pch[1] == L'>')) { // This looks like XML. At this point if we don't find an encoding // specification we could assume UTF-8. We don't because there are // malformed XML documents and assuming UTF-8 might affect Notepad // compatibility. This may be fine but we put it off for now. // *pcp = CP_UTF8; // return(TRUE); break; } pchToken = pch; while ((pch < pchMax) && (*pch != L'=') && (*pch != L'?') && !FIsXmlWhitespaceW(*pch)) { pch++; } if (pch != (pchToken + 8)) { continue; } if (memcmp(pchToken, L"encoding", 8) != 0) { continue; } while ((pch < pchMax) && FIsXmlWhitespaceW(*pch)) { pch++; } if ((pch == pchMax) || (*pch++ != L'=')) { // Not XML break; } while ((pch < pchMax) && FIsXmlWhitespaceW(*pch)) { pch++; } if ((pch == pchMax) || ((*pch != L'\'') && (*pch != L'"'))) { // Not XML break; } chQuote = *pch++; pchToken = pch; while ((pch < pchMax) && (*pch != chQuote)) { pch++; } if (pch == pchMax) { // Not XML break; } // We have an XML encoding declaration from pchToken to (pch - 1) if (pch == pchToken) { // Not XML break; } if (!FLookupCodepageNameW(pchToken, (UINT) (pch - pchToken), pcp)) { // Encoding is not recognized break; } #if 0 if ((*pcp == CP_UTF16) || (*pcp == CP_UTF16BE)) { // These are bogus since we know the file is MBCS break; } #endif return(FValidateCodepage(hwndNP, *pcp)); } return(FALSE); }