windows-nt/Source/XPSP1/NT/shell/osshell/accesory/newpad/npxml.c

/*
 * XML support functions
 *  Copyright (C) 2000 Microsoft Corporation
 */

#include "precomp.h"


BOOL FIsXmlWhitespaceW(WCHAR wch)
{
    return((wch == L' ') || (wch == L'\x9') || (wch == L'\xA') || (wch == L'\xD'));
}


BOOL FIsXmlWhitespaceA(char ch)
{
    return(FIsXmlWhitespaceW((WCHAR) (BYTE) ch));
}


BOOL FIsXmlA(LPCSTR rgch, UINT cch)
{
    if (memcmp(rgch, "<?xml", 5) != 0)
    {
        // Not XML

        return(FALSE);
    }

    return(TRUE);

    UNREFERENCED_PARAMETER( cch );
}


BOOL FIsXmlW(LPCWSTR rgwch, UINT cch)
{
    if (memcmp(rgwch, L"<?xml", 5 * sizeof(WCHAR)) != 0)
    {
        // Not XML

        return(FALSE);
    }

    return(TRUE);

    UNREFERENCED_PARAMETER( cch );
}


BOOL FDetectXmlEncodingA(LPCSTR rgch, UINT cch, UINT *pcp)
{
    LPCSTR pchMax;
    LPCSTR pch;
    char chQuote;

    // XML files encoded in UTF-16 are required to have a BOM which if present
    // would already have been detected.  This means that if this file is XML
    // it either is encoded in UCS-4 or UTF-32 which isn't supported or an MBCS
    // encoding of some form.  We check for ASCII compatible encodings only
    // which includes everything we probably care about but excludes EBCDIC.

    // Check for file begining with <?xml ... encoding='...' ... ?>

    if (cch < 20)
    {
        // File is too small

        return(FALSE);
    }

    if (!FIsXmlA(rgch, cch))
    {
        // Not XML

        return(FALSE);
    }

    // Don't scan more than 4K looking for encoding even if it is valid XML

    cch = __min(cch, 4096);

    pchMax = rgch + cch;
    pch = rgch + 5;

    if (!FIsXmlWhitespaceA(*pch))
    {
        // Not XML

        return(FALSE);
    }

    pch++;

    chQuote = '\0';

    for (;;)
    {
        LPCSTR pchToken;

        if (pch == pchMax)
        {
            // Not XML

            break;
        }

        if (FIsXmlWhitespaceA(*pch))
        {
            pch++;
            continue;
        }

        if (*pch == '=')
        {
            pch++;
            continue;
        }

        if ((*pch == '\'') || (*pch == '"'))
        {
            if (*pch == chQuote)
            {
                chQuote = '\0';
            }

            else
            {
                chQuote = *pch;
            }

            pch++;
            continue;
        }

        if (chQuote != '\0')
        {
            // We are within a quoted string.  Skip everything until closing quote.

            pch++;
            continue;
        }

        if ((pch + 2) > pchMax)
        {
            // Not XML

            break;
        }

        if ((pch[0] == '?') && (pch[1] == '>'))
        {
            // This looks like XML.  At this point if we don't find an encoding
            // specification we could assume UTF-8.  We don't because there are
            // malformed XML documents and assuming UTF-8 might affect Notepad
            // compatibility.  This may be fine but we put it off for now.

            // *pcp = CP_UTF8;
            // return(TRUE);

            break;
        }

        pchToken = pch;

        while ((pch < pchMax) && (*pch != '=') && (*pch != '?') && !FIsXmlWhitespaceA(*pch))
        {
            pch++;
        }

        if (pch != (pchToken + 8))
        {
             continue;
        }

        if (memcmp(pchToken, "encoding", 8) != 0)
        {
             continue;
        }

        while ((pch < pchMax) && FIsXmlWhitespaceA(*pch))
        {
            pch++;
        }

        if ((pch == pchMax) || (*pch++ != '='))
        {
            // Not XML

            break;
        }

        while ((pch < pchMax) && FIsXmlWhitespaceA(*pch))
        {
            pch++;
        }

        if ((pch == pchMax) || ((*pch != '\'') && (*pch != '"')))
        {
            // Not XML

            break;
        }

        chQuote = *pch++;

        pchToken = pch;

        while ((pch < pchMax) && (*pch != chQuote))
        {
            pch++;
        }

        if (pch == pchMax)
        {
            // Not XML

            break;
        }

        // We have an XML encoding declaration from pchToken to (pch - 1)

        if (pch == pchToken)
        {
            // Not XML

            break;
        }

        if (!FLookupCodepageNameA((LPCSTR) pchToken, (UINT) (pch - pchToken), pcp))
        {
            // Encoding is not recognized

            break;
        }

        if ((*pcp == CP_UTF16) || (*pcp == CP_UTF16BE))
        {
            // These are bogus since we know the file is MBCS

            break;
        }

        return(FValidateCodepage(hwndNP, *pcp));
    }

    return(FALSE);
}


BOOL FDetectXmlEncodingW(LPCWSTR rgch, UINT cch, UINT *pcp)
{
    const WCHAR *pchMax;
    const WCHAR *pch;
    WCHAR chQuote;

    // XML files encoded in UTF-16 are required to have a BOM which if present
    // would already have been detected.  This means that if this file is XML
    // it either is encoded in UCS-4 or UTF-32 which isn't supported or an MBCS
    // encoding of some form.  We check for ASCII compatible encodings only
    // which includes everything we probably care about but excludes EBCDIC.

    // Check for file begining with <?xml ... encoding='...' ... ?>

    if (cch < 20)
    {
        // File is too small

        return(FALSE);
    }

    if (!FIsXmlW(rgch, cch))
    {
        // Not XML

        return(FALSE);
    }

    // Don't scan more than 4K looking for encoding even if it is valid XML

    cch = __min(cch, 4096);

    pchMax = rgch + cch;
    pch = rgch + 5;

    if (!FIsXmlWhitespaceW(*pch))
    {
        // Not XML

        return(FALSE);
    }

    pch++;

    chQuote = L'\0';

    for (;;)
    {
        const WCHAR *pchToken;

        if (pch == pchMax)
        {
            // Not XML

            break;
        }

        if (FIsXmlWhitespaceW(*pch))
        {
            pch++;
            continue;
        }

        if (*pch == L'=')
        {
            pch++;
            continue;
        }

        if ((*pch == L'\'') || (*pch == L'"'))
        {
            if (*pch == chQuote)
            {
                chQuote = L'\0';
            }

            else
            {
                chQuote = *pch;
            }

            pch++;
            continue;
        }

        if (chQuote != L'\0')
        {
            // We are within a quoted string.  Skip everything until closing quote.

            pch++;
            continue;
        }

        if ((pch + 2) > pchMax)
        {
            // Not XML

            break;
        }

        if ((pch[0] == L'?') && (pch[1] == L'>'))
        {
            // This looks like XML.  At this point if we don't find an encoding
            // specification we could assume UTF-8.  We don't because there are
            // malformed XML documents and assuming UTF-8 might affect Notepad
            // compatibility.  This may be fine but we put it off for now.

            // *pcp = CP_UTF8;
            // return(TRUE);

            break;
        }

        pchToken = pch;

        while ((pch < pchMax) && (*pch != L'=') && (*pch != L'?') && !FIsXmlWhitespaceW(*pch))
        {
            pch++;
        }

        if (pch != (pchToken + 8))
        {
             continue;
        }

        if (memcmp(pchToken, L"encoding", 8) != 0)
        {
             continue;
        }

        while ((pch < pchMax) && FIsXmlWhitespaceW(*pch))
        {
            pch++;
        }

        if ((pch == pchMax) || (*pch++ != L'='))
        {
            // Not XML

            break;
        }

        while ((pch < pchMax) && FIsXmlWhitespaceW(*pch))
        {
            pch++;
        }

        if ((pch == pchMax) || ((*pch != L'\'') && (*pch != L'"')))
        {
            // Not XML

            break;
        }

        chQuote = *pch++;

        pchToken = pch;

        while ((pch < pchMax) && (*pch != chQuote))
        {
            pch++;
        }

        if (pch == pchMax)
        {
            // Not XML

            break;
        }

        // We have an XML encoding declaration from pchToken to (pch - 1)

        if (pch == pchToken)
        {
            // Not XML

            break;
        }

        if (!FLookupCodepageNameW(pchToken, (UINT) (pch - pchToken), pcp))
        {
            // Encoding is not recognized

            break;
        }

#if 0
        if ((*pcp == CP_UTF16) || (*pcp == CP_UTF16BE))
        {
            // These are bogus since we know the file is MBCS

            break;
        }
#endif

        return(FValidateCodepage(hwndNP, *pcp));
    }

    return(FALSE);
}
Add source files 2020-09-26 03:20:57 -05:00			`/*`
			`* XML support functions`
			`* Copyright (C) 2000 Microsoft Corporation`
			`*/`

			`#include "precomp.h"`


			`BOOL FIsXmlWhitespaceW(WCHAR wch)`
			`{`
			`return((wch == L' ') \|\| (wch == L'\x9') \|\| (wch == L'\xA') \|\| (wch == L'\xD'));`
			`}`


			`BOOL FIsXmlWhitespaceA(char ch)`
			`{`
			`return(FIsXmlWhitespaceW((WCHAR) (BYTE) ch));`
			`}`


			`BOOL FIsXmlA(LPCSTR rgch, UINT cch)`
			`{`
			`if (memcmp(rgch, "<?xml", 5) != 0)`
			`{`
			`// Not XML`

			`return(FALSE);`
			`}`

			`return(TRUE);`

			`UNREFERENCED_PARAMETER( cch );`
			`}`


			`BOOL FIsXmlW(LPCWSTR rgwch, UINT cch)`
			`{`
			`if (memcmp(rgwch, L"<?xml", 5 * sizeof(WCHAR)) != 0)`
			`{`
			`// Not XML`

			`return(FALSE);`
			`}`

			`return(TRUE);`

			`UNREFERENCED_PARAMETER( cch );`
			`}`


			`BOOL FDetectXmlEncodingA(LPCSTR rgch, UINT cch, UINT *pcp)`
			`{`
			`LPCSTR pchMax;`
			`LPCSTR pch;`
			`char chQuote;`

			`// XML files encoded in UTF-16 are required to have a BOM which if present`
			`// would already have been detected. This means that if this file is XML`
			`// it either is encoded in UCS-4 or UTF-32 which isn't supported or an MBCS`
			`// encoding of some form. We check for ASCII compatible encodings only`
			`// which includes everything we probably care about but excludes EBCDIC.`

			`// Check for file begining with <?xml ... encoding='...' ... ?>`

			`if (cch < 20)`
			`{`
			`// File is too small`

			`return(FALSE);`
			`}`

			`if (!FIsXmlA(rgch, cch))`
			`{`
			`// Not XML`

			`return(FALSE);`
			`}`

			`// Don't scan more than 4K looking for encoding even if it is valid XML`

			`cch = __min(cch, 4096);`

			`pchMax = rgch + cch;`
			`pch = rgch + 5;`

			`if (!FIsXmlWhitespaceA(*pch))`
			`{`
			`// Not XML`

			`return(FALSE);`
			`}`

			`pch++;`

			`chQuote = '\0';`

			`for (;;)`
			`{`
			`LPCSTR pchToken;`

			`if (pch == pchMax)`
			`{`
			`// Not XML`

			`break;`
			`}`

			`if (FIsXmlWhitespaceA(*pch))`
			`{`
			`pch++;`
			`continue;`
			`}`

			`if (*pch == '=')`
			`{`
			`pch++;`
			`continue;`
			`}`

			`if ((pch == '\'') \|\| (pch == '"'))`
			`{`
			`if (*pch == chQuote)`
			`{`
			`chQuote = '\0';`
			`}`

			`else`
			`{`
			`chQuote = *pch;`
			`}`

			`pch++;`
			`continue;`
			`}`

			`if (chQuote != '\0')`
			`{`
			`// We are within a quoted string. Skip everything until closing quote.`

			`pch++;`
			`continue;`
			`}`

			`if ((pch + 2) > pchMax)`
			`{`
			`// Not XML`

			`break;`
			`}`

			`if ((pch[0] == '?') && (pch[1] == '>'))`
			`{`
			`// This looks like XML. At this point if we don't find an encoding`
			`// specification we could assume UTF-8. We don't because there are`
			`// malformed XML documents and assuming UTF-8 might affect Notepad`
			`// compatibility. This may be fine but we put it off for now.`

			`// *pcp = CP_UTF8;`
			`// return(TRUE);`

			`break;`
			`}`

			`pchToken = pch;`

			`while ((pch < pchMax) && (pch != '=') && (pch != '?') && !FIsXmlWhitespaceA(*pch))`
			`{`
			`pch++;`
			`}`

			`if (pch != (pchToken + 8))`
			`{`
			`continue;`
			`}`

			`if (memcmp(pchToken, "encoding", 8) != 0)`
			`{`
			`continue;`
			`}`

			`while ((pch < pchMax) && FIsXmlWhitespaceA(*pch))`
			`{`
			`pch++;`
			`}`

			`if ((pch == pchMax) \|\| (*pch++ != '='))`
			`{`
			`// Not XML`

			`break;`
			`}`

			`while ((pch < pchMax) && FIsXmlWhitespaceA(*pch))`
			`{`
			`pch++;`
			`}`

			`if ((pch == pchMax) \|\| ((pch != '\'') && (pch != '"')))`
			`{`
			`// Not XML`

			`break;`
			`}`

			`chQuote = *pch++;`

			`pchToken = pch;`

			`while ((pch < pchMax) && (*pch != chQuote))`
			`{`
			`pch++;`
			`}`

			`if (pch == pchMax)`
			`{`
			`// Not XML`

			`break;`
			`}`

			`// We have an XML encoding declaration from pchToken to (pch - 1)`

			`if (pch == pchToken)`
			`{`
			`// Not XML`

			`break;`
			`}`

			`if (!FLookupCodepageNameA((LPCSTR) pchToken, (UINT) (pch - pchToken), pcp))`
			`{`
			`// Encoding is not recognized`

			`break;`
			`}`

			`if ((pcp == CP_UTF16) \|\| (pcp == CP_UTF16BE))`
			`{`
			`// These are bogus since we know the file is MBCS`

			`break;`
			`}`

			`return(FValidateCodepage(hwndNP, *pcp));`
			`}`

			`return(FALSE);`
			`}`


			`BOOL FDetectXmlEncodingW(LPCWSTR rgch, UINT cch, UINT *pcp)`
			`{`
			`const WCHAR *pchMax;`
			`const WCHAR *pch;`
			`WCHAR chQuote;`

			`// XML files encoded in UTF-16 are required to have a BOM which if present`
			`// would already have been detected. This means that if this file is XML`
			`// it either is encoded in UCS-4 or UTF-32 which isn't supported or an MBCS`
			`// encoding of some form. We check for ASCII compatible encodings only`
			`// which includes everything we probably care about but excludes EBCDIC.`

			`// Check for file begining with <?xml ... encoding='...' ... ?>`

			`if (cch < 20)`
			`{`
			`// File is too small`

			`return(FALSE);`
			`}`

			`if (!FIsXmlW(rgch, cch))`
			`{`
			`// Not XML`

			`return(FALSE);`
			`}`

			`// Don't scan more than 4K looking for encoding even if it is valid XML`

			`cch = __min(cch, 4096);`

			`pchMax = rgch + cch;`
			`pch = rgch + 5;`

			`if (!FIsXmlWhitespaceW(*pch))`
			`{`
			`// Not XML`

			`return(FALSE);`
			`}`

			`pch++;`

			`chQuote = L'\0';`

			`for (;;)`
			`{`
			`const WCHAR *pchToken;`

			`if (pch == pchMax)`
			`{`
			`// Not XML`

			`break;`
			`}`

			`if (FIsXmlWhitespaceW(*pch))`
			`{`
			`pch++;`
			`continue;`
			`}`

			`if (*pch == L'=')`
			`{`
			`pch++;`
			`continue;`
			`}`

			`if ((pch == L'\'') \|\| (pch == L'"'))`
			`{`
			`if (*pch == chQuote)`
			`{`
			`chQuote = L'\0';`
			`}`

			`else`
			`{`
			`chQuote = *pch;`
			`}`

			`pch++;`
			`continue;`
			`}`

			`if (chQuote != L'\0')`
			`{`
			`// We are within a quoted string. Skip everything until closing quote.`

			`pch++;`
			`continue;`
			`}`

			`if ((pch + 2) > pchMax)`
			`{`
			`// Not XML`

			`break;`
			`}`

			`if ((pch[0] == L'?') && (pch[1] == L'>'))`
			`{`
			`// This looks like XML. At this point if we don't find an encoding`
			`// specification we could assume UTF-8. We don't because there are`
			`// malformed XML documents and assuming UTF-8 might affect Notepad`
			`// compatibility. This may be fine but we put it off for now.`

			`// *pcp = CP_UTF8;`
			`// return(TRUE);`

			`break;`
			`}`

			`pchToken = pch;`

			`while ((pch < pchMax) && (pch != L'=') && (pch != L'?') && !FIsXmlWhitespaceW(*pch))`
			`{`
			`pch++;`
			`}`

			`if (pch != (pchToken + 8))`
			`{`
			`continue;`
			`}`

			`if (memcmp(pchToken, L"encoding", 8) != 0)`
			`{`
			`continue;`
			`}`

			`while ((pch < pchMax) && FIsXmlWhitespaceW(*pch))`
			`{`
			`pch++;`
			`}`

			`if ((pch == pchMax) \|\| (*pch++ != L'='))`
			`{`
			`// Not XML`

			`break;`
			`}`

			`while ((pch < pchMax) && FIsXmlWhitespaceW(*pch))`
			`{`
			`pch++;`
			`}`

			`if ((pch == pchMax) \|\| ((pch != L'\'') && (pch != L'"')))`
			`{`
			`// Not XML`

			`break;`
			`}`

			`chQuote = *pch++;`

			`pchToken = pch;`

			`while ((pch < pchMax) && (*pch != chQuote))`
			`{`
			`pch++;`
			`}`

			`if (pch == pchMax)`
			`{`
			`// Not XML`

			`break;`
			`}`

			`// We have an XML encoding declaration from pchToken to (pch - 1)`

			`if (pch == pchToken)`
			`{`
			`// Not XML`

			`break;`
			`}`

			`if (!FLookupCodepageNameW(pchToken, (UINT) (pch - pchToken), pcp))`
			`{`
			`// Encoding is not recognized`

			`break;`
			`}`

			`#if 0`
			`if ((pcp == CP_UTF16) \|\| (pcp == CP_UTF16BE))`
			`{`
			`// These are bogus since we know the file is MBCS`

			`break;`
			`}`
			`#endif`

			`return(FValidateCodepage(hwndNP, *pcp));`
			`}`

			`return(FALSE);`
			`}`