windows-nt/Source/XPSP1/NT/shell/osshell/accesory/newpad/npxml.c

/*
 * XML support functions
 *  Copyright (C) 2000 Microsoft Corporation
 */

#include "precomp.h"


BOOL FIsXmlWhitespaceW(WCHAR wch)
{
    return((wch == L' ') || (wch == L'\x9') || (wch == L'\xA') || (wch == L'\xD'));
}


BOOL FIsXmlWhitespaceA(char ch)
{
    return(FIsXmlWhitespaceW((WCHAR) (BYTE) ch));
}


BOOL FIsXmlA(LPCSTR rgch, UINT cch)
{
    if (memcmp(rgch, "<?xml", 5) != 0)
    {
        // Not XML

        return(FALSE);
    }

    return(TRUE);

    UNREFERENCED_PARAMETER( cch );
}


BOOL FIsXmlW(LPCWSTR rgwch, UINT cch)
{
    if (memcmp(rgwch, L"<?xml", 5 * sizeof(WCHAR)) != 0)
    {
        // Not XML

        return(FALSE);
    }

    return(TRUE);

    UNREFERENCED_PARAMETER( cch );
}


BOOL FDetectXmlEncodingA(LPCSTR rgch, UINT cch, UINT *pcp)
{
    LPCSTR pchMax;
    LPCSTR pch;
    char chQuote;

    // XML files encoded in UTF-16 are required to have a BOM which if present
    // would already have been detected.  This means that if this file is XML
    // it either is encoded in UCS-4 or UTF-32 which isn't supported or an MBCS
    // encoding of some form.  We check for ASCII compatible encodings only
    // which includes everything we probably care about but excludes EBCDIC.

    // Check for file begining with <?xml ... encoding='...' ... ?>

    if (cch < 20)
    {
        // File is too small

        return(FALSE);
    }

    if (!FIsXmlA(rgch, cch))
    {
        // Not XML

        return(FALSE);
    }

    // Don't scan more than 4K looking for encoding even if it is valid XML

    cch = __min(cch, 4096);

    pchMax = rgch + cch;
    pch = rgch + 5;

    if (!FIsXmlWhitespaceA(*pch))
    {
        // Not XML

        return(FALSE);
    }

    pch++;

    chQuote = '\0';

    for (;;)
    {
        LPCSTR pchToken;

        if (pch == pchMax)
        {
            // Not XML

            break;
        }

        if (FIsXmlWhitespaceA(*pch))
        {
            pch++;
            continue;
        }

        if (*pch == '=')
        {
            pch++;
            continue;
        }

        if ((*pch == '\'') || (*pch == '"'))
        {
            if (*pch == chQuote)
            {
                chQuote = '\0';
            }

            else
            {
                chQuote = *pch;
            }

            pch++;
            continue;
        }

        if (chQuote != '\0')
        {
            // We are within a quoted string.  Skip everything until closing quote.

            pch++;
            continue;
        }

        if ((pch + 2) > pchMax)
        {
            // Not XML

            break;
        }

        if ((pch[0] == '?') && (pch[1] == '>'))
        {
            // This looks like XML.  At this point if we don't find an encoding
            // specification we could assume UTF-8.  We don't because there are
            // malformed XML documents and assuming UTF-8 might affect Notepad
            // compatibility.  This may be fine but we put it off for now.

            // *pcp = CP_UTF8;
            // return(TRUE);

            break;
        }

        pchToken = pch;

        while ((pch < pchMax) && (*pch != '=') && (*pch != '?') && !FIsXmlWhitespaceA(*pch))
        {
            pch++;
        }

        if (pch != (pchToken + 8))
        {
             continue;
        }

        if (memcmp(pchToken, "encoding", 8) != 0)
        {
             continue;
        }

        while ((pch < pchMax) && FIsXmlWhitespaceA(*pch))
        {
            pch++;
        }

        if ((pch == pchMax) || (*pch++ != '='))
        {
            // Not XML

            break;
        }

        while ((pch < pchMax) && FIsXmlWhitespaceA(*pch))
        {
            pch++;
        }

        if ((pch == pchMax) || ((*pch != '\'') && (*pch != '"')))
        {
            // Not XML

            break;
        }

        chQuote = *pch++;

        pchToken = pch;

        while ((pch < pchMax) && (*pch != chQuote))
        {
            pch++;
        }

        if (pch == pchMax)
        {
            // Not XML

            break;
        }

        // We have an XML encoding declaration from pchToken to (pch - 1)

        if (pch == pchToken)
        {
            // Not XML

            break;
        }

        if (!FLookupCodepageNameA((LPCSTR) pchToken, (UINT) (pch - pchToken), pcp))
        {
            // Encoding is not recognized

            break;
        }

        if ((*pcp == CP_UTF16) || (*pcp == CP_UTF16BE))
        {
            // These are bogus since we know the file is MBCS

            break;
        }

        return(FValidateCodepage(hwndNP, *pcp));
    }

    return(FALSE);
}


BOOL FDetectXmlEncodingW(LPCWSTR rgch, UINT cch, UINT *pcp)
{
    const WCHAR *pchMax;
    const WCHAR *pch;
    WCHAR chQuote;

    // XML files encoded in UTF-16 are required to have a BOM which if present
    // would already have been detected.  This means that if this file is XML
    // it either is encoded in UCS-4 or UTF-32 which isn't supported or an MBCS
    // encoding of some form.  We check for ASCII compatible encodings only
    // which includes everything we probably care about but excludes EBCDIC.

    // Check for file begining with <?xml ... encoding='...' ... ?>

    if (cch < 20)
    {
        // File is too small

        return(FALSE);
    }

    if (!FIsXmlW(rgch, cch))
    {
        // Not XML

        return(FALSE);
    }

    // Don't scan more than 4K looking for encoding even if it is valid XML

    cch = __min(cch, 4096);

    pchMax = rgch + cch;
    pch = rgch + 5;

    if (!FIsXmlWhitespaceW(*pch))
    {
        // Not XML

        return(FALSE);
    }

    pch++;

    chQuote = L'\0';

    for (;;)
    {
        const WCHAR *pchToken;

        if (pch == pchMax)
        {
            // Not XML

            break;
        }

        if (FIsXmlWhitespaceW(*pch))
        {
            pch++;
            continue;
        }

        if (*pch == L'=')
        {
            pch++;
            continue;
        }

        if ((*pch == L'\'') || (*pch == L'"'))
        {
            if (*pch == chQuote)
            {
                chQuote = L'\0';
            }

            else
            {
                chQuote = *pch;
            }

            pch++;
            continue;
        }

        if (chQuote != L'\0')
        {
            // We are within a quoted string.  Skip everything until closing quote.

            pch++;
            continue;
        }

        if ((pch + 2) > pchMax)
        {
            // Not XML

            break;
        }

        if ((pch[0] == L'?') && (pch[1] == L'>'))
        {
            // This looks like XML.  At this point if we don't find an encoding
            // specification we could assume UTF-8.  We don't because there are
            // malformed XML documents and assuming UTF-8 might affect Notepad
            // compatibility.  This may be fine but we put it off for now.

            // *pcp = CP_UTF8;
            // return(TRUE);

            break;
        }

        pchToken = pch;

        while ((pch < pchMax) && (*pch != L'=') && (*pch != L'?') && !FIsXmlWhitespaceW(*pch))
        {
            pch++;
        }

        if (pch != (pchToken + 8))
        {
             continue;
        }

        if (memcmp(pchToken, L"encoding", 8) != 0)
        {
             continue;
        }

        while ((pch < pchMax) && FIsXmlWhitespaceW(*pch))
        {
            pch++;
        }

        if ((pch == pchMax) || (*pch++ != L'='))
        {
            // Not XML

            break;
        }

        while ((pch < pchMax) && FIsXmlWhitespaceW(*pch))
        {
            pch++;
        }

        if ((pch == pchMax) || ((*pch != L'\'') && (*pch != L'"')))
        {
            // Not XML

            break;
        }

        chQuote = *pch++;

        pchToken = pch;

        while ((pch < pchMax) && (*pch != chQuote))
        {
            pch++;
        }

        if (pch == pchMax)
        {
            // Not XML

            break;
        }

        // We have an XML encoding declaration from pchToken to (pch - 1)

        if (pch == pchToken)
        {
            // Not XML

            break;
        }

        if (!FLookupCodepageNameW(pchToken, (UINT) (pch - pchToken), pcp))
        {
            // Encoding is not recognized

            break;
        }

#if 0
        if ((*pcp == CP_UTF16) || (*pcp == CP_UTF16BE))
        {
            // These are bogus since we know the file is MBCS

            break;
        }
#endif

        return(FValidateCodepage(hwndNP, *pcp));
    }

    return(FALSE);
}