windows-nt/Source/XPSP1/NT/base/win32/fusion/xmlparser/encodingstream.cxx

/*
 * @(#)EncodingStream.cxx 1.0 6/10/97
 *
* Copyright (c) 1997 - 1999 Microsoft Corporation. All rights reserved. *
 */
#include "stdinc.h"
#include "core.hxx"
#include "xmlhelper.hxx"
#include "encodingstream.hxx"
#pragma hdrstop

const int EncodingStream::BUFFERSIZE = 4096*sizeof(WCHAR);
//////////////////////////////////////////////////////////////////////////////////
EncodingStream::EncodingStream(IStream * pStream): stream(pStream), encoding(NULL), buf(NULL)
{
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE

    // These objects are sometimes handed out to external clients.
    ::IncrementComponents();
#endif

    pfnWideCharFromMultiByte = NULL;
#ifdef FUSION_USE_OLD_XML_PARSER_SOURCE
    pfnWideCharToMultiByte = NULL;
#endif
    btotal = bnext = startAt = 0;
    lastBuffer = false;
    bufsize = 0;
    _fEOF = false;
    _fReadStream = true;
    _fUTF8BOM = false;
    //_fTextXML = false;
    //_fSetCharset = false;
    _dwMode = 0;
    codepage = CP_UNDEFINED;
}
//////////////////////////////////////////////////////////////////////////////////
/**
 * Builds the EncodingStream for input.
 * Reads the first two bytes of the InputStream * in order to make a guess
 * as to the character encoding of the file.
 */
IStream * EncodingStream::newEncodingStream(IStream * pStream)
{
    EncodingStream * es = NEW (EncodingStream(pStream));
    if (es == NULL)
        return NULL;

    es->AddRef(); // xwu@@ : check this addRef()!

    es->isInput = true;
    es->buf = NULL;

    return es;
}
//////////////////////////////////////////////////////////////////////////////////
EncodingStream::~EncodingStream()
{
    if (buf)
        delete [] buf;
    if (encoding != NULL)
        delete encoding;

    stream = NULL; // smart pointer
}
//////////////////////////////////////////////////////////////////////////////////
/**
 * Reads characters from stream and encode it to Unicode
 */
HRESULT STDMETHODCALLTYPE EncodingStream::Read(void * pv, ULONG cb, ULONG * pcbRead)
{
    HRESULT hr;

    ULONG num = 0;

    if (pcbRead != NULL)
        *pcbRead = 0;

    if (btotal == 0 && _fEOF)          // we already hit EOF - so return right away.
        return S_OK;

    // Calculate how many UNICODE chars we are allowed to return,
    // xiaoyu : which is the same as the number of BYTES read from the file
    cb /= sizeof(WCHAR);
    checkhr2(prepareForInput(cb));

    if (stream && _fReadStream)
    {
        // btotal = number of bytes already in start of buffer.
        if (cb > btotal)
        {
            hr = stream->Read(buf + btotal, cb - btotal, &num);

            // Let's show what we've seen in the debugger so that we can diagnose bad manifests
            // more easily.  mgrier 12/28/2000

            if (::FusionpDbgWouldPrintAtFilterLevel(FUSION_DBG_LEVEL_XMLSTREAM))
            {
                ::FusionpDbgPrintEx(
                    FUSION_DBG_LEVEL_XMLSTREAM,
                    "SXS.DLL: Read %lu bytes from XML stream; HRESULT returned = 0x%08lx\n", num, hr);

                if (num > 0)
                {
                    ::FusionpDbgPrintBlob(
                        FUSION_DBG_LEVEL_XMLSTREAM,
                        buf + btotal,
                        num,
                        L"   ");
                }
            }

            if (hr == E_PENDING && num > 0)
            {
                // in which case we ignore the error, and continue on !!.
                // BUGBUG - this may be a problem.since we are changing the
                // return code returned from the stream.  This may mean we
                // should not ever hand out this stream outside of MSXML.
                hr = 0;
            }
            if (FAILED(hr))
            {
                return hr;
            }
            if (btotal == 0 && num == 0)
            {
                _fEOF = true;
                return hr;
            }
        }
        else
        {
            hr = S_OK;
        }
    }
    else if (btotal == 0)
    {
    	return (lastBuffer) ? S_FALSE : E_PENDING;
    }

    btotal += num;
    UINT b = btotal, utotal = cb;

    if (b > cb)
    {
        // If we have more bytes in our buffer than the caller has
        // room for, then only return the number of bytes the caller
        // asked for -- otherwise pfnWideCharFromMultiByte will write
        // off the end of the caller's buffer.
        b = cb;
    }
    if (pfnWideCharFromMultiByte == NULL) // first read() call
    {
        checkhr2(autoDetect());
        if (pfnWideCharFromMultiByte == NULL) // failed to fully determine encoding
            return (lastBuffer) ? S_FALSE : E_PENDING;
        b -= bnext;
        startAt -= bnext;
    }
    hr = (this->pfnWideCharFromMultiByte)(&_dwMode, codepage, buf + bnext, &b, (WCHAR *)pv, &utotal);
    if (hr != S_OK)
        return hr;
    if (b == 0 && num == 0 && (stream || lastBuffer))
    {
        // stream says we're at the end, but pfnWideCharFromMultiByte
        // disagrees !!
        ::FusionpDbgPrintEx(
            FUSION_DBG_LEVEL_ERROR,
            "SXS.DLL: XML Parser found incomplete encoding\n");

        return XML_E_INCOMPLETE_ENCODING;
    }
    bnext += b;
    if (pcbRead != NULL)
        *pcbRead = utotal*sizeof(WCHAR);
    return (utotal == 0) ? E_PENDING : S_OK;
}
//////////////////////////////////////////////////////////////////////////////////
/**
 * Checks the first two/four bytes of the input Stream in order to
 * detect UTF-16/UCS-4 or UTF-8 encoding;
 * otherwise assume it is UTF-8

 * xiaoyu : since only UCS-2 and UTF-8 are support, we do not deal with others...
 */
HRESULT EncodingStream::autoDetect()
{
    // wait until we have enough to be sure.
    if (btotal < 2)
        return S_OK;

    unsigned int guess = (((unsigned char)buf[0]) << 8) + ((unsigned char)buf[1]);
    HRESULT hr;

    if (guess == 0xFEFF || guess == 0xFFFE) // BOM found
    {
        // wait until we have enough to be sure.
        if (btotal < 4)
            return S_OK;

        unsigned int guess1 = (((unsigned char)buf[2]) << 8) + ((unsigned char)buf[3]);
        if (guess == guess1)
        {
            /*
			if (!encoding)
            {
                static const WCHAR* wchUCS4 = TEXT("UCS-4");
                encoding = Encoding::newEncoding(wchUCS4, 5, (0xFFFE == guess), true);
            }
            bnext = 4;
			*/
			// FUSION_XML_PARSER does not support UCS4
			return XML_E_INVALIDENCODING;
        }
        else
        {
            if (!encoding)
            {
                static const WCHAR* wchUCS2 = L"UCS-2";
                encoding = Encoding::newEncoding(wchUCS2, 5, (0xFFFE == guess), true);
            }
            bnext = 2;
        }

        if (NULL == encoding)
            return E_OUTOFMEMORY;
        encoding->littleendian =  (0xFFFE == guess);
    }
    else
    {
        if (!encoding)
        {
            encoding = Encoding::newEncoding(); // default encoding : UTF-8
            if (NULL == encoding)
                return E_OUTOFMEMORY;
        }

        // In some system, such as win2k, there is BOM 0xEF BB BF for UTF8
        if (guess == 0xEFBB)
        {
            if (btotal < 3)
                return S_OK;

            if (buf[2] == 0xBF)
                _fUTF8BOM = true;

            bnext = 3;
        }
        else
        {
            encoding->byteOrderMark = false;
        }
    }

    checkhr2(CharEncoder::getWideCharFromMultiByteInfo(encoding, &codepage, &pfnWideCharFromMultiByte, &maxCharSize));
    return S_OK;
}
/////////////////////////////////////////////////////////////////////////////////////////
/**
 * Switchs the character encoding of the input stream
 * Returns:
 *         S_OK: succeeded, and do not need re-read
 *         S_FALSE: succeeded, needs to re-read from <code> newPosition </code>
 *         Otherwise: error code
 * Notice:
 *         This method only works for input stream, newPosition starts with 1
 */
HRESULT EncodingStream::switchEncodingAt(Encoding * newEncoding, int newPosition)
{
    // Ignore encoding information in the document when charset information is set from outside
	// xwu: fusion xml parsed does not use Charset
    //if (_fSetCharset)
    //    return S_OK;


    int l = newPosition - startAt;
    if (l < 0 || l > (int)bnext)
    {
        // out of range
        delete newEncoding;
        return E_INVALIDARG;
    }

    UINT newcodepage;
    UINT newCharSize;
    //
    // get and check charset information
    //
    WideCharFromMultiByteFunc * pfn;
    HRESULT hr = CharEncoder::getWideCharFromMultiByteInfo(newEncoding, &newcodepage, &pfn, &newCharSize);
    if (hr != S_OK)
    {
        delete newEncoding;
        return E_INVALIDARG;
    }
    if (codepage == newcodepage)
    {
        delete newEncoding;
        return S_OK;
    }

    // Now if we are in UCS-2/UCS-4 we cannot switch out of UCS-2/UCS-4 and if we are
    // not in UCS-2/UCS-4 we cannot switch into UCS-2/UCS-4.
    // Also if UTF-8 BOM is presented, we cannot switch away
    if ((codepage != CP_UCS_2 && newcodepage == CP_UCS_2) ||
        (codepage == CP_UCS_2 && newcodepage != CP_UCS_2) ||
		/* xuw: fusion xml parser only support UTF-8 and UCS-2
        (codepage != CP_UCS_4 && newcodepage == CP_UCS_4) ||
        (codepage == CP_UCS_4 && newcodepage != CP_UCS_4) ||
		*/
        (codepage == CP_UTF_8 && newcodepage != CP_UTF_8 && _fUTF8BOM))
    {
        delete newEncoding;
        return E_FAIL;
    }

    // Ok, then, let's make the switch.
    delete encoding;
    encoding = newEncoding;
    maxCharSize = newCharSize;
    codepage = newcodepage;
    pfnWideCharFromMultiByte = pfn;

    // Because the XML declaration is encoded in UTF-8,
    // Mapping input characters to wide characters is one-to-one mapping
    if ((int)bnext != l)
    {
        bnext = l;
        return S_FALSE;
    }
    return S_OK;
}

//////////////////////////////////////////////////////////////////////////////////
// minlen is the number of UNICODE, which is the same number of byte we read from the file
HRESULT EncodingStream::prepareForInput(ULONG minlen)
{
    Assert(btotal >= bnext);
    btotal -= bnext;

    if (bufsize < minlen)
    {
        BYTE* newbuf = NEW (BYTE[minlen]);
        if (newbuf == NULL) {
            return E_OUTOFMEMORY;
        }

        if (buf){
            ::memcpy(newbuf, buf+bnext, btotal);
            delete[] buf;
        }

        buf = newbuf;
        bufsize = minlen;
    }
    else if (bnext > 0 && btotal > 0)
    {
        // Shift remaining bytes down to beginning of buffer.
        ::memmove(buf, buf + bnext, btotal);
    }

    startAt += bnext;
    bnext = 0;
    return S_OK;
}
//////////////////////////////////////////////////////////////////////////////////
// xiaoyu : here it assumes that it is a BYTE buffer, not a WCHAR byte, so it can be copied directly
HRESULT EncodingStream::AppendData( const BYTE* buffer, ULONG length, BOOL fLastBuffer)
{
    Assert(btotal >= bnext);
    lastBuffer = (fLastBuffer != FALSE);
    HRESULT hr;
    ULONG minlen = length + (btotal - bnext); // make sure we don't loose any data
    if (minlen < BUFFERSIZE)
        minlen = BUFFERSIZE;
    checkhr2( prepareForInput(minlen)); // guarantee enough space in the array

    if (length > 0 && buffer != NULL){
        // Copy raw data into new buffer.
        ::memcpy(buf + btotal, buffer, length);
        btotal += length;
    }
	if (pfnWideCharFromMultiByte == NULL) // first AppendData call
    {
        checkhr2(autoDetect());
    }


    return hr;
}
//////////////////////////////////////////////////////////////////////////////////
HRESULT EncodingStream::BufferData()
{
    HRESULT hr = S_OK;
    checkhr2(prepareForInput(0)); // 0 is used just for shift down (so bnext=0).

    if (_fEOF)          // already hit the end of the stream.
        return S_FALSE;

    const DWORD BUFSIZE = 4096;

    DWORD dwRead = 1;

    while (S_OK == hr && dwRead > 0)
    {
        // if we cannot fit another buffer full, then re-allocate.
        DWORD minsize = (btotal+BUFSIZE > bufsize) ? bufsize + BUFSIZE : bufsize;
        checkhr2( prepareForInput(minsize)); // make space available.

        dwRead = 0;
        hr = stream->Read(buf + btotal, BUFSIZE, &dwRead);
        btotal += dwRead;
    }

    if (SUCCEEDED(hr) && dwRead == 0)
    {
        _fEOF = true;
        hr = S_FALSE; // return S_FALSE when at eof.
    }
    return hr;
}