/*++

   Copyright    (c)    2000    Microsoft Corporation

   Module Name :
     ulparse.cxx

   Abstract:
     Rip some useful UL code
     
   Author:
     (RIPPED from UL driver code (HenrySa, PaulMcd)

   Environment:
     Win32 - User Mode

   Project:
     ULW3.DLL
--*/

#include "precomp.hxx"

typedef enum _URL_PART
{
    Scheme,
    HostName,
    AbsPath,
    QueryString

} URL_PART;

#define IS_UTF8_TRAILBYTE(ch)      (((ch) & 0xc0) == 0x80)

NTSTATUS
Unescape(
    IN  PUCHAR pChar,
    OUT PUCHAR pOutChar
    )

{
    UCHAR Result, Digit;

    if (pChar[0] != '%' ||
        SAFEIsXDigit(pChar[1]) == FALSE ||
        SAFEIsXDigit(pChar[2]) == FALSE)
    {
        return STATUS_OBJECT_PATH_SYNTAX_BAD;
    }

    //
    // HexToChar() inlined
    //

    // uppercase #1
    //
    if (isalpha(pChar[1]))
        Digit = (UCHAR) toupper(pChar[1]);
    else
        Digit = pChar[1];

    Result = ((Digit >= 'A') ? (Digit - 'A' + 0xA) : (Digit - '0')) << 4;

    // uppercase #2
    //
    if (isalpha(pChar[2]))
        Digit = (UCHAR) toupper(pChar[2]);
    else
        Digit = pChar[2];

    Result |= (Digit >= 'A') ? (Digit - 'A' + 0xA) : (Digit - '0');

    *pOutChar = Result;

    return STATUS_SUCCESS;

}   // Unescape


NTSTATUS
PopChar(
    IN URL_PART UrlPart,
    IN PUCHAR pChar,
    OUT WCHAR * pUnicodeChar,
    OUT PULONG pCharToSkip
    )
{
    NTSTATUS Status;
    WCHAR   UnicodeChar;
    UCHAR   Char;
    UCHAR   Trail1;
    UCHAR   Trail2;
    ULONG   CharToSkip;

    //
    // need to unescape ?
    //
    // can't decode the query string.  that would be lossy decodeing
    // as '=' and '&' characters might be encoded, but have meaning
    // to the usermode parser.
    //

    if (UrlPart != QueryString && pChar[0] == '%')
    {
        Status = Unescape(pChar, &Char);
        if (NT_SUCCESS(Status) == FALSE)
            goto end;
        CharToSkip = 3;
    }
    else
    {
        Char = pChar[0];
        CharToSkip = 1;
    }

    //
    // convert to unicode, checking for utf8 .
    //
    // 3 byte runs are the largest we can have.  16 bits in UCS-2 =
    // 3 bytes of (4+4,2+6,2+6) where it's code + char.
    // for a total of 6+6+4 char bits = 16 bits.
    //

    //
    // NOTE: we'll only bother to decode utf if it was escaped
    // thus the (CharToSkip == 3)
    //
    if ((CharToSkip == 3) && ((Char & 0xf0) == 0xe0))
    {
        // 3 byte run
        //

        // Unescape the next 2 trail bytes
        //

        Status = Unescape(pChar+CharToSkip, &Trail1);
        if (NT_SUCCESS(Status) == FALSE)
            goto end;

        CharToSkip += 3; // %xx

        Status = Unescape(pChar+CharToSkip, &Trail2);
        if (NT_SUCCESS(Status) == FALSE)
            goto end;

        CharToSkip += 3; // %xx

        if (IS_UTF8_TRAILBYTE(Trail1) == FALSE ||
            IS_UTF8_TRAILBYTE(Trail2) == FALSE)
        {
            // bad utf!
            //
            Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
            goto end;
        }

        // handle three byte case
        // 1110xxxx 10xxxxxx 10xxxxxx

        UnicodeChar = (USHORT) (((Char & 0x0f) << 12) |
                                ((Trail1 & 0x3f) << 6) |
                                (Trail2 & 0x3f));

    }
    else if ((CharToSkip == 3) && ((Char & 0xe0) == 0xc0))
    {
        // 2 byte run
        //

        // Unescape the next 1 trail byte
        //

        Status = Unescape(pChar+CharToSkip, &Trail1);
        if (NT_SUCCESS(Status) == FALSE)
            goto end;

        CharToSkip += 3; // %xx

        if (IS_UTF8_TRAILBYTE(Trail1) == FALSE)
        {
            // bad utf!
            //
            Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
            goto end;
        }

        // handle two byte case
        // 110xxxxx 10xxxxxx

        UnicodeChar = (USHORT) (((Char & 0x1f) << 6) |
                                (Trail1 & 0x3f));

    }

    // now this can either be unescaped high-bit (bad)
    // or escaped high-bit.  (also bad)
    //
    // thus not checking CharToSkip
    //

    else if ((Char & 0x80) == 0x80)
    {
        // high bit set !  bad utf!
        //
        Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
        goto end;

    }
    //
    // Normal character (again either escaped or unescaped)
    //
    else
    {
        //
        // Simple conversion to unicode, it's 7-bit ascii.
        //

        UnicodeChar = (USHORT)Char;
    }

    //
    // turn backslashes into forward slashes
    //

    if (UrlPart != QueryString && UnicodeChar == L'\\')
    {
        UnicodeChar = L'/';
    }
    else if (UnicodeChar == 0)
    {
        //
        // we pop'd a NULL.  bad!
        //
        Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
        goto end;
    }

    *pCharToSkip  = CharToSkip;
    *pUnicodeChar = UnicodeChar;

    Status = STATUS_SUCCESS;

end:
    return Status;

}   // PopChar

//
//  Private constants.
//

#define ACTION_NOTHING              0x00000000
#define ACTION_EMIT_CH              0x00010000
#define ACTION_EMIT_DOT_CH          0x00020000
#define ACTION_EMIT_DOT_DOT_CH      0x00030000
#define ACTION_BACKUP               0x00040000
#define ACTION_MASK                 0xFFFF0000

//
// Private globals
//

//
// this table says what to do based on the current state and the current
// character
//
ULONG  pActionTable[16] =
{
    //
    // state 0 = fresh, seen nothing exciting yet
    //
    ACTION_EMIT_CH,         // other = emit it                      state = 0
    ACTION_EMIT_CH,         // "."   = emit it                      state = 0
    ACTION_NOTHING,         // EOS   = normal finish                state = 4
    ACTION_EMIT_CH,         // "/"   = we saw the "/", emit it      state = 1

    //
    // state 1 = we saw a "/" !
    //
    ACTION_EMIT_CH,         // other = emit it,                     state = 0
    ACTION_NOTHING,         // "."   = eat it,                      state = 2
    ACTION_NOTHING,         // EOS   = normal finish                state = 4
    ACTION_NOTHING,         // "/"   = extra slash, eat it,         state = 1

    //
    // state 2 = we saw a "/" and ate a "." !
    //
    ACTION_EMIT_DOT_CH,     // other = emit the dot we ate.         state = 0
    ACTION_NOTHING,         // "."   = eat it, a ..                 state = 3
    ACTION_NOTHING,         // EOS   = normal finish                state = 4
    ACTION_NOTHING,         // "/"   = we ate a "/./", swallow it   state = 1

    //
    // state 3 = we saw a "/" and ate a ".." !
    //
    ACTION_EMIT_DOT_DOT_CH, // other = emit the "..".               state = 0
    ACTION_EMIT_DOT_DOT_CH, // "."   = 3 dots, emit the ".."        state = 0
    ACTION_BACKUP,          // EOS   = we have a "/..\0", backup!   state = 4
    ACTION_BACKUP           // "/"   = we have a "/../", backup!    state = 1
};

//
// this table says which newstate to be in given the current state and the
// character we saw
//
ULONG  pNextStateTable[16] =
{
    // state 0
    0 ,             // other
    0 ,             // "."
    4 ,             // EOS
    1 ,             // "\"

    //  state 1
    0 ,              // other
    2 ,             // "."
    4 ,             // EOS
    1 ,             // "\"

    // state 2
    0 ,             // other
    3 ,             // "."
    4 ,             // EOS
    1 ,             // "\"

    // state 3
    0 ,             // other
    0 ,             // "."
    4 ,             // EOS
    1               // "\"
};

//
// this says how to index into pNextStateTable given our current state.
//
// since max states = 4, we calculate the index by multiplying with 4.
//
#define IndexFromState( st)   ( (st) * 4)


/***************************************************************************++

Routine Description:


    Unescape
    Convert backslash to forward slash
    Remove double slashes (empty directiories names) - e.g. // or \\
    Handle /./
    Handle /../
    Convert to unicode

Arguments:

Return Value:

    HRESULT 


--***************************************************************************/
HRESULT
UlCleanAndCopyUrl(
    IN      PUCHAR      pSource,
    IN      ULONG       SourceLength,
    OUT     PULONG      pBytesCopied,
    OUT     PWSTR       pDestination,
    OUT     PWSTR *     ppQueryString OPTIONAL
    )
{
    NTSTATUS Status;
    PWSTR   pDest;
    PUCHAR  pChar;
    ULONG   CharToSkip;
    UCHAR   Char;
    ULONG   BytesCopied;
    PWSTR   pQueryString;
    ULONG   StateIndex;
    WCHAR   UnicodeChar;
    BOOLEAN MakeCanonical;
    URL_PART UrlPart = AbsPath;

//
// a cool local helper macro
//

#define EMIT_CHAR(ch)                                   \
    do {                                                \
        pDest[0] = (ch);                                \
        pDest += 1;                                     \
        BytesCopied += 2;                               \
    } while (0)


    pDest = pDestination;
    pQueryString = NULL;
    BytesCopied = 0;

    pChar = pSource;
    CharToSkip = 0;

    StateIndex = 0;

    MakeCanonical = (UrlPart == AbsPath) ? TRUE : FALSE;

    while (SourceLength > 0)
    {
        //
        // advance !  it's at the top of the loop to enable ANSI_NULL to
        // come through ONCE
        //

        pChar += CharToSkip;
        SourceLength -= CharToSkip;

        //
        // well?  have we hit the end?
        //

        if (SourceLength == 0)
        {
            UnicodeChar = UNICODE_NULL;
        }
        else
        {
            //
            // Nope.  Peek briefly to see if we hit the query string
            //

            if (UrlPart == AbsPath && pChar[0] == '?')
            {
                DBG_ASSERT(pQueryString == NULL);

                //
                // remember it's location
                //

                pQueryString = pDest;

                //
                // let it fall through ONCE to the canonical
                // in order to handle a trailing "/.." like
                // "http://foobar:80/foo/bar/..?v=1&v2"
                //

                UnicodeChar = L'?';
                CharToSkip = 1;

                //
                // now we are cleaning the query string
                //

                UrlPart = QueryString;
            }
            else
            {
                //
                // grab the next char
                //

                Status = PopChar(UrlPart, pChar, &UnicodeChar, &CharToSkip);
                if (NT_SUCCESS(Status) == FALSE)
                    goto end;
            }
        }

        if (MakeCanonical)
        {
            //
            // now use the state machine to make it canonical .
            //

            //
            // from the old value of StateIndex, figure out our new base StateIndex
            //
            StateIndex = IndexFromState(pNextStateTable[StateIndex]);

            //
            // did we just hit the query string?  this will only happen once
            // that we take this branch after hitting it, as we stop
            // processing after hitting it.
            //

            if (UrlPart == QueryString)
            {
                //
                // treat this just like we hit a NULL, EOS.
                //

                StateIndex += 2;
            }
            else
            {
                //
                // otherwise based the new state off of the char we
                // just popped.
                //

                switch (UnicodeChar)
                {
                case UNICODE_NULL:      StateIndex += 2;    break;
                case L'.':              StateIndex += 1;    break;
                case L'/':              StateIndex += 3;    break;
                default:                StateIndex += 0;    break;
                }
            }

        }
        else
        {
            StateIndex = (UnicodeChar == UNICODE_NULL) ? 2 : 0;
        }

        //
        //  Perform the action associated with the state.
        //

        switch (pActionTable[StateIndex])
        {
        case ACTION_EMIT_DOT_DOT_CH:

            EMIT_CHAR(L'.');

            // fall through

        case ACTION_EMIT_DOT_CH:

            EMIT_CHAR(L'.');

            // fall through

        case ACTION_EMIT_CH:

            EMIT_CHAR(UnicodeChar);

            // fall through

        case ACTION_NOTHING:
            break;

        case ACTION_BACKUP:

            //
            // pDest currently points 1 past the last '/'.  backup over it and
            // find the preceding '/', set pDest to 1 past that one.
            //

            //
            // backup to the '/'
            //

            pDest       -= 1;
            BytesCopied -= 2;

            DBG_ASSERT(pDest[0] == L'/');

            //
            // are we at the start of the string?  that's bad, can't go back!
            //

            if (pDest == pDestination)
            {
                DBG_ASSERT(BytesCopied == 0);
                Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
                goto end;
            }

            //
            // back up over the '/'
            //

            pDest       -= 1;
            BytesCopied -= 2;

            DBG_ASSERT(pDest > pDestination);

            //
            // now find the previous slash
            //

            while (pDest > pDestination && pDest[0] != L'/')
            {
                pDest       -= 1;
                BytesCopied -= 2;
            }

            //
            // we already have a slash, so don't have to store 1.
            //

            DBG_ASSERT(pDest[0] == L'/');

            //
            // simply skip it, as if we had emitted it just now
            //

            pDest       += 1;
            BytesCopied += 2;

            break;

        default:
            DBG_ASSERT(!"w3core!UlpCleanAndCopyUrl: Invalid action code in state table!");
            Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
            goto end;
        }

        //
        // Just hit the query string ?
        //

        if (MakeCanonical && UrlPart == QueryString)
        {
            //
            // Stop canonical processing
            //

            MakeCanonical = FALSE;

            //
            // Need to emit the '?', it wasn't emitted above
            //

            DBG_ASSERT(pActionTable[StateIndex] != ACTION_EMIT_CH);

            EMIT_CHAR(L'?');

        }

    }

    //
    // terminate the string, it hasn't been done in the loop
    //

    DBG_ASSERT((pDest-1)[0] != UNICODE_NULL);

    pDest[0] = UNICODE_NULL;

    *pBytesCopied = BytesCopied;
    if (ppQueryString != NULL)
    {
        *ppQueryString = pQueryString;
    }

    Status = STATUS_SUCCESS;


end:
    return HRESULT_FROM_WIN32( RtlNtStatusToDosError( Status ) );

} // UlCleanAndCopyUrl