windows-nt/Source/XPSP1/NT/inetsrv/iis/iisrearc/iisplus/ulw3/ulparse.cxx
2020-09-26 16:20:57 +08:00

655 lines
14 KiB
C++

/*++
Copyright (c) 2000 Microsoft Corporation
Module Name :
ulparse.cxx
Abstract:
Rip some useful UL code
Author:
(RIPPED from UL driver code (HenrySa, PaulMcd)
Environment:
Win32 - User Mode
Project:
ULW3.DLL
--*/
#include "precomp.hxx"
typedef enum _URL_PART
{
Scheme,
HostName,
AbsPath,
QueryString
} URL_PART;
#define IS_UTF8_TRAILBYTE(ch) (((ch) & 0xc0) == 0x80)
NTSTATUS
Unescape(
IN PUCHAR pChar,
OUT PUCHAR pOutChar
)
{
UCHAR Result, Digit;
if (pChar[0] != '%' ||
SAFEIsXDigit(pChar[1]) == FALSE ||
SAFEIsXDigit(pChar[2]) == FALSE)
{
return STATUS_OBJECT_PATH_SYNTAX_BAD;
}
//
// HexToChar() inlined
//
// uppercase #1
//
if (isalpha(pChar[1]))
Digit = (UCHAR) toupper(pChar[1]);
else
Digit = pChar[1];
Result = ((Digit >= 'A') ? (Digit - 'A' + 0xA) : (Digit - '0')) << 4;
// uppercase #2
//
if (isalpha(pChar[2]))
Digit = (UCHAR) toupper(pChar[2]);
else
Digit = pChar[2];
Result |= (Digit >= 'A') ? (Digit - 'A' + 0xA) : (Digit - '0');
*pOutChar = Result;
return STATUS_SUCCESS;
} // Unescape
NTSTATUS
PopChar(
IN URL_PART UrlPart,
IN PUCHAR pChar,
OUT WCHAR * pUnicodeChar,
OUT PULONG pCharToSkip
)
{
NTSTATUS Status;
WCHAR UnicodeChar;
UCHAR Char;
UCHAR Trail1;
UCHAR Trail2;
ULONG CharToSkip;
//
// need to unescape ?
//
// can't decode the query string. that would be lossy decodeing
// as '=' and '&' characters might be encoded, but have meaning
// to the usermode parser.
//
if (UrlPart != QueryString && pChar[0] == '%')
{
Status = Unescape(pChar, &Char);
if (NT_SUCCESS(Status) == FALSE)
goto end;
CharToSkip = 3;
}
else
{
Char = pChar[0];
CharToSkip = 1;
}
//
// convert to unicode, checking for utf8 .
//
// 3 byte runs are the largest we can have. 16 bits in UCS-2 =
// 3 bytes of (4+4,2+6,2+6) where it's code + char.
// for a total of 6+6+4 char bits = 16 bits.
//
//
// NOTE: we'll only bother to decode utf if it was escaped
// thus the (CharToSkip == 3)
//
if ((CharToSkip == 3) && ((Char & 0xf0) == 0xe0))
{
// 3 byte run
//
// Unescape the next 2 trail bytes
//
Status = Unescape(pChar+CharToSkip, &Trail1);
if (NT_SUCCESS(Status) == FALSE)
goto end;
CharToSkip += 3; // %xx
Status = Unescape(pChar+CharToSkip, &Trail2);
if (NT_SUCCESS(Status) == FALSE)
goto end;
CharToSkip += 3; // %xx
if (IS_UTF8_TRAILBYTE(Trail1) == FALSE ||
IS_UTF8_TRAILBYTE(Trail2) == FALSE)
{
// bad utf!
//
Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
goto end;
}
// handle three byte case
// 1110xxxx 10xxxxxx 10xxxxxx
UnicodeChar = (USHORT) (((Char & 0x0f) << 12) |
((Trail1 & 0x3f) << 6) |
(Trail2 & 0x3f));
}
else if ((CharToSkip == 3) && ((Char & 0xe0) == 0xc0))
{
// 2 byte run
//
// Unescape the next 1 trail byte
//
Status = Unescape(pChar+CharToSkip, &Trail1);
if (NT_SUCCESS(Status) == FALSE)
goto end;
CharToSkip += 3; // %xx
if (IS_UTF8_TRAILBYTE(Trail1) == FALSE)
{
// bad utf!
//
Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
goto end;
}
// handle two byte case
// 110xxxxx 10xxxxxx
UnicodeChar = (USHORT) (((Char & 0x1f) << 6) |
(Trail1 & 0x3f));
}
// now this can either be unescaped high-bit (bad)
// or escaped high-bit. (also bad)
//
// thus not checking CharToSkip
//
else if ((Char & 0x80) == 0x80)
{
// high bit set ! bad utf!
//
Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
goto end;
}
//
// Normal character (again either escaped or unescaped)
//
else
{
//
// Simple conversion to unicode, it's 7-bit ascii.
//
UnicodeChar = (USHORT)Char;
}
//
// turn backslashes into forward slashes
//
if (UrlPart != QueryString && UnicodeChar == L'\\')
{
UnicodeChar = L'/';
}
else if (UnicodeChar == 0)
{
//
// we pop'd a NULL. bad!
//
Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
goto end;
}
*pCharToSkip = CharToSkip;
*pUnicodeChar = UnicodeChar;
Status = STATUS_SUCCESS;
end:
return Status;
} // PopChar
//
// Private constants.
//
#define ACTION_NOTHING 0x00000000
#define ACTION_EMIT_CH 0x00010000
#define ACTION_EMIT_DOT_CH 0x00020000
#define ACTION_EMIT_DOT_DOT_CH 0x00030000
#define ACTION_BACKUP 0x00040000
#define ACTION_MASK 0xFFFF0000
//
// Private globals
//
//
// this table says what to do based on the current state and the current
// character
//
ULONG pActionTable[16] =
{
//
// state 0 = fresh, seen nothing exciting yet
//
ACTION_EMIT_CH, // other = emit it state = 0
ACTION_EMIT_CH, // "." = emit it state = 0
ACTION_NOTHING, // EOS = normal finish state = 4
ACTION_EMIT_CH, // "/" = we saw the "/", emit it state = 1
//
// state 1 = we saw a "/" !
//
ACTION_EMIT_CH, // other = emit it, state = 0
ACTION_NOTHING, // "." = eat it, state = 2
ACTION_NOTHING, // EOS = normal finish state = 4
ACTION_NOTHING, // "/" = extra slash, eat it, state = 1
//
// state 2 = we saw a "/" and ate a "." !
//
ACTION_EMIT_DOT_CH, // other = emit the dot we ate. state = 0
ACTION_NOTHING, // "." = eat it, a .. state = 3
ACTION_NOTHING, // EOS = normal finish state = 4
ACTION_NOTHING, // "/" = we ate a "/./", swallow it state = 1
//
// state 3 = we saw a "/" and ate a ".." !
//
ACTION_EMIT_DOT_DOT_CH, // other = emit the "..". state = 0
ACTION_EMIT_DOT_DOT_CH, // "." = 3 dots, emit the ".." state = 0
ACTION_BACKUP, // EOS = we have a "/..\0", backup! state = 4
ACTION_BACKUP // "/" = we have a "/../", backup! state = 1
};
//
// this table says which newstate to be in given the current state and the
// character we saw
//
ULONG pNextStateTable[16] =
{
// state 0
0 , // other
0 , // "."
4 , // EOS
1 , // "\"
// state 1
0 , // other
2 , // "."
4 , // EOS
1 , // "\"
// state 2
0 , // other
3 , // "."
4 , // EOS
1 , // "\"
// state 3
0 , // other
0 , // "."
4 , // EOS
1 // "\"
};
//
// this says how to index into pNextStateTable given our current state.
//
// since max states = 4, we calculate the index by multiplying with 4.
//
#define IndexFromState( st) ( (st) * 4)
/***************************************************************************++
Routine Description:
Unescape
Convert backslash to forward slash
Remove double slashes (empty directiories names) - e.g. // or \\
Handle /./
Handle /../
Convert to unicode
Arguments:
Return Value:
HRESULT
--***************************************************************************/
HRESULT
UlCleanAndCopyUrl(
IN PUCHAR pSource,
IN ULONG SourceLength,
OUT PULONG pBytesCopied,
OUT PWSTR pDestination,
OUT PWSTR * ppQueryString OPTIONAL
)
{
NTSTATUS Status;
PWSTR pDest;
PUCHAR pChar;
ULONG CharToSkip;
UCHAR Char;
ULONG BytesCopied;
PWSTR pQueryString;
ULONG StateIndex;
WCHAR UnicodeChar;
BOOLEAN MakeCanonical;
URL_PART UrlPart = AbsPath;
//
// a cool local helper macro
//
#define EMIT_CHAR(ch) \
do { \
pDest[0] = (ch); \
pDest += 1; \
BytesCopied += 2; \
} while (0)
pDest = pDestination;
pQueryString = NULL;
BytesCopied = 0;
pChar = pSource;
CharToSkip = 0;
StateIndex = 0;
MakeCanonical = (UrlPart == AbsPath) ? TRUE : FALSE;
while (SourceLength > 0)
{
//
// advance ! it's at the top of the loop to enable ANSI_NULL to
// come through ONCE
//
pChar += CharToSkip;
SourceLength -= CharToSkip;
//
// well? have we hit the end?
//
if (SourceLength == 0)
{
UnicodeChar = UNICODE_NULL;
}
else
{
//
// Nope. Peek briefly to see if we hit the query string
//
if (UrlPart == AbsPath && pChar[0] == '?')
{
DBG_ASSERT(pQueryString == NULL);
//
// remember it's location
//
pQueryString = pDest;
//
// let it fall through ONCE to the canonical
// in order to handle a trailing "/.." like
// "http://foobar:80/foo/bar/..?v=1&v2"
//
UnicodeChar = L'?';
CharToSkip = 1;
//
// now we are cleaning the query string
//
UrlPart = QueryString;
}
else
{
//
// grab the next char
//
Status = PopChar(UrlPart, pChar, &UnicodeChar, &CharToSkip);
if (NT_SUCCESS(Status) == FALSE)
goto end;
}
}
if (MakeCanonical)
{
//
// now use the state machine to make it canonical .
//
//
// from the old value of StateIndex, figure out our new base StateIndex
//
StateIndex = IndexFromState(pNextStateTable[StateIndex]);
//
// did we just hit the query string? this will only happen once
// that we take this branch after hitting it, as we stop
// processing after hitting it.
//
if (UrlPart == QueryString)
{
//
// treat this just like we hit a NULL, EOS.
//
StateIndex += 2;
}
else
{
//
// otherwise based the new state off of the char we
// just popped.
//
switch (UnicodeChar)
{
case UNICODE_NULL: StateIndex += 2; break;
case L'.': StateIndex += 1; break;
case L'/': StateIndex += 3; break;
default: StateIndex += 0; break;
}
}
}
else
{
StateIndex = (UnicodeChar == UNICODE_NULL) ? 2 : 0;
}
//
// Perform the action associated with the state.
//
switch (pActionTable[StateIndex])
{
case ACTION_EMIT_DOT_DOT_CH:
EMIT_CHAR(L'.');
// fall through
case ACTION_EMIT_DOT_CH:
EMIT_CHAR(L'.');
// fall through
case ACTION_EMIT_CH:
EMIT_CHAR(UnicodeChar);
// fall through
case ACTION_NOTHING:
break;
case ACTION_BACKUP:
//
// pDest currently points 1 past the last '/'. backup over it and
// find the preceding '/', set pDest to 1 past that one.
//
//
// backup to the '/'
//
pDest -= 1;
BytesCopied -= 2;
DBG_ASSERT(pDest[0] == L'/');
//
// are we at the start of the string? that's bad, can't go back!
//
if (pDest == pDestination)
{
DBG_ASSERT(BytesCopied == 0);
Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
goto end;
}
//
// back up over the '/'
//
pDest -= 1;
BytesCopied -= 2;
DBG_ASSERT(pDest > pDestination);
//
// now find the previous slash
//
while (pDest > pDestination && pDest[0] != L'/')
{
pDest -= 1;
BytesCopied -= 2;
}
//
// we already have a slash, so don't have to store 1.
//
DBG_ASSERT(pDest[0] == L'/');
//
// simply skip it, as if we had emitted it just now
//
pDest += 1;
BytesCopied += 2;
break;
default:
DBG_ASSERT(!"w3core!UlpCleanAndCopyUrl: Invalid action code in state table!");
Status = STATUS_OBJECT_PATH_SYNTAX_BAD;
goto end;
}
//
// Just hit the query string ?
//
if (MakeCanonical && UrlPart == QueryString)
{
//
// Stop canonical processing
//
MakeCanonical = FALSE;
//
// Need to emit the '?', it wasn't emitted above
//
DBG_ASSERT(pActionTable[StateIndex] != ACTION_EMIT_CH);
EMIT_CHAR(L'?');
}
}
//
// terminate the string, it hasn't been done in the loop
//
DBG_ASSERT((pDest-1)[0] != UNICODE_NULL);
pDest[0] = UNICODE_NULL;
*pBytesCopied = BytesCopied;
if (ppQueryString != NULL)
{
*ppQueryString = pQueryString;
}
Status = STATUS_SUCCESS;
end:
return HRESULT_FROM_WIN32( RtlNtStatusToDosError( Status ) );
} // UlCleanAndCopyUrl