/*++ Copyright (c) 2000 Microsoft Corporation Module Name : ulparse.cxx Abstract: Rip some useful UL code Author: (RIPPED from UL driver code (HenrySa, PaulMcd) Environment: Win32 - User Mode Project: ULW3.DLL --*/ #include "precomp.hxx" typedef enum _URL_PART { Scheme, HostName, AbsPath, QueryString } URL_PART; #define IS_UTF8_TRAILBYTE(ch) (((ch) & 0xc0) == 0x80) NTSTATUS Unescape( IN PUCHAR pChar, OUT PUCHAR pOutChar ) { UCHAR Result, Digit; if (pChar[0] != '%' || SAFEIsXDigit(pChar[1]) == FALSE || SAFEIsXDigit(pChar[2]) == FALSE) { return STATUS_OBJECT_PATH_SYNTAX_BAD; } // // HexToChar() inlined // // uppercase #1 // if (isalpha(pChar[1])) Digit = (UCHAR) toupper(pChar[1]); else Digit = pChar[1]; Result = ((Digit >= 'A') ? (Digit - 'A' + 0xA) : (Digit - '0')) << 4; // uppercase #2 // if (isalpha(pChar[2])) Digit = (UCHAR) toupper(pChar[2]); else Digit = pChar[2]; Result |= (Digit >= 'A') ? (Digit - 'A' + 0xA) : (Digit - '0'); *pOutChar = Result; return STATUS_SUCCESS; } // Unescape NTSTATUS PopChar( IN URL_PART UrlPart, IN PUCHAR pChar, OUT WCHAR * pUnicodeChar, OUT PULONG pCharToSkip ) { NTSTATUS Status; WCHAR UnicodeChar; UCHAR Char; UCHAR Trail1; UCHAR Trail2; ULONG CharToSkip; // // need to unescape ? // // can't decode the query string. that would be lossy decodeing // as '=' and '&' characters might be encoded, but have meaning // to the usermode parser. // if (UrlPart != QueryString && pChar[0] == '%') { Status = Unescape(pChar, &Char); if (NT_SUCCESS(Status) == FALSE) goto end; CharToSkip = 3; } else { Char = pChar[0]; CharToSkip = 1; } // // convert to unicode, checking for utf8 . // // 3 byte runs are the largest we can have. 16 bits in UCS-2 = // 3 bytes of (4+4,2+6,2+6) where it's code + char. // for a total of 6+6+4 char bits = 16 bits. // // // NOTE: we'll only bother to decode utf if it was escaped // thus the (CharToSkip == 3) // if ((CharToSkip == 3) && ((Char & 0xf0) == 0xe0)) { // 3 byte run // // Unescape the next 2 trail bytes // Status = Unescape(pChar+CharToSkip, &Trail1); if (NT_SUCCESS(Status) == FALSE) goto end; CharToSkip += 3; // %xx Status = Unescape(pChar+CharToSkip, &Trail2); if (NT_SUCCESS(Status) == FALSE) goto end; CharToSkip += 3; // %xx if (IS_UTF8_TRAILBYTE(Trail1) == FALSE || IS_UTF8_TRAILBYTE(Trail2) == FALSE) { // bad utf! // Status = STATUS_OBJECT_PATH_SYNTAX_BAD; goto end; } // handle three byte case // 1110xxxx 10xxxxxx 10xxxxxx UnicodeChar = (USHORT) (((Char & 0x0f) << 12) | ((Trail1 & 0x3f) << 6) | (Trail2 & 0x3f)); } else if ((CharToSkip == 3) && ((Char & 0xe0) == 0xc0)) { // 2 byte run // // Unescape the next 1 trail byte // Status = Unescape(pChar+CharToSkip, &Trail1); if (NT_SUCCESS(Status) == FALSE) goto end; CharToSkip += 3; // %xx if (IS_UTF8_TRAILBYTE(Trail1) == FALSE) { // bad utf! // Status = STATUS_OBJECT_PATH_SYNTAX_BAD; goto end; } // handle two byte case // 110xxxxx 10xxxxxx UnicodeChar = (USHORT) (((Char & 0x1f) << 6) | (Trail1 & 0x3f)); } // now this can either be unescaped high-bit (bad) // or escaped high-bit. (also bad) // // thus not checking CharToSkip // else if ((Char & 0x80) == 0x80) { // high bit set ! bad utf! // Status = STATUS_OBJECT_PATH_SYNTAX_BAD; goto end; } // // Normal character (again either escaped or unescaped) // else { // // Simple conversion to unicode, it's 7-bit ascii. // UnicodeChar = (USHORT)Char; } // // turn backslashes into forward slashes // if (UrlPart != QueryString && UnicodeChar == L'\\') { UnicodeChar = L'/'; } else if (UnicodeChar == 0) { // // we pop'd a NULL. bad! // Status = STATUS_OBJECT_PATH_SYNTAX_BAD; goto end; } *pCharToSkip = CharToSkip; *pUnicodeChar = UnicodeChar; Status = STATUS_SUCCESS; end: return Status; } // PopChar // // Private constants. // #define ACTION_NOTHING 0x00000000 #define ACTION_EMIT_CH 0x00010000 #define ACTION_EMIT_DOT_CH 0x00020000 #define ACTION_EMIT_DOT_DOT_CH 0x00030000 #define ACTION_BACKUP 0x00040000 #define ACTION_MASK 0xFFFF0000 // // Private globals // // // this table says what to do based on the current state and the current // character // ULONG pActionTable[16] = { // // state 0 = fresh, seen nothing exciting yet // ACTION_EMIT_CH, // other = emit it state = 0 ACTION_EMIT_CH, // "." = emit it state = 0 ACTION_NOTHING, // EOS = normal finish state = 4 ACTION_EMIT_CH, // "/" = we saw the "/", emit it state = 1 // // state 1 = we saw a "/" ! // ACTION_EMIT_CH, // other = emit it, state = 0 ACTION_NOTHING, // "." = eat it, state = 2 ACTION_NOTHING, // EOS = normal finish state = 4 ACTION_NOTHING, // "/" = extra slash, eat it, state = 1 // // state 2 = we saw a "/" and ate a "." ! // ACTION_EMIT_DOT_CH, // other = emit the dot we ate. state = 0 ACTION_NOTHING, // "." = eat it, a .. state = 3 ACTION_NOTHING, // EOS = normal finish state = 4 ACTION_NOTHING, // "/" = we ate a "/./", swallow it state = 1 // // state 3 = we saw a "/" and ate a ".." ! // ACTION_EMIT_DOT_DOT_CH, // other = emit the "..". state = 0 ACTION_EMIT_DOT_DOT_CH, // "." = 3 dots, emit the ".." state = 0 ACTION_BACKUP, // EOS = we have a "/..\0", backup! state = 4 ACTION_BACKUP // "/" = we have a "/../", backup! state = 1 }; // // this table says which newstate to be in given the current state and the // character we saw // ULONG pNextStateTable[16] = { // state 0 0 , // other 0 , // "." 4 , // EOS 1 , // "\" // state 1 0 , // other 2 , // "." 4 , // EOS 1 , // "\" // state 2 0 , // other 3 , // "." 4 , // EOS 1 , // "\" // state 3 0 , // other 0 , // "." 4 , // EOS 1 // "\" }; // // this says how to index into pNextStateTable given our current state. // // since max states = 4, we calculate the index by multiplying with 4. // #define IndexFromState( st) ( (st) * 4) /***************************************************************************++ Routine Description: Unescape Convert backslash to forward slash Remove double slashes (empty directiories names) - e.g. // or \\ Handle /./ Handle /../ Convert to unicode Arguments: Return Value: HRESULT --***************************************************************************/ HRESULT UlCleanAndCopyUrl( IN PUCHAR pSource, IN ULONG SourceLength, OUT PULONG pBytesCopied, OUT PWSTR pDestination, OUT PWSTR * ppQueryString OPTIONAL ) { NTSTATUS Status; PWSTR pDest; PUCHAR pChar; ULONG CharToSkip; UCHAR Char; ULONG BytesCopied; PWSTR pQueryString; ULONG StateIndex; WCHAR UnicodeChar; BOOLEAN MakeCanonical; URL_PART UrlPart = AbsPath; // // a cool local helper macro // #define EMIT_CHAR(ch) \ do { \ pDest[0] = (ch); \ pDest += 1; \ BytesCopied += 2; \ } while (0) pDest = pDestination; pQueryString = NULL; BytesCopied = 0; pChar = pSource; CharToSkip = 0; StateIndex = 0; MakeCanonical = (UrlPart == AbsPath) ? TRUE : FALSE; while (SourceLength > 0) { // // advance ! it's at the top of the loop to enable ANSI_NULL to // come through ONCE // pChar += CharToSkip; SourceLength -= CharToSkip; // // well? have we hit the end? // if (SourceLength == 0) { UnicodeChar = UNICODE_NULL; } else { // // Nope. Peek briefly to see if we hit the query string // if (UrlPart == AbsPath && pChar[0] == '?') { DBG_ASSERT(pQueryString == NULL); // // remember it's location // pQueryString = pDest; // // let it fall through ONCE to the canonical // in order to handle a trailing "/.." like // "http://foobar:80/foo/bar/..?v=1&v2" // UnicodeChar = L'?'; CharToSkip = 1; // // now we are cleaning the query string // UrlPart = QueryString; } else { // // grab the next char // Status = PopChar(UrlPart, pChar, &UnicodeChar, &CharToSkip); if (NT_SUCCESS(Status) == FALSE) goto end; } } if (MakeCanonical) { // // now use the state machine to make it canonical . // // // from the old value of StateIndex, figure out our new base StateIndex // StateIndex = IndexFromState(pNextStateTable[StateIndex]); // // did we just hit the query string? this will only happen once // that we take this branch after hitting it, as we stop // processing after hitting it. // if (UrlPart == QueryString) { // // treat this just like we hit a NULL, EOS. // StateIndex += 2; } else { // // otherwise based the new state off of the char we // just popped. // switch (UnicodeChar) { case UNICODE_NULL: StateIndex += 2; break; case L'.': StateIndex += 1; break; case L'/': StateIndex += 3; break; default: StateIndex += 0; break; } } } else { StateIndex = (UnicodeChar == UNICODE_NULL) ? 2 : 0; } // // Perform the action associated with the state. // switch (pActionTable[StateIndex]) { case ACTION_EMIT_DOT_DOT_CH: EMIT_CHAR(L'.'); // fall through case ACTION_EMIT_DOT_CH: EMIT_CHAR(L'.'); // fall through case ACTION_EMIT_CH: EMIT_CHAR(UnicodeChar); // fall through case ACTION_NOTHING: break; case ACTION_BACKUP: // // pDest currently points 1 past the last '/'. backup over it and // find the preceding '/', set pDest to 1 past that one. // // // backup to the '/' // pDest -= 1; BytesCopied -= 2; DBG_ASSERT(pDest[0] == L'/'); // // are we at the start of the string? that's bad, can't go back! // if (pDest == pDestination) { DBG_ASSERT(BytesCopied == 0); Status = STATUS_OBJECT_PATH_SYNTAX_BAD; goto end; } // // back up over the '/' // pDest -= 1; BytesCopied -= 2; DBG_ASSERT(pDest > pDestination); // // now find the previous slash // while (pDest > pDestination && pDest[0] != L'/') { pDest -= 1; BytesCopied -= 2; } // // we already have a slash, so don't have to store 1. // DBG_ASSERT(pDest[0] == L'/'); // // simply skip it, as if we had emitted it just now // pDest += 1; BytesCopied += 2; break; default: DBG_ASSERT(!"w3core!UlpCleanAndCopyUrl: Invalid action code in state table!"); Status = STATUS_OBJECT_PATH_SYNTAX_BAD; goto end; } // // Just hit the query string ? // if (MakeCanonical && UrlPart == QueryString) { // // Stop canonical processing // MakeCanonical = FALSE; // // Need to emit the '?', it wasn't emitted above // DBG_ASSERT(pActionTable[StateIndex] != ACTION_EMIT_CH); EMIT_CHAR(L'?'); } } // // terminate the string, it hasn't been done in the loop // DBG_ASSERT((pDest-1)[0] != UNICODE_NULL); pDest[0] = UNICODE_NULL; *pBytesCopied = BytesCopied; if (ppQueryString != NULL) { *ppQueryString = pQueryString; } Status = STATUS_SUCCESS; end: return HRESULT_FROM_WIN32( RtlNtStatusToDosError( Status ) ); } // UlCleanAndCopyUrl