windows-nt/Source/XPSP1/NT/shell/tools/cleaninf/parse.cpp
2020-09-26 16:20:57 +08:00

625 lines
16 KiB
C++

#include "priv.h"
#define IS_WHITESPACE(ch) (' ' == ch || '\t' == ch)
#define IS_NEWLINE(ch) ('\n' == ch)
// Flags for _ReadChar
#define RCF_NEXTLINE 0x0001 // skip to next line
#define RCF_NEXTNWS 0x0002 // skip to next non-whitespace
#define RCF_SKIPTRAILING 0x0004 // skip trailing whitespace
// constructor
CParseFile::CParseFile()
{
}
/*-------------------------------------------------------------------------
Purpose: Parse the given file according to the provided flags.
*/
void CParseFile::Parse(FILE * pfileSrc, FILE * pfileDest, DWORD dwFlags)
{
_bSkipWhitespace = BOOLIFY(dwFlags & PFF_WHITESPACE);
_pfileSrc = pfileSrc;
_pfileDest = pfileDest;
_ichRead = 0;
_cchRead = 0;
_ichWrite = 0;
_ch = 0;
if (dwFlags & PFF_HTML)
_ParseHtml();
else if (dwFlags & PFF_HTC)
_ParseHtc();
else if (dwFlags & PFF_JS)
_ParseJS();
else
_ParseInf();
_FlushWriteBuffer();
}
/*-------------------------------------------------------------------------
Purpose: Read the next character in the file. Sets _ch.
*/
char CParseFile::_ReadChar(DWORD dwFlags)
{
BOOL bFirstCharSav = _bFirstChar;
do
{
_ichRead++;
_bFirstChar = FALSE;
// Are we past the buffer, or do we skip to next line?
if (_ichRead >= _cchRead || dwFlags & RCF_NEXTLINE)
{
// Yes; read in more
if (fgets(_szReadBuf, SIZECHARS(_szReadBuf), _pfileSrc))
{
_ichRead = 0;
_cchRead = strlen(_szReadBuf);
_bFirstChar = TRUE;
}
else
{
_ichRead = 0;
_cchRead = 0;
}
}
if (_ichRead < _cchRead)
_ch = _szReadBuf[_ichRead];
else
_ch = CHAR_EOF;
} while ((dwFlags & RCF_NEXTNWS) && IS_WHITESPACE(_ch));
// Are we supposed to skip to the next non-whitespace?
if (dwFlags & RCF_NEXTNWS)
{
// Yes; then retain the "first character" state
_bFirstChar = bFirstCharSav;
}
return _ch;
}
/*-------------------------------------------------------------------------
Purpose: Read ahead to the next character in the buffer and return its
value, but don't set _ch or increment the read pointer.
*/
char CParseFile::_SniffChar(int ichAhead)
{
if (_ichRead + ichAhead < _cchRead)
return _szReadBuf[_ichRead + ichAhead];
return 0;
}
/*-------------------------------------------------------------------------
Purpose: Write the character to the file
*/
void CParseFile::_WriteChar(char ch)
{
_szWriteBuf[_ichWrite++] = ch;
_szWriteBuf[_ichWrite] = 0;
if ('\n' == ch || SIZECHARS(_szWriteBuf)-1 == _ichWrite)
{
fputs(_szWriteBuf, _pfileDest);
_ichWrite = 0;
}
}
/*-------------------------------------------------------------------------
Purpose: Flushes the write buffer to the file
*/
void CParseFile::_FlushWriteBuffer(void)
{
if (_ichWrite > 0)
{
fputs(_szWriteBuf, _pfileDest);
_ichWrite = 0;
}
}
/*-------------------------------------------------------------------------
Purpose: Parse a .inf file.
*/
void CParseFile::_ParseInf(void)
{
_ReadChar(0);
while (CHAR_EOF != _ch)
{
if (_bFirstChar)
{
// Is this a comment?
if (';' == _ch)
{
// Yes; skip to next line
_ReadChar(RCF_NEXTLINE);
continue;
}
if (_SkipWhitespace())
continue;
}
_WriteChar(_ch);
_ReadChar(0);
}
}
/*-------------------------------------------------------------------------
Purpose: Write the current character and the rest of the tag. Assumes
_ch is the beginning of the tag ('<').
There are some parts of the tag which may be compacted if _bSkipWhitespace
is TRUE. The general rule is only one space is required between attributes,
and newlines are converted to spaces if necessary. Anything in quotes
(single or double) are left alone.
*/
void CParseFile::_WriteTag(void)
{
BOOL bSingleQuotes = FALSE;
BOOL bDblQuotes = FALSE;
// The end of the tag is the next '>' that is not in single or double-quotes.
while (CHAR_EOF != _ch)
{
if ('\'' == _ch)
bSingleQuotes ^= TRUE;
else if ('"' == _ch)
bDblQuotes ^= TRUE;
if (!bSingleQuotes && !bDblQuotes)
{
// _SkipWhitespace returns TRUE if it skips any whitespace,
// which means we've read some more input, which means we should
// go to the top of the loop and check for EOF and quotes.
if (_bSkipWhitespace && _SkipWhitespace(TRUE))
continue;
// End of tag?
if ('>' == _ch)
{
// Yes
_WriteChar(_ch);
break;
}
}
_WriteChar(_ch);
_ReadChar(0);
}
}
/*-------------------------------------------------------------------------
Purpose: Skip the current comment tag. Assumes _ch is the beginning of
the tag ('<').
*/
void CParseFile::_SkipCommentTag(void)
{
// The end of the tag is the next '-->'
while (CHAR_EOF != _ch)
{
// Is the end of the comment coming up?
if ('-' == _ch && _SniffChar(1) == '-' && _SniffChar(2) == '>')
{
// Yes
_ReadChar(0); // skip '-'
_ReadChar(0); // skip '>'
break;
}
_ReadChar(0);
}
}
/*-------------------------------------------------------------------------
Purpose: Skip leading whitespace.
Returns TRUE if anything was skipped
*/
BOOL CParseFile::_SkipWhitespace(BOOL bPreserveOneSpace)
{
BOOL bRet = FALSE;
if (_bSkipWhitespace)
{
if (IS_WHITESPACE(_ch))
{
// Skip leading whitespace in line
_ReadChar(RCF_NEXTNWS);
bRet = TRUE;
}
else if (IS_NEWLINE(_ch))
{
// Move to the next line
_ReadChar(RCF_NEXTLINE);
// Skip leading whitespace on the next line, but don't write
// another space char (we'll do that here if necessary) and
// ignore the return value since we've already skipped some
// whitespace here (return TRUE).
_SkipWhitespace(FALSE);
bRet = TRUE;
}
// Write a single space char if we skipped something and the caller
// asked us to preserve a space.
if (bRet && bPreserveOneSpace)
_WriteChar(' ');
}
return bRet;
}
/*-------------------------------------------------------------------------
Purpose: Skip a C or C++ style comment
Returns TRUE if a comment boundary was encountered.
*/
BOOL CParseFile::_SkipComment(int * pcNestedComment)
{
BOOL bRet = FALSE;
if ('/' == _ch)
{
// Is this a C++ comment?
if ('/' == _SniffChar(1))
{
// Yes; skip it to end of line
if (!_bFirstChar || !_bSkipWhitespace)
_WriteChar('\n');
_ReadChar(RCF_NEXTLINE);
bRet = TRUE;
}
// Is this a C comment?
else if ('*' == _SniffChar(1))
{
// Yes; skip to respective '*/'
_ReadChar(0); // skip '/'
_ReadChar(0); // skip '*'
(*pcNestedComment)++;
bRet = TRUE;
}
}
else if ('*' == _ch)
{
// Is this the end of a C comment?
if ('/' == _SniffChar(1))
{
// Yes
_ReadChar(0); // skip '*'
_ReadChar(0); // skip '/'
(*pcNestedComment)--;
// Prevent writing an unnecessary '\n'
_bFirstChar = TRUE;
bRet = TRUE;
}
}
return bRet;
}
/*-------------------------------------------------------------------------
Purpose: Parse the innertext of the STYLE tag, remove any comments
*/
void CParseFile::_ParseInnerStyle(void)
{
int cNestedComment = 0;
// The end of the tag is the next '</STYLE>'
_ReadChar(0);
while (CHAR_EOF != _ch)
{
if (_bFirstChar && _SkipWhitespace())
continue;
// Is the end of the styletag section coming up?
if ('<' == _ch && _IsTagEqual("/STYLE"))
{
// Yes
break;
}
if (_SkipComment(&cNestedComment))
continue;
if (0 == cNestedComment && !IS_NEWLINE(_ch))
_WriteChar(_ch);
_ReadChar(0);
}
}
/*-------------------------------------------------------------------------
Purpose: Returns TRUE if the given tagname matches the currently parsed token
*/
BOOL CParseFile::_IsTagEqual(LPSTR pszTag)
{
int ich = 1;
while (*pszTag)
{
if (_SniffChar(ich++) != *pszTag++)
return FALSE;
}
// We should verify we've come to the end of the tagName
char chEnd = _SniffChar(ich);
return (' ' == chEnd || '>' == chEnd || '<' == chEnd);
}
/*-------------------------------------------------------------------------
Purpose: Returns TRUE if the current tag is an end tag
*/
BOOL CParseFile::_IsEndTag(void)
{
return (_SniffChar(1) == '/');
}
/*-------------------------------------------------------------------------
Purpose: Parse a .htm or .hta file.
*/
void CParseFile::_ParseHtml(void)
{
BOOL bFollowingTag = FALSE;
BOOL bFollowingEndTag = FALSE;
_ReadChar(0);
while (CHAR_EOF != _ch)
{
// Anytime we read another char, we should go to the top of the loop
// to check for EOF and skip leading whitespace if it's a new line.
//
// Note that _SkipWhitespace returns TRUE if it has skipped something,
// which also involves reading a new char.
if (_bFirstChar && _SkipWhitespace())
continue;
// Is this a tag?
if ('<' == _ch)
{
// Yes; looks like it
// Since we've found a new tag, no need to remember if we just saw
// an end tag. That only matters for text content following an end
// tag. For example, given "<SPAN>foo</SPAN> bar", we need to
// preserve a space before the word "bar".
bFollowingEndTag = FALSE;
if (_IsTagEqual("!--"))
{
// Comment; skip it
_SkipCommentTag();
}
else if (_IsTagEqual("SCRIPT"))
{
// Parse the script
_WriteTag(); // write the <SCRIPT> tag
// FEATURE (scotth): we always assume javascript, maybe we should support something else
_ParseJS();
_WriteTag(); // write the </SCRIPT> tag
}
else if (_IsTagEqual("STYLE"))
{
_WriteTag(); // write the <STYLE> tag
_ParseInnerStyle();
_WriteTag(); // write the </STYLE> tag
}
else
{
// Check for end tag ("</") before calling _WriteTag
bFollowingEndTag = _IsEndTag();
// Any other tag: write the tag and go to the next one
_WriteTag();
}
bFollowingTag = TRUE;
_ReadChar(0);
continue;
}
if (bFollowingTag && _bSkipWhitespace)
{
// We can't entirely skip whitespace following tags such as </SPAN>
// or </A>, but we can at least collapse it down to a single space.
BOOL bPreserveOneSpace = bFollowingEndTag;
bFollowingEndTag = FALSE;
bFollowingTag = FALSE;
if (_SkipWhitespace(bPreserveOneSpace))
continue;
}
_WriteChar(_ch);
_ReadChar(0);
}
}
/*-------------------------------------------------------------------------
Purpose: Parse a .js file.
*/
void CParseFile::_ParseJS(void)
{
BOOL bDblQuotes = FALSE;
BOOL bSingleQuotes = FALSE;
int cNestedComment = 0;
_ReadChar(0);
while (CHAR_EOF != _ch)
{
// Are we in a comment?
if (0 == cNestedComment)
{
// No; (we only pay attention to strings when they're not in comments)
if ('\'' == _ch)
bSingleQuotes ^= TRUE;
else if ('"' == _ch)
bDblQuotes ^= TRUE;
if (_bSkipWhitespace && !bDblQuotes && !bSingleQuotes)
{
if (IS_WHITESPACE(_ch))
{
// Skip whitespace
if (!_bFirstChar)
_WriteChar(' ');
_ReadChar(RCF_NEXTNWS);
continue;
}
else if (IS_NEWLINE(_ch))
{
// Since javascript doesn't require a ';' at the end of a statement,
// we should at least replace the newline with a space so tokens don't
// get appended accidentally.
// The javascript engine has a line-length limit. So don't replace
// a newline with a space.
if (!_bFirstChar)
_WriteChar('\n');
_ReadChar(RCF_NEXTLINE);
continue;
}
}
// Are we in a string?
if (!bDblQuotes && !bSingleQuotes)
{
// No; look for the terminating SCRIPT tag
if ('<' == _ch)
{
if (_IsTagEqual("/SCRIPT"))
{
// We've reached the end of the script block
break;
}
}
}
}
// Are we in a string?
if (!bDblQuotes && !bSingleQuotes)
{
// No; look for comments...
if (_SkipComment(&cNestedComment))
continue;
}
if (0 == cNestedComment)
_WriteChar(_ch);
_ReadChar(0);
}
}
/*-------------------------------------------------------------------------
Purpose: Parse a .htc file.
*/
void CParseFile::_ParseHtc(void)
{
BOOL bFollowingTag = FALSE;
int cNestedComment = 0;
_ReadChar(0);
while (CHAR_EOF != _ch)
{
if (_bFirstChar && _SkipWhitespace())
continue;
// Is this a tag?
if ('<' == _ch)
{
// Yes; is it a script tag?
if (_IsTagEqual("SCRIPT"))
{
// Yes; parse the script
_WriteTag(); // write the <SCRIPT> tag
// FEATURE (scotth): we always assume javascript
_ParseJS();
_WriteTag(); // write the </SCRIPT> tag
_ReadChar(0);
bFollowingTag = TRUE;
continue;
}
else
{
_WriteTag();
_ReadChar(0);
bFollowingTag = TRUE;
continue;
}
}
// Look for comments outside the SCRIPT block...
if (_SkipComment(&cNestedComment))
continue;
if (bFollowingTag && _bSkipWhitespace)
{
bFollowingTag = FALSE;
if (_SkipWhitespace())
continue;
}
if (0 == cNestedComment)
_WriteChar(_ch);
_ReadChar(0);
}
}