625 lines
16 KiB
C++
625 lines
16 KiB
C++
|
|
#include "priv.h"
|
|
|
|
|
|
|
|
#define IS_WHITESPACE(ch) (' ' == ch || '\t' == ch)
|
|
#define IS_NEWLINE(ch) ('\n' == ch)
|
|
|
|
|
|
// Flags for _ReadChar
|
|
#define RCF_NEXTLINE 0x0001 // skip to next line
|
|
#define RCF_NEXTNWS 0x0002 // skip to next non-whitespace
|
|
#define RCF_SKIPTRAILING 0x0004 // skip trailing whitespace
|
|
|
|
|
|
// constructor
|
|
CParseFile::CParseFile()
|
|
{
|
|
}
|
|
|
|
|
|
|
|
/*-------------------------------------------------------------------------
|
|
Purpose: Parse the given file according to the provided flags.
|
|
*/
|
|
void CParseFile::Parse(FILE * pfileSrc, FILE * pfileDest, DWORD dwFlags)
|
|
{
|
|
_bSkipWhitespace = BOOLIFY(dwFlags & PFF_WHITESPACE);
|
|
|
|
_pfileSrc = pfileSrc;
|
|
_pfileDest = pfileDest;
|
|
_ichRead = 0;
|
|
_cchRead = 0;
|
|
|
|
_ichWrite = 0;
|
|
|
|
_ch = 0;
|
|
|
|
if (dwFlags & PFF_HTML)
|
|
_ParseHtml();
|
|
else if (dwFlags & PFF_HTC)
|
|
_ParseHtc();
|
|
else if (dwFlags & PFF_JS)
|
|
_ParseJS();
|
|
else
|
|
_ParseInf();
|
|
|
|
_FlushWriteBuffer();
|
|
}
|
|
|
|
|
|
/*-------------------------------------------------------------------------
|
|
Purpose: Read the next character in the file. Sets _ch.
|
|
|
|
*/
|
|
char CParseFile::_ReadChar(DWORD dwFlags)
|
|
{
|
|
BOOL bFirstCharSav = _bFirstChar;
|
|
|
|
do
|
|
{
|
|
_ichRead++;
|
|
_bFirstChar = FALSE;
|
|
|
|
// Are we past the buffer, or do we skip to next line?
|
|
if (_ichRead >= _cchRead || dwFlags & RCF_NEXTLINE)
|
|
{
|
|
// Yes; read in more
|
|
if (fgets(_szReadBuf, SIZECHARS(_szReadBuf), _pfileSrc))
|
|
{
|
|
_ichRead = 0;
|
|
_cchRead = strlen(_szReadBuf);
|
|
_bFirstChar = TRUE;
|
|
}
|
|
else
|
|
{
|
|
_ichRead = 0;
|
|
_cchRead = 0;
|
|
}
|
|
}
|
|
|
|
if (_ichRead < _cchRead)
|
|
_ch = _szReadBuf[_ichRead];
|
|
else
|
|
_ch = CHAR_EOF;
|
|
} while ((dwFlags & RCF_NEXTNWS) && IS_WHITESPACE(_ch));
|
|
|
|
// Are we supposed to skip to the next non-whitespace?
|
|
if (dwFlags & RCF_NEXTNWS)
|
|
{
|
|
// Yes; then retain the "first character" state
|
|
_bFirstChar = bFirstCharSav;
|
|
}
|
|
|
|
return _ch;
|
|
}
|
|
|
|
|
|
/*-------------------------------------------------------------------------
|
|
Purpose: Read ahead to the next character in the buffer and return its
|
|
value, but don't set _ch or increment the read pointer.
|
|
*/
|
|
char CParseFile::_SniffChar(int ichAhead)
|
|
{
|
|
if (_ichRead + ichAhead < _cchRead)
|
|
return _szReadBuf[_ichRead + ichAhead];
|
|
|
|
return 0;
|
|
}
|
|
|
|
|
|
|
|
/*-------------------------------------------------------------------------
|
|
Purpose: Write the character to the file
|
|
*/
|
|
void CParseFile::_WriteChar(char ch)
|
|
{
|
|
_szWriteBuf[_ichWrite++] = ch;
|
|
_szWriteBuf[_ichWrite] = 0;
|
|
|
|
if ('\n' == ch || SIZECHARS(_szWriteBuf)-1 == _ichWrite)
|
|
{
|
|
fputs(_szWriteBuf, _pfileDest);
|
|
_ichWrite = 0;
|
|
}
|
|
}
|
|
|
|
|
|
/*-------------------------------------------------------------------------
|
|
Purpose: Flushes the write buffer to the file
|
|
*/
|
|
void CParseFile::_FlushWriteBuffer(void)
|
|
{
|
|
if (_ichWrite > 0)
|
|
{
|
|
fputs(_szWriteBuf, _pfileDest);
|
|
_ichWrite = 0;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/*-------------------------------------------------------------------------
|
|
Purpose: Parse a .inf file.
|
|
*/
|
|
void CParseFile::_ParseInf(void)
|
|
{
|
|
_ReadChar(0);
|
|
|
|
while (CHAR_EOF != _ch)
|
|
{
|
|
if (_bFirstChar)
|
|
{
|
|
// Is this a comment?
|
|
if (';' == _ch)
|
|
{
|
|
// Yes; skip to next line
|
|
_ReadChar(RCF_NEXTLINE);
|
|
continue;
|
|
}
|
|
|
|
if (_SkipWhitespace())
|
|
continue;
|
|
}
|
|
|
|
_WriteChar(_ch);
|
|
_ReadChar(0);
|
|
}
|
|
}
|
|
|
|
|
|
/*-------------------------------------------------------------------------
|
|
Purpose: Write the current character and the rest of the tag. Assumes
|
|
_ch is the beginning of the tag ('<').
|
|
|
|
There are some parts of the tag which may be compacted if _bSkipWhitespace
|
|
is TRUE. The general rule is only one space is required between attributes,
|
|
and newlines are converted to spaces if necessary. Anything in quotes
|
|
(single or double) are left alone.
|
|
*/
|
|
void CParseFile::_WriteTag(void)
|
|
{
|
|
BOOL bSingleQuotes = FALSE;
|
|
BOOL bDblQuotes = FALSE;
|
|
|
|
// The end of the tag is the next '>' that is not in single or double-quotes.
|
|
|
|
while (CHAR_EOF != _ch)
|
|
{
|
|
if ('\'' == _ch)
|
|
bSingleQuotes ^= TRUE;
|
|
else if ('"' == _ch)
|
|
bDblQuotes ^= TRUE;
|
|
|
|
if (!bSingleQuotes && !bDblQuotes)
|
|
{
|
|
// _SkipWhitespace returns TRUE if it skips any whitespace,
|
|
// which means we've read some more input, which means we should
|
|
// go to the top of the loop and check for EOF and quotes.
|
|
if (_bSkipWhitespace && _SkipWhitespace(TRUE))
|
|
continue;
|
|
|
|
// End of tag?
|
|
if ('>' == _ch)
|
|
{
|
|
// Yes
|
|
_WriteChar(_ch);
|
|
break;
|
|
}
|
|
}
|
|
|
|
_WriteChar(_ch);
|
|
_ReadChar(0);
|
|
}
|
|
}
|
|
|
|
|
|
/*-------------------------------------------------------------------------
|
|
Purpose: Skip the current comment tag. Assumes _ch is the beginning of
|
|
the tag ('<').
|
|
*/
|
|
void CParseFile::_SkipCommentTag(void)
|
|
{
|
|
// The end of the tag is the next '-->'
|
|
|
|
while (CHAR_EOF != _ch)
|
|
{
|
|
// Is the end of the comment coming up?
|
|
if ('-' == _ch && _SniffChar(1) == '-' && _SniffChar(2) == '>')
|
|
{
|
|
// Yes
|
|
_ReadChar(0); // skip '-'
|
|
_ReadChar(0); // skip '>'
|
|
break;
|
|
}
|
|
|
|
_ReadChar(0);
|
|
}
|
|
}
|
|
|
|
|
|
/*-------------------------------------------------------------------------
|
|
Purpose: Skip leading whitespace.
|
|
|
|
Returns TRUE if anything was skipped
|
|
*/
|
|
BOOL CParseFile::_SkipWhitespace(BOOL bPreserveOneSpace)
|
|
{
|
|
BOOL bRet = FALSE;
|
|
|
|
if (_bSkipWhitespace)
|
|
{
|
|
if (IS_WHITESPACE(_ch))
|
|
{
|
|
// Skip leading whitespace in line
|
|
_ReadChar(RCF_NEXTNWS);
|
|
bRet = TRUE;
|
|
}
|
|
else if (IS_NEWLINE(_ch))
|
|
{
|
|
// Move to the next line
|
|
_ReadChar(RCF_NEXTLINE);
|
|
|
|
// Skip leading whitespace on the next line, but don't write
|
|
// another space char (we'll do that here if necessary) and
|
|
// ignore the return value since we've already skipped some
|
|
// whitespace here (return TRUE).
|
|
_SkipWhitespace(FALSE);
|
|
|
|
bRet = TRUE;
|
|
}
|
|
// Write a single space char if we skipped something and the caller
|
|
// asked us to preserve a space.
|
|
if (bRet && bPreserveOneSpace)
|
|
_WriteChar(' ');
|
|
}
|
|
return bRet;
|
|
}
|
|
|
|
|
|
/*-------------------------------------------------------------------------
|
|
Purpose: Skip a C or C++ style comment
|
|
|
|
Returns TRUE if a comment boundary was encountered.
|
|
*/
|
|
BOOL CParseFile::_SkipComment(int * pcNestedComment)
|
|
{
|
|
BOOL bRet = FALSE;
|
|
|
|
if ('/' == _ch)
|
|
{
|
|
// Is this a C++ comment?
|
|
if ('/' == _SniffChar(1))
|
|
{
|
|
// Yes; skip it to end of line
|
|
if (!_bFirstChar || !_bSkipWhitespace)
|
|
_WriteChar('\n');
|
|
|
|
_ReadChar(RCF_NEXTLINE);
|
|
bRet = TRUE;
|
|
}
|
|
// Is this a C comment?
|
|
else if ('*' == _SniffChar(1))
|
|
{
|
|
// Yes; skip to respective '*/'
|
|
_ReadChar(0); // skip '/'
|
|
_ReadChar(0); // skip '*'
|
|
(*pcNestedComment)++;
|
|
bRet = TRUE;
|
|
}
|
|
}
|
|
else if ('*' == _ch)
|
|
{
|
|
// Is this the end of a C comment?
|
|
if ('/' == _SniffChar(1))
|
|
{
|
|
// Yes
|
|
_ReadChar(0); // skip '*'
|
|
_ReadChar(0); // skip '/'
|
|
(*pcNestedComment)--;
|
|
|
|
// Prevent writing an unnecessary '\n'
|
|
_bFirstChar = TRUE;
|
|
bRet = TRUE;
|
|
}
|
|
}
|
|
return bRet;
|
|
}
|
|
|
|
|
|
/*-------------------------------------------------------------------------
|
|
Purpose: Parse the innertext of the STYLE tag, remove any comments
|
|
*/
|
|
void CParseFile::_ParseInnerStyle(void)
|
|
{
|
|
int cNestedComment = 0;
|
|
|
|
// The end of the tag is the next '</STYLE>'
|
|
|
|
_ReadChar(0);
|
|
|
|
while (CHAR_EOF != _ch)
|
|
{
|
|
if (_bFirstChar && _SkipWhitespace())
|
|
continue;
|
|
|
|
// Is the end of the styletag section coming up?
|
|
if ('<' == _ch && _IsTagEqual("/STYLE"))
|
|
{
|
|
// Yes
|
|
break;
|
|
}
|
|
|
|
if (_SkipComment(&cNestedComment))
|
|
continue;
|
|
|
|
if (0 == cNestedComment && !IS_NEWLINE(_ch))
|
|
_WriteChar(_ch);
|
|
|
|
_ReadChar(0);
|
|
}
|
|
}
|
|
|
|
|
|
/*-------------------------------------------------------------------------
|
|
Purpose: Returns TRUE if the given tagname matches the currently parsed token
|
|
*/
|
|
BOOL CParseFile::_IsTagEqual(LPSTR pszTag)
|
|
{
|
|
int ich = 1;
|
|
|
|
while (*pszTag)
|
|
{
|
|
if (_SniffChar(ich++) != *pszTag++)
|
|
return FALSE;
|
|
}
|
|
|
|
// We should verify we've come to the end of the tagName
|
|
char chEnd = _SniffChar(ich);
|
|
|
|
return (' ' == chEnd || '>' == chEnd || '<' == chEnd);
|
|
}
|
|
|
|
|
|
/*-------------------------------------------------------------------------
|
|
Purpose: Returns TRUE if the current tag is an end tag
|
|
*/
|
|
BOOL CParseFile::_IsEndTag(void)
|
|
{
|
|
return (_SniffChar(1) == '/');
|
|
}
|
|
|
|
|
|
/*-------------------------------------------------------------------------
|
|
Purpose: Parse a .htm or .hta file.
|
|
*/
|
|
void CParseFile::_ParseHtml(void)
|
|
{
|
|
BOOL bFollowingTag = FALSE;
|
|
BOOL bFollowingEndTag = FALSE;
|
|
|
|
_ReadChar(0);
|
|
|
|
while (CHAR_EOF != _ch)
|
|
{
|
|
// Anytime we read another char, we should go to the top of the loop
|
|
// to check for EOF and skip leading whitespace if it's a new line.
|
|
//
|
|
// Note that _SkipWhitespace returns TRUE if it has skipped something,
|
|
// which also involves reading a new char.
|
|
|
|
if (_bFirstChar && _SkipWhitespace())
|
|
continue;
|
|
|
|
// Is this a tag?
|
|
if ('<' == _ch)
|
|
{
|
|
// Yes; looks like it
|
|
|
|
// Since we've found a new tag, no need to remember if we just saw
|
|
// an end tag. That only matters for text content following an end
|
|
// tag. For example, given "<SPAN>foo</SPAN> bar", we need to
|
|
// preserve a space before the word "bar".
|
|
bFollowingEndTag = FALSE;
|
|
|
|
if (_IsTagEqual("!--"))
|
|
{
|
|
// Comment; skip it
|
|
_SkipCommentTag();
|
|
}
|
|
else if (_IsTagEqual("SCRIPT"))
|
|
{
|
|
// Parse the script
|
|
_WriteTag(); // write the <SCRIPT> tag
|
|
|
|
// FEATURE (scotth): we always assume javascript, maybe we should support something else
|
|
_ParseJS();
|
|
|
|
_WriteTag(); // write the </SCRIPT> tag
|
|
}
|
|
else if (_IsTagEqual("STYLE"))
|
|
{
|
|
_WriteTag(); // write the <STYLE> tag
|
|
_ParseInnerStyle();
|
|
_WriteTag(); // write the </STYLE> tag
|
|
}
|
|
else
|
|
{
|
|
// Check for end tag ("</") before calling _WriteTag
|
|
bFollowingEndTag = _IsEndTag();
|
|
|
|
// Any other tag: write the tag and go to the next one
|
|
_WriteTag();
|
|
}
|
|
|
|
bFollowingTag = TRUE;
|
|
_ReadChar(0);
|
|
continue;
|
|
}
|
|
|
|
if (bFollowingTag && _bSkipWhitespace)
|
|
{
|
|
// We can't entirely skip whitespace following tags such as </SPAN>
|
|
// or </A>, but we can at least collapse it down to a single space.
|
|
BOOL bPreserveOneSpace = bFollowingEndTag;
|
|
|
|
bFollowingEndTag = FALSE;
|
|
bFollowingTag = FALSE;
|
|
|
|
if (_SkipWhitespace(bPreserveOneSpace))
|
|
continue;
|
|
}
|
|
|
|
_WriteChar(_ch);
|
|
_ReadChar(0);
|
|
}
|
|
}
|
|
|
|
|
|
/*-------------------------------------------------------------------------
|
|
Purpose: Parse a .js file.
|
|
*/
|
|
void CParseFile::_ParseJS(void)
|
|
{
|
|
BOOL bDblQuotes = FALSE;
|
|
BOOL bSingleQuotes = FALSE;
|
|
int cNestedComment = 0;
|
|
|
|
_ReadChar(0);
|
|
|
|
while (CHAR_EOF != _ch)
|
|
{
|
|
// Are we in a comment?
|
|
if (0 == cNestedComment)
|
|
{
|
|
// No; (we only pay attention to strings when they're not in comments)
|
|
if ('\'' == _ch)
|
|
bSingleQuotes ^= TRUE;
|
|
else if ('"' == _ch)
|
|
bDblQuotes ^= TRUE;
|
|
|
|
if (_bSkipWhitespace && !bDblQuotes && !bSingleQuotes)
|
|
{
|
|
if (IS_WHITESPACE(_ch))
|
|
{
|
|
// Skip whitespace
|
|
if (!_bFirstChar)
|
|
_WriteChar(' ');
|
|
|
|
_ReadChar(RCF_NEXTNWS);
|
|
continue;
|
|
}
|
|
else if (IS_NEWLINE(_ch))
|
|
{
|
|
// Since javascript doesn't require a ';' at the end of a statement,
|
|
// we should at least replace the newline with a space so tokens don't
|
|
// get appended accidentally.
|
|
|
|
// The javascript engine has a line-length limit. So don't replace
|
|
// a newline with a space.
|
|
if (!_bFirstChar)
|
|
_WriteChar('\n');
|
|
|
|
_ReadChar(RCF_NEXTLINE);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Are we in a string?
|
|
if (!bDblQuotes && !bSingleQuotes)
|
|
{
|
|
// No; look for the terminating SCRIPT tag
|
|
if ('<' == _ch)
|
|
{
|
|
if (_IsTagEqual("/SCRIPT"))
|
|
{
|
|
// We've reached the end of the script block
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Are we in a string?
|
|
if (!bDblQuotes && !bSingleQuotes)
|
|
{
|
|
// No; look for comments...
|
|
if (_SkipComment(&cNestedComment))
|
|
continue;
|
|
}
|
|
|
|
if (0 == cNestedComment)
|
|
_WriteChar(_ch);
|
|
|
|
_ReadChar(0);
|
|
}
|
|
}
|
|
|
|
|
|
/*-------------------------------------------------------------------------
|
|
Purpose: Parse a .htc file.
|
|
*/
|
|
void CParseFile::_ParseHtc(void)
|
|
{
|
|
BOOL bFollowingTag = FALSE;
|
|
int cNestedComment = 0;
|
|
|
|
_ReadChar(0);
|
|
|
|
while (CHAR_EOF != _ch)
|
|
{
|
|
if (_bFirstChar && _SkipWhitespace())
|
|
continue;
|
|
|
|
// Is this a tag?
|
|
if ('<' == _ch)
|
|
{
|
|
// Yes; is it a script tag?
|
|
if (_IsTagEqual("SCRIPT"))
|
|
{
|
|
// Yes; parse the script
|
|
_WriteTag(); // write the <SCRIPT> tag
|
|
|
|
// FEATURE (scotth): we always assume javascript
|
|
_ParseJS();
|
|
|
|
_WriteTag(); // write the </SCRIPT> tag
|
|
|
|
_ReadChar(0);
|
|
bFollowingTag = TRUE;
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
_WriteTag();
|
|
_ReadChar(0);
|
|
bFollowingTag = TRUE;
|
|
continue;
|
|
|
|
}
|
|
}
|
|
|
|
// Look for comments outside the SCRIPT block...
|
|
if (_SkipComment(&cNestedComment))
|
|
continue;
|
|
|
|
if (bFollowingTag && _bSkipWhitespace)
|
|
{
|
|
bFollowingTag = FALSE;
|
|
|
|
if (_SkipWhitespace())
|
|
continue;
|
|
}
|
|
|
|
if (0 == cNestedComment)
|
|
_WriteChar(_ch);
|
|
|
|
_ReadChar(0);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|