windows-nt/Source/XPSP1/NT/windows/richedit/re41/rtflex.cpp

1034 lines
24 KiB
C++
Raw Normal View History

2020-09-26 03:20:57 -05:00
/*
* @doc INTERNAL
*
* @module RTFLEX.CPP - RichEdit RTF reader lexical analyzer |
*
* This file contains the implementation of the lexical analyzer part of
* the RTF reader.
*
* Authors: <nl>
* Original RichEdit 1.0 RTF converter: Anthony Francisco <nl>
* Conversion to C++ and RichEdit 2.0: Murray Sargent <nl>
*
* @devnote
* All sz's in the RTF*.? files refer to a LPSTRs, not LPWSTRs, unless
* noted as a szUnicode.
*
* Copyright (c) 1995-2000, Microsoft Corporation. All rights reserved.
*/
#include "_common.h"
#include "_rtfread.h"
#include "hash.h"
#include "tokens.cpp"
ASSERTDATA
// Array used by character classification macros to speed classification
// of chars residing in two or more discontiguous ranges, e.g., alphanumeric
// or hex. The alphabetics used in RTF control words are lower-case ASCII.
// *** DO NOT DBCS rgbCharClass[] ***
#define fCS fCT + fSP
#define fSB fBL + fSP
#define fHD fHX + fDG
#define fHU fHX + fUC
#define fHL fHX + fLC
const BYTE rgbCharClass[256] =
{
fCT,fCT,fCT,fCT,fCT,fCT,fCT,fCT, fCT,fCS,fCS,fCS,fCS,fCS,fCT,fCT,
fCT,fCT,fCT,fCT,fCT,fCT,fCT,fCT, fCT,fCT,fCT,fCT,fCT,fCT,fCT,fCT,
fSB,fPN,fPN,fPN,fPN,fPN,fPN,fPN, fPN,fPN,fPN,fPN,fPN,fPN,fPN,fPN,
fHD,fHD,fHD,fHD,fHD,fHD,fHD,fHD, fHD,fHD,fPN,fPN,fPN,fPN,fPN,fPN,
fPN,fHU,fHU,fHU,fHU,fHU,fHU,fUC, fUC,fUC,fUC,fUC,fUC,fUC,fUC,fUC,
fUC,fUC,fUC,fUC,fUC,fUC,fUC,fUC, fUC,fUC,fUC,fPN,fPN,fPN,fPN,fPN,
fPN,fHL,fHL,fHL,fHL,fHL,fHL,fLC, fLC,fLC,fLC,fLC,fLC,fLC,fLC,fLC,
fLC,fLC,fLC,fLC,fLC,fLC,fLC,fLC, fLC,fLC,fLC,fPN,fPN,fPN,fPN,fPN,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
// Specifies the number of bytes we can safely "UngetChar"
// before possibly underflowing the buffer.
const int cbBackupMax = 4;
// Bug2298 - I found an RTF writer which emits uppercase RTF keywords,
// so I had to change IsLCAscii to IsAlphaChar for use in scanning
// for RTF keywords.
inline BOOL IsAlphaChar(BYTE b)
{
return IN_RANGE('a', b | 0x20, 'z');
}
/*
* IsRTF(pstr, cb)
*
* @func
* Return FALSE if cb < 7 or pstr is NULL or if pstr doesn't start
* with "{\rtf"N or "{\urtf"N, where N is an ASCII number. cb gives
* the minimum length of pstr unless pstr is NULL-terminated, in which
* case the null terminator marks the end of the string.
*
* @rdesc
* TRUE if pstr points at a valid start of RTF data
*/
BOOL IsRTF(
char *pstr, //@parm String to check
LONG cb) //@parm Min byte count if string isn't null terminated
{
if(!pstr || cb < 7 || *pstr++ != '{' || *pstr++ != '\\')
return FALSE; // Quick out for most common cases
if(*pstr == 'u') // Bypass u of possible urtf
pstr++;
return !CompareMemory("rtf", pstr, 3) && IsASCIIDigit((BYTE)pstr[3]);
}
/*
* CRTFRead::InitLex()
*
* @mfunc
* Initialize the lexical analyzer. Reset the variables. if reading in
* from resource file, sort the keyword list (). Uses global hinstRE
* from the RichEdit to find out where its resources are. Note: in
* RichEdit 2.0, currently the resource option is not supported.
*
* @rdesc
* TRUE If lexical analyzer was initialized
*/
BOOL CRTFRead::InitLex()
{
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::InitLex");
AssertSz(cKeywords == i_TokenIndexMax,
"Keyword index enumeration is incompatible with rgKeyword[]");
Assert(!_szText && !_pchRTFBuffer);
// Allocate our buffers with an extra byte for szText so that hex
// conversion doesn't have to worry about running off the end if the
// first char is NULL
if ((_szText = (BYTE *)PvAlloc(cachTextMax + 1, GMEM_ZEROINIT)) &&
(_pchRTFBuffer = (BYTE *)PvAlloc(cachBufferMost, GMEM_ZEROINIT)))
{
return TRUE; // Signal that lexer is initialized
}
_ped->GetCallMgr()->SetOutOfMemory();
_ecParseError = ecLexInitFailed;
return FALSE;
}
/*
* CRTFRead::DeinitLex()
*
* @mfunc
* Shut down lexical analyzer
*/
void CRTFRead::DeinitLex()
{
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::DeinitLex");
#ifdef KEYWORD_RESOURCE
if (hglbKeywords)
{
FreeResource(hglbKeywords);
hglbKeywords = NULL;
rgKeyword = NULL;
}
#endif
FreePv(_szText);
FreePv(_pchRTFBuffer);
}
/*
* CRTFRead::GetChar()
*
* @mfunc
* Get next char, filling buffer as needed
*
* @rdesc
* BYTE nonzero char value if success; else 0
*/
BYTE CRTFRead::GetChar()
{
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetChar");
if (_pchRTFCurrent == _pchRTFEnd && !FillBuffer())
{
_ecParseError = ecUnexpectedEOF;
return 0;
}
return *_pchRTFCurrent++;
}
/*
* CRTFRead::GetCharEx()
*
* @mfunc
* Get next char including escaped chars of form \'xx
*
* @rdesc
* BYTE nonzero char value if success; else 0
*/
BYTE CRTFRead::GetCharEx()
{
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetCharEx");
BYTE ach;
do
ach = GetChar();
while (ach == CR || ach == LF); // Ignore CRLFs
if(ach == BSLASH)
{
if(GetChar() == '\'')
{
// Convert hex to char and store result in _token
if(TokenGetHex() != tokenError)
return (BYTE)_token;
_ecParseError = ecUnexpectedChar;
}
UngetChar();
}
return ach;
}
/*
* CRTFRead::FillBuffer()
*
* @mfunc
* Fill RTF buffer & return != 0 if successful
*
* @rdesc
* LONG # chars read
*
* @comm
* This routine doesn't bother copying anything down if
* pchRTFCurrent <lt> pchRTFEnd so anything not read yet is lost.
* The only exception to this is that it always copies down the
* last two bytes read so that UngetChar() will work. ReadData()
* actually counts on this behavior, so if you change it, change
* ReadData() accordingly.
*/
LONG CRTFRead::FillBuffer()
{
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::FillBuffer");
LONG cchRead;
if (!_pchRTFCurrent)
{
// No data yet, nothing for backup
// Leave cbBackupMax NULL chars so backup
// area of buffer doesn't contain garbage.
for(int i = 0; i < cbBackupMax; i++)
{
_pchRTFBuffer[i] = 0;
}
}
else
{
Assert(_pchRTFCurrent == _pchRTFEnd);
// Copy most recently read chars in case
// we need to back up
int cbBackup = min((UINT) cbBackupMax, DiffPtrs(_pchRTFCurrent, &_pchRTFBuffer[cbBackupMax]));
int i;
for(i = -1; i >= -cbBackup; i--)
_pchRTFBuffer[cbBackupMax + i] = _pchRTFCurrent[i];
if(cbBackup < cbBackupMax)
{
// NULL before the first valid character in the backup buffer
_pchRTFBuffer[cbBackupMax + i] = 0;
}
}
_pchRTFCurrent = &_pchRTFBuffer[cbBackupMax];
// Fill buffer with as much as we can take given our starting offset
_pes->dwError = _pes->pfnCallback(_pes->dwCookie,
_pchRTFCurrent,
cachBufferMost - cbBackupMax,
&cchRead);
if (_pes->dwError)
{
TRACEERRSZSC("RTFLEX: GetChar()", _pes->dwError);
_ecParseError = ecGeneralFailure;
return 0;
}
_pchRTFEnd = &_pchRTFBuffer[cbBackupMax + cchRead]; // Point the end
#if defined(DEBUG)
if(_hfileCapture)
{
DWORD cbLeftToWrite = cchRead;
DWORD cbWritten = 0;
BYTE *pbToWrite = (BYTE *)_pchRTFCurrent;
while(WriteFile(_hfileCapture,
pbToWrite,
cbLeftToWrite,
&cbWritten,
NULL) &&
(pbToWrite += cbWritten,
(cbLeftToWrite -= cbWritten)));
}
#endif
return cchRead;
}
/*
* CRTFRead::UngetChar()
*
* @mfunc
* Bump our file pointer back one char
*
* @rdesc
* BOOL TRUE on success
*
* @comm
* You can safely UngetChar _at most_ cbBackupMax times without
* error.
*/
BOOL CRTFRead::UngetChar()
{
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::UngetChar");
if (_pchRTFCurrent == _pchRTFBuffer || !_pchRTFCurrent)
{
Assert(0);
_ecParseError = ecUnGetCharFailed;
return FALSE;
}
--_pchRTFCurrent;
return TRUE;
}
/*
* CRTFRead::UngetChar(cch)
*
* @mfunc
* Bump our file pointer back 'cch' chars
*
* @rdesc
* BOOL TRUE on success
*
* @comm
* You can safely UngetChar _at most_ cbBackupMax times without
* error.
*/
BOOL CRTFRead::UngetChar(
UINT cch) //@parm cch to put back in buffer
{
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::UngetChar");
AssertSz(cch <= cbBackupMax, "CRTFRead::UngetChar(): Number of UngetChar's "
"exceeds size of backup buffer.");
while(cch-- > 0)
{
if(!UngetChar())
return FALSE;
}
return TRUE;
}
/*
* CRTFRead::GetHex()
*
* @mfunc
* Get next char if hex and return hex value
* If not hex, leave char in buffer and return 255
*
* @rdesc
* BYTE hex value of GetChar() if hex; else 255
*/
BYTE CRTFRead::GetHex()
{
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetHex");
BYTE ch = GetChar();
if(IsXDigit(ch))
return (BYTE)(ch <= '9' ? ch - '0' : (ch & 0x4f) - 'A' + 10);
if(ch)
UngetChar();
return 255;
}
/*
* CRTFRead::GetHexSkipCRLF()
*
* @mfunc
* Get next char if hex and return hex value
* If not hex, leave char in buffer and return 255
*
* @rdesc
* BYTE hex value of GetChar() if hex; else 255
*
* @devnote
* Keep this in sync with GetHex above.
*/
BYTE CRTFRead::GetHexSkipCRLF()
{
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetHexSkipCRLF");
BYTE ch = GetChar();
// Skip \r \n
while(ch == CR || ch == LF)
ch = GetChar();
// Rest is same as CRTFRead::GetHex()
if(IsXDigit(ch))
return (BYTE)(ch <= '9' ? ch - '0' : (ch & 0x4f) - 'A' + 10);
if(ch)
UngetChar();
return 255;
}
/*
* CRTFRead::TokenGetHex()
*
* @mfunc
* Get an 8 bit character saved as a 2 hex digit value
*
* @rdesc
* TOKEN value of hex number read in
*/
TOKEN CRTFRead::TokenGetHex()
{
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetHex");
_token = tokenError; // Default error
BYTE bChar0 = GetHex(); // Get hexadigit
if(bChar0 < 16) // It's valid
{
BYTE bChar1 = GetHex(); // Get next hexadigit
if(bChar1 < 16) // It's valid too
_token = (WORD)(bChar0 << 4 | bChar1);
else
UngetChar(); // Invalid: put back 1st hexadigit
}
return _token;
}
/*
* CRTFRead::SkipToEndOfGroup()
*
* @mfunc
* Skip to end of current group
*
* @rdesc
* EC An error code
*/
EC CRTFRead::SkipToEndOfGroup()
{
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::SkipToEndOfGroup");
INT nDepth = 1;
BYTE ach;
while(TRUE)
{
ach = GetChar();
switch(ach)
{
case BSLASH:
{
BYTE achNext = GetChar();
// EOF: goto done; else ignore NULLs
if(!achNext && _ecParseError == ecUnexpectedEOF)
goto done;
if(achNext == 'b' && UngetChar() &&
TokenGetKeyword() == tokenBinaryData)
{
// We've encountered the \binN tag in the RTF we want
// to skip. _iParam contains N from \binN once the
// tag is parsed by TokenGetKeyword()
SkipBinaryData(_iParam);
}
break;
}
case LBRACE:
nDepth++;
break;
case RBRACE:
if (--nDepth <= 0)
goto done;
break;
case 0:
if(_ecParseError == ecUnexpectedEOF)
goto done;
default:
// Detect Lead bytes here.
int cTrailBytes = GetTrailBytesCount(ach, _nCodePage);
if (cTrailBytes)
{
for (int i = 0; i < cTrailBytes; i++)
{
ach = GetChar();
if(ach == 0 && _ecParseError == ecUnexpectedEOF)
goto done;
}
}
break;
}
}
Assert(!_ecParseError);
_ecParseError = ecUnexpectedEOF;
done:
return _ecParseError;
}
/*
* CRTFRead::TokenFindKeyword(szKeyword, prgKeyword, cKeyword)
*
* @mfunc
* Find keyword <p szKeyword> and return its token value
*
* @rdesc
* TOKEN token number of keyword
*/
TOKEN CRTFRead::TokenFindKeyword(
BYTE * szKeyword, //@parm Keyword to find
const KEYWORD *prgKeyword, //@parm Keyword array to use
LONG cKeyword) //@parm Count of keywords
{
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenFindKeyword");
INT iMax;
INT iMid;
INT iMin;
INT nComp;
BYTE * pchCandidate;
BYTE * pchKeyword;
const KEYWORD * pk;
AssertSz(szKeyword[0],
"CRTFRead::TokenFindKeyword: null keyword");
_iKeyword = 0;
#ifdef RTF_HASHCACHE
if ( _rtfHashInited )
{
// Hash is 23% faster than the following binary search on finds
// and 55% faster on misses: For 97 words stored in a 257 cache.
// Performance numbers will change when the total stored goes up.
pk = HashKeyword_Fetch ( (CHAR *) szKeyword );
}
else
#endif
{
iMin = 0;
iMax = cKeyword - 1;
pk = NULL;
do
{
iMid = (iMin + iMax) / 2;
pchCandidate = (BYTE *)prgKeyword[iMid].szKeyword;
pchKeyword = szKeyword;
while (!(nComp = (*pchKeyword | 0x20) - (*pchCandidate | 0x20)) // Be sure to match
&& *pchKeyword) // terminating 0's
{
pchKeyword++;
pchCandidate++;
}
if (nComp < 0)
iMax = iMid - 1;
else if (nComp)
iMin = iMid + 1;
else
{
pk = &prgKeyword[iMid];
_iKeyword = iMid; // Save keyword index
break;
}
} while (iMin <= iMax);
}
if(pk)
{
_token = pk->token;
// Log the RTF keyword scan to aid in tracking RTF tag coverage
// TODO: Implement RTF tag logging for the Mac and WinCE
#if defined(DEBUG) && !defined(NOFULLDEBUG)
if(_prtflg)
{
#ifdef RTF_HASCACHE
_prtflg->AddAt(szKeyword);
#else
_prtflg->AddAt((size_t)iMid);
#endif
}
#endif
}
else
{ // No match: place to take
_token = tokenUnknownKeyword; // care of unrecognized RTF
if(_fNotifyLowFiRTF)
{
iMin = 0; // Use binary search as above
iMax = crgszUnrecognizedRTF - 1;
do
{
iMid = (iMin + iMax) / 2;
pchCandidate = (BYTE *)rgszUnrecognizedRTF[iMid];
pchKeyword = szKeyword;
while (!(nComp = (*pchKeyword | 0x20) - (*pchCandidate | 0x20))
&& *pchKeyword)
{
pchKeyword++;
pchCandidate++;
}
if (nComp < 0)
iMax = iMid - 1;
else if (nComp && *pchCandidate)
iMin = iMid + 1;
else // Found keyword
{
_iKeyword = -iMid - 1;
CheckNotifyLowFiRTF();
break;
}
} while (iMin <= iMax);
}
}
return _token;
}
/*
* CRTFRead::CheckNotifyLowFiRTF()
*
* @mfunc
* If LowFi RTF notifications are enabled, send notification for the
* keyword with index _iKeyword to client and turn off the notifications
* for the rest of this read.
*/
void CRTFRead::CheckNotifyLowFiRTF(
BOOL fEnable)
{
if(_fNotifyLowFiRTF && (_fBody || fEnable))
{
char *pach = _iKeyword >= 0
? (char *)rgKeyword[_iKeyword].szKeyword
: (char *)rgszUnrecognizedRTF[-_iKeyword - 1];
_ped->HandleLowFiRTF(pach);
_fNotifyLowFiRTF = FALSE;
}
}
/*
* CRTFRead::TokenGetKeyword()
*
* @mfunc
* Collect a keyword and its parameter. Return token's keyword
*
* @rdesc
* TOKEN token number of keyword
*
* @comm
* Most RTF control words (keywords) consist of a span of lower-case
* ASCII letters possibly followed by a span of decimal digits. Other
* control words consist of a single character that isn't LC ASCII. No
* control words contain upper-case characters.
*/
TOKEN CRTFRead::TokenGetKeyword()
{
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetKeyword");
BYTE ach = GetChar();
BYTE *pach;
BYTE szKeyword[cachKeywordMax];
BYTE *pachEnd = szKeyword + cachKeywordMax - 1;
if(!IsAlphaChar(ach)) // Not alpha, i.e.,
{ // single char
if (ach == '\'') // Most common case needs
{ // special treatment
// Convert hex to char and store result in _token
if(TokenGetHex() == tokenError)
{
_ecParseError = ecUnexpectedChar;
goto TokenError;
}
if((_token == CR || _token == LF) && FInDocTextDest())
{
// Add raw CR or LF in the byte stream as a \par
return tokenEndParagraph;
}
}
else
{
// Check for other known symbols
const BYTE *pachSym = szSymbolKeywords;
while(ach != *pachSym && *pachSym)
pachSym++;
if(*pachSym) // Found one
{
_token = tokenSymbol[pachSym - szSymbolKeywords];
if(_token > 0x7F) // Token or larger Unicode
return _token; // value
}
else if (!ach) // No more input chars
goto TokenError;
else // Code for unrecognized RTF
_token = ach; // We'll just insert it for now
}
_token = TokenGetText((BYTE)_token);
return _token;
}
szKeyword[0] = ach; // Collect keyword that starts
pach = szKeyword + 1; // with Alpha
while (IsAlphaChar(ach = GetChar()))
{
if (pach < pachEnd)
*pach++ = ach;
}
*pach = '\0'; // Terminate keyword
GetParam(ach); // Get keyword N in _iParam
if (!_ecParseError) // Find and return keyword
return TokenFindKeyword(szKeyword, rgKeyword, cKeywords);
TokenError:
TRACEERRSZSC("TokenGetKeyword()", _ecParseError);
return _token = tokenError;
}
/*
* CRTFRead::GetParam(ach)
*
* @mfunc
* Get any numeric parameter following a keyword, storing the result
* in _iParam and setting _fParam = TRUE iff a number is found.
*/
void CRTFRead::GetParam(
char ach) // @parm First char of 8-bit text string
{
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetText");
_fParam = FALSE; // Clear parameter
_iParam = 0;
if(IsDigit(ach) || ach == '-') // Collect parameter
{
BOOL fNegativeParam = TRUE;
_fParam = TRUE;
if(ach != '-')
{
_iParam = ach - '0'; // Get parameter value
fNegativeParam = FALSE;
}
while (IsDigit(ach = GetChar()))
_iParam = _iParam*10 + ach - '0';
if (fNegativeParam)
_iParam = -_iParam;
}
if(ach != ' ')
UngetChar(); // If not ' ', unget char
}
/*
* CRTFRead::TokenGetText(ach)
*
* @mfunc
* Collect a string of text starting with the char <p ach> and treat as a
* single token. The string ends when a LBRACE, RBRACE, or single '\\' is found.
*
* @devnote
* We peek past the '\\' for \\'xx, which we decode and keep on going;
* else we return in a state where the next character is the '\\'.
*
* @rdesc
* TOKEN Token number of next token (tokenText or tokenError)
*/
TOKEN CRTFRead::TokenGetText(
BYTE ach) // @parm First char of 8-bit text string
{
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetText");
BYTE * pach = _szText;
SHORT cachText = 0;
LONG CodePage = _pstateStackTop->nCodePage;
BOOL fAllASCII = TRUE;
int cTrailBytesNeeded = 0;
_token = tokenError; // Default error
// FUTURE(BradO): This 'goto' into a while loop is pretty weak.
// Restructure this 'while' loop such that the 'goto' is removed.
// Add character passed into routine
goto add;
// If cTrailBytesNeeded is non-zero, we need to get all the trail bytes. Otherwise,
// a string end in the middle of a DBC or UTF-8 will cause bad display/print problem
// - 5 to allow extra space for up to 4 bytes for UTF-8 and Null char
while (cachText < cachTextMax - 5 || cTrailBytesNeeded)
{
ach = GetChar();
switch (ach)
{
case BSLASH:
{
// FUTURE(BradO): This code looks ALOT like TokenGetKeyword.
// We should combine the two into a common routine.
BYTE achNext;
// Get char after BSLASH
achNext = GetChar();
if(!achNext)
goto error;
if(achNext == '\'') // Handle most frequent
{ // case here
if(TokenGetHex() == tokenError)
{
if(cTrailBytesNeeded)
{
// The trail-byte must be a raw BSLASH.
// Unget the single-quote.
if(!UngetChar())
goto error;
// fall through to add BSLASH
}
else
{
_ecParseError = ecUnexpectedChar;
goto error;
}
}
else
{
ach = (BYTE)_token;
if (cTrailBytesNeeded == 0 && (ach == CR || ach == LF) &&
FInDocTextDest())
{
// Here, we have a raw CR or LF in document text.
// Unget the whole lot of characters and bail out.
// TokenGetKeyword will convert this CR or LF into
// a \par.
if(!UngetChar(4))
goto error;
goto done;
}
}
goto add;
}
// Check next byte against list of RTF symbol
// NOTE:- we need to check for RTF symbol even if we
// are expecting a trail byte. According to the rtf spec,
// we cannot just take this backslash as trail byte.
// HWC 9/97
const BYTE *pachSymbol = szSymbolKeywords;
while(achNext != *pachSymbol && *pachSymbol)
pachSymbol++;
TOKEN tokenTmp;
if (*pachSymbol &&
(tokenTmp = tokenSymbol[pachSymbol - szSymbolKeywords])
<= 0x7F)
{
ach = (BYTE)tokenTmp;
goto add;
}
// In either of the last two cases below, we will want
// to unget the byte following the BSLASH
if(!UngetChar())
goto error;
if(cTrailBytesNeeded && !IsAlphaChar(achNext))
{
// In this situation, either this BSLASH begins the next
// RTF keyword or it is a raw BSLASH which is the trail
// byte for a DBCS character.
// I think a fair assumption here is that if an alphanum
// follows the BSLASH, that the BSLASH begins the next
// RTF keyword.
// add the raw BSLASH
goto add;
}
// Here, my guess is that the BSLASH begins the next RTF
// keyword, so unget the BSLASH
if(!UngetChar())
goto error;
goto done;
}
case LBRACE: // End of text string
case RBRACE:
if(cTrailBytesNeeded)
{
// Previous char was a lead-byte of a DBCS pair or UTF-8, which
// makes this char a raw trail-byte.
goto add;
}
if(!UngetChar()) // Unget delimeter
goto error;
goto done;
case LF: // Throw away noise chars
case CR:
break;
case 0:
if(_ecParseError == ecUnexpectedEOF)
goto done;
ach = ' '; // Replace NULL by blank
default: // Collect chars
add:
*pach++ = ach;
++cachText;
if(ach > 0x7F)
fAllASCII = FALSE;
// Check if we are expecting more trail bytes
if (cTrailBytesNeeded)
cTrailBytesNeeded--;
else
cTrailBytesNeeded = GetTrailBytesCount(ach, CodePage);
Assert(cTrailBytesNeeded >= 0);
}
}
done:
_token = (WORD)(fAllASCII ? tokenASCIIText : tokenText);
*pach = '\0'; // Terminate token string
error:
return _token;
}
/*
* CRTFRead::TokenGetToken()
*
* @mfunc
* This function reads in next token from input stream
*
* @rdesc
* TOKEN token number of next token
*/
TOKEN CRTFRead::TokenGetToken()
{
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetToken");
BYTE ach;
_tokenLast = _token; // Used by \* destinations and FE
_token = tokenEOF; // Default end-of-file
SkipNoise:
ach = GetChar();
switch (ach)
{
case CR:
case LF:
goto SkipNoise;
case LBRACE:
_token = tokenStartGroup;
break;
case RBRACE:
_token = tokenEndGroup;
break;
case BSLASH:
_token = TokenGetKeyword();
break;
case 0:
if(_ecParseError == ecUnexpectedEOF)
break;
ach = ' '; // Replace NULL by blank
// Fall thru to default
default:
if( !_pstateStackTop )
{
TRACEWARNSZ("Unexpected token in rtf file");
Assert(_token == tokenEOF);
if (_ped->Get10Mode())
_ecParseError = ecUnexpectedToken; // Signal bad file
}
else if (_pstateStackTop->sDest == destObjectData ||
_pstateStackTop->sDest == destPicture )
// not text but data
{
_token = (WORD)(tokenObjectDataValue + _pstateStackTop->sDest
- destObjectData);
UngetChar();
}
else
_token = TokenGetText(ach);
}
return _token;
}
#define FINDOCTEXTDEST ((1 << destRTF) | \
(1 << destField) | \
(1 << destFieldResult) | (1 << destFieldInstruction) | \
(1 << destParaNumText) | (1 << destParaNumbering) | \
(1 << destNULL))
/*
* CRTFRead::FInDocTextDest()
*
* @mfunc
* Returns a BOOL indicating if the current destination is one in which
* we would encounter document text.
*
* @rdesc
* BOOL indicates the current destination may contain document text.
*/
BOOL CRTFRead::FInDocTextDest() const
{
AssertSz(_pstateStackTop->sDest < destMAX,
"CRTFRead::FInDocTextDest(): New destination encountered - update enum in _rtfread.h");
return (FINDOCTEXTDEST & (1 << _pstateStackTop->sDest)) != 0;
}