1034 lines
24 KiB
C++
1034 lines
24 KiB
C++
|
/*
|
||
|
* @doc INTERNAL
|
||
|
*
|
||
|
* @module RTFLEX.CPP - RichEdit RTF reader lexical analyzer |
|
||
|
*
|
||
|
* This file contains the implementation of the lexical analyzer part of
|
||
|
* the RTF reader.
|
||
|
*
|
||
|
* Authors: <nl>
|
||
|
* Original RichEdit 1.0 RTF converter: Anthony Francisco <nl>
|
||
|
* Conversion to C++ and RichEdit 2.0: Murray Sargent <nl>
|
||
|
*
|
||
|
* @devnote
|
||
|
* All sz's in the RTF*.? files refer to a LPSTRs, not LPWSTRs, unless
|
||
|
* noted as a szUnicode.
|
||
|
*
|
||
|
* Copyright (c) 1995-2000, Microsoft Corporation. All rights reserved.
|
||
|
*/
|
||
|
|
||
|
#include "_common.h"
|
||
|
#include "_rtfread.h"
|
||
|
#include "hash.h"
|
||
|
#include "tokens.cpp"
|
||
|
|
||
|
ASSERTDATA
|
||
|
|
||
|
// Array used by character classification macros to speed classification
|
||
|
// of chars residing in two or more discontiguous ranges, e.g., alphanumeric
|
||
|
// or hex. The alphabetics used in RTF control words are lower-case ASCII.
|
||
|
// *** DO NOT DBCS rgbCharClass[] ***
|
||
|
|
||
|
#define fCS fCT + fSP
|
||
|
#define fSB fBL + fSP
|
||
|
#define fHD fHX + fDG
|
||
|
#define fHU fHX + fUC
|
||
|
#define fHL fHX + fLC
|
||
|
|
||
|
const BYTE rgbCharClass[256] =
|
||
|
{
|
||
|
fCT,fCT,fCT,fCT,fCT,fCT,fCT,fCT, fCT,fCS,fCS,fCS,fCS,fCS,fCT,fCT,
|
||
|
fCT,fCT,fCT,fCT,fCT,fCT,fCT,fCT, fCT,fCT,fCT,fCT,fCT,fCT,fCT,fCT,
|
||
|
fSB,fPN,fPN,fPN,fPN,fPN,fPN,fPN, fPN,fPN,fPN,fPN,fPN,fPN,fPN,fPN,
|
||
|
fHD,fHD,fHD,fHD,fHD,fHD,fHD,fHD, fHD,fHD,fPN,fPN,fPN,fPN,fPN,fPN,
|
||
|
|
||
|
fPN,fHU,fHU,fHU,fHU,fHU,fHU,fUC, fUC,fUC,fUC,fUC,fUC,fUC,fUC,fUC,
|
||
|
fUC,fUC,fUC,fUC,fUC,fUC,fUC,fUC, fUC,fUC,fUC,fPN,fPN,fPN,fPN,fPN,
|
||
|
fPN,fHL,fHL,fHL,fHL,fHL,fHL,fLC, fLC,fLC,fLC,fLC,fLC,fLC,fLC,fLC,
|
||
|
fLC,fLC,fLC,fLC,fLC,fLC,fLC,fLC, fLC,fLC,fLC,fPN,fPN,fPN,fPN,fPN,
|
||
|
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||
|
};
|
||
|
|
||
|
// Specifies the number of bytes we can safely "UngetChar"
|
||
|
// before possibly underflowing the buffer.
|
||
|
const int cbBackupMax = 4;
|
||
|
|
||
|
// Bug2298 - I found an RTF writer which emits uppercase RTF keywords,
|
||
|
// so I had to change IsLCAscii to IsAlphaChar for use in scanning
|
||
|
// for RTF keywords.
|
||
|
inline BOOL IsAlphaChar(BYTE b)
|
||
|
{
|
||
|
return IN_RANGE('a', b | 0x20, 'z');
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* IsRTF(pstr, cb)
|
||
|
*
|
||
|
* @func
|
||
|
* Return FALSE if cb < 7 or pstr is NULL or if pstr doesn't start
|
||
|
* with "{\rtf"N or "{\urtf"N, where N is an ASCII number. cb gives
|
||
|
* the minimum length of pstr unless pstr is NULL-terminated, in which
|
||
|
* case the null terminator marks the end of the string.
|
||
|
*
|
||
|
* @rdesc
|
||
|
* TRUE if pstr points at a valid start of RTF data
|
||
|
*/
|
||
|
BOOL IsRTF(
|
||
|
char *pstr, //@parm String to check
|
||
|
LONG cb) //@parm Min byte count if string isn't null terminated
|
||
|
{
|
||
|
if(!pstr || cb < 7 || *pstr++ != '{' || *pstr++ != '\\')
|
||
|
return FALSE; // Quick out for most common cases
|
||
|
|
||
|
if(*pstr == 'u') // Bypass u of possible urtf
|
||
|
pstr++;
|
||
|
|
||
|
return !CompareMemory("rtf", pstr, 3) && IsASCIIDigit((BYTE)pstr[3]);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* CRTFRead::InitLex()
|
||
|
*
|
||
|
* @mfunc
|
||
|
* Initialize the lexical analyzer. Reset the variables. if reading in
|
||
|
* from resource file, sort the keyword list (). Uses global hinstRE
|
||
|
* from the RichEdit to find out where its resources are. Note: in
|
||
|
* RichEdit 2.0, currently the resource option is not supported.
|
||
|
*
|
||
|
* @rdesc
|
||
|
* TRUE If lexical analyzer was initialized
|
||
|
*/
|
||
|
BOOL CRTFRead::InitLex()
|
||
|
{
|
||
|
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::InitLex");
|
||
|
|
||
|
AssertSz(cKeywords == i_TokenIndexMax,
|
||
|
"Keyword index enumeration is incompatible with rgKeyword[]");
|
||
|
Assert(!_szText && !_pchRTFBuffer);
|
||
|
|
||
|
// Allocate our buffers with an extra byte for szText so that hex
|
||
|
// conversion doesn't have to worry about running off the end if the
|
||
|
// first char is NULL
|
||
|
if ((_szText = (BYTE *)PvAlloc(cachTextMax + 1, GMEM_ZEROINIT)) &&
|
||
|
(_pchRTFBuffer = (BYTE *)PvAlloc(cachBufferMost, GMEM_ZEROINIT)))
|
||
|
{
|
||
|
return TRUE; // Signal that lexer is initialized
|
||
|
}
|
||
|
|
||
|
_ped->GetCallMgr()->SetOutOfMemory();
|
||
|
_ecParseError = ecLexInitFailed;
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* CRTFRead::DeinitLex()
|
||
|
*
|
||
|
* @mfunc
|
||
|
* Shut down lexical analyzer
|
||
|
*/
|
||
|
void CRTFRead::DeinitLex()
|
||
|
{
|
||
|
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::DeinitLex");
|
||
|
|
||
|
#ifdef KEYWORD_RESOURCE
|
||
|
if (hglbKeywords)
|
||
|
{
|
||
|
FreeResource(hglbKeywords);
|
||
|
hglbKeywords = NULL;
|
||
|
rgKeyword = NULL;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
FreePv(_szText);
|
||
|
FreePv(_pchRTFBuffer);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* CRTFRead::GetChar()
|
||
|
*
|
||
|
* @mfunc
|
||
|
* Get next char, filling buffer as needed
|
||
|
*
|
||
|
* @rdesc
|
||
|
* BYTE nonzero char value if success; else 0
|
||
|
*/
|
||
|
BYTE CRTFRead::GetChar()
|
||
|
{
|
||
|
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetChar");
|
||
|
|
||
|
if (_pchRTFCurrent == _pchRTFEnd && !FillBuffer())
|
||
|
{
|
||
|
_ecParseError = ecUnexpectedEOF;
|
||
|
return 0;
|
||
|
}
|
||
|
return *_pchRTFCurrent++;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* CRTFRead::GetCharEx()
|
||
|
*
|
||
|
* @mfunc
|
||
|
* Get next char including escaped chars of form \'xx
|
||
|
*
|
||
|
* @rdesc
|
||
|
* BYTE nonzero char value if success; else 0
|
||
|
*/
|
||
|
BYTE CRTFRead::GetCharEx()
|
||
|
{
|
||
|
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetCharEx");
|
||
|
|
||
|
BYTE ach;
|
||
|
|
||
|
do
|
||
|
ach = GetChar();
|
||
|
while (ach == CR || ach == LF); // Ignore CRLFs
|
||
|
|
||
|
if(ach == BSLASH)
|
||
|
{
|
||
|
if(GetChar() == '\'')
|
||
|
{
|
||
|
// Convert hex to char and store result in _token
|
||
|
if(TokenGetHex() != tokenError)
|
||
|
return (BYTE)_token;
|
||
|
_ecParseError = ecUnexpectedChar;
|
||
|
}
|
||
|
UngetChar();
|
||
|
}
|
||
|
return ach;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* CRTFRead::FillBuffer()
|
||
|
*
|
||
|
* @mfunc
|
||
|
* Fill RTF buffer & return != 0 if successful
|
||
|
*
|
||
|
* @rdesc
|
||
|
* LONG # chars read
|
||
|
*
|
||
|
* @comm
|
||
|
* This routine doesn't bother copying anything down if
|
||
|
* pchRTFCurrent <lt> pchRTFEnd so anything not read yet is lost.
|
||
|
* The only exception to this is that it always copies down the
|
||
|
* last two bytes read so that UngetChar() will work. ReadData()
|
||
|
* actually counts on this behavior, so if you change it, change
|
||
|
* ReadData() accordingly.
|
||
|
*/
|
||
|
LONG CRTFRead::FillBuffer()
|
||
|
{
|
||
|
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::FillBuffer");
|
||
|
|
||
|
LONG cchRead;
|
||
|
|
||
|
if (!_pchRTFCurrent)
|
||
|
{
|
||
|
// No data yet, nothing for backup
|
||
|
// Leave cbBackupMax NULL chars so backup
|
||
|
// area of buffer doesn't contain garbage.
|
||
|
|
||
|
for(int i = 0; i < cbBackupMax; i++)
|
||
|
{
|
||
|
_pchRTFBuffer[i] = 0;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
Assert(_pchRTFCurrent == _pchRTFEnd);
|
||
|
|
||
|
// Copy most recently read chars in case
|
||
|
// we need to back up
|
||
|
|
||
|
int cbBackup = min((UINT) cbBackupMax, DiffPtrs(_pchRTFCurrent, &_pchRTFBuffer[cbBackupMax]));
|
||
|
int i;
|
||
|
|
||
|
for(i = -1; i >= -cbBackup; i--)
|
||
|
_pchRTFBuffer[cbBackupMax + i] = _pchRTFCurrent[i];
|
||
|
|
||
|
if(cbBackup < cbBackupMax)
|
||
|
{
|
||
|
// NULL before the first valid character in the backup buffer
|
||
|
_pchRTFBuffer[cbBackupMax + i] = 0;
|
||
|
}
|
||
|
}
|
||
|
_pchRTFCurrent = &_pchRTFBuffer[cbBackupMax];
|
||
|
|
||
|
// Fill buffer with as much as we can take given our starting offset
|
||
|
_pes->dwError = _pes->pfnCallback(_pes->dwCookie,
|
||
|
_pchRTFCurrent,
|
||
|
cachBufferMost - cbBackupMax,
|
||
|
&cchRead);
|
||
|
if (_pes->dwError)
|
||
|
{
|
||
|
TRACEERRSZSC("RTFLEX: GetChar()", _pes->dwError);
|
||
|
_ecParseError = ecGeneralFailure;
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
_pchRTFEnd = &_pchRTFBuffer[cbBackupMax + cchRead]; // Point the end
|
||
|
|
||
|
#if defined(DEBUG)
|
||
|
if(_hfileCapture)
|
||
|
{
|
||
|
DWORD cbLeftToWrite = cchRead;
|
||
|
DWORD cbWritten = 0;
|
||
|
BYTE *pbToWrite = (BYTE *)_pchRTFCurrent;
|
||
|
|
||
|
while(WriteFile(_hfileCapture,
|
||
|
pbToWrite,
|
||
|
cbLeftToWrite,
|
||
|
&cbWritten,
|
||
|
NULL) &&
|
||
|
(pbToWrite += cbWritten,
|
||
|
(cbLeftToWrite -= cbWritten)));
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
return cchRead;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* CRTFRead::UngetChar()
|
||
|
*
|
||
|
* @mfunc
|
||
|
* Bump our file pointer back one char
|
||
|
*
|
||
|
* @rdesc
|
||
|
* BOOL TRUE on success
|
||
|
*
|
||
|
* @comm
|
||
|
* You can safely UngetChar _at most_ cbBackupMax times without
|
||
|
* error.
|
||
|
*/
|
||
|
BOOL CRTFRead::UngetChar()
|
||
|
{
|
||
|
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::UngetChar");
|
||
|
|
||
|
if (_pchRTFCurrent == _pchRTFBuffer || !_pchRTFCurrent)
|
||
|
{
|
||
|
Assert(0);
|
||
|
_ecParseError = ecUnGetCharFailed;
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
--_pchRTFCurrent;
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* CRTFRead::UngetChar(cch)
|
||
|
*
|
||
|
* @mfunc
|
||
|
* Bump our file pointer back 'cch' chars
|
||
|
*
|
||
|
* @rdesc
|
||
|
* BOOL TRUE on success
|
||
|
*
|
||
|
* @comm
|
||
|
* You can safely UngetChar _at most_ cbBackupMax times without
|
||
|
* error.
|
||
|
*/
|
||
|
BOOL CRTFRead::UngetChar(
|
||
|
UINT cch) //@parm cch to put back in buffer
|
||
|
{
|
||
|
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::UngetChar");
|
||
|
|
||
|
AssertSz(cch <= cbBackupMax, "CRTFRead::UngetChar(): Number of UngetChar's "
|
||
|
"exceeds size of backup buffer.");
|
||
|
while(cch-- > 0)
|
||
|
{
|
||
|
if(!UngetChar())
|
||
|
return FALSE;
|
||
|
}
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* CRTFRead::GetHex()
|
||
|
*
|
||
|
* @mfunc
|
||
|
* Get next char if hex and return hex value
|
||
|
* If not hex, leave char in buffer and return 255
|
||
|
*
|
||
|
* @rdesc
|
||
|
* BYTE hex value of GetChar() if hex; else 255
|
||
|
*/
|
||
|
BYTE CRTFRead::GetHex()
|
||
|
{
|
||
|
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetHex");
|
||
|
|
||
|
BYTE ch = GetChar();
|
||
|
|
||
|
if(IsXDigit(ch))
|
||
|
return (BYTE)(ch <= '9' ? ch - '0' : (ch & 0x4f) - 'A' + 10);
|
||
|
if(ch)
|
||
|
UngetChar();
|
||
|
return 255;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* CRTFRead::GetHexSkipCRLF()
|
||
|
*
|
||
|
* @mfunc
|
||
|
* Get next char if hex and return hex value
|
||
|
* If not hex, leave char in buffer and return 255
|
||
|
*
|
||
|
* @rdesc
|
||
|
* BYTE hex value of GetChar() if hex; else 255
|
||
|
*
|
||
|
* @devnote
|
||
|
* Keep this in sync with GetHex above.
|
||
|
*/
|
||
|
BYTE CRTFRead::GetHexSkipCRLF()
|
||
|
{
|
||
|
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetHexSkipCRLF");
|
||
|
|
||
|
BYTE ch = GetChar();
|
||
|
|
||
|
// Skip \r \n
|
||
|
while(ch == CR || ch == LF)
|
||
|
ch = GetChar();
|
||
|
|
||
|
// Rest is same as CRTFRead::GetHex()
|
||
|
if(IsXDigit(ch))
|
||
|
return (BYTE)(ch <= '9' ? ch - '0' : (ch & 0x4f) - 'A' + 10);
|
||
|
if(ch)
|
||
|
UngetChar();
|
||
|
return 255;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* CRTFRead::TokenGetHex()
|
||
|
*
|
||
|
* @mfunc
|
||
|
* Get an 8 bit character saved as a 2 hex digit value
|
||
|
*
|
||
|
* @rdesc
|
||
|
* TOKEN value of hex number read in
|
||
|
*/
|
||
|
TOKEN CRTFRead::TokenGetHex()
|
||
|
{
|
||
|
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetHex");
|
||
|
|
||
|
_token = tokenError; // Default error
|
||
|
|
||
|
BYTE bChar0 = GetHex(); // Get hexadigit
|
||
|
if(bChar0 < 16) // It's valid
|
||
|
{
|
||
|
BYTE bChar1 = GetHex(); // Get next hexadigit
|
||
|
if(bChar1 < 16) // It's valid too
|
||
|
_token = (WORD)(bChar0 << 4 | bChar1);
|
||
|
else
|
||
|
UngetChar(); // Invalid: put back 1st hexadigit
|
||
|
}
|
||
|
return _token;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* CRTFRead::SkipToEndOfGroup()
|
||
|
*
|
||
|
* @mfunc
|
||
|
* Skip to end of current group
|
||
|
*
|
||
|
* @rdesc
|
||
|
* EC An error code
|
||
|
*/
|
||
|
EC CRTFRead::SkipToEndOfGroup()
|
||
|
{
|
||
|
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::SkipToEndOfGroup");
|
||
|
|
||
|
INT nDepth = 1;
|
||
|
BYTE ach;
|
||
|
|
||
|
while(TRUE)
|
||
|
{
|
||
|
ach = GetChar();
|
||
|
switch(ach)
|
||
|
{
|
||
|
case BSLASH:
|
||
|
{
|
||
|
BYTE achNext = GetChar();
|
||
|
|
||
|
// EOF: goto done; else ignore NULLs
|
||
|
if(!achNext && _ecParseError == ecUnexpectedEOF)
|
||
|
goto done;
|
||
|
|
||
|
if(achNext == 'b' && UngetChar() &&
|
||
|
TokenGetKeyword() == tokenBinaryData)
|
||
|
{
|
||
|
// We've encountered the \binN tag in the RTF we want
|
||
|
// to skip. _iParam contains N from \binN once the
|
||
|
// tag is parsed by TokenGetKeyword()
|
||
|
SkipBinaryData(_iParam);
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
case LBRACE:
|
||
|
nDepth++;
|
||
|
break;
|
||
|
|
||
|
case RBRACE:
|
||
|
if (--nDepth <= 0)
|
||
|
goto done;
|
||
|
break;
|
||
|
|
||
|
case 0:
|
||
|
if(_ecParseError == ecUnexpectedEOF)
|
||
|
goto done;
|
||
|
|
||
|
default:
|
||
|
// Detect Lead bytes here.
|
||
|
int cTrailBytes = GetTrailBytesCount(ach, _nCodePage);
|
||
|
if (cTrailBytes)
|
||
|
{
|
||
|
for (int i = 0; i < cTrailBytes; i++)
|
||
|
{
|
||
|
ach = GetChar();
|
||
|
if(ach == 0 && _ecParseError == ecUnexpectedEOF)
|
||
|
goto done;
|
||
|
}
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
Assert(!_ecParseError);
|
||
|
_ecParseError = ecUnexpectedEOF;
|
||
|
|
||
|
done:
|
||
|
return _ecParseError;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* CRTFRead::TokenFindKeyword(szKeyword, prgKeyword, cKeyword)
|
||
|
*
|
||
|
* @mfunc
|
||
|
* Find keyword <p szKeyword> and return its token value
|
||
|
*
|
||
|
* @rdesc
|
||
|
* TOKEN token number of keyword
|
||
|
*/
|
||
|
TOKEN CRTFRead::TokenFindKeyword(
|
||
|
BYTE * szKeyword, //@parm Keyword to find
|
||
|
const KEYWORD *prgKeyword, //@parm Keyword array to use
|
||
|
LONG cKeyword) //@parm Count of keywords
|
||
|
{
|
||
|
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenFindKeyword");
|
||
|
|
||
|
INT iMax;
|
||
|
INT iMid;
|
||
|
INT iMin;
|
||
|
INT nComp;
|
||
|
BYTE * pchCandidate;
|
||
|
BYTE * pchKeyword;
|
||
|
const KEYWORD * pk;
|
||
|
|
||
|
AssertSz(szKeyword[0],
|
||
|
"CRTFRead::TokenFindKeyword: null keyword");
|
||
|
|
||
|
_iKeyword = 0;
|
||
|
#ifdef RTF_HASHCACHE
|
||
|
if ( _rtfHashInited )
|
||
|
{
|
||
|
// Hash is 23% faster than the following binary search on finds
|
||
|
// and 55% faster on misses: For 97 words stored in a 257 cache.
|
||
|
// Performance numbers will change when the total stored goes up.
|
||
|
pk = HashKeyword_Fetch ( (CHAR *) szKeyword );
|
||
|
}
|
||
|
else
|
||
|
#endif
|
||
|
{
|
||
|
iMin = 0;
|
||
|
iMax = cKeyword - 1;
|
||
|
pk = NULL;
|
||
|
do
|
||
|
{
|
||
|
iMid = (iMin + iMax) / 2;
|
||
|
pchCandidate = (BYTE *)prgKeyword[iMid].szKeyword;
|
||
|
pchKeyword = szKeyword;
|
||
|
while (!(nComp = (*pchKeyword | 0x20) - (*pchCandidate | 0x20)) // Be sure to match
|
||
|
&& *pchKeyword) // terminating 0's
|
||
|
{
|
||
|
pchKeyword++;
|
||
|
pchCandidate++;
|
||
|
}
|
||
|
if (nComp < 0)
|
||
|
iMax = iMid - 1;
|
||
|
else if (nComp)
|
||
|
iMin = iMid + 1;
|
||
|
else
|
||
|
{
|
||
|
pk = &prgKeyword[iMid];
|
||
|
_iKeyword = iMid; // Save keyword index
|
||
|
break;
|
||
|
}
|
||
|
} while (iMin <= iMax);
|
||
|
}
|
||
|
|
||
|
if(pk)
|
||
|
{
|
||
|
_token = pk->token;
|
||
|
|
||
|
// Log the RTF keyword scan to aid in tracking RTF tag coverage
|
||
|
// TODO: Implement RTF tag logging for the Mac and WinCE
|
||
|
#if defined(DEBUG) && !defined(NOFULLDEBUG)
|
||
|
if(_prtflg)
|
||
|
{
|
||
|
#ifdef RTF_HASCACHE
|
||
|
_prtflg->AddAt(szKeyword);
|
||
|
#else
|
||
|
_prtflg->AddAt((size_t)iMid);
|
||
|
#endif
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
else
|
||
|
{ // No match: place to take
|
||
|
_token = tokenUnknownKeyword; // care of unrecognized RTF
|
||
|
if(_fNotifyLowFiRTF)
|
||
|
{
|
||
|
iMin = 0; // Use binary search as above
|
||
|
iMax = crgszUnrecognizedRTF - 1;
|
||
|
do
|
||
|
{
|
||
|
iMid = (iMin + iMax) / 2;
|
||
|
pchCandidate = (BYTE *)rgszUnrecognizedRTF[iMid];
|
||
|
pchKeyword = szKeyword;
|
||
|
while (!(nComp = (*pchKeyword | 0x20) - (*pchCandidate | 0x20))
|
||
|
&& *pchKeyword)
|
||
|
{
|
||
|
pchKeyword++;
|
||
|
pchCandidate++;
|
||
|
}
|
||
|
if (nComp < 0)
|
||
|
iMax = iMid - 1;
|
||
|
else if (nComp && *pchCandidate)
|
||
|
iMin = iMid + 1;
|
||
|
else // Found keyword
|
||
|
{
|
||
|
_iKeyword = -iMid - 1;
|
||
|
CheckNotifyLowFiRTF();
|
||
|
break;
|
||
|
}
|
||
|
} while (iMin <= iMax);
|
||
|
}
|
||
|
}
|
||
|
return _token;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* CRTFRead::CheckNotifyLowFiRTF()
|
||
|
*
|
||
|
* @mfunc
|
||
|
* If LowFi RTF notifications are enabled, send notification for the
|
||
|
* keyword with index _iKeyword to client and turn off the notifications
|
||
|
* for the rest of this read.
|
||
|
*/
|
||
|
void CRTFRead::CheckNotifyLowFiRTF(
|
||
|
BOOL fEnable)
|
||
|
{
|
||
|
if(_fNotifyLowFiRTF && (_fBody || fEnable))
|
||
|
{
|
||
|
char *pach = _iKeyword >= 0
|
||
|
? (char *)rgKeyword[_iKeyword].szKeyword
|
||
|
: (char *)rgszUnrecognizedRTF[-_iKeyword - 1];
|
||
|
_ped->HandleLowFiRTF(pach);
|
||
|
_fNotifyLowFiRTF = FALSE;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* CRTFRead::TokenGetKeyword()
|
||
|
*
|
||
|
* @mfunc
|
||
|
* Collect a keyword and its parameter. Return token's keyword
|
||
|
*
|
||
|
* @rdesc
|
||
|
* TOKEN token number of keyword
|
||
|
*
|
||
|
* @comm
|
||
|
* Most RTF control words (keywords) consist of a span of lower-case
|
||
|
* ASCII letters possibly followed by a span of decimal digits. Other
|
||
|
* control words consist of a single character that isn't LC ASCII. No
|
||
|
* control words contain upper-case characters.
|
||
|
*/
|
||
|
TOKEN CRTFRead::TokenGetKeyword()
|
||
|
{
|
||
|
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetKeyword");
|
||
|
|
||
|
BYTE ach = GetChar();
|
||
|
BYTE *pach;
|
||
|
BYTE szKeyword[cachKeywordMax];
|
||
|
BYTE *pachEnd = szKeyword + cachKeywordMax - 1;
|
||
|
|
||
|
if(!IsAlphaChar(ach)) // Not alpha, i.e.,
|
||
|
{ // single char
|
||
|
if (ach == '\'') // Most common case needs
|
||
|
{ // special treatment
|
||
|
// Convert hex to char and store result in _token
|
||
|
if(TokenGetHex() == tokenError)
|
||
|
{
|
||
|
_ecParseError = ecUnexpectedChar;
|
||
|
goto TokenError;
|
||
|
}
|
||
|
if((_token == CR || _token == LF) && FInDocTextDest())
|
||
|
{
|
||
|
// Add raw CR or LF in the byte stream as a \par
|
||
|
return tokenEndParagraph;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// Check for other known symbols
|
||
|
const BYTE *pachSym = szSymbolKeywords;
|
||
|
|
||
|
while(ach != *pachSym && *pachSym)
|
||
|
pachSym++;
|
||
|
if(*pachSym) // Found one
|
||
|
{
|
||
|
_token = tokenSymbol[pachSym - szSymbolKeywords];
|
||
|
if(_token > 0x7F) // Token or larger Unicode
|
||
|
return _token; // value
|
||
|
}
|
||
|
else if (!ach) // No more input chars
|
||
|
goto TokenError;
|
||
|
else // Code for unrecognized RTF
|
||
|
_token = ach; // We'll just insert it for now
|
||
|
}
|
||
|
_token = TokenGetText((BYTE)_token);
|
||
|
return _token;
|
||
|
}
|
||
|
|
||
|
szKeyword[0] = ach; // Collect keyword that starts
|
||
|
pach = szKeyword + 1; // with Alpha
|
||
|
while (IsAlphaChar(ach = GetChar()))
|
||
|
{
|
||
|
if (pach < pachEnd)
|
||
|
*pach++ = ach;
|
||
|
}
|
||
|
*pach = '\0'; // Terminate keyword
|
||
|
GetParam(ach); // Get keyword N in _iParam
|
||
|
if (!_ecParseError) // Find and return keyword
|
||
|
return TokenFindKeyword(szKeyword, rgKeyword, cKeywords);
|
||
|
|
||
|
TokenError:
|
||
|
TRACEERRSZSC("TokenGetKeyword()", _ecParseError);
|
||
|
return _token = tokenError;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* CRTFRead::GetParam(ach)
|
||
|
*
|
||
|
* @mfunc
|
||
|
* Get any numeric parameter following a keyword, storing the result
|
||
|
* in _iParam and setting _fParam = TRUE iff a number is found.
|
||
|
*/
|
||
|
void CRTFRead::GetParam(
|
||
|
char ach) // @parm First char of 8-bit text string
|
||
|
{
|
||
|
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::GetText");
|
||
|
_fParam = FALSE; // Clear parameter
|
||
|
_iParam = 0;
|
||
|
|
||
|
if(IsDigit(ach) || ach == '-') // Collect parameter
|
||
|
{
|
||
|
BOOL fNegativeParam = TRUE;
|
||
|
|
||
|
_fParam = TRUE;
|
||
|
if(ach != '-')
|
||
|
{
|
||
|
_iParam = ach - '0'; // Get parameter value
|
||
|
fNegativeParam = FALSE;
|
||
|
}
|
||
|
|
||
|
while (IsDigit(ach = GetChar()))
|
||
|
_iParam = _iParam*10 + ach - '0';
|
||
|
|
||
|
if (fNegativeParam)
|
||
|
_iParam = -_iParam;
|
||
|
}
|
||
|
if(ach != ' ')
|
||
|
UngetChar(); // If not ' ', unget char
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* CRTFRead::TokenGetText(ach)
|
||
|
*
|
||
|
* @mfunc
|
||
|
* Collect a string of text starting with the char <p ach> and treat as a
|
||
|
* single token. The string ends when a LBRACE, RBRACE, or single '\\' is found.
|
||
|
*
|
||
|
* @devnote
|
||
|
* We peek past the '\\' for \\'xx, which we decode and keep on going;
|
||
|
* else we return in a state where the next character is the '\\'.
|
||
|
*
|
||
|
* @rdesc
|
||
|
* TOKEN Token number of next token (tokenText or tokenError)
|
||
|
*/
|
||
|
TOKEN CRTFRead::TokenGetText(
|
||
|
BYTE ach) // @parm First char of 8-bit text string
|
||
|
{
|
||
|
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetText");
|
||
|
|
||
|
BYTE * pach = _szText;
|
||
|
SHORT cachText = 0;
|
||
|
LONG CodePage = _pstateStackTop->nCodePage;
|
||
|
BOOL fAllASCII = TRUE;
|
||
|
int cTrailBytesNeeded = 0;
|
||
|
|
||
|
_token = tokenError; // Default error
|
||
|
|
||
|
// FUTURE(BradO): This 'goto' into a while loop is pretty weak.
|
||
|
// Restructure this 'while' loop such that the 'goto' is removed.
|
||
|
|
||
|
// Add character passed into routine
|
||
|
goto add;
|
||
|
|
||
|
// If cTrailBytesNeeded is non-zero, we need to get all the trail bytes. Otherwise,
|
||
|
// a string end in the middle of a DBC or UTF-8 will cause bad display/print problem
|
||
|
// - 5 to allow extra space for up to 4 bytes for UTF-8 and Null char
|
||
|
while (cachText < cachTextMax - 5 || cTrailBytesNeeded)
|
||
|
{
|
||
|
ach = GetChar();
|
||
|
switch (ach)
|
||
|
{
|
||
|
case BSLASH:
|
||
|
{
|
||
|
// FUTURE(BradO): This code looks ALOT like TokenGetKeyword.
|
||
|
// We should combine the two into a common routine.
|
||
|
|
||
|
BYTE achNext;
|
||
|
|
||
|
// Get char after BSLASH
|
||
|
achNext = GetChar();
|
||
|
if(!achNext)
|
||
|
goto error;
|
||
|
|
||
|
if(achNext == '\'') // Handle most frequent
|
||
|
{ // case here
|
||
|
if(TokenGetHex() == tokenError)
|
||
|
{
|
||
|
if(cTrailBytesNeeded)
|
||
|
{
|
||
|
// The trail-byte must be a raw BSLASH.
|
||
|
// Unget the single-quote.
|
||
|
|
||
|
if(!UngetChar())
|
||
|
goto error;
|
||
|
// fall through to add BSLASH
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
_ecParseError = ecUnexpectedChar;
|
||
|
goto error;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
ach = (BYTE)_token;
|
||
|
if (cTrailBytesNeeded == 0 && (ach == CR || ach == LF) &&
|
||
|
FInDocTextDest())
|
||
|
{
|
||
|
// Here, we have a raw CR or LF in document text.
|
||
|
// Unget the whole lot of characters and bail out.
|
||
|
// TokenGetKeyword will convert this CR or LF into
|
||
|
// a \par.
|
||
|
|
||
|
if(!UngetChar(4))
|
||
|
goto error;
|
||
|
goto done;
|
||
|
}
|
||
|
}
|
||
|
goto add;
|
||
|
}
|
||
|
|
||
|
// Check next byte against list of RTF symbol
|
||
|
// NOTE:- we need to check for RTF symbol even if we
|
||
|
// are expecting a trail byte. According to the rtf spec,
|
||
|
// we cannot just take this backslash as trail byte.
|
||
|
// HWC 9/97
|
||
|
|
||
|
const BYTE *pachSymbol = szSymbolKeywords;
|
||
|
while(achNext != *pachSymbol && *pachSymbol)
|
||
|
pachSymbol++;
|
||
|
|
||
|
TOKEN tokenTmp;
|
||
|
|
||
|
if (*pachSymbol &&
|
||
|
(tokenTmp = tokenSymbol[pachSymbol - szSymbolKeywords])
|
||
|
<= 0x7F)
|
||
|
{
|
||
|
ach = (BYTE)tokenTmp;
|
||
|
goto add;
|
||
|
}
|
||
|
|
||
|
// In either of the last two cases below, we will want
|
||
|
// to unget the byte following the BSLASH
|
||
|
if(!UngetChar())
|
||
|
goto error;
|
||
|
|
||
|
if(cTrailBytesNeeded && !IsAlphaChar(achNext))
|
||
|
{
|
||
|
// In this situation, either this BSLASH begins the next
|
||
|
// RTF keyword or it is a raw BSLASH which is the trail
|
||
|
// byte for a DBCS character.
|
||
|
|
||
|
// I think a fair assumption here is that if an alphanum
|
||
|
// follows the BSLASH, that the BSLASH begins the next
|
||
|
// RTF keyword.
|
||
|
|
||
|
// add the raw BSLASH
|
||
|
goto add;
|
||
|
}
|
||
|
|
||
|
// Here, my guess is that the BSLASH begins the next RTF
|
||
|
// keyword, so unget the BSLASH
|
||
|
if(!UngetChar())
|
||
|
goto error;
|
||
|
|
||
|
goto done;
|
||
|
}
|
||
|
|
||
|
case LBRACE: // End of text string
|
||
|
case RBRACE:
|
||
|
if(cTrailBytesNeeded)
|
||
|
{
|
||
|
// Previous char was a lead-byte of a DBCS pair or UTF-8, which
|
||
|
// makes this char a raw trail-byte.
|
||
|
goto add;
|
||
|
}
|
||
|
|
||
|
if(!UngetChar()) // Unget delimeter
|
||
|
goto error;
|
||
|
goto done;
|
||
|
|
||
|
case LF: // Throw away noise chars
|
||
|
case CR:
|
||
|
break;
|
||
|
|
||
|
case 0:
|
||
|
if(_ecParseError == ecUnexpectedEOF)
|
||
|
goto done;
|
||
|
ach = ' '; // Replace NULL by blank
|
||
|
|
||
|
default: // Collect chars
|
||
|
add:
|
||
|
*pach++ = ach;
|
||
|
++cachText;
|
||
|
if(ach > 0x7F)
|
||
|
fAllASCII = FALSE;
|
||
|
|
||
|
// Check if we are expecting more trail bytes
|
||
|
if (cTrailBytesNeeded)
|
||
|
cTrailBytesNeeded--;
|
||
|
else
|
||
|
cTrailBytesNeeded = GetTrailBytesCount(ach, CodePage);
|
||
|
Assert(cTrailBytesNeeded >= 0);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
done:
|
||
|
_token = (WORD)(fAllASCII ? tokenASCIIText : tokenText);
|
||
|
*pach = '\0'; // Terminate token string
|
||
|
|
||
|
error:
|
||
|
return _token;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* CRTFRead::TokenGetToken()
|
||
|
*
|
||
|
* @mfunc
|
||
|
* This function reads in next token from input stream
|
||
|
*
|
||
|
* @rdesc
|
||
|
* TOKEN token number of next token
|
||
|
*/
|
||
|
TOKEN CRTFRead::TokenGetToken()
|
||
|
{
|
||
|
TRACEBEGIN(TRCSUBSYSRTFR, TRCSCOPEINTERN, "CRTFRead::TokenGetToken");
|
||
|
|
||
|
BYTE ach;
|
||
|
|
||
|
_tokenLast = _token; // Used by \* destinations and FE
|
||
|
_token = tokenEOF; // Default end-of-file
|
||
|
|
||
|
SkipNoise:
|
||
|
ach = GetChar();
|
||
|
switch (ach)
|
||
|
{
|
||
|
case CR:
|
||
|
case LF:
|
||
|
goto SkipNoise;
|
||
|
|
||
|
case LBRACE:
|
||
|
_token = tokenStartGroup;
|
||
|
break;
|
||
|
|
||
|
case RBRACE:
|
||
|
_token = tokenEndGroup;
|
||
|
break;
|
||
|
|
||
|
case BSLASH:
|
||
|
_token = TokenGetKeyword();
|
||
|
break;
|
||
|
|
||
|
case 0:
|
||
|
if(_ecParseError == ecUnexpectedEOF)
|
||
|
break;
|
||
|
ach = ' '; // Replace NULL by blank
|
||
|
// Fall thru to default
|
||
|
default:
|
||
|
if( !_pstateStackTop )
|
||
|
{
|
||
|
TRACEWARNSZ("Unexpected token in rtf file");
|
||
|
Assert(_token == tokenEOF);
|
||
|
if (_ped->Get10Mode())
|
||
|
_ecParseError = ecUnexpectedToken; // Signal bad file
|
||
|
}
|
||
|
else if (_pstateStackTop->sDest == destObjectData ||
|
||
|
_pstateStackTop->sDest == destPicture )
|
||
|
// not text but data
|
||
|
{
|
||
|
_token = (WORD)(tokenObjectDataValue + _pstateStackTop->sDest
|
||
|
- destObjectData);
|
||
|
UngetChar();
|
||
|
}
|
||
|
else
|
||
|
_token = TokenGetText(ach);
|
||
|
}
|
||
|
return _token;
|
||
|
}
|
||
|
|
||
|
#define FINDOCTEXTDEST ((1 << destRTF) | \
|
||
|
(1 << destField) | \
|
||
|
(1 << destFieldResult) | (1 << destFieldInstruction) | \
|
||
|
(1 << destParaNumText) | (1 << destParaNumbering) | \
|
||
|
(1 << destNULL))
|
||
|
/*
|
||
|
* CRTFRead::FInDocTextDest()
|
||
|
*
|
||
|
* @mfunc
|
||
|
* Returns a BOOL indicating if the current destination is one in which
|
||
|
* we would encounter document text.
|
||
|
*
|
||
|
* @rdesc
|
||
|
* BOOL indicates the current destination may contain document text.
|
||
|
*/
|
||
|
BOOL CRTFRead::FInDocTextDest() const
|
||
|
{
|
||
|
AssertSz(_pstateStackTop->sDest < destMAX,
|
||
|
"CRTFRead::FInDocTextDest(): New destination encountered - update enum in _rtfread.h");
|
||
|
|
||
|
return (FINDOCTEXTDEST & (1 << _pstateStackTop->sDest)) != 0;
|
||
|
}
|