windows-nt/Source/XPSP1/NT/inetsrv/query/apps/srch/brdoc.cxx

754 lines
18 KiB
C++
Raw Permalink Normal View History

2020-09-26 03:20:57 -05:00
//+-------------------------------------------------------------------------
//
// Microsoft Windows
// Copyright (C) Microsoft Corporation, 1992 - 2000.
//
// File: document.cxx
//
// Contents: The Document part of the browser
//
//--------------------------------------------------------------------------
#include <pch.cxx>
#pragma hdrstop
#define TheSearch pSearch
const int UNICODE_PARAGRAPH_SEPARATOR=0x2029;
const GUID guidStorage = PSGUID_STORAGE;
//+-------------------------------------------------------------------------
//
// Member: Position::Compare, public
//
// Synopsis: Compare two positions
//
//--------------------------------------------------------------------------
int Position::Compare( const Position& pos ) const
{
int diff = _para - pos.Para();
if ( diff == 0 )
diff = _begOff - pos.BegOff();
return diff;
}
//+-------------------------------------------------------------------------
//
// Member: Hit::Hit, public
//
// Synopsis: Create hit from an array of positions
//
//--------------------------------------------------------------------------
Hit::Hit( const Position * aPos, unsigned cPos )
: _cPos(cPos)
{
_aPos = new Position[cPos];
memcpy( _aPos, aPos, sizeof(Position) * cPos );
}
Hit::~Hit()
{
delete _aPos;
}
//+-------------------------------------------------------------------------
//
// Member: HitIter::GetPositionCount, public
//
// Synopsis: return number of positions or zero
//
//--------------------------------------------------------------------------
int HitIter::GetPositionCount() const
{
if (_iHit < _pDoc->_cHit && _pDoc->_aHit[_iHit])
return _pDoc->_aHit[_iHit]->Count();
return 0;
}
//+-------------------------------------------------------------------------
//
// Member: HitIter::GetPosition, public
//
// Synopsis: return position by value
//
//--------------------------------------------------------------------------
Position HitIter::GetPosition ( int i ) const
{
if ( _iHit < _pDoc->_cHit && _pDoc->_aHit[_iHit] )
return _pDoc->_aHit[_iHit]->GetPos(i);
else
{
Position pos;
return( pos );
}
}
//+-------------------------------------------------------------------------
//
// Member: Document::Document, public
//
// Synopsis: Initialize document with filename
//
//--------------------------------------------------------------------------
Document::Document(WCHAR const* filename, LONG rank, BOOL fDelete)
: _filename(0),
_rank (rank),
_buffer(0),
_bufLen(0),
_bufEnd(0),
_pFilter(0),
_aParaOffset(0),
_isInit(FALSE),
_cHit(0),
_aParaLine(0),
_maxParaLen(0),
_cPara(0),
_chunkCount(0),
_fDelete( fDelete )
{
_filename = new WCHAR[ wcslen( filename ) + 1 ];
wcscpy( _filename, filename );
}
//+-------------------------------------------------------------------------
//
// Member: Document::Document, public
//
// Synopsis: Initialize document
//
//--------------------------------------------------------------------------
Document::Document()
: _filename(0),
_buffer(0),
_bufLen(0),
_bufEnd(0),
_pFilter(0),
_aParaOffset(0),
_isInit(FALSE),
_cHit(0),
_aParaLine(0),
_maxParaLen(0),
_cPara(0),
_chunkCount(0),
_fDelete( FALSE )
{}
//+-------------------------------------------------------------------------
//
// Member: Document::~Document, public
//
// Synopsis: Free document
//
//--------------------------------------------------------------------------
Document::~Document()
{
Free();
}
//+-------------------------------------------------------------------------
//
// Member: Document::Free, public
//
// Synopsis: Free document storage
//
//--------------------------------------------------------------------------
void Document::Free()
{
if ( 0 != _filename )
{
if ( _fDelete )
DeleteFile( _filename );
delete [] _filename;
}
if (!_isInit)
return;
for ( unsigned i = 0; i < _cHit; i++ )
{
delete _aHit[i];
_aHit[i] = 0;
}
// _aHit is embedded
delete []_aParaOffset;
_aParaOffset = 0;
if (_aParaLine)
{
for (int i = 0; i < _cPara; i++)
{
while (_aParaLine[i].next != 0)
{
ParaLine* p = _aParaLine[i].next;
_aParaLine[i].next = _aParaLine[i].next->next;
delete p;
}
}
delete _aParaLine;
}
delete _buffer;
_buffer = 0;
_bufEnd = 0;
_cHit = 0;
_isInit = FALSE;
} //Free
//+-------------------------------------------------------------------------
//
// Member: Document::Init, public
//
// Synopsis: Read-in file, fill array of hits
//
//--------------------------------------------------------------------------
SCODE Document::Init(ISearchQueryHits *pSearch)
{
BOOL noHits = FALSE;
SCODE sc = S_OK;
TRY
{
AllocBuffer( _filename );
BindToFilter( _filename );
ULONG ulFlags;
sc = _pFilter->Init( IFILTER_INIT_CANON_PARAGRAPHS |
IFILTER_INIT_CANON_HYPHENS |
IFILTER_INIT_APPLY_INDEX_ATTRIBUTES,
0, 0, &ulFlags );
if (FAILED (sc))
THROW (CException(sc));
ReadFile();
BreakParas();
if (Paras() != 0)
{
BreakLines();
#if 0
// some filters don't behave correctly if you just re-init them,
// so release the filter and re-open it.
_pFilter->Release();
_pFilter = 0;
BindToFilter();
#endif
sc = _pFilter->Init ( IFILTER_INIT_CANON_PARAGRAPHS |
IFILTER_INIT_CANON_HYPHENS |
IFILTER_INIT_APPLY_INDEX_ATTRIBUTES,
0, 0, &ulFlags );
sc = TheSearch->Init( _pFilter, ulFlags );
if (FAILED (sc))
{
if ( QUERY_E_ALLNOISE != sc )
THROW (CException(sc));
// we can still show the file
sc = S_OK;
noHits = TRUE;
}
// SUCCESS
_isInit = TRUE;
}
}
CATCH ( CException, e )
{
_isInit = FALSE;
sc = e.GetErrorCode();
}
END_CATCH;
if (!noHits)
{
//
// pull up all the hits
//
ULONG count;
FILTERREGION* aRegion;
SCODE sc = TheSearch->NextHitOffset ( &count, &aRegion );
while (sc == S_OK)
{
XCoMem<FILTERREGION> xRegion( aRegion );
CDynArrayInPlace<Position> aPos( count );
for (unsigned i = 0; i < count; i++)
aPos [i] = RegionToPos ( aRegion [i] );
xRegion.Free();
XPtr<Hit> xHit( new Hit( aPos.GetPointer(), count ) );
_aHit[_cHit] = xHit.Get();
_cHit++;
xHit.Acquire();
sc = TheSearch->NextHitOffset ( &count, &aRegion );
}
}
else
{
_cHit = 0;
_isInit = (_bufEnd - _buffer) != 0;
}
if ( _pFilter )
{
_pFilter->Release();
_pFilter = 0;
}
return _isInit ? S_OK : sc;
}
Position Document::RegionToPos ( FILTERREGION& region )
{
static int paraHint = 0;
static int iChunkHint = 0;
static Position posNull;
ULONG offset = ULONG (-1);
// translate region to offset into buffer
if (iChunkHint >= _chunkCount || _chunk[iChunkHint].ChunkId() != region.idChunk )
{
iChunkHint = 0;
while ( iChunkHint < _chunkCount && _chunk[iChunkHint].ChunkId() < region.idChunk )
{
iChunkHint++;
}
if (iChunkHint >= _chunkCount || _chunk[iChunkHint].ChunkId() != region.idChunk)
return posNull;
}
Win4Assert ( iChunkHint < _chunkCount );
Win4Assert ( _chunk[iChunkHint].ChunkId() == region.idChunk );
offset = _chunk[iChunkHint].Offset() + region.cwcStart;
if (paraHint >= _cPara || _aParaOffset[paraHint] > offset )
paraHint = 0;
Win4Assert ( _aParaOffset[paraHint] <= offset );
for ( ; paraHint <= _cPara; paraHint++)
{
// _aParaOffset[_cPara] is valid!
if (_aParaOffset[paraHint] > offset)
{
Win4Assert (paraHint > 0);
paraHint--;
return Position ( paraHint,
offset - _aParaOffset[paraHint],
region.cwcExtent );
}
}
return posNull;
}
//+-------------------------------------------------------------------------
//
// Member: Document::AllocBuffer, public
//
// Synopsis: Allocate buffer for file text
//
//--------------------------------------------------------------------------
void Document::AllocBuffer ( WCHAR const * pwcPath )
{
//
// We should keep allocating buffers on demand,
// but for this simple demo we'll just get the
// file size up front and do a single buffer
// allocation of 2.25 the size (to accommodate
// Unicode expansion). THIS IS JUST A DEMO!
//
HANDLE hFile = CreateFile ( pwcPath,
GENERIC_READ,
FILE_SHARE_READ,
0, // security
OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL,
0 ); // template
if ( INVALID_HANDLE_VALUE == hFile )
THROW( CException() );
_bufLen = GetFileSize(hFile, 0 );
CloseHandle ( hFile );
// Unicode from ASCII, twice and then some
_bufLen = 2 * _bufLen + _bufLen / 4 + 1;
_buffer = new WCHAR [_bufLen + 1];
_buffer[ _bufLen ] = 0;
}
typedef HRESULT (__stdcall * PFnLoadTextFilter)( WCHAR const * pwcPath,
IFilter ** ppIFilter );
PFnLoadTextFilter g_pLoadTextFilter = 0;
SCODE MyLoadTextFilter( WCHAR const *pwc, IFilter **ppFilter )
{
if ( 0 == g_pLoadTextFilter )
{
g_pLoadTextFilter = (PFnLoadTextFilter) GetProcAddress( GetModuleHandle( L"query.dll" ), "LoadTextFilter" );
if ( 0 == g_pLoadTextFilter )
return HRESULT_FROM_WIN32( GetLastError() );
}
return g_pLoadTextFilter( pwc, ppFilter );
}
//+-------------------------------------------------------------------------
//
// Member: Document::BindToFilter, public
//
// Synopsis: Bind to appropriate filter for the document
//
//--------------------------------------------------------------------------
void Document::BindToFilter( WCHAR const * pwcPath )
{
//
// Bind to the filter interface
//
SCODE sc = LoadIFilter( pwcPath, 0, (void **)&_pFilter );
if ( FAILED(sc) )
{
sc = MyLoadTextFilter( pwcPath, &_pFilter );
if ( FAILED(sc) )
THROW( CException(sc) );
}
}
//+-------------------------------------------------------------------------
//
// Member: Document::ReadFile, public
//
// Synopsis: Read file into buffer using the filter
//
//--------------------------------------------------------------------------
void Document::ReadFile ()
{
SCODE sc;
ULONG lenSoFar = 0;
int cChunk = 0;
BOOL fSeenProp = FALSE;
STAT_CHUNK statChunk;
sc = _pFilter->GetChunk ( &statChunk );
// what about all these glueing flags?
// Take them into account at some point
// to test more complicated chunking
while (SUCCEEDED(sc)
|| FILTER_E_LINK_UNAVAILABLE == sc
|| FILTER_E_EMBEDDING_UNAVAILABLE == sc )
{
if ( SUCCEEDED( sc ) && (statChunk.flags & CHUNK_TEXT) )
{
// read the contents only
if ( statChunk.attribute.guidPropSet == guidStorage &&
statChunk.attribute.psProperty.ulKind == PRSPEC_PROPID &&
statChunk.attribute.psProperty.propid == PID_STG_CONTENTS )
{
if ( statChunk.breakType != CHUNK_NO_BREAK )
{
switch( statChunk.breakType )
{
case CHUNK_EOW:
case CHUNK_EOS:
_buffer[lenSoFar++] = L' ';
break;
case CHUNK_EOP:
case CHUNK_EOC:
_buffer[lenSoFar++] = UNICODE_PARAGRAPH_SEPARATOR;
break;
}
}
_chunk [cChunk].SetChunkId (statChunk.idChunk);
Win4Assert ( cChunk == 0 || statChunk.idChunk > _chunk [cChunk - 1].ChunkId () );
_chunk [cChunk].SetOffset (lenSoFar);
cChunk++;
do
{
ULONG lenThis = _bufLen - lenSoFar;
if (lenThis == 0)
break;
sc = _pFilter->GetText( &lenThis, _buffer+lenSoFar );
// The buffer may be filled with zeroes. Nice filter.
if ( SUCCEEDED(sc) && 0 != lenThis )
{
lenThis = __min( lenThis,
wcslen( _buffer + lenSoFar ) );
lenSoFar += lenThis;
}
}
while (SUCCEEDED(sc));
}
} // if SUCCEEDED( sc )
// next chunk, please
sc = _pFilter->GetChunk ( &statChunk );
}
_bufEnd = _buffer + lenSoFar;
Win4Assert( lenSoFar <= _bufLen );
_chunkCount = cChunk;
}
//+-------------------------------------------------------------------------
//
// Member: Document::BreakParas, public
//
// Synopsis: Break document into paragraphs separated by line feeds
//
//--------------------------------------------------------------------------
#define PARAS 25
void Document::BreakParas()
{
int maxParas = PARAS;
_aParaOffset = new unsigned [ maxParas ];
WCHAR * pCur = _buffer;
_cPara = 0;
_maxParaLen = 0;
do
{
if ( _cPara == maxParas )
{
// grow array
unsigned * tmp = new unsigned [maxParas * 2];
for ( int n = 0; n < maxParas; n++ )
tmp[n] = _aParaOffset[n];
delete []_aParaOffset;
_aParaOffset = tmp;
maxParas *= 2;
}
_aParaOffset [_cPara] = (UINT)(pCur - _buffer);
pCur = EatPara(pCur);
_cPara++;
} while ( pCur < _bufEnd );
// store end of buffer offset as _aParaOffset[_cPara]
if ( _cPara == maxParas )
{
// grow array
unsigned * tmp = new unsigned [maxParas + 1];
for ( int n = 0; n < maxParas; n++ )
tmp[n] = _aParaOffset[n];
delete []_aParaOffset;
_aParaOffset = tmp;
maxParas += 1;
}
_aParaOffset [_cPara] = (UINT)(pCur - _buffer - 1);
}
//+-------------------------------------------------------------------------
//
// Member: Document::EatPara, private
//
// Synopsis: Skip till the line feed
//
//--------------------------------------------------------------------------
WCHAR * Document::EatPara( WCHAR * pCur )
{
// search for newline or null
int pos = 0;
int c;
while ( pCur < _bufEnd
&& (c = *pCur) != L'\n'
&& c != L'\r'
&& c != L'\0'
&& c != UNICODE_PARAGRAPH_SEPARATOR )
{
pos++;
pCur++;
}
// eat newline and/or carriage return
pCur++;
if ( pCur < _bufEnd
&& *(pCur-1) == L'\r'
&& *pCur == L'\n' )
pCur++;
if ( pos > _maxParaLen )
_maxParaLen = pos;
return pCur;
}
int BreakLine ( WCHAR* buf, int cwcBuf, int cwcMax )
{
if (cwcBuf <= cwcMax)
return cwcBuf;
Win4Assert (cwcMax > 0);
// look backwards for whitespace
int len = cwcMax;
int c = buf[len-1];
while (c != L' ' && c != L'\t')
{
len--;
if (len < 1)
break;
c = buf[len-1];
}
if (len == 0)
{
// a single word larger than screen width
// try scanning forward
len = cwcMax;
c = buf[len];
while (c != L' ' && c != L'\t')
{
len++;
if (len == cwcBuf)
break;
c = buf[len];
}
}
return len;
}
const int MAX_LINE_LEN = 110;
void Document::BreakLines()
{
_aParaLine = new ParaLine [_cPara];
for (int i = 0; i < _cPara; i++)
{
int cwcLeft = _aParaOffset[i+1] - _aParaOffset[i];
if (cwcLeft < MAX_LINE_LEN)
_aParaLine[i].offEnd = cwcLeft;
else
{
ParaLine* pParaLine = &_aParaLine[i];
WCHAR* buf = _buffer + _aParaOffset[i];
int cwcOffset = 0;
for (;;)
{
int cwcLine = BreakLine ( buf + cwcOffset, cwcLeft, MAX_LINE_LEN );
cwcOffset += cwcLine;
pParaLine->offEnd = cwcOffset;
cwcLeft -= cwcLine;
if (cwcLeft == 0)
break;
pParaLine->next = new ParaLine;
pParaLine = pParaLine->next;
};
}
}
}
//+-------------------------------------------------------------------------
//
// Member: Document::GetLine, public
//
// Arguments: [nPara] -- paragraph number
// [off] -- offset within paragraph
// [cwc] -- in/out chars to copy / copied
// [buf] -- target buffer
//
// Synopsis: Copy text from paragraph to buffer
//
//--------------------------------------------------------------------------
BOOL Document::GetLine(int nPara, int off, int& cwc, WCHAR* buf)
{
Win4Assert (_buffer != 0);
if (nPara >= _cPara)
return FALSE;
const WCHAR * pText = _buffer + _aParaOffset[nPara] + off;
// _aParaOffset [_cPara] is the offset of the end of buffer
int cwcPara = _aParaOffset[nPara+1] - (_aParaOffset[nPara] + off);
cwc = __min ( cwc, cwcPara );
memcpy ( buf, pText, cwc * sizeof(WCHAR));
return TRUE;
}
//+-------------------------------------------------------------------------
//
// Member: Document::GetWord, public
//
// Synopsis:
// Copy the string into buffer
//
//--------------------------------------------------------------------------
void Document::GetWord(int nPara, int offSrc, int cwcSrc, WCHAR* buf)
{
Win4Assert (_buffer != 0);
Win4Assert ( nPara < _cPara );
WCHAR * p = _buffer + _aParaOffset[nPara];
Win4Assert ( p + offSrc + cwcSrc <= _bufEnd );
memcpy ( buf, p + offSrc, cwcSrc * sizeof(WCHAR));
}