windows-nt/Source/XPSP1/NT/inetsrv/query/apps/webhits/cdoc.cxx
2020-09-26 16:20:57 +08:00

791 lines
22 KiB
C++

//+-------------------------------------------------------------------------
//
// Microsoft Windows
// Copyright (C) Microsoft Corporation, 1992 - 2000.
//
// File: cdoc.cxx
//
// Contents: a radically stripped down version of the document class
// that gets rid of the notion of paragragph and maintains only
// information relative to the stream
//
//--------------------------------------------------------------------------
#include <pch.cxx>
#pragma hdrstop
#include <cidebug.hxx>
#include <dynstack.hxx>
#include <cimbmgr.hxx>
#include <propspec.hxx>
#include <vquery.hxx>
#include <pageman.hxx>
#include <dblink.hxx>
#include <imprsnat.hxx>
#include <queryexp.hxx>
#include "whmsg.h"
#include "webdbg.hxx"
#include "cdoc.hxx"
//+-------------------------------------------------------------------------
//
// Function: ComparePositions
//
// Arguments: const void* pPos1 - pointer to first position
// const void* pPos2 - pointer to second position
//
// Synopsis: Comparison function used by qsort to sort positions array
//
//--------------------------------------------------------------------------
int _cdecl ComparePositions(
const void* pPos1,
const void* pPos2 )
{
Position* pp1= (Position*) pPos1;
Position* pp2= (Position*) pPos2;
Win4Assert(0 != pp1 && 0 !=pp2);
if (pp1->GetBegOffset() == pp2->GetBegOffset())
return 0;
else if (pp1->GetBegOffset() < pp2->GetBegOffset())
return -1;
else
return 1;
}
void Hit::Sort()
{
qsort( _aPos, _cPos, sizeof(Position), &ComparePositions );
}
//+-------------------------------------------------------------------------
//
// Member: Hit::Hit, public
//
// Arguments: [aPos] - array of positions
// [cPos] - number of Positions in [aPos]
//
// Synopsis: Create hit from an array of positions
//
//--------------------------------------------------------------------------
Hit::Hit( const Position * aPos, unsigned cPos )
: _cPos(cPos)
{
_aPos = new Position[cPos];
memcpy( _aPos, aPos, sizeof(Position) * cPos );
}
Hit::~Hit()
{
delete[] _aPos;
}
//+-------------------------------------------------------------------------
//
// Member: HitIter::GetPositionCount, public
//
// Synopsis: return number of positions or zero
//
//--------------------------------------------------------------------------
int HitIter::GetPositionCount() const
{
if (_iHit < _pDoc->_cHit && _pDoc->_aHit[_iHit])
return _pDoc->_aHit[_iHit]->GetPositionCount();
return 0;
}
//+-------------------------------------------------------------------------
//
// Member: HitIter::GetPosition, public
//
// Synopsis: return position by value
//
//--------------------------------------------------------------------------
Position HitIter::GetPosition ( int i ) const
{
if ( _iHit < _pDoc->_cHit && _pDoc->_aHit[_iHit] )
return _pDoc->_aHit[_iHit]->GetPos(i);
else
{
Position pos;
return( pos );
}
}
//+-------------------------------------------------------------------------
//
// Member: CDocument::CDocument, public constructor
//
// Arguments: [filename] - the name of the file to hit highlight
// [rank] - the rank of document in the hierarchy - NOT USED
// [rSearch] - ISearch object
// [cmsReadTimeout] - timeout for the initial file read
// [lockSingleThreadedFilter] - lock used for all single
// threaded filters
// [propertyList] - properties to be emitted
// [ulDisplayScript] - setting for displaying scripts
//
// Synopsis: Stream the file in chunk by chunk, scan it for hits,
// and record those positions in the stream matching the restricition.
//
//--------------------------------------------------------------------------
CDocument::CDocument(
WCHAR * filename,
ULONG rank,
ISearchQueryHits & rSearch,
DWORD cmsReadTimeout,
CReleasableLock & lockSingleThreadedFilter,
CEmptyPropertyList & propertyList,
ULONG ulDisplayScript )
: _filename( filename ),
_rank( rank ),
_bufEnd( 0 ),
_iChunkHint( 0 ),
_cHit( 0 ),
_rSearch( rSearch ),
_cmsReadTimeout( cmsReadTimeout ),
_lockSingleThreadedFilter( lockSingleThreadedFilter )
{
BOOL noHits = FALSE;
//
// cut away anything after the non-drive colon
// like in c:\wzmail\foo.fld:12.wzm
//
WCHAR* pChar = _filename;
if ( _filename[1] == L':')
pChar += 2;
while (*pChar != 0 && *pChar != L':')
pChar++;
if(*pChar == L':')
*pChar = 0;
//
// allocate a buffer to hold the file
//
AllocBuffer();
//
// attach to IFilter
//
BOOL fKnownFilter = BindToFilter();
// Check if this file's extension has a script mapping (if necessary)
BOOL fHasScriptMap = FALSE;
if ( ( DISPLAY_SCRIPT_NONE == ulDisplayScript ) ||
( ( DISPLAY_SCRIPT_KNOWN_FILTER == ulDisplayScript ) &&
( !fKnownFilter ) ) )
{
WCHAR *pwcExt = wcsrchr( _filename, L'.' );
webDebugOut(( DEB_ITRACE, "extension: '%ws'\n", pwcExt ));
if ( 0 != pwcExt )
{
//
// .asp files include .inc files. .inc files don't have a script
// map but they contain script. I'm not aware of a good way to
// enumerate all possible include file extensions for asp.
//
if ( !_wcsicmp( pwcExt, L".inc" ) )
fHasScriptMap = TRUE;
else
{
//
// Must be system to read the metabase
//
CImpersonateSystem system;
CMetaDataMgr mdMgr( TRUE, W3VRoot );
fHasScriptMap = mdMgr.ExtensionHasScriptMap( pwcExt );
}
}
}
webDebugOut(( DEB_ITRACE,
"fHasScriptMap %d, fKnownFilter %d, ulDisplayScript %d\n",
fHasScriptMap, fKnownFilter, ulDisplayScript ));
if ( fHasScriptMap )
{
if ( ( DISPLAY_SCRIPT_NONE == ulDisplayScript ) ||
( ( DISPLAY_SCRIPT_KNOWN_FILTER == ulDisplayScript ) &&
( !fKnownFilter ) ) )
{
THROW( CException( MSG_WEBHITS_PATH_INVALID ) );
}
}
//
// Initialize IFilter. Pass the list of properties to be emitted, since
// some other properties may have sensitive information (eg passwords in
// vbscript code in .asp files).
//
// First count how many properties exist.
ULONG cProps = propertyList.GetCount();
// Copy the properties
CDbColumns aSpecs( cProps );
CDbColId prop;
for ( unsigned iProp = 0; iProp < cProps; iProp++ )
aSpecs.Add( prop, iProp );
typedef CPropEntry * PCPropEntry;
XArray<PCPropEntry> xapPropEntries(cProps);
SCODE sc = propertyList.GetAllEntries(xapPropEntries.GetPointer(), cProps);
Win4Assert(S_OK == sc);
if (FAILED (sc))
THROW (CException(sc));
PCPropEntry *apPropEntries = xapPropEntries.GetPointer();
for (ULONG i = 0; i < cProps; i++)
{
CDbColId * pcol = (CDbColId *) &aSpecs.Get( i );
*pcol = apPropEntries[i]->PropSpec();
if ( !pcol->IsValid())
THROW (CException(E_OUTOFMEMORY));
}
webDebugOut(( DEB_ITRACE, "%d properties being processed\n", cProps ));
ULONG ulFlags;
sc = _xFilter->Init( IFILTER_INIT_CANON_PARAGRAPHS |
IFILTER_INIT_CANON_HYPHENS |
IFILTER_INIT_APPLY_INDEX_ATTRIBUTES,
cProps,
(FULLPROPSPEC *) aSpecs.GetColumnsArray(),
&ulFlags );
if (FAILED (sc))
THROW (CException(sc));
//
// pull the contents of the file into the buffer
//
ReadFile();
// Some broken filters don't work right if you Init() them twice, so
// throw away the IFilter, and get it again.
_xFilter.Free();
BindToFilter();
sc = _xFilter->Init( IFILTER_INIT_CANON_PARAGRAPHS |
IFILTER_INIT_CANON_HYPHENS |
IFILTER_INIT_APPLY_INDEX_ATTRIBUTES,
cProps,
(FULLPROPSPEC *) aSpecs.GetColumnsArray(),
&ulFlags );
if (FAILED (sc))
THROW (CException(sc));
//
// attach to ISearchQueryHits, which will find the hits
//
sc = _rSearch.Init( _xFilter.GetPointer(), ulFlags );
if (FAILED (sc))
{
if ( QUERY_E_INVALIDRESTRICTION != sc )
THROW (CException(sc));
// we can still show the file
noHits = TRUE;
}
//
// pull up all the hits
//
TRY
{
if (!noHits)
{
ULONG count;
FILTERREGION* aRegion;
SCODE sc = _rSearch.NextHitOffset( &count, &aRegion );
while ( S_OK == sc )
{
XCoMem<FILTERREGION> xRegion( aRegion );
webDebugOut(( DEB_ITRACE,
"CDOCUMENT: next hit: count %d, chunk %d offset %d, ext %d\n",
count,
aRegion[0].idChunk,
aRegion[0].cwcStart,
aRegion[0].cwcExtent ));
CDynArrayInPlace<Position> aPos( count );
//
// get the positions in the hit
//
for (unsigned i = 0; i < count; i++)
{
aPos[i] = RegionToPos( aRegion [i] );
webDebugOut(( DEB_ITRACE,
" region %d, start %d, length %d\n",
i,
aPos[i].GetBegOffset(),
aPos[i].GetLength() ));
}
xRegion.Free();
XPtr<Hit> xHit( new Hit( aPos.GetPointer(), count ) );
_aHit[_cHit] = xHit.GetPointer();
_cHit++;
xHit.Acquire();
sc = _rSearch.NextHitOffset( &count, &aRegion );
}
if ( FAILED( sc ) )
THROW( CException( sc ) );
}
}
CATCH( CException, e )
{
FreeHits();
RETHROW();
}
END_CATCH;
// done with the filter
_xFilter.Free();
if ( _lockSingleThreadedFilter.IsHeld() )
_lockSingleThreadedFilter.Release();
} //CDocument
//+-------------------------------------------------------------------------
//
// Member: CDocument::~CDocument, public
//
// Synopsis: Free CDocument
//
//--------------------------------------------------------------------------
CDocument::~CDocument()
{
FreeHits();
} //~CDocument
//+-------------------------------------------------------------------------
//
// Member: CDocument::Free, public
//
// Synopsis: Free CDocument storage
//
//--------------------------------------------------------------------------
void CDocument::FreeHits()
{
//
// walk through _aHit, deleting each Positions array that the
// cells are pointing to
//
for ( unsigned i = 0; i < _cHit; i++ )
{
delete _aHit[i];
_aHit[i] = 0;
}
_cHit = 0;
} //Free
//+-------------------------------------------------------------------------
//
// Member: CDocument::RegionToPos, public
//
// Synopsis: Convert a FILTERREGION to a position
//
//--------------------------------------------------------------------------
Position CDocument::RegionToPos(
FILTERREGION& region )
{
//
// Use a linear search here. In profile runs this has never shown
// up as a problem. Fix if this changes.
//
ULONG offset = ULONG (-1);
//
// check whether we're not trying to access an illegal chunk
//
if (_iChunkHint >= _chunkCount || _chunk[_iChunkHint].ChunkId() !=
region.idChunk )
{
_iChunkHint = 0;
while ( _iChunkHint < _chunkCount && _chunk[_iChunkHint].ChunkId() <
region.idChunk )
{
_iChunkHint++;
}
if (_iChunkHint >= _chunkCount || _chunk[_iChunkHint].ChunkId()
!= region.idChunk)
{
return Position();
}
}
//
// _iChunkHint now contains the index of the appropriate chunk in the
// chunk array
//
Win4Assert ( _iChunkHint < _chunkCount );
Win4Assert ( _chunk[_iChunkHint].ChunkId() == region.idChunk );
//
// offset now stores the linear offset of the position from the
// beginning of the stream/buffer
//
offset = _chunk[_iChunkHint].Offset() + region.cwcStart;
return Position (offset,region.cwcExtent );
} //RegionToPos
//+-------------------------------------------------------------------------
//
// Member: CDocument::AllocBuffer, public
//
// Synopsis: Allocate buffer for file text
//
//--------------------------------------------------------------------------
void CDocument::AllocBuffer()
{
HANDLE hFile = CreateFile( _filename,
GENERIC_READ,
FILE_SHARE_READ,
0, // security
OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL,
0 ); // template
if ( INVALID_HANDLE_VALUE == hFile )
THROW( CException() );
ULONG cbBuf = GetFileSize( hFile, 0 );
CloseHandle( hFile );
// Allow extra room for custom properties to be emitted from the
// filter, plus the conversion to unicode
_xBuffer.Init( cbBuf + cbBuf / 2 );
} //AllocBuffer
//+-------------------------------------------------------------------------
//
// Member: CDocument::BindToFilter, public
//
// Synopsis: Bind to appropriate filter for the CDocument
//
// Returns: TRUE if an appropriate filter was found
// FALSE if defaulted to the text filter
//
//--------------------------------------------------------------------------
BOOL CDocument::BindToFilter()
{
//
// Bind to the filter interface -- try free threaded first. If the
// filter isn't thread-safe, grab the lock and get the filter.
//
SCODE sc = LoadBHIFilter( _filename, 0, _xFilter.GetQIPointer(), FALSE );
// Is the filter not thread safe? If so, get the lock to protect
// the filter. No checking is done to see that this particular
// filter is in use -- just that some non-thread-safe filter is in use.
if ( S_FALSE == sc )
{
// If the lock isn't held yet, get it (BindToFilter is called
// twice by CDocument's constructor, so check IsHeld())
if ( !_lockSingleThreadedFilter.IsHeld() )
_lockSingleThreadedFilter.Request();
// retry to load the filter as single-threaded
sc = LoadBHIFilter( _filename, 0, _xFilter.GetQIPointer(), TRUE );
}
BOOL fFoundFilter = TRUE;
if ( FAILED(sc) )
{
sc = LoadTextFilter( _filename, _xFilter.GetPPointer() );
if (FAILED(sc))
THROW (CException(sc));
fFoundFilter = FALSE;
}
return fFoundFilter;
} //BindToFilter
//+-------------------------------------------------------------------------
//
// Function: GetThreadTime
//
// Synopsis: Gets the current total cpu usage for the thread
//
//--------------------------------------------------------------------------
LONGLONG GetThreadTime()
{
FILETIME ftDummy1, ftDummy2;
LONGLONG llUser, llKernel;
Win4Assert( sizeof(LONGLONG) == sizeof(FILETIME) );
GetThreadTimes( GetCurrentThread(),
&ftDummy1, // Creation time
&ftDummy2, // Exit time
(FILETIME *) &llUser, // user mode time
(FILETIME *) &llKernel ); // kernel mode tiem
return llKernel + llUser;
} //GetThreadTime
//+-------------------------------------------------------------------------
//
// Member: CDocument::ReadFile, public
//
// Synopsis: Read file into buffer using the filter
//
//--------------------------------------------------------------------------
void CDocument::ReadFile()
{
// get the maximum cpu time in 100s of nano seconds.
LONGLONG llLimitCpuTime = _cmsReadTimeout * 1000 * 10000;
llLimitCpuTime += GetThreadTime();
ULONG cwcSoFar = 0;
int cChunk = 0;
BOOL fSeenProp = FALSE;
STAT_CHUNK statChunk;
SCODE sc = _xFilter->GetChunk ( &statChunk );
//
// Take them into account at some point
// to test more complicated chunking
//
//
// keep getting chunks of the file, placing them in the buffer,
// and setting the chunk offset markers that will be used to
// interpolate the buffer
//
while ( SUCCEEDED(sc)
|| FILTER_E_LINK_UNAVAILABLE == sc
|| FILTER_E_EMBEDDING_UNAVAILABLE == sc
|| FILTER_E_NO_TEXT == sc )
{
//
// Eliminate all chunks with idChunkSource 0 right here - these
// cannot be hit highlighted.
// Also eliminate all CHUNK_VALUE chunks.
//
if ( SUCCEEDED( sc ) && (statChunk.flags & CHUNK_TEXT) && (0 != statChunk.idChunkSource) )
{
//
// set markers
//
Win4Assert ( cChunk == 0 || statChunk.idChunk >
_chunk [cChunk - 1].ChunkId() );
//
// If there was an end of sentence or paragraph or chapter, we
// should introduce an appropriate spacing character.
//
if ( statChunk.breakType != CHUNK_NO_BREAK &&
cwcSoFar < _xBuffer.Count() )
{
switch (statChunk.breakType)
{
case CHUNK_EOW:
case CHUNK_EOS:
_xBuffer[cwcSoFar++] = L' '; // introduce a space character
break;
case CHUNK_EOP:
case CHUNK_EOC:
_xBuffer[cwcSoFar++] = UNICODE_PARAGRAPH_SEPARATOR;
break;
}
}
//
// The Offset into the stream depends on whether this is an
// 'original' chunk or not
//
CCiPropSpec* pProp = (CCiPropSpec*) &statChunk.attribute;
webDebugOut(( DEB_ITRACE,
"Chunk %d, Source %d, Contents %d, start %d, cwc %d\n",
statChunk.idChunk,
statChunk.idChunkSource,
pProp->IsContents(),
statChunk.cwcStartSource,
statChunk.cwcLenSource ));
if ( (statChunk.idChunk == statChunk.idChunkSource) &&
pProp->IsContents() )
{
_chunk[cChunk].SetChunkId( statChunk.idChunk );
_chunk[cChunk].SetOffset( cwcSoFar );
cChunk++;
#if 0
}
else if ( statChunk.idChunk != statChunk.idChunkSource )
{
_chunk [cChunk].SetChunkId (statChunk.idChunk);
//
// we have to first find the offset of the source chunk
//
for (int i=cChunk-1;i>=0;i--)
{
if (_chunk[i].ChunkId() == statChunk.idChunkSource)
{
_chunk[cChunk].SetOffset(_chunk[i].Offset()+statChunk.cwcStartSource);
break;
}
}
cChunk++;
}
//
// if the chunk is a contents chunk and idChunkSrc = idChunk,
// then pull it in
//
if ( (statChunk.idChunk == statChunk.idChunkSource) &&
pProp->IsContents() )
{
#endif
webDebugOut(( DEB_ITRACE, "CDOC: markers: chunk %d offset %d\n",
_chunk[cChunk-1].ChunkId(),
_chunk[cChunk-1].Offset() ));
//
// push the text into memory
//
do
{
ULONG cwcThis = _xBuffer.Count() - cwcSoFar;
if ( 0 == cwcThis )
break;
sc = _xFilter->GetText( &cwcThis,
_xBuffer.GetPointer() + cwcSoFar );
if (SUCCEEDED(sc))
{
cwcSoFar += cwcThis;
}
}
while (SUCCEEDED(sc));
}
} // If SUCCEEDED( sc )
if ( GetThreadTime() > llLimitCpuTime )
{
webDebugOut(( DEB_ERROR, "Webhits took too long. Timeout\n" ));
THROW( CException( MSG_WEBHITS_TIMEOUT ) );
}
//
// next chunk, please
//
sc = _xFilter->GetChunk ( &statChunk );
}
_bufEnd = _xBuffer.GetPointer() + cwcSoFar;
_chunkCount = cChunk;
} //ReadFile
WCHAR* CDocument::GetWritablePointerToOffset(
long offset )
{
if (offset >= 0)
{
if (_xBuffer.GetPointer() + offset < _bufEnd)
return _xBuffer.GetPointer() + offset;
else
return _bufEnd;
}
else
{
return _xBuffer.GetPointer();
}
} //GetWritablePointerToOffset
//+-------------------------------------------------------------------------
//
// Member: CDocument::GetPointerToOffset, public
//
// Arguments: [offset] - the offset in the stream that we want a pointer to
//
// Synopsis: Return a constant pointer to a specific offset in the buffer
//
//--------------------------------------------------------------------------
const WCHAR* CDocument::GetPointerToOffset(long offset)
{
return (const WCHAR *) GetWritablePointerToOffset(offset);
} //GetPointerToOffset