//+------------------------------------------------------------------------- // // Microsoft Windows // Copyright (C) Microsoft Corporation, 1992 - 2000. // // File: cdoc.cxx // // Contents: a radically stripped down version of the document class // that gets rid of the notion of paragragph and maintains only // information relative to the stream // //-------------------------------------------------------------------------- #include #pragma hdrstop #include #include #include #include #include #include #include #include #include #include "whmsg.h" #include "webdbg.hxx" #include "cdoc.hxx" //+------------------------------------------------------------------------- // // Function: ComparePositions // // Arguments: const void* pPos1 - pointer to first position // const void* pPos2 - pointer to second position // // Synopsis: Comparison function used by qsort to sort positions array // //-------------------------------------------------------------------------- int _cdecl ComparePositions( const void* pPos1, const void* pPos2 ) { Position* pp1= (Position*) pPos1; Position* pp2= (Position*) pPos2; Win4Assert(0 != pp1 && 0 !=pp2); if (pp1->GetBegOffset() == pp2->GetBegOffset()) return 0; else if (pp1->GetBegOffset() < pp2->GetBegOffset()) return -1; else return 1; } void Hit::Sort() { qsort( _aPos, _cPos, sizeof(Position), &ComparePositions ); } //+------------------------------------------------------------------------- // // Member: Hit::Hit, public // // Arguments: [aPos] - array of positions // [cPos] - number of Positions in [aPos] // // Synopsis: Create hit from an array of positions // //-------------------------------------------------------------------------- Hit::Hit( const Position * aPos, unsigned cPos ) : _cPos(cPos) { _aPos = new Position[cPos]; memcpy( _aPos, aPos, sizeof(Position) * cPos ); } Hit::~Hit() { delete[] _aPos; } //+------------------------------------------------------------------------- // // Member: HitIter::GetPositionCount, public // // Synopsis: return number of positions or zero // //-------------------------------------------------------------------------- int HitIter::GetPositionCount() const { if (_iHit < _pDoc->_cHit && _pDoc->_aHit[_iHit]) return _pDoc->_aHit[_iHit]->GetPositionCount(); return 0; } //+------------------------------------------------------------------------- // // Member: HitIter::GetPosition, public // // Synopsis: return position by value // //-------------------------------------------------------------------------- Position HitIter::GetPosition ( int i ) const { if ( _iHit < _pDoc->_cHit && _pDoc->_aHit[_iHit] ) return _pDoc->_aHit[_iHit]->GetPos(i); else { Position pos; return( pos ); } } //+------------------------------------------------------------------------- // // Member: CDocument::CDocument, public constructor // // Arguments: [filename] - the name of the file to hit highlight // [rank] - the rank of document in the hierarchy - NOT USED // [rSearch] - ISearch object // [cmsReadTimeout] - timeout for the initial file read // [lockSingleThreadedFilter] - lock used for all single // threaded filters // [propertyList] - properties to be emitted // [ulDisplayScript] - setting for displaying scripts // // Synopsis: Stream the file in chunk by chunk, scan it for hits, // and record those positions in the stream matching the restricition. // //-------------------------------------------------------------------------- CDocument::CDocument( WCHAR * filename, ULONG rank, ISearchQueryHits & rSearch, DWORD cmsReadTimeout, CReleasableLock & lockSingleThreadedFilter, CEmptyPropertyList & propertyList, ULONG ulDisplayScript ) : _filename( filename ), _rank( rank ), _bufEnd( 0 ), _iChunkHint( 0 ), _cHit( 0 ), _rSearch( rSearch ), _cmsReadTimeout( cmsReadTimeout ), _lockSingleThreadedFilter( lockSingleThreadedFilter ) { BOOL noHits = FALSE; // // cut away anything after the non-drive colon // like in c:\wzmail\foo.fld:12.wzm // WCHAR* pChar = _filename; if ( _filename[1] == L':') pChar += 2; while (*pChar != 0 && *pChar != L':') pChar++; if(*pChar == L':') *pChar = 0; // // allocate a buffer to hold the file // AllocBuffer(); // // attach to IFilter // BOOL fKnownFilter = BindToFilter(); // Check if this file's extension has a script mapping (if necessary) BOOL fHasScriptMap = FALSE; if ( ( DISPLAY_SCRIPT_NONE == ulDisplayScript ) || ( ( DISPLAY_SCRIPT_KNOWN_FILTER == ulDisplayScript ) && ( !fKnownFilter ) ) ) { WCHAR *pwcExt = wcsrchr( _filename, L'.' ); webDebugOut(( DEB_ITRACE, "extension: '%ws'\n", pwcExt )); if ( 0 != pwcExt ) { // // .asp files include .inc files. .inc files don't have a script // map but they contain script. I'm not aware of a good way to // enumerate all possible include file extensions for asp. // if ( !_wcsicmp( pwcExt, L".inc" ) ) fHasScriptMap = TRUE; else { // // Must be system to read the metabase // CImpersonateSystem system; CMetaDataMgr mdMgr( TRUE, W3VRoot ); fHasScriptMap = mdMgr.ExtensionHasScriptMap( pwcExt ); } } } webDebugOut(( DEB_ITRACE, "fHasScriptMap %d, fKnownFilter %d, ulDisplayScript %d\n", fHasScriptMap, fKnownFilter, ulDisplayScript )); if ( fHasScriptMap ) { if ( ( DISPLAY_SCRIPT_NONE == ulDisplayScript ) || ( ( DISPLAY_SCRIPT_KNOWN_FILTER == ulDisplayScript ) && ( !fKnownFilter ) ) ) { THROW( CException( MSG_WEBHITS_PATH_INVALID ) ); } } // // Initialize IFilter. Pass the list of properties to be emitted, since // some other properties may have sensitive information (eg passwords in // vbscript code in .asp files). // // First count how many properties exist. ULONG cProps = propertyList.GetCount(); // Copy the properties CDbColumns aSpecs( cProps ); CDbColId prop; for ( unsigned iProp = 0; iProp < cProps; iProp++ ) aSpecs.Add( prop, iProp ); typedef CPropEntry * PCPropEntry; XArray xapPropEntries(cProps); SCODE sc = propertyList.GetAllEntries(xapPropEntries.GetPointer(), cProps); Win4Assert(S_OK == sc); if (FAILED (sc)) THROW (CException(sc)); PCPropEntry *apPropEntries = xapPropEntries.GetPointer(); for (ULONG i = 0; i < cProps; i++) { CDbColId * pcol = (CDbColId *) &aSpecs.Get( i ); *pcol = apPropEntries[i]->PropSpec(); if ( !pcol->IsValid()) THROW (CException(E_OUTOFMEMORY)); } webDebugOut(( DEB_ITRACE, "%d properties being processed\n", cProps )); ULONG ulFlags; sc = _xFilter->Init( IFILTER_INIT_CANON_PARAGRAPHS | IFILTER_INIT_CANON_HYPHENS | IFILTER_INIT_APPLY_INDEX_ATTRIBUTES, cProps, (FULLPROPSPEC *) aSpecs.GetColumnsArray(), &ulFlags ); if (FAILED (sc)) THROW (CException(sc)); // // pull the contents of the file into the buffer // ReadFile(); // Some broken filters don't work right if you Init() them twice, so // throw away the IFilter, and get it again. _xFilter.Free(); BindToFilter(); sc = _xFilter->Init( IFILTER_INIT_CANON_PARAGRAPHS | IFILTER_INIT_CANON_HYPHENS | IFILTER_INIT_APPLY_INDEX_ATTRIBUTES, cProps, (FULLPROPSPEC *) aSpecs.GetColumnsArray(), &ulFlags ); if (FAILED (sc)) THROW (CException(sc)); // // attach to ISearchQueryHits, which will find the hits // sc = _rSearch.Init( _xFilter.GetPointer(), ulFlags ); if (FAILED (sc)) { if ( QUERY_E_INVALIDRESTRICTION != sc ) THROW (CException(sc)); // we can still show the file noHits = TRUE; } // // pull up all the hits // TRY { if (!noHits) { ULONG count; FILTERREGION* aRegion; SCODE sc = _rSearch.NextHitOffset( &count, &aRegion ); while ( S_OK == sc ) { XCoMem xRegion( aRegion ); webDebugOut(( DEB_ITRACE, "CDOCUMENT: next hit: count %d, chunk %d offset %d, ext %d\n", count, aRegion[0].idChunk, aRegion[0].cwcStart, aRegion[0].cwcExtent )); CDynArrayInPlace aPos( count ); // // get the positions in the hit // for (unsigned i = 0; i < count; i++) { aPos[i] = RegionToPos( aRegion [i] ); webDebugOut(( DEB_ITRACE, " region %d, start %d, length %d\n", i, aPos[i].GetBegOffset(), aPos[i].GetLength() )); } xRegion.Free(); XPtr xHit( new Hit( aPos.GetPointer(), count ) ); _aHit[_cHit] = xHit.GetPointer(); _cHit++; xHit.Acquire(); sc = _rSearch.NextHitOffset( &count, &aRegion ); } if ( FAILED( sc ) ) THROW( CException( sc ) ); } } CATCH( CException, e ) { FreeHits(); RETHROW(); } END_CATCH; // done with the filter _xFilter.Free(); if ( _lockSingleThreadedFilter.IsHeld() ) _lockSingleThreadedFilter.Release(); } //CDocument //+------------------------------------------------------------------------- // // Member: CDocument::~CDocument, public // // Synopsis: Free CDocument // //-------------------------------------------------------------------------- CDocument::~CDocument() { FreeHits(); } //~CDocument //+------------------------------------------------------------------------- // // Member: CDocument::Free, public // // Synopsis: Free CDocument storage // //-------------------------------------------------------------------------- void CDocument::FreeHits() { // // walk through _aHit, deleting each Positions array that the // cells are pointing to // for ( unsigned i = 0; i < _cHit; i++ ) { delete _aHit[i]; _aHit[i] = 0; } _cHit = 0; } //Free //+------------------------------------------------------------------------- // // Member: CDocument::RegionToPos, public // // Synopsis: Convert a FILTERREGION to a position // //-------------------------------------------------------------------------- Position CDocument::RegionToPos( FILTERREGION& region ) { // // Use a linear search here. In profile runs this has never shown // up as a problem. Fix if this changes. // ULONG offset = ULONG (-1); // // check whether we're not trying to access an illegal chunk // if (_iChunkHint >= _chunkCount || _chunk[_iChunkHint].ChunkId() != region.idChunk ) { _iChunkHint = 0; while ( _iChunkHint < _chunkCount && _chunk[_iChunkHint].ChunkId() < region.idChunk ) { _iChunkHint++; } if (_iChunkHint >= _chunkCount || _chunk[_iChunkHint].ChunkId() != region.idChunk) { return Position(); } } // // _iChunkHint now contains the index of the appropriate chunk in the // chunk array // Win4Assert ( _iChunkHint < _chunkCount ); Win4Assert ( _chunk[_iChunkHint].ChunkId() == region.idChunk ); // // offset now stores the linear offset of the position from the // beginning of the stream/buffer // offset = _chunk[_iChunkHint].Offset() + region.cwcStart; return Position (offset,region.cwcExtent ); } //RegionToPos //+------------------------------------------------------------------------- // // Member: CDocument::AllocBuffer, public // // Synopsis: Allocate buffer for file text // //-------------------------------------------------------------------------- void CDocument::AllocBuffer() { HANDLE hFile = CreateFile( _filename, GENERIC_READ, FILE_SHARE_READ, 0, // security OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0 ); // template if ( INVALID_HANDLE_VALUE == hFile ) THROW( CException() ); ULONG cbBuf = GetFileSize( hFile, 0 ); CloseHandle( hFile ); // Allow extra room for custom properties to be emitted from the // filter, plus the conversion to unicode _xBuffer.Init( cbBuf + cbBuf / 2 ); } //AllocBuffer //+------------------------------------------------------------------------- // // Member: CDocument::BindToFilter, public // // Synopsis: Bind to appropriate filter for the CDocument // // Returns: TRUE if an appropriate filter was found // FALSE if defaulted to the text filter // //-------------------------------------------------------------------------- BOOL CDocument::BindToFilter() { // // Bind to the filter interface -- try free threaded first. If the // filter isn't thread-safe, grab the lock and get the filter. // SCODE sc = LoadBHIFilter( _filename, 0, _xFilter.GetQIPointer(), FALSE ); // Is the filter not thread safe? If so, get the lock to protect // the filter. No checking is done to see that this particular // filter is in use -- just that some non-thread-safe filter is in use. if ( S_FALSE == sc ) { // If the lock isn't held yet, get it (BindToFilter is called // twice by CDocument's constructor, so check IsHeld()) if ( !_lockSingleThreadedFilter.IsHeld() ) _lockSingleThreadedFilter.Request(); // retry to load the filter as single-threaded sc = LoadBHIFilter( _filename, 0, _xFilter.GetQIPointer(), TRUE ); } BOOL fFoundFilter = TRUE; if ( FAILED(sc) ) { sc = LoadTextFilter( _filename, _xFilter.GetPPointer() ); if (FAILED(sc)) THROW (CException(sc)); fFoundFilter = FALSE; } return fFoundFilter; } //BindToFilter //+------------------------------------------------------------------------- // // Function: GetThreadTime // // Synopsis: Gets the current total cpu usage for the thread // //-------------------------------------------------------------------------- LONGLONG GetThreadTime() { FILETIME ftDummy1, ftDummy2; LONGLONG llUser, llKernel; Win4Assert( sizeof(LONGLONG) == sizeof(FILETIME) ); GetThreadTimes( GetCurrentThread(), &ftDummy1, // Creation time &ftDummy2, // Exit time (FILETIME *) &llUser, // user mode time (FILETIME *) &llKernel ); // kernel mode tiem return llKernel + llUser; } //GetThreadTime //+------------------------------------------------------------------------- // // Member: CDocument::ReadFile, public // // Synopsis: Read file into buffer using the filter // //-------------------------------------------------------------------------- void CDocument::ReadFile() { // get the maximum cpu time in 100s of nano seconds. LONGLONG llLimitCpuTime = _cmsReadTimeout * 1000 * 10000; llLimitCpuTime += GetThreadTime(); ULONG cwcSoFar = 0; int cChunk = 0; BOOL fSeenProp = FALSE; STAT_CHUNK statChunk; SCODE sc = _xFilter->GetChunk ( &statChunk ); // // Take them into account at some point // to test more complicated chunking // // // keep getting chunks of the file, placing them in the buffer, // and setting the chunk offset markers that will be used to // interpolate the buffer // while ( SUCCEEDED(sc) || FILTER_E_LINK_UNAVAILABLE == sc || FILTER_E_EMBEDDING_UNAVAILABLE == sc || FILTER_E_NO_TEXT == sc ) { // // Eliminate all chunks with idChunkSource 0 right here - these // cannot be hit highlighted. // Also eliminate all CHUNK_VALUE chunks. // if ( SUCCEEDED( sc ) && (statChunk.flags & CHUNK_TEXT) && (0 != statChunk.idChunkSource) ) { // // set markers // Win4Assert ( cChunk == 0 || statChunk.idChunk > _chunk [cChunk - 1].ChunkId() ); // // If there was an end of sentence or paragraph or chapter, we // should introduce an appropriate spacing character. // if ( statChunk.breakType != CHUNK_NO_BREAK && cwcSoFar < _xBuffer.Count() ) { switch (statChunk.breakType) { case CHUNK_EOW: case CHUNK_EOS: _xBuffer[cwcSoFar++] = L' '; // introduce a space character break; case CHUNK_EOP: case CHUNK_EOC: _xBuffer[cwcSoFar++] = UNICODE_PARAGRAPH_SEPARATOR; break; } } // // The Offset into the stream depends on whether this is an // 'original' chunk or not // CCiPropSpec* pProp = (CCiPropSpec*) &statChunk.attribute; webDebugOut(( DEB_ITRACE, "Chunk %d, Source %d, Contents %d, start %d, cwc %d\n", statChunk.idChunk, statChunk.idChunkSource, pProp->IsContents(), statChunk.cwcStartSource, statChunk.cwcLenSource )); if ( (statChunk.idChunk == statChunk.idChunkSource) && pProp->IsContents() ) { _chunk[cChunk].SetChunkId( statChunk.idChunk ); _chunk[cChunk].SetOffset( cwcSoFar ); cChunk++; #if 0 } else if ( statChunk.idChunk != statChunk.idChunkSource ) { _chunk [cChunk].SetChunkId (statChunk.idChunk); // // we have to first find the offset of the source chunk // for (int i=cChunk-1;i>=0;i--) { if (_chunk[i].ChunkId() == statChunk.idChunkSource) { _chunk[cChunk].SetOffset(_chunk[i].Offset()+statChunk.cwcStartSource); break; } } cChunk++; } // // if the chunk is a contents chunk and idChunkSrc = idChunk, // then pull it in // if ( (statChunk.idChunk == statChunk.idChunkSource) && pProp->IsContents() ) { #endif webDebugOut(( DEB_ITRACE, "CDOC: markers: chunk %d offset %d\n", _chunk[cChunk-1].ChunkId(), _chunk[cChunk-1].Offset() )); // // push the text into memory // do { ULONG cwcThis = _xBuffer.Count() - cwcSoFar; if ( 0 == cwcThis ) break; sc = _xFilter->GetText( &cwcThis, _xBuffer.GetPointer() + cwcSoFar ); if (SUCCEEDED(sc)) { cwcSoFar += cwcThis; } } while (SUCCEEDED(sc)); } } // If SUCCEEDED( sc ) if ( GetThreadTime() > llLimitCpuTime ) { webDebugOut(( DEB_ERROR, "Webhits took too long. Timeout\n" )); THROW( CException( MSG_WEBHITS_TIMEOUT ) ); } // // next chunk, please // sc = _xFilter->GetChunk ( &statChunk ); } _bufEnd = _xBuffer.GetPointer() + cwcSoFar; _chunkCount = cChunk; } //ReadFile WCHAR* CDocument::GetWritablePointerToOffset( long offset ) { if (offset >= 0) { if (_xBuffer.GetPointer() + offset < _bufEnd) return _xBuffer.GetPointer() + offset; else return _bufEnd; } else { return _xBuffer.GetPointer(); } } //GetWritablePointerToOffset //+------------------------------------------------------------------------- // // Member: CDocument::GetPointerToOffset, public // // Arguments: [offset] - the offset in the stream that we want a pointer to // // Synopsis: Return a constant pointer to a specific offset in the buffer // //-------------------------------------------------------------------------- const WCHAR* CDocument::GetPointerToOffset(long offset) { return (const WCHAR *) GetWritablePointerToOffset(offset); } //GetPointerToOffset