windows-nt/Source/XPSP1/NT/inetsrv/query/bigtable/colhash.cxx

//+-------------------------------------------------------------------------
//
//  Microsoft Windows
//  Copyright (C) Microsoft Corporation, 1994 - 2000.
//
//  File:       colhash.cxx
//
//  Contents:   Hash table compressions for large tables.
//
//  Classes:    CCompressedColHash
//
//  Functions:  GuidHash - Hash function for GUIDs
//
//  History:    13 Apr 1994     AlanW    Created
//
//--------------------------------------------------------------------------

#include "pch.cxx"
#pragma hdrstop

#include <objcur.hxx>
#include <tblvarnt.hxx>

#include "tabledbg.hxx"
#include "colcompr.hxx"


const USHORT MAX_HASH_TABLE_SIZE = 32767;     // Maximum hash table size

//+-------------------------------------------------------------------------
//
//  Function:   GuidHash, public
//
//  Synopsis:   Hash a GUID value for use in a hash table.
//
//  Arguments:  [pbData] - pointer to the value to be hashed.
//              [cbData] - should be sizeof (GUID), unused
//
//  Returns:    ULONG - Hash value for the input GUID
//
//  Notes:      The hash function just xors a few selected fields out
//              of the GUID structure.  It is intended to work well for
//              both generated GUIDs (from UuidCreate) and administratively
//              assigned GUIDs like OLE IIDs and CLSIDs.
//
//--------------------------------------------------------------------------

ULONG GuidHash(
    BYTE *pbData,
    USHORT cbData
) {
    UNALIGNED GUID *pGuid = (GUID *)pbData;
    return (pGuid->Data1 ^
            (pGuid->Data4[0]<<16) ^
            (pGuid->Data4[6]<<8) ^
            (pGuid->Data4[7]));
}


//+-------------------------------------------------------------------------
//
//  Method:     CCompressedColHash::DefaultHash, public static
//
//  Synopsis:   Generic hash function
//
//  Arguments:  [pbData] - pointer to the value to be hashed.
//              [cbData] - size of pbData
//
//  Returns:    ULONG - Hash value for the input data
//
//--------------------------------------------------------------------------

//static
ULONG CCompressedColHash::DefaultHash(
    BYTE *pbData,
    USHORT cbData
) {
    ULONG ulRet = cbData;

    while (cbData--)
        ulRet = (ulRet<<1) ^ *pbData++;

    return ulRet;
}


//+-------------------------------------------------------------------------
//
//  Method:     CCompressedColHash::CCompressedColHash, public
//
//  Synopsis:   Constructor for a hash compressed column.
//
//  Arguments:  [vtData] - type of each data item
//              [cbDataWidth] - size of each data item
//              [pfnHashFunction] - pointer to hash function
//
//  Returns:    pKey is filled in with the index of the data item in
//              the data array.
//
//  Notes:
//
//--------------------------------------------------------------------------


CCompressedColHash::CCompressedColHash(
    VARTYPE     vtData,
    USHORT      cbDataWidth,
    PFNHASH     pfnHashFunction) :
        CCompressedCol(
            vtData,                     // DataType
            sizeof (HASHKEY),           // _cbKeyWidth
            CCompressedCol::FixedHash   // _CompressionType
        ),

        _cbDataWidth(cbDataWidth),
        _pfnHash(pfnHashFunction),
        _pHashTable(NULL), _cHashEntries(0),
        _pDataItems(NULL), _cDataItems(0),
        _fGrowthInProgress(FALSE),
        _pData(NULL), _cbData(0),
        _ulMemCounter(0)
{

}


CCompressedColHash::~CCompressedColHash( )
{
    if (_pData) {
        TblPageDealloc(_pData, _ulMemCounter);
        _pData = NULL;
        _cbData = 0;
    }
    Win4Assert(_ulMemCounter == 0);
}


//+-------------------------------------------------------------------------
//
//  Method:     CCompressedColHash::AddData, public
//
//  Synopsis:   Add a data entry to the hash table if it is not
//              already there.
//
//  Arguments:  [pVarnt] - pointer to data item
//              [pKey] - pointer to lookup key value
//              [reIndicator] - returns an indicator variable for
//                      problems
//
//  Returns:    pKey is filled in with the index of the data item in
//              the data array.  reIndicator is filled with an indication
//              of problems.
//
//  Notes:
//
//--------------------------------------------------------------------------

VOID    CCompressedColHash::AddData(
    PROPVARIANT const * const pVarnt,
    ULONG* pKey,
    GetValueResult& reIndicator
) {
    //
    //  Specially handle the VT_EMPTY case
    //
    if (pVarnt->vt == VT_EMPTY) {
        *pKey = 0;
        reIndicator = GVRSuccess;
        return;
    }

    CTableVariant *pVar = (CTableVariant *)pVarnt;
    Win4Assert(pVarnt->vt == DataType);

    BYTE *pbData ;
    USHORT cbData = (USHORT) pVar->VarDataSize();

    Win4Assert(cbData && cbData == _cbDataWidth);
    if (pVar->VariantPointerInFirstWord( )) {
        pbData = (BYTE *) pVar->pszVal;
    } else {
        Win4Assert(pVar->VariantPointerInSecondWord( ));
        pbData = (BYTE *) pVar->blob.pBlobData;
    }

    _AddData(pbData, cbData, pKey);
    reIndicator = GVRSuccess;
    return;
}


//+-------------------------------------------------------------------------
//
//  Method:     CCompressedColHash::_AddData, protected
//
//  Synopsis:   Helper for the public AddData method.  Adds
//              a data entry to the hash table (if it does not already
//              exist).
//
//  Arguments:  [pbData] - pointer to data item
//              [cbDataSize] - size of data item
//              [pKey] - pointer to lookup key value
//
//  Returns:    pKey is filled in with the index of the data item in
//              the data array.
//
//  Notes:
//
//--------------------------------------------------------------------------

VOID    CCompressedColHash::_AddData(
    BYTE *pbData,
    USHORT cbDataSize,
    ULONG* pKey
) {
    Win4Assert(cbDataSize == _cbDataWidth);

    if (_pData == NULL) {
        _GrowHashTable();
    }

    ULONG ulHash = _pfnHash(pbData, cbDataSize);

    ulHash %= _cHashEntries;

    HASHKEY* pusHashChain = &_pHashTable[ulHash];
    HASHKEY* pusNextData;
    USHORT cChainLength = 0;

    while (*pusHashChain != 0) {
        cChainLength++;
        pusNextData = _IndexHashkey( *pusHashChain );

        if (memcmp((BYTE *) (pusNextData+1), pbData, cbDataSize) == 0) {
            //
            //  Found the data item.  Return its index.
            //
            *pKey = *pusHashChain;
            return;
        }
        pusHashChain = pusNextData;
    }
    if (cChainLength > _maxChain)
        _maxChain = cChainLength;

    pusNextData = (HASHKEY *) ((BYTE *)_pDataItems +
                    (_cDataItems) * (sizeof (HASHKEY) + _cbDataWidth));
    if (((BYTE*)pusNextData + (sizeof (HASHKEY) + _cbDataWidth) -
        (BYTE *)_pData) > (int) _cbData ||
        (_cDataItems > (ULONG) ( _cHashEntries * 3 ) &&
         _cHashEntries < MAX_HASH_TABLE_SIZE &&
         !_fGrowthInProgress)) {

        //
        //  The new data will not fit in the table, or the hash chains will
        //  be too long.  Grow the table, then recurse.  The table may be
        //  rehashed, and can be moved when grown, so the lookup we've
        //  already done may be invalid.
        //
        _GrowHashTable();
        _AddData(pbData, cbDataSize, pKey);
        return;
    }

    //
    //  Now add the new data item.  The data item consists of a USHORT
    //  for the hash chain, followed by the buffer for the fixed size
    //  data item.
    //

    *pKey = *pusHashChain = ++_cDataItems;
    Win4Assert(_cDataItems != 0);               // check for overflow
    *pusNextData++ = 0;
    RtlCopyMemory((BYTE *)pusNextData, pbData, _cbDataWidth);
}


//+-------------------------------------------------------------------------
//
//  Method:     CCompressedColHash::_Rehash, protected
//
//  Synopsis:   Helper function for the _GrowHashTable method.
//              reinserts an existing item into the hash table.
//
//  Arguments:  [pbData] - pointer to data item
//              [kData] - index to the data item in the table
//
//  Returns:    Nothing
//
//  Notes:
//
//--------------------------------------------------------------------------

VOID    CCompressedColHash::_Rehash(
    HASHKEY kData,
    BYTE *pbData
) {
    Win4Assert(_pData != NULL && kData > 0 && kData <= _cDataItems);

    ULONG ulHash = _pfnHash(pbData, _cbDataWidth);

    ulHash %= _cHashEntries;

    HASHKEY* pusHashChain = &_pHashTable[ulHash];
    HASHKEY* pusNextData;
    USHORT cChainLength = 0;

    while (*pusHashChain != 0) {
        cChainLength++;
        pusNextData = _IndexHashkey( *pusHashChain );
        pusHashChain = pusNextData;
    }
    if (cChainLength > _maxChain)
        _maxChain = cChainLength;

    pusNextData = _IndexHashkey( kData );

    //
    //  Now add the data item to the hash chain.
    //

    *pusHashChain = kData;
    *pusNextData++ = 0;
    Win4Assert((BYTE*)pusNextData == pbData);
    return;
}


//+-------------------------------------------------------------------------
//
//  Method:     CCompressedColHash::GetData, public
//
//  Synopsis:   Retrieve a value from the hash table.
//
//  Arguments:  [pVarnt] - pointer to variant in which to return the data
//              [PreferredType] - Peferred data type
//              [ulKey] - the lookup key value
//              [PropId] - (unused) property id being retrieved.
//
//  Returns:    pVarnt is filled with the result of the lookup.
//
//  Notes:      The PreferredType expresses the caller's preference only.
//              This method is free to return whatever type is most
//              convenient.
//
//              The returned data does not conform to any alignment
//              restrictions on the data.
//
//--------------------------------------------------------------------------

GetValueResult  CCompressedColHash::GetData(
    PROPVARIANT * pVarnt,
    VARTYPE PreferredType,
    ULONG ulKey,
    PROPID PropId
) {
    CTableVariant *pVar = (CTableVariant *)pVarnt;
    Win4Assert(PreferredType == DataType && ulKey >= 1 && ulKey <= _cDataItems);

    if (ulKey >= 1 && ulKey <= _cDataItems) {
        pVarnt->vt = DataType;

        BYTE *pbData = ((BYTE *)_pDataItems +
                        (ulKey-1) * (sizeof (HASHKEY) + _cbDataWidth)) +
                        sizeof (HASHKEY);

        if (pVar->VariantPointerInFirstWord( )) {
            pVar->pszVal = (CHAR*)pbData;
        } else {
            Win4Assert(pVar->VariantPointerInSecondWord( ));
            pVar->blob.pBlobData = pbData;
        }
        return GVRSuccess;
    } else {
        pVarnt->vt = VT_EMPTY;
        return GVRNotAvailable;
    }
}

void    CCompressedColHash::FreeVariant(PROPVARIANT * pvarnt) { }


//+-------------------------------------------------------------------------
//
//  Method:     CCompressedColHash::_GrowHashTable, protected
//
//  Synopsis:   Grow the space allocated to the hash table and data
//              items.
//
//  Arguments:  - none -
//
//  Returns:    Nothing
//
//  Notes:      Also called to allocate the initial data area.
//
//              The number of hash buckets starts out at a low
//              number, then is increased as the amount of data
//              grows.  Data items must be rehashed when this occurs.
//              Since items are identified by their offset in the
//              data array, this must not change while rehashing.
//
//--------------------------------------------------------------------------

const unsigned MIN_HASH_TABLE_SIZE = 11;        // Minimum hash table size


inline USHORT CCompressedColHash::_NextHashSize(
    HASHKEY cItems,
    USHORT cHash
) {
    do {
        cHash = cHash*2 + 1;
    } while (cHash < _cDataItems);
    return  (cHash < MAX_HASH_TABLE_SIZE) ? cHash : MAX_HASH_TABLE_SIZE;
}


VOID CCompressedColHash::_GrowHashTable( void )
{
    ULONG cbSize;
    USHORT cNewHashEntries;
    int fRehash = FALSE;

    Win4Assert(!_fGrowthInProgress &&
             "Recursive call to CCompressedColHash::_GrowHashTable");

    _fGrowthInProgress = TRUE;
    if (_pData == NULL) {
        cNewHashEntries = MIN_HASH_TABLE_SIZE;
    } else if (_cHashEntries < MAX_HASH_TABLE_SIZE &&
               (_cDataItems > (ULONG) _cHashEntries*2 ||
                (_cDataItems > _cHashEntries && _maxChain > 3))) {
        cNewHashEntries = _NextHashSize(_cDataItems, _cHashEntries);
        fRehash = TRUE;
        tbDebugOut((DEB_ITRACE, "Growing hash table, old,new sizes = %d,%d\n",
                                        _cHashEntries, cNewHashEntries));
    }

    //
    //  Compute the required size of the hash table and data
    //
    cbSize = _cHashEntries * sizeof(HASHKEY);
    cbSize += (_cDataItems + 4) * (_cbDataWidth + sizeof (HASHKEY));
    cbSize = TblPageGrowSize(cbSize, TRUE);
    Win4Assert(cbSize > _cbData || (fRehash && cbSize == _cbData));

    BYTE *pbNewData;

    if (_pData && cbSize < TBL_PAGE_MAX_SEGMENT_SIZE) {
        pbNewData = (BYTE *)
            TblPageRealloc(_pData, _ulMemCounter, cbSize, 0);
    } else {
        pbNewData =
            (BYTE *)TblPageAlloc(cbSize, _ulMemCounter, TBL_SIG_COMPRESSED);
    }

    tbDebugOut((DEB_ITRACE, "New hash table at = %x\n", pbNewData));

    if (_pData != NULL && !fRehash) {
        if (_pData != pbNewData) {
            RtlCopyMemory(pbNewData, _pData, _cbData);
            TblPageDealloc(_pData, _ulMemCounter, _cbData);
            _pData = pbNewData;
        }
        _cbData = cbSize;
        _pHashTable = (HASHKEY *) _pData;
        _pDataItems = (BYTE *) (_pHashTable + _cHashEntries);
    } else {
        BYTE *pOldDataItems = _pDataItems;
        VOID *pOldData = _pData;
        ULONG cbOldSize = _cbData;

        _pData = pbNewData;
        _cbData = cbSize;
        _pHashTable = (HASHKEY *)_pData;
        _cHashEntries = cNewHashEntries;
        _pDataItems = (BYTE *) (_pHashTable + _cHashEntries);
        if (pOldData != NULL)
            RtlMoveMemory(_pDataItems,
                          pOldDataItems,
                          _cDataItems * (sizeof (HASHKEY) + _cbDataWidth));
        RtlZeroMemory(_pHashTable, cNewHashEntries * sizeof (HASHKEY));
        _maxChain = 0;

        //
        //  Now re-add all old data items to the hash table.
        //
        pOldDataItems = _pDataItems;
        for (HASHKEY i=1; i<=_cDataItems; i++) {
            pOldDataItems += sizeof (HASHKEY);  // skip hash chain
            _Rehash(i, pOldDataItems);
            pOldDataItems += _cbDataWidth;      // skip data item
        }
        if (pOldData != NULL && pOldData != _pData)
            TblPageDealloc(pOldData, _ulMemCounter, cbOldSize);
    }

    _fGrowthInProgress = FALSE;
    return;
}

//+---------------------------------------------------------------------------
//
//  Function:   _ClearAll
//
//  Synopsis:   Method clears all the data in the "fixed width" part of the
//              memory buffer.
//
//  Arguments:  (none)
//
//  History:    12-16-94   srikants   Created
//
//  Notes:
//
//----------------------------------------------------------------------------

void CCompressedColHash::_ClearAll()
{
    RtlZeroMemory(_pHashTable, _cHashEntries * sizeof (HASHKEY));
    RtlZeroMemory(_pDataItems, _cDataItems * _cbDataWidth );
    _cDataItems = 0;
}