/*++ © 1998 Seagate Software, Inc. All rights reserved Module Name: Wsbhash.cpp Abstract: Some functions for hashing text strings and creating DB keys from file path names. NOTE: Since no one needed this code by the time I got it done, it hasn't been tested! Author: Ron White [ronw] 25-Apr-1997 Revision History: --*/ #include "stdafx.h" // This pseudorandom permutation table (used by the SimpleHash function below) // is taken from the article referenced in the comments for that function. static UCHAR perm_table[] = { 1, 87, 49, 12, 176, 178, 102, 166, 121, 193, 6, 84, 249, 230, 44, 163, 14, 197, 213, 181, 161, 85, 218, 80, 64, 239, 24, 226, 236, 142, 38, 200, 110, 177, 104, 103, 141, 253, 255, 50, 77, 101, 81, 18, 45, 96, 31, 222, 25, 107, 190, 70, 86, 237, 240, 34, 72, 242, 20, 214, 244, 227, 149, 235, 97, 234, 57, 22, 60, 250, 82, 175, 208, 5, 127, 199, 111, 62, 135, 248, 174, 169, 211, 58, 66, 154, 106, 195, 245, 171, 17, 187, 182, 179, 0, 243, 132, 56, 148, 75, 128, 133, 158, 100, 130, 126, 91, 13, 153, 246, 216, 219, 119, 68, 223, 78, 83, 88, 201, 99, 122, 11, 92, 32, 136, 114, 52, 10, 138, 30, 48, 183, 156, 35, 61, 26, 143, 74, 251, 94, 129, 162, 63, 152, 170, 7, 115, 167, 241, 206, 3, 150, 55, 59, 151, 220, 90, 53, 23, 131, 125, 173, 15, 238, 79, 95, 89, 16, 105, 137, 225, 224, 217, 160, 37, 123, 118, 73, 2, 157, 46, 116, 9, 145, 134, 228, 207, 212, 202, 215, 69, 229, 27, 188, 67, 124, 168, 252, 42, 4, 29, 108, 21, 247, 19, 205, 39, 203, 233, 40, 186, 147, 198, 192, 155, 33, 164, 191, 98, 204, 165, 180, 117, 76, 140, 36, 210, 172, 41, 54, 159, 8, 185, 232, 113, 196, 231, 47, 146, 120, 51, 65, 28, 144, 254, 221, 93, 189, 194, 139, 112, 43, 71, 109, 184, 209 }; // Local functions static HRESULT ProgressiveHash(WCHAR* pWstring, ULONG nChars, UCHAR* pKey, ULONG keySize, ULONG* pKeyCount); static UCHAR SimpleHash(UCHAR* pString, ULONG count); // ProgressiveHash - hash a wide-character string into a byte key of a given // maximum size. The string is limited to 32K characters (64K bytes) and the // key size must be at least 16. // // The algorithm starts out merely XORing the two bytes of each character into a // single byte in the key. If it must use the last 15 bytes of the key, it begins // using the SimpleHash function to hash progressively larger (doubling) chuncks // of the string into a single byte. // // This method is used to try and preserve as much information about short strings // as possible; to preserve, to some extent, the sort order of strings; and to // compress long strings into a reasonably sized key. It is assumed (perhaps // incorrectly) that many of the characters will be ANSI characters an so the // XOR of the bytes in the initial part of the string won't lose any information. static HRESULT ProgressiveHash(WCHAR* pWstring, ULONG nChars, UCHAR* pKey, ULONG keySize, ULONG* pKeyCount) { HRESULT hr = S_OK; try { ULONG chunk; // Current chunk size ULONG headSize; ULONG keyIndex = 0; // Current index into the key UCHAR* pBytes; // Byte pointer into the string ULONG remains; // Bytes remaining in the string // Check arguments WsbAffirm(NULL != pWstring, E_POINTER); WsbAffirm(NULL != pKey, E_POINTER); remains = nChars * 2; WsbAffirm(65536 >= remains, E_INVALIDARG); WsbAffirm(15 < keySize, E_INVALIDARG); // Do the non-progressive part pBytes = (UCHAR*)pWstring; headSize = keySize - 15; while (remains > 0 && keyIndex < headSize) { pKey[keyIndex++] = (UCHAR) ( *pBytes ^ *(pBytes + 1) ); pBytes += 2; remains -= 2; } // Do the progressive part chunk = 4; while (remains > 0) { if (chunk > remains) { chunk = remains; } pKey[keyIndex++] = SimpleHash(pBytes, chunk); pBytes += chunk; remains -= chunk; chunk *= 2; } if (NULL != pKeyCount) { *pKeyCount = keyIndex; } } WsbCatch(hr); return(hr); } // SimpleHash - hash a string of bytes into a single byte. // // This algorithm and the permutation table come from the article "Fast Hashing // of Variable-Length Text Strings" in the June 1990 (33, 6) issue of Communications // of the ACM (CACM). // NOTE: For a hash value larger than one byte, the article suggests hashing the // original string with this function to get one byte, adding 1 (mod 256) to the // first byte of the string and hashing the new string with this function to get // the second byte, etc. static UCHAR SimpleHash(UCHAR* pString, ULONG count) { int h = 0; for (ULONG i = 0; i < count; i++) { h = perm_table[h ^ pString[i]]; } return((UCHAR)h); } // SquashFilepath - compress a file path name into a (possibly) shorter key. // // This function splits the key into a path part (about 3/4 of the initial // bytes of the key) and a file name part (the rest of the key). For each // part it uses the ProgressiveHash function to compress the substring. // This function attempts to preserve enough information in the key that keys // will be sorted in approximately the same order as the original path names // and it is unlikely (though not impossible) that two different paths would // result in the same key. Both of these are dependent on the size of the key. // A reasonable size is probably 128 bytes, which gives 96 bytes for the path // and 32 bytes for the file name. A key size of 64 or less will fail because // the file name part will be too small for the Progressive Hash function. HRESULT SquashFilepath(WCHAR* pWstring, UCHAR* pKey, ULONG keySize) { HRESULT hr = S_OK; try { ULONG keyIndex; ULONG nChars; WCHAR* pFilename; ULONG pathKeySize; // Check arguments WsbAffirm(NULL != pWstring, E_POINTER); WsbAffirm(NULL != pKey, E_POINTER); WsbAffirm(60 < keySize, E_INVALIDARG); // Calculate some initial values pFilename = wcsrchr(pWstring, WCHAR('\\')); if (NULL == pFilename) { nChars = 0; pFilename = pWstring; } else { nChars = (ULONG)(pFilename - pWstring); pFilename++; } pathKeySize = (keySize / 4) * 3; // Compress the path if (0 < nChars) { WsbAffirmHr(ProgressiveHash(pWstring, nChars, pKey, pathKeySize, &keyIndex)); } else { keyIndex = 0; } // Fill the rest of the path part of the key with zeros for ( ; keyIndex < pathKeySize; keyIndex++) { pKey[keyIndex] = 0; } // Compress the file name nChars = wcslen(pFilename); if (0 < nChars) { WsbAffirmHr(ProgressiveHash(pFilename, nChars, &pKey[keyIndex], keySize - pathKeySize, &keyIndex)); keyIndex += pathKeySize; } // Fill the rest of the file name part of the key with zeros for ( ; keyIndex < keySize; keyIndex++) { pKey[keyIndex] = 0; } } WsbCatch(hr); return(hr); }