/******************************************************************************* * StringBlob.h * *--------------* * Description: * This is the header file for the CStringBlob class used internally by SAPI. * * Copyright 1998-2000 Microsoft Corporation All Rights Reserved. * *******************************************************************************/ #ifndef _STRINGBLOB_H_ #define _STRINGBLOB_H_ 1 #ifndef SPDebug_h #include #endif #include template class CStringBlobT { XCHAR * m_pData; // List of words, end-to-end ULONG m_cchAllocated; // Size of m_pData ULONG * m_aichWords; // Word index => offset in m_pData [1] is index of start of second word ULONG m_cwords; // Number of words ULONG m_cwordsAllocated; // Size of m_aichWords ULONG * m_aulBuckets; // Hash table containing indices of words or 0 for empty buckets ULONG m_cBuckets; // Number of buckets in hash table public: CStringBlobT() { m_pData = NULL; m_cchAllocated = 0; m_aichWords = NULL; m_cwords = 0; m_cwordsAllocated = 0; m_aulBuckets = NULL; m_cBuckets = 0; } ~CStringBlobT() { Clear(); } void Detach(XCHAR **ppszWordList, ULONG *pulSize) { *ppszWordList = NULL; if (m_pData) { ULONG cchDesired = StringSize(); ULONG cbSize = SerializeSize(); // byte count, ULONG multiple *ppszWordList = (XCHAR*)::CoTaskMemRealloc(m_pData, cbSize); if (*ppszWordList == NULL) { *ppszWordList = m_pData; cbSize = m_cchAllocated * sizeof(XCHAR); } m_pData = NULL; Clear(); if (pulSize) { *pulSize = cbSize; } } } void Clear() { if (m_pData) { ::CoTaskMemFree(m_pData); m_pData = NULL; } m_cchAllocated = 0; free(m_aichWords); m_aichWords = NULL; m_cwordsAllocated = 0; m_cwords = 0; free(m_aulBuckets); m_aulBuckets = NULL; m_cBuckets = 0; } HRESULT InitFrom(const XCHAR * pszStringArray, ULONG cch) { SPDBG_ASSERT(m_pData == NULL); if (cch) { ULONG cbSize = (cch * sizeof(XCHAR) + 3) & ~3; m_pData = (XCHAR *)::CoTaskMemAlloc(cbSize); if (m_pData == NULL) return E_OUTOFMEMORY; m_cchAllocated = cch; SPDBG_ASSERT(pszStringArray[0] == 0); // First string is always empty. // First pass to copy data and count strings. const XCHAR * pszPastEnd = pszStringArray + cch; const XCHAR * psz = pszStringArray; XCHAR * pszOut = m_pData; ULONG cwords = 0; while (psz < pszPastEnd) { if ((*pszOut++ = *psz++) == 0) ++cwords; } m_aichWords = (ULONG *) malloc(sizeof(ULONG) * cwords); if (m_aichWords == NULL) return E_OUTOFMEMORY; m_cwordsAllocated = cwords; m_cwords = cwords - 1; // Doesn't count leading 0 HRESULT hr = SetHashSize(cwords * 2 + 1); if (FAILED(hr)) return hr; // Second pass to fill in indices and hash table. psz = pszStringArray + 1; const WCHAR * pszWordStart = psz; ULONG ulID = 1; m_aichWords[0] = 1; while (psz < pszPastEnd) { if (*(psz++) == 0) { SPDBG_ASSERT(ulID < m_cwordsAllocated); m_aichWords[ulID] = (ULONG)(psz - pszStringArray); // can't have more than 4 million chars! m_aulBuckets[FindIndex(pszWordStart)] = ulID; pszWordStart = psz; ++ulID; } } } return S_OK; } ULONG HashKey(const XCHAR * pszString, ULONG * pcchIncNull = NULL) { ULONG hash = 0; ULONG cchIncNull = 1; // one for the NULL for (const XCHAR * pch = pszString; *pch; ++pch, ++cchIncNull) hash = hash * 65599 + *pch; if (pcchIncNull) *pcchIncNull = cchIncNull; return hash; } // find index for string -- returns 0 if not found ULONG FindIndex(const XCHAR * psz) { SPDBG_ASSERT(psz); ULONG cchIncNull; ULONG start = HashKey(psz, &cchIncNull) % m_cBuckets; ULONG index = start; do { // Not in table; return index where it should be placed. if (m_aulBuckets[index] == 0) return index; // Compare length and if it matches compare full string. if (m_aichWords[m_aulBuckets[index]] - m_aichWords[m_aulBuckets[index] - 1] == cchIncNull && IsEqual(m_aichWords[m_aulBuckets[index] - 1], psz)) { // Found this word already in the table. return index; } if (++index >= m_cBuckets) index -= m_cBuckets; } while (index != start); SPDBG_ASSERT(m_cwords == m_cBuckets); // Shouldn't ever get here return (ULONG) -1; } // Returns ID; use IndexFromId to recover string offset ULONG Find(const XCHAR * psz) { if (psz == NULL || m_cwords == 0) return 0; // Should always succeed in finding a bucket, since hash table is >2x larger than # of elements. ULONG ibucket = FindIndex(psz); return m_aulBuckets[ibucket]; // May be 0 if not in table } ULONG primeNext(ULONG val) { if (val < 2) val = 2; /* the smallest prime number */ for (;;) { /* Is val a prime number? */ ULONG maxFactor = (ULONG) sqrt ((double) val); /* Is i a factor of val? */ for (ULONG i = 2; i <= maxFactor; i++) if (val % i == 0) break; if (i > maxFactor) return (val); val++; } } HRESULT SetHashSize(ULONG cbuckets) { if (cbuckets > m_cBuckets) { ULONG * oldtable = m_aulBuckets; ULONG oldentry = m_cBuckets; ULONG prime = primeNext(cbuckets); // Alloc new table. m_aulBuckets = (ULONG *) malloc(prime * sizeof(ULONG)); if (m_aulBuckets == NULL) { m_aulBuckets = oldtable; return E_OUTOFMEMORY; } for (ULONG i=0; i < prime; i++) { m_aulBuckets[i] = 0; } m_cBuckets = prime; for (i = 0; i < oldentry; i++) { if (oldtable[i] != 0) { ULONG ibucket = FindIndex(m_pData + m_aichWords[oldtable[i] - 1]); m_aulBuckets[ibucket] = oldtable[i]; } } free(oldtable); } return S_OK; } // // The ID for a NULL string is always 0, the ID for subsequent strings is the // index of the string + 1; // HRESULT Add(const XCHAR * psz, ULONG * pichOffset, ULONG *pulID = NULL) { ULONG ID = 0; if (psz) { // Grow if we're more than half full. if (m_cwords * 2 >= m_cBuckets) { HRESULT hr = SetHashSize(m_cwords * 3 + 17); if (FAILED(hr)) return hr; } // Find out where this element should end up in hash table. ULONG ibucket = FindIndex(psz); if (m_aulBuckets[ibucket] == 0) { // Not found in hash table. Append it to the end. // Grow ID=>index mapping array if necessary. if (m_cwords + 1 >= m_cwordsAllocated) // 1 extra for init. zero { void * pvNew = realloc(m_aichWords, sizeof(*m_aichWords) * (m_cwords + 100)); if (pvNew == NULL) return E_OUTOFMEMORY; m_aichWords = (ULONG *)pvNew; m_cwordsAllocated = m_cwords + 100; m_aichWords[0] = 1; } // Grow string storage if necessary. ULONG cchIncNull = xcslen(psz); if (m_aichWords[m_cwords] + cchIncNull > m_cchAllocated) { ULONG cbDesired = ((m_cchAllocated + cchIncNull) * sizeof(XCHAR) + 0x2003) & ~3; void * pvNew = ::CoTaskMemRealloc(m_pData, cbDesired); if (pvNew == NULL) { return E_OUTOFMEMORY; } m_pData = (XCHAR *)pvNew; m_pData[0] = 0; m_cchAllocated = cbDesired / sizeof(XCHAR); } memcpy(m_pData + m_aichWords[m_cwords], psz, cchIncNull * sizeof(XCHAR)); ++m_cwords; m_aichWords[m_cwords] = m_aichWords[m_cwords - 1] + cchIncNull; // Fill in hash table entry with index of string. m_aulBuckets[ibucket] = m_cwords; ID = m_cwords; } else { // It was already there. ID = m_aulBuckets[ibucket]; } } *pichOffset = ID ? m_aichWords[ID - 1] : 0; if (pulID) { *pulID = ID; } return S_OK; } const ULONG GetNumItems() const { return m_cwords; } const XCHAR * String(ULONG ichOffset) const { return ichOffset ? m_pData + ichOffset : NULL; } static int xcscmp(const WCHAR * p0, const WCHAR * p1) { return wcscmp(p0, p1); } static int xcscmp(const char * p0, const char * p1) { return strcmp(p0, p1); } static int xcslen(const WCHAR * p) { return wcslen(p) + 1; } static int xcslen(const char * p) { return strlen(p) + 1; } BOOL IsEqual(ULONG ichOffset, const XCHAR * psz) { if (ichOffset) { return (psz ? (xcscmp(m_pData + ichOffset, psz) == 0) : FALSE); } else { return (psz == NULL); } } ULONG StringSize(void) const { return m_cwords ? m_aichWords[m_cwords] : 0; } ULONG IndexFromId(ULONG ulID) const { SPDBG_ASSERT(ulID <= m_cwords); if (ulID > 0) { return m_aichWords[ulID - 1]; } return 0; } const XCHAR * Item(ULONG ulID) const { SPDBG_ASSERT(ulID <= m_cwords); if ((ulID < 1) || m_pData == NULL) { return NULL; } return m_pData + IndexFromId(ulID); } ULONG SerializeSize() const { return (StringSize() * sizeof(XCHAR) + 3) & ~3; } const XCHAR * SerializeData() { ULONG cchWrite = StringSize(); if (cchWrite) { const ULONG cb = cchWrite * sizeof(XCHAR); if (cb % 4) // We know there's room since data is always DWORD aligned by { memset(m_pData + cchWrite, 0xcc, 4 - (cb & 3)); // Junk data so make sure it's not null } } return m_pData; } }; typedef class CStringBlobT CStringBlob; typedef class CStringBlobT CStringBlobW; typedef class CStringBlobT CStringBlobA; #endif // _STRINGBLOB_H_