/************************************************************************* * * * COLLECT.C * * * * Copyright (C) Microsoft Corporation 1990-1994 * * All Rights reserved. * * * ************************************************************************** * * * Module Intent * * * * This modules is the first stage in the index building process. The * * primary functoin of stage 1 is to collect and sort all of the words * * to be indexed. Before processing can begin, the user must call * * IndexInitiate to initialize the indexing variables (IPB). Words are * * added via a call to IndexAddWord and are stored in a Balanced Tree * * until an OOM condition occurrs. The tree is dumped and reset to * * receive further words. * * * ************************************************************************** * * * Current Owner: BinhN * * * **************************************************************************/ #include #include #include #include #include #include #include "common.h" #include "index.h" #ifdef _DEBUG static BYTE NEAR s_aszModule[] = __FILE__; /* Used by error return functions.*/ #endif #define MAX_OCCDATA 5 #define ISBUFFER_SIZE 0xFFFC // Size of OUTPUT buffers for collect2.c // The output is DWORD aligned // And the buffer *MUST* BE a multiple of 4 // Min size: size of largest index word #define MIN_REQUIRED_MEM 0x400000 // 4-meg minimum /************************************************************************* * * INTERNAL PUBLIC FUNCTIONS * * All of them should be declared far, unless we know they belong to * the same segment. They should be included in some include file * *************************************************************************/ PUBLIC VOID FAR PASCAL FreeISI (LPIPB); PUBLIC void FAR PASCAL FreeEsi (LPIPB); /************************************************************************* * * INTERNAL PRIVATE FUNCTIONS * *************************************************************************/ PRIVATE PBTNODE NEAR PASCAL AddNode (_LPIPB, LST, LPOCC, PHRESULT); PRIVATE HRESULT NEAR PASCAL AddTopic (_LPIPB, PSTRDATA, LPOCC); PRIVATE void NEAR PASCAL AddOccurrence (PTOPICDATA, POCCDATA, int); PRIVATE HRESULT NEAR PASCAL WriteBuffer (_LPIPB, LPB); PRIVATE HRESULT NEAR PASCAL TraverseWrite (_LPIPB, PBTNODE, int); PRIVATE void NEAR PASCAL BalanceTree (LPISI, PBTNODE); PRIVATE void NEAR PASCAL LeftRotate (LPISI, PBTNODE); PRIVATE void NEAR PASCAL RightRotate (LPISI, PBTNODE); PRIVATE HRESULT PASCAL NEAR IndexBlockAllocate (LPIPB lpipb, LONG lMemSize); PRIVATE void NEAR PASCAL VerifyTree (PBTNODE pRoot); /************************************************************************* * * PUBLIC API FUNCTIONS * * All of them should be declared far and included in some .DEF file * *************************************************************************/ PUBLIC LPIPB EXPORT_API FAR PASCAL MVIndexInitiate(PINDEXINFO pIndexInfo, PHRESULT phr); PUBLIC void EXPORT_API FAR PASCAL MVIndexDispose (LPIPB); PUBLIC HRESULT EXPORT_API FAR PASCAL MVIndexAddWord (LPIPB, LST, LPOCC); PUBLIC LPDWORD EXPORT_API PASCAL FAR TotalIndexedWord (LPIPB); /************************************************************************* * * INTERNAL PUBLIC FUNCTIONS * * All of them should be declared far and included in some .h file * *************************************************************************/ PUBLIC HRESULT FAR PASCAL SortFlushISI (_LPIPB); PUBLIC int FAR PASCAL CompareOccurrence (LPDW, LPDW, int); PUBLIC int FAR PASCAL StrCmp2BytePascal (LPB, LPB); PUBLIC HRESULT FAR PASCAL FlushTree (_LPIPB); /************************************************************************* * * @doc API EXTERNAL INDEXING * * @func LPIPB FAR PASCAL | MVIndexInitiate | * The function allocates a index parameter block. The block is used * in all places during indexing. This function must be called * prior to any other indexing funtion. * * @parm PINDEXINFO | pIndexInfo | * Pointer to the index information data * * @parm PHRESULT | phr | * Pointer to error buffer. * * @rdesc Pointer to the block, or NULL if error. The error buffer * contains the description of the error * *************************************************************************/ PUBLIC LPIPB EXPORT_API FAR PASCAL MVIndexInitiate(PINDEXINFO pIndexInfo, PHRESULT phr) { _LPIPB lpipb; // Pointer to index paramet block HRESULT fRet; // foNil should, of course, be nil // In this case foNil is only used by incremental update ITASSERT(0 == foNil.dwOffset && 0 == foNil.dwHigh); if (pIndexInfo == NULL) { SetErrCode (phr, E_INVALIDARG); return(NULL); } // Allocate the block. All the fields are initialized to 0 if ((lpipb = GlobalLockedStructMemAlloc (sizeof (IPB))) == NULL) { SetErrCode (phr, E_OUTOFMEMORY); return (NULL); } // Initialize "idxf", make sure that "occf" has "OCCF_TOPICID" set. lpipb->idxf = (WORD)(pIndexInfo->Idxf); lpipb->occf = (WORD)(pIndexInfo->Occf | OCCF_TOPICID); // Initialize some fields lpipb->dwLastIndexedTopic = (DWORD)-1; // Set the number of occurrence fields in the occurrence block if (pIndexInfo->Occf & OCCF_COUNT) lpipb->ucNumOccDataFields++; if (pIndexInfo->Occf & OCCF_OFFSET) lpipb->ucNumOccDataFields++; // Clear sort file handle lpipb->dwUniqueWord = 0; lpipb->esi.lpesbRoot = NULL; // Allocate all the necessary memory block if ((lpipb->dwMemAllowed = pIndexInfo->dwMemSize) < MIN_REQUIRED_MEM) lpipb->dwMemAllowed = MIN_REQUIRED_MEM; if ((fRet = IndexBlockAllocate (lpipb, lpipb->dwMemAllowed)) != S_OK) { SetErrCode (phr, fRet); GlobalLockedStructMemFree (lpipb); return (NULL); } if (pIndexInfo->dwBlockSize <= BTREE_NODE_SIZE) lpipb->BTreeData.Header.dwBlockSize = BTREE_NODE_SIZE; else lpipb->BTreeData.Header.dwBlockSize = pIndexInfo->dwBlockSize; lpipb->BTreeData.Header.dwCodePageID = pIndexInfo->dwCodePageID; lpipb->BTreeData.Header.lcid = pIndexInfo->lcid; lpipb->BTreeData.Header.dwBreakerInstID = pIndexInfo->dwBreakerInstID; // Set the callback key lpipb->dwKey = CALLBACKKEY; return (lpipb); } /************************************************************************* * * @doc API EXTERNAL INDEXING * * @func void FAR PASCAL | MVIndexDispose | * Release all memory associated with the index parameter block. * Must be called after indexing is complete. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * *************************************************************************/ PUBLIC void EXPORT_API FAR PASCAL MVIndexDispose(_LPIPB lpipb) { // Sanity check if (lpipb == NULL) return; // Free all memory associated with internal sort FreeISI(lpipb); // Free all memory associated with external sort FreeEsi(lpipb); GlobalLockedStructMemFree (lpipb); } /************************************************************************* * * @doc PRIVATE INDEXING * * @func VOID PASCAL NEAR | FreeISI | * Free all blocks, and temporary file associated with the internal * sort * * @parm _LPIPB | lpipb | * Pointer to index parameter block * *************************************************************************/ PUBLIC VOID PASCAL NEAR FreeISI (_LPIPB lpipb) { // Release temporary file buffer FreeHandle (lpipb->isi.hSortBuffer); lpipb->isi.hSortBuffer = NULL; if (lpipb->isi.hfpb) { FileClose (lpipb->isi.hfpb); lpipb->isi.hfpb = NULL; } if (lpipb->pDataBlock) { BlockFree (lpipb->pDataBlock); lpipb->pDataBlock = NULL; } if (lpipb->BTNodeBlock.pBlockMgr) { BlockFree (lpipb->BTNodeBlock.pBlockMgr); lpipb->BTNodeBlock.pBlockMgr = NULL; lpipb->BTNodeBlock.pFreeList = NULL; // Free list of Btnode } if (lpipb->TopicBlock.pBlockMgr) { BlockFree (lpipb->TopicBlock.pBlockMgr); lpipb->TopicBlock.pBlockMgr = NULL; lpipb->TopicBlock.pFreeList = NULL; // Free list of topic node } if (lpipb->OccBlock.pBlockMgr) { BlockFree (lpipb->OccBlock.pBlockMgr); lpipb->OccBlock.pBlockMgr = NULL; lpipb->OccBlock.pFreeList = NULL; // Free list of occurrence nodes } } /************************************************************************* * @doc API EXTERNAL INDEXING * * @func HRESULT FAR PASCAL | MVIndexAddWord | * This function will add a word into the index. * * @parm LPIPB | lpipb | * Index parameter block being operated on * * @parm LST | lstWord | * Word being indexed. (Pascal style with 2-byte header) * * @parm LPOCC | lpocc | * Occurence data associated with this word. It is assumed that the * occurrence block contains NO UNINITIALIZED DATA, ie. non-used * fields must be set to 0 * * @rdesc S_OK, if successful, else other error * * @comm * The data are copied into the buffer managed by the block manager * and arranged as a Red/Black tree to speed sorting. *************************************************************************/ static OCC NullOcc = { 0 }; PUBLIC HRESULT EXPORT_API FAR PASCAL MVIndexAddWord (_LPIPB lpipb, LST lstWord, LPOCC lpOcc) { // Local replacement variables ERRB errb; // Pointer to error variable LPISI pIsi; // Internal Sort Information PBTNODE pRoot; // Root of the Balanced Tree // Working variables PBTNODE pNode; // Used to traverse the tree to find // to find the insertion point PBTNODE FAR *ppNode; // Used to add children to the tree int result; // String compare results int wLen; // Word length LST lstStart; // Saved starting position #ifdef _DEBUG char Buffer[200]; #endif #ifdef _DEBUGREDBLACK int iLeft = 0; int iRight = 0; #endif // Various flags int fCompareField; // Sanity check if (lpipb == NULL) return(E_INVALIDARG); // Handle null case if (lstWord == NULL) return(S_OK); fCompareField = lpipb->occf & OCCF_FIELDID; pIsi = &lpipb->isi; // Internal Sort Information pRoot = pIsi->pBalanceTree; // Root of the Balanced Tree // Working variables ppNode = NULL; // Used to add children to the tree lstStart = lstWord; // Saved starting position if (lpOcc == NULL) lpOcc = &NullOcc; // Get statistics lpipb->dwIndexedWord++; // Count unique TopicId's if (lpipb->dwLastIndexedTopic != lpOcc->dwTopicID) { lpipb->lcTopics++; lpipb->dwLastIndexedTopic = lpOcc->dwTopicID; } if (lpOcc->dwTopicID > lpipb->dwMaxTopicId) { lpipb->dwMaxTopicId = lpOcc->dwTopicID; } wLen = GETWORD((LPUW)(lstStart = lstWord)); // Save statistical information about the total length of all words if (wLen > 2) lpipb->dwTotal3bWordLen += wLen; else lpipb->dwTotal2bWordLen += wLen; lstWord += sizeof(WORD); #ifdef _DEBUG if (wLen >= 200) { strncpy (Buffer, lstWord, 198); Buffer[199] = 0; } else { strncpy (Buffer, lstWord, wLen); Buffer[wLen] = 0; } // if (STRICMP (Buffer, "erin") == 0) // _asm int 3; #endif // Call the user callback every once in a while if (!(lpipb->dwIndexedWord % 65536L) && (lpipb->CallbackInfo.dwFlags & ERRFLAG_STATUS)) { PFCALLBACK_MSG pCallbackInfo = &lpipb->CallbackInfo; CALLBACKINFO Info; HRESULT err; Info.dwPhase = 1; Info.dwIndex = lpipb->dwIndexedWord; err = (*pCallbackInfo->MessageFunc) (ERRFLAG_STATUS, pCallbackInfo->pUserData, &Info); if (S_OK != err) return (err); } SubmitWord: // Is this the first word in the tree? if (pRoot == NULL) { if ((pRoot = AddNode (lpipb, lstStart, lpOcc, &errb)) == NULL) return (SetErrCode (NULL, E_OUTOFMEMORY)); // Adjust tree data pRoot->color = BLACK; pRoot->pParent = NULL; pIsi->pBalanceTree = pRoot; // Set statistical info lpipb->dwByteCount = GETWORD ((LPUW)pRoot->StringData.pText); lpipb->dwMaxFieldId = pRoot->StringData.dwField; return (S_OK); } // Set traversal node to root node pNode = pRoot; for (; ; ) // Traverse the tree forever { int len; // Used for string compare block LPB lpbWord1, lpbWord2; // Used for string compare block PSTRDATA pString; /********************************************** * This section of code does a string compare **********************************************/ lpbWord1 = lstWord; pString = &pNode->StringData; lpbWord2 = pString->pText; // Get the minimum length if ((result = wLen - GETWORD ((LPUW)lpbWord2)) > 0) len = GETWORD ((LPUW)lpbWord2); else len = wLen; // Skip the lengths lpbWord2 += sizeof (WORD); // Start compare byte per byte for (; len > 0; len--, lpbWord1++, lpbWord2++) { if (*lpbWord1 != *lpbWord2) break; } if (len != 0) result = *lpbWord1 - *lpbWord2; /********************************** * COMPARE FIELDID AND WORD LENGTH **********************************/ if (result == 0) { // If the WordLength and FieldId are the same as the current // nodes' then we update the current record if (fCompareField) result = lpOcc->dwFieldId - pString->dwField; if (result == 0) result = lpOcc->wWordLen - (WORD)pString->dwWordLength; if (result == 0) { if (AddTopic (lpipb, pString, lpOcc) == S_OK) return (S_OK); // Add failed. Flush the tree to disk & resubmit word if ((result = FlushTree(lpipb)) == S_OK) { pRoot = pIsi->pBalanceTree; goto SubmitWord; } return (SetErrCode (NULL, (HRESULT)result)); } // Fall through in case result is non-zero } // Descend tree or add new node if (result < 0) { if (pNode->pLeft != NULL) { pNode = pNode->pLeft; #ifdef _DEBUGREDBLACK iLeft++; #endif continue; } else ppNode = &pNode->pLeft; } else { if (pNode->pRight != NULL) { pNode = pNode->pRight; #ifdef _DEBUGREDBLACK iRight++; #endif continue; } else ppNode = &pNode->pRight; } #ifdef _DEBUGREDBLACK _DPF3("Added node '%s' at left %d, right %d\n", Buffer, iLeft, iRight); #endif // Add the new node to the tree *ppNode = AddNode (lpipb, lstStart, lpOcc, &errb); // If node is NULL we will flush the tree and resubmit the word if (*ppNode == NULL) { if ((result = FlushTree(lpipb)) != S_OK) return (result); pRoot = pIsi->pBalanceTree; ppNode = NULL; goto SubmitWord; } (*ppNode)->pParent = pNode; // This is the only place that the nodes get balanced BalanceTree (pIsi, *ppNode); #ifdef _DEBUGREDBLACK VerifyTree (pIsi->pBalanceTree); #endif return (S_OK); } } /************************************************************************* * @doc API EXTERNAL INDEXING * * @func LPDWORD PASCAL FAR | TotalIndexedWord | * Return the total number of words indexed (for statistical purpose * only) * * @parm LPIPB | lpipb | * Pointer to index parameter block * * @rdesc Return pointer to the total number of words indexed *************************************************************************/ PUBLIC LPDWORD PASCAL FAR TotalIndexedWord(_LPIPB lpipb) { return (&lpipb->dwUniqueWord); } /************************************************************************* * * @doc PRIVATE INDEXING * * @func void NEAR PASCAL | FreeEsi | * Gets rid of all external-sort blocks attached to an IPB. * These blocks are formed into a single linked list * Also closes the file associated with the external sort. * * @parm _LPIPB | lpipb | * Pointer to index parameter block where all the info is stored * *************************************************************************/ PUBLIC VOID FAR PASCAL FreeEsi(_LPIPB lpipb) { LPESB lpesb; /* Linked-list walk pointer. */ LPESB lpesbNext; /* Next ESB in chain. */ LPESI lpesi; /* Pointer to external sort info struct */ /* Get pointer to the ESI block */ lpesi = &lpipb->esi; for (lpesb = lpesi->lpesbRoot; lpesb != NULL; lpesb = lpesbNext) { /* Get pointer to the next block */ lpesbNext = lpesb->lpesbNext; if (lpesb->hMem) { _GLOBALUNLOCK(lpesb->hMem); _GLOBALFREE(lpesb->hMem); } /* Free the block */ GlobalLockedStructMemFree (lpesb); } lpesi->lpesbRoot = NULL; /* No more chain. */ lpesi->cesb = 0; /* Everyone freed */ // Delete the internal sorting result file if ((lpipb->idxf & KEEP_TEMP_FILE) == 0) FileUnlink (NULL, lpipb->isi.aszTempName, REGULAR_FILE); } /************************************************************************* * * @doc PRIVATE INDEXING * * @func PBTNODE NEAR PASCAL | AddNode | * Inserts a new node into the tree. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @parm LST | lpb | * Word being indexed. * * @parm LPOCC | lpOcc | * Pointer to occurrence data * * @parm PHRESULT | phr | * Pointer to error structure * * @rdesc Pointer to the newly created node * * @comm * The nodes parent pointer must be set externally. * *************************************************************************/ PBTNODE NEAR PASCAL AddNode (_LPIPB lpipb, LST lpbWord, LPOCC lpOcc, PHRESULT phr) { // Local replacement variables LPV pDataBlock = lpipb->pDataBlock; // Pointer to Block Manager int occf = lpipb->occf; // Working variables PBTNODE pNode; // This will point to the new node PSTRDATA pString; // Pointer to string block PTOPICDATA pTopic; // Pointer to topic block POCCDATA pOcc; LPDW lpDw; // Create space for new node & topic & occ & copy the string #if 0 if ((pNode = (PBTNODE)GetBlockNode (&lpipb->BTNodeBlock)) == NULL || (pTopic = (PTOPICDATA)GetBlockNode (&lpipb->TopicBlock)) == NULL || #else if ((pNode = (PBTNODE)BlockGetBlock(pDataBlock, sizeof(BTNODE))) == NULL || (pTopic = (PTOPICDATA)BlockGetBlock (pDataBlock, sizeof(TOPICDATA))) == NULL || #endif (pNode->StringData.pText = (LPB)BlockCopy (lpipb->pDataBlock, lpbWord, GETWORD((LPUW)lpbWord) + sizeof (SHORT), 0)) == NULL) { return (NULL); } pString = &pNode->StringData; /* Initialize all the fields */ // Node Information. Parent field is set outside of this function pNode->pLeft = pNode->pRight = NULL; pNode->color = RED; /* Set the string fields */ pString->pTopic = pString->pLastTopic = pTopic; pString->dwTopicCount = 1; // It doesn't hurt to copy the data even if we don't use it // It also saves a compare just to set it pString->dwField = lpOcc->dwFieldId; pString->dwWordLength = lpOcc->wWordLen; // Set the topic fields data pTopic->pNext = NULL; pTopic->dwTopicId = lpOcc->dwTopicID; if (occf & (OCCF_COUNT | OCCF_OFFSET)) { #if 1 if ((pOcc = (POCCDATA)BlockGetBlock (pDataBlock, sizeof(OCCDATA) * lpipb->ucNumOccDataFields)) == NULL) return(NULL); #else if ((pOcc = (POCCDATA)GetBlockNode (&lpipb->OccBlock)) == NULL ) return(NULL); #endif // Set the occ fields pOcc->pNext = NULL; // Generate occ data block lpDw = pOcc->OccData; if (occf & OCCF_COUNT) *lpDw++ = lpOcc->dwCount; if (occf & OCCF_OFFSET) *lpDw = lpOcc->dwOffset; pTopic->pLastOccData = pTopic->pOccData = pOcc; pTopic->dwOccCount = 1; } else { pTopic->pLastOccData = pTopic->pOccData = NULL; pTopic->dwOccCount = 0; } // Set Statistical information if (lpipb->dwMaxWLen < GETWORD ((LPUW)pString->pText)) lpipb->dwMaxWLen = GETWORD ((LPUW)pString->pText); if (lpipb->dwMaxFieldId < pString->dwField) lpipb->dwMaxFieldId = pString->dwField; lpipb->dwUniqueWord++; lpipb->dwByteCount += GETWORD ((LPUW)pString->pText); return (pNode); } /************************************************************************* * * @doc PRIVATE INDEXING * * @func int FAR PASCAL | CompareOccurrence | * Compares two Occurrence data pointers starting from the first * element and continuing until the elements are unequal. * * @parm LPB | lpStr1 | * Pointer to the first Occurence to compare * * @parm LPB | pOcc2 | * Pointer to the second Occurence to compare * * @parm int | max | * The number of occurrence fields to compare * * @rdesc * negative value : If pOcc1 is less than pOcc2 * 0 : if pOcc1 is equal to pOcc2 * positive value : If pOcc1 is greater than pOcc2 * * @comm * The use of switch statment is for speed since this function is * called for so many times *************************************************************************/ int FAR PASCAL CompareOccurrence (LPDW pOcc1, LPDW pOcc2, int max) { int result; switch (max) { case 5: if (result = (int)(*pOcc1 - *pOcc2)) return (result); pOcc1++; pOcc2++; case 4: if (result = (int)(*pOcc1 - *pOcc2)) return (result); pOcc1++; pOcc2++; case 3: if (result = (int)(*pOcc1 - *pOcc2)) return (result); pOcc1++; pOcc2++; case 2: if (result = (int)(*pOcc1 - *pOcc2)) return (result); pOcc1++; pOcc2++; case 1: return ((int)(*pOcc1 - *pOcc2)); default: // This can only an error, since we knows that max // can never be > 5 return (0); } } /************************************************************************* * * @doc INTERNAL INDEXING * * @func HRESULT | AddTopic | * Inserts a new topic into a nodes topic list or a new occurrence * if a topic with the same TopicId already exists. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @parm PSTRDATA | pString | * Pointer to node structure * * @parm LPOCC | lpOcc | * Pointer occurrence data * * @rdesc S_OK, or errors if failed * *************************************************************************/ HRESULT NEAR PASCAL AddTopic (_LPIPB lpipb, PSTRDATA pString, LPOCC lpOcc) { // Local replacement variables LPV pDataBlock = lpipb->pDataBlock; int occf = lpipb->occf; DWORD dwNewTopicId = lpOcc->dwTopicID; POCCDATA pOcc; // Working variables // int topicCount; // Iterates through current topics PTOPICDATA pTopic, pPrevTopic; LPDW lpDw; int fResult; /* Set up a new occurrence block */ if (occf & (OCCF_COUNT | OCCF_OFFSET)) { if ((pOcc = (POCCDATA)BlockGetBlock (pDataBlock, sizeof(OCCDATA) * lpipb->ucNumOccDataFields)) == NULL) return (E_OUTOFMEMORY); lpDw = pOcc->OccData; if (occf & OCCF_COUNT) *lpDw++ = lpOcc->dwCount; if (occf & OCCF_OFFSET) *lpDw = lpOcc->dwOffset; pOcc->pNext = NULL; } else pOcc = NULL; // Check from last point of insertion pTopic = pString->pLastTopic; if (pTopic->dwTopicId == dwNewTopicId) { append_occ_info: // Match. We don't have to do anything. That's is the majority // of the case. Just add the occdata to the end if (pOcc) { pTopic->pLastOccData->pNext = pOcc; pTopic->pLastOccData = pOcc; pTopic->dwOccCount++; } goto Update; } if (pTopic->dwTopicId < dwNewTopicId) { // kevynct: scan ahead to insertion point. Usually with sorted lists // this won't be far at all. pPrevTopic = pTopic; if (pTopic->pNext) { for (; (fResult = pTopic->dwTopicId - dwNewTopicId) < 0 && pTopic->pNext; pPrevTopic = pTopic, pTopic = pTopic->pNext) ; // empty loop! if (fResult == 0) { pString->pLastTopic = pTopic; goto append_occ_info; } } if ((pTopic = (PTOPICDATA)BlockGetBlock (pDataBlock, sizeof(TOPICDATA))) == NULL) return (E_OUTOFMEMORY); // Set the topic fields data if (pOcc) { pTopic->pLastOccData = pTopic->pOccData = pOcc; pTopic->dwOccCount = 1; } else { pTopic->pLastOccData = pTopic->pOccData = NULL; pTopic->dwOccCount = 0; } pTopic->dwTopicId = dwNewTopicId; insert_middle_or_end: // Add to middle or end of list pTopic->pNext = pPrevTopic->pNext; pPrevTopic->pNext = pTopic; pString->dwTopicCount++; pString->pLastTopic = pTopic; goto Update; } // It means that topics are not inserted // in order. It can only happen if somebody is using the // indexer for some special, non-topic related index build // Move to the right node pPrevTopic = NULL; for (pTopic = pString->pTopic; (fResult = pTopic->dwTopicId - dwNewTopicId) < 0 && pTopic->pNext; pPrevTopic = pTopic, pTopic = pTopic->pNext); if (fResult == 0) { // Match. Just add the occdata to the end if (pOcc) { pTopic->pLastOccData->pNext = pOcc; pTopic->pLastOccData = pOcc; pTopic->dwOccCount++; } } else { // A new topic node is needed if ((pTopic = (PTOPICDATA)BlockGetBlock (pDataBlock, sizeof(TOPICDATA))) == NULL) return (E_OUTOFMEMORY); // Set the topic fields data if (pOcc) { pTopic->pLastOccData = pTopic->pOccData = pOcc; pTopic->dwOccCount = 1; } else { pTopic->pLastOccData = pTopic->pOccData = NULL; pTopic->dwOccCount = 0; } pTopic->dwTopicId = dwNewTopicId; // Add to the beginning if empty if (pPrevTopic == NULL) { // Add to the beginning pTopic->pNext = pString->pTopic; pString->pTopic = pTopic; pString->dwTopicCount++; pString->pLastTopic = pTopic; goto Update; } goto insert_middle_or_end; } Update: // Update statistical information if (lpipb->dwMaxWCount < lpOcc->dwCount) lpipb->dwMaxWCount = lpOcc->dwCount; if (lpipb->dwMaxOffset < lpOcc->dwOffset) lpipb->dwMaxOffset = lpOcc->dwOffset; return S_OK; } /************************************************************************* * * @doc INTERNAL INDEXING * * @func int | StrCmp2BytePascal | * Compares two Pascal style strings against eachother. * The strings must have a 2 byte length field, *NOT* 1 byte. * * @parm LPB | lpStr1 | * Pointer to string one * * @parm LPB | lpStr2 | * Pointer to string two * * @rdesc * negative value : If pOcc1 is less than pOcc2 * 0 : if pOcc1 is equal to pOcc2 * positive value : If pOcc1 is greater than pOcc2 * *************************************************************************/ int FAR PASCAL StrCmp2BytePascal(LPB lpStr1, LPB lpStr2) { int fRet; int register len; // Get the minimum length if ((fRet = GETWORD ((LPUW)lpStr1) - GETWORD ((LPUW)lpStr2)) > 0) len = GETWORD ((LPUW)lpStr2); else len = GETWORD ((LPUW)lpStr1); // Skip the lengths */ lpStr1 += sizeof (SHORT); lpStr2 += sizeof (SHORT); // Start compare byte per byte */ for (; len > 0; len--, lpStr1++, lpStr2++) { if (*lpStr1 != *lpStr2) break; } if (len == 0) return (fRet); return (*lpStr1 - *lpStr2); } /************************************************************************* * * @doc INTERNAL INDEXING * * @func HRESULT | FlushTree | * Flushes the tree to disk. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @rdesc S_OK, or errors if failed * * @comm * This function holds the output file open until the tree has been * completely written to disk. The physical offset of the written * data is stored in the ESB blocks so that the word can be merged * in the next index phase. * *************************************************************************/ PUBLIC HRESULT FAR PASCAL FlushTree(_LPIPB lpipb) { // Local replacement variables LPISI pIsi = &lpipb->isi; LPESI pEsi = &lpipb->esi; PBTNODE pBalanceTree = pIsi->pBalanceTree; ERRB errb; PHRESULT phr = &errb; // Local working variables LPESB pNewEsb; HRESULT fRet; // Make sure that the tree actually has nodes if (pBalanceTree == NULL) return (S_OK); // Open output file & clear working variables if (pIsi->hfpb == NULL) { // Allocate output buffer if ((pIsi->hSortBuffer = _GLOBALALLOC (DLLGMEM_ZEROINIT, ISBUFFER_SIZE)) == NULL) return (E_OUTOFMEMORY); pIsi->pSortBuffer = (LPB)_GLOBALLOCK (pIsi->hSortBuffer); // Get temp filename & open file GETTEMPFILENAME ((char)0, (LPB)"iso", (WORD)0, pIsi->aszTempName); if ((pIsi->hfpb = FileOpen (NULL, pIsi->aszTempName, REGULAR_FILE, READ_WRITE, phr)) == NULL) return (*phr); pIsi->dwRecLength = 0; pEsi->cesb = 0; } // Allocate new ESB structure & set starting values if ((pNewEsb = GlobalLockedStructMemAlloc (sizeof (ESB))) == NULL) return (E_OUTOFMEMORY); // Remember the starting offset pNewEsb->lfo = pIsi->lfo; // Reset the current insertion point pIsi->pCurPtr = pIsi->pSortBuffer; // Actually ouput the tree data if ((fRet = TraverseWrite(lpipb, pBalanceTree, 0)) != S_OK) return (fRet); // Flush remaining buffer to disk if ((fRet = WriteBuffer(lpipb, pIsi->pCurPtr)) != S_OK) return(fRet); // Set the ESB maximum record length pNewEsb->dwEsbSize = pIsi->dwMaxEsbRecSize; pIsi->dwMaxEsbRecSize = 0; // Store end offset in list pNewEsb->lfoMax = pIsi->lfo; // Update the fileoffset pIsi->lfo = pNewEsb->lfoMax; if (pEsi->lpesbRoot == NULL) pNewEsb->lpesbNext = NULL; else pNewEsb->lpesbNext = pEsi->lpesbRoot; pEsi->lpesbRoot = pNewEsb; pEsi->cesb++; // Reset tree heap & root node BlockReset (lpipb->pDataBlock); BlockReset (lpipb->BTNodeBlock.pBlockMgr); lpipb->BTNodeBlock.pFreeList = (PLIST)BlockGetLinkedList(lpipb->BTNodeBlock.pBlockMgr); BlockReset (lpipb->TopicBlock.pBlockMgr); lpipb->TopicBlock.pFreeList = (PLIST)BlockGetLinkedList(lpipb->TopicBlock.pBlockMgr); BlockReset (lpipb->OccBlock.pBlockMgr); lpipb->OccBlock.pFreeList = (PLIST)BlockGetLinkedList(lpipb->OccBlock.pBlockMgr); pIsi->pBalanceTree = NULL; return (S_OK); } /************************************************************************* * * @doc INTERNAL INDEXING * * @func HRESULT | WriteBuffer | * Physically writes buffer to disk. This will write from the beginning * of the sort buffer to pStartRec. It then copies whatever left * in the sort buffer back to the beginning of it * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @parm LPB | copyEnd | * Pointer to the end of the next block of data to copy * * @rdesc S_OK or errors *************************************************************************/ HRESULT NEAR PASCAL WriteBuffer (_LPIPB lpipb, LPB copyEnd) { // Local replacement variables LPISI pIsi = &lpipb->isi; LPB pSortBuffer; ERRB errb; PHRESULT phr = &errb; DWORD cbWritten; // Number of bytes to write to disk (bytes) DWORD cbCopied; // Size of extra data to move to buffers front LPB copyStart; pSortBuffer = (LPB)pIsi->pSortBuffer; // Find what should be left in the buffer. // copyStart will pointer to the beginning of data to be recopied, ie. // the beginning of a record // - if pIsi->pStartRec == -1 : there is no beginning of record // so we have nothing to recopy // - if pIsi->pStartRec == pSortBuffer, again the whole thing is // to be written out, and there is nothing to recopy if ((copyStart = pIsi->pStartRec) == (LPB)-1 || copyStart == pSortBuffer) copyStart = copyEnd; if ((cbWritten = (DWORD)(copyStart - pSortBuffer)) == 0) return(S_OK); // Nothing to copy cbCopied = (DWORD)(copyEnd - copyStart); // Update backpatching data if (pIsi->pStartRec == pSortBuffer) { pIsi->pStartRec = (LPB)-1; // The pointer is invalid pIsi->lfoRecBackPatch = pIsi->lfo; // Remember the place for backpatch } // Write the buffer to disk if (cbWritten != (DWORD) FileWrite(pIsi->hfpb, pSortBuffer, cbWritten, phr)) { return (*phr); } pIsi->lfo = FoAddDw (pIsi->lfo, cbWritten); // Only copy if extra data exists if (cbCopied) { MEMMOVE(pSortBuffer, copyStart, cbCopied); if (pIsi->pStartRec == copyStart) pIsi->pStartRec = pSortBuffer; } // Reset pStartRec & pCurPtr pIsi->pCurPtr = pSortBuffer + cbCopied; return S_OK; } /************************************************************************* * * @doc INTERNAL INDEXING * * @func HRESULT | TraverseWrite | * Copies the node data to the output buffer. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @parm PBTNODE | node | * Node to copy to buffer * * @parm LPB | pBuffer | * Buffer to copy node into * * @parm int | Level | * Current tree level (starting with 1) * * @rdesc S_OK, or errors if failed * * @comm * This is currently a recursive routine. It should probably be * changed to be non-recursive to save on speed at run-time. * *************************************************************************/ HRESULT NEAR PASCAL TraverseWrite (_LPIPB lpipb, PBTNODE node, int Level) { // Local replacement pointers PSTRDATA pString = &node->StringData; LPISI pIsi = &lpipb->isi; // Internal sort information LPB pText = pString->pText; // The word string POCCDATA pOccData; WORD ucNumOccDataFields = lpipb->ucNumOccDataFields; PTOPICDATA pTopic = pString->pTopic; ERRB errb; PHRESULT phr = &errb; // Working variables DWORD topicLoop, occLoop; // Loop counters WORD wLength; // DWORD aligned length of string BYTE filledBuffer = 0; // Count if record fills entire buffer LPB pBaseBuffer; // Start of entire buffer LPB pCurPtr; LPB pMaxPtr; HRESULT fRet; // Keep track of how deep the tree is if (Level > pIsi->DeepLevel) pIsi->DeepLevel = (BYTE) Level; #ifdef _DEBUG if (Level >= 65) { // This would be a HUGE tree! return SetErrCode (phr, E_ASSERT); } #endif // Traverse the left sub tree if (node->pLeft != NULL) { if ((fRet = TraverseWrite(lpipb, node->pLeft, Level + 1)) != S_OK) return(fRet); } /* Initialize */ pBaseBuffer = (LPB)pIsi->pSortBuffer; pMaxPtr = pBaseBuffer + ISBUFFER_SIZE - sizeof(DWORD); // Leave some room pCurPtr = pIsi->pCurPtr; // Get starting point // Reset the record length field pIsi->dwRecLength = 0; // Get the Pascal string length wLength = GETWORD ((LPUW)pText) + sizeof (SHORT); //wLength = (wLength + 3) & (~3); // Check for minimum room if (pMaxPtr <= pCurPtr + wLength + // String length sizeof (DWORD) + // Record length sizeof (DWORD) + // FieldId sizeof (WORD) + // Word length sizeof (DWORD) ) // TopicCount { if ((fRet = WriteBuffer (lpipb, pCurPtr)) != S_OK) return fRet; pCurPtr = pIsi->pCurPtr; // Reset insertion point } // Remember record length position to be backpatched pIsi->pStartRec = pCurPtr; pCurPtr += sizeof (DWORD); MEMCPY(pCurPtr, pText, wLength); pCurPtr += wLength; // Add aligned offset // Copy the Word Length only if flag is set if (lpipb->occf & OCCF_LENGTH) pCurPtr += CbBytePack (pCurPtr, pString->dwWordLength); // Copy FieldId only if flag is set if (lpipb->occf & OCCF_FIELDID) pCurPtr += CbBytePack (pCurPtr, pString->dwField); // Topic Count if (lpipb->occf & OCCF_TOPICID) pCurPtr += CbBytePack (pCurPtr, pString->dwTopicCount); else pString->dwTopicCount = 0; // Add in all topics for (topicLoop = pString->dwTopicCount; topicLoop > 0; --topicLoop) { // Check buffer overflow if (pMaxPtr <= pCurPtr + sizeof (DWORD) // TopicId + sizeof (DWORD)) // Occurrence count { pIsi->dwRecLength += (DWORD)(pCurPtr - pIsi->pCurPtr); if ((fRet = WriteBuffer (lpipb, pCurPtr)) != S_OK) return fRet; pCurPtr = pIsi->pCurPtr; // Reset insertion point } pCurPtr += CbBytePack (pCurPtr, pTopic->dwTopicId); if (occLoop = pTopic->dwOccCount) { pCurPtr += CbBytePack (pCurPtr, pTopic->dwOccCount); pOccData = pTopic->pOccData; // Add in all occurrence data for (occLoop = pTopic->dwOccCount; occLoop > 0; --occLoop) { LPDW lpDw; // Check buffer overflow if (pMaxPtr <= pCurPtr + MAX_OCCDATA * sizeof (DWORD)) { pIsi->dwRecLength += (DWORD)(pCurPtr - pIsi->pCurPtr); if ((fRet = WriteBuffer (lpipb, pCurPtr)) != S_OK) return fRet; pCurPtr = pIsi->pCurPtr; // Reset insertion point } lpDw = (LPDW)pOccData->OccData; switch (ucNumOccDataFields) { case 5: pCurPtr += CbBytePack (pCurPtr, *lpDw++); case 4: pCurPtr += CbBytePack (pCurPtr, *lpDw++); case 3: pCurPtr += CbBytePack (pCurPtr, *lpDw++); case 2: pCurPtr += CbBytePack (pCurPtr, *lpDw++); case 1: pCurPtr += CbBytePack (pCurPtr, *lpDw++); } pOccData = pOccData->pNext; } } pTopic = pTopic->pNext; } // Update the record length pIsi->dwRecLength += (DWORD)(pCurPtr - pIsi->pCurPtr); // Keep track of the maximum record size for merging. // - 1 for the current ESB. This helps speeding up the merging sequence // since we don't have to worry about a record being split if (pIsi->dwRecLength > pIsi->dwMaxEsbRecSize) pIsi->dwMaxEsbRecSize = pIsi->dwRecLength; // Set record length if (pIsi->pStartRec != (LPB)-1) { // Everything is still in memory *(LPUL)pIsi->pStartRec = pIsi->dwRecLength; } else { // We have to do backpatching if (sizeof (DWORD) != FileSeekWrite (pIsi->hfpb, &pIsi->dwRecLength, pIsi->lfoRecBackPatch, sizeof (DWORD), phr)) return *phr; FileSeek (lpipb->isi.hfpb, pIsi->lfo, 0, phr); } // Update the current insertion point, and prepare for the next record pIsi->pStartRec = pIsi->pCurPtr = pCurPtr; if (node->pRight != NULL) return TraverseWrite(lpipb, node->pRight, Level + 1); return(S_OK); } /************************************************************************* * * @doc INTERNAL INDEXING * * @func VOID NEAR PASCAL | BalanceTree | * Balances the tree using a Red/Black algorithm. * * @parm LPISI | pIsi | * Pointer to Internal sort data * * @parm PBTNODE | node | * Pointer to the node that was just inserted * * @comm * This routine must be called after EVERY new node is inserted in * the tree to maintain proper balance. * A Red/Black tree must maintain the following conditions: * Every node is colored either red or black * Every leaf node must be black * If a node is red, then both of its children must be black * Every path from the root to a leaf must contain the same * number of black nodes * *************************************************************************/ void NEAR PASCAL BalanceTree(LPISI pIsi, PBTNODE node) { PBTNODE y; PBTNODE pParentNode; node->color = RED; while (node != pIsi->pBalanceTree && node->pParent->color == RED) { pParentNode = node->pParent; if (pParentNode != NULL && pParentNode->pParent != NULL && pParentNode == pParentNode->pParent->pLeft) { y = pParentNode->pParent->pRight; if (y != NULL && y->color == RED) { pParentNode->color = BLACK; y->color = BLACK; pParentNode->pParent->color = RED; node = pParentNode->pParent; pParentNode = node->pParent; } else { if (node == pParentNode->pRight) { node = pParentNode; // LeftRotate change the parent node LeftRotate(pIsi, node); pParentNode = node->pParent; } pParentNode->color = BLACK; pParentNode->pParent->color = RED; // RightRotate change the parent node RightRotate(pIsi, pParentNode); pParentNode = node->pParent; } } else { if (pParentNode != NULL && pParentNode->pParent != NULL) y = pParentNode->pParent->pLeft; else y = NULL; if (y != NULL && y->color == RED) { pParentNode->color = BLACK; y->color = BLACK; pParentNode->pParent->color = RED; node = pParentNode->pParent; pParentNode = node->pParent; } else { if (node == pParentNode->pLeft) { // RightRotate change the parent node RightRotate(pIsi, node); node->color = BLACK; node = node->pRight; pParentNode = node->pParent; } pParentNode->color = BLACK; pParentNode->pParent->color = RED; // LeftRotste change the parent node LeftRotate(pIsi, pParentNode->pParent); pParentNode = node->pParent; } } } pIsi->pBalanceTree->color = BLACK; } /************************************************************************* * * @doc INTERNAL INDEXING * * @func VOID NEAR PASCAL | LeftRotate | * Rotates two nodes in the tree. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @parm PBTNODE | node | * The X node to process (see notes) * * @comm * * X Y * / \ / \ * a Y ---> X c * / \ / \ * b c a b *************************************************************************/ void NEAR PASCAL LeftRotate(LPISI pIsi, PBTNODE node) { PBTNODE y = node->pRight; node->pRight = y->pLeft; if (y->pLeft != NULL) y->pLeft->pParent = node; y->pParent = node->pParent; if (y->pParent == NULL) pIsi->pBalanceTree = y; else { if (node == node->pParent->pLeft) node->pParent->pLeft = y; else node->pParent->pRight = y; } y->pLeft = node; node->pParent = y; } /************************************************************************* * * @doc INTERNAL INDEXING * * @func VOID NEAR PASCAL | RightRotate | * Rotates two nodes in the tree. * * @parm _LPIPB | lpipb | * Pointer to index parameter block * * @parm PBTNODE | node | * The X node to process (see notes) * * @comm * * Y X * / \ / \ * X c ---> a Y * / \ / \ * a b b c *************************************************************************/ void NEAR PASCAL RightRotate(LPISI pIsi, PBTNODE node) { PBTNODE y = node->pParent; y->pLeft = node->pRight; if (y->pLeft != NULL) y->pLeft->pParent = y; node->pParent = y->pParent; if (node->pParent == NULL) pIsi->pBalanceTree = node; else { if (y == node->pParent->pLeft) node->pParent->pLeft = node; else node->pParent->pRight = node; } node->pRight = y; y->pParent = node; } /************************************************************************ * @doc PRIVATE * @func HRESULT PASCAL NEAR | IndexBlockAllocate | * Set the memory allocation based on the memory of the machine * @parm DWORD | dwmemSize | * Memory allocated for the indexer * @rdesc S_OK, or E_OUTOFMEMORY ************************************************************************/ PRIVATE HRESULT PASCAL NEAR IndexBlockAllocate (_LPIPB lpipb, LONG lMemSize) { if ((lpipb->pDataBlock = BlockInitiate (MAX_BLOCK_SIZE, 0, (WORD)(lMemSize/MAX_BLOCK_SIZE), USE_VIRTUAL_MEMORY)) == NULL) return(E_OUTOFMEMORY); return(S_OK); } #ifdef _DEBUGREDBLACK /* * @comm * This routine must be called after EVERY new node is inserted in * the tree to maintain proper balance. * A Red/Black tree must maintain the following conditions: * Every node is colored either red or black * Every leaf node must be black * If a node is red, then both of its children must be black * Every path from the root to a leaf must contain the same * number of black nodes */ void PreOrdTrav (PBTNODE pNode, int iLevel, char cChildType) { if (pNode == NULL) { OutputDebugString ("*\n"); return; } _DPF4 ("Chl: %c Col: %c Lev: %d\n", cChildType, pNode->color == RED ? 'R' : 'B', iLevel); iLevel++; PreOrdTrav (pNode->pLeft, iLevel, 'L'); PreOrdTrav (pNode->pRight, iLevel, 'R'); } void NEAR PASCAL VerifyTree (PBTNODE pRoot) { PreOrdTrav (pRoot, 0, 'R'); OutputDebugString ("End Tree\n"); } #endif /* _DEBUG */