/*++ Copyright (c) 1996-1999 Microsoft Corporation Module Name: member.c Abstract: Cluster membership management routines for the Node Manager. Author: Mike Massa (mikemas) 12-Mar-1996 Revision History: --*/ #include "nmp.h" #include // // Data // BOOLEAN NmpMembershipCleanupOk = FALSE; BITSET NmpUpNodeSet = 0; LIST_ENTRY NmpLeaderChangeWaitList = {NULL, NULL}; // // Routines // VOID NmpMarkNodeUp( CL_NODE_ID NodeId ) /*++ Notes: Called with the NmpLock held. --*/ { BitsetAdd(NmpUpNodeSet, NodeId); return; } VOID NmpNodeUpEventHandler( IN PNM_NODE Node ) /*++ Notes: Called with the NmpLock held. --*/ { NmpMarkNodeUp(Node->NodeId); // // Don't declare the local node to be up. The join code will // take care of this. // if ((Node != NmLocalNode) && (Node->State == ClusterNodeJoining)) { ClRtlLogPrint(LOG_UNUSUAL, "[NMJOIN] Joining node %1!u! is now participating in the cluster membership.\n", Node->NodeId ); CL_ASSERT(NmpJoinerNodeId == Node->NodeId); CL_ASSERT(Node->State == ClusterNodeJoining); CL_ASSERT(NmpJoinTimer == 0); CL_ASSERT(NmpJoinAbortPending == FALSE); CL_ASSERT(NmpJoinerUp == FALSE); NmpJoinerUp = TRUE; } return; } // NmpNodeUpEventHandler VOID NmpNodeDownEventHandler( IN PNM_NODE Node ) { NmpMultiNodeDownEventHandler( BitsetFromUnit(Node->NodeId) ); } DWORD NmpMultiNodeDownEventHandler( IN BITSET DownedNodeSet ) { CL_NODE_ID i; PNM_NODE node; DWORD status; BOOLEAN iAmNewLeader = FALSE; PNM_LEADER_CHANGE_WAIT_ENTRY waitEntry; PLIST_ENTRY listEntry; ClRtlLogPrint(LOG_NOISE, "[NM] Down node set: %1!04X!.\n", DownedNodeSet); NmpAcquireLock(); // // Compute the new up node set // BitsetSubtract(NmpUpNodeSet, DownedNodeSet); ClRtlLogPrint(LOG_NOISE, "[NM] New up node set: %1!04X!.\n", NmpUpNodeSet); // // Check for failure of a joining node. // if (NmpJoinerNodeId != ClusterInvalidNodeId) { if (NmpJoinerNodeId == NmLocalNodeId) { // // The joining node is the local node. Halt. // ClRtlLogPrint(LOG_NOISE, "[NMJOIN] Aborting join because of change in membership.\n" ); CsInconsistencyHalt(ERROR_CLUSTER_JOIN_ABORTED); } else if ( (BitsetIsMember(NmpJoinerNodeId, DownedNodeSet)) || ( (BitsetIsMember(NmpSponsorNodeId, DownedNodeSet)) && (!BitsetIsMember(NmpJoinerNodeId, DownedNodeSet)) ) ) { // // The joining node is down or the sponsor is down and the joiner // is not yet an active member. Cleanup the join state. If the // sponsor is down and the joiner is an active member, we will // clean up when we detect that the joiner has perished. // ClRtlLogPrint(LOG_NOISE, "[NMJOIN] Aborting join of node %1!u! sponsored by node %2!u!\n", NmpJoinerNodeId, NmpSponsorNodeId ); // // Reset joiner state if sponsor died // if (BitsetIsMember(NmpSponsorNodeId, DownedNodeSet)) { node = NmpIdArray[NmpJoinerNodeId]; node->State = ClusterNodeDown; // [GorN 4/3/2000] // Without a node down, cluadmin won't refresh the state. // If this code is to be changed to emit CLUSTER_NODE_CHANGE_EVENT or // some other event, NmpUpdateJoinAbort has to be changed as well, // so that we will have the same join cleanup behavior BitsetAdd(DownedNodeSet, NmpJoinerNodeId); } NmpJoinerNodeId = ClusterInvalidNodeId; NmpSponsorNodeId = ClusterInvalidNodeId; NmpJoinTimer = 0; NmpJoinAbortPending = FALSE; NmpJoinSequence = 0; NmpJoinerUp = FALSE; NmpJoinerOutOfSynch = FALSE; } else { // // Mark that the joiner is out of synch with the cluster // state. The sponsor will eventually abort the join. // ClRtlLogPrint(LOG_NOISE, "[NMJOIN] Joiner node %1!u! is now out of synch with the cluster state.\n", NmpJoinerNodeId ); NmpJoinerOutOfSynch = TRUE; } } // // Check if the leader node went down // if (BitsetIsMember(NmpLeaderNodeId, DownedNodeSet)) { BOOL isEventSet; // // Elect a new leader - active node with the smallest ID. // for (i = ClusterMinNodeId; i <= NmMaxNodeId; i++) { if (BitsetIsMember(i, NmpUpNodeSet)) { NmpLeaderNodeId = i; break; } } CL_ASSERT(i <= NmMaxNodeId); if (NmpLeaderNodeId == NmLocalNodeId) { // // The local node is the new leader. // ClRtlLogPrint(LOG_NOISE, "[NM] This node is the new leader.\n" ); iAmNewLeader = TRUE; } else { ClRtlLogPrint(LOG_NOISE, "[NM] Node %1!u! is the new leader.\n", NmpLeaderNodeId ); } // // Wake up any threads waiting for an RPC call to the leader to // complete. // while (!IsListEmpty(&NmpLeaderChangeWaitList)) { listEntry = RemoveHeadList(&NmpLeaderChangeWaitList); // // NULL out the entry's links to indicate that it has been // dequeued. The users of the notification feature depend // on this action. // listEntry->Flink = NULL; listEntry->Blink = NULL; // // Wake up the waiting thread. // waitEntry = (PNM_LEADER_CHANGE_WAIT_ENTRY) listEntry; isEventSet = SetEvent(waitEntry->LeaderChangeEvent); CL_ASSERT(isEventSet != 0); } } // // First recovery pass - clean up node states and disable communication // for (i = ClusterMinNodeId; i <= NmMaxNodeId; i++) { node = NmpIdArray[i]; if ( (node != NULL) && (BitsetIsMember(i, DownedNodeSet)) ) { node->State = ClusterNodeDown; status = ClusnetOfflineNodeComm( NmClusnetHandle, node->NodeId ); CL_ASSERT( (status == ERROR_SUCCESS) || (status == ERROR_CLUSTER_NODE_ALREADY_DOWN) ); } } // // Inform the rest of the service that these nodes are gone // ClusterEventEx( CLUSTER_EVENT_NODE_DOWN_EX, EP_CONTEXT_VALID, ULongToPtr(DownedNodeSet) ); // // Second recovery pass - clean up network states and issue old-style // node down events // for (i = ClusterMinNodeId; i <= NmMaxNodeId; i++) { node = NmpIdArray[i]; if ( (node != NULL) && (BitsetIsMember(i, DownedNodeSet)) ) { // // Issue an individual node down event. // ClusterEvent(CLUSTER_EVENT_NODE_DOWN, node); // // Now do Intracluster RPC cleanup... // NmpTerminateRpcsToNode(node->NodeId); // // Update the network and interface information. // NmpUpdateNetworkConnectivityForDownNode(node); // // Log an event // if (NmpLeaderNodeId == NmLocalNodeId) { LPCWSTR nodeName = OmObjectName(node); CsLogEvent1( LOG_UNUSUAL, NM_EVENT_NODE_DOWN, nodeName ); } } } // // If this node is the new leader, schedule a state computation for all // networks. State reports may have been received before this node // assumed leadership duties. // if (iAmNewLeader) { NmpRecomputeNT5NetworkAndInterfaceStates(); } NmpReleaseLock(); return(ERROR_SUCCESS); } // NmpNodesDownEventHandler // DWORD NmpNodeChange( IN DWORD NodeId, IN NODESTATUS NewStatus ) { PNM_NODE node; CL_ASSERT( (NodeId >= ClusterMinNodeId) && (NodeId <= NmMaxNodeId) ); NmpAcquireLock(); node = NmpIdArray[NodeId]; CL_ASSERT(node != NULL); if (node != NULL) { if (NewStatus == NODE_DOWN) { NmpNodeDownEventHandler(node); } else { CL_ASSERT(NewStatus == NODE_UP); NmpNodeUpEventHandler(node); } } NmpReleaseLock(); return(ERROR_SUCCESS); } // NmpNodeChange VOID NmpHoldIoEventHandler( VOID ) { ClRtlLogPrint(LOG_NOISE, "[NM] Holding I/O.\n" ); #if defined(HOLD_IO_IS_SAFE_NOW) FmHoldIO(); #endif return; } VOID NmpResumeIoEventHandler( VOID ) { ClRtlLogPrint(LOG_NOISE, "[NM] Resuming I/O.\n" ); #if defined(HOLD_IO_IS_SAFE_NOW) FmResumeIO(); #endif return; } BOOL NmpCheckQuorumEventHandler( VOID ) { BOOL haveQuorum; // // daviddio 06/19/2000 // // Before asking FM to arbitrate, determine if we have any // viable network interfaces. If not, return failure to MM // and allow other cluster nodes to arbitrate. The SCM // will restart the cluster service, so that if no nodes // successfully arbitrate, we will get another shot. // if (NmpCheckForNetwork()) { ClRtlLogPrint(LOG_NOISE, "[NM] Checking if we own the quorum resource.\n" ); haveQuorum = FmArbitrateQuorumResource(); if (haveQuorum) { ClRtlLogPrint(LOG_NOISE, "[NM] We own the quorum resource.\n" ); } else { ClRtlLogPrint(LOG_NOISE, "[NM] We do not own the quorum resource, status %1!u!.\n", GetLastError() ); //[GN] ClusnetHalt( NmClusnetHandle ); => NmpHaltEventHandler // } } else { ClRtlLogPrint(LOG_CRITICAL, "[NM] Abdicating quorum because no valid network " "interfaces were detected.\n" ); haveQuorum = FALSE; } return(haveQuorum); } // NmpCheckQuorumEventHandler void NmpMsgCleanup1( IN DWORD DeadNodeId ) { ClRtlLogPrint(LOG_NOISE, "[NM] Phase 1 message cleanup - node %1!u!.\n", DeadNodeId ); return; } void NmpMsgCleanup2( IN BITSET DownedNodeSet ) { ClRtlLogPrint(LOG_NOISE, "[NM] Phase 2 message cleanup - node %1!04X!.\n", DownedNodeSet ); NmpAcquireLock(); if ( NmpCleanupIfJoinAborted && (NmpJoinerNodeId != ClusterInvalidNodeId) && BitsetIsMember(NmpJoinerNodeId, DownedNodeSet) ) { // // Since the joiner is in the DownedNodeSet mask // the node down will be delivered on this node by a regroup engine. // No need for NmpUpdateAbortJoin to issue a node down. // NmpCleanupIfJoinAborted = FALSE; ClRtlLogPrint(LOG_NOISE, "[NM] NmpCleanupIfJoinAborted is set to false. Joiner - %1!u!.\n", NmpJoinerNodeId ); } NmpReleaseLock(); // // Inform the rest of the service that these nodes are gone // ClusterSyncEventEx( CLUSTER_EVENT_NODE_DOWN_EX, EP_CONTEXT_VALID, ULongToPtr(DownedNodeSet) ); return; } VOID NmpHaltEventHandler( IN DWORD HaltCode ) { WCHAR string[16]; // Do a graceful stop if we are shutting down // if (HaltCode == MM_STOP_REQUESTED) { DWORD Status = ERROR_SUCCESS; ClRtlLogPrint(LOG_UNUSUAL, "[NM] Prompt shutdown is requested by a membership engine\n" ); ClusnetHalt( NmClusnetHandle ); CsLogEvent(LOG_NOISE, SERVICE_SUCCESSFUL_TERMINATION); CsServiceStatus.dwCurrentState = SERVICE_STOPPED; CsServiceStatus.dwControlsAccepted = 0; CsServiceStatus.dwCheckPoint = 0; CsServiceStatus.dwWaitHint = 0; CsServiceStatus.dwWin32ExitCode = Status; CsServiceStatus.dwServiceSpecificExitCode = Status; CsAnnounceServiceStatus(); ExitProcess(Status); } else { wsprintfW(&(string[0]), L"%u", HaltCode); ClRtlLogPrint(LOG_CRITICAL, "[NM] Halting this node due to membership or communications error. Halt code = %1!u!\n", HaltCode ); ClusnetHalt( NmClusnetHandle ); // // Adjust membership code to win32 error code. (If mapping exits) // HaltCode = MMMapHaltCodeToDosError( HaltCode ); CsInconsistencyHalt(HaltCode); } } void NmpJoinFailed( void ) { return; } DWORD NmpGumUpdateHandler( IN DWORD Context, IN BOOL SourceNode, IN DWORD BufferLength, IN PVOID Buffer ) /*++ Routine Description: Handles GUM updates for membership events. Arguments: Context - Supplies the update context. This is the message type SourceNode - Supplies whether or not the update originated on this node. BufferLength - Supplies the length of the update. Buffer - Supplies a pointer to the buffer. Return Value: ERROR_SUCCESS if successful Win32 error code otherwise --*/ { DWORD status; if (Context == NmUpdateJoinComplete) { status = NmpUpdateJoinComplete(Buffer); } else { status = ERROR_SUCCESS; ClRtlLogPrint(LOG_UNUSUAL, "[NM] Discarding unknown gum request %1!u!\n", Context ); } return(status); } // NmpUpdateGumHandler DWORD NmpMembershipInit( VOID ) { DWORD status; ClRtlLogPrint(LOG_NOISE,"[NM] Initializing membership...\n"); InitializeListHead(&NmpLeaderChangeWaitList); // // Initialize membership engine. // status = MMInit( NmLocalNodeId, NmMaxNodes, NmpNodeChange, NmpCheckQuorumEventHandler, NmpHoldIoEventHandler, NmpResumeIoEventHandler, NmpMsgCleanup1, NmpMsgCleanup2, NmpHaltEventHandler, NmpJoinFailed, NmpMultiNodeDownEventHandler ); if (status != MM_OK) { status = MMMapStatusToDosError(status); ClRtlLogPrint(LOG_CRITICAL, "[NM] Membership initialization failed, status %1!u!.\n", status ); return(status); } NmpMembershipCleanupOk = TRUE; ClRtlLogPrint(LOG_NOISE,"[NM] Membership initialization complete.\n"); return(ERROR_SUCCESS); } // NmpMembershipInit VOID NmpMembershipShutdown( VOID ) { if (NmpMembershipCleanupOk) { ClRtlLogPrint(LOG_NOISE,"[NM] Shutting down membership...\n"); MMShutdown(); NmpMembershipCleanupOk = FALSE; ClRtlLogPrint(LOG_NOISE,"[NM] Membership shutdown complete.\n"); } return; } // NmpMembershipShutdown