windows-nt/Source/XPSP1/NT/base/cluster/service/fm/fmevent.c

/*++

Copyright (c) 1996  Microsoft Corporation

Module Name:

    fmevent.c

Abstract:

    Event Handler for the Failover Manager component of the
    NT Cluster Service

Author:

    Rod Gamache (rodga) 19-Mar-1996


Revision History:

--*/
#include "fmp.h"

#define LOG_MODULE EVENT

//
// Global data initialized in this module
//


//
// Local functions
//


DWORD
WINAPI
FmpEventHandler(
    IN CLUSTER_EVENT Event,
    IN PVOID Context
    )

/*++

Routine Description:

    This routine handles events for the Failover Manager.

    In many cases the request is posted to the FM's work queue, so
    that the mainline event process is not blocked.

Arguments:

    Event - The event to be processed. Only one event at a time.
            If the event is not handled, return ERROR_SUCCESS.

    Context - A pointer to context associated with the particular event.

Returns:

    ERROR_SHUTDOWN_CLUSTER - if the Cluster must be shutdown.

    A Win32 error code on other errors.

Notes:

    The conservation of energy, and laws of inertia apply here.

    If a resource comes online it is because someone requested it to be so.
    Therefore, the energy from that request goes into the state of the Group,
    by requesting the Group to go online.

    However, if a resource goes offline, it could be because of a failure.
    We therefore only mark the state of a Group as offline if all resources
    contained within the group are offline.

--*/

{
    DWORD status;

    switch ( Event ) {

    case CLUSTER_EVENT_GROUP_FAILED:
        CL_ASSERT( Context != NULL );
        FmpPostWorkItem( FM_EVENT_GROUP_FAILED, Context, 0 );
        break;
        
    case CLUSTER_EVENT_NODE_ADDED:
        CL_ASSERT( Context != NULL );
        FmpPostWorkItem( FM_EVENT_NODE_ADDED, Context, 0 );
        break;

    case CLUSTER_EVENT_NODE_UP:
        ClRtlLogPrint(LOG_NOISE,"[FM] Node up event\n");
        //
        // FM no longer cares about node up events.
        //
        break;

    case CLUSTER_EVENT_NODE_DOWN:
        FmpMajorEvent = TRUE;           // Node Down is a major event.
        ClRtlLogPrint(LOG_NOISE,"[FM] FmpEventHandler::Node down event\n");
        FmpHandleNodeDownEvent( Context );
        break;

    default:
        break;

    }

    return(ERROR_SUCCESS);

} // FmEventHandler


DWORD
WINAPI
FmpSyncEventHandler(
    IN CLUSTER_EVENT Event,
    IN PVOID Context
    )

/*++

Routine Description:

    Processes nodes down cluster events. Update locker/locking nodes
    state and decide if we need to replay last update in async handler.

Arguments:

    Event - Supplies the type of cluster event.

    Context - Supplies the event-specific context

Return Value:

    ERROR_SUCCESS

--*/
{
    BITSET DownedNodes = (BITSET)((ULONG_PTR)Context);
    DWORD NodeId;


    if (Event != CLUSTER_EVENT_NODE_DOWN_EX) {
        return(ERROR_SUCCESS);
    }

    CL_ASSERT(BitsetIsNotMember(NmLocalNodeId, DownedNodes));


    ClRtlLogPrint(LOG_NOISE, 
        "[FM] FmpSyncEventHandler:: %1!04X!.\n",
        DownedNodes);

    //
    // mark the nodes that go down
    // till the worker thread finishes processing the groups that belonged
    // to this node, we will block a join from the same node
    //
    for(NodeId = ClusterMinNodeId; NodeId <= NmMaxNodeId; ++NodeId) 
    {

       if (BitsetIsMember(NodeId, DownedNodes))
       {
            gFmpNodeArray[NodeId].dwNodeDownProcessingInProgress = 1;
        }            
    }


    return(ERROR_SUCCESS);
}


VOID
FmpHandleGroupFailure(
    IN PFM_GROUP    Group
    )

/*++

Routine Description:

    Handles Group failure notifications from the resource manager. If the
    Group can be moved to some other system and we are within the failover
    threshold, then move it. Otherwise, just leave the Group (partially)
    online on this system.

Arguments:

    Group - a pointer to the Group object for the failed Group.

Returns:

    None.

--*/

{
    DWORD   status;
    DWORD   tickCount;
    DWORD   withinFailoverPeriod;
    DWORD   failoverPeriodInMs;
    BOOL    newTime;
    PFM_RESOURCE Resource;
    PLIST_ENTRY     listEntry;
 
    FmpAcquireLocalGroupLock( Group );

    if ( ( !IS_VALID_FM_GROUP( Group ) ) || ( Group->OwnerNode != NmLocalNode ) ) {
        FmpReleaseLocalGroupLock( Group );
        return;
    }

    ClRtlLogPrint(LOG_NOISE,
               "[FM] FmpHandleGroupFailure, Entry: Group failure for %1!ws!...\n",
               OmObjectId(Group));

    //
    // Convert Group's failover period from hours to milliseconds.
    //
    failoverPeriodInMs = Group->FailoverPeriod * (3600*1000);

    //
    // Get current time (in tick counts). We can save about 1193 hours worth
    // of milliseconds (or almost 50 days) in one DWORD.
    //
    tickCount = GetTickCount();

    //
    // Compute boolean that indicates if we are whithin the failover period.
    //
    withinFailoverPeriod = ( ((tickCount - Group->FailureTime) <=
                             failoverPeriodInMs ) ? TRUE : FALSE);

    //
    // Tally another failure.
    //
    if ( withinFailoverPeriod ) {
        ++Group->NumberOfFailures;
        newTime = FALSE;
    } else {
        Group->FailureTime = tickCount;
        Group->NumberOfFailures = 1;
        newTime = TRUE;
    }

    //
    // Tell everyone about our new FailureCount. Propagate failure
    // count
    //
    FmpPropagateFailureCount( Group, newTime );

    //
    // If this group is the same as the quorum group and the quorum 
    // resource has failed
    //
    if ( ( gpQuoResource->Group == Group ) && 
         ( gpQuoResource->State == ClusterResourceFailed ) ) 
    {
        FmpCleanupQuorumResource(gpQuoResource);
#if DBG
        if (IsDebuggerPresent())
        {
            DebugBreak();
        }
#endif            
        CsInconsistencyHalt(ERROR_QUORUM_RESOURCE_ONLINE_FAILED);
    }

    //
    // First check if we can move the Group someplace else.
    //
    if ( FmpGroupCanMove( Group ) &&
         (Group->NumberOfFailures <= Group->FailoverThreshold) ) {
        //
        //  Chittur Subbaraman (chitturs) - 4/13/99
        //
        //  Now create the FmpDoMoveGroupOnFailure thread to handle the
        //  group move. The thread will wait until the group state becomes
        //  stable and then initiate the move.
        //
        if( !( Group->dwStructState & 
               FM_GROUP_STRUCT_MARKED_FOR_MOVE_ON_FAIL ) )
        {
            PMOVE_GROUP  pContext = NULL;
            DWORD        dwThreadId = 0;
            HANDLE       hThread = NULL;

            pContext = LocalAlloc( LMEM_FIXED, sizeof( MOVE_GROUP ) );
            if ( pContext == NULL ) {
                status = ERROR_NOT_ENOUGH_MEMORY;
                ClRtlLogPrint(LOG_UNUSUAL,
                           "[FM] Group failure for group <%1!ws!>. Unable to allocate memory.\n",
                           OmObjectId(Group));
                FmpReleaseLocalGroupLock( Group );
                return;
            }

            ClRtlLogPrint(LOG_UNUSUAL,
                       "[FM] Group failure for group <%1!ws!>. Create thread to take offline and move.\n",
                       OmObjectId(Group));

            //
            // Reference the Group object. You don't want the group object
            // to be deleted at the time the FmpDoMoveGroupOnFailure thread
            // executes.
            //
            OmReferenceObject( Group );

            pContext->Group = Group;
            pContext->DestinationNode = NULL;

            hThread = CreateThread( NULL,
                                    0,
                                    FmpDoMoveGroupOnFailure,
                                    pContext,
                                    0,
                                    &dwThreadId );

            if ( hThread == NULL ) {
                status = GetLastError();
                ClRtlLogPrint(LOG_UNUSUAL,
                            "[FM] Failed to create FmpDoMoveGroupOnFailure thread for group <%1!ws!>. Error %2!u!.\n",
                            OmObjectId(Group),
                            status);
                LocalFree( pContext );
                OmDereferenceObject( Group );
            } else {
                CloseHandle( hThread );
                //
                //  Mark the group as being moved on failure. This is necessary
                //  so that you don't spawn new FmpDoMoveGroupOnFailure threads 
                //  which try to concurrently move the group. Note that the
                //  worker thread which calls this function may deliver multiple
                //  failure notifications.
                //
                Group->dwStructState |= FM_GROUP_STRUCT_MARKED_FOR_MOVE_ON_FAIL;
            }
        }
    } else {
        ClRtlLogPrint(LOG_UNUSUAL,
                   "[FM] Group failure for %1!ws!, but can't move. Failure count = %2!d!.\n",
                   OmObjectId(Group), Group->NumberOfFailures);

        // All attempts to bring group online failed - start the watchdog timer
        // to attempt a restart of all failed resources in this group.
        for ( listEntry = Group->Contains.Flink;
          listEntry != &(Group->Contains);
          listEntry = listEntry->Flink ) 
        {
            Resource = CONTAINING_RECORD(listEntry, FM_RESOURCE, ContainsLinkage);
            FmpDelayedStartRes(Resource);
        }       
                   
    }
    
    FmpReleaseLocalGroupLock( Group );

    ClRtlLogPrint(LOG_NOISE,
               "[FM] FmpHandleGroupFailure, Exit: Group failure for %1!ws!...\n",
               OmObjectId(Group));

    return;

} // FmpHandleGroupFailure


BOOL
FmpGroupCanMove(
    IN PFM_GROUP    Group
    )

/*++

Routine Description:

    Indicates whether there is another system that is in the preferred owner
    list that can take a Group.

Arguments:

    Group - the Group to check if it can move.

Returns:

    TRUE - the Group can (probably) move to another system.
    FALSE - there is no place to move this Group.

--*/

{
    DWORD   status;
    PNM_NODE node;

    node = FmpFindAnotherNode( Group, FALSE );
    if (node != NULL ) {
        return(TRUE);
    }

    return(FALSE);

} // FmpGroupCanMove


DWORD
FmpNodeDown(
    PVOID Context
    )

/*++

Routine Description:

    This routine handles a node down event from the NM layer.

Arguments:

    Context - The node that went down.

Returns:

    ERROR_SUCCESS if everything was handled okay.

    ERROR_SHUTDOWN_CLUSTER if catastrophy happens.

    Win32 error code otherwise (???).

--*/
{
    PNM_NODE            pNode = (PNM_NODE)Context;
    DWORD               dwStatus;
    LPCWSTR             pszNodeId;
    DWORD               dwNodeLen;
    DWORD               dwClusterHighestVersion;
    
    ClRtlLogPrint(LOG_NOISE,
                "[FM] FmpNodeDown::Node down %1!ws!\n",
                    OmObjectId(pNode));

    //
    //  Chittur Subbaraman (chitturs) - 3/30/99
    //
    //  Acquire the global group lock to synchronize with the shutdown
    //
    FmpAcquireGroupLock();
    
    if (!FmpFMOnline || FmpShutdown) 
    {
        //
        // We don't care about membership changes until we have finished
        // initializing and we're not shutting down.
        //
        FmpReleaseGroupLock();
        ClRtlLogPrint(LOG_CRITICAL,
                   "[FM] FmpNodeDown - ignore node down event.\n" );
        return(ERROR_SUCCESS);
    }
    
    FmpReleaseGroupLock();

    //SS: Note all nodes will send this update
    //The latter updates should not find any groups that belong to 
    //this node
    //We cant rely on only the locker node making this update
    //since the locker node may die before it is able to do this and
    //that can result in these groups being orphaned
    pszNodeId = OmObjectId(pNode);
    dwNodeLen = (lstrlenW(pszNodeId)+1)*sizeof(WCHAR);

    NmGetClusterOperationalVersion( &dwClusterHighestVersion, 
                                    NULL, 
                                    NULL );

    //
    //  If this is a non Win2k-Whistler mixed mode cluster, attempt to randomize the
    //  group preferred owners list and send it as a part of node down GUM.
    //
    if ( CLUSTER_GET_MAJOR_VERSION( dwClusterHighestVersion ) >= 
                NT51_MAJOR_VERSION ) 
    {
        PFM_GROUP_NODE_LIST pGroupNodeList = NULL;

        //
        //  Attempt to get a contiguous buffer containing the list of group IDs and suggested
        //  owners for them.
        //
        dwStatus = FmpPrepareGroupNodeList( &pGroupNodeList );

        if ( dwStatus != ERROR_SUCCESS )
        {
            //
            //  If the call returns ERROR_CLUSTER_INVALID_REQUEST, it means a user has turned
            //  off the randomization algorithm.
            //
            if ( dwStatus != ERROR_CLUSTER_INVALID_REQUEST )
                ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpNodeDown: FmpPrepareGroupNodeList returns %1!u!...\n",
                            dwStatus); 
            LocalFree( pGroupNodeList );
            goto use_old_gum;
        }

        //
        //  If the list does not even contain any entries, just switch to the old gum. No point in
        //  sending the list header around.
        //
        if ( pGroupNodeList->cbGroupNodeList < sizeof ( FM_GROUP_NODE_LIST ) )
        {
            ClRtlLogPrint(LOG_NOISE, "[FM] FmpNodeDown: FmpPrepareGroupNodeList returns empty list...\n"); 
            LocalFree( pGroupNodeList );
            goto use_old_gum;
        }

        //
        //  Invoke GUM to pass around the dead node ID and the randomized group node list
        //
        dwStatus = GumSendUpdateEx( GumUpdateFailoverManager,
                                    FmUpdateUseRandomizedNodeListForGroups,
                                    2,
                                    dwNodeLen,
                                    pszNodeId,
                                    pGroupNodeList->cbGroupNodeList,
                                    pGroupNodeList );

        if ( dwStatus != ERROR_SUCCESS ) 
        {
            ClRtlLogPrint(LOG_CRITICAL,
                       "[FM] FmpNodeDown: GUM update FmUpdateUseRandomizedNodeListForGroups failed %1!d!\n",
                       dwStatus);
        }

        LocalFree( pGroupNodeList );
        return( ERROR_SUCCESS );
    }

use_old_gum:      
    dwStatus = GumSendUpdateEx(GumUpdateFailoverManager,
                   FmUpdateAssignOwnerToGroups,
                   1,
                   dwNodeLen,
                   pszNodeId);

    if (dwStatus != ERROR_SUCCESS) 
    {
        ClRtlLogPrint(LOG_CRITICAL,
                   "[FM] FmpNodeDown: Gumupdate failed %1!d!\n",
                   dwStatus);
    }

    return(ERROR_SUCCESS);
} // FmpNodeDown


BOOL
WINAPI
FmVerifyNodeDown(
    IN  PNM_NODE Node,
    OUT LPBOOL   IsDown
    )

/*++

Routine Description:

    This routine attempts to verify whether a given node is down. This can
    only be done if there is some shared resource that the other system
    currently 'owns'.  We will attempt to negotiate the shared resource and
    if we 'win' the negotiation we'll declare that other system down. If we
    loose arbitration, we declare the other system as still up.

Arguments:

    Node - A pointer to the node structure for the other system.

    IsDown - A we can perform the verification, this indicates the results of
            that verification.

Returns:
    TRUE - If we can perform the verification.
    FALSE - If we can't perform the verification.

--*/

{
    return(FALSE);

} // FmVerifyNodeDown

DWORD
FmpHandleNodeDownEvent(
    IN  PVOID pContext
    )

/*++

Routine Description:

    This function creates a thread to handle the node down event.

Arguments:

    pContext - Pointer to the context structure
	
Returns:

    ERROR_SUCCESS
--*/

{
    HANDLE  hThread = NULL;
    DWORD   dwThreadId;
    DWORD   dwError;

    //
    //  Chittur Subbaraman (chitturs) - 7/31/99
    //
    //  Create a thread to handle the FM node down event. Let us not
    //  rely on the FM worker thread to handle this. This is because
    //  the FM worker thread could be trying to online some resource
    //  and that could get stuck for some time since the quorum resource 
    //  is not online. Now in some cases, only after the node down event 
    //  is processed the quorum resource could come online. (This is 
    //  highly likely especially in a 2 node cluster.)
    //
    ClRtlLogPrint(LOG_NOISE,
              "[FM] FmpHandleNodeDownEvent - Create thread to handle node down event....\n"
              );
    
    hThread = CreateThread( NULL, 
                            0, 
                            FmpNodeDown,
                            pContext, 
                            0, 
                            &dwThreadId );

    if ( hThread == NULL )
    {
        dwError = GetLastError();
        ClRtlLogPrint(LOG_CRITICAL,
                  "[FM] FmpHandleNodeDownEvent - Unable to create thread to handle node down event. Error=0x%1!08lx!\r\n",
        	     dwError);
        CsInconsistencyHalt( dwError );
    }
        
    CloseHandle( hThread );

    return( ERROR_SUCCESS );
} // FmpHandleNodeDownEvent