windows-nt/Source/XPSP1/NT/base/cluster/service/fm/group.c
2020-09-26 16:20:57 +08:00

5306 lines
153 KiB
C
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*++
Copyright (c) 1996 Microsoft Corporation
Module Name:
group.c
Abstract:
Cluster group management routines.
Author:
Rod Gamache (rodga) 8-Mar-1996
Notes:
WARNING: All of the routines in this file assume that the group
lock is held when they are called.
Revision History:
--*/
#include "fmp.h"
#define LOG_MODULE GROUP
//
// Global Data
//
CRITICAL_SECTION FmpGroupLock;
//
// Local function prototypes
//
/////////////////////////////////////////////////////////////////////////////
//
// Group Management Routines
//
/////////////////////////////////////////////////////////////////////////////
BOOL
FmpInPreferredList(
IN PFM_GROUP Group,
IN PNM_NODE Node,
IN BOOL bRecalc,
IN PFM_RESOURCE pRefResource
)
/*++
Routine Description:
Check if a node is in the preferred list for the Group.
Arguments:
Group - Pointer to the group object with the preferred owners list.
Node - The Node to check for.
bRecalc - If set to TRUE, we recalculate the preferred list for the group
based on the possible node list for the reference resource.
pRefResource - If NULL, we walk all the resources in the
group and calculate their possible node list to see
if it has since expanded due to the fact that dlls
were copied to nodes.
Return Value:
TRUE - if the node is in the list.
FALSE - if the node is NOT in the list.
--*/
{
PLIST_ENTRY listEntry;
PPREFERRED_ENTRY preferredEntry;
BOOL bRet = FALSE;
//
// For each entry in the Preferred list, it must exist in the possible
// list.
//
ChkInPrefList:
for ( listEntry = Group->PreferredOwners.Flink;
listEntry != &(Group->PreferredOwners);
listEntry = listEntry->Flink ) {
preferredEntry = CONTAINING_RECORD( listEntry,
PREFERRED_ENTRY,
PreferredLinkage );
if ( preferredEntry->PreferredNode == Node ) {
return(TRUE);
}
}
if (bRecalc)
{
PFM_RESOURCE pResource;
DWORD dwStatus;
LPWSTR lpszOwners = NULL;
DWORD dwMaxSize=0;
HDMKEY hGroupKey;
DWORD dwSize = 0;
hGroupKey = DmOpenKey(DmGroupsKey, OmObjectId(Group),
KEY_READ);
if (hGroupKey == NULL)
{
dwStatus = GetLastError();
ClRtlLogPrint(LOG_CRITICAL,
"[FM] FmInPreferredList: Couldnt open group key\r\n",
dwStatus);
CL_UNEXPECTED_ERROR(dwStatus);
goto FnExit;
}
//the group preferred list must not be set by the user
//if it is then there is no point in doing this recalculation
dwStatus = DmQueryMultiSz( hGroupKey,
CLUSREG_NAME_GRP_PREFERRED_OWNERS,
&lpszOwners,
&dwMaxSize,
&dwSize );
if (lpszOwners)
LocalFree(lpszOwners);
DmCloseKey(hGroupKey);
if (dwStatus == ERROR_FILE_NOT_FOUND)
{
DWORD dwUserModified;
for (listEntry = Group->Contains.Flink;
listEntry != &(Group->Contains);
listEntry = listEntry->Flink)
{
pResource = CONTAINING_RECORD(listEntry, FM_RESOURCE, ContainsLinkage);
//the resource possible node list must not be set by the user
//if it is, then we can skip this resource
dwStatus = DmQueryDword( pResource->RegistryKey,
CLUSREG_NAME_RES_USER_MODIFIED_POSSIBLE_LIST,
&dwUserModified,
NULL );
if (dwStatus == ERROR_FILE_NOT_FOUND)
{
FmpSetPossibleNodeForResType(OmObjectId(pResource->Type),
TRUE);
if (FmpInPossibleListForResType(pResource->Type,Node) &&
!FmpInPossibleListForResource(pResource, Node))
{
//add to the resource possible node list
//this will or add to the pref list of the group
FmChangeResourceNode(pResource, Node, TRUE);
}
}
}
//set bRecalc to be FALSE so that we dont evaluate this again
bRecalc = FALSE;
goto ChkInPrefList;
}
}
FnExit:
return(bRet);
} // FmpInPreferredList
BOOL
FmpHigherInPreferredList(
IN PFM_GROUP Group,
IN PNM_NODE Node1,
IN PNM_NODE Node2
)
/*++
Routine Description:
Check if Node1 is higher (in priority) in the preferred owners list than
Node1.
Arguments:
Group - Pointer to the group object with the preferred owners list.
Node1 - The Node that should be higher in the list.
Node2 - The Node that should be lower in the list.
Return Value:
TRUE - if Node1 is higher in the list.
FALSE - if Node2 is higher in the list, or Node1 is not in the list at all.
--*/
{
PLIST_ENTRY listEntry;
PPREFERRED_ENTRY preferredEntry;
DWORD orderedOwners = 0;
//
// For each entry in the Preferred list, check whether Node1 or Node2 is
// higher.
//
for ( listEntry = Group->PreferredOwners.Flink;
listEntry != &(Group->PreferredOwners),
orderedOwners < Group->OrderedOwners;
listEntry = listEntry->Flink ) {
preferredEntry = CONTAINING_RECORD( listEntry,
PREFERRED_ENTRY,
PreferredLinkage );
if ( preferredEntry->PreferredNode == Node1 ) {
return(TRUE);
}
if ( preferredEntry->PreferredNode == Node2 ) {
return(FALSE);
}
orderedOwners++;
}
return(FALSE);
} // FmpHigherInPreferredList
DWORD
FmpSetPreferredEntry(
IN PFM_GROUP Group,
IN PNM_NODE Node
)
/*++
Routine Description:
Add a node to the preferred list for the Group.
Arguments:
Group - Pointer to the group object with the preferred owners list.
Node - The Node to add.
Return Value:
ERROR_SUCCESS if node is added.
ERROR_NOT_ENOUGH_MEMORY on failure.
--*/
{
PLIST_ENTRY listEntry;
PPREFERRED_ENTRY preferredEntry;
//
// Make sure entry is not already present in list.
//
if ( FmpInPreferredList( Group, Node, FALSE, NULL ) ) {
return(ERROR_SUCCESS);
}
//
// Create the Preferred Owners List entry.
//
preferredEntry = LocalAlloc( LMEM_FIXED, sizeof(PREFERRED_ENTRY) );
if ( preferredEntry == NULL ) {
ClRtlLogPrint( LOG_ERROR,
"[FM] Error allocating preferred owner entry for group %1!ws!. Stopped adding.\n",
OmObjectId(Group));
return(ERROR_NOT_ENOUGH_MEMORY);
}
//
// Create the preferred owner entry and keep a reference on the node object.
//
OmReferenceObject( Node );
preferredEntry->PreferredNode = Node;
InsertTailList( &Group->PreferredOwners,
&preferredEntry->PreferredLinkage );
return(ERROR_SUCCESS);
} // FmpSetPreferredEntry
BOOL FmpFindNodeThatMightBeAddedToPrefList(
IN PFM_GROUP pGroup,
IN PNM_NODE *pDestNode,
IN PVOID pNode,
IN LPCWSTR szName)
{
BOOL bRet = TRUE; //assume we will continue enumeration
*pDestNode = NULL;
//if this node is not up or if this is the local node, continue
if ((pNode == NmLocalNode) || (NmGetNodeState(pNode) != ClusterNodeUp))
{
return(bRet);
}
if (FmpInPreferredList(pGroup, pNode, TRUE, NULL))
{
bRet = FALSE;
*pDestNode = pNode;
}
return(bRet);
}
PNM_NODE
FmpFindAnotherNode(
IN PFM_GROUP Group,
IN BOOL bChooseMostPreferredNode
)
/*++
Routine Description:
Check if another node is up that can take the group.
Arguments:
Group - Pointer to the group object we're checking.
bChooseMostPreferredNode - Whether to choose the most preferred node or not.
Return Value:
Pointer to node object that the group can move to.
NULL if another system is not found.
--*/
{
PLIST_ENTRY listEntry;
PPREFERRED_ENTRY preferredEntry;
PNM_NODE first = NULL;
BOOLEAN flag = FALSE;
//
// First, let us give the anti-affinity algorithm a shot at picking the node.
//
first = FmpGetNodeNotHostingUndesiredGroups ( Group,
TRUE ); // Rule out local node
if ( first != NULL )
{
goto FnExit;
}
//
// For each entry in the Preferred list, find a system (other than the
// local system that is up).
//
if ( bChooseMostPreferredNode )
{
first = FmpGetNonLocalPreferredNode( Group );
//
// In this case in which you are doing a user-initiated move, give the randomized
// preferred list algorithm a chance to pick the node. Note that if the randomized
// algorithm could not pick a node, it will return the supplied suggested node itself.
//
if ( first != NULL )
{
first = FmpPickNodeFromPreferredListAtRandom ( Group,
first, // Suggested default
TRUE, // Dont choose local node
TRUE ); // Check whether randomization
// should be disabled
}
}
else
{
for ( listEntry = Group->PreferredOwners.Flink;
listEntry != &(Group->PreferredOwners);
listEntry = listEntry->Flink ) {
preferredEntry = CONTAINING_RECORD( listEntry,
PREFERRED_ENTRY,
PreferredLinkage );
if ( (preferredEntry->PreferredNode != NmLocalNode) &&
(NmGetExtendedNodeState(preferredEntry->PreferredNode) == ClusterNodeUp) ) {
if (flag == TRUE)
return(preferredEntry->PreferredNode);
else if (first == NULL)
first = preferredEntry->PreferredNode;
} else if (preferredEntry->PreferredNode == NmLocalNode) {
flag = TRUE;
}
}
}
//if we couldnt find a node, we retry again since the user might have
//expanded the possible node list for resource type since then
//if the group preferred list is not set by the user,
//we recalculate it since it could have
if (first == NULL)
{
LPWSTR lpszOwners = NULL;
DWORD dwMaxSize=0;
HDMKEY hGroupKey;
DWORD dwSize = 0;
DWORD dwStatus;
hGroupKey = DmOpenKey(DmGroupsKey, OmObjectId(Group),
KEY_READ);
if (hGroupKey == NULL)
{
dwStatus = GetLastError();
ClRtlLogPrint(LOG_CRITICAL,
"[FM] FmInPreferredList: Couldnt open group key\r\n",
dwStatus);
CL_UNEXPECTED_ERROR(dwStatus);
goto FnExit;
}
//the group preferred list must not be set by the user
//if it is then there is no point in doing this recalculation
dwStatus = DmQueryMultiSz( hGroupKey,
CLUSREG_NAME_GRP_PREFERRED_OWNERS,
&lpszOwners,
&dwMaxSize,
&dwSize );
if (lpszOwners)
LocalFree(lpszOwners);
DmCloseKey(hGroupKey);
if (dwStatus == ERROR_FILE_NOT_FOUND)
OmEnumObjects(ObjectTypeNode, FmpFindNodeThatMightBeAddedToPrefList,
Group, &first);
}
FnExit:
return(first);
} // FmpFindAnotherNode
PNM_NODE
FmpGetPreferredNode(
IN PFM_GROUP Group
)
/*++
Routine Description:
Find best node that can take the group
Arguments:
Group - Pointer to the group object we're checking.
Return Value:
Pointer to node object that the group can move to.
NULL if another system is not found.
--*/
{
PLIST_ENTRY listEntry;
PPREFERRED_ENTRY preferredEntry;
PNM_NODE pNode = NULL;
//
// First, let us give the anti-affinity algorithm a shot at picking the node.
//
pNode = FmpGetNodeNotHostingUndesiredGroups ( Group,
FALSE ); // Don't rule out local node
if ( pNode != NULL )
{
return ( pNode );
}
//
// For each entry in the Preferred list, find a system that is up.
//
for ( listEntry = Group->PreferredOwners.Flink;
listEntry != &(Group->PreferredOwners);
listEntry = listEntry->Flink ) {
preferredEntry = CONTAINING_RECORD( listEntry,
PREFERRED_ENTRY,
PreferredLinkage );
if (NmGetNodeState(preferredEntry->PreferredNode) == ClusterNodeUp ) {
return(preferredEntry->PreferredNode);
}
}
return(NULL);
} // FmpGetPreferredNode
PNM_NODE
FmpGetNonLocalPreferredNode(
IN PFM_GROUP Group
)
/*++
Routine Description:
Find best node that can take the group which is not the local node.
Arguments:
Group - Pointer to the group object we're checking.
Return Value:
Pointer to node object that the group can move to.
NULL if another system is not found.
--*/
{
PLIST_ENTRY listEntry;
PPREFERRED_ENTRY preferredEntry;
//
// For each entry in the Preferred list, find a system (other than the
// local system that is up).
//
for ( listEntry = Group->PreferredOwners.Flink;
listEntry != &(Group->PreferredOwners);
listEntry = listEntry->Flink ) {
preferredEntry = CONTAINING_RECORD( listEntry,
PREFERRED_ENTRY,
PreferredLinkage );
if ( preferredEntry->PreferredNode == NmLocalNode ) {
continue;
}
if (NmGetNodeState(preferredEntry->PreferredNode) == ClusterNodeUp ) {
return(preferredEntry->PreferredNode);
}
}
return(NULL);
} // FmpGetNonLocalPreferredNode
BOOL
FmpIsGroupQuiet(
IN PFM_GROUP Group,
IN CLUSTER_GROUP_STATE WantedState
)
/*++
Routine Description:
Checks if the group has any pending resources.
Arguments:
Group - the Group to check.
WantedState - the state the Group wants to get to.
Return Value:
TRUE - if the Group is not doing anything now.
FALSE otherwise.
--*/
{
DWORD status;
PLIST_ENTRY listEntry;
PFM_RESOURCE Resource;
if ( Group->MovingList ) {
return(FALSE);
}
//
// Check all of the resources contained within this group.
//
for ( listEntry = Group->Contains.Flink;
listEntry != &(Group->Contains);
listEntry = listEntry->Flink ) {
Resource = CONTAINING_RECORD(listEntry, FM_RESOURCE, ContainsLinkage);
switch ( WantedState ) {
case ClusterGroupOnline:
// if resource is pending, then offline pending is bad
if ( Resource->State == ClusterResourceOfflinePending ) {
return(FALSE);
}
break;
case ClusterGroupOffline:
// if resource is pending, then online pending is bad
if ( Resource->State == ClusterResourceOnlinePending ) {
return(FALSE);
}
break;
default:
// any pending state is bad
if ( Resource->State >= ClusterResourcePending ) {
return(FALSE);
}
break;
}
}
return(TRUE);
} // FmpIsGroupQuiet
VOID
FmpSetGroupPersistentState(
IN PFM_GROUP Group,
IN CLUSTER_GROUP_STATE State
)
/*++
Routine Description:
Sets the PersistentState of a Group. This includes the registry.
Arguments:
Group - The Group to set the state for.
State - The new state for the Group.
Returns:
ERROR_SUCCESS if successful.
A Win32 error code on failure.
Notes:
The LocalGroupLock must be held.
--*/
{
DWORD persistentState;
LPWSTR persistentStateName = CLUSREG_NAME_GRP_PERSISTENT_STATE;
if (!gbIsQuoResEnoughSpace)
return;
FmpAcquireLocalGroupLock( Group );
//
// If the current state has changed, then do the work. Otherwise,
// skip the effort.
//
if ( Group->PersistentState != State ) {
Group->PersistentState = State;
CL_ASSERT( Group->RegistryKey != NULL );
//
// Set the new value, but only if it is online or offline.
//
if ( State == ClusterGroupOnline ) {
persistentState = 1;
DmSetValue( Group->RegistryKey,
persistentStateName,
REG_DWORD,
(LPBYTE)&persistentState,
sizeof(DWORD) );
} else if ( State == ClusterGroupOffline ) {
persistentState = 0;
DmSetValue( Group->RegistryKey,
persistentStateName,
REG_DWORD,
(LPBYTE)&persistentState,
sizeof(DWORD) );
}
}
FmpReleaseLocalGroupLock( Group );
} // FmpSetGroupPersistentState
DWORD
FmpOnlineGroup(
IN PFM_GROUP Group,
IN BOOL ForceOnline
)
/*++
Routine Description:
Bring the specified group online. This means bringing all of the
individual resources contained within the group online. This is an
atomic operation - so either all resources contained within the group
are brought online, or none of them are.
Arguments:
Group - Supplies a pointer to the group structure to bring online.
ForceOnline - TRUE if all resources in the Group should be forced online.
Retruns:
ERROR_SUCCESS if the request was successful.
A Win32 error code on failure.
--*/
{
DWORD status, retstatus = ERROR_SUCCESS;
PLIST_ENTRY listEntry;
PFM_RESOURCE Resource;
BOOL bPending = FALSE;
ClRtlLogPrint(LOG_NOISE,
"[FM] OnlineGroup for %1!ws! owner %2!d!\n",
OmObjectId(Group), OmObjectId(Group->OwnerNode));
FmpAcquireLocalGroupLock( Group );
//
// Check if we are the owner... if not, return failure.
//
if ( gpQuoResource->Group != Group &&
((Group->OwnerNode != NmLocalNode) ||
!FmpInPreferredList( Group, Group->OwnerNode, TRUE, NULL) ) ) {
FmpReleaseLocalGroupLock( Group );
return(ERROR_HOST_NODE_NOT_RESOURCE_OWNER);
}
//
// Make sure the group is quiet
//
if ( !FmpIsGroupQuiet( Group, ClusterGroupOnline ) ) {
FmpReleaseLocalGroupLock( Group );
return(ERROR_INVALID_STATE);
}
//if the quorum group is in this group bring it online first
//This is called when a node goes down and its groups are
//being reclaimed, the order in which the resoures are brought
//online is important
if ( gpQuoResource->Group == Group)
{
//SS:: if the quorum resource is in the group, it must be
//brought online irrespective of the persistent state
//so we will pass in true here
//Apps can mess with persistent state via the common
//properties and then cause havoc so we need to force the
//quorum resource online despite that
status = FmpDoOnlineResource( gpQuoResource,
TRUE );
if ( (status != ERROR_SUCCESS) &&
(status != ERROR_IO_PENDING) ) {
ClRtlLogPrint(LOG_NOISE,
"[FM] OnlineGroup: Failed on resource %1!ws!. Status %2!u!\n",
OmObjectId(gpQuoResource),
status);
CL_UNEXPECTED_ERROR(status);
}
}
//
// Bring online all of the resources contained within this group.
//
for ( listEntry = Group->Contains.Flink;
listEntry != &(Group->Contains);
listEntry = listEntry->Flink ) {
Resource = CONTAINING_RECORD(listEntry, FM_RESOURCE, ContainsLinkage);
status = FmpDoOnlineResource( Resource,
ForceOnline );
if (status == ERROR_IO_PENDING) {
bPending = TRUE;
}
if ( (status != ERROR_SUCCESS) &&
(status != ERROR_NODE_CANT_HOST_RESOURCE) &&
(status != ERROR_IO_PENDING) ) {
ClRtlLogPrint(LOG_NOISE,
"[FM] OnlineGroup: Failed on resource %1!ws!. Status %2!u!\n",
OmObjectId(Resource),
status);
retstatus = status;
}
}
//
// Normally bringing the resources online propagates the group state,
// but in order to get the state right for a group with no resources,
// manually propagate the state here.
//
FmpPropagateGroupState(Group);
ClRtlLogPrint(LOG_NOISE,
"[FM] OnlineGroup: setting group state to Online for %1!ws!\n",
OmObjectId(Group));
FmpReleaseLocalGroupLock( Group );
if (retstatus == ERROR_SUCCESS) {
if (bPending) {
retstatus = ERROR_IO_PENDING;
}
}
return(retstatus);
} // FmpOnlineGroup
DWORD
FmpOfflineGroup(
IN PFM_GROUP Group,
IN BOOL OfflineQuorum,
IN BOOL SetPersistent
)
/*++
Routine Description:
Bring the specified group offline. This means bringing all of the
individual resources contained within the group offline.
Arguments:
Group - Supplies a pointer to the group structure to bring offline.
OfflineQuorum - TRUE if any quorum resource in this group should
be taken offline. FALSE if the quorum resource should be left online.
SetPersistent - TRUE if the persistent state of each resource should be
updated.
Returns:
ERROR_SUCCESS if the request was successful.
A Win32 error code on failure.
--*/
{
DWORD status;
PLIST_ENTRY listEntry;
PFM_RESOURCE Resource;
DWORD returnStatus = ERROR_SUCCESS;
PRESOURCE_ENUM ResourceEnum=NULL;
DWORD i;
FmpAcquireLocalGroupLock( Group );
//if the group has been marked for delete, then fail this call
if (!IS_VALID_FM_GROUP(Group))
{
FmpReleaseLocalGroupLock( Group);
return (ERROR_GROUP_NOT_AVAILABLE);
}
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpOfflineGroup, Group=%1!ws!\n",
OmObjectId(Group));
//
// Check if we are the owner... if not, return failure.
//
if ( Group->OwnerNode != NmLocalNode ) {
returnStatus = ERROR_HOST_NODE_NOT_RESOURCE_OWNER;
goto error_exit;
}
//
// Make sure the group is quiet
//
if ( !FmpIsGroupQuiet( Group, ClusterGroupOffline ) ) {
returnStatus = ERROR_INVALID_STATE;
goto error_exit;
}
//
// Get the list of resources in the group and their states.
//
returnStatus = FmpGetResourceList( &ResourceEnum, Group );
if ( returnStatus != ERROR_SUCCESS ) {
goto error_exit;
}
// offline all resources except the quorum resource
for ( i = 0; i < ResourceEnum->EntryCount; i++ ) {
Resource = OmReferenceObjectById( ObjectTypeResource,
ResourceEnum->Entry[i].Id );
if ( Resource == NULL ) {
returnStatus = ERROR_RESOURCE_NOT_FOUND;
goto error_exit;
}
//quorum resource is brought offline last
if (Resource->QuorumResource)
{
OmDereferenceObject(Resource);
continue;
}
if (SetPersistent) {
FmpSetResourcePersistentState( Resource, ClusterResourceOffline );
}
status = FmpOfflineResource( Resource, FALSE);
OmDereferenceObject( Resource );
if ( (status != ERROR_SUCCESS) &&
(status != ERROR_IO_PENDING) ) {
returnStatus = status;
goto error_exit;
}
if ( status == ERROR_IO_PENDING ) {
returnStatus = ERROR_IO_PENDING;
}
}
// bring the quorum resource offline now, if asked to bring quorum offline
// This allows other resources to come offline and save their checkpoints
// The quorum resource offline should block till the resources have
// finished saving the checkpoint
if (ResourceEnum->ContainsQuorum >= 0)
{
if (!OfflineQuorum)
{
//if the quorum resource should not be taken offline
returnStatus = ERROR_QUORUM_RESOURCE;
}
else if (returnStatus == ERROR_SUCCESS)
{
CL_ASSERT((DWORD)ResourceEnum->ContainsQuorum < ResourceEnum->EntryCount);
Resource = OmReferenceObjectById( ObjectTypeResource,
ResourceEnum->Entry[ResourceEnum->ContainsQuorum].Id );
if ( Resource == NULL ) {
returnStatus = ERROR_RESOURCE_NOT_FOUND;
goto error_exit;
}
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpOfflineGroup: Bring quorum resource offline\n");
if ( !(Resource->Flags & RESOURCE_WAITING) ) {
if (Resource->State != ClusterResourceOffline) {
Resource->State = ClusterResourceOnline; // [HACKHACK]
}
status = FmpOfflineResource( Resource , FALSE);
OmDereferenceObject( Resource );
if ( (status != ERROR_SUCCESS) &&
(status != ERROR_IO_PENDING) )
{
returnStatus = status;
goto error_exit;
}
if ( status == ERROR_IO_PENDING )
returnStatus = ERROR_IO_PENDING;
} else {
OmDereferenceObject( Resource );
}
}
}
//
// Normally bringing the resources offline propagates the group state,
// but in order to get the state right for a group with no resources,
// manually propagate the state here.
//
if (SetPersistent)
FmpPropagateGroupState(Group);
error_exit:
FmpReleaseLocalGroupLock( Group );
if (ResourceEnum)
FmpDeleteResourceEnum( ResourceEnum );
return(returnStatus);
} // FmpOfflineGroup
CLUSTER_GROUP_STATE
FmpGetGroupState(
IN PFM_GROUP Group,
IN BOOL IsNormalized
)
/*++
Routine Description:
Get the Group state, either normalized to ClusterGroupOnline or
ClusterGroupOffline or not normalized.
Arguments:
Group - The Group we're interested in.
IsNormalized - Should the Group state be normalized ?
Returns:
The current Group state which is one of (in increasing order of
precedence)
ClusterGroupOnline, ClusterGroupOffline
ClusterGroupPartialOnline
ClusterGroupPending (only if IsNormalized is FALSE)
ClusterGroupFailed (only if IsNormalized is FALSE)
--*/
{
PLIST_ENTRY listEntry;
PFM_RESOURCE resource;
CLUSTER_GROUP_STATE state;
CLUSTER_RESOURCE_STATE firstResourceState;
CLUSTER_RESOURCE_STATE resourceState;
// Chittur Subbaraman (chitturs) - 09/16/98 (Modified this function
// to work with IsNormalized flag)
FmpAcquireLocalGroupLock( Group );
if ( !IsListEmpty(&Group->Contains) ) {
listEntry = Group->Contains.Flink;
resource = CONTAINING_RECORD(listEntry,
FM_RESOURCE,
ContainsLinkage);
//
// Get the first resource's state
//
firstResourceState = resource->State;
if ( IsNormalized == FALSE ) {
BOOL IsPending = FALSE;
BOOL IsPartialOnline = FALSE;
//
// First check whether any resource in the group has
// failed. If so, set the group state to ClusterGroupFailed
// and exit immediately. If no resource in the group has
// failed, but at least one of them is in the pending state,
// then set the group state to ClusterGroupPending and exit
// immediately. If no resource in the group is in either
// the failed or in the pending state, then check whether
// some resources in the group are in online and some in the
// offline state. Then, set the group state to
// ClusterGroupPartialOnline and exit immediately.
//
for ( ;
listEntry != &(Group->Contains);
listEntry = listEntry->Flink ) {
resource = CONTAINING_RECORD(listEntry,
FM_RESOURCE,
ContainsLinkage);
resourceState = resource->State;
if ( resourceState == ClusterResourceFailed ) {
state = ClusterGroupFailed;
//
// This state has the highest precedence, so
// exit immediately.
//
goto FnExit;
} else if ( (resourceState == ClusterResourceOnlinePending) ||
(resourceState == ClusterResourceOfflinePending) ) {
IsPending = TRUE;
} else {
CL_ASSERT( (resourceState == ClusterResourceOffline) ||
(resourceState == ClusterResourceOnline) ||
(resourceState == ClusterResourceInitializing) );
if ( resourceState == ClusterResourceInitializing ) {
//
// Normalize this state to offline state
//
resourceState = ClusterResourceOffline;
}
if ( firstResourceState == ClusterResourceInitializing ) {
//
// Normalize this state to offline state
//
firstResourceState = ClusterResourceOffline;
}
if ( firstResourceState != resourceState ) {
IsPartialOnline = TRUE;
}
}
}
if ( IsPending == TRUE ) {
state = ClusterGroupPending;
//
// This state has the next highest precedence after
// ClusterGroupFailed state
//
goto FnExit;
}
if ( IsPartialOnline == TRUE ) {
state = ClusterGroupPartialOnline;
//
// This state has the next highest precedence after
// ClusterGroupFailed and ClusterGroupPending states
//
goto FnExit;
}
if ( firstResourceState == ClusterResourceOnline ) {
state = ClusterGroupOnline;
//
// If the first resource is in an online state,
// then the group state should be online.
//
goto FnExit;
}
if ( firstResourceState == ClusterResourceOffline ) {
state = ClusterGroupOffline;
//
// If the first resource is in an offline state,
// then the group state should be offline.
//
goto FnExit;
}
}
//
// The control gets here only if IsNormalized is TRUE
//
if ( (firstResourceState == ClusterResourceOnline) ||
(firstResourceState == ClusterResourceOnlinePending) ) {
state = ClusterGroupOnline;
firstResourceState = ClusterResourceOnline;
} else {
CL_ASSERT( (firstResourceState == ClusterResourceOffline) ||
(firstResourceState == ClusterResourceFailed) ||
(firstResourceState == ClusterResourceOfflinePending) ||
(firstResourceState == ClusterResourceInitializing) );
state = ClusterGroupOffline;
firstResourceState = ClusterResourceOffline;
}
//
// Now check each resource to see if they match the first.
//
for (listEntry = Group->Contains.Flink;
listEntry != &(Group->Contains);
listEntry = listEntry->Flink ) {
resource = CONTAINING_RECORD(listEntry,
FM_RESOURCE,
ContainsLinkage);
resourceState = resource->State;
//
// Normalize pending states to their final state, and Failed and Initializing
// to Offline.
//
if ( resourceState == ClusterResourceOnlinePending ) {
resourceState = ClusterResourceOnline;
} else if ( (resourceState == ClusterResourceOfflinePending) ||
(resourceState == ClusterResourceFailed) ||
(resourceState == ClusterResourceInitializing) ) {
resourceState = ClusterResourceOffline;
}
//
// We only need 1 resource that is not the same as the first resource
// to be in a partially online state.
//
if ( firstResourceState != resourceState ) {
state = ClusterGroupPartialOnline;
break;
}
}
} else {
//
// The group is empty, so I guess it must be offline.
//
state = Group->PersistentState;
}
FnExit:
FmpReleaseLocalGroupLock( Group );
return(state);
} // FmpGetGroupState
DWORD
FmpPropagateGroupState(
IN PFM_GROUP Group
)
/*++
Routine Description:
Set and propagate the state of the group to other components on the
local system and to other systems in the cluster.
Arguments:
Group - The Group to propagate the state.
Return:
ERROR_SUCCESS if successful.
A Win32 error code on failure.
Notes:
We will use the first resource's state to determine what should be the
state for the whole group. If all resources match the state of the first
resource, then that is the state of the Group. If any resource disagrees
with the first resource, then the state is PartialOnline.
--*/
{
GUM_GROUP_STATE groupState;
LPCWSTR groupId;
DWORD groupIdSize;
DWORD status;
PLIST_ENTRY listEntry;
CLUSTER_RESOURCE_STATE firstResourceState;
CLUSTER_GROUP_STATE state;
FmpAcquireLocalGroupLock( Group );
//
// If we no longer own the Group, then just return now.
//
// This can happen when a resource goes offline (via a terminate), but
// the group ownership has already migrated to another system.
// We will assume that returning success is okay in this case.
//
if ( Group->OwnerNode != NmLocalNode ) {
FmpReleaseLocalGroupLock( Group );
return(ERROR_SUCCESS);
}
//
// Chittur Subbaraman (chitturs) - 6/28/99
//
// If the group is marked for deletion, then don't do anything.
//
if ( !IS_VALID_FM_GROUP( Group ) ) {
FmpReleaseLocalGroupLock( Group );
return(ERROR_SUCCESS);
}
state = FmpGetGroupState( Group, TRUE );
//
// If the state has changed, then update the local system.
//
++Group->StateSequence;
if ( state != Group->State ) {
Group->State = state;
switch ( state ) {
case ClusterGroupOnline:
case ClusterGroupPartialOnline:
ClusterEvent(CLUSTER_EVENT_GROUP_ONLINE, Group);
break;
case ClusterGroupOffline:
case ClusterGroupFailed:
ClusterEvent(CLUSTER_EVENT_GROUP_OFFLINE, Group);
break;
default:
break;
}
//
// Prepare to notify the other systems.
//
groupId = OmObjectId( Group );
groupIdSize = (lstrlenW( groupId ) + 1) * sizeof(WCHAR);
//
// Set Group state
//
groupState.State = state;
groupState.PersistentState = Group->PersistentState;
groupState.StateSequence = Group->StateSequence;
status = GumSendUpdateEx(GumUpdateFailoverManager,
FmUpdateGroupState,
3,
groupIdSize,
groupId,
(lstrlenW(OmObjectId(NmLocalNode))+1)*sizeof(WCHAR),
OmObjectId(NmLocalNode),
sizeof(groupState),
&groupState);
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpPropagateGroupState: Group %1!ws! state = %2!u!, persistent state = %3!u!\n",
OmObjectId(Group),
groupState.State,
groupState.PersistentState);
} else {
//
// Assume that the state didn't change, but the owning node did.
//
//
// Prepare to notify the other systems.
//
groupId = OmObjectId( Group );
groupIdSize = (lstrlenW( groupId ) + 1) * sizeof(WCHAR);
status = GumSendUpdateEx(GumUpdateFailoverManager,
FmUpdateGroupNode,
2,
groupIdSize,
groupId,
(lstrlenW(OmObjectId(NmLocalNode))+1)*sizeof(WCHAR),
OmObjectId(NmLocalNode));
}
FmpReleaseLocalGroupLock( Group );
return(status);
} // FmpPropagateGroupState
DWORD
FmpPropagateFailureCount(
IN PFM_GROUP Group,
IN BOOL NewTime
)
/*++
Routine Description:
Propagate NumberOfFailures for the group to other systems in the cluster.
Arguments:
Group - The Group to propagate the state.
NewTime - TRUE if last failure time should be reset also. FALSE otherwise.
Return:
ERROR_SUCCESS if successful.
A Win32 error code on failure.
Notes:
The Local Group lock must be held.
--*/
{
PGUM_FAILURE_COUNT failureCount;
DWORD failureCountSize;
LPCWSTR groupId;
DWORD status;
//
// Prepare to notify the other systems.
//
groupId = OmObjectId( Group );
failureCountSize = sizeof(GUM_FAILURE_COUNT) - 1 +
((lstrlenW(groupId) + 1) * sizeof(WCHAR));
failureCount = LocalAlloc(LMEM_FIXED, failureCountSize);
if ( failureCount == NULL ) {
return(ERROR_NOT_ENOUGH_MEMORY);
}
failureCount->Count = Group->NumberOfFailures;
failureCount->NewTime = (DWORD)NewTime;
wcscpy(&failureCount->GroupId[0], groupId);
status = GumSendUpdate( GumUpdateFailoverManager,
FmUpdateFailureCount,
failureCountSize,
failureCount );
LocalFree( failureCount );
return(status);
} // FmpPropagateFailureCount
PFM_GROUP
FmpCreateGroup(
IN LPWSTR GroupId,
IN BOOL Initialize
)
/*++
Routine Description:
Creates a new Group object.
Arguments:
GroupId - The Id of the new Group.
Initialize - TRUE if the Group should be initialized, FALSE otherwise.
Returns:
A non-NULL pointer to the Group if successful.
NULL - The Group could not be created.
Notes:
1) Passing Initialize as FALSE allows for creating the group and it
resources, but complete initialization can happen later.
2) The Group List lock must be held.
3) If the Group is created, the reference count on the object is 1. If
the group is not create (i.e., it already exists) then the reference count
is not incremented and the caller may add a reference as needed.
--*/
{
PFM_GROUP group = NULL;
DWORD status = ERROR_SUCCESS;
BOOL Created;
//
// Open an existing group or create a new one.
//
group = OmCreateObject( ObjectTypeGroup,
GroupId,
NULL,
&Created);
if (group == NULL) {
return(NULL);
}
if (!Created) {
ClRtlLogPrint(LOG_NOISE,
"[FM] Opened existing group %1!ws!\n",
GroupId);
//this is the quorum group being recreated again,
if ((!FmpFMOnline) && (group->RegistryKey == NULL))
{
status = FmpInitializeGroup(group, Initialize);
}
OmDereferenceObject( group );
goto FnExit;
}
else
{
ClRtlLogPrint(LOG_NOISE,
"[FM] Creating group %1!ws!\n",
GroupId);
group->State = ClusterGroupOffline;
InitializeCriticalSection( &group->Lock );
group->dwStructState = FM_GROUP_STRUCT_CREATED;
//
// Insert the group into its list.
//
status = FmpInitializeGroup( group , Initialize);
if ( status != ERROR_SUCCESS ) {
goto FnExit;
}
//
// Insert the group into its list.
//
status = OmInsertObject( group );
if ( status != ERROR_SUCCESS ) {
goto FnExit;
}
}
FnExit:
if (status != ERROR_SUCCESS)
{
FmpAcquireLocalGroupLock( group );
FmpDestroyGroup( group, FALSE );
SetLastError(status);
group = NULL;
}
return(group);
} // FmpCreateGroup
DWORD FmpInitializeGroup(
IN PFM_GROUP Group,
IN BOOL Initialize
)
{
DWORD status;
//
// Initialize the Group
//
InitializeListHead( &(Group->Contains) );
InitializeListHead( &(Group->PreferredOwners) );
InitializeListHead( &(Group->DmRundownList) );
InitializeListHead( &(Group->WaitQueue) );
Group->MovingList = NULL;
//
// Read the registry information if directed to do so.
//
status = FmpQueryGroupInfo( Group, Initialize );
if ( status != ERROR_SUCCESS ) {
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpInitializeGroup: FmpQueryGroupInfo failed, status=%1!u!\n",
status);
}
return(status);
}
DWORD
FmpDestroyGroup(
IN PFM_GROUP Group,
IN BOOL bDeleteObjOnly
)
/*++
Routine Description:
Closes a group.
First, this routine verifies that all resources contained within
the Group are closed.
If the group is online, it is brought offline.
Note that the group object itself is not dereferenced here. This is
done so that FmpCleanupGroups can simply enumerate all the groups,
destroying each one in turn. This approach means a group may be
destroyed multiple times if there are outstanding references to it, but
that is not a problem since no work will be done on subsequent calls.
IF bDeleteObjOnly is TRUE, then the resource monitor is not invoked and
group state is not touched.
Arguments:
FoundGroup - Returns the found group.
Group - Supplies the current group.
Name - Supplies the current group's name.
Return Value:
TRUE - to continue searching
FALSE - to stop the search. The matching group is returned in
*FoundGroup
Notes:
The LocalGroupLock MUST be held! This routine will release that lock
as part of cleanup.
--*/
{
PLIST_ENTRY listEntry;
PFM_RESOURCE Resource;
PPREFERRED_ENTRY preferredEntry;
DWORD status = ERROR_SUCCESS;
ClRtlLogPrint(LOG_NOISE,
"[FM] DestroyGroup: destroying %1!ws!\n",
OmObjectId(Group));
//
// Make sure there are no resources in the Group.
//
for ( listEntry = Group->Contains.Flink;
listEntry != &(Group->Contains);
) {
Resource = CONTAINING_RECORD(listEntry, FM_RESOURCE, ContainsLinkage);
listEntry = listEntry->Flink;
RemoveEntryList( &Resource->ContainsLinkage );
//dereference for removing from the contains list
OmDereferenceObject( Resource );
FmpAcquireLocalResourceLock( Resource );
if (!bDeleteObjOnly)
Resource->QuorumResource = FALSE;
FmpDestroyResource( Resource, bDeleteObjOnly );
//the reference count on the group wrt to being
//referenced by the resource is handled in FmpDestroyResource
}
CL_ASSERT(IsListEmpty(&Group->Contains));
//
//
// Make sure the preferred owners list is drained.
//
while ( !IsListEmpty( &Group->PreferredOwners ) ) {
listEntry = RemoveHeadList(&Group->PreferredOwners);
preferredEntry = CONTAINING_RECORD( listEntry,
PREFERRED_ENTRY,
PreferredLinkage );
OmDereferenceObject( preferredEntry->PreferredNode );
LocalFree( preferredEntry );
}
//
// Now that there are no remaining resources in this group
// we're done, so remove it from it's object type list.
//
status = OmRemoveObject( Group );
//
// Close the Group's registry key.
//
DmRundownList( &Group->DmRundownList );
if ( Group->RegistryKey != NULL ) {
DmCloseKey( Group->RegistryKey );
Group->RegistryKey = NULL;
Group->Initialized = FALSE;
}
//
// We must release the lock prior to the dereference, in case this is
// the last dereference of the object!
//
FmpReleaseLocalGroupLock( Group );
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpDestroyGroup: Group %1!ws! destroyed.\n",
OmObjectId(Group));
OmDereferenceObject( Group );
return(status);
} // FmpDestroyGroup
///////////////////////////////////////////////////////////////////////////
//
// Initialization/Cleanup Routines
//
///////////////////////////////////////////////////////////////////////////
DWORD
FmpInitGroups(
IN BOOL Initialize
)
/*++
Routine Description:
Processes the Cluster group list in the registry. For each
group key found, a cluster group is created.
Arguments:
Initialize - TRUE if resources should be initialized. FALSE otherwise.
Return Value:
ERROR_SUCCESS if successful.
A Win32 error code on failure.
--*/
{
DWORD status;
DWORD keyIndex = 0;
LPWSTR groupId = NULL;
DWORD groupIdMaxSize = 0;
PFM_GROUP ignored;
ClRtlLogPrint(LOG_NOISE,"[FM] Processing groups list.\n");
FmpAcquireGroupLock();
//
// Enumerate the subkeys. Each subkey name corresponds to a group name.
//
for (keyIndex = 0; ; keyIndex++) {
status = FmpRegEnumerateKey( DmGroupsKey,
keyIndex,
&groupId,
&groupIdMaxSize
);
if (status == NO_ERROR) {
ignored = FmpCreateGroup( groupId,
Initialize );
continue;
}
if (status == ERROR_NO_MORE_ITEMS) {
status = NO_ERROR;
} else {
ClRtlLogPrint(LOG_NOISE,"[FM] EnumGroup error %1!u!\n", status);
}
break;
}
FmpReleaseGroupLock();
ClRtlLogPrint(LOG_NOISE,"[FM] All groups created.\n");
if (groupId != NULL) {
LocalFree(groupId);
}
return(status);
} // FmpInitGroups
DWORD
FmpCompleteInitGroup(
IN PFM_GROUP Group
)
/*++
Routine Description:
Finish initialization of all resources within the group.
Arguments:
Group - The group to finish initializing.
Return Value:
ERROR_SUCCESS if successful.
A Win32 error code on failure.
--*/
{
PLIST_ENTRY listEntry;
PFM_RESOURCE Resource;
FmpAcquireLocalGroupLock(Group);
//
// For each resource in the Group, make sure that it has been fully
// initialized.
//
for ( listEntry = Group->Contains.Flink;
listEntry != &(Group->Contains);
listEntry = listEntry->Flink ) {
Resource = CONTAINING_RECORD(listEntry, FM_RESOURCE, ContainsLinkage);
FmpInitializeResource( Resource, TRUE );
}
FmpReleaseLocalGroupLock(Group);
return(ERROR_SUCCESS);
} // FmpCompleteInitGroup
DWORD
FmpCleanupGroupsWorker(
IN PFM_CLEANUP_INFO pFmCleanupInfo
)
/*++
Routine Description:
This routine walks through an enumerated list of all the groups
owned by the local node and tries to shut them down cleanly.
In the first phase it tries to bring
all resources offline except the quorum one.
In the second phase it waits for the group to reach stable state
and then move it. It tries to bring the quorum resource offline as
well by moving the quorum group.
Arguments:
pFmCleanupInfo - ptr to a strucuture containing the groups to be
offlined/moved and the timelimit in which to do so.
Returns:
None.
Assumptions:
--*/
{
DWORD Status = ERROR_SUCCESS;
DWORD i;
PFM_GROUP pGroup;
PGROUP_ENUM pGroupEnum;
BOOL bContainsQuorumGroup;
BOOL bQuorumGroup = FALSE;
DWORD CleanupStatus = ERROR_SUCCESS;
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCleanupGroupsWorker: Entry\r\n");
//
// This is done in two passes. In the first pass, we offline/move all
// resources except the quorum resource. In the second pass, we offline/move
// everything and then destroy the group. This allows resources that are
// being shutdown to write to the registry and have the updates logged to
// the quorum disk.
//
pGroupEnum = pFmCleanupInfo->pGroupEnum;
bContainsQuorumGroup = pFmCleanupInfo->bContainsQuorumGroup;
// Now offline all of the non-quorum resources...
// but don't wait for them to finish. I.E. get as much work done as
// possible as fast as possible.
//
for ( i = 0; i < pGroupEnum->EntryCount; i++ )
{
pGroup = OmReferenceObjectById( ObjectTypeGroup,
pGroupEnum->Entry[i].Id );
//try and offline all resources except the quorum
//resource
Status = FmpCleanupGroupPhase1(pGroup, pFmCleanupInfo->dwTimeOut);
if ((Status != ERROR_IO_PENDING) && (Status != ERROR_SUCCESS) &&
(Status != ERROR_QUORUM_RESOURCE))
CleanupStatus = Status;
OmDereferenceObject(pGroup);
}
//this finishes the second phase of the cleanup on shutdown
//if the quorum group is in this list, skip it and process it
//at the end
if (CleanupStatus == ERROR_SUCCESS)
{
for ( i = 0; i < pGroupEnum->EntryCount; i++ )
{
pGroup = OmReferenceObjectById( ObjectTypeGroup,
pGroupEnum->Entry[i].Id );
if (gpQuoResource->Group == pGroup)
{
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCleanupGroupsWorker: Quorum group belongs to this node, process phase 2 later\r\n");
bQuorumGroup = TRUE;
OmDereferenceObject(pGroup);
continue;
}
//try and offline all groups, including the quorum resource
//also try and move the resource to other nodes
Status = FmpCleanupGroupPhase2(pGroup);
OmDereferenceObject(pGroup);
}
if (bQuorumGroup)
Status = FmpCleanupGroupPhase2(gpQuoResource->Group);
}
else
{
//phase 1 didnt work for some reason
//try and offline the quorum resource alone.
//TODO::Should we also terminate all resources
// No way to terminate services ???
if (bContainsQuorumGroup)
FmpCleanupQuorumResource(gpQuoResource);
}
return(Status);
} // FmpCleanupGroupsWorker
DWORD
FmpCleanupGroupPhase1(
IN PFM_GROUP Group,
IN DWORD dwTimeOut
)
/*++
Routine Description:
This routine is the first phase for clean up all groups owned by the node
on shutdown.
In this phase, we try and bring all resources offline except the quorum
resource. In this phase we dont block for the resources to reach a stable
state
We give the group the shutdown timeout specified for the cluster
to reach a stable state before we try to offline it. If it doesnt
reach a stable state in this period then we shut it down abruptly.
Arguments:
Group - The Group to offline.
Returns:
ERROR_SUCCESS if successful.
A Win32 error code on failure.
--*/
{
DWORD Status = ERROR_SUCCESS;
DWORD dwRetryCount = (2 * dwTimeOut)/1000;//we check after every 1/2 sec
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCleanupGroupsPhase1: Entry, Group = %1!ws!\r\n",
OmObjectId(Group));
ChkGroupState:
FmpAcquireLocalGroupLock( Group );
//
// Just offline the group
//
if ( Group->OwnerNode == NmLocalNode )
{
//
// Make sure the group is quiet
//
if ( !FmpIsGroupQuiet( Group, ClusterGroupOffline ) )
{
FmpReleaseLocalGroupLock( Group );
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCleanupGroupsPhase1: Group is not quiet, wait\r\n");
//we give it a minute to recover totally
Sleep(500);
if (dwRetryCount--)
goto ChkGroupState;
else
{
Status = ERROR_REQUEST_ABORTED;
goto FnExit;
}
}
//
// Just take the group offline. Don't wait, don't pass go...
//
// Dont take the quorum resource offline in phase 1
// The quorum resource must be the last one to be taken offline
Status = FmpOfflineGroup(Group, FALSE, FALSE);
}
FmpReleaseLocalGroupLock( Group );
FnExit:
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCleanupGroupsPhase1: Exit, status=%1!u!\r\n",
Status);
return(Status);
} // FmpCleanupGroupsPhase1
DWORD
FmpCleanupGroupPhase2(
IN PFM_GROUP Group
)
/*++
Routine Description:
This routine is the second phase for clean up all groups owned by the node
on shutdown.
In this phase, we try and bring all resources offline including the quorum
resource. We also try to move the quorum resource
We give the group 10 seconds to reach a stable state before we try to
move it.
Arguments:
Group - The Group to offline.
Returns:
ERROR_SUCCESS if successful.
A Win32 error code on failure.
--*/
{
DWORD Status = ERROR_SUCCESS;
DWORD dwRetryCount= 120 * 12;
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCleanupGroupsPhase2: Entry, Group = %1!ws!\r\n",
OmObjectId(Group));
FmpAcquireLocalGroupLock( Group );
//
// Try to move the Group before destroying it if we own it.
//
if ( Group->OwnerNode == NmLocalNode )
{
//
// First make sure the group is really offline.
// In phase 1 we began the offline process... we need to check it here.
//
WaitSomeMore:
//
// [GorN] [10/05/1999]
// We need to wait for the quorum to go offline, otherwise
// the surviving node will not be able to arbitrate.
//
// FmpWaitForGroup keeps issuing RmOffline for the quorum,
// resrcmon returns ERROR_INVALID_STATE, for the second offline,
// since offline is already in progress.
//
// This causes us to break out of this look while the quorum resource
// is still being offline.
//
// [HACKHACK] The following fix for the problem is a hack.
// It would be better either to make resmon return IO_PENDING when
// somebody is trying to offline the resource that is in offline pending
//
// Or not to call FmRmOffline the second time in FM.
//
Status = FmpOfflineGroup(Group, TRUE, FALSE);
if (Status == ERROR_IO_PENDING ||
(Status == ERROR_INVALID_STATE
&& Group == gpQuoResource->Group) )
{
//FmpWaitForGroup() will release the lock
Status = FmpWaitForGroup(Group);
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCleanupGroupsPhase2: Sleep and retry\r\n");
Sleep(2*1000);
//Reacquire the group lock and check if the group is offline
FmpAcquireLocalGroupLock(Group);
if (dwRetryCount--)
goto WaitSomeMore;
}
else if (Status != ERROR_SUCCESS)
{
goto FnExit;
}
else
{
// The Move routine frees the LocalGroupLock!
FmpMoveGroup( Group, NULL, TRUE, NULL, TRUE );
FmpAcquireLocalGroupLock( Group );
}
}
FnExit:
FmpReleaseLocalGroupLock(Group);
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCleanupGroupsPhase2: Exit\n");
return(TRUE);
} // FmpCleanupGroupsPhase2
BOOL
FmpEnumNodeState(
OUT DWORD *pStatus,
IN PVOID Context2,
IN PNM_NODE Node,
IN LPCWSTR Name
)
/*++
Routine Description:
Node enumeration callback for FM shutdown. Queries the state
of other nodes to see if any are up.
Arguments:
pStatus - Returns TRUE if other node is up.
Context2 - Not used
Node - Supplies the node.
Name - Supplies the node's name.
Return Value:
TRUE - to indicate that the enumeration should continue.
FALSE - to indicate that the enumeration should not continue.
--*/
{
DWORD Status;
DWORD NodeId;
PGROUP_ENUM NodeGroups = NULL;
PRESOURCE_ENUM NodeResources = NULL;
DWORD i;
PFM_GROUP Group;
PFM_RESOURCE Resource;
if (Node == NmLocalNode) {
return(TRUE);
}
//
// Enumerate all other node's group states. This includes all nodes
// that are up, as well as nodes that are paused.
//
if ((NmGetNodeState(Node) == ClusterNodeUp) ||
(NmGetNodeState(Node) == ClusterNodePaused)){
*pStatus = TRUE;
return(FALSE);
}
return(TRUE);
} // FmpEnumNodeState
VOID
FmpCleanupGroups(
IN BOOL ClusterShutDownEvent
)
/*++
Routine Description:
This routine kicks off the cleanup of the FM layer.
Arguments:
None.
Returns:
None.
--*/
{
DWORD Status;
DWORD dwTimeOut;
DWORD dwDefaultTimeOut;
HANDLE hCleanupThread;
DWORD otherNodesUp = FALSE;
DWORD dwThreadId;
DWORD i,dwTimeOutCount;
PGROUP_ENUM pGroupEnum;
BOOL bQuorumGroup = FALSE;
PFM_CLEANUP_INFO pFmCleanupInfo;
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCleanupGroups: Entry\r\n");
//
// If we don't know the quorum resource or we are not online,
// then leave immediately
//
if ( !gpQuoResource ) {
goto FnExit;
}
ACQUIRE_EXCLUSIVE_LOCK(gQuoChangeLock);
//if this is called when fmformphaseprocessing is going on
//then the quorum group doesnt exist, other groups dont exist
//either
if (FmpFMFormPhaseProcessing)
FmpCleanupQuorumResource(gpQuoResource);
else
CL_ASSERT(gpQuoResource->Group != NULL)
RELEASE_LOCK(gQuoChangeLock);
//
// Find and sort all known groups, hold the group lock while enumerating
//
FmpAcquireGroupLock();
Status = FmpEnumSortGroups(&pGroupEnum, NmLocalNode, &bQuorumGroup);
FmpReleaseGroupLock();
if (Status != ERROR_SUCCESS) {
goto FnExit;
}
//
// See if any other node in the cluster is up...
// If so, we will use the default timeout value.
// Otherwise, we will use what we believe is a more reasonable time.
//
OmEnumObjects( ObjectTypeNode,
FmpEnumNodeState,
&otherNodesUp,
NULL );
dwDefaultTimeOut = CLUSTER_SHUTDOWN_TIMEOUT * 60; // default timeout (secs)
switch ( CsShutdownRequest ) {
case CsShutdownTypeShutdown:
if ( otherNodesUp ) {
dwTimeOut = 15; // other node will time us out quickly - say 15 secs
} else {
dwTimeOut = 30; // otherwise use 30 seconds
}
break;
default:
// apply default value to registry
dwDefaultTimeOut = CLUSTER_SHUTDOWN_TIMEOUT; // default timeout (mins)
Status = DmQueryDword( DmClusterParametersKey,
CLUSREG_NAME_CLUS_SHUTDOWN_TIMEOUT,
&dwTimeOut,
&dwDefaultTimeOut);
dwTimeOut *= 60; // convert to secs.
break;
}
//convert to msecs
dwTimeOut *= 1000;
pFmCleanupInfo = (PFM_CLEANUP_INFO)LocalAlloc(LMEM_FIXED, sizeof(FM_CLEANUP_INFO));
if (!pFmCleanupInfo)
{
Status = ERROR_NOT_ENOUGH_MEMORY;
goto FnExit;
}
pFmCleanupInfo->pGroupEnum = pGroupEnum;
pFmCleanupInfo->dwTimeOut = dwTimeOut; //in msecs
pFmCleanupInfo->bContainsQuorumGroup = bQuorumGroup;
//
// Start the worker thread to perform cleanup.
//
hCleanupThread = CreateThread( NULL,
0,
FmpCleanupGroupsWorker,
pFmCleanupInfo,
0,
&dwThreadId );
if ( hCleanupThread == NULL ) {
//SS: if we own the quorum resource should we cleanup the quorum resource
//this will avoid corruption
if (bQuorumGroup)
FmpCleanupQuorumResource(gpQuoResource);
goto FnExit;
}
// Rohit (rjain): This path is taken when Cluster Service is shutting
// down. ServiceStatus checkpoint is incremented after every WaitHint
// units of time. For this the waiting period of dwTimeOut is divided into
// multiple waiting periods of dwWaitHint units each.
if((ClusterShutDownEvent==TRUE) && (dwTimeOut > CsServiceStatus.dwWaitHint))
{
dwTimeOutCount=dwTimeOut/CsServiceStatus.dwWaitHint;
ClRtlLogPrint(LOG_ERROR,
"[FM] FmpCleanupGroups: dwTimeOut=%1!u! dwTimoutCount=%2!u! waithint =%3!u! \r\n",
dwTimeOut,dwTimeOutCount, CsServiceStatus.dwWaitHint);
for(i=0;i<dwTimeOutCount;i++){
Status = WaitForSingleObject(hCleanupThread, CsServiceStatus.dwWaitHint);
switch(Status) {
case WAIT_OBJECT_0:
//everything is fine
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCleanupGroups: Cleanup thread finished in time\r\n");
break;
case WAIT_TIMEOUT:
//should we terminate the thread
//try and clean up the quorum resource
//this will avoid corruption on the quorum disk
//TODO::Should we also terminate all resources
// No way to terminate services ???
if(i == (dwTimeOutCount-1)){
ClRtlLogPrint(LOG_UNUSUAL,
"[FM] FmpCleanupGroups: Timed out on the CleanupThread\r\n");
if (bQuorumGroup)
FmpCleanupQuorumResource(gpQuoResource);
}
break;
case WAIT_FAILED:
ClRtlLogPrint(LOG_UNUSUAL,
"[DM] FmpCleanupGroups: wait on CleanupEvent failed 0x%1!08lx!\r\n",
GetLastError());
break;
}
if(Status== WAIT_OBJECT_0 || Status==WAIT_FAILED)
break;
CsServiceStatus.dwCheckPoint++;
CsAnnounceServiceStatus();
}
goto FnExit;
}
//
// Wait for the thread to complete or a timeout.
//
Status = WaitForSingleObject(hCleanupThread, dwTimeOut);
switch(Status) {
case WAIT_OBJECT_0:
//everything is fine
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCleanupGroups: Cleanup thread finished in time\r\n");
break;
case WAIT_TIMEOUT:
//should we terminate the thread
//try and clean up the quorum resource
//this will avoid corruption on the quorum disk
//TODO::Should we also terminate all resources
// No way to terminate services ???
ClRtlLogPrint(LOG_UNUSUAL,
"[FM] FmpCleanupGroups: Timed out on the CleanupThread\r\n");
if (bQuorumGroup)
FmpCleanupQuorumResource(gpQuoResource);
break;
case WAIT_FAILED:
ClRtlLogPrint(LOG_UNUSUAL,
"[DM] FmpCleanupGroups: wait on CleanupEvent failed 0x%1!08lx!\r\n",
GetLastError());
break;
}
FnExit:
//SS: dont bother cleaning up, we are going to exit after this
#if 0
if (pGroupEnum) LocalFree(GroupEnum);
#endif
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCleanupGroups: Exit\r\n");
return;
} // FmpCleanupGroups
DWORD
FmpCleanupQuorumResource(
IN PFM_RESOURCE Resource
)
/*++
Routine Description:
This routine is for emergency clean up of the quorum resource.
In this phase, we dont try and acquire any locks. We just try to
bring the quorum resource offline. Hopefully the api is offline and
nothing funky is attempted on the quorum group/resource during this
time. This should only be called during the shutdown of FM.
Arguments:
Group - The Group to offline.
Returns:
ERROR_SUCCESS if successful.
A Win32 error code on failure.
--*/
{
DWORD status = ERROR_SUCCESS;
DWORD state;
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCleanupQuorum: Offline resource <%1!ws!> <%2!ws!>\n",
OmObjectName(Resource),
OmObjectId(Resource) );
//
// If the resource is already offline, then return immediately.
//
// We should not have to check if a resource has been initialized,
// since if it hasn't, then we will return because the pre-initialized
// state of a resource is Offline.
//
if ( Resource->State == ClusterResourceOffline ) {
//
// If this is the quorum resource, make sure any reservation
// threads are stopped!
//
FmpRmTerminateResource( Resource );
return(ERROR_SUCCESS);
}
if (Resource->State > ClusterResourcePending ) {
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCleanupQuorum: Offline resource <%1!ws!> is in pending state\n",
OmObjectName(Resource) );
FmpRmTerminateResource( Resource );
return(ERROR_SUCCESS);
}
//make sure the quorum logs can be flushed and closed
OmNotifyCb(Resource, NOTIFY_RESOURCE_PREOFFLINE);
//it may not be prudent to call offline without holding any locks
//just call terminate
FmpRmTerminateResource( Resource );
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCleanupQuorum: RmOfflineResource returns %1!u!\r\n",
status);
return(status);
}
DWORD
FmpMoveGroup(
IN PFM_GROUP Group,
IN PNM_NODE DestinationNode OPTIONAL,
IN BOOL ShutdownHandler,
OUT PNM_NODE *pChosenDestinationNode OPTIONAL,
IN BOOL bChooseMostPreferredNode
)
/*++
Routine Description:
Move the specified Group. This means taking all of the individual
resources contained within the group offline and requesting the
DestinationNode to bring the Group Online.
Arguments:
Group - Supplies a pointer to the group structure to move.
DestinationNode - Supplies the node object to move the group to. If not
present, then move it to 'highest' entry in the preferred list.
ShutdownHandler - TRUE if the shutdown handler is invoking this function.
pChosenDestinationNode - Set to the destination node of the move and
will be passed on to FmpCompleteMoveGroup, if necessary.
bChooseMostPreferredNode - If the destination node is not supplied,
indicates whether to choose the most preferred node or not.
Returns:
ERROR_SUCCESS if the request was successful.
A Win32 error code on failure.
Notes:
It is assumed that the Group and all contained resources are offline
from the requesting node when this call returns. The Group may or
may not be online on the DestinationNode, depending on whether the
online request succeeded. This means that the status return is merely
the status return for the Online request for the DestinationNode.
The LocalGroupLock MUST also be held. The LocalGroupLock is released
by this routine.
--*/
{
PNM_NODE node;
DWORD status;
PFM_RESOURCE resource;
PLIST_ENTRY listEntry;
PRESOURCE_ENUM resourceList=NULL;
DWORD dwMoveStatus = ERROR_SUCCESS;
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpMoveGroup: Entry\r\n");
if ( !ShutdownHandler )
{
if ( !FmpFMOnline )
{
status = ERROR_CLUSTER_NODE_NOT_READY;
goto FnExit;
}
if ( FmpShutdown )
{
status = ERROR_SHUTDOWN_IN_PROGRESS;
goto FnExit;
}
}
//
// See which system owns the group in order to control the move request.
//
if ( Group->OwnerNode != NmLocalNode )
{
if ( Group->OwnerNode == NULL )
{
status = ERROR_HOST_NODE_NOT_AVAILABLE;
goto FnExit;
}
//
// The other system owns the Group ... let them do the work.
//
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpMoveGroup: Request node %1!ws! to move Group %2!ws!\n",
OmObjectId(Group->OwnerNode),
OmObjectId(Group));
// FmcMoveGroupRequest must release the Group lock.
status = FmcMoveGroupRequest( Group,
DestinationNode );
if ( status != ERROR_SUCCESS )
{
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpMoveGroup: Requested system %1!ws! to move group %2!ws! failed with status %3!u!.\n",
OmObjectId(Group->OwnerNode),
OmObjectId(Group),
status);
}
FmpAcquireLocalGroupLock( Group );
goto FnExit;
}
else
{
//
// We control the move.
//
if ( !FmpIsGroupQuiet(Group, ClusterGroupStateUnknown) )
{
//
// If a move is pending or resources are pending,
// then return now.
//
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpMoveGroup: Request to move group <%1!ws!> when it is busy.\n",
OmObjectName(Group) );
status = ERROR_INVALID_STATE;
goto FnExit;
}
if ( ARGUMENT_PRESENT( DestinationNode ) )
{
//
// Check if we are the destination... if so, we're done.
//
if ( NmLocalNode == DestinationNode )
{
status = ERROR_SUCCESS;
goto FnExit;
}
node = DestinationNode;
}
else
{
node = FmpFindAnotherNode( Group, bChooseMostPreferredNode );
if ( node == NULL )
{
status = ERROR_HOST_NODE_NOT_AVAILABLE;
goto FnExit;
}
}
if ( ARGUMENT_PRESENT ( pChosenDestinationNode ) )
{
*pChosenDestinationNode = node;
}
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpMoveGroup: Moving group %1!ws! to node %2!ws! (%3!d!)\n",
OmObjectId(Group),
OmObjectId(node),
NmGetNodeId(node));
//
// If the other system is not up, then fail now.
//
if ( NmGetExtendedNodeState(node) != ClusterNodeUp )
{
status = ERROR_HOST_NODE_NOT_AVAILABLE;
goto FnExit;
}
//
// If the other system is not in the preferred list, then fail this
// now.
//
if ( !FmpInPreferredList( Group, node, TRUE, NULL) )
{
status = ERROR_CLUSTER_NODE_NOT_FOUND;
goto FnExit;
}
//
// Get the list of resources in the group and their states.
//
status = FmpGetResourceList( &resourceList, Group );
if ( status != ERROR_SUCCESS )
{
goto FnExit;
}
Group->MovingList = resourceList;
//
// At this point the other system should be up!
//
status = FmpOfflineResourceList( resourceList, TRUE );
//SS: avoid the window when the group lock is released
//and the moving flag is not set true
//moving will be continued in another thread context if pending is
//returned
if ( status != ERROR_SUCCESS )
{
goto FnRestore;
}
// for now make sure that the group state is propagated here
// In general it is propagated by the worker thread. Since
// the ownership is going to change, we want to make sure that the
// last known state is propagated from this node to others before
// that.
FmpPropagateGroupState(Group);
//
// Assume the other node is going to take ownership. This is done
// before, in case the Group state changes. We want to accept the
// Group/resource state changes from the remote system when they
// arrive. We've already verified that node is in the preferred list!
//
TESTPT(TpFailPreMoveWithNodeDown)
{
ClusterEvent( CLUSTER_EVENT_NODE_DOWN, node );
}
//
// Chittur Subbaraman (chitturs) - 5/18/99
//
// Modified to handle the move group request of a quorum group in
// case the destination node could not arbitrate for the quorum
// resource.
//
do
{
//
// Before making the RPC, set the intended owner of the group
//
FmpSetIntendedOwnerForGroup( Group, NmGetNodeId( node ) );
try {
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpMoveGroup: Take group %2!ws! request to remote node %1!ws!\n",
OmObjectId(node),
OmObjectId(Group));
dwMoveStatus = status = FmcTakeGroupRequest( node, OmObjectId( Group ), resourceList );
} except (I_RpcExceptionFilter(RpcExceptionCode())) {
LPCWSTR pszNodeId;
LPCWSTR pszGroupId;
status = GetExceptionCode ();
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpMoveGroup: Exception in FmcTakeGroupRequest %2!ws! request to remote node %1!ws!, status=%3!u!\n",
OmObjectId(node),
OmObjectId(Group),
status);
//
// An exception from RPC indicates that the other node is either dead
// or insane. We dont know whether it took ownership or not.
// So, let the FM node down handler handle the group.
//
GumCommFailure( GumUpdateFailoverManager,
NmGetNodeId(node),
GetExceptionCode(),
TRUE );
//
// The new owner node that is now dead might have set the intended
// owner as NULL or it might not have set this. It might have
// set the owner node to himself or might not have.
// If it has set the owner node for this group as himself, then
// the FM node down handler will assume responsibility for this
// group. If the target node dies before it sets himself as the owner,
// then again, the FM node down handler will assume responsibility
// for the group. We wake up when the gum sync handling is over.
// Right now, the gum update for the owner node may still be in
// progress so we cant be sure if that update was completed on
// all nodes.
//
//
// Chittur Subbaraman (chitturs) - 6/7/99
//
// Issue a GUM update to handle this group. Using this
// GUM update prevents any race condition with the
// node down processing code.
//
// TODO: This does not cover the case in which
// FmpTakeGroupRequest crashes after setting the
// intended owner to invalid ID. In such a case,
// the following handler won't take ownership of the
// group. Also, claim handler will not touch the
// group.
//
pszNodeId = OmObjectId( node );
pszGroupId = OmObjectId( Group );
GumSendUpdateEx( GumUpdateFailoverManager,
FmUpdateCompleteGroupMove,
2,
(lstrlenW(pszNodeId)+1)*sizeof(WCHAR),
pszNodeId,
(lstrlenW(pszGroupId)+1)*sizeof(WCHAR),
pszGroupId);
status = ERROR_HOST_NODE_NOT_AVAILABLE;
goto FnExit;
}
if ( status == ERROR_RETRY )
{
//
// The destination refused to take the quorum group since it
// did not win the arbitration. So let us see who won the
// arbitration.
//
DWORD dwSelectedQuorumOwnerId;
CL_ASSERT( Group == gpQuoResource->Group );
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpMoveGroup: Remote node asked us to resend take group request for group %1!ws! to another node ...\n",
OmObjectId( Group ));
//
// Get the ID of the node which the MM believes is the best
// candidate to own the quorum resource. This is a call that
// blocks while RGP is in progress.
//
MMApproxArbitrationWinner( &dwSelectedQuorumOwnerId );
if ( ( dwSelectedQuorumOwnerId == NmGetNodeId( NmLocalNode ) ) ||
( dwSelectedQuorumOwnerId == MM_INVALID_NODE ) )
{
//
// The local node is chosen by MM or no node is chosen by
// the MM. The latter case will happen if no RGP has
// occurred at the time this call is made. Let us see if we
// can arbitrate for the quorum resource.
//
status = FmpRmArbitrateResource( gpQuoResource );
if ( status != ERROR_SUCCESS )
{
//
// Too bad. We will halt and let FmpNodeDown handler
// handle the quorum group.
//
ClRtlLogPrint(LOG_CRITICAL,
"[FM] FmpMoveGroup: Local node %1!u! cannot arbitrate for quorum, Status = %1!u!...\n",
dwSelectedQuorumOwnerId,
status);
CsInconsistencyHalt( ERROR_QUORUM_RESOURCE_ONLINE_FAILED );
}
status = ERROR_RETRY;
break;
}
node = NmReferenceNodeById( dwSelectedQuorumOwnerId );
if ( node == NULL )
{
ClRtlLogPrint(LOG_CRITICAL,
"[FM] FmpMoveGroup: Selected node %1!u! cannot be referenced...\n",
dwSelectedQuorumOwnerId);
CsInconsistencyHalt( ERROR_QUORUM_RESOURCE_ONLINE_FAILED );
}
} // if
} while ( status == ERROR_RETRY );
TESTPT(TpFailPostMoveWithNodeDown)
{
ClusterEvent( CLUSTER_EVENT_NODE_DOWN, node );
}
CL_ASSERT( status != ERROR_IO_PENDING );
if ( status != ERROR_SUCCESS )
{
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpMoveGroup: FmcTakeGroupRequest to node %1!ws! to take group %2!ws! failed, status %3!u!.\n",
OmObjectId(node),
OmObjectId(Group),
status );
goto FnRestore;
}
//
// If the group is empty, then generate a Group state change event.
//
if ( IsListEmpty( &Group->Contains ) )
{
ClusterWideEvent( CLUSTER_EVENT_GROUP_OFFLINE,
Group );
}
}
FnRestore:
if ((status != ERROR_SUCCESS) && (status != ERROR_IO_PENDING))
{
//
// Chittur Subbaraman (chitturs) - 3/22/2000
//
// Reset the group's intended owner to invalid node ID if the
// node down handler did not do that.
//
if ( dwMoveStatus != ERROR_SUCCESS )
{
if ( FmpSetIntendedOwnerForGroup( Group, ClusterInvalidNodeId )
== ERROR_CLUSTER_INVALID_NODE )
{
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpMoveGroup: Group <%1!ws!> has already been processed by node down handler....\r\n",
OmObjectName(Group));
goto FnExit;
}
}
// the move failed
// In all failure cases we want to bring the resources
// back online
// if it is pending, then we let FmpCompleteMoveGroup finish
// the work
if (resourceList)
{
//
// Terminate all of the resources in the group.
//
FmpTerminateResourceList( resourceList );
//
// Chittur Subbaraman (chitturs) - 4/10/2000
//
// Make sure to online the quorum group even if this node is
// shutting down. This is necessary so that other groups
// can be brought offline during this node's shutdown. Note
// that FmpOnlineResourceList would only online a group
// during a shutdown if the group is the quorum group.
//
if ( FmpFMGroupsInited )
FmpOnlineResourceList( resourceList, Group );
}
}
FnExit:
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpMoveGroup: Exit group <%1!ws!>, status = %2!u!\r\n",
OmObjectName(Group),
status);
if ( status != ERROR_IO_PENDING )
{
if (resourceList)
{
FmpDeleteResourceEnum( resourceList );
Group->MovingList = NULL;
}
}
if ( ( status == ERROR_SUCCESS ) || ( status == ERROR_IO_PENDING ) )
{
//
// Chittur Subbaraman (chitturs) - 4/13/99
//
// If the FmpDoMoveGroupOnFailure thread is also waiting to do the
// move, then tell that thread to take its hands off.
//
if ( Group->dwStructState & FM_GROUP_STRUCT_MARKED_FOR_MOVE_ON_FAIL )
{
Group->dwStructState |= FM_GROUP_STRUCT_MARKED_FOR_REGULAR_MOVE;
}
}
FmpReleaseLocalGroupLock( Group );
return(status);
} // FmpMoveGroup
DWORD
FmpCompleteMoveGroup(
IN PFM_GROUP Group,
IN PNM_NODE DestinationNode
)
/*++
Routine Description:
This completes the move of a group by asking the other node to take
ownership.
This function is called by FmpMovePendingThread() after all the resources
are offline.
Arguments:
Group - Supplies a pointer to the group structure to move.
DestinationNode - Supplies the node object to move the group to. If not
present, then move it to 'highest' entry in the preferred list.
Returns:
ERROR_SUCCESS if the request was successful.
A Win32 error code on failure.
Notes:
It is assumed that the Group and all contained resources are offline
when this is called.
The LocalGroupLock MUST also be held. The LocalGroupLock is released
by this routine, especially before requesting a remote system to move
a group!
--*/
{
PNM_NODE node;
DWORD status = ERROR_SUCCESS;
PFM_RESOURCE resource;
PLIST_ENTRY listEntry;
PRESOURCE_ENUM resourceList=NULL;
DWORD dwMoveStatus = ERROR_SUCCESS;
ClRtlLogPrint(LOG_NOISE,
"[FM] CompleteMoveGroup: Entry for <%1!ws!>\r\n",
OmObjectName(Group) );
resourceList = Group->MovingList;
if ( resourceList == NULL ) {
ClRtlLogPrint( LOG_NOISE,
"[FM] CompleteMoveGroup: No moving list!\n" );
status = ERROR_SUCCESS;
goto FnRestore;
}
node = DestinationNode;
CL_ASSERT( node != NULL );
ClRtlLogPrint(LOG_NOISE,
"[FM] CompleteMoveGroup: Completing the move for group %1!ws! to node %2!ws! (%3!d!)\n",
OmObjectName(Group),
OmObjectId(node),
NmGetNodeId(node));
status = FmpOfflineResourceList( resourceList, TRUE );
if ( status != ERROR_SUCCESS ) {
//by now the group must be offline!
//if not, mail the move, the resource that fails to go
//offline will force the other resources to come online
//again.
//how do we handle shutdowns
goto FnRestore;
}
// for now make sure that the group state is propagated here
// In general it is propagated by the worker thread. Since
// the ownership is going to change, we want to make sure that the
// last known state is propagated from this node to others before
// that.
FmpPropagateGroupState(Group);
//
// Chittur Subbaraman (chitturs) - 10/01/1999
//
// If the other system is not up, then fail now. Note that this
// check must be done only AFTER ensuring that the group state
// is stable. Otherwise some funny corner cases can result.
// E.g., If the complete move operation is aborted when one or
// more resources are in offline pending state since the destination
// node went down, then you first terminate the resource list and
// then online the list. As a part of all this, the online pending
// or the online states of the resources could be propagated
// synchronously. Now, the offline notification from the previous
// offline attempt could come in and be processed by the FM worker
// thread way too late and you could have spurious resource states
// in FM while the real resource state is different. Another
// issue here is during the lengthy offline operation here, the
// destination node could go down and come back up soon after and
// so aborting the move may not be prudent in such a case.
//
// But, don't do this optimization for the quorum group. This is
// because once the quorum group is made offline, then MM
// could decide who the group owner is. So, you may not be able to
// bring the group online necessarily in this node. To avoid such
// a case, we let FmcTakeGroupRequest fail and then let either the
// retry loop here move the group somewhere else or let the
// FM node down handler decide on the group's owner consulting
// with MM.
//
if ( ( NmGetExtendedNodeState(node) != ClusterNodeUp ) &&
( Group != gpQuoResource->Group ) )
{
status = ERROR_HOST_NODE_NOT_AVAILABLE;
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCompleteMoveGroup: Restoring group <%1!ws!> on local node due to destination node unavailability...\n",
OmObjectName(Group));
goto FnRestore;
}
// SS::
// After this point the responsibility of failing the group
// back due to resource failures is with the destination code.
// If there is a failure to bring the resources online,
// the local restart policy on the destination node must kick
// in.
//
// if there is an rpc failure to communicate with the other node
// I suppose we should bring the resources online here again
// However, rpc failures can be pretty non descriptive - there is
// no way to determine from rpc errors if the rpc call actually
// executed on the remote side
//
// but unless we are pretty careful about this and do what gum does
// on rpc failures(banish the destination node) there is no way to
// guarantee that both nodes dont retry to restart the group
// If the destination node begins the process of bringing resources
// in the group online, FmsTakeGroupRequest must return success(note
// it should not return ERROR_IO_PENDING), else
// it returns an error code and this node will bring the group back
// to its previous state.
// Assume the other node is going to take ownership. This is done
// before, in case the Group state changes. We want to accept the
// Group/resource state changes from the remote system when they
// arrive. We've already verified that node is in the preferred list!
//
//we will reacquire the lock after making the rpc call
// SS::
// After this point the responsibility of failing the group
// back due to resource failures is with the destination code.
// If there is a failure to bring the resources online,
// the local restart policy on the destination node must kick
// in.
//
// if there is an rpc failure to communicate with the other node
// I suppose we should bring the resources online here again
// However, rpc failures can be pretty non descriptive - there is
// no way to determine from rpc errors if the rpc call actually
// executed on the remote side
//
// but unless we are pretty careful about this and do what gum does
// on rpc failures(banish the destination node) there is no way to
// guarantee that both nodes dont retry to restart the group
// If the destination node begins the process of bringing resources
// in the group online, FmsTakeGroupRequest must return success(note
// it should not return ERROR_IO_PENDING), else
// it returns an error code and this node will bring the group back
// to its previous state.
// Assume the other node is going to take ownership. This is done
// before, in case the Group state changes. We want to accept the
// Group/resource state changes from the remote system when they
// arrive. We've already verified that node is in the preferred list!
//
//
// Chittur Subbaraman (chitturs) - 5/18/99
//
// Modified to handle the move group request of a quorum group in
// case the destination node could not arbitrate for the quorum
// resource.
//
do
{
//
// Before making the RPC, set the intended owner of the group
//
FmpSetIntendedOwnerForGroup( Group, NmGetNodeId( node ) );
try {
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCompleteMoveGroup: Take group %2!ws! request to remote node %1!ws!\n",
OmObjectId(node),
OmObjectId(Group));
dwMoveStatus = status = FmcTakeGroupRequest( node, OmObjectId( Group ), resourceList );
} except (I_RpcExceptionFilter(RpcExceptionCode())) {
LPCWSTR pszNodeId;
LPCWSTR pszGroupId;
status = GetExceptionCode ();
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCompleteMoveGroup: Exception in FmcTakeGroupRequest %2!ws! request to remote node %1!ws!, status=%3!u!\n",
OmObjectId(node),
OmObjectId(Group),
status);
//
// An exception from RPC indicates that the other node is either dead
// or insane. We dont know whether it took ownership or not.
// So, let the FM node down handler handle the group.
//
GumCommFailure( GumUpdateFailoverManager,
NmGetNodeId(node),
GetExceptionCode(),
TRUE );
//
// The new owner node that is now dead might have set the intended
// owner as NULL or it might not have set this. It might have
// set the owner node to himself or might not have.
// If it has set the owner node for this group as himself, then
// the FM node down handler will assume responsibility for this
// group. If the target node dies before it sets himself as the owner,
// then again, the FM node down handler will assume responsibility
// for the group. We wake up when the gum sync handling is over.
// Right now, the gum update for the owner node may still be in
// progress so we cant be sure if that update was completed on
// all nodes.
//
//
// Chittur Subbaraman (chitturs) - 6/7/99
//
// Issue a GUM update to handle this group. Using this
// GUM update prevents any race condition with the
// node down processing code.
//
//
// TODO: This does not cover the case in which
// FmpTakeGroupRequest crashes after setting the
// intended owner to invalid ID. In such a case,
// the following handler won't take ownership of the
// group. Also, claim handler will not touch the
// group.
//
pszNodeId = OmObjectId( node );
pszGroupId = OmObjectId( Group );
GumSendUpdateEx( GumUpdateFailoverManager,
FmUpdateCompleteGroupMove,
2,
(lstrlenW(pszNodeId)+1)*sizeof(WCHAR),
pszNodeId,
(lstrlenW(pszGroupId)+1)*sizeof(WCHAR),
pszGroupId);
status = ERROR_HOST_NODE_NOT_AVAILABLE;
goto FnExit;
}
if ( status == ERROR_RETRY )
{
//
// The destination refused to take the quorum group since it
// did not win the arbitration. So let us see who won the
// arbitration.
//
DWORD dwSelectedQuorumOwnerId;
CL_ASSERT( Group == gpQuoResource->Group );
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCompleteMoveGroup: Remote node asked us to resend take group request for group %1!ws! to another node ...\n",
OmObjectId( Group ));
//
// Get the ID of the node which the MM believes is the best
// candidate to own the quorum resource. This is a call that
// blocks while RGP is in progress.
//
MMApproxArbitrationWinner( &dwSelectedQuorumOwnerId );
if ( ( dwSelectedQuorumOwnerId == NmGetNodeId( NmLocalNode ) ) ||
( dwSelectedQuorumOwnerId == MM_INVALID_NODE ) )
{
//
// The local node is chosen by MM or no node is chosen by
// the MM. The latter case will happen if no RGP has
// occurred at the time this call is made. Let us see if we
// can arbitrate for the quorum resource.
//
status = FmpRmArbitrateResource( gpQuoResource );
if ( status != ERROR_SUCCESS )
{
//
// Too bad. We will halt and let FmpNodeDown handler
// handle the quorum group.
//
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCompleteMoveGroup: Local node %1!u! cannot arbitrate for quorum group %3!ws!, Status = %2!u!...\n",
dwSelectedQuorumOwnerId,
status,
OmObjectId( Group ));
CsInconsistencyHalt( ERROR_QUORUM_RESOURCE_ONLINE_FAILED );
}
status = ERROR_RETRY;
break;
}
node = NmReferenceNodeById( dwSelectedQuorumOwnerId );
if ( node == NULL )
{
ClRtlLogPrint(LOG_CRITICAL,
"[FM] FmpCompleteMoveGroup: Selected node %1!u! cannot be referenced...\n",
dwSelectedQuorumOwnerId);
CsInconsistencyHalt( ERROR_QUORUM_RESOURCE_ONLINE_FAILED );
}
} // if
} while ( status == ERROR_RETRY );
// At this point, the onus of taking care of the group is with the
// destination node whether it means restarting the group or
// failing it back
FnRestore:
//if there is any failure try and restore the previous states
if ((status != ERROR_IO_PENDING) && (status != ERROR_SUCCESS))
{
//
// Chittur Subbaraman (chitturs) - 3/22/2000
//
// Reset the group's intended owner to invalid node ID if the
// node down handler did not do that.
//
if ( dwMoveStatus != ERROR_SUCCESS )
{
if ( FmpSetIntendedOwnerForGroup( Group, ClusterInvalidNodeId )
== ERROR_CLUSTER_INVALID_NODE )
{
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCompleteMoveGroup: Group <%1!ws!> has already been processed by node down handler....\r\n",
OmObjectName(Group));
goto FnExit;
}
}
if (resourceList)
{
FmpTerminateResourceList( resourceList );
//
// Chittur Subbaraman (chitturs) - 4/10/2000
//
// Make sure to online the quorum group even if this node is
// shutting down. This is necessary so that other groups
// can be brought offline during this node's shutdown. Note
// that FmpOnlineResourceList would only online a group
// during a shutdown if the group is the quorum group.
//
if ( FmpFMGroupsInited )
FmpOnlineResourceList( resourceList, Group );
}
} else
{
//
// Chittur Subbaraman (chitturs) - 4/19/99
//
// If the FmpDoMoveGroupOnFailure thread is also waiting to do the
// move, then tell that thread to take its hands off.
//
if ( Group->dwStructState & FM_GROUP_STRUCT_MARKED_FOR_MOVE_ON_FAIL )
{
Group->dwStructState |= FM_GROUP_STRUCT_MARKED_FOR_REGULAR_MOVE;
}
}
FnExit:
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpCompleteMoveGroup: Exit, status = %1!u!\r\n",
status);
//if the status is success or some other error, clean up the resource list
if (status != ERROR_IO_PENDING)
{
if (resourceList)
{
FmpDeleteResourceEnum( resourceList );
Group->MovingList = NULL;
}
}
FmpReleaseLocalGroupLock( Group );
return(status);
} // FmpCompleteMoveGroup
DWORD
FmpMovePendingThread(
IN LPVOID Context
)
/*++
Routine Description:
Continue trying to move a group if ERROR_IO_PENDING is returned.
We need to perform this operation, because part way through a move
request, we could get a pending return status. The processing of the
request is halted and the pending status is returned. However, the
remainder of the move operation needs to be performed.
Arguments:
Context - Pointer to the MOVE_GROUP structure to move.
Returns:
ERROR_SUCCESS.
--*/
{
PMOVE_GROUP moveGroup = (PMOVE_GROUP)Context;
PFM_GROUP group;
PNM_NODE node;
DWORD status;
DWORD loopCount = 100; // Only try this so many times and then give up
HANDLE waitArray[2];
group = moveGroup->Group;
node = moveGroup->DestinationNode;
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpMovePendingThread Entry.\n");
//
// We must attempt to finish the move request for this Group.
//
// We are waiting for a resource to go offline and it finally goes
// offline and the Group's pending event is set.
//
// Or we are waiting for cluster shutdown (FmpShutdownEvent)
//
WaitSomeMore:
//acquire the lock since fmpwaitforgroup() releases it
FmpAcquireLocalGroupLock( group );
status = FmpWaitForGroup(group);
if (status == ERROR_SHUTDOWN_IN_PROGRESS) {
//
// We've been asked to shutdown
//
} else if (status == ERROR_SUCCESS) {
//acquire the group lock before calling FmpCompleteMoveGroup
FmpAcquireLocalGroupLock( group );
status = FmpCompleteMoveGroup( group, node );
if ( status == ERROR_IO_PENDING ) {
Sleep(500); // [HACKHACK] kludgy, I know, but nice solution might break something else
goto WaitSomeMore;
}
} else {
ClRtlLogPrint(LOG_UNUSUAL,
"[FM] FmpMovePendingThread got error %1!d! waiting for group to shutdown.\n",
status);
}
//
// We're done with the move now.
//
if ( status != ERROR_IO_PENDING ) {
CL_ASSERT( group->MovingList == NULL );
}
//
// Now dereference the Group and node object (if non-NULL) and
// free our local context.
//
OmDereferenceObject( group );
if ( node != NULL ) {
OmDereferenceObject( node );
}
LocalFree( Context );
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpMovePendingThread Exit.\n");
return(ERROR_SUCCESS);
} // FmpMovePendingThread
DWORD
FmpCreateMovePendingThread(
IN PFM_GROUP Group,
IN PNM_NODE DestinationNode
)
/*++
Routine Description:
Crate a thread that will continue to call the move routine for a given
Group.
Arguments:
Group - A pointer to the Group to move.
DestinationNode - The destination node for the move request.
Returns:
ERROR_IO_PENDING if the thread was created successfully. This assumes
that this routine was called because of this error return.
A Win32 error code on failure.
--*/
{
HANDLE threadHandle=NULL;
DWORD threadId;
PMOVE_GROUP context=NULL;
DWORD status=ERROR_IO_PENDING; //assume success
FmpAcquireLocalGroupLock( Group );
if ( Group->OwnerNode != NmLocalNode ) {
status = ERROR_HOST_NODE_NOT_RESOURCE_OWNER;
goto FnExit;
}
//
// If there is a pending event, then the group is not available for any
// new requests.
//
if ( FmpIsGroupPending(Group) ) {
status = ERROR_GROUP_NOT_AVAILABLE;
goto FnExit;
}
context = LocalAlloc(LMEM_FIXED, sizeof(MOVE_GROUP));
if ( context == NULL ) {
status = ERROR_NOT_ENOUGH_MEMORY;
goto FnExit;
}
//
// Keep reference on the Group and node object (if present) while we
// retain pointers.
//
OmReferenceObject( Group );
if ( DestinationNode != NULL ) {
OmReferenceObject( DestinationNode );
}
//
// Fill in context fields
//
context->Group = Group;
context->DestinationNode = DestinationNode;
threadHandle = CreateThread( NULL,
0,
FmpMovePendingThread,
context,
0,
&threadId );
if ( threadHandle == NULL )
{
OmDereferenceObject( Group );
if ( DestinationNode != NULL ) {
OmDereferenceObject( DestinationNode );
}
status = GetLastError();
LocalFree(context);
goto FnExit;
}
FnExit:
if (threadHandle) CloseHandle( threadHandle );
FmpReleaseLocalGroupLock( Group );
return(status);
} // FmpCreateMovePendingThread
DWORD
FmpDoMoveGroup(
IN PFM_GROUP Group,
IN PNM_NODE DestinationNode,
IN BOOL bChooseMostPreferredNode
)
/*++
Routine Description:
This routine performs the action of moving a Group. This requires taking
a Group offline and then bringing the Group online. The Offline and
Online requests may pend, so we have to pick up the work in order to
complete the request. This means handling the offline pending case, since
the online pending request will eventually complete.
Arguments:
Group - The Group to move.
DestinationNode - The destination node for the move request.
bChooseMostPreferredNode - If the destination node is not supplied,
indicates whether to choose the most preferred node or not.
Returns:
ERROR_SUCCESS if successful.
A Win32 error code on failure.
--*/
{
DWORD status;
PNM_NODE node;
PNM_NODE ChosenDestinationNode = NULL;
//
// We can only support one request on this Group at a time.
//
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpDoMoveGroup: Entry\r\n");
FmpAcquireLocalGroupLock( Group );
//if the group has been marked for delete, then fail this call
if (!IS_VALID_FM_GROUP(Group))
{
FmpReleaseLocalGroupLock( Group);
return (ERROR_GROUP_NOT_AVAILABLE);
}
if ( FmpIsGroupPending(Group) ) {
FmpReleaseLocalGroupLock( Group );
return(ERROR_GROUP_NOT_AVAILABLE);
}
node = Group->OwnerNode;
// Note: the local group lock is released by the FmpMoveGroup routine.
status = FmpMoveGroup( Group, DestinationNode, FALSE, &ChosenDestinationNode, bChooseMostPreferredNode );
//
// If we were the owner of the group and the request is pending, then
// start a thread to complete the move request.
//
if ( (node == NmLocalNode) &&
(status == ERROR_IO_PENDING) ) {
status = FmpCreateMovePendingThread( Group, ChosenDestinationNode );
}
//
// Chittur Subbaraman (chitturs) - 7/31/2000
//
// Log an event to the eventlog if the group is moving due to a failure.
//
if ( ( bChooseMostPreferredNode == FALSE ) &&
( ( status == ERROR_SUCCESS ) || ( status == ERROR_IO_PENDING ) ) )
{
CsLogEvent3( LOG_NOISE,
FM_EVENT_GROUP_FAILOVER,
OmObjectName(Group),
OmObjectName(NmLocalNode),
OmObjectName(ChosenDestinationNode) );
}
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpDoMoveGroup: Exit, status = %1!u!\r\n",
status);
return(status);
} // FmpDoMoveGroup
DWORD
FmpTakeGroupRequest(
IN PFM_GROUP Group,
IN PRESOURCE_ENUM ResourceList
)
/*++
Routine Description:
Performs a Take Group Request from (THE) remote system and returns
status for that request.
Arguments:
Group - The Group to take online locally.
ResourceList - The list of resources and their states.
Return Value:
ERROR_SUCCESS if successful.
A Win32 error code on error.
--*/
{
DWORD status = ERROR_SUCCESS;
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpTakeGroupRequest: To take group '%1!ws!'.\n",
OmObjectId(Group) );
FmpAcquireLocalGroupLock( Group );
if ( !FmpFMOnline )
{
if (FmpShutdown)
status = ERROR_CLUSTER_NODE_SHUTTING_DOWN;
else
status = ERROR_CLUSTER_NODE_NOT_READY;
CL_LOGFAILURE(status);
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpTakeGroupRequest: Group '%1!ws!' cannot be accepted, status=%2!u!...\n",
OmObjectId(Group),
status);
//
// Chittur Subbaraman (chitturs) - 7/5/2000
//
// Make sure you ask the source node to relocate the quorum group some place else
// after consulting with MM.
//
if ( gpQuoResource->Group == Group ) status = ERROR_RETRY;
goto FnExit;
}
//every body should be able to host the quorum group
//so we dont check the prefferred owner list for this group
if ( ( gpQuoResource->Group != Group) &&
!FmpInPreferredList( Group, NmLocalNode, FALSE, NULL) )
{
//
// Nobody should ever ask us to take a group that can't run here.
//
status = ERROR_CLUSTER_NODE_NOT_FOUND;
CL_LOGFAILURE( status);
goto FnExit;
}
//
// Take ownership of the Group.
//
if ( Group->OwnerNode == NmLocalNode ) {
//SS:://We are alreay the owner ?? How did this happen
status = ERROR_SUCCESS;
goto FnExit;
}
//
// Chittur Subbaraman (chitturs) - 5/18/99
//
// Handle quorum group in a special way. Make sure you can arbitrate
// for the quorum resource. If not, you could get killed when you
// try to bring it online and you fail.
//
if ( Group == gpQuoResource->Group )
{
status = FmpRmArbitrateResource( gpQuoResource );
if ( status != ERROR_SUCCESS )
{
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpTakeGroupRequest: MM did not select local node %1!u! as the arbitration winner...\n\r",
NmLocalNodeId,
status);
status = ERROR_RETRY;
goto FnExit;
}
}
status = FmpSetOwnerForGroup( Group, NmLocalNode );
if ( status != ERROR_SUCCESS )
{
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpTakeGroupRequest: Set owner GUM update returns %1!u! for group <%2!ws!>...\n\r",
status,
OmObjectId(Group));
if ( status == ERROR_GROUP_NOT_AVAILABLE )
{
//
// If the node down processing GUM handler has claimed ownership
// of this group, consider everything as being fine.
//
status = ERROR_SUCCESS;
}
goto FnExit;
}
FmpSetIntendedOwnerForGroup(Group, ClusterInvalidNodeId);
// prepare to bring this group online
FmpPrepareGroupForOnline( Group );
//
// Online what needs to be online.
//
// SS: Note that we ignore the error from FmpOnlineResourceList
// This is because at this point the onus of taking care of the group
// is with us.
//
FmpOnlineResourceList( ResourceList, Group );
FnExit:
FmpReleaseLocalGroupLock( Group );
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpTakeGroupRequest: Exit for group <%1!ws!>, Status = %2!u!...\n",
OmObjectId(Group),
status);
return(status);
} // FmpTakeGroupRequest
DWORD
FmpUpdateChangeGroupName(
IN BOOL SourceNode,
IN LPCWSTR GroupId,
IN LPCWSTR NewName
)
/*++
Routine Description:
GUM dispatch routine for changing the friendly name of a group.
Arguments:
SourceNode - Supplies whether or not this node initiated the GUM update.
Not used.
ResourceId - Supplies the group ID.
NewName - Supplies the new friendly name.
Return Value:
ERROR_SUCCESS if successful.
Win32 error code otherwise.
--*/
{
PFM_GROUP Group;
DWORD Status;
//
// Chittur Subbaraman (chitturs) - 4/19/98
//
// If FM groups are not initialized or FM is shutting down, don't
// do anything.
//
if ( !FmpFMGroupsInited ||
FmpShutdown ) {
return(ERROR_SUCCESS);
}
Group = OmReferenceObjectById(ObjectTypeGroup, GroupId);
if (Group == NULL) {
return(ERROR_GROUP_NOT_FOUND);
}
Status = OmSetObjectName( Group, NewName);
if (Status == ERROR_SUCCESS) {
ClusterEvent(CLUSTER_EVENT_GROUP_PROPERTY_CHANGE, Group);
}
OmDereferenceObject(Group);
return(Status);
} // FmpUpdateChangeGroupName
BOOL
FmpEnumGroupNodeEvict(
IN PVOID Context1,
IN PVOID Context2,
IN PVOID Object,
IN LPCWSTR Name
)
/*++
Routine Description:
Group enumeration callback for removing node references when
a node is evicted.
Arguments:
Context1 - Supplies the node that is being evicted.
Context2 - not used
Object - Supplies a pointer to the group object
Name - Supplies the object name.
Return Value:
TRUE to continue enumeration
--*/
{
PFM_GROUP Group = (PFM_GROUP)Object;
PNM_NODE Node = (PNM_NODE)Context1;
PLIST_ENTRY listEntry;
PPREFERRED_ENTRY preferredEntry;
ClRtlLogPrint(LOG_NOISE,
"[FM] EnumGroupNodeEvict: Removing references to node %1!ws! from group %2!ws!\n",
OmObjectId(Node),
OmObjectId(Group));
FmpAcquireLocalGroupLock(Group);
//
// Walk the list of preferred owners. If this node is in the list, remove it.
//
for ( listEntry = Group->PreferredOwners.Flink;
listEntry != &(Group->PreferredOwners);
listEntry = listEntry->Flink ) {
preferredEntry = CONTAINING_RECORD( listEntry,
PREFERRED_ENTRY,
PreferredLinkage );
if ( preferredEntry->PreferredNode == Node ) {
RemoveEntryList(&preferredEntry->PreferredLinkage);
OmDereferenceObject(preferredEntry->PreferredNode);
LocalFree(preferredEntry);
break;
}
}
FmpReleaseLocalGroupLock(Group);
ClusterEvent(CLUSTER_EVENT_GROUP_PROPERTY_CHANGE, Group);
return(TRUE);
} // FmpEnumGroupNodeEvict
VOID
FmpSignalGroupWaiters(
IN PFM_GROUP Group
)
/*++
Routine Description:
Wakes up any threads waiting for this group to achieve a
stable state.
Arguments:
Group - Supplies the group.
Return Value:
None.
--*/
{
PLIST_ENTRY ListEntry;
PFM_WAIT_BLOCK WaitBlock;
FmpAcquireLocalGroupLock( Group );
while (!IsListEmpty(&Group->WaitQueue)) {
ListEntry = RemoveHeadList(&Group->WaitQueue);
WaitBlock = CONTAINING_RECORD(ListEntry,
FM_WAIT_BLOCK,
ListEntry);
WaitBlock->Status = ERROR_SUCCESS;
SetEvent(WaitBlock->hEvent);
}
FmpReleaseLocalGroupLock( Group );
}
DWORD
FmpWaitForGroup(
IN PFM_GROUP Group
)
/*++
Routine Description:
Waits for a group to reach a stable state.
Arguments:
Group - supplies the group
Comments - Assumption, is that the group lock is held when this is called.
This function releases the group lock before the wait
Return Value:
ERROR_SUCCESS if successful
ERROR_SHUTDOWN_IN_PROGRESS if the cluster is being shutdown
Win32 error code otherwise
--*/
{
FM_WAIT_BLOCK WaitBlock;
HANDLE WaitArray[2];
DWORD Status;
CLUSTER_GROUP_STATE GroupState;
WaitBlock.hEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
if (WaitBlock.hEvent == NULL) {
FmpReleaseLocalGroupLock( Group );
return(GetLastError());
}
//
// Check to see if it transitioned before we got the lock.
//
GroupState = FmpGetGroupState( Group , TRUE );
if ((GroupState == ClusterGroupOffline) ||
(GroupState == ClusterGroupOnline) ||
(GroupState == ClusterGroupFailed)) {
CloseHandle( WaitBlock.hEvent );
FmpReleaseLocalGroupLock( Group );
return(ERROR_SUCCESS);
}
//
// Chittur Subbaraman (chitturs) - 10/31/1999
//
// Now before waiting, really make sure one or more resources in the
// group is in pending state.
//
GroupState = FmpGetGroupState( Group, FALSE );
if ( GroupState != ClusterGroupPending ) {
CloseHandle( WaitBlock.hEvent );
FmpReleaseLocalGroupLock( Group );
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpWaitForGroup: Group <%1!ws!> state is %2!d!, not waiting for event...\r\n",
OmObjectName(Group),
GroupState );
return( ERROR_SUCCESS );
}
//
// Add this wait block to the queue.
//
InsertTailList(&Group->WaitQueue, &WaitBlock.ListEntry);
FmpReleaseLocalGroupLock( Group );
//
// Wait for the group to become stable or for the cluster to shutdown.
//
WaitArray[0] = FmpShutdownEvent;
WaitArray[1] = WaitBlock.hEvent;
Status = WaitForMultipleObjects(2, WaitArray, FALSE, INFINITE);
CloseHandle(WaitBlock.hEvent);
if (Status == 0) {
return(ERROR_SHUTDOWN_IN_PROGRESS);
} else {
return(WaitBlock.Status);
}
}
/****
@func DWORD | FmpDeleteGroup| This makes the gum call to delete the
group.
@parm IN PFM_GROUP | pGroup | The group that must be deleted.
@comm The group lock must be held when calling this api.
@rdesc Returns a result code. ERROR_SUCCESS on success.
****/
DWORD
FmpDeleteGroup(
IN PFM_GROUP pGroup)
{
PCWSTR pszGroupId;
DWORD dwBufSize;
DWORD dwGroupLen;
DWORD dwStatus;
pszGroupId = OmObjectId( pGroup );
dwGroupLen = (lstrlenW(pszGroupId)+1) * sizeof(WCHAR);
//
// Send message.
//
dwStatus = GumSendUpdateEx(GumUpdateFailoverManager,
FmUpdateDeleteGroup,
1,
dwGroupLen,
pszGroupId);
return(dwStatus);
}
VOID
FmpGroupLastReference(
IN PFM_GROUP pGroup
)
/*++
Routine Description:
Last dereference to group object processing routine.
All cleanup for a group should really be done here!
Arguments:
Resource - pointer the group being removed.
Return Value:
None.
--*/
{
if ( pGroup->OwnerNode != NULL )
OmDereferenceObject(pGroup->OwnerNode);
if (pGroup->dwStructState & FM_GROUP_STRUCT_CREATED)
DeleteCriticalSection(&pGroup->Lock);
return;
} // FmpGroupLastReference
DWORD
FmpDoMoveGroupOnFailure(
IN LPVOID pContext
)
/*++
Routine Description:
Move a group after ensuring that all resources in the group are
in stable state. This thread is forked from FmpHandleGroupFailure.
Arguments:
pContext - Pointer to the MOVE_GROUP structure to move.
Returns:
ERROR_SUCCESS.
--*/
{
PMOVE_GROUP pMoveGroup = ( PMOVE_GROUP ) pContext;
PFM_GROUP pGroup;
DWORD dwStatus;
PLIST_ENTRY pListEntry;
PFM_RESOURCE pResource;
//
// Chittur Subbaraman (chitturs) - 4/13/99
//
// This thread first waits until all the resources within the
// failed group are in stable state and then initiates the
// move.
//
pGroup = pMoveGroup->Group;
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpDoMoveGroupOnFailure: Entry for Group <%1!ws!>...\n",
OmObjectId(pGroup));
TryAgain:
FmpAcquireLocalGroupLock( pGroup );
//
// This thread must yield if someone else takes responsibility for
// the move.
//
// Condition 1: Protects against the case in which someone moves
// the group to another node and back to you while this thread is
// sleeping (very rare, I agree).
//
// Condition 2: Protects against the common move case.
//
// Condition 3: Protects against the case in which the
// FmpMovePendingThread is waiting in FmpWaitForGroup while
// this thread got the resource lock and reached here.
//
if ( ( pGroup->dwStructState &
FM_GROUP_STRUCT_MARKED_FOR_REGULAR_MOVE ) ||
( pGroup->OwnerNode != NmLocalNode ) ||
( pGroup->MovingList != NULL ) )
{
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpDoMoveGroupOnFailure: Group <%1!ws!> move being yielded to someone else who is moving it...\n",
OmObjectId(pGroup));
goto FnExit;
}
//
// If FM is shutting down, just exit.
//
if ( FmpShutdown )
{
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpDoMoveGroupOnFailure: Giving up Group <%1!ws!> move. FM is shutting down ...\n",
OmObjectId(pGroup));
goto FnExit;
}
//
// If the group has been marked for delete, then also exit. This is
// just an optimization. FmpDoMoveGroup does this check also.
//
if ( !IS_VALID_FM_GROUP( pGroup ) )
{
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpDoMoveGroupOnFailure: Group <%1!ws!> marked for delete. Exiting ...\n",
OmObjectId(pGroup));
goto FnExit;
}
//
// Wait until all resources within the group become stable.
//
for ( pListEntry = pGroup->Contains.Flink;
pListEntry != &(pGroup->Contains);
pListEntry = pListEntry->Flink )
{
pResource = CONTAINING_RECORD( pListEntry,
FM_RESOURCE,
ContainsLinkage );
if ( pResource->State > ClusterResourcePending )
{
FmpReleaseLocalGroupLock( pGroup );
Sleep ( 200 );
goto TryAgain;
}
}
//
// Initiate a move now that the group is quiet.
//
dwStatus = FmpDoMoveGroup( pGroup, NULL, FALSE );
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpDoMoveGroupOnFailure: FmpDoMoveGroup returns %1!u!\n",
dwStatus);
FnExit:
LocalFree( pContext );
pGroup->dwStructState &=
~( FM_GROUP_STRUCT_MARKED_FOR_MOVE_ON_FAIL | FM_GROUP_STRUCT_MARKED_FOR_REGULAR_MOVE );
FmpReleaseLocalGroupLock( pGroup );
OmDereferenceObject( pGroup );
ClRtlLogPrint(LOG_NOISE,
"[FM] FmpDoMoveGroupOnFailure Exit.\n");
return( ERROR_SUCCESS );
} // FmpDoMoveGroupOnFailure
/****
@func DWORD | FmpSetIntendedOwnerForGroup| This makes the gum call
to set the intended owner for the group before a move.
@parm IN PFM_GROUP | pGroup | The group whose intended owner
is to be set.
@comm The local group lock is held while making this call.
@rdesc Returns a result code. ERROR_SUCCESS on success.
****/
DWORD FmpSetIntendedOwnerForGroup(
IN PFM_GROUP pGroup,
IN DWORD dwNodeId)
{
PCWSTR pszGroupId;
DWORD dwGroupLen;
DWORD dwStatus;
pszGroupId = OmObjectId( pGroup );
dwGroupLen = (lstrlenW(pszGroupId)+1) * sizeof(WCHAR);
//
// Send message.
//
dwStatus = GumSendUpdateEx(GumUpdateFailoverManager,
FmUpdateGroupIntendedOwner,
2,
dwGroupLen,
pszGroupId,
sizeof(DWORD),
&dwNodeId
);
return(dwStatus);
}
/****
@func DWORD | FmpSetOwnerForGroup | On a move the new owner
node makes this gum call to inform all nodes that it
owns this particular group.
@parm IN PFM_GROUP | pGroup | The group whose owner must be set.
@parm IN PNM_NODE | pNode | The group's owner node.
@comm The local group lock is held while making this call.
@rdesc Returns a result code. ERROR_SUCCESS on success.
****/
DWORD FmpSetOwnerForGroup(
IN PFM_GROUP pGroup,
IN PNM_NODE pNode
)
{
PCWSTR pszGroupId;
PCWSTR pszNodeId;
DWORD dwGroupLen;
DWORD dwNodeLen;
DWORD dwStatus;
pszGroupId = OmObjectId( pGroup );
dwGroupLen = (lstrlenW(pszGroupId)+1) * sizeof(WCHAR);
pszNodeId = OmObjectId(pNode);
dwNodeLen = (lstrlenW(pszNodeId)+1) * sizeof(WCHAR);
//
// Send message.
//
dwStatus = GumSendUpdateEx(GumUpdateFailoverManager,
FmUpdateCheckAndSetGroupOwner,
2,
dwGroupLen,
pszGroupId,
dwNodeLen,
pszNodeId
);
return(dwStatus);
}
PNM_NODE
FmpGetNodeNotHostingUndesiredGroups(
IN PFM_GROUP pGroup,
IN BOOL fRuleOutLocalNode
)
/*++
Routine Description:
Find a preferred node that does not host groups with CLUSREG_NAME_GRP_ANTI_AFFINITY_CLASS_NAME
property set to the same value as the supplied group.
Arguments:
pGroup - Pointer to the group object we're checking.
fRuleOutLocalNode - Should the local node be considered or not.
Return Value:
Pointer to node object that satisfies the anti-affinity condition.
NULL if a node cannot be not found.
Note:
The antiaffinity property value is defined as a MULTI_SZ property. However for this implementation
we ignore all the string values beyond the first value. The MULTI_SZ definition is to allow
future expansion of the algorithm implemented by this function.
--*/
{
PLIST_ENTRY plistEntry;
PPREFERRED_ENTRY pPreferredEntry;
GROUP_AFFINITY_NODE_INFO GroupAffinityNodeInfo;
PNM_NODE pNode = NULL;
DWORD dwIndex = 0, i;
DWORD dwClusterHighestVersion;
GroupAffinityNodeInfo.ppNmNodeList = NULL;
//
// Chittur Subbaraman (chitturs) - 3/6/2001
//
// This function works as follows. First, it makes a list of possible candidate nodes that the
// group can be hosted on. Next, it enumerates all groups in the cluster and for those
// groups that have the AntiAffinityClassName property set, it will remove those group's
// current owner nodes from the list of possible candidate nodes if they are present there.
// Note that this function will return a node only if the pruning has positively taken place.
// Else, it will return NULL.
//
// IMPORTANT NOTE: This function is called by all nodes from the node down processing FM
// GUM handler. For all nodes to reach exactly the same decision on the group placement,
// it is crucial that all nodes call this function for groups in exactly the same order.
// E.g., if node 1 was hosting groups A, B and C and it died, then all the remaining nodes
// must call this function first for group A, then for group B and finally for group C.
// This is because once group A is placed by this function, then group B's placement is
// influenced by group A's placement and similarly for groups B and C. This order is
// ensured since all nodes OM will maintain groups in the same order since OM creates this
// list based on enumerating the group key (under Cluster\Groups) and that must occur in the
// same order in all nodes.
//
//
// It is too bad that we can't hold any locks while enumerating groups and looking at the
// property field since that will soon result in a deadlock (since we can't hold group locks
// from within a GUM and this function is invoked from a GUM).
//
//
// If we are dealing with the mixed mode cluster or if the group does not have the antiaffinity
// property set, then don't do anything.
//
NmGetClusterOperationalVersion( &dwClusterHighestVersion,
NULL,
NULL );
if ( ( CLUSTER_GET_MAJOR_VERSION( dwClusterHighestVersion ) < NT51_MAJOR_VERSION ) ||
( pGroup->lpszAntiAffinityClassName == NULL ) )
{
goto FnExit;
}
//
// Initialize the node list.
//
GroupAffinityNodeInfo.ppNmNodeList = LocalAlloc ( LPTR,
ClusterDefaultMaxNodes * sizeof ( PNM_NODE ) );
if ( GroupAffinityNodeInfo.ppNmNodeList == NULL )
{
ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpGetNodeNotHostingUndesiredGroups: Failed in alloc, Status %1!d!\n",
GetLastError());
goto FnExit;
}
//
// For each entry in the preferred list, find a system that is up and that does not
// host any groups with an anti-affinity to the supplied group.
//
for ( plistEntry = pGroup->PreferredOwners.Flink;
plistEntry != &(pGroup->PreferredOwners);
plistEntry = plistEntry->Flink )
{
pPreferredEntry = CONTAINING_RECORD( plistEntry,
PREFERRED_ENTRY,
PreferredLinkage );
if ( NmGetNodeState( pPreferredEntry->PreferredNode ) == ClusterNodeUp )
{
if ( ( fRuleOutLocalNode ) &&
( pPreferredEntry->PreferredNode == NmLocalNode ) ) continue;
GroupAffinityNodeInfo.ppNmNodeList[dwIndex] = pPreferredEntry->PreferredNode;
dwIndex ++;
}
} // for
//
// Initialize the other fields in the GroupAffinityNodeInfo structure.
//
GroupAffinityNodeInfo.pGroup = pGroup;
GroupAffinityNodeInfo.fDidPruningOccur = FALSE;
//
// Enumerate all the groups and rule out nodes that host groups with the supplied
// anti-affinity property set.
//
OmEnumObjects ( ObjectTypeGroup,
FmpCheckForAntiAffinityProperty,
pGroup->lpszAntiAffinityClassName,
&GroupAffinityNodeInfo );
//
// No pruning occurred so far. So, don't proceed further and let the caller decide on
// a best node for the group using some other algorithm.
//
if ( GroupAffinityNodeInfo.fDidPruningOccur == FALSE )
{
goto FnExit;
}
//
// Now, pick the first node from the list that is a valid node.
//
for ( i=0; i<ClusterDefaultMaxNodes; i++ )
{
if ( GroupAffinityNodeInfo.ppNmNodeList[i] != NULL )
{
pNode = GroupAffinityNodeInfo.ppNmNodeList[i];
ClRtlLogPrint(LOG_NOISE, "[FM] FmpGetNodeNotHostingUndesiredGroups: Choosing node %1!d! for group %2!ws! [%3!ws!]...\n",
NmGetNodeId(pNode),
OmObjectId(pGroup),
OmObjectName(pGroup));
goto FnExit;
}
} // for
FnExit:
LocalFree( GroupAffinityNodeInfo.ppNmNodeList );
return( pNode );
} // FmpGetNodeNotHostingUndesiredGroups
BOOL
FmpCheckForAntiAffinityProperty(
IN LPCWSTR lpszAntiAffinityClassName,
IN PGROUP_AFFINITY_NODE_INFO pGroupAffinityNodeInfo,
IN PFM_GROUP pGroup,
IN LPCWSTR lpszGroupName
)
/*++
Routine Description:
Remove a node from the supplied node list if it hosts the supplied group with the supplied
anti-affinity property set.
Arguments:
lpszAntiAffinityClassName - The name property to check for.
pGroupAffinityNodeInfo - Structure containing a list of nodes that is to be pruned possibly.
pGroup - Supplies the group.
lpszGroupName - Supplies the group's name.
Return Value:
TRUE - to indicate that the enumeration should continue.
FALSE - to indicate that the enumeration should not continue.
--*/
{
DWORD i;
//
// If the supplied group has the anti-affinity property not set or if it has the
// property set but is not same as the one we are checking against or if it is same
// as the group we are interested in placing, then just return specifying that the
// enum should continue.
//
if ( ( pGroup->lpszAntiAffinityClassName == NULL ) ||
( pGroup == pGroupAffinityNodeInfo->pGroup ) ||
( lstrcmp ( lpszAntiAffinityClassName, pGroup->lpszAntiAffinityClassName ) != 0 ) )
{
goto FnExit;
}
//
// If you reached here, this means that the supplied group has the anti-affinity property
// set and is same as the property we are checking against. So, prune the node list.
//
for ( i=0; i<ClusterDefaultMaxNodes; i++ )
{
if ( ( pGroupAffinityNodeInfo->ppNmNodeList[i] != NULL ) &&
( pGroup->OwnerNode == pGroupAffinityNodeInfo->ppNmNodeList[i] ) )
{
ClRtlLogPrint(LOG_NOISE, "[FM] FmpCheckForAntiAffinityProperty: Pruning node %1!d! for group %2!ws! due to "
"group %3!ws!, AntiAffinityClassName=%4!ws!...\n",
NmGetNodeId(pGroupAffinityNodeInfo->ppNmNodeList[i]),
OmObjectId(pGroupAffinityNodeInfo->pGroup),
OmObjectId(pGroup),
lpszAntiAffinityClassName);
pGroupAffinityNodeInfo->ppNmNodeList[i] = NULL;
//
// Mark that pruning was attempted.
//
pGroupAffinityNodeInfo->fDidPruningOccur = TRUE;
goto FnExit;
} // if
} // for
FnExit:
return( TRUE );
} // FmpCheckForAntiAffinityProperty
PNM_NODE
FmpPickNodeFromPreferredListAtRandom(
IN PFM_GROUP pGroup,
IN PNM_NODE pSuggestedPreferredNode OPTIONAL,
IN BOOL fRuleOutLocalNode,
IN BOOL fCheckForDisablingRandomization
)
/*++
Routine Description:
Find a preferred node for the group that is UP in a random fashion.
Arguments:
pGroup - Pointer to the group object we're interested in.
pSuggestedPreferredNode - Suggested fallback option in case this random result is undesired. OPTIONAL
fRuleOutLocalNode - Should the local node be ruled out from consideration.
fCheckForDisablingRandomization - Check whether randomization should be disabled.
Return Value:
The preferred node that is picked.
NULL if a node cannot be not found.
Comments:
This function is called from both FmpMoveGroup as well as from FmpNodeDown. In the former case,
we will have a non-NULL suggested preferred node, rule out local node option, check
for property setting disabling randomization and check for mixed mode clusters to disable
randomization. In the latter case, these parameters are the opposite.
--*/
{
UUID uuId;
USHORT usHashValue;
PNM_NODE pNode = NULL, pSelectedNode = pSuggestedPreferredNode;
DWORD dwNodeId;
DWORD dwRetry = 0;
DWORD dwStatus;
DWORD dwDisabled = 0;
DWORD dwClusterHighestVersion;
//
// Chittur Subbaraman (chitturs) - 4/18/2001
//
if ( fCheckForDisablingRandomization )
{
//
// If you are here, this means you are coming as a part of a user-initiated move.
// Check whether the randomization applies.
//
//
// First, check if are operating in a mixed version cluster. If so, don't randomize.
//
NmGetClusterOperationalVersion( &dwClusterHighestVersion,
NULL,
NULL );
if ( CLUSTER_GET_MAJOR_VERSION( dwClusterHighestVersion ) <
NT51_MAJOR_VERSION )
{
return ( pSelectedNode );
}
//
// Next check if the user has turned off the randomization algorithm by setting
// HKLM\Cluster\DisableGroupPreferredOwnersRandomization DWORD to 1.
//
dwStatus = DmQueryDword( DmClusterParametersKey,
CLUSREG_NAME_DISABLE_GROUP_PREFERRED_OWNER_RANDOMIZATION,
&dwDisabled,
NULL );
if ( ( dwStatus == ERROR_SUCCESS ) &&
( dwDisabled == 1 ) )
{
return ( pSelectedNode );
}
}
//
// This function will attempt to pick a node at random from the group's preferred owners list
// in case the caller does not suggest a preferred node which is set by the user. So, first
// this function checks this case and bails out if the condition is met. Otherwise, it
// will generate a random number between 1 and NmMaxNodeId and see if (a) that node is in
// the group's preferred list, and (b) that node is UP. If so, it picks up the node. Note
// that the function will try 10 times to pick a node and then gives up. If no
// node is found, this function will return the suggested node which in some cases could be
// NULL.
//
ClRtlLogPrint(LOG_NOISE, "[FM] FmpPickNodeFromPreferredListAtRandom: Picking node for group %1!ws! [%2!ws!], suggested node %3!u!...\n",
OmObjectId(pGroup),
OmObjectName(pGroup),
(pSuggestedPreferredNode == NULL) ? 0:NmGetNodeId(pSuggestedPreferredNode));
if ( ( pSuggestedPreferredNode != NULL ) &&
( FmpIsNodeUserPreferred ( pGroup, pSuggestedPreferredNode ) ) )
{
ClRtlLogPrint(LOG_NOISE, "[FM] FmpPickNodeFromPreferredListAtRandom: Node %2!u! for group %1!ws! is user preferred...\n",
OmObjectId(pGroup),
NmGetNodeId(pSuggestedPreferredNode));
goto FnExit;
}
if ( pGroup->lpszAntiAffinityClassName != NULL )
{
ClRtlLogPrint(LOG_NOISE, "[FM] FmpPickNodeFromPreferredListAtRandom: Group %1!ws! has antiaffinity property set...\n",
OmObjectId(pGroup));
goto FnExit;
}
//
// Retry 25 times so that we can have a good chance of getting a valid node. Note that we
// supply NmMaxNodeId to the srand() function and its value is equal to the node limit of
// 16. So, to get a valid node in a smaller size cluster, we have to have the retry count
// to be reasonable.
//
while ( dwRetry++ < 25 )
{
dwStatus = UuidFromString( ( LPWSTR ) OmObjectId(pGroup), &uuId );
if ( dwStatus != RPC_S_OK )
{
ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpPickNodeFromPreferredListAtRandom: Unable to get UUID from string %1!ws!, Status %2!u!...\n",
OmObjectId(pGroup),
dwStatus);
goto FnExit;
}
usHashValue = UuidHash( &uuId, &dwStatus );
if ( dwStatus != RPC_S_OK )
{
ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpPickNodeFromPreferredListAtRandom: Unable to get hash value for UUID %1!ws!, Status %2!u!...\n",
OmObjectId(pGroup),
dwStatus);
goto FnExit;
}
//
// Seed the random number generate with a value that is as random as it gets.
//
srand( GetTickCount() * usHashValue * ( dwRetry + 1 ) );
//
// Find the node ID that is between ClusterMinNodeId and NmMaxNodeId. We use NmMaxNodeId
// here since there is no simple way to get the count of configured nodes. Note that we
// have to ensure that the node ID falls within this range, otherwise assertion trips
// in NmReferenceNodeById.
//
dwNodeId = ( DWORD ) ( ( double ) rand() / ( double ) ( RAND_MAX ) * NmMaxNodeId ) + 1;
if ( dwNodeId > NmMaxNodeId ) dwNodeId = NmMaxNodeId;
if ( dwNodeId < ClusterMinNodeId ) dwNodeId = ClusterMinNodeId;
//
// In case the caller asks you to rule out local node, do so.
//
if ( ( fRuleOutLocalNode ) && ( dwNodeId == NmLocalNodeId ) ) continue;
//
// Reference and dereference the node objects. Note that we are only interested in
// getting a pointer to the node object and we use the fact that the node in the preferred
// list must be referenced.
//
pNode = NmReferenceNodeById ( dwNodeId );
if ( pNode == NULL ) continue;
if ( ( FmpInPreferredList( pGroup, pNode, FALSE, NULL ) ) &&
( NmGetExtendedNodeState( pNode ) == ClusterNodeUp ) )
{
pSelectedNode = pNode;
break;
}
OmDereferenceObject ( pNode );
pNode = NULL;
}// while
FnExit:
if ( pNode != NULL ) OmDereferenceObject ( pNode );
ClRtlLogPrint(LOG_NOISE, "[FM] FmpPickNodeFromPreferredListAtRandom: Selected node %2!u! for group %1!ws!...\n",
OmObjectId(pGroup),
(pSelectedNode == NULL) ? 0:NmGetNodeId(pSelectedNode));
return ( pSelectedNode );
}// FmpPickNodeFromPreferredNodeAtRandom
BOOL
FmpIsNodeUserPreferred(
IN PFM_GROUP pGroup,
IN PNM_NODE pPreferredNode
)
/*++
Routine Description:
Check whether the supplied node is set as a preferred node by the user.
Arguments:
pGroup - Pointer to the group object we're interested in.
pPreferredNode - Preferred node to check for.
Return Value:
TRUE - The supplied preferred node is user set.
FALSE otherwise
--*/
{
DWORD dwStatus;
BOOL fPreferredByUser = FALSE;
LPWSTR lpmszPreferredNodeList = NULL;
LPCWSTR lpszPreferredNode;
DWORD cbPreferredNodeList = 0;
DWORD cbBuffer = 0;
DWORD dwIndex;
PNM_NODE pNode;
//
// Look for any preferred owners set by the user
//
dwStatus = DmQueryMultiSz( pGroup->RegistryKey,
CLUSREG_NAME_GRP_PREFERRED_OWNERS,
&lpmszPreferredNodeList,
&cbBuffer,
&cbPreferredNodeList );
if ( dwStatus != ERROR_SUCCESS )
{
goto FnExit;
}
//
// Parse the multisz and check whether the supplied node exists in the list
//
for ( dwIndex = 0; ; dwIndex++ )
{
lpszPreferredNode = ClRtlMultiSzEnum( lpmszPreferredNodeList,
cbPreferredNodeList/sizeof(WCHAR),
dwIndex );
if ( lpszPreferredNode == NULL )
{
break;
}
pNode = OmReferenceObjectById( ObjectTypeNode,
lpszPreferredNode );
if ( pNode == NULL )
{
ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpIsNodeUserPreferred: Unable to reference node %1!ws!, Status %2!u!...\n",
lpszPreferredNode,
dwStatus);
continue;
}
if ( pNode == pPreferredNode )
{
fPreferredByUser = TRUE;
OmDereferenceObject ( pNode );
break;
}
OmDereferenceObject ( pNode );
} // for
FnExit:
LocalFree ( lpmszPreferredNodeList );
return ( fPreferredByUser );
}// FmpIsNodeUserPreferred
DWORD
FmpPrepareGroupNodeList(
OUT PFM_GROUP_NODE_LIST *ppGroupNodeList
)
/*++
Routine Description:
Prepares a buffer containing the group ID and preferred owner node ID of all groups.
Arguments:
ppGroupNodeList - Pointer to a buffer containing group IDs and preferred nodes.
Return Value:
ERROR_SUCCESS on success
Win32 error code otherwise
--*/
{
DWORD cbBuffer = 512; // Let us try a 512 byte buffer to start with.
DWORD dwStatus;
DWORD dwDisabled = 0;
//
// First check if the user has turned off the randomization algorithm by setting
// HKLM\Cluster\DisableGroupPreferredOwnersRandomization DWORD to 1.
//
dwStatus = DmQueryDword( DmClusterParametersKey,
CLUSREG_NAME_DISABLE_GROUP_PREFERRED_OWNER_RANDOMIZATION,
&dwDisabled,
NULL );
if ( ( dwStatus == ERROR_SUCCESS ) &&
( dwDisabled == 1 ) )
{
dwStatus = ERROR_CLUSTER_INVALID_REQUEST;
return ( dwStatus );
}
//
// This function allocates contiguous memory for a list so that the entire buffer
// can be passed on to GUM.
//
*ppGroupNodeList = LocalAlloc( LPTR, cbBuffer );
if ( *ppGroupNodeList == NULL )
{
dwStatus = GetLastError();
ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpPrepareGroupNodeList: Memory alloc failed, Status %1!u!...\n",
dwStatus);
return ( dwStatus );
}
//
// Initialize the size of the list to the size of the header minus first element.
//
( *ppGroupNodeList )->cbGroupNodeList = sizeof ( FM_GROUP_NODE_LIST ) -
sizeof ( FM_GROUP_NODE_LIST_ENTRY );
//
// Enumerate all the groups, find a possibly random preferred owner for each group and
// return all the info in the buffer.
//
return OmEnumObjects ( ObjectTypeGroup,
FmpAddGroupNodeToList,
ppGroupNodeList,
&cbBuffer );
}// FmpPrepareGroupNodeList
DWORD
FmpAddGroupNodeToList(
IN PFM_GROUP_NODE_LIST *ppGroupNodeList,
IN LPDWORD pcbBuffer,
IN PFM_GROUP pGroup,
IN LPCWSTR lpszGroupId
)
/*++
Routine Description:
Find a random preferred owner for the given group and add the info to a buffer.
Arguments:
ppGroupNodeList - Pointer to a buffer containing group IDs and preferred nodes.
pcbBuffer - Size of the buffer.
pGroup - Group whose preferred node is to be found.
lpszGroupId - ID of the group.
Return Value:
ERROR_SUCCESS on success
Win32 error code otherwise
--*/
{
PNM_NODE pNode;
PFM_GROUP_NODE_LIST_ENTRY pGroupNodeListEntry;
PFM_GROUP_NODE_LIST pBuffer;
PLIST_ENTRY pListEntry;
DWORD dwStatus;
//
// Get the group lock since you manipulate group lists here.
//
FmpAcquireLocalGroupLock ( pGroup );
//
// Skip the quorum group since we cannot randomize its preferred owners list since MM has a
// choke hold on the placement of quorum group.
//
if ( pGroup == gpQuoResource->Group ) goto FnExit;
//
// Try to pick a preferred node list for the group at random.
//
pNode = FmpPickNodeFromPreferredListAtRandom( pGroup,
NULL, // No suggested preferred owner
FALSE, // Can choose local node
FALSE ); // Check whether randomization should be
// disabled
//
// If no node could be picked, bail out
//
if ( pNode == NULL ) goto FnExit;
//
// Check whether the allocated buffer is big enough to hold the new entry. Note that the
// RHS of the equality need not contain the NULL char size since we allocate 1 WCHAR for it in
// the FM_GROUP_NODE_LIST_ENTRY structure. Also, note that we have to see if the current
// buffer size is big enough to hold the padding for DWORD alignment.
//
if ( *pcbBuffer < ( ( *ppGroupNodeList )->cbGroupNodeList +
( sizeof ( FM_GROUP_NODE_LIST_ENTRY ) +
lstrlenW ( lpszGroupId ) * sizeof ( WCHAR ) +
sizeof ( DWORD ) - 1
) & ~( sizeof ( DWORD ) - 1 )
) )
{
//
// Reallocate a bigger buffer
//
pBuffer = LocalAlloc( LPTR, 2 * ( *pcbBuffer ) );
if ( pBuffer == NULL )
{
dwStatus = GetLastError();
ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpAddGroupNodeToList: Memory alloc failed, Status %1!u!...\n",
dwStatus);
goto FnExit;
}
( *pcbBuffer ) *= 2;
//
// Copy the contents of the old list to the new list.
//
CopyMemory( pBuffer, *ppGroupNodeList, ( *ppGroupNodeList )->cbGroupNodeList );
LocalFree ( *ppGroupNodeList );
*ppGroupNodeList = pBuffer;
}
//
// Find the pointer to the beginning of the new list entry
//
pGroupNodeListEntry = ( PFM_GROUP_NODE_LIST_ENTRY )
( ( LPBYTE ) ( *ppGroupNodeList ) +
( *ppGroupNodeList )->cbGroupNodeList );
//
// Adjust the size of the list. As above, size of NULL char is excluded. Align the length
// to a multiple of DWORD since we want the PFM_GROUP_NODE_LIST_ENTRY structure to be
// DWORD aligned since the structure starts with a DWORD.
//
( *ppGroupNodeList )->cbGroupNodeList += ( sizeof ( FM_GROUP_NODE_LIST_ENTRY ) +
lstrlenW ( lpszGroupId ) * sizeof ( WCHAR ) +
sizeof ( DWORD ) - 1 ) & ~( sizeof ( DWORD ) - 1 );
//
// Set the contents of the list entry
//
pGroupNodeListEntry->dwPreferredNodeId = NmGetNodeId ( pNode );
lstrcpy( pGroupNodeListEntry->szGroupId, lpszGroupId );
FnExit:
FmpReleaseLocalGroupLock( pGroup );
return ( TRUE );
}// FmpPrepareGroupNodeList
PNM_NODE
FmpParseGroupNodeListForPreferredOwner(
IN PFM_GROUP pGroup,
IN PFM_GROUP_NODE_LIST pGroupNodeList,
IN PNM_NODE pSuggestedPreferredNode
)
/*++
Routine Description:
Parse the supplied group node list looking for a preferred node for the supplied group.
Arguments:
pGroup - The group whose preferred node must be found.
pGroupNodeList - The list contains preferred nodes of the group.
pSuggestedPreferredNode - Suggested preferred node fallback option.
Return Value:
The preferred node for the group.
--*/
{
PNM_NODE pSelectedNode = pSuggestedPreferredNode;
PFM_GROUP_NODE_LIST_ENTRY pGroupNodeListEntry;
BOOL fFoundGroup = FALSE;
PNM_NODE pNode = NULL;
DWORD dwStatus;
DWORD cbGroupNodeList;
//
// If the suggested node is user preferred or if it has an anti-affinity class name
// property set, don't do anything else. Just return the suggested owner.
//
if ( ( FmpIsNodeUserPreferred ( pGroup, pSuggestedPreferredNode ) ) ||
( pGroup->lpszAntiAffinityClassName != NULL ) )
{
ClRtlLogPrint(LOG_NOISE, "[FM] FmpParseGroupNodeListForPreferredOwner: Node %2!u! for group %1!ws! is user preferred/antiaffinity property set...\n",
OmObjectId(pGroup),
NmGetNodeId(pSuggestedPreferredNode));
goto FnExit;
}
cbGroupNodeList = sizeof ( FM_GROUP_NODE_LIST ) -
sizeof ( FM_GROUP_NODE_LIST_ENTRY );
//
// Walk the supplied list looking for the group entry.
//
while ( cbGroupNodeList < pGroupNodeList->cbGroupNodeList )
{
pGroupNodeListEntry = ( PFM_GROUP_NODE_LIST_ENTRY ) ( ( LPBYTE ) pGroupNodeList +
cbGroupNodeList );
if ( lstrcmp( pGroupNodeListEntry->szGroupId, OmObjectId( pGroup ) ) == 0 )
{
fFoundGroup = TRUE;
break;
}
cbGroupNodeList += ( sizeof ( FM_GROUP_NODE_LIST_ENTRY ) +
lstrlenW ( pGroupNodeListEntry->szGroupId ) * sizeof ( WCHAR ) +
sizeof ( DWORD ) - 1 ) & ~( sizeof ( DWORD ) - 1 );
} // while
//
// Fallback to the suggested option if:
// (1) You did not find the group in the list
// (2) The preferred node for the group is invalid in the list
// (3) The preferred node for the group is down
//
if ( fFoundGroup == FALSE )
{
ClRtlLogPrint(LOG_NOISE, "[FM] FmpParseGroupNodeListForPreferredOwner: Did not find group %1!ws! in supplied list...\n",
OmObjectId(pGroup));
goto FnExit;
}
if ( ( pGroupNodeListEntry->dwPreferredNodeId == 0 ) ||
( pGroupNodeListEntry->dwPreferredNodeId > NmMaxNodeId ) )
{
ClRtlLogPrint(LOG_NOISE, "[FM] FmpParseGroupNodeListForPreferredOwner: Invalid node %1!u! for group %1!ws! in supplied list...\n",
pGroupNodeListEntry->dwPreferredNodeId,
OmObjectId(pGroup));
goto FnExit;
}
pNode = NmReferenceNodeById( pGroupNodeListEntry->dwPreferredNodeId );
if ( pNode == NULL )
{
dwStatus = GetLastError();
ClRtlLogPrint(LOG_UNUSUAL, "[FM] FmpParseGroupNodeListForPreferredOwner: Unable to reference node %1!u! for group %1!ws!, Status %3!u!...\n",
pGroupNodeListEntry->dwPreferredNodeId,
OmObjectId(pGroup),
dwStatus);
goto FnExit;
}
if ( NmGetNodeState( pNode ) != ClusterNodeUp )
{
ClRtlLogPrint(LOG_UNUSUAL, "[FM] FmpParseGroupNodeListForPreferredOwner: Preferred node %1!u! for group %1!ws! is not UP...\n",
pGroupNodeListEntry->dwPreferredNodeId,
OmObjectId(pGroup));
goto FnExit;
}
pSelectedNode = pNode;
ClRtlLogPrint(LOG_NOISE, "[FM] FmpParseGroupNodeListForPreferredOwner: Selected node %1!u! for group %2!ws! from supplied randomized list...\n",
pGroupNodeListEntry->dwPreferredNodeId,
OmObjectId(pGroup));
FnExit:
//
// Dereference the node object since we depend on the original reference added to the
// group's preferred owner when it was added to the group structure.
//
if ( pNode != NULL ) OmDereferenceObject( pNode );
return ( pSelectedNode );
}// FmpParseGroupNodeListForPreferredOwner