#ifdef __TANDEM #pragma columns 79 #pragma page "srgpif.c - T9050 - interface routines for Regroup Module" #endif /* @@@ START COPYRIGHT @@@ ** Tandem Confidential: Need to Know only ** Copyright (c) 1995, Tandem Computers Incorporated ** Protected as an unpublished work. ** All Rights Reserved. ** ** The computer program listings, specifications, and documentation ** herein are the property of Tandem Computers Incorporated and shall ** not be reproduced, copied, disclosed, or used in whole or in part ** for any reason without the prior express written permission of ** Tandem Computers Incorporated. ** ** @@@ END COPYRIGHT @@@ **/ /*--------------------------------------------------------------------------- * This file (srgpif.c) contains all the external interface routines * of Regroup. *---------------------------------------------------------------------------*/ #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ #include /************************************************************************ * rgp_estimate_memory * =================== * * Description: * * Routine to find the number of bytes of memory needed by regroup. * The only global memory used by Regroup is for the rgp_control structure. * The caller must allocate and zero out a chunk of this much memory * and then call rgp_init() with a pointer to this memory. * * Parameters: * * None * * Returns: * * int - number of bytes of locked down and initialized (to 0) memory * needed by Regroup. The memory must be 4-byte aligned. * * Algorithm: * * Uses the size of the rgp_control_t to calculate the number of * bytes needed. * ************************************************************************/ _priv _resident int RGP_ESTIMATE_MEMORY(void) { return(sizeof(rgp_control_t)); } /************************************************************************ * rgp_init * ======== * * Description: * * Routine to initialize the global Regroup data structures. * * Parameters: * * node_t this_node - * node number of local node; regroup uses bit masks to represent * nodes in the cluster and starts numbering nodes from 0. The OS * starts numbering at LOWEST_NODENUM. This transformation is * maintained in all the regroup interfaces to the OS. * * unsigned int num_nodes - * number of nodes in the configured node number space = * (largest configured node number - LOWEST_NODENUM + 1). * * void *rgp_buffer - * pointer to a block of locked down memory initialized to 0; this is * for use by Regroup as its global memory; must be 4-byte aligned * * int rgp_buflen - * length in bytes of the locked down buffer *rgp_buffer; must be equal * to or greater than the number returned by rgp_estimate_memory() * * rgp_msgsys_p rgp_msgsys_p - * pointer to a common struct used by the message system and * Regroup to co-ordinate regroup related work * * Returns: * * void - no return value * * Algorithm: * * Initializes the Regroup global data structure with default initial * values and the parameters passed in. * ************************************************************************/ _priv _resident void RGP_INIT(node_t this_node, unsigned int num_nodes, void *rgp_buffer, int rgp_buflen, rgp_msgsys_p rgp_msgsys_p) { this_node = INT_NODE(this_node); /* adjust the node number by the offset */ if ((num_nodes > MAX_CLUSTER_SIZE) || (this_node >= (node_t) num_nodes) || (rgp_buflen < rgp_estimate_memory()) /* buffer too small */ || ((ULONG_PTR)rgp_buffer % 4) /* buffer not 4-byte aligned */ ) RGP_ERROR(RGP_INTERNAL_ERROR); #ifdef NSK /* In NSK, the caller must set up the global rgp pointer. */ #else rgp = (rgp_control_t *) rgp_buffer; #endif /* NSK */ rgp->num_nodes = num_nodes; /* # of nodes configured */ rgp->rgp_msgsys_p = rgp_msgsys_p; /* ptr to struct shared with Msgsys */ rgp->mynode = this_node; #if defined (NT) /* Initialize RGP_LOCK, the CRITICALSECTION object that will be used * to synchronize access within the regroup procedures */ InitializeCriticalSection( &rgp->OS_specific_control.RgpCriticalSection ); #endif RGP_CLEANUP(); /* We place a bit pattern in the IamAlive packet. This bit * pattern toggles all the bits. */ rgp->iamalive_pkt.testpattern.words[0] = 0x0055FF6D; rgp->iamalive_pkt.testpattern.words[1] = 0x92CC33E3; rgp->iamalive_pkt.testpattern.words[2] = 0x718E49F0; rgp->iamalive_pkt.testpattern.words[3] = 0x92CC33E3; rgp->iamalive_pkt.testpattern.words[4] = 0x0055FF6D; rgp->iamalive_pkt.testpattern.words[5] = 0x0055FF6D; rgp->iamalive_pkt.testpattern.words[6] = 0x92CC33E3; rgp->iamalive_pkt.testpattern.words[7] = 0x718E49F0; rgp->iamalive_pkt.testpattern.words[8] = 0x92CC33E3; rgp->iamalive_pkt.testpattern.words[9] = 0x0055FF6D; rgp->iamalive_pkt.testpattern.words[10] = 0x55AA55AA; rgp->iamalive_pkt.testpattern.words[11] = 0x55AA55AA; rgp->iamalive_pkt.testpattern.words[12] = 0x55AA55AA; rgp->poison_pkt.pktsubtype = RGP_UNACK_POISON; rgp_init_OS(); /* OS-specific initializations */ rgp_cleanup_OS(); /* OS-specific cleanup */ /* Trace the call after the data structures have been initialized. */ RGP_TRACE( "RGP Init called ", EXT_NODE(this_node), num_nodes, PtrToUlong(rgp_buffer), PtrToUlong(rgp_msgsys_p) ); /* TRACE */ } /************************************************************************** * rgp_cleanup * =========== * Description: * * This function cleans up the RGP structure such that this node is * virtually returned to the state following RGP_INIT and ready to be * "join"ed into the cluster. * * Parameters: * * None * * Returns: * * None **************************************************************************/ _priv _resident void RGP_CLEANUP(void) { node_t i; RGP_LOCK; /* Initialize the state of all possible nodes in the cluster. */ for (i = 0; i < (node_t) rgp->num_nodes; i++) { rgp->node_states[i].status = RGP_NODE_DEAD; rgp->node_states[i].pollstate = AWAITING_IAMALIVE; rgp->node_states[i].lostHBs = 0; #if defined( NT ) ClusnetSetNodeMembershipState(NmClusnetHandle, EXT_NODE( i ), ClusnetNodeStateDead); #endif // NT } for (i = (node_t)rgp->num_nodes; i < MAX_CLUSTER_SIZE; i++) { rgp->node_states[i].status = RGP_NODE_NOT_CONFIGURED; rgp->node_states[i].pollstate = AWAITING_IAMALIVE; rgp->node_states[i].lostHBs = 0; #if defined( NT ) ClusnetSetNodeMembershipState(NmClusnetHandle, EXT_NODE( i ), ClusnetNodeStateNotConfigured); #endif // NT } rgp->rgpinfo.version = RGP_VERSION; rgp->rgpinfo.seqnum = RGP_INITSEQNUM; rgp->rgpinfo.iamalive_ticks = RGP_IAMALIVE_TICKS; rgp->rgpinfo.check_ticks = RGP_CHECK_TICKS; rgp->rgpinfo.Min_Stage1_ticks = RGP_MIN_STAGE1_TICKS; rgp->rgpinfo.a_tick = RGP_INACTIVE_PERIOD; ClusterInit(rgp->rgpinfo.cluster); rgp->rgppkt.stage = RGP_COLDLOADED; rgp->rgpcounter = 0; rgp->restartcount = 0; rgp->tiebreaker = rgp->mynode; /* Initialize the unacknowledged packet buffers */ rgp->rgppkt.pktsubtype = RGP_UNACK_REGROUP; rgp->rgppkt.seqno = rgp->rgpinfo.seqnum; rgp->last_stable_seqno = rgp->rgpinfo.seqnum; ClusterCopy(rgp->OS_specific_control.CPUUPMASK, rgp->rgpinfo.cluster); ClusterCopy(rgp->outerscreen, rgp->rgpinfo.cluster); #if defined( NT ) ClusnetSetOuterscreen( NmClusnetHandle, (ULONG)*((PUSHORT)rgp->outerscreen) ); #endif ClusterCopy(rgp->innerscreen, rgp->rgpinfo.cluster); ClusterCopy(rgp->rgppkt.knownstage1, rgp->rgpinfo.cluster); ClusterCopy(rgp->rgppkt.knownstage2, rgp->rgpinfo.cluster); ClusterCopy(rgp->rgppkt.knownstage3, rgp->rgpinfo.cluster); ClusterCopy(rgp->rgppkt.knownstage4, rgp->rgpinfo.cluster); ClusterCopy(rgp->rgppkt.knownstage5, rgp->rgpinfo.cluster); ClusterCopy(rgp->rgppkt.pruning_result, rgp->rgpinfo.cluster); MatrixInit(rgp->rgppkt.connectivity_matrix); rgp->rgppkt_to_send.pktsubtype = RGP_UNACK_REGROUP; rgp->iamalive_pkt.pktsubtype = RGP_UNACK_IAMALIVE; RGP_UNLOCK; } /*************************************************************************** * rgp_sequence_number * =================== * Description: * * This function returns the regroup sequence number. * * This provides only a subset of the functionality provided by * rgp_getrgpinfo(), but is a simpler function and has no structure * parameters, making it easier to call from PTAL. * * A regroup incident could be in progress when this routine is * called. * * Parameters: * * None * * Returns: * * uint32 - the current regroup sequence number; this reflects * how many regroup incidents have happened since * the system came up. Since one incident can result in * upto RGP_RESTART_MAX restarts each resulting in the * sequence # being bumped, this number does not always * equal the number of regroup incidents. * ***************************************************************************/ _priv _resident uint32 RGP_SEQUENCE_NUMBER(void) { return(rgp->rgpinfo.seqnum); } /************************************************************************ * rgp_getrgpinfo * ============== * * Description: * * Routine to get Regroup parameters. * * Parameters: * * rgpinfo_t *rgpinfo - pointer to struct to be filled with Regroup * parameters. * * Returns: * * int - 0 if successful; -1 if Regroup is perturbed. * * Algorithm: * * Copies the rgpinfo struct from the Regroup global memory into the * struct passed in by the caller. * ************************************************************************/ _priv _resident int RGP_GETRGPINFO(rgpinfo_t *rgpinfo) { int error = 0; /* If no rgpinfo structure is passed OR rgp_init() has not been called * earlier, halt. */ if ((rgpinfo == RGP_NULL_PTR) || (rgp == RGP_NULL_PTR)) RGP_ERROR( RGP_INTERNAL_ERROR ); RGP_LOCK; if (rgp_is_perturbed()) error = -1; else /* Copy the rgpinfo structure from regroup's internal struct. */ *rgpinfo = rgp->rgpinfo; RGP_UNLOCK; return(error); } /************************************************************************ * rgp_setrgpinfo * ============== * * Description: * * Routine to set Regroup parameters. This routine is to be called on * newly booting nodes to set the Regroup parameters to the values * in the master or reloading node. The parameters to be updated * include Regroup timing parameters and the cluster membership; * that is, the current set of nodes in the system. * * This routine can also be called on the first node to boot to * modify the Regroup timing parameters which are set to the default * values when rgp_init() is called. Such modification has to be done * before other nodes are added to the system. * * Parameters: * * rgpinfo_t *rgpinfo - pointer to struct with Regroup parameters to * be modified. * * Returns: * * int - 0 if successful; -1 if there is more than one node in the * cluster. This is to prevent modification of timing parameters * after the second node is added to the system. * * Algorithm: * * Copies the contents of the user-passed struct into the one in the * Regroup global memory and updates related parameters. * ************************************************************************/ _priv _resident int RGP_SETRGPINFO(rgpinfo_t *rgpinfo) { int error = 0; node_t i; /* If no rgpinfo structure is passed OR the version # of the * structure is not understood OR rgp_init() has not been called, * halt. */ if ((rgpinfo == RGP_NULL_PTR) || (rgpinfo->version != RGP_VERSION) || (rgp == RGP_NULL_PTR)) RGP_ERROR( RGP_INTERNAL_ERROR ); RGP_LOCK; /* The following checks must be made before proceeding: * * 1. Regroup must not be perturbed. * * 2. If rgp_start() has been called (regroup is in the * RGP_STABILIZED state), only the local node must be in the * cluster when this routine is called. * * 3. If rgp_start() has been called, this routine can be used * only to modify the timing parameters and not to specify the * cluster. * * If these restrictions are not followed, return -1. */ RGP_TRACE( "RGP SetRGPInfo ", rgpinfo->version, /* TRACE */ rgpinfo->seqnum, /* TRACE */ rgpinfo->iamalive_ticks, /* TRACE */ GetCluster( rgpinfo->cluster ) );/* TRACE */ if ( rgp_is_perturbed() || ( (rgp->rgppkt.stage == RGP_STABILIZED) && ( (ClusterNumMembers(rgp->rgpinfo.cluster) > 1) || !ClusterCompare(rgp->rgpinfo.cluster,rgpinfo->cluster) ) ) ) error = -1; else { /* Copy the rgpinfo structure into regroup's internal struct. */ rgp->rgpinfo = *rgpinfo; /* If iamalive_ticks is set to 0, use the default value instead. */ /*F40:KSK06102.2*/ if (rgpinfo->iamalive_ticks == 0) /*F40:KSK06102.3*/ rgp->rgpinfo.iamalive_ticks = RGP_IAMALIVE_TICKS; /*F40:KSK06102.4*/ /*F40:KSK06102.5*/ if (rgpinfo->check_ticks == 0) { rgp->rgpinfo.check_ticks = RGP_CHECK_TICKS; } if (rgpinfo->Min_Stage1_ticks == 0) rgp->rgpinfo.Min_Stage1_ticks = (rgp->rgpinfo.iamalive_ticks * rgp->rgpinfo.check_ticks); if (rgpinfo->a_tick == 0) rgp->rgpinfo.a_tick = RGP_CLOCK_PERIOD; // Tell Timer thread to restart RGP timer SetEvent (rgp->OS_specific_control.TimerSignal); /* The cluster should include the local node even if the cluster * field in the rgpinfo structure does not include it. */ ClusterInsert(rgp->rgpinfo.cluster, rgp->mynode); /* Copy the sequence number into the regroup packet area. */ rgp->rgppkt.seqno = rgp->rgpinfo.seqnum; /* If nodes have been added in the cluster field, they must be * added to all the screens and their status must be set to * alive. */ ClusterCopy(rgp->OS_specific_control.CPUUPMASK, rgp->rgpinfo.cluster); ClusterCopy(rgp->outerscreen, rgp->rgpinfo.cluster); #if defined( NT ) ClusnetSetOuterscreen( NmClusnetHandle, (ULONG)*((PUSHORT)rgp->outerscreen) ); ClusterComplement(rgp->ignorescreen, rgp->outerscreen); #endif ClusterCopy(rgp->innerscreen, rgp->rgpinfo.cluster); ClusterCopy(rgp->rgppkt.knownstage1, rgp->rgpinfo.cluster); ClusterCopy(rgp->rgppkt.knownstage2, rgp->rgpinfo.cluster); ClusterCopy(rgp->rgppkt.knownstage3, rgp->rgpinfo.cluster); ClusterCopy(rgp->rgppkt.knownstage4, rgp->rgpinfo.cluster); ClusterCopy(rgp->rgppkt.knownstage5, rgp->rgpinfo.cluster); ClusterCopy(rgp->rgppkt.pruning_result, rgp->rgpinfo.cluster); rgp->tiebreaker = rgp_select_tiebreaker(rgp->rgpinfo.cluster); for (i = 0; i < (node_t) rgp->num_nodes; i++) { if (ClusterMember(rgp->rgpinfo.cluster, i)) { rgp->node_states[i].pollstate = IAMALIVE_RECEIVED; rgp->node_states[i].status = RGP_NODE_ALIVE; #if defined( NT ) ClusnetSetNodeMembershipState(NmClusnetHandle, EXT_NODE( i ), ClusnetNodeStateAlive); #endif // NT } } /* Reset the clock counter so that IamAlives are sent when * the next timer tick arrives. */ rgp->clock_ticks = 0; } RGP_UNLOCK; return(error); } /************************************************************************ * rgp_start * ========= * * Description: * * This routine signals the end of node integration into the cluster. * The node can now start participating in the Regroup algorithm. * * Parameters: * * void (*rgp_node_failed)() * pointer to a routine to be called when a node failure is * detected. * * int (*rgp_select_cluster)() * pointer to an optional routine to be called when link failures * cause multiple alternative clusters to be formed. This routine * should select one from a list of suggested clusters. * * Returns: * * void - no return value * * Algorithm: * * Installs the callback routines in the global data structure and * changes the Regroup state to RGP_STABILIZED. * ************************************************************************/ _priv _resident void RGP_START(void (*nodedown_callback)(cluster_t failed_nodes), int (*select_cluster)(cluster_t cluster_choices[], int num_clusters) ) { if (rgp == RGP_NULL_PTR) RGP_ERROR( RGP_INTERNAL_ERROR ); RGP_LOCK; RGP_TRACE( "RGP Start called", rgp->rgppkt.stage, /* TRACE */ PtrToUlong(nodedown_callback), /* TRACE */ PtrToUlong(select_cluster), /* TRACE */ 0 ); /* TRACE */ /* Install callback routines for node failure notification and cluster * selection. If no routine is given by the caller, use default ones. */ if (nodedown_callback == RGP_NULL_PTR) { #ifdef NSK /* In NSK, rgp_start() is called from pTAL code and passing routine * addresses is cumbersome. So, RGP_NULL_PTR is passed and we * call the routine rgp_node_failed() which must be supplied by * the message system. */ rgp->nodedown_callback = rgp_node_failed; /* hardcoded name */ #else /* A node down callback routine must be supplied. */ RGP_ERROR( RGP_INTERNAL_ERROR ); #endif /* NSK */ } else rgp->nodedown_callback = nodedown_callback; #if 0 /* The select cluster routine is optional. */ if (select_cluster == RGP_NULL_PTR) rgp->select_cluster = rgp_select_cluster; /* supplied by regroup */ else #endif // // Calling rgp_select_cluster is // not a good idea since it doesn't take into the consideration // quorum owner node. // If rgp->select_cluster == RGP_NULL_PTR, then srgpsm.c uses // rgp_select_cluster_ex, that will try to select the group // that contain the current quorum owner node rgp->select_cluster = select_cluster; #if defined(NT) /* Call the node up callback. This is where the local node gets * the node up callback for itself coming up. Other nodes call * the callback, for this node coming up, in rgp_monitor_node. */ ClusterInsert(rgp->rgpinfo.cluster, rgp->mynode); ClusterCopy(rgp->OS_specific_control.CPUUPMASK, rgp->rgpinfo.cluster); if ( rgp->OS_specific_control.UpDownCallback != RGP_NULL_PTR ) { (*(rgp->OS_specific_control.UpDownCallback))( EXT_NODE(rgp->mynode), NODE_UP ); } #endif /* NT */ RGP_UNLOCK; } /************************************************************************ * rgp_add_node * ============ * * Description: * * Called to add a newly booting node to the regroup masks. This prevents * Regroup from sending poison packets to the new node when it tries to * contact our node by sending IamAlive messages. * * Parameters: * * node_t node - node to be added to the recognition masks * * Returns: * * int - 0 on success and -1 on failure. The routine fails only if a * regroup incident is in progress. * * Algorithm: * * The node is added to all the recognition masks and its state is * changed to RGP_NODE_COMING_UP. * ************************************************************************/ _priv _resident int RGP_ADD_NODE(node_t node) { int error = 0; RGP_LOCK; RGP_TRACE( "RGP Add node ", node, rgp->rgppkt.stage, GetCluster(rgp->outerscreen), /* TRACE */ GetCluster(rgp->rgpinfo.cluster) ); /* TRACE */ /* Cannot add a node while regroup is perturbed. Return -1 in that case. * The new node booting should fail due to the regroup incident anyway. */ if (rgp_is_perturbed()) error = -1; else { node = INT_NODE(node); /* adjust the node number by the offset */ ClusterInsert(rgp->outerscreen, node); #if defined( NT ) ClusnetSetOuterscreen( NmClusnetHandle, (ULONG)*((PUSHORT)rgp->outerscreen) ); #endif ClusterInsert(rgp->innerscreen, node); ClusterInsert(rgp->rgppkt.knownstage1, node); ClusterInsert(rgp->rgppkt.knownstage2, node); ClusterInsert(rgp->rgppkt.knownstage3, node); ClusterInsert(rgp->rgppkt.knownstage4, node); ClusterInsert(rgp->rgppkt.knownstage5, node); ClusterInsert(rgp->rgppkt.pruning_result, node); rgp->node_states[node].pollstate = AWAITING_IAMALIVE; rgp->node_states[node].status = RGP_NODE_COMING_UP; rgp->node_states[node].lostHBs = 0; #if defined( NT ) ClusterDelete( rgp->OS_specific_control.Banished, node ); // // Remove joining node from ignore screen // ClusterDelete( rgp->ignorescreen, node ); PackIgnoreScreen(&rgp->rgppkt, rgp->ignorescreen); ClusnetSetNodeMembershipState(NmClusnetHandle, EXT_NODE( node ), ClusnetNodeStateJoining); #endif // NT } RGP_UNLOCK; return(error); } /************************************************************************ * rgp_monitor_node * ================ * * Description: * * Called by all running nodes to change the status of a newly booted node * to UP. Can be called by the new node also; it is a no-op in this case. * * Parameters: * * node_t node - number of node being declared up * * Returns: * * int - 0 on success and -1 on failure. The routine fails only if the * state of the node is neither RGP_NODE_COMING_UP nor RGP_NODE_ALIVE. * * Algorithm: * * If the node is marked coming up, its state is changed to * RGP_NODE_ALIVE. If the node has already been marked up, * nothing is done. * ************************************************************************/ _priv _resident int RGP_MONITOR_NODE(node_t node) { int error = 0; RGP_LOCK; RGP_TRACE( "RGP Monitor node", node, rgp->rgppkt.stage, GetCluster(rgp->outerscreen), /* TRACE */ GetCluster(rgp->rgpinfo.cluster) ); /* TRACE */ node = INT_NODE(node); /* adjust the node number by the offset */ /* Accept the request only if the state of the node is COMING_UP or UP. */ if (rgp->node_states[node].status == RGP_NODE_COMING_UP) { ClusterInsert(rgp->rgpinfo.cluster, node); rgp->tiebreaker = rgp_select_tiebreaker(rgp->rgpinfo.cluster); rgp->node_states[node].pollstate = IAMALIVE_RECEIVED; rgp->node_states[node].status = RGP_NODE_ALIVE; #if defined(NT) ClusterCopy(rgp->OS_specific_control.CPUUPMASK, rgp->rgpinfo.cluster); ClusnetSetNodeMembershipState(NmClusnetHandle, EXT_NODE( node ), ClusnetNodeStateAlive); /* A node came up. Call the node up callback. */ if ( rgp->OS_specific_control.UpDownCallback != RGP_NULL_PTR ) { (*(rgp->OS_specific_control.UpDownCallback))( EXT_NODE(node), NODE_UP ); } #endif /* NT */ } else if (rgp->node_states[node].status != RGP_NODE_ALIVE) /* Perhaps the booting node failed and regroup has already marked * it down. The cluster manager may have invoked a global update * resulting in this call before regroup reporetd the failure * of the node. */ error = -1; RGP_UNLOCK; return(error); } /************************************************************************ * rgp_remove_node * =============== * * Description: * * Called by the cluster manager to force out a booting node if booting * fails. Regroup may or may not have already removed the booting node * from the masks and declared it down, depending on what stage the * booting is in and when the booting node failed. * * Regroup can remove the node from the masks of all nodes in the cluster * by simply starting a new incident of regroup with any event code. This * will force all nodes to come to an agreement on cluster membership * that excludes the booting node. If the booting node is alive, it will * commit suicide since it will be in the incompetent (RGP_COLDLOADED) * state. * * Removing the new node from our masks is not necessary since regroup * will detect the node failure and adjust the masks. If we do remove it * from our masks BEFORE initiating regroup, regroup may complete quicker * since we will not wait in stage 1 for the node to check in. Also, this * could allow a node to be removed even after it is fully integrated. * This is because our node will send a poison packet to the removed node * if it tries to contact us. * * But this "enhancement" is not implemented because it requires a new * regroup event code which is examined by all nodes and processed * specially. Currently, the regroup event code is used only for * debugging info. Also, there is no guarantee that all nodes see the * same regroup reason code. For instance, some may see a missing * IamAlive while others may see a power failure. * * Parameters: * * node_t node - node to be removed from the recognition masks * (in external format). * * Returns: * * int - 0 on success and -1 on failure. The routine fails if a * regroup incident is in progress or rgp_start() has not been * called (as in a new node where the booting is not complete). * * Algorithm: * * If the node is still in the recognition masks, a new regroup incident * is started. This incident will result in all nodes declaring the node * dead and removing it from the recognition masks. * ************************************************************************/ _priv _resident int RGP_REMOVE_NODE(node_t node) { int error = 0; RGP_LOCK; RGP_TRACE( "RGP Remove node ", node, rgp->rgppkt.stage, GetCluster(rgp->outerscreen), /* TRACE */ GetCluster(rgp->rgpinfo.cluster) ); /* TRACE */ if (rgp->rgppkt.stage == RGP_STABILIZED) { if (ClusterMember(rgp->outerscreen, INT_NODE(node))) { /* Node is currently in our screen. The node may have never come up * after rgp_add_node() was called OR regroup may not have figured * out yet that the node is down. In either case, the node must * be forced out and all nodes in the cluster notified (by a regroup * incident). If the node is still running, it will commit suicide * when this regroup incident starts. */ rgp_event_handler(RGP_EVT_LATEPOLLPACKET, node); } else { /* Either the node was not added to the cluster OR regroup has * already figured out that the node is dead and reported this. * In either case, there is nothing more to do. */ } } else error = -1; RGP_UNLOCK; return(error); } /************************************************************************ * rgp_is_perturbed * ================ * * Description: * * Function to check if a regroup incident is in progress. * * Parameters: * * None. * * Returns: * * int - 0 if no regroup is quiescent; non-zero if a regroup incident * is in progress. * * Algorithm: * * Looks at the current state of the Regroup algorithm. * ************************************************************************/ _priv _resident int RGP_IS_PERTURBED(void) { uint8 stage = rgp->rgppkt.stage; return((stage != RGP_STABILIZED) && (stage != RGP_COLDLOADED)); } /************************************************************************ * rgp_periodic_check * ================== * * Description: * * This routine is invoked every RGP_CLOCK_PERIOD by the timer interrupt * handler of the native OS. It performs Regroups's periodic operations. * * Parameters: * * None * * Returns: * * void - no return value * * Algorithm: * * This routine requests Iamalive packets to be sent, checks if * IamAlives have been received (and calls rgp_event_handler() if * not) and sends a clock tick to the regroup algorithm if it is in * progress. * * IamAlives are checked at twice the IamAlive period. The regroup * global variable clock_ticks is incremented in each call. After * the IamAlives are checked, clock_ticks is reset to 0. Thus, the * ticker counts time modulo twice the IamAlive ticks. * ************************************************************************/ _priv _resident void RGP_PERIODIC_CHECK(void) { node_t node; RGP_LOCK; /* If regroup is active, give it a shot at each regroup clock tick. */ if ((rgp->rgppkt.stage != RGP_STABILIZED) && (rgp->rgppkt.stage != RGP_COLDLOADED)) rgp_event_handler(RGP_EVT_CLOCK_TICK, RGP_NULL_NODE); #if !defined( NT ) /* Send IamAlive messages at appropriate intervals. */ if ( (rgp->clock_ticks == 0) || (rgp->clock_ticks == rgp->rgpinfo.iamalive_ticks) ) { rgp_broadcast(RGP_UNACK_IAMALIVE); rgp->clock_ticks++; } /* Check for missing IamAlives at IamAlive sending period, * But flag an error (LATE_POLL) only if "check_ticks" IamAlives missed. * The checking is offset from the sending by one clock tick. */ else if ( rgp->clock_ticks >= (rgp->rgpinfo.iamalive_ticks - 1) ) { /* check all nodes for IamAlives received */ for (node = 0; node < (node_t) rgp->num_nodes; node++) { if (rgp->node_states[node].status == RGP_NODE_ALIVE) { if ( rgp->node_states[node].pollstate == IAMALIVE_RECEIVED ) { /* checked in in time */ #if defined(TDM_DEBUG) if ( rgp->OS_specific_control.debug.doing_tracing ) { printf ("Node %d: Node %d is alive. My rgp state=%d\n", EXT_NODE(rgp->mynode), EXT_NODE(node), rgp->rgppkt.stage ); } #endif rgp->node_states[node].pollstate = AWAITING_IAMALIVE; rgp->node_states[node].lostHBs = 0; } else if ( rgp->node_states[node].lostHBs++ < rgp->rgpinfo.check_ticks ) ;// allow upto (check_ticks-1) IamAlives to be lost. else { /* missing IamAlives */ if (node == rgp->mynode) /* missed my own packets */ { /* We should be lenient if we just had a power failure. */ if (rgp->pfail_state == 0) /* no recent power failure */ RGP_ERROR( RGP_MISSED_POLL_TO_SELF ); } else rgp_event_handler(RGP_EVT_LATEPOLLPACKET, EXT_NODE(node)); } } } /* Reset the regroup tick counter after checking for IamAlives. */ rgp->clock_ticks = 0; } /* check all nodes for IamAlives received */ else rgp->clock_ticks++; /* rgp->pfail_state is set to a non-zero value when a pfail event * is reported to regroup. It is decremented at every regroup clock * tick till it reaches zero. While this number is non-zero, missing * self IamAlives are ignored and do not cause the node to halt. * This gives the sending hardware some time to recover from power * failures before self IamAlives are checked. */ if (rgp->pfail_state) rgp->pfail_state--; #endif // NT RGP_UNLOCK; } /* rgp_periodic_check */ /************************************************************************ * rgp_received_packet * =================== * * Description: * * Routine to be called by the message system when an unacknowledged * packet sent by the Regroup module is received from any node. These * packets include IamAlive packets, regroup status packets and poison * packets. * * Parameters: * * node_t node - node from which a packet has been received * * void *packet - address of the received packet data * * int packetlen - length in bytes of the received packet data * * Returns: * * void - no return value * * Algorithm: * * Does different things based on the packet subtype. * ************************************************************************/ _priv _resident void RGP_RECEIVED_PACKET(node_t node, void *packet, int packetlen) { rgp_unseq_pkt_t *unseq_pkt = (rgp_unseq_pkt_t *) packet; node = INT_NODE(node); /* If the packet is from a node that cannot be in our cluster, * simply ignore it. */ if (node >= (node_t) rgp->num_nodes) return; /* If the sending node is excluded by the outer screen, then it is * not part of the current (most recently known) configuration. * Therefore the packet should not be honored, and a poison message * should be sent to try to kill this renegade processor unless * it is sending US a poison packet. If it is sending us a poison * packet, we cannot send it a poison in return because that results * in an infinite loop. In this case, we just halt because this * situation implies that there is a split brain situation and our * split brain avoidance algorithm has failed. */ /* NT Notes * * even with poison pkts being sent and recv'ed in the kernel, we still * want to make these checks since clusnet doesn't have the regroup stage * info and regroup packets themselves find there way in here. */ if (!ClusterMember(rgp->outerscreen, node) #if defined( NT ) || ClusterMember(rgp->OS_specific_control.Banished, node) #endif ) { if (rgp->rgppkt.stage == RGP_COLDLOADED) { // We are doing this check in srgpsm.c // No need to do it here // RGP_ERROR(RGP_RELOADFAILED); // } else if (unseq_pkt->pktsubtype == RGP_UNACK_POISON) { RGP_ERROR((uint16) (RGP_PARIAH + EXT_NODE(node))); } else { /* Must send a poison packet to the sender. */ ClusterInsert(rgp->poison_targets, node); rgp_broadcast(RGP_UNACK_POISON); } return; } switch (unseq_pkt->pktsubtype) { case RGP_UNACK_IAMALIVE : { /* Count the number of IamAlives received */ if ( node == rgp->mynode ) RGP_INCREMENT_COUNTER( RcvdLocalIAmAlive ); else RGP_INCREMENT_COUNTER( RcvdRemoteIAmAlive ); if (rgp->node_states[node].status == RGP_NODE_ALIVE) rgp->node_states[node].pollstate = IAMALIVE_RECEIVED; else if (rgp->node_states[node].status == RGP_NODE_COMING_UP) { /* If the node has not yet been marked fully up, it is time to * do so. */ rgp_monitor_node(EXT_NODE(node)); /* We must tell the OS that the new node is up in case the * OS needs the IamAlives to figure that out. */ rgp_newnode_online(EXT_NODE(node)); } else /* If the node state is neither alive nor coming up, it * must not be in our outerscreen. The outerscreen check * above must have passed and we should not get here. */ RGP_ERROR(RGP_INTERNAL_ERROR); break; } case RGP_UNACK_REGROUP : { /* Count the number of regroup status packets received. */ RGP_INCREMENT_COUNTER( RcvdRegroup ); /* Any good packet can be treated as an IamAlive packet. */ rgp->node_states[node].pollstate = IAMALIVE_RECEIVED; RGP_EVENT_HANDLER_EX (RGP_EVT_RECEIVED_PACKET, EXT_NODE(node), (void*)unseq_pkt); break; } case RGP_UNACK_POISON : { /* If our node is in RGP_PRUNING stage and have been pruned out, * the poison packet probably implies that the sender has gone * into the next stage and declared us down. In this case, use * the more appropriate RGP_PRUNED_OUT halt code. Otherwise, * use the poison packet halt code. In either case, we must halt. */ if ( (rgp->rgppkt.stage == RGP_PRUNING) && !ClusterMember(rgp->rgppkt.pruning_result, rgp->mynode) ) RGP_ERROR(RGP_PRUNED_OUT); else { if (rgp->rgppkt.stage == RGP_COLDLOADED) { RGP_ERROR(RGP_RELOADFAILED); return; } else RGP_ERROR((uint16) (RGP_PARIAH + EXT_NODE(node))); } break; } default : { /* Ignore the unknown packet type. */ break; } } } /*---------------------------------------------------------------------------*/ #ifdef __cplusplus } #endif /* __cplusplus */ #if 0 History of changes to this file: ------------------------------------------------------------------------- 1995, December 13 F40:KSK0610 /*F40:KSK06102.6*/ This file is part of the portable Regroup Module used in the NonStop Kernel (NSK) and Loosely Coupled UNIX (LCU) operating systems. There are 10 files in the module - jrgp.h, jrgpos.h, wrgp.h, wrgpos.h, srgpif.c, srgpos.c, srgpsm.c, srgputl.c, srgpcli.c and srgpsvr.c. The last two are simulation files to test the Regroup Module on a UNIX workstation in user mode with processes simulating processor nodes and UDP datagrams used to send unacknowledged datagrams. This file was first submitted for release into NSK on 12/13/95. ------------------------------------------------------------------------------ This change occurred on 19 Jan 1996 /*F40:MB06458.1*/ Changes for phase IV Sierra message system release. Includes: /*F40:MB06458.2*/ - Some cleanup of the code /*F40:MB06458.3*/ - Increment KCCB counters to count the number of setup messages and /*F40:MB06458.4*/ unsequenced messages sent. /*F40:MB06458.5*/ - Fixed some bugs /*F40:MB06458.6*/ - Disable interrupts before allocating broadcast sibs. /*F40:MB06458.7*/ - Change per-packet-timeout to 5ms /*F40:MB06458.8*/ - Make the regroup and powerfail broadcast use highest priority /*F40:MB06458.9*/ tnet services queue. /*F40:MB06458.10*/ - Call the millicode backdoor to get the processor status from SP /*F40:MB06458.11*/ - Fixed expand bug in msg_listen_ and msg_readctrl_ /*F40:MB06458.12*/ - Added enhancement to msngr_sendmsg_ so that clients do not need /*F40:MB06458.13*/ to be unstoppable before calling this routine. /*F40:MB06458.14*/ - Added new steps in the build file called /*F40:MB06458.15*/ MSGSYS_C - compiles all the message system C files /*F40:MB06458.16*/ MSDRIVER - compiles all the MSDriver files /*F40:MB06458.17*/ REGROUP - compiles all the regroup files /*F40:MB06458.18*/ - remove #pragma env libspace because we set it as a command line /*F40:MB06458.19*/ parameter. /*F40:MB06458.20*/ ----------------------------------------------------------------------- /*F40:MB06458.21*/ #endif /* 0 - change descriptions */