#ifdef __TANDEM #pragma columns 79 #pragma page "srgpos.c - T9050 - OS-dependent routines for Regroup Module" #endif /* @@@ START COPYRIGHT @@@ ** Tandem Confidential: Need to Know only ** Copyright (c) 1995, Tandem Computers Incorporated ** Protected as an unpublished work. ** All Rights Reserved. ** ** The computer program listings, specifications, and documentation ** herein are the property of Tandem Computers Incorporated and shall ** not be reproduced, copied, disclosed, or used in whole or in part ** for any reason without the prior express written permission of ** Tandem Computers Incorporated. ** ** @@@ END COPYRIGHT @@@ **/ /*--------------------------------------------------------------------------- * This file (srgpos.c) contains OS-specific code used by Regroup. *---------------------------------------------------------------------------*/ #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ #include #ifdef NSK #include #endif /* NSK */ #if defined(NT) DWORD MmSetThreadPriority( VOID ); void NT_timer_thread( void ); PWCHAR RgpGetNodeNameFromId( node_t ); #endif // NT /* The global pointer to regroup's internal data structure. */ #ifdef NSK /* The global regroup pointer is #defined to a pointer in the message * system root structure. */ #endif #if defined(LCU) || defined(UNIX) || defined(NT) rgp_control_t *rgp = (rgp_control_t *) RGP_NULL_PTR; DWORD QuorumOwner = MM_INVALID_NODE; /* quorum owner can be set by the forming node before rgp is initialized */ #endif /* LCU || UNIX || NT */ #ifdef LCU /************************************************************************ * rgp_lcu_serv_listen * =================== * * Description: * * This is an LCU-specific routine that gets called in IPC interrupt * context when a datagram addressed to the Regroup Module is received. * * Parameters: * * void *listen_callarg - required param, unused by regroup * lcumsg_t *lcumsgp - pointer to message * uint moredata - required param, unused by regroup * * Returns: * * int - Always returns ELCU_OK * * Algorithm: * * The routine simply picks apart the arguments and calls * rgp_received_packet(). * * ************************************************************************/ _priv _resident int rgp_lcu_serv_listen(void *listen_callarg, lcumsg_t *lcumsgp, uint moredata) { /* Ignore if the packet is not from the local system. */ if (lcumsgp->lcu_sysnum == rgp->OS_specific_control.my_sysnum) rgp_received_packet(lcumsgp->lcu_node, lcumsgp->lcu_reqmbuf.lcu_ctrlbuf, lcumsgp->lcu_reqmbuf.lcu_ctrllen); return(ELCU_OK); } /************************************************************************ * rgp_lcu_event_callback * ====================== * * Description: * * This is an LCU-specific routine that gets called in IPC interrupt * context when the LCUEV_NODE_UNREACHABLE event is generated. * * Parameters: * * ulong event - event # (= LCUEV_NODE_UNREACHABLE) * sysnum_t sysnum - system # (= local system #) * nodenum_t node - # of node that is unreachable * int event_info - required parameter, unused by regroup * * Returns: * * void - no return value * * Algorithm: * * The routine simply transforms the LCU event into the regroup event * RGP_EVT_NODE_UNREACHABLE and calls rgp_event_handler(). * ************************************************************************/ _priv _resident void rgp_lcu_event_callback( ulong event, sysnum_t sysnum, nodenum_t node, int event_info) { /* Sanity checks: * (1) The event must be LCUEV_NODE_UNREACHABLE, the only event * we asked for. * (1) The event must be for the local system, the only system * we asked for. */ if ((event != LCUEV_NODE_UNREACHABLE) || (sysnum != rgp->OS_specific_control.my_sysnum)) RGP_ERROR(RGP_INTERNAL_ERROR); rgp_event_handler(RGP_EVT_NODE_UNREACHABLE, node); } #endif /* LCU */ /************************************************************************ * rgp_init_OS * =========== * * Description: * * This routine does OS-dependent regroup initialization such as * initializing the regroup data structure lock, requesting a * periodic timer to be installed and registering the callback * routine for receiving regroup's unacknowledged packets. * * Parameters: * * None * * Returns: * * void - no return value * * Algorithm: * * OS-dependent initializations. * ************************************************************************/ _priv _resident void rgp_init_OS(void) { #ifdef UNIX struct sigaction sig_action; /* to install signals */ #endif #ifdef LCU sysnum_t sysnum; lcumsg_t *lcumsgp; #endif #ifdef NT HANDLE tempHandle; DWORD threadID = 0; #endif #if defined(NSK) || defined(UNIX) || defined(NT) /* * In NSK, the regroup caller ensures that timer and IPC interrupts * are disabled before the regroup routines are called. Therefore, * there is no regroup lock initialization. Also, rather than using * registration of callback routines, the appropriate routine names * are hard coded into routines that must call them. Thus, the timer * routine is called from POLLINGCHECK, the periodic message system * routine, and the packet reception routine is called from the * IPC interrupt handler. */ /* Initialize the unchanging fields in the rgp_msgsys struct. */ rgp->rgp_msgsys_p->regroup_data = (void *) &(rgp->rgppkt_to_send); rgp->rgp_msgsys_p->regroup_datalen = RGPPKTLEN; rgp->rgp_msgsys_p->iamalive_data = (void *) &(rgp->iamalive_pkt); rgp->rgp_msgsys_p->iamalive_datalen = IAMALIVEPKTLEN; rgp->rgp_msgsys_p->poison_data = (void *) &(rgp->poison_pkt); rgp->rgp_msgsys_p->poison_datalen = POISONPKTLEN; #endif /* NSK || UNIX || NT */ #ifdef LCU if (itimeout(rgp_periodic_check, NULL, /* parameter pointer */ ((RGP_CLOCK_PERIOD * HZ) / 100) | TO_PERIODIC, plstr /* interrupt priority level */ ) == 0) RGP_ERROR(RGP_INTERNAL_ERROR); if (lcuxprt_listen(LCU_RGP_PORT, rgp_lcu_serv_listen, NULL /* no call arg */, NULL /* no options */ ) != ELCU_OK) RGP_ERROR(RGP_INTERNAL_ERROR); if (lcuxprt_config(LCU_GET_MYSYSNUM, &sysnum) != ELCU_OK) RGP_ERROR(RGP_INTERNAL_ERROR); rgp->OS_specific_control.my_sysnum = sysnum; /* Allocate 3 message buffers to send regroup packets, iamalive packets * and poison packets. */ if ((lcumsgp = lcuxprt_msg_alloc(LCU_UNACKMSG, LCU_RGP_FLAGS)) == NULL) RGP_ERROR(RGP_INTERNAL_ERROR); /* no memory */ rgp->OS_specific_control.lcumsg_regroup_p = lcumsgp; lcumsgp->lcu_tag = NULL; lcumsgp->lcu_sysnum = sysnum; lcumsgp->lcu_port = LCU_RGP_PORT; lcumsgp->lcu_flags = LCUMSG_CRITICAL; lcumsgp->lcu_reqmbuf.lcu_ctrllen = RGPPKTLEN; lcumsgp->lcu_reqmbuf.lcu_ctrlbuf = (char *)&(rgp->rgppkt_to_send); if ((lcumsgp = lcuxprt_msg_alloc(LCU_UNACKMSG, LCU_RGP_FLAGS)) == NULL) RGP_ERROR(RGP_INTERNAL_ERROR); /* no memory */ rgp->OS_specific_control.lcumsg_iamalive_p = lcumsgp; lcumsgp->lcu_tag = NULL; lcumsgp->lcu_sysnum = sysnum; lcumsgp->lcu_port = LCU_RGP_PORT; lcumsgp->lcu_reqmbuf.lcu_ctrllen = IAMALIVEPKTLEN; lcumsgp->lcu_reqmbuf.lcu_ctrlbuf = (char *)&(rgp->iamalive_pkt); if ((lcumsgp = lcuxprt_msg_alloc(LCU_UNACKMSG, LCU_RGP_FLAGS)) == NULL) RGP_ERROR(RGP_INTERNAL_ERROR); /* no memory */ rgp->OS_specific_control.lcumsg_poison_p = lcumsgp; lcumsgp->lcu_tag = NULL; lcumsgp->lcu_sysnum = sysnum; lcumsgp->lcu_port = LCU_RGP_PORT; lcumsgp->lcu_reqmbuf.lcu_ctrllen = POISONPKTLEN; lcumsgp->lcu_reqmbuf.lcu_ctrlbuf = (char *)&(rgp->poison_pkt); /* Register to get the LCUEV_NODE_UNREACHABLE event. */ if (lcuxprt_events(LCU_CATCH_EVENTS, sysnum, LCUEV_NODE_UNREACHABLE, rgp_lcu_event_callback) != ELCU_OK) RGP_ERROR(RGP_INTERNAL_ERROR); #endif /* LCU */ #ifdef UNIX /* For testing on UNIX at user level, we use alarm() to simulate timer * ticks. */ /* Install the alarm handler. */ sig_action.sa_flags = 0; sig_action.sa_handler = alarm_handler; sigemptyset(&(sig_action.sa_mask)); /* Block messages when handling timer pops. */ sigaddset(&(sig_action.sa_mask), SIGPOLL); sigaction(SIGALRM, &sig_action, NULL); alarm_callback = rgp_periodic_check; /* Round up the alarm period to the next higher second. */ alarm_period = (RGP_CLOCK_PERIOD + 99) / 100; /* Get first timer tick as soon as possible; subsequent ones will be * at alarm_period. */ alarm(1); #endif /* UNIX */ #ifdef NT /* On NT we create a separate thread that will be our timer. */ /* The Timer Thread waits on TimerSignal Event to indicate an RGP rate change. */ /* An RGP rate of 0 is a signal for the Timer Thread to exit */ tempHandle = CreateEvent ( NULL, /* no security */ FALSE, /* Autoreset */ TRUE, /* Initial State is Signalled */ NULL); /* No name */ if ( !tempHandle ) { RGP_ERROR (RGP_INTERNAL_ERROR); } rgp->OS_specific_control.TimerSignal = tempHandle; tempHandle = CreateEvent ( NULL, /* no security */ TRUE, /* Manual reset */ TRUE, /* Initial State is Signalled */ NULL); /* No name */ if ( !tempHandle ) { RGP_ERROR (RGP_INTERNAL_ERROR); } rgp->OS_specific_control.Stabilized = tempHandle; rgp->OS_specific_control.ArbitrationInProgress = FALSE; rgp->OS_specific_control.ArbitratingNode = MM_INVALID_NODE; rgp->OS_specific_control.ApproxArbitrationWinner = MM_INVALID_NODE; rgp->OS_specific_control.ShuttingDown = FALSE; tempHandle = CreateThread( 0, /* security */ 0, /* stack size - use same as primary thread */ (LPTHREAD_START_ROUTINE)NT_timer_thread, /* starting point */ (VOID *) NULL, /* no parameter */ 0, /* create flags - start immediately */ &threadID ); /* thread ID returned here */ if ( !tempHandle ) { RGP_ERROR( RGP_INTERNAL_ERROR ); /* at least for now */ } rgp->OS_specific_control.TimerThread = tempHandle; rgp->OS_specific_control.TimerThreadId = threadID; rgp->OS_specific_control.UpDownCallback = RGP_NULL_PTR; rgp->OS_specific_control.NodesDownCallback = RGP_NULL_PTR; rgp->OS_specific_control.EventEpoch = 0; #if defined TDM_DEBUG rgp->OS_specific_control.debug.frozen = 0; rgp->OS_specific_control.debug.reload_in_progress = 0; rgp->OS_specific_control.debug.timer_frozen = 0; rgp->OS_specific_control.debug.doing_tracing = 0; rgp->OS_specific_control.debug.MyTestPoints.TestPointWord = 0; // seed the random number function used in testing srand((unsigned) time( NULL ) ); #endif #endif /* NT */ } /************************************************************************ * rgp_cleanup_OS * =========== * * Description: * * This routine does OS-dependent cleanup of regroup structures * and timer thread activity to ready for a new JOIN attempt. * * Parameters: * * None * * Returns: * * void - no return value * * Algorithm: * * OS-dependent initializations. * ************************************************************************/ _priv _resident void rgp_cleanup_OS(void) { #if defined (NT) // Tell Timer Thread to restart RGP Timer // a_tick might have changed. SetEvent( rgp->OS_specific_control.TimerSignal); #endif // NT } /************************************************************************ * rgp_update_regroup_packet * ========================= * * Description: * * Macro to copy the current regroup status into the regroup packet * sending buffer. * * Parameters: * * None * * Algorithm: * * Copies the status (which is already in the form of a regroup status * packet) into the packet buffer. Then, if we should let others (and * ourselves) know of our stage, the current knownstage field is * updated to include the local node number. * ************************************************************************/ #define rgp_update_regroup_packet \ do \ { \ /* Copy the regroup status to the sending packet area. */ \ rgp->rgppkt_to_send = rgp->rgppkt; \ \ /* If we should let others know of our stage, we must modify the \ * current stage mask to include ourselves. \ */ \ if (rgp->sendstage) \ switch (rgp->rgppkt.stage) \ { \ case RGP_ACTIVATED: \ ClusterInsert(rgp->rgppkt_to_send.knownstage1, rgp->mynode); \ break; \ case RGP_CLOSING: \ ClusterInsert(rgp->rgppkt_to_send.knownstage2, rgp->mynode); \ break; \ case RGP_PRUNING: \ ClusterInsert(rgp->rgppkt_to_send.knownstage3, rgp->mynode); \ break; \ case RGP_PHASE1_CLEANUP: \ ClusterInsert(rgp->rgppkt_to_send.knownstage4, rgp->mynode); \ break; \ case RGP_PHASE2_CLEANUP: \ ClusterInsert(rgp->rgppkt_to_send.knownstage5, rgp->mynode); \ break; \ default: \ break; \ } \ } while(0) /************************************************************************ * rgp_update_poison_packet * ======================== * * Description: * * Macro to copy the current regroup status into the poison packet * sending buffer. * * Parameters: * * None * * Algorithm: * * Copies the appropriate regroup status fields into the poison * packet buffer to help debugging when a dump of a poisoned * node is examined. * ************************************************************************/ #define rgp_update_poison_packet \ do \ { \ rgp->poison_pkt.seqno = rgp->rgppkt.seqno; \ rgp->poison_pkt.reason = rgp->rgppkt.reason; \ rgp->poison_pkt.activatingnode = rgp->rgppkt.activatingnode; \ rgp->poison_pkt.causingnode = rgp->rgppkt.causingnode; \ ClusterCopy(rgp->poison_pkt.initnodes, rgp->initnodes); \ ClusterCopy(rgp->poison_pkt.endnodes, rgp->endnodes); \ } while(0) /************************************************************************ * rgp_broadcast * ============= * * Description: * * This routine asks the message system to broadcast an unacknowledged * packet of subtype "packet_subtype" to a set of nodes indicated in * an appropriate field in the rgp control struct. How the broadcast * is implemented depends on the OS. * * Parameters: * * uint8 packet_subtype - type of unsequenced packet to send * * Returns: * * void - no return value * * Algorithm: * * The same data packet is to be sent to the set of nodes indicated * in the rgp control struct field. The sending can be done by queueing * the packets directly to the send engine or the send can be deferred * to a lower priority interrupt level. The former approach reduces * the latency for sending these urgent packets while the latter * approach may reduce the number of sends if several requests to * send the same type of packets (this is true only of regroup * packets) are made in quick succession. In this case, previous * requests are overwritten by later requests. This is OK since the * regroup algorithm has enough redundancy in packet sending. * * In NSK, the message system provides a broadcast facility for * unacknowledged packets. It copies regroup's packet into its own * buffer and issues multiple requests to the SNet services layer. * When it copies the buffer, it disables the timer and IPC * interrupts ensuring that there will be no contention with Regroup. * Therefore, this routine can safely update the packet area here * without checking if the sending apparatus has completed sending * the previous packet. * * This is not true of LCU where the message system does not * provide a broadcast facility. In LCU, the updating of the packet * buffer can be done only when the send engine has completed * sending. This is assured only in the send completion interrupt * handler (rgp_msgsys_work). * ************************************************************************/ _priv _resident void rgp_broadcast(uint8 packet_subtype) { cluster_t temp_cluster; switch (packet_subtype) { case RGP_UNACK_REGROUP : /* Trace the queueing of regroup status packets. */ RGP_TRACE( "RGP Send packets", rgp->rgppkt.stage, /* TRACE */ RGP_MERGE_TO_32( rgp->status_targets, /* TRACE */ rgp->rgppkt.knownstage1 ), /* TRACE */ RGP_MERGE_TO_32( rgp->rgppkt.knownstage2, /* TRACE */ rgp->rgppkt.knownstage3 ), /* TRACE */ RGP_MERGE_TO_32( rgp->rgppkt.knownstage4, /* TRACE */ rgp->rgppkt.knownstage5 ) ); /* TRACE */ #if defined(NSK) || defined(UNIX) || defined(NT) /* In NSK, the packet buffer can be updated even if the send * engine is working on the previous send. See algorithm * description above. */ if ((rgp->rgppkt.reason == MM_EVT_LEAVE) && (rgp->rgppkt.causingnode == rgp->mynode)) // If a LEAVE event is in progress exclude our node from knownstage mask rgp->rgppkt_to_send = rgp->rgppkt; else // copy regroup packet and insert our node number into knownstage mask rgp_update_regroup_packet; #endif /* NSK || UNIX || NT */ ClusterUnion(rgp->rgp_msgsys_p->regroup_nodes, rgp->status_targets, rgp->rgp_msgsys_p->regroup_nodes); /* Clear the targets field in the rgp_control struct after * copying this info. The message system must clear the target * bits in the common regroup/msgsys struct after sending the * packets. */ ClusterInit(rgp->status_targets); rgp->rgp_msgsys_p->sendrgppkts = 1; break; case RGP_UNACK_IAMALIVE : /* Count number of IamAlive requests queued. */ RGP_INCREMENT_COUNTER( QueuedIAmAlive ); ClusterUnion(rgp->rgp_msgsys_p->iamalive_nodes, rgp->rgpinfo.cluster, rgp->rgp_msgsys_p->iamalive_nodes); rgp->rgp_msgsys_p->sendiamalives = 1; /* No targets field to clear in the rgp_control struct. * The message system must clear the target bits in the common * regroup/msgsys struct after sending the packets. */ break; case RGP_UNACK_POISON : /* Trace the sending of poison packets. */ RGP_TRACE( "RGP Send poison ", rgp->rgppkt.stage, /* TRACE */ RGP_MERGE_TO_32( rgp->poison_targets, /* TRACE */ rgp->rgppkt.knownstage1 ), /* TRACE */ RGP_MERGE_TO_32( rgp->rgppkt.knownstage2, /* TRACE */ rgp->rgppkt.knownstage3 ), /* TRACE */ RGP_MERGE_TO_32( rgp->rgppkt.knownstage4, /* TRACE */ rgp->rgppkt.knownstage5 ) ); /* TRACE */ /* The poison packet targets must NOT be considered alive. */ ClusterIntersection(temp_cluster, rgp->rgpinfo.cluster, rgp->poison_targets); ClusterDifference(temp_cluster, temp_cluster, rgp->OS_specific_control.Banished); if (ClusterNumMembers(temp_cluster) != 0) RGP_ERROR(RGP_INTERNAL_ERROR); #if defined(NSK) || defined(NT) /* In NSK, the packet buffer can be updated even if the send * engine is working on the previous send. See algorithm * description above. */ rgp_update_poison_packet; #endif /* NSK || NT */ ClusterUnion(rgp->rgp_msgsys_p->poison_nodes, rgp->poison_targets, rgp->rgp_msgsys_p->poison_nodes); /* Clear the targets field in the rgp_control struct after * copying this info. The message system must clear the target * bits in the common regroup/msgsys struct after sending the * packets. */ ClusterInit(rgp->poison_targets); rgp->rgp_msgsys_p->sendpoisons = 1; break; default : RGP_ERROR(RGP_INTERNAL_ERROR); break; } QUEUESEND; /* invoke OS-specific sending function/macro */ } /************************************************************************ * rgp_had_power_failure * ===================== * * Description: * * Tells the OS at the end of a regroup incident if a surviving node * had a power failure. The message system can use this to clear all * bus errors collected so far to node because node seems to have * had a power failure and has now recovered from it. Perhaps, the * bus errors were due to the power failure. * * Parameters: * * None * * Returns: * * void - no return value * * Algorithm: * * Calls a message system routine to perform any error clearing. * ************************************************************************/ _priv _resident void rgp_had_power_failure(node_t node) { /* Currently, there is nothing to do. */ RGP_TRACE( "RGP Power fail ", node, 0, 0, 0); } /************************************************************************ * rgp_status_of_node * ================== * * Description: * * Ask the SP to return the status of a node. The SP must return the * current status and not return a stale status. This routine is * called by the split-brain avoidance algorithm in the two-node * case, for the non-tie-breaker to get the status of the tie-breaker * node. * * Parameters: * * node_t node * the node whose status is to be obtained. * * Returns: * * int - the status code of the node returned by the SP, appropriately * encoded into one of the values known to regroup. * * Algorithm: * * Calls a millicode routine to ask the SP for the status of the node. * ************************************************************************/ _priv _resident int rgp_status_of_node(node_t node) { #if defined(NT) /* noone home */ return RGP_NODE_UNREACHABLE; #else return _get_remote_cpu_state_( node ); /*F40:MB06452.1*/ #endif } /************************************************************************ * rgp_newnode_online * ================== * * Description: * * This routine is called if the first IamAlive is received from a * newly booted node before the cluster manager gets a chance to * call rgp_monitor_node(). The OS can use this routine to mark the * node as up if it does not have any other means to detect that * a node has come up. * * Parameters: * * node_t node - * the new node that has just been detected to be up * * Returns: * * void - no return value * * Algorithm: * * This routine marks the state of the node as up as seen by the * native OS. * * In NSK, on the reloader node, the marking of the reloadee as up * is done by the message system when the initial address handshake * packet is received from the reloadee. NSK does not require the * regroup module to report the fact that the reloadee is online. * * The above is probably true for LCU as well. However, the details * are not yet worked out. For now, this routine is a no-op for LCU. * ************************************************************************/ _priv _resident void rgp_newnode_online(node_t newnode) { RGP_TRACE( "RGP New node up ", newnode, 0, 0, 0); } /************************************************************************ * rgp_select_cluster_ex * ===================== * * Description: * * Given an array of cluster choices, this routine picks the best * cluster to keep alive. cluster_choices[] is the array of choices * and num_clusters is the number of entries in the array. * * Parameters: * * cluster_t cluster_choices[] * array of cluster choices * * int num_clusters * number of entries (choices) in the array * * node_t key_node * internal node number of the key node or RGP_NULL_NODE * * Returns: * * int - the index of the selected cluster; if no cluster * is viable, -1 is returned. * * Algorithm: * * By default, the best cluster is defined as the largest cluster. * Optionally, a node called key_node can be required to be present * for a cluster to be viable. key_node can be set to RGP_NULL_NODE * to imply that no specific node is required to be present. The * routine returns the index of the best cluster and -1 if none of * the clusters is viable (that is, does not include the key node). * ************************************************************************/ _priv _resident int rgp_select_cluster_ex(cluster_t cluster_choices[], int num_clusters, node_t key_node) { int max_members = 0, num_members; int cluster_selected = -1; int i; #if defined(UNIX) printf("rgp_select_cluster() called with %d choices:", num_clusters); for (i = 0; i < num_clusters; i++) { node_t j; printf("("); for (j = 0; j < (node_t) rgp->num_nodes; j++) { if (ClusterMember(cluster_choices[i], j)) printf("%d,", EXT_NODE(j)); } printf(")"); } printf("\n"); fflush(stdout); #endif /* UNIX */ for (i = 0; i < num_clusters; i++) { /* Skip the current cluster if a key node is defined and is not * in the cluster. */ if ((key_node != RGP_NULL_NODE) && !ClusterMember(cluster_choices[i], key_node)) continue; if ((num_members = ClusterNumMembers(cluster_choices[i])) > max_members) { cluster_selected = i; max_members = num_members; } } #if defined(UNIX) printf("Node %d: rgp_select_cluster() returned %d.\n", EXT_NODE(rgp->mynode), cluster_selected); fflush(stdout); #endif /* UNIX */ return (cluster_selected); } /************************************************************************ * rgp_select_cluster * ================== * * Description: * * Given an array of cluster choices, this routine picks the best * cluster to keep alive. cluster_choices[] is the array of choices * and num_clusters is the number of entries in the array. * * Parameters: * * cluster_t cluster_choices[] * array of cluster choices * * int num_clusters * number of entries (choices) in the array * * Returns: * * int - the index of the selected cluster; if no cluster * is viable, -1 is returned. * * Algorithm: * * By default, the best cluster is defined as the largest cluster. * Optionally, a node called RGP_KEY_NODE can be required to be present * for a cluster to be viable. RGP_KEY_NODE can be set to RGP_NULL_NODE * to imply that no specific node is required to be present. The * routine returns the index of the best cluster and -1 if none of * the clusters is viable (that is, does not include the key node). * ************************************************************************/ _priv _resident int rgp_select_cluster(cluster_t cluster_choices[], int num_clusters) { node_t key_node; if (RGP_KEY_NODE == RGP_NULL_NODE) { key_node = RGP_NULL_NODE; } else { key_node = INT_NODE(RGP_KEY_NODE); } return rgp_select_cluster_ex(cluster_choices , num_clusters, key_node); } #ifdef LCU /************************************************************************ * rgp_msgsys_work * =============== * * Description: * * LCU-specific routine that implements broadcasting of packets by * sending them serially. * * This routine is called from rgp_broadcast() to initiate new sends. * It is also the packet send completion interrupt handler (callback * routine), invoked by the LCU message system when the packet buffer * can be reused. * * Parameters: * * lcumsg_t *lcumsgp - * pointer to lcu message if called from the transport's send * completion interrupt handler; NULL if called from * rgp_broadcast() to send a new packet. * * int status - * the message completion status if called from the transport's * send completion interrupt handler; 0 if called from * rgp_broadcast() to send a new packet. * * Returns: * * void - no return value * * Algorithm: * * If called from the send completion interrupt, the routine checks * to see if the packet buffer needs to be refreshed. This is true * if the appropriate bit in the rgp_msgsys struct is set. If so, * the buffer is updated with the current info (using an update * macro). This update is relevant to regroup status packets and * poison packets, but not to IamAlives packets whose contents are * always the same. The bit is cleared after the packet is updated. * * Next, the routine checks if there are more destinations to send * the packet to. If so, it finds the next higher numbered node to * send to, issues a send and returns. * * If invoked from rgp_broadcast() to start a new broadcast, the * routine first checks to see if the previous broadcast of the * same packet is complete. This is indicated by the tag field in * the message struct. The tag is NULL if the broadcast has * completed or has not been initiated. In this case, the tag is * set to a non-NULL value and a new broadcast initiated, with * this routine specified as the callback routine. * * If the previous broadcast has not completed, nothing needs to * be done. The completion interrupt will cause the buffer to be * refreshed and the broadcast to be continued. The broadcast * will then include new targets that may be included in this * new request. * ************************************************************************/ _priv _resident void rgp_msgsys_work(lcumsg_t *lcumsgp, int status) { rgp_unseq_pkt_t *packet; cluster_t *sending_cluster; node_t node; if (lcumsgp == NULL) { /* New work requested. Only one type of work is requested at * a time. */ if (rgp->rgp_msgsys_p->sendrgppkts) { /* Have new regroup status packets to send. First check * if the last regroup status send completed. If so, * we can update the packet and initiate a new send. * If not, we must defer to the completion interrupt * (invocation of this routine with a non-NULL lcumsgp). */ lcumsgp = rgp->OS_specific_control.lcumsg_regroup_p; if (lcumsgp->lcu_tag == NULL) { /* Last send completed. Initiate new send. */ rgp_update_regroup_packet; rgp->rgp_msgsys_p->sendrgppkts = 0; for (node = 0; node < rgp->num_nodes; node++) { if (ClusterMember(rgp->rgp_msgsys_p->regroup_nodes, node)) { ClusterDelete(rgp->rgp_msgsys_p->regroup_nodes, node); lcumsgp->lcu_node = node; lcumsgp->lcu_tag = &(rgp->rgp_msgsys_p->regroup_nodes); if (lcuxprt_msg_send(lcumsgp, NULL, rgp_msgsys_work, 0) != ELCU_OK) RGP_ERROR(RGP_INTERNAL_ERROR); break; /* can send only to one node at a time */ } } } } else if (rgp->rgp_msgsys_p->sendiamalives) { /* Need to send IamAlives again. First check if the last * IamAlive send completed. If so, we can initiate a new send. * If not, we must defer to the completion interrupt * (invocation of this routine with a non-NULL lcumsgp). */ lcumsgp = rgp->OS_specific_control.lcumsg_iamalive_p; if (lcumsgp->lcu_tag == NULL) { /* Last send completed. Initiate new send. */ rgp->rgp_msgsys_p->sendiamalives = 0; for (node = 0; node < rgp->num_nodes; node++) { if (ClusterMember(rgp->rgp_msgsys_p->iamalive_nodes, node)) { ClusterDelete(rgp->rgp_msgsys_p->iamalive_nodes, node); lcumsgp->lcu_node = node; lcumsgp->lcu_tag = &(rgp->rgp_msgsys_p->iamalive_nodes); if (lcuxprt_msg_send(lcumsgp, NULL, rgp_msgsys_work, 0) != ELCU_OK) RGP_ERROR(RGP_INTERNAL_ERROR); break; /* can send only to one node at a time */ } } } } else if (rgp->rgp_msgsys_p->sendpoisons) { /* Have new poison packets to send. First check * if the last poison packet send completed. If so, * we can update the packet and initiate a new send. * If not, we must defer to the completion interrupt * (invocation of this routine with a non-NULL lcumsgp). */ lcumsgp = rgp->OS_specific_control.lcumsg_poison_p; if (lcumsgp->lcu_tag == NULL) { /* Last send completed. Initiate new send. */ rgp_update_poison_packet; rgp->rgp_msgsys_p->sendpoisons = 0; for (node = 0; node < rgp->num_nodes; node++) { if (ClusterMember(rgp->rgp_msgsys_p->poison_nodes, node)) { ClusterDelete(rgp->rgp_msgsys_p->poison_nodes, node); lcumsgp->lcu_node = node; lcumsgp->lcu_tag = &(rgp->rgp_msgsys_p->poison_nodes); if (lcuxprt_msg_send(lcumsgp, NULL, rgp_msgsys_work, 0) != ELCU_OK) RGP_ERROR(RGP_INTERNAL_ERROR); break; /* can send only to one node at a time */ } } } } } /* new work */ else { /* Send completion interrupt; continue the broadcast if * there are targets remaining. */ RGP_LOCK; /* Find what type of packet completed; send the same type. */ packet = (rgp_unseq_pkt_t *) lcumsgp->lcu_reqmbuf.lcu_ctrlbuf; switch (packet->pktsubtype) { case RGP_UNACK_REGROUP : /* Check if packet needs to be updated. */ if (rgp->rgp_msgsys_p->sendrgppkts) { rgp_update_regroup_packet; rgp->rgp_msgsys_p->sendrgppkts = 0; } break; case RGP_UNACK_IAMALIVE : break; case RGP_UNACK_POISON : /* Check if packet needs to be updated. */ if (rgp->rgp_msgsys_p->sendpoisons) { rgp_update_poison_packet; rgp->rgp_msgsys_p->sendpoisons = 0; } break; } /* Check if there is any more node to send the same packet * type to. If not, set the tag to NULL and return. */ sending_cluster = (cluster_t *) (lcumsgp->lcu_tag); if (ClusterNumMembers(*sending_cluster) == 0) { lcumsgp->lcu_tag = NULL; /* indicate that broadcast is complete. */ return; } /* There is at least one more node to send to. Start with * the node with the next higher number than the node we * just finished sending to. * * The loop terminates after posting a send to the next * node to send to. We know there is at least one such node. */ for (node = lcumsgp->lcu_node + 1; node < rgp->num_nodes + 1; node++) { if (node == rgp->num_nodes) node = 0; /* continue the search starting at node 0 */ if (ClusterMember(*sending_cluster, node)) { ClusterDelete(*sending_cluster, node); lcumsgp->lcu_node = node; if (lcuxprt_msg_send(lcumsgp, NULL, rgp_msgsys_work, 0) != ELCU_OK) RGP_ERROR(RGP_INTERNAL_ERROR); break; /* can send only to one node at a time */ } } RGP_UNLOCK; } } #endif /* LCU */ /*---------------------------------------------------------------------------*/ #if defined(LCU) || defined(UNIX) || defined(NT) /*---------------------------------------------------------------------------*/ void rgp_hold_all_io(void) /* Simulates the TNet services routine to pause IO. */ { #if defined (NT) (*(rgp->OS_specific_control.HoldIOCallback))(); #endif RGP_TRACE( "RGP Hold all IO ", 0, 0, 0, 0); } /*---------------------------------------------------------------------------*/ void rgp_resume_all_io(void) /* Simulates the TNet services routine to resume IO. */ { #if defined (NT) (*(rgp->OS_specific_control.ResumeIOCallback))(); #endif RGP_TRACE( "RGP Resume IO ", 0, 0, 0, 0); } /*---------------------------------------------------------------------------*/ void RGP_ERROR_EX (uint16 halt_code, char* fname, DWORD lineno) /* Halt node with error code. */ { char *halt_string; node_t node = RGP_NULL_NODE; #if defined( NT ) char halt_buffer[ 256 ]; DWORD eventMsgId; BOOL skipFormatting = FALSE; // // If a user initiated a shutdown, (s)he wants to see the node // to go down and wait for an explicit start command. // // We map RGP_RELOADFAILED to SHUTDOWN_DURING_REGROUP_ERROR since // HaltCallback does a graceful stop for the latter one. // SCM won't restart the node after a graceful stop unless // it is explicitly told to do so // if (halt_code == RGP_RELOADFAILED && rgp->OS_specific_control.ShuttingDown) { halt_code = RGP_SHUTDOWN_DURING_RGP; } #endif if (halt_code == RGP_RELOADFAILED) { halt_string = "[RGP] Node %d: REGROUP WARNING: reload failed."; eventMsgId = MM_EVENT_RELOAD_FAILED; } else if (halt_code == RGP_INTERNAL_ERROR) { halt_string = "[RGP] Node %d: REGROUP ERROR: consistency check failed in file %s, line %u."; eventMsgId = MM_EVENT_INTERNAL_ERROR; skipFormatting = TRUE; _snprintf(halt_buffer, sizeof( halt_buffer ) - 1, halt_string, EXT_NODE(rgp->mynode), fname, lineno); } else if (halt_code == RGP_MISSED_POLL_TO_SELF) { halt_string = "[RGP] Node %d: REGROUP ERROR: cannot talk to self."; eventMsgId = NM_EVENT_MEMBERSHIP_HALT; } #if !defined(NT) else if (halt_code == RGP_AVOID_SPLIT_BRAIN) { halt_string = "[RGP] Node %d: REGROUP ERROR: commiting suicide to avoid split brain."; } #endif else if (halt_code == RGP_PRUNED_OUT) { halt_string = "[RGP] Node %d: REGROUP ERROR: pruned out due to communication failure."; eventMsgId = MM_EVENT_PRUNED_OUT; } else if ((halt_code >= RGP_PARIAH_FIRST) && (halt_code <= RGP_PARIAH_LAST)) { halt_string = "[RGP] Node %d: REGROUP ERROR: poison packet received from node %d."; eventMsgId = MM_EVENT_PARIAH; node = (node_t)(halt_code - RGP_PARIAH); } else if (halt_code == RGP_ARBITRATION_FAILED) { halt_string = "[RGP] Node %d: REGROUP ERROR: arbitration failed."; eventMsgId = MM_EVENT_ARBITRATION_FAILED; } else if (halt_code == RGP_ARBITRATION_STALLED) { halt_string = "[RGP] Node %d: REGROUP ERROR: arbitration stalled."; eventMsgId = MM_EVENT_ARBITRATION_STALLED; } else if (halt_code == RGP_SHUTDOWN_DURING_RGP) { halt_string = "[RGP] Node %d: REGROUP INFO: regroup engine requested immediate shutdown."; eventMsgId = MM_EVENT_SHUTDOWN_DURING_RGP; } else { halt_string = "[RGP] Node %d: REGROUP ERROR: unknown halt code (%d)."; eventMsgId = NM_EVENT_MEMBERSHIP_HALT; node = halt_code; // get it printed out by borrowing node } #if defined(UNIX) printf(halt_string, EXT_NODE(rgp->mynode), node); fflush(stdout); /* Simulate a halt by dumping core and exiting the process. */ abort(); #elif defined(NT) if ( !skipFormatting ) { _snprintf(halt_buffer, sizeof( halt_buffer ) - 1, halt_string, EXT_NODE(rgp->mynode), node); } #if CLUSTER_BETA ClRtlLogPrint(LOG_CRITICAL, "%1!hs!\t%2!hs!:%3!d!\n", halt_buffer, fname, lineno); #else ClRtlLogPrint(LOG_CRITICAL, "%1!hs!\n", halt_buffer ); #endif if ((halt_code >= RGP_PARIAH_FIRST) && (halt_code <= RGP_PARIAH_LAST)) { WCHAR nodeString[ 16 ]; PWCHAR nodeName; _snwprintf( nodeString, sizeof( nodeString ) / sizeof ( WCHAR ), L"%d", node ); nodeName = RgpGetNodeNameFromId( node ); CsLogEvent2( LOG_CRITICAL, eventMsgId, nodeString, nodeName ); if ( nodeName != NULL ) { LocalFree( nodeName ); } } else if ( eventMsgId == NM_EVENT_MEMBERSHIP_HALT ) { WCHAR haltString[ 16 ]; _snwprintf( haltString, sizeof( haltString ) / sizeof ( WCHAR ), L"%d", halt_code ); CsLogEvent1( LOG_CRITICAL, eventMsgId, haltString ); } else { CsLogEvent( LOG_CRITICAL, eventMsgId ); } /* we rely on RGP_ERROR_EX to kill the node immediately rgp_cleanup() can potentially slow us down. 435977 showed that it can take upto 25 seconds, if we have a lot IP addr activity. since in the end of the function we execute HaltCallback which kills the cluster, we can safely omit doing rgp_cleanup and rgp_cleanup_OS If JoinFailedCallback will be ever enabled, the fate of rgp_cleanup and rgp_cleanup_OS should be reevaluated. */ #if 0 rgp_cleanup(); rgp_cleanup_OS(); if (halt_code == RGP_RELOADFAILED) (*(rgp->OS_specific_control.JoinFailedCallback))(); else #endif (*(rgp->OS_specific_control.HaltCallback))(halt_code); // does not return */ #else cmn_err(CE_PANIC, halt_string, EXT_NODE(rgp->mynode), node); #endif /* UNIX */ } /*---------------------------------------------------------------------------*/ void rgp_start_phase1_cleanup(void) /* Tells the OS to start cleanup actions for all failed nodes. */ { #if defined (NT) node_t i; // // On NT we saved the nodes to be downed bitmask in NeedsNodeDownCallback. // for ( i=0; i < (node_t) rgp->num_nodes; i++) { if ( ClusterMember( rgp->OS_specific_control.NeedsNodeDownCallback, i ) ) { (*(rgp->OS_specific_control.MsgCleanup1Callback))(EXT_NODE(i)); } } #endif RGP_TRACE( "RGP Ph1 cleanup ", 0, 0, 0, 0); rgp_event_handler(RGP_EVT_PHASE1_CLEANUP_DONE, RGP_NULL_NODE); } /*---------------------------------------------------------------------------*/ void rgp_start_phase2_cleanup(void) /* The equivalent of NSK's regroupstage4action(). */ { #if defined (NT) BITSET bitset; node_t i; // // On NT we saved the nodes to be downed bitmask in NeedsNodeDownCallback. // BitsetInit(bitset); for ( i=0; i < (node_t) rgp->num_nodes; i++) { if ( ClusterMember( rgp->OS_specific_control.NeedsNodeDownCallback, i ) ) { BitsetAdd(bitset, EXT_NODE(i)); } } (*(rgp->OS_specific_control.MsgCleanup2Callback))(bitset); #endif RGP_TRACE( "RGP Ph2 cleanup ", 0, 0, 0, 0); rgp_event_handler(RGP_EVT_PHASE2_CLEANUP_DONE, RGP_NULL_NODE); } /*---------------------------------------------------------------------------*/ void rgp_cleanup_complete(void) /* The equivalent of NSK's regroupstage5action(). */ { #if defined(NT) #endif RGP_TRACE( "RGP completed ", 0, 0, 0, 0); } /*---------------------------------------------------------------------------*/ #endif /* LCU || UNIX || NT */ #if defined(NT) /************************************************************************ * NT_timer_callback * ================= * * Description: * * This routine is the callback function that gets invoked whenever a * timer pops. The routine will call rgp_periodic_check. This function * is defined by the Win32 TimerProc procedure. * * Parameters: * * See below. We don't use any of them. * * Returns: * * none. * * Algorithm: * * This routine just calls rgp_periodic_check. The existense of this * routine is solely due to a fixed format callback defined by * Microsoft. * ************************************************************************/ VOID CALLBACK NT_timer_callback( VOID ) { #if defined(TDM_DEBUG) if ( !(rgp->OS_specific_control.debug.timer_frozen) && !(rgp->OS_specific_control.debug.frozen) ) #endif rgp_periodic_check( ); } /************************************************************************ * NT_timer_thread * =============== * * Description: * * This routine is executed as a separate thread in the Windows NT * implementation. This thread controls generates periodic regroup * clock ticks. It is signalled via an event whenever the rate changes * or to cause termination. * * Parameters: * * None. * * Returns: * * This thread should not go away. * * Algorithm: * * This routine is run as a separate thread. It sets up a timer to pop * every * 10 milliseconds. * ************************************************************************/ void NT_timer_thread( void ) { BOOL Success; LARGE_INTEGER DueTime; DWORD Error, MyHandleIndex; HANDLE MyHandles[2]; /* for use by WaitForMultiple */ DWORD status; DWORD msDueTime; #define MyHandleSignalIx 0 #define MyHandleTimerIx 1 MyHandles[MyHandleSignalIx] = rgp->OS_specific_control.TimerSignal; /* Event signals HB rate change */ rgp->OS_specific_control.RGPTimer = CreateWaitableTimer( NULL, // no security FALSE, // Initial State FALSE NULL ); // No name if (rgp->OS_specific_control.RGPTimer == NULL) { Error = GetLastError(); RGP_ERROR(RGP_INTERNAL_ERROR); } status = MmSetThreadPriority(); if ( status != ERROR_SUCCESS ) { ClRtlLogPrint(LOG_CRITICAL, "[MM] Unable to set timer thread priority, status %1!u!\n", status ); RGP_ERROR((uint16) status); ExitThread(status); } MyHandles[MyHandleTimerIx] = rgp->OS_specific_control.RGPTimer; while (TRUE) { MyHandleIndex = WaitForMultipleObjects ( 2, /* Number of Events */ MyHandles, /* Handle Array */ FALSE, /* Wait for ANY event */ INFINITE ); /* Wait forever */ if (MyHandleIndex == MyHandleSignalIx) // Timer Change Signal Event { // RGP rate has changed CancelWaitableTimer ( rgp->OS_specific_control.RGPTimer ); if ( rgp->rgpinfo.a_tick == 0 ) // Time to quit { CloseHandle ( rgp->OS_specific_control.RGPTimer ); rgp->OS_specific_control.RGPTimer = 0; ExitThread ( 0 ); } // a_tick has new RGP rate in milliseconds. msDueTime = rgp->rgpinfo.a_tick; DueTime.QuadPart = -10 * 1000 * msDueTime; Success = SetWaitableTimer( rgp->OS_specific_control.RGPTimer, &DueTime, rgp->rgpinfo.a_tick, NULL, NULL, FALSE); if (!Success) { Error = GetLastError(); RGP_ERROR(RGP_INTERNAL_ERROR); } } // Timer Change Signal else { // RGP Timer Tick NT_timer_callback(); NmTimerTick(msDueTime); } } // while } PWCHAR RgpGetNodeNameFromId( node_t NodeID ) /*++ Routine Description: given a node ID, issue a get name node control to get the computer name of the node. Returned buffer to be freed by caller. Arguments: NodeID - ID ( 1, 2, 3, ..) of the node Return Value: pointer to buffer containing name --*/ { PWCHAR buffer; DWORD bufferSize = MAX_COMPUTERNAME_LENGTH * sizeof( WCHAR ); DWORD bytesReturned; DWORD bytesRequired; PNM_NODE node; buffer = LocalAlloc( LMEM_FIXED, bufferSize ); if ( buffer != NULL ) { node = NmReferenceNodeById( NodeID ); if ( node != NULL ) { NmNodeControl(node, NULL, // HostNode OPTIONAL, CLUSCTL_NODE_GET_NAME, NULL, // InBuffer, 0, // InBufferSize, (PUCHAR)buffer, bufferSize, &bytesReturned, &bytesRequired); OmDereferenceObject( node ); } } return buffer; } #endif /* NT */ #ifdef __cplusplus } #endif /* __cplusplus */ #if 0 History of changes to this file: ------------------------------------------------------------------------- 1995, December 13 F40:KSK0610 /*F40:KSK06102.2*/ This file is part of the portable Regroup Module used in the NonStop Kernel (NSK) and Loosely Coupled UNIX (LCU) operating systems. There are 10 files in the module - jrgp.h, jrgpos.h, wrgp.h, wrgpos.h, srgpif.c, srgpos.c, srgpsm.c, srgputl.c, srgpcli.c and srgpsvr.c. The last two are simulation files to test the Regroup Module on a UNIX workstation in user mode with processes simulating processor nodes and UDP datagrams used to send unacknowledged datagrams. This file was first submitted for release into NSK on 12/13/95. ------------------------------------------------------------------------------ This change occurred on 19 Jan 1996 /*F40:MB06458.1*/ Changes for phase IV Sierra message system release. Includes: /*F40:MB06458.2*/ - Some cleanup of the code /*F40:MB06458.3*/ - Increment KCCB counters to count the number of setup messages and /*F40:MB06458.4*/ unsequenced messages sent. /*F40:MB06458.5*/ - Fixed some bugs /*F40:MB06458.6*/ - Disable interrupts before allocating broadcast sibs. /*F40:MB06458.7*/ - Change per-packet-timeout to 5ms /*F40:MB06458.8*/ - Make the regroup and powerfail broadcast use highest priority /*F40:MB06458.9*/ tnet services queue. /*F40:MB06458.10*/ - Call the millicode backdoor to get the processor status from SP /*F40:MB06458.11*/ - Fixed expand bug in msg_listen_ and msg_readctrl_ /*F40:MB06458.12*/ - Added enhancement to msngr_sendmsg_ so that clients do not need /*F40:MB06458.13*/ to be unstoppable before calling this routine. /*F40:MB06458.14*/ - Added new steps in the build file called /*F40:MB06458.15*/ MSGSYS_C - compiles all the message system C files /*F40:MB06458.16*/ MSDRIVER - compiles all the MSDriver files /*F40:MB06458.17*/ REGROUP - compiles all the regroup files /*F40:MB06458.18*/ - remove #pragma env libspace because we set it as a command line /*F40:MB06458.19*/ parameter. /*F40:MB06458.20*/ ----------------------------------------------------------------------- /*F40:MB06458.21*/ #endif /* 0 - change descriptions */