737 lines
30 KiB
C
737 lines
30 KiB
C
|
#ifndef _WRGP_H_
|
||
|
#define _WRGP_H_
|
||
|
|
||
|
#ifdef __TANDEM
|
||
|
#pragma columns 79
|
||
|
#pragma page "wrgp.h - T9050 - internal declarations for Regroup Module"
|
||
|
#endif
|
||
|
|
||
|
/* @@@ START COPYRIGHT @@@
|
||
|
** Tandem Confidential: Need to Know only
|
||
|
** Copyright (c) 1995, Tandem Computers Incorporated
|
||
|
** Protected as an unpublished work.
|
||
|
** All Rights Reserved.
|
||
|
**
|
||
|
** The computer program listings, specifications, and documentation
|
||
|
** herein are the property of Tandem Computers Incorporated and shall
|
||
|
** not be reproduced, copied, disclosed, or used in whole or in part
|
||
|
** for any reason without the prior express written permission of
|
||
|
** Tandem Computers Incorporated.
|
||
|
**
|
||
|
** @@@ END COPYRIGHT @@@
|
||
|
**/
|
||
|
|
||
|
/*---------------------------------------------------------------------------
|
||
|
* This file (wrgp.h) contains the cluster_t data type and types used for the
|
||
|
* node pruning algorithm and declares the routines exported by the Cluster
|
||
|
* data type and the node pruning algorithm.
|
||
|
*---------------------------------------------------------------------------*/
|
||
|
|
||
|
#ifdef __cplusplus
|
||
|
extern "C" {
|
||
|
#endif /* __cplusplus */
|
||
|
|
||
|
|
||
|
#include <jrgp.h>
|
||
|
#include <wrgpos.h>
|
||
|
#include <bitset.h>
|
||
|
|
||
|
#define RGP_VERSION 1 /* version # of data structures */
|
||
|
#define RGP_INITSEQNUM 0 /* starting seq# # of regroup */
|
||
|
|
||
|
|
||
|
#define RGPPKTLEN sizeof(rgp_pkt_t) /* byte length of regroup pkts */
|
||
|
#define IAMALIVEPKTLEN sizeof(iamalive_pkt_t)/* byte length of IamAlive pkts */
|
||
|
#define POISONPKTLEN sizeof(poison_pkt_t) /* byte length of poison pkts */
|
||
|
|
||
|
|
||
|
/*-------------------------------------------------------*/
|
||
|
/* The following are the stages of the regroup algorithm */
|
||
|
/*-------------------------------------------------------*/
|
||
|
|
||
|
#define RGP_COLDLOADED 0
|
||
|
#define RGP_ACTIVATED 1
|
||
|
#define RGP_CLOSING 2
|
||
|
#define RGP_PRUNING 3
|
||
|
#define RGP_PHASE1_CLEANUP 4
|
||
|
#define RGP_PHASE2_CLEANUP 5
|
||
|
#define RGP_STABILIZED 6
|
||
|
|
||
|
|
||
|
/*--------------------------------------------------------------------*/
|
||
|
/* Macros to transform node numbers used by the OS to node numbers */
|
||
|
/* used by the Regroup module and vice versa. Regroup's internal node */
|
||
|
/* numbers start at 0 while the OS starts node numbers at */
|
||
|
/* LOWEST_NODENUM. */
|
||
|
/*--------------------------------------------------------------------*/
|
||
|
#define EXT_NODE(int_node) ((node_t)(int_node + LOWEST_NODENUM))
|
||
|
#define INT_NODE(ext_node) ((node_t)(ext_node - LOWEST_NODENUM))
|
||
|
|
||
|
|
||
|
/*----------------------------------------*/
|
||
|
/* Defines for the node pruning algorithm */
|
||
|
/*----------------------------------------*/
|
||
|
|
||
|
/* The data type "cluster_t" is a bit array of size equal to the maximum
|
||
|
* number of nodes in the cluster. The bit array is implemented as an
|
||
|
* array of uint8s.
|
||
|
*
|
||
|
* Given a node#, its bit position in the bit array is computed by first
|
||
|
* locating the byte in the array (node# / BYTEL) and then the bit in
|
||
|
* the byte. Bits in the byte are numbered 0..7 (from left to right).
|
||
|
* Thus, node 0 is placed in byte 0, bit 0, which is the left-most bit
|
||
|
* in the bit array.
|
||
|
*/
|
||
|
#define BYTE(cluster, node) ( (cluster)[(node) / BYTEL] ) /* byte# in array */
|
||
|
#define BIT(node) ( (node) % BYTEL ) /* bit# in byte */
|
||
|
|
||
|
|
||
|
/* The connectivity matrix is an array of elements of type cluster_t.
|
||
|
* cluster_t is equivalent to a bit array with one bit per node. Thus the
|
||
|
* matrix is equivalent to a two-dimensional bit array, with each
|
||
|
* dimension being MAX_CLUSTER_SIZE large. A bit value of 1 for matrix[i][j]
|
||
|
* represents a unidirectional connection between nodes i and j (a
|
||
|
* regroup packet received on node i from node j).
|
||
|
*/
|
||
|
|
||
|
typedef cluster_t connectivity_matrix_t[MAX_CLUSTER_SIZE];
|
||
|
|
||
|
|
||
|
#define connected(i,j) (ClusterMember(c[(int)i],j) && \
|
||
|
ClusterMember(c[(int)j],i)) /* bidirectional */
|
||
|
|
||
|
/* Should a node that cannot receive its own regroup packets be considered
|
||
|
* dead? Not necessarily. It may be able to send packets to others and
|
||
|
* be considered alive by everyone. There is no real need for the ability
|
||
|
* to send to yourself on the network. Software bugs could result in
|
||
|
* such a situation. Therefore, the correct way to check if a node is
|
||
|
* alive would be to check if there is a non-zero bit in either the row
|
||
|
* or column corresponding to the node; that is, if the node has
|
||
|
* received regroup packets from or sent regroup packets to any node,
|
||
|
* it may be considered alive. But for simplicity, we will assume in
|
||
|
* the following macro that a node that does not receive its own
|
||
|
* regroup packets will be considered dead.
|
||
|
*/
|
||
|
|
||
|
#define node_considered_alive(i) ClusterMember(c[(int)i],i)
|
||
|
|
||
|
/* The upper bound on the number of potential fully-connected groups is
|
||
|
* the lower of 2**N and 2**D where N is the number of live nodes and
|
||
|
* D is the number of disconnects. If this number exceeds MAX_GROUPS,
|
||
|
* do not attempt to exhaustively generate all possible groups;
|
||
|
* just return an arbitrary fully-connected group which includes a
|
||
|
* node selected by the cluster manager.
|
||
|
*/
|
||
|
#define MAX_GROUPS 256 /* if more than these, pick arbitrary group */
|
||
|
#define LOG2_MAX_GROUPS 8 /* log (base 2) of MAX_GROUPS */
|
||
|
|
||
|
#define too_many_groups(nodes, disconnects) \
|
||
|
((nodes > LOG2_MAX_GROUPS) && (disconnects > LOG2_MAX_GROUPS))
|
||
|
|
||
|
/* The disconnect array is an array of (i,j) pairs which represent a
|
||
|
* break in connectivity between nodes i and j.
|
||
|
*/
|
||
|
|
||
|
typedef node_t disconnect_array [LOG2_MAX_GROUPS * (LOG2_MAX_GROUPS-1)/2] [2];
|
||
|
|
||
|
|
||
|
/*---------------------------------------------------------------------------*/
|
||
|
/* Following are templates for three kinds of unacknowledged datagrams sent */
|
||
|
/* by the regroup module (regroup pkts, IamAlive pkts and poison pkts). */
|
||
|
/*---------------------------------------------------------------------------*/
|
||
|
|
||
|
//
|
||
|
// We already hand packed all on the wire structures.
|
||
|
// packon will instruct the compiler not to mess with field alignment (kind of)
|
||
|
//
|
||
|
#include <packon.h>
|
||
|
|
||
|
/************************************************************************
|
||
|
* rgp_pkt_t (regroup status packet)
|
||
|
* ---------------------------------
|
||
|
* This structure is used to send the current state of the regroup state
|
||
|
* machine to other nodes.
|
||
|
*
|
||
|
* ___________________________________________________________
|
||
|
* wd0 | pktsubtype | stage | reason | Low8 ignscr |
|
||
|
* |_____________|_______________|_____________________________|
|
||
|
* wd1 | seqno |
|
||
|
* |_____________________________|_____________________________|
|
||
|
* wd2 | activa- | causingnode | quorumowner |
|
||
|
* | tingnode | Hi8 ignscr | (was hadpowerfail) |
|
||
|
* |_____________|_______________|_____________________________|
|
||
|
* wd3 | knownstage1 | knownstage2 |
|
||
|
* |_____________________________|_____________________________|
|
||
|
* wd4 | knownstage3 | knownstage4 |
|
||
|
* |_____________________________|_____________________________|
|
||
|
* wd5 | knownstage5 | pruning_result |
|
||
|
* |_____________________________|_____________________________|
|
||
|
* wd6 : :
|
||
|
* | connectivity_matrix |
|
||
|
* : :
|
||
|
* wd13|___________________________________________________________|
|
||
|
*
|
||
|
*
|
||
|
* pktsubtype - packet subtype = RGP_UNACK_REGROUP
|
||
|
* stage - current stage (state) of the regroup algorithm
|
||
|
* reason - reason for the activation of regroup
|
||
|
* seqno - sequence number of current regroup incident
|
||
|
* activatingnode - node that calls for a regroup incident
|
||
|
* causingnode - node whose poll packet was missed or which
|
||
|
* had a power failure or otherwise caused
|
||
|
* a regroup incident being called for
|
||
|
* quorumowner - mask of nodes that think they own the quorum resrc
|
||
|
* knownstage1 - mask of nodes known to have entered stage 1
|
||
|
* knownstage2 - mask of nodes known to have entered stage 2
|
||
|
* knownstage3 - mask of nodes known to have entered stage 3
|
||
|
* knownstage4 - mask of nodes known to have entered stage 4
|
||
|
* knownstage5 - mask of nodes known to have entered stage 5
|
||
|
* pruning_result - result of node pruning by tie-breaker node
|
||
|
* connectivity_matrix - current connectivity info for entire cluster
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
#ifdef __TANDEM
|
||
|
#pragma fieldalign shared8 rgp_pkt
|
||
|
#endif /* __TANDEM */
|
||
|
|
||
|
typedef struct rgp_pkt
|
||
|
{
|
||
|
uint8 pktsubtype;
|
||
|
uint8 stage;
|
||
|
uint16 reason;
|
||
|
uint32 seqno;
|
||
|
uint8 activatingnode;
|
||
|
uint8 causingnode;
|
||
|
cluster_t quorumowner;
|
||
|
cluster_t knownstage1;
|
||
|
cluster_t knownstage2;
|
||
|
cluster_t knownstage3;
|
||
|
cluster_t knownstage4;
|
||
|
cluster_t knownstage5;
|
||
|
cluster_t pruning_result;
|
||
|
connectivity_matrix_t connectivity_matrix;
|
||
|
} rgp_pkt_t;
|
||
|
|
||
|
/************************************************************************
|
||
|
* iamalive_pkt_t
|
||
|
* --------------
|
||
|
* This structure is used by a node to indicate to another node that it
|
||
|
* is alive and well.
|
||
|
*
|
||
|
* ___________________________________________________________
|
||
|
* wd0 | pktsubtype | filler |
|
||
|
* |_____________|_____________________________________________|
|
||
|
* wd1 : :
|
||
|
* | testpattern |
|
||
|
* : :
|
||
|
* wd13|___________________________________________________________|
|
||
|
*
|
||
|
*
|
||
|
* pktsubtype - packet subtype = RGP_UNACK_IAMALIVE
|
||
|
* testpattern - a bit pattern used for testing
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
#ifdef __TANDEM
|
||
|
#pragma fieldalign shared8 iamalive_pkt
|
||
|
#endif /* __TANDEM */
|
||
|
|
||
|
typedef struct iamalive_pkt
|
||
|
{
|
||
|
uint8 pktsubtype;
|
||
|
uint8 filler[3];
|
||
|
union
|
||
|
{
|
||
|
uint8 bytes[RGP_UNACK_PKTLEN - 4];
|
||
|
uint32 words[(RGP_UNACK_PKTLEN - 4)/4];
|
||
|
} testpattern;
|
||
|
} iamalive_pkt_t;
|
||
|
|
||
|
|
||
|
/************************************************************************
|
||
|
* poison_pkt_t
|
||
|
* ------------
|
||
|
* This structure is used to send a poison packet to another node to
|
||
|
* force the other node to halt.
|
||
|
*
|
||
|
* ___________________________________________________________
|
||
|
* wd0 | pktsubtype | unused1 | reason |
|
||
|
* |_____________|_______________|_____________________________|
|
||
|
* wd1 | seqno |
|
||
|
* |_____________________________|_____________________________|
|
||
|
* wd2 | activa- | causingnode | |
|
||
|
* | tingnode | | unused2 |
|
||
|
* |_____________|_______________|_____________________________|
|
||
|
* wd3 | initnodes | endnodes |
|
||
|
* |_____________________________|_____________________________|
|
||
|
*
|
||
|
*
|
||
|
* pktsubtype - packet subtype = RGP_UNACK_POISON
|
||
|
* reason - reason for the last activation of regroup
|
||
|
* seqno - current regroup sequence number
|
||
|
* (sequence number of last regroup incident)
|
||
|
* activatingnode - node which called for last regroup incident
|
||
|
* causingnode - node whose poll packet was missed or which
|
||
|
* had a power failure or otherwise caused
|
||
|
* the last regroup incident being called for
|
||
|
* initnodes - mask of nodes at beginning of last regroup
|
||
|
* endnodes - mask of nodes at end of last regroup
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
#ifdef __TANDEM
|
||
|
#pragma fieldalign shared8 poison_pkt
|
||
|
#endif /* __TANDEM */
|
||
|
|
||
|
typedef struct poison_pkt
|
||
|
{
|
||
|
uint8 pktsubtype;
|
||
|
uint8 unused1;
|
||
|
uint16 reason;
|
||
|
uint32 seqno;
|
||
|
uint8 activatingnode;
|
||
|
uint8 causingnode;
|
||
|
uint16 unused2;
|
||
|
cluster_t initnodes;
|
||
|
cluster_t endnodes;
|
||
|
} poison_pkt_t;
|
||
|
|
||
|
#include <packoff.h>
|
||
|
|
||
|
//
|
||
|
// There is no room for a 16 bit ignorescreen mask
|
||
|
// in rgp_pkt_t structure. We use a few bit from several
|
||
|
// fields to store the ignore screen.
|
||
|
// The following routines do packing and unpacking
|
||
|
// of ignorescreen from/into the packet
|
||
|
//
|
||
|
|
||
|
extern void PackIgnoreScreen(rgp_pkt_t* to, cluster_t from);
|
||
|
extern void UnpackIgnoreScreen(rgp_pkt_t* from, cluster_t to);
|
||
|
extern void SetMulticastReachable(uint32 mask);
|
||
|
|
||
|
/*---------------------------------------------------------------------------*/
|
||
|
/* This struct is keeps track of the state of each node in the cluster. */
|
||
|
/*---------------------------------------------------------------------------*/
|
||
|
typedef struct
|
||
|
{
|
||
|
uint16 status; /* state of node - alive, dead etc. */
|
||
|
uint16 pollstate; /* whether I'm alives have been received */
|
||
|
uint16 lostHBs; /* tracks the number of consecutive I'm alives lost */
|
||
|
} node_state_t;
|
||
|
|
||
|
/* The status and pollstate fields of the node_state_t struct can have the
|
||
|
* following values.
|
||
|
*/
|
||
|
|
||
|
/* Node status of nodes */
|
||
|
|
||
|
#define RGP_NODE_ALIVE 1 /* node is considered alive */
|
||
|
#define RGP_NODE_COMING_UP 2 /* node is coming up */
|
||
|
#define RGP_NODE_DEAD 3 /* node has failed */
|
||
|
#define RGP_NODE_NOT_CONFIGURED 4 /* node is not even configured */
|
||
|
|
||
|
/* IamAlive status codes of nodes */
|
||
|
|
||
|
#define AWAITING_IAMALIVE 1 /* awaiting IamAlives */
|
||
|
#define IAMALIVE_RECEIVED 2 /* got IamAlive */
|
||
|
|
||
|
#define RGP_IAMALIVE_THRESHOLD 100 /* after getting this many Iam- *
|
||
|
* Alives, we check if every *
|
||
|
* node has sent at least one */
|
||
|
|
||
|
|
||
|
/************************************************************************
|
||
|
* rgp_control_t (regroup's only global data structure)
|
||
|
* ----------------------------------------------------
|
||
|
* This structure holds all the Regroup state and other info.
|
||
|
* This is the only global data structure used by Regroup.
|
||
|
*
|
||
|
* NOTE: The word offsets shown in this picture assume that
|
||
|
* MAX_CLUSTER_SIZE is 16.
|
||
|
*
|
||
|
* ___________________________________________________________
|
||
|
* wd0 | |
|
||
|
* : rgpinfo structure :
|
||
|
* : :
|
||
|
* |___________________________________________________________|
|
||
|
* wd3 | mynode | tiebreaker |
|
||
|
* |_____________________________|_____________________________|
|
||
|
* wd4 | num_nodes |
|
||
|
* |___________________________________________________________|
|
||
|
* wd5 | clock_ticks | rgpcounter |
|
||
|
* |_____________________________|_____________________________|
|
||
|
* wd6 | restartcount | pruning_ticks |
|
||
|
* |_____________________________|_____________________________|
|
||
|
* wd7 | pfail_state | flags |
|
||
|
* |_____________________________|_____________________________|
|
||
|
* wd8 | outerscreen | innerscreen |
|
||
|
* |_____________________________|_____________________________|
|
||
|
* wd9 | status_targets | poison_targets |
|
||
|
* |_____________________________|_____________________________|
|
||
|
* wd10| initnodes | endnodes |
|
||
|
* |_____________________________|_____________________________|
|
||
|
* wd11| unreachable_nodes | arbitration_ticks |
|
||
|
* |_____________________________|_____________________________|
|
||
|
* wd12| ignorescreen | filler[0] |
|
||
|
* |_____________________________|_____________________________|
|
||
|
* wd13| filler[1] | filler[2] |
|
||
|
* |_____________________________|_____________________________|
|
||
|
* wd14| |
|
||
|
* : node_states[MAX_CLUSTER_SIZE] :
|
||
|
* : :
|
||
|
* |___________________________________________________________|
|
||
|
* wd30| *nodedown_callback() |
|
||
|
* |___________________________________________________________|
|
||
|
* wd31| *select_cluster() |
|
||
|
* |___________________________________________________________|
|
||
|
* wd32| *rgp_msgsys_p |
|
||
|
* |___________________________________________________________|
|
||
|
* wd33| *received_pktaddr |
|
||
|
* |___________________________________________________________|
|
||
|
* wd34| |
|
||
|
* : rgppkt :
|
||
|
* : :
|
||
|
* |___________________________________________________________|
|
||
|
* wd48| |
|
||
|
* : rgppkt_to_send :
|
||
|
* : :
|
||
|
* |___________________________________________________________|
|
||
|
* wd62| |
|
||
|
* : iamalive_pkt :
|
||
|
* : :
|
||
|
* |___________________________________________________________|
|
||
|
* wd76| |
|
||
|
* : poison_pkt :
|
||
|
* |___________________________________________________________|
|
||
|
* wd80| |
|
||
|
* : :
|
||
|
* : potential_groups[MAX_GROUPS] :
|
||
|
* : :
|
||
|
* |___________________________________________________________|
|
||
|
*wd208| |
|
||
|
* : last_stable_seqno :
|
||
|
* |___________________________________________________________|
|
||
|
*wd212| |
|
||
|
* : internal_connectivity_matrix :
|
||
|
* |___________________________________________________________|
|
||
|
*wdyyy| |
|
||
|
* : OS_specific_control :
|
||
|
*wdxxx|___________________________________________________________|
|
||
|
*
|
||
|
*
|
||
|
* rgpinfo - contains regroup timing parameters and mask of
|
||
|
* fully-integrated cluster (to send IamAlives and monitor)
|
||
|
*
|
||
|
* mynode - node number of local node
|
||
|
*
|
||
|
* tiebreaker - node selected to act as a tie-breaker in the
|
||
|
* split-brain avoidance algorithm and to run the
|
||
|
* pruning algorithm
|
||
|
*
|
||
|
* num_nodes - number of nodes configured in the system, including
|
||
|
* any unused node numbers in the middle; this is equal
|
||
|
* to (the largest configured node# in the system -
|
||
|
* lowest possible node # + 1).
|
||
|
*
|
||
|
* clock_ticks- regroup's internal clock used for checking if it is
|
||
|
* time to send IamAlive packets and to check if IamAlives
|
||
|
* have been received. It is incremented every
|
||
|
* RGP_CLOCK_PERIOD and reset to 0 after checking
|
||
|
* for IamAlives.
|
||
|
*
|
||
|
* rgpcounter - counts regroup clock ticks in a regroup incident in
|
||
|
* order to detect if the algorithm is stalling.
|
||
|
* This is reset when a new regroup incident begins and
|
||
|
* is incremented at each regroup clock tick while
|
||
|
* regroup is perturbed.
|
||
|
*
|
||
|
* restartcount - counts # of regroup algorithm restarts in each regroup
|
||
|
* incident; the node is halted if there are too many
|
||
|
* restarts.
|
||
|
*
|
||
|
* pruning_ticks - number of regroup clock ticks after the tie-breaker
|
||
|
* has been selected; if there are disconnects, the
|
||
|
* tie-breaker should wait a fixed number of ticks
|
||
|
* before running the pruning algorithm.
|
||
|
*
|
||
|
* pfail_state - set to a +ve value when a pfail event is reported
|
||
|
* to regroup. It is decremented at every regroup
|
||
|
* clock tick till it reaches zero. While this number
|
||
|
* is +ve, missing self IamAlives are ignored and
|
||
|
* do not cause the node to halt. This gives the
|
||
|
* sending hardware some time to recover from power
|
||
|
* failures before self IamAlives are checked.
|
||
|
*
|
||
|
* outerscreen - outer recognition mask: nodes not in this mask are
|
||
|
* considered dead or outcasts; if they try to contact
|
||
|
* us, send them poison packets to make sure they stay down
|
||
|
*
|
||
|
* innerscreen - inner recognition mask: nodes not in this mask are
|
||
|
* considered tardy. Regroup packts from them will be
|
||
|
* ignored. They may survive if they can find some
|
||
|
* node which hasn't eliminated them from this screen.
|
||
|
*
|
||
|
* status_targets - nodes to send regroup status packets to
|
||
|
*
|
||
|
* poison_targets - nodes to send poison packets to
|
||
|
*
|
||
|
* initnodes - nodes alive at the beginning of last regroup incident
|
||
|
*
|
||
|
* endnodes - nodes alive at the end of last regroup incident
|
||
|
*
|
||
|
* unreachable_nodes - stores unreachable_node events till the events
|
||
|
* can be processed
|
||
|
*
|
||
|
* arbitration_ticks - number of regroup clock ticks after the arbitration
|
||
|
* started. If arbitration_ticks counter exceeds
|
||
|
* RGP_ARBITRATION_TIMEOUT number of ticks,
|
||
|
* the arbitrating node will shoot itself, and the rest
|
||
|
* of the group will restart the regroup ignoring stalled
|
||
|
* arbitrator
|
||
|
*
|
||
|
* ignorescreen - this is a local copy of ignorescreen passed as
|
||
|
* a part of the regroup packet. The packets from
|
||
|
* the nodes in this screen are ignored and no wait
|
||
|
* for the nodes in ignorescreen is performed in stage 1
|
||
|
*
|
||
|
* last_stable_seqno - this is a sequence number of the last successful regroup.
|
||
|
* It allows to detect really outdated packets
|
||
|
*
|
||
|
* flags:
|
||
|
*
|
||
|
* cautiousmode - need to be "cautious"; wait longer in stage 1
|
||
|
*
|
||
|
* sendstage - This flag is used to indicate whether the
|
||
|
* regroup status packets should indicate we
|
||
|
* are in the current stage. When we enter the
|
||
|
* cleanup stages, we don't let others know we
|
||
|
* are in the stage until the cleanup actions
|
||
|
* are completed.
|
||
|
*
|
||
|
* This flag is set when a new regroup incident
|
||
|
* is started. It is then cleared when we enter
|
||
|
* a cleanup stage and set again when the
|
||
|
* cleanup operations are completed.
|
||
|
*
|
||
|
* tiebreaker_selected - set in stage 2 after tie-breaker is selected
|
||
|
*
|
||
|
* has_unreachable_nodes - set when a node_unreachable event is detected
|
||
|
* in stages 1 or 2. checked in stage 3.
|
||
|
*
|
||
|
* flags_unused - 11 unused bits
|
||
|
*
|
||
|
* node_states[MAX_CLUSTER_SIZE] - state of all the nodes
|
||
|
*
|
||
|
* *nodedown_callback() - registered callback routine to be invoked
|
||
|
* to report node failure
|
||
|
*
|
||
|
* *select_cluster() - registered callback routine to be invoked
|
||
|
* when multiple cluster options exist
|
||
|
*
|
||
|
* *rgp_msgsys_p - pointer to struct shared by regroup and message system
|
||
|
*
|
||
|
* *received_pktaddr - address of rgp packet received
|
||
|
*
|
||
|
* rgp_lock - lock to serialize access to this struct
|
||
|
*
|
||
|
* rgppkt - regroup status in the form of a packet
|
||
|
*
|
||
|
* rgppkt_to_send - regroup packet to be broadcast
|
||
|
*
|
||
|
* iamalive_pkt - I am alive packet to be broadcast
|
||
|
*
|
||
|
* poison_pkt - poison packet to be sent
|
||
|
*
|
||
|
* potential_groups[MAX_GROUPS] - scratch pad for pruning algorithm
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
#ifdef __TANDEM
|
||
|
#pragma fieldalign shared8 rgp_control
|
||
|
#endif /* __TANDEM */
|
||
|
|
||
|
typedef struct rgp_control
|
||
|
{
|
||
|
/* timing parameters and cluster membership */
|
||
|
rgpinfo_t rgpinfo;
|
||
|
|
||
|
/* node numbers */
|
||
|
node_t mynode;
|
||
|
node_t tiebreaker;
|
||
|
uint32 num_nodes;
|
||
|
|
||
|
/* various counters counting clock ticks */
|
||
|
uint16 clock_ticks;
|
||
|
uint16 rgpcounter;
|
||
|
uint16 restartcount;
|
||
|
uint16 pruning_ticks;
|
||
|
uint16 pfail_state;
|
||
|
|
||
|
/* rgpflags */
|
||
|
uint16 cautiousmode : 1;
|
||
|
uint16 sendstage : 1;
|
||
|
uint16 tiebreaker_selected : 1;
|
||
|
uint16 has_unreachable_nodes : 1;
|
||
|
uint16 arbitration_started : 1;
|
||
|
uint16 flags_unused : 11;
|
||
|
|
||
|
/* cluster masks */
|
||
|
cluster_t outerscreen;
|
||
|
cluster_t innerscreen;
|
||
|
cluster_t status_targets;
|
||
|
cluster_t poison_targets;
|
||
|
cluster_t initnodes;
|
||
|
cluster_t endnodes;
|
||
|
cluster_t unreachable_nodes;
|
||
|
|
||
|
uint16 arbitration_ticks;
|
||
|
cluster_t ignorescreen;
|
||
|
|
||
|
uint16 filler[3]; /* for alignment and future use */
|
||
|
|
||
|
/* node states */
|
||
|
node_state_t node_states[MAX_CLUSTER_SIZE];
|
||
|
|
||
|
/* callback routines */
|
||
|
void (*nodedown_callback)(cluster_t failed_nodes);
|
||
|
int (*select_cluster)(cluster_t cluster_choices[], int num_clusters);
|
||
|
|
||
|
/* pointers to other structures */
|
||
|
rgp_msgsys_p rgp_msgsys_p;
|
||
|
rgp_pkt_t *received_pktaddr;
|
||
|
|
||
|
/* current status in the form of a regroup packet */
|
||
|
rgp_pkt_t rgppkt;
|
||
|
|
||
|
/* packets to be sent */
|
||
|
rgp_pkt_t rgppkt_to_send;
|
||
|
iamalive_pkt_t iamalive_pkt;
|
||
|
poison_pkt_t poison_pkt;
|
||
|
|
||
|
/* scratch pad for node pruning algorithm */
|
||
|
cluster_t potential_groups[MAX_GROUPS];
|
||
|
|
||
|
/* The rest of the struct is an OS-specific substruct
|
||
|
* (defined in wrgpos.h).
|
||
|
*/
|
||
|
uint32 last_stable_seqno;
|
||
|
|
||
|
/* temporary place to collect connectivity information
|
||
|
* while send_stage = 0. (Can't use rgp_pkt conn.matrix,
|
||
|
* because we don't want to see our info until we get
|
||
|
* the first timer tick */
|
||
|
|
||
|
connectivity_matrix_t internal_connectivity_matrix;
|
||
|
OS_specific_rgp_control_t OS_specific_control;
|
||
|
|
||
|
} rgp_control_t;
|
||
|
|
||
|
/*---------------------------------------------------------------------------*/
|
||
|
/* Procedures exported by the Cluster type implementation */
|
||
|
|
||
|
_priv _resident extern void
|
||
|
ClusterInit(cluster_t c);
|
||
|
_priv _resident extern void
|
||
|
ClusterUnion(cluster_t dst, cluster_t src1, cluster_t src2);
|
||
|
_priv _resident extern void
|
||
|
ClusterIntersection(cluster_t dst, cluster_t src1, cluster_t src2);
|
||
|
_priv _resident extern void
|
||
|
ClusterDifference(cluster_t dst, cluster_t src1, cluster_t src2);
|
||
|
_priv _resident extern int
|
||
|
ClusterCompare(cluster_t c1, cluster_t c2);
|
||
|
_priv _resident extern int
|
||
|
ClusterSubsetOf(cluster_t big, cluster_t small);
|
||
|
_priv _resident extern void
|
||
|
ClusterComplement(cluster_t dst, cluster_t src);
|
||
|
_priv _resident extern int
|
||
|
ClusterMember(cluster_t c, node_t i);
|
||
|
_priv _resident extern void
|
||
|
ClusterInsert(cluster_t c, node_t i);
|
||
|
_priv _resident extern void
|
||
|
ClusterDelete(cluster_t c, node_t i);
|
||
|
_priv _resident extern void
|
||
|
ClusterCopy(cluster_t dst, cluster_t src);
|
||
|
_priv _resident extern void
|
||
|
ClusterSwap(cluster_t c1, cluster_t c2);
|
||
|
_priv _resident extern int
|
||
|
ClusterNumMembers(cluster_t c);
|
||
|
extern int
|
||
|
ClusterEmpty(cluster_t c);
|
||
|
|
||
|
|
||
|
/*---------------------------------------------------------------------------*/
|
||
|
/* Function to select the tie-breaker node used in both the split-brain
|
||
|
* avoidance and node pruning algorithms
|
||
|
*/
|
||
|
_priv _resident extern node_t
|
||
|
rgp_select_tiebreaker(cluster_t cluster);
|
||
|
|
||
|
|
||
|
/*---------------------------------------------------------------------------*/
|
||
|
/* Procedures exported by the node pruning algorithm */
|
||
|
|
||
|
_priv _resident extern void MatrixInit(connectivity_matrix_t c);
|
||
|
/* Initialize the matrix c to show 0 connectivity. */
|
||
|
|
||
|
_priv _resident extern void
|
||
|
MatrixSet(connectivity_matrix_t c, int row, int column);
|
||
|
/* Set c[row,column] to 1. */
|
||
|
|
||
|
_priv _resident extern void
|
||
|
MatrixOr(connectivity_matrix_t t, connectivity_matrix_t s);
|
||
|
/* OR in s into t. */
|
||
|
|
||
|
_priv _resident extern int connectivity_complete(connectivity_matrix_t c);
|
||
|
/* Returns 1 if all live nodes are connected to all other live nodes
|
||
|
* and 0 if there is at least one disconnect.
|
||
|
*/
|
||
|
|
||
|
_priv _resident extern int
|
||
|
find_all_fully_connected_groups(connectivity_matrix_t c,
|
||
|
node_t selected_node,
|
||
|
cluster_t groups[]);
|
||
|
/* Analyzes the connectivity matrix and comes up with the list of
|
||
|
* all maximal, fully-connected groups. Returns the number of
|
||
|
* such groups found. 0 is returned iff there are no live nodes.
|
||
|
*/
|
||
|
|
||
|
/*---------------------------------------------------------------------------*/
|
||
|
/* Declaration of Regroup's global data structure */
|
||
|
|
||
|
#ifdef NSK
|
||
|
#include <wmsgsac.h>
|
||
|
#define rgp ((rgp_control_t *) MSGROOT->RegroupControlAddr)
|
||
|
#else
|
||
|
extern rgp_control_t *rgp;
|
||
|
#endif /* NSK */
|
||
|
/*---------------------------------------------------------------------------*/
|
||
|
|
||
|
#ifdef __cplusplus
|
||
|
}
|
||
|
#endif /* __cplusplus */
|
||
|
|
||
|
|
||
|
#if 0
|
||
|
|
||
|
History of changes to this file:
|
||
|
-------------------------------------------------------------------------
|
||
|
1995, December 13 F40:KSK0610 /*F40:KSK06102.1*/
|
||
|
|
||
|
This file is part of the portable Regroup Module used in the NonStop
|
||
|
Kernel (NSK) and Loosely Coupled UNIX (LCU) operating systems. There
|
||
|
are 10 files in the module - jrgp.h, jrgpos.h, wrgp.h, wrgpos.h,
|
||
|
srgpif.c, srgpos.c, srgpsm.c, srgputl.c, srgpcli.c and srgpsvr.c.
|
||
|
The last two are simulation files to test the Regroup Module on a
|
||
|
UNIX workstation in user mode with processes simulating processor nodes
|
||
|
and UDP datagrams used to send unacknowledged datagrams.
|
||
|
|
||
|
This file was first submitted for release into NSK on 12/13/95.
|
||
|
------------------------------------------------------------------------------
|
||
|
|
||
|
#endif /* 0 - change descriptions */
|
||
|
|
||
|
|
||
|
#endif /* _WRGP_H_ defined */
|