3953 lines
134 KiB
C
3953 lines
134 KiB
C
/*++
|
|
|
|
Copyright(c) 1998,99 Microsoft Corporation
|
|
|
|
Module Name:
|
|
|
|
load.c
|
|
|
|
Abstract:
|
|
|
|
Windows Load Balancing Service (WLBS)
|
|
Driver - load balancing algorithm
|
|
|
|
Author:
|
|
|
|
bbain
|
|
|
|
ToDo:
|
|
Kernel mode queue mgt
|
|
Fail safe mode (single server for everything)
|
|
--*/
|
|
|
|
#ifdef KERNEL_MODE
|
|
|
|
#include <ntddk.h>
|
|
|
|
#include "log.h"
|
|
#include "univ.h"
|
|
#include "main.h" // added for multiple nic
|
|
|
|
static ULONG log_module_id = LOG_MODULE_LOAD;
|
|
|
|
#else
|
|
|
|
#include <stdlib.h>
|
|
#include <windows.h>
|
|
#endif
|
|
|
|
#include <stdio.h>
|
|
#include "wlbsparm.h"
|
|
#include "params.h"
|
|
#include "wlbsiocl.h"
|
|
#include "wlbsip.h"
|
|
#include "load.h"
|
|
|
|
//
|
|
// For WPP Event Tracing
|
|
//
|
|
#include "trace.h" // for event tracing
|
|
#include "load.tmh" // for event tracing
|
|
#ifndef KERNEL_MODE
|
|
|
|
#define UNIV_PRINT(s) { printf s ; printf ("\n"); }
|
|
#define Univ_ulong_to_str(x, y, z) (y)
|
|
|
|
#define LOG_MSG(c,s)
|
|
#define LOG_MSG1(c,s,d1)
|
|
#define LOG_MSG2(c,s,d1,d2)
|
|
#define LOG_MSG3(c,s,d1,d2,d3)
|
|
#define LOG_MSG4(c,s,d1,d2,d3,d4)
|
|
|
|
#else
|
|
|
|
#define malloc(s) ExAllocatePoolWithTag (NonPagedPool, s, UNIV_POOL_TAG)
|
|
#define free(s) ExFreePool (s)
|
|
|
|
#endif
|
|
|
|
|
|
//extern CVY_PARAMS univ_params;
|
|
//#define univ_params ( * (lp -> params))
|
|
|
|
void Bin_state_print(PBIN_STATE bp, ULONG my_host_id);
|
|
void Load_conn_kill(PLOAD_CTXT lp, PBIN_STATE bp); /* v1.32B */
|
|
|
|
|
|
// static WCHAR buf [256]; /* string buffer (V1.1.2) */
|
|
|
|
|
|
/* CONSTANTS */
|
|
|
|
|
|
#if 0 /* v2.06 */
|
|
#define BIN_ALL_ONES ((MAP_T)-1) /* bin map state for 64 ones (v2.04) */
|
|
#endif
|
|
#define BIN_ALL_ONES ((MAP_T)(0xFFFFFFFFFFFFFFF)) /* bin map state for 60 ones (v2.04) */
|
|
|
|
|
|
/* FUNCTIONS */
|
|
|
|
|
|
/* Byte offset of a field in a structure of the specified type: */
|
|
|
|
#define CVY_FIELD_OFFSET(type, field) ((LONG_PTR)&(((type *)0)->field))
|
|
|
|
/*
|
|
* Address of the base of the structure given its type, field name, and the
|
|
* address of a field or field offset within the structure:
|
|
*/
|
|
|
|
#define STRUCT_PTR(address, type, field) ((type *)( \
|
|
(PCHAR)(address) - \
|
|
(PCHAR)CVY_FIELD_OFFSET(type, field)))
|
|
|
|
/*
|
|
* Function: Load_teaming_consistency_notify
|
|
* Description: This function is called to notify a team in which this adapter
|
|
* might be participating whether the teaming configuration in the
|
|
* heartbeats is consistent or not. Inconsistent configuration
|
|
* results in the entire team being marked inactive - meaning that
|
|
* no adapter in the team will handle any traffic, except to the DIP.
|
|
* Parameters: member - a pointer to the team membership information for this adapter.
|
|
* consistent - a boolean indicating the polarity of teaming consistency.
|
|
* Returns: Nothing.
|
|
* Author: shouse, 3.29.01
|
|
* Notes: In order to check to see whether or not this adapter is part of a team,
|
|
* we need to look into the team member information for this adapter. This
|
|
* access should be locked, but for performance reasons, we will only lock
|
|
* and check for sure if we "think" we're part of a team. Worst case is that
|
|
* we are in the process of joining a team and we missed this check - no
|
|
* matter, we'll notify them when/if we see this again.
|
|
*/
|
|
VOID Load_teaming_consistency_notify (IN PBDA_MEMBER member, IN BOOL consistent) {
|
|
|
|
/* Make sure that the membership information points to something. */
|
|
ASSERT(member);
|
|
|
|
/* We can check without locking to keep the common case minimally expensive. If we do think
|
|
we're part of a team, then we'll grab the lock and make sure. If our first indication is
|
|
that we're not part of a team, then just bail out and if we actually are part of a team,
|
|
we'll be through here again later to notify our team if necessary. */
|
|
if (!member->active) return;
|
|
|
|
NdisAcquireSpinLock(&univ_bda_teaming_lock);
|
|
|
|
/* If we are an active member of a BDA team, then notify our team of our state. */
|
|
if (member->active) {
|
|
/* Assert that the team actually points to something. */
|
|
ASSERT(member->bda_team);
|
|
|
|
/* Assert that the member ID is valid. */
|
|
ASSERT(member->member_id <= CVY_BDA_MAXIMUM_MEMBER_ID);
|
|
|
|
if (consistent) {
|
|
UNIV_PRINT(("Load_teaming_consistency_notify: Consistent configuration detected."));
|
|
|
|
/* Mark this member as consistent. */
|
|
member->bda_team->consistency_map |= (1 << member->member_id);
|
|
} else {
|
|
UNIV_PRINT(("Load_teaming_consistency_notify: Inconsistent configuration detected."));
|
|
|
|
/* Mark this member as inconsistent. */
|
|
member->bda_team->consistency_map &= ~(1 << member->member_id);
|
|
|
|
/* Inactivate the team. */
|
|
member->bda_team->active = FALSE;
|
|
}
|
|
}
|
|
|
|
NdisReleaseSpinLock(&univ_bda_teaming_lock);
|
|
}
|
|
|
|
/*
|
|
* Function: Load_teaming_consistency_check
|
|
* Description: This function is used to check our teaming configuration against the
|
|
* teaming configuration received in a remote heartbeat. It does little
|
|
* more than check the equality of two DWORDS, however, if this is our
|
|
* first notification of bad configuration, it prints a few debug state-
|
|
* ments as well.
|
|
* Parameters: bAlreadyKnown - a boolean indication of whether or not we have already detected bad configuration.
|
|
* If the misconfiguration is already known, no additional logging is done.
|
|
* member - a pointer to the team member structure for this adapter.
|
|
* myConfig - a DWORD containing the teaming "code" for me.
|
|
* theirCofnig - a DWORD containing the teaming "code" received in the heartbeat from them.
|
|
* Returns: BOOLEAN (as ULONG) - TRUE means the configuration is consistent, FALSE indicates that it is not.
|
|
* Author: shouse, 3.29.01
|
|
* Notes: In order to check to see whether or not this adapter is part of a team,
|
|
* we need to look into the team member information for this adapter. This
|
|
* access should be locked, but for performance reasons, we will only lock
|
|
* and check for sure if we "think" we're part of a team. Worst case is that
|
|
* we are in the process of joining a team and we missed this check - no
|
|
* matter, we'll check again on the next heartbeat.
|
|
*/
|
|
ULONG Load_teaming_consistency_check (IN BOOLEAN bAlreadyKnown, IN PBDA_MEMBER member, IN ULONG myConfig, IN ULONG theirConfig) {
|
|
|
|
/* We can check without locking to keep the common case minimally expensive. If we do think
|
|
we're part of a team, then we'll grab the lock and make sure. If our first indication is
|
|
that we're not part of a team, then just bail out and if we actually are part of a team,
|
|
we'll be through here again later to check the consistency. */
|
|
if (!member->active) return TRUE;
|
|
|
|
NdisAcquireSpinLock(&univ_bda_teaming_lock);
|
|
|
|
/* If we are part of a BDA team, check the BDA teaming configuration consistency. */
|
|
if (member->active) {
|
|
|
|
NdisReleaseSpinLock(&univ_bda_teaming_lock);
|
|
|
|
/* If the bi-directional affinity teaming configurations don't match, do something about it. */
|
|
if (myConfig != theirConfig) {
|
|
if (!bAlreadyKnown) {
|
|
UNIV_PRINT(("Bad teaming configuration detected: Mine=0x%08x, Theirs=0x%08x", myConfig, theirConfig));
|
|
|
|
/* Report whether or not the teaming active flags are consistent. */
|
|
if ((myConfig & CVY_BDA_TEAMING_CODE_ACTIVE_MASK) != (theirConfig & CVY_BDA_TEAMING_CODE_ACTIVE_MASK)) {
|
|
UNIV_PRINT(("Teaming active flags do not match: Mine=%d, Theirs=%d",
|
|
(myConfig & CVY_BDA_TEAMING_CODE_ACTIVE_MASK) >> CVY_BDA_TEAMING_CODE_ACTIVE_OFFSET,
|
|
(theirConfig & CVY_BDA_TEAMING_CODE_ACTIVE_MASK) >> CVY_BDA_TEAMING_CODE_ACTIVE_OFFSET));
|
|
}
|
|
|
|
/* Report whether or not the master flags are consistent. */
|
|
if ((myConfig & CVY_BDA_TEAMING_CODE_MASTER_MASK) != (theirConfig & CVY_BDA_TEAMING_CODE_MASTER_MASK)) {
|
|
UNIV_PRINT(("Master/slave settings do not match: Mine=%d, Theirs=%d",
|
|
(myConfig & CVY_BDA_TEAMING_CODE_MASTER_MASK) >> CVY_BDA_TEAMING_CODE_MASTER_OFFSET,
|
|
(theirConfig & CVY_BDA_TEAMING_CODE_MASTER_MASK) >> CVY_BDA_TEAMING_CODE_MASTER_OFFSET));
|
|
}
|
|
|
|
/* Report whether or not the reverse hashing flags are consistent. */
|
|
if ((myConfig & CVY_BDA_TEAMING_CODE_HASHING_MASK) != (theirConfig & CVY_BDA_TEAMING_CODE_HASHING_MASK)) {
|
|
UNIV_PRINT(("Reverse hashing flags do not match: Mine=%d, Theirs=%d",
|
|
(myConfig & CVY_BDA_TEAMING_CODE_HASHING_MASK) >> CVY_BDA_TEAMING_CODE_HASHING_OFFSET,
|
|
(theirConfig & CVY_BDA_TEAMING_CODE_HASHING_MASK) >> CVY_BDA_TEAMING_CODE_HASHING_OFFSET));
|
|
}
|
|
|
|
/* Report whether or not the number of team members is consistent. */
|
|
if ((myConfig & CVY_BDA_TEAMING_CODE_NUM_MEMBERS_MASK) != (theirConfig & CVY_BDA_TEAMING_CODE_NUM_MEMBERS_MASK)) {
|
|
UNIV_PRINT(("Numbers of team members do not match: Mine=%d, Theirs=%d",
|
|
(myConfig & CVY_BDA_TEAMING_CODE_NUM_MEMBERS_MASK) >> CVY_BDA_TEAMING_CODE_NUM_MEMBERS_OFFSET,
|
|
(theirConfig & CVY_BDA_TEAMING_CODE_NUM_MEMBERS_MASK) >> CVY_BDA_TEAMING_CODE_NUM_MEMBERS_OFFSET));
|
|
}
|
|
|
|
/* Report whether or not the team membership lists are consistent. */
|
|
if ((myConfig & CVY_BDA_TEAMING_CODE_MEMBERS_MASK) != (theirConfig & CVY_BDA_TEAMING_CODE_MEMBERS_MASK)) {
|
|
UNIV_PRINT(("Participating members lists do not match: Mine=0x%04x, Theirs=0x%04x",
|
|
(myConfig & CVY_BDA_TEAMING_CODE_MEMBERS_MASK) >> CVY_BDA_TEAMING_CODE_MEMBERS_OFFSET,
|
|
(theirConfig & CVY_BDA_TEAMING_CODE_MEMBERS_MASK) >> CVY_BDA_TEAMING_CODE_MEMBERS_OFFSET));
|
|
}
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
NdisReleaseSpinLock(&univ_bda_teaming_lock);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/*
|
|
* Function: Load_teaming_code_create
|
|
* Description: This function pieces together the ULONG code that represents the configuration
|
|
* of bi-directional affinity teaming on this adapter. If the adapter is not part
|
|
* of a team, then the code is zero.
|
|
* Parameters: code - a pointer to a ULONG that will receive the 32-bit code word.
|
|
* member - a pointer to the team member structure for this adapter.
|
|
* Returns: Nothing.
|
|
* Author: shouse, 3.29.01
|
|
* Notes: In order to check to see whether or not this adapter is part of a team,
|
|
* we need to look into the team member information for this adapter. This
|
|
* access should be locked, but for performance reasons, we will only lock
|
|
* and check for sure if we "think" we're part of a team. Worst case is that
|
|
* we are in the process of joining a team and we missed this check - no
|
|
* matter, we'll be through here the next time er send a heartbeat anyway.
|
|
*/
|
|
VOID Load_teaming_code_create (OUT PULONG code, IN PBDA_MEMBER member) {
|
|
|
|
/* Assert that the code actually points to something. */
|
|
ASSERT(code);
|
|
|
|
/* Assert that the membership information actually points to something. */
|
|
ASSERT(member);
|
|
|
|
/* Reset the code. */
|
|
*code = 0;
|
|
|
|
/* We can check without locking to keep the common case minimally expensive. If we do think
|
|
we're part of a team, then we'll grab the lock and make sure. If our first indication is
|
|
that we're not part of a team, then just bail out and if we actually are part of a team,
|
|
we'll be through here again later to generate the code next time we send a heartbeat. */
|
|
if (!member->active) return;
|
|
|
|
NdisAcquireSpinLock(&univ_bda_teaming_lock);
|
|
|
|
/* If we are in a team, fill in the team configuration information. */
|
|
if (member->active) {
|
|
/* Assert that the team actually points to something. */
|
|
ASSERT(member->bda_team);
|
|
|
|
/* Add configuration information for teaming at each timeout. */
|
|
CVY_BDA_TEAMING_CODE_CREATE(*code,
|
|
member->active,
|
|
member->master,
|
|
member->reverse_hash,
|
|
member->bda_team->membership_count,
|
|
member->bda_team->membership_fingerprint);
|
|
}
|
|
|
|
NdisReleaseSpinLock(&univ_bda_teaming_lock);
|
|
}
|
|
|
|
/*
|
|
* Function: Load_add_reference
|
|
* Description: This function adds a reference to the load module of a given adapter.
|
|
* Parameters: pLoad - a pointer to the load module to reference.
|
|
* Returns: ULONG - The incremented value.
|
|
* Author: shouse, 3.29.01
|
|
* Notes:
|
|
*/
|
|
ULONG Load_add_reference (IN PLOAD_CTXT pLoad) {
|
|
|
|
/* Assert that the load pointer actually points to something. */
|
|
ASSERT(pLoad);
|
|
|
|
/* Increment the reference count. */
|
|
return NdisInterlockedIncrement(&pLoad->ref_count);
|
|
}
|
|
|
|
/*
|
|
* Function: Load_release_reference
|
|
* Description: This function releases a reference on the load module of a given adapter.
|
|
* Parameters: pLoad - a pointer to the load module to dereference.
|
|
* Returns: ULONG - The decremented value.
|
|
* Author: shouse, 3.29.01
|
|
* Notes:
|
|
*/
|
|
ULONG Load_release_reference (IN PLOAD_CTXT pLoad) {
|
|
|
|
/* Assert that the load pointer actually points to something. */
|
|
ASSERT(pLoad);
|
|
|
|
/* Decrement the reference count. */
|
|
return NdisInterlockedDecrement(&pLoad->ref_count);
|
|
}
|
|
|
|
/*
|
|
* Function: Load_get_reference_count
|
|
* Description: This function returns the current reference count on a given adapter.
|
|
* Parameters: pLoad - a pointer to the load module to check.
|
|
* Returns: ULONG - The current reference count.
|
|
* Author: shouse, 3.29.01
|
|
* Notes:
|
|
*/
|
|
ULONG Load_get_reference_count (IN PLOAD_CTXT pLoad) {
|
|
|
|
/* Assert that the load pointer actually points to something. */
|
|
ASSERT(pLoad);
|
|
|
|
/* Return the reference count. */
|
|
return pLoad->ref_count;
|
|
}
|
|
|
|
/* Hash routine is based on a public-domain Tiny Encryption Algorithm (TEA) by
|
|
David Wheeler and Roger Needham at the Computer Laboratory of Cambridge
|
|
University. For reference, please consult
|
|
http://vader.brad.ac.uk/tea/tea.shtml */
|
|
|
|
ULONG Map (
|
|
ULONG v1,
|
|
ULONG v2) /* v2.06: removed range parameter */
|
|
{
|
|
ULONG y = v1,
|
|
z = v2,
|
|
sum = 0;
|
|
|
|
const ULONG a = 0x67; //key [0];
|
|
const ULONG b = 0xdf; //key [1];
|
|
const ULONG c = 0x40; //key [2];
|
|
const ULONG d = 0xd3; //key [3];
|
|
|
|
const ULONG delta = 0x9E3779B9;
|
|
|
|
//
|
|
// Unroll the loop to improve performance
|
|
//
|
|
sum += delta;
|
|
y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
|
|
z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
|
|
|
|
sum += delta;
|
|
y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
|
|
z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
|
|
|
|
sum += delta;
|
|
y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
|
|
z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
|
|
|
|
sum += delta;
|
|
y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
|
|
z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
|
|
|
|
sum += delta;
|
|
y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
|
|
z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
|
|
|
|
sum += delta;
|
|
y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
|
|
z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
|
|
|
|
sum += delta;
|
|
y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
|
|
z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
|
|
|
|
sum += delta;
|
|
y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
|
|
z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
|
|
|
|
return y ^ z;
|
|
} /* end Map */
|
|
|
|
|
|
BOOLEAN Bin_targ_map_get(
|
|
PLOAD_CTXT lp,
|
|
PBIN_STATE binp, /* ptr. to bin state */
|
|
ULONG my_host_id,
|
|
PMAP_T pmap) /* ptr. to target map */
|
|
/*
|
|
Get target map for this host
|
|
|
|
returns BOOLEAN:
|
|
TRUE => valid target map is returned via pmap
|
|
FALSE => error occurred; no target map returned
|
|
*/
|
|
{
|
|
ULONG remsz, /* remainder size */
|
|
loadsz, /* size of a load partition */
|
|
first_bit; /* first bit position of load partition */
|
|
MAP_T targ_map; /* bit map of load bins for this host */
|
|
ULONG tot_load = 0; /* total of load perecentages */
|
|
ULONG * pload_list; /* ptr. to list of load balance perecntages */
|
|
WCHAR num [20];
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
|
|
|
|
|
|
pload_list = binp->load_amt;
|
|
|
|
if (binp->mode == CVY_SINGLE)
|
|
{
|
|
ULONG max_pri; /* highest priority */
|
|
ULONG i;
|
|
|
|
first_bit = 0;
|
|
|
|
/* compute max priority */
|
|
|
|
max_pri = CVY_MAX_HOSTS + 1;
|
|
|
|
for (i=0; i<CVY_MAX_HOSTS; i++)
|
|
{
|
|
tot_load += pload_list[i]; /* v2.1 */
|
|
|
|
if (pload_list[i] != 0)
|
|
{
|
|
//
|
|
// If another host has the same priority as this host, do not converge
|
|
//
|
|
if (i!= my_host_id && pload_list[i] == pload_list[my_host_id])
|
|
{
|
|
if (!(lp->dup_sspri))
|
|
{
|
|
UNIV_PRINT(("Host %d: duplicate single svr priorities detected", my_host_id));
|
|
Univ_ulong_to_str (pload_list[my_host_id], num, 10);
|
|
LOG_MSG(MSG_ERROR_SINGLE_DUP, num);
|
|
|
|
lp->dup_sspri = TRUE;
|
|
}
|
|
|
|
/* 1.03: return error, which inhibits convergence; note that
|
|
rule will be automatically reinstated when duplicate server
|
|
priorities are eliminated */
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
if ( pload_list[i] <= max_pri )
|
|
{
|
|
max_pri = pload_list[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
binp->tot_load = tot_load; /* v2.1 */
|
|
|
|
/* now determine if we are the highest priority host */
|
|
|
|
if (pload_list[my_host_id] == max_pri)
|
|
{
|
|
loadsz = CVY_MAXBINS;
|
|
targ_map = BIN_ALL_ONES; /* v2.05 */
|
|
}
|
|
else
|
|
{
|
|
loadsz = 0;
|
|
targ_map = 0; /* v2.05 */
|
|
}
|
|
}
|
|
|
|
else /* load balanced */
|
|
{
|
|
ULONG i, j;
|
|
ULONG partsz[CVY_MAX_HOSTS+1];
|
|
/* new partition size per host */
|
|
ULONG cur_partsz[CVY_MAX_HOSTS+1];
|
|
/* current partition size per host (v2.05) */
|
|
ULONG cur_host[CVY_MAXBINS];
|
|
/* current host for each bin (v2.05) */
|
|
ULONG tot_partsz; /* sum of partition sizes */
|
|
ULONG donor; /* current donor host (v2.05) */
|
|
ULONG cur_nbins; /* current # bins (v2.05) */
|
|
|
|
/* setup current partition sizes and bin to host mapping from current map (v2.05) */
|
|
|
|
cur_nbins = 0;
|
|
|
|
for (j=0; j<CVY_MAXBINS; j++)
|
|
cur_host[j] = CVY_MAX_HOSTS; /* all bins are initially orphans */
|
|
|
|
for (i=0; i<CVY_MAX_HOSTS; i++)
|
|
{
|
|
ULONG count = 0L;
|
|
MAP_T cmap = binp->cur_map[i];
|
|
|
|
tot_load += pload_list[i]; /* folded into this loop v2.1 */
|
|
|
|
for (j=0; j<CVY_MAXBINS && cmap != ((MAP_T)0); j++)
|
|
{
|
|
/* if host i has bin j and it's not a duplicate, set up the mapping */
|
|
|
|
if ((cmap & ((MAP_T)0x1)) != ((MAP_T)0) && cur_host[j] == CVY_MAX_HOSTS)
|
|
{
|
|
count++;
|
|
cur_host[j] = i;
|
|
}
|
|
cmap >>= 1;
|
|
}
|
|
|
|
cur_partsz[i] = count;
|
|
cur_nbins += count;
|
|
}
|
|
|
|
if (cur_nbins > CVY_MAXBINS)
|
|
{
|
|
UNIV_PRINT(("Bin_targ_map_get: error - too many bins found"));
|
|
LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
|
|
|
|
cur_nbins = CVY_MAXBINS;
|
|
}
|
|
|
|
/* if there are orphan bins, give them to pseudo-host CVY_MAX_HOSTS for now (v2.05) */
|
|
|
|
if (cur_nbins < CVY_MAXBINS)
|
|
cur_partsz[CVY_MAX_HOSTS] = CVY_MAXBINS - cur_nbins;
|
|
else
|
|
cur_partsz[CVY_MAX_HOSTS] = 0;
|
|
|
|
/* compute total load */
|
|
|
|
binp->tot_load = tot_load; /* v2.06 */
|
|
|
|
/* now compute tentative partition sizes and remainder after initially
|
|
dividing up partitions among hosts */
|
|
|
|
tot_partsz = 0;
|
|
first_bit = 0;
|
|
|
|
for (i=0; i<CVY_MAX_HOSTS; i++)
|
|
{
|
|
if (tot_load > 0)
|
|
partsz[i] = CVY_MAXBINS * pload_list[i] / tot_load;
|
|
else
|
|
partsz[i] = 0;
|
|
|
|
tot_partsz += partsz[i];
|
|
}
|
|
|
|
remsz = CVY_MAXBINS - tot_partsz;
|
|
|
|
/* check for zero total load */
|
|
|
|
if (tot_partsz == 0)
|
|
{
|
|
* pmap = 0;
|
|
return TRUE;
|
|
}
|
|
|
|
/* first dole out remainder bits to hosts that currently have bins (this
|
|
minimizes the number of bins that have to move) v2.05 */
|
|
|
|
if (remsz > 0)
|
|
{
|
|
for (i=0; i<CVY_MAX_HOSTS && remsz > 0; i++)
|
|
if (cur_partsz[i] > 0 && pload_list[i] > 0)
|
|
{
|
|
partsz[i]++;
|
|
remsz--;
|
|
}
|
|
}
|
|
|
|
/* now dole out remainder bits to hosts that currently have no bins (to maintain
|
|
the target load balance) v2.05 */
|
|
|
|
if (remsz > 0)
|
|
{
|
|
for (i=0; i<CVY_MAX_HOSTS && remsz > 0; i++)
|
|
if (cur_partsz[i] == 0 && pload_list[i] > 0)
|
|
{
|
|
partsz[i]++;
|
|
remsz--;
|
|
}
|
|
}
|
|
|
|
/* now dole out remainder bits among non-zero partitions round robin */
|
|
|
|
i = 0;
|
|
while (remsz > 0)
|
|
{
|
|
if (pload_list[i] > 0)
|
|
{
|
|
partsz[i]++;
|
|
remsz--;
|
|
}
|
|
|
|
i++;
|
|
if (i == CVY_MAX_HOSTS)
|
|
i = 0;
|
|
}
|
|
|
|
/* reallocate bins to target hosts to match new partition sizes (v2.05) */
|
|
|
|
donor = 0;
|
|
partsz[CVY_MAX_HOSTS] = 0; /* pseudo-host needs no bins */
|
|
|
|
for (i=0; i<CVY_MAX_HOSTS; i++)
|
|
{
|
|
ULONG rcvrsz; /* current receiver's target partition */
|
|
ULONG donorsz; /* current donor's target partition size */
|
|
|
|
/* find and give this host some bins */
|
|
|
|
rcvrsz = partsz[i];
|
|
|
|
while (rcvrsz > cur_partsz[i])
|
|
{
|
|
/* find a host with too many bins */
|
|
|
|
for (; donor < CVY_MAX_HOSTS; donor++)
|
|
if (partsz[donor] < cur_partsz[donor])
|
|
break;
|
|
|
|
/* if donor is pseudo-host and it's out of bins, give it more bins
|
|
to keep algorithm from looping; this should never happen */
|
|
|
|
if (donor >= CVY_MAX_HOSTS && cur_partsz[donor] == 0)
|
|
{
|
|
UNIV_PRINT(("Bin_targ_map_get: error - no donor bins"));
|
|
LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
|
|
cur_partsz[donor] = CVY_MAXBINS;
|
|
}
|
|
|
|
/* now find the donor's bins and give them to the target host */
|
|
|
|
donorsz = partsz[donor]; /* donor's target bin count */
|
|
|
|
for (j=0; j<CVY_MAXBINS; j++)
|
|
{
|
|
if (cur_host[j] == donor)
|
|
{
|
|
cur_host[j] = i;
|
|
cur_partsz[donor]--;
|
|
cur_partsz[i]++;
|
|
|
|
/* if this donor has no more to give, go find the next donor;
|
|
if this receiver needs no more, go on to next receiver */
|
|
|
|
if (donorsz == cur_partsz[donor] || rcvrsz == cur_partsz[i])
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* if no bin was found, log a fatal error and exit */
|
|
|
|
if (j == CVY_MAXBINS)
|
|
{
|
|
UNIV_PRINT(("Bin_targ_map_get: error - no bin found"));
|
|
LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* finally, compute bit mask for this host (v2.05) */
|
|
|
|
targ_map = 0;
|
|
|
|
for (j=0; j<CVY_MAXBINS; j++)
|
|
{
|
|
if (cur_host[j] == CVY_MAX_HOSTS)
|
|
{
|
|
UNIV_PRINT(("Bin_targ_map_get: error - incomplete mapping"));
|
|
LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
|
|
cur_host[j] = 0;
|
|
}
|
|
|
|
if (cur_host[j] == my_host_id)
|
|
targ_map |= ((MAP_T)1) << j;
|
|
}
|
|
}
|
|
|
|
* pmap = targ_map;
|
|
|
|
return TRUE;
|
|
|
|
} /* end Bin_targ_map_get */
|
|
|
|
|
|
BOOLEAN Bin_map_check(
|
|
ULONG tot_load, /* total load percentage (v2.06) */
|
|
PMAP_T pbin_map) /* bin map for all hosts */
|
|
{
|
|
MAP_T tot_map, /* total map for all hosts */
|
|
ovr_map, /* overlap map between hosts */
|
|
exp_tot_map; /* expected total map */
|
|
ULONG i;
|
|
|
|
|
|
/* compute expected total map (2.04) */
|
|
|
|
if (tot_load == 0) /* v2.06 */
|
|
return TRUE;
|
|
else
|
|
exp_tot_map = BIN_ALL_ONES;
|
|
|
|
/* compute total map and overlap map */
|
|
|
|
tot_map = ovr_map = 0;
|
|
|
|
for (i=0; i<CVY_MAX_HOSTS; i++)
|
|
{
|
|
ovr_map |= (pbin_map[i] & tot_map);
|
|
tot_map |= pbin_map[i];
|
|
}
|
|
|
|
if (tot_map == exp_tot_map && ovr_map == 0)
|
|
return TRUE;
|
|
else
|
|
return FALSE;
|
|
|
|
} /* end Bin_map_check */
|
|
|
|
|
|
BOOLEAN Bin_map_covering(
|
|
ULONG tot_load, /* total load percentage (v2.06) */
|
|
PMAP_T pbin_map) /* bin map for all hosts */
|
|
{
|
|
MAP_T tot_map, /* total map for all hosts */
|
|
exp_tot_map; /* expected total map */
|
|
ULONG i;
|
|
|
|
|
|
/* compute expected total map (v2.04) */
|
|
|
|
if (tot_load == 0) /* v2.06 */
|
|
return TRUE;
|
|
else
|
|
exp_tot_map = BIN_ALL_ONES;
|
|
|
|
/* compute total map and overlap map */
|
|
|
|
tot_map = 0;
|
|
|
|
for (i=0; i<CVY_MAX_HOSTS; i++)
|
|
{
|
|
tot_map |= pbin_map[i];
|
|
}
|
|
|
|
if (tot_map == exp_tot_map)
|
|
return TRUE;
|
|
else
|
|
return FALSE;
|
|
|
|
} /* end Bin_map_covering */
|
|
|
|
|
|
void Bin_state_init(
|
|
PLOAD_CTXT lp,
|
|
PBIN_STATE binp, /* ptr. to bin state */
|
|
ULONG index, /* index of bin state */
|
|
ULONG my_host_id,
|
|
ULONG mode,
|
|
ULONG prot,
|
|
BOOLEAN equal_bal, /* TRUE => balance equally across hosts */
|
|
USHORT affinity,
|
|
ULONG load_amt) /* this host's load percentage if unequal */
|
|
/*
|
|
Initialize bin state for a port group
|
|
*/
|
|
{
|
|
ULONG i; /* loop variable */
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
|
|
|
|
|
|
if ((equal_bal && mode == CVY_SINGLE) ||
|
|
(mode == CVY_SINGLE && load_amt > CVY_MAX_HOSTS) ||
|
|
index >= CVY_MAXBINS)
|
|
{
|
|
UNIV_ASSERT(FALSE); // This should never happen
|
|
}
|
|
|
|
binp->code = CVY_BINCODE; /* (bbain 8/19/99) */
|
|
binp->equal_bal = equal_bal;
|
|
binp->affinity = affinity;
|
|
binp->index = index;
|
|
binp->compatible = TRUE;
|
|
binp->mode = mode;
|
|
binp->prot = prot;
|
|
|
|
/* initialize target and new load maps */
|
|
|
|
binp->targ_map = 0;
|
|
binp->all_idle_map = BIN_ALL_ONES;
|
|
binp->cmap = 0; /* v2.1 */
|
|
|
|
for (i=0; i<CVY_MAX_HOSTS; i++)
|
|
{
|
|
binp->new_map[i] = 0;
|
|
binp->cur_map[i] = 0;
|
|
binp->chk_map[i] = 0;
|
|
binp->idle_map[i] = BIN_ALL_ONES;
|
|
}
|
|
|
|
/* initialize load percentages for all hosts */
|
|
|
|
if (equal_bal)
|
|
{
|
|
load_amt = CVY_EQUAL_LOAD;
|
|
}
|
|
|
|
binp->tot_load = load_amt;
|
|
|
|
for (i=0; i<CVY_MAX_HOSTS; i++)
|
|
{
|
|
if (i == my_host_id)
|
|
{
|
|
binp->orig_load_amt =
|
|
binp->load_amt[i] = load_amt;
|
|
}
|
|
else
|
|
binp->load_amt[i] = 0;
|
|
}
|
|
|
|
/* initialize requesting state to no requests active and all bins local or none */
|
|
|
|
binp->snd_bins = 0;
|
|
binp->rcv_bins = 0;
|
|
binp->rdy_bins = 0;
|
|
binp->idle_bins = BIN_ALL_ONES; /* we are initially idle */
|
|
|
|
/* perform first initialization only once (v2.06) */
|
|
|
|
if (!(binp->initialized))
|
|
{
|
|
binp->tconn = 0;
|
|
|
|
for (i=0; i<CVY_MAXBINS; i++)
|
|
{
|
|
binp->nconn[i] = 0;
|
|
}
|
|
|
|
Queue_init(&(binp->connq));
|
|
binp->initialized = TRUE;
|
|
}
|
|
|
|
} /* end Bin_state_init */
|
|
|
|
|
|
BOOLEAN Bin_converge(
|
|
PLOAD_CTXT lp,
|
|
PBIN_STATE binp, /* ptr. to bin state */
|
|
ULONG my_host_id)
|
|
/*
|
|
Explicitly attempt to converge new port group state
|
|
|
|
returns BOOL:
|
|
TRUE => all hosts have consistent new state for converging
|
|
FALSE => parameter error or inconsistent convergence state
|
|
*/
|
|
{
|
|
MAP_T orphan_map; /* map of orphans that this host will now own */
|
|
ULONG i;
|
|
|
|
|
|
/* determine new target load map; 1.03: return in error if no map generated */
|
|
|
|
if (!Bin_targ_map_get(lp, binp, my_host_id, &(binp->targ_map)))
|
|
return FALSE;
|
|
|
|
/* compute map of all currently orphan bins; note that all duplicates are
|
|
considered to be orphans */
|
|
|
|
orphan_map = 0;
|
|
for (i=0; i<CVY_MAX_HOSTS; i++)
|
|
orphan_map |= binp->cur_map[i];
|
|
|
|
orphan_map = ~orphan_map;
|
|
|
|
/* update our new map to include all current bins and orphans that are in the
|
|
target set */
|
|
|
|
binp->new_map[my_host_id] = binp->cmap | /* v2.1 */
|
|
(binp->targ_map & orphan_map); /* 1.03 */
|
|
|
|
/* check that new load maps are consistent and covering */
|
|
|
|
return Bin_map_check(binp->tot_load, binp->new_map); /* v2.06 */
|
|
|
|
} /* end Bin_converge */
|
|
|
|
|
|
void Bin_converge_commit(
|
|
PLOAD_CTXT lp,
|
|
PBIN_STATE binp, /* ptr. to bin state */
|
|
ULONG my_host_id)
|
|
/*
|
|
Commit to new port group state
|
|
*/
|
|
{
|
|
ULONG i;
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
|
|
|
|
/* check that new load maps are consistent and covering */
|
|
|
|
if (!(Bin_map_check(binp->tot_load, binp->new_map))) /* v2.06 */
|
|
{
|
|
if (!(lp->bad_map))
|
|
{
|
|
UNIV_PRINT(("Bin_converge_commit: bad new map"));
|
|
LOG_MSG1(MSG_ERROR_INTERNAL, MSG_NONE, (ULONG_PTR)binp->new_map);
|
|
|
|
lp->bad_map = TRUE;
|
|
}
|
|
}
|
|
|
|
/* commit to new current maps */
|
|
|
|
for (i=0; i<CVY_MAX_HOSTS; i++)
|
|
{
|
|
binp->chk_map[i] =
|
|
binp->cur_map[i] = binp->new_map[i];
|
|
}
|
|
|
|
/* setup new send/rcv bins, and new ready to ship bins; note that ready to
|
|
ship bins are cleared from the current map */
|
|
|
|
binp->rdy_bins = binp->cur_map[my_host_id] & ~(binp->targ_map); /* 1.03 */
|
|
|
|
binp->cur_map[my_host_id] &= ~(binp->rdy_bins);
|
|
|
|
binp->rcv_bins = binp->targ_map & ~(binp->cur_map[my_host_id]);
|
|
|
|
binp->cmap = binp->cur_map[my_host_id]; /* v2.1 */
|
|
|
|
#if 0
|
|
/* simulation output generator (2.05) */
|
|
{
|
|
ULONG lcount = 0L;
|
|
ULONG ncount = 0L;
|
|
MAP_T bins = binp->rdy_bins;
|
|
|
|
for (i=0; i<CVY_MAXBINS && bins != 0; i++, bins >>= 1)
|
|
if ((bins & ((MAP_T)0x1)) != ((MAP_T)0))
|
|
lcount++;
|
|
|
|
bins = binp->targ_map;
|
|
|
|
for (i=0; i<CVY_MAXBINS && bins != 0; i++, bins >>= 1)
|
|
if ((bins & ((MAP_T)0x1)) != ((MAP_T)0))
|
|
ncount++;
|
|
|
|
printf("Connverge at host %d pg %d: losing %d, will have %d bins\n", my_host_id, binp->index,
|
|
lcount, ncount);
|
|
}
|
|
#endif
|
|
|
|
} /* end Bin_converge_commit */
|
|
|
|
|
|
BOOLEAN Bin_host_update(
|
|
PLOAD_CTXT lp,
|
|
PBIN_STATE binp, /* ptr. to bin state */
|
|
ULONG my_host_id, /* my host's id MINUS one */
|
|
BOOLEAN converging, /* TRUE => we are converging now */
|
|
BOOLEAN rem_converging, /* TRUE => remote host is converging */
|
|
ULONG rem_host, /* remote host's id MINUS one */
|
|
MAP_T cur_map, /* remote host's current map or 0 if host died */
|
|
MAP_T new_map, /* remote host's new map if converging */
|
|
MAP_T idle_map, /* remote host's idle map */
|
|
MAP_T rdy_bins, /* bins that host is ready to send; ignored
|
|
if converging to prevent bin transfers */
|
|
ULONG pkt_count, /* remote host's packet count */
|
|
ULONG load_amt) /* remote host's load percentage */
|
|
/*
|
|
Update hosts's state for a port group
|
|
|
|
returns BOOL:
|
|
TRUE => if not converging, normal return
|
|
otherwise, all hosts have consistent state for converging
|
|
FALSE => parameter error or inconsistent convergence state
|
|
|
|
function:
|
|
Updates hosts's state for a port group and attempts to converge new states if
|
|
in convergence mode. Called when a ping message is received or when a host
|
|
is considered to have died. Handles case of newly discovered hosts. Can be
|
|
called multiple times with the same information.
|
|
*/
|
|
{
|
|
ULONG i;
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
|
|
|
|
if (rem_host >= CVY_MAX_HOSTS || rem_host == my_host_id)
|
|
{
|
|
UNIV_PRINT(("Bin_host_update: parameter error"));
|
|
LOG_MSG2(MSG_ERROR_INTERNAL, MSG_NONE, rem_host+1, my_host_id+1);
|
|
return FALSE;
|
|
}
|
|
|
|
UNIV_ASSERT(binp->code == CVY_BINCODE); /* (bbain 8/19/99) */
|
|
|
|
#if 0 /* v2.06 */
|
|
/* update current load balance information */
|
|
|
|
if (binp->equal_bal && load_amt > 0)
|
|
{
|
|
load_amt = CVY_EQUAL_LOAD;
|
|
}
|
|
#endif
|
|
|
|
/* change load percentage if load changed */
|
|
|
|
if (load_amt != binp->load_amt[rem_host])
|
|
{
|
|
#if 0 /* v2.06 */
|
|
binp->tot_load += (load_amt - binp->load_amt[rem_host]);
|
|
#endif
|
|
binp->load_amt[rem_host] = load_amt;
|
|
}
|
|
|
|
|
|
/* check for non-overlapping maps */
|
|
|
|
if ((binp->cmap & cur_map) != 0) /* v2.1 */
|
|
{
|
|
/* if we have received fewer packets than the other host or have a higher host id,
|
|
remove duplicates from current map; this uses a heuristic that a newly joining
|
|
host that was subnetted probably did not receive packets; we are trying to avoid
|
|
having two hosts answer to the same client while minimizing disruption of service
|
|
(v1.32B) */
|
|
|
|
if (lp->send_msg.pkt_count < pkt_count ||
|
|
(lp->send_msg.pkt_count == pkt_count && rem_host < my_host_id))
|
|
{
|
|
MAP_T dup_map;
|
|
|
|
dup_map = binp->cmap & cur_map; /* v2.1 */
|
|
|
|
binp->cur_map[my_host_id] &= ~dup_map;
|
|
binp->cmap = binp->cur_map[my_host_id]; /* v2.1 */
|
|
|
|
Load_conn_kill(lp, binp);
|
|
}
|
|
|
|
if (!converging && !rem_converging)
|
|
{
|
|
if (!(lp->overlap_maps))
|
|
{
|
|
UNIV_PRINT(("Host %d: two hosts with overlapping maps detected %d.", my_host_id, binp->index));
|
|
LOG_MSG2(MSG_WARN_OVERLAP, MSG_NONE, my_host_id+1, binp->index);
|
|
|
|
lp->overlap_maps = TRUE;
|
|
}
|
|
|
|
/* force convergence if in normal operations */
|
|
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
/* now update remote host's current map */
|
|
|
|
binp->cur_map[rem_host] = cur_map;
|
|
|
|
/* update idle map and calculate new global idle map if it's changed */
|
|
|
|
if (binp->idle_map[rem_host] != idle_map)
|
|
{
|
|
MAP_T saved_map = binp->all_idle_map;
|
|
MAP_T new_idle_map = BIN_ALL_ONES;
|
|
MAP_T tmp_map;
|
|
|
|
binp->idle_map[rem_host] = idle_map;
|
|
|
|
/* compute new idle map for all other hosts */
|
|
|
|
for (i=0; i<CVY_MAX_HOSTS; i++)
|
|
if (i != my_host_id)
|
|
new_idle_map &= binp->idle_map[i];
|
|
|
|
binp->all_idle_map = new_idle_map;
|
|
|
|
/* see which locally owned bins have gone idle in all other hosts */
|
|
|
|
tmp_map = new_idle_map & (~saved_map) & binp->cmap; /* v2.1 */
|
|
|
|
if (tmp_map != 0)
|
|
{
|
|
UNIV_PRINT(("Host %d pg %d: detected new all idle %08x for local bins",
|
|
my_host_id, binp->index, tmp_map));
|
|
}
|
|
|
|
tmp_map = saved_map & (~new_idle_map) & binp->cmap; /* v2.1 */
|
|
|
|
if (tmp_map != 0)
|
|
{
|
|
UNIV_PRINT(("Host %d pg %d: detected new non-idle %08x for local bins",
|
|
my_host_id, binp->index, tmp_map));
|
|
}
|
|
}
|
|
/* 1.03: eliminated else clause */
|
|
|
|
/* if we are not converging AND other host not converging, exchange bins;
|
|
convergence must now be complete for both hosts */
|
|
|
|
if (!converging)
|
|
{
|
|
if (!rem_converging) { /* 1.03: reorganized code to exchange bins only when both
|
|
hosts are not converging to avoid using stale bins */
|
|
|
|
MAP_T new_bins; /* incoming bins from the remote host */
|
|
|
|
/* check to see if remote host has received some bins from us */
|
|
|
|
binp->rdy_bins &= (~cur_map);
|
|
|
|
/* check to see if we can receive some bins */
|
|
|
|
new_bins = binp->rcv_bins & rdy_bins;
|
|
|
|
if (new_bins != 0)
|
|
{
|
|
if ((binp->cmap & new_bins) != 0) /* v2.1 */
|
|
{
|
|
if (!(lp->err_rcving_bins))
|
|
{
|
|
UNIV_PRINT(("Bin_host_update: receiving bins already own"));
|
|
LOG_MSG2(MSG_ERROR_INTERNAL, MSG_NONE, binp->cur_map[my_host_id], new_bins);
|
|
|
|
lp->err_rcving_bins = TRUE;
|
|
}
|
|
}
|
|
|
|
binp->cur_map[my_host_id] |= new_bins;
|
|
binp->rcv_bins &= ~new_bins;
|
|
|
|
binp->cmap = binp->cur_map[my_host_id]; /* v2.1 */
|
|
|
|
UNIV_PRINT(("====== host %d pg %d: received %08x ; cur now %08x",
|
|
my_host_id, binp->index, new_bins, binp->cur_map[my_host_id]));
|
|
}
|
|
|
|
/* do consistency check that all bins are covered */
|
|
|
|
binp->chk_map[rem_host] = cur_map | rdy_bins;
|
|
binp->chk_map[my_host_id] = binp->cmap | binp->rdy_bins; /* v2.1 */
|
|
|
|
if (!Bin_map_covering(binp->tot_load, binp->chk_map)) /* v2.06 */
|
|
{
|
|
if (!(lp->err_orphans))
|
|
{
|
|
#if 0
|
|
UNIV_PRINT(("Host %d: orphan bins detected", my_host_id));
|
|
LOG_MSG1(MSG_ERROR_INTERNAL, MSG_NONE, my_host_id+1);
|
|
#endif
|
|
lp->err_orphans = TRUE;
|
|
}
|
|
}
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* otherwise, store proposed new load map and try to converge current host data */
|
|
|
|
else
|
|
{
|
|
binp->chk_map[rem_host] =
|
|
binp->new_map[rem_host] = new_map;
|
|
|
|
return Bin_converge(lp, binp, my_host_id);
|
|
}
|
|
|
|
} /* end Bin_host_update */
|
|
|
|
|
|
void Bin_state_print(
|
|
PBIN_STATE binp, /* ptr. to bin state */
|
|
ULONG my_host_id)
|
|
{
|
|
#if 0
|
|
ULONG i;
|
|
#endif
|
|
|
|
UNIV_PRINT(("hst %d binp %x: maps: targ %x cur %x new %x; eq %d mode %d amt %d tot %d; bins: snd %x rcv %x rdy %x",
|
|
my_host_id, binp, binp->targ_map, binp->cur_map[my_host_id], binp->new_map[my_host_id],
|
|
binp->equal_bal, binp->mode, binp->load_amt[my_host_id],
|
|
binp->tot_load, binp->snd_bins, binp->rcv_bins, binp->rdy_bins));
|
|
|
|
#if 0
|
|
for (i=0; i<CVY_MAX_HOSTS; i++)
|
|
{
|
|
UNIV_PRINT(("host %d: cur map %x new %x load_amt %d", i+1, binp->cur_map[i],
|
|
binp->new_map[i], binp->load_amt[i]));
|
|
}
|
|
|
|
for (i=0; i<CVY_MAXBINS; i++)
|
|
{
|
|
UNIV_PRINT(("bin %d: req_host %d bin_state %d nconn %d", i, binp->req_host[i],
|
|
binp->bin_state[i], binp->nconn[i]));
|
|
}
|
|
#endif
|
|
|
|
} /* end Bin_state_print */
|
|
|
|
|
|
void Load_conn_kill(
|
|
PLOAD_CTXT lp,
|
|
PBIN_STATE bp)
|
|
/*
|
|
Kill all connections in a port group (v1.32B)
|
|
*/
|
|
{
|
|
PCONN_ENTRY ep; /* ptr. to connection entry */
|
|
PCONN_DESCR dp; /* ptr. to connection descriptor */
|
|
QUEUE * qp; /* ptr. to bin's connection queue */
|
|
QUEUE * dqp; /* ptr. to dirty queue */
|
|
QUEUE * fqp; /* ptr. to free queue */
|
|
LONG count[CVY_MAXBINS];
|
|
/* count of cleaned up connections per bin for checking */
|
|
ULONG i;
|
|
BOOLEAN err_bin; /* bin id error detected */
|
|
BOOLEAN err_count; /* connection count error detected */
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
|
|
|
|
err_bin =
|
|
err_count = FALSE;
|
|
|
|
qp = &(bp->connq);
|
|
dqp = &(lp->conn_dirtyq);
|
|
fqp = &(lp->conn_freeq);
|
|
|
|
for (i=0; i<CVY_MAXBINS; i++)
|
|
count[i] = 0;
|
|
|
|
#ifdef TRACE_DIRTY
|
|
DbgPrint ("marking connections as dirty");
|
|
#endif
|
|
|
|
/* remove connections from bin queue and either make dirty or cleanup */
|
|
|
|
ep = (PCONN_ENTRY)Queue_deq(qp);
|
|
|
|
while (ep != NULL)
|
|
{
|
|
UNIV_ASSERT (ep->code == CVY_ENTRCODE); /* (bbain 8/19/99) */
|
|
|
|
if (ep->bin >= CVY_MAXBINS)
|
|
{
|
|
if (!err_bin)
|
|
{
|
|
UNIV_PRINT(("Load_conn_kill: bad bin id"));
|
|
LOG_MSG2(MSG_ERROR_INTERNAL, MSG_NONE, ep->bin, CVY_MAXBINS);
|
|
|
|
err_bin = TRUE;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
count[ep->bin]++;
|
|
}
|
|
|
|
/* make connection and bin dirty if we don't have a zero timeout period so that they
|
|
will not be handled by TCP/IP anymore; this avoids allowing TCP/IP's now stale
|
|
connection state from handling packets for newer connections should traffic be
|
|
directed to this host in the future */
|
|
|
|
if (lp->cln_timeout > 0)
|
|
{
|
|
ep->dirty = TRUE;
|
|
Queue_enq(dqp, &(ep->blink));
|
|
|
|
lp->dirty_bin[ep->bin] = TRUE;
|
|
lp->cln_waiting = TRUE;
|
|
}
|
|
|
|
/* otherwise, just cleanup the connection */
|
|
|
|
else
|
|
{
|
|
CVY_CONN_CLEAR(ep); /* v2.06 */
|
|
|
|
Link_unlink(&(ep->rlink)); /* V2.1.5 */
|
|
|
|
/* if entry is not in the hash table, free the descriptor */
|
|
|
|
if (ep->alloc)
|
|
{
|
|
dp = STRUCT_PTR(ep, CONN_DESCR, entry);
|
|
UNIV_ASSERT (dp->code == CVY_DESCCODE); /* (bbain 8/19/99) */
|
|
|
|
Link_unlink(&(dp->link));
|
|
Queue_enq(fqp, &(dp->link));
|
|
}
|
|
}
|
|
|
|
ep = (PCONN_ENTRY)Queue_deq(qp);
|
|
}
|
|
|
|
/* now make bins idle */
|
|
|
|
for (i=0; i<CVY_MAXBINS; i++)
|
|
{
|
|
if (bp->nconn[i] != count[i])
|
|
{
|
|
if (!err_count)
|
|
{
|
|
UNIV_PRINT(("Load_conn_kill: bad connection count %d %d bin %d", bp->nconn[i], (LONG)count[i], i));
|
|
|
|
/* KXF 2.1.1 - removed after tripped up at MSFT a few times */
|
|
#if 0
|
|
LOG_MSG3(MSG_ERROR_INTERNAL, MSG_NONE, bp->nconn[i], (LONG)count[i], i);
|
|
#endif
|
|
|
|
err_count = TRUE;
|
|
}
|
|
}
|
|
|
|
bp->nconn[i] = 0;
|
|
}
|
|
|
|
lp->nconn -= bp->tconn; /* v2.1 */
|
|
if (lp->nconn < 0)
|
|
lp->nconn = 0;
|
|
bp->tconn = 0; /* v2.06 */
|
|
|
|
bp->idle_bins = BIN_ALL_ONES;
|
|
|
|
/* if we at least one connection is dirty, restart cleanup timeout period */
|
|
|
|
if (lp->cln_waiting)
|
|
{
|
|
#ifdef TRACE_DIRTY
|
|
DbgPrint ("setting cleanup timeout");
|
|
#endif
|
|
lp->cur_time = 0;
|
|
}
|
|
else
|
|
{
|
|
#ifdef TRACE_DIRTY
|
|
DbgPrint ("no dirty connections found");
|
|
#endif
|
|
}
|
|
|
|
} /* end Load_conn_kill */
|
|
|
|
|
|
void Load_conn_cleanup(
|
|
PLOAD_CTXT lp)
|
|
/*
|
|
Clean up all dirty connections (v1.32B)
|
|
*/
|
|
{
|
|
PCONN_ENTRY ep; /* ptr. to connection entry */
|
|
PCONN_DESCR dp; /* ptr. to connection descriptor */
|
|
QUEUE * fqp; /* ptr. to free queue */
|
|
QUEUE * dqp; /* ptr. to dirty queue */
|
|
BOOLEAN err_bin; /* bin id error detected */
|
|
ULONG i;
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
|
|
|
|
err_bin = FALSE;
|
|
|
|
dqp = &(lp->conn_dirtyq);
|
|
fqp = &(lp->conn_freeq);
|
|
|
|
#ifdef TRACE_DIRTY
|
|
DbgPrint ("cleaning up dirty connections");
|
|
#endif
|
|
|
|
/* dequeue and clean up all connections on dirty connection queue */
|
|
|
|
ep = (PCONN_ENTRY)Queue_deq(dqp);
|
|
|
|
while (ep != NULL)
|
|
{
|
|
UNIV_ASSERT (ep->code == CVY_ENTRCODE); /* (bbain 8/19/99) */
|
|
|
|
if (ep->bin >= CVY_MAXBINS)
|
|
{
|
|
if (!err_bin)
|
|
{
|
|
UNIV_PRINT(("Load_conn_cleanup: bad bin id"));
|
|
LOG_MSG2(MSG_ERROR_INTERNAL, MSG_NONE, ep->bin, CVY_MAXBINS);
|
|
|
|
err_bin = TRUE;
|
|
}
|
|
}
|
|
|
|
CVY_CONN_CLEAR(ep);
|
|
|
|
ep->dirty = FALSE;
|
|
|
|
Link_unlink(&(ep->rlink)); /* V2.1.5 */
|
|
|
|
/* if entry is not in the hash table, free the descriptor */
|
|
|
|
if (ep->alloc)
|
|
{
|
|
dp = STRUCT_PTR(ep, CONN_DESCR, entry);
|
|
UNIV_ASSERT (dp->code == CVY_DESCCODE); /* (bbain 8/19/99) */
|
|
|
|
Link_unlink(&(dp->link));
|
|
Queue_enq(fqp, &(dp->link));
|
|
}
|
|
|
|
ep = (PCONN_ENTRY)Queue_deq(dqp);
|
|
}
|
|
|
|
/* clear all dirty bin flags */
|
|
|
|
for (i=0; i<CVY_MAXBINS; i++)
|
|
lp->dirty_bin[i] = FALSE;
|
|
|
|
} /* end Load_conn_cleanup */
|
|
|
|
|
|
void Load_stop(
|
|
PLOAD_CTXT lp)
|
|
{
|
|
ULONG i;
|
|
IRQLEVEL irql;
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
|
|
|
|
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
|
|
|
|
if (!(lp->active))
|
|
return;
|
|
|
|
LOCK_ENTER(&(lp->lock), &irql);
|
|
|
|
/* make connections for all rules dirty so they will not be handled */
|
|
|
|
for (i=0; i<lp->send_msg.nrules; i++)
|
|
{
|
|
PBIN_STATE bp; /* ptr. to bin state */
|
|
|
|
bp = &(lp->pg_state[i]);
|
|
UNIV_ASSERT(bp->code == CVY_BINCODE); /* (bbain 8/21/99) */
|
|
|
|
Load_conn_kill(lp, bp); /* (v1.32B) */
|
|
|
|
/* advertise that we are not handling any load in case a ping is sent out */
|
|
|
|
lp->send_msg.cur_map[i] = 0;
|
|
lp->send_msg.new_map[i] = 0;
|
|
lp->send_msg.idle_map[i] = BIN_ALL_ONES;
|
|
lp->send_msg.rdy_bins[i] = 0;
|
|
lp->send_msg.load_amt[i] = 0;
|
|
}
|
|
|
|
lp->send_msg.state = HST_CVG; /* force convergence (v2.1) */
|
|
|
|
/* go inactive until restarted */
|
|
|
|
lp->active = FALSE;
|
|
lp->nconn = 0; /* v2.1 */
|
|
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
|
|
} /* end Load_stop */
|
|
|
|
|
|
void Load_start( /* (v1.32B) */
|
|
PLOAD_CTXT lp)
|
|
{
|
|
ULONG i;
|
|
BOOLEAN ret;
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
|
|
WCHAR me[20];
|
|
|
|
if (!(lp->initialized))
|
|
Load_init(lp, & ctxtp -> params);
|
|
|
|
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
|
|
|
|
if (lp->active)
|
|
return;
|
|
|
|
lp->my_host_id =(* (lp->params)).host_priority - 1;
|
|
|
|
lp->ping_map =
|
|
lp->host_map = 1 << lp->my_host_id;
|
|
|
|
lp->last_hmap = 0; /* bbain RTM RC1 6/23/99 */
|
|
|
|
for (i=0; i<CVY_MAX_HOSTS; i++)
|
|
{
|
|
lp->nmissed_pings[i] = 0;
|
|
}
|
|
|
|
lp->min_missed_pings = (* (lp->params)).alive_tolerance;
|
|
lp->cln_timeout = (* (lp->params)).cleanup_delay;
|
|
lp->def_timeout = (* (lp->params)).alive_period;
|
|
lp->stable_map = 0;
|
|
lp->consistent = TRUE;
|
|
|
|
/* Intiialize the bad teaming configuration detected flag. */
|
|
lp->bad_team_config = FALSE;
|
|
|
|
lp->dup_hosts = FALSE;
|
|
lp->dup_sspri = FALSE;
|
|
lp->bad_map = FALSE;
|
|
lp->overlap_maps = FALSE;
|
|
lp->err_rcving_bins = FALSE;
|
|
lp->err_orphans = FALSE;
|
|
lp->bad_num_rules = FALSE;
|
|
lp->alloc_inhibited = FALSE;
|
|
lp->alloc_failed = FALSE;
|
|
lp->bad_defrule = FALSE;
|
|
|
|
lp->scale_client = (BOOLEAN)(* (lp->params)).scale_client;
|
|
lp->my_stable_ct = 0;
|
|
lp->all_stable_ct = 0;
|
|
lp->min_stable_ct = lp->min_missed_pings;
|
|
|
|
lp->dscr_per_alloc = (* (lp->params)).dscr_per_alloc;
|
|
lp->max_dscr_allocs = (* (lp->params)).max_dscr_allocs;
|
|
|
|
lp->pkt_count = 0; /* 1.32B */
|
|
|
|
/* initialize port group bin states; add a default rule at the end */
|
|
|
|
if ((* (lp->params)).num_rules >= (CVY_MAX_RULES - 1))
|
|
{
|
|
UNIV_PRINT(("Load_start: too many rules; using max possible."));
|
|
lp->send_msg.nrules = (USHORT)CVY_MAX_RULES;
|
|
}
|
|
else
|
|
lp->send_msg.nrules = (USHORT)((* (lp->params)).num_rules) + 1;
|
|
|
|
for (i=0; i<lp->send_msg.nrules; i++)
|
|
{
|
|
PBIN_STATE bp; /* ptr. to bin state */
|
|
PCVY_RULE rp; /* ptr. to rules array */
|
|
|
|
bp = &(lp->pg_state[i]);
|
|
rp = &((* (lp->params)).port_rules[i]);
|
|
|
|
if (i == (((ULONG)lp->send_msg.nrules) - 1))
|
|
|
|
/* initialize bin state for default rule to single server with
|
|
host priority */
|
|
|
|
Bin_state_init(lp, bp, i, lp->my_host_id, CVY_SINGLE, CVY_TCP_UDP,
|
|
FALSE, (USHORT)0, (* (lp->params)).host_priority);
|
|
|
|
else if (rp->mode == CVY_SINGLE)
|
|
Bin_state_init(lp, bp, i, lp->my_host_id, rp->mode, rp->protocol,
|
|
FALSE, (USHORT)0, rp->mode_data.single.priority);
|
|
else if (rp->mode == CVY_MULTI)
|
|
Bin_state_init(lp, bp, i, lp->my_host_id, rp->mode, rp->protocol,
|
|
(BOOLEAN)(rp->mode_data.multi.equal_load),
|
|
rp->mode_data.multi.affinity,
|
|
(rp->mode_data.multi.equal_load ?
|
|
CVY_EQUAL_LOAD : rp->mode_data.multi.load));
|
|
|
|
/* handle CVY_NEVER mode as multi-server. the check for
|
|
those modes is done before attempting to hash to the bin in
|
|
Load_packet_check and Load_conn_advise so bin distribution plays
|
|
no role in the behavior, but simply allows the rule to be valid
|
|
across all of the operational servers */
|
|
|
|
else
|
|
Bin_state_init(lp, bp, i, lp->my_host_id, rp->mode, rp->protocol,
|
|
TRUE, (USHORT)0, CVY_EQUAL_LOAD);
|
|
|
|
ret = Bin_converge(lp, bp, lp->my_host_id);
|
|
if (!ret)
|
|
{
|
|
UNIV_PRINT(("Load_start: initial convergence inconsistent"));
|
|
LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
|
|
}
|
|
|
|
/* export current port group state to send msg */
|
|
|
|
if (i == (((ULONG)(lp->send_msg.nrules)) - 1))
|
|
lp->send_msg.rcode[i]= 0;
|
|
else
|
|
lp->send_msg.rcode[i]= rp->code;
|
|
|
|
lp->send_msg.cur_map[i] = bp->cmap; /* v2.1 */
|
|
lp->send_msg.new_map[i] = bp->new_map[lp->my_host_id];
|
|
lp->send_msg.idle_map[i] = bp->idle_bins;
|
|
lp->send_msg.rdy_bins[i] = bp->rdy_bins;
|
|
lp->send_msg.load_amt[i] = bp->load_amt[lp->my_host_id];
|
|
/* ###### for keynote - ramkrish */
|
|
lp->send_msg.pg_rsvd1[i] = (ULONG)bp->all_idle_map;
|
|
}
|
|
|
|
/* initialize send msg */
|
|
|
|
lp->send_msg.host_id = (USHORT)(lp->my_host_id);
|
|
lp->send_msg.master_id = (USHORT)(lp->my_host_id);
|
|
lp->send_msg.hcode = lp->params->install_date;
|
|
lp->send_msg.pkt_count = lp->pkt_count; /* 1.32B */
|
|
|
|
Univ_ulong_to_str (lp->my_host_id+1, me, 10);
|
|
|
|
/* Tracking convergence - Starting convergence because this host is joining the cluster. */
|
|
LOG_MSGS(MSG_INFO_CONVERGING_NEW_MEMBER, me, me);
|
|
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d is joining the cluster.", lp->my_host_id+1, lp->my_host_id+1);
|
|
|
|
/* Tracking convergence - Starting convergence. */
|
|
lp->send_msg.state = HST_CVG;
|
|
|
|
/* activate module */
|
|
|
|
lp->active = TRUE;
|
|
|
|
} /* end Load_start */
|
|
|
|
|
|
void Load_init(
|
|
PLOAD_CTXT lp,
|
|
PCVY_PARAMS params)
|
|
{
|
|
ULONG i;
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
|
|
|
|
TRACE_INFO("-> Load_init: lp=0x%p, param=0x%p", lp, params);
|
|
|
|
LOCK_INIT(&(lp->lock));
|
|
|
|
if (!(lp->initialized))
|
|
{
|
|
lp->code = CVY_LOADCODE; /* (bbain 8/19/99) */
|
|
|
|
/* initialize hashed connection descriptors and queues */
|
|
|
|
for (i=0; i<CVY_MAX_CHASH; i++)
|
|
{
|
|
PCONN_ENTRY ep;
|
|
|
|
ep = &(lp->hashed_conn[i]);
|
|
|
|
ep->code = CVY_ENTRCODE; /* (bbain 8/19/99) */
|
|
ep->alloc = FALSE;
|
|
ep->dirty = FALSE; /* v1.32B */
|
|
|
|
CVY_CONN_CLEAR(ep);
|
|
Link_init(&(ep->blink));
|
|
Link_init(&(ep->rlink)); /* V2.1.5 */
|
|
|
|
Queue_init(&(lp->connq[i]));
|
|
}
|
|
|
|
/* initialize connection free and dirty queues; free descriptors */
|
|
|
|
Queue_init(&(lp->conn_freeq));
|
|
Queue_init(&(lp->conn_dirtyq)); /* v1.32B */
|
|
Queue_init(&(lp->conn_rcvryq)); /* V2.1.5 */
|
|
|
|
for (i=0; i<CVY_INIT_QCONN; i++)
|
|
{
|
|
lp->conn_descr[i].code = CVY_DESCCODE; /* (bbain 8/19/99) */
|
|
Link_init(&(lp->conn_descr[i].link));
|
|
lp->conn_descr[i].entry.code = CVY_ENTRCODE; /* (bbain 8/21/99) */
|
|
lp->conn_descr[i].entry.alloc = TRUE;
|
|
lp->conn_descr[i].entry.dirty = FALSE; /* v1.32B */
|
|
|
|
CVY_CONN_CLEAR(&(lp->conn_descr[i].entry));
|
|
Link_init(&(lp->conn_descr[i].entry.blink));
|
|
Link_init(&(lp->conn_descr[i].entry.rlink)); /* V2.1.5 */
|
|
|
|
Queue_enq(&(lp->conn_freeq), &(lp->conn_descr[i].link));
|
|
}
|
|
|
|
/* (v1.32B) */
|
|
|
|
for (i=0; i<CVY_MAXBINS; i++)
|
|
lp->dirty_bin[i] = FALSE;
|
|
|
|
lp->cln_waiting = FALSE;
|
|
lp->def_timeout =
|
|
lp->cur_timeout = params -> alive_period;
|
|
lp->nqalloc = 0;
|
|
lp->nconn = 0; /* v2.1 */
|
|
lp->active = FALSE;
|
|
lp->initialized = TRUE;
|
|
|
|
/* clear list of descriptor queue allocations (bbain 2/25/99) */
|
|
|
|
for (i=0; i<CVY_MAX_MAX_DSCR_ALLOCS; i++)
|
|
lp->qalloc_list[i] = (PCONN_DESCR)NULL;
|
|
|
|
lp -> params = params;
|
|
}
|
|
else
|
|
{
|
|
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
|
|
}
|
|
|
|
/* Initialize the reference count on this load module. */
|
|
lp->ref_count = 0;
|
|
|
|
|
|
/* don't start module (v1.32B) */
|
|
TRACE_INFO("<- Load_init");
|
|
|
|
} /* end Load_init */
|
|
|
|
|
|
void Load_cleanup( /* (bbain 2/25/99) */
|
|
PLOAD_CTXT lp)
|
|
{
|
|
ULONG i;
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
|
|
|
|
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
|
|
|
|
/* free all descriptor queue allocations */
|
|
|
|
if (lp->nqalloc > CVY_MAX_MAX_DSCR_ALLOCS)
|
|
lp->nqalloc = CVY_MAX_MAX_DSCR_ALLOCS;
|
|
|
|
for (i=0; i<lp->nqalloc; i++)
|
|
if (lp->qalloc_list[i] != (PCONN_DESCR)NULL)
|
|
free((PVOID)(lp->qalloc_list[i]));
|
|
|
|
} /* end Load_cleanup */
|
|
|
|
|
|
void Load_convergence_start(
|
|
PLOAD_CTXT lp)
|
|
{
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
|
|
lp->consistent = TRUE; /* 1.03 */
|
|
|
|
/* setup initial convergence state */
|
|
|
|
lp->send_msg.state = HST_CVG;
|
|
|
|
lp->stable_map = 0;
|
|
lp->my_stable_ct = 0;
|
|
lp->all_stable_ct = 0;
|
|
|
|
lp->send_msg.master_id = (USHORT)(lp->my_host_id);
|
|
|
|
} /* end Load_convergence_start */
|
|
|
|
|
|
void Load_msg_rcv(
|
|
PLOAD_CTXT lp,
|
|
PPING_MSG pmsg) /* ptr. to ping message */
|
|
{
|
|
ULONG i;
|
|
BOOLEAN consistent;
|
|
ULONG my_host;
|
|
ULONG rem_host;
|
|
ULONG saved_map; /* saved host map */
|
|
PPING_MSG sendp; /* ptr. to my send message */
|
|
IRQLEVEL irql;
|
|
WCHAR me[20];
|
|
WCHAR them[20];
|
|
ULONG map;
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
|
|
|
|
/* Used for tracking convergence and event logging. */
|
|
BOOLEAN bInconsistentMaster = FALSE;
|
|
BOOLEAN bInconsistentTeaming = FALSE;
|
|
BOOLEAN bInconsistentPortRules = FALSE;
|
|
|
|
UNIV_ASSERT(lp->code == CVY_LOADCODE);
|
|
|
|
TRACE_HB("Recv HB from host %d", (ULONG) pmsg->host_id + 1);
|
|
|
|
if (!(lp->active))
|
|
return;
|
|
|
|
my_host = lp->my_host_id;
|
|
rem_host = (ULONG) pmsg->host_id;
|
|
|
|
Univ_ulong_to_str (my_host+1, me, 10);
|
|
Univ_ulong_to_str (rem_host+1, them, 10);
|
|
|
|
sendp = &(lp->send_msg);
|
|
|
|
if (rem_host >= CVY_MAX_HOSTS)
|
|
return;
|
|
|
|
LOCK_ENTER(&(lp->lock), &irql);
|
|
|
|
/* filter out packets broadcast by this host */
|
|
|
|
if(rem_host == my_host)
|
|
{
|
|
/* if this packet was really from another host, we have duplicate host ids */
|
|
|
|
if (sendp->hcode != pmsg->hcode)
|
|
{
|
|
if (!(lp->dup_hosts))
|
|
{
|
|
UNIV_PRINT(("Duplicate host ids detected."));
|
|
|
|
LOG_MSG(MSG_ERROR_HOST_ID, me);
|
|
|
|
lp->dup_hosts = TRUE;
|
|
}
|
|
|
|
/* Tracking convergence - Starting convergence because duplicate host IDs were detected in the cluster. */
|
|
if (sendp->state == HST_NORMAL) {
|
|
LOG_MSGS(MSG_INFO_CONVERGING_DUPLICATE_HOST_ID, me, them);
|
|
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d is configured with the same host ID.", my_host+1, rem_host+1);
|
|
}
|
|
|
|
/* Tracking convergence - Starting convergence. */
|
|
Load_convergence_start(lp);
|
|
}
|
|
|
|
/* just update ping and host maps for us */
|
|
lp->ping_map |= (1 << my_host);
|
|
lp->host_map |= (1 << my_host);
|
|
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
return;
|
|
}
|
|
|
|
if (sendp->nrules != pmsg->nrules)
|
|
{
|
|
if (!(lp->bad_num_rules))
|
|
{
|
|
UNIV_PRINT(("Host %d: Hosts have diff # rules.", my_host));
|
|
|
|
LOG_MSG2(MSG_ERROR_RULES_MISMATCH, them, sendp->nrules, pmsg->nrules);
|
|
|
|
lp->bad_num_rules = TRUE;
|
|
}
|
|
|
|
/* Tracking convergence - Starting convergence because the number of port rules on this host and the remote host do not match. */
|
|
if (sendp->state == HST_NORMAL) {
|
|
LOG_MSGS(MSG_INFO_CONVERGING_NUM_RULES, me, them);
|
|
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d is configured with a conflicting number of port rules.", my_host+1, rem_host+1);
|
|
}
|
|
|
|
/* Tracking convergence - Starting convergence. */
|
|
Load_convergence_start(lp);
|
|
|
|
/* just update ping and host maps for remote host (bbain 2/17/99) */
|
|
|
|
lp->ping_map |= (1 << rem_host);
|
|
lp->host_map |= (1 << rem_host);
|
|
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
return;
|
|
}
|
|
|
|
/* update mastership and see if consistent */
|
|
|
|
if (rem_host < sendp->master_id)
|
|
sendp->master_id = (USHORT)rem_host;
|
|
|
|
consistent = sendp->master_id == pmsg->master_id; /* 1.03 */
|
|
|
|
/* For the purposes of logging the reason for convergence, note this inconsistency. */
|
|
if (!consistent) bInconsistentMaster = TRUE;
|
|
|
|
/* update ping and host maps to include remote host */
|
|
|
|
lp->ping_map |= (1 << rem_host);
|
|
|
|
saved_map = lp->host_map;
|
|
lp->host_map |= (1 << rem_host);
|
|
|
|
/* handle host convergence */
|
|
|
|
if (sendp->state != HST_NORMAL)
|
|
{
|
|
/* if master, update stable map for remote host */
|
|
|
|
if (sendp->master_id == my_host)
|
|
{
|
|
if (pmsg->state == HST_STABLE)
|
|
{
|
|
lp->stable_map |= (1 << rem_host);
|
|
}
|
|
else
|
|
{
|
|
lp->stable_map &= ~(1 << rem_host);
|
|
lp->all_stable_ct = 0;
|
|
}
|
|
}
|
|
|
|
/* otherwise, update state if have global stable convergence and the current
|
|
master has signalled completion by returning to the normal state; note
|
|
that we must do this prior to updating port group states */
|
|
|
|
else if (rem_host == sendp->master_id && pmsg->state == HST_NORMAL)
|
|
{
|
|
if (sendp->state == HST_STABLE)
|
|
{
|
|
sendp->state = HST_NORMAL;
|
|
|
|
/* Notify our BDA team that this cluster is consistently configured.
|
|
If we are not part of a BDA team, this call is essentially a no-op. */
|
|
Load_teaming_consistency_notify(&ctxtp->bda_teaming, TRUE);
|
|
|
|
/* Reset the bad teaming configuration detected flag if we are converged. */
|
|
lp->bad_team_config = FALSE;
|
|
|
|
lp->dup_hosts = FALSE;
|
|
lp->dup_sspri = FALSE;
|
|
lp->bad_map = FALSE;
|
|
lp->overlap_maps = FALSE;
|
|
lp->err_rcving_bins = FALSE;
|
|
lp->err_orphans = FALSE;
|
|
lp->bad_num_rules = FALSE;
|
|
lp->pkt_count = 0; /* v1.32B */
|
|
|
|
for (i=0; i<sendp->nrules; i++)
|
|
{
|
|
PBIN_STATE bp;
|
|
|
|
bp = &(lp->pg_state[i]);
|
|
|
|
bp->compatible = TRUE; /* 1.03 */
|
|
|
|
Bin_converge_commit(lp, bp, my_host);
|
|
|
|
UNIV_PRINT(("Host %d pg %d: new cur map %x idle %x all %x",
|
|
my_host, i, bp->cur_map[my_host], bp->idle_bins,
|
|
bp->all_idle_map));
|
|
|
|
#if 0 /* 1.03: only update ping message in Load_timeout to avoid locking send */
|
|
|
|
/* export current port group state */
|
|
|
|
sendp->cur_map[i] = bp->cmap; /* v2.1 */
|
|
sendp->new_map[i] = bp->new_map[my_host];
|
|
sendp->idle_map[i] = bp->idle_bins;
|
|
sendp->rdy_bins[i] = bp->rdy_bins;
|
|
sendp->load_amt[i] = bp->load_amt[my_host];
|
|
#endif
|
|
}
|
|
|
|
#if 0
|
|
sendp->pkt_count = lp->pkt_count; /* 1.32B */
|
|
#endif
|
|
|
|
UNIV_PRINT(("Host %d: converged as slave", my_host));
|
|
/* log convergence completion if host map changed (bbain RTM RC1 6/23/99) */
|
|
Load_hosts_query (lp, TRUE, & map);
|
|
lp->last_hmap = lp->host_map;
|
|
}
|
|
else
|
|
{
|
|
/* Tracking convergence - Starting convergence because the DEFAULT host prematurely ended convergence. In this case, we
|
|
are guaranteed to already be in the HST_CVG state, and because this message can be misleading in some circumstances,
|
|
we do not log an event. For instance, due to timing issues, when a host joins a cluster he can receive a HST_NORMAL
|
|
heartbeat from the DEFAULT host while it is still in the HST_CVG state simply because that heartbeat left the DEFAULT
|
|
host before it received our first heartbeat, which initiated convergence. */
|
|
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d, the DEFAULT host, prematurely terminated convergence.", my_host+1, rem_host+1);
|
|
|
|
/* Tracking convergence - Starting convergence. */
|
|
Load_convergence_start(lp);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Compare the teaming configuration of this host with the remote host. If the
|
|
two are inconsitent and we are part of a team, we will initiate convergence. */
|
|
if (!Load_teaming_consistency_check(lp->bad_team_config, &ctxtp->bda_teaming, sendp->teaming, pmsg->teaming)) {
|
|
/* Only log an event if the teaming configuration was, but is now not, consistent. */
|
|
if (!lp->bad_team_config) {
|
|
/* Note that we saw this. */
|
|
lp->bad_team_config = TRUE;
|
|
|
|
/* Log the event. */
|
|
LOG_MSG(MSG_ERROR_BDA_BAD_TEAM_CONFIG, them);
|
|
}
|
|
|
|
/* Notify the team that this cluster is NOT consistently configured. */
|
|
Load_teaming_consistency_notify(&ctxtp->bda_teaming, FALSE);
|
|
|
|
/* Mark the heartbeats inconsistent to force and retain convergence. */
|
|
consistent = FALSE;
|
|
|
|
/* For the purposes of logging the reason for convergence, note this inconsistency. */
|
|
bInconsistentTeaming = TRUE;
|
|
}
|
|
|
|
/* update port group state */
|
|
|
|
for (i=0; i<sendp->nrules; i++)
|
|
{
|
|
BOOLEAN ret;
|
|
PBIN_STATE bp;
|
|
|
|
bp = &lp->pg_state[i];
|
|
|
|
/* if rule codes don't match, print message and handle incompatibility (1.03: note
|
|
that we previously marked rule invalid, which would stop processing) */
|
|
|
|
if (sendp->rcode[i] != pmsg->rcode[i])
|
|
{
|
|
/* 1.03: if rule was peviously compatible, print message */
|
|
|
|
if (bp->compatible)
|
|
{
|
|
PCVY_RULE rp;
|
|
|
|
UNIV_PRINT(("Host %d pg %d: rule codes do not match.", lp->my_host_id, i));
|
|
|
|
/* bbain 8/27/99 */
|
|
LOG_MSG4(MSG_ERROR_RULES_MISMATCH, them, rem_host, i, sendp->rcode[i], pmsg->rcode[i]);
|
|
|
|
/* Get the port rule information for this rule. */
|
|
rp = &lp->params->port_rules[i];
|
|
|
|
/* Check to see if this is an issue with a win2k host in a cluster utilizing virtual clusters. */
|
|
if ((rp->virtual_ip_addr != CVY_ALL_VIP_NUMERIC_VALUE) && ((sendp->rcode[i] ^ ~rp->virtual_ip_addr) == pmsg->rcode[i])) {
|
|
UNIV_PRINT((" ** A Windows 2000 or NT4 host MAY be participating in a cluster utilizing virtual cluster support."));
|
|
LOG_MSG(MSG_WARN_VIRTUAL_CLUSTERS, MSG_NONE);
|
|
}
|
|
|
|
bp->compatible = FALSE;
|
|
}
|
|
|
|
/* 1.03: mark rule inconsistent to force and continue convergence */
|
|
|
|
consistent = FALSE;
|
|
|
|
/* For the purposes of logging the reason for convergence, note this inconsistency. */
|
|
bInconsistentPortRules = TRUE;
|
|
|
|
/* don't update bin state */
|
|
|
|
continue;
|
|
}
|
|
|
|
ret = Bin_host_update(lp, bp, my_host, (BOOLEAN)(sendp->state != HST_NORMAL),
|
|
(BOOLEAN)(pmsg->state != HST_NORMAL),
|
|
rem_host, pmsg->cur_map[i], pmsg->new_map[i],
|
|
pmsg->idle_map[i], pmsg->rdy_bins[i],
|
|
pmsg->pkt_count, pmsg->load_amt[i]);
|
|
|
|
#if 0 /* 1.03: only update ping message in Load_timeout to avoid locking send */
|
|
|
|
/* export current port group state */
|
|
|
|
sendp->cur_map[i] = bp->cmap; /* v2.1 */
|
|
sendp->new_map[i] = bp->new_map[my_host];
|
|
sendp->idle_map[i] = bp->idle_bins;
|
|
sendp->rdy_bins[i] = bp->rdy_bins;
|
|
sendp->load_amt[i] = bp->load_amt[my_host];
|
|
#endif
|
|
|
|
if (!ret)
|
|
consistent = FALSE;
|
|
}
|
|
|
|
/* update our consistency state */
|
|
|
|
lp->consistent = consistent;
|
|
|
|
/* if we are in normal operation and we discover a new host or a host goes into
|
|
convergence or we discover an inconsistency, go into convergence */
|
|
|
|
if (sendp->state == HST_NORMAL)
|
|
{
|
|
if (lp->host_map != saved_map || pmsg->state == HST_CVG || !consistent)
|
|
{
|
|
/* If a host has joined the cluster, or if inconsistent teaming configuration or port
|
|
rules were detected, then we need to log an event. However, we segregate the
|
|
inconsistent master host flag because it is set by the initiating host in MANY
|
|
occasions, so we want to log the most specific reason(s) for convergence if
|
|
possible and only report the inconsistent master detection only if nothing more
|
|
specific can be deduced. */
|
|
if (lp->host_map != saved_map || bInconsistentTeaming || bInconsistentPortRules) {
|
|
|
|
/* If the host maps are different, then we know that the host from which we received
|
|
this packet is joining the cluster because the ONLY operation on the host map in
|
|
this function is to ADD a remote host to our map. Otherwise, if the map has not
|
|
changed, then an inconsistent configuration got us into the branch. */
|
|
if (lp->host_map != saved_map) {
|
|
/* Tracking convergence - Starting convergence because another host is joining the cluster. */
|
|
LOG_MSGS(MSG_INFO_CONVERGING_NEW_MEMBER, me, them);
|
|
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d is joining the cluster.", my_host+1, rem_host+1);
|
|
} else if (bInconsistentTeaming || bInconsistentPortRules) {
|
|
/* Tracking convergence - Starting convergence because inconsistent configuration was detected. */
|
|
LOG_MSGS(MSG_INFO_CONVERGING_BAD_CONFIG, me, them);
|
|
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d has conflicting configuration.", my_host+1, rem_host+1);
|
|
}
|
|
|
|
/* If we have nothing better to report, report convergence for an unspecific reason. */
|
|
} else if (bInconsistentMaster || pmsg->state == HST_CVG) {
|
|
/* Tracking convergence - Starting convergence for unknown reasons. */
|
|
LOG_MSGS(MSG_INFO_CONVERGING_UNKNOWN, me, them);
|
|
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d is converging for an unknown reason.", my_host+1, rem_host+1);
|
|
}
|
|
|
|
/* Tracking convergence - Starting convergence. */
|
|
Load_convergence_start(lp);
|
|
}
|
|
}
|
|
|
|
/* otherwise, if we are in convergence and we see an inconsistency, just restart
|
|
our local convergence */
|
|
|
|
else
|
|
{
|
|
/* update our consistency state; if we didn't see consistent information,
|
|
restart this host's convergence */
|
|
|
|
if (!consistent)
|
|
{
|
|
/* Tracking convergence - Starting convergence because inconsistent configuration was detected.
|
|
This keeps hosts in a state of convergence when hosts are inconsistently configured. However,
|
|
since the cluster is already in a state of convergece (HST_CVG or HST_STABLE), don't log an
|
|
event, which may confuse a user. */
|
|
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d has conflicting configuration.", my_host+1, rem_host+1);
|
|
|
|
/* Tracking convergence - Starting convergence. */
|
|
sendp->state = HST_CVG;
|
|
lp->my_stable_ct = 0;
|
|
lp->stable_map &= ~(1 << my_host);
|
|
lp->all_stable_ct = 0;
|
|
}
|
|
}
|
|
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
|
|
} /* end Load_msg_rcv */
|
|
|
|
|
|
PPING_MSG Load_snd_msg_get(
|
|
PLOAD_CTXT lp)
|
|
{
|
|
return &(lp->send_msg);
|
|
|
|
} /* end Load_snd_msg_get */
|
|
|
|
|
|
BOOLEAN Load_timeout(
|
|
PLOAD_CTXT lp,
|
|
PULONG new_timeout,
|
|
PBOOLEAN pconverging,
|
|
PULONG pnconn)
|
|
/*
|
|
Note: we only update ping message in this function since we know that upper level code
|
|
sends out ping messages after calling this routine. We cannot be sure that Load_msg_rcv
|
|
is sequentialized with sending a message, (1.03)
|
|
|
|
Upper level code locks this routine wrt Load_msg_rcv, Load_packet_check, and
|
|
Load_conn_advise. (1.03)
|
|
*/
|
|
{
|
|
ULONG missed_pings;
|
|
ULONG my_host;
|
|
ULONG i;
|
|
PPING_MSG sendp; /* ptr. to my send message */
|
|
IRQLEVEL irql;
|
|
ULONG map; /* returned host map from query */
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
|
|
|
|
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
|
|
|
|
LOCK_ENTER(&(lp->lock), &irql);
|
|
|
|
/* check for cleanup timeout (v1.32B) */
|
|
|
|
if (lp->cln_waiting)
|
|
{
|
|
lp->cur_time += lp->cur_timeout;
|
|
|
|
if (lp->cur_time >= lp->cln_timeout)
|
|
{
|
|
Load_conn_cleanup(lp);
|
|
|
|
lp->cln_waiting = FALSE;
|
|
}
|
|
}
|
|
|
|
/* return if not active */
|
|
|
|
if (!(lp->active))
|
|
{
|
|
if (new_timeout != NULL)
|
|
* new_timeout = lp->cur_timeout = lp->def_timeout;
|
|
if (pnconn != NULL) /* v2.1 */
|
|
* pnconn = lp->nconn;
|
|
if (pconverging != NULL)
|
|
* pconverging = FALSE;
|
|
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
return FALSE;
|
|
}
|
|
|
|
my_host = lp->my_host_id;
|
|
sendp = &(lp->send_msg);
|
|
|
|
/* compute which hosts missed pings and reset ping map */
|
|
|
|
missed_pings = lp->host_map & (~lp->ping_map);
|
|
|
|
#ifdef NO_CLEANUP
|
|
lp->ping_map = 1 << my_host;
|
|
#else
|
|
lp->ping_map = 0;
|
|
#endif
|
|
|
|
/* check whether any host is dead, including ourselves */
|
|
|
|
for (i=0; i<CVY_MAX_HOSTS; i++)
|
|
{
|
|
/* if we have a missed ping for this host, increment count */
|
|
|
|
if ((missed_pings & 0x1) == 1)
|
|
{
|
|
lp->nmissed_pings[i]++;
|
|
|
|
/* if we missed too many pings, declare host dead and force convergence */
|
|
|
|
if (lp->nmissed_pings[i] == lp->min_missed_pings)
|
|
{
|
|
ULONG j;
|
|
BOOLEAN ret;
|
|
WCHAR me[20];
|
|
WCHAR them[20];
|
|
|
|
if (i == my_host)
|
|
{
|
|
UNIV_PRINT(("Host %d: missed too many pings; this host declared offline", i));
|
|
|
|
/* reset our packet count since we are likely not to be receiving
|
|
packets from others now; this will make us less favored to
|
|
handle duplicate bins later (v1.32B) */
|
|
|
|
lp->pkt_count = 0;
|
|
}
|
|
|
|
lp->host_map &= ~(1<<i);
|
|
|
|
for (j=0; j<sendp->nrules; j++)
|
|
{
|
|
PBIN_STATE bp;
|
|
|
|
bp = &(lp->pg_state[j]);
|
|
UNIV_ASSERT(bp->code == CVY_BINCODE); /* (bbain 8/19/99) */
|
|
|
|
if (i == my_host)
|
|
{
|
|
ULONG k;
|
|
|
|
/* cleanup connections and restore maps to clean state */
|
|
|
|
Load_conn_kill(lp, bp);
|
|
|
|
bp->targ_map = 0;
|
|
bp->all_idle_map = BIN_ALL_ONES;
|
|
bp->cmap = 0; /* v2.1 */
|
|
bp->compatible = TRUE; /* v1.03 */
|
|
|
|
for (k=0; k<CVY_MAX_HOSTS; k++)
|
|
{
|
|
bp->new_map[k] = 0;
|
|
bp->cur_map[k] = 0;
|
|
bp->chk_map[k] = 0;
|
|
bp->idle_map[k] = BIN_ALL_ONES;
|
|
|
|
if (k != i)
|
|
bp->load_amt[k] = 0;
|
|
}
|
|
|
|
bp->snd_bins =
|
|
bp->rcv_bins =
|
|
bp->rdy_bins = 0;
|
|
bp->idle_bins = BIN_ALL_ONES;
|
|
|
|
/* compute initial new map for convergence as only host in cluster
|
|
(v 1.3.2B) */
|
|
|
|
ret = Bin_converge(lp, bp, lp->my_host_id);
|
|
if (!ret)
|
|
{
|
|
UNIV_PRINT(("Load_timeout: initial convergence inconsistent"));
|
|
LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ret = Bin_host_update(lp, bp, my_host, TRUE, TRUE,
|
|
i, 0, 0, BIN_ALL_ONES, 0, 0, 0);
|
|
}
|
|
}
|
|
|
|
lp->nmissed_pings[i] = 0;
|
|
|
|
/* If a host has dropped out of the cluster, then log an event. However, we don't
|
|
log an event when we drop out because the only way for us to drop out of our own
|
|
cluster is if we are stopping anyway, or if we have lost network connectivity.
|
|
Logging such events may be misleading, so we won't bother. */
|
|
if (i != my_host) {
|
|
Univ_ulong_to_str (my_host+1, me, 10);
|
|
Univ_ulong_to_str (i+1, them, 10);
|
|
|
|
/* Tracking convergence - Starting convergence because a member has fallen out of the cluster. */
|
|
LOG_MSGS(MSG_INFO_CONVERGING_MEMBER_LOST, me, them);
|
|
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d is leaving the cluster.", my_host+1, i+1);
|
|
}
|
|
|
|
/* Tracking convergence - Starting convergence. */
|
|
Load_convergence_start(lp);
|
|
}
|
|
}
|
|
|
|
/* otherwise reset missed ping count */
|
|
|
|
else
|
|
lp->nmissed_pings[i] = 0;
|
|
|
|
missed_pings >>= 1;
|
|
}
|
|
|
|
/* handle convergence */
|
|
|
|
if (sendp->state != HST_NORMAL)
|
|
{
|
|
/* check whether we have been consistent and have received our own pings
|
|
for a sufficient period to move to a stable state and announce it to
|
|
other hosts */
|
|
|
|
if (sendp->state == HST_CVG)
|
|
{
|
|
if (lp->consistent && ((lp->host_map & (1 << my_host)) != 0))
|
|
{
|
|
lp->my_stable_ct++;
|
|
if (lp->my_stable_ct >= lp->min_stable_ct)
|
|
{
|
|
sendp->state = HST_STABLE;
|
|
lp->stable_map |= (1 << my_host);
|
|
}
|
|
}
|
|
else
|
|
lp->my_stable_ct = lp->all_stable_ct = 0; /* wlb B3RC1 */
|
|
}
|
|
|
|
/* otherwise, see if we are the master and everybody's been stable for
|
|
a sufficient period for us to terminate convergence */
|
|
|
|
else if (sendp->state == HST_STABLE &&
|
|
my_host == sendp->master_id &&
|
|
lp->stable_map == lp->host_map)
|
|
{
|
|
lp->all_stable_ct++;
|
|
if (lp->all_stable_ct >= lp->min_stable_ct)
|
|
{
|
|
sendp->state = HST_NORMAL;
|
|
|
|
/* Notify our BDA team that this cluster is consistently configured.
|
|
If we are not part of BDA team, this call is essentially a no-op. */
|
|
Load_teaming_consistency_notify(&ctxtp->bda_teaming, TRUE);
|
|
|
|
/* Reset the bad teaming configuration detected flag if we are converged. */
|
|
lp->bad_team_config = FALSE;
|
|
|
|
lp->dup_hosts = FALSE;
|
|
lp->dup_sspri = FALSE;
|
|
lp->bad_map = FALSE;
|
|
lp->overlap_maps = FALSE;
|
|
lp->err_rcving_bins = FALSE;
|
|
lp->err_orphans = FALSE;
|
|
lp->bad_num_rules = FALSE;
|
|
lp->pkt_count = 0; /* v1.32B */
|
|
|
|
for (i=0; i<sendp->nrules; i++)
|
|
{
|
|
PBIN_STATE bp;
|
|
BOOLEAN ret;
|
|
|
|
bp = &(lp->pg_state[i]);
|
|
|
|
bp->compatible = TRUE; /* 1.03 */
|
|
|
|
/* explicitly converge to new map in case we're the only host (v2.06) */
|
|
|
|
ret = Bin_converge(lp, bp, lp->my_host_id);
|
|
if (!ret)
|
|
{
|
|
UNIV_PRINT(("Load_timeout: final convergence inconsistent"));
|
|
LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
|
|
}
|
|
|
|
Bin_converge_commit(lp, bp, my_host);
|
|
|
|
UNIV_PRINT(("Host %d pg %d: new cur map %x idle %x all %x",
|
|
my_host, i, bp->cur_map[my_host], bp->idle_bins,
|
|
bp->all_idle_map));
|
|
}
|
|
|
|
UNIV_PRINT(("+++ Host %d: converged as master +++", my_host));
|
|
/* log convergence completion if host map changed (bbain RTM RC1 6/23/99) */
|
|
Load_hosts_query (lp, TRUE, & map);
|
|
lp->last_hmap = lp->host_map;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* 1.03: update ping message */
|
|
|
|
for (i=0; i<sendp->nrules; i++)
|
|
{
|
|
PBIN_STATE bp;
|
|
|
|
bp = &(lp->pg_state[i]);
|
|
|
|
/* export current port group state to ping message */
|
|
|
|
sendp->cur_map[i] = bp->cmap; /* v2.1 */
|
|
sendp->new_map[i] = bp->new_map[my_host];
|
|
sendp->idle_map[i] = bp->idle_bins;
|
|
sendp->rdy_bins[i] = bp->rdy_bins;
|
|
sendp->load_amt[i] = bp->load_amt[my_host];
|
|
/* ###### for keynote - ramkrish */
|
|
sendp->pg_rsvd1[i] = (ULONG)bp->all_idle_map;
|
|
}
|
|
|
|
sendp->pkt_count = lp->pkt_count; /* 1.32B */
|
|
|
|
/* Add configuration information for teaming at each timeout. */
|
|
Load_teaming_code_create(&lp->send_msg.teaming, &ctxtp->bda_teaming);
|
|
|
|
/* request fast timeout if converging */
|
|
|
|
if (new_timeout != NULL) /* 1.03 */
|
|
{
|
|
if (sendp->state != HST_NORMAL)
|
|
* new_timeout = lp->cur_timeout = lp->def_timeout / 2;
|
|
else
|
|
* new_timeout = lp->cur_timeout = lp->def_timeout;
|
|
}
|
|
|
|
if (pnconn != NULL) /* v2.1 */
|
|
* pnconn = lp->nconn;
|
|
if (pconverging != NULL)
|
|
* pconverging = (sendp->state != HST_NORMAL);
|
|
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
|
|
return ((lp->host_map) != 0);
|
|
|
|
} /* end Load_timeout */
|
|
|
|
|
|
PBIN_STATE Load_pg_lookup(
|
|
PLOAD_CTXT lp,
|
|
ULONG svr_ipaddr,
|
|
ULONG svr_port,
|
|
BOOLEAN is_tcp)
|
|
{
|
|
PCVY_RULE rp; /* ptr. to rules array */
|
|
PBIN_STATE bp; /* ptr. to bin state */
|
|
ULONG i;
|
|
ULONG nurules; /* # user defined rules */
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
|
|
|
|
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
|
|
|
|
rp = (* (lp->params)).port_rules;
|
|
nurules = (* (lp->params)).num_rules;
|
|
|
|
/* check for invalid port value (bbain RC1 6/14/99) */
|
|
|
|
UNIV_ASSERT(svr_port <= CVY_MAX_PORT);
|
|
|
|
/* find server port rule */
|
|
|
|
for (i=0; i<nurules; i++)
|
|
{
|
|
/* For virtual clusters: If the server IP address matches the VIP for the port rule,
|
|
or if the VIP for the port rule is "ALL VIPs", and if the port lies in the range
|
|
for this rule, and if the protocol matches, this is the rule. Notice that this
|
|
give priority to rules for specific VIPs over those for "ALL VIPs", which means
|
|
that this code RELIES on the port rules being sorted by VIP/port where the "ALL
|
|
VIP" ports rules are at the end of the port rule list. */
|
|
if ((svr_ipaddr == rp->virtual_ip_addr || CVY_ALL_VIP_NUMERIC_VALUE == rp->virtual_ip_addr) &&
|
|
(svr_port >= rp->start_port && svr_port <= rp->end_port) &&
|
|
((is_tcp && rp->protocol != CVY_UDP) || (!is_tcp && rp->protocol != CVY_TCP)))
|
|
break;
|
|
else
|
|
rp++;
|
|
}
|
|
|
|
/* use default rule if port not found or rule is invalid */
|
|
|
|
bp = &(lp->pg_state[i]);
|
|
UNIV_ASSERT(bp->code == CVY_BINCODE); /* (bbain 8/19/99) */
|
|
|
|
return bp;
|
|
|
|
} /* end Load_pg_lookup */
|
|
|
|
|
|
BOOLEAN Load_packet_check(
|
|
PLOAD_CTXT lp,
|
|
ULONG svr_ipaddr,
|
|
ULONG svr_port,
|
|
ULONG client_ipaddr,
|
|
ULONG client_port,
|
|
USHORT protocol,
|
|
BOOLEAN limit_map_fn)
|
|
{
|
|
PBIN_STATE bp; /* ptr. to bin state */
|
|
ULONG id; /* hash index for the connection */
|
|
ULONG bin; /* bin index */
|
|
QUEUE * qp; /* ptr. to connection queue */
|
|
IRQLEVEL irql;
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
|
|
BOOLEAN is_tcp_pkt = (protocol == TCPIP_PROTOCOL_TCP);
|
|
BOOLEAN is_session_pkt;
|
|
|
|
is_session_pkt = is_tcp_pkt;
|
|
|
|
if (NLB_IPSEC_SESSION_SUPPORT_ENABLED() && (protocol == TCPIP_PROTOCOL_IPSEC1))
|
|
{
|
|
is_session_pkt = TRUE;
|
|
}
|
|
|
|
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
|
|
|
|
if (! lp -> active)
|
|
return FALSE;
|
|
|
|
lp->pkt_count++; /* increment count of pkts handled (v1.32B) */
|
|
|
|
bp = Load_pg_lookup(lp, svr_ipaddr, svr_port, is_tcp_pkt);
|
|
|
|
/* V2.2 make sure that Load_pg_lookup properly handled protocol specific rules */
|
|
|
|
UNIV_ASSERT ((is_tcp_pkt && bp->prot != CVY_UDP) || (!is_tcp_pkt && bp->prot != CVY_TCP));
|
|
|
|
/* handle CVY_NEVER mode immediately */
|
|
|
|
if (bp->mode == CVY_NEVER)
|
|
return FALSE;
|
|
|
|
/* lookup connection entry in hash table */
|
|
if (limit_map_fn) {
|
|
if (bp->affinity == CVY_AFFINITY_NONE)
|
|
id = Map(client_ipaddr, MAP_FN_PARAMETER);
|
|
else if (bp->affinity == CVY_AFFINITY_SINGLE)
|
|
id = Map(client_ipaddr, MAP_FN_PARAMETER);
|
|
else
|
|
id = Map(client_ipaddr & TCPIP_CLASSC_MASK, MAP_FN_PARAMETER);
|
|
} else {
|
|
if (bp->affinity == CVY_AFFINITY_NONE)
|
|
id = Map(client_ipaddr, ((svr_port << 16) + client_port));
|
|
else if (bp->affinity == CVY_AFFINITY_SINGLE)
|
|
id = Map(client_ipaddr, svr_ipaddr);
|
|
else
|
|
id = Map(client_ipaddr & TCPIP_CLASSC_MASK, svr_ipaddr);
|
|
}
|
|
|
|
/* now hash client address to bin id */
|
|
bin = id % CVY_MAXBINS;
|
|
|
|
LOCK_ENTER(&(lp->lock), &irql);
|
|
|
|
/* check bin for residency and all other hosts now idle on their bins; in this
|
|
case and if we do not have dirty connections, we must be able to handle the packet */
|
|
if (((bp->cmap & (((MAP_T) 1) << bin)) != 0) && /* v2.1 */
|
|
(!is_session_pkt || (((bp->all_idle_map & (((MAP_T) 1) << bin)) != 0) && (!(lp->cln_waiting))))) /* v1.32B */
|
|
{
|
|
/* note that we may have missed a connection, but it could also be a stale
|
|
packet so we can't start tracking the connection now */
|
|
|
|
#ifdef TRACE_LOAD
|
|
DbgPrint("Host %d: check 1 accepts pkt; rule %d bin %d nconn %d %s port %d\n",
|
|
lp->my_host_id, bp->index, bin, bp->nconn[bin], is_tcp_pkt ? "TCP" : "UDP", svr_port);
|
|
#endif
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
return TRUE;
|
|
}
|
|
/* otherwise, if we have an active connection for this bin or if we have dirty
|
|
connections for this bin and the bin is resident, check for a match */
|
|
|
|
else if (bp->nconn[bin] > 0 || (lp->cln_waiting && lp->dirty_bin[bin] && ((bp->cmap & (((MAP_T) 1) << bin)) != 0)))
|
|
{
|
|
PCONN_ENTRY ep; /* ptr. to connection entry */
|
|
PCONN_DESCR dp; /* ptr. to connection descriptor */
|
|
|
|
/* now hash client address to conn. hash table index */
|
|
id = id % CVY_MAX_CHASH;
|
|
|
|
ep = &(lp->hashed_conn[id]);
|
|
qp = &(lp->connq[id]);
|
|
|
|
|
|
/* look for a connection match */
|
|
if (CVY_CONN_MATCH(ep, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol))
|
|
{
|
|
/* if connection was dirty, just block the packet since TCP/IP may have stale
|
|
connection state for a previous connection from another host (v1.32B) */
|
|
|
|
if (ep->dirty)
|
|
{
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
#ifdef TRACE_DIRTY
|
|
DbgPrint ("blocking dirty connection from %d to %d\n", client_port, svr_port);
|
|
#endif
|
|
return FALSE;
|
|
}
|
|
|
|
#ifdef TRACE_LOAD
|
|
DbgPrint("Host %d: check 2 accepts pkt; rule %d bin %d nconn %d %s port %d\n",
|
|
lp->my_host_id, bp->index, bin, bp->nconn[bin], is_tcp_pkt ? "TCP" : "UDP", svr_port);
|
|
#endif
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
return TRUE;
|
|
}
|
|
else
|
|
{
|
|
for (dp = (PCONN_DESCR)Queue_front(qp); dp != NULL;
|
|
dp = (PCONN_DESCR)Queue_next(qp, &(dp->link)))
|
|
{
|
|
if (CVY_CONN_MATCH(&(dp->entry), svr_ipaddr, svr_port, client_ipaddr, client_port, protocol))
|
|
{
|
|
/* if connection was dirty, just block the packet since TCP/IP may have
|
|
stale connection state for a previous connection from another host
|
|
(v1.32B) */
|
|
|
|
if (dp->entry.dirty)
|
|
{
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
#ifdef TRACE_DIRTY
|
|
DbgPrint ("blocking dirty connection from %d to %d\n", client_port, svr_port);
|
|
#endif
|
|
return FALSE;
|
|
}
|
|
|
|
#ifdef TRACE_LOAD
|
|
DbgPrint("Host %d: check 3 accepts pkt; rule %d bin %d nconn %d %s port %d\n",
|
|
lp->my_host_id, bp->index, bin, bp->nconn[bin], is_tcp_pkt ? "TCP" : "UDP", svr_port);
|
|
#endif
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
return TRUE;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
return FALSE;
|
|
|
|
} /* end Load_packet_check */
|
|
|
|
|
|
BOOLEAN Load_conn_advise(
|
|
PLOAD_CTXT lp,
|
|
ULONG svr_ipaddr,
|
|
ULONG svr_port,
|
|
ULONG client_ipaddr,
|
|
ULONG client_port,
|
|
USHORT protocol,
|
|
ULONG conn_status,
|
|
BOOLEAN limit_map_fn)
|
|
{
|
|
BOOLEAN match, /* TRUE => we have a record of this connection */
|
|
hit; /* TRUE => we have a hash entry hit */
|
|
ULONG id; /* hash index for the connection */
|
|
ULONG bin; /* bin index */
|
|
PBIN_STATE bp; /* ptr. to bin state */
|
|
PCONN_ENTRY ep; /* ptr. to connection entry */
|
|
PCONN_DESCR dp; /* ptr. to connection descriptor */
|
|
QUEUE * qp; /* ptr. to connection queue */
|
|
IRQLEVEL irql;
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
|
|
BOOLEAN is_tcp_pkt = (protocol == TCPIP_PROTOCOL_TCP);
|
|
|
|
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
|
|
|
|
if (!lp -> active)
|
|
return FALSE;
|
|
|
|
lp->pkt_count++; /* increment count of pkts handled (v1.32B) */
|
|
|
|
/* increment bin count */
|
|
|
|
bp = Load_pg_lookup(lp, svr_ipaddr, svr_port, is_tcp_pkt);
|
|
|
|
/* handle CVY_NEVER immediately */
|
|
|
|
if (bp->mode == CVY_NEVER)
|
|
return FALSE;
|
|
|
|
/* This function is no longer for TCP only. */
|
|
if (!NLB_SESSION_SUPPORT_ENABLED())
|
|
{
|
|
/* This should never happen with session support disabled anyway - Load_pg_lookup() will
|
|
NEVER return a UDP only rule when the is_tcp_pkt is TRUE, so this isn't necessary. */
|
|
if (bp->prot == CVY_UDP)
|
|
return TRUE;
|
|
}
|
|
|
|
/* lookup connection entry in hash table */
|
|
if (limit_map_fn) {
|
|
if (bp->affinity == CVY_AFFINITY_NONE)
|
|
id = Map(client_ipaddr, MAP_FN_PARAMETER);
|
|
else if (bp->affinity == CVY_AFFINITY_SINGLE)
|
|
id = Map(client_ipaddr, MAP_FN_PARAMETER);
|
|
else
|
|
id = Map(client_ipaddr & TCPIP_CLASSC_MASK, MAP_FN_PARAMETER);
|
|
} else {
|
|
if (bp->affinity == CVY_AFFINITY_NONE)
|
|
id = Map(client_ipaddr, ((svr_port << 16) + client_port));
|
|
else if (bp->affinity == CVY_AFFINITY_SINGLE)
|
|
id = Map(client_ipaddr, svr_ipaddr);
|
|
else
|
|
id = Map(client_ipaddr & TCPIP_CLASSC_MASK, svr_ipaddr);
|
|
}
|
|
|
|
/* now hash client address to bin id and conn. hash table index */
|
|
|
|
bin = id % CVY_MAXBINS;
|
|
id = id % CVY_MAX_CHASH;
|
|
|
|
/* if this connection is not in our current map and it is not a connection
|
|
down notification for a non-idle bin, just filter it out */
|
|
|
|
if ((bp->cmap & (((MAP_T) 1) << bin)) == 0 && /* v2.1 */
|
|
(!((conn_status == CVY_CONN_DOWN || conn_status == CVY_CONN_RESET) && bp->nconn[bin] > 0)))
|
|
return FALSE;
|
|
|
|
ep = &(lp->hashed_conn[id]);
|
|
UNIV_ASSERT (ep->code == CVY_ENTRCODE); /* (bbain 8/21/99) */
|
|
qp = &(lp->connq[id]);
|
|
|
|
match = hit = FALSE;
|
|
|
|
LOCK_ENTER(&(lp->lock), &irql);
|
|
|
|
if (CVY_CONN_MATCH(ep, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol))
|
|
{
|
|
hit =
|
|
match = TRUE;
|
|
}
|
|
else
|
|
{
|
|
for (dp = (PCONN_DESCR)Queue_front(qp); dp != NULL;
|
|
dp = (PCONN_DESCR)Queue_next(qp, &(dp->link)))
|
|
{
|
|
if (CVY_CONN_MATCH(&(dp->entry), svr_ipaddr, svr_port, client_ipaddr, client_port, protocol))
|
|
{
|
|
match = TRUE;
|
|
|
|
UNIV_ASSERT (dp->code == CVY_DESCCODE); /* (bbain 8/19/99) */
|
|
ep = &(dp->entry); /* v 2.06 */
|
|
UNIV_ASSERT (ep->code == CVY_ENTRCODE); /* (bbain 8/21/99) */
|
|
|
|
/* release connection descriptor if taking down connection */
|
|
|
|
if (conn_status == CVY_CONN_DOWN || conn_status == CVY_CONN_RESET)
|
|
{
|
|
/* if connection was dirty, just block the packet since TCP/IP may have
|
|
stale connection state for a previous connection from another host
|
|
(v1.32B) */
|
|
|
|
if (ep->dirty)
|
|
{
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
#ifdef TRACE_DIRTY
|
|
DbgPrint ("blocking dirty FIN from %d to %d\n", client_port, svr_port);
|
|
#endif
|
|
return FALSE;
|
|
}
|
|
|
|
/* ###### fin count added for keynote - ramkrish. */
|
|
/* if first fin, then only increment the count and return TRUE */
|
|
if (conn_status == CVY_CONN_DOWN && ep->fin_count == 0 && is_tcp_pkt)
|
|
{
|
|
ep->fin_count++;
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
return TRUE;
|
|
}
|
|
|
|
Link_unlink(&(dp->entry.blink));
|
|
Link_unlink(&(dp->entry.rlink)); /* V2.1.5 */
|
|
|
|
Link_unlink(&(dp->link));
|
|
Queue_enq(&(lp->conn_freeq), &(dp->link));
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* if we see a new connection, handle it */
|
|
|
|
if (conn_status == CVY_CONN_UP)
|
|
{
|
|
/* if we don't have a connection match, setup a new connection entry */
|
|
|
|
if (!match)
|
|
{
|
|
/* if hash entry table is not available, setup and enqueue a new entry */
|
|
|
|
if (CVY_CONN_IN_USE(ep))
|
|
{
|
|
dp = (PCONN_DESCR)Queue_deq(&(lp->conn_freeq));
|
|
|
|
if (dp == NULL)
|
|
{
|
|
/* allocate new queue descriptors if allowed */
|
|
|
|
if (lp->nqalloc < lp->max_dscr_allocs)
|
|
{
|
|
UNIV_PRINT(("Load_conn_advise: %d/%d allocating %d descriptors", lp->nqalloc, lp->max_dscr_allocs, lp->dscr_per_alloc));
|
|
|
|
lp->qalloc_list[lp->nqalloc] = /* (bbain 2/25/99) */
|
|
dp = (PCONN_DESCR)malloc((lp->dscr_per_alloc) * sizeof(CONN_DESCR));
|
|
if (dp != NULL)
|
|
{
|
|
ULONG i;
|
|
PCONN_DESCR tp;
|
|
QUEUE * fqp;
|
|
|
|
lp->nqalloc++;
|
|
|
|
/* initialize and link up descriptors; save first descriptor
|
|
for our use */
|
|
|
|
dp->code = CVY_DESCCODE; /* (bbain 8/19/99) */
|
|
Link_init(&(dp->link));
|
|
ep = &(dp->entry); /* (bbain 8/21/99) */
|
|
ep->code = CVY_ENTRCODE; /* (bbain 8/19/99) */
|
|
ep->alloc = TRUE;
|
|
ep->dirty = FALSE; /* v1.32B */
|
|
|
|
CVY_CONN_CLEAR(&(dp->entry));
|
|
Link_init(&(dp->entry.blink));
|
|
Link_init(&(dp->entry.rlink)); /* V2.1.5 */
|
|
|
|
tp = dp + 1;
|
|
fqp = &(lp->conn_freeq);
|
|
|
|
for (i=1; i<lp->dscr_per_alloc; i++)
|
|
{
|
|
tp->code = CVY_DESCCODE; /* (bbain 8/19/99) */
|
|
Link_init(&(tp->link));
|
|
tp->entry.code = CVY_ENTRCODE; /* (bbain 8/19/99) */
|
|
tp->entry.alloc = TRUE;
|
|
tp->entry.dirty = FALSE; /* v1.32B */
|
|
|
|
CVY_CONN_CLEAR(&(tp->entry));
|
|
Link_init(&(tp->entry.blink));
|
|
Link_init(&(tp->entry.rlink)); /* V2.1.5 */
|
|
|
|
Queue_enq(fqp, &(tp->link));
|
|
|
|
tp++;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (!(lp->alloc_failed))
|
|
{
|
|
UNIV_PRINT(("Load_conn_advise: error allocating conn descrs"));
|
|
LOG_MSG(MSG_ERROR_MEMORY, MSG_NONE);
|
|
|
|
lp->alloc_failed = TRUE;
|
|
}
|
|
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
return TRUE;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* V2.1.5 - if reached allocation limit - start taking
|
|
connection descriptors from the recover queue since
|
|
they are likely to be stale and very old */
|
|
|
|
PBIN_STATE rbp;
|
|
LINK * rlp;
|
|
|
|
#ifdef TRACE_RCVRY
|
|
DbgPrint ("Host %d: taking connection from recovery queue\n", lp->my_host_id);
|
|
#endif
|
|
|
|
rlp = (LINK *)Queue_deq(&(lp->conn_rcvryq));
|
|
|
|
UNIV_ASSERT (rlp != NULL);
|
|
|
|
/* this should not happen at all but protect anyway */
|
|
|
|
if (rlp == NULL)
|
|
{
|
|
if (!(lp->alloc_inhibited))
|
|
{
|
|
UNIV_PRINT(("Host %d: cannot allocate conn descriptors.", lp->my_host_id));
|
|
LOG_MSG(MSG_WARN_DESCRIPTORS, CVY_NAME_MAX_DSCR_ALLOCS);
|
|
|
|
lp->alloc_inhibited = TRUE;
|
|
}
|
|
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
return TRUE;
|
|
}
|
|
|
|
ep = STRUCT_PTR(rlp, CONN_ENTRY, rlink);
|
|
UNIV_ASSERT (ep->code == CVY_ENTRCODE); /* (bbain 8/19/99) */
|
|
|
|
/* fixed for nt4/sp5 */
|
|
|
|
if (ep->alloc)
|
|
{
|
|
/* unlink allocated descriptors from the hash table
|
|
queue if necessary and set dp so that code below
|
|
will put it back in the right hash queue */
|
|
|
|
dp = STRUCT_PTR(ep, CONN_DESCR, entry);
|
|
UNIV_ASSERT (dp->code == CVY_DESCCODE); /* (bbain 8/19/99) */
|
|
|
|
Link_unlink(&(dp->link));
|
|
}
|
|
else
|
|
{
|
|
dp = NULL; /* (bbain 8/21/99) */
|
|
}
|
|
|
|
/* dirty connections are not counted */
|
|
|
|
if (! ep->dirty)
|
|
{
|
|
/* find out which port group we are on so we can clean
|
|
up its counters */
|
|
|
|
rbp = Load_pg_lookup(lp, ep->svr_ipaddr, ep->svr_port, is_tcp_pkt);
|
|
|
|
/* correct bad (negative) bin count */
|
|
|
|
if (lp->nconn <= 0)
|
|
lp->nconn = 0;
|
|
else
|
|
lp->nconn--;
|
|
|
|
if (rbp->nconn[ep->bin] <= 0)
|
|
rbp->nconn[ep->bin] = 0;
|
|
else
|
|
{
|
|
rbp->nconn[ep->bin]--;
|
|
}
|
|
|
|
if (rbp->tconn <= 0)
|
|
rbp->tconn = 0;
|
|
else
|
|
rbp->tconn--;
|
|
|
|
if (rbp->nconn[ep->bin] == 0)
|
|
{
|
|
rbp->idle_bins |= (((MAP_T) 1) << ep->bin);
|
|
}
|
|
}
|
|
|
|
Link_unlink(&(ep->blink));
|
|
CVY_CONN_CLEAR(ep);
|
|
ep->dirty = FALSE;
|
|
}
|
|
}
|
|
/* else dp is not NULL, so setup entry pointer */
|
|
else
|
|
{
|
|
ep = &(dp->entry);
|
|
UNIV_ASSERT (ep->code == CVY_ENTRCODE); /* (bbain 8/21/99) */
|
|
}
|
|
|
|
/* enqueue descriptor in hash table unless it's already a hash table entry
|
|
(V2.1.5 recovered connection might be in hash table, so make
|
|
sure we do not end up queueing it) */
|
|
|
|
if (dp != NULL)
|
|
{
|
|
UNIV_ASSERT (dp->code == CVY_DESCCODE); /* (bbain 8/19/99) */
|
|
|
|
/* enqueue new queue descriptor and setup entry pointer */
|
|
|
|
Queue_enq(qp, &(dp->link));
|
|
}
|
|
}
|
|
|
|
/* setup new entry */
|
|
|
|
UNIV_ASSERT (ep->code == CVY_ENTRCODE); /* (bbain 8/21/99) */
|
|
|
|
CVY_CONN_SET(ep, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol);
|
|
ep->bin = (UCHAR)bin;
|
|
|
|
/* ###### fin count added for keynote - ramkrish */
|
|
/* initialize the fin count to 0 for a new connection */
|
|
ep->fin_count = 0;
|
|
|
|
/* enqueue entry into port group queue */
|
|
|
|
Queue_enq(&(bp->connq), &(ep->blink));
|
|
|
|
/* V2.1.5 add entry to the tail of connection recovery queue */
|
|
|
|
Queue_enq(&(lp->conn_rcvryq), &(ep->rlink));
|
|
|
|
/* increment # connections and mark bin not idle if necessary */
|
|
|
|
lp->nconn++; /* v2.1 */
|
|
bp->tconn++;
|
|
bp->nconn[bin]++;
|
|
if (bp->nconn[bin] == 1)
|
|
bp->idle_bins &= ~(((MAP_T) 1) << bin);
|
|
|
|
#ifdef TRACE_LOAD
|
|
DbgPrint("Host %d: advise starts conn; rule %d bin %d nconn %d\n",
|
|
lp->my_host_id, bp->index, bin, bp->nconn[bin]);
|
|
#endif
|
|
}
|
|
/* otherwise, we have a match; clean up conn entry if dirty since we have a
|
|
new connection, although TCP/IP will likely reject it if it has stale state
|
|
from another connection (v1.32B) */
|
|
|
|
else
|
|
{
|
|
if (ep->dirty)
|
|
{
|
|
#ifdef TRACE_DIRTY
|
|
DbgPrint ("converting dirty SYN from %d to %d\n", client_port, svr_port);
|
|
#endif
|
|
UNIV_ASSERT (ep->code == CVY_ENTRCODE); /* (bbain 8/21/99) */
|
|
|
|
ep->dirty = FALSE;
|
|
|
|
/* ###### initialize fin count for this new connection added for keynote - ramkrish */
|
|
/* ###### since we are reusing a dirty connection desc for a new conn., it needs to be reset */
|
|
ep->fin_count = 0;
|
|
|
|
UNIV_ASSERT (ep->bin == (USHORT)bin);
|
|
|
|
/* unlink and enqueue entry into port group queue */
|
|
|
|
Link_unlink(&(ep->blink));
|
|
Queue_enq(&(bp->connq), &(ep->blink));
|
|
|
|
/* increment # connections and mark bin not idle if necessary */
|
|
|
|
lp->nconn++; /* v2.1 */
|
|
bp->tconn++;
|
|
bp->nconn[bin]++;
|
|
if (bp->nconn[bin] == 1)
|
|
bp->idle_bins &= ~(((MAP_T) 1) << bin);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* otherwise, if a known connection is going down, remove our connection entry */
|
|
|
|
/* ###### check for reset addded for keynote - ramkrish */
|
|
else if ((conn_status == CVY_CONN_DOWN || conn_status == CVY_CONN_RESET) && match)
|
|
{
|
|
/* if connection was dirty, just block the packet since TCP/IP may have stale
|
|
connection state for a previous connection from another host (v1.32B) */
|
|
|
|
if (ep->dirty)
|
|
{
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
#ifdef TRACE_DIRTY
|
|
DbgPrint ("blocking dirty FIN from %d to %d\n", client_port, svr_port);
|
|
#endif
|
|
return FALSE;
|
|
}
|
|
|
|
/* ###### fin count added for keynote - ramkrish */
|
|
/* if this is the first fin, then simply increment the fincount and return */
|
|
if (conn_status == CVY_CONN_DOWN && ep->fin_count == 0 && is_tcp_pkt)
|
|
{
|
|
ep->fin_count++;
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
return TRUE;
|
|
}
|
|
|
|
/* clear hash table entry if we had a hit; enqueued entry was already freed */
|
|
|
|
if (hit)
|
|
{
|
|
CVY_CONN_CLEAR(ep);
|
|
|
|
/* ###### clear fin count for keynote - ramkrish */
|
|
ep->fin_count = 0;
|
|
|
|
Link_unlink(&(ep->rlink)); /* V2.1.5 */
|
|
Link_unlink(&(ep->blink));
|
|
}
|
|
|
|
/* decrement # connections and mark bin idle if necessary */
|
|
|
|
#if 0
|
|
if (bp->nconn[bin] <= 0)
|
|
DbgPrint("WLBS: Load_conn_advise: count was zero %d %d\n", bin, bp->nconn[bin]);
|
|
#endif
|
|
|
|
UNIV_ASSERT(bp->nconn[bin] > 0 && bp->tconn > 0 && lp->nconn > 0);
|
|
|
|
if (lp->nconn <= 0) /* v2.1 */
|
|
lp->nconn = 0;
|
|
else
|
|
lp->nconn--;
|
|
|
|
if (bp->nconn[bin] <= 0) /* correct bad (negative) bin count */
|
|
bp->nconn[bin] = 0;
|
|
else
|
|
bp->nconn[bin]--;
|
|
|
|
if (bp->tconn <= 0)
|
|
bp->tconn = 0;
|
|
else
|
|
bp->tconn--;
|
|
|
|
if (bp->nconn[bin] == 0)
|
|
{
|
|
bp->idle_bins |= (((MAP_T) 1) << bin);
|
|
}
|
|
|
|
#ifdef TRACE_LOAD
|
|
DbgPrint("Host %d: advise removes conn; rule %d bin %d nconn %d\n",
|
|
lp->my_host_id, bp->index, bin, bp->nconn[bin]);
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
return FALSE;
|
|
}
|
|
|
|
LOCK_EXIT(&(lp->lock), irql);
|
|
return TRUE;
|
|
|
|
} /* end Load_conn_advise */
|
|
|
|
/*
|
|
* Function: Load_create_dscr
|
|
* Desctription:
|
|
* Parameters:
|
|
* Returns:
|
|
* Author: shouse, 5.18.01
|
|
* Notes:
|
|
*/
|
|
BOOLEAN Load_create_dscr(
|
|
PLOAD_CTXT lp,
|
|
ULONG svr_ipaddr,
|
|
ULONG svr_port,
|
|
ULONG client_ipaddr,
|
|
ULONG client_port,
|
|
USHORT protocol,
|
|
BOOLEAN limit_map_fn)
|
|
{
|
|
BOOLEAN match = FALSE; /* TRUE => we have a record of this connection. */
|
|
ULONG id; /* Hash index for the connection. */
|
|
ULONG bin; /* Bin index. */
|
|
PBIN_STATE bp; /* Pointer to bin state. */
|
|
PCONN_ENTRY ep; /* Pointer to connection entry. */
|
|
PCONN_DESCR dp; /* Pointer to connection descriptor. */
|
|
QUEUE * qp; /* Pointer to connection queue. */
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD(lp, MAIN_CTXT, load);
|
|
BOOLEAN is_tcp_pkt = (protocol == TCPIP_PROTOCOL_TCP);
|
|
|
|
UNIV_ASSERT(lp->code == CVY_LOADCODE);
|
|
|
|
if (!lp->active)
|
|
return FALSE;
|
|
|
|
/* Increment count of packets handled. */
|
|
lp->pkt_count++;
|
|
|
|
/* Find the port rule for this connection. */
|
|
bp = Load_pg_lookup(lp, svr_ipaddr, svr_port, is_tcp_pkt);
|
|
|
|
/* Hash. */
|
|
if (limit_map_fn) {
|
|
if (bp->affinity == CVY_AFFINITY_NONE)
|
|
id = Map(client_ipaddr, MAP_FN_PARAMETER);
|
|
else if (bp->affinity == CVY_AFFINITY_SINGLE)
|
|
id = Map(client_ipaddr, MAP_FN_PARAMETER);
|
|
else
|
|
id = Map(client_ipaddr & TCPIP_CLASSC_MASK, MAP_FN_PARAMETER);
|
|
} else {
|
|
if (bp->affinity == CVY_AFFINITY_NONE)
|
|
id = Map(client_ipaddr, ((svr_port << 16) + client_port));
|
|
else if (bp->affinity == CVY_AFFINITY_SINGLE)
|
|
id = Map(client_ipaddr, svr_ipaddr);
|
|
else
|
|
id = Map(client_ipaddr & TCPIP_CLASSC_MASK, svr_ipaddr);
|
|
}
|
|
|
|
/* Hash client address to bin id and connection hash table index. */
|
|
bin = id % CVY_MAXBINS;
|
|
id = id % CVY_MAX_CHASH;
|
|
|
|
/* Get a pointer to the connection entry for this hash ID. */
|
|
ep = &(lp->hashed_conn[id]);
|
|
|
|
UNIV_ASSERT (ep->code == CVY_ENTRCODE);
|
|
|
|
/* Get a pointer to the conneciton queue. */
|
|
qp = &(lp->connq[id]);
|
|
|
|
if (CVY_CONN_MATCH(ep, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol))
|
|
{
|
|
/* Note that we found a match for this tuple. */
|
|
match = TRUE;
|
|
} else {
|
|
for (dp = (PCONN_DESCR)Queue_front(qp); dp != NULL; dp = (PCONN_DESCR)Queue_next(qp, &(dp->link))) {
|
|
if (CVY_CONN_MATCH(&(dp->entry), svr_ipaddr, svr_port, client_ipaddr, client_port, protocol))
|
|
{
|
|
/* Note that we found a match for this tuple. */
|
|
match = TRUE;
|
|
|
|
UNIV_ASSERT (dp->code == CVY_DESCCODE);
|
|
|
|
/* Get a pointer to the connection entry. */
|
|
ep = &(dp->entry);
|
|
|
|
UNIV_ASSERT (ep->code == CVY_ENTRCODE);
|
|
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* If we don't have a connection match, setup a new connection entry. */
|
|
if (!match) {
|
|
/* If hash entry table is not available, setup and enqueue a new entry. */
|
|
if (CVY_CONN_IN_USE(ep)) {
|
|
/* Get a pointer to a free descriptor. */
|
|
dp = (PCONN_DESCR)Queue_deq(&(lp->conn_freeq));
|
|
|
|
if (dp == NULL) {
|
|
/* Allocate new queue descriptors if allowed. */
|
|
if (lp->nqalloc < lp->max_dscr_allocs) {
|
|
UNIV_PRINT(("Load_create_dscr: %d/%d allocating %d descriptors", lp->nqalloc, lp->max_dscr_allocs, lp->dscr_per_alloc));
|
|
|
|
dp = lp->qalloc_list[lp->nqalloc] = (PCONN_DESCR)malloc((lp->dscr_per_alloc) * sizeof(CONN_DESCR));
|
|
|
|
if (dp != NULL) {
|
|
ULONG i;
|
|
PCONN_DESCR tp;
|
|
QUEUE * fqp;
|
|
|
|
/* Increment the counter for number of allocations. */
|
|
lp->nqalloc++;
|
|
|
|
/* Initialize and link up descriptors; save first descriptor for our use. */
|
|
dp->code = CVY_DESCCODE;
|
|
Link_init(&(dp->link));
|
|
|
|
/* Initialize the connection entry. */
|
|
ep = &(dp->entry);
|
|
ep->code = CVY_ENTRCODE;
|
|
ep->alloc = TRUE;
|
|
ep->dirty = FALSE;
|
|
|
|
/* Mark this entry unused. */
|
|
CVY_CONN_CLEAR(&(dp->entry));
|
|
|
|
Link_init(&(dp->entry.blink));
|
|
Link_init(&(dp->entry.rlink));
|
|
|
|
tp = dp + 1;
|
|
fqp = &(lp->conn_freeq);
|
|
|
|
/* Initialize all descriptors and tack them on the free queue. */
|
|
for (i = 1; i < lp->dscr_per_alloc; i++, tp++) {
|
|
/* Initialize the descriptor. */
|
|
tp->code = CVY_DESCCODE;
|
|
Link_init(&(tp->link));
|
|
|
|
/* Initialize the connection entry. */
|
|
tp->entry.code = CVY_ENTRCODE;
|
|
tp->entry.alloc = TRUE;
|
|
tp->entry.dirty = FALSE;
|
|
|
|
/* Mark this entry unused. */
|
|
CVY_CONN_CLEAR(&(tp->entry));
|
|
|
|
Link_init(&(tp->entry.blink));
|
|
Link_init(&(tp->entry.rlink));
|
|
|
|
/* Queue the descriptor onto the free queue. */
|
|
Queue_enq(fqp, &(tp->link));
|
|
}
|
|
} else {
|
|
/* Allocation failed, log a message and bail out. */
|
|
if (!(lp->alloc_failed)) {
|
|
UNIV_PRINT(("Load_conn_advise: error allocating conn descrs"));
|
|
LOG_MSG(MSG_ERROR_MEMORY, MSG_NONE);
|
|
lp->alloc_failed = TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
} else {
|
|
/* If we have reached the allocation limit, start taking connection descriptors
|
|
from the recover queue since they are likely to be stale and very old. */
|
|
PBIN_STATE rbp;
|
|
LINK * rlp;
|
|
|
|
#ifdef TRACE_RCVRY
|
|
DbgPrint ("Host %d: taking connection from recovery queue\n", lp->my_host_id);
|
|
#endif
|
|
|
|
/* Dequeue a descriptor from the recovery queue. */
|
|
rlp = (LINK *)Queue_deq(&(lp->conn_rcvryq));
|
|
|
|
UNIV_ASSERT (rlp != NULL);
|
|
|
|
/* This should not happen at all but protect anyway. */
|
|
if (rlp == NULL) {
|
|
/* Unable to get a descriptor, log a message and bail out. */
|
|
if (!(lp->alloc_inhibited)) {
|
|
UNIV_PRINT(("Host %d: cannot allocate conn descriptors.", lp->my_host_id));
|
|
LOG_MSG(MSG_WARN_DESCRIPTORS, CVY_NAME_MAX_DSCR_ALLOCS);
|
|
lp->alloc_inhibited = TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
/* Grab a pointer to the connection entry. */
|
|
ep = STRUCT_PTR(rlp, CONN_ENTRY, rlink);
|
|
|
|
UNIV_ASSERT (ep->code == CVY_ENTRCODE);
|
|
|
|
if (ep->alloc) {
|
|
/* Unlink allocated descriptors from the hash table queue if necessary
|
|
and set dp so that code below will put it back in the right hash queue. */
|
|
dp = STRUCT_PTR(ep, CONN_DESCR, entry);
|
|
|
|
UNIV_ASSERT (dp->code == CVY_DESCCODE);
|
|
|
|
Link_unlink(&(dp->link));
|
|
} else {
|
|
dp = NULL;
|
|
}
|
|
|
|
/* Dirty connections are not counted, so we don't need to update these counters. */
|
|
if (! ep->dirty) {
|
|
/* Find out which port group we are on so we can clean up its counters. */
|
|
rbp = Load_pg_lookup(lp, ep->svr_ipaddr, ep->svr_port, is_tcp_pkt);
|
|
|
|
if (lp->nconn <= 0)
|
|
lp->nconn = 0;
|
|
else
|
|
lp->nconn--;
|
|
|
|
if (rbp->nconn[ep->bin] <= 0)
|
|
rbp->nconn[ep->bin] = 0;
|
|
else
|
|
rbp->nconn[ep->bin]--;
|
|
|
|
if (rbp->tconn <= 0)
|
|
rbp->tconn = 0;
|
|
else
|
|
rbp->tconn--;
|
|
|
|
if (rbp->nconn[ep->bin] == 0)
|
|
rbp->idle_bins |= (((MAP_T) 1) << ep->bin);
|
|
}
|
|
|
|
Link_unlink(&(ep->blink));
|
|
|
|
/* Mark the descriptor as unused. */
|
|
CVY_CONN_CLEAR(ep);
|
|
|
|
/* Makr the descriptor as clean. */
|
|
ep->dirty = FALSE;
|
|
}
|
|
} else {
|
|
/* There was a free descriptor, so setup the connection entry pointer. */
|
|
ep = &(dp->entry);
|
|
|
|
UNIV_ASSERT (ep->code == CVY_ENTRCODE);
|
|
}
|
|
|
|
/* Enqueue descriptor in hash table unless it's already a hash table entry (a recovered
|
|
connection might be in hash table, so make sure we do not end up queueing it) */
|
|
if (dp != NULL) {
|
|
UNIV_ASSERT (dp->code == CVY_DESCCODE);
|
|
|
|
Queue_enq(qp, &(dp->link));
|
|
}
|
|
}
|
|
|
|
UNIV_ASSERT (ep->code == CVY_ENTRCODE);
|
|
|
|
/* Setup a new entry. */
|
|
CVY_CONN_SET(ep, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol);
|
|
ep->bin = (UCHAR)bin;
|
|
|
|
/* Initialize the fin count to 0 for a new connection. */
|
|
ep->fin_count = 0;
|
|
|
|
/* Enqueue entry into port group queue. */
|
|
Queue_enq(&(bp->connq), &(ep->blink));
|
|
|
|
/* Add entry to the tail of connection recovery queue. */
|
|
Queue_enq(&(lp->conn_rcvryq), &(ep->rlink));
|
|
|
|
/* Increment number of connections and mark bin not idle if necessary. */
|
|
lp->nconn++;
|
|
bp->tconn++;
|
|
bp->nconn[bin]++;
|
|
|
|
if (bp->nconn[bin] == 1) bp->idle_bins &= ~(((MAP_T) 1) << bin);
|
|
|
|
#ifdef TRACE_LOAD
|
|
DbgPrint("Host %d: advise starts conn; rule %d bin %d nconn %d\n",
|
|
lp->my_host_id, bp->index, bin, bp->nconn[bin]);
|
|
#endif
|
|
} else {
|
|
/* We have a match. Clean up connection entry if it's dirty since we have a new connection,
|
|
although TCP/IP will likely reject it if it has stale state from another connection. */
|
|
if (ep->dirty) {
|
|
#ifdef TRACE_DIRTY
|
|
DbgPrint ("converting dirty SYN from %d to %d\n", client_port, svr_port);
|
|
#endif
|
|
UNIV_ASSERT (ep->code == CVY_ENTRCODE);
|
|
|
|
ep->dirty = FALSE;
|
|
ep->fin_count = 0;
|
|
|
|
UNIV_ASSERT (ep->bin == (USHORT)bin);
|
|
|
|
/* Unlink and enqueue entry into port group queue. */
|
|
Link_unlink(&(ep->blink));
|
|
Queue_enq(&(bp->connq), &(ep->blink));
|
|
|
|
/* Increment # connections and mark bin not idle if necessary. */
|
|
lp->nconn++;
|
|
bp->tconn++;
|
|
bp->nconn[bin]++;
|
|
|
|
if (bp->nconn[bin] == 1) bp->idle_bins &= ~(((MAP_T) 1) << bin);
|
|
}
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
ULONG Load_port_change(
|
|
PLOAD_CTXT lp,
|
|
ULONG ipaddr,
|
|
ULONG port,
|
|
ULONG cmd,
|
|
ULONG value)
|
|
{
|
|
PCVY_RULE rp; /* Pointer to configured port rules. */
|
|
PBIN_STATE bp; /* Pointer to load module port rule state. */
|
|
ULONG nrules; /* Number of rules. */
|
|
ULONG i;
|
|
ULONG ret = IOCTL_CVY_NOT_FOUND;
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD(lp, MAIN_CTXT, load);
|
|
|
|
UNIV_ASSERT(lp->code == CVY_LOADCODE);
|
|
|
|
if (! lp->active)
|
|
return IOCTL_CVY_NOT_FOUND;
|
|
|
|
rp = (* (lp->params)).port_rules;
|
|
|
|
/* If we are draining whole cluster, include DEFAULT rule; Otherwise, just
|
|
include the user-defined rules (the DEFAULT rule is the last rule). */
|
|
|
|
if (cmd == IOCTL_CVY_CLUSTER_DRAIN || cmd == IOCTL_CVY_CLUSTER_PLUG)
|
|
nrules = (* (lp->params)).num_rules + 1;
|
|
else
|
|
nrules = (* (lp->params)).num_rules;
|
|
|
|
for (i=0; i<nrules; i++, rp++)
|
|
{
|
|
/* If the virtual IP address is IOCTL_ALL_VIPS (0x00000000), then we are applying this
|
|
change to all port rules for port X, regardless of VIP. If the virtual IP address is
|
|
to be applied to a particular VIP, then we apply only to port rules whose VIP matches.
|
|
Similarly, if the change is to apply to an "ALL VIP" rule, then we also apply when the
|
|
VIP matches because the caller uses CVY_ALL_VIP_NUMERIC_VALUE (0xffffffff) as the
|
|
virtual IP address, which is the same value stored in the port rule state. */
|
|
if ((ipaddr == IOCTL_ALL_VIPS || ipaddr == rp->virtual_ip_addr) &&
|
|
(port == IOCTL_ALL_PORTS || (port >= rp->start_port && port <= rp->end_port)))
|
|
{
|
|
bp = &(lp->pg_state[i]);
|
|
|
|
UNIV_ASSERT(bp->code == CVY_BINCODE); /* (bbain 8/19/99) */
|
|
|
|
/* If enabling a port rule, set the load amount to original value;
|
|
If disabling a port rule, set the load amount to zero;
|
|
Otherwise, set the load amount it to the specified amount. */
|
|
if (cmd == IOCTL_CVY_PORT_ON || cmd == IOCTL_CVY_CLUSTER_PLUG)
|
|
{
|
|
if (bp->load_amt[lp->my_host_id] == bp->orig_load_amt)
|
|
{
|
|
/* If we are the first port rule to match, then set the
|
|
return value to "Already"; Otherwise, we don't want to
|
|
overwrite some other port rule's return value of "OK"
|
|
in the case of ALL_VIPS or ALL_PORTS. */
|
|
if (ret == IOCTL_CVY_NOT_FOUND) ret = IOCTL_CVY_ALREADY;
|
|
|
|
continue;
|
|
}
|
|
|
|
/* Restore the original load amount. */
|
|
bp->load_amt[lp->my_host_id] = bp->orig_load_amt;
|
|
ret = IOCTL_CVY_OK;
|
|
}
|
|
else if (cmd == IOCTL_CVY_PORT_OFF)
|
|
{
|
|
|
|
if (bp->load_amt[lp->my_host_id] == 0)
|
|
{
|
|
/* If we are the first port rule to match, then set the
|
|
return value to "Already"; Otherwise, we don't want to
|
|
overwrite some other port rule's return value of "OK"
|
|
in the case of ALL_VIPS or ALL_PORTS. */
|
|
if (ret == IOCTL_CVY_NOT_FOUND) ret = IOCTL_CVY_ALREADY;
|
|
|
|
continue;
|
|
}
|
|
|
|
bp->load_amt[lp->my_host_id] = 0;
|
|
|
|
/* Immediately stop handling all traffic on the port group. */
|
|
bp->cmap = 0;
|
|
bp->cur_map[lp->my_host_id] = 0;
|
|
Load_conn_kill(lp, bp);
|
|
ret = IOCTL_CVY_OK;
|
|
}
|
|
else if (cmd == IOCTL_CVY_PORT_DRAIN || cmd == IOCTL_CVY_CLUSTER_DRAIN)
|
|
{
|
|
if (bp->load_amt[lp->my_host_id] == 0)
|
|
{
|
|
/* If we are the first port rule to match, then set the
|
|
return value to "Already"; Otherwise, we don't want to
|
|
overwrite some other port rule's return value of "OK"
|
|
in the case of ALL_VIPS or ALL_PORTS. */
|
|
if (ret == IOCTL_CVY_NOT_FOUND) ret = IOCTL_CVY_ALREADY;
|
|
|
|
continue;
|
|
}
|
|
|
|
/* Set load weight to zero, but continue to handle existing connections. */
|
|
bp->load_amt[lp->my_host_id] = 0;
|
|
ret = IOCTL_CVY_OK;
|
|
}
|
|
else
|
|
{
|
|
UNIV_ASSERT(cmd == IOCTL_CVY_PORT_SET);
|
|
|
|
if (bp->load_amt[lp->my_host_id] == value)
|
|
{
|
|
/* If we are the first port rule to match, then set the
|
|
return value to "Already"; Otherwise, we don't want to
|
|
overwrite some other port rule's return value of "OK"
|
|
in the case of ALL_VIPS or ALL_PORTS. */
|
|
if (ret == IOCTL_CVY_NOT_FOUND) ret = IOCTL_CVY_ALREADY;
|
|
|
|
continue;
|
|
}
|
|
|
|
/* Set the load weight for this port rule. */
|
|
bp->orig_load_amt = value;
|
|
bp->load_amt[lp->my_host_id] = value;
|
|
ret = IOCTL_CVY_OK;
|
|
}
|
|
|
|
if (port != IOCTL_ALL_PORTS && ipaddr != IOCTL_ALL_VIPS) break;
|
|
}
|
|
}
|
|
|
|
/* If the cluster isn't already converging, then initiate convergence if the load weight of a port rule has been modified. */
|
|
if (lp->send_msg.state != HST_CVG && ret == IOCTL_CVY_OK) {
|
|
WCHAR me[20];
|
|
|
|
Univ_ulong_to_str (lp->my_host_id+1, me, 10);
|
|
|
|
/* Tracking convergence - Starting convergence because our port rule configuration has changed. */
|
|
LOG_MSGS(MSG_INFO_CONVERGING_NEW_RULES, me, me);
|
|
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d has changed its port rule configuration.", lp->my_host_id+1, lp->my_host_id+1);
|
|
|
|
/* Tracking convergence. */
|
|
Load_convergence_start(lp);
|
|
}
|
|
|
|
return ret;
|
|
|
|
} /* end Load_port_change */
|
|
|
|
|
|
ULONG Load_hosts_query(
|
|
PLOAD_CTXT lp,
|
|
BOOLEAN internal,
|
|
PULONG host_map)
|
|
{
|
|
WCHAR buf1 [256];
|
|
WCHAR buf2 [256];
|
|
PWCHAR ptr1 = buf1;
|
|
PWCHAR ptr2 = buf2;
|
|
WCHAR num [20]; /* v2.1 */
|
|
WCHAR msk [33];
|
|
ULONG i, j, k;
|
|
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
|
|
|
|
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
|
|
|
|
buf1 [0] = 0;
|
|
buf2 [0] = 0;
|
|
msk [0] = 0;
|
|
num [0] = 0;
|
|
|
|
for (i = 0, j = 0; i < 16; i++)
|
|
{
|
|
if (lp -> host_map & (1 << i))
|
|
{
|
|
ptr1 = Univ_ulong_to_str (i + 1, ptr1, 10);
|
|
|
|
* ptr1 = L',';
|
|
ptr1 ++;
|
|
j ++;
|
|
|
|
msk [i] = L'1';
|
|
}
|
|
else
|
|
msk [i] = L'0';
|
|
}
|
|
|
|
for (i = 16, k = 0; i < 32; i++)
|
|
{
|
|
if (lp -> host_map & (1 << i))
|
|
{
|
|
ptr2 = Univ_ulong_to_str (i + 1, ptr2, 10);
|
|
|
|
* ptr2 = L',';
|
|
ptr2 ++;
|
|
k ++;
|
|
|
|
msk [i] = L'1';
|
|
}
|
|
else
|
|
msk [i] = L'0';
|
|
}
|
|
|
|
if (k)
|
|
{
|
|
ptr2 --;
|
|
// * ptr2 = L'.';
|
|
// ptr2 ++;
|
|
}
|
|
else if (j)
|
|
{
|
|
ptr1 --;
|
|
// * ptr1 = L'.';
|
|
// ptr1 ++;
|
|
}
|
|
|
|
* ptr1 = 0;
|
|
* ptr2 = 0;
|
|
|
|
* host_map = lp->host_map;
|
|
|
|
Univ_ulong_to_str ((* (lp->params)) . host_priority, num, 10); /* v2.1 */
|
|
|
|
if (lp->send_msg.state != HST_NORMAL)
|
|
{
|
|
UNIV_PRINT (("current host map is %08x and converging", lp->host_map));
|
|
if (internal) /* 1.03 */
|
|
{
|
|
LOG_MSGS3 (MSG_INFO_CONVERGING, num, buf1, buf2);
|
|
}
|
|
return IOCTL_CVY_CONVERGING;
|
|
}
|
|
|
|
/* if this host has the bins for the deafult rule, it is the default host (v2.1) */
|
|
|
|
else if (lp->pg_state[(* (lp->params)).num_rules].cmap != 0)
|
|
{
|
|
UNIV_PRINT (("current host map is %08x and converged as DEFAULT", lp->host_map));
|
|
if (internal) /* 1.03 */
|
|
{
|
|
LOG_MSGS3(MSG_INFO_MASTER, num, buf1, buf2);
|
|
}
|
|
return IOCTL_CVY_MASTER;
|
|
}
|
|
else
|
|
{
|
|
UNIV_PRINT (("current host map is %08x and converged (NON-DEFAULT)", lp->host_map));
|
|
if (internal) /* 1.03 */
|
|
{
|
|
LOG_MSGS3(MSG_INFO_SLAVE, num, buf1, buf2);
|
|
}
|
|
return IOCTL_CVY_SLAVE;
|
|
}
|
|
|
|
} /* end Load_hosts_query */
|
|
|
|
/*
|
|
* Function: Load_query_packet_filter
|
|
* Desctription:
|
|
* Parameters:
|
|
* Returns:
|
|
* Author: shouse, 5.18.01
|
|
* Notes:
|
|
*/
|
|
VOID Load_query_packet_filter (
|
|
PIOCTL_QUERY_STATE_PACKET_FILTER pQuery,
|
|
PLOAD_CTXT lp,
|
|
ULONG svr_ipaddr,
|
|
ULONG svr_port,
|
|
ULONG client_ipaddr,
|
|
ULONG client_port,
|
|
USHORT protocol,
|
|
BOOLEAN limit_map_fn)
|
|
{
|
|
PBIN_STATE bp;
|
|
ULONG id;
|
|
ULONG bin;
|
|
QUEUE * qp;
|
|
|
|
/* This variable is used for port rule lookup and since the port rules only cover
|
|
UDP and TCP, we categorize as TCP and non-TCP, meaning that any protocol that's
|
|
not TCP will be treated like UDP for the sake of port rule lookup. */
|
|
BOOLEAN is_tcp_pkt = (protocol == TCPIP_PROTOCOL_TCP);
|
|
|
|
/* Further, some protocols are treated with "session" semantics, while others are
|
|
not. For TCP, this "session" is currently a single TCP connection, which is
|
|
tracked from SYN to FIN using a connection descriptor. IPSec "sessions" are
|
|
also tracked using descriptors, so even though its treated like UDP for port
|
|
rule lookup, its treated with the session semantics resembling TCP. Therefore,
|
|
by default the determination of a session packet is initially the same as the
|
|
determination of a TCP packet. */
|
|
BOOLEAN is_session_pkt = is_tcp_pkt;
|
|
|
|
/* If we have enabled IPSec session tracking, then if the protocol is IPSec, this
|
|
packet should also be treated as part of an existing session. */
|
|
if (NLB_IPSEC_SESSION_SUPPORT_ENABLED() && (protocol == TCPIP_PROTOCOL_IPSEC1)) is_session_pkt = TRUE;
|
|
|
|
UNIV_ASSERT(lp->code == CVY_LOADCODE);
|
|
|
|
/* If the load module has been "turned off", then we drop the packet. */
|
|
if (!lp->active) {
|
|
pQuery->Results.Accept = NLB_REJECT_LOAD_MODULE_INACTIVE;
|
|
return;
|
|
}
|
|
|
|
/* Find the port rule for this server IP address / port pair. */
|
|
bp = Load_pg_lookup(lp, svr_ipaddr, svr_port, is_tcp_pkt);
|
|
|
|
UNIV_ASSERT ((is_tcp_pkt && bp->prot != CVY_UDP) || (!is_tcp_pkt && bp->prot != CVY_TCP));
|
|
|
|
/* If the matching port rule is configured as "disabled", which means to drop any
|
|
packets that match the rule, then we drop the packet. */
|
|
if (bp->mode == CVY_NEVER) {
|
|
pQuery->Results.Accept = NLB_REJECT_PORT_RULE_DISABLED;
|
|
return;
|
|
}
|
|
|
|
/* Apply the NLB hashing algorithm on the client identification. If for reasons
|
|
such as BDA teaming, we have chosen to limit the map function, we hard code the
|
|
second parameter, rather than use some of the server identification in an
|
|
effort to make the processing of this packet agnostic to the server identity.
|
|
The hashing parameters also, of course, depend on the configured afffinity
|
|
settings for the retrieved port rule. */
|
|
if (limit_map_fn) {
|
|
if (bp->affinity == CVY_AFFINITY_NONE)
|
|
id = Map(client_ipaddr, MAP_FN_PARAMETER);
|
|
else if (bp->affinity == CVY_AFFINITY_SINGLE)
|
|
id = Map(client_ipaddr, MAP_FN_PARAMETER);
|
|
else
|
|
id = Map(client_ipaddr & TCPIP_CLASSC_MASK, MAP_FN_PARAMETER);
|
|
} else {
|
|
if (bp->affinity == CVY_AFFINITY_NONE)
|
|
id = Map(client_ipaddr, ((svr_port << 16) + client_port));
|
|
else if (bp->affinity == CVY_AFFINITY_SINGLE)
|
|
id = Map(client_ipaddr, svr_ipaddr);
|
|
else
|
|
id = Map(client_ipaddr & TCPIP_CLASSC_MASK, svr_ipaddr);
|
|
}
|
|
|
|
/* Find the applicable "bucket" by a modulo operation on the number of bins, 60. */
|
|
bin = id % CVY_MAXBINS;
|
|
|
|
/* At this point, we can begin providing the requestee some actual information about
|
|
the state of the load module to better inform them as to why the decision we return
|
|
them was actually made. Here will provide some appropriate information about the
|
|
port rule we are operating on, including the "bucket" ID, the current "bucket"
|
|
ownership map and the number of connections active on this "bucket". */
|
|
pQuery->Results.HashInfo.Valid = TRUE;
|
|
pQuery->Results.HashInfo.Bin = bin;
|
|
pQuery->Results.HashInfo.CurrentMap = bp->cmap;
|
|
pQuery->Results.HashInfo.AllIdleMap = bp->all_idle_map;
|
|
pQuery->Results.HashInfo.ActiveConnections = bp->nconn[bin];
|
|
|
|
/* check bin for residency and all other hosts now idle on their bins; in this
|
|
case and if we do not have dirty connections, we must be able to handle the packet */
|
|
|
|
/* If we currently own the "bucket" to which this connection maps and either NLB provides
|
|
no session support for this protocol, or all other hosts have no exisitng connections
|
|
on this "bucket" and we have no dirty connections, then we can safely take the packet
|
|
with no regard to the connection (session) descriptors. */
|
|
if (((bp->cmap & (((MAP_T) 1) << bin)) != 0) && (!is_session_pkt || (((bp->all_idle_map & (((MAP_T) 1) << bin)) != 0) && (!(lp->cln_waiting))))) {
|
|
pQuery->Results.Accept = NLB_ACCEPT_UNCONDITIONAL_OWNERSHIP;
|
|
return;
|
|
|
|
/* Otherwise, if there are active connections on this "bucket" or if we own the
|
|
"bucket" and there are dirty connections on it, then we'll walk our descriptor
|
|
lists to determine whether or not we should take the packet or not. */
|
|
} else if (bp->nconn[bin] > 0 || (lp->cln_waiting && lp->dirty_bin[bin] && ((bp->cmap & (((MAP_T) 1) << bin)) != 0))) {
|
|
PCONN_ENTRY ep;
|
|
PCONN_DESCR dp;
|
|
|
|
/* Calculate our index into the descriptor hash table by a modulo operation on the
|
|
length of the static descriptor array, 4096. */
|
|
id = id % CVY_MAX_CHASH;
|
|
|
|
/* Grab a pointer to the descriptor in our spot in the hash table. */
|
|
ep = &(lp->hashed_conn[id]);
|
|
|
|
/* Grab a pointer to our assigned queue of descriptors - our second level hashing. */
|
|
qp = &(lp->connq[id]);
|
|
|
|
/* First look for a match in the first-level hashing array. */
|
|
if (CVY_CONN_MATCH(ep, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol)) {
|
|
/* If we find a match in the static hash table, fill in some descriptor
|
|
information for the user, including whether or not the descriptor was
|
|
allocated or static (static is this case) and the observed FIN count. */
|
|
pQuery->Results.DescriptorInfo.Valid = TRUE;
|
|
pQuery->Results.DescriptorInfo.Alloc = ep->alloc;
|
|
pQuery->Results.DescriptorInfo.Dirty = ep->dirty;
|
|
pQuery->Results.DescriptorInfo.FinCount = ep->fin_count;
|
|
|
|
/* If the connection is dirty, we do not take the packet because TCP may
|
|
have stale information for this descriptor. */
|
|
if (ep->dirty) {
|
|
pQuery->Results.Accept = NLB_REJECT_CONNECTION_DIRTY;
|
|
return;
|
|
}
|
|
|
|
/* If the connection is not dirty, we'll take the packet, as it belongs
|
|
to an existing connection that we are servicing on this host. */
|
|
pQuery->Results.Accept = NLB_ACCEPT_FOUND_MATCHING_DESCRIPTOR;
|
|
return;
|
|
|
|
/* Otherwise, we have to walk the second-level hashing linked list of connection
|
|
(session) descriptors looking for a match. */
|
|
} else {
|
|
/* Walk the queue until we reach the end or find what we're looking for. */
|
|
for (dp = (PCONN_DESCR)Queue_front(qp); dp != NULL; dp = (PCONN_DESCR)Queue_next(qp, &(dp->link))) {
|
|
if (CVY_CONN_MATCH(&(dp->entry), svr_ipaddr, svr_port, client_ipaddr, client_port, protocol)) {
|
|
/* If we find a match in the static hash table, fill in some descriptor
|
|
information for the user, including whether or not the descriptor was
|
|
allocated or static (allocated is this case) and the observed FIN count. */
|
|
pQuery->Results.DescriptorInfo.Valid = TRUE;
|
|
pQuery->Results.DescriptorInfo.Alloc = dp->entry.alloc;
|
|
pQuery->Results.DescriptorInfo.Dirty = dp->entry.dirty;
|
|
pQuery->Results.DescriptorInfo.FinCount = dp->entry.fin_count;
|
|
|
|
/* If the connection is dirty, we do not take the packet because TCP may
|
|
have stale information for this descriptor. */
|
|
if (dp->entry.dirty) {
|
|
pQuery->Results.Accept = NLB_REJECT_CONNECTION_DIRTY;
|
|
return;
|
|
}
|
|
|
|
/* If the connection is not dirty, we'll take the packet, as it belongs
|
|
to an existing connection that we are servicing on this host. */
|
|
pQuery->Results.Accept = NLB_ACCEPT_FOUND_MATCHING_DESCRIPTOR;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* If we get all the way down here, then we aren't going to accept the packet
|
|
because we do not own the "bucket" to which the packet maps and we have no
|
|
existing connection (session) state to allow us to service the packet. */
|
|
pQuery->Results.Accept = NLB_REJECT_OWNED_ELSEWHERE;
|
|
return;
|
|
}
|
|
|
|
#if defined (SBH)
|
|
|
|
/*
|
|
* Function: Load_packet_filter
|
|
* Desctription:
|
|
* Parameters:
|
|
* Returns:
|
|
* Author: shouse, 5.18.01
|
|
* Notes:
|
|
*/
|
|
BOOLEAN Load_packet_filter (
|
|
PLOAD_CTXT lp,
|
|
ULONG svr_ipaddr,
|
|
ULONG svr_port,
|
|
ULONG client_ipaddr,
|
|
ULONG client_port,
|
|
USHORT protocol,
|
|
ULONG conn_status,
|
|
BOOLEAN limit_map_fn)
|
|
{
|
|
|
|
BIN_LOOKUP();
|
|
|
|
HASH();
|
|
|
|
switch (conn_status) {
|
|
case CVY_CONN_CREATE:
|
|
CREATE_DSCR();
|
|
break;
|
|
case CVY_CONN_UP:
|
|
CREATE_DSCR();
|
|
break;
|
|
case CVY_CONN_DOWN:
|
|
case CVY_CONN_RESET:
|
|
REMOVE_DSCR();
|
|
break;
|
|
case CVY_CONN_DATA:
|
|
// protocol dependent.
|
|
CHECK_HASH();
|
|
SEARCH_QUEUE();
|
|
break;
|
|
}
|
|
}
|
|
|
|
#endif
|