windows-nt/Source/XPSP1/NT/base/cluster/service/gs/receive.c
2020-09-26 16:20:57 +08:00

707 lines
16 KiB
C

/*++
Copyright (c) 2000 Microsoft Corporation
Module Name:
receive.c
Abstract:
Receive handler and sends reply packets
Author:
Ahmed Mohamed (ahmedm) 12, 01, 2000
Revision History:
--*/
#include "gs.h"
#include "gsp.h"
#include <stdio.h>
#include <assert.h>
void
GspDumpQueue(gs_group_t *gd)
{
gs_msg_t *q;
int i = 0;
for (q = gd->g_recv.r_head; q != NULL; q = q->m_next) {
state_log(("Msg %x: nid %d gid %d type %d mseq %d bnum %d flags %x cnt %d\n",
q, q->m_hdr.h_sid, q->m_hdr.h_gid, q->m_hdr.h_type,
q->m_hdr.h_mseq, q->m_hdr.h_bnum, q->m_hdr.h_flags, q->m_refcnt));
i++;
if (i > 100) {
err_log(("Infinite loop\n"));
halt(1);
}
}
state_log(("Head %x Next %x expecting <%d, %d.\n",
gd->g_recv.r_head,
gd->g_recv.r_next,
gd->g_recv.r_mseq,
gd->g_recv.r_bnum));
}
void
GspRemoveMsg(gs_group_t *gd, gs_msg_t *msg)
{
gs_msg_t **p;
gs_msg_t *q;
gs_log(("Remove gid %d seq %d msg %x\n", gd->g_id,
msg->m_hdr.h_mseq, msg));
GspDumpQueue(gd);
if (msg->m_hdr.h_flags & GS_FLAGS_QUEUED) {
while ((q = gd->g_recv.r_head) != msg) {
if (q == NULL) {
err_log(("Internal error: null head during remove %x\n", msg));
GspDumpQueue(gd);
halt(1);
break;
}
q->m_hdr.h_flags &= ~GS_FLAGS_QUEUED;
gd->g_recv.r_head = q->m_next;
msg_free(q);
}
// delay the freeing of continued messages to simplify recovery
if (!(msg->m_hdr.h_flags & GS_FLAGS_CONTINUED)) {
msg->m_refcnt--;
msg->m_hdr.h_flags &= ~GS_FLAGS_QUEUED;
gd->g_recv.r_head = msg->m_next;
if (&msg->m_next == gd->g_recv.r_next)
gd->g_recv.r_next = &gd->g_recv.r_head;
}
}
msg_free(msg);
GspDumpQueue(gd);
}
void
GspCleanQueue(gs_group_t *gd, gs_sequence_t mseq)
{
gs_msg_t *q, *msg;
gs_log(("Clean gid %d seq %d\n", gd->g_id, mseq));
GspDumpQueue(gd);
while ((q = gd->g_recv.r_head) != NULL && q->m_hdr.h_mseq < mseq) {
if (&q->m_next == gd->g_recv.r_next) {
gd->g_recv.r_next = &gd->g_recv.r_head;
}
q->m_hdr.h_flags &= ~GS_FLAGS_QUEUED;
gd->g_recv.r_head = q->m_next;
msg_free(q);
}
GspDumpQueue(gd);
}
void
GspUOrderInsert(gs_group_t *gd, gs_msg_t *head, gs_msg_t *tail,
gs_sequence_t mseq, gs_sequence_t bnum)
{
gs_msg_t **p;
// insert msg into proper order in receive queue
// this routine needs to check for duplicates
gs_log(("Add ucast gid %d mseq %d,%d head %x tail %x @ next %x\n",
gd->g_id, mseq, bnum,
head, tail, gd->g_recv.r_next));
p = gd->g_recv.r_next;
while (*p) {
if ((*p)->m_hdr.h_mseq > mseq) {
tail->m_next = *p;
*p = head;
return;
}
p = &(*p)->m_next;
}
// add at tail of history queue
tail->m_next = *p;
*p = head;
GspDumpQueue(gd);
}
void
GspOrderInsert(gs_group_t *gd, gs_msg_t *head, gs_msg_t *tail,
gs_sequence_t mseq, gs_sequence_t bnum)
{
gs_msg_t **p;
// check if we have already processed this sequence
if (mseq < gd->g_recv.r_mseq || (mseq == gd->g_recv.r_mseq &&
bnum < gd->g_recv.r_bnum)) {
gs_log(("Droping msg %d,%d @ %d,%d\n", mseq, bnum,
gd->g_recv.r_mseq, gd->g_recv.r_bnum));
msg_free(head);
return;
}
if (head->m_hdr.h_flags & GS_FLAGS_REPLAY) {
p = &gd->g_recv.r_head;
while (p != gd->g_recv.r_next && *p != NULL) {
if ((*p)->m_hdr.h_mseq == mseq && (*p)->m_hdr.h_bnum == bnum) {
gs_log(("duplicate pending type %d mseq %d bnum %d\n",
head->m_hdr.h_type, mseq, bnum));
msg_free(head);
return;
}
}
}
// insert msg into proper order in receive queue
// this routine needs to check for duplicates
gs_log(("Add gid %d mseq %d,%d head %x tail %x @ next %x\n",
gd->g_id, mseq, bnum,
head, tail, gd->g_recv.r_next));
p = gd->g_recv.r_next;
while (*p) {
if ((*p)->m_hdr.h_mseq > mseq ||
((*p)->m_hdr.h_mseq == mseq && (*p)->m_hdr.h_bnum > bnum)) {
tail->m_next = *p;
*p = head;
return;
} else if ((*p)->m_hdr.h_mseq == mseq && (*p)->m_hdr.h_bnum == bnum) {
assert(head->m_hdr.h_flags & GS_FLAGS_REPLAY);
assert(head == tail);
gs_log(("duplicate type %d mseq %d bnum %d\n", head->m_hdr.h_type,mseq, bnum));
msg_free(head);
return;
}
p = &(*p)->m_next;
}
// add at tail of history queue
tail->m_next = *p;
*p = head;
GspDumpQueue(gd);
}
void
GspReplyMsgHandler(gs_msg_t *msg)
{
gs_msg_hdr_t *hdr;
gs_group_t *gd;
gs_context_t *ctx;
hdr = &msg->m_hdr;
// find group using group internal identifier
gd = GspLookupGroup(hdr->h_gid);
GsLockEnter(gd->g_lock);
// find context in waiting queue
ctx = GspLookupContext(gd, hdr->h_cid);
assert(ctx != NULL);
if (ctx->ctx_msg == NULL) {
err_log(("Internal error gid %d ctx %d mseq %d bnum %d flags %x mask %x\n",
ctx->ctx_gid, ctx->ctx_id, ctx->ctx_mseq, ctx->ctx_bnum,
ctx->ctx_flags, ctx->ctx_mask));
err_log(("Internal error msg sid %d mid %d gid %d ctx %d mseq %d bnum %d flags %x\n",
hdr->h_sid, hdr->h_mid,
hdr->h_gid, hdr->h_cid, hdr->h_mseq, hdr->h_bnum, hdr->h_flags));
halt(1);
}
assert(ctx->ctx_msg != NULL);
if (ctx->ctx_msg->m_hdr.h_mseq != hdr->h_mseq) {
err_log(("Internal error ctx %d %d reply %d mismatch %d\n",
ctx->ctx_id, hdr->h_cid,
hdr->h_mseq,
ctx->ctx_msg->m_hdr.h_mseq));
halt(1);
}
GspProcessReply(gd, ctx, msg->m_hdr.h_sid, msg->m_buf, msg->m_hdr.h_len,
*((NTSTATUS *)msg->m_hdr.h_tag));
GsLockExit(gd->g_lock);
msg_free(msg);
}
void
GspSendAck(gs_group_t *gd, gs_msg_t *msg, NTSTATUS status)
{
gs_msg_hdr_t *hdr;
hdr = &msg->m_hdr;
if (hdr->h_cid == (gs_cookie_t) -1)
return;
gs_log(("Ack nid %d msg %x flags %x\n",hdr->h_sid, msg,
msg->m_hdr.h_flags));
if (hdr->h_sid != gd->g_nid) {
gs_msg_hdr_t rhdr;
memcpy(&rhdr, hdr, sizeof(rhdr));
rhdr.h_sid = (gs_memberid_t) gd->g_nid;
rhdr.h_mid = hdr->h_sid;
rhdr.h_type = GS_MSG_TYPE_ACK;
rhdr.h_len = 0;
*((NTSTATUS *)rhdr.h_tag) = status;
msg_send(hdr->h_sid, &rhdr, NULL, 0);
} else {
gs_context_t *ctx;
ctx = GspLookupContext(gd, hdr->h_cid);
GspProcessReply(gd, ctx, gd->g_nid, NULL, 0, status);
}
}
NTSTATUS
WINAPI
GsSendReply(HANDLE cookie, PVOID buf, int len, NTSTATUS status)
{
gs_group_t *gd;
gs_msg_t *msg = (gs_msg_t *)cookie;
NTSTATUS err = ERROR_SUCCESS;
if (msg == NULL || msg->m_hdr.h_rlen < len)
return ERROR_INVALID_PARAMETER;
// find group
gd = GspLookupGroup(msg->m_hdr.h_gid);
GsLockEnter(gd->g_lock);
if (!(msg->m_hdr.h_flags & GS_FLAGS_REPLY) &&
msg->m_hdr.h_rlen >= len) {
// mark msg state
msg->m_hdr.h_flags |= GS_FLAGS_REPLY;
gs_log(("Reply msg %x flags %x len %x ubuf %x ulen %x\n",msg,
msg->m_hdr.h_flags, msg->m_hdr.h_rlen, buf, len));
// local reply
if (msg->m_hdr.h_sid == gd->g_nid) {
gs_context_t *ctx;
// find context in waiting queue
ctx = GspLookupContext(gd, msg->m_hdr.h_cid);
assert(ctx != NULL);
assert(ctx->ctx_msg->m_hdr.h_mseq == hdr->h_mseq);
GspProcessReply(gd, ctx, msg->m_hdr.h_sid, (char *)buf, len, status);
} else {
gs_msg_hdr_t rhdr;
memcpy(&rhdr, &msg->m_hdr, sizeof(rhdr));
rhdr.h_sid = gd->g_nid;
rhdr.h_mid = msg->m_hdr.h_sid;
rhdr.h_type = GS_MSG_TYPE_REPLY;
rhdr.h_len = (UINT16) len;
*((NTSTATUS *)rhdr.h_tag) = status;
msg_send(rhdr.h_mid, &rhdr, (const char *)buf, len);
}
// release msg
msg_free(msg);
} else {
gs_log(("Reply failed %x: flags %x len %x ubuf %x ulen %x\n",msg,
msg->m_hdr.h_flags, msg->m_hdr.h_rlen, buf, len));
err = ERROR_INVALID_OPERATION;
}
GsLockExit(gd->g_lock);
return err;
}
static gs_eventid_t GsTypeToEventId[] = {
GsEventInvalid,
GsEventInvalid,
GsEventData,
GsEventInvalid,
GsEventSingleData,
GsEventInvalid,
GsEventInvalid,
GsEventInvalid,
GsEventInvalid,
GsEventMemberJoin,
GsEventMemberUp,
GsEventInvalid,
GsEventMemberEvicted,
GsEventInvalid,
GsEventMemberDown
};
#define GsMsgTypeToEventId(x) (x != GS_MSG_TYPE_ABORT ? GsTypeToEventId[x] : GsEventAbort)
void
GspSyncMember(gs_group_t *gd, gs_memberid_t mid, gs_sequence_t mseq)
{
gs_msg_t *p;
// forward all messages that we have sent with higher sequence number
for (p = gd->g_recv.r_head; p != NULL; p = p->m_next) {
if (p->m_hdr.h_sid == gd->g_nid && p->m_hdr.h_mseq > mseq &&
p->m_hdr.h_type != GS_MSG_TYPE_UCAST) {
gs_context_t *ctx = &gd->g_send.s_ctxpool[p->m_hdr.h_cid];
assert(ctx->ctx_msg == p);
if (!(ctx->ctx_mask & (1 << mid))) {
recovery_log(("sync node %d mseq %d\n", mid, p->m_hdr.h_mseq));
ctx->ctx_mask |= (1 << mid);
msg_send(mid, &p->m_hdr, p->m_buf, p->m_hdr.h_len);
}
}
}
}
void
GspDeliverMsg(gs_group_t *gd, gs_msg_t *msg)
{
IO_STATUS_BLOCK ios;
NTSTATUS status;
gs_memberid_t mid;
switch(msg->m_hdr.h_type) {
case GS_MSG_TYPE_UP:
mid = *((gs_memberid_t *)msg->m_hdr.h_tag);
GspAddMember(gd, mid, *(int *)msg->m_buf);
GspSyncMember(gd, mid, msg->m_hdr.h_mseq);
recovery_log(("New membership gid %d view %d,%d sz %d set %x\n",
gd->g_id,
gd->g_startview, gd->g_curview, gd->g_sz, gd->g_mset));
break;
default:
break;
}
// hold msg
msg->m_refcnt++;
GsLockExit(gd->g_lock);
ios.Status = GsMsgTypeToEventId(msg->m_hdr.h_type);
ios.Information = msg->m_hdr.h_len;
status = gd->g_callback((HANDLE)msg, msg->m_hdr.h_tag, msg->m_buf, &ios);
GsLockEnter(gd->g_lock);
if (status == STATUS_PENDING) {
gs_log(("Reply msg pending %x\n", msg));
return;
}
if (!(msg->m_hdr.h_flags & GS_FLAGS_REPLY)) {
msg->m_hdr.h_flags |= GS_FLAGS_REPLY;
// *((NTSTATUS *)msg->m_hdr.h_tag) = status;
// release msg
msg->m_refcnt--;
GspSendAck(gd, msg, status);
}
if (msg->m_hdr.h_type == GS_MSG_TYPE_UCAST) {
msg->m_refcnt++;
msg->m_hdr.h_flags &= ~GS_FLAGS_CONTINUED;
GspRemoveMsg(gd, msg);
}
}
void
GspDispatch(gs_group_t *gd)
{
gs_msg_t *msg;
assert(gd->g_recv.r_next != NULL);
while (gd->g_pending == 0 && (msg = *(gd->g_recv.r_next)) != NULL) {
int hit = FALSE;
int flags;
if (msg->m_hdr.h_type != GS_MSG_TYPE_UCAST) {
// compare sequence numbers
if (gd->g_recv.r_mseq == msg->m_hdr.h_mseq &&
gd->g_recv.r_bnum == msg->m_hdr.h_bnum) {
// got it
hit = TRUE;
}
} else {
// compare sequence numbers
if (gd->g_recv.r_mseq >= msg->m_hdr.h_mseq) {
// got it
hit = TRUE;
}
}
if (hit == FALSE) {
break;
}
gd->g_pending = 1;
msg->m_hdr.h_flags &= ~GS_FLAGS_REPLY;
flags = msg->m_hdr.h_flags;
gs_log(("dispatch seq <%d, %d> flags %x msg %x @ next %x\n",
msg->m_hdr.h_mseq,
msg->m_hdr.h_bnum,
flags, msg, gd->g_recv.r_next));
// advance next msg to deliver
gd->g_recv.r_next = &msg->m_next;
// don't touch msg beyond this point, it may get freed as part of delivery
if (msg->m_hdr.h_type != GS_MSG_TYPE_SKIP) {
GspDeliverMsg(gd, msg);
}
// if a continued msg don't advance mseq/bnum
if (!(flags & GS_FLAGS_CONTINUED)) {
if (flags & GS_FLAGS_LAST) {
gd->g_recv.r_bnum = 0;
gd->g_recv.r_mseq++;
} else if (!(flags & GS_FLAGS_PTP)) {
gd->g_recv.r_bnum += (1 << 16);
}
} else if (!(flags & GS_FLAGS_PTP)) {
gd->g_recv.r_bnum++;
}
gd->g_pending = 0;
}
gs_log(("waiting gid %d expect <%d, %d>\n",
gd->g_id, gd->g_recv.r_mseq, gd->g_recv.r_bnum));
GspDumpQueue(gd);
}
#if 0
WINAPI
GsReceiveRequest(gd, buf, len, ios)
{
GsLockEnter(gd->recv_lock);
m = gd->recv_last;
// advance receive window
if (m && m->state == MSG_STATE_DELIVERED) {
if (m->flags & GS_FLAGS_DELIVERED) {
msg_send_reply(m->srcid, m->mseq, m->cseq..);
m->reply = 1;
}
m->state = MSG_STATE_DONE;
// check if this msg can be freed before moving to next one
m = m->next;
}
if (m && m->state == MSG_STATE_READY) {
m->state = MSG_STATE_DELIVERED;
GsLockExit(gd->recv_lock);
memcpy(buf, m->data, m->len);
Ios->status = m->srcid;
Ios->information = m->len;
Return SUCCESS;
}
// queue request
irp->next = gd->recv_pending_queue;
gd->recv_pending_queue = irp;
GsLockExit(gd->recv_lock);
Return PENDING;
}
#endif
void
GspMcastMsgHandler(gs_msg_t *msg)
{
gs_msg_hdr_t *hdr;
gs_group_t *gd;
hdr = &msg->m_hdr;
gd = GspLookupGroup(hdr->h_gid);
// accept messages only if in a valid view
if (gd && GspValidateView(gd, msg->m_hdr.h_viewnum)) {
gs_sequence_t lseq = msg->m_hdr.h_lseq;
GsLockEnter(gd->g_lock);
hdr->h_flags |= GS_FLAGS_QUEUED;
// insert msg into dispatch queue at proper order
GspOrderInsert(gd, msg, msg, hdr->h_mseq, hdr->h_bnum);
GspDispatch(gd);
GspCleanQueue(gd, lseq);
GsLockExit(gd->g_lock);
} else {
msg_free(msg);
}
}
void
GspUcastMsgHandler(gs_msg_t *msg)
{
gs_msg_hdr_t *hdr;
gs_group_t *gd;
hdr = &msg->m_hdr;
gd = GspLookupGroup(hdr->h_gid);
if (gd && GspValidateView(gd, msg->m_hdr.h_viewnum)) {
gs_sequence_t lseq = msg->m_hdr.h_lseq;
GsLockEnter(gd->g_lock);
hdr->h_flags |= GS_FLAGS_QUEUED;
// insert msg into dispatch queue at proper order
GspUOrderInsert(gd, msg, msg, hdr->h_mseq, hdr->h_bnum);
GspDispatch(gd);
GspCleanQueue(gd, lseq);
GsLockExit(gd->g_lock);
} else {
gs_log(("Dropping ucast: gid %d nid %d mseq %d view %d\n", hdr->h_gid,
hdr->h_mid, hdr->h_mseq, hdr->h_viewnum));
msg_free(msg);
}
}
void
GspSeqAllocMsgHandler(gs_msg_t *msg)
{
gs_msg_hdr_t *hdr;
gs_seq_info_t info;
gs_group_t *gd;
hdr = &msg->m_hdr;
gd = GspLookupGroup(hdr->h_gid);
if (gd) {
GsLockEnter(gd->g_lock);
info.mseq = gd->g_global_seq++;
info.viewnum = gd->g_curview;
GsLockExit(gd->g_lock);
hdr->h_mid = hdr->h_sid;
hdr->h_sid = gd->g_nid;
hdr->h_type = GS_MSG_TYPE_SEQREPLY;
hdr->h_len = sizeof(info);
gs_log(("SeqAlloc: nid %d mseq %d view %d\n",
hdr->h_mid, info.mseq, info.viewnum));
msg_send(hdr->h_mid, hdr, (char *) &info, sizeof(info));
}
msg_free(msg);
}
void
GspSeqReplyMsgHandler(gs_msg_t *msg)
{
gs_msg_hdr_t *hdr;
gs_group_t *gd;
gs_context_t *ctx;
hdr = &msg->m_hdr;
assert(hdr->h_len == sizeof(gs_seq_info_t));
// find group using group internal identifier
gd = GspLookupGroup(hdr->h_gid);
if (gd != NULL && GspValidateView(gd, hdr->h_viewnum)) {
gs_seq_info_t *info = (gs_seq_info_t *)msg->m_buf;
GsLockEnter(gd->g_lock);
if (GspValidateView(gd, info->viewnum) && hdr->h_sid == gd->g_mid) {
GspProcessWaitQueue(gd, info);
}
GsLockExit(gd->g_lock);
}
msg_free(msg);
}
void
GspJoinRequestMsgHandler(gs_msg_t *msg)
{
gs_msg_hdr_t *hdr;
gs_join_info_t info;
gs_group_t *gd;
hdr = &msg->m_hdr;
gd = GspLookupGroup(hdr->h_gid);
if (gd) {
GsLockEnter(gd->g_lock);
info.mseq = gd->g_global_seq++;
info.viewnum = gd->g_curview;
info.mset = gd->g_mset;
info.sz = gd->g_sz;
GsLockExit(gd->g_lock);
hdr->h_mid = hdr->h_sid;
hdr->h_sid = gd->g_nid;
hdr->h_type = GS_MSG_TYPE_REPLY;
hdr->h_len = sizeof(info);
msg_send(hdr->h_mid, hdr, (char *) &info, sizeof(info));
}
msg_free(msg);
}
void
GspJoinUpMsgHandler(gs_msg_t *msg)
{
gs_group_t *gd;
gs_msg_hdr_t *hdr;
hdr = &msg->m_hdr;
gd = GspLookupGroup(hdr->h_gid);
// accept messages only if in a valid view
if (gd && GspValidateView(gd, msg->m_hdr.h_viewnum)) {
GsLockEnter(gd->g_lock);
hdr->h_flags |= GS_FLAGS_QUEUED;
// insert msg into dispatch queue at proper order
GspOrderInsert(gd, msg, msg, hdr->h_mseq, hdr->h_bnum);
GspDispatch(gd);
GsLockExit(gd->g_lock);
} else {
msg_free(msg);
}
}
void GspInfoMsgHandler(gs_msg_t *);
void GspMmMsgHandler(gs_msg_t *);
void GspRecoveryMsgHandler(gs_msg_t *);
void GspSyncMsgHandler(gs_msg_t *);
gs_msg_handler_t gs_msg_handler[] = {
GspSeqAllocMsgHandler,
GspSeqReplyMsgHandler,
GspMcastMsgHandler,
GspReplyMsgHandler,
GspUcastMsgHandler,
GspReplyMsgHandler,
GspInfoMsgHandler,
GspMmMsgHandler,
GspJoinRequestMsgHandler,
GspJoinUpMsgHandler, // join
GspJoinUpMsgHandler, // up
NULL, // evict request
NULL, // evict
GspRecoveryMsgHandler,
GspSyncMsgHandler
};