windows-nt/Source/XPSP1/NT/base/cluster/resdll/ndquorum/crs.c
2020-09-26 16:20:57 +08:00

1063 lines
27 KiB
C

/*++
Copyright (c) 2001 Microsoft Corporation
Module Name:
crs.c
Abstract:
Implements Consistency Replica Set Algorithm
Author:
Ahmed Mohamed (ahmedm) 1-Jan-2001
Revision History:
--*/
#include <nt.h>
#include <ntdef.h>
#include <ntrtl.h>
#include <nturtl.h>
#include <windows.h>
#include <stdio.h>
#include <assert.h>
#include "crs.h"
#define xmalloc(size) VirtualAlloc(NULL, size,MEM_RESERVE|MEM_COMMIT,PAGE_READWRITE)
#define xfree(buffer) VirtualFree(buffer, 0, MEM_RELEASE|MEM_DECOMMIT)
#define CrspEqual(r1,r2) ((r1)->hdr.seq == (r2)->hdr.seq && \
(r1)->hdr.epoch == (r2)->hdr.epoch && \
(r1)->hdr.state == (r2)->hdr.state)
DWORD CrsForcedQuorumSize = 0xffff;
void
WINAPI
CrsSetForcedQuorumSize(DWORD size)
{
CrsForcedQuorumSize = size;
}
DWORD
CrspFindLast(CrsInfo_t *p, DWORD logsz)
{
CrsRecord_t *rec, *last_rec;
BOOL err;
DWORD n, i;
n = SetFilePointer(p->fh, 0, NULL, FILE_BEGIN);
if (n == INVALID_SET_FILE_POINTER) {
return GetLastError();
}
err = ReadFile(p->fh, p->buf, logsz, &n, NULL);
if (!err)
return GetLastError();
if (n != logsz) {
CrsLog(("Crs%d: failed to load complete file, read %d expected %d\n",
p->lid,
n, logsz));
return ERROR_BAD_LENGTH;
}
ASSERT(p->max_records * CRS_RECORD_SZ == (int)n);
if(p->max_records * CRS_RECORD_SZ != (int)n) {
CrsLog(("Crs%d: unable to load log file %d bytes, got %d bytes\n",
p->lid, n, logsz));
return ERROR_BAD_LENGTH;
}
CrsLog(("Crs%d: loaded %d bytes, %d records\n", p->lid,
n, p->max_records));
last_rec = NULL;
rec = p->buf;
for (i = 0; i < logsz; i += CRS_RECORD_SZ, rec++) {
if (rec->hdr.tag != CRS_TAG) {
CrsLog(("crs%d: Bad record %d, got %x expected %x\n",
p->lid,
i/CRS_RECORD_SZ, rec->hdr.tag, CRS_TAG));
return ERROR_BAD_FORMAT;
}
if (!last_rec ||
rec->hdr.epoch > last_rec->hdr.epoch ||
(rec->hdr.epoch == last_rec->hdr.epoch &&
(rec->hdr.seq > last_rec->hdr.seq))) {
last_rec = rec;
}
}
ASSERT(last_rec);
// make sure only the last record is not committed or aborted
rec = p->buf;
for (i = 0; i < logsz; i += CRS_RECORD_SZ, rec++) {
if (!(rec->hdr.state & (CRS_COMMIT | CRS_ABORT))) {
if (rec != last_rec) {
CrsLog(("crs:%d Bad record %d state %x expected commit|abort\n",
p->lid, i/CRS_RECORD_SZ, rec->hdr.state));
return ERROR_INTERNAL_ERROR;
}
}
}
p->last_record = (int) (last_rec - p->buf);
p->seq = last_rec->hdr.seq;
p->epoch = last_rec->hdr.epoch;
return ERROR_SUCCESS;
}
#define CrspFlush(p,offset) CrspWrite(p,offset, CRS_SECTOR_SZ)
static
DWORD
CrspWrite(CrsInfo_t *p, int offset, DWORD length)
{
DWORD n;
p->pending = FALSE;
n = (DWORD) offset;
// write out last sector, assumes lock is held
ASSERT(offset < p->max_records);
offset = offset / CRS_RECORDS_PER_SECTOR;
CrsLog(("Crs%d: flush %d bytes record %d -> %d,%d\n", p->lid,
length, n,
offset, offset*CRS_SECTOR_SZ));
n = SetFilePointer(p->fh, offset * CRS_SECTOR_SZ, NULL, FILE_BEGIN);
if (n == INVALID_SET_FILE_POINTER) {
return GetLastError();
}
n = 0;
if (WriteFile(p->fh, (PVOID) &p->buf[offset*CRS_RECORDS_PER_SECTOR], length, &n, NULL)) {
if (n != length) {
CrsLog(("Write count mismatch, wrote %d, expected %d\n", n, length));
return ERROR_BAD_LENGTH;
}
return ERROR_SUCCESS;
}
n = GetLastError();
CrsLog(("Crs%d: flush record %d failed err %d\n", p->lid, offset, n));
if (n == ERROR_UNEXP_NET_ERR) {
// repeat the write one more time
p->pending = TRUE;
}
return n;
}
static
DWORD
CrspAppendRecord(CrsInfo_t *p, CrsRecord_t *rr, CrsRecord_t **rec)
{
CrsRecord_t *q;
DWORD err;
// tag record
rr->hdr.tag = CRS_TAG;
// assumes lock is held
if ((p->last_record & CRS_SECTOR_MASK) == CRS_SECTOR_MASK) {
// flush current sector
err = CrspFlush(p, p->last_record);
if (err != ERROR_SUCCESS)
return err;
}
// advance last record
p->last_record++;
if (p->last_record == p->max_records)
p->last_record = 0;
CrsLog(("Crs%d: append record %d epoch %I64d seq %I64d state %x\n",
p->lid, p->last_record,
rr->hdr.epoch, rr->hdr.seq, rr->hdr.state));
// copy record
q = &p->buf[p->last_record];
memcpy((PVOID)q, (PVOID) rr, CRS_RECORD_SZ);
// flush it out now
err = CrspFlush(p, p->last_record);
if (err == ERROR_SUCCESS) {
if (rec) *rec = q;
} else {
if (p->last_record == 0)
p->last_record = p->max_records;
p->last_record--;
}
return err;
}
// NextRecord:
// if seq is null, fill in last record and return SUCCESS
// if seq is not found, return NOT_FOUND
// if seq is last record, return EOF
// otherwise return next record after seq in lrec and SUCCESS
DWORD
CrspNextLogRecord(CrsInfo_t *info, CrsRecord_t *seq,
CrsRecord_t *lrec, BOOLEAN this_flag)
{
CrsRecord_t *last, *p;
DWORD err = ERROR_SUCCESS;
if (lrec == NULL || info == NULL) {
return ERROR_INVALID_PARAMETER;
}
// read record
EnterCriticalSection(&info->lock);
last = &info->buf[info->last_record];
if (seq == NULL) {
CrsLog(("Crs%d: last record %d %I64d %I64d\n",
info->lid, info->last_record, last->hdr.epoch, last->hdr.seq));
// read last record
memcpy(lrec, last, CRS_RECORD_SZ);
} else if (seq->hdr.epoch != last->hdr.epoch ||
seq->hdr.seq != last->hdr.seq) {
int i;
CrsLog(("Crs%d: last record %d %I64d %I64d search %I64d %I64d\n",
info->lid, info->last_record,
last->hdr.epoch, last->hdr.seq,
seq->hdr.epoch, seq->hdr.seq));
// assume we don't have it
p = seq;
seq = NULL;
// do a search instead of index, so that
// seq can be reset as epoch increments
for (i = 0; i < info->max_records; i++) {
last = &info->buf[i];
if (p->hdr.epoch == last->hdr.epoch &&
p->hdr.seq == last->hdr.seq) {
seq = last;
break;
}
}
if (seq != NULL) {
if (this_flag == FALSE) {
// return record after this one
i++;
if (i >= info->max_records)
i = 0;
seq = &info->buf[i];
}
CrsLog(("Crs%d: search found %d %I64d, %I64d\n", info->lid,
seq - info->buf, seq->hdr.epoch, seq->hdr.seq));
memcpy(lrec, seq, CRS_RECORD_SZ);
} else {
err = ERROR_NOT_FOUND;
}
} else {
CrsLog(("Crs%d: reached last record %d %I64d %I64d, %I64d %I64d\n",
info->lid, info->last_record,
last->hdr.epoch, last->hdr.seq,
seq->hdr.epoch, seq->hdr.seq));
if (this_flag == TRUE) {
// we are trying to read the last record
memcpy(lrec, last, CRS_RECORD_SZ);
err = ERROR_SUCCESS;
} else {
err = ERROR_HANDLE_EOF;
}
}
LeaveCriticalSection(&info->lock);
if (err == ERROR_SUCCESS && lrec->hdr.epoch == 0) {
// invalid rec, log is empty
err = ERROR_HANDLE_EOF;
}
return err;
}
// Call into fs with <undo, replay, query, disable, enable, done>
// undo: pass replica in recovery due to a conflict
// replay: replica is missing change, if replay fails with abort, we
// do a full copy; otherwise we issue a skip record
// query: ask replica if record was completed or not
// done: signal end of recovery and pass in new wset, rset
// we silently handle <abort(skip) and epoch records>
// abort: add a skip record
// epoch records: just log it as is
DWORD
CrspReplay(LPVOID rec)
{
CrsRecoveryBlk_t *rr;
CrsInfo_t *info, *minfo;
CrsRecord_t *p, *q;
CrsRecord_t lrec, mlrec;
DWORD err;
rr = (CrsRecoveryBlk_t *) rec;
info = rr->info;
minfo = rr->minfo;
CrsLog(("CrsReplay%d mid %d, lid %d leader_id %d\n",
rr->nid, rr->mid, info->lid, info->leader_id));
do {
p = NULL;
// read last record
err = CrspNextLogRecord(info, NULL, &lrec, FALSE);
if (err != ERROR_SUCCESS) {
CrsLog(("CrsReplay%d: unable to read last record %d\n",
info->lid, err));
break;
}
// find our last record in master replica
q = &lrec;
p = &mlrec;
err = CrspNextLogRecord(minfo, q, p, TRUE);
// if found and consistent with master, no undo
if (err == ERROR_SUCCESS && p->hdr.state == q->hdr.state) {
CrsLog(("CrsReplay%d: last record %I64d, %I64d consistent %x %x\n",
info->lid, q->hdr.epoch, q->hdr.seq,
p->hdr.state, q->hdr.state));
break;
}
if (err != ERROR_SUCCESS) {
CrsLog(("CrsReplay%d: missing lrec %I64d, %I64d in disk %d, err %d\n",
info->lid, q->hdr.epoch, q->hdr.seq, minfo->lid, err));
} else {
CrsLog(("CrsReplay%d: undo last record %I64d, %I64d %x needs %x\n",
info->lid, q->hdr.epoch, q->hdr.seq,
q->hdr.state, p->hdr.state));
ASSERT(p->hdr.state & (CRS_COMMIT|CRS_ABORT));
}
// last record is in conflict, we must undo it first
if (!(q->hdr.state & CRS_EPOCH)) {
// if we found this record in master and a conflict is detected,
// we undo it. Otherwise, we need to do a full copy
if (err == ERROR_SUCCESS) {
ASSERT(p->hdr.state & (CRS_COMMIT|CRS_ABORT));
ASSERT(q->hdr.state & CRS_PREPARE);
err = info->callback(info->callback_arg,
rr->nid, q,
CRS_ACTION_UNDO, rr->mid);
}
} else {
// A missing epoch record doesn't mean we are old. A regroup
// could have happened but no new data records got added. We
// undo it, and continue;
err = STATUS_SUCCESS;
}
if (err == STATUS_SUCCESS) {
// update current record, sequence, epoch
info->buf[info->last_record].hdr.state = 0;
info->buf[info->last_record].hdr.epoch = 0;
info->buf[info->last_record].hdr.seq = 0;
if (info->last_record == 0) {
info->last_record = info->max_records;
}
info->last_record--;
info->seq = info->buf[info->last_record].hdr.seq;
info->epoch = info->buf[info->last_record].hdr.epoch;
CrsLog(("CrsReplay%d: new last record %d %I64d, %I64d\n",
info->lid, info->last_record, info->epoch, info->seq));
} else {
// can't undo it, do full copy and readjust our log
CrsLog(("CrsReplay%d: Unable to undo record %I64d, %I64d\n",
info->lid, q->hdr.epoch, q->hdr.seq));
p = NULL;
}
} while (err == STATUS_SUCCESS && info->state == CRS_STATE_RECOVERY);
while (p != NULL && info->state == CRS_STATE_RECOVERY) {
// read master copy
err = CrspNextLogRecord(minfo, p, &mlrec, FALSE);
if (err != ERROR_SUCCESS) {
if (err == ERROR_HANDLE_EOF) {
CrsLog(("CrsReplay%d: last record %I64d, %I64d in disk %d\n",
info->lid, q->hdr.epoch, q->hdr.seq, minfo->lid));
// the last record is where we are at
info->seq = info->buf[info->last_record].hdr.seq;
info->epoch = info->buf[info->last_record].hdr.epoch;
// we reached the end, signal end of recovery
err = info->callback(info->callback_arg,
rr->nid, p,
CRS_ACTION_DONE, rr->mid);
goto exit;
}
break;
}
p = &mlrec;
if ((p->hdr.state & CRS_EPOCH) || (p->hdr.state & CRS_ABORT)) {
CrsLog(("CrsReplay%d: skip record %I64d, %I64d %x\n",
info->lid, p->hdr.epoch, p->hdr.seq, p->hdr.state));
err = !STATUS_SUCCESS;
} else if (p->hdr.state & CRS_COMMIT) {
err = info->callback(info->callback_arg,
rr->nid, p,
CRS_ACTION_REPLAY, rr->mid);
if (err == STATUS_TRANSACTION_ABORTED) {
CrsLog(("CrsReplay: failed nid %d seq %I64d err %d\n",
rr->nid, p->hdr.seq, err));
break;
}
} else {
ASSERT(p->hdr.state & CRS_PREPARE);
// what if the record is prepared but not yet committed or
// aborted; in transit record.
// stop now
CrsLog(("CrsReplay%d: bad record seq %I64d state %x\n",
rr->nid, p->hdr.seq, p->hdr.state));
break;
}
if (err != STATUS_SUCCESS) {
// add record
err = CrspAppendRecord(info, p, NULL);
if (err != ERROR_SUCCESS) {
CrsLog(("CrsReplay%d: failed append seq %I64d err %d\n",
rr->nid, p->hdr.seq, err));
break;
}
if (p->hdr.state & CRS_EPOCH) {
; //ASSERT(info->epoch+1 == p->hdr.epoch);
} else {
ASSERT(info->epoch == p->hdr.epoch);
ASSERT(info->seq+1 == p->hdr.seq);
}
info->seq = p->hdr.seq;
info->epoch = p->hdr.epoch;
} else {
// make sure we have added it
ASSERT(info->seq == p->hdr.seq);
ASSERT(info->epoch == p->hdr.epoch);
ASSERT(info->buf[info->last_record].hdr.seq == p->hdr.seq);
ASSERT(info->buf[info->last_record].hdr.epoch == p->hdr.epoch);
// Propagate dubious bit
if (p->hdr.state & CRS_DUBIOUS) {
info->buf[info->last_record].hdr.state |= CRS_DUBIOUS;
}
ASSERT(info->buf[info->last_record].hdr.state == p->hdr.state);
}
}
if (p == NULL || err != STATUS_SUCCESS) {
CrsLog(("CrsReplay%d: Full copy from disk %d\n",
info->lid, minfo->lid));
// we are out of date or need full recovery, do a full copy
err = info->callback(info->callback_arg,
rr->nid, NULL,
CRS_ACTION_COPY, rr->mid);
if (err == STATUS_SUCCESS) {
DWORD len;
// we now copy our master log and flush it
ASSERT(minfo->max_records == info->max_records);
len = info->max_records * CRS_RECORD_SZ;
memcpy(info->buf, minfo->buf, len);
err = CrspWrite(info, 0, len);
if (err == ERROR_SUCCESS) {
// adjust our state
info->last_record = minfo->last_record;
info->seq = info->buf[info->last_record].hdr.seq;
info->epoch = info->buf[info->last_record].hdr.epoch;
// we reached the end, signal end of recovery
err = info->callback(info->callback_arg,
rr->nid, p,
CRS_ACTION_DONE, rr->mid);
}
}
}
exit:
CrsLog(("CrsReplay%d mid %d status 0x%x\n", rr->nid, rr->mid, err));
return err;
}
/////////////////////// Public Functions //////////////////////
DWORD
WINAPI
CrsOpen(crs_callback_t callback, PVOID callback_arg, USHORT lid,
WCHAR *log_name, int max_logsectors, HANDLE *outhdl)
{
// Open the log file
// If the file in newly create, set the proper size
// If the file size is not the same size, we need to either
// expand or truncate the file. (truncate needs copy)
// Scan file to locate last sector and record
// If last record hasn't been commited, issue a query.
// If query succeeded then, mark it as committed.
// Set epoch,seq
DWORD status;
HANDLE maph;
CrsInfo_t *p;
int logsz;
if (outhdl == NULL) {
return ERROR_INVALID_PARAMETER;
}
*outhdl = NULL;
p = (CrsInfo_t *) malloc(sizeof(*p));
if (p == NULL) {
return ERROR_NOT_ENOUGH_MEMORY;
}
memset((PVOID) p, 0, sizeof(*p));
CrsLog(("Crs%d file '%S'\n", lid, log_name));
p->lid = lid;
p->callback = callback;
p->callback_arg = callback_arg;
p->pending = FALSE;
// Create log file, and set size of newly created
p->fh = CreateFileW(log_name,
GENERIC_READ | GENERIC_WRITE,
FILE_SHARE_READ|FILE_SHARE_WRITE,
NULL,
OPEN_ALWAYS,
FILE_FLAG_WRITE_THROUGH,
NULL);
status = GetLastError();
if(p->fh == INVALID_HANDLE_VALUE){
free((char *) p);
return status;
}
// acquire an exclusive lock on the whole file
if (!LockFile(p->fh, 0, 0, (DWORD)-1, (DWORD)-1)) {
FILE_FULL_EA_INFORMATION ea[2] = {0};
IO_STATUS_BLOCK ios;
NTSTATUS err;
// get status
status = GetLastError();
// change the ea to cause a notification to happen
ea[0].NextEntryOffset = 0;
ea[0].Flags = 0;
ea[0].EaNameLength = 1;
ea[0].EaValueLength = 1;
ea[0].EaName[0] = 'X';
err = NtSetEaFile(p->fh, &ios, (PVOID) ea, sizeof(ea));
CrsLog(("Crs%d Setting EA %x\n", lid, err));
goto error;
}
if (status == ERROR_ALREADY_EXISTS) {
// todo: compare current file size to new size and adjust file
// size accordingly. For now, just use old size
logsz = GetFileSize(p->fh, NULL);
CrsLog(("Crs%d: Filesz %d max_sec %d\n", lid, logsz, max_logsectors));
ASSERT(logsz == max_logsectors * CRS_SECTOR_SZ);
} else {
//extend the file pointer to max size
logsz = max_logsectors * CRS_SECTOR_SZ;
SetFilePointer(p->fh, logsz, NULL, FILE_BEGIN);
SetEndOfFile(p->fh);
CrsLog(("Crs%d: Set Filesz %d max_sec %d\n", lid, logsz, max_logsectors));
}
// allocate file copy in memory
p->buf = xmalloc(logsz);
if (p->buf == NULL) {
status = ERROR_NOT_ENOUGH_MEMORY;
goto error;
}
// set max record
p->max_records = logsz / CRS_RECORD_SZ;
if (status == ERROR_ALREADY_EXISTS) {
// load file and compute last epoch/seq
status = CrspFindLast(p, logsz);
} else {
status = !ERROR_SUCCESS;
}
// init the file, when we detect a read failure or first time
if (status != ERROR_SUCCESS) {
CrsRecord_t *r;
int i;
// initialize file
p->seq = 0;
p->epoch = 0;
p->last_record = 0;
r = p->buf;
for (i = 0; i < logsz; i+= CRS_RECORD_SZ, r++) {
r->hdr.epoch = p->epoch;
r->hdr.seq = p->seq;
r->hdr.tag = CRS_TAG;
r->hdr.state = CRS_COMMIT | CRS_PREPARE | CRS_EPOCH;
}
status = CrspWrite(p, 0, logsz);
}
if (status != ERROR_SUCCESS) {
goto error;
}
CrsLog(("Crs%d: %x Last record %d max %d epoch %I64d seq %I64d\n", p->lid,
p->fh,
p->last_record, p->max_records, p->epoch, p->seq));
// initialize rest of state
p->state = CRS_STATE_INIT;
p->refcnt = 1;
p->leader_id = 0;
InitializeCriticalSection(&p->lock);
*outhdl = p;
return ERROR_SUCCESS;
error:
CloseHandle(p->fh);
if (p->buf) {
xfree(p->buf);
}
free((PVOID) p);
return status;
}
//
DWORD
WINAPI
CrsStart(PVOID *hdls, ULONG alive_set, int cluster_sz,
ULONG *write_set, ULONG *read_set, ULONG *evict_set)
{
DWORD status;
CrsInfo_t **info = (CrsInfo_t **) hdls;
int i, active_sz, mid;
ULONG mask, active_set, fail_set;
CrsInfo_t *p;
CrsRecord_t *q, *mlrec;
if (write_set) *write_set = 0;
if (read_set) *read_set = 0;
if (evict_set) *evict_set = 0;
// no alive node
if (cluster_sz == 0 || alive_set == 0) {
// nothing to do
return ERROR_WRITE_PROTECT;
}
// scan each hdl and make sure it is initialized and lock all hdls
mask = alive_set;
for (i = 0; mask != 0; i++, mask = mask >> 1) {
if (!(mask & 0x1)) {
continue;
}
p = info[i];
if (p == NULL) {
continue;
}
EnterCriticalSection(&p->lock);
// check the state of the last record
p = info[i];
q = &p->buf[p->last_record];
CrsLog(("Crs%d last record %d epoch %I64d seq %I64d state %x\n",
p->lid, p->last_record,
q->hdr.epoch, q->hdr.seq, q->hdr.state));
}
mid = 0;
mlrec = NULL;
// select master replica
for (i = 0, mask = alive_set; mask != 0; i++, mask = mask >> 1) {
if (!(mask & 0x1)) {
continue;
}
p = info[i];
if (p == NULL)
continue;
q = &p->buf[p->last_record];
if (!mlrec ||
mlrec->hdr.epoch < q->hdr.epoch ||
(mlrec->hdr.epoch == q->hdr.epoch && mlrec->hdr.seq < q->hdr.seq) ||
(mlrec->hdr.epoch == q->hdr.epoch && mlrec->hdr.seq == q->hdr.seq &&
mlrec->hdr.state != q->hdr.state && (q->hdr.state & CRS_COMMIT))) {
mid = i;
mlrec = q;
}
}
ASSERT(mid != 0);
// if master last record is in doubt, query filesystem. If the filesystem
// is certain that the operation has occured, it returns STATUS_SUCCESS for
// COMMIT, STATUS_CANCELLED for ABORT, and STATUS_NOT_FOUND for can't tell.
// All undetermined IO must be undone and redone in all non-master replicas
// to ensure all replicas reach consistency. This statement is true even
// for replicas that are currently absent from our set. We tag such records
// we both COMMIT and ABORT, so that the replay thread issues replay for
// new records and undo,replay for last records
p = info[mid];
p->leader_id = (USHORT) mid;
ASSERT(mlrec != NULL);
if (!(mlrec->hdr.state & (CRS_COMMIT | CRS_ABORT))) {
ASSERT(mlrec->hdr.state & CRS_PREPARE);
status = p->callback(p->callback_arg, p->lid,
mlrec, CRS_ACTION_QUERY,
p->lid);
if (status == STATUS_SUCCESS) {
mlrec->hdr.state |= CRS_COMMIT;
} else if (status == STATUS_CANCELLED) {
mlrec->hdr.state |= CRS_ABORT;
} else if (status == STATUS_NOT_FOUND) {
// assume it is committed, but mark it for undo during recovery
mlrec->hdr.state |= (CRS_COMMIT | CRS_DUBIOUS);
}
// todo: if status == TRANSACTION_ABORTED, we need to bail out since
// must master is dead
// no need to flush, I think!
// CrspFlush(p, p->last_record);
// todo: what if the flush fails here, I am assuming that
// an append will equally fail.
}
ASSERT(mlrec->hdr.state & (CRS_COMMIT | CRS_ABORT));
// compute sync and recovery masks
fail_set = 0;
active_set = 0;
active_sz = 0;
for (i = 0, mask = alive_set; mask != 0; i++, mask = mask >> 1) {
if (!(mask & 0x1)) {
continue;
}
p = info[i];
if (p == NULL) {
continue;
}
// set leader id
p->leader_id = (USHORT) mid;
q = &p->buf[p->last_record];
if (CrspEqual(mlrec, q)) {
ASSERT(q->hdr.state & (CRS_COMMIT | CRS_ABORT));
p->state = CRS_STATE_READ;
active_set |= (1 << i);
active_sz++;
} else if (p->state != CRS_STATE_RECOVERY) {
CrsRecoveryBlk_t rrbuf;
CrsRecoveryBlk_t *rr = &rrbuf;
// recover replica
rr->nid = i;
rr->mid = mid;
rr->info = p;
rr->minfo = info[mid];
// set recovery state
p->state = CRS_STATE_RECOVERY;
status = CrspReplay((LPVOID) rr);
// if we fail, evict this replica
if (status != ERROR_SUCCESS) {
fail_set |= (1 << i);
} else {
// repeat this replica again
i--;
mask = mask << 1;
}
}
}
// assume success
status = ERROR_SUCCESS;
// set read sets
if (read_set) *read_set = active_set;
if (!CRS_QUORUM(active_sz, cluster_sz)) {
CrsLog(("No quorum active %d cluster %d\n", active_sz, cluster_sz));
mid = 0;
status = ERROR_WRITE_PROTECT;
} else {
int pass_cnt = 0;
ULONG pass_set = 0;
// Enable writes on all active replicas
for (i = 0, mask = active_set; mask != 0; i++, mask = mask >> 1) {
CrsRecord_t rec;
if (!(mask & 0x1)) {
continue;
}
p = info[i];
if (p == NULL)
continue;
p->state = CRS_STATE_WRITE;
// we now generate a new epoch and flush it to the disk
p->epoch++;
if (p->epoch == 0)
p->epoch = 1;
// reset seq to zero
p->seq = 0;
// write new epoch now, if not a majority replicas succeeded in writing
// the new <epoch, seq> we fail
rec.hdr.epoch = p->epoch;
rec.hdr.seq = p->seq;
rec.hdr.state = CRS_PREPARE | CRS_COMMIT | CRS_EPOCH;
memset(rec.data, 0, sizeof(rec.data));
if (CrspAppendRecord(p, &rec, NULL) == ERROR_SUCCESS) {
pass_cnt++;
pass_set |= (1 << i);
} else {
fail_set |= (1 << i);
}
}
// Recheck to make sure all replicas have advanced epoch
if (!CRS_QUORUM(pass_cnt, cluster_sz)) {
CrsLog(("No quorum due to error pass %d cluster %d\n", pass_cnt, cluster_sz));
mid = 0;
pass_set = 0;
pass_cnt = 0;
status = ERROR_WRITE_PROTECT;
}
if (pass_cnt != active_sz) {
// some replicas have died
for (i = 0, mask = pass_set; mask != 0; i++, mask = mask >> 1) {
if ((alive_set & (1 << i)) && (!mask & (1 << i))) {
p = info[i];
ASSERT(p != NULL);
p->state = CRS_STATE_READ;
}
}
}
// set write set
if (write_set) *write_set = pass_set;
}
if (evict_set) *evict_set = fail_set;
// unlock all hdls and set new master if any
for (i = 0, mask = alive_set; mask != 0; i++, mask = mask >> 1) {
if (!(mask & 0x1)) {
continue;
}
p = info[i];
if (p == NULL)
continue;
p->leader_id = (USHORT) mid;
LeaveCriticalSection(&p->lock);
}
return status;
}
void
WINAPI
CrsClose(PVOID hd)
{
DWORD err;
CrsInfo_t *info = (CrsInfo_t *) hd;
// If we any recovery threads running, make sure we terminate them first
// before close and free all of this stuff
if (info == NULL) {
CrsLog(("CrsClose: try to close a null handle!\n"));
return;
}
// Flush everything out and close the file
EnterCriticalSection(&info->lock);
// flush
CrspFlush(info, info->last_record);
LeaveCriticalSection(&info->lock);
DeleteCriticalSection(&info->lock);
err = CloseHandle(info->fh);
CrsLog(("Crs%d: %x Closed %d\n", info->fh, info->lid, err));
xfree(info->buf);
free((char *) info);
}
void
WINAPI
CrsFlush(PVOID hd)
{
CrsInfo_t *info = (CrsInfo_t *) hd;
// if we have a commit or abort that isn't flushed yet, flush it now
EnterCriticalSection(&info->lock);
if (info->pending == TRUE) {
CrspFlush(info, info->last_record);
}
LeaveCriticalSection(&info->lock);
}
PVOID
WINAPI
CrsPrepareRecord(PVOID hd, PVOID lrec, crs_id_t id)
{
CrsRecord_t *p = (CrsRecord_t *)lrec;
CrsInfo_t *info = (CrsInfo_t *) hd;
DWORD err;
// move to correct slot in this sector. If we need a new sector,
// read it from the file. Make sure we flush any pending commits on
// current sector before we over write our in memory sector buffer.
// prepare record, if seq none 0 then we are skipping the next sequence
EnterCriticalSection(&info->lock);
if (info->state == CRS_STATE_WRITE ||
(info->state == CRS_STATE_RECOVERY && id != NULL && id[0] != 0)) {
if (id != NULL && id[0] != 0) {
CrsHdr_t *tmp = (CrsHdr_t *) id;
assert(id[0] == info->seq+1);
p->hdr.seq = tmp->seq;
p->hdr.epoch = tmp->epoch;
} else {
p->hdr.seq = info->seq+1;
p->hdr.epoch = info->epoch;
}
p->hdr.state = CRS_PREPARE;
err = CrspAppendRecord(info, p, &p);
if (err == ERROR_SUCCESS) {
// we return with the lock held, gets release on commitorabort
CrsLog(("Crs%d prepare %x seq %I64d\n",info->lid, p, p->hdr.seq));
return p;
}
CrsLog(("Crs%d: Append failed seq %I64%d\n", info->lid, p->hdr.seq));
} else {
CrsLog(("Crs%d: Prepare bad state %d id %x\n", info->lid, info->state, id));
}
LeaveCriticalSection(&info->lock);
return NULL;
}
int
WINAPI
CrsCommitOrAbort(PVOID hd, PVOID lrec, int commit)
{
CrsRecord_t *p = (CrsRecord_t *)lrec;
CrsInfo_t *info = (CrsInfo_t *) hd;
if (p == NULL || info == NULL) {
return ERROR_INVALID_PARAMETER;
}
// update state of record
if (p->hdr.seq != info->seq+1) {
CrsLog(("Crs: sequence mis-match on commit|abort %I64d %I64d\n",
p->hdr.seq, info->seq));
assert(0);
return ERROR_INVALID_PARAMETER;
}
assert(!(p->hdr.state & (CRS_COMMIT | CRS_ABORT)));
// todo: this is wrong, what if one replica succeeds
// and others abort. Now, the others will reuse the
// same seq for a different update and when the
// succeeded replica rejoins it can't tell that the
// sequence got reused.
if (commit == TRUE) {
p->hdr.state |= CRS_COMMIT;
// advance the sequence
info->seq++;
CrsLog(("Crs%d: commit last %d leader %d seq %I64d\n", info->lid,
info->last_record,
info->leader_id, p->hdr.seq));
} else {
p->hdr.state |= CRS_ABORT;
// we need to re-adjust our last record
if (info->last_record == 0) {
info->last_record = info->max_records;
}
info->last_record--;
CrsLog(("Crs%d: abort last %d leader %d seq %I64d\n", info->lid,
info->last_record,
info->leader_id, p->hdr.seq));
}
info->pending = TRUE;
LeaveCriticalSection(&info->lock);
return ERROR_SUCCESS;
}
int
WINAPI
CrsCanWrite(PVOID hd)
{
CrsInfo_t *info = (CrsInfo_t *) hd;
int err;
// do we have a quorm or not
EnterCriticalSection(&info->lock);
err = (info->state == CRS_STATE_WRITE);
LeaveCriticalSection(&info->lock);
return err;
}