// -*- mode: C++; tab-width: 4; indent-tabs-mode: nil -*- (for GNU Emacs) // // Copyright (c) 1985-2000 Microsoft Corporation // // This file is part of the Microsoft Research IPv6 Network Protocol Stack. // You should have received a copy of the Microsoft End-User License Agreement // for this software along with this release; see the file "license.txt". // If not, please see http://www.research.microsoft.com/msripv6/license.htm, // or write to Microsoft Research, One Microsoft Way, Redmond, WA 98052-6399. // // Abstract: // // Code for TCP Control Block management. // #include "oscfg.h" #include "ndis.h" #include "ip6imp.h" #include "ip6def.h" #include "tdi.h" #include "tdint.h" #include "tdistat.h" #include "queue.h" #include "transprt.h" #include "tcp.h" #include "tcb.h" #include "tcpconn.h" #include "tcpsend.h" #include "tcprcv.h" #include "info.h" #include "tcpcfg.h" #include "tcpdeliv.h" #include "route.h" KSPIN_LOCK TCBTableLock; uint TCPTime; uint TCBWalkCount; TCB **TCBTable; TCB *LastTCB; TCB *PendingFreeList; SLIST_HEADER FreeTCBList; KSPIN_LOCK FreeTCBListLock; // Lock to protect TCB free list. extern KSPIN_LOCK AddrObjTableLock; extern SeqNum ISNMonotonicPortion; extern int ISNCredits; extern int ISNMaxCredits; extern uint GetDeltaTime(); uint CurrentTCBs = 0; uint FreeTCBs = 0; uint MaxTCBs = 0xffffffff; #define MAX_FREE_TCBS 1000 #define NUM_DEADMAN_TICKS MS_TO_TICKS(1000) uint MaxFreeTCBs = MAX_FREE_TCBS; uint DeadmanTicks; KTIMER TCBTimer; KDPC TCBTimeoutDpc; // // All of the init code can be discarded. // #ifdef ALLOC_PRAGMA int InitTCB(void); #pragma alloc_text(INIT, InitTCB) #endif // ALLOC_PRAGMA //* ReadNextTCB - Read the next TCB in the table. // // Called to read the next TCB in the table. The needed information // is derived from the incoming context, which is assumed to be valid. // We'll copy the information, and then update the context value with // the next TCB to be read. // uint // Returns: TRUE if more data is available to be read, FALSE is not. ReadNextTCB( void *Context, // Pointer to a TCPConnContext. void *Buffer) // Pointer to a TCPConnTableEntry structure. { TCPConnContext *TCContext = (TCPConnContext *)Context; TCP6ConnTableEntry *TCEntry = (TCP6ConnTableEntry *)Buffer; KIRQL OldIrql; TCB *CurrentTCB; uint i; CurrentTCB = TCContext->tcc_tcb; CHECK_STRUCT(CurrentTCB, tcb); KeAcquireSpinLock(&CurrentTCB->tcb_lock, &OldIrql); if (CLOSING(CurrentTCB)) TCEntry->tct_state = TCP_CONN_CLOSED; else TCEntry->tct_state = (uint)CurrentTCB->tcb_state + TCB_STATE_DELTA; TCEntry->tct_localaddr = CurrentTCB->tcb_saddr; TCEntry->tct_localscopeid = CurrentTCB->tcb_sscope_id; TCEntry->tct_localport = CurrentTCB->tcb_sport; TCEntry->tct_remoteaddr = CurrentTCB->tcb_daddr; TCEntry->tct_remotescopeid = CurrentTCB->tcb_dscope_id; TCEntry->tct_remoteport = CurrentTCB->tcb_dport; TCEntry->tct_owningpid = (CurrentTCB->tcb_conn) ? CurrentTCB->tcb_conn->tc_owningpid : 0; KeReleaseSpinLock(&CurrentTCB->tcb_lock, OldIrql); // We've filled it in. Now update the context. if (CurrentTCB->tcb_next != NULL) { TCContext->tcc_tcb = CurrentTCB->tcb_next; return TRUE; } else { // NextTCB is NULL. Loop through the TCBTable looking for a new one. i = TCContext->tcc_index + 1; while (i < TcbTableSize) { if (TCBTable[i] != NULL) { TCContext->tcc_tcb = TCBTable[i]; TCContext->tcc_index = i; return TRUE; break; } else i++; } TCContext->tcc_index = 0; TCContext->tcc_tcb = NULL; return FALSE; } } //* ValidateTCBContext - Validate the context for reading a TCB table. // // Called to start reading the TCB table sequentially. We take in // a context, and if the values are 0 we return information about the // first TCB in the table. Otherwise we make sure that the context value // is valid, and if it is we return TRUE. // We assume the caller holds the TCB table lock. // // Upon return, *Valid is set to true if the context is valid. // uint // Returns: TRUE if data in table, FALSE if not. ValidateTCBContext( void *Context, // Pointer to a TCPConnContext. uint *Valid) // Where to return infoformation about context being valid. { TCPConnContext *TCContext = (TCPConnContext *)Context; uint i; TCB *TargetTCB; TCB *CurrentTCB; i = TCContext->tcc_index; TargetTCB = TCContext->tcc_tcb; // // If the context values are 0 and NULL, we're starting from the beginning. // if (i == 0 && TargetTCB == NULL) { *Valid = TRUE; do { if ((CurrentTCB = TCBTable[i]) != NULL) { CHECK_STRUCT(CurrentTCB, tcb); break; } i++; } while (i < TcbTableSize); if (CurrentTCB != NULL) { TCContext->tcc_index = i; TCContext->tcc_tcb = CurrentTCB; return TRUE; } else return FALSE; } else { // // We've been given a context. We just need to make sure that it's // valid. // if (i < TcbTableSize) { CurrentTCB = TCBTable[i]; while (CurrentTCB != NULL) { if (CurrentTCB == TargetTCB) { *Valid = TRUE; return TRUE; break; } else { CurrentTCB = CurrentTCB->tcb_next; } } } // If we get here, we didn't find the matching TCB. *Valid = FALSE; return FALSE; } } //* FindNextTCB - Find the next TCB in a particular chain. // // This routine is used to find the 'next' TCB in a chain. Since we keep // the chain in ascending order, we look for a TCB which is greater than // the input TCB. When we find one, we return it. // // This routine is mostly used when someone is walking the table and needs // to free the various locks to perform some action. // TCB * // Returns: Pointer to the next TCB, or NULL. FindNextTCB( uint Index, // Index into TCBTable. TCB *Current) // Current TCB - we find the one after this one. { TCB *Next; ASSERT(Index < TcbTableSize); Next = TCBTable[Index]; while (Next != NULL && (Next <= Current)) Next = Next->tcb_next; return Next; } //* ResetSendNext - Set the sendnext value of a TCB. // // Called to set the send next value of a TCB. We do that, and adjust all // pointers to the appropriate places. We assume the caller holds the lock // on the TCB. // void // Returns: Nothing. ResetSendNext( TCB *SeqTCB, // TCB to be updated. SeqNum NewSeq) // Sequence number to set. { TCPSendReq *SendReq; uint AmtForward; Queue *CurQ; PNDIS_BUFFER Buffer; uint Offset; CHECK_STRUCT(SeqTCB, tcb); ASSERT(SEQ_GTE(NewSeq, SeqTCB->tcb_senduna)); // // The new seq must be less than send max, or NewSeq, senduna, sendnext, // and sendmax must all be equal (the latter case happens when we're // called exiting TIME_WAIT, or possibly when we're retransmitting // during a flow controlled situation). // ASSERT(SEQ_LT(NewSeq, SeqTCB->tcb_sendmax) || (SEQ_EQ(SeqTCB->tcb_senduna, SeqTCB->tcb_sendnext) && SEQ_EQ(SeqTCB->tcb_senduna, SeqTCB->tcb_sendmax) && SEQ_EQ(SeqTCB->tcb_senduna, NewSeq))); AmtForward = NewSeq - SeqTCB->tcb_senduna; if ((AmtForward == 1) && (SeqTCB->tcb_flags & FIN_SENT) && !((SeqTCB->tcb_sendnext - SeqTCB->tcb_senduna) > 1) && (SEQ_EQ(SeqTCB->tcb_sendnext,SeqTCB->tcb_sendmax))) { KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_INFO_RARE, "tcpip6: trying to set sendnext for FIN_SENT\n")); SeqTCB->tcb_sendnext = NewSeq; SeqTCB->tcb_flags &= ~FIN_OUTSTANDING; return; } if((SeqTCB->tcb_flags & FIN_SENT) && (SEQ_EQ(SeqTCB->tcb_sendnext,SeqTCB->tcb_sendmax)) && ((SeqTCB->tcb_sendnext - NewSeq) == 1) ){ // // There is only FIN that is left beyond sendnext. // SeqTCB->tcb_sendnext = NewSeq; SeqTCB->tcb_flags &= ~FIN_OUTSTANDING; return; } SeqTCB->tcb_sendnext = NewSeq; // // If we're backing off send next, turn off the FIN_OUTSTANDING flag to // maintain a consistent state. // if (!SEQ_EQ(NewSeq, SeqTCB->tcb_sendmax)) SeqTCB->tcb_flags &= ~FIN_OUTSTANDING; if (SYNC_STATE(SeqTCB->tcb_state) && SeqTCB->tcb_state != TCB_TIME_WAIT) { // // In these states we need to update the send queue. // if (!EMPTYQ(&SeqTCB->tcb_sendq)) { CurQ = QHEAD(&SeqTCB->tcb_sendq); SendReq = (TCPSendReq *)CONTAINING_RECORD(CurQ, TCPReq, tr_q); // // SendReq points to the first send request on the send queue. // Move forward AmtForward bytes on the send queue, and set the // TCB pointers to the resultant SendReq, buffer, offset, size. // while (AmtForward) { CHECK_STRUCT(SendReq, tsr); if (AmtForward >= SendReq->tsr_unasize) { // // We're going to move completely past this one. Subtract // his size from AmtForward and get the next one. // AmtForward -= SendReq->tsr_unasize; CurQ = QNEXT(CurQ); ASSERT(CurQ != QEND(&SeqTCB->tcb_sendq)); SendReq = (TCPSendReq *)CONTAINING_RECORD(CurQ, TCPReq, tr_q); } else { // // We're pointing at the proper send req now. Break out // of this loop and save the information. Further down // we'll need to walk down the buffer chain to find // the proper buffer and offset. // break; } } // // We're pointing at the proper send req now. We need to go down // the buffer chain here to find the proper buffer and offset. // SeqTCB->tcb_cursend = SendReq; SeqTCB->tcb_sendsize = SendReq->tsr_unasize - AmtForward; Buffer = SendReq->tsr_buffer; Offset = SendReq->tsr_offset; while (AmtForward) { // Walk the buffer chain. uint Length; // // We'll need the length of this buffer. Use the portable // macro to get it. We have to adjust the length by the offset // into it, also. // ASSERT((Offset < NdisBufferLength(Buffer)) || ((Offset == 0) && (NdisBufferLength(Buffer) == 0))); Length = NdisBufferLength(Buffer) - Offset; if (AmtForward >= Length) { // // We're moving past this one. Skip over him, and 0 the // Offset we're keeping. // AmtForward -= Length; Offset = 0; Buffer = NDIS_BUFFER_LINKAGE(Buffer); ASSERT(Buffer != NULL); } else break; } // // Save the buffer we found, and the offset into that buffer. // SeqTCB->tcb_sendbuf = Buffer; SeqTCB->tcb_sendofs = Offset + AmtForward; } else { ASSERT(SeqTCB->tcb_cursend == NULL); ASSERT(AmtForward == 0); } } CheckTCBSends(SeqTCB); } //* TCPAbortAndIndicateDisconnect // // Abortively closes a TCB and issues a disconnect indication up the the // transport user. This function is used to support cancellation of // TDI send and receive requests. // void // Returns: Nothing. TCPAbortAndIndicateDisconnect( CONNECTION_CONTEXT ConnectionContext // Connection ID to find a TCB for. ) { TCB *AbortTCB; KIRQL Irql0, Irql1; // One per lock nesting level. TCPConn *Conn; Conn = GetConnFromConnID(PtrToUlong(ConnectionContext), &Irql0); if (Conn != NULL) { CHECK_STRUCT(Conn, tc); AbortTCB = Conn->tc_tcb; if (AbortTCB != NULL) { // // If it's CLOSING or CLOSED, skip it. // if ((AbortTCB->tcb_state != TCB_CLOSED) && !CLOSING(AbortTCB)) { CHECK_STRUCT(AbortTCB, tcb); KeAcquireSpinLock(&AbortTCB->tcb_lock, &Irql1); KeReleaseSpinLock(&Conn->tc_ConnBlock->cb_lock, Irql1); if (AbortTCB->tcb_state == TCB_CLOSED || CLOSING(AbortTCB)) { KeReleaseSpinLock(&AbortTCB->tcb_lock, Irql0); return; } AbortTCB->tcb_refcnt++; AbortTCB->tcb_flags |= NEED_RST; // send a reset if connected TryToCloseTCB(AbortTCB, TCB_CLOSE_ABORTED, Irql0); RemoveTCBFromConn(AbortTCB); IF_TCPDBG(TCP_DEBUG_IRP) { KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_INFO_TCPDBG, "TCPAbortAndIndicateDisconnect, indicating discon\n")); } NotifyOfDisc(AbortTCB, TDI_CONNECTION_ABORTED); KeAcquireSpinLock(&AbortTCB->tcb_lock, &Irql0); DerefTCB(AbortTCB, Irql0); // TCB lock freed by DerefTCB. return; } else KeReleaseSpinLock(&Conn->tc_ConnBlock->cb_lock, Irql0); } else KeReleaseSpinLock(&Conn->tc_ConnBlock->cb_lock, Irql0); } } //* TCBTimeout - Do timeout events on TCBs. // // Called every MS_PER_TICKS milliseconds to do timeout processing on TCBs. // We run throught the TCB table, decrementing timers. If one goes to zero // we look at its state to decide what to do. // void // Returns: Nothing. TCBTimeout( PKDPC MyDpcObject, // The DPC object describing this routine. void *Context, // The argument we asked to be called with. void *Unused1, void *Unused2) { uint i; TCB *CurrentTCB; uint Delayed = FALSE; uint CallRcvComplete; int Delta; UNREFERENCED_PARAMETER(Context); UNREFERENCED_PARAMETER(Unused1); UNREFERENCED_PARAMETER(Unused2); // // Update our free running counter. // TCPTime++; ExInterlockedAddUlong(&TCBWalkCount, 1, &TCBTableLock); // // Set credits so that some more connections can increment the // Initial Sequence Number, during the next 100 ms. // InterlockedExchange(&ISNCredits, ISNMaxCredits); Delta = GetDeltaTime(); // // The increment made is (256)*(Time in milliseconds). This is really close // to 25000 increment made originally every 100 ms. // if (Delta > 0) { Delta *= 0x100; InterlockedExchangeAdd(&ISNMonotonicPortion, Delta); } // // Loop through each bucket in the table, going down the chain of // TCBs on the bucket. // for (i = 0; i < TcbTableSize; i++) { TCB *TempTCB; uint maxRexmitCnt; CurrentTCB = TCBTable[i]; while (CurrentTCB != NULL) { CHECK_STRUCT(CurrentTCB, tcb); KeAcquireSpinLockAtDpcLevel(&CurrentTCB->tcb_lock); // // If it's CLOSING or CLOSED, skip it. // if (CurrentTCB->tcb_state == TCB_CLOSED || CLOSING(CurrentTCB)) { TempTCB = CurrentTCB->tcb_next; KeReleaseSpinLockFromDpcLevel(&CurrentTCB->tcb_lock); CurrentTCB = TempTCB; continue; } CheckTCBSends(CurrentTCB); CheckTCBRcv(CurrentTCB); // // First check the rexmit timer. // if (TCB_TIMER_RUNNING(CurrentTCB->tcb_rexmittimer)) { // // The timer is running. // if (--(CurrentTCB->tcb_rexmittimer) == 0) { // // And it's fired. Figure out what to do now. // if (CurrentTCB->tcb_state == TCB_SYN_SENT) { maxRexmitCnt = MaxConnectRexmitCount; } else { maxRexmitCnt = MaxDataRexmitCount; } // // If we've run out of retransmits or we're in FIN_WAIT2, // time out. // CurrentTCB->tcb_rexmitcnt++; if (CurrentTCB->tcb_rexmitcnt > maxRexmitCnt) { ASSERT(CurrentTCB->tcb_state > TCB_LISTEN); // // This connection has timed out. Abort it. First // reference him, then mark as closed, notify the // user, and finally dereference and close him. // TimeoutTCB: CurrentTCB->tcb_refcnt++; TryToCloseTCB(CurrentTCB, TCB_CLOSE_TIMEOUT, DISPATCH_LEVEL); RemoveTCBFromConn(CurrentTCB); NotifyOfDisc(CurrentTCB, TDI_TIMED_OUT); KeAcquireSpinLockAtDpcLevel(&CurrentTCB->tcb_lock); DerefTCB(CurrentTCB, DISPATCH_LEVEL); CurrentTCB = FindNextTCB(i, CurrentTCB); continue; } // // Stop round trip time measurement. // CurrentTCB->tcb_rtt = 0; // // Figure out what our new retransmit timeout should be. // We double it each time we get a retransmit, and reset it // back when we get an ack for new data. // CurrentTCB->tcb_rexmit = MIN(CurrentTCB->tcb_rexmit << 1, MAX_REXMIT_TO); // // Reset the sequence number, and reset the congestion // window. // ResetSendNext(CurrentTCB, CurrentTCB->tcb_senduna); if (!(CurrentTCB->tcb_flags & FLOW_CNTLD)) { // // Don't let the slow start threshold go below 2 // segments. // CurrentTCB->tcb_ssthresh = MAX(MIN(CurrentTCB->tcb_cwin, CurrentTCB->tcb_sendwin) / 2, (uint) CurrentTCB->tcb_mss * 2); CurrentTCB->tcb_cwin = CurrentTCB->tcb_mss; } else { // // We're probing, and the probe timer has fired. We // need to set the FORCE_OUTPUT bit here. // CurrentTCB->tcb_flags |= FORCE_OUTPUT; } // // See if we need to probe for a PMTU black hole. // if (PMTUBHDetect && CurrentTCB->tcb_rexmitcnt == ((maxRexmitCnt+1)/2)) { // // We may need to probe for a black hole. If we're // doing MTU discovery on this connection and we // are retransmitting more than a minimum segment // size, or we are probing for a PMTU BH already, turn // off the DF flag and bump the probe count. If the // probe count gets too big we'll assume it's not // a PMTU black hole, and we'll try to switch the // router. // if ((CurrentTCB->tcb_flags & PMTU_BH_PROBE) || (CurrentTCB->tcb_sendmax - CurrentTCB->tcb_senduna > 8)) { // // May need to probe. If we haven't exceeded our // probe count, do so, otherwise restore those // values. // if (CurrentTCB->tcb_bhprobecnt++ < 2) { // // We're going to probe. Turn on the flag, // drop the MSS, and turn off the don't // fragment bit. // if (!(CurrentTCB->tcb_flags & PMTU_BH_PROBE)) { CurrentTCB->tcb_flags |= PMTU_BH_PROBE; CurrentTCB->tcb_slowcount++; CurrentTCB->tcb_fastchk |= TCP_FLAG_SLOW; // // Drop the MSS to the minimum. // CurrentTCB->tcb_mss = MIN(DEFAULT_MSS, CurrentTCB->tcb_remmss); ASSERT(CurrentTCB->tcb_mss > 0); CurrentTCB->tcb_cwin = CurrentTCB->tcb_mss; } // // Drop the rexmit count so we come here again, // and don't retrigger DeadGWDetect. // CurrentTCB->tcb_rexmitcnt--; } else { // // Too many probes. Stop probing, and allow // fallover to the next gateway. // // Currently this code won't do BH probing on // the 2nd gateway. The MSS will stay at the // minimum size. This might be a little // suboptimal, but it's easy to implement for // the Sept. 95 service pack and will keep // connections alive if possible. // // In the future we should investigate doing // dead g/w detect on a per-connection basis, // and then doing PMTU probing for each // connection. // if (CurrentTCB->tcb_flags & PMTU_BH_PROBE) { CurrentTCB->tcb_flags &= ~PMTU_BH_PROBE; if (--(CurrentTCB->tcb_slowcount) == 0) CurrentTCB->tcb_fastchk &= ~TCP_FLAG_SLOW; } CurrentTCB->tcb_bhprobecnt = 0; } } } // // Since we're retransmitting, our first-hop router // may be down. Tell IP we're suspicious if this // is the first retransmit. // if (CurrentTCB->tcb_rexmitcnt == 1 && CurrentTCB->tcb_rce != NULL) { ForwardReachabilityInDoubt(CurrentTCB->tcb_rce); } // // Now handle the various cases. // switch (CurrentTCB->tcb_state) { case TCB_SYN_SENT: case TCB_SYN_RCVD: // // In SYN-SENT or SYN-RCVD we'll need to retransmit // the SYN. // SendSYN(CurrentTCB, DISPATCH_LEVEL); CurrentTCB = FindNextTCB(i, CurrentTCB); continue; case TCB_FIN_WAIT1: case TCB_CLOSING: case TCB_LAST_ACK: // // The call to ResetSendNext (above) will have // turned off the FIN_OUTSTANDING flag. // CurrentTCB->tcb_flags |= FIN_NEEDED; case TCB_CLOSE_WAIT: case TCB_ESTAB: // // In this state we have data to retransmit, unless // the window is zero (in which case we need to // probe), or we're just sending a FIN. // CheckTCBSends(CurrentTCB); Delayed = TRUE; DelayAction(CurrentTCB, NEED_OUTPUT); break; case TCB_TIME_WAIT: // // If it's fired in TIME-WAIT, we're all done and // can clean up. We'll call TryToCloseTCB even // though he's already sort of closed. TryToCloseTCB // will figure this out and do the right thing. // TryToCloseTCB(CurrentTCB, TCB_CLOSE_SUCCESS, DISPATCH_LEVEL); CurrentTCB = FindNextTCB(i, CurrentTCB); continue; default: break; } } } // // Now check the SWS deadlock timer.. // if (TCB_TIMER_RUNNING(CurrentTCB->tcb_swstimer)) { // // The timer is running. // if (--(CurrentTCB->tcb_swstimer) == 0) { // // And it's fired. Force output now. // CurrentTCB->tcb_flags |= FORCE_OUTPUT; Delayed = TRUE; DelayAction(CurrentTCB, NEED_OUTPUT); } } // // Check the push data timer. // if (TCB_TIMER_RUNNING(CurrentTCB->tcb_pushtimer)) { // // The timer is running. Decrement it. // if (--(CurrentTCB->tcb_pushtimer) == 0) { // // It's fired. // PushData(CurrentTCB); Delayed = TRUE; } } // // Check the delayed ack timer. // if (TCB_TIMER_RUNNING(CurrentTCB->tcb_delacktimer)) { // // The timer is running. // if (--(CurrentTCB->tcb_delacktimer) == 0) { // // And it's fired. Set up to send an ACK. // Delayed = TRUE; DelayAction(CurrentTCB, NEED_ACK); } } // // Finally check the keepalive timer. // if (CurrentTCB->tcb_state == TCB_ESTAB) { if ((CurrentTCB->tcb_flags & KEEPALIVE) && (CurrentTCB->tcb_conn != NULL)) { uint Delta; Delta = TCPTime - CurrentTCB->tcb_alive; if (Delta > CurrentTCB->tcb_conn->tc_tcbkatime) { Delta -= CurrentTCB->tcb_conn->tc_tcbkatime; if (Delta > (CurrentTCB->tcb_kacount * CurrentTCB->tcb_conn->tc_tcbkainterval)) { if (CurrentTCB->tcb_kacount < MaxDataRexmitCount) { SendKA(CurrentTCB, DISPATCH_LEVEL); CurrentTCB = FindNextTCB(i, CurrentTCB); continue; } else goto TimeoutTCB; } } else CurrentTCB->tcb_kacount = 0; } } // // If this is an active open connection in SYN-SENT or SYN-RCVD, // or we have a FIN pending, check the connect timer. // if (CurrentTCB->tcb_flags & (ACTIVE_OPEN | FIN_NEEDED | FIN_SENT)) { TCPConnReq *ConnReq = CurrentTCB->tcb_connreq; ASSERT(ConnReq != NULL); if (TCB_TIMER_RUNNING(ConnReq->tcr_timeout)) { // Timer is running. if (--(ConnReq->tcr_timeout) == 0) { // The connection timer has timed out. TryToCloseTCB(CurrentTCB, TCB_CLOSE_TIMEOUT, DISPATCH_LEVEL); CurrentTCB = FindNextTCB(i, CurrentTCB); continue; } } } // // Timer isn't running, or didn't fire. // TempTCB = CurrentTCB->tcb_next; KeReleaseSpinLockFromDpcLevel(&CurrentTCB->tcb_lock); CurrentTCB = TempTCB; } } // // See if we need to call receive complete as part of deadman processing. // We do this now because we want to restart the timer before calling // receive complete, in case that takes a while. If we make this check // while the timer is running we'd have to lock, so we'll check and save // the result now before we start the timer. // if (DeadmanTicks == TCPTime) { CallRcvComplete = TRUE; DeadmanTicks += NUM_DEADMAN_TICKS; } else CallRcvComplete = FALSE; // // Now check the pending free list. If it's not null, walk down the // list and decrement the walk count. If the count goes below 2, pull it // from the list. If the count goes to 0, free the TCB. If the count is // at 1 it'll be freed by whoever called RemoveTCB. // KeAcquireSpinLockAtDpcLevel(&TCBTableLock); if (PendingFreeList != NULL) { TCB *PrevTCB; PrevTCB = CONTAINING_RECORD(&PendingFreeList, TCB, tcb_delayq.q_next); do { CurrentTCB = (TCB *)PrevTCB->tcb_delayq.q_next; CHECK_STRUCT(CurrentTCB, tcb); CurrentTCB->tcb_walkcount--; if (CurrentTCB->tcb_walkcount <= 1) { *(TCB **)&PrevTCB->tcb_delayq.q_next = (TCB *)CurrentTCB->tcb_delayq.q_next; if (CurrentTCB->tcb_walkcount == 0) { FreeTCB(CurrentTCB); } } else { PrevTCB = CurrentTCB; } } while (PrevTCB->tcb_delayq.q_next != NULL); } TCBWalkCount--; KeReleaseSpinLockFromDpcLevel(&TCBTableLock); // // Do AddrCheckTable cleanup. // if (AddrCheckTable) { TCPAddrCheckElement *Temp; KeAcquireSpinLockAtDpcLevel(&AddrObjTableLock); for (Temp = AddrCheckTable;Temp < AddrCheckTable + NTWMaxConnectCount; Temp++) { if (Temp->TickCount > 0) { if ((--(Temp->TickCount)) == 0) { Temp->SourceAddress = UnspecifiedAddr; } } } KeReleaseSpinLockFromDpcLevel(&AddrObjTableLock); } if (Delayed) ProcessTCBDelayQ(); if (CallRcvComplete) TCPRcvComplete(); } #if 0 // We update PMTU lazily to avoid exactly this. //* SetTCBMTU - Set TCB MTU values. // // A function called by TCBWalk to set the MTU values of all TCBs using // a particular path. // uint // Returns: TRUE. SetTCBMTU( TCB *CheckTCB, // TCB to be checked. void *DestPtr, // Destination address. void *SrcPtr, // Source address. void *MTUPtr) // New MTU. { IPv6Addr *DestAddr = (IPv6Addr *)DestPtr; IPv6Addr *SrcAddr = (IPv6Addr *)SrcPtr; KIRQL OldIrql; CHECK_STRUCT(CheckTCB, tcb); KeAcquireSpinLock(&CheckTCB->tcb_lock, &OldIrql); if (IP6_ADDR_EQUAL(&CheckTCB->tcb_daddr, DestAddr) && IP6_ADDR_EQUAL(&CheckTCB->tcb_saddr, SrcAddr)) { uint MTU = *(uint *)MTUPtr; CheckTCB->tcb_mss = (ushort)MIN(MTU, (uint)CheckTCB->tcb_remmss); ASSERT(CheckTCB->tcb_mss > 0); // // Reset the Congestion Window if necessary. // if (CheckTCB->tcb_cwin < CheckTCB->tcb_mss) { CheckTCB->tcb_cwin = CheckTCB->tcb_mss; // // Make sure the slow start threshold is at least 2 segments. // if (CheckTCB->tcb_ssthresh < ((uint) CheckTCB->tcb_mss*2)) { CheckTCB->tcb_ssthresh = CheckTCB->tcb_mss * 2; } } } KeReleaseSpinLock(&CheckTCB->tcb_lock, OldIrql); return TRUE; } #endif //* DeleteTCBWithSrc - Delete tcbs with a particular src address. // // A function called by TCBWalk to delete all TCBs with a particular source // address. // uint // Returns: FALSE if CheckTCB is to be deleted, TRUE otherwise. DeleteTCBWithSrc( TCB *CheckTCB, // TCB to be checked. void *AddrPtr, // Pointer to address. void *Unused1, // Go figure. void *Unused3) // What happened to Unused2? { IPv6Addr *Addr = (IPv6Addr *)AddrPtr; CHECK_STRUCT(CheckTCB, tcb); if (IP6_ADDR_EQUAL(&CheckTCB->tcb_saddr, Addr)) return FALSE; else return TRUE; } //* TCBWalk - Walk the TCBs in the table, and call a function for each of them. // // Called when we need to repetively do something to each TCB in the table. // We call the specified function with a pointer to the TCB and the input // context for each TCB in the table. If the function returns FALSE, we // delete the TCB. // void // Returns: Nothing. TCBWalk( uint (*CallRtn)(struct TCB *, void *, void *, void *), // Routine to call. void *Context1, // Context to pass to CallRtn. void *Context2, // Second context to pass to call routine. void *Context3) // Third context to pass to call routine. { uint i; TCB *CurTCB; KIRQL Irql0, Irql1; // // Loop through each bucket in the table, going down the chain of // TCBs on the bucket. For each one call CallRtn. // KeAcquireSpinLock(&TCBTableLock, &Irql0); for (i = 0; i < TcbTableSize; i++) { CurTCB = TCBTable[i]; // // Walk down the chain on this bucket. // while (CurTCB != NULL) { if (!(*CallRtn)(CurTCB, Context1, Context2, Context3)) { // // Call failed on this one. // Notify the client and close the TCB. // KeAcquireSpinLock(&CurTCB->tcb_lock, &Irql1); if (!CLOSING(CurTCB)) { CurTCB->tcb_refcnt++; KeReleaseSpinLock(&TCBTableLock, Irql1); TryToCloseTCB(CurTCB, TCB_CLOSE_ABORTED, Irql0); RemoveTCBFromConn(CurTCB); if (CurTCB->tcb_state != TCB_TIME_WAIT) NotifyOfDisc(CurTCB, TDI_CONNECTION_ABORTED); KeAcquireSpinLock(&CurTCB->tcb_lock, &Irql0); DerefTCB(CurTCB, Irql0); KeAcquireSpinLock(&TCBTableLock, &Irql0); } else KeReleaseSpinLock(&CurTCB->tcb_lock, Irql1); CurTCB = FindNextTCB(i, CurTCB); } else { CurTCB = CurTCB->tcb_next; } } } KeReleaseSpinLock(&TCBTableLock, Irql0); } //* FindTCB - Find a TCB in the tcb table. // // Called when we need to find a TCB in the TCB table. We take a quick // look at the last TCB we found, and if it matches we return it. Otherwise // we hash into the TCB table and look for it. We assume the TCB table lock // is held when we are called. // TCB * // Returns: Pointer to TCB found, or NULL if none. FindTCB( IPv6Addr *Src, // Source IP address of TCB to be found. IPv6Addr *Dest, // Destination IP address of TCB to be found. uint SrcScopeId, // Source address scope identifier. uint DestScopeId, // Destination address scope identifier. ushort SrcPort, // Source port of TCB to be found. ushort DestPort) // Destination port of TCB to be found. { TCB *FoundTCB; if (LastTCB != NULL) { CHECK_STRUCT(LastTCB, tcb); if (IP6_ADDR_EQUAL(&LastTCB->tcb_daddr, Dest) && LastTCB->tcb_dscope_id == DestScopeId && LastTCB->tcb_dport == DestPort && IP6_ADDR_EQUAL(&LastTCB->tcb_saddr, Src) && LastTCB->tcb_sscope_id == SrcScopeId && LastTCB->tcb_sport == SrcPort) return LastTCB; } // // Didn't find it in our 1 element cache. // FoundTCB = TCBTable[TCB_HASH(*Dest, *Src, DestPort, SrcPort)]; while (FoundTCB != NULL) { CHECK_STRUCT(FoundTCB, tcb); if (IP6_ADDR_EQUAL(&FoundTCB->tcb_daddr, Dest) && FoundTCB->tcb_dscope_id == DestScopeId && FoundTCB->tcb_dport == DestPort && IP6_ADDR_EQUAL(&FoundTCB->tcb_saddr, Src) && FoundTCB->tcb_sscope_id == SrcScopeId && FoundTCB->tcb_sport == SrcPort) { // // Found it. Update the cache for next time, and return. // LastTCB = FoundTCB; return FoundTCB; } else FoundTCB = FoundTCB->tcb_next; } return FoundTCB; } //* InsertTCB - Insert a TCB in the tcb table. // // This routine inserts a TCB in the TCB table. No locks need to be held // when this routine is called. We insert TCBs in ascending address order. // Before inserting we make sure that the TCB isn't already in the table. // uint // Returns: TRUE if we inserted, false if we didn't. InsertTCB( TCB *NewTCB) // TCB to be inserted. { uint TCBIndex; KIRQL OldIrql; TCB *PrevTCB, *CurrentTCB; TCB *WhereToInsert; ASSERT(NewTCB != NULL); CHECK_STRUCT(NewTCB, tcb); TCBIndex = TCB_HASH(NewTCB->tcb_daddr, NewTCB->tcb_saddr, NewTCB->tcb_dport, NewTCB->tcb_sport); KeAcquireSpinLock(&TCBTableLock, &OldIrql); KeAcquireSpinLockAtDpcLevel(&NewTCB->tcb_lock); // // Find the proper place in the table to insert him. While // we're walking we'll check to see if a dupe already exists. // When we find the right place to insert, we'll remember it, and // keep walking looking for a duplicate. // PrevTCB = CONTAINING_RECORD(&TCBTable[TCBIndex], TCB, tcb_next); WhereToInsert = NULL; while (PrevTCB->tcb_next != NULL) { CurrentTCB = PrevTCB->tcb_next; if (IP6_ADDR_EQUAL(&CurrentTCB->tcb_daddr, &NewTCB->tcb_daddr) && IP6_ADDR_EQUAL(&CurrentTCB->tcb_saddr, &NewTCB->tcb_saddr) && (CurrentTCB->tcb_sport == NewTCB->tcb_sport) && (CurrentTCB->tcb_dport == NewTCB->tcb_dport)) { KeReleaseSpinLockFromDpcLevel(&NewTCB->tcb_lock); KeReleaseSpinLock(&TCBTableLock, OldIrql); return FALSE; } else { if (WhereToInsert == NULL && CurrentTCB > NewTCB) { WhereToInsert = PrevTCB; } CHECK_STRUCT(PrevTCB->tcb_next, tcb); PrevTCB = PrevTCB->tcb_next; } } if (WhereToInsert == NULL) { WhereToInsert = PrevTCB; } NewTCB->tcb_next = WhereToInsert->tcb_next; WhereToInsert->tcb_next = NewTCB; NewTCB->tcb_flags |= IN_TCB_TABLE; TStats.ts_numconns++; KeReleaseSpinLockFromDpcLevel(&NewTCB->tcb_lock); KeReleaseSpinLock(&TCBTableLock, OldIrql); return TRUE; } //* RemoveTCB - Remove a TCB from the tcb table. // // Called when we need to remove a TCB from the TCB table. We assume the // TCB table lock and the TCB lock are held when we are called. If the // TCB isn't in the table we won't try to remove him. // uint // Returns: TRUE if it's OK to free it, FALSE otherwise. RemoveTCB( TCB *RemovedTCB) // TCB to be removed. { uint TCBIndex; TCB *PrevTCB; #if DBG uint Found = FALSE; #endif CHECK_STRUCT(RemovedTCB, tcb); if (RemovedTCB->tcb_flags & IN_TCB_TABLE) { TCBIndex = TCB_HASH(RemovedTCB->tcb_daddr, RemovedTCB->tcb_saddr, RemovedTCB->tcb_dport, RemovedTCB->tcb_sport); PrevTCB = CONTAINING_RECORD(&TCBTable[TCBIndex], TCB, tcb_next); do { if (PrevTCB->tcb_next == RemovedTCB) { // Found him. PrevTCB->tcb_next = RemovedTCB->tcb_next; RemovedTCB->tcb_flags &= ~IN_TCB_TABLE; TStats.ts_numconns--; #if DBG Found = TRUE; #endif break; } PrevTCB = PrevTCB->tcb_next; #if DBG if (PrevTCB != NULL) CHECK_STRUCT(PrevTCB, tcb); #endif } while (PrevTCB != NULL); ASSERT(Found); } if (LastTCB == RemovedTCB) LastTCB = NULL; if (TCBWalkCount == 0) { return TRUE; } else { RemovedTCB->tcb_walkcount = TCBWalkCount + 1; *(TCB **)&RemovedTCB->tcb_delayq.q_next = PendingFreeList; PendingFreeList = RemovedTCB; return FALSE; } } //* ScavengeTCB - Scavenge a TCB that's in the TIME_WAIT state. // // Called when we're running low on TCBs, and need to scavenge one from // TIME_WAIT state. We'll walk through the TCB table, looking for the oldest // TCB in TIME_WAIT. We'll remove and return a pointer to that TCB. If we // don't find any TCBs in TIME_WAIT, we'll return NULL. // TCB * // Returns: Pointer to a reusable TCB, or NULL. ScavengeTCB( void) { KIRQL Irql0, Irql1, IrqlSave; uint Now = SystemUpTime(); uint Delta = 0; uint i; TCB *FoundTCB = NULL, *PrevFound; TCB *CurrentTCB, *PrevTCB; KeAcquireSpinLock(&TCBTableLock, &Irql0); if (TCBWalkCount != 0) { KeReleaseSpinLock(&TCBTableLock, Irql0); return NULL; } for (i = 0; i < TcbTableSize; i++) { PrevTCB = CONTAINING_RECORD(&TCBTable[i], TCB, tcb_next); CurrentTCB = PrevTCB->tcb_next; while (CurrentTCB != NULL) { CHECK_STRUCT(CurrentTCB, tcb); KeAcquireSpinLock(&CurrentTCB->tcb_lock, &Irql1); if (CurrentTCB->tcb_state == TCB_TIME_WAIT && (CurrentTCB->tcb_refcnt == 0) && !CLOSING(CurrentTCB)){ if (FoundTCB == NULL || ((Now - CurrentTCB->tcb_alive) > Delta)) { // // Found a new 'older' TCB. If we already have one, free // the lock on him and get the lock on the new one. // if (FoundTCB != NULL) KeReleaseSpinLock(&FoundTCB->tcb_lock, Irql1); else IrqlSave = Irql1; PrevFound = PrevTCB; FoundTCB = CurrentTCB; Delta = Now - FoundTCB->tcb_alive; } else KeReleaseSpinLock(&CurrentTCB->tcb_lock, Irql1); } else KeReleaseSpinLock(&CurrentTCB->tcb_lock, Irql1); // // Look at the next one. // PrevTCB = CurrentTCB; CurrentTCB = PrevTCB->tcb_next; } } // // If we have one, pull him from the list. // if (FoundTCB != NULL) { PrevFound->tcb_next = FoundTCB->tcb_next; FoundTCB->tcb_flags &= ~IN_TCB_TABLE; // // REVIEW: Is the right place to drop the reference on our RCE? // REVIEW: IPv4 called down to IP to close the RCE here. // if (FoundTCB->tcb_rce != NULL) ReleaseRCE(FoundTCB->tcb_rce); TStats.ts_numconns--; if (LastTCB == FoundTCB) { LastTCB = NULL; } KeReleaseSpinLock(&FoundTCB->tcb_lock, IrqlSave); } KeReleaseSpinLock(&TCBTableLock, Irql0); return FoundTCB; } //* AllocTCB - Allocate a TCB. // // Called whenever we need to allocate a TCB. We try to pull one off the // free list, or allocate one if we need one. We then initialize it, etc. // TCB * // Returns: Pointer to the new TCB, or NULL if we couldn't get one. AllocTCB( void) { TCB *NewTCB; // // First, see if we have one on the free list. // PSLIST_ENTRY BufferLink; BufferLink = ExInterlockedPopEntrySList(&FreeTCBList, &FreeTCBListLock); if (BufferLink != NULL) { NewTCB = CONTAINING_RECORD(BufferLink, TCB, tcb_next); CHECK_STRUCT(NewTCB, tcb); ExInterlockedAddUlong(&FreeTCBs, -1, &FreeTCBListLock); } else { // // We have none on the free list. If the total number of TCBs // outstanding is more than we like to keep on the free list, try // to scavenge a TCB from time wait. // if (CurrentTCBs < MaxFreeTCBs || ((NewTCB = ScavengeTCB()) == NULL)) { if (CurrentTCBs < MaxTCBs) { NewTCB = ExAllocatePool(NonPagedPool, sizeof(TCB)); if (NewTCB == NULL) { return NewTCB; } else { ExInterlockedAddUlong(&CurrentTCBs, 1, &FreeTCBListLock); } } else return NULL; } } ASSERT(NewTCB != NULL); RtlZeroMemory(NewTCB, sizeof(TCB)); #if DBG NewTCB->tcb_sig = tcb_signature; #endif INITQ(&NewTCB->tcb_sendq); NewTCB->tcb_cursend = NULL; NewTCB->tcb_alive = TCPTime; NewTCB->tcb_hops = -1; // // Initially we're not on the fast path because we're not established. Set // the slowcount to one and set up the fastchk fields so we don't take the // fast path. // NewTCB->tcb_slowcount = 1; NewTCB->tcb_fastchk = TCP_FLAG_ACK | TCP_FLAG_SLOW; KeInitializeSpinLock(&NewTCB->tcb_lock); return NewTCB; } //* FreeTCB - Free a TCB. // // Called whenever we need to free a TCB. // // Note: This routine may be called with the TCBTableLock held. // void // Returns: Nothing. FreeTCB( TCB *FreedTCB) // TCB to be freed. { PSLIST_ENTRY BufferLink; CHECK_STRUCT(FreedTCB, tcb); #if defined(_WIN64) if (CurrentTCBs > 2 * MaxFreeTCBs) { #else if ((CurrentTCBs > 2 * MaxFreeTCBs) || (FreeTCBList.Depth > 65000)) { #endif ExInterlockedAddUlong(&CurrentTCBs, (ulong) - 1, &FreeTCBListLock); ExFreePool(FreedTCB); return; } BufferLink = CONTAINING_RECORD(&(FreedTCB->tcb_next), SLIST_ENTRY, Next); ExInterlockedPushEntrySList(&FreeTCBList, BufferLink, &FreeTCBListLock); ExInterlockedAddUlong(&FreeTCBs, 1, &FreeTCBListLock); } #pragma BEGIN_INIT //* InitTCB - Initialize our TCB code. // // Called during init time to initialize our TCB code. We initialize // the TCB table, etc, then return. // int // Returns: TRUE if we did initialize, false if we didn't. InitTCB( void) { LARGE_INTEGER InitialWakeUp; uint i; TCBTable = ExAllocatePool(NonPagedPool, TcbTableSize * sizeof(TCB*)); if (TCBTable == NULL) { return FALSE; } for (i = 0; i < TcbTableSize; i++) TCBTable[i] = NULL; LastTCB = NULL; ExInitializeSListHead(&FreeTCBList); KeInitializeSpinLock(&TCBTableLock); KeInitializeSpinLock(&FreeTCBListLock); TCPTime = 0; TCBWalkCount = 0; DeadmanTicks = NUM_DEADMAN_TICKS; // // Set up our timer to call TCBTimeout once every MS_PER_TICK milliseconds. // // REVIEW: Switch this to be driven off the IPv6Timeout routine instead // REVIEW: of having two independent timers? // KeInitializeDpc(&TCBTimeoutDpc, TCBTimeout, NULL); KeInitializeTimer(&TCBTimer); InitialWakeUp.QuadPart = -(LONGLONG) MS_PER_TICK * 10000; KeSetTimerEx(&TCBTimer, InitialWakeUp, MS_PER_TICK, &TCBTimeoutDpc); return TRUE; } #pragma END_INIT //* UnloadTCB // // Called during shutdown to uninitialize // in preparation for unloading the stack. // // There are no open sockets (or else we wouldn't be unloading). // Because UnloadTCPSend has already been called, // we are no longer receiving packets from the IPv6 layer. // void UnloadTCB(void) { PSLIST_ENTRY BufferLink; TCB *CurrentTCB; uint i; KIRQL OldIrql; // // First stop TCBTimeout from being called. // KeCancelTimer(&TCBTimer); // // Traverse the buckets looking for TCBs. // REVIEW - Can we have TCBs in states other than time-wait? // for (i = 0; i < TcbTableSize; i++) { while ((CurrentTCB = TCBTable[i]) != NULL) { KeAcquireSpinLock(&CurrentTCB->tcb_lock, &OldIrql); KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_INFO_STATE, "UnloadTCB(%p): state %x flags %x refs %x " "reason %x pend %x walk %x\n", CurrentTCB, CurrentTCB->tcb_state, CurrentTCB->tcb_flags, CurrentTCB->tcb_refcnt, CurrentTCB->tcb_closereason, CurrentTCB->tcb_pending, CurrentTCB->tcb_walkcount)); CurrentTCB->tcb_flags |= NEED_RST; TryToCloseTCB(CurrentTCB, TCB_CLOSE_ABORTED, OldIrql); } } // // Now pull TCBs off the free list and really free them. // while ((BufferLink = ExInterlockedPopEntrySList(&FreeTCBList, &FreeTCBListLock)) != NULL) { CurrentTCB = CONTAINING_RECORD(BufferLink, TCB, tcb_next); CHECK_STRUCT(CurrentTCB, tcb); ExFreePool(CurrentTCB); } ExFreePool(TCBTable); TCBTable = NULL; } //* CleanupTCBWithIF // // Helper function for TCBWalk, to remove // TCBs that reference the specified interface. // // Returns FALSE if CheckTCB should be deleted, TRUE otherwise. // uint CleanupTCBWithIF( TCB *CheckTCB, void *Context1, void *Context2, void *Context3) { Interface *IF = (Interface *) Context1; RouteCacheEntry *RCE; KIRQL OldIrql; CHECK_STRUCT(CheckTCB, tcb); RCE = CheckTCB->tcb_rce; if (RCE != NULL) { ASSERT(RCE->NTE->IF == RCE->NCE->IF); if (RCE->NTE->IF == IF) return FALSE; // Delete this TCB. } return TRUE; // Do not delete this TCB. } //* TCPRemoveIF // // Remove TCP's references to the specified interface. // void TCPRemoveIF(Interface *IF) { // // Currently, only TCBs hold onto references. // TCBWalk(CleanupTCBWithIF, IF, NULL, NULL); }