windows-nt/Source/XPSP1/NT/base/published/xsum_axp
2020-09-26 16:20:57 +08:00

272 lines
8.2 KiB
Plaintext
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// TITLE("Compute Checksum")
//++
//
// Copyright (c) Microsoft Corporation. All rights reserved.
//
// Module Name:
//
// xsum.s
//
// Abstract:
//
// This module implements a function to compute the checksum of a buffer.
//
// Author:
//
// John Vert (jvert) 11-Jul-1994
//
// Environment:
//
// Revision History:
//
//--
#include "ksalpha.h"
SBTTL("Compute Checksum")
//++
//
// ULONG
// tcpxsum (
// IN ULONG Checksum,
// IN PUSHORT Source,
// IN ULONG Length
// )
//
// Routine Description:
//
// This function computes the checksum of the specified buffer.
//
// Arguments:
//
// Checksum (a0) - Supplies the initial checksum value.
//
// Source (a1) - Supplies a pointer to the checksum buffer
//
// Length (a2) - Supplies the length of the buffer in words.
//
// Return Value:
//
// The computed checksum is returned as the function value.
//
//--
LEAF_ENTRY(tcpxsum)
zap a0, 0xf0, a0 // clear high half of a0
bis a1, zero, t6 // save initial buffer address
bis zero, zero, v0 // clear accumulated checksum
//
// Check if the buffer is quadword aligned.
//
// If the buffer is not quadword aligned, then add the leading words to the
// checksum.
//
ldq_u t0, 0(a1) // get containing quadword of first part
blbc a1, 10f // check for word alignment
beq a2, 65f // if zero bytes, don't do anything
extbl t0, a1, t1 // get leading byte
sll t1, 8, v0 // shift it to correct spot for later byte swap
addq a1, 1, a1 // increment buffer to first full word
subq a2, 1, a2 // decrement byte count
10:
and a1, 6, t2 // check if buffer quadword aligned
beq t2, 20f // if eq, quadword aligned
extql t0, t2, t0 // extract bytes to checksum
and a1, 7, t3 // compute bytes summed
subq zero, t3, t3
addq t3, 8, t3
addq a1, 8, a1 // advance buffer address to next qword
bic a1, 7, a1 //
subq a2, t3, t2
blt t2, 55f // if ltz, too many, jump to residual code
addq v0, t0, v0 // add bytes to partial checksum
cmpult v0, t0, t1 // generate carry
addq t1, v0, v0 // add carry back into checksum
bis t2, zero, a2 // reduce count of bytes to checksum
beq t2, 60f // if eq, no more bytes
20:
//
// Compute the checksum in 64-byte blocks
//
bic a2, 7, t4 // subtract out residual bytes
beq t4, 40f // if eq, no quadwords to checksum
subq zero, t4, t2 // compute negative of byte count
and t2, 15 << 2, t3 // compute bytes in first iteration
ldq t0, 0(a1) // get first quadword to checksum
beq t3, 35f // if eq, full 64-byte block
subq a1, t3, a1 // bias buffer address by offset
bic t4, 64-1, t4 // subtract out bytes in first iteration
lda t2, 30f // get base address of code vector
addl t3, t3, t3 //
addq t3, t2, t2 // compute code vector offset
bis t0, zero, t1 // copy first quadword to checksum
jmp (t2) // dispatch
30:
//
// The following code vector computes the checksum of a 64-byte block.
//
.set noreorder
ldq t1, 8(a1)
addq v0, t0, v0
cmpult v0, t0, t2
addq v0, t2, v0
ldq t0, 16(a1)
addq v0, t1, v0
cmpult v0, t1, t2
addq v0, t2, v0
ldq t1, 24(a1)
addq v0, t0, v0
cmpult v0, t0, t2
addq v0, t2, v0
ldq t0, 32(a1)
addq v0, t1, v0
cmpult v0, t1, t2
addq v0, t2, v0
ldq t1, 40(a1)
addq v0, t0, v0
cmpult v0, t0, t2
addq v0, t2, v0
ldq t0, 48(a1)
addq v0, t1, v0
cmpult v0, t1, t2
addq v0, t2, v0
ldq t1, 56(a1)
addq v0, t0, v0
cmpult v0, t0, t2
addq v0, t2, v0
addq a1, 64, a1
addq v0, t1, v0
cmpult v0, t1, t2
addq v0, t2, v0
.set reorder
beq t4, 40f // if zero, end of block
35:
ldq t0, 0(a1)
//
// The following loop is allowed to be reordered by the assembler for
// optimal scheduling. It is never branched into.
//
subq t4, 64, t4 // reduce byte count of longwords
ldq t1, 8(a1)
addq v0, t0, v0
cmpult v0, t0, t2
addq v0, t2, v0
ldq t0, 16(a1)
addq v0, t1, v0
cmpult v0, t1, t2
addq v0, t2, v0
ldq t1, 24(a1)
addq v0, t0, v0
cmpult v0, t0, t2
addq v0, t2, v0
ldq t0, 32(a1)
addq v0, t1, v0
cmpult v0, t1, t2
addq v0, t2, v0
ldq t1, 40(a1)
addq v0, t0, v0
cmpult v0, t0, t2
addq v0, t2, v0
ldq t0, 48(a1)
addq v0, t1, v0
cmpult v0, t1, t2
addq v0, t2, v0
ldq t1, 56(a1)
addq v0, t0, v0
cmpult v0, t0, t2
addq v0, t2, v0
addq a1, 64, a1
addq v0, t1, v0
cmpult v0, t1, t2
addq v0, t2, v0
bne t4, 35b // if ne zero, not end of block
40:
//
// Check for any remaining bytes.
//
and a2, 7, a2 // isolate residual bytes
beq a2, 60f // if eq, no residual bytes
50:
//
// Checksum remaining bytes.
//
// The technique we use here is to load the final quadword, then
// zero out the bytes that are not included.
//
ldq t0, 0(a1) // get quadword surrounding remainder
55:
ornot zero, zero, t1 // get FF mask
sll t1, a2, t2 // shift to produce byte mask
zap t0, t2, t0 // zero out bytes past end of buffer
addq v0, t0, v0 // add quadword to partial checksum
cmpult v0, t0, t1 // generate carry
addq t1, v0, v0 // add carry back into checksum
60:
//
// Byte swap the 64-bit checksum if the start of the buffer was not word aligned
//
blbc t6, 65f
zap v0, 0xAA, t0 // isolate even bytes
sll t0, 8, t0 // shift even bytes into odd positions
srl v0, 8, t1 // shift odd bytes into even positions
zap t1, 0xAA, t1 // isolate odd bytes
bis t0, t1, v0 // merge bytes back together
65:
//
// add computed checksum to original checksum, and fold the 64-bit
// result down to 16 bits.
//
addq v0, a0, v0 // add computed checksum to original
cmpult v0, a0, t0 // generate carry
addq v0, t0, v0 // add carry back into checksum
//
// swap the longwords in order to sum two longwords and their carry in one add.
//
sll v0, 32, t0 // shift low longword into high
srl v0, 32, t1 // shift high longword into low
bis t1, t0, t5 // merge back together
addq v0, t5, t0 // produce sum + carry in high longword
srl t0, 32, t1 // shift back down to low half
//
// swap the words in order to sum two words and their carry in one add
//
sll t1, 16, t2 // shift high word into low
srl t1, 16, t3 // shift low word into high
bis t2, t3, t4 // merge back together
addq t4, t1, t2 // produce sum and carry in high word
extwl t2, 2, v0 // extract result.
ret zero, (ra) // return
.end tcpxsum