1208 lines
26 KiB
ArmAsm
1208 lines
26 KiB
ArmAsm
//
|
|
// Module Name:
|
|
//
|
|
// fillmem.s
|
|
//
|
|
// Abstract:
|
|
//
|
|
// This module implements functions to move, zero, and fill blocks
|
|
// of memory. If the memory is aligned, then these functions are
|
|
// very efficient.
|
|
//
|
|
// Author:
|
|
//
|
|
//
|
|
// Environment:
|
|
//
|
|
// User or Kernel mode.
|
|
//
|
|
//--
|
|
|
|
#include "ksia64.h"
|
|
|
|
//++
|
|
//
|
|
// VOID
|
|
// RtlFillMemory (
|
|
// IN PVOID destination,
|
|
// IN SIZE_T length,
|
|
// IN UCHAR fill
|
|
// )
|
|
//
|
|
// Routine Description:
|
|
//
|
|
// This function fills memory by first aligning the destination address to
|
|
// a qword boundary, and then filling 4-byte blocks, followed by any
|
|
// remaining bytes.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// destination (a0) - Supplies a pointer to the memory to fill.
|
|
//
|
|
// length (a1) - Supplies the length, in bytes, of the memory to be filled.
|
|
//
|
|
// fill (a2) - Supplies the fill byte.
|
|
//
|
|
// N.B. The alternate entry memset expects the length and fill arguments
|
|
// to be reversed. It also returns the Destination pointer
|
|
//
|
|
// Return Value:
|
|
//
|
|
// None.
|
|
//
|
|
//--
|
|
|
|
LEAF_ENTRY(RtlFillMemory)
|
|
|
|
lfetch.excl [a0]
|
|
mov t0 = a0
|
|
add t4 = 64, a0
|
|
|
|
cmp.eq pt0 = zero, a1 // length == 0 ?
|
|
add t1 = -1, a0
|
|
zxt1 a2 = a2
|
|
|
|
cmp.ge pt1 = 7, a1
|
|
mov v0 = a0
|
|
(pt0) br.ret.spnt brp // return if length is zero
|
|
;;
|
|
|
|
//
|
|
// Align address on qword boundary by determining the number of bytes
|
|
// before the next qword boundary by performing an AND operation on
|
|
// the 2's complement of the address with a mask value of 0x7.
|
|
//
|
|
|
|
lfetch.excl [t4], 64
|
|
andcm t1 = 7, t1 // t1 = # bytes before dword boundary
|
|
(pt1) br.cond.spnt TailSet // 1 <= length <= 3, br to TailSet
|
|
;;
|
|
|
|
cmp.eq pt2 = zero, t1 // skip HeadSet if t1 is zero
|
|
mux1 t2 = a2, @brcst // t2 = all 8 bytes = [fill]
|
|
sub a1 = a1, t1 // a1 = adjusted length
|
|
;;
|
|
|
|
lfetch.excl [t4], 64
|
|
(pt2) br.cond.sptk SkipHeadSet
|
|
|
|
//
|
|
// Copy the leading bytes until t1 is equal to zero
|
|
//
|
|
|
|
HeadSet:
|
|
st1 [t0] = a2, 1
|
|
add t1 = -1, t1
|
|
;;
|
|
cmp.ne pt0 = zero, t1
|
|
|
|
(pt0) br.cond.sptk HeadSet
|
|
|
|
//
|
|
// now the address is qword aligned;
|
|
// fall into the QwordSet loop if remaining length is greater than 8;
|
|
// else skip the QwordSet loop
|
|
//
|
|
|
|
SkipHeadSet:
|
|
|
|
cmp.gt pt1 = 16, a1
|
|
add t4 = 64, t0
|
|
cmp.le pt2 = 8, a1
|
|
|
|
add t3 = 8, t0
|
|
cmp.gt pt3 = 64, a1
|
|
(pt1) br.cond.spnt SkipQwordSet
|
|
;;
|
|
|
|
lfetch.excl [t4], 64
|
|
(pt3) br.cond.spnt QwordSet
|
|
|
|
nop.m 0
|
|
nop.m 0
|
|
nop.i 0
|
|
|
|
UnrolledQwordSet:
|
|
|
|
st8 [t0] = t2, 16
|
|
st8 [t3] = t2, 16
|
|
add a1 = -64, a1
|
|
;;
|
|
|
|
st8 [t0] = t2, 16
|
|
st8 [t3] = t2, 16
|
|
cmp.le pt0 = 64, a1
|
|
;;
|
|
|
|
st8 [t0] = t2, 16
|
|
st8 [t3] = t2, 16
|
|
cmp.le pt2 = 8, a1
|
|
;;
|
|
|
|
st8 [t0] = t2, 16
|
|
nop.f 0
|
|
cmp.gt pt1 = 16, a1
|
|
|
|
st8 [t3] = t2, 16
|
|
(pt0) br.cond.sptk UnrolledQwordSet
|
|
(pt1) br.cond.spnt SkipQwordSet
|
|
;;
|
|
|
|
//
|
|
// fill 8 bytes at a time until the remaining length is less than 8
|
|
//
|
|
|
|
QwordSet:
|
|
st8 [t0] = t2, 16
|
|
st8 [t3] = t2, 16
|
|
add a1 = -16, a1
|
|
;;
|
|
|
|
cmp.le pt0 = 16, a1
|
|
cmp.le pt2 = 8, a1
|
|
(pt0) br.cond.sptk QwordSet
|
|
;;
|
|
|
|
SkipQwordSet:
|
|
(pt2) st8 [t0] = t2, 8
|
|
(pt2) add a1 = -8, a1
|
|
;;
|
|
|
|
cmp.eq pt3 = zero, a1 // return now if length equals 0
|
|
(pt3) br.ret.sptk brp
|
|
;;
|
|
|
|
//
|
|
// copy the remaining bytes one at a time
|
|
//
|
|
|
|
TailSet:
|
|
st1 [t0] = a2, 1
|
|
add a1 = -1, a1
|
|
nop.i 0
|
|
;;
|
|
|
|
cmp.ne pt0, pt3 = 0, a1
|
|
(pt0) br.cond.dptk TailSet
|
|
(pt3) br.ret.dpnt brp
|
|
;;
|
|
|
|
LEAF_EXIT(RtlFillMemory)
|
|
|
|
//++
|
|
//
|
|
// VOID
|
|
// RtlFillMemoryUlong (
|
|
// IN PVOID Destination,
|
|
// IN SIZE_T Length,
|
|
// IN ULONG Pattern
|
|
// )
|
|
//
|
|
// Routine Description:
|
|
//
|
|
// This function fills memory with the specified longowrd pattern
|
|
// 4 bytes at a time.
|
|
//
|
|
// N.B. This routine assumes that the destination address is aligned
|
|
// on a longword boundary and that the length is an even multiple
|
|
// of longwords.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Destination (a0) - Supplies a pointer to the memory to fill.
|
|
//
|
|
// Length (a1) - Supplies the length, in bytes, of the memory to be filled.
|
|
//
|
|
// Pattern (a2) - Supplies the fill pattern.
|
|
//
|
|
// Return Value:
|
|
//
|
|
// None.
|
|
//
|
|
//--
|
|
|
|
LEAF_ENTRY(RtlFillMemoryUlong)
|
|
|
|
.prologue
|
|
.save ar.lc, t22
|
|
mov t22 = ar.lc
|
|
extr.u a1 = a1, 2, 30
|
|
;;
|
|
|
|
PROLOGUE_END
|
|
|
|
cmp.eq pt0, pt1 = zero, a1
|
|
add a1 = -1, a1
|
|
;;
|
|
|
|
nop.m 0
|
|
(pt1) mov ar.lc = a1
|
|
(pt0) br.ret.spnt brp
|
|
;;
|
|
Rfmu10:
|
|
st4 [a0] = a2, 4
|
|
br.cloop.dptk.few Rfmu10
|
|
;;
|
|
|
|
nop.m 0
|
|
mov ar.lc = t22
|
|
br.ret.sptk brp
|
|
|
|
LEAF_EXIT(RtlFillMemoryUlong)
|
|
|
|
//++
|
|
//
|
|
// VOID
|
|
// RtlFillMemoryUlonglong (
|
|
// IN PVOID Destination,
|
|
// IN SIZE_T Length,
|
|
// IN ULONGLONG Pattern
|
|
// )
|
|
//
|
|
// Routine Description:
|
|
//
|
|
// This function fills memory with the specified pattern
|
|
// 8 bytes at a time.
|
|
//
|
|
// N.B. This routine assumes that the destination address is aligned
|
|
// on a longword boundary and that the length is an even multiple
|
|
// of longwords.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Destination (a0) - Supplies a pointer to the memory to fill.
|
|
//
|
|
// Length (a1) - Supplies the length, in bytes, of the memory to be filled.
|
|
//
|
|
// Pattern (a2,a3) - Supplies the fill pattern.
|
|
//
|
|
// Return Value:
|
|
//
|
|
// None.
|
|
//
|
|
//--
|
|
|
|
LEAF_ENTRY(RtlFillMemoryUlonglong)
|
|
|
|
.prologue
|
|
.save ar.lc, t22
|
|
mov t22 = ar.lc
|
|
extr.u a1 = a1, 3, 29
|
|
;;
|
|
|
|
PROLOGUE_END
|
|
|
|
cmp.eq pt0, pt1 = zero, a1
|
|
add a1 = -1, a1
|
|
;;
|
|
|
|
nop.m 0
|
|
(pt1) mov ar.lc = a1
|
|
(pt0) br.ret.spnt brp
|
|
;;
|
|
Rfmul10:
|
|
st8 [a0] = a2, 8
|
|
br.cloop.dptk.few Rfmul10
|
|
;;
|
|
|
|
nop.m 0
|
|
mov ar.lc = t22
|
|
br.ret.sptk brp
|
|
;;
|
|
|
|
LEAF_EXIT(RtlFillMemoryUlonglong)
|
|
|
|
|
|
//++
|
|
//
|
|
// VOID
|
|
// RtlZeroMemory (
|
|
// IN PVOID Destination,
|
|
// IN SIZE_T Length
|
|
// )
|
|
//
|
|
// Routine Description:
|
|
//
|
|
// This function simply sets up the fill value (out2) and branches
|
|
// directly to RtlFillMemory
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Destination (a0) - Supplies a pointer to the memory to zero.
|
|
//
|
|
// Length (a1) - Supplies the length, in bytes, of the memory to be zeroed.
|
|
//
|
|
// Return Value:
|
|
//
|
|
// None.
|
|
//
|
|
//--
|
|
LEAF_ENTRY(RtlZeroMemory)
|
|
|
|
alloc t22 = ar.pfs, 0, 0, 3, 0
|
|
mov out2 = 0
|
|
br RtlFillMemory
|
|
|
|
LEAF_EXIT(RtlZeroMemory)
|
|
|
|
|
|
//++
|
|
//
|
|
// VOID
|
|
// RtlMoveMemory (
|
|
// IN PVOID Destination,
|
|
// IN PVOID Source,
|
|
// IN SIZE_T Length
|
|
// )
|
|
//
|
|
// Routine Description:
|
|
//
|
|
// This function moves memory either forward or backward, aligned or
|
|
// unaligned.
|
|
//
|
|
// Algorithm:
|
|
// 1) Length equals zero, return immediately
|
|
// 2) Source & Destination don't overlap, copy from low to high
|
|
// else copy from high to low address one byte at a time
|
|
// 3) if Source & Destination are both 8-byte aligned, copy 8 bytes
|
|
// at a time and the remaining bytes are copied one at a time.
|
|
// 4) if Source & Destination are both 4-byte aligned, copy 4 bytes
|
|
// at a time and the remaining bytes are copied one at a time.
|
|
// 5) else copy one byte at a time from low to high address.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// Destination (a0) - Supplies a pointer to the destination address of
|
|
// the move operation.
|
|
//
|
|
// Source (a1) - Supplies a pointer to the source address of the move
|
|
// operation.
|
|
//
|
|
// Length (a2) - Supplies the length, in bytes, of the memory to be moved.
|
|
//
|
|
// Return Value:
|
|
//
|
|
// None.
|
|
//
|
|
//--
|
|
LEAF_ENTRY(memcpy)
|
|
ALTERNATE_ENTRY(memmove)
|
|
ALTERNATE_ENTRY(RtlMoveMemory)
|
|
ALTERNATE_ENTRY(RtlCopyMemory)
|
|
ALTERNATE_ENTRY(RtlCopyMemoryNonTemporal)
|
|
.prologue
|
|
.regstk 3,7,0,8
|
|
alloc t17 = ar.pfs,3,31,0,32
|
|
.save pr, r64
|
|
mov r64 = pr
|
|
and t3 = -32, a1
|
|
;;
|
|
|
|
lfetch [t3], 32 //0
|
|
.save ar.lc, r65
|
|
mov.i r65 = ar.lc
|
|
and t1 = 7, a1
|
|
;;
|
|
|
|
.body
|
|
lfetch [t3], 32 //32
|
|
mov v0 = a0
|
|
and t0 = 7, a0
|
|
;;
|
|
|
|
add t21 = a1, a2
|
|
cmp.gtu pt0 = a0, a1
|
|
or t2 = t0, t1
|
|
;;
|
|
|
|
(pt0) cmp.ltu.unc pt0 = a0, t21
|
|
cmp.eq pt1 = zero, a2
|
|
(pt1) br.ret.spnt brp
|
|
|
|
lfetch [t3], 32 //64
|
|
cmp.lt pt2 = 16, a2
|
|
(pt0) br.cond.spnt CopyDown
|
|
;;
|
|
|
|
lfetch [t3], 32 //96
|
|
cmp.lt pt6 = 127, a2
|
|
cmp.le pt4 = 8, a2
|
|
;;
|
|
|
|
(pt6) lfetch [t3], 32 //128
|
|
(pt4) cmp.eq.unc pt3 = 0, t2
|
|
(pt4) cmp.eq.unc pt5 = t0, t1
|
|
|
|
|
|
(pt3) br.cond.sptk QwordMoveUp
|
|
(pt5) br.cond.spnt AlignedMove
|
|
(pt2) br.cond.sptk UnalignedMove
|
|
|
|
|
|
ByteMoveUpLoop:
|
|
ld1 t10 = [a1], 1
|
|
nop.f 0
|
|
add a2 = -1, a2
|
|
;;
|
|
|
|
st1 [a0] = t10, 1
|
|
cmp.ne pt1 = zero, a2
|
|
(pt1) br.cond.sptk ByteMoveUpLoop
|
|
|
|
nop.m 0
|
|
nop.f 0
|
|
br.ret.sptk brp
|
|
|
|
UnalignedMove:
|
|
cmp.eq pt0 = 0, t1
|
|
sub t1 = 8, t1
|
|
(pt0) br.cond.spnt SkipUnalignedMoveByteLoop
|
|
;;
|
|
|
|
|
|
UnalignedMoveByteLoop:
|
|
ld1 t10 = [a1], 1
|
|
add t1 = -1, t1
|
|
add a2 = -1, a2
|
|
;;
|
|
|
|
st1 [a0] = t10, 1
|
|
cmp.eq p0, pt1 = zero, t1
|
|
(pt1) br.cond.sptk UnalignedMoveByteLoop
|
|
;;
|
|
|
|
|
|
SkipUnalignedMoveByteLoop:
|
|
and t0 = 7, a0
|
|
mov pr.rot = 3<<16
|
|
or t1 = a1, r0
|
|
;;
|
|
|
|
add t2 = a2, t0
|
|
mov.i ar.ec = 32
|
|
sub t21 = 8, t0
|
|
;;
|
|
|
|
sub t4 = a0, t0
|
|
shr t10 = t2, 3
|
|
shl t21 = t21, 3
|
|
;;
|
|
|
|
ld8 r33 = [t4], 0
|
|
add t10 = -1,t10
|
|
and t2 = 7, t2
|
|
;;
|
|
|
|
cmp.eq pt0 = 2, t0
|
|
cmp.eq pt3 = 4, t0
|
|
cmp.eq pt5 = 6, t0
|
|
;;
|
|
|
|
nop.m 0
|
|
shl r33 = r33,t21 // Prime r39
|
|
mov.i ar.lc = t10
|
|
|
|
(pt0) br.cond.spnt SpecialLoop2
|
|
(pt3) br.cond.spnt SpecialLoop4
|
|
(pt5) br.cond.spnt SpecialLoop6
|
|
|
|
cmp.eq pt1 = 3, t0
|
|
cmp.eq pt4 = 5, t0
|
|
cmp.eq pt6 = 7, t0
|
|
|
|
(pt1) br.cond.spnt SpecialLoop3
|
|
(pt4) br.cond.spnt SpecialLoop5
|
|
(pt6) br.cond.spnt SpecialLoop7
|
|
;;
|
|
|
|
SpecialLoop1:
|
|
(p16) ld8 r32 = [t1], 8
|
|
nop.f 0
|
|
brp.sptk.imp SpecialLoop1E, SpecialLoop1
|
|
|
|
SpecialLoop1E:
|
|
(p48) st8 [t4] = r10, 8
|
|
(p47) shrp r10 = r62,r63,56
|
|
br.ctop.sptk.many SpecialLoop1
|
|
|
|
br UnalignedByteDone
|
|
|
|
SpecialLoop2:
|
|
(p16) ld8 r32 = [t1], 8
|
|
nop.f 0
|
|
brp.sptk.imp SpecialLoop2E, SpecialLoop2
|
|
|
|
SpecialLoop2E:
|
|
(p48) st8 [t4] = r10, 8
|
|
(p47) shrp r10 = r62,r63,48
|
|
br.ctop.sptk.many SpecialLoop2
|
|
|
|
br UnalignedByteDone
|
|
|
|
SpecialLoop3:
|
|
(p16) ld8 r32 = [t1], 8
|
|
nop.f 0
|
|
brp.sptk.imp SpecialLoop3E, SpecialLoop3
|
|
|
|
SpecialLoop3E:
|
|
(p48) st8 [t4] = r10, 8
|
|
(p47) shrp r10 = r62,r63,40
|
|
br.ctop.sptk.many SpecialLoop3
|
|
|
|
br UnalignedByteDone
|
|
|
|
SpecialLoop4:
|
|
(p16) ld8 r32 = [t1], 8
|
|
nop.f 0
|
|
brp.sptk.imp SpecialLoop4E, SpecialLoop4
|
|
|
|
SpecialLoop4E:
|
|
(p48) st8 [t4] = r10, 8
|
|
(p47) shrp r10 = r62,r63,32
|
|
br.ctop.sptk.many SpecialLoop4
|
|
|
|
br UnalignedByteDone
|
|
|
|
SpecialLoop5:
|
|
(p16) ld8 r32 = [t1], 8
|
|
nop.f 0
|
|
brp.sptk.imp SpecialLoop5E, SpecialLoop5
|
|
|
|
SpecialLoop5E:
|
|
(p48) st8 [t4] = r10, 8
|
|
(p47) shrp r10 = r62,r63,24
|
|
br.ctop.sptk.many SpecialLoop5
|
|
|
|
br UnalignedByteDone
|
|
|
|
SpecialLoop6:
|
|
(p16) ld8 r32 = [t1], 8
|
|
nop.f 0
|
|
brp.sptk.imp SpecialLoop6E, SpecialLoop6
|
|
|
|
SpecialLoop6E:
|
|
(p48) st8 [t4] = r10, 8
|
|
(p47) shrp r10 = r62,r63,16
|
|
br.ctop.sptk.many SpecialLoop6
|
|
|
|
br UnalignedByteDone
|
|
|
|
SpecialLoop7:
|
|
(p16) ld8 r32 = [t1], 8
|
|
nop.f 0
|
|
brp.sptk.imp SpecialLoop7E, SpecialLoop7
|
|
|
|
SpecialLoop7E:
|
|
(p48) st8 [t4] = r10, 8
|
|
(p47) shrp r10 = r62,r63,8
|
|
br.ctop.sptk.many SpecialLoop7;;
|
|
|
|
UnalignedByteDone:
|
|
sub t1 = t1, t0
|
|
mov pr = r64
|
|
mov.i ar.lc = r65
|
|
;;
|
|
|
|
cmp.eq pt0 = zero, t2
|
|
(pt0) br.ret.spnt brp
|
|
|
|
UnAlignedByteDoneLoop:
|
|
ld1 t10 = [t1], 1
|
|
add t2 = -1, t2
|
|
;;
|
|
cmp.ne pt1 = zero, t2
|
|
|
|
st1 [t4] = t10, 1
|
|
(pt1) br.cond.sptk UnAlignedByteDoneLoop
|
|
br.ret.spnt brp
|
|
|
|
|
|
AlignedMove:
|
|
add t4 = 64, t3
|
|
(pt6) lfetch [t3], 32 //160
|
|
sub t22 = 8, t0
|
|
;;
|
|
|
|
(pt6) lfetch [t3], 64 //192
|
|
(pt6) lfetch [t4], 96 //224
|
|
sub a2 = a2, t22
|
|
;;
|
|
|
|
|
|
AlignedMoveByteLoop:
|
|
ld1 t10 = [a1], 1
|
|
nop.f 0
|
|
add t22 = -1, t22
|
|
;;
|
|
|
|
st1 [a0] = t10, 1
|
|
cmp.ne pt1 = zero, t22
|
|
(pt1) br.cond.sptk AlignedMoveByteLoop
|
|
;;
|
|
|
|
(pt6) lfetch [t3], 32 //256
|
|
cmp.eq.unc pt0 = zero, a2
|
|
cmp.gt pt2 = 8, a2
|
|
|
|
(pt6) lfetch [t4], 128 //320
|
|
(pt0) br.ret.spnt brp
|
|
(pt2) br.cond.sptk ByteMoveUpLoop
|
|
;;
|
|
|
|
//
|
|
// both src & dest are now 8-byte aligned
|
|
//
|
|
|
|
QwordMoveUp:
|
|
add t3 = 128, a1
|
|
add t4 = 288, a1
|
|
add t7 = 8, a1
|
|
|
|
add t8 = 8, a0
|
|
cmp.gt pt3 = 64, a2
|
|
(pt3) br.cond.spnt QwordMoveUpLoop
|
|
;;
|
|
|
|
UnrolledQwordMoveUpLoop:
|
|
|
|
ld8 t10 = [a1], 16
|
|
ld8 t11 = [t7], 16
|
|
add a2 = -64, a2
|
|
;;
|
|
|
|
ld8 t12 = [a1], 16
|
|
ld8 t13 = [t7], 16
|
|
cmp.le pt3 = 128, a2
|
|
;;
|
|
|
|
ld8 t14 = [a1], 16
|
|
ld8 t15 = [t7], 16
|
|
cmp.gt pt2 = 8, a2
|
|
;;
|
|
|
|
ld8 t16 = [a1], 16
|
|
ld8 t17 = [t7], 16
|
|
;;
|
|
|
|
(pt3) lfetch [t3], 64
|
|
(pt3) lfetch [t4], 64
|
|
|
|
st8 [a0] = t10, 16
|
|
st8 [t8] = t11, 16
|
|
;;
|
|
|
|
st8 [a0] = t12, 16
|
|
st8 [t8] = t13, 16
|
|
;;
|
|
|
|
st8 [a0] = t14, 16
|
|
st8 [t8] = t15, 16
|
|
;;
|
|
|
|
st8 [a0] = t16, 16
|
|
st8 [t8] = t17, 16
|
|
(pt3) br.cond.dptk UnrolledQwordMoveUpLoop
|
|
|
|
(pt2) br.cond.spnt ByteMoveUp
|
|
;;
|
|
|
|
QwordMoveUpLoop:
|
|
|
|
ld8 t10 = [a1], 8
|
|
add a2 = -8, a2
|
|
;;
|
|
cmp.le pt1 = 8, a2
|
|
|
|
st8 [a0] = t10, 8
|
|
(pt1) br.cond.sptk QwordMoveUpLoop
|
|
;;
|
|
|
|
ByteMoveUp:
|
|
cmp.eq pt0 = zero, a2
|
|
(pt0) br.ret.spnt brp
|
|
;;
|
|
|
|
AlignedByteDoneLoop:
|
|
ld1 t10 = [a1], 1
|
|
add a2 = -1, a2
|
|
;;
|
|
cmp.ne pt1 = zero, a2
|
|
|
|
st1 [a0] = t10, 1
|
|
(pt1) br.cond.sptk AlignedByteDoneLoop
|
|
br.ret.spnt brp
|
|
;;
|
|
|
|
CopyDown:
|
|
cmp.eq pt0 = zero, a2
|
|
cmp.ne pt6 = t0, t1
|
|
(pt0) br.ret.spnt brp // return if length is zero
|
|
|
|
cmp.gt pt4 = 16, a2
|
|
add t20 = a2, a0
|
|
add t21 = a2, a1
|
|
|
|
nop.m 0
|
|
(pt4) br.cond.sptk ByteMoveDown // less than 16 bytes to copy
|
|
(pt6) br.cond.spnt UnalignedMoveDown // incompatible alignment
|
|
;;
|
|
|
|
nop.m 0
|
|
nop.m 0
|
|
and t22 = 0x7, t21
|
|
;;
|
|
|
|
add t20 = -1, t20
|
|
add t21 = -1, t21
|
|
sub a2 = a2, t22
|
|
;;
|
|
|
|
TailMove:
|
|
cmp.eq pt0, pt1 = zero, t22
|
|
;;
|
|
|
|
(pt1) ld1 t10 = [t21], -1
|
|
(pt1) add t22 = -1, t22
|
|
;;
|
|
|
|
(pt1) st1 [t20] = t10, -1
|
|
(pt1) br.cond.sptk TailMove
|
|
|
|
|
|
Block8Move:
|
|
nop.m 0
|
|
add t20 = -7, t20
|
|
add t21 = -7, t21
|
|
;;
|
|
|
|
Block8MoveLoop:
|
|
cmp.gt pt5, pt6 = 8, a2
|
|
;;
|
|
|
|
(pt6) ld8 t10 = [t21], -8
|
|
(pt6) add a2 = -8, a2
|
|
;;
|
|
|
|
(pt6) st8 [t20] = t10, -8
|
|
(pt6) br.cond.sptk Block8MoveLoop
|
|
|
|
add t20 = 8, t20 // adjust dest
|
|
add t21 = 8, t21 // adjust source
|
|
br.cond.sptk ByteMoveDown
|
|
;;
|
|
|
|
|
|
UnalignedMoveDown:
|
|
and t1 = 7, t21
|
|
;;
|
|
cmp.eq pt0 = 0, t1
|
|
(pt0) br.cond.spnt SkipUnalignedMoveDownByteLoop
|
|
;;
|
|
|
|
add t20 = -1, t20
|
|
add t21 = -1, t21
|
|
;;
|
|
|
|
UnalignedMoveDownByteLoop:
|
|
ld1 t10 = [t21], -1
|
|
add t1 = -1, t1
|
|
add a2 = -1, a2
|
|
;;
|
|
|
|
st1 [t20] = t10, -1
|
|
cmp.eq p0, pt1 = zero, t1
|
|
(pt1) br.cond.sptk UnalignedMoveDownByteLoop
|
|
;;
|
|
|
|
add t20 = 1, t20
|
|
add t21 = 1, t21
|
|
;;
|
|
|
|
SkipUnalignedMoveDownByteLoop:
|
|
add t21 = -8, t21
|
|
;;
|
|
|
|
and t0 = 7, t20
|
|
mov pr.rot = 3<<16
|
|
or t1 = t21, r0
|
|
;;
|
|
|
|
sub t7 = 8, t0
|
|
;;
|
|
|
|
add t2 = a2, t7
|
|
mov.i ar.ec = 32
|
|
;;
|
|
|
|
sub t4 = t20, t0
|
|
shr t10 = t2, 3
|
|
shl t6 = t0, 3
|
|
;;
|
|
|
|
ld8 r33 = [t4], 0
|
|
add t10 = -1,t10
|
|
and t2 = 7, t2
|
|
;;
|
|
|
|
cmp.eq pt0 = 2, t0
|
|
cmp.eq pt3 = 4, t0
|
|
cmp.eq pt5 = 6, t0
|
|
;;
|
|
|
|
shr r33 = r33,t6 // Prime r39
|
|
mov.i ar.lc = t10
|
|
|
|
(pt0) br.cond.spnt SpecialLoopDown2
|
|
(pt3) br.cond.spnt SpecialLoopDown4
|
|
(pt5) br.cond.spnt SpecialLoopDown6
|
|
|
|
cmp.eq pt1 = 3, t0
|
|
cmp.eq pt4 = 5, t0
|
|
cmp.eq pt6 = 7, t0
|
|
|
|
(pt1) br.cond.spnt SpecialLoopDown3
|
|
(pt4) br.cond.spnt SpecialLoopDown5
|
|
(pt6) br.cond.spnt SpecialLoopDown7
|
|
;;
|
|
|
|
SpecialLoopDown1:
|
|
(p16) ld8 r32 = [t1], -8
|
|
nop.f 0
|
|
brp.sptk.imp SpecialLoopDown1E, SpecialLoopDown1
|
|
|
|
SpecialLoopDown1E:
|
|
(p48) st8 [t4] = r10, -8
|
|
(p47) shrp r10 = r63,r62,56
|
|
br.ctop.sptk.many SpecialLoopDown1
|
|
|
|
br UnalignedByteDownDone
|
|
|
|
SpecialLoopDown2:
|
|
(p16) ld8 r32 = [t1], -8
|
|
nop.f 0
|
|
brp.sptk.imp SpecialLoopDown2E, SpecialLoopDown2
|
|
|
|
SpecialLoopDown2E:
|
|
(p48) st8 [t4] = r10, -8
|
|
(p47) shrp r10 = r63,r62,48
|
|
br.ctop.sptk.many SpecialLoopDown2
|
|
|
|
br UnalignedByteDownDone
|
|
|
|
SpecialLoopDown3:
|
|
(p16) ld8 r32 = [t1], -8
|
|
nop.f 0
|
|
brp.sptk.imp SpecialLoopDown3E, SpecialLoopDown3
|
|
|
|
SpecialLoopDown3E:
|
|
(p48) st8 [t4] = r10, -8
|
|
(p47) shrp r10 = r63,r62,40
|
|
br.ctop.sptk.many SpecialLoopDown3
|
|
|
|
br UnalignedByteDownDone
|
|
|
|
SpecialLoopDown4:
|
|
(p16) ld8 r32 = [t1], -8
|
|
nop.f 0
|
|
brp.sptk.imp SpecialLoopDown4E, SpecialLoopDown4
|
|
|
|
SpecialLoopDown4E:
|
|
(p48) st8 [t4] = r10, -8
|
|
(p47) shrp r10 = r63,r62,32
|
|
br.ctop.sptk.many SpecialLoopDown4
|
|
|
|
br UnalignedByteDownDone
|
|
|
|
SpecialLoopDown5:
|
|
(p16) ld8 r32 = [t1], -8
|
|
nop.f 0
|
|
brp.sptk.imp SpecialLoopDown5E, SpecialLoopDown5
|
|
|
|
SpecialLoopDown5E:
|
|
(p48) st8 [t4] = r10, -8
|
|
(p47) shrp r10 = r63,r62,24
|
|
br.ctop.sptk.many SpecialLoopDown5
|
|
|
|
br UnalignedByteDownDone
|
|
|
|
SpecialLoopDown6:
|
|
(p16) ld8 r32 = [t1], -8
|
|
nop.f 0
|
|
brp.sptk.imp SpecialLoopDown6E, SpecialLoopDown6
|
|
|
|
SpecialLoopDown6E:
|
|
(p48) st8 [t4] = r10, -8
|
|
(p47) shrp r10 = r63,r62,16
|
|
br.ctop.sptk.many SpecialLoopDown6
|
|
|
|
br UnalignedByteDownDone
|
|
|
|
SpecialLoopDown7:
|
|
(p16) ld8 r32 = [t1], -8
|
|
nop.f 0
|
|
brp.sptk.imp SpecialLoopDown7E, SpecialLoopDown7
|
|
|
|
SpecialLoopDown7E:
|
|
(p48) st8 [t4] = r10, -8
|
|
(p47) shrp r10 = r63,r62,8
|
|
br.ctop.sptk.many SpecialLoopDown7;;
|
|
|
|
UnalignedByteDownDone:
|
|
add t1 = 7, t1
|
|
add t4 = 7, t4
|
|
;;
|
|
|
|
add t1 = t1, t7
|
|
mov pr = r64
|
|
mov.i ar.lc = r65
|
|
;;
|
|
|
|
cmp.eq pt0 = zero, t2
|
|
(pt0) br.ret.spnt brp
|
|
;;
|
|
|
|
UnAlignedByteDoneDownLoop:
|
|
ld1 t10 = [t1], -1
|
|
add t2 = -1, t2
|
|
;;
|
|
cmp.ne pt1 = zero, t2
|
|
|
|
st1 [t4] = t10, -1
|
|
(pt1) br.cond.sptk UnAlignedByteDoneDownLoop
|
|
br.ret.spnt brp
|
|
|
|
ByteMoveDown:
|
|
nop.m 0
|
|
add t20 = -1, t20 // adjust source
|
|
add t21 = -1, t21 // adjust destination
|
|
;;
|
|
|
|
ByteMoveDownLoop:
|
|
cmp.ne pt1 = zero, a2
|
|
;;
|
|
(pt1) ld1 t10 = [t21], -1
|
|
(pt1) add a2 = -1, a2
|
|
;;
|
|
|
|
(pt1) st1 [t20] = t10, -1
|
|
(pt1) br.cond.sptk ByteMoveDownLoop
|
|
br.ret.spnt brp
|
|
;;
|
|
LEAF_EXIT(RtlMoveMemory)
|
|
|
|
|
|
LEAF_ENTRY(RtlCompareMemory)
|
|
|
|
cmp.eq pt0 = 0, a2
|
|
mov v0 = 0
|
|
(pt0) br.ret.spnt.many brp
|
|
;;
|
|
|
|
add t2 = -1, a2
|
|
|
|
Rcmp10:
|
|
ld1 t0 = [a0], 1
|
|
ld1 t1 = [a1], 1
|
|
;;
|
|
cmp4.eq pt2 = t0, t1
|
|
;;
|
|
|
|
(pt2) cmp.ne.unc pt1 = v0, t2
|
|
(pt2) add v0 = 1, v0
|
|
(pt1) br.cond.dptk.few Rcmp10
|
|
|
|
br.ret.sptk.many brp
|
|
|
|
LEAF_EXIT(RtlCompareMemory)
|
|
|
|
|
|
//++
|
|
//
|
|
// VOID
|
|
// RtlCopyIa64FloatRegisterContext (
|
|
// PFLOAT128 Destination,
|
|
// PFLOAT128 Source,
|
|
// ULONGLONG Length
|
|
// )
|
|
//
|
|
// Routine Description:
|
|
//
|
|
// This routine copies floating point context from one place to
|
|
// another. It assumes both the source and the destination are
|
|
// 16-byte aligned and the buffer contains only memory image of
|
|
// floating point registers. Note that Length must be greater
|
|
// than 0 and a multiple of 16.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// a0 - Destination
|
|
// a1 - Source
|
|
// a2 - Length
|
|
//
|
|
// Return Value:
|
|
//
|
|
// None.
|
|
//
|
|
//--
|
|
|
|
NESTED_ENTRY(RtlCopyIa64FloatRegisterContext)
|
|
|
|
.prologue
|
|
.save ar.lc, t22
|
|
mov t22 = ar.lc
|
|
shr t0 = a2, 4
|
|
;;
|
|
|
|
cmp.gtu pt0, pt1 = 16, a2
|
|
add t0 = -1, t0
|
|
;;
|
|
|
|
PROLOGUE_END
|
|
|
|
(pt1) mov ar.lc = t0
|
|
(pt0) br.ret.spnt brp
|
|
|
|
Rcf10:
|
|
|
|
ldf.fill ft0 = [a1], 16
|
|
nop.m 0
|
|
nop.i 0
|
|
;;
|
|
|
|
stf.spill [a0] = ft0, 16
|
|
nop.i 0
|
|
br.cloop.dptk Rcf10
|
|
;;
|
|
|
|
nop.m 0
|
|
mov ar.lc = t22
|
|
br.ret.sptk brp
|
|
;;
|
|
|
|
NESTED_EXIT(RtlCopyIa64FloatRegisterContext)
|
|
|
|
|
|
|
|
NESTED_ENTRY(RtlpCopyContextSubSet)
|
|
|
|
.prologue
|
|
.save ar.lc, t22
|
|
mov t22 = ar.lc
|
|
mov t0 = a0
|
|
mov t1 = a1
|
|
;;
|
|
|
|
PROLOGUE_END
|
|
|
|
ld8 t3 = [t1], CxFltS0
|
|
;;
|
|
st8 [t0] = t3, CxFltS0
|
|
mov t2 = 3
|
|
|
|
add t10 = CxFltS4, a0
|
|
add t11 = CxFltS4, a1
|
|
;;
|
|
mov ar.lc = t2
|
|
|
|
Rcc10:
|
|
ldf.fill ft0 = [t1], 16
|
|
;;
|
|
stf.spill [t0] = ft0, 16
|
|
mov t2 = 15
|
|
br.cloop.dptk.few Rcc10
|
|
;;
|
|
|
|
mov t0 = CxStIFS
|
|
mov t1 = CxStFPSR
|
|
mov ar.lc = t2
|
|
|
|
Rcc20:
|
|
ldf.fill ft0 = [t11], 16
|
|
;;
|
|
stf.spill [t10] = ft0, 16
|
|
sub t2 = t0, t1
|
|
br.cloop.dptk.few Rcc20
|
|
;;
|
|
|
|
add t11 = CxStFPSR, a1
|
|
add t10 = CxStFPSR, a0
|
|
shr t2 = t2, 3
|
|
;;
|
|
|
|
mov ar.lc = t2
|
|
;;
|
|
|
|
Rcc30:
|
|
ld8 t0 = [t11], 8
|
|
;;
|
|
st8 [t10] = t0, 8
|
|
nop.i 0
|
|
|
|
br.cloop.dptk.few Rcc30
|
|
;;
|
|
|
|
nop.m 0
|
|
mov ar.lc = t22
|
|
br.ret.sptk brp
|
|
|
|
NESTED_EXIT(RtlpCopyContextSubSet)
|
|
|
|
|
|
//++
|
|
//
|
|
// VOID
|
|
// RtlPrefetchMemoryNonTemporal (
|
|
// IN PVOID Source,
|
|
// IN SIZE_T Length
|
|
// )
|
|
//
|
|
// Routine Description:
|
|
//
|
|
// This routine prefetches memory at Source, for Length bytes into
|
|
// the closest cache to the processor.
|
|
//
|
|
// N.B. Currently this code assumes a line size of 32 bytes. At
|
|
// some stage it should be altered to determine and use the processor's
|
|
// actual line size.
|
|
//
|
|
// Arguments:
|
|
//
|
|
// a0 - Source
|
|
// a1 - Length
|
|
//
|
|
// Return Value:
|
|
//
|
|
// None.
|
|
//
|
|
//--
|
|
|
|
LEAF_ENTRY(RtlPrefetchMemoryNonTemporal)
|
|
.prologue
|
|
lfetch.nta [a0], 32 // get first line coming
|
|
.save ar.lc, t0
|
|
mov.i t0 = ar.lc // save loop counter
|
|
shr a1 = a1, 5 // determine loop count
|
|
;;
|
|
.body
|
|
|
|
add t2 = -1, a1 // subtract out already fetched line
|
|
cmp.lt pt0, pt1 = 2, a1 // check if less than one line to fetch
|
|
;;
|
|
|
|
(pt0) mov ar.lc = t2 // set loop count
|
|
(pt1) br.ret.spnt.few brp // return if no more lines to fetch
|
|
;;
|
|
|
|
Rpmnt10:
|
|
lfetch.nta [a0], 32 // fetch next line
|
|
br.cloop.dptk.many Rpmnt10 // loop while more lines to fetch
|
|
;;
|
|
|
|
mov ar.lc = t0 // restore loop counter
|
|
br.ret.sptk.many brp // return
|
|
|
|
LEAF_EXIT(RtlPrefetchMemoryNonTemporal)
|
|
|
|
|
|
|