windows-nt/Source/XPSP1/NT/base/crts/crtw32/string/ia64/memset.s
2020-09-26 16:20:57 +08:00

171 lines
4.6 KiB
ArmAsm

// memset.s: function to set a number of bytes to a char value
// Copyright (C) 1998 Intel Corporation.
//
// The information and source code contained herein is the exclusive property
// of Intel Corporation and may not be disclosed, examined, or
// reproduced in whole or in part without explicit written authorization from
// the Company.
// Author: Steve Skedzielewski
// Date: June, 2000
//
.section .text
// -- Begin memset
.proc memset#
.align 32
// Replicate the value into all bytes using mmx broadcast
// live out: r21 (alignment), r11(ar.lc), r33(replicated c),
// r32(s), r34(n)
.global memset#
.prologue
memset:
and r21=7,r32
.save ar.lc,r11,t01
[t01:] mov r11=ar.lc //0: 2 MS
brp.dptk.imp Longloop, Longloop_br
mov r8=r32 //0:
mux1 r33=r33,@brcst
;;
// If we're not on an 8-byte boundary, move to one
// live out: r11(ar.lc), r33(unsigned c), r32(sext s), r34(unsigned n)
// p14 (n>=MINIMUM_LONG)
.body
MINIMUM_LONG=0x4f
Check_align:
cmp.le p14,p0=MINIMUM_LONG,r34 //0: MINIMUM_LONG < n?
cmp.ne p15,p0=0,r21 //0: Low 3 bits zero?
(p15) br.cond.dpnt Align //0:
;;
// Now that p is aligned,
// use straight-line code for n<=64, a loop otherwise
// Exit if n<=0
// live out: r11(ar.lc), r33(unsigned c), r32(sext s), r34(n)
// r17(s+8), p13(n>8), p12(n>16), p14 (n>=MINIMUM_LONG)
Is_aligned:
cmp.ge p15,p0=0,r34 //0: n <= 0?
cmp.le p13,p0=0x10,r34 //0: 16 <= n?
cmp.le p12,p0=0x20,r34 //0: 32 <= n?
add r17=8,r32 //0: second pointer
(p15) br.cond.dpnt Exit //0: 21 MS
(p14) br.cond.dpnt Long //0: 21 MS
;;
// Short memsets are done with predicated straightline code
// live out: r8 (return value, original value of r32
;; // stall 1 cycle for MMX to complete
(p13) st8 [r32]=r33,16 //0:
(p13) st8 [r17]=r33,16 //0:
cmp.le p11,p0=0x30,r34 //0: 48 <= n?
;;
(p12) st8 [r32]=r33,16 //1:
(p12) st8 [r17]=r33,16 //1:
cmp.le p10,p0=0x40,r34 //1: 64 <= n?
;;
(p11) st8 [r32]=r33,16 //2:
(p11) st8 [r17]=r33,16 //2:
tbit.nz p9,p0=r34,3 //2: odd number of st8s?
;;
(p10) st8 [r32]=r33,16 //3:
(p10) st8 [r17]=r33 //3:
tbit.nz p8,p0=r34,2 //3: bit 2 on?
;;
(p9) st8 [r32]=r33,8 //4:
tbit.nz p7,p0=r34,1 //4: bit 1 on?
and r18=1,r34 //4: bit 0 on?
;;
//
// Clean up any partial word stores.
//
(p8) st4 [r32]=r33,4 //5:
;;
(p7) st2 [r32]=r33,2 //6:
cmp.ne p6,p0=0,r18 //6:
;;
(p6) st1 [r32]=r33,1 //7:
br.ret.sptk.many b0 //7:
;;
// Cycles = 8 , Instr = 21
//
// Block 11: Bchanged Pred: 8 Succ: 15
// Counted loop setup. We know n>0 (exit above otherwise),
// so we can just shift n right 4 bits (2 st8/iteration)
// live out: r8(return value), r11(ar.lc), r17(s+8), r32(sext s)
// r33(replicated c), r34(n), p5(n&4), p6(n&8)
Long:
add r17=8,r32 //0: second pointer
shr.u r30=r34,4 //0: 29 MS
and r18=0x8,r34 //0:
and r19=0x4,r34 //0:
;;
cmp.ne p6,p0=0,r18 //1:
add r30=-2,r30 //1:
cmp.ne p15,p0=0,r19 //1:
;;
st8 [r32]=r33,16 //2: Use the otherwise empty
st8 [r17]=r33,16 //2: m slots
mov ar.lc=r30 //2:
;;
// Cycles = 2, Instr = 9
// Block 15: lentry lexit Bchanged Pred: 15 11 Succ: 15 13
// Counted loop storing 16 bytes/iteration, TAR hinted.
// live out: r11(ar.lc), r17(s+8), r21(n&7), r32(sext s)
// r33(replicated c), r34(n), p5(n&4), p6(n&8)
Longloop:
Longloop_br:
st8 [r32]=r33,16 //0: 30 MS
st8 [r17]=r33,16 //0: 31 MS
br.cloop.sptk Longloop //0: 0 MS
;;
// Cycles = 1, Instr = 3
// Block 13: Pred: 11 15 Succ: 12 24
// Exit, or cleanup and exit
// live out: r17(s+8), r32(sext s), r33(replicated c), r34(n)
// p4(n&2), p5(n&4)
Loopdone:
(p6) st8 [r32]=r33,8 //0:
tbit.nz p14,p0=r34,1 //0:
;;
// Block 24: Bchanged Pred: 13 Succ:
// Cleanup partial words after loop
(p15) st4 [r32]=r33,4 //0:
;;
(p14) st2 [r32]=r33,2 //1:
tbit.nz p13,p0=r34,0 //1:
;;
Loopexit:
(p13) st1 [r32]=r33 //2:
mov ar.lc=r11 //2:
br.ret.sptk.many b0 //2:
;;
// Cycles = 6, Instr = 12
///
/// Align the input pointer to an 8-byte boundary
// Block 5: lentry lexit Bchanged Pred: 3 6 Succ: 8 6
// Freq 0, prob 0
.b1_5:
Align:
cmp.ge p9,p0=0,r34 //0: 18 MS
sub r22=8,r21 //0: 16 B6 MS S
(p9) br.cond.dpnt Exit //0: 18 MS
;;
// Cycles = 1, Instr = 3
// Block 6: lexit Bchanged Pred: 5 Succ: 5 8
Align_loop:
st1 [r32]=r33,1 //0: 19 MS
cmp.ge p10,p0=1,r22 //0:
add r34=-1,r34 //0:
(p10) br.cond.dpnt Is_aligned //0:
;;
add r22=-1,r22
cmp.lt p9,p0=0,r34 //0: 16 MS
(p9) br.cond.dptk Align_loop //0: 16 MS
;;
// Cycles = 2, Instr = 6
Exit:
br.ret.sptk.many b0 //0:
;;
// Cycles = 1, Instr = 3
// -- End memset
.endp memset#
// End