171 lines
4.6 KiB
ArmAsm
171 lines
4.6 KiB
ArmAsm
|
// memset.s: function to set a number of bytes to a char value
|
||
|
|
||
|
// Copyright (C) 1998 Intel Corporation.
|
||
|
//
|
||
|
// The information and source code contained herein is the exclusive property
|
||
|
// of Intel Corporation and may not be disclosed, examined, or
|
||
|
// reproduced in whole or in part without explicit written authorization from
|
||
|
// the Company.
|
||
|
|
||
|
// Author: Steve Skedzielewski
|
||
|
// Date: June, 2000
|
||
|
//
|
||
|
.section .text
|
||
|
// -- Begin memset
|
||
|
.proc memset#
|
||
|
.align 32
|
||
|
// Replicate the value into all bytes using mmx broadcast
|
||
|
// live out: r21 (alignment), r11(ar.lc), r33(replicated c),
|
||
|
// r32(s), r34(n)
|
||
|
.global memset#
|
||
|
.prologue
|
||
|
memset:
|
||
|
and r21=7,r32
|
||
|
.save ar.lc,r11,t01
|
||
|
[t01:] mov r11=ar.lc //0: 2 MS
|
||
|
brp.dptk.imp Longloop, Longloop_br
|
||
|
mov r8=r32 //0:
|
||
|
mux1 r33=r33,@brcst
|
||
|
;;
|
||
|
// If we're not on an 8-byte boundary, move to one
|
||
|
// live out: r11(ar.lc), r33(unsigned c), r32(sext s), r34(unsigned n)
|
||
|
// p14 (n>=MINIMUM_LONG)
|
||
|
.body
|
||
|
MINIMUM_LONG=0x4f
|
||
|
Check_align:
|
||
|
cmp.le p14,p0=MINIMUM_LONG,r34 //0: MINIMUM_LONG < n?
|
||
|
cmp.ne p15,p0=0,r21 //0: Low 3 bits zero?
|
||
|
(p15) br.cond.dpnt Align //0:
|
||
|
;;
|
||
|
// Now that p is aligned,
|
||
|
// use straight-line code for n<=64, a loop otherwise
|
||
|
// Exit if n<=0
|
||
|
// live out: r11(ar.lc), r33(unsigned c), r32(sext s), r34(n)
|
||
|
// r17(s+8), p13(n>8), p12(n>16), p14 (n>=MINIMUM_LONG)
|
||
|
Is_aligned:
|
||
|
cmp.ge p15,p0=0,r34 //0: n <= 0?
|
||
|
cmp.le p13,p0=0x10,r34 //0: 16 <= n?
|
||
|
cmp.le p12,p0=0x20,r34 //0: 32 <= n?
|
||
|
add r17=8,r32 //0: second pointer
|
||
|
(p15) br.cond.dpnt Exit //0: 21 MS
|
||
|
(p14) br.cond.dpnt Long //0: 21 MS
|
||
|
;;
|
||
|
// Short memsets are done with predicated straightline code
|
||
|
// live out: r8 (return value, original value of r32
|
||
|
;; // stall 1 cycle for MMX to complete
|
||
|
(p13) st8 [r32]=r33,16 //0:
|
||
|
(p13) st8 [r17]=r33,16 //0:
|
||
|
cmp.le p11,p0=0x30,r34 //0: 48 <= n?
|
||
|
;;
|
||
|
(p12) st8 [r32]=r33,16 //1:
|
||
|
(p12) st8 [r17]=r33,16 //1:
|
||
|
cmp.le p10,p0=0x40,r34 //1: 64 <= n?
|
||
|
;;
|
||
|
(p11) st8 [r32]=r33,16 //2:
|
||
|
(p11) st8 [r17]=r33,16 //2:
|
||
|
tbit.nz p9,p0=r34,3 //2: odd number of st8s?
|
||
|
;;
|
||
|
(p10) st8 [r32]=r33,16 //3:
|
||
|
(p10) st8 [r17]=r33 //3:
|
||
|
tbit.nz p8,p0=r34,2 //3: bit 2 on?
|
||
|
;;
|
||
|
(p9) st8 [r32]=r33,8 //4:
|
||
|
tbit.nz p7,p0=r34,1 //4: bit 1 on?
|
||
|
and r18=1,r34 //4: bit 0 on?
|
||
|
;;
|
||
|
//
|
||
|
// Clean up any partial word stores.
|
||
|
//
|
||
|
(p8) st4 [r32]=r33,4 //5:
|
||
|
;;
|
||
|
(p7) st2 [r32]=r33,2 //6:
|
||
|
cmp.ne p6,p0=0,r18 //6:
|
||
|
;;
|
||
|
(p6) st1 [r32]=r33,1 //7:
|
||
|
br.ret.sptk.many b0 //7:
|
||
|
;;
|
||
|
// Cycles = 8 , Instr = 21
|
||
|
//
|
||
|
// Block 11: Bchanged Pred: 8 Succ: 15
|
||
|
// Counted loop setup. We know n>0 (exit above otherwise),
|
||
|
// so we can just shift n right 4 bits (2 st8/iteration)
|
||
|
// live out: r8(return value), r11(ar.lc), r17(s+8), r32(sext s)
|
||
|
// r33(replicated c), r34(n), p5(n&4), p6(n&8)
|
||
|
Long:
|
||
|
add r17=8,r32 //0: second pointer
|
||
|
shr.u r30=r34,4 //0: 29 MS
|
||
|
and r18=0x8,r34 //0:
|
||
|
and r19=0x4,r34 //0:
|
||
|
;;
|
||
|
cmp.ne p6,p0=0,r18 //1:
|
||
|
add r30=-2,r30 //1:
|
||
|
cmp.ne p15,p0=0,r19 //1:
|
||
|
;;
|
||
|
st8 [r32]=r33,16 //2: Use the otherwise empty
|
||
|
st8 [r17]=r33,16 //2: m slots
|
||
|
mov ar.lc=r30 //2:
|
||
|
;;
|
||
|
// Cycles = 2, Instr = 9
|
||
|
// Block 15: lentry lexit Bchanged Pred: 15 11 Succ: 15 13
|
||
|
// Counted loop storing 16 bytes/iteration, TAR hinted.
|
||
|
// live out: r11(ar.lc), r17(s+8), r21(n&7), r32(sext s)
|
||
|
// r33(replicated c), r34(n), p5(n&4), p6(n&8)
|
||
|
Longloop:
|
||
|
Longloop_br:
|
||
|
st8 [r32]=r33,16 //0: 30 MS
|
||
|
st8 [r17]=r33,16 //0: 31 MS
|
||
|
br.cloop.sptk Longloop //0: 0 MS
|
||
|
;;
|
||
|
// Cycles = 1, Instr = 3
|
||
|
// Block 13: Pred: 11 15 Succ: 12 24
|
||
|
// Exit, or cleanup and exit
|
||
|
// live out: r17(s+8), r32(sext s), r33(replicated c), r34(n)
|
||
|
// p4(n&2), p5(n&4)
|
||
|
Loopdone:
|
||
|
(p6) st8 [r32]=r33,8 //0:
|
||
|
tbit.nz p14,p0=r34,1 //0:
|
||
|
;;
|
||
|
// Block 24: Bchanged Pred: 13 Succ:
|
||
|
// Cleanup partial words after loop
|
||
|
(p15) st4 [r32]=r33,4 //0:
|
||
|
;;
|
||
|
(p14) st2 [r32]=r33,2 //1:
|
||
|
tbit.nz p13,p0=r34,0 //1:
|
||
|
;;
|
||
|
Loopexit:
|
||
|
(p13) st1 [r32]=r33 //2:
|
||
|
mov ar.lc=r11 //2:
|
||
|
br.ret.sptk.many b0 //2:
|
||
|
;;
|
||
|
// Cycles = 6, Instr = 12
|
||
|
///
|
||
|
/// Align the input pointer to an 8-byte boundary
|
||
|
// Block 5: lentry lexit Bchanged Pred: 3 6 Succ: 8 6
|
||
|
// Freq 0, prob 0
|
||
|
.b1_5:
|
||
|
Align:
|
||
|
cmp.ge p9,p0=0,r34 //0: 18 MS
|
||
|
sub r22=8,r21 //0: 16 B6 MS S
|
||
|
(p9) br.cond.dpnt Exit //0: 18 MS
|
||
|
;;
|
||
|
// Cycles = 1, Instr = 3
|
||
|
// Block 6: lexit Bchanged Pred: 5 Succ: 5 8
|
||
|
Align_loop:
|
||
|
st1 [r32]=r33,1 //0: 19 MS
|
||
|
cmp.ge p10,p0=1,r22 //0:
|
||
|
add r34=-1,r34 //0:
|
||
|
(p10) br.cond.dpnt Is_aligned //0:
|
||
|
;;
|
||
|
add r22=-1,r22
|
||
|
cmp.lt p9,p0=0,r34 //0: 16 MS
|
||
|
(p9) br.cond.dptk Align_loop //0: 16 MS
|
||
|
;;
|
||
|
// Cycles = 2, Instr = 6
|
||
|
Exit:
|
||
|
br.ret.sptk.many b0 //0:
|
||
|
;;
|
||
|
// Cycles = 1, Instr = 3
|
||
|
// -- End memset
|
||
|
.endp memset#
|
||
|
// End
|