335 lines
8.3 KiB
ArmAsm
335 lines
8.3 KiB
ArmAsm
//##########################################################################
|
|
//**
|
|
//** Copyright (C) 1996-2000 Intel Corporation. All rights reserved.
|
|
//**
|
|
//** The information and source code contained herein is the exclusive
|
|
//** property of Intel Corporation and may not be disclosed, examined
|
|
//** or reproduced in whole or in part without explicit written authorization
|
|
//** from the company.
|
|
//**
|
|
//###########################################################################
|
|
|
|
.file "rerun.s"
|
|
.section .text
|
|
.align 32
|
|
|
|
.proc _xrun1args#
|
|
.global _xrun1args#
|
|
.align 32
|
|
_xrun1args:
|
|
alloc r31=ar.pfs,4,4,0,0 // r32, r33, r34, r35, r36, r37, r38, r39
|
|
|
|
// OpCode is in r32
|
|
// &fpsr is in r33
|
|
// &fr1 (output) is in r34
|
|
// &fr2 (input) is in r35
|
|
|
|
// save old FPSR in r36
|
|
mov.m r36 = ar40
|
|
// save predicates in r38
|
|
mov r38 = pr;;
|
|
// load fpsr in r37
|
|
ld8 r37 = [r33];;
|
|
// set new value of FPSR
|
|
mov ar40 = r37;;
|
|
// clear predicates
|
|
movl r39 = 0x0000000000000001;;
|
|
// load clear predicates from r39
|
|
mov pr = r39,0x1ffff;;
|
|
// load input argument into f8
|
|
ldf.fill f8 = [r35];;
|
|
cmp4.eq p1, p2 = 1, r32;; // fprsqrta [not used]
|
|
(p2) cmp4.eq.unc p2, p3 = 2, r32;; // fpcvt_fx
|
|
(p3) cmp4.eq.unc p3, p4 = 3, r32;; // fpcvt_fxu
|
|
(p4) cmp4.eq.unc p4, p5 = 4, r32;; // fpcvt_fx_trunc
|
|
(p5) cmp4.eq.unc p5, p6 = 5, r32;; // fpcvt_fxu_trunc
|
|
(p1) fprsqrta.s0 f9,p7 = f8;; // 1/sqrt(f8) in f9
|
|
(p2) fpcvt.fx.s0 f9 = f8;;
|
|
(p3) fpcvt.fxu.s0 f9 = f8;;
|
|
(p4) fpcvt.fx.trunc.s0 f9 = f8;;
|
|
(p5) fpcvt.fxu.trunc.s0 f9 = f8;;
|
|
(p6) mov f9 = f0 // return 0
|
|
// restore predicates from r38
|
|
mov pr = r38,0x1ffff;;
|
|
// store result
|
|
stf.spill [r34] = f9;;
|
|
// save FPSR
|
|
mov.m r37 = ar40;;
|
|
st8 [r33] = r37
|
|
// restore FPSR
|
|
mov ar40 = r36;;
|
|
// return
|
|
br.ret.sptk b0
|
|
.endp _xrun1args
|
|
|
|
|
|
.proc _xrun2args#
|
|
.global _xrun2args#
|
|
.align 32
|
|
|
|
_xrun2args:
|
|
alloc r31=ar.pfs,5,4,0,0 // r32, r33, r34, r35, r36, r37, r38, r39, r40
|
|
|
|
// OpCode is in r32
|
|
// &fpsr is in r33
|
|
// &fr1 (output) is in r34
|
|
// &fr2 (input) is in r35
|
|
// &fr3 (input) is in r36
|
|
|
|
// save old FPSR in r37
|
|
mov r37 = ar40
|
|
// save predicates in r39
|
|
mov r39 = pr;;
|
|
// load fpsr in r38
|
|
ld8 r38 = [r33];;
|
|
// set new value of FPSR
|
|
mov ar40 = r38;;
|
|
// clear predicates
|
|
movl r40 = 0x0000000000000001;;
|
|
// load clear predicates from r40
|
|
mov pr = r40,0x1ffff;;
|
|
// load first input argument into f8
|
|
ldf.fill f8 = [r35]
|
|
// load second input argument into f9
|
|
ldf.fill f9 = [r36];;
|
|
cmp4.eq p1, p2 = 1, r32;; // fprcpa [not used - fprcpa not re-executed]
|
|
(p2) cmp4.eq.unc p2, p3 = 2, r32;; // fpcmp_eq
|
|
(p3) cmp4.eq.unc p3, p4 = 3, r32;; // fpcmp_lt
|
|
(p4) cmp4.eq.unc p4, p5 = 4, r32;; // fpcmp_le
|
|
(p5) cmp4.eq.unc p5, p6 = 5, r32;; // fpcmp_unord
|
|
(p6) cmp4.eq.unc p6, p7 = 6, r32;; // fpcmp_neq
|
|
(p7) cmp4.eq.unc p7, p8 = 7, r32;; // fpcmp_nlt
|
|
(p8) cmp4.eq.unc p8, p9 = 8, r32;; // fpcmp_nle
|
|
(p9) cmp4.eq.unc p9, p10 = 9, r32;; // fpcmp_ord
|
|
(p10) cmp4.eq.unc p10, p11 = 10, r32;; // fpmin
|
|
(p11) cmp4.eq.unc p11, p12 = 11, r32;; // fpmax
|
|
(p12) cmp4.eq.unc p12, p13 = 12, r32;; // fpamin
|
|
(p13) cmp4.eq.unc p13, p14 = 13, r32;; // fpamax
|
|
(p1) fprcpa.s0 f10 , p15 = f8, f9;; // 1 / f3 in f4
|
|
(p2) fpcmp.eq.s0 f10 = f8, f9;;
|
|
(p3) fpcmp.lt.s0 f10 = f8, f9;;
|
|
(p4) fpcmp.le.s0 f10 = f8, f9;;
|
|
(p5) fpcmp.unord.s0 f10 = f8, f9;;
|
|
(p6) fpcmp.neq.s0 f10 = f8, f9;;
|
|
(p7) fpcmp.nlt.s0 f10 = f8, f9;;
|
|
(p8) fpcmp.nle.s0 f10 = f8, f9;;
|
|
(p9) fpcmp.ord.s0 f10 = f8, f9;;
|
|
(p10) fpmin.s0 f10 = f8, f9;;
|
|
(p11) fpmax.s0 f10 = f8, f9;;
|
|
(p12) fpamin.s0 f10 = f8, f9;;
|
|
(p13) fpamax.s0 f10 = f8, f9;;
|
|
(p14) mov f10 = f0 // return 0
|
|
// restore predicates from r39
|
|
mov pr = r39,0x1ffff;;
|
|
// store result
|
|
stf.spill [r34] = f10
|
|
// save FPSR
|
|
mov.m r38 = ar40;;
|
|
st8 [r33] = r38
|
|
// restore FPSR
|
|
mov ar40 = r37;;
|
|
// return
|
|
br.ret.sptk b0
|
|
.endp _xrun2args
|
|
|
|
|
|
.proc _xrun3args#
|
|
.global _xrun3args#
|
|
.align 32
|
|
|
|
_xrun3args:
|
|
alloc r31=ar.pfs,6,4,0,0 // r32, r33, r34, r35, r36, r37, r38, r39, r40, r41
|
|
|
|
// OpCode is in r32
|
|
// &fpsr is in r33
|
|
// &fr1 (output) is in r34
|
|
// &fr2 (input) is in r35
|
|
// &fr3 (input) is in r36
|
|
// &fr4 (input) is in r37
|
|
|
|
// save old FPSR in r38
|
|
mov r38 = ar40
|
|
// save predicates in r40
|
|
mov r40 = pr;;
|
|
// load fpsr in r39
|
|
ld8 r39 = [r33];;
|
|
// set new value of FPSR
|
|
mov ar40 = r39;;
|
|
// clear predicates
|
|
movl r41 = 0x0000000000000001;;
|
|
// load clear predicates from r41
|
|
mov pr = r41,0x1ffff;;
|
|
// load first input argument into f8
|
|
ldf.fill f8 = [r35]
|
|
// load second input argument into f9
|
|
ldf.fill f9 = [r36];;
|
|
// load third input argument into f10
|
|
ldf.fill f10 = [r37];;
|
|
cmp4.eq p1, p2 = 1, r32;; // fpma
|
|
(p2) cmp4.eq.unc p2, p3 = 2, r32;; // fpms
|
|
(p3) cmp4.eq.unc p3, p4 = 3, r32;; // fpnma
|
|
(p1) fpma.s0 f11 = f8, f9, f10;; // f11 = f8 * f9 + f10
|
|
(p2) fpms.s0 f11 = f8, f9, f10;; // f11 = f8 * f9 - f10
|
|
(p3) fpnma.s0 f11 = f8, f9, f10;; // f11 = -f8 * f9 + f10
|
|
(p4) mov f11 = f0 // return 0
|
|
// restore predicates from r40
|
|
mov pr = r40,0x1ffff;;
|
|
// store result
|
|
stf.spill [r34] = f11
|
|
// save FPSR
|
|
mov.m r39 = ar40;;
|
|
st8 [r33] = r39
|
|
// restore FPSR
|
|
mov ar40 = r38
|
|
// return
|
|
br.ret.sptk b0
|
|
.endp _xrun3args
|
|
|
|
|
|
.proc _thmB#
|
|
.global _thmB#
|
|
.align 32
|
|
|
|
_thmB:
|
|
alloc r31=ar.pfs,4,2,0,0 // r32, r33, r34, r35, r36, r37
|
|
|
|
// &a is in r32
|
|
// &b is in r33
|
|
// &div is in r34 (the address of the divide result)
|
|
// &fpsr is in r35
|
|
|
|
// general registers used: r31, r32, r33, r34, r35, r36, r37
|
|
// predicate registers used: p6
|
|
// floating-point registers used: f6, f7, f8
|
|
|
|
// save old FPSR in r36
|
|
mov r36 = ar40
|
|
// load fpsr in r37
|
|
ld8 r37 = [r35];;
|
|
// set new value of FPSR
|
|
mov ar40 = r37
|
|
// load a, the first argument, in f6
|
|
ldfs f6 = [r32];;
|
|
// load b, the second argument, in f7
|
|
ldfs f7 = [r33];;
|
|
// Step (1)
|
|
// y0 = 1 / b in f8
|
|
frcpa.s0 f8,p6=f6,f7;;
|
|
// Step (2)
|
|
// q0 = a * y0 in f6
|
|
(p6) fma.s1 f6=f6,f8,f0
|
|
// Step (3)
|
|
// e0 = 1 - b * y0 in f7
|
|
(p6) fnma.s1 f7=f7,f8,f1;;
|
|
// Step (4)
|
|
// q1 = q0 + e0 * q0 in f6
|
|
(p6) fma.s1 f6=f7,f6,f6
|
|
// Step (5)
|
|
// e1 = e0 * e0 in f7
|
|
(p6) fma.s1 f7=f7,f7,f0;;
|
|
// Step (6)
|
|
// q2 = q1 + e1 * q1 in f6
|
|
(p6) fma.s1 f6=f7,f6,f6
|
|
// Step (7)
|
|
// e2 = e1 * e1 in f7
|
|
(p6) fma.s1 f7=f7,f7,f0;;
|
|
// Step (8)
|
|
// q3 = q2 + e2 * q2 in f6
|
|
(p6) fma.d.s1 f6=f7,f6,f6;;
|
|
// Step (9)
|
|
// q3' = q3 in f8
|
|
(p6) fma.s.s0 f8=f6,f1,f0;;
|
|
// store result
|
|
stfs [r34]=f8
|
|
// save fpsr
|
|
mov.m r37 = ar40;;
|
|
st8 [r35] = r37
|
|
// restore FPSR
|
|
mov ar40 = r36;;
|
|
// return
|
|
br.ret.sptk b0
|
|
|
|
.endp _thmB
|
|
|
|
|
|
.proc _thmH#
|
|
.global _thmH#
|
|
.align 32
|
|
|
|
_thmH:
|
|
alloc r31=ar.pfs,3,2,0,0 // r32, r33, r34, r35, r36
|
|
|
|
// &a is in r32
|
|
// &sqrt is in r33 (the address of the sqrt result)
|
|
// &fpsr in r34
|
|
|
|
// general registers used: r31, r32, r33, r34, r35
|
|
// predicate registers used: p6
|
|
// floating-point registers used: f6, f7, f8, f9, f10, f11, f12
|
|
|
|
// save old FPSR in r35
|
|
mov r35 = ar40
|
|
// load fpsr in r36
|
|
ld8 r36 = [r34];;
|
|
// set new value of FPSR
|
|
mov ar40 = r36
|
|
// exponent of +1/2 in r2
|
|
movl r2 = 0x0fffe;;
|
|
// +1/2 in f7
|
|
setf.exp f7 = r2
|
|
// load the argument a in f6
|
|
ldfs f6 = [r32];;
|
|
// Step (1)
|
|
// y0 = 1/sqrt(a) in f8
|
|
frsqrta.s0 f8,p6=f6;;
|
|
// Step (2)
|
|
// h = +1/2 * a in f9
|
|
(p6) fma.s1 f9=f7,f6,f0
|
|
// Step (3)
|
|
// t1 = y0 * y0 in f10
|
|
(p6) fma.s1 f10=f8,f8,f0;;
|
|
// Step (4)
|
|
// t2 = 1/2 - t1 * h in f10
|
|
(p6) fnma.s1 f10=f10,f9,f7;;
|
|
// Step (5)
|
|
// y1 = y0 + t2 * y0 in f8
|
|
(p6) fma.s1 f8=f10,f8,f8;;
|
|
// Step (6)
|
|
// S = a * y1 in f10
|
|
(p6) fma.s1 f10=f6,f8,f0
|
|
// Step (7)
|
|
// t3 = y1 * h in f9
|
|
(p6) fma.s1 f9=f8,f9,f0
|
|
// Step (8)
|
|
// H = 1/2 * y1 in f11
|
|
(p6) fma.s1 f11=f7,f8,f0;;
|
|
// Step (9)
|
|
// d = a - S * S in f12
|
|
(p6) fnma.s1 f12=f10,f10,f6
|
|
// Step (10)
|
|
// t4 = 1/2 - t3 * y1 in f7
|
|
(p6) fnma.s1 f7=f9,f8,f7;;
|
|
// Step (11)
|
|
// S1 = S + d * H in f8
|
|
(p6) fma.s.s1 f8=f12,f11,f10
|
|
// Step (12)
|
|
// H1 = H + t4 * H in f7
|
|
(p6) fma.s1 f7=f7,f11,f11;;
|
|
// Step (13)
|
|
// d1 = a - S1 * S1 in f6
|
|
(p6) fnma.s1 f6=f8,f8,f6;;
|
|
// Step (14)
|
|
// R = S1 + d1 * H1 in f8
|
|
(p6) fma.s.s0 f8=f6,f7,f8;;
|
|
// store result
|
|
stfs [r33]=f8
|
|
// save fpsr
|
|
mov.m r36 = ar40;;
|
|
st8 [r34] = r36
|
|
// restore FPSR
|
|
mov ar40 = r35;;
|
|
// return
|
|
br.ret.sptk b0
|
|
|
|
.endp _thmH
|