windows-nt/Source/XPSP1/NT/base/ntdll/i386/emfsqrt.asm
2020-09-26 16:20:57 +08:00

268 lines
10 KiB
NASM

subttl emfsqrt.asm - FSQRT instruction
page
;*******************************************************************************
;emfsqrt.asm - FSQRT instruction
; by Tim Paterson
;
; Microsoft Confidential
;
; Copyright (c) Microsoft Corporation 1991
; All Rights Reserved
;
;Inputs:
; edi = [CURstk]
;
;Revision History:
;
; [] 09/05/91 TP Initial 32-bit version.
;
;*******************************************************************************
;A linear approximation of the square root function is used to get the
;intial value for Newton-Raphson iteration. This approximation gives
;nearly 5-bit accuracy over the required input interval, [1,4). The
;equation for the linear approximation of y = sqrt(x) is y = mx + b,
;where m is the slope (named SQRT_COEF) and b is the y-intercept (named
;SQRT_INTERCEPT).
;
;(The values for m and b were computed with Excel Solver in two passes:
;the first pass computed them full precision, minimizing absolute error;
;the second computed only b after m was rounded to an 8-bit value.)
;
;The resulting values have the following maximum error:
;
;inp. value --> 1 2.18972 3.82505
;----------------------------------------------------------------
;abs. err., full prec. 0.04544 -0.03233 0.04423
;abs. err., truncated 0.04544 -0.04546 0.04423
;
;The three input values shown represent the left end point, the maximum
;error (derivative of absolute error == 0), and the right end point.
;The right end point is not 4 because the approximation reaches 2.000
;at the value given--we abandon the linear approximation at that point
;and use that same value for all greater input values. This linear
;approximation is computed with 8-bit operations, so truncations can
;add a negative error. This increases maximum error only when it is
;already negative, as shown in the table.
;
;Each iteration of Newton-Raphson approximation more than doubles the
;number of bits of accuracy. Suppose the current guess is A, and it has
;an absolute error of e (i.e., A+e or A-e is the root). Then the absolute
;error after the next iteration is e^2/2A. This error is always positive.
;However, the divide instruction truncates, which introduces an error
;that is always negative. Sometimes a constant or rounding bit is added
;to balance the positive and negative errors. The maximum possible error
;is given in comments below for each iteration. (Note that when we compute
;the error from e^2/2A, A could be in the range 1 to 2--we use 1 to get
;max error.) Remember that the binary point is to the RIGHT of the MSB
;when looking at these error numbers.
;SQRT_INTERCEPT is used when the binary point is to the right of the MSB.
;Multiplying it by 64K would put the binary point to the left of the MSB,
;so it must be divided by two to be aligned.
SQRT_INTERCEPT equ 23185 ; 0.70755 * 65536 / 2
;SQRT_COEF would have the binary point to the left of the MSB if multiplied
;by 256. However, this would leave it with a leading zero, so we multiply
;it by two more to normalize it.
SQRT_COEF equ 173 ; 0.33789 * 256 * 2
SqrtSpcl:
cmp al,bTAG_DEN
jz SqrtDen
cmp al,bTAG_INF
jnz SpclDestNotDen
;Have infinity
or ah,ah ;Is it negative?
js ReturnIndefinite
SqrtRet:
ret
MaxStartRoot:
;The first iteration is calculated as (ax / bh) * 100H + bx. The first
;trial root in bx should be 10000H (which is too big). But it's very
;easy to calculate (ax / 100H) * 100H + 10000H = ax.
mov bx,ax
cmp ax,-1 ;Would subsequent DIV overflow?
jb FirstTrialRoot
;The reduced argument is so close to 4.0 that the 16-bit DIV instruction
;used in the next iteration would overflow. If the argument is 4-A
;then a guess of 2.0 is in error by approximately A/4. [This is not
;an upper bound. The error is a little by more than this by an
;addition with the magnitude of A^2. This is an insignificant amount
;when A is small.] This means that the first guess of 2.0 is quite
;accurate, and we'll use it to bypass some of the iteration steps.
;This will eliminate the DIV overflow by skipping the DIV.
;
;One iteration is performed by: (Arg/Guess + Guess)/2. When Guess = 2,
;this becomes (Arg/2 + 2)/2 = Arg/4 + 1. We get Arg/2 just by assuming
;the binary point is one bit further left; then a single right shift is
;needed to get Arg/4. By shifting in a 1 bit on the left, we account for
;adding 1 at the same time. [Note that if Arg = 4 - A, then Arg/4 + 1
; = (4 - A)/4 + 1 = 1 - A/4 + 1 = 2 - A/4. In other words, we just
;subtract out exactly what we estimate our error to be, A/4.]
;
;Since the upper 16 bits are 0FFFFH, A <= 2^-14, so error <= 2^-16 =
; +0.00001526, -0.
mov ebx,esi ;Return root in ebx
sar ebx,1 ;Trial root = arg/2
cmp esi,ebx ;Will 32-bit division overflow?
jb StartThirdIteration ;No, our 32-bit guess is good
;Argument is really, really close to 4.0: with an initial trial root of
;2.0, max absolute error is 2^-32 = +2.328E-10, -0. One trivial
;iteration will get us 65-bit accuracy, max abs. error = +2.71E-20, -0.
mov ebx,esi
mov eax,ecx ;65-bit root*2 in ebx:eax (MSB implied)
shl ecx,2 ;ecx = low half*4
jmp RoundRoot
SqrtDen:
mov EMSEG:[CURerr],Denormal
test EMSEG:[CWmask],Denormal ;Is denormal exception masked?
jnz SqrtRet ;If not, quit
;******
EM_ENTRY eFSQRT
eFSQRT:
;******
mov eax,EMSEG:[edi].ExpSgn
cmp al,bTAG_ZERO
jz SqrtRet
ja SqrtSpcl
or ah,ah
js ReturnIndefinite
mov esi,EMSEG:[edi].lManHi
mov ecx,EMSEG:[edi].lManLo
sar EMSEG:[edi].wExp,1 ;Divide exponent by two
mov edi,0 ;Extend mantissa
jc RootAligned ;If odd exponent, leave it normalized
shrd edi,ecx,1
shrd ecx,esi,1
shr esi,1 ;Denormalize, extending into edi
RootAligned:
;esi:ecx:edi has mantissa, 2 MSBs are left of binary point. Range is [1,4).
shld eax,esi,16 ;Get high word of mantissa
movzx ebx,ah ;High byte to bl
;UNDONE: MASM 6 bug!!
;UNDONE: SQRT_COEF (=0AEH) get sign extended!!
mov dx,SQRT_COEF ;UNDONE
imul bx,dx ;UNDONE
;UNDONE imul bx,SQRT_COEF ;Product in bx
;Multiply by SQRT_COEF causes binary point to shift left 1 bit.
add bx,SQRT_INTERCEPT ;5-bit approx. square root in bh
jc MaxStartRoot
;Max absolute error is +/- 0.04546
div bh ;See how close we are
add bh,al ;quotient + divisor (always sets CY)
FirstTrialRoot:
;Avoid RCR because it takes 9 clocks on 386. Use SHRD (3 clocks) instead.
mov dl,1 ;Need bit set
shrd bx,dx,1 ;(quotient + divisor)/2
;bx has 9-bit approx. square root, normalized
;Max absolute error is +0.001033, -0.003906
movzx eax,si
shld edx,esi,16 ;dx:ax has high half mantissa
div bx ;Test our approximation
add ebx,eax ;quotient + divisor
shl ebx,15 ;Normalize (quotient + divisor)/2
;ebx has 17-bit approx. square root, normalized
;Max absolute error is +0.000007629, -0.00001526
;Add adjustment factor to center the error range at +/-0.00001144
or bh,20H ;Add in 0.000003815
StartThirdIteration:
mov edx,esi
mov eax,ecx
div ebx ;Test approximation
stc ;Set bit for rounding (= 2.328E-10)
adc ebx,eax ;quotient + divisor + round bit
;Avoid RCR because it takes 9 clocks on 386. Use SHRD (3 clocks) instead.
mov dl,1 ;Need bit set
shrd ebx,edx,1 ;(quotient + divisor)/2, rounded
;ebx has 32-bit approx. square root, normalized
;Max absolute error is +2.983E-10, -2.328E-10
mov edx,esi ;Last time we need high half
mov eax,ecx
shld ecx,edi,2 ;ecx = low half*4, w/extension back in
div ebx ;Test approximation
xchg edi,eax ;Save 1st quotient, get extension
mov esi,eax
or esi,edx ;Any remainder?
jz HaveRoot ;Result is ebx:esi
div ebx ;edi:eax is 64-bit quotient
add ebx,edi ;quotient + divisor (always sets CY)
RoundRoot:
mov esi,eax ;Save low half root*2
;We have 65-bit root*2 in ebx:esi (eax==esi) (MSB is implied one).
;Max absolute error is +4.450E-20, -5.421E-20. This maximum error
;corresponds to just less than +/- 1 in the last (65th) bit.
;
;We have to determine if this error is positive or negative so
;we can tell if we rounded up or down (and set the status bit
;accordingly). This is done by squaring the root and comparing the
;that result with the input.
;
;Squaring the sample root requires summing partial products:
; lo*lo + lo*hi + hi*lo + hi*hi. lo*hi == hi*lo, so only one multiply
;is needed there. The low half of lo*lo isn't relevant, we know it
;is non-zero. Only the low few bits of hi*hi are needed, so we can use
;an 8-bit multiply there. Since the MSB is implied, we need to add in
;two 1*lo products (shifted up 64 bits). We only need bits 64 - 71 of
;the 130-bit product (the action happens near bit 65). What we're
;squaring is root*2, so the result is square*4. ecx already has arg*4.
mul eax ;Low partial product of square
mov edi,edx ;Only high half counts
mov eax,ebx
mul esi ;Middle partial product of square
add eax,eax ;There are two of these
adc edx,edx
add edi,eax
adc edx,0 ;edx:edi = lo*lo + lo*hi + hi*lo
add edx,esi ;lo*implied msb
add edx,esi ;lo*implied msb again
mov al,bl
mul al ;hi*hi - only low 8 bits are valid
add al,dl ;Bits 64 - 71 of product
or al,1 ;Account for sticky bits 0 - 63
sub cl,al ;Compare product with argument
;Sign flag set if product is larger. In this case, subtract 1 from root.
add cl,cl ;Set CY if sign is set
SubOneFromRoot:
sbb esi,0 ;Reduce root if product was too big
sbb ebx,0
ShiftRoot:
;ebx:esi = root*2
;Absolute error is in the range (0, -5.421E-20). This is equivalent to
;less than +1, -0 in last bit. Thus LSB is correct rounding bit as
;long as we set a sticky bit below it.
;
;Now divide root*2 by 2, preserving LSB as rounding bit and filling
;eax with 1's as sticky bits.
;
;Avoid RCR because it takes 9 clocks on 386. Use SHRD (3 clocks) instead.
mov eax,-1
shrd eax,esi,1 ;Move round bit to MSB of eax
shrd esi,ebx,1
shrd ebx,eax,1 ;Shift 1 into MSB of ebx
StoreRoot:
mov edi,EMSEG:[CURstk]
mov EMSEG:[Result],edi
mov ecx,EMSEG:[edi].ExpSgn
;mantissa in ebx:esi:eax, exponent in high ebx, sign in bh bit 7
jmp EMSEG:[RoundMode]
HaveRoot:
;esi = eax = edx = 0
cmp edi,ebx ;Does quotient == divisor?
jz StoreRoot ;If so, we're done
;Quotient != divisor, so answer is not exact. Since remainder is zero,
;the division was exact. The only error in the result is e^2/2A, which
;is always positive. We need the error to be only negative so that
;the rounding routine can properly tell if it rounded up.
add ebx,edi ;quotient + divisor (always sets CY)
jmp SubOneFromRoot ;Reduce root to ensure negative error