subttl	emfsqrt.asm - FSQRT instruction
	page
;*******************************************************************************
;emfsqrt.asm - FSQRT instruction
;	by Tim Paterson
;
;	 Microsoft Confidential
;
;	 Copyright (c) Microsoft Corporation 1991
;	 All Rights Reserved
;
;Inputs:
;	edi = [CURstk]
;
;Revision History:
;
; []	09/05/91  TP	Initial 32-bit version.
;
;*******************************************************************************


;A linear approximation of the square root function is used to get the
;intial value for Newton-Raphson iteration.  This approximation gives
;nearly 5-bit accuracy over the required input interval, [1,4).  The
;equation for the linear approximation of y = sqrt(x) is y = mx + b,
;where m is the slope (named SQRT_COEF) and b is the y-intercept (named
;SQRT_INTERCEPT).
;
;(The values for m and b were computed with Excel Solver in two passes: 
;the first pass computed them full precision, minimizing absolute error;
;the second computed only b after m was rounded to an 8-bit value.)
;
;The resulting values have the following maximum error:
;
;inp. value -->		1		 2.18972	3.82505
;----------------------------------------------------------------
;abs. err., full prec.	0.04544		-0.03233	0.04423
;abs. err., truncated	0.04544		-0.04546	0.04423
;
;The three input values shown represent the left end point, the maximum 
;error (derivative of absolute error == 0), and the right end point.  
;The right end point is not 4 because the approximation reaches 2.000
;at the value given--we abandon the linear approximation at that point
;and use that same value for all greater input values.	This linear
;approximation is computed with 8-bit operations, so truncations can
;add a negative error.  This increases maximum error only when it is
;already negative, as shown in the table.
;
;Each iteration of Newton-Raphson approximation more than doubles the
;number of bits of accuracy.  Suppose the current guess is A, and it has
;an absolute error of e (i.e., A+e or A-e is the root).  Then the absolute
;error after the next iteration is e^2/2A.  This error is always positive.
;However, the divide instruction truncates, which introduces an error
;that is always negative.  Sometimes a constant or rounding bit is added
;to balance the positive and negative errors.  The maximum possible error 
;is given in comments below for each iteration.  (Note that when we compute 
;the error from e^2/2A, A could be in the range 1 to 2--we use 1 to get
;max error.)  Remember that the binary point is to the RIGHT of the MSB
;when looking at these error numbers.


;SQRT_INTERCEPT is used when the binary point is to the right of the MSB.
;Multiplying it by 64K would put the binary point to the left of the MSB,
;so it must be divided by two to be aligned.
SQRT_INTERCEPT	equ	23185		; 0.70755 * 65536 / 2

;SQRT_COEF would have the binary point to the left of the MSB if multiplied
;by 256.  However, this would leave it with a leading zero, so we multiply
;it by two more to normalize it.
SQRT_COEF	equ	173		; 0.33789 * 256 * 2

SqrtSpcl:
	cmp	al,bTAG_DEN
	jz	SqrtDen
	cmp	al,bTAG_INF
	jnz	SpclDestNotDen
;Have infinity
	or	ah,ah			;Is it negative?
	js	ReturnIndefinite
SqrtRet:
	ret


MaxStartRoot:
;The first iteration is calculated as  (ax / bh) * 100H + bx.  The first 
;trial root in bx should be 10000H (which is too big).  But it's very
;easy to calculate (ax / 100H) * 100H + 10000H = ax.
	mov	bx,ax
	cmp	ax,-1			;Would subsequent DIV overflow?
	jb	FirstTrialRoot
;The reduced argument is so close to 4.0 that the 16-bit DIV instruction
;used in the next iteration would overflow.  If the argument is 4-A 
;then a guess of 2.0 is in error by approximately A/4.  [This is not
;an upper bound.  The error is a little by more than this by an
;addition with the magnitude of A^2.  This is an insignificant amount
;when A is small.]  This means that the first guess of 2.0 is quite
;accurate, and we'll use it to bypass some of the iteration steps. 
;This will eliminate the DIV overflow by skipping the DIV.
;
;One iteration is performed by: (Arg/Guess + Guess)/2.  When Guess = 2,
;this becomes (Arg/2 + 2)/2 = Arg/4 + 1.  We get Arg/2 just by assuming
;the binary point is one bit further left; then a single right shift is
;needed to get Arg/4.  By shifting in a 1 bit on the left, we account for
;adding 1 at the same time.  [Note that if Arg = 4 - A, then Arg/4 + 1
; = (4 - A)/4 + 1 = 1 - A/4 + 1 = 2 - A/4.  In other words, we just
;subtract out exactly what we estimate our error to be, A/4.]
;
;Since the upper 16 bits are 0FFFFH, A <= 2^-14, so error <= 2^-16 =
; +0.00001526, -0.
	mov	ebx,esi			;Return root in ebx
	sar	ebx,1			;Trial root = arg/2
	cmp	esi,ebx			;Will 32-bit division overflow?
	jb	StartThirdIteration	;No, our 32-bit guess is good
;Argument is really, really close to 4.0: with an initial trial root of
;2.0, max absolute error is 2^-32 = +2.328E-10, -0.  One trivial
;iteration will get us 65-bit accuracy, max abs. error = +2.71E-20, -0.
	mov	ebx,esi
	mov	eax,ecx			;65-bit root*2 in ebx:eax (MSB implied)
	shl	ecx,2			;ecx = low half*4
	jmp	RoundRoot

SqrtDen:
	mov	EMSEG:[CURerr],Denormal
	test	EMSEG:[CWmask],Denormal ;Is denormal exception masked?
	jnz	SqrtRet			;If not, quit

;******
EM_ENTRY eFSQRT
eFSQRT:
;******
	mov	eax,EMSEG:[edi].ExpSgn
	cmp	al,bTAG_ZERO
	jz	SqrtRet
	ja	SqrtSpcl
	or	ah,ah
	js	ReturnIndefinite
	mov	esi,EMSEG:[edi].lManHi
	mov	ecx,EMSEG:[edi].lManLo
	sar	EMSEG:[edi].wExp,1	;Divide exponent by two
	mov	edi,0			;Extend mantissa
	jc	RootAligned		;If odd exponent, leave it normalized
	shrd	edi,ecx,1
	shrd	ecx,esi,1
	shr	esi,1			;Denormalize, extending into edi
RootAligned:
;esi:ecx:edi has mantissa, 2 MSBs are left of binary point. Range is [1,4).
	shld	eax,esi,16		;Get high word of mantissa
	movzx	ebx,ah			;High byte to bl
;UNDONE:  MASM 6 bug!!
;UNDONE:  SQRT_COEF (=0AEH) get sign extended!!
	mov	dx,SQRT_COEF		;UNDONE
	imul	bx,dx			;UNDONE
;UNDONE imul	bx,SQRT_COEF		;Product in bx
;Multiply by SQRT_COEF causes binary point to shift left 1 bit.
	add	bx,SQRT_INTERCEPT	;5-bit approx. square root in bh
	jc	MaxStartRoot
;Max absolute error is +/- 0.04546
	div	bh			;See how close we are
	add	bh,al			;quotient + divisor (always sets CY)
FirstTrialRoot:
;Avoid RCR because it takes 9 clocks on 386.  Use SHRD (3 clocks) instead.
	mov	dl,1			;Need bit set
	shrd	bx,dx,1			;(quotient + divisor)/2
;bx has 9-bit approx. square root, normalized
;Max absolute error is +0.001033, -0.003906
	movzx	eax,si
	shld	edx,esi,16		;dx:ax has high half mantissa
	div	bx			;Test our approximation
	add	ebx,eax			;quotient + divisor
	shl	ebx,15			;Normalize (quotient + divisor)/2
;ebx has 17-bit approx. square root, normalized
;Max absolute error is +0.000007629, -0.00001526
;Add adjustment factor to center the error range at +/-0.00001144
	or	bh,20H			;Add in 0.000003815
StartThirdIteration:
	mov	edx,esi
	mov	eax,ecx
	div	ebx			;Test approximation
	stc				;Set bit for rounding (= 2.328E-10)
	adc	ebx,eax			;quotient + divisor + round bit
;Avoid RCR because it takes 9 clocks on 386.  Use SHRD (3 clocks) instead.
	mov	dl,1			;Need bit set
	shrd	ebx,edx,1		;(quotient + divisor)/2, rounded
;ebx has 32-bit approx. square root, normalized
;Max absolute error is +2.983E-10, -2.328E-10
	mov	edx,esi			;Last time we need high half
	mov	eax,ecx
	shld	ecx,edi,2		;ecx = low half*4, w/extension back in
	div	ebx			;Test approximation
	xchg	edi,eax			;Save 1st quotient, get extension
	mov	esi,eax
	or	esi,edx			;Any remainder?
	jz	HaveRoot		;Result is ebx:esi
	div	ebx			;edi:eax is 64-bit quotient
	add	ebx,edi			;quotient + divisor (always sets CY)
RoundRoot:
	mov	esi,eax			;Save low half root*2

;We have 65-bit root*2 in ebx:esi (eax==esi) (MSB is implied one).
;Max absolute error is +4.450E-20, -5.421E-20.	This maximum error 
;corresponds to just less than +/- 1 in the last (65th) bit.  
;	
;We have to determine if this error is positive or negative so
;we can tell if we rounded up or down (and set the status bit
;accordingly).	This is done by squaring the root and comparing the
;that result with the input.
;
;Squaring the sample root requires summing partial products:
; lo*lo + lo*hi + hi*lo + hi*hi.  lo*hi == hi*lo, so only one multiply
;is needed there.  The low half of lo*lo isn't relevant, we know it
;is non-zero.  Only the low few bits of hi*hi are needed, so we can use
;an 8-bit multiply there.  Since the MSB is implied, we need to add in
;two 1*lo products (shifted up 64 bits).  We only need bits 64 - 71 of
;the 130-bit product (the action happens near bit 65).	What we're 
;squaring is root*2, so the result is square*4.  ecx already has arg*4.

	mul	eax			;Low partial product of square
	mov	edi,edx			;Only high half counts
	mov	eax,ebx
	mul	esi			;Middle partial product of square
	add	eax,eax			;There are two of these
	adc	edx,edx
	add	edi,eax
	adc	edx,0			;edx:edi = lo*lo + lo*hi + hi*lo
	add	edx,esi			;lo*implied msb
	add	edx,esi			;lo*implied msb again
	mov	al,bl
	mul	al			;hi*hi - only low 8 bits are valid
	add	al,dl			;Bits 64 - 71 of product
	or	al,1			;Account for sticky bits 0 - 63
	sub	cl,al			;Compare product with argument
;Sign flag set if product is larger.  In this case, subtract 1 from root.
	add	cl,cl			;Set CY if sign is set
SubOneFromRoot:
	sbb	esi,0			;Reduce root if product was too big
	sbb	ebx,0
ShiftRoot:
;ebx:esi = root*2
;Absolute error is in the range (0, -5.421E-20).  This is equivalent to
;less than +1, -0 in last bit.	Thus LSB is correct rounding bit as 
;long as we set a sticky bit below it.
;
;Now divide root*2 by 2, preserving LSB as rounding bit and filling
;eax with 1's as sticky bits.
;
;Avoid RCR because it takes 9 clocks on 386.  Use SHRD (3 clocks) instead.
	mov	eax,-1
	shrd	eax,esi,1		;Move round bit to MSB of eax
	shrd	esi,ebx,1
	shrd	ebx,eax,1		;Shift 1 into MSB of ebx
StoreRoot:
	mov	edi,EMSEG:[CURstk]
	mov	EMSEG:[Result],edi
	mov	ecx,EMSEG:[edi].ExpSgn
;mantissa in ebx:esi:eax, exponent in high ebx, sign in bh bit 7
	jmp	EMSEG:[RoundMode]

HaveRoot:
;esi = eax = edx = 0
	cmp	edi,ebx			;Does quotient == divisor?
	jz	StoreRoot		;If so, we're done
;Quotient != divisor, so answer is not exact.  Since remainder is zero,
;the division was exact.  The only error in the result is e^2/2A, which
;is always positive.  We need the error to be only negative so that
;the rounding routine can properly tell if it rounded up.
	add	ebx,edi			;quotient + divisor (always sets CY)
	jmp	SubOneFromRoot		;Reduce root to ensure negative error