#include "stdafx.h"
#pragma  hdrstop

/***************************************************************************
*
*                INTEL Corporation Proprietary Information  
*
*      
*                  Copyright (c) 1996 Intel Corporation.
*                         All rights reserved.
*
***************************************************************************
			AUTHOR:  Kumar Balasubramanian 
***************************************************************************

** Pentium version of the "integer LLM mode" within IJG decompressor code.
** The following is a non-MMX Pentium implementation of the integer slow mode
** IDCT within the IJG code.
*/


#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
#include "jdct.h"		/* Private declarations for DCT subsystem */

#ifdef DCT_ISLOW_SUPPORTED


/*
 * This module is specialized to the case DCTSIZE = 8.
 */

#if DCTSIZE != 8
  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
#endif


#if BITS_IN_JSAMPLE == 8
#define CONST_BITS  13
#define PASS1_BITS  2
#else
#define CONST_BITS  13
#define PASS1_BITS  1		/* lose a little precision to avoid overflow */
#endif

/* Define the constants for the case BITS_IN_JSAMPLE = 8 */


static const INT32 const_0_2986	=	0x0000098E ;
static const INT32 const_0_3901	=	0x0fffff384;
static const INT32 const_0_54119	=	0x00001151;
static const INT32 const_0_7653	=	0x0000187E;
static const INT32 const_0_899	=	0x0ffffe333;
static const INT32 const_1_175	=	0x000025a1;
static const INT32 const_1_501	=	0x0000300b;
static const INT32 const_1_8477	=	0x0ffffc4df;
static const INT32 const_1_961	=	0x0ffffc13b;
static const INT32 const_2_053	=	0x000041b3;
static const INT32 const_2_562	=	0x0ffffadfd;
static const INT32 const_3_072	=	0x00006254;

static const INT32 const_round	=	0x00000400;
static const INT32 const_round_row	=	0x00020000;
static const INT32 const_mask		=	0x000003ff;


/*
 * Perform dequantization and inverse DCT on one block of coefficients.
 */

GLOBAL(void)
pidct8x8llm (JCOEFPTR inptr, short *quantptr, short *wsptr,
		 JSAMPARRAY output_buf, JDIMENSION output_col, JSAMPLE *range_limit )
{

INT32   locdwinptr, locdwqptr, locdwwsptr, locdwtmp0, locdwtmp1 ;
INT32   locdwtmp2, locdwtmp3, locdwtmp00, locdwtmp01, locdwtmp02 ;
INT32   locdwtmp03, locdwtmp10, locdwtmp11, locdwtmp12 ;
INT32   locdwtmp13, locdwcounter, locdwrowctr ;	


// Inline assembly to do the IDCT and store the result */

__asm {

mov		esi, inptr	; point to start of source
mov		edi, quantptr	;

mov		eax, wsptr
mov		locdwinptr, esi	; point to start of source

mov		locdwqptr, edi	;
mov		locdwwsptr, eax

mov	locdwcounter, 8
mov		eax, [esi]		; warm up the cache

mov		ebx, [esi+32]
mov		ecx, [esi+64]

mov		edx, [esi+96]
mov		eax, [edi]

mov		ebx, [edi+32]
mov		ecx, [edi+64]

mov		edx, [edi+96]

;; 1D-IDCT of all the eight columns
idct_column:

mov		esi, locdwinptr	; point to start of source
mov		edi, locdwqptr		;

;; do the even part

mov		ax, [esi+16*2]
mov		bx, [edi+16*2]

shl		eax, 16		; sign extend the i/p
mov		cx, [esi+16*6]

sar		eax, 16
mov		dx, [edi+16*6]

shl		ebx, 16		; sign extend the quant factor

sar		ebx, 16

imul	eax, ebx	; dequantized C2 = z2

shl		ecx, 16

sar		ecx, 16

shl		edx, 16

sar		edx, 16

imul	ecx, edx	; dequantized C6 = z3

mov		ebx, eax	; copy of z2

imul	eax, const_0_7653

add		ebx, ecx	; z2 + z3

imul	ecx, const_1_8477

imul	ebx, const_0_54119	; z1

mov		dx, [edi+16*4]	; quant factor for C4
add		ecx, ebx	; tmp2

add		eax, ebx	; tmp3
mov		locdwtmp2, ecx

mov		locdwtmp3, eax

mov		cx, [esi+16*4]	; C4
mov		ax, [esi+16*0]	; C0

mov		bx, [edi+16*0]	; quant factor for C0

movsx	edx, dx

movsx	ecx, cx

movsx	eax, ax

movsx	ebx, bx

imul	ecx, edx	; dequantize C4 = z3

imul	eax, ebx	; dequantize C0 = z2

mov		edx, ecx	; copy of z3
add		ecx, eax	; z2 + z3

shl		ecx, 13		; tmp0
sub		eax, edx	; z2 - z3

shl		eax, 13		; tmp1
mov		ebx, ecx	; copy of tmp0

add		ecx, locdwtmp3	; tmp10
mov		edx, eax	; copy of tmp1

add		eax, locdwtmp2	; tmp11
mov		locdwtmp00, ecx

sub		ebx, locdwtmp3	; tmp13
mov		locdwtmp01, eax

sub		edx, locdwtmp2	; tmp12
mov		locdwtmp03, ebx

mov		ax, [esi+16*7]	; C7 for the odd part
mov		locdwtmp02, edx

mov		bx, [edi+16*7]	; quant factor for C7

;; now do the odd part

shl		eax, 16
mov		cx, [esi+16*3]

sar		eax, 16
mov		dx, [edi+16*3]

shl		ebx, 16

sar		ebx, 16

imul	eax, ebx		; dequantized C7 = tmp0

shl		ecx, 16

sar		ecx, 16

shl		edx, 16

sar		edx, 16
mov		bx, [esi+16*1]

imul	ecx, edx		; dequantized C3 = tmp2

shl		ebx, 16
mov		dx, [edi+16*1]

sar		ebx, 16

shl		edx, 16

sar		edx, 16

imul	ebx, edx		; dequantized C1 = tmp3

mov		locdwtmp0, eax
mov		locdwtmp2, ecx

mov		ax, [esi+16*5]
mov		dx, [edi+16*5]

shl		eax, 16

sar		eax, 16

shl		edx, 16

sar		edx, 16

imul	eax, edx	; dequantized C5 = tmp1

imul	ecx, const_3_072	; tmp2

mov		locdwtmp3, ebx
mov		edx, locdwtmp0

imul	ebx, const_1_501	; tmp3

imul	edx, const_0_2986	; tmp0

mov		locdwtmp1, eax	; store tmp1
mov		locdwtmp10, edx

imul	eax, const_2_053	; tmp1

mov		locdwtmp11, eax
mov		locdwtmp12, ecx

mov		locdwtmp13, ebx
mov		eax, locdwtmp0

mov		ebx, locdwtmp1
mov		ecx, eax

mov		edx, ebx
add		eax, locdwtmp3	; z1

add		ebx, locdwtmp3	; z4
add		ecx, locdwtmp2	; z3

add		edx, locdwtmp2	; z2
mov		esi, ecx	; copy of z3

imul	eax,  const_0_899	; z1

imul	edx,  const_2_562	; z2

add		esi, ebx	; z3 + z4

imul	esi,  const_1_175	; z5

imul	ecx,  const_1_961	; z3

imul	ebx,  const_0_3901	; z4

add		ecx, esi	; z3
add		ebx, esi	; z4

mov		esi, eax	; copy of z1
add		eax, ecx	; z1 + z3

add		esi, ebx	; z1 + z4
add		ecx, edx	; z3 + z2

add		edx, ebx	; z2 + z4
add		eax, locdwtmp10		; tmp0

add		edx, locdwtmp11		; tmp1
add		ecx, locdwtmp12		; tmp2

add		esi, locdwtmp13		; tmp3
mov		ebx, locdwtmp03

sub		ebx, eax			; w4
add		eax, locdwtmp03		; w3

add		ebx,  const_round
mov		edi, locdwwsptr		; keep in mind that wsptr stores 32 bit values

sar		ebx, 11				; So store/update the pointer accordingly
add		eax,  const_round

sar		eax, 11
mov		[edi+32*4], ebx

mov		[edi+32*3], eax
mov		ebx, locdwtmp02

mov		eax, locdwtmp01
sub		ebx, edx			; w5

add		edx, locdwtmp02		; w2
sub		eax, ecx			; w6

add		ecx, locdwtmp01		; w1
add		ebx,  const_round

sar		ebx, 11
add		eax,  const_round

sar		eax, 11
add		edx,  const_round

add		ecx,  const_round
mov		[edi+32*5], ebx

sar		edx, 11
mov		[edi+32*6], eax

sar		ecx, 11
mov		[edi+32*2], edx

mov		eax, locdwtmp00
mov		[edi+32*1], ecx

mov		ebx, eax
sub		eax, esi			; w7

add		ebx, esi			; w0
add		eax, const_round

sar		eax, 11
add		ebx, const_round

sar		ebx, 11
mov		[edi+32*7], eax

mov		[edi+32*0], ebx
mov		eax, locdwcounter

add		locdwinptr, 2
add		locdwwsptr, 4		; wsptr stores 32 bit quantities

add		locdwqptr, 2
dec		eax

mov		locdwcounter, eax
jnz		idct_column

;; End of 1D-idct of all the columns

;; get ready for the 1D-idct of the rows

mov		esi, wsptr
mov		locdwcounter, 8

mov		locdwrowctr, 0
mov		locdwwsptr, esi


;; 1D-IDCT of all the eight rows
idct_row:

mov		esi, locdwwsptr	; point to start of source
mov		edi, output_buf	

add		edi, locdwrowctr
mov		edi, [edi]

add		locdwrowctr, 4
add		edi, output_col	; this is the dest start addr for this row


;; do the even part

mov		eax, [esi+4*2]
mov		ecx, [esi+4*6]

mov		ebx, eax	; copy of z2
mov		edx, [edi]	; warm up the cache for writing this output row

imul	eax, const_0_7653

add		ebx, ecx	; z2 + z3

imul	ecx,  const_1_8477

imul	ebx,  const_0_54119	; z1

add		ecx, ebx	; tmp2
add		eax, ebx	; tmp3

mov		locdwtmp2, ecx
mov		locdwtmp3, eax

mov		ecx, [esi+4*4]	; C4
mov		eax, [esi+4*0]	; C0

mov		edx, ecx	; copy of z3

add		ecx, eax	; z2 + z3
sub		eax, edx	; z2 - z3

shl		ecx, 13		; tmp0

shl		eax, 13		; tmp1
mov		ebx, ecx	; copy of tmp0

add		ecx, locdwtmp3	; tmp10
mov		edx, eax	; copy of tmp1

add		eax, locdwtmp2	; tmp11
mov		locdwtmp00, ecx

sub		ebx, locdwtmp3	; tmp13
mov		locdwtmp01, eax

sub		edx, locdwtmp2	; tmp12
mov		locdwtmp03, ebx

mov		eax, [esi+4*7]	; C7 for the odd part
mov		locdwtmp02, edx

;; now do the odd part

mov		ecx, [esi+4*3]
mov		ebx, [esi+4*1]

mov		locdwtmp0, eax
mov		locdwtmp2, ecx

mov		eax, [esi+4*5]
mov		locdwtmp3, ebx

imul	ecx,  const_3_072	; tmp2

mov		edx, locdwtmp0

imul	ebx,  const_1_501	; tmp3

imul	edx,  const_0_2986	; tmp0

mov		locdwtmp1, eax	; store tmp1

imul	eax, const_2_053	; tmp1

mov		locdwtmp10, edx
mov		locdwtmp11, eax

mov		locdwtmp12, ecx
mov		locdwtmp13, ebx

mov		eax, locdwtmp0
mov		ebx, locdwtmp1

mov		ecx, eax
mov		edx, ebx

add		eax, locdwtmp3	; z1
add		edx, locdwtmp2	; z2

add		ebx, locdwtmp3	; z4
add		ecx, locdwtmp2	; z3

mov		esi, ecx	; copy of z3

imul	eax,  const_0_899	; z1

imul	edx, const_2_562	; z2

add		esi, ebx	; z3 + z4

imul	esi, const_1_175	; z5

imul	ecx, const_1_961	; z3

imul	ebx,  const_0_3901	; z4

add		ecx, esi	; z3
add		ebx, esi	; z4

mov		esi, eax	; copy of z1
add		eax, ecx	; z1 + z3

add		esi, ebx	; z1 + z4
add		ecx, edx	; z3 + z2

add		edx, ebx	; z2 + z4
add		eax, locdwtmp10		; tmp0

add		edx, locdwtmp11		; tmp1
add		ecx, locdwtmp12		; tmp2

add		esi, locdwtmp13		; tmp3
mov		locdwtmp0, eax

mov		locdwtmp1, edx
mov		locdwtmp2, ecx

mov		locdwtmp3, esi
mov		ebx, locdwtmp03

add		ebx, locdwtmp0	; out3
mov		ecx, locdwtmp00

sub		ecx, locdwtmp3	; out7
add		ebx,  const_round_row

sar		ebx, 18
add		ecx,  const_round_row

sar		ecx, 18
mov		esi, range_limit

and		ebx,  const_mask
and		ecx, const_mask

mov		al, [esi][ebx]
mov		dl, [esi][ecx]

mov		ebx, locdwtmp02
mov		ecx, locdwtmp01

add		ebx, locdwtmp1	; out2
sub		ecx, locdwtmp2	; out6

shl		eax, 8		; get ready to receive next output byte
add		ebx,  const_round_row

shl		edx, 8		; get ready to receive next output byte
add		ecx, const_round_row

sar		ebx, 18

sar		ecx, 18
and		ebx,  const_mask

and		ecx,  const_mask
mov		al, [esi][ebx]

mov		dl, [esi][ecx]
mov		ebx, locdwtmp01

mov		ecx, locdwtmp02
add		ebx, locdwtmp2	; out1

shl		eax, 8		; get ready to receive next output byte
sub		ecx, locdwtmp1	; out5

shl		edx, 8		; get ready to receive next output byte
add		ebx,  const_round_row

sar		ebx, 18
add		ecx,  const_round_row

sar		ecx, 18
and		ebx,  const_mask

and		ecx,  const_mask
mov		al, [esi][ebx]	; out1

mov		dl, [esi][ecx]	; out5
mov		ebx, locdwtmp00

mov		ecx, locdwtmp03
add		ebx, locdwtmp3	; out0

shl		eax, 8		; get ready to receive next output byte
sub		ecx, locdwtmp0	; out4

shl		edx, 8		; get ready to receive next output byte
add		ebx,  const_round_row

sar		ebx, 18
add		ecx,  const_round_row

sar		ecx, 18
and		ebx,  const_mask

and		ecx,  const_mask
mov		al, [esi][ebx]	; out0

mov		dl, [esi][ecx]	; out4
mov		[edi], eax		; store the first four bytes

mov		[edi+4], edx	; store the next four bytes of this row
mov		eax, locdwcounter

add		locdwwsptr, 32		; wsptr stores 32 bit quantities
dec		eax

mov		locdwcounter, eax
jnz		idct_row

} //end of __asm

}

#endif /* DCT_ISLOW_SUPPORTED */