#include "stdafx.h" #pragma hdrstop /*************************************************************************** * * INTEL Corporation Proprietary Information * * * Copyright (c) 1996 Intel Corporation. * All rights reserved. * *************************************************************************** AUTHOR: Kumar Balasubramanian *************************************************************************** ** Pentium version of the "integer LLM mode" within IJG decompressor code. ** The following is a non-MMX Pentium implementation of the integer slow mode ** IDCT within the IJG code. */ #define JPEG_INTERNALS #include "jinclude.h" #include "jpeglib.h" #include "jdct.h" /* Private declarations for DCT subsystem */ #ifdef DCT_ISLOW_SUPPORTED /* * This module is specialized to the case DCTSIZE = 8. */ #if DCTSIZE != 8 Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ #endif #if BITS_IN_JSAMPLE == 8 #define CONST_BITS 13 #define PASS1_BITS 2 #else #define CONST_BITS 13 #define PASS1_BITS 1 /* lose a little precision to avoid overflow */ #endif /* Define the constants for the case BITS_IN_JSAMPLE = 8 */ static const INT32 const_0_2986 = 0x0000098E ; static const INT32 const_0_3901 = 0x0fffff384; static const INT32 const_0_54119 = 0x00001151; static const INT32 const_0_7653 = 0x0000187E; static const INT32 const_0_899 = 0x0ffffe333; static const INT32 const_1_175 = 0x000025a1; static const INT32 const_1_501 = 0x0000300b; static const INT32 const_1_8477 = 0x0ffffc4df; static const INT32 const_1_961 = 0x0ffffc13b; static const INT32 const_2_053 = 0x000041b3; static const INT32 const_2_562 = 0x0ffffadfd; static const INT32 const_3_072 = 0x00006254; static const INT32 const_round = 0x00000400; static const INT32 const_round_row = 0x00020000; static const INT32 const_mask = 0x000003ff; /* * Perform dequantization and inverse DCT on one block of coefficients. */ GLOBAL(void) pidct8x8llm (JCOEFPTR inptr, short *quantptr, short *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col, JSAMPLE *range_limit ) { INT32 locdwinptr, locdwqptr, locdwwsptr, locdwtmp0, locdwtmp1 ; INT32 locdwtmp2, locdwtmp3, locdwtmp00, locdwtmp01, locdwtmp02 ; INT32 locdwtmp03, locdwtmp10, locdwtmp11, locdwtmp12 ; INT32 locdwtmp13, locdwcounter, locdwrowctr ; // Inline assembly to do the IDCT and store the result */ __asm { mov esi, inptr ; point to start of source mov edi, quantptr ; mov eax, wsptr mov locdwinptr, esi ; point to start of source mov locdwqptr, edi ; mov locdwwsptr, eax mov locdwcounter, 8 mov eax, [esi] ; warm up the cache mov ebx, [esi+32] mov ecx, [esi+64] mov edx, [esi+96] mov eax, [edi] mov ebx, [edi+32] mov ecx, [edi+64] mov edx, [edi+96] ;; 1D-IDCT of all the eight columns idct_column: mov esi, locdwinptr ; point to start of source mov edi, locdwqptr ; ;; do the even part mov ax, [esi+16*2] mov bx, [edi+16*2] shl eax, 16 ; sign extend the i/p mov cx, [esi+16*6] sar eax, 16 mov dx, [edi+16*6] shl ebx, 16 ; sign extend the quant factor sar ebx, 16 imul eax, ebx ; dequantized C2 = z2 shl ecx, 16 sar ecx, 16 shl edx, 16 sar edx, 16 imul ecx, edx ; dequantized C6 = z3 mov ebx, eax ; copy of z2 imul eax, const_0_7653 add ebx, ecx ; z2 + z3 imul ecx, const_1_8477 imul ebx, const_0_54119 ; z1 mov dx, [edi+16*4] ; quant factor for C4 add ecx, ebx ; tmp2 add eax, ebx ; tmp3 mov locdwtmp2, ecx mov locdwtmp3, eax mov cx, [esi+16*4] ; C4 mov ax, [esi+16*0] ; C0 mov bx, [edi+16*0] ; quant factor for C0 movsx edx, dx movsx ecx, cx movsx eax, ax movsx ebx, bx imul ecx, edx ; dequantize C4 = z3 imul eax, ebx ; dequantize C0 = z2 mov edx, ecx ; copy of z3 add ecx, eax ; z2 + z3 shl ecx, 13 ; tmp0 sub eax, edx ; z2 - z3 shl eax, 13 ; tmp1 mov ebx, ecx ; copy of tmp0 add ecx, locdwtmp3 ; tmp10 mov edx, eax ; copy of tmp1 add eax, locdwtmp2 ; tmp11 mov locdwtmp00, ecx sub ebx, locdwtmp3 ; tmp13 mov locdwtmp01, eax sub edx, locdwtmp2 ; tmp12 mov locdwtmp03, ebx mov ax, [esi+16*7] ; C7 for the odd part mov locdwtmp02, edx mov bx, [edi+16*7] ; quant factor for C7 ;; now do the odd part shl eax, 16 mov cx, [esi+16*3] sar eax, 16 mov dx, [edi+16*3] shl ebx, 16 sar ebx, 16 imul eax, ebx ; dequantized C7 = tmp0 shl ecx, 16 sar ecx, 16 shl edx, 16 sar edx, 16 mov bx, [esi+16*1] imul ecx, edx ; dequantized C3 = tmp2 shl ebx, 16 mov dx, [edi+16*1] sar ebx, 16 shl edx, 16 sar edx, 16 imul ebx, edx ; dequantized C1 = tmp3 mov locdwtmp0, eax mov locdwtmp2, ecx mov ax, [esi+16*5] mov dx, [edi+16*5] shl eax, 16 sar eax, 16 shl edx, 16 sar edx, 16 imul eax, edx ; dequantized C5 = tmp1 imul ecx, const_3_072 ; tmp2 mov locdwtmp3, ebx mov edx, locdwtmp0 imul ebx, const_1_501 ; tmp3 imul edx, const_0_2986 ; tmp0 mov locdwtmp1, eax ; store tmp1 mov locdwtmp10, edx imul eax, const_2_053 ; tmp1 mov locdwtmp11, eax mov locdwtmp12, ecx mov locdwtmp13, ebx mov eax, locdwtmp0 mov ebx, locdwtmp1 mov ecx, eax mov edx, ebx add eax, locdwtmp3 ; z1 add ebx, locdwtmp3 ; z4 add ecx, locdwtmp2 ; z3 add edx, locdwtmp2 ; z2 mov esi, ecx ; copy of z3 imul eax, const_0_899 ; z1 imul edx, const_2_562 ; z2 add esi, ebx ; z3 + z4 imul esi, const_1_175 ; z5 imul ecx, const_1_961 ; z3 imul ebx, const_0_3901 ; z4 add ecx, esi ; z3 add ebx, esi ; z4 mov esi, eax ; copy of z1 add eax, ecx ; z1 + z3 add esi, ebx ; z1 + z4 add ecx, edx ; z3 + z2 add edx, ebx ; z2 + z4 add eax, locdwtmp10 ; tmp0 add edx, locdwtmp11 ; tmp1 add ecx, locdwtmp12 ; tmp2 add esi, locdwtmp13 ; tmp3 mov ebx, locdwtmp03 sub ebx, eax ; w4 add eax, locdwtmp03 ; w3 add ebx, const_round mov edi, locdwwsptr ; keep in mind that wsptr stores 32 bit values sar ebx, 11 ; So store/update the pointer accordingly add eax, const_round sar eax, 11 mov [edi+32*4], ebx mov [edi+32*3], eax mov ebx, locdwtmp02 mov eax, locdwtmp01 sub ebx, edx ; w5 add edx, locdwtmp02 ; w2 sub eax, ecx ; w6 add ecx, locdwtmp01 ; w1 add ebx, const_round sar ebx, 11 add eax, const_round sar eax, 11 add edx, const_round add ecx, const_round mov [edi+32*5], ebx sar edx, 11 mov [edi+32*6], eax sar ecx, 11 mov [edi+32*2], edx mov eax, locdwtmp00 mov [edi+32*1], ecx mov ebx, eax sub eax, esi ; w7 add ebx, esi ; w0 add eax, const_round sar eax, 11 add ebx, const_round sar ebx, 11 mov [edi+32*7], eax mov [edi+32*0], ebx mov eax, locdwcounter add locdwinptr, 2 add locdwwsptr, 4 ; wsptr stores 32 bit quantities add locdwqptr, 2 dec eax mov locdwcounter, eax jnz idct_column ;; End of 1D-idct of all the columns ;; get ready for the 1D-idct of the rows mov esi, wsptr mov locdwcounter, 8 mov locdwrowctr, 0 mov locdwwsptr, esi ;; 1D-IDCT of all the eight rows idct_row: mov esi, locdwwsptr ; point to start of source mov edi, output_buf add edi, locdwrowctr mov edi, [edi] add locdwrowctr, 4 add edi, output_col ; this is the dest start addr for this row ;; do the even part mov eax, [esi+4*2] mov ecx, [esi+4*6] mov ebx, eax ; copy of z2 mov edx, [edi] ; warm up the cache for writing this output row imul eax, const_0_7653 add ebx, ecx ; z2 + z3 imul ecx, const_1_8477 imul ebx, const_0_54119 ; z1 add ecx, ebx ; tmp2 add eax, ebx ; tmp3 mov locdwtmp2, ecx mov locdwtmp3, eax mov ecx, [esi+4*4] ; C4 mov eax, [esi+4*0] ; C0 mov edx, ecx ; copy of z3 add ecx, eax ; z2 + z3 sub eax, edx ; z2 - z3 shl ecx, 13 ; tmp0 shl eax, 13 ; tmp1 mov ebx, ecx ; copy of tmp0 add ecx, locdwtmp3 ; tmp10 mov edx, eax ; copy of tmp1 add eax, locdwtmp2 ; tmp11 mov locdwtmp00, ecx sub ebx, locdwtmp3 ; tmp13 mov locdwtmp01, eax sub edx, locdwtmp2 ; tmp12 mov locdwtmp03, ebx mov eax, [esi+4*7] ; C7 for the odd part mov locdwtmp02, edx ;; now do the odd part mov ecx, [esi+4*3] mov ebx, [esi+4*1] mov locdwtmp0, eax mov locdwtmp2, ecx mov eax, [esi+4*5] mov locdwtmp3, ebx imul ecx, const_3_072 ; tmp2 mov edx, locdwtmp0 imul ebx, const_1_501 ; tmp3 imul edx, const_0_2986 ; tmp0 mov locdwtmp1, eax ; store tmp1 imul eax, const_2_053 ; tmp1 mov locdwtmp10, edx mov locdwtmp11, eax mov locdwtmp12, ecx mov locdwtmp13, ebx mov eax, locdwtmp0 mov ebx, locdwtmp1 mov ecx, eax mov edx, ebx add eax, locdwtmp3 ; z1 add edx, locdwtmp2 ; z2 add ebx, locdwtmp3 ; z4 add ecx, locdwtmp2 ; z3 mov esi, ecx ; copy of z3 imul eax, const_0_899 ; z1 imul edx, const_2_562 ; z2 add esi, ebx ; z3 + z4 imul esi, const_1_175 ; z5 imul ecx, const_1_961 ; z3 imul ebx, const_0_3901 ; z4 add ecx, esi ; z3 add ebx, esi ; z4 mov esi, eax ; copy of z1 add eax, ecx ; z1 + z3 add esi, ebx ; z1 + z4 add ecx, edx ; z3 + z2 add edx, ebx ; z2 + z4 add eax, locdwtmp10 ; tmp0 add edx, locdwtmp11 ; tmp1 add ecx, locdwtmp12 ; tmp2 add esi, locdwtmp13 ; tmp3 mov locdwtmp0, eax mov locdwtmp1, edx mov locdwtmp2, ecx mov locdwtmp3, esi mov ebx, locdwtmp03 add ebx, locdwtmp0 ; out3 mov ecx, locdwtmp00 sub ecx, locdwtmp3 ; out7 add ebx, const_round_row sar ebx, 18 add ecx, const_round_row sar ecx, 18 mov esi, range_limit and ebx, const_mask and ecx, const_mask mov al, [esi][ebx] mov dl, [esi][ecx] mov ebx, locdwtmp02 mov ecx, locdwtmp01 add ebx, locdwtmp1 ; out2 sub ecx, locdwtmp2 ; out6 shl eax, 8 ; get ready to receive next output byte add ebx, const_round_row shl edx, 8 ; get ready to receive next output byte add ecx, const_round_row sar ebx, 18 sar ecx, 18 and ebx, const_mask and ecx, const_mask mov al, [esi][ebx] mov dl, [esi][ecx] mov ebx, locdwtmp01 mov ecx, locdwtmp02 add ebx, locdwtmp2 ; out1 shl eax, 8 ; get ready to receive next output byte sub ecx, locdwtmp1 ; out5 shl edx, 8 ; get ready to receive next output byte add ebx, const_round_row sar ebx, 18 add ecx, const_round_row sar ecx, 18 and ebx, const_mask and ecx, const_mask mov al, [esi][ebx] ; out1 mov dl, [esi][ecx] ; out5 mov ebx, locdwtmp00 mov ecx, locdwtmp03 add ebx, locdwtmp3 ; out0 shl eax, 8 ; get ready to receive next output byte sub ecx, locdwtmp0 ; out4 shl edx, 8 ; get ready to receive next output byte add ebx, const_round_row sar ebx, 18 add ecx, const_round_row sar ecx, 18 and ebx, const_mask and ecx, const_mask mov al, [esi][ebx] ; out0 mov dl, [esi][ecx] ; out4 mov [edi], eax ; store the first four bytes mov [edi+4], edx ; store the next four bytes of this row mov eax, locdwcounter add locdwwsptr, 32 ; wsptr stores 32 bit quantities dec eax mov locdwcounter, eax jnz idct_row } //end of __asm } #endif /* DCT_ISLOW_SUPPORTED */