windows-nt/Source/XPSP1/NT/shell/shell32/tngen/mfint.cpp

2138 lines
53 KiB
C++
Raw Normal View History

2020-09-26 03:20:57 -05:00
#include "stdafx.h"
#pragma hdrstop
/***************************************************************************
*
* INTEL Corporation Proprietary Information
*
*
* Copyright (c) 1996 Intel Corporation.
* All rights reserved.
*
***************************************************************************
*/
/*
* jfdctint.c
*
* Copyright (C) 1991-1996, Thomas G. Lane.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
* This file contains a slow-but-accurate integer implementation of the
* forward DCT (Discrete Cosine Transform).
*
* A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
* on each column. Direct algorithms are also available, but they are
* much more complex and seem not to be any faster when reduced to code.
*
* This implementation is based on an algorithm described in
* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
* Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
* Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
* The primary algorithm described there uses 11 multiplies and 29 adds.
* We use their alternate method with 12 multiplies and 32 adds.
* The advantage of this method is that no data path contains more than one
* multiplication; this allows a very simple and accurate implementation in
* scaled fixed-point arithmetic, with a minimal number of shifts.
*/
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
#include "jdct.h" /* Private declarations for DCT subsystem */
#ifdef DCT_ISLOW_SUPPORTED
/*
* This module is specialized to the case DATASIZE = 8.
*/
#if DCTSIZE != 8
Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
#endif
/*
* The poop on this scaling stuff is as follows:
*
* Each 1-D DCT step produces outputs which are a factor of sqrt(N)
* larger than the true DCT outputs. The final outputs are therefore
* a factor of N larger than desired; since N=8 this can be cured by
* a simple right shift at the end of the algorithm. The advantage of
* this arrangement is that we save two multiplications per 1-D DCT,
* because the y0 and y4 outputs need not be divided by sqrt(N).
* In the IJG code, this factor of 8 is removed by the quantization step
* (in jcdctmgr.c), NOT in this module.
*
* We have to do addition and subtraction of the integer inputs, which
* is no problem, and multiplication by fractional constants, which is
* a problem to do in integer arithmetic. We multiply all the constants
* by CONST_SCALE and convert them to integer constants (thus retaining
* CONST_BITS bits of precision in the constants). After doing a
* multiplication we have to divide the product by CONST_SCALE, with proper
* rounding, to produce the correct output. This division can be done
* cheaply as a right shift of CONST_BITS bits. We postpone shifting
* as long as possible so that partial sums can be added together with
* full fractional precision.
*
* The outputs of the first pass are scaled up by PASS1_BITS bits so that
* they are represented to better-than-integral precision. These outputs
* require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
* with the recommended scaling. (For 12-bit sample data, the intermediate
* array is INT32 anyway.)
*
* To avoid overflow of the 32-bit intermediate results in pass 2, we must
* have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
* shows that the values given below are the most effective.
*/
#if BITS_IN_JSAMPLE == 8
#define CONST_BITS 13
#define PASS1_BITS 2
#else
#define CONST_BITS 13
#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
#endif
/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
* causing a lot of useless floating-point operations at run time.
* To get around this we use the following pre-calculated constants.
* If you change CONST_BITS you may want to add appropriate values.
* (With a reasonable C compiler, you can just rely on the FIX() macro...)
*/
#if CONST_BITS == 13
#define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */
#define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */
#define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */
#define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */
#define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */
#define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */
#define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */
#define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */
#define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */
#define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */
#define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */
#define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */
#else
#define FIX_0_298631336 FIX(0.298631336)
#define FIX_0_390180644 FIX(0.390180644)
#define FIX_0_541196100 FIX(0.541196100)
#define FIX_0_765366865 FIX(0.765366865)
#define FIX_0_899976223 FIX(0.899976223)
#define FIX_1_175875602 FIX(1.175875602)
#define FIX_1_501321110 FIX(1.501321110)
#define FIX_1_847759065 FIX(1.847759065)
#define FIX_1_961570560 FIX(1.961570560)
#define FIX_2_053119869 FIX(2.053119869)
#define FIX_2_562915447 FIX(2.562915447)
#define FIX_3_072711026 FIX(3.072711026)
#endif
const __int64 Const_1 = 0x0000000100000001;
const __int64 Const_2 = 0x0002000200020002;
const __int64 Const_1024 = 0x0000040000000400;
const __int64 Const_16384 = 0x0000400000004000;
const __int64 Const_FFFF = 0xFFFFFFFFFFFFFFFF;
const __int64 Const_0xFIX_0_298631336 = 0x0000098e0000098e;
const __int64 Const_FIX_0_298631336x0 = 0x098e0000098e0000;
const __int64 Const_0xFIX_0_390180644 = 0x00000c7c00000c7c;
const __int64 Const_FIX_0_390180644x0 = 0x0c7c00000c7c0000;
const __int64 Const_0xFIX_0_541196100 = 0x0000115100001151;
const __int64 Const_FIX_0_541196100x0 = 0x1151000011510000;
const __int64 Const_0xFIX_0_765366865 = 0x0000187e0000187e;
const __int64 Const_FIX_0_765366865x0 = 0x187e0000187e0000;
const __int64 Const_0xFIX_0_899976223 = 0x00001ccd00001ccd;
const __int64 Const_FIX_0_899976223x0 = 0x1ccd00001ccd0000;
const __int64 Const_0xFIX_1_175875602 = 0x000025a1000025a1;
const __int64 Const_FIX_1_175875602x0 = 0x25a1000025a10000;
const __int64 Const_0xFIX_1_501321110 = 0x0000300b0000300b;
const __int64 Const_FIX_1_501321110x0 = 0x300b0000300b0000;
const __int64 Const_0xFIX_1_847759065 = 0x00003b2100003b21;
const __int64 Const_FIX_1_847759065x0 = 0x3b2100003b210000;
const __int64 Const_0xFIX_1_961570560 = 0x00003ec500003ec5;
const __int64 Const_FIX_1_961570560x0 = 0x3ec500003ec50000;
const __int64 Const_0xFIX_2_053119869 = 0x000041b3000041b3;
const __int64 Const_FIX_2_053119869x0 = 0x41b3000041b30000;
const __int64 Const_0xFIX_2_562915447 = 0x0000520300005203;
const __int64 Const_FIX_2_562915447x0 = 0x5203000052030000;
const __int64 Const_0xFIX_3_072711026 = 0x0000625400006254;
const __int64 Const_FIX_3_072711026x0 = 0x6254000062540000;
/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
* For 8-bit samples with the recommended scaling, all the variable
* and constant values involved are no more than 16 bits wide, so a
* 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
* For 12-bit samples, a full 32-bit multiplication will be needed.
*/
#if BITS_IN_JSAMPLE == 8
#define MULTIPLY(var,const) MULTIPLY16C16(var,const)
#else
#define MULTIPLY(var,const) ((var) * (const))
#endif
#define DATASIZE 32
/*
* Perform the forward DCT on one block of samples.
*/
GLOBAL(void)
mfdct8x8llm (DCTELEM * data)
{
__int64 qwTemp0, qwTemp2, qwTemp4, qwTemp6;
__int64 qwZ1, qwZ2, qwZ4_even, qwZ4_odd;
__int64 qwTmp4_Z3_Even, qwTmp4_Z3_Odd;
__int64 qwTmp6_Z3_Even, qwTmp6_Z3_Odd;
__int64 qwTmp5_Z4_Even, qwTmp5_Z4_Odd;
__int64 qwScratch7, qwScratch6, qwScratch5;
__asm{
mov edi, [data]
// transpose the bottom right quadrant(4X4) of the matrix
// --------- ---------
// | M1 | M2 | | M1'| M3'|
// --------- --> ---------
// | M3 | M4 | | M2'| M4'|
// --------- ---------
// Get the 32-bit quantities and pack into 16 bits
movq mm5, [edi][DATASIZE*4+16] //| w41 | w40 |
movq mm3, [edi][DATASIZE*4+24] //| w43 | w42 |
movq mm6, [edi][DATASIZE*5+16]
packssdw mm5, mm3 //|w43|w42|w41|w40|
movq mm7, [edi][DATASIZE*5+24]
movq mm4, mm5 // copy w4---0,1,3,5,6
movq mm3, [edi][DATASIZE*6+16]
packssdw mm6, mm7
movq mm2, [edi][DATASIZE*6+24]
punpcklwd mm5, mm6 //mm6 = w5
movq mm1, [edi][DATASIZE*7+16]
packssdw mm3, mm2
movq mm0, [edi][DATASIZE*7+24]
punpckhwd mm4, mm6 //---0,1,3,5,6
packssdw mm1, mm0
movq mm7, mm3 //---0,1,2,3,5,6 w6
punpcklwd mm3, mm1 //mm1 = w7
movq mm0, mm5 //---0,2,3,4,5,6,7
movq mm2, [edi][DATASIZE*4] //| w01 | w00 |
punpckhdq mm0, mm3 // transposed w5---0,2,4,6,7
punpckhwd mm7, mm1 //---0,2,3,5,6,7
movq mm1, [edi][DATASIZE*5+8]
movq mm6, mm4 //---0,2,3,4,6,7
movq [edi][DATASIZE*5+16], mm0 // store w5
punpckldq mm5, mm3 // transposed w4
movq mm3, [edi][DATASIZE*5]
punpckldq mm4, mm7 // transposed w6
movq mm0, [edi][DATASIZE*4+8] //| w03 | w02 |
punpckhdq mm6, mm7 // transposed w7---0,3,6,7
// transpose the bottom left quadrant(4X4) of the matrix and place
// in the top right quadrant while doing the same for the top
// right quadrant
// --------- ---------
// | M1 | M2 | | M1'| M3'|
// --------- --> ---------
// | M3 | M4 | | M2'| M4'|
// --------- ---------
movq [edi][DATASIZE*4+16], mm5 // store w4
packssdw mm2, mm0 //|w03|w02|w01|w00|
movq mm5, [edi][DATASIZE*7]
packssdw mm3, mm1
movq mm0, [edi][DATASIZE*7+8]
movq [edi][DATASIZE*7+16], mm6 // store w7---5,6,7
packssdw mm5, mm0
movq mm6, [edi][DATASIZE*6]
movq mm0, mm2 // copy w0---0,1,3,5,6
movq mm7, [edi][DATASIZE*6+8]
punpcklwd mm2, mm3 //mm6 = w1
movq [edi][DATASIZE*6+16], mm4 // store w6---3,5,6,7
packssdw mm6, mm7
movq mm1, [edi][DATASIZE*0+24]
punpckhwd mm0, mm3 //---0,1,3,5,6
movq mm7, mm6 //---0,1,2,3,5,6 w2
punpcklwd mm6, mm5 //mm1 = w3
movq mm3, [edi][DATASIZE*0+16]
punpckhwd mm7, mm5 //---0,2,3,5,6,7
movq mm4, [edi][DATASIZE*2+24]
packssdw mm3, mm1
movq mm1, mm2 //---0,2,3,4,5,6,7
punpckldq mm2, mm6 // transposed w4
movq mm5, [edi][DATASIZE*2+16]
punpckhdq mm1, mm6 // transposed w5---0,2,4,6,7
movq [edi][DATASIZE*0+16], mm2 // store w4
packssdw mm5, mm4
movq mm4, [edi][DATASIZE*1+16]
movq mm6, mm0 //---0,2,3,4,6,7
movq mm2, [edi][DATASIZE*1+24]
punpckldq mm0, mm7 // transposed w6
movq [edi][DATASIZE*1+16], mm1 // store w5
punpckhdq mm6, mm7 // transposed w7---0,3,6,7
movq mm7, [edi][DATASIZE*3+24]
packssdw mm4, mm2
movq [edi][DATASIZE*2+16], mm0 // store w6---3,5,6,7
movq mm1, mm3 // copy w4---0,1,3,5,6
movq mm2, [edi][DATASIZE*3+16]
punpcklwd mm3, mm4 //mm6 = w5
movq [edi][DATASIZE*3+16], mm6 // store w7---5,6,7
packssdw mm2, mm7
// transpose the bottom left quadrant(4X4) of the matrix
// --------- ---------
// | M1 | M2 | | M1'| M3'|
// --------- --> ---------
// | M3 | M4 | | M2'| M4'|
// --------- ---------
movq mm6, [edi][DATASIZE*0] //| w01 | w00 |
punpckhwd mm1, mm4 //---0,1,3,5,6
movq mm7, mm5 //---0,1,2,3,5,6 w6
punpcklwd mm5, mm2 //mm1 = w7
movq mm4, [edi][DATASIZE*0+8] //| w03 | w02 |
punpckhwd mm7, mm2 //---0,2,3,5,6,7
movq mm0, mm3 //---0,2,3,4,5,6,7
packssdw mm6, mm4 //|w03|w02|w01|w00|
movq mm2, [edi][DATASIZE*2+8]
punpckldq mm3, mm5 // transposed w4
movq mm4, [edi][DATASIZE*1]
punpckhdq mm0, mm5 // transposed w5---0,2,4,6,7
movq [edi][DATASIZE*4], mm3 // store w4
movq mm5, mm1 //---0,2,3,4,6,7
movq mm3, [edi][DATASIZE*2]
punpckldq mm1, mm7 // transposed w6
movq [edi][DATASIZE*5], mm0 // store w5
punpckhdq mm5, mm7 // transposed w7---0,3,6,7
movq mm7, [edi][DATASIZE*1+8]
packssdw mm3, mm2
movq [edi][DATASIZE*7], mm5 // store w7---5,6,7
movq mm2, mm6 // copy w0---0,1,3,5,6
movq [edi][DATASIZE*6], mm1 // store w6---3,5,6,7
packssdw mm4, mm7
// transpose the top left quadrant(4X4) of the matrix
// --------- ---------
// | M1 | M2 | | M1'| M3'|
// --------- --> ---------
// | M3 | M4 | | M2'| M4'|
// --------- ---------
// Get the 32-bit quantities and pack into 16 bits
movq mm1, [edi][DATASIZE*3]
punpcklwd mm6, mm4 //mm6 = w1
movq mm0, [edi][DATASIZE*3+8]
punpckhwd mm2, mm4 //---0,1,3,5,6
packssdw mm1, mm0
movq mm5, mm3 //---0,1,2,3,5,6 w2
punpcklwd mm3, mm1 //mm1 = w3
movq mm0, mm6 //---0,2,3,4,5,6,7
movq mm4, [edi][DATASIZE*7]
punpckhwd mm5, mm1 //---0,2,3,5,6,7
movq mm1, [edi][DATASIZE*4]
punpckhdq mm6, mm3 // transposed w4
punpckldq mm0, mm3 // transposed w5---0,2,4,6,7
movq mm3, mm2 //---0,2,3,4,6,7
movq [edi][DATASIZE*0], mm0 // store w4
punpckldq mm2, mm5 // transposed w6
movq [edi][DATASIZE*1], mm6 // store w5
punpckhdq mm3, mm5 // transposed w7---0,3,6,7
movq [edi][DATASIZE*2], mm2 // store w6---3,5,6,7
paddw mm0, mm4
movq [edi][DATASIZE*3], mm3 // store w7---5,6,7
paddw mm3, mm1
//******************************************************************************
// End of transpose. Begin row dct.
//******************************************************************************
// tmp0 = dataptr[DATASIZE*0] + dataptr[DATASIZE*7];
movq mm7, mm0
paddw mm0, mm3 //tmp10
paddw mm6, [edi][DATASIZE*6]
psubw mm7, mm3 //tmp13
paddw mm2, [edi][DATASIZE*5]
movq mm1, mm6
// tmp10 = tmp0 + tmp3;
paddw mm1, mm2 //tmp11
psubw mm6, mm2 //tmp12
// dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
// dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
movq mm3, mm0
paddw mm0, mm1 //tmp10 + tmp11
psubw mm3, mm1 //tmp10 - tmp11
psllw mm0, 2 // descale it
movq mm1, mm6 //copy tmp12
psllw mm3, 2 // descale it
// z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
movq qwTemp0, mm0 //store
paddw mm1, mm7 //tmp12 + tmp13
movq mm2, mm1 //copy
// dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
// CONST_BITS-PASS1_BITS);
// dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
// CONST_BITS-PASS1_BITS);
pmaddwd mm1, Const_0xFIX_0_541196100 //| z12 | z10 |
movq mm4, mm7
pmaddwd mm7, Const_0xFIX_0_765366865 //| r2 | r0 |
movq mm0, mm6
pmaddwd mm2, Const_FIX_0_541196100x0 //| z13 | z11 |
pmaddwd mm4, Const_FIX_0_765366865x0 //| r3 | r1 |
pmaddwd mm6, Const_0xFIX_1_847759065 //| r2 | r0 |
paddd mm7, mm1 // add z1
pmaddwd mm0, Const_FIX_1_847759065x0 //| r3 | r1 |
paddd mm7, Const_1024
paddd mm4, mm2
paddd mm4, Const_1024
psrad mm7, 11 // descale it | |R2| |R0|
//!!!!!! Negate the results in mm6 and mm0
pxor mm6, Const_FFFF //invert result
psrad mm4, 11 // descale it | |R3| |R1|
paddd mm6, Const_1 // 2's complement
movq mm5, mm7
pxor mm0, Const_FFFF //invert result
punpckldq mm7, mm4 //| |R1| |R0|
paddd mm0, Const_1 // 2's complement
punpckhdq mm5, mm4 //| |R3| |R2|
movq qwTemp4, mm3 //store
packssdw mm7, mm5
movq mm5, Const_1024
paddd mm6, mm1 // add z1
movq qwTemp2, mm7 //store
paddd mm6, mm5
paddd mm0, mm2
psrad mm6, 11 // descale it | |R2| |R0|
paddd mm0, mm5
movq mm5, mm6
movq mm4, [edi][DATASIZE*3]
psrad mm0, 11 // descale it | |R3| |R1|
psubw mm4, [edi][DATASIZE*4]
punpckldq mm6, mm0 //| |R1| |R0|
movq mm7, [edi][DATASIZE*0]
punpckhdq mm5, mm0 //| |R3| |R2|
psubw mm7, [edi][DATASIZE*7]
packssdw mm6, mm5
// tmp4 = dataptr[3] - dataptr[4];
movq mm5, [edi][DATASIZE*2]
movq mm0, mm4
psubw mm5, [edi][DATASIZE*5]
movq mm2, mm4
movq qwTemp6, mm6 //store
paddw mm0, mm7 //z1
movq mm6, [edi][DATASIZE*1]
movq mm1, mm5
psubw mm6, [edi][DATASIZE*6]
movq mm3, mm5
// z1 = tmp4 + tmp7;
movq qwScratch5, mm5
paddw mm3, mm7 //z4
movq qwScratch7, mm7
paddw mm2, mm6 //z3
movq qwZ1, mm0 //store
paddw mm1, mm6 //z2
// z3 = MULTIPLY(z3, - FIX_1_961570560);
// z4 = MULTIPLY(z4, - FIX_0_390180644);
// z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
movq mm0, Const_FFFF
movq mm5, mm2
movq qwZ2, mm1
movq mm7, mm2
pmaddwd mm5, Const_0xFIX_1_961570560 //z32, z30
paddw mm2, mm3 //z3 + z4
pmaddwd mm7, Const_FIX_1_961570560x0 //z33, z31
movq mm1, mm3
movq qwScratch6, mm6
movq mm6, mm2
// z3 += z5;
//!!!!!! Negate the results
pmaddwd mm2, Const_0xFIX_1_175875602 //z52, z50
pxor mm5, mm0 //invert result
paddd mm5, Const_1 // 2's complement
pxor mm7, mm0 //invert result
pmaddwd mm3, Const_0xFIX_0_390180644 //z42, z40
pmaddwd mm1, Const_FIX_0_390180644x0 //z43, z41
paddd mm5, mm2 //z3_even
paddd mm7, Const_1 // 2's complement
pmaddwd mm6, Const_FIX_1_175875602x0 //z53, z51
pxor mm3, mm0 //invert result
// z4 += z5;
//!!!!!! Negate the results
paddd mm3, Const_1 // 2's complement
pxor mm1, mm0 //invert result
paddd mm1, Const_1 // 2's complement
paddd mm3, mm2
movq mm0, qwScratch6
movq mm2, mm4
// tmp4 = MULTIPLY(tmp4, FIX_0_298631336);
pmaddwd mm4, Const_0xFIX_0_298631336 //T42, T40
paddd mm7, mm6 //z3_odd
pmaddwd mm2, Const_FIX_0_298631336x0 //T43, T41
paddd mm1, mm6
movq mm6, mm0
paddd mm4, mm5
// tmp6 = MULTIPLY(tmp6, FIX_3_072711026);
pmaddwd mm6, Const_0xFIX_3_072711026 //T62, T60
paddd mm2, mm7
pmaddwd mm0, Const_FIX_3_072711026x0 //T63, T61
movq qwTmp4_Z3_Odd, mm2
movq qwTmp4_Z3_Even, mm4
paddd mm6, mm5
movq mm5, qwScratch5
paddd mm0, mm7
movq mm7, qwScratch7
movq mm2, mm5
movq qwTmp6_Z3_Even, mm6
movq mm6, mm7
// tmp5 = MULTIPLY(tmp5, FIX_2_053119869);
// tmp7 = MULTIPLY(tmp7, FIX_1_501321110);
pmaddwd mm5, Const_0xFIX_2_053119869 //T52, T50
pmaddwd mm2, Const_FIX_2_053119869x0 //T53, T51
pmaddwd mm7, Const_0xFIX_1_501321110 //T72, T70
pmaddwd mm6, Const_FIX_1_501321110x0 //T73, T71
paddd mm5, mm3
movq qwTmp6_Z3_Odd, mm0
paddd mm2, mm1
movq qwTmp5_Z4_Even, mm5
paddd mm7, mm3
movq mm0, qwZ1
paddd mm6, mm1
// z1 = MULTIPLY(z1, - FIX_0_899976223);
movq mm1, Const_FFFF
movq mm4, mm0
//!!!!!! Negate the results
pmaddwd mm0, Const_0xFIX_0_899976223 //z12, z10
pmaddwd mm4, Const_FIX_0_899976223x0 //z13, z11
movq mm3, qwTmp4_Z3_Even
movq qwTmp5_Z4_Odd, mm2
pxor mm0, mm1 //invert result
movq mm2, qwTmp4_Z3_Odd
pxor mm4, mm1 //invert result
paddd mm4, Const_1 // 2's complement
paddd mm7, mm0 //tmp7 + z1 + z4 EVEN
paddd mm0, Const_1 // 2's complement
paddd mm6, mm4 //tmp7 + z1 + z4 ODD
// dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
paddd mm7, Const_1024 //rounding adj
paddd mm3, mm0 //tmp4 + z1 + z3 EVEN
paddd mm6, Const_1024 //rounding adj
psrad mm7, 11 // descale it | |R2| |R0|
psrad mm6, 11 // descale it | |R3| |R1|
movq mm5, mm7
punpckldq mm7, mm6 //| |R1| |R0|
// dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
punpckhdq mm5, mm6 //| |R3| |R2|
paddd mm2, mm4 //tmp4 + z1 + z3 ODD
paddd mm3, Const_1024 //rounding adj
packssdw mm7, mm5
paddd mm2, Const_1024 //rounding adj
psrad mm3, 11 // descale it | |R2| |R0|
movq mm0, qwZ2
psrad mm2, 11 // descale it | |R3| |R1|
movq mm5, mm3
movq mm4, mm0
// z2 = MULTIPLY(z2, - FIX_2_562915447);
pmaddwd mm0, Const_0xFIX_2_562915447 //z22, z20
punpckldq mm3, mm2 //| |R1| |R0|
pmaddwd mm4, Const_FIX_2_562915447x0 //z23, z21
punpckhdq mm5, mm2 //| |R3| |R2|
movq mm2, Const_FFFF
packssdw mm3, mm5
movq [edi][DATASIZE*1], mm7 //store
//!!!!!! Negate the results
pxor mm0, mm2 //invert result
movq mm5, Const_1
pxor mm4, mm2 //invert result
movq [edi][DATASIZE*7], mm3 //store
paddd mm0, mm5 // 2's complement
movq mm7, qwTmp6_Z3_Even
paddd mm4, mm5 // 2's complement
// dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
movq mm2, qwTmp6_Z3_Odd
paddd mm7, mm0 //tmp6 + z2 + z3 EVEN
paddd mm7, Const_1024 //rounding adj
paddd mm2, mm4 //tmp6 + z2 + z3 ODD
paddd mm2, Const_1024 //rounding adj
psrad mm7, 11 // descale it | |R2| |R0|
movq mm6, qwTemp0 //restore
psrad mm2, 11 // descale it | |R3| |R1|
movq mm3, qwTmp5_Z4_Even
movq mm5, mm7
movq [edi][DATASIZE*0], mm6 //store
punpckldq mm7, mm2 //| |R1| |R0|
movq mm1, qwTmp5_Z4_Odd
punpckhdq mm5, mm2 //| |R3| |R2|
movq mm6, qwTemp2 //restore
packssdw mm7, mm5
movq mm5, Const_1024
paddd mm3, mm0 //tmp5 + z2 + z4 EVEN
// dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
movq [edi][DATASIZE*3], mm7 //store
paddd mm1, mm4 //tmp5 + z2 + z4 ODD
movq mm7, qwTemp4 //restore
paddd mm3, mm5 //rounding adj
movq [edi][DATASIZE*2], mm6 //store
paddd mm1, mm5 //rounding adj
movq [edi][DATASIZE*4], mm7 //store
psrad mm3, 11 // descale it | |R2| |R0|
movq mm6, qwTemp6 //restore
psrad mm1, 11 // descale it | |R3| |R1|
movq mm0, [edi][DATASIZE*0+16]
movq mm5, mm3
movq [edi][DATASIZE*6], mm6 //store
punpckldq mm3, mm1 //| |R1| |R0|
paddw mm0, [edi][DATASIZE*7+16]
punpckhdq mm5, mm1 //| |R3| |R2|
movq mm1, [edi][DATASIZE*1+16]
packssdw mm3, mm5
paddw mm1, [edi][DATASIZE*6+16]
movq mm7, mm0
movq [edi][DATASIZE*5], mm3 //store
movq mm6, mm1
//******************************************************************************
// This completes 4x8 dct locations. Copy to do other 4x8.
//******************************************************************************
// tmp0 = dataptr[DATASIZE*0] + dataptr[DATASIZE*7];
movq mm3, [edi][DATASIZE*3+16]
paddw mm3, [edi][DATASIZE*4+16]
movq mm2, [edi][DATASIZE*2+16]
paddw mm0, mm3 //tmp10
paddw mm2, [edi][DATASIZE*5+16]
psubw mm7, mm3 //tmp13
// tmp10 = tmp0 + tmp3;
paddw mm1, mm2 //tmp11
psubw mm6, mm2 //tmp12
// dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
// dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
movq mm3, mm0
paddw mm0, mm1 //tmp10 + tmp11
psubw mm3, mm1 //tmp10 - tmp11
psllw mm0, 2 // descale it
movq mm1, mm6 //copy tmp12
psllw mm3, 2 // descale it
// z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
movq qwTemp0, mm0 //store
paddw mm1, mm7 //tmp12 + tmp13
//;;; movq [edi][DATASIZE*6+16], mm4 ; store w6---3,5,6,7
movq mm2, mm1 //copy
// dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
// CONST_BITS-PASS1_BITS);
// dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
// CONST_BITS-PASS1_BITS);
pmaddwd mm1, Const_0xFIX_0_541196100 //| z12 | z10 |
movq mm4, mm7
pmaddwd mm7, Const_0xFIX_0_765366865 //| r2 | r0 |
movq mm0, mm6
pmaddwd mm2, Const_FIX_0_541196100x0 //| z13 | z11 |
pmaddwd mm4, Const_FIX_0_765366865x0 //| r3 | r1 |
pmaddwd mm6, Const_0xFIX_1_847759065 //| r2 | r0 |
paddd mm7, mm1 // add z1
pmaddwd mm0, Const_FIX_1_847759065x0 //| r3 | r1 |
paddd mm7, Const_1024
paddd mm4, mm2
paddd mm4, Const_1024
psrad mm7, 11 // descale it | |R2| |R0|
//!!!!!! Negate the results in mm6 and mm0
pxor mm6, Const_FFFF //invert result
psrad mm4, 11 // descale it | |R3| |R1|
paddd mm6, Const_1 // 2's complement
movq mm5, mm7
pxor mm0, Const_FFFF //invert result
punpckldq mm7, mm4 //| |R1| |R0|
paddd mm0, Const_1 // 2's complement
punpckhdq mm5, mm4 //| |R3| |R2|
movq qwTemp4, mm3 //store
packssdw mm7, mm5
movq mm5, Const_1024
paddd mm6, mm1 // add z1
movq qwTemp2, mm7 //store
paddd mm0, mm2
movq mm4, [edi][DATASIZE*3+16]
paddd mm6, mm5
psubw mm4, [edi][DATASIZE*4+16]
psrad mm6, 11 // descale it | |R2| |R0|
paddd mm0, mm5
movq mm5, mm6
movq mm7, [edi][DATASIZE*0+16]
psrad mm0, 11 // descale it | |R3| |R1|
psubw mm7, [edi][DATASIZE*7+16]
punpckldq mm6, mm0 //| |R1| |R0|
punpckhdq mm5, mm0 //| |R3| |R2|
movq mm0, mm4
packssdw mm6, mm5
movq mm2, mm4
// tmp4 = dataptr[3] - dataptr[4];
movq mm5, [edi][DATASIZE*2+16]
paddw mm0, mm7 //z1
psubw mm5, [edi][DATASIZE*5+16]
movq qwTemp6, mm6 //store
movq mm1, mm5
movq mm6, [edi][DATASIZE*1+16]
movq mm3, mm5
// z1 = tmp4 + tmp7;
psubw mm6, [edi][DATASIZE*6+16]
paddw mm3, mm7 //z4
movq qwScratch7, mm7
paddw mm2, mm6 //z3
movq qwScratch5, mm5
paddw mm1, mm6 //z2
// z3 = MULTIPLY(z3, - FIX_1_961570560);
// z4 = MULTIPLY(z4, - FIX_0_390180644);
// z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
movq qwZ1, mm0 //store
movq mm5, mm2
movq qwZ2, mm1
movq mm7, mm2
movq mm0, Const_FFFF
paddw mm2, mm3 //z3 + z4
pmaddwd mm5, Const_0xFIX_1_961570560 //z32, z30
movq mm1, mm3
pmaddwd mm7, Const_FIX_1_961570560x0 //z33, z31
movq qwScratch6, mm6
movq mm6, mm2
// z3 += z5//
//!!!!!! Negate the results
pmaddwd mm2, Const_0xFIX_1_175875602 //z52, z50
pxor mm5, mm0 //invert result
paddd mm5, Const_1 // 2's complement
pxor mm7, mm0 //invert result
pmaddwd mm3, Const_0xFIX_0_390180644 //z42, z40
pmaddwd mm1, Const_FIX_0_390180644x0 //z43, z41
paddd mm5, mm2 //z3_even
paddd mm7, Const_1 // 2's complement
pmaddwd mm6, Const_FIX_1_175875602x0 //z53, z51
pxor mm3, mm0 //invert result
// z4 += z5;
//!!!!!! Negate the results
paddd mm3, Const_1 // 2's complement
pxor mm1, mm0 //invert result
paddd mm1, Const_1 // 2's complement
paddd mm3, mm2
movq mm0, qwScratch6
movq mm2, mm4
// tmp4 = MULTIPLY(tmp4, FIX_0_298631336);
pmaddwd mm4, Const_0xFIX_0_298631336 //T42, T40
paddd mm7, mm6 //z3_odd
pmaddwd mm2, Const_FIX_0_298631336x0 //T43, T41
paddd mm1, mm6
movq mm6, mm0
paddd mm4, mm5
// tmp6 = MULTIPLY(tmp6, FIX_3_072711026);
pmaddwd mm6, Const_0xFIX_3_072711026 //T62, T60
paddd mm2, mm7
pmaddwd mm0, Const_FIX_3_072711026x0 //T63, T61
movq qwTmp4_Z3_Odd, mm2
movq qwTmp4_Z3_Even, mm4
paddd mm6, mm5
movq mm5, qwScratch5
paddd mm0, mm7
movq mm7, qwScratch7
movq mm2, mm5
movq qwTmp6_Z3_Even, mm6
movq mm6, mm7
// tmp5 = MULTIPLY(tmp5, FIX_2_053119869);
// tmp7 = MULTIPLY(tmp7, FIX_1_501321110);
pmaddwd mm5, Const_0xFIX_2_053119869 //T52, T50
pmaddwd mm2, Const_FIX_2_053119869x0 //T53, T51
pmaddwd mm7, Const_0xFIX_1_501321110 //T72, T70
pmaddwd mm6, Const_FIX_1_501321110x0 //T73, T71
paddd mm5, mm3
movq qwTmp6_Z3_Odd, mm0
paddd mm2, mm1
movq qwTmp5_Z4_Even, mm5
paddd mm7, mm3
movq mm0, qwZ1
paddd mm6, mm1
// z1 = MULTIPLY(z1, - FIX_0_899976223);
movq mm1, Const_FFFF
movq mm4, mm0
//!!!!!! Negate the results
pmaddwd mm0, Const_0xFIX_0_899976223 //z12, z10
pmaddwd mm4, Const_FIX_0_899976223x0 //z13, z11
movq mm3, qwTmp4_Z3_Even
movq qwTmp5_Z4_Odd, mm2
pxor mm0, mm1 //invert result
movq mm2, qwTmp4_Z3_Odd
pxor mm4, mm1 //invert result
paddd mm4, Const_1 // 2's complement
paddd mm7, mm0 //tmp7 + z1 + z4 EVEN
paddd mm0, Const_1 // 2's complement
paddd mm6, mm4 //tmp7 + z1 + z4 ODD
// dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
paddd mm7, Const_1024 //rounding adj
paddd mm3, mm0 //tmp4 + z1 + z3 EVEN
paddd mm6, Const_1024 //rounding adj
psrad mm7, 11 // descale it | |R2| |R0|
psrad mm6, 11 // descale it | |R3| |R1|
movq mm5, mm7
punpckldq mm7, mm6 //| |R1| |R0|
// dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
punpckhdq mm5, mm6 //| |R3| |R2|
paddd mm2, mm4 //tmp4 + z1 + z3 ODD
paddd mm3, Const_1024 //rounding adj
packssdw mm7, mm5
paddd mm2, Const_1024 //rounding adj
psrad mm3, 11 // descale it | |R2| |R0|
movq mm0, qwZ2
psrad mm2, 11 // descale it | |R3| |R1|
movq mm5, mm3
movq mm4, mm0
// z2 = MULTIPLY(z2, - FIX_2_562915447);
pmaddwd mm0, Const_0xFIX_2_562915447 //z22, z20
punpckldq mm3, mm2 //| |R1| |R0|
pmaddwd mm4, Const_FIX_2_562915447x0 //z23, z21
punpckhdq mm5, mm2 //| |R3| |R2|
movq mm2, Const_FFFF
packssdw mm3, mm5
movq [edi][DATASIZE*1+16], mm7 //store
//!!!!!! Negate the results
pxor mm0, mm2 //invert result
movq mm5, Const_1
pxor mm4, mm2 //invert result
movq [edi][DATASIZE*7+16], mm3 //store
paddd mm0, mm5 // 2's complement
movq mm7, qwTmp6_Z3_Even
paddd mm4, mm5 // 2's complement
// dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
movq mm2, qwTmp6_Z3_Odd
paddd mm7, mm0 //tmp6 + z2 + z3 EVEN
paddd mm7, Const_1024 //rounding adj
paddd mm2, mm4 //tmp6 + z2 + z3 ODD
paddd mm2, Const_1024 //rounding adj
psrad mm7, 11 // descale it | |R2| |R0|
movq mm6, qwTemp0 //restore
psrad mm2, 11 // descale it | |R3| |R1|
movq mm5, mm7
movq [edi][DATASIZE*0+16], mm6 //store
punpckldq mm7, mm2 //| |R1| |R0|
movq mm3, qwTmp5_Z4_Even
punpckhdq mm5, mm2 //| |R3| |R2|
movq mm1, qwTmp5_Z4_Odd
packssdw mm7, mm5
movq mm6, qwTemp2 //restore
paddd mm3, mm0 //tmp5 + z2 + z4 EVEN
// dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
movq mm0, Const_1024
paddd mm1, mm4 //tmp5 + z2 + z4 ODD
movq [edi][DATASIZE*3+16], mm7 //store
paddd mm3, mm0 //rounding adj
movq mm7, qwTemp4 //restore
paddd mm1, mm0 //rounding adj
movq [edi][DATASIZE*2+16], mm6 //store
psrad mm3, 11 // descale it | |R2| |R0|
movq mm6, qwTemp6 //restore
psrad mm1, 11 // descale it | |R3| |R1|
movq [edi][DATASIZE*4+16], mm7 //store
movq mm5, mm3
movq [edi][DATASIZE*6+16], mm6 //store
punpckldq mm3, mm1 //| |R1| |R0|
punpckhdq mm5, mm1 //| |R3| |R2|
movq mm0, mm7 // copy w4---0,1,3,5,6
movq mm1, [edi][DATASIZE*7+16]
packssdw mm3, mm5
movq [edi][DATASIZE*5+16], mm3 //store
punpcklwd mm7, mm3 //mm6 = w5
//******************************************************************************
//******************************************************************************
// This completes all 8x8 dct locations for the row case.
// Now transpose the data for the columns.
//******************************************************************************
// transpose the bottom right quadrant(4X4) of the matrix
// --------- ---------
// | M1 | M2 | | M1'| M3'|
// --------- --> ---------
// | M3 | M4 | | M2'| M4'|
// --------- ---------
movq mm4, mm7 //---0,2,3,4,5,6,7
punpckhwd mm0, mm3 //---0,1,3,5,6
movq mm2, mm6 //---0,1,2,3,5,6 w6
punpcklwd mm6, mm1 //mm1 = w7
// tmp0 = dataptr[DATASIZE*0] + dataptr[DATASIZE*7]//
movq mm5, [edi][DATASIZE*5]
punpckldq mm7, mm6 // transposed w4
punpckhdq mm4, mm6 // transposed w5---0,2,4,6,7
movq mm6, mm0 //---0,2,3,4,6,7
movq [edi][DATASIZE*4+16], mm7 // store w4
punpckhwd mm2, mm1 //---0,2,3,5,6,7
movq [edi][DATASIZE*5+16], mm4 // store w5
punpckldq mm0, mm2 // transposed w6
movq mm7, [edi][DATASIZE*4]
punpckhdq mm6, mm2 // transposed w7---0,3,6,7
movq [edi][DATASIZE*6+16], mm0 // store w6---3,5,6,7
movq mm0, mm7 // copy w0---0,1,3,5,6
movq [edi][DATASIZE*7+16], mm6 // store w7---5,6,7
punpcklwd mm7, mm5 //mm6 = w1
// transpose the bottom left quadrant(4X4) of the matrix and place
// in the top right quadrant while doing the same for the top
// right quadrant
// --------- ---------
// | M1 | M2 | | M1'| M3'|
// --------- --> ---------
// | M3 | M4 | | M2'| M4'|
// --------- ---------
movq mm3, [edi][DATASIZE*6]
punpckhwd mm0, mm5 //---0,1,3,5,6
movq mm1, [edi][DATASIZE*7]
movq mm2, mm3 //---0,1,2,3,5,6 w2
movq mm6, [edi][DATASIZE*0+16]
punpcklwd mm3, mm1 //mm1 = w3
movq mm5, [edi][DATASIZE*1+16]
punpckhwd mm2, mm1 //---0,2,3,5,6,7
movq mm4, mm7 //---0,2,3,4,5,6,7
punpckldq mm7, mm3 // transposed w4
punpckhdq mm4, mm3 // transposed w5---0,2,4,6,7
movq mm3, mm0 //---0,2,3,4,6,7
movq [edi][DATASIZE*0+16], mm7 // store w4
punpckldq mm0, mm2 // transposed w6
movq mm1, [edi][DATASIZE*2+16]
punpckhdq mm3, mm2 // transposed w7---0,3,6,7
movq [edi][DATASIZE*2+16], mm0 // store w6---3,5,6,7
movq mm0, mm6 // copy w4---0,1,3,5,6
movq mm7, [edi][DATASIZE*3+16]
punpcklwd mm6, mm5 //mm6 = w5
movq [edi][DATASIZE*1+16], mm4 // store w5
punpckhwd mm0, mm5 //---0,1,3,5,6
// transpose the top right quadrant(4X4) of the matrix
// --------- ---------
// | M1 | M2 | | M1'| M3'|
// --------- --> ---------
// | M3 | M4 | | M2'| M4'|
// --------- ---------
movq mm2, mm1 //---0,1,2,3,5,6 w6
punpcklwd mm1, mm7 //mm1 = w7
movq mm4, mm6 //---0,2,3,4,5,6,7
punpckldq mm6, mm1 // transposed w4
movq [edi][DATASIZE*3+16], mm3 // store w7---5,6,7
punpckhdq mm4, mm1 // transposed w5---0,2,4,6,7
movq [edi][DATASIZE*4], mm6 // store w4
punpckhwd mm2, mm7 //---0,2,3,5,6,7
movq mm7, [edi][DATASIZE*0]
movq mm1, mm0 //---0,2,3,4,6,7
movq mm3, [edi][DATASIZE*1]
punpckldq mm0, mm2 // transposed w6
movq [edi][DATASIZE*5], mm4 // store w5
punpckhdq mm1, mm2 // transposed w7---0,3,6,7
movq [edi][DATASIZE*6], mm0 // store w6---3,5,6,7
movq mm2, mm7 // copy w0---0,1,3,5,6
movq mm4, [edi][DATASIZE*3]
punpcklwd mm7, mm3 //mm6 = w1
// transpose the top left quadrant(4X4) of the matrix
// --------- ---------
// | M1 | M2 | | M1'| M3'|
// --------- --> ---------
// | M3 | M4 | | M2'| M4'|
// --------- ---------
movq mm6, [edi][DATASIZE*2]
punpckhwd mm2, mm3 //---0,1,3,5,6
movq mm0, mm6 //---0,1,2,3,5,6 w2
punpcklwd mm6, mm4 //mm1 = w3
movq [edi][DATASIZE*7], mm1 // store w7---5,6,7
punpckhwd mm0, mm4 //---0,2,3,5,6,7
movq mm1, mm7 //---0,2,3,4,5,6,7
punpckldq mm7, mm6 // transposed w4
punpckhdq mm1, mm6 // transposed w5---0,2,4,6,7
movq mm6, mm2 //---0,2,3,4,6,7
movq [edi][DATASIZE*0], mm7 // store w4
punpckldq mm2, mm0 // transposed w6
paddw mm7, [edi][DATASIZE*7]
punpckhdq mm6, mm0 // transposed w7---0,3,6,7
movq [edi][DATASIZE*3], mm6 // store w7---5,6,7
movq mm4, mm7
paddw mm6, [edi][DATASIZE*4]
movq [edi][DATASIZE*1], mm1 // store w5
paddw mm7, mm6 //tmp10
//******************************************************************************
// This begins the column dct
//******************************************************************************
paddw mm1, [edi][DATASIZE*6]
psubw mm4, mm6 //tmp13
movq [edi][DATASIZE*2], mm2 // store w6---3,5,6,7
movq mm6, mm1
paddw mm2, [edi][DATASIZE*5]
movq mm3, mm7
paddw mm1, mm2 //tmp11
psubw mm6, mm2 //tmp12
// dataptr[DATASIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
// dataptr[DATASIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
paddw mm7, mm1 //tmp10 + tmp11
paddw mm7, Const_2 // round add 2 to each element
psubw mm3, mm1 //tmp10 - tmp11
paddw mm3, Const_2 // round add 2 to each element
psraw mm7, 2 // descale it
// unpack word to dword sign extended
movq mm5, mm7
punpcklwd mm7, mm7
psrad mm7, 16 // even results store in Temp0
punpckhwd mm5, mm5
psrad mm5, 16 // odd results store in array
movq mm1, mm6 //copy tmp12
movq qwTemp0, mm7 //store
psraw mm3, 2 // descale it
movq [edi][DATASIZE*0+8], mm5
movq mm5, mm3
punpcklwd mm3, mm3
paddw mm1, mm4 //tmp12 + tmp13
psrad mm3, 16 // even results store in Temp4
movq mm2, mm1 //copy
// z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
pmaddwd mm1, Const_0xFIX_0_541196100 //| z12 | z10 |
punpckhwd mm5, mm5
pmaddwd mm2, Const_FIX_0_541196100x0 //| z13 | z11 |
movq mm7, mm4
// dataptr[DATASIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
// CONST_BITS+PASS1_BITS);
pmaddwd mm4, Const_FIX_0_765366865x0 //| r3 | r1 |
psrad mm5, 16 // odd results store in array
pmaddwd mm7, Const_0xFIX_0_765366865 //| r2 | r0 |
movq mm0, mm6
// dataptr[DATASIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
// CONST_BITS+PASS1_BITS);
pmaddwd mm6, Const_0xFIX_1_847759065 //| r2 | r0 |
movq qwTemp4, mm3 //store
paddd mm4, mm2
paddd mm4, Const_16384
paddd mm7, mm1 // add z1
paddd mm7, Const_16384
psrad mm4, 15 // descale it | |R3| |R1|
movq [edi][DATASIZE*4+8], mm5
psrad mm7, 15 // descale it | |R2| |R0|
pmaddwd mm0, Const_FIX_1_847759065x0 //| r3 | r1 |
movq mm5, mm7
//!!!!!! Negate result
movq mm3, Const_1
punpckldq mm7, mm4 //| |R1| |R0|
pxor mm6, Const_FFFF //invert result
punpckhdq mm5, mm4 //| |R3| |R2|
movq qwTemp2, mm7 //store
paddd mm6, mm3 // 2's complement
pxor mm0, Const_FFFF //invert result
paddd mm6, mm1 // add z1
movq [edi][DATASIZE*2+8], mm5 //write out 2nd half in unused memory
paddd mm0, mm3 // 2's complement
movq mm3, Const_16384
paddd mm0, mm2
movq mm7, [edi][DATASIZE*0]
paddd mm6, mm3
movq mm4, [edi][DATASIZE*3]
paddd mm0, mm3
psubw mm7, [edi][DATASIZE*7]
psrad mm6, 15 // descale it | |R2| |R0|
psubw mm4, [edi][DATASIZE*4]
psrad mm0, 15 // descale it | |R3| |R1|
movq mm3, [edi][DATASIZE*2]
movq mm5, mm6
psubw mm3, [edi][DATASIZE*5]
punpckldq mm6, mm0 //| |R1| |R0|
punpckhdq mm5, mm0 //| |R3| |R2|
movq mm0, mm4
movq qwTemp6, mm6 //store
movq mm2, mm4
// tmp4 = dataptr[3] - dataptr[4];
// z1 = tmp4 + tmp7;
movq mm6, [edi][DATASIZE*1]
paddw mm0, mm7 //z1
movq [edi][DATASIZE*6+8], mm5 //write out 2nd half in unused memory
movq mm1, mm3
psubw mm6, [edi][DATASIZE*6]
movq mm5, mm3
movq qwZ1, mm0 //store
paddw mm5, mm7 //z4
movq qwScratch7, mm7
paddw mm1, mm6 //z2
movq qwScratch5, mm3
paddw mm2, mm6 //z3
movq qwZ2, mm1
movq mm3, mm2
// z3 = MULTIPLY(z3, - FIX_1_961570560);
// z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
// z4 = MULTIPLY(z4, - FIX_0_390180644);
movq qwScratch6, mm6
movq mm1, mm2
pmaddwd mm3, Const_0xFIX_1_961570560 //z32, z30
movq mm7, mm5
movq mm6, Const_FFFF
paddw mm2, mm5 //z3 + z4
pmaddwd mm1, Const_FIX_1_961570560x0 //z33, z31
movq mm0, mm2
pmaddwd mm7, Const_FIX_0_390180644x0 //z43, z41
//!!!!!! Negate the results
pxor mm3, mm6 //invert result
pmaddwd mm5, Const_0xFIX_0_390180644 //z42, z40
pmaddwd mm2, Const_0xFIX_1_175875602 //z52, z50
pxor mm1, mm6 //invert result
pmaddwd mm0, Const_FIX_1_175875602x0 //z53, z51
//!!!!!! Negate the results
pxor mm7, mm6 //invert result
paddd mm3, Const_1 // 2's complement
pxor mm5, mm6 //invert result
// z3 += z5//
paddd mm1, Const_1 // 2's complement
paddd mm3, mm2 //z3_even
paddd mm5, Const_1 // 2's complement
paddd mm1, mm0 //z3_odd
// z4 += z5;
paddd mm7, Const_1 // 2's complement
paddd mm5, mm2
paddd mm7, mm0
movq mm2, mm4
// tmp4 = MULTIPLY(tmp4, FIX_0_298631336);
pmaddwd mm4, Const_0xFIX_0_298631336 //T42, T40
pmaddwd mm2, Const_FIX_0_298631336x0 //T43, T41
movq qwZ4_even, mm5
movq qwZ4_odd, mm7
paddd mm4, mm3
movq mm6, qwScratch6
paddd mm2, mm1
movq qwTmp4_Z3_Even, mm4
movq mm5, mm6
// tmp6 = MULTIPLY(tmp6, FIX_3_072711026);
pmaddwd mm6, Const_0xFIX_3_072711026 //T62, T60
pmaddwd mm5, Const_FIX_3_072711026x0 //T63, T61
movq qwTmp4_Z3_Odd, mm2
movq mm4, qwZ4_even
paddd mm6, mm3
movq mm3, qwScratch5
paddd mm5, mm1
movq qwTmp6_Z3_Even, mm6
movq mm2, mm3
// tmp5 = MULTIPLY(tmp5, FIX_2_053119869);
pmaddwd mm3, Const_0xFIX_2_053119869 //T52, T50
pmaddwd mm2, Const_FIX_2_053119869x0 //T53, T51
movq qwTmp6_Z3_Odd, mm5
movq mm0, qwZ4_odd
paddd mm3, mm4
movq mm7, qwScratch7
paddd mm2, mm0
movq qwTmp5_Z4_Even, mm3
movq mm6, mm7
// tmp7 = MULTIPLY(tmp7, FIX_1_501321110);
pmaddwd mm7, Const_0xFIX_1_501321110 //T72, T70
pmaddwd mm6, Const_FIX_1_501321110x0 //T73, T71
movq mm3, qwZ1
movq qwTmp5_Z4_Odd, mm2
paddd mm7, mm4
movq mm5, Const_FFFF
movq mm4, mm3
// z1 = MULTIPLY(z1, - FIX_0_899976223);
pmaddwd mm3, Const_0xFIX_0_899976223 //z12, z10
paddd mm6, mm0
pmaddwd mm4, Const_FIX_0_899976223x0 //z13, z11
movq mm2, qwTmp4_Z3_Odd
//!!!!!! Negate the results
pxor mm3, mm5 //invert result
paddd mm3, Const_1 // 2's complement
pxor mm4, mm5 //invert result
paddd mm4, Const_1 // 2's complement
paddd mm7, mm3 //tmp7 + z1 + z4 EVEN
// dataptr[DATASIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
// CONST_BITS+PASS1_BITS);
paddd mm7, Const_16384 //rounding adj
paddd mm6, mm4 //tmp7 + z1 + z4 ODD
paddd mm6, Const_16384 //rounding adj
psrad mm7, 15 // descale it | |R2| |R0|
movq mm0, qwTmp4_Z3_Even
psrad mm6, 15 // descale it | |R3| |R1|
paddd mm0, mm3 //tmp4 + z1 + z3 EVEN
movq mm5, mm7
movq mm3, qwTemp0 //restore
punpckldq mm7, mm6 //| |R1| |R0|
paddd mm0, Const_16384 //rounding adj
paddd mm2, mm4 //tmp4 + z1 + z3 ODD
movq [edi][DATASIZE*0], mm3 //store
punpckhdq mm5, mm6 //| |R3| |R2|
// dataptr[DATASIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
// CONST_BITS+PASS1_BITS);
paddd mm2, Const_16384 //rounding adj
psrad mm0, 15 // descale it | |R2| |R0|
movq mm6, qwZ2
psrad mm2, 15 // descale it | |R3| |R1|
movq [edi][DATASIZE*1+8], mm5 //store
movq mm4, mm6
// z2 = MULTIPLY(z2, - FIX_2_562915447);
pmaddwd mm6, Const_0xFIX_2_562915447 //z22, z20
movq mm5, mm0
pmaddwd mm4, Const_FIX_2_562915447x0 //z23, z21
punpckldq mm0, mm2 //| |R1| |R0|
movq mm3, Const_FFFF
punpckhdq mm5, mm2 //| |R3| |R2|
movq [edi][DATASIZE*1], mm7 //store
//!!!!!! Negate the results
pxor mm6, mm3 //invert result
movq mm1, Const_1
pxor mm4, mm3 //invert result
movq mm7, qwTmp6_Z3_Even
paddd mm6, mm1 // 2's complement
movq mm2, qwTmp6_Z3_Odd
paddd mm4, mm1 // 2's complement
// dataptr[DATASIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
// CONST_BITS+PASS1_BITS);
movq [edi][DATASIZE*7], mm0 //store
paddd mm7, mm6 //tmp6 + z2 + z3 EVEN
movq mm1, Const_16384
paddd mm2, mm4 //tmp6 + z2 + z3 ODD
movq mm3, qwTemp2 //restore
paddd mm7, mm1 //rounding adj
movq [edi][DATASIZE*7+8], mm5 //store
paddd mm2, mm1 //rounding adj
movq [edi][DATASIZE*2], mm3 //store
psrad mm7, 15 // descale it | |R2| |R0|
movq mm0, qwTemp4 //restore
psrad mm2, 15 // descale it | |R3| |R1|
movq mm3, qwTmp5_Z4_Even
movq mm5, mm7
movq [edi][DATASIZE*4], mm0 //store
paddd mm3, mm6 //tmp5 + z2 + z4 EVEN
movq mm6, qwTmp5_Z4_Odd
punpckldq mm7, mm2 //| |R1| |R0|
punpckhdq mm5, mm2 //| |R3| |R2|
paddd mm6, mm4 //tmp5 + z2 + z4 ODD
movq [edi][DATASIZE*3], mm7 //store
paddd mm3, mm1 //rounding adj
// dataptr[DATASIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
// CONST_BITS+PASS1_BITS);
movq mm0, qwTemp6 //restore
paddd mm6, mm1 //rounding adj
movq [edi][DATASIZE*3+8], mm5 //store
psrad mm3, 15 // descale it | |R2| |R0|
movq [edi][DATASIZE*6], mm0 //store
psrad mm6, 15 // descale it | |R3| |R1|
movq mm7, [edi][DATASIZE*0+16]
movq mm5, mm3
paddw mm7, [edi][DATASIZE*7+16]
punpckldq mm3, mm6 //| |R1| |R0|
movq mm1, [edi][DATASIZE*1+16]
punpckhdq mm5, mm6 //| |R3| |R2|
paddw mm1, [edi][DATASIZE*6+16]
movq mm4, mm7
//******************************************************************************
// This completes 4x8 dct locations. Copy to do other 4x8.
//******************************************************************************
movq mm6, [edi][DATASIZE*3+16]
paddw mm6, [edi][DATASIZE*4+16]
movq mm2, [edi][DATASIZE*2+16]
psubw mm4, mm6 //tmp13
paddw mm2, [edi][DATASIZE*5+16]
paddw mm7, mm6 //tmp10
movq [edi][DATASIZE*5], mm3 //store
movq mm6, mm1
movq [edi][DATASIZE*5+8], mm5 //store
paddw mm1, mm2 //tmp11
psubw mm6, mm2 //tmp12
movq mm3, mm7
// dataptr[DATASIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
// dataptr[DATASIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
paddw mm7, mm1 //tmp10 + tmp11
paddw mm7, Const_2 // round add 2 to each element
psubw mm3, mm1 //tmp10 - tmp11
paddw mm3, Const_2 // round add 2 to each element
psraw mm7, 2 // descale it
// unpack word to dword sign extended
movq mm5, mm7
punpcklwd mm7, mm7
psrad mm7, 16 // even results store in Temp0
punpckhwd mm5, mm5
psrad mm5, 16 // odd results store in array
movq mm1, mm6 //copy tmp12
movq qwTemp0, mm7 //store
psraw mm3, 2 // descale it
movq [edi][DATASIZE*0+24], mm5
movq mm5, mm3
punpcklwd mm3, mm3
paddw mm1, mm4 //tmp12 + tmp13
psrad mm3, 16 // even results store in Temp4
movq mm2, mm1 //copy
// z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
pmaddwd mm1, Const_0xFIX_0_541196100 //| z12 | z10 |
punpckhwd mm5, mm5
pmaddwd mm2, Const_FIX_0_541196100x0 //| z13 | z11 |
movq mm7, mm4
// dataptr[DATASIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
// CONST_BITS+PASS1_BITS);
pmaddwd mm4, Const_FIX_0_765366865x0 //| r3 | r1 |
psrad mm5, 16 // odd results store in array
pmaddwd mm7, Const_0xFIX_0_765366865 //| r2 | r0 |
movq mm0, mm6
// dataptr[DATASIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
// CONST_BITS+PASS1_BITS);
pmaddwd mm6, Const_0xFIX_1_847759065 //| r2 | r0 |
movq qwTemp4, mm3 //store
paddd mm4, mm2
paddd mm4, Const_16384
paddd mm7, mm1 // add z1
paddd mm7, Const_16384
psrad mm4, 15 // descale it | |R3| |R1|
movq [edi][DATASIZE*4+24], mm5
psrad mm7, 15 // descale it | |R2| |R0|
pmaddwd mm0, Const_FIX_1_847759065x0 //| r3 | r1 |
movq mm5, mm7
//!!!!!! Negate result
movq mm3, Const_1
punpckldq mm7, mm4 //| |R1| |R0|
pxor mm6, Const_FFFF //invert result
punpckhdq mm5, mm4 //| |R3| |R2|
movq qwTemp2, mm7 //store
paddd mm6, mm3 // 2's complement
pxor mm0, Const_FFFF //invert result
paddd mm6, mm1 // add z1
movq [edi][DATASIZE*2+24], mm5 //write out 2nd half in unused memory
paddd mm0, mm3 // 2's complement
movq mm3, Const_16384
paddd mm0, mm2
movq mm7, [edi][DATASIZE*0+16]
paddd mm6, mm3
movq mm4, [edi][DATASIZE*3+16]
paddd mm0, mm3
psubw mm7, [edi][DATASIZE*7+16]
psrad mm6, 15 // descale it | |R2| |R0|
psubw mm4, [edi][DATASIZE*4+16]
psrad mm0, 15 // descale it | |R3| |R1|
movq mm3, [edi][DATASIZE*2+16]
movq mm5, mm6
psubw mm3, [edi][DATASIZE*5+16]
punpckldq mm6, mm0 //| |R1| |R0|
punpckhdq mm5, mm0 //| |R3| |R2|
movq mm0, mm4
movq qwTemp6, mm6 //store
movq mm2, mm4
// tmp4 = dataptr[3] - dataptr[4];
// z1 = tmp4 + tmp7;
movq mm6, [edi][DATASIZE*1+16]
paddw mm0, mm7 //z1
movq [edi][DATASIZE*6+24], mm5 //write out 2nd half in unused memory
movq mm1, mm3
psubw mm6, [edi][DATASIZE*6+16]
movq mm5, mm3
movq qwZ1, mm0 //store
paddw mm5, mm7 //z4
movq qwScratch7, mm7
paddw mm1, mm6 //z2
movq qwScratch5, mm3
paddw mm2, mm6 //z3
movq qwZ2, mm1
movq mm3, mm2
// z3 = MULTIPLY(z3, - FIX_1_961570560);
// z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
// z4 = MULTIPLY(z4, - FIX_0_390180644);
movq qwScratch6, mm6
movq mm1, mm2
pmaddwd mm3, Const_0xFIX_1_961570560 //z32, z30
movq mm7, mm5
movq mm6, Const_FFFF
paddw mm2, mm5 //z3 + z4
pmaddwd mm1, Const_FIX_1_961570560x0 //z33, z31
movq mm0, mm2
pmaddwd mm7, Const_FIX_0_390180644x0 //z43, z41
//!!!!!! Negate the results
pxor mm3, mm6 //invert result
pmaddwd mm5, Const_0xFIX_0_390180644 //z42, z40
pmaddwd mm2, Const_0xFIX_1_175875602 //z52, z50
pxor mm1, mm6 //invert result
pmaddwd mm0, Const_FIX_1_175875602x0 //z53, z51
//!!!!!! Negate the results
pxor mm7, mm6 //invert result
paddd mm3, Const_1 // 2's complement
pxor mm5, mm6 //invert result
// z3 += z5;
paddd mm1, Const_1 // 2's complement
paddd mm3, mm2 //z3_even
paddd mm5, Const_1 // 2's complement
paddd mm1, mm0 //z3_odd
// z4 += z5;
paddd mm7, Const_1 // 2's complement
paddd mm5, mm2
paddd mm7, mm0
movq mm2, mm4
// tmp4 = MULTIPLY(tmp4, FIX_0_298631336);
pmaddwd mm4, Const_0xFIX_0_298631336 //T42, T40
pmaddwd mm2, Const_FIX_0_298631336x0 //T43, T41
movq qwZ4_even, mm5
movq qwZ4_odd, mm7
paddd mm4, mm3
movq mm6, qwScratch6
paddd mm2, mm1
movq qwTmp4_Z3_Even, mm4
movq mm5, mm6
// tmp6 = MULTIPLY(tmp6, FIX_3_072711026);
pmaddwd mm6, Const_0xFIX_3_072711026 //T62, T60
pmaddwd mm5, Const_FIX_3_072711026x0 //T63, T61
movq qwTmp4_Z3_Odd, mm2
movq mm4, qwZ4_even
paddd mm6, mm3
movq mm3, qwScratch5
paddd mm5, mm1
movq qwTmp6_Z3_Even, mm6
movq mm2, mm3
// tmp5 = MULTIPLY(tmp5, FIX_2_053119869);
pmaddwd mm3, Const_0xFIX_2_053119869 //T52, T50
pmaddwd mm2, Const_FIX_2_053119869x0 //T53, T51
movq qwTmp6_Z3_Odd, mm5
movq mm0, qwZ4_odd
paddd mm3, mm4
movq mm7, qwScratch7
paddd mm2, mm0
movq qwTmp5_Z4_Even, mm3
movq mm6, mm7
// tmp7 = MULTIPLY(tmp7, FIX_1_501321110);
pmaddwd mm7, Const_0xFIX_1_501321110 //T72, T70
pmaddwd mm6, Const_FIX_1_501321110x0 //T73, T71
movq mm3, qwZ1
movq qwTmp5_Z4_Odd, mm2
paddd mm7, mm4
movq mm5, Const_FFFF
movq mm4, mm3
// z1 = MULTIPLY(z1, - FIX_0_899976223);
pmaddwd mm3, Const_0xFIX_0_899976223 //z12, z10
paddd mm6, mm0
pmaddwd mm4, Const_FIX_0_899976223x0 //z13, z11
movq mm2, qwTmp4_Z3_Odd
//!!!!!! Negate the results
pxor mm3, mm5 //invert result
paddd mm3, Const_1 // 2's complement
pxor mm4, mm5 //invert result
paddd mm4, Const_1 // 2's complement
paddd mm7, mm3 //tmp7 + z1 + z4 EVEN
// dataptr[DATASIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
// CONST_BITS+PASS1_BITS);
paddd mm7, Const_16384 //rounding adj
paddd mm6, mm4 //tmp7 + z1 + z4 ODD
paddd mm6, Const_16384 //rounding adj
psrad mm7, 15 // descale it | |R2| |R0|
movq mm0, qwTmp4_Z3_Even
psrad mm6, 15 // descale it | |R3| |R1|
paddd mm0, mm3 //tmp4 + z1 + z3 EVEN
movq mm5, mm7
movq mm3, qwTemp0 //restore
punpckldq mm7, mm6 //| |R1| |R0|
paddd mm0, Const_16384 //rounding adj
paddd mm2, mm4 //tmp4 + z1 + z3 ODD
movq [edi][DATASIZE*0+16], mm3 //store
punpckhdq mm5, mm6 //| |R3| |R2|
// dataptr[DATASIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
// CONST_BITS+PASS1_BITS);
paddd mm2, Const_16384 //rounding adj
psrad mm0, 15 // descale it | |R2| |R0|
movq mm6, qwZ2
psrad mm2, 15 // descale it | |R3| |R1|
movq [edi][DATASIZE*1+24], mm5 //store
movq mm4, mm6
// z2 = MULTIPLY(z2, - FIX_2_562915447);
pmaddwd mm6, Const_0xFIX_2_562915447 //z22, z20
movq mm5, mm0
pmaddwd mm4, Const_FIX_2_562915447x0 //z23, z21
punpckldq mm0, mm2 //| |R1| |R0|
movq mm3, Const_FFFF
punpckhdq mm5, mm2 //| |R3| |R2|
movq [edi][DATASIZE*1+16], mm7 //store
//!!!!!! Negate the results
pxor mm6, mm3 //invert result
movq mm1, Const_1
pxor mm4, mm3 //invert result
movq mm7, qwTmp6_Z3_Even
paddd mm6, mm1 // 2's complement
movq mm2, qwTmp6_Z3_Odd
paddd mm4, mm1 // 2's complement
// dataptr[DATASIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
// CONST_BITS+PASS1_BITS);
movq [edi][DATASIZE*7+16], mm0 //store
paddd mm7, mm6 //tmp6 + z2 + z3 EVEN
movq mm1, Const_16384
paddd mm2, mm4 //tmp6 + z2 + z3 ODD
movq mm3, qwTemp2 //restore
paddd mm7, mm1 //rounding adj
movq [edi][DATASIZE*7+24], mm5 //store
paddd mm2, mm1 //rounding adj
movq [edi][DATASIZE*2+16], mm3 //store
psrad mm7, 15 // descale it | |R2| |R0|
movq mm3, qwTmp5_Z4_Even
psrad mm2, 15 // descale it | |R3| |R1|
movq mm5, mm7
paddd mm3, mm6 //tmp5 + z2 + z4 EVEN
movq mm6, qwTmp5_Z4_Odd
punpckldq mm7, mm2 //| |R1| |R0|
punpckhdq mm5, mm2 //| |R3| |R2|
paddd mm6, mm4 //tmp5 + z2 + z4 ODD
movq [edi][DATASIZE*3+16], mm7 //store
paddd mm3, mm1 //rounding adj
// dataptr[DATASIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
// CONST_BITS+PASS1_BITS);
movq mm7, qwTemp4 //restore
paddd mm6, mm1 //rounding adj
movq [edi][DATASIZE*3+24], mm5 //store
psrad mm3, 15 // descale it | |R2| |R0|
movq [edi][DATASIZE*4+16], mm7 //store
psrad mm6, 15 // descale it | |R3| |R1|
movq mm7, qwTemp6 //restore
movq mm5, mm3
punpckldq mm3, mm6 //| |R1| |R0|
movq [edi][DATASIZE*6+16], mm7 //store
punpckhdq mm5, mm6 //| |R3| |R2|
movq [edi][DATASIZE*5+16], mm3 //store
movq [edi][DATASIZE*5+24], mm5 //store
//******************************************************************************
// This completes all 8x8 dct locations for the column case.
//******************************************************************************
emms
}
}
#endif /* DCT_ISLOW_SUPPORTED */