2138 lines
53 KiB
C++
2138 lines
53 KiB
C++
#include "stdafx.h"
|
|
#pragma hdrstop
|
|
|
|
/***************************************************************************
|
|
*
|
|
* INTEL Corporation Proprietary Information
|
|
*
|
|
*
|
|
* Copyright (c) 1996 Intel Corporation.
|
|
* All rights reserved.
|
|
*
|
|
***************************************************************************
|
|
*/
|
|
/*
|
|
* jfdctint.c
|
|
*
|
|
* Copyright (C) 1991-1996, Thomas G. Lane.
|
|
* This file is part of the Independent JPEG Group's software.
|
|
* For conditions of distribution and use, see the accompanying README file.
|
|
*
|
|
* This file contains a slow-but-accurate integer implementation of the
|
|
* forward DCT (Discrete Cosine Transform).
|
|
*
|
|
* A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
|
|
* on each column. Direct algorithms are also available, but they are
|
|
* much more complex and seem not to be any faster when reduced to code.
|
|
*
|
|
* This implementation is based on an algorithm described in
|
|
* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
|
|
* Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
|
|
* Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
|
|
* The primary algorithm described there uses 11 multiplies and 29 adds.
|
|
* We use their alternate method with 12 multiplies and 32 adds.
|
|
* The advantage of this method is that no data path contains more than one
|
|
* multiplication; this allows a very simple and accurate implementation in
|
|
* scaled fixed-point arithmetic, with a minimal number of shifts.
|
|
*/
|
|
|
|
#define JPEG_INTERNALS
|
|
#include "jinclude.h"
|
|
#include "jpeglib.h"
|
|
#include "jdct.h" /* Private declarations for DCT subsystem */
|
|
|
|
#ifdef DCT_ISLOW_SUPPORTED
|
|
|
|
|
|
/*
|
|
* This module is specialized to the case DATASIZE = 8.
|
|
*/
|
|
|
|
#if DCTSIZE != 8
|
|
Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
|
|
#endif
|
|
|
|
|
|
/*
|
|
* The poop on this scaling stuff is as follows:
|
|
*
|
|
* Each 1-D DCT step produces outputs which are a factor of sqrt(N)
|
|
* larger than the true DCT outputs. The final outputs are therefore
|
|
* a factor of N larger than desired; since N=8 this can be cured by
|
|
* a simple right shift at the end of the algorithm. The advantage of
|
|
* this arrangement is that we save two multiplications per 1-D DCT,
|
|
* because the y0 and y4 outputs need not be divided by sqrt(N).
|
|
* In the IJG code, this factor of 8 is removed by the quantization step
|
|
* (in jcdctmgr.c), NOT in this module.
|
|
*
|
|
* We have to do addition and subtraction of the integer inputs, which
|
|
* is no problem, and multiplication by fractional constants, which is
|
|
* a problem to do in integer arithmetic. We multiply all the constants
|
|
* by CONST_SCALE and convert them to integer constants (thus retaining
|
|
* CONST_BITS bits of precision in the constants). After doing a
|
|
* multiplication we have to divide the product by CONST_SCALE, with proper
|
|
* rounding, to produce the correct output. This division can be done
|
|
* cheaply as a right shift of CONST_BITS bits. We postpone shifting
|
|
* as long as possible so that partial sums can be added together with
|
|
* full fractional precision.
|
|
*
|
|
* The outputs of the first pass are scaled up by PASS1_BITS bits so that
|
|
* they are represented to better-than-integral precision. These outputs
|
|
* require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
|
|
* with the recommended scaling. (For 12-bit sample data, the intermediate
|
|
* array is INT32 anyway.)
|
|
*
|
|
* To avoid overflow of the 32-bit intermediate results in pass 2, we must
|
|
* have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
|
|
* shows that the values given below are the most effective.
|
|
*/
|
|
|
|
#if BITS_IN_JSAMPLE == 8
|
|
#define CONST_BITS 13
|
|
#define PASS1_BITS 2
|
|
#else
|
|
#define CONST_BITS 13
|
|
#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
|
|
#endif
|
|
|
|
/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
|
|
* causing a lot of useless floating-point operations at run time.
|
|
* To get around this we use the following pre-calculated constants.
|
|
* If you change CONST_BITS you may want to add appropriate values.
|
|
* (With a reasonable C compiler, you can just rely on the FIX() macro...)
|
|
*/
|
|
|
|
#if CONST_BITS == 13
|
|
#define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */
|
|
#define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */
|
|
#define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */
|
|
#define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */
|
|
#define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */
|
|
#define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */
|
|
#define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */
|
|
#define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */
|
|
#define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */
|
|
#define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */
|
|
#define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */
|
|
#define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */
|
|
#else
|
|
#define FIX_0_298631336 FIX(0.298631336)
|
|
#define FIX_0_390180644 FIX(0.390180644)
|
|
#define FIX_0_541196100 FIX(0.541196100)
|
|
#define FIX_0_765366865 FIX(0.765366865)
|
|
#define FIX_0_899976223 FIX(0.899976223)
|
|
#define FIX_1_175875602 FIX(1.175875602)
|
|
#define FIX_1_501321110 FIX(1.501321110)
|
|
#define FIX_1_847759065 FIX(1.847759065)
|
|
#define FIX_1_961570560 FIX(1.961570560)
|
|
#define FIX_2_053119869 FIX(2.053119869)
|
|
#define FIX_2_562915447 FIX(2.562915447)
|
|
#define FIX_3_072711026 FIX(3.072711026)
|
|
#endif
|
|
|
|
const __int64 Const_1 = 0x0000000100000001;
|
|
const __int64 Const_2 = 0x0002000200020002;
|
|
const __int64 Const_1024 = 0x0000040000000400;
|
|
const __int64 Const_16384 = 0x0000400000004000;
|
|
const __int64 Const_FFFF = 0xFFFFFFFFFFFFFFFF;
|
|
|
|
const __int64 Const_0xFIX_0_298631336 = 0x0000098e0000098e;
|
|
const __int64 Const_FIX_0_298631336x0 = 0x098e0000098e0000;
|
|
const __int64 Const_0xFIX_0_390180644 = 0x00000c7c00000c7c;
|
|
const __int64 Const_FIX_0_390180644x0 = 0x0c7c00000c7c0000;
|
|
const __int64 Const_0xFIX_0_541196100 = 0x0000115100001151;
|
|
const __int64 Const_FIX_0_541196100x0 = 0x1151000011510000;
|
|
const __int64 Const_0xFIX_0_765366865 = 0x0000187e0000187e;
|
|
const __int64 Const_FIX_0_765366865x0 = 0x187e0000187e0000;
|
|
const __int64 Const_0xFIX_0_899976223 = 0x00001ccd00001ccd;
|
|
const __int64 Const_FIX_0_899976223x0 = 0x1ccd00001ccd0000;
|
|
const __int64 Const_0xFIX_1_175875602 = 0x000025a1000025a1;
|
|
const __int64 Const_FIX_1_175875602x0 = 0x25a1000025a10000;
|
|
const __int64 Const_0xFIX_1_501321110 = 0x0000300b0000300b;
|
|
const __int64 Const_FIX_1_501321110x0 = 0x300b0000300b0000;
|
|
const __int64 Const_0xFIX_1_847759065 = 0x00003b2100003b21;
|
|
const __int64 Const_FIX_1_847759065x0 = 0x3b2100003b210000;
|
|
const __int64 Const_0xFIX_1_961570560 = 0x00003ec500003ec5;
|
|
const __int64 Const_FIX_1_961570560x0 = 0x3ec500003ec50000;
|
|
const __int64 Const_0xFIX_2_053119869 = 0x000041b3000041b3;
|
|
const __int64 Const_FIX_2_053119869x0 = 0x41b3000041b30000;
|
|
const __int64 Const_0xFIX_2_562915447 = 0x0000520300005203;
|
|
const __int64 Const_FIX_2_562915447x0 = 0x5203000052030000;
|
|
const __int64 Const_0xFIX_3_072711026 = 0x0000625400006254;
|
|
const __int64 Const_FIX_3_072711026x0 = 0x6254000062540000;
|
|
|
|
/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
|
|
* For 8-bit samples with the recommended scaling, all the variable
|
|
* and constant values involved are no more than 16 bits wide, so a
|
|
* 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
|
|
* For 12-bit samples, a full 32-bit multiplication will be needed.
|
|
*/
|
|
|
|
#if BITS_IN_JSAMPLE == 8
|
|
#define MULTIPLY(var,const) MULTIPLY16C16(var,const)
|
|
#else
|
|
#define MULTIPLY(var,const) ((var) * (const))
|
|
#endif
|
|
|
|
#define DATASIZE 32
|
|
/*
|
|
* Perform the forward DCT on one block of samples.
|
|
*/
|
|
|
|
GLOBAL(void)
|
|
mfdct8x8llm (DCTELEM * data)
|
|
{
|
|
__int64 qwTemp0, qwTemp2, qwTemp4, qwTemp6;
|
|
__int64 qwZ1, qwZ2, qwZ4_even, qwZ4_odd;
|
|
__int64 qwTmp4_Z3_Even, qwTmp4_Z3_Odd;
|
|
__int64 qwTmp6_Z3_Even, qwTmp6_Z3_Odd;
|
|
__int64 qwTmp5_Z4_Even, qwTmp5_Z4_Odd;
|
|
__int64 qwScratch7, qwScratch6, qwScratch5;
|
|
|
|
__asm{
|
|
|
|
mov edi, [data]
|
|
|
|
|
|
// transpose the bottom right quadrant(4X4) of the matrix
|
|
// --------- ---------
|
|
// | M1 | M2 | | M1'| M3'|
|
|
// --------- --> ---------
|
|
// | M3 | M4 | | M2'| M4'|
|
|
// --------- ---------
|
|
// Get the 32-bit quantities and pack into 16 bits
|
|
|
|
movq mm5, [edi][DATASIZE*4+16] //| w41 | w40 |
|
|
|
|
movq mm3, [edi][DATASIZE*4+24] //| w43 | w42 |
|
|
|
|
movq mm6, [edi][DATASIZE*5+16]
|
|
packssdw mm5, mm3 //|w43|w42|w41|w40|
|
|
|
|
movq mm7, [edi][DATASIZE*5+24]
|
|
movq mm4, mm5 // copy w4---0,1,3,5,6
|
|
|
|
movq mm3, [edi][DATASIZE*6+16]
|
|
packssdw mm6, mm7
|
|
|
|
movq mm2, [edi][DATASIZE*6+24]
|
|
punpcklwd mm5, mm6 //mm6 = w5
|
|
|
|
movq mm1, [edi][DATASIZE*7+16]
|
|
packssdw mm3, mm2
|
|
|
|
movq mm0, [edi][DATASIZE*7+24]
|
|
punpckhwd mm4, mm6 //---0,1,3,5,6
|
|
|
|
packssdw mm1, mm0
|
|
movq mm7, mm3 //---0,1,2,3,5,6 w6
|
|
|
|
punpcklwd mm3, mm1 //mm1 = w7
|
|
movq mm0, mm5 //---0,2,3,4,5,6,7
|
|
|
|
movq mm2, [edi][DATASIZE*4] //| w01 | w00 |
|
|
punpckhdq mm0, mm3 // transposed w5---0,2,4,6,7
|
|
|
|
punpckhwd mm7, mm1 //---0,2,3,5,6,7
|
|
|
|
movq mm1, [edi][DATASIZE*5+8]
|
|
movq mm6, mm4 //---0,2,3,4,6,7
|
|
|
|
movq [edi][DATASIZE*5+16], mm0 // store w5
|
|
punpckldq mm5, mm3 // transposed w4
|
|
|
|
movq mm3, [edi][DATASIZE*5]
|
|
punpckldq mm4, mm7 // transposed w6
|
|
|
|
movq mm0, [edi][DATASIZE*4+8] //| w03 | w02 |
|
|
punpckhdq mm6, mm7 // transposed w7---0,3,6,7
|
|
|
|
|
|
// transpose the bottom left quadrant(4X4) of the matrix and place
|
|
// in the top right quadrant while doing the same for the top
|
|
// right quadrant
|
|
// --------- ---------
|
|
// | M1 | M2 | | M1'| M3'|
|
|
// --------- --> ---------
|
|
// | M3 | M4 | | M2'| M4'|
|
|
// --------- ---------
|
|
|
|
movq [edi][DATASIZE*4+16], mm5 // store w4
|
|
packssdw mm2, mm0 //|w03|w02|w01|w00|
|
|
|
|
movq mm5, [edi][DATASIZE*7]
|
|
packssdw mm3, mm1
|
|
|
|
movq mm0, [edi][DATASIZE*7+8]
|
|
|
|
movq [edi][DATASIZE*7+16], mm6 // store w7---5,6,7
|
|
packssdw mm5, mm0
|
|
|
|
movq mm6, [edi][DATASIZE*6]
|
|
movq mm0, mm2 // copy w0---0,1,3,5,6
|
|
|
|
movq mm7, [edi][DATASIZE*6+8]
|
|
punpcklwd mm2, mm3 //mm6 = w1
|
|
|
|
movq [edi][DATASIZE*6+16], mm4 // store w6---3,5,6,7
|
|
packssdw mm6, mm7
|
|
|
|
movq mm1, [edi][DATASIZE*0+24]
|
|
punpckhwd mm0, mm3 //---0,1,3,5,6
|
|
|
|
movq mm7, mm6 //---0,1,2,3,5,6 w2
|
|
punpcklwd mm6, mm5 //mm1 = w3
|
|
|
|
movq mm3, [edi][DATASIZE*0+16]
|
|
punpckhwd mm7, mm5 //---0,2,3,5,6,7
|
|
|
|
movq mm4, [edi][DATASIZE*2+24]
|
|
packssdw mm3, mm1
|
|
|
|
movq mm1, mm2 //---0,2,3,4,5,6,7
|
|
punpckldq mm2, mm6 // transposed w4
|
|
|
|
movq mm5, [edi][DATASIZE*2+16]
|
|
punpckhdq mm1, mm6 // transposed w5---0,2,4,6,7
|
|
|
|
movq [edi][DATASIZE*0+16], mm2 // store w4
|
|
packssdw mm5, mm4
|
|
|
|
movq mm4, [edi][DATASIZE*1+16]
|
|
movq mm6, mm0 //---0,2,3,4,6,7
|
|
|
|
movq mm2, [edi][DATASIZE*1+24]
|
|
punpckldq mm0, mm7 // transposed w6
|
|
|
|
movq [edi][DATASIZE*1+16], mm1 // store w5
|
|
punpckhdq mm6, mm7 // transposed w7---0,3,6,7
|
|
|
|
movq mm7, [edi][DATASIZE*3+24]
|
|
packssdw mm4, mm2
|
|
|
|
movq [edi][DATASIZE*2+16], mm0 // store w6---3,5,6,7
|
|
movq mm1, mm3 // copy w4---0,1,3,5,6
|
|
|
|
movq mm2, [edi][DATASIZE*3+16]
|
|
punpcklwd mm3, mm4 //mm6 = w5
|
|
|
|
movq [edi][DATASIZE*3+16], mm6 // store w7---5,6,7
|
|
packssdw mm2, mm7
|
|
|
|
|
|
// transpose the bottom left quadrant(4X4) of the matrix
|
|
// --------- ---------
|
|
// | M1 | M2 | | M1'| M3'|
|
|
// --------- --> ---------
|
|
// | M3 | M4 | | M2'| M4'|
|
|
// --------- ---------
|
|
|
|
movq mm6, [edi][DATASIZE*0] //| w01 | w00 |
|
|
punpckhwd mm1, mm4 //---0,1,3,5,6
|
|
|
|
movq mm7, mm5 //---0,1,2,3,5,6 w6
|
|
punpcklwd mm5, mm2 //mm1 = w7
|
|
|
|
movq mm4, [edi][DATASIZE*0+8] //| w03 | w02 |
|
|
punpckhwd mm7, mm2 //---0,2,3,5,6,7
|
|
|
|
movq mm0, mm3 //---0,2,3,4,5,6,7
|
|
packssdw mm6, mm4 //|w03|w02|w01|w00|
|
|
|
|
movq mm2, [edi][DATASIZE*2+8]
|
|
punpckldq mm3, mm5 // transposed w4
|
|
|
|
movq mm4, [edi][DATASIZE*1]
|
|
punpckhdq mm0, mm5 // transposed w5---0,2,4,6,7
|
|
|
|
movq [edi][DATASIZE*4], mm3 // store w4
|
|
movq mm5, mm1 //---0,2,3,4,6,7
|
|
|
|
movq mm3, [edi][DATASIZE*2]
|
|
punpckldq mm1, mm7 // transposed w6
|
|
|
|
movq [edi][DATASIZE*5], mm0 // store w5
|
|
punpckhdq mm5, mm7 // transposed w7---0,3,6,7
|
|
|
|
movq mm7, [edi][DATASIZE*1+8]
|
|
packssdw mm3, mm2
|
|
|
|
movq [edi][DATASIZE*7], mm5 // store w7---5,6,7
|
|
movq mm2, mm6 // copy w0---0,1,3,5,6
|
|
|
|
movq [edi][DATASIZE*6], mm1 // store w6---3,5,6,7
|
|
packssdw mm4, mm7
|
|
|
|
// transpose the top left quadrant(4X4) of the matrix
|
|
// --------- ---------
|
|
// | M1 | M2 | | M1'| M3'|
|
|
// --------- --> ---------
|
|
// | M3 | M4 | | M2'| M4'|
|
|
// --------- ---------
|
|
|
|
// Get the 32-bit quantities and pack into 16 bits
|
|
movq mm1, [edi][DATASIZE*3]
|
|
punpcklwd mm6, mm4 //mm6 = w1
|
|
|
|
movq mm0, [edi][DATASIZE*3+8]
|
|
punpckhwd mm2, mm4 //---0,1,3,5,6
|
|
|
|
packssdw mm1, mm0
|
|
movq mm5, mm3 //---0,1,2,3,5,6 w2
|
|
|
|
punpcklwd mm3, mm1 //mm1 = w3
|
|
movq mm0, mm6 //---0,2,3,4,5,6,7
|
|
|
|
movq mm4, [edi][DATASIZE*7]
|
|
punpckhwd mm5, mm1 //---0,2,3,5,6,7
|
|
|
|
movq mm1, [edi][DATASIZE*4]
|
|
punpckhdq mm6, mm3 // transposed w4
|
|
|
|
punpckldq mm0, mm3 // transposed w5---0,2,4,6,7
|
|
movq mm3, mm2 //---0,2,3,4,6,7
|
|
|
|
movq [edi][DATASIZE*0], mm0 // store w4
|
|
punpckldq mm2, mm5 // transposed w6
|
|
|
|
movq [edi][DATASIZE*1], mm6 // store w5
|
|
punpckhdq mm3, mm5 // transposed w7---0,3,6,7
|
|
|
|
movq [edi][DATASIZE*2], mm2 // store w6---3,5,6,7
|
|
paddw mm0, mm4
|
|
|
|
movq [edi][DATASIZE*3], mm3 // store w7---5,6,7
|
|
paddw mm3, mm1
|
|
|
|
|
|
//******************************************************************************
|
|
// End of transpose. Begin row dct.
|
|
//******************************************************************************
|
|
|
|
// tmp0 = dataptr[DATASIZE*0] + dataptr[DATASIZE*7];
|
|
|
|
movq mm7, mm0
|
|
paddw mm0, mm3 //tmp10
|
|
|
|
paddw mm6, [edi][DATASIZE*6]
|
|
psubw mm7, mm3 //tmp13
|
|
|
|
paddw mm2, [edi][DATASIZE*5]
|
|
movq mm1, mm6
|
|
|
|
// tmp10 = tmp0 + tmp3;
|
|
|
|
paddw mm1, mm2 //tmp11
|
|
psubw mm6, mm2 //tmp12
|
|
|
|
// dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
|
|
// dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
|
|
|
|
movq mm3, mm0
|
|
paddw mm0, mm1 //tmp10 + tmp11
|
|
|
|
psubw mm3, mm1 //tmp10 - tmp11
|
|
psllw mm0, 2 // descale it
|
|
|
|
movq mm1, mm6 //copy tmp12
|
|
psllw mm3, 2 // descale it
|
|
|
|
// z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
|
|
|
|
movq qwTemp0, mm0 //store
|
|
paddw mm1, mm7 //tmp12 + tmp13
|
|
|
|
movq mm2, mm1 //copy
|
|
|
|
// dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
|
|
// CONST_BITS-PASS1_BITS);
|
|
// dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
|
|
// CONST_BITS-PASS1_BITS);
|
|
|
|
pmaddwd mm1, Const_0xFIX_0_541196100 //| z12 | z10 |
|
|
movq mm4, mm7
|
|
|
|
pmaddwd mm7, Const_0xFIX_0_765366865 //| r2 | r0 |
|
|
movq mm0, mm6
|
|
|
|
pmaddwd mm2, Const_FIX_0_541196100x0 //| z13 | z11 |
|
|
|
|
pmaddwd mm4, Const_FIX_0_765366865x0 //| r3 | r1 |
|
|
|
|
pmaddwd mm6, Const_0xFIX_1_847759065 //| r2 | r0 |
|
|
paddd mm7, mm1 // add z1
|
|
|
|
pmaddwd mm0, Const_FIX_1_847759065x0 //| r3 | r1 |
|
|
|
|
paddd mm7, Const_1024
|
|
paddd mm4, mm2
|
|
|
|
paddd mm4, Const_1024
|
|
psrad mm7, 11 // descale it | |R2| |R0|
|
|
|
|
//!!!!!! Negate the results in mm6 and mm0
|
|
pxor mm6, Const_FFFF //invert result
|
|
psrad mm4, 11 // descale it | |R3| |R1|
|
|
|
|
paddd mm6, Const_1 // 2's complement
|
|
movq mm5, mm7
|
|
|
|
pxor mm0, Const_FFFF //invert result
|
|
punpckldq mm7, mm4 //| |R1| |R0|
|
|
|
|
paddd mm0, Const_1 // 2's complement
|
|
punpckhdq mm5, mm4 //| |R3| |R2|
|
|
|
|
movq qwTemp4, mm3 //store
|
|
packssdw mm7, mm5
|
|
|
|
movq mm5, Const_1024
|
|
paddd mm6, mm1 // add z1
|
|
|
|
movq qwTemp2, mm7 //store
|
|
paddd mm6, mm5
|
|
|
|
paddd mm0, mm2
|
|
psrad mm6, 11 // descale it | |R2| |R0|
|
|
|
|
paddd mm0, mm5
|
|
movq mm5, mm6
|
|
|
|
movq mm4, [edi][DATASIZE*3]
|
|
psrad mm0, 11 // descale it | |R3| |R1|
|
|
|
|
psubw mm4, [edi][DATASIZE*4]
|
|
punpckldq mm6, mm0 //| |R1| |R0|
|
|
|
|
movq mm7, [edi][DATASIZE*0]
|
|
punpckhdq mm5, mm0 //| |R3| |R2|
|
|
|
|
psubw mm7, [edi][DATASIZE*7]
|
|
packssdw mm6, mm5
|
|
|
|
// tmp4 = dataptr[3] - dataptr[4];
|
|
|
|
movq mm5, [edi][DATASIZE*2]
|
|
movq mm0, mm4
|
|
|
|
psubw mm5, [edi][DATASIZE*5]
|
|
movq mm2, mm4
|
|
|
|
movq qwTemp6, mm6 //store
|
|
paddw mm0, mm7 //z1
|
|
|
|
movq mm6, [edi][DATASIZE*1]
|
|
movq mm1, mm5
|
|
|
|
psubw mm6, [edi][DATASIZE*6]
|
|
movq mm3, mm5
|
|
|
|
// z1 = tmp4 + tmp7;
|
|
|
|
movq qwScratch5, mm5
|
|
paddw mm3, mm7 //z4
|
|
|
|
movq qwScratch7, mm7
|
|
paddw mm2, mm6 //z3
|
|
|
|
movq qwZ1, mm0 //store
|
|
paddw mm1, mm6 //z2
|
|
|
|
// z3 = MULTIPLY(z3, - FIX_1_961570560);
|
|
// z4 = MULTIPLY(z4, - FIX_0_390180644);
|
|
// z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
|
|
|
|
movq mm0, Const_FFFF
|
|
movq mm5, mm2
|
|
|
|
movq qwZ2, mm1
|
|
movq mm7, mm2
|
|
|
|
pmaddwd mm5, Const_0xFIX_1_961570560 //z32, z30
|
|
paddw mm2, mm3 //z3 + z4
|
|
|
|
pmaddwd mm7, Const_FIX_1_961570560x0 //z33, z31
|
|
movq mm1, mm3
|
|
|
|
movq qwScratch6, mm6
|
|
movq mm6, mm2
|
|
|
|
// z3 += z5;
|
|
|
|
//!!!!!! Negate the results
|
|
pmaddwd mm2, Const_0xFIX_1_175875602 //z52, z50
|
|
pxor mm5, mm0 //invert result
|
|
|
|
paddd mm5, Const_1 // 2's complement
|
|
pxor mm7, mm0 //invert result
|
|
|
|
pmaddwd mm3, Const_0xFIX_0_390180644 //z42, z40
|
|
|
|
pmaddwd mm1, Const_FIX_0_390180644x0 //z43, z41
|
|
paddd mm5, mm2 //z3_even
|
|
|
|
paddd mm7, Const_1 // 2's complement
|
|
|
|
pmaddwd mm6, Const_FIX_1_175875602x0 //z53, z51
|
|
pxor mm3, mm0 //invert result
|
|
|
|
// z4 += z5;
|
|
|
|
//!!!!!! Negate the results
|
|
paddd mm3, Const_1 // 2's complement
|
|
pxor mm1, mm0 //invert result
|
|
|
|
paddd mm1, Const_1 // 2's complement
|
|
paddd mm3, mm2
|
|
|
|
movq mm0, qwScratch6
|
|
movq mm2, mm4
|
|
|
|
// tmp4 = MULTIPLY(tmp4, FIX_0_298631336);
|
|
|
|
pmaddwd mm4, Const_0xFIX_0_298631336 //T42, T40
|
|
paddd mm7, mm6 //z3_odd
|
|
|
|
pmaddwd mm2, Const_FIX_0_298631336x0 //T43, T41
|
|
paddd mm1, mm6
|
|
|
|
movq mm6, mm0
|
|
paddd mm4, mm5
|
|
|
|
// tmp6 = MULTIPLY(tmp6, FIX_3_072711026);
|
|
|
|
pmaddwd mm6, Const_0xFIX_3_072711026 //T62, T60
|
|
paddd mm2, mm7
|
|
|
|
pmaddwd mm0, Const_FIX_3_072711026x0 //T63, T61
|
|
|
|
movq qwTmp4_Z3_Odd, mm2
|
|
|
|
movq qwTmp4_Z3_Even, mm4
|
|
paddd mm6, mm5
|
|
|
|
movq mm5, qwScratch5
|
|
paddd mm0, mm7
|
|
|
|
movq mm7, qwScratch7
|
|
movq mm2, mm5
|
|
|
|
movq qwTmp6_Z3_Even, mm6
|
|
movq mm6, mm7
|
|
|
|
// tmp5 = MULTIPLY(tmp5, FIX_2_053119869);
|
|
// tmp7 = MULTIPLY(tmp7, FIX_1_501321110);
|
|
|
|
pmaddwd mm5, Const_0xFIX_2_053119869 //T52, T50
|
|
|
|
pmaddwd mm2, Const_FIX_2_053119869x0 //T53, T51
|
|
|
|
pmaddwd mm7, Const_0xFIX_1_501321110 //T72, T70
|
|
|
|
pmaddwd mm6, Const_FIX_1_501321110x0 //T73, T71
|
|
paddd mm5, mm3
|
|
|
|
movq qwTmp6_Z3_Odd, mm0
|
|
paddd mm2, mm1
|
|
|
|
movq qwTmp5_Z4_Even, mm5
|
|
paddd mm7, mm3
|
|
|
|
movq mm0, qwZ1
|
|
paddd mm6, mm1
|
|
|
|
// z1 = MULTIPLY(z1, - FIX_0_899976223);
|
|
|
|
movq mm1, Const_FFFF
|
|
movq mm4, mm0
|
|
|
|
//!!!!!! Negate the results
|
|
pmaddwd mm0, Const_0xFIX_0_899976223 //z12, z10
|
|
|
|
pmaddwd mm4, Const_FIX_0_899976223x0 //z13, z11
|
|
|
|
movq mm3, qwTmp4_Z3_Even
|
|
|
|
movq qwTmp5_Z4_Odd, mm2
|
|
pxor mm0, mm1 //invert result
|
|
|
|
movq mm2, qwTmp4_Z3_Odd
|
|
pxor mm4, mm1 //invert result
|
|
|
|
paddd mm4, Const_1 // 2's complement
|
|
paddd mm7, mm0 //tmp7 + z1 + z4 EVEN
|
|
|
|
paddd mm0, Const_1 // 2's complement
|
|
paddd mm6, mm4 //tmp7 + z1 + z4 ODD
|
|
|
|
// dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
|
|
|
|
paddd mm7, Const_1024 //rounding adj
|
|
paddd mm3, mm0 //tmp4 + z1 + z3 EVEN
|
|
|
|
paddd mm6, Const_1024 //rounding adj
|
|
psrad mm7, 11 // descale it | |R2| |R0|
|
|
|
|
psrad mm6, 11 // descale it | |R3| |R1|
|
|
|
|
movq mm5, mm7
|
|
punpckldq mm7, mm6 //| |R1| |R0|
|
|
|
|
// dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
|
|
|
|
punpckhdq mm5, mm6 //| |R3| |R2|
|
|
paddd mm2, mm4 //tmp4 + z1 + z3 ODD
|
|
|
|
paddd mm3, Const_1024 //rounding adj
|
|
packssdw mm7, mm5
|
|
|
|
paddd mm2, Const_1024 //rounding adj
|
|
psrad mm3, 11 // descale it | |R2| |R0|
|
|
|
|
movq mm0, qwZ2
|
|
psrad mm2, 11 // descale it | |R3| |R1|
|
|
|
|
movq mm5, mm3
|
|
movq mm4, mm0
|
|
|
|
// z2 = MULTIPLY(z2, - FIX_2_562915447);
|
|
|
|
pmaddwd mm0, Const_0xFIX_2_562915447 //z22, z20
|
|
punpckldq mm3, mm2 //| |R1| |R0|
|
|
|
|
pmaddwd mm4, Const_FIX_2_562915447x0 //z23, z21
|
|
punpckhdq mm5, mm2 //| |R3| |R2|
|
|
|
|
movq mm2, Const_FFFF
|
|
packssdw mm3, mm5
|
|
|
|
movq [edi][DATASIZE*1], mm7 //store
|
|
//!!!!!! Negate the results
|
|
pxor mm0, mm2 //invert result
|
|
|
|
movq mm5, Const_1
|
|
pxor mm4, mm2 //invert result
|
|
|
|
movq [edi][DATASIZE*7], mm3 //store
|
|
paddd mm0, mm5 // 2's complement
|
|
|
|
movq mm7, qwTmp6_Z3_Even
|
|
paddd mm4, mm5 // 2's complement
|
|
|
|
// dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
|
|
|
|
movq mm2, qwTmp6_Z3_Odd
|
|
paddd mm7, mm0 //tmp6 + z2 + z3 EVEN
|
|
|
|
paddd mm7, Const_1024 //rounding adj
|
|
paddd mm2, mm4 //tmp6 + z2 + z3 ODD
|
|
|
|
paddd mm2, Const_1024 //rounding adj
|
|
psrad mm7, 11 // descale it | |R2| |R0|
|
|
|
|
movq mm6, qwTemp0 //restore
|
|
psrad mm2, 11 // descale it | |R3| |R1|
|
|
|
|
movq mm3, qwTmp5_Z4_Even
|
|
movq mm5, mm7
|
|
|
|
movq [edi][DATASIZE*0], mm6 //store
|
|
punpckldq mm7, mm2 //| |R1| |R0|
|
|
|
|
movq mm1, qwTmp5_Z4_Odd
|
|
punpckhdq mm5, mm2 //| |R3| |R2|
|
|
|
|
movq mm6, qwTemp2 //restore
|
|
packssdw mm7, mm5
|
|
|
|
movq mm5, Const_1024
|
|
paddd mm3, mm0 //tmp5 + z2 + z4 EVEN
|
|
|
|
// dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
|
|
|
|
movq [edi][DATASIZE*3], mm7 //store
|
|
paddd mm1, mm4 //tmp5 + z2 + z4 ODD
|
|
|
|
movq mm7, qwTemp4 //restore
|
|
paddd mm3, mm5 //rounding adj
|
|
|
|
movq [edi][DATASIZE*2], mm6 //store
|
|
paddd mm1, mm5 //rounding adj
|
|
|
|
movq [edi][DATASIZE*4], mm7 //store
|
|
psrad mm3, 11 // descale it | |R2| |R0|
|
|
|
|
movq mm6, qwTemp6 //restore
|
|
psrad mm1, 11 // descale it | |R3| |R1|
|
|
|
|
movq mm0, [edi][DATASIZE*0+16]
|
|
movq mm5, mm3
|
|
|
|
movq [edi][DATASIZE*6], mm6 //store
|
|
punpckldq mm3, mm1 //| |R1| |R0|
|
|
|
|
paddw mm0, [edi][DATASIZE*7+16]
|
|
punpckhdq mm5, mm1 //| |R3| |R2|
|
|
|
|
movq mm1, [edi][DATASIZE*1+16]
|
|
packssdw mm3, mm5
|
|
|
|
paddw mm1, [edi][DATASIZE*6+16]
|
|
movq mm7, mm0
|
|
|
|
movq [edi][DATASIZE*5], mm3 //store
|
|
movq mm6, mm1
|
|
|
|
//******************************************************************************
|
|
// This completes 4x8 dct locations. Copy to do other 4x8.
|
|
//******************************************************************************
|
|
|
|
// tmp0 = dataptr[DATASIZE*0] + dataptr[DATASIZE*7];
|
|
|
|
movq mm3, [edi][DATASIZE*3+16]
|
|
|
|
paddw mm3, [edi][DATASIZE*4+16]
|
|
|
|
movq mm2, [edi][DATASIZE*2+16]
|
|
paddw mm0, mm3 //tmp10
|
|
|
|
paddw mm2, [edi][DATASIZE*5+16]
|
|
psubw mm7, mm3 //tmp13
|
|
|
|
// tmp10 = tmp0 + tmp3;
|
|
|
|
paddw mm1, mm2 //tmp11
|
|
psubw mm6, mm2 //tmp12
|
|
|
|
// dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
|
|
// dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
|
|
|
|
movq mm3, mm0
|
|
paddw mm0, mm1 //tmp10 + tmp11
|
|
|
|
psubw mm3, mm1 //tmp10 - tmp11
|
|
psllw mm0, 2 // descale it
|
|
|
|
movq mm1, mm6 //copy tmp12
|
|
psllw mm3, 2 // descale it
|
|
|
|
// z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
|
|
|
|
movq qwTemp0, mm0 //store
|
|
paddw mm1, mm7 //tmp12 + tmp13
|
|
|
|
//;;; movq [edi][DATASIZE*6+16], mm4 ; store w6---3,5,6,7
|
|
movq mm2, mm1 //copy
|
|
|
|
// dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
|
|
// CONST_BITS-PASS1_BITS);
|
|
// dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
|
|
// CONST_BITS-PASS1_BITS);
|
|
|
|
pmaddwd mm1, Const_0xFIX_0_541196100 //| z12 | z10 |
|
|
movq mm4, mm7
|
|
|
|
pmaddwd mm7, Const_0xFIX_0_765366865 //| r2 | r0 |
|
|
movq mm0, mm6
|
|
|
|
pmaddwd mm2, Const_FIX_0_541196100x0 //| z13 | z11 |
|
|
|
|
pmaddwd mm4, Const_FIX_0_765366865x0 //| r3 | r1 |
|
|
|
|
pmaddwd mm6, Const_0xFIX_1_847759065 //| r2 | r0 |
|
|
paddd mm7, mm1 // add z1
|
|
|
|
pmaddwd mm0, Const_FIX_1_847759065x0 //| r3 | r1 |
|
|
|
|
paddd mm7, Const_1024
|
|
paddd mm4, mm2
|
|
|
|
paddd mm4, Const_1024
|
|
psrad mm7, 11 // descale it | |R2| |R0|
|
|
|
|
//!!!!!! Negate the results in mm6 and mm0
|
|
pxor mm6, Const_FFFF //invert result
|
|
psrad mm4, 11 // descale it | |R3| |R1|
|
|
|
|
paddd mm6, Const_1 // 2's complement
|
|
movq mm5, mm7
|
|
|
|
pxor mm0, Const_FFFF //invert result
|
|
punpckldq mm7, mm4 //| |R1| |R0|
|
|
|
|
paddd mm0, Const_1 // 2's complement
|
|
punpckhdq mm5, mm4 //| |R3| |R2|
|
|
|
|
movq qwTemp4, mm3 //store
|
|
packssdw mm7, mm5
|
|
|
|
movq mm5, Const_1024
|
|
paddd mm6, mm1 // add z1
|
|
|
|
movq qwTemp2, mm7 //store
|
|
paddd mm0, mm2
|
|
|
|
movq mm4, [edi][DATASIZE*3+16]
|
|
paddd mm6, mm5
|
|
|
|
psubw mm4, [edi][DATASIZE*4+16]
|
|
psrad mm6, 11 // descale it | |R2| |R0|
|
|
|
|
paddd mm0, mm5
|
|
movq mm5, mm6
|
|
|
|
movq mm7, [edi][DATASIZE*0+16]
|
|
psrad mm0, 11 // descale it | |R3| |R1|
|
|
|
|
psubw mm7, [edi][DATASIZE*7+16]
|
|
punpckldq mm6, mm0 //| |R1| |R0|
|
|
|
|
punpckhdq mm5, mm0 //| |R3| |R2|
|
|
movq mm0, mm4
|
|
|
|
packssdw mm6, mm5
|
|
movq mm2, mm4
|
|
|
|
// tmp4 = dataptr[3] - dataptr[4];
|
|
|
|
movq mm5, [edi][DATASIZE*2+16]
|
|
paddw mm0, mm7 //z1
|
|
|
|
psubw mm5, [edi][DATASIZE*5+16]
|
|
|
|
movq qwTemp6, mm6 //store
|
|
movq mm1, mm5
|
|
|
|
movq mm6, [edi][DATASIZE*1+16]
|
|
movq mm3, mm5
|
|
|
|
// z1 = tmp4 + tmp7;
|
|
|
|
psubw mm6, [edi][DATASIZE*6+16]
|
|
paddw mm3, mm7 //z4
|
|
|
|
movq qwScratch7, mm7
|
|
paddw mm2, mm6 //z3
|
|
|
|
movq qwScratch5, mm5
|
|
paddw mm1, mm6 //z2
|
|
|
|
// z3 = MULTIPLY(z3, - FIX_1_961570560);
|
|
// z4 = MULTIPLY(z4, - FIX_0_390180644);
|
|
// z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
|
|
|
|
movq qwZ1, mm0 //store
|
|
movq mm5, mm2
|
|
|
|
movq qwZ2, mm1
|
|
movq mm7, mm2
|
|
|
|
movq mm0, Const_FFFF
|
|
paddw mm2, mm3 //z3 + z4
|
|
|
|
pmaddwd mm5, Const_0xFIX_1_961570560 //z32, z30
|
|
movq mm1, mm3
|
|
|
|
pmaddwd mm7, Const_FIX_1_961570560x0 //z33, z31
|
|
|
|
movq qwScratch6, mm6
|
|
movq mm6, mm2
|
|
|
|
// z3 += z5//
|
|
|
|
//!!!!!! Negate the results
|
|
pmaddwd mm2, Const_0xFIX_1_175875602 //z52, z50
|
|
pxor mm5, mm0 //invert result
|
|
|
|
paddd mm5, Const_1 // 2's complement
|
|
pxor mm7, mm0 //invert result
|
|
|
|
pmaddwd mm3, Const_0xFIX_0_390180644 //z42, z40
|
|
|
|
pmaddwd mm1, Const_FIX_0_390180644x0 //z43, z41
|
|
paddd mm5, mm2 //z3_even
|
|
|
|
paddd mm7, Const_1 // 2's complement
|
|
|
|
pmaddwd mm6, Const_FIX_1_175875602x0 //z53, z51
|
|
pxor mm3, mm0 //invert result
|
|
|
|
// z4 += z5;
|
|
|
|
//!!!!!! Negate the results
|
|
paddd mm3, Const_1 // 2's complement
|
|
pxor mm1, mm0 //invert result
|
|
|
|
paddd mm1, Const_1 // 2's complement
|
|
paddd mm3, mm2
|
|
|
|
movq mm0, qwScratch6
|
|
movq mm2, mm4
|
|
|
|
// tmp4 = MULTIPLY(tmp4, FIX_0_298631336);
|
|
|
|
pmaddwd mm4, Const_0xFIX_0_298631336 //T42, T40
|
|
paddd mm7, mm6 //z3_odd
|
|
|
|
pmaddwd mm2, Const_FIX_0_298631336x0 //T43, T41
|
|
paddd mm1, mm6
|
|
|
|
movq mm6, mm0
|
|
paddd mm4, mm5
|
|
|
|
// tmp6 = MULTIPLY(tmp6, FIX_3_072711026);
|
|
|
|
pmaddwd mm6, Const_0xFIX_3_072711026 //T62, T60
|
|
paddd mm2, mm7
|
|
|
|
pmaddwd mm0, Const_FIX_3_072711026x0 //T63, T61
|
|
|
|
movq qwTmp4_Z3_Odd, mm2
|
|
|
|
movq qwTmp4_Z3_Even, mm4
|
|
paddd mm6, mm5
|
|
|
|
movq mm5, qwScratch5
|
|
paddd mm0, mm7
|
|
|
|
movq mm7, qwScratch7
|
|
movq mm2, mm5
|
|
|
|
movq qwTmp6_Z3_Even, mm6
|
|
movq mm6, mm7
|
|
|
|
// tmp5 = MULTIPLY(tmp5, FIX_2_053119869);
|
|
// tmp7 = MULTIPLY(tmp7, FIX_1_501321110);
|
|
|
|
pmaddwd mm5, Const_0xFIX_2_053119869 //T52, T50
|
|
|
|
pmaddwd mm2, Const_FIX_2_053119869x0 //T53, T51
|
|
|
|
pmaddwd mm7, Const_0xFIX_1_501321110 //T72, T70
|
|
|
|
pmaddwd mm6, Const_FIX_1_501321110x0 //T73, T71
|
|
paddd mm5, mm3
|
|
|
|
movq qwTmp6_Z3_Odd, mm0
|
|
paddd mm2, mm1
|
|
|
|
movq qwTmp5_Z4_Even, mm5
|
|
paddd mm7, mm3
|
|
|
|
movq mm0, qwZ1
|
|
paddd mm6, mm1
|
|
|
|
// z1 = MULTIPLY(z1, - FIX_0_899976223);
|
|
|
|
movq mm1, Const_FFFF
|
|
movq mm4, mm0
|
|
|
|
//!!!!!! Negate the results
|
|
pmaddwd mm0, Const_0xFIX_0_899976223 //z12, z10
|
|
|
|
pmaddwd mm4, Const_FIX_0_899976223x0 //z13, z11
|
|
|
|
movq mm3, qwTmp4_Z3_Even
|
|
|
|
movq qwTmp5_Z4_Odd, mm2
|
|
pxor mm0, mm1 //invert result
|
|
|
|
movq mm2, qwTmp4_Z3_Odd
|
|
pxor mm4, mm1 //invert result
|
|
|
|
paddd mm4, Const_1 // 2's complement
|
|
paddd mm7, mm0 //tmp7 + z1 + z4 EVEN
|
|
|
|
paddd mm0, Const_1 // 2's complement
|
|
paddd mm6, mm4 //tmp7 + z1 + z4 ODD
|
|
|
|
// dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
|
|
|
|
paddd mm7, Const_1024 //rounding adj
|
|
paddd mm3, mm0 //tmp4 + z1 + z3 EVEN
|
|
|
|
paddd mm6, Const_1024 //rounding adj
|
|
psrad mm7, 11 // descale it | |R2| |R0|
|
|
|
|
psrad mm6, 11 // descale it | |R3| |R1|
|
|
|
|
movq mm5, mm7
|
|
punpckldq mm7, mm6 //| |R1| |R0|
|
|
|
|
// dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
|
|
|
|
punpckhdq mm5, mm6 //| |R3| |R2|
|
|
paddd mm2, mm4 //tmp4 + z1 + z3 ODD
|
|
|
|
paddd mm3, Const_1024 //rounding adj
|
|
packssdw mm7, mm5
|
|
|
|
paddd mm2, Const_1024 //rounding adj
|
|
psrad mm3, 11 // descale it | |R2| |R0|
|
|
|
|
movq mm0, qwZ2
|
|
psrad mm2, 11 // descale it | |R3| |R1|
|
|
|
|
movq mm5, mm3
|
|
movq mm4, mm0
|
|
|
|
// z2 = MULTIPLY(z2, - FIX_2_562915447);
|
|
|
|
pmaddwd mm0, Const_0xFIX_2_562915447 //z22, z20
|
|
punpckldq mm3, mm2 //| |R1| |R0|
|
|
|
|
pmaddwd mm4, Const_FIX_2_562915447x0 //z23, z21
|
|
punpckhdq mm5, mm2 //| |R3| |R2|
|
|
|
|
movq mm2, Const_FFFF
|
|
packssdw mm3, mm5
|
|
|
|
movq [edi][DATASIZE*1+16], mm7 //store
|
|
//!!!!!! Negate the results
|
|
pxor mm0, mm2 //invert result
|
|
|
|
movq mm5, Const_1
|
|
pxor mm4, mm2 //invert result
|
|
|
|
movq [edi][DATASIZE*7+16], mm3 //store
|
|
paddd mm0, mm5 // 2's complement
|
|
|
|
movq mm7, qwTmp6_Z3_Even
|
|
paddd mm4, mm5 // 2's complement
|
|
|
|
// dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
|
|
|
|
movq mm2, qwTmp6_Z3_Odd
|
|
paddd mm7, mm0 //tmp6 + z2 + z3 EVEN
|
|
|
|
paddd mm7, Const_1024 //rounding adj
|
|
paddd mm2, mm4 //tmp6 + z2 + z3 ODD
|
|
|
|
paddd mm2, Const_1024 //rounding adj
|
|
psrad mm7, 11 // descale it | |R2| |R0|
|
|
|
|
movq mm6, qwTemp0 //restore
|
|
psrad mm2, 11 // descale it | |R3| |R1|
|
|
|
|
movq mm5, mm7
|
|
|
|
movq [edi][DATASIZE*0+16], mm6 //store
|
|
punpckldq mm7, mm2 //| |R1| |R0|
|
|
|
|
movq mm3, qwTmp5_Z4_Even
|
|
punpckhdq mm5, mm2 //| |R3| |R2|
|
|
|
|
movq mm1, qwTmp5_Z4_Odd
|
|
packssdw mm7, mm5
|
|
|
|
movq mm6, qwTemp2 //restore
|
|
paddd mm3, mm0 //tmp5 + z2 + z4 EVEN
|
|
|
|
// dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
|
|
|
|
movq mm0, Const_1024
|
|
paddd mm1, mm4 //tmp5 + z2 + z4 ODD
|
|
|
|
movq [edi][DATASIZE*3+16], mm7 //store
|
|
paddd mm3, mm0 //rounding adj
|
|
|
|
movq mm7, qwTemp4 //restore
|
|
paddd mm1, mm0 //rounding adj
|
|
|
|
movq [edi][DATASIZE*2+16], mm6 //store
|
|
psrad mm3, 11 // descale it | |R2| |R0|
|
|
|
|
movq mm6, qwTemp6 //restore
|
|
psrad mm1, 11 // descale it | |R3| |R1|
|
|
|
|
movq [edi][DATASIZE*4+16], mm7 //store
|
|
movq mm5, mm3
|
|
|
|
movq [edi][DATASIZE*6+16], mm6 //store
|
|
punpckldq mm3, mm1 //| |R1| |R0|
|
|
|
|
punpckhdq mm5, mm1 //| |R3| |R2|
|
|
movq mm0, mm7 // copy w4---0,1,3,5,6
|
|
|
|
movq mm1, [edi][DATASIZE*7+16]
|
|
packssdw mm3, mm5
|
|
|
|
movq [edi][DATASIZE*5+16], mm3 //store
|
|
punpcklwd mm7, mm3 //mm6 = w5
|
|
|
|
//******************************************************************************
|
|
|
|
//******************************************************************************
|
|
// This completes all 8x8 dct locations for the row case.
|
|
// Now transpose the data for the columns.
|
|
//******************************************************************************
|
|
|
|
// transpose the bottom right quadrant(4X4) of the matrix
|
|
// --------- ---------
|
|
// | M1 | M2 | | M1'| M3'|
|
|
// --------- --> ---------
|
|
// | M3 | M4 | | M2'| M4'|
|
|
// --------- ---------
|
|
|
|
movq mm4, mm7 //---0,2,3,4,5,6,7
|
|
punpckhwd mm0, mm3 //---0,1,3,5,6
|
|
|
|
movq mm2, mm6 //---0,1,2,3,5,6 w6
|
|
punpcklwd mm6, mm1 //mm1 = w7
|
|
|
|
// tmp0 = dataptr[DATASIZE*0] + dataptr[DATASIZE*7]//
|
|
|
|
movq mm5, [edi][DATASIZE*5]
|
|
punpckldq mm7, mm6 // transposed w4
|
|
|
|
punpckhdq mm4, mm6 // transposed w5---0,2,4,6,7
|
|
movq mm6, mm0 //---0,2,3,4,6,7
|
|
|
|
movq [edi][DATASIZE*4+16], mm7 // store w4
|
|
punpckhwd mm2, mm1 //---0,2,3,5,6,7
|
|
|
|
movq [edi][DATASIZE*5+16], mm4 // store w5
|
|
punpckldq mm0, mm2 // transposed w6
|
|
|
|
movq mm7, [edi][DATASIZE*4]
|
|
punpckhdq mm6, mm2 // transposed w7---0,3,6,7
|
|
|
|
movq [edi][DATASIZE*6+16], mm0 // store w6---3,5,6,7
|
|
movq mm0, mm7 // copy w0---0,1,3,5,6
|
|
|
|
movq [edi][DATASIZE*7+16], mm6 // store w7---5,6,7
|
|
punpcklwd mm7, mm5 //mm6 = w1
|
|
|
|
// transpose the bottom left quadrant(4X4) of the matrix and place
|
|
// in the top right quadrant while doing the same for the top
|
|
// right quadrant
|
|
// --------- ---------
|
|
// | M1 | M2 | | M1'| M3'|
|
|
// --------- --> ---------
|
|
// | M3 | M4 | | M2'| M4'|
|
|
// --------- ---------
|
|
|
|
movq mm3, [edi][DATASIZE*6]
|
|
punpckhwd mm0, mm5 //---0,1,3,5,6
|
|
|
|
movq mm1, [edi][DATASIZE*7]
|
|
movq mm2, mm3 //---0,1,2,3,5,6 w2
|
|
|
|
movq mm6, [edi][DATASIZE*0+16]
|
|
punpcklwd mm3, mm1 //mm1 = w3
|
|
|
|
movq mm5, [edi][DATASIZE*1+16]
|
|
punpckhwd mm2, mm1 //---0,2,3,5,6,7
|
|
|
|
movq mm4, mm7 //---0,2,3,4,5,6,7
|
|
punpckldq mm7, mm3 // transposed w4
|
|
|
|
punpckhdq mm4, mm3 // transposed w5---0,2,4,6,7
|
|
movq mm3, mm0 //---0,2,3,4,6,7
|
|
|
|
movq [edi][DATASIZE*0+16], mm7 // store w4
|
|
punpckldq mm0, mm2 // transposed w6
|
|
|
|
movq mm1, [edi][DATASIZE*2+16]
|
|
punpckhdq mm3, mm2 // transposed w7---0,3,6,7
|
|
|
|
movq [edi][DATASIZE*2+16], mm0 // store w6---3,5,6,7
|
|
movq mm0, mm6 // copy w4---0,1,3,5,6
|
|
|
|
movq mm7, [edi][DATASIZE*3+16]
|
|
punpcklwd mm6, mm5 //mm6 = w5
|
|
|
|
movq [edi][DATASIZE*1+16], mm4 // store w5
|
|
punpckhwd mm0, mm5 //---0,1,3,5,6
|
|
|
|
// transpose the top right quadrant(4X4) of the matrix
|
|
// --------- ---------
|
|
// | M1 | M2 | | M1'| M3'|
|
|
// --------- --> ---------
|
|
// | M3 | M4 | | M2'| M4'|
|
|
// --------- ---------
|
|
|
|
movq mm2, mm1 //---0,1,2,3,5,6 w6
|
|
punpcklwd mm1, mm7 //mm1 = w7
|
|
|
|
movq mm4, mm6 //---0,2,3,4,5,6,7
|
|
punpckldq mm6, mm1 // transposed w4
|
|
|
|
movq [edi][DATASIZE*3+16], mm3 // store w7---5,6,7
|
|
punpckhdq mm4, mm1 // transposed w5---0,2,4,6,7
|
|
|
|
movq [edi][DATASIZE*4], mm6 // store w4
|
|
punpckhwd mm2, mm7 //---0,2,3,5,6,7
|
|
|
|
movq mm7, [edi][DATASIZE*0]
|
|
movq mm1, mm0 //---0,2,3,4,6,7
|
|
|
|
movq mm3, [edi][DATASIZE*1]
|
|
punpckldq mm0, mm2 // transposed w6
|
|
|
|
movq [edi][DATASIZE*5], mm4 // store w5
|
|
punpckhdq mm1, mm2 // transposed w7---0,3,6,7
|
|
|
|
movq [edi][DATASIZE*6], mm0 // store w6---3,5,6,7
|
|
movq mm2, mm7 // copy w0---0,1,3,5,6
|
|
|
|
movq mm4, [edi][DATASIZE*3]
|
|
punpcklwd mm7, mm3 //mm6 = w1
|
|
|
|
// transpose the top left quadrant(4X4) of the matrix
|
|
// --------- ---------
|
|
// | M1 | M2 | | M1'| M3'|
|
|
// --------- --> ---------
|
|
// | M3 | M4 | | M2'| M4'|
|
|
// --------- ---------
|
|
|
|
movq mm6, [edi][DATASIZE*2]
|
|
punpckhwd mm2, mm3 //---0,1,3,5,6
|
|
|
|
movq mm0, mm6 //---0,1,2,3,5,6 w2
|
|
punpcklwd mm6, mm4 //mm1 = w3
|
|
|
|
movq [edi][DATASIZE*7], mm1 // store w7---5,6,7
|
|
punpckhwd mm0, mm4 //---0,2,3,5,6,7
|
|
|
|
movq mm1, mm7 //---0,2,3,4,5,6,7
|
|
punpckldq mm7, mm6 // transposed w4
|
|
|
|
punpckhdq mm1, mm6 // transposed w5---0,2,4,6,7
|
|
movq mm6, mm2 //---0,2,3,4,6,7
|
|
|
|
movq [edi][DATASIZE*0], mm7 // store w4
|
|
punpckldq mm2, mm0 // transposed w6
|
|
|
|
paddw mm7, [edi][DATASIZE*7]
|
|
punpckhdq mm6, mm0 // transposed w7---0,3,6,7
|
|
|
|
movq [edi][DATASIZE*3], mm6 // store w7---5,6,7
|
|
movq mm4, mm7
|
|
|
|
paddw mm6, [edi][DATASIZE*4]
|
|
|
|
movq [edi][DATASIZE*1], mm1 // store w5
|
|
paddw mm7, mm6 //tmp10
|
|
|
|
|
|
//******************************************************************************
|
|
// This begins the column dct
|
|
//******************************************************************************
|
|
|
|
paddw mm1, [edi][DATASIZE*6]
|
|
psubw mm4, mm6 //tmp13
|
|
|
|
movq [edi][DATASIZE*2], mm2 // store w6---3,5,6,7
|
|
movq mm6, mm1
|
|
|
|
paddw mm2, [edi][DATASIZE*5]
|
|
movq mm3, mm7
|
|
|
|
paddw mm1, mm2 //tmp11
|
|
psubw mm6, mm2 //tmp12
|
|
|
|
// dataptr[DATASIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
|
|
// dataptr[DATASIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
|
|
|
|
paddw mm7, mm1 //tmp10 + tmp11
|
|
|
|
paddw mm7, Const_2 // round add 2 to each element
|
|
psubw mm3, mm1 //tmp10 - tmp11
|
|
|
|
paddw mm3, Const_2 // round add 2 to each element
|
|
psraw mm7, 2 // descale it
|
|
|
|
// unpack word to dword sign extended
|
|
movq mm5, mm7
|
|
punpcklwd mm7, mm7
|
|
|
|
psrad mm7, 16 // even results store in Temp0
|
|
punpckhwd mm5, mm5
|
|
|
|
psrad mm5, 16 // odd results store in array
|
|
movq mm1, mm6 //copy tmp12
|
|
|
|
movq qwTemp0, mm7 //store
|
|
psraw mm3, 2 // descale it
|
|
|
|
movq [edi][DATASIZE*0+8], mm5
|
|
movq mm5, mm3
|
|
|
|
punpcklwd mm3, mm3
|
|
paddw mm1, mm4 //tmp12 + tmp13
|
|
|
|
psrad mm3, 16 // even results store in Temp4
|
|
movq mm2, mm1 //copy
|
|
|
|
// z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
|
|
|
|
pmaddwd mm1, Const_0xFIX_0_541196100 //| z12 | z10 |
|
|
punpckhwd mm5, mm5
|
|
|
|
pmaddwd mm2, Const_FIX_0_541196100x0 //| z13 | z11 |
|
|
movq mm7, mm4
|
|
|
|
// dataptr[DATASIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
|
|
// CONST_BITS+PASS1_BITS);
|
|
|
|
pmaddwd mm4, Const_FIX_0_765366865x0 //| r3 | r1 |
|
|
psrad mm5, 16 // odd results store in array
|
|
|
|
pmaddwd mm7, Const_0xFIX_0_765366865 //| r2 | r0 |
|
|
movq mm0, mm6
|
|
|
|
// dataptr[DATASIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
|
|
// CONST_BITS+PASS1_BITS);
|
|
|
|
pmaddwd mm6, Const_0xFIX_1_847759065 //| r2 | r0 |
|
|
|
|
movq qwTemp4, mm3 //store
|
|
paddd mm4, mm2
|
|
|
|
paddd mm4, Const_16384
|
|
paddd mm7, mm1 // add z1
|
|
|
|
paddd mm7, Const_16384
|
|
psrad mm4, 15 // descale it | |R3| |R1|
|
|
|
|
movq [edi][DATASIZE*4+8], mm5
|
|
psrad mm7, 15 // descale it | |R2| |R0|
|
|
|
|
pmaddwd mm0, Const_FIX_1_847759065x0 //| r3 | r1 |
|
|
movq mm5, mm7
|
|
|
|
//!!!!!! Negate result
|
|
movq mm3, Const_1
|
|
punpckldq mm7, mm4 //| |R1| |R0|
|
|
|
|
pxor mm6, Const_FFFF //invert result
|
|
punpckhdq mm5, mm4 //| |R3| |R2|
|
|
|
|
movq qwTemp2, mm7 //store
|
|
paddd mm6, mm3 // 2's complement
|
|
|
|
pxor mm0, Const_FFFF //invert result
|
|
paddd mm6, mm1 // add z1
|
|
|
|
movq [edi][DATASIZE*2+8], mm5 //write out 2nd half in unused memory
|
|
paddd mm0, mm3 // 2's complement
|
|
|
|
movq mm3, Const_16384
|
|
paddd mm0, mm2
|
|
|
|
movq mm7, [edi][DATASIZE*0]
|
|
paddd mm6, mm3
|
|
|
|
movq mm4, [edi][DATASIZE*3]
|
|
paddd mm0, mm3
|
|
|
|
psubw mm7, [edi][DATASIZE*7]
|
|
psrad mm6, 15 // descale it | |R2| |R0|
|
|
|
|
psubw mm4, [edi][DATASIZE*4]
|
|
psrad mm0, 15 // descale it | |R3| |R1|
|
|
|
|
movq mm3, [edi][DATASIZE*2]
|
|
movq mm5, mm6
|
|
|
|
psubw mm3, [edi][DATASIZE*5]
|
|
punpckldq mm6, mm0 //| |R1| |R0|
|
|
|
|
punpckhdq mm5, mm0 //| |R3| |R2|
|
|
movq mm0, mm4
|
|
|
|
movq qwTemp6, mm6 //store
|
|
movq mm2, mm4
|
|
|
|
// tmp4 = dataptr[3] - dataptr[4];
|
|
// z1 = tmp4 + tmp7;
|
|
|
|
movq mm6, [edi][DATASIZE*1]
|
|
paddw mm0, mm7 //z1
|
|
|
|
movq [edi][DATASIZE*6+8], mm5 //write out 2nd half in unused memory
|
|
movq mm1, mm3
|
|
|
|
psubw mm6, [edi][DATASIZE*6]
|
|
movq mm5, mm3
|
|
|
|
movq qwZ1, mm0 //store
|
|
paddw mm5, mm7 //z4
|
|
|
|
movq qwScratch7, mm7
|
|
paddw mm1, mm6 //z2
|
|
|
|
movq qwScratch5, mm3
|
|
paddw mm2, mm6 //z3
|
|
|
|
movq qwZ2, mm1
|
|
movq mm3, mm2
|
|
|
|
// z3 = MULTIPLY(z3, - FIX_1_961570560);
|
|
// z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
|
|
// z4 = MULTIPLY(z4, - FIX_0_390180644);
|
|
|
|
movq qwScratch6, mm6
|
|
movq mm1, mm2
|
|
|
|
pmaddwd mm3, Const_0xFIX_1_961570560 //z32, z30
|
|
movq mm7, mm5
|
|
|
|
movq mm6, Const_FFFF
|
|
paddw mm2, mm5 //z3 + z4
|
|
|
|
pmaddwd mm1, Const_FIX_1_961570560x0 //z33, z31
|
|
movq mm0, mm2
|
|
|
|
pmaddwd mm7, Const_FIX_0_390180644x0 //z43, z41
|
|
//!!!!!! Negate the results
|
|
pxor mm3, mm6 //invert result
|
|
|
|
pmaddwd mm5, Const_0xFIX_0_390180644 //z42, z40
|
|
|
|
pmaddwd mm2, Const_0xFIX_1_175875602 //z52, z50
|
|
pxor mm1, mm6 //invert result
|
|
|
|
pmaddwd mm0, Const_FIX_1_175875602x0 //z53, z51
|
|
//!!!!!! Negate the results
|
|
pxor mm7, mm6 //invert result
|
|
|
|
paddd mm3, Const_1 // 2's complement
|
|
pxor mm5, mm6 //invert result
|
|
|
|
// z3 += z5//
|
|
|
|
paddd mm1, Const_1 // 2's complement
|
|
paddd mm3, mm2 //z3_even
|
|
|
|
paddd mm5, Const_1 // 2's complement
|
|
paddd mm1, mm0 //z3_odd
|
|
|
|
// z4 += z5;
|
|
|
|
paddd mm7, Const_1 // 2's complement
|
|
paddd mm5, mm2
|
|
|
|
paddd mm7, mm0
|
|
movq mm2, mm4
|
|
|
|
// tmp4 = MULTIPLY(tmp4, FIX_0_298631336);
|
|
|
|
pmaddwd mm4, Const_0xFIX_0_298631336 //T42, T40
|
|
|
|
pmaddwd mm2, Const_FIX_0_298631336x0 //T43, T41
|
|
|
|
movq qwZ4_even, mm5
|
|
|
|
movq qwZ4_odd, mm7
|
|
paddd mm4, mm3
|
|
|
|
movq mm6, qwScratch6
|
|
paddd mm2, mm1
|
|
|
|
movq qwTmp4_Z3_Even, mm4
|
|
movq mm5, mm6
|
|
|
|
// tmp6 = MULTIPLY(tmp6, FIX_3_072711026);
|
|
|
|
pmaddwd mm6, Const_0xFIX_3_072711026 //T62, T60
|
|
|
|
pmaddwd mm5, Const_FIX_3_072711026x0 //T63, T61
|
|
|
|
movq qwTmp4_Z3_Odd, mm2
|
|
|
|
movq mm4, qwZ4_even
|
|
paddd mm6, mm3
|
|
|
|
movq mm3, qwScratch5
|
|
paddd mm5, mm1
|
|
|
|
movq qwTmp6_Z3_Even, mm6
|
|
movq mm2, mm3
|
|
|
|
// tmp5 = MULTIPLY(tmp5, FIX_2_053119869);
|
|
|
|
pmaddwd mm3, Const_0xFIX_2_053119869 //T52, T50
|
|
|
|
pmaddwd mm2, Const_FIX_2_053119869x0 //T53, T51
|
|
|
|
movq qwTmp6_Z3_Odd, mm5
|
|
|
|
movq mm0, qwZ4_odd
|
|
paddd mm3, mm4
|
|
|
|
movq mm7, qwScratch7
|
|
paddd mm2, mm0
|
|
|
|
movq qwTmp5_Z4_Even, mm3
|
|
movq mm6, mm7
|
|
|
|
// tmp7 = MULTIPLY(tmp7, FIX_1_501321110);
|
|
|
|
pmaddwd mm7, Const_0xFIX_1_501321110 //T72, T70
|
|
|
|
pmaddwd mm6, Const_FIX_1_501321110x0 //T73, T71
|
|
|
|
movq mm3, qwZ1
|
|
|
|
movq qwTmp5_Z4_Odd, mm2
|
|
paddd mm7, mm4
|
|
|
|
movq mm5, Const_FFFF
|
|
movq mm4, mm3
|
|
|
|
// z1 = MULTIPLY(z1, - FIX_0_899976223);
|
|
|
|
pmaddwd mm3, Const_0xFIX_0_899976223 //z12, z10
|
|
paddd mm6, mm0
|
|
|
|
pmaddwd mm4, Const_FIX_0_899976223x0 //z13, z11
|
|
|
|
movq mm2, qwTmp4_Z3_Odd
|
|
//!!!!!! Negate the results
|
|
pxor mm3, mm5 //invert result
|
|
|
|
paddd mm3, Const_1 // 2's complement
|
|
pxor mm4, mm5 //invert result
|
|
|
|
paddd mm4, Const_1 // 2's complement
|
|
paddd mm7, mm3 //tmp7 + z1 + z4 EVEN
|
|
|
|
// dataptr[DATASIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
|
|
// CONST_BITS+PASS1_BITS);
|
|
|
|
paddd mm7, Const_16384 //rounding adj
|
|
paddd mm6, mm4 //tmp7 + z1 + z4 ODD
|
|
|
|
paddd mm6, Const_16384 //rounding adj
|
|
psrad mm7, 15 // descale it | |R2| |R0|
|
|
|
|
movq mm0, qwTmp4_Z3_Even
|
|
psrad mm6, 15 // descale it | |R3| |R1|
|
|
|
|
paddd mm0, mm3 //tmp4 + z1 + z3 EVEN
|
|
movq mm5, mm7
|
|
|
|
movq mm3, qwTemp0 //restore
|
|
punpckldq mm7, mm6 //| |R1| |R0|
|
|
|
|
paddd mm0, Const_16384 //rounding adj
|
|
paddd mm2, mm4 //tmp4 + z1 + z3 ODD
|
|
|
|
movq [edi][DATASIZE*0], mm3 //store
|
|
punpckhdq mm5, mm6 //| |R3| |R2|
|
|
|
|
// dataptr[DATASIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
|
|
// CONST_BITS+PASS1_BITS);
|
|
|
|
paddd mm2, Const_16384 //rounding adj
|
|
psrad mm0, 15 // descale it | |R2| |R0|
|
|
|
|
movq mm6, qwZ2
|
|
psrad mm2, 15 // descale it | |R3| |R1|
|
|
|
|
movq [edi][DATASIZE*1+8], mm5 //store
|
|
movq mm4, mm6
|
|
|
|
// z2 = MULTIPLY(z2, - FIX_2_562915447);
|
|
|
|
pmaddwd mm6, Const_0xFIX_2_562915447 //z22, z20
|
|
movq mm5, mm0
|
|
|
|
pmaddwd mm4, Const_FIX_2_562915447x0 //z23, z21
|
|
punpckldq mm0, mm2 //| |R1| |R0|
|
|
|
|
movq mm3, Const_FFFF
|
|
punpckhdq mm5, mm2 //| |R3| |R2|
|
|
|
|
movq [edi][DATASIZE*1], mm7 //store
|
|
//!!!!!! Negate the results
|
|
pxor mm6, mm3 //invert result
|
|
|
|
movq mm1, Const_1
|
|
pxor mm4, mm3 //invert result
|
|
|
|
movq mm7, qwTmp6_Z3_Even
|
|
paddd mm6, mm1 // 2's complement
|
|
|
|
movq mm2, qwTmp6_Z3_Odd
|
|
paddd mm4, mm1 // 2's complement
|
|
|
|
// dataptr[DATASIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
|
|
// CONST_BITS+PASS1_BITS);
|
|
|
|
movq [edi][DATASIZE*7], mm0 //store
|
|
paddd mm7, mm6 //tmp6 + z2 + z3 EVEN
|
|
|
|
movq mm1, Const_16384
|
|
paddd mm2, mm4 //tmp6 + z2 + z3 ODD
|
|
|
|
movq mm3, qwTemp2 //restore
|
|
paddd mm7, mm1 //rounding adj
|
|
|
|
movq [edi][DATASIZE*7+8], mm5 //store
|
|
paddd mm2, mm1 //rounding adj
|
|
|
|
movq [edi][DATASIZE*2], mm3 //store
|
|
psrad mm7, 15 // descale it | |R2| |R0|
|
|
|
|
movq mm0, qwTemp4 //restore
|
|
psrad mm2, 15 // descale it | |R3| |R1|
|
|
|
|
movq mm3, qwTmp5_Z4_Even
|
|
movq mm5, mm7
|
|
|
|
movq [edi][DATASIZE*4], mm0 //store
|
|
paddd mm3, mm6 //tmp5 + z2 + z4 EVEN
|
|
|
|
movq mm6, qwTmp5_Z4_Odd
|
|
punpckldq mm7, mm2 //| |R1| |R0|
|
|
|
|
punpckhdq mm5, mm2 //| |R3| |R2|
|
|
paddd mm6, mm4 //tmp5 + z2 + z4 ODD
|
|
|
|
movq [edi][DATASIZE*3], mm7 //store
|
|
paddd mm3, mm1 //rounding adj
|
|
|
|
// dataptr[DATASIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
|
|
// CONST_BITS+PASS1_BITS);
|
|
|
|
movq mm0, qwTemp6 //restore
|
|
paddd mm6, mm1 //rounding adj
|
|
|
|
movq [edi][DATASIZE*3+8], mm5 //store
|
|
psrad mm3, 15 // descale it | |R2| |R0|
|
|
|
|
movq [edi][DATASIZE*6], mm0 //store
|
|
psrad mm6, 15 // descale it | |R3| |R1|
|
|
|
|
movq mm7, [edi][DATASIZE*0+16]
|
|
movq mm5, mm3
|
|
|
|
paddw mm7, [edi][DATASIZE*7+16]
|
|
punpckldq mm3, mm6 //| |R1| |R0|
|
|
|
|
movq mm1, [edi][DATASIZE*1+16]
|
|
punpckhdq mm5, mm6 //| |R3| |R2|
|
|
|
|
paddw mm1, [edi][DATASIZE*6+16]
|
|
movq mm4, mm7
|
|
|
|
//******************************************************************************
|
|
// This completes 4x8 dct locations. Copy to do other 4x8.
|
|
//******************************************************************************
|
|
|
|
movq mm6, [edi][DATASIZE*3+16]
|
|
|
|
paddw mm6, [edi][DATASIZE*4+16]
|
|
|
|
movq mm2, [edi][DATASIZE*2+16]
|
|
psubw mm4, mm6 //tmp13
|
|
|
|
paddw mm2, [edi][DATASIZE*5+16]
|
|
paddw mm7, mm6 //tmp10
|
|
|
|
movq [edi][DATASIZE*5], mm3 //store
|
|
movq mm6, mm1
|
|
|
|
movq [edi][DATASIZE*5+8], mm5 //store
|
|
paddw mm1, mm2 //tmp11
|
|
|
|
psubw mm6, mm2 //tmp12
|
|
movq mm3, mm7
|
|
|
|
// dataptr[DATASIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
|
|
// dataptr[DATASIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
|
|
|
|
paddw mm7, mm1 //tmp10 + tmp11
|
|
|
|
paddw mm7, Const_2 // round add 2 to each element
|
|
psubw mm3, mm1 //tmp10 - tmp11
|
|
|
|
paddw mm3, Const_2 // round add 2 to each element
|
|
psraw mm7, 2 // descale it
|
|
|
|
// unpack word to dword sign extended
|
|
movq mm5, mm7
|
|
punpcklwd mm7, mm7
|
|
|
|
psrad mm7, 16 // even results store in Temp0
|
|
punpckhwd mm5, mm5
|
|
|
|
psrad mm5, 16 // odd results store in array
|
|
movq mm1, mm6 //copy tmp12
|
|
|
|
movq qwTemp0, mm7 //store
|
|
psraw mm3, 2 // descale it
|
|
|
|
movq [edi][DATASIZE*0+24], mm5
|
|
movq mm5, mm3
|
|
|
|
punpcklwd mm3, mm3
|
|
paddw mm1, mm4 //tmp12 + tmp13
|
|
|
|
psrad mm3, 16 // even results store in Temp4
|
|
movq mm2, mm1 //copy
|
|
|
|
// z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
|
|
|
|
pmaddwd mm1, Const_0xFIX_0_541196100 //| z12 | z10 |
|
|
punpckhwd mm5, mm5
|
|
|
|
pmaddwd mm2, Const_FIX_0_541196100x0 //| z13 | z11 |
|
|
movq mm7, mm4
|
|
|
|
// dataptr[DATASIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
|
|
// CONST_BITS+PASS1_BITS);
|
|
|
|
pmaddwd mm4, Const_FIX_0_765366865x0 //| r3 | r1 |
|
|
psrad mm5, 16 // odd results store in array
|
|
|
|
pmaddwd mm7, Const_0xFIX_0_765366865 //| r2 | r0 |
|
|
movq mm0, mm6
|
|
|
|
// dataptr[DATASIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
|
|
// CONST_BITS+PASS1_BITS);
|
|
|
|
pmaddwd mm6, Const_0xFIX_1_847759065 //| r2 | r0 |
|
|
|
|
movq qwTemp4, mm3 //store
|
|
paddd mm4, mm2
|
|
|
|
paddd mm4, Const_16384
|
|
paddd mm7, mm1 // add z1
|
|
|
|
paddd mm7, Const_16384
|
|
psrad mm4, 15 // descale it | |R3| |R1|
|
|
|
|
movq [edi][DATASIZE*4+24], mm5
|
|
psrad mm7, 15 // descale it | |R2| |R0|
|
|
|
|
pmaddwd mm0, Const_FIX_1_847759065x0 //| r3 | r1 |
|
|
movq mm5, mm7
|
|
|
|
//!!!!!! Negate result
|
|
movq mm3, Const_1
|
|
punpckldq mm7, mm4 //| |R1| |R0|
|
|
|
|
pxor mm6, Const_FFFF //invert result
|
|
punpckhdq mm5, mm4 //| |R3| |R2|
|
|
|
|
movq qwTemp2, mm7 //store
|
|
paddd mm6, mm3 // 2's complement
|
|
|
|
pxor mm0, Const_FFFF //invert result
|
|
paddd mm6, mm1 // add z1
|
|
|
|
movq [edi][DATASIZE*2+24], mm5 //write out 2nd half in unused memory
|
|
paddd mm0, mm3 // 2's complement
|
|
|
|
movq mm3, Const_16384
|
|
paddd mm0, mm2
|
|
|
|
movq mm7, [edi][DATASIZE*0+16]
|
|
paddd mm6, mm3
|
|
|
|
movq mm4, [edi][DATASIZE*3+16]
|
|
paddd mm0, mm3
|
|
|
|
psubw mm7, [edi][DATASIZE*7+16]
|
|
psrad mm6, 15 // descale it | |R2| |R0|
|
|
|
|
psubw mm4, [edi][DATASIZE*4+16]
|
|
psrad mm0, 15 // descale it | |R3| |R1|
|
|
|
|
movq mm3, [edi][DATASIZE*2+16]
|
|
movq mm5, mm6
|
|
|
|
psubw mm3, [edi][DATASIZE*5+16]
|
|
punpckldq mm6, mm0 //| |R1| |R0|
|
|
|
|
punpckhdq mm5, mm0 //| |R3| |R2|
|
|
movq mm0, mm4
|
|
|
|
movq qwTemp6, mm6 //store
|
|
movq mm2, mm4
|
|
|
|
// tmp4 = dataptr[3] - dataptr[4];
|
|
// z1 = tmp4 + tmp7;
|
|
|
|
movq mm6, [edi][DATASIZE*1+16]
|
|
paddw mm0, mm7 //z1
|
|
|
|
movq [edi][DATASIZE*6+24], mm5 //write out 2nd half in unused memory
|
|
movq mm1, mm3
|
|
|
|
psubw mm6, [edi][DATASIZE*6+16]
|
|
movq mm5, mm3
|
|
|
|
movq qwZ1, mm0 //store
|
|
paddw mm5, mm7 //z4
|
|
|
|
movq qwScratch7, mm7
|
|
paddw mm1, mm6 //z2
|
|
|
|
movq qwScratch5, mm3
|
|
paddw mm2, mm6 //z3
|
|
|
|
movq qwZ2, mm1
|
|
movq mm3, mm2
|
|
|
|
// z3 = MULTIPLY(z3, - FIX_1_961570560);
|
|
// z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
|
|
// z4 = MULTIPLY(z4, - FIX_0_390180644);
|
|
|
|
movq qwScratch6, mm6
|
|
movq mm1, mm2
|
|
|
|
pmaddwd mm3, Const_0xFIX_1_961570560 //z32, z30
|
|
movq mm7, mm5
|
|
|
|
movq mm6, Const_FFFF
|
|
paddw mm2, mm5 //z3 + z4
|
|
|
|
pmaddwd mm1, Const_FIX_1_961570560x0 //z33, z31
|
|
movq mm0, mm2
|
|
|
|
pmaddwd mm7, Const_FIX_0_390180644x0 //z43, z41
|
|
//!!!!!! Negate the results
|
|
pxor mm3, mm6 //invert result
|
|
|
|
pmaddwd mm5, Const_0xFIX_0_390180644 //z42, z40
|
|
|
|
pmaddwd mm2, Const_0xFIX_1_175875602 //z52, z50
|
|
pxor mm1, mm6 //invert result
|
|
|
|
pmaddwd mm0, Const_FIX_1_175875602x0 //z53, z51
|
|
//!!!!!! Negate the results
|
|
pxor mm7, mm6 //invert result
|
|
|
|
paddd mm3, Const_1 // 2's complement
|
|
pxor mm5, mm6 //invert result
|
|
|
|
// z3 += z5;
|
|
|
|
paddd mm1, Const_1 // 2's complement
|
|
paddd mm3, mm2 //z3_even
|
|
|
|
paddd mm5, Const_1 // 2's complement
|
|
paddd mm1, mm0 //z3_odd
|
|
|
|
// z4 += z5;
|
|
|
|
paddd mm7, Const_1 // 2's complement
|
|
paddd mm5, mm2
|
|
|
|
paddd mm7, mm0
|
|
movq mm2, mm4
|
|
|
|
// tmp4 = MULTIPLY(tmp4, FIX_0_298631336);
|
|
|
|
pmaddwd mm4, Const_0xFIX_0_298631336 //T42, T40
|
|
|
|
pmaddwd mm2, Const_FIX_0_298631336x0 //T43, T41
|
|
|
|
movq qwZ4_even, mm5
|
|
|
|
movq qwZ4_odd, mm7
|
|
paddd mm4, mm3
|
|
|
|
movq mm6, qwScratch6
|
|
paddd mm2, mm1
|
|
|
|
movq qwTmp4_Z3_Even, mm4
|
|
movq mm5, mm6
|
|
|
|
// tmp6 = MULTIPLY(tmp6, FIX_3_072711026);
|
|
|
|
pmaddwd mm6, Const_0xFIX_3_072711026 //T62, T60
|
|
|
|
pmaddwd mm5, Const_FIX_3_072711026x0 //T63, T61
|
|
|
|
movq qwTmp4_Z3_Odd, mm2
|
|
|
|
movq mm4, qwZ4_even
|
|
paddd mm6, mm3
|
|
|
|
movq mm3, qwScratch5
|
|
paddd mm5, mm1
|
|
|
|
movq qwTmp6_Z3_Even, mm6
|
|
movq mm2, mm3
|
|
|
|
// tmp5 = MULTIPLY(tmp5, FIX_2_053119869);
|
|
|
|
pmaddwd mm3, Const_0xFIX_2_053119869 //T52, T50
|
|
|
|
pmaddwd mm2, Const_FIX_2_053119869x0 //T53, T51
|
|
|
|
movq qwTmp6_Z3_Odd, mm5
|
|
|
|
movq mm0, qwZ4_odd
|
|
paddd mm3, mm4
|
|
|
|
movq mm7, qwScratch7
|
|
paddd mm2, mm0
|
|
|
|
movq qwTmp5_Z4_Even, mm3
|
|
movq mm6, mm7
|
|
|
|
// tmp7 = MULTIPLY(tmp7, FIX_1_501321110);
|
|
|
|
pmaddwd mm7, Const_0xFIX_1_501321110 //T72, T70
|
|
|
|
pmaddwd mm6, Const_FIX_1_501321110x0 //T73, T71
|
|
|
|
movq mm3, qwZ1
|
|
|
|
movq qwTmp5_Z4_Odd, mm2
|
|
paddd mm7, mm4
|
|
|
|
movq mm5, Const_FFFF
|
|
movq mm4, mm3
|
|
|
|
// z1 = MULTIPLY(z1, - FIX_0_899976223);
|
|
|
|
pmaddwd mm3, Const_0xFIX_0_899976223 //z12, z10
|
|
paddd mm6, mm0
|
|
|
|
pmaddwd mm4, Const_FIX_0_899976223x0 //z13, z11
|
|
|
|
movq mm2, qwTmp4_Z3_Odd
|
|
//!!!!!! Negate the results
|
|
pxor mm3, mm5 //invert result
|
|
|
|
paddd mm3, Const_1 // 2's complement
|
|
pxor mm4, mm5 //invert result
|
|
|
|
paddd mm4, Const_1 // 2's complement
|
|
paddd mm7, mm3 //tmp7 + z1 + z4 EVEN
|
|
|
|
// dataptr[DATASIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
|
|
// CONST_BITS+PASS1_BITS);
|
|
|
|
paddd mm7, Const_16384 //rounding adj
|
|
paddd mm6, mm4 //tmp7 + z1 + z4 ODD
|
|
|
|
paddd mm6, Const_16384 //rounding adj
|
|
psrad mm7, 15 // descale it | |R2| |R0|
|
|
|
|
movq mm0, qwTmp4_Z3_Even
|
|
psrad mm6, 15 // descale it | |R3| |R1|
|
|
|
|
paddd mm0, mm3 //tmp4 + z1 + z3 EVEN
|
|
movq mm5, mm7
|
|
|
|
movq mm3, qwTemp0 //restore
|
|
punpckldq mm7, mm6 //| |R1| |R0|
|
|
|
|
paddd mm0, Const_16384 //rounding adj
|
|
paddd mm2, mm4 //tmp4 + z1 + z3 ODD
|
|
|
|
movq [edi][DATASIZE*0+16], mm3 //store
|
|
punpckhdq mm5, mm6 //| |R3| |R2|
|
|
|
|
// dataptr[DATASIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
|
|
// CONST_BITS+PASS1_BITS);
|
|
|
|
paddd mm2, Const_16384 //rounding adj
|
|
psrad mm0, 15 // descale it | |R2| |R0|
|
|
|
|
movq mm6, qwZ2
|
|
psrad mm2, 15 // descale it | |R3| |R1|
|
|
|
|
movq [edi][DATASIZE*1+24], mm5 //store
|
|
movq mm4, mm6
|
|
|
|
// z2 = MULTIPLY(z2, - FIX_2_562915447);
|
|
|
|
pmaddwd mm6, Const_0xFIX_2_562915447 //z22, z20
|
|
movq mm5, mm0
|
|
|
|
pmaddwd mm4, Const_FIX_2_562915447x0 //z23, z21
|
|
punpckldq mm0, mm2 //| |R1| |R0|
|
|
|
|
movq mm3, Const_FFFF
|
|
punpckhdq mm5, mm2 //| |R3| |R2|
|
|
|
|
movq [edi][DATASIZE*1+16], mm7 //store
|
|
//!!!!!! Negate the results
|
|
pxor mm6, mm3 //invert result
|
|
|
|
movq mm1, Const_1
|
|
pxor mm4, mm3 //invert result
|
|
|
|
movq mm7, qwTmp6_Z3_Even
|
|
paddd mm6, mm1 // 2's complement
|
|
|
|
movq mm2, qwTmp6_Z3_Odd
|
|
paddd mm4, mm1 // 2's complement
|
|
|
|
// dataptr[DATASIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
|
|
// CONST_BITS+PASS1_BITS);
|
|
|
|
movq [edi][DATASIZE*7+16], mm0 //store
|
|
paddd mm7, mm6 //tmp6 + z2 + z3 EVEN
|
|
|
|
movq mm1, Const_16384
|
|
paddd mm2, mm4 //tmp6 + z2 + z3 ODD
|
|
|
|
movq mm3, qwTemp2 //restore
|
|
paddd mm7, mm1 //rounding adj
|
|
|
|
movq [edi][DATASIZE*7+24], mm5 //store
|
|
paddd mm2, mm1 //rounding adj
|
|
|
|
movq [edi][DATASIZE*2+16], mm3 //store
|
|
psrad mm7, 15 // descale it | |R2| |R0|
|
|
|
|
movq mm3, qwTmp5_Z4_Even
|
|
psrad mm2, 15 // descale it | |R3| |R1|
|
|
|
|
movq mm5, mm7
|
|
paddd mm3, mm6 //tmp5 + z2 + z4 EVEN
|
|
|
|
movq mm6, qwTmp5_Z4_Odd
|
|
punpckldq mm7, mm2 //| |R1| |R0|
|
|
|
|
punpckhdq mm5, mm2 //| |R3| |R2|
|
|
paddd mm6, mm4 //tmp5 + z2 + z4 ODD
|
|
|
|
movq [edi][DATASIZE*3+16], mm7 //store
|
|
paddd mm3, mm1 //rounding adj
|
|
|
|
// dataptr[DATASIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
|
|
// CONST_BITS+PASS1_BITS);
|
|
|
|
movq mm7, qwTemp4 //restore
|
|
paddd mm6, mm1 //rounding adj
|
|
|
|
movq [edi][DATASIZE*3+24], mm5 //store
|
|
psrad mm3, 15 // descale it | |R2| |R0|
|
|
|
|
movq [edi][DATASIZE*4+16], mm7 //store
|
|
psrad mm6, 15 // descale it | |R3| |R1|
|
|
|
|
movq mm7, qwTemp6 //restore
|
|
movq mm5, mm3
|
|
|
|
punpckldq mm3, mm6 //| |R1| |R0|
|
|
|
|
movq [edi][DATASIZE*6+16], mm7 //store
|
|
punpckhdq mm5, mm6 //| |R3| |R2|
|
|
|
|
movq [edi][DATASIZE*5+16], mm3 //store
|
|
|
|
movq [edi][DATASIZE*5+24], mm5 //store
|
|
|
|
//******************************************************************************
|
|
// This completes all 8x8 dct locations for the column case.
|
|
//******************************************************************************
|
|
|
|
emms
|
|
}
|
|
}
|
|
|
|
#endif /* DCT_ISLOW_SUPPORTED */
|