1575 lines
33 KiB
NASM
1575 lines
33 KiB
NASM
;***************************************************************************/
|
|
;*
|
|
;* INTEL Corporation Proprietary Information
|
|
;*
|
|
;*
|
|
;* Copyright (c) 1996 Intel Corporation.
|
|
;* All rights reserved.
|
|
;*
|
|
;***************************************************************************/
|
|
; AUTHOR: Kumar Balasubramanian
|
|
;***************************************************************************/
|
|
|
|
;; MMX version of the "integer fast mode" within IJG decompressor code.
|
|
|
|
|
|
.nolist
|
|
include iammx.inc ; IAMMX Emulator Macros
|
|
MMWORD TEXTEQU <DWORD>
|
|
.list
|
|
|
|
.586
|
|
.model flat
|
|
|
|
_DATA SEGMENT PARA PUBLIC USE32 'DATA'
|
|
x0005000200010001 DQ 0005000200010001h
|
|
x0040000000000000 DQ 40000000000000h
|
|
|
|
x5a825a825a825a82 DW 16ah, 0h, 16ah, 0h ; 23170---1.414
|
|
x539f539f539f539f DW 0fd63h, 0h, 0fd63h, 0h ; 21407---2.613
|
|
x4546454645464546 DW 115h, 0h, 115h, 0h ; 17734---1.082
|
|
x61f861f861f861f8 DW 1d9h, 0h, 1d9h, 0h ; 25080---1.847
|
|
|
|
const_mask DQ 3ff03ff03ff03ffh
|
|
const_zero DQ 0
|
|
scratch1 DQ 0
|
|
scratch3 DQ 0
|
|
scratch5 DQ 0
|
|
scratch7 DQ 0
|
|
; for debug only
|
|
x0 DQ 0
|
|
|
|
|
|
|
|
preSC DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384
|
|
DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384
|
|
DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384
|
|
DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384
|
|
DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384
|
|
DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384
|
|
DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384
|
|
DW 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384
|
|
|
|
|
|
|
|
_DATA ENDS
|
|
|
|
|
|
_TEXT SEGMENT PARA PUBLIC USE32 'CODE'
|
|
|
|
|
|
PackMulW MACRO
|
|
movq mm0, mmword ptr scratch1
|
|
punpcklwd mm0, mmword ptr const_zero
|
|
pmaddwd mm0, mmword ptr scratch3
|
|
psrad mm0, 8
|
|
movq mm1, mmword ptr scratch1
|
|
punpckhwd mm1, mmword ptr const_zero
|
|
pmaddwd mm1, mmword ptr scratch3
|
|
psrad mm1, 8
|
|
movq mmword ptr scratch1, mm1
|
|
movq mm1, mm0
|
|
punpcklwd mm0, mmword ptr scratch1
|
|
punpckhwd mm1, mmword ptr scratch1
|
|
punpcklwd mm0, mm1
|
|
movq mmword ptr scratch1, mm0
|
|
|
|
ENDM
|
|
|
|
COMMENT ^
|
|
void idct8x8aan (
|
|
int16 *src_result);
|
|
^
|
|
public _idct8x8aan
|
|
_idct8x8aan proc USES eax ebx ecx edx esi edi ebp
|
|
|
|
|
|
|
|
mov ebx, DWORD PTR [esp+32] ; source coeff
|
|
mov esi, DWORD PTR [esp+36] ; temp results
|
|
mov edi, DWORD PTR [esp+40] ; quant factors
|
|
;slot
|
|
|
|
; column 0: even part
|
|
; use V4, V12, V0, V8 to produce V22..V25
|
|
;slot
|
|
|
|
movq mm0, mmword ptr [ebx+8*12] ; V12
|
|
pmullw mm0, mmword ptr [edi+8*12]
|
|
;slot
|
|
|
|
movq mm1, mmword ptr [ebx+8*4] ; V4
|
|
pmullw mm1, mmword ptr [edi+8*4]
|
|
;slot
|
|
|
|
movq mm3, mmword ptr [ebx+8*0] ; V0
|
|
pmullw mm3, mmword ptr [edi+8*0]
|
|
;slot
|
|
|
|
movq mm2, mm1 ; duplicate V4
|
|
|
|
movq mm5, mmword ptr [ebx+8*8] ; V8
|
|
pmullw mm5, mmword ptr [edi+8*8]
|
|
psubw mm1, mm0 ; V16
|
|
|
|
movq mmword ptr scratch1, mm1
|
|
movq mm1, mmword ptr x5a825a825a825a82 ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm1
|
|
movq mmword ptr scratch5, mm0
|
|
PackMulW
|
|
movq mm1, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
|
|
paddw mm2, mm0 ; V17
|
|
|
|
movq mm0, mm2 ; duplicate V17
|
|
|
|
movq mm4, mm3 ; duplicate V0
|
|
|
|
paddw mm3, mm5 ; V19
|
|
psubw mm4, mm5 ; V20 ;mm5 free
|
|
|
|
movq mm6, mm3 ; duplicate t74=t81
|
|
|
|
psubw mm1, mm0 ; V21 ; mm0 free
|
|
paddw mm3, mm2 ; V22
|
|
|
|
movq mm5, mm1 ; duplicate V21
|
|
paddw mm1, mm4 ; V23
|
|
|
|
movq mmword ptr [esi+8*4], mm3 ; V22
|
|
psubw mm4, mm5 ; V24; mm5 free
|
|
|
|
movq mmword ptr [esi+8*12], mm1 ; V23
|
|
psubw mm6, mm2 ; V25; mm2 free
|
|
|
|
movq mmword ptr [esi+8*0], mm4 ; V24
|
|
;slot
|
|
|
|
|
|
movq mm7, mmword ptr [ebx+8*10] ; V10
|
|
pmullw mm7, mmword ptr [edi+8*10]
|
|
;slot
|
|
|
|
movq mm0, mmword ptr [ebx+8*6] ; V6
|
|
pmullw mm0, mmword ptr [edi+8*6]
|
|
;slot
|
|
|
|
movq mm3, mm7 ; duplicate V10
|
|
|
|
movq mm5, mmword ptr [ebx+8*2] ; V2
|
|
pmullw mm5, mmword ptr [edi+8*2]
|
|
;slot
|
|
|
|
psubw mm7, mm0 ; V26
|
|
|
|
movq mm4, mmword ptr [ebx+8*14] ; V14
|
|
pmullw mm4, mmword ptr [edi+8*14]
|
|
paddw mm3, mm0 ; V29 ; free mm0
|
|
|
|
movq mm1, mm7 ; duplicate V26
|
|
|
|
movq mmword ptr scratch1, mm7
|
|
movq mm7, mmword ptr x539f539f539f539f ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm7
|
|
movq mmword ptr scratch5, mm0
|
|
movq mmword ptr scratch7, mm1
|
|
PackMulW
|
|
movq mm7, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
movq mm1, mmword ptr scratch7
|
|
|
|
|
|
movq mm0, mm5 ; duplicate V2
|
|
|
|
paddw mm5, mm4 ; V27
|
|
psubw mm0, mm4 ; V28 ; free mm4
|
|
|
|
movq mm2, mm0 ; duplicate V28
|
|
|
|
|
|
movq mmword ptr scratch1, mm0
|
|
movq mm0, mmword ptr x4546454645464546 ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm0
|
|
movq mmword ptr scratch7, mm1
|
|
PackMulW
|
|
movq mm0, mmword ptr scratch1
|
|
movq mm1, mmword ptr scratch7
|
|
|
|
|
|
movq mm4, mm5 ; duplicate t90=t93
|
|
paddw mm1, mm2 ; V32 ; free mm2
|
|
|
|
movq mmword ptr scratch1, mm1
|
|
movq mm1, mmword ptr x61f861f861f861f8 ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm1
|
|
movq mmword ptr scratch5, mm0
|
|
PackMulW
|
|
movq mm1, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
|
|
paddw mm5, mm3 ; V31
|
|
psubw mm4, mm3 ; V30 ; free mm3
|
|
|
|
|
|
movq mmword ptr scratch1, mm4
|
|
movq mm4, mmword ptr x5a825a825a825a82 ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm4
|
|
movq mmword ptr scratch5, mm0
|
|
movq mmword ptr scratch7, mm1
|
|
PackMulW
|
|
movq mm4, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
movq mm1, mmword ptr scratch7
|
|
|
|
|
|
psubw mm0, mm1 ; V38
|
|
paddw mm1, mm7 ; V37 ; free mm7
|
|
|
|
movq mm3, mm6 ; duplicate V25
|
|
|
|
;move from the next block
|
|
movq mm7, mmword ptr [esi+8*4] ; V22
|
|
|
|
psubw mm1, mm5 ; V39 (mm5 still needed for next block)
|
|
|
|
;move from the next block
|
|
movq mm2, mmword ptr [esi+8*12] ; V23
|
|
|
|
psubw mm4, mm1 ; V40
|
|
|
|
paddw mm0, mm4 ; V41; free mm0
|
|
|
|
; column 0: output butterfly
|
|
|
|
psubw mm6, mm0 ; tm6
|
|
paddw mm3, mm0 ; tm8; free mm1
|
|
|
|
movq mm0, mm1 ; line added by Kumar
|
|
|
|
movq mm1, mm7 ; duplicate V22
|
|
paddw mm7, mm5 ; tm0
|
|
|
|
movq mmword ptr [esi+8*8], mm3 ; tm8; free mm3
|
|
psubw mm1, mm5 ; tm14; free mm5
|
|
|
|
movq mmword ptr [esi+8*6], mm6 ; tm6; free mm6
|
|
movq mm3, mm2 ; duplicate t117=t125
|
|
|
|
movq mm6, mmword ptr [esi+8*0] ; V24
|
|
paddw mm2, mm0 ; tm2
|
|
|
|
movq mmword ptr [esi+8*0], mm7 ; tm0; free mm7
|
|
psubw mm3, mm0 ; tm12; free mm0
|
|
|
|
movq mmword ptr [esi+8*14], mm1 ; tm14; free mm1
|
|
|
|
movq mmword ptr [esi+8*2], mm2 ; tm2; free mm2
|
|
movq mm0, mm6 ; duplicate t119=t123
|
|
|
|
movq mmword ptr [esi+8*12], mm3 ; tm12; free mm3
|
|
paddw mm6, mm4 ; tm4
|
|
|
|
psubw mm0, mm4 ; tm10; free mm4
|
|
|
|
movq mm1, mmword ptr [ebx+8*5] ; V5
|
|
pmullw mm1, mmword ptr [edi+8*5]
|
|
|
|
movq mmword ptr [esi+8*4], mm6 ; tm4; free mm6
|
|
|
|
movq mmword ptr [esi+8*10], mm0 ; tm10; free mm0
|
|
|
|
; column 1: even part
|
|
; use V5, V13, V1, V9 to produce V56..V59
|
|
|
|
movq mm7, mmword ptr [ebx+8*13] ; V13
|
|
pmullw mm7, mmword ptr [edi+8*13]
|
|
movq mm2, mm1 ; duplicate t128=t130
|
|
|
|
movq mm3, mmword ptr [ebx+8*1] ; V1
|
|
pmullw mm3, mmword ptr [edi+8*1]
|
|
|
|
psubw mm1, mm7 ; V50
|
|
|
|
movq mm5, mmword ptr [ebx+8*9] ; V9
|
|
pmullw mm5, mmword ptr [edi+8*9]
|
|
paddw mm2, mm7 ; V51
|
|
|
|
|
|
movq mmword ptr scratch1, mm1
|
|
movq mm1, mmword ptr x5a825a825a825a82 ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm1
|
|
movq mmword ptr scratch5, mm0
|
|
PackMulW
|
|
movq mm1, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
|
|
movq mm6, mm2 ; duplicate V51
|
|
|
|
movq mm4, mm3 ; duplicate V1
|
|
|
|
paddw mm3, mm5 ; V53
|
|
|
|
psubw mm4, mm5 ; V54 ;mm5 free
|
|
movq mm7, mm3 ; duplicate V53
|
|
|
|
psubw mm1, mm6 ; V55 ; mm6 free
|
|
paddw mm3, mm2 ; V56
|
|
|
|
movq mm5, mm4 ; duplicate t140=t142
|
|
paddw mm4, mm1 ; V57
|
|
|
|
movq mmword ptr [esi+8*5], mm3 ; V56
|
|
psubw mm5, mm1 ; V58; mm1 free
|
|
|
|
movq mmword ptr [esi+8*13], mm4 ; V57
|
|
psubw mm7, mm2 ; V59; mm2 free
|
|
|
|
movq mmword ptr [esi+8*9], mm5 ; V58
|
|
|
|
|
|
movq mm0, mmword ptr [ebx+8*11] ; V11
|
|
pmullw mm0, mmword ptr [edi+8*11]
|
|
|
|
movq mm6, mmword ptr [ebx+8*7] ; V7
|
|
pmullw mm6, mmword ptr [edi+8*7]
|
|
|
|
movq mm3, mm0 ; duplicate V11
|
|
|
|
movq mm4, mmword ptr [ebx+8*15] ; V15
|
|
pmullw mm4, mmword ptr [edi+8*15]
|
|
|
|
movq mm5, mmword ptr [ebx+8*3] ; V3
|
|
pmullw mm5, mmword ptr [edi+8*3]
|
|
paddw mm0, mm6 ; V63
|
|
|
|
; note that V15 computation has a correction step:
|
|
; this is a 'magic' constant that rebiases the results to be closer to the expected result
|
|
; this magic constant can be refined to reduce the error even more
|
|
; by doing the correction step in a later stage when the number is actually multiplied by 16
|
|
psubw mm3, mm6 ; V60 ; free mm6
|
|
|
|
movq mm1, mm3 ; duplicate V60
|
|
|
|
|
|
movq mmword ptr scratch1, mm1
|
|
movq mm1, mmword ptr x539f539f539f539f ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm1
|
|
movq mmword ptr scratch5, mm0
|
|
PackMulW
|
|
movq mm1, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
|
|
movq mm6, mm5 ; duplicate V3
|
|
|
|
paddw mm5, mm4 ; V61
|
|
psubw mm6, mm4 ; V62 ; free mm4
|
|
|
|
movq mm4, mm5 ; duplicate V61
|
|
|
|
paddw mm5, mm0 ; V65 -> result
|
|
psubw mm4, mm0 ; V64 ; free mm0
|
|
|
|
|
|
movq mmword ptr scratch1, mm4
|
|
movq mm4, mmword ptr x5a825a825a825a82 ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm4
|
|
movq mmword ptr scratch5, mm0
|
|
movq mmword ptr scratch7, mm1
|
|
PackMulW
|
|
movq mm4, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
movq mm1, mmword ptr scratch7
|
|
|
|
|
|
paddw mm3, mm6 ; V66
|
|
movq mm2, mm5 ; duplicate V65
|
|
|
|
|
|
movq mmword ptr scratch1, mm3
|
|
movq mm3, mmword ptr x61f861f861f861f8 ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm3
|
|
movq mmword ptr scratch5, mm0
|
|
movq mmword ptr scratch7, mm1
|
|
PackMulW
|
|
movq mm3, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
movq mm1, mmword ptr scratch7
|
|
|
|
movq mmword ptr scratch1, mm6
|
|
movq mm6, mmword ptr x4546454645464546 ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm6
|
|
movq mmword ptr scratch5, mm0
|
|
movq mmword ptr scratch7, mm1
|
|
PackMulW
|
|
movq mm6, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
movq mm1, mmword ptr scratch7
|
|
|
|
movq mm0, mmword ptr [esi+8*5] ; V56
|
|
|
|
psubw mm6, mm3 ; V72
|
|
paddw mm3, mm1 ; V71 ; free mm1
|
|
|
|
psubw mm3, mm2 ; V73 ; free mm2
|
|
|
|
psubw mm4, mm3 ; V74
|
|
|
|
;moved from next block
|
|
movq mm1, mm0 ; duplicate t177=t188
|
|
|
|
paddw mm6, mm4 ; V75
|
|
|
|
;moved from next block
|
|
paddw mm0, mm5 ; tm1
|
|
|
|
;location
|
|
; 5 - V56
|
|
; 13 - V57
|
|
; 9 - V58
|
|
; X - V59, mm7
|
|
; X - V65, mm5
|
|
; X - V73, mm6
|
|
; X - V74, mm4
|
|
; X - V75, mm3
|
|
; free mm0, mm1 & mm2
|
|
;move above
|
|
|
|
movq mm2, mmword ptr [esi+8*13] ; V57
|
|
psubw mm1, mm5 ; tm15; free mm5
|
|
|
|
movq mmword ptr [esi+8*1], mm0 ; tm1; free mm0
|
|
|
|
;save the store as used directly in the transpose
|
|
;movq mmword ptr [esi+8*15], mm1 ; tm15; free mm1
|
|
movq mm5, mm7 ; duplicate t182=t184
|
|
|
|
psubw mm7, mm6 ; tm7
|
|
|
|
paddw mm5, mm6 ; tm9; free mm3
|
|
;slot
|
|
|
|
movq mm6, mm3
|
|
|
|
movq mm0, mmword ptr [esi+8*9] ; V58
|
|
movq mm3, mm2 ; duplicate V57
|
|
|
|
movq mmword ptr [esi+8*7], mm7 ; tm7; free mm7
|
|
psubw mm3, mm6 ; tm13
|
|
|
|
paddw mm2, mm6 ; tm3 ; free mm6
|
|
|
|
movq mm6, mm0 ; duplicate V58
|
|
|
|
movq mmword ptr [esi+8*3], mm2 ; tm3; free mm2
|
|
paddw mm0, mm4 ; tm5
|
|
|
|
psubw mm6, mm4 ; tm11; free mm4
|
|
|
|
movq mmword ptr [esi+8*5], mm0 ; tm5; free mm0
|
|
|
|
|
|
; transpose the bottom right quadrant(4X4) of the matrix
|
|
; --------- ---------
|
|
; | M1 | M2 | | M1'| M3'|
|
|
; --------- --> ---------
|
|
; | M3 | M4 | | M2'| M4'|
|
|
; --------- ---------
|
|
|
|
movq mm0, mm5 ; copy w4---0,1,3,5,6
|
|
punpcklwd mm5, mm6 ;
|
|
|
|
punpckhwd mm0, mm6 ;---0,1,3,5,6
|
|
movq mm6, mmword ptr [esi+8*0] ;get w0 of top left quadrant
|
|
|
|
movq mm2, mm3 ;---0,1,2,3,5,6
|
|
punpcklwd mm3, mm1 ;
|
|
|
|
movq mm7, mmword ptr [esi+8*2] ;get w1 of top left quadrant
|
|
punpckhwd mm2, mm1 ;---0,2,3,5,6,7
|
|
|
|
movq mm4, mm5 ;---0,2,3,4,5,6,7
|
|
punpckldq mm5, mm3 ; transposed w4
|
|
|
|
movq mmword ptr [esi+8*9], mm5 ; store w4
|
|
punpckhdq mm4, mm3 ; transposed w5---0,2,4,6,7
|
|
|
|
movq mm3, mm0 ;---0,2,3,4,6,7
|
|
punpckldq mm0, mm2 ; transposed w6
|
|
|
|
movq mmword ptr [esi+8*11], mm4 ; store w5
|
|
punpckhdq mm3, mm2 ; transposed w7---0,3,6,7
|
|
|
|
movq mmword ptr [esi+8*13], mm0 ; store w6---3,5,6,7
|
|
movq mm5, mm6 ; copy w0
|
|
|
|
movq mmword ptr [esi+8*15], mm3 ; store w7---5,6,7
|
|
punpcklwd mm6, mm7
|
|
|
|
; transpose the top left quadrant(4X4) of the matrix
|
|
|
|
punpckhwd mm5, mm7 ;---5,6,7
|
|
movq mm7, mmword ptr [esi+8*4] ; get w2 of TL quadrant
|
|
|
|
movq mm4, mmword ptr [esi+8*6] ; get w3 of TL quadrant
|
|
movq mm3, mm7 ; copy w2---3,4,5,6,7
|
|
|
|
movq mm2, mm6
|
|
punpcklwd mm7, mm4 ;---2,3,4,5,6,7
|
|
|
|
punpckhwd mm3, mm4 ;---2,3,4,5,6,7
|
|
movq mm4, mm5 ;
|
|
|
|
movq mm1, mm5
|
|
punpckldq mm6, mm7 ;---1,2,3,4,5,6,7
|
|
|
|
movq mmword ptr [esi+8*0], mm6 ; store w0 of TL quadrant
|
|
punpckhdq mm2, mm7 ;---1,2,3,4,5,6,7
|
|
|
|
movq mmword ptr [esi+8*2], mm2 ; store w1 of TL quadrant
|
|
punpckldq mm5, mm3 ;---1,2,3,4,5,6,7
|
|
|
|
movq mmword ptr [esi+8*4], mm5 ; store w2 of TL quadrant
|
|
punpckhdq mm1, mm3 ;---1,2,3,4,5,6,7
|
|
|
|
movq mmword ptr [esi+8*6], mm1 ; store w3 of TL quadrant
|
|
|
|
|
|
; transpose the top right quadrant(4X4) of the matrix
|
|
|
|
movq mm0, mmword ptr [esi+8*1] ;---0
|
|
|
|
movq mm1, mmword ptr [esi+8*3] ;---0,1,2
|
|
movq mm2, mm0
|
|
|
|
movq mm3, mmword ptr [esi+8*5]
|
|
punpcklwd mm0, mm1 ;---0,1,2,3
|
|
|
|
punpckhwd mm2, mm1
|
|
movq mm1, mmword ptr [esi+8*7] ;---0,1,2,3
|
|
|
|
movq mm4, mm3
|
|
punpcklwd mm3, mm1 ;---0,1,2,3,4
|
|
|
|
punpckhwd mm4, mm1 ;---0,1,2,3,4
|
|
movq mm1, mm0
|
|
|
|
movq mm5, mm2
|
|
punpckldq mm0, mm3 ;---0,1,2,3,4,5
|
|
|
|
punpckhdq mm1, mm3 ;---0,1,2,3,4,5
|
|
movq mm3, mmword ptr [esi+8*8]
|
|
|
|
movq mmword ptr [esi+8*8], mm0
|
|
punpckldq mm2, mm4 ;---1,2,3,4,5
|
|
|
|
punpckhdq mm5, mm4 ;---1,2,3,4,5
|
|
movq mm4, mmword ptr [esi+8*10]
|
|
|
|
; transpose the bottom left quadrant(4X4) of the matrix
|
|
; Also store w1,w2,w3 of top right quadrant into
|
|
; w5,w6,w7 of bottom left quadrant. Storing w0 of TR in w4
|
|
; of BL is already done.
|
|
|
|
movq mmword ptr [esi+8*10], mm1
|
|
movq mm1, mm3 ;---1,2,3,4,5
|
|
|
|
movq mm0, mmword ptr [esi+8*12]
|
|
punpcklwd mm3, mm4 ;---0,1,2,3,4,5
|
|
|
|
punpckhwd mm1, mm4 ;---0,1,2,3,4,5
|
|
movq mm4, mmword ptr [esi+8*14]
|
|
|
|
movq mmword ptr [esi+8*12], mm2
|
|
movq mm2, mm0
|
|
|
|
movq mmword ptr [esi+8*14], mm5
|
|
punpcklwd mm0, mm4 ;---0,1,2,3,4
|
|
|
|
punpckhwd mm2, mm4 ;---0,1,2,3,4
|
|
movq mm4, mm3
|
|
|
|
movq mm5, mm1
|
|
punpckldq mm3, mm0 ;---0,1,2,3,4,5
|
|
|
|
movq mmword ptr [esi+8*1], mm3
|
|
punpckhdq mm4, mm0 ;---1,2,4,5
|
|
|
|
movq mmword ptr [esi+8*3], mm4
|
|
punpckldq mm1, mm2 ;---1,2,5
|
|
|
|
movq mmword ptr [esi+8*5], mm1
|
|
punpckhdq mm5, mm2 ;---5
|
|
|
|
movq mmword ptr [esi+8*7], mm5
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;;;;;; 1D DCT of the rows ;;;;;;;;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
|
|
|
|
|
|
mov esi, DWORD PTR [esp+36] ; source
|
|
;slot
|
|
|
|
; column 0: even part
|
|
; use V4, V12, V0, V8 to produce V22..V25
|
|
|
|
movq mm0, mmword ptr [esi+8*12] ; V12
|
|
|
|
movq mm1, mmword ptr [esi+8*4] ; V4
|
|
|
|
movq mm3, mmword ptr [esi+8*0] ; V0
|
|
|
|
movq mm2, mm1 ; duplicate V4
|
|
|
|
movq mm5, mmword ptr [esi+8*8] ; V8
|
|
psubw mm1, mm0 ; V16
|
|
|
|
movq mmword ptr scratch1, mm1
|
|
movq mm1, mmword ptr x5a825a825a825a82 ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm1
|
|
movq mmword ptr scratch5, mm0
|
|
PackMulW
|
|
movq mm1, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
|
|
paddw mm2, mm0 ; V17
|
|
|
|
movq mm0, mm2 ; duplicate V17
|
|
|
|
movq mm4, mm3 ; duplicate V0
|
|
|
|
paddw mm3, mm5 ; V19
|
|
psubw mm4, mm5 ; V20 ;mm5 free
|
|
|
|
;moved from the block below
|
|
movq mm6, mm3 ; duplicate t74=t81
|
|
|
|
psubw mm1, mm0 ; V21 ; mm0 free
|
|
paddw mm3, mm2 ; V22
|
|
|
|
movq mm5, mm1 ; duplicate V21
|
|
paddw mm1, mm4 ; V23
|
|
|
|
movq mmword ptr [esi+8*4], mm3 ; V22
|
|
psubw mm4, mm5 ; V24; mm5 free
|
|
|
|
movq mmword ptr [esi+8*12], mm1 ; V23
|
|
psubw mm6, mm2 ; V25; mm2 free
|
|
|
|
movq mmword ptr [esi+8*0], mm4 ; V24
|
|
|
|
; keep mm6 alive all along the next block
|
|
|
|
; column 0: odd part
|
|
; use V2, V6, V10, V14 to produce V31, V39, V40, V41
|
|
|
|
|
|
movq mm7, mmword ptr [esi+8*10] ; V10
|
|
|
|
movq mm0, mmword ptr [esi+8*6] ; V6
|
|
|
|
movq mm3, mm7 ; duplicate V10
|
|
|
|
movq mm5, mmword ptr [esi+8*2] ; V2
|
|
|
|
psubw mm7, mm0 ; V26
|
|
|
|
movq mm4, mmword ptr [esi+8*14] ; V14
|
|
paddw mm3, mm0 ; V29 ; free mm0
|
|
|
|
movq mm1, mm7 ; duplicate V26
|
|
|
|
movq mmword ptr scratch1, mm7
|
|
movq mm7, mmword ptr x539f539f539f539f ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm7
|
|
movq mmword ptr scratch5, mm0
|
|
movq mmword ptr scratch7, mm1
|
|
PackMulW
|
|
movq mm7, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
movq mm1, mmword ptr scratch7
|
|
|
|
|
|
|
|
movq mm0, mm5 ; duplicate V2
|
|
|
|
paddw mm5, mm4 ; V27
|
|
psubw mm0, mm4 ; V28 ; free mm4
|
|
|
|
movq mm2, mm0 ; duplicate V28
|
|
|
|
|
|
movq mmword ptr scratch1, mm0
|
|
movq mm0, mmword ptr x4546454645464546 ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm0
|
|
movq mmword ptr scratch7, mm1
|
|
PackMulW
|
|
movq mm0, mmword ptr scratch1
|
|
movq mm1, mmword ptr scratch7
|
|
|
|
|
|
movq mm4, mm5 ; duplicate t90=t93
|
|
paddw mm1, mm2 ; V32 ; free mm2
|
|
|
|
movq mmword ptr scratch1, mm1
|
|
movq mm1, mmword ptr x61f861f861f861f8 ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm1
|
|
movq mmword ptr scratch5, mm0
|
|
PackMulW
|
|
movq mm1, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
|
|
|
|
paddw mm5, mm3 ; V31
|
|
psubw mm4, mm3 ; V30 ; free mm3
|
|
|
|
|
|
movq mmword ptr scratch1, mm4
|
|
movq mm4, mmword ptr x5a825a825a825a82 ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm4
|
|
movq mmword ptr scratch5, mm0
|
|
movq mmword ptr scratch7, mm1
|
|
PackMulW
|
|
movq mm4, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
movq mm1, mmword ptr scratch7
|
|
|
|
|
|
psubw mm0, mm1 ; V38
|
|
paddw mm1, mm7 ; V37 ; free mm7
|
|
|
|
;move from the next block
|
|
movq mm3, mm6 ; duplicate V25
|
|
|
|
;move from the next block
|
|
movq mm7, mmword ptr [esi+8*4] ; V22
|
|
|
|
psubw mm1, mm5 ; V39 (mm5 still needed for next block)
|
|
|
|
;move from the next block
|
|
movq mm2, mmword ptr [esi+8*12] ; V23
|
|
|
|
psubw mm4, mm1 ; V40
|
|
|
|
paddw mm0, mm4 ; V41; free mm0
|
|
;move from the next block
|
|
|
|
; column 0: output butterfly
|
|
;move above
|
|
psubw mm6, mm0 ; tm6
|
|
paddw mm3, mm0 ; tm8; free mm1
|
|
|
|
movq mm0, mm1 ; line added by Kumar
|
|
|
|
movq mm1, mm7 ; duplicate V22
|
|
paddw mm7, mm5 ; tm0
|
|
|
|
movq mmword ptr [esi+8*8], mm3 ; tm8; free mm3
|
|
psubw mm1, mm5 ; tm14; free mm5
|
|
|
|
movq mmword ptr [esi+8*6], mm6 ; tm6; free mm6
|
|
movq mm3, mm2 ; duplicate t117=t125
|
|
|
|
movq mm6, mmword ptr [esi+8*0] ; V24
|
|
paddw mm2, mm0 ; tm2
|
|
|
|
movq mmword ptr [esi+8*0], mm7 ; tm0; free mm7
|
|
psubw mm3, mm0 ; tm12; free mm0
|
|
|
|
movq mmword ptr [esi+8*14], mm1 ; tm14; free mm1
|
|
|
|
movq mmword ptr [esi+8*2], mm2 ; tm2; free mm2
|
|
movq mm0, mm6 ; duplicate t119=t123
|
|
|
|
movq mmword ptr [esi+8*12], mm3 ; tm12; free mm3
|
|
paddw mm6, mm4 ; tm4
|
|
|
|
;moved from next block
|
|
psubw mm0, mm4 ; tm10; free mm4
|
|
|
|
;moved from next block
|
|
movq mm1, mmword ptr [esi+8*5] ; V5
|
|
|
|
movq mmword ptr [esi+8*4], mm6 ; tm4; free mm6
|
|
|
|
movq mmword ptr [esi+8*10], mm0 ; tm10; free mm0
|
|
|
|
; column 1: even part
|
|
; use V5, V13, V1, V9 to produce V56..V59
|
|
;moved to prev block
|
|
|
|
movq mm7, mmword ptr [esi+8*13] ; V13
|
|
movq mm2, mm1 ; duplicate t128=t130
|
|
|
|
movq mm3, mmword ptr [esi+8*1] ; V1
|
|
|
|
psubw mm1, mm7 ; V50
|
|
|
|
movq mm5, mmword ptr [esi+8*9] ; V9
|
|
paddw mm2, mm7 ; V51
|
|
|
|
|
|
movq mmword ptr scratch1, mm1
|
|
movq mm1, mmword ptr x5a825a825a825a82 ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm1
|
|
movq mmword ptr scratch5, mm0
|
|
PackMulW
|
|
movq mm1, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
|
|
|
|
movq mm6, mm2 ; duplicate V51
|
|
|
|
movq mm4, mm3 ; duplicate V1
|
|
|
|
paddw mm3, mm5 ; V53
|
|
|
|
psubw mm4, mm5 ; V54 ;mm5 free
|
|
movq mm7, mm3 ; duplicate V53
|
|
|
|
;moved from next block
|
|
|
|
psubw mm1, mm6 ; V55 ; mm6 free
|
|
paddw mm3, mm2 ; V56
|
|
|
|
movq mm5, mm4 ; duplicate t140=t142
|
|
paddw mm4, mm1 ; V57
|
|
|
|
movq mmword ptr [esi+8*5], mm3 ; V56
|
|
psubw mm5, mm1 ; V58; mm1 free
|
|
|
|
movq mmword ptr [esi+8*13], mm4 ; V57
|
|
psubw mm7, mm2 ; V59; mm2 free
|
|
|
|
movq mmword ptr [esi+8*9], mm5 ; V58
|
|
|
|
; keep mm7 alive all along the next block
|
|
|
|
movq mm0, mmword ptr [esi+8*11] ; V11
|
|
|
|
|
|
movq mm6, mmword ptr [esi+8*7] ; V7
|
|
|
|
movq mm3, mm0 ; duplicate V11
|
|
|
|
movq mm4, mmword ptr [esi+8*15] ; V15
|
|
|
|
|
|
movq mm5, mmword ptr [esi+8*3] ; V3
|
|
paddw mm0, mm6 ; V63
|
|
|
|
; note that V15 computation has a correction step:
|
|
; this is a 'magic' constant that rebiases the results to be closer to the expected result
|
|
; this magic constant can be refined to reduce the error even more
|
|
; by doing the correction step in a later stage when the number is actually multiplied by 16
|
|
psubw mm3, mm6 ; V60 ; free mm6
|
|
|
|
movq mm1, mm3 ; duplicate V60
|
|
|
|
|
|
movq mmword ptr scratch1, mm1
|
|
movq mm1, mmword ptr x539f539f539f539f ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm1
|
|
movq mmword ptr scratch5, mm0
|
|
PackMulW
|
|
movq mm1, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
|
|
|
|
|
|
movq mm6, mm5 ; duplicate V3
|
|
|
|
|
|
paddw mm5, mm4 ; V61
|
|
psubw mm6, mm4 ; V62 ; free mm4
|
|
|
|
movq mm4, mm5 ; duplicate V61
|
|
|
|
paddw mm5, mm0 ; V65 -> result
|
|
psubw mm4, mm0 ; V64 ; free mm0
|
|
|
|
|
|
movq mmword ptr scratch1, mm4
|
|
movq mm4, mmword ptr x5a825a825a825a82 ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm4
|
|
movq mmword ptr scratch5, mm0
|
|
movq mmword ptr scratch7, mm1
|
|
PackMulW
|
|
movq mm4, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
movq mm1, mmword ptr scratch7
|
|
|
|
paddw mm3, mm6 ; V66
|
|
movq mm2, mm5 ; duplicate V65
|
|
|
|
|
|
movq mmword ptr scratch1, mm3
|
|
movq mm3, mmword ptr x61f861f861f861f8 ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm3
|
|
movq mmword ptr scratch5, mm0
|
|
movq mmword ptr scratch7, mm1
|
|
PackMulW
|
|
movq mm3, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
movq mm1, mmword ptr scratch7
|
|
|
|
|
|
movq mmword ptr scratch1, mm6
|
|
movq mm6, mmword ptr x4546454645464546 ; 23170 ->V18
|
|
movq mmword ptr scratch3, mm6
|
|
movq mmword ptr scratch5, mm0
|
|
movq mmword ptr scratch7, mm1
|
|
PackMulW
|
|
movq mm6, mmword ptr scratch1
|
|
movq mm0, mmword ptr scratch5
|
|
movq mm1, mmword ptr scratch7
|
|
|
|
;moved from next block
|
|
movq mm0, mmword ptr [esi+8*5] ; V56
|
|
|
|
|
|
psubw mm6, mm3 ; V72
|
|
paddw mm3, mm1 ; V71 ; free mm1
|
|
|
|
psubw mm3, mm2 ; V73 ; free mm2
|
|
|
|
psubw mm4, mm3 ; V74
|
|
|
|
;moved from next block
|
|
movq mm1, mm0 ; duplicate t177=t188
|
|
|
|
paddw mm6, mm4 ; V75
|
|
|
|
;moved from next block
|
|
paddw mm0, mm5 ; tm1
|
|
|
|
;location
|
|
; 5 - V56
|
|
; 13 - V57
|
|
; 9 - V58
|
|
; X - V59, mm7
|
|
; X - V65, mm5
|
|
; X - V73, mm6
|
|
; X - V74, mm4
|
|
; X - V75, mm3
|
|
; free mm0, mm1 & mm2
|
|
;move above
|
|
|
|
movq mm2, mmword ptr [esi+8*13] ; V57
|
|
psubw mm1, mm5 ; tm15; free mm5
|
|
|
|
movq mmword ptr [esi+8*1], mm0 ; tm1; free mm0
|
|
|
|
;save the store as used directly in the transpose
|
|
movq mm5, mm7 ; duplicate t182=t184
|
|
|
|
psubw mm7, mm6 ; tm7
|
|
|
|
paddw mm5, mm6 ; tm9; free mm3
|
|
|
|
movq mm6, mm3
|
|
|
|
movq mm0, mmword ptr [esi+8*9] ; V58
|
|
movq mm3, mm2 ; duplicate V57
|
|
|
|
movq mmword ptr [esi+8*7], mm7 ; tm7; free mm7
|
|
psubw mm3, mm6 ; tm13
|
|
|
|
paddw mm2, mm6 ; tm3 ; free mm6
|
|
|
|
movq mm6, mm0 ; duplicate V58
|
|
|
|
movq mmword ptr [esi+8*3], mm2 ; tm3; free mm2
|
|
paddw mm0, mm4 ; tm5
|
|
|
|
psubw mm6, mm4 ; tm11; free mm4
|
|
|
|
movq mmword ptr [esi+8*5], mm0 ; tm5; free mm0
|
|
|
|
|
|
|
|
; Final results to be stored after the transpose
|
|
; transpose the bottom right quadrant(4X4) of the matrix
|
|
; --------- ---------
|
|
; | M1 | M2 | | M1'| M3'|
|
|
; --------- --> ---------
|
|
; | M3 | M4 | | M2'| M4'|
|
|
; --------- ---------
|
|
;
|
|
; get the pointer to array "range"
|
|
mov edi, [esp+52]
|
|
|
|
; calculate the destination address
|
|
mov ebp, [esp+44] ; get output_buf[4]
|
|
|
|
mov ebx, [ebp+20]
|
|
mov ecx, [ebp+24]
|
|
mov edx, [ebp+28]
|
|
mov ebp, [ebp+16]
|
|
|
|
add ebp, [esp+48] ; add to output_col
|
|
add ebx, [esp+48] ; add to output_col
|
|
add ecx, [esp+48] ; add to output_col
|
|
add edx, [esp+48] ; add to output_col
|
|
|
|
|
|
movq mm0, mm5 ; copy w4---0,1,3,5,6
|
|
punpcklwd mm5, mm6 ;
|
|
|
|
punpckhwd mm0, mm6 ;---0,1,3,5,6
|
|
movq mm6, mmword ptr [esi+8*0] ;get w0 of top left quadrant
|
|
|
|
movq mm2, mm3 ;---0,1,2,3,5,6
|
|
punpcklwd mm3, mm1 ;
|
|
|
|
movq mm7, mmword ptr [esi+8*2] ;get w1 of top left quadrant
|
|
punpckhwd mm2, mm1 ;---0,2,3,5,6,7
|
|
|
|
movq mm4, mm5 ;---0,2,3,4,5,6,7
|
|
punpckldq mm5, mm3 ; transposed w4
|
|
|
|
psrlw mm5, 5
|
|
movd eax, mm5
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebp+4], al
|
|
|
|
psrlq mm5, 16
|
|
movd eax, mm5
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebp+5], al
|
|
|
|
psrlq mm5, 16
|
|
movd eax, mm5
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebp+6], al
|
|
|
|
psrlq mm5, 16
|
|
movd eax, mm5
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebp+7], al
|
|
|
|
|
|
punpckhdq mm4, mm3 ; transposed w5---0,2,4,6,7
|
|
|
|
movq mm3, mm0 ;---0,2,3,4,6,7
|
|
punpckldq mm0, mm2 ; transposed w6
|
|
|
|
psrlw mm4, 5
|
|
movd eax, mm4
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebx+4], al
|
|
|
|
psrlq mm4, 16
|
|
movd eax, mm4
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebx+5], al
|
|
|
|
psrlq mm4, 16
|
|
movd eax, mm4
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebx+6], al
|
|
|
|
psrlq mm4, 16
|
|
movd eax, mm4
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebx+7], al
|
|
|
|
punpckhdq mm3, mm2 ; transposed w7---0,3,6,7
|
|
|
|
psrlw mm0, 5
|
|
|
|
movd eax, mm0
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ecx+4], al
|
|
|
|
psrlq mm0, 16
|
|
movd eax, mm0
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ecx+5], al
|
|
|
|
psrlq mm0, 16
|
|
movd eax, mm0
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ecx+6], al
|
|
|
|
psrlq mm0, 16
|
|
movd eax, mm0
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ecx+7], al
|
|
|
|
movq mm5, mm6 ; copy w0
|
|
|
|
psrlw mm3, 5
|
|
movd eax, mm3
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [edx+4], al
|
|
|
|
psrlq mm3, 16
|
|
movd eax, mm3
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [edx+5], al
|
|
|
|
psrlq mm3, 16
|
|
movd eax, mm3
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [edx+6], al
|
|
|
|
psrlq mm3, 16
|
|
movd eax, mm3
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [edx+7], al
|
|
|
|
punpcklwd mm6, mm7
|
|
|
|
; transpose the top left quadrant(4X4) of the matrix
|
|
|
|
; calculate the destination address
|
|
mov ebp, [esp+44] ; get output_buf[0]
|
|
|
|
mov ebx, [ebp+4]
|
|
mov ecx, [ebp+8]
|
|
mov edx, [ebp+12]
|
|
mov ebp, [ebp+0]
|
|
|
|
add ebp, [esp+48] ; add to output_col
|
|
add ebx, [esp+48] ; add to output_col
|
|
add ecx, [esp+48] ; add to output_col
|
|
add edx, [esp+48] ; add to output_col
|
|
|
|
punpckhwd mm5, mm7 ;---5,6,7
|
|
movq mm7, mmword ptr [esi+8*4] ; get w2 of TL quadrant
|
|
|
|
movq mm4, mmword ptr [esi+8*6] ; get w3 of TL quadrant
|
|
movq mm3, mm7 ; copy w2---3,4,5,6,7
|
|
|
|
movq mm2, mm6
|
|
punpcklwd mm7, mm4 ;---2,3,4,5,6,7
|
|
|
|
punpckhwd mm3, mm4 ;---2,3,4,5,6,7
|
|
movq mm4, mm5 ;
|
|
|
|
movq mm1, mm5
|
|
punpckldq mm6, mm7 ;---1,2,3,4,5,6,7
|
|
|
|
psrlw mm6, 5
|
|
movd eax, mm6
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebp], al
|
|
|
|
psrlq mm6, 16
|
|
movd eax, mm6
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebp+1], al
|
|
|
|
psrlq mm6, 16
|
|
movd eax, mm6
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebp+2], al
|
|
|
|
psrlq mm6, 16
|
|
movd eax, mm6
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebp+3], al
|
|
|
|
punpckhdq mm2, mm7 ;---1,2,3,4,5,6,7
|
|
|
|
psrlw mm2, 5
|
|
movd eax, mm2
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebx], al
|
|
|
|
psrlq mm2, 16
|
|
movd eax, mm2
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebx+1], al
|
|
|
|
psrlq mm2, 16
|
|
movd eax, mm2
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebx+2], al
|
|
|
|
psrlq mm2, 16
|
|
movd eax, mm2
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebx+3], al
|
|
|
|
punpckldq mm5, mm3 ;---1,2,3,4,5,6,7
|
|
|
|
psrlw mm5, 5
|
|
movd eax, mm5
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ecx], al
|
|
|
|
psrlq mm5, 16
|
|
movd eax, mm5
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ecx+1], al
|
|
|
|
psrlq mm5, 16
|
|
movd eax, mm5
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ecx+2], al
|
|
|
|
psrlq mm5, 16
|
|
movd eax, mm5
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ecx+3], al
|
|
|
|
punpckhdq mm1, mm3 ;---1,2,3,4,5,6,7
|
|
|
|
psrlw mm1, 5
|
|
movd eax, mm1
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [edx], al
|
|
|
|
psrlq mm1, 16
|
|
movd eax, mm1
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [edx+1], al
|
|
|
|
psrlq mm1, 16
|
|
movd eax, mm1
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [edx+2], al
|
|
|
|
psrlq mm1, 16
|
|
movd eax, mm1
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [edx+3], al
|
|
|
|
|
|
|
|
; transpose the top right quadrant(4X4) of the matrix
|
|
|
|
; calculate the destination address for **bottom left quadrant
|
|
mov ebp, [esp+44] ; get output_buf[4]
|
|
|
|
mov ebx, [ebp+20]
|
|
mov ecx, [ebp+24]
|
|
mov edx, [ebp+28]
|
|
mov ebp, [ebp+16]
|
|
|
|
add ebp, [esp+48] ; add to output_col
|
|
add ebx, [esp+48] ; add to output_col
|
|
add ecx, [esp+48] ; add to output_col
|
|
add edx, [esp+48] ; add to output_col
|
|
|
|
|
|
movq mm0, mmword ptr [esi+8*1] ;---0
|
|
|
|
movq mm1, mmword ptr [esi+8*3] ;---0,1,2
|
|
movq mm2, mm0
|
|
|
|
movq mm3, mmword ptr [esi+8*5]
|
|
punpcklwd mm0, mm1 ;---0,1,2,3
|
|
|
|
punpckhwd mm2, mm1
|
|
movq mm1, mmword ptr [esi+8*7] ;---0,1,2,3
|
|
|
|
movq mm4, mm3
|
|
punpcklwd mm3, mm1 ;---0,1,2,3,4
|
|
|
|
punpckhwd mm4, mm1 ;---0,1,2,3,4
|
|
movq mm1, mm0
|
|
|
|
movq mm5, mm2
|
|
punpckldq mm0, mm3 ;---0,1,2,3,4,5
|
|
|
|
punpckhdq mm1, mm3 ;---0,1,2,3,4,5
|
|
movq mm3, mmword ptr [esi+8*8]
|
|
|
|
psrlw mm0, 5
|
|
movd eax, mm0
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebp], al
|
|
|
|
psrlq mm0, 16
|
|
movd eax, mm0
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebp+1], al
|
|
|
|
psrlq mm0, 16
|
|
movd eax, mm0
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebp+2], al
|
|
|
|
psrlq mm0, 16
|
|
movd eax, mm0
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebp+3], al
|
|
|
|
|
|
punpckldq mm2, mm4 ;---1,2,3,4,5
|
|
|
|
punpckhdq mm5, mm4 ;---1,2,3,4,5
|
|
movq mm4, mmword ptr [esi+8*10]
|
|
|
|
; transpose the bottom left quadrant(4X4) of the matrix
|
|
; Also store w1,w2,w3 of top right quadrant into
|
|
; w5,w6,w7 of bottom left quadrant. Storing w0 of TR in w4
|
|
; of BL is already done.
|
|
|
|
psrlw mm1, 5
|
|
movd eax, mm1
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebx], al
|
|
|
|
psrlq mm1, 16
|
|
movd eax, mm1
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebx+1], al
|
|
|
|
psrlq mm1, 16
|
|
movd eax, mm1
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebx+2], al
|
|
|
|
psrlq mm1, 16
|
|
movd eax, mm1
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebx+3], al
|
|
|
|
movq mm1, mm3 ;---1,2,3,4,5
|
|
|
|
movq mm0, mmword ptr [esi+8*12]
|
|
punpcklwd mm3, mm4 ;---0,1,2,3,4,5
|
|
|
|
punpckhwd mm1, mm4 ;---0,1,2,3,4,5
|
|
movq mm4, mmword ptr [esi+8*14]
|
|
|
|
psrlw mm2, 5
|
|
movd eax, mm2
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ecx], al
|
|
|
|
psrlq mm2, 16
|
|
movd eax, mm2
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ecx+1], al
|
|
|
|
psrlq mm2, 16
|
|
movd eax, mm2
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ecx+2], al
|
|
|
|
psrlq mm2, 16
|
|
movd eax, mm2
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ecx+3], al
|
|
|
|
|
|
movq mm2, mm0
|
|
|
|
psrlw mm5, 5
|
|
movd eax, mm5
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [edx], al
|
|
|
|
|
|
psrlq mm5, 16
|
|
movd eax, mm5
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [edx+1], al
|
|
|
|
psrlq mm5, 16
|
|
movd eax, mm5
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [edx+2], al
|
|
|
|
psrlq mm5, 16
|
|
movd eax, mm5
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [edx+3], al
|
|
|
|
|
|
punpcklwd mm0, mm4 ;---0,1,2,3,4
|
|
|
|
punpckhwd mm2, mm4 ;---0,1,2,3,4
|
|
movq mm4, mm3
|
|
|
|
movq mm5, mm1
|
|
punpckldq mm3, mm0 ;---0,1,2,3,4,5
|
|
|
|
; calculate the destination address for **top right quadrant
|
|
mov ebp, [esp+44] ; get output_buf[0]
|
|
|
|
mov ebx, [ebp+4]
|
|
mov ecx, [ebp+8]
|
|
mov edx, [ebp+12]
|
|
mov ebp, [ebp+0]
|
|
|
|
add ebp, [esp+48] ; add to output_col
|
|
add ebx, [esp+48] ; add to output_col
|
|
add ecx, [esp+48] ; add to output_col
|
|
add edx, [esp+48] ; add to output_col
|
|
|
|
|
|
psrlw mm3, 5
|
|
movd eax, mm3
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebp+4], al
|
|
|
|
psrlq mm3, 16
|
|
movd eax, mm3
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebp+5], al
|
|
|
|
psrlq mm3, 16
|
|
movd eax, mm3
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebp+6], al
|
|
|
|
psrlq mm3, 16
|
|
movd eax, mm3
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebp+7], al
|
|
|
|
|
|
punpckhdq mm4, mm0 ;---1,2,4,5
|
|
|
|
psrlw mm4, 5
|
|
movd eax, mm4
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebx+4], al
|
|
|
|
psrlq mm4, 16
|
|
movd eax, mm4
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebx+5], al
|
|
|
|
psrlq mm4, 16
|
|
movd eax, mm4
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebx+6], al
|
|
|
|
psrlq mm4, 16
|
|
movd eax, mm4
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ebx+7], al
|
|
|
|
|
|
punpckldq mm1, mm2 ;---1,2,5
|
|
|
|
psrlw mm1, 5
|
|
movd eax, mm1
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ecx+4], al
|
|
|
|
psrlq mm1, 16
|
|
movd eax, mm1
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ecx+5], al
|
|
|
|
psrlq mm1, 16
|
|
movd eax, mm1
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ecx+6], al
|
|
|
|
psrlq mm1, 16
|
|
movd eax, mm1
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [ecx+7], al
|
|
|
|
|
|
punpckhdq mm5, mm2 ;---5
|
|
|
|
psrlw mm5, 5
|
|
movd eax, mm5
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [edx+4], al
|
|
|
|
psrlq mm5, 16
|
|
movd eax, mm5
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [edx+5], al
|
|
|
|
psrlq mm5, 16
|
|
movd eax, mm5
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [edx+6], al
|
|
|
|
psrlq mm5, 16
|
|
movd eax, mm5
|
|
and eax, 03ffh
|
|
mov al, byte ptr [edi][eax]
|
|
mov byte ptr [edx+7], al
|
|
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
emms
|
|
|
|
|
|
ret
|
|
|
|
_idct8x8aan ENDP
|
|
_TEXT ENDS
|
|
|
|
END
|