1452 lines
42 KiB
C++
1452 lines
42 KiB
C++
//
|
|
// Copyright (c) 1996-2000 Microsoft Corporation. All rights reserved.
|
|
// Mmx.cpp
|
|
// MMX Mix engines for Microsoft synth
|
|
|
|
/*
|
|
Variable useage.
|
|
|
|
Variable register
|
|
pfSamplePos eax
|
|
pfPitch ebx
|
|
dwI ecx
|
|
dwIncDelta edx (edx is sometimes a temporary register)
|
|
dwPosition1 esi
|
|
dwPostiion2 edi
|
|
|
|
vfRvolume and vfLvolume mm0
|
|
vfRVolume, vfLVolume mm2
|
|
|
|
mm4 - mm7 are temporary mmx registers.
|
|
*/
|
|
|
|
// Notes about calculation.
|
|
|
|
// Loop is unrolled once.
|
|
// *1 shifting volumne to 15 bit values to get rid of shifts and simplify code.
|
|
// This make the packed mulitply work better later since I keep the sound interpolated
|
|
// wave value at 16 bit signed value. For a PMULHW, this results in 15 bit results
|
|
// which is the same as the original code.
|
|
|
|
|
|
// *2 linear interpolation can be done very quickly with MMX by re-arranging the
|
|
// way that the interpolation is done. Here is code in C that shows the difference.
|
|
// Original C code
|
|
//lM1 = ((pcWave[dwPosition1 + 1] - pcWave[dwPosition1]) * dwFract1) >> 12;
|
|
//lM2 = ((pcWave[dwPosition2 + 1] - pcWave[dwPosition2]) * dwFract2) >> 12;
|
|
//lM1 += pcWave[dwPosition1];
|
|
//lM2 += pcWave[dwPosition2];
|
|
|
|
// Equivalent C Code that can be done with a pmadd
|
|
//lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
|
|
//lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
|
|
|
|
|
|
#include "common.h"
|
|
|
|
#define STR_MODULENAME "DDKSynth.sys:MMX: "
|
|
|
|
typedef unsigned __int64 QWORD;
|
|
|
|
#pragma code_seg()
|
|
/*****************************************************************************
|
|
* CDigitalAudio::MixMono8X()
|
|
*****************************************************************************
|
|
* Implement a mono eight-bit mix.
|
|
* Heavily optimized for MMX.
|
|
*/
|
|
DWORD CDigitalAudio::MixMono8X(short * pBuffer, DWORD dwLength,
|
|
DWORD dwDeltaPeriod, VFRACT vfDeltaVolume,
|
|
PFRACT pfDeltaPitch, PFRACT pfSampleLength,
|
|
PFRACT pfLoopLength)
|
|
{
|
|
DWORD dwI,dwIncDelta = dwDeltaPeriod;
|
|
|
|
char * pcWave = (char *) m_pnWave;
|
|
PFRACT pfSamplePos = m_pfLastSample;
|
|
VFRACT vfVolume = m_vfLastLVolume;
|
|
PFRACT pfPitch = m_pfLastPitch;
|
|
PFRACT pfPFract = pfPitch << 8;
|
|
VFRACT vfVFract = vfVolume << 8; // Keep high res version around.
|
|
|
|
|
|
QWORD dwFractMASK = 0x000000000FFF0FFF;
|
|
QWORD dwFractOne = 0x0000000010001000;
|
|
QWORD wordmask = 0x0000FFFF0000FFFF;
|
|
QWORD vfDeltaLandRVolume;
|
|
|
|
_asm{
|
|
|
|
// vfLVFract and vfRVFract are in mm0
|
|
//VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around.
|
|
//VFRACT vfRVFract = vfRVolume1 << 8;
|
|
|
|
movd mm0, vfVolume
|
|
movd mm7, vfVolume
|
|
|
|
// vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume
|
|
movd mm1, vfDeltaVolume
|
|
movd mm6, vfDeltaVolume
|
|
|
|
punpckldq mm1, mm6
|
|
|
|
// dwI = 0
|
|
mov ecx, 0
|
|
movq vfDeltaLandRVolume, mm1
|
|
|
|
|
|
movq mm1, dwFractOne
|
|
movq mm4, dwFractMASK
|
|
|
|
mov eax, pfSamplePos
|
|
|
|
|
|
punpckldq mm0, mm7
|
|
mov ebx, pfPitch
|
|
|
|
pslld mm0, 8
|
|
mov edx, dwIncDelta
|
|
|
|
movq mm2, mm0 // vfLVolume and vfRVolume in mm2
|
|
// need to be set before first pass.
|
|
|
|
// *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value
|
|
psrld mm2, 5
|
|
|
|
//for (dwI = 0; dwI < dwLength; )
|
|
//{
|
|
mainloop:
|
|
cmp ecx, dwLength
|
|
jae done
|
|
|
|
|
|
|
|
cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength)
|
|
jb NotPastEndOfSample1 //{
|
|
|
|
cmp pfLoopLength, 0 //if (!pfLoopLength)
|
|
|
|
je done // break;
|
|
|
|
sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength;
|
|
|
|
NotPastEndOfSample1: //}
|
|
|
|
mov esi, eax // dwPosition1 = pfSamplePos;
|
|
add eax, ebx // pfSamplePos += pfPitch;
|
|
|
|
sub edx, 2 // dwIncDelta-=2;
|
|
jnz DontIncreaseValues1 //if (!dwIncDelta) {
|
|
|
|
// Since edx was use for dwIncDelta and now its zero, we can use if for a temporary
|
|
// for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume
|
|
// if it goes below zero.
|
|
|
|
paddd mm0, vfDeltaLandRVolume // vfVFract += vfDeltaVolume;
|
|
// vfVFract += vfDeltaVolume;
|
|
pxor mm5, mm5 // TestLVol = 0; TestRVol = 0;
|
|
|
|
|
|
mov edx, pfPFract // Temp = pfPFract;
|
|
pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff;
|
|
// if (TestRVol > vfRVFract) TestRVol = 0xffffffff;
|
|
|
|
add edx, pfDeltaPitch // Temp += pfDeltaPitch;
|
|
pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol);
|
|
// TestRVol = vfRVFract & (~TestRVol);
|
|
|
|
mov pfPFract, edx // pfPFract = Temp;
|
|
movq mm2, mm5 // vfLVolume = TestLVol;
|
|
// vfRVolume = TestRVol;
|
|
|
|
|
|
shr edx, 8 // Temp = Temp >> 8;
|
|
psrld mm2, 5 // vfLVolume = vfLVolume >> 5;
|
|
// vfRVolume = vfRVolume >> 5;
|
|
|
|
mov ebx, edx // pfPitch = Temp;
|
|
mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod;
|
|
|
|
//}
|
|
DontIncreaseValues1:
|
|
|
|
movd mm6, esi // dwFract1 = dwPosition1;
|
|
movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000
|
|
|
|
shr esi, 12 // dwPosition1 = dwPosition1 >> 12;
|
|
inc ecx //dwI++;
|
|
|
|
// if ( dwI < dwLength) break;
|
|
cmp ecx, dwLength
|
|
jae StoreOne
|
|
|
|
//if (pfSamplePos >= pfSampleLength)
|
|
//{
|
|
cmp eax, pfSampleLength
|
|
jb NotPastEndOfSample2
|
|
|
|
// Original if in C was not negated
|
|
//if (!pfLoopLength)
|
|
cmp pfLoopLength, 0
|
|
//break;
|
|
je StoreOne
|
|
//else
|
|
//pfSamplePos -= pfLoopLength;
|
|
sub eax, pfLoopLength
|
|
//}
|
|
NotPastEndOfSample2:
|
|
|
|
//shl esi, 1 // do not shift left since pcWave is array of chars
|
|
mov edi, eax // dwPosition2 = pfSamplePos;
|
|
|
|
add esi, pcWave // Put address of pcWave[dwPosition1] in esi
|
|
movd mm7, eax // dwFract2 = pfSamplePos;
|
|
|
|
shr edi, 12 // dwPosition2 = dwPosition2 >> 12;
|
|
punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are
|
|
// 0, 0, dwFract2, dwFract1
|
|
|
|
pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff;
|
|
|
|
movzx esi, word ptr[esi] //lLM1 = pcWave[dwPosition1];
|
|
movd mm3, esi
|
|
|
|
psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1
|
|
|
|
//shl edi, 1 //do not shift left since pcWave is array of chars
|
|
punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1
|
|
|
|
add edi, pcWave // Put address of pcWave[dwPosition2] in edi
|
|
mov esi, ecx // Temp = dWI;
|
|
|
|
shl esi, 1 // Temp = Temp << 1;
|
|
|
|
movzx edi, word ptr[edi] //lLM2 = pcWave[dwPoisition2];
|
|
movd mm6, edi
|
|
|
|
pxor mm7, mm7 // zero out mm7 to make 8 bit into 16 bit
|
|
|
|
// low 4 bytes in mm3
|
|
punpcklwd mm3, mm6 // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
|
|
|
|
add esi, pBuffer //
|
|
punpcklbw mm7, mm3 // low four bytes bytes in
|
|
// pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
|
|
|
|
pmaddwd mm7, mm5 // high dword = lM2 =
|
|
//(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2))
|
|
// low dword = lM1 =
|
|
//(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
|
|
|
|
movq mm3, mm2 // put left and right volume levels in mm3
|
|
add eax, ebx //pfSamplePos += pfPitch;
|
|
|
|
packssdw mm3, mm2 // words in mm7
|
|
// vfVolume, vfVolume, vfVolume, vfVolume
|
|
|
|
movd mm5, dword ptr[esi-2] // Load values from buffer
|
|
inc ecx // dwI++;
|
|
|
|
psrad mm7, 12 // shift back down to 16 bits.
|
|
|
|
packssdw mm7, mm4 // only need one word in mono case.
|
|
// low word are lm2 and lm1
|
|
|
|
// above multiplies and shifts are all done with this one pmul. Low two word are only
|
|
// interest in mono case
|
|
pmulhw mm3, mm7 // lLM1 *= vfVolume;
|
|
// lLM2 *= vfVolume;
|
|
|
|
|
|
paddsw mm5, mm3 // Add values to buffer with saturation
|
|
movd dword ptr[esi-2], mm5 // Store values back into buffer.
|
|
|
|
// }
|
|
jmp mainloop
|
|
|
|
// Need to write only one.
|
|
//if (dwI < dwLength)
|
|
//{
|
|
StoreOne:
|
|
#if 1
|
|
// Linearly interpolate between points and store only one value.
|
|
// combine dwFract Values.
|
|
|
|
// Make mm7 zero for unpacking
|
|
|
|
//shl esi, 1 // do not shift left since pcWave is array of chars
|
|
add esi, pcWave // Put address of pcWave[dwPosition1] in esi
|
|
pxor mm7, mm7
|
|
|
|
//lLM1 = pcWave[dwPosition1];
|
|
movzx esi, word ptr[esi]
|
|
|
|
// Doing AND that was not done for dwFract1 and dwFract2
|
|
pand mm6, mm4
|
|
|
|
// words in MMX register after operation is complete.
|
|
psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1
|
|
punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1
|
|
|
|
// put values of pcWave into MMX registers. They are read into a regular register so
|
|
// that the routine does not read past the end of the buffer otherwise, it could read
|
|
// directly into the MMX registers.
|
|
|
|
// words in MMX registers
|
|
pxor mm7, mm7
|
|
// low four bytes
|
|
movd mm4, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1]
|
|
|
|
// 8 bytes after unpakc
|
|
punpcklbw mm7, mm4 // 0, 0, 0, 0, pcWave[dwPos1+1], 0, pcWave[dwPos1], 0
|
|
|
|
// *2 pmadd efficent code.
|
|
//lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
|
|
//lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
|
|
|
|
pmaddwd mm7, mm5// low dword = lM1 =
|
|
//(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
|
|
|
|
psrad mm7, 12 // shift back down to 16 bits
|
|
|
|
movq mm5, mm2 // move volume into mm5
|
|
/*
|
|
// Set lLM to be same as lM
|
|
lLM1 = lM1;
|
|
|
|
lLM1 *= vfLVolume1;
|
|
lLM1 >>= 5; // Signal bumps up to 15 bits.
|
|
lM1 *= vfRVolume1;
|
|
lM1 >>= 5;
|
|
|
|
// Set lLM to be same as lM
|
|
lLM2 = lM2;
|
|
|
|
lLM2 *= vfLVolume2;
|
|
lLM2 >>= 5; // Signal bumps up to 15 bits.
|
|
lM2 *= vfRVolume2;
|
|
lM2 >>= 5;
|
|
*/
|
|
// above multiplies and shifts are all done with this one pmul
|
|
pmulhw mm5, mm7
|
|
|
|
// calculate buffer location.
|
|
mov edi, ecx
|
|
shl edi, 1
|
|
add edi, pBuffer
|
|
|
|
movd edx, mm5
|
|
|
|
//pBuffer[dwI+1] += (short) lM1;
|
|
add word ptr[edi-2], dx
|
|
jno no_oflowr1
|
|
//pBuffer[dwI+1] = 0x7fff;
|
|
mov word ptr[edi-2], 0x7fff
|
|
js no_oflowr1
|
|
//pBuffer[dwI+1] = (short) 0x8000;
|
|
mov word ptr[edi-2], 0x8000
|
|
no_oflowr1:
|
|
//}
|
|
#endif
|
|
done:
|
|
|
|
mov edx, this // get address of class object
|
|
|
|
//m_vfLastLVolume = vfVolume;
|
|
//m_vfLastRVolume = vfVolume;
|
|
// need to shift volume back down to 12 bits before storing
|
|
psrld mm2, 3
|
|
movd [edx]this.m_vfLastLVolume, mm2
|
|
movd [edx]this.m_vfLastRVolume, mm2
|
|
|
|
//m_pfLastPitch = pfPitch;
|
|
mov [edx]this.m_pfLastPitch, ebx
|
|
|
|
//m_pfLastSample = pfSamplePos;
|
|
mov [edx]this.m_pfLastSample, eax
|
|
|
|
// put value back into dwI to be returned. This could just be passed back in eax I think.
|
|
mov dwI, ecx
|
|
emms
|
|
} // ASM block
|
|
return (dwI);
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
* CDigitalAudio::Mix8X()
|
|
*****************************************************************************
|
|
* Implement a stereo eight-bit mix.
|
|
* Heavily optimized for MMX.
|
|
*/
|
|
DWORD CDigitalAudio::Mix8X(short * pBuffer, DWORD dwLength, DWORD dwDeltaPeriod,
|
|
VFRACT vfDeltaLVolume, VFRACT vfDeltaRVolume,
|
|
PFRACT pfDeltaPitch, PFRACT pfSampleLength, PFRACT pfLoopLength)
|
|
|
|
{
|
|
DWORD dwI;
|
|
//DWORD dwPosition1, dwPosition2;
|
|
//long lM1, lLM1;
|
|
//long lM2, lLM2;
|
|
DWORD dwIncDelta = dwDeltaPeriod;
|
|
//VFRACT dwFract1, dwFract2;
|
|
char * pcWave = (char *) m_pnWave;
|
|
PFRACT pfSamplePos = m_pfLastSample;
|
|
VFRACT vfLVolume = m_vfLastLVolume;
|
|
VFRACT vfRVolume = m_vfLastRVolume;
|
|
|
|
VFRACT vfLVolume2 = m_vfLastLVolume;
|
|
VFRACT vfRVolume2 = m_vfLastRVolume;
|
|
|
|
PFRACT pfPitch = m_pfLastPitch;
|
|
PFRACT pfPFract = pfPitch << 8;
|
|
dwLength <<= 1;
|
|
|
|
QWORD dwFractMASK = 0x000000000FFF0FFF;
|
|
QWORD dwFractOne = 0x0000000010001000;
|
|
QWORD wordmask = 0x0000FFFF0000FFFF;
|
|
QWORD vfDeltaLandRVolume;
|
|
|
|
_asm{
|
|
|
|
// vfLVFract and vfRVFract are in mm0
|
|
//VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around.
|
|
//VFRACT vfRVFract = vfRVolume1 << 8;
|
|
|
|
movd mm0, vfLVolume
|
|
movd mm7, vfRVolume
|
|
|
|
// vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume
|
|
movd mm1, vfDeltaLVolume
|
|
movd mm6, vfDeltaRVolume
|
|
|
|
punpckldq mm1, mm6
|
|
|
|
// dwI = 0
|
|
mov ecx, 0
|
|
movq vfDeltaLandRVolume, mm1
|
|
|
|
|
|
movq mm1, dwFractOne
|
|
movq mm4, dwFractMASK
|
|
|
|
mov eax, pfSamplePos
|
|
|
|
|
|
punpckldq mm0, mm7
|
|
mov ebx, pfPitch
|
|
|
|
pslld mm0, 8
|
|
mov edx, dwIncDelta
|
|
|
|
movq mm2, mm0 // vfLVolume and vfRVolume in mm2
|
|
// need to be set before first pass.
|
|
|
|
// *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value
|
|
psrld mm2, 5
|
|
|
|
//for (dwI = 0; dwI < dwLength; )
|
|
//{
|
|
mainloop:
|
|
cmp ecx, dwLength
|
|
jae done
|
|
|
|
|
|
|
|
cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength)
|
|
jb NotPastEndOfSample1 //{
|
|
|
|
cmp pfLoopLength, 0 //if (!pfLoopLength)
|
|
|
|
je done // break;
|
|
|
|
sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength;
|
|
|
|
NotPastEndOfSample1: //}
|
|
|
|
mov esi, eax // dwPosition1 = pfSamplePos;
|
|
add eax, ebx // pfSamplePos += pfPitch;
|
|
|
|
sub edx, 2 // dwIncDelta-=2;
|
|
jnz DontIncreaseValues1 //if (!dwIncDelta) {
|
|
|
|
// Since edx was use for dwIncDelta and now its zero, we can use if for a temporary
|
|
// for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume
|
|
// if it goes below zero.
|
|
|
|
paddd mm0, vfDeltaLandRVolume // vfLVFract += vfDeltaLVolume;
|
|
// vfRVFract += vfDeltaRVolume;
|
|
pxor mm5, mm5 // TestLVol = 0; TestRVol = 0;
|
|
|
|
|
|
mov edx, pfPFract // Temp = pfPFract;
|
|
pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff;
|
|
// if (TestRVol > vfRVFract) TestRVol = 0xffffffff;
|
|
|
|
add edx, pfDeltaPitch // Temp += pfDeltaPitch;
|
|
pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol);
|
|
// TestRVol = vfRVFract & (~TestRVol);
|
|
|
|
mov pfPFract, edx // pfPFract = Temp;
|
|
movq mm2, mm5 // vfLVolume = TestLVol;
|
|
// vfRVolume = TestRVol;
|
|
|
|
|
|
shr edx, 8 // Temp = Temp >> 8;
|
|
psrld mm2, 5 // vfLVolume = vfLVolume >> 5;
|
|
// vfRVolume = vfRVolume >> 5;
|
|
|
|
mov ebx, edx // pfPitch = Temp;
|
|
mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod;
|
|
|
|
//}
|
|
DontIncreaseValues1:
|
|
|
|
movd mm6, esi // dwFract1 = dwPosition1;
|
|
movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000
|
|
|
|
shr esi, 12 // dwPosition1 = dwPosition1 >> 12;
|
|
add ecx, 2 //dwI += 2;
|
|
|
|
// if ( dwI < dwLength) break;
|
|
cmp ecx, dwLength
|
|
jae StoreOne
|
|
|
|
//if (pfSamplePos >= pfSampleLength)
|
|
//{
|
|
cmp eax, pfSampleLength
|
|
jb NotPastEndOfSample2
|
|
|
|
// Original if in C was not negated
|
|
//if (!pfLoopLength)
|
|
cmp pfLoopLength, 0
|
|
//break;
|
|
je StoreOne
|
|
//else
|
|
//pfSamplePos -= pfLoopLength;
|
|
sub eax, pfLoopLength
|
|
//}
|
|
NotPastEndOfSample2:
|
|
|
|
//shl esi, 1 // do not shift left since pcWave is array of chars
|
|
mov edi, eax // dwPosition2 = pfSamplePos;
|
|
|
|
add esi, pcWave // Put address of pcWave[dwPosition1] in esi
|
|
movd mm7, eax // dwFract2 = pfSamplePos;
|
|
|
|
shr edi, 12 // dwPosition2 = dwPosition2 >> 12;
|
|
punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are
|
|
// 0, 0, dwFract2, dwFract1
|
|
|
|
pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff;
|
|
|
|
movzx esi, word ptr[esi] //lLM1 = pcWave[dwPosition1];
|
|
|
|
movd mm3, esi
|
|
|
|
psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1
|
|
|
|
//shl edi, 1 // do not shift left since pcWave is array of chars
|
|
punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1
|
|
|
|
add edi, pcWave // Put address of pcWave[dwPosition2] in edi
|
|
mov esi, ecx // Temp = dWI;
|
|
|
|
shl esi, 1 // Temp = Temp << 1;
|
|
|
|
|
|
movzx edi, word ptr[edi] //lLM2 = pcWave[dwPosition2];
|
|
movd mm6, edi
|
|
|
|
pxor mm7, mm7 // zero out mm7 to make 8 bit into 16 bit
|
|
|
|
// low 4 bytes bytes in mm3
|
|
punpcklwd mm3, mm6 // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
|
|
|
|
add esi, pBuffer //
|
|
punpcklbw mm7, mm3 // bytes in mm7
|
|
// pcWave[dwPos2+1], 0, pcWave[dwPos2], 0, pcWave[dwPos1+1], pcWave[dwPos1], 0
|
|
|
|
pmaddwd mm7, mm5 // high dword = lM2 =
|
|
//(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2))
|
|
// low dword = lM1 =
|
|
//(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
|
|
|
|
movq mm3, mm2 // put left and right volume levels in mm3
|
|
|
|
add eax, ebx //pfSamplePos += pfPitch;
|
|
packssdw mm3, mm2 // words in mm3
|
|
// vfRVolume2, vfLVolume2, vfRVolume1, vfLVolume1
|
|
|
|
movq mm5, qword ptr[esi-4] // Load values from buffer
|
|
add ecx, 2 // dwI += 2;
|
|
|
|
psrad mm7, 12 // shift back down to 16 bits.
|
|
|
|
pand mm7, wordmask // combine results to get ready to multiply by left and right
|
|
movq mm6, mm7 // volume levels.
|
|
pslld mm6, 16 //
|
|
por mm7, mm6 // words in mm7
|
|
// lM2, lM2, lM1, lM1
|
|
|
|
// above multiplies and shifts are all done with this one pmul
|
|
pmulhw mm3, mm7 // lLM1 *= vfLVolume;
|
|
// lM1 *= vfRVolume;
|
|
// lLM2 *= vfLVolume;
|
|
// lM2 *= vfRVolume;
|
|
|
|
paddsw mm5, mm3 // Add values to buffer with saturation
|
|
movq qword ptr[esi-4], mm5 // Store values back into buffer.
|
|
|
|
// }
|
|
jmp mainloop
|
|
|
|
// Need to write only one.
|
|
//if (dwI < dwLength)
|
|
//{
|
|
StoreOne:
|
|
#if 1
|
|
// Linearly interpolate between points and store only one value.
|
|
// combine dwFract Values.
|
|
|
|
// Make mm7 zero for unpacking
|
|
|
|
//shl esi, 1 // do not shift left since pcWave is array of chars
|
|
add esi, pcWave // Put address of pcWave[dwPosition1] in esi
|
|
pxor mm7, mm7
|
|
|
|
//lLM1 = pcWave[dwPosition1];
|
|
movzx esi, word ptr[esi]
|
|
|
|
// Doing AND that was not done for dwFract1 and dwFract2
|
|
pand mm6, mm4
|
|
|
|
// words in MMX register after operation is complete.
|
|
psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1
|
|
punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1
|
|
|
|
// put values of pcWave into MMX registers. They are read into a regular register so
|
|
// that the routine does not read past the end of the buffer otherwise, it could read
|
|
// directly into the MMX registers.
|
|
|
|
pxor mm7, mm7
|
|
// byte in MMX registers
|
|
movd mm4, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1]
|
|
|
|
punpcklbw mm7, mm4 // 0, 0, 0, 0, pcWave[dwPos1+1], 0, pcWave[dwPos1], 0
|
|
|
|
// *2 pmadd efficent code.
|
|
//lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
|
|
//lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
|
|
|
|
pmaddwd mm7, mm5// low dword = lM1 =
|
|
//(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
|
|
|
|
psrad mm7, 12 // shift back down to 16 bits
|
|
|
|
pand mm7, wordmask // combine results to get ready to multiply by left and right
|
|
movq mm6, mm7 // volume levels.
|
|
pslld mm6, 16 //
|
|
por mm7, mm6 // words in mm7
|
|
// lM2, lM2, lM1, lM1
|
|
|
|
pxor mm6, mm6
|
|
|
|
movq mm5, mm2 // move volume1 into mm5
|
|
|
|
// use pack to get 4 volume values together for multiplication.
|
|
packssdw mm5, mm6 // words in mm7
|
|
// 0, 0, vfRVolume1, vfLVolume1
|
|
/*
|
|
// Set lLM to be same as lM
|
|
lLM1 = lM1;
|
|
|
|
lLM1 *= vfLVolume1;
|
|
lLM1 >>= 5; // Signal bumps up to 15 bits.
|
|
lM1 *= vfRVolume1;
|
|
lM1 >>= 5;
|
|
|
|
// Set lLM to be same as lM
|
|
lLM2 = lM2;
|
|
|
|
lLM2 *= vfLVolume2;
|
|
lLM2 >>= 5; // Signal bumps up to 15 bits.
|
|
lM2 *= vfRVolume2;
|
|
lM2 >>= 5;
|
|
*/
|
|
// above multiplies and shifts are all done with this one pmul
|
|
pmulhw mm5, mm7
|
|
|
|
// calculate buffer location.
|
|
mov edi, ecx
|
|
shl edi, 1
|
|
add edi, pBuffer
|
|
|
|
/*
|
|
add word ptr[edi-4], si
|
|
jno no_oflowl1
|
|
// pBuffer[dwI] = 0x7fff;
|
|
mov word ptr[edi-4], 0x7fff
|
|
js no_oflowl1
|
|
//pBuffer[dwI] = (short) 0x8000;
|
|
mov word ptr[edi-4], 0x8000
|
|
no_oflowl1:
|
|
//pBuffer[dwI+1] += (short) lM1;
|
|
add word ptr[edi-2], dx
|
|
jno no_oflowr1
|
|
//pBuffer[dwI+1] = 0x7fff;
|
|
mov word ptr[edi-2], 0x7fff
|
|
js no_oflowr1
|
|
//pBuffer[dwI+1] = (short) 0x8000;
|
|
mov word ptr[edi-2], 0x8000
|
|
no_oflowr1:
|
|
*/
|
|
movd mm7, dword ptr[edi-4]
|
|
paddsw mm7, mm5
|
|
movd dword ptr[edi-4], mm7
|
|
//}
|
|
#endif
|
|
done:
|
|
|
|
mov edx, this // get address of class object
|
|
|
|
//m_vfLastLVolume = vfLVolume;
|
|
//m_vfLastRVolume = vfRVolume;
|
|
// need to shift volume back down to 12 bits before storing
|
|
psrld mm2, 3
|
|
movd [edx]this.m_vfLastLVolume, mm2
|
|
psrlq mm2, 32
|
|
movd [edx]this.m_vfLastRVolume, mm2
|
|
|
|
//m_pfLastPitch = pfPitch;
|
|
mov [edx]this.m_pfLastPitch, ebx
|
|
|
|
//m_pfLastSample = pfSamplePos;
|
|
mov [edx]this.m_pfLastSample, eax
|
|
|
|
// put value back into dwI to be returned. This could just be passed back in eax I think.
|
|
mov dwI, ecx
|
|
emms
|
|
} // ASM block
|
|
return (dwI >> 1);
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
* CDigitalAudio::MixMono16X()
|
|
*****************************************************************************
|
|
* Implement a mono sixteen-bit mix.
|
|
* Heavily optimized for MMX.
|
|
*/
|
|
DWORD CDigitalAudio::MixMono16X(short * pBuffer, DWORD dwLength,
|
|
DWORD dwDeltaPeriod,VFRACT vfDeltaVolume,
|
|
PFRACT pfDeltaPitch,PFRACT pfSampleLength,
|
|
PFRACT pfLoopLength)
|
|
{
|
|
DWORD dwI,dwIncDelta = dwDeltaPeriod;
|
|
|
|
short * pcWave = (short*) m_pnWave;
|
|
PFRACT pfSamplePos = m_pfLastSample;
|
|
VFRACT vfVolume = m_vfLastLVolume;
|
|
PFRACT pfPitch = m_pfLastPitch;
|
|
PFRACT pfPFract = pfPitch << 8;
|
|
VFRACT vfVFract = vfVolume << 8; // Keep high res version around.
|
|
|
|
|
|
QWORD dwFractMASK = 0x000000000FFF0FFF;
|
|
QWORD dwFractOne = 0x0000000010001000;
|
|
QWORD wordmask = 0x0000FFFF0000FFFF;
|
|
QWORD vfDeltaLandRVolume;
|
|
|
|
_asm{
|
|
|
|
// vfLVFract and vfRVFract are in mm0
|
|
//VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around.
|
|
//VFRACT vfRVFract = vfRVolume1 << 8;
|
|
|
|
movd mm0, vfVolume
|
|
movd mm7, vfVolume
|
|
|
|
// vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume
|
|
movd mm1, vfDeltaVolume
|
|
movd mm6, vfDeltaVolume
|
|
|
|
punpckldq mm1, mm6
|
|
|
|
// dwI = 0
|
|
mov ecx, 0
|
|
movq vfDeltaLandRVolume, mm1
|
|
|
|
|
|
movq mm1, dwFractOne
|
|
movq mm4, dwFractMASK
|
|
|
|
mov eax, pfSamplePos
|
|
|
|
|
|
punpckldq mm0, mm7
|
|
mov ebx, pfPitch
|
|
|
|
pslld mm0, 8
|
|
mov edx, dwIncDelta
|
|
|
|
movq mm2, mm0 // vfLVolume and vfRVolume in mm2
|
|
// need to be set before first pass.
|
|
|
|
// *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value
|
|
psrld mm2, 5
|
|
|
|
//for (dwI = 0; dwI < dwLength; )
|
|
//{
|
|
mainloop:
|
|
cmp ecx, dwLength
|
|
jae done
|
|
|
|
|
|
|
|
cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength)
|
|
jb NotPastEndOfSample1 //{
|
|
|
|
cmp pfLoopLength, 0 //if (!pfLoopLength)
|
|
|
|
je done // break;
|
|
|
|
sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength;
|
|
|
|
NotPastEndOfSample1: //}
|
|
|
|
mov esi, eax // dwPosition1 = pfSamplePos;
|
|
add eax, ebx // pfSamplePos += pfPitch;
|
|
|
|
sub edx, 2 // dwIncDelta-=2;
|
|
jnz DontIncreaseValues1 //if (!dwIncDelta) {
|
|
|
|
// Since edx was use for dwIncDelta and now its zero, we can use if for a temporary
|
|
// for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume
|
|
// if it goes below zero.
|
|
|
|
paddd mm0, vfDeltaLandRVolume // vfVFract += vfDeltaVolume;
|
|
// vfVFract += vfDeltaVolume;
|
|
pxor mm5, mm5 // TestLVol = 0; TestRVol = 0;
|
|
|
|
|
|
mov edx, pfPFract // Temp = pfPFract;
|
|
pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff;
|
|
// if (TestRVol > vfRVFract) TestRVol = 0xffffffff;
|
|
|
|
add edx, pfDeltaPitch // Temp += pfDeltaPitch;
|
|
pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol);
|
|
// TestRVol = vfRVFract & (~TestRVol);
|
|
|
|
mov pfPFract, edx // pfPFract = Temp;
|
|
movq mm2, mm5 // vfLVolume = TestLVol;
|
|
// vfRVolume = TestRVol;
|
|
|
|
|
|
shr edx, 8 // Temp = Temp >> 8;
|
|
psrld mm2, 5 // vfLVolume = vfLVolume >> 5;
|
|
// vfRVolume = vfRVolume >> 5;
|
|
|
|
mov ebx, edx // pfPitch = Temp;
|
|
mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod;
|
|
|
|
//}
|
|
DontIncreaseValues1:
|
|
|
|
movd mm6, esi // dwFract1 = dwPosition1;
|
|
movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000
|
|
|
|
shr esi, 12 // dwPosition1 = dwPosition1 >> 12;
|
|
inc ecx //dwI++;
|
|
|
|
// if ( dwI < dwLength) break;
|
|
cmp ecx, dwLength
|
|
jae StoreOne
|
|
|
|
//if (pfSamplePos >= pfSampleLength)
|
|
//{
|
|
cmp eax, pfSampleLength
|
|
jb NotPastEndOfSample2
|
|
|
|
// Original if in C was not negated
|
|
//if (!pfLoopLength)
|
|
cmp pfLoopLength, 0
|
|
//break;
|
|
je StoreOne
|
|
//else
|
|
//pfSamplePos -= pfLoopLength;
|
|
sub eax, pfLoopLength
|
|
//}
|
|
NotPastEndOfSample2:
|
|
|
|
shl esi, 1 // shift left since pcWave is array of shorts
|
|
mov edi, eax // dwPosition2 = pfSamplePos;
|
|
|
|
add esi, pcWave // Put address of pcWave[dwPosition1] in esi
|
|
movd mm7, eax // dwFract2 = pfSamplePos;
|
|
|
|
shr edi, 12 // dwPosition2 = dwPosition2 >> 12;
|
|
punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are
|
|
// 0, 0, dwFract2, dwFract1
|
|
|
|
pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff;
|
|
|
|
movd mm7, dword ptr[esi] //lLM1 = pcWave[dwPosition1];
|
|
psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1
|
|
|
|
shl edi, 1 // shift left since pcWave is array of shorts
|
|
punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1
|
|
|
|
add edi, pcWave // Put address of pcWave[dwPosition2] in edi
|
|
mov esi, ecx // Temp = dWI;
|
|
|
|
shl esi, 1 // Temp = Temp << 1;
|
|
movq mm3, mm2 // put left and right volume levels in mm3
|
|
|
|
|
|
movd mm6, dword ptr[edi] //lLM2 = pcWave[dwPosition2];
|
|
packssdw mm3, mm2 // words in mm7
|
|
// vfRVolume2, vfLVolume2, vfRVolume1, vfLVolume1
|
|
|
|
add esi, pBuffer //
|
|
punpckldq mm7, mm6 // low four bytes bytes in
|
|
// pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
|
|
|
|
pmaddwd mm7, mm5 // high dword = lM2 =
|
|
//(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2))
|
|
// low dword = lM1 =
|
|
//(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
|
|
add eax, ebx //pfSamplePos += pfPitch;
|
|
|
|
movd mm5, dword ptr[esi-2] // Load values from buffer
|
|
inc ecx // dwI++;
|
|
|
|
psrad mm7, 12 // shift back down to 16 bits.
|
|
|
|
packssdw mm7, mm4 // only need one word in mono case.
|
|
// low word are lm2 and lm1
|
|
|
|
// above multiplies and shifts are all done with this one pmul. Low two word are only
|
|
// interest in mono case
|
|
pmulhw mm3, mm7 // lLM1 *= vfVolume;
|
|
// lLM2 *= vfVolume;
|
|
|
|
|
|
paddsw mm5, mm3 // Add values to buffer with saturation
|
|
movd dword ptr[esi-2], mm5 // Store values back into buffer.
|
|
|
|
// }
|
|
jmp mainloop
|
|
|
|
// Need to write only one.
|
|
//if (dwI < dwLength)
|
|
//{
|
|
StoreOne:
|
|
#if 1
|
|
// Linearly interpolate between points and store only one value.
|
|
// combine dwFract Values.
|
|
|
|
// Make mm7 zero for unpacking
|
|
|
|
shl esi, 1 // shift left since pcWave is array of shorts
|
|
add esi, pcWave // Put address of pcWave[dwPosition1] in esi
|
|
pxor mm7, mm7
|
|
|
|
//lLM1 = pcWave[dwPosition1];
|
|
mov esi, dword ptr[esi]
|
|
|
|
// Doing AND that was not done for dwFract1 and dwFract2
|
|
pand mm6, mm4
|
|
|
|
// words in MMX register after operation is complete.
|
|
psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1
|
|
punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1
|
|
|
|
// put values of pcWave into MMX registers. They are read into a regular register so
|
|
// that the routine does not read past the end of the buffer otherwise, it could read
|
|
// directly into the MMX registers.
|
|
|
|
// words in MMX registers
|
|
movd mm7, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1]
|
|
|
|
// *2 pmadd efficent code.
|
|
//lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
|
|
//lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
|
|
|
|
pmaddwd mm7, mm5// low dword = lM1 =
|
|
//(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
|
|
|
|
psrad mm7, 12 // shift back down to 16 bits
|
|
|
|
movq mm5, mm2 // move volume into mm5
|
|
/*
|
|
// Set lLM to be same as lM
|
|
lLM1 = lM1;
|
|
|
|
lLM1 *= vfLVolume1;
|
|
lLM1 >>= 5; // Signal bumps up to 15 bits.
|
|
lM1 *= vfRVolume1;
|
|
lM1 >>= 5;
|
|
|
|
// Set lLM to be same as lM
|
|
lLM2 = lM2;
|
|
|
|
lLM2 *= vfLVolume2;
|
|
lLM2 >>= 5; // Signal bumps up to 15 bits.
|
|
lM2 *= vfRVolume2;
|
|
lM2 >>= 5;
|
|
*/
|
|
// above multiplies and shifts are all done with this one pmul
|
|
pmulhw mm5, mm7
|
|
|
|
// calculate buffer location.
|
|
mov edi, ecx
|
|
shl edi, 1
|
|
add edi, pBuffer
|
|
|
|
movd edx, mm5
|
|
|
|
//pBuffer[dwI+1] += (short) lM1;
|
|
add word ptr[edi-2], dx
|
|
jno no_oflowr1
|
|
//pBuffer[dwI+1] = 0x7fff;
|
|
mov word ptr[edi-2], 0x7fff
|
|
js no_oflowr1
|
|
//pBuffer[dwI+1] = (short) 0x8000;
|
|
mov word ptr[edi-2], 0x8000
|
|
no_oflowr1:
|
|
//}
|
|
#endif
|
|
done:
|
|
|
|
mov edx, this // get address of class object
|
|
|
|
//m_vfLastLVolume = vfVolume;
|
|
//m_vfLastRVolume = vfVolume;
|
|
// need to shift volume back down to 12 bits before storing
|
|
psrld mm2, 3
|
|
movd [edx]this.m_vfLastLVolume, mm2
|
|
movd [edx]this.m_vfLastRVolume, mm2
|
|
|
|
//m_pfLastPitch = pfPitch;
|
|
mov [edx]this.m_pfLastPitch, ebx
|
|
|
|
//m_pfLastSample = pfSamplePos;
|
|
mov [edx]this.m_pfLastSample, eax
|
|
|
|
// put value back into dwI to be returned. This could just be passed back in eax I think.
|
|
mov dwI, ecx
|
|
emms
|
|
} // ASM block
|
|
return (dwI);
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
* CDigitalAudio::Mix16X()
|
|
*****************************************************************************
|
|
* Implement a stereo sixteen-bit mix.
|
|
* Heavily optimized for MMX.
|
|
*/
|
|
DWORD CDigitalAudio::Mix16X(short * pBuffer, DWORD dwLength,
|
|
DWORD dwDeltaPeriod, VFRACT vfDeltaLVolume,
|
|
VFRACT vfDeltaRVolume,PFRACT pfDeltaPitch,
|
|
PFRACT pfSampleLength,PFRACT pfLoopLength)
|
|
{
|
|
DWORD dwI,dwIncDelta = dwDeltaPeriod;
|
|
//DWORD dwPosition1, dwPosition2;
|
|
//long lM1, lLM1;
|
|
//long lM2, lLM2;
|
|
//VFRACT dwFract1, dwFract2;
|
|
short * pcWave = (short *) m_pnWave;
|
|
PFRACT pfSamplePos = m_pfLastSample;
|
|
VFRACT vfLVolume = m_vfLastLVolume;
|
|
VFRACT vfRVolume = m_vfLastRVolume;
|
|
|
|
VFRACT vfLVolume2 = m_vfLastLVolume;
|
|
VFRACT vfRVolume2 = m_vfLastRVolume;
|
|
|
|
PFRACT pfPitch = m_pfLastPitch;
|
|
PFRACT pfPFract = pfPitch << 8;
|
|
dwLength <<= 1;
|
|
|
|
QWORD dwFractMASK = 0x000000000FFF0FFF;
|
|
QWORD dwFractOne = 0x0000000010001000;
|
|
QWORD wordmask = 0x0000FFFF0000FFFF;
|
|
QWORD vfDeltaLandRVolume;
|
|
|
|
_asm{
|
|
|
|
// vfLVFract and vfRVFract are in mm0
|
|
//VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around.
|
|
//VFRACT vfRVFract = vfRVolume1 << 8;
|
|
|
|
movd mm0, vfLVolume
|
|
movd mm7, vfRVolume
|
|
|
|
// vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume
|
|
movd mm1, vfDeltaLVolume
|
|
movd mm6, vfDeltaRVolume
|
|
|
|
punpckldq mm1, mm6
|
|
|
|
// dwI = 0
|
|
mov ecx, 0
|
|
movq vfDeltaLandRVolume, mm1
|
|
|
|
|
|
movq mm1, dwFractOne
|
|
movq mm4, dwFractMASK
|
|
|
|
mov eax, pfSamplePos
|
|
|
|
|
|
punpckldq mm0, mm7
|
|
mov ebx, pfPitch
|
|
|
|
pslld mm0, 8
|
|
mov edx, dwIncDelta
|
|
|
|
movq mm2, mm0 // vfLVolume and vfRVolume in mm2
|
|
// need to be set before first pass.
|
|
|
|
// *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value
|
|
psrld mm2, 5
|
|
|
|
//for (dwI = 0; dwI < dwLength; )
|
|
//{
|
|
mainloop:
|
|
cmp ecx, dwLength
|
|
jae done
|
|
|
|
|
|
|
|
cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength)
|
|
jb NotPastEndOfSample1 //{
|
|
|
|
cmp pfLoopLength, 0 //if (!pfLoopLength)
|
|
|
|
je done // break;
|
|
|
|
sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength;
|
|
|
|
NotPastEndOfSample1: //}
|
|
|
|
mov esi, eax // dwPosition1 = pfSamplePos;
|
|
add eax, ebx // pfSamplePos += pfPitch;
|
|
|
|
sub edx, 2 // dwIncDelta-=2;
|
|
jnz DontIncreaseValues1 //if (!dwIncDelta) {
|
|
|
|
// Since edx was use for dwIncDelta and now its zero, we can use if for a temporary
|
|
// for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume
|
|
// if it goes below zero.
|
|
|
|
paddd mm0, vfDeltaLandRVolume // vfLVFract += vfDeltaLVolume;
|
|
// vfRVFract += vfDeltaRVolume;
|
|
pxor mm5, mm5 // TestLVol = 0; TestRVol = 0;
|
|
|
|
|
|
mov edx, pfPFract // Temp = pfPFract;
|
|
pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff;
|
|
// if (TestRVol > vfRVFract) TestRVol = 0xffffffff;
|
|
|
|
add edx, pfDeltaPitch // Temp += pfDeltaPitch;
|
|
pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol);
|
|
// TestRVol = vfRVFract & (~TestRVol);
|
|
|
|
mov pfPFract, edx // pfPFract = Temp;
|
|
movq mm2, mm5 // vfLVolume = TestLVol;
|
|
// vfRVolume = TestRVol;
|
|
|
|
|
|
shr edx, 8 // Temp = Temp >> 8;
|
|
psrld mm2, 5 // vfLVolume = vfLVolume >> 5;
|
|
// vfRVolume = vfRVolume >> 5;
|
|
|
|
mov ebx, edx // pfPitch = Temp;
|
|
mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod;
|
|
|
|
//}
|
|
DontIncreaseValues1:
|
|
|
|
movd mm6, esi // dwFract1 = dwPosition1;
|
|
movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000
|
|
|
|
shr esi, 12 // dwPosition1 = dwPosition1 >> 12;
|
|
add ecx, 2 //dwI += 2;
|
|
|
|
// if ( dwI < dwLength) break;
|
|
cmp ecx, dwLength
|
|
jae StoreOne
|
|
|
|
//if (pfSamplePos >= pfSampleLength)
|
|
//{
|
|
cmp eax, pfSampleLength
|
|
jb NotPastEndOfSample2
|
|
|
|
// Original if in C was not negated
|
|
//if (!pfLoopLength)
|
|
cmp pfLoopLength, 0
|
|
//break;
|
|
je StoreOne
|
|
//else
|
|
//pfSamplePos -= pfLoopLength;
|
|
sub eax, pfLoopLength
|
|
//}
|
|
NotPastEndOfSample2:
|
|
|
|
shl esi, 1 // shift left since pcWave is array of shorts
|
|
mov edi, eax // dwPosition2 = pfSamplePos;
|
|
|
|
add esi, pcWave // Put address of pcWave[dwPosition1] in esi
|
|
movd mm7, eax // dwFract2 = pfSamplePos;
|
|
|
|
shr edi, 12 // dwPosition2 = dwPosition2 >> 12;
|
|
punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are
|
|
// 0, 0, dwFract2, dwFract1
|
|
|
|
pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff;
|
|
|
|
movd mm7, dword ptr[esi] //lLM1 = pcWave[dwPosition1];
|
|
psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1
|
|
|
|
shl edi, 1 // shift left since pcWave is array of shorts
|
|
punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1
|
|
|
|
add edi, pcWave // Put address of pcWave[dwPosition2] in edi
|
|
mov esi, ecx // Temp = dWI;
|
|
|
|
shl esi, 1 // Temp = Temp << 1;
|
|
movq mm3, mm2 // put left and right volume levels in mm3
|
|
|
|
|
|
movd mm6, dword ptr[edi] //lLM2 = pcWave[dwPosition2];
|
|
packssdw mm3, mm2 // words in mm7
|
|
// vfRVolume2, vfLVolume2, vfRVolume1, vfLVolume1
|
|
|
|
add esi, pBuffer //
|
|
punpckldq mm7, mm6 // low four bytes bytes in
|
|
// pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
|
|
|
|
pmaddwd mm7, mm5 // high dword = lM2 =
|
|
//(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2))
|
|
// low dword = lM1 =
|
|
//(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
|
|
add eax, ebx //pfSamplePos += pfPitch;
|
|
|
|
movq mm5, qword ptr[esi-4] // Load values from buffer
|
|
add ecx, 2 // dwI += 2;
|
|
|
|
psrad mm7, 12 // shift back down to 16 bits.
|
|
|
|
pand mm7, wordmask // combine results to get ready to multiply by left and right
|
|
movq mm6, mm7 // volume levels.
|
|
pslld mm6, 16 //
|
|
por mm7, mm6 // words in mm7
|
|
// lM2, lM2, lM1, lM1
|
|
|
|
// above multiplies and shifts are all done with this one pmul
|
|
pmulhw mm3, mm7 // lLM1 *= vfLVolume;
|
|
// lM1 *= vfRVolume;
|
|
// lLM2 *= vfLVolume;
|
|
// lM2 *= vfRVolume;
|
|
|
|
paddsw mm5, mm3 // Add values to buffer with saturation
|
|
movq qword ptr[esi-4], mm5 // Store values back into buffer.
|
|
|
|
// }
|
|
jmp mainloop
|
|
|
|
// Need to write only one.
|
|
//if (dwI < dwLength)
|
|
//{
|
|
StoreOne:
|
|
#if 1
|
|
// Linearly interpolate between points and store only one value.
|
|
// combine dwFract Values.
|
|
|
|
// Make mm7 zero for unpacking
|
|
|
|
shl esi, 1 // shift left since pcWave is array of shorts
|
|
add esi, pcWave // Put address of pcWave[dwPosition1] in esi
|
|
pxor mm7, mm7
|
|
|
|
//lLM1 = pcWave[dwPosition1];
|
|
mov esi, dword ptr[esi]
|
|
|
|
// Doing AND that was not done for dwFract1 and dwFract2
|
|
pand mm6, mm4
|
|
|
|
// words in MMX register after operation is complete.
|
|
psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1
|
|
punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1
|
|
|
|
// put values of pcWave into MMX registers. They are read into a regular register so
|
|
// that the routine does not read past the end of the buffer otherwise, it could read
|
|
// directly into the MMX registers.
|
|
|
|
// words in MMX registers
|
|
movd mm7, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1]
|
|
|
|
// *2 pmadd efficent code.
|
|
//lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
|
|
//lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
|
|
|
|
pmaddwd mm7, mm5// low dword = lM1 =
|
|
//(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
|
|
|
|
psrad mm7, 12 // shift back down to 16 bits
|
|
|
|
pand mm7, wordmask // combine results to get ready to multiply by left and right
|
|
movq mm6, mm7 // volume levels.
|
|
pslld mm6, 16 //
|
|
por mm7, mm6 // words in mm7
|
|
// lM2, lM2, lM1, lM1
|
|
|
|
pxor mm6, mm6
|
|
|
|
movq mm5, mm2 // move volume1 into mm5
|
|
|
|
// use pack to get 4 volume values together for multiplication.
|
|
packssdw mm5, mm6 // words in mm7
|
|
// 0, 0, vfRVolume1, vfLVolume1
|
|
/*
|
|
// Set lLM to be same as lM
|
|
lLM1 = lM1;
|
|
|
|
lLM1 *= vfLVolume1;
|
|
lLM1 >>= 5; // Signal bumps up to 15 bits.
|
|
lM1 *= vfRVolume1;
|
|
lM1 >>= 5;
|
|
|
|
// Set lLM to be same as lM
|
|
lLM2 = lM2;
|
|
|
|
lLM2 *= vfLVolume2;
|
|
lLM2 >>= 5; // Signal bumps up to 15 bits.
|
|
lM2 *= vfRVolume2;
|
|
lM2 >>= 5;
|
|
*/
|
|
// above multiplies and shifts are all done with this one pmul
|
|
pmulhw mm5, mm7
|
|
|
|
// calculate buffer location.
|
|
mov edi, ecx
|
|
shl edi, 1
|
|
add edi, pBuffer
|
|
|
|
/*
|
|
add word ptr[edi-4], si
|
|
jno no_oflowl1
|
|
// pBuffer[dwI] = 0x7fff;
|
|
mov word ptr[edi-4], 0x7fff
|
|
js no_oflowl1
|
|
//pBuffer[dwI] = (short) 0x8000;
|
|
mov word ptr[edi-4], 0x8000
|
|
no_oflowl1:
|
|
//pBuffer[dwI+1] += (short) lM1;
|
|
add word ptr[edi-2], dx
|
|
jno no_oflowr1
|
|
//pBuffer[dwI+1] = 0x7fff;
|
|
mov word ptr[edi-2], 0x7fff
|
|
js no_oflowr1
|
|
//pBuffer[dwI+1] = (short) 0x8000;
|
|
mov word ptr[edi-2], 0x8000
|
|
no_oflowr1:
|
|
*/
|
|
movd mm7, dword ptr[edi-4]
|
|
paddsw mm7, mm5
|
|
movd dword ptr[edi-4], mm7
|
|
//}
|
|
#endif
|
|
done:
|
|
|
|
mov edx, this // get address of class object
|
|
|
|
//m_vfLastLVolume = vfLVolume;
|
|
//m_vfLastRVolume = vfRVolume;
|
|
// need to shift volume back down to 12 bits before storing
|
|
psrld mm2, 3
|
|
movd [edx]this.m_vfLastLVolume, mm2
|
|
psrlq mm2, 32
|
|
movd [edx]this.m_vfLastRVolume, mm2
|
|
|
|
//m_pfLastPitch = pfPitch;
|
|
mov [edx]this.m_pfLastPitch, ebx
|
|
|
|
//m_pfLastSample = pfSamplePos;
|
|
mov [edx]this.m_pfLastSample, eax
|
|
|
|
// put value back into dwI to be returned. This could just be passed back in eax I think.
|
|
mov dwI, ecx
|
|
emms
|
|
} // ASM block
|
|
return (dwI >> 1);
|
|
}
|
|
|
|
/*****************************************************************************
|
|
* MMXDisabled()
|
|
*****************************************************************************
|
|
* Check the registry key to determine whether to ignore MMX.
|
|
*/
|
|
static BOOL MMXDisabled()
|
|
{
|
|
ULONG ulValue;
|
|
|
|
if (!GetRegValueDword(
|
|
TEXT("Software\\Microsoft\\DirectMusic"),
|
|
TEXT("MMXDisabled"),
|
|
&ulValue))
|
|
{
|
|
return FALSE;
|
|
}
|
|
|
|
return (BOOL)ulValue;
|
|
}
|
|
|
|
#define CPU_ID _asm _emit 0x0f _asm _emit 0xa2
|
|
|
|
/*****************************************************************************
|
|
* MultiMediaInstructionsSupported()
|
|
*****************************************************************************
|
|
* Returns whether this CPU supports MMX.
|
|
*/
|
|
BOOL MultiMediaInstructionsSupported()
|
|
{
|
|
BOOL bMultiMediaInstructionsSupported;
|
|
|
|
if (!MMXDisabled())
|
|
{
|
|
_asm
|
|
{
|
|
pushfd // Store original EFLAGS on stack
|
|
pop eax // Get original EFLAGS in EAX
|
|
mov ecx, eax // Duplicate original EFLAGS in ECX for toggle check
|
|
xor eax, 0x00200000L // Flip ID bit in EFLAGS
|
|
push eax // Save new EFLAGS value on stack
|
|
popfd // Replace current EFLAGS value
|
|
pushfd // Store new EFLAGS on stack
|
|
pop eax // Get new EFLAGS in EAX
|
|
xor eax, ecx // Can we toggle ID bit?
|
|
jz Done // Jump if no, Processor is older than a Pentium so CPU_ID is not supported
|
|
mov eax, 1 // Set EAX to tell the CPUID instruction what to return
|
|
push ebx
|
|
CPU_ID // Get family/model/stepping/features
|
|
pop ebx
|
|
xor eax,eax // Assume failure
|
|
test edx, 0x00800000L // Check if mmx technology available
|
|
jz Done // Jump if no
|
|
// Tests passed, this machine supports MMX
|
|
inc eax // Set to success
|
|
Done:
|
|
mov bMultiMediaInstructionsSupported, eax
|
|
}
|
|
} else {
|
|
bMultiMediaInstructionsSupported = 0;
|
|
}
|
|
|
|
return (bMultiMediaInstructionsSupported);
|
|
}
|
|
|