windows-nt/Source/XPSP1/NT/drivers/ddk/wdmaudio/ddksynth/mmx.cpp
2020-09-26 16:20:57 +08:00

1452 lines
42 KiB
C++

//
// Copyright (c) 1996-2000 Microsoft Corporation. All rights reserved.
// Mmx.cpp
// MMX Mix engines for Microsoft synth
/*
Variable useage.
Variable register
pfSamplePos eax
pfPitch ebx
dwI ecx
dwIncDelta edx (edx is sometimes a temporary register)
dwPosition1 esi
dwPostiion2 edi
vfRvolume and vfLvolume mm0
vfRVolume, vfLVolume mm2
mm4 - mm7 are temporary mmx registers.
*/
// Notes about calculation.
// Loop is unrolled once.
// *1 shifting volumne to 15 bit values to get rid of shifts and simplify code.
// This make the packed mulitply work better later since I keep the sound interpolated
// wave value at 16 bit signed value. For a PMULHW, this results in 15 bit results
// which is the same as the original code.
// *2 linear interpolation can be done very quickly with MMX by re-arranging the
// way that the interpolation is done. Here is code in C that shows the difference.
// Original C code
//lM1 = ((pcWave[dwPosition1 + 1] - pcWave[dwPosition1]) * dwFract1) >> 12;
//lM2 = ((pcWave[dwPosition2 + 1] - pcWave[dwPosition2]) * dwFract2) >> 12;
//lM1 += pcWave[dwPosition1];
//lM2 += pcWave[dwPosition2];
// Equivalent C Code that can be done with a pmadd
//lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
//lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
#include "common.h"
#define STR_MODULENAME "DDKSynth.sys:MMX: "
typedef unsigned __int64 QWORD;
#pragma code_seg()
/*****************************************************************************
* CDigitalAudio::MixMono8X()
*****************************************************************************
* Implement a mono eight-bit mix.
* Heavily optimized for MMX.
*/
DWORD CDigitalAudio::MixMono8X(short * pBuffer, DWORD dwLength,
DWORD dwDeltaPeriod, VFRACT vfDeltaVolume,
PFRACT pfDeltaPitch, PFRACT pfSampleLength,
PFRACT pfLoopLength)
{
DWORD dwI,dwIncDelta = dwDeltaPeriod;
char * pcWave = (char *) m_pnWave;
PFRACT pfSamplePos = m_pfLastSample;
VFRACT vfVolume = m_vfLastLVolume;
PFRACT pfPitch = m_pfLastPitch;
PFRACT pfPFract = pfPitch << 8;
VFRACT vfVFract = vfVolume << 8; // Keep high res version around.
QWORD dwFractMASK = 0x000000000FFF0FFF;
QWORD dwFractOne = 0x0000000010001000;
QWORD wordmask = 0x0000FFFF0000FFFF;
QWORD vfDeltaLandRVolume;
_asm{
// vfLVFract and vfRVFract are in mm0
//VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around.
//VFRACT vfRVFract = vfRVolume1 << 8;
movd mm0, vfVolume
movd mm7, vfVolume
// vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume
movd mm1, vfDeltaVolume
movd mm6, vfDeltaVolume
punpckldq mm1, mm6
// dwI = 0
mov ecx, 0
movq vfDeltaLandRVolume, mm1
movq mm1, dwFractOne
movq mm4, dwFractMASK
mov eax, pfSamplePos
punpckldq mm0, mm7
mov ebx, pfPitch
pslld mm0, 8
mov edx, dwIncDelta
movq mm2, mm0 // vfLVolume and vfRVolume in mm2
// need to be set before first pass.
// *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value
psrld mm2, 5
//for (dwI = 0; dwI < dwLength; )
//{
mainloop:
cmp ecx, dwLength
jae done
cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength)
jb NotPastEndOfSample1 //{
cmp pfLoopLength, 0 //if (!pfLoopLength)
je done // break;
sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength;
NotPastEndOfSample1: //}
mov esi, eax // dwPosition1 = pfSamplePos;
add eax, ebx // pfSamplePos += pfPitch;
sub edx, 2 // dwIncDelta-=2;
jnz DontIncreaseValues1 //if (!dwIncDelta) {
// Since edx was use for dwIncDelta and now its zero, we can use if for a temporary
// for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume
// if it goes below zero.
paddd mm0, vfDeltaLandRVolume // vfVFract += vfDeltaVolume;
// vfVFract += vfDeltaVolume;
pxor mm5, mm5 // TestLVol = 0; TestRVol = 0;
mov edx, pfPFract // Temp = pfPFract;
pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff;
// if (TestRVol > vfRVFract) TestRVol = 0xffffffff;
add edx, pfDeltaPitch // Temp += pfDeltaPitch;
pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol);
// TestRVol = vfRVFract & (~TestRVol);
mov pfPFract, edx // pfPFract = Temp;
movq mm2, mm5 // vfLVolume = TestLVol;
// vfRVolume = TestRVol;
shr edx, 8 // Temp = Temp >> 8;
psrld mm2, 5 // vfLVolume = vfLVolume >> 5;
// vfRVolume = vfRVolume >> 5;
mov ebx, edx // pfPitch = Temp;
mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod;
//}
DontIncreaseValues1:
movd mm6, esi // dwFract1 = dwPosition1;
movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000
shr esi, 12 // dwPosition1 = dwPosition1 >> 12;
inc ecx //dwI++;
// if ( dwI < dwLength) break;
cmp ecx, dwLength
jae StoreOne
//if (pfSamplePos >= pfSampleLength)
//{
cmp eax, pfSampleLength
jb NotPastEndOfSample2
// Original if in C was not negated
//if (!pfLoopLength)
cmp pfLoopLength, 0
//break;
je StoreOne
//else
//pfSamplePos -= pfLoopLength;
sub eax, pfLoopLength
//}
NotPastEndOfSample2:
//shl esi, 1 // do not shift left since pcWave is array of chars
mov edi, eax // dwPosition2 = pfSamplePos;
add esi, pcWave // Put address of pcWave[dwPosition1] in esi
movd mm7, eax // dwFract2 = pfSamplePos;
shr edi, 12 // dwPosition2 = dwPosition2 >> 12;
punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are
// 0, 0, dwFract2, dwFract1
pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff;
movzx esi, word ptr[esi] //lLM1 = pcWave[dwPosition1];
movd mm3, esi
psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1
//shl edi, 1 //do not shift left since pcWave is array of chars
punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1
add edi, pcWave // Put address of pcWave[dwPosition2] in edi
mov esi, ecx // Temp = dWI;
shl esi, 1 // Temp = Temp << 1;
movzx edi, word ptr[edi] //lLM2 = pcWave[dwPoisition2];
movd mm6, edi
pxor mm7, mm7 // zero out mm7 to make 8 bit into 16 bit
// low 4 bytes in mm3
punpcklwd mm3, mm6 // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
add esi, pBuffer //
punpcklbw mm7, mm3 // low four bytes bytes in
// pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
pmaddwd mm7, mm5 // high dword = lM2 =
//(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2))
// low dword = lM1 =
//(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
movq mm3, mm2 // put left and right volume levels in mm3
add eax, ebx //pfSamplePos += pfPitch;
packssdw mm3, mm2 // words in mm7
// vfVolume, vfVolume, vfVolume, vfVolume
movd mm5, dword ptr[esi-2] // Load values from buffer
inc ecx // dwI++;
psrad mm7, 12 // shift back down to 16 bits.
packssdw mm7, mm4 // only need one word in mono case.
// low word are lm2 and lm1
// above multiplies and shifts are all done with this one pmul. Low two word are only
// interest in mono case
pmulhw mm3, mm7 // lLM1 *= vfVolume;
// lLM2 *= vfVolume;
paddsw mm5, mm3 // Add values to buffer with saturation
movd dword ptr[esi-2], mm5 // Store values back into buffer.
// }
jmp mainloop
// Need to write only one.
//if (dwI < dwLength)
//{
StoreOne:
#if 1
// Linearly interpolate between points and store only one value.
// combine dwFract Values.
// Make mm7 zero for unpacking
//shl esi, 1 // do not shift left since pcWave is array of chars
add esi, pcWave // Put address of pcWave[dwPosition1] in esi
pxor mm7, mm7
//lLM1 = pcWave[dwPosition1];
movzx esi, word ptr[esi]
// Doing AND that was not done for dwFract1 and dwFract2
pand mm6, mm4
// words in MMX register after operation is complete.
psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1
punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1
// put values of pcWave into MMX registers. They are read into a regular register so
// that the routine does not read past the end of the buffer otherwise, it could read
// directly into the MMX registers.
// words in MMX registers
pxor mm7, mm7
// low four bytes
movd mm4, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1]
// 8 bytes after unpakc
punpcklbw mm7, mm4 // 0, 0, 0, 0, pcWave[dwPos1+1], 0, pcWave[dwPos1], 0
// *2 pmadd efficent code.
//lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
//lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
pmaddwd mm7, mm5// low dword = lM1 =
//(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
psrad mm7, 12 // shift back down to 16 bits
movq mm5, mm2 // move volume into mm5
/*
// Set lLM to be same as lM
lLM1 = lM1;
lLM1 *= vfLVolume1;
lLM1 >>= 5; // Signal bumps up to 15 bits.
lM1 *= vfRVolume1;
lM1 >>= 5;
// Set lLM to be same as lM
lLM2 = lM2;
lLM2 *= vfLVolume2;
lLM2 >>= 5; // Signal bumps up to 15 bits.
lM2 *= vfRVolume2;
lM2 >>= 5;
*/
// above multiplies and shifts are all done with this one pmul
pmulhw mm5, mm7
// calculate buffer location.
mov edi, ecx
shl edi, 1
add edi, pBuffer
movd edx, mm5
//pBuffer[dwI+1] += (short) lM1;
add word ptr[edi-2], dx
jno no_oflowr1
//pBuffer[dwI+1] = 0x7fff;
mov word ptr[edi-2], 0x7fff
js no_oflowr1
//pBuffer[dwI+1] = (short) 0x8000;
mov word ptr[edi-2], 0x8000
no_oflowr1:
//}
#endif
done:
mov edx, this // get address of class object
//m_vfLastLVolume = vfVolume;
//m_vfLastRVolume = vfVolume;
// need to shift volume back down to 12 bits before storing
psrld mm2, 3
movd [edx]this.m_vfLastLVolume, mm2
movd [edx]this.m_vfLastRVolume, mm2
//m_pfLastPitch = pfPitch;
mov [edx]this.m_pfLastPitch, ebx
//m_pfLastSample = pfSamplePos;
mov [edx]this.m_pfLastSample, eax
// put value back into dwI to be returned. This could just be passed back in eax I think.
mov dwI, ecx
emms
} // ASM block
return (dwI);
}
/*****************************************************************************
* CDigitalAudio::Mix8X()
*****************************************************************************
* Implement a stereo eight-bit mix.
* Heavily optimized for MMX.
*/
DWORD CDigitalAudio::Mix8X(short * pBuffer, DWORD dwLength, DWORD dwDeltaPeriod,
VFRACT vfDeltaLVolume, VFRACT vfDeltaRVolume,
PFRACT pfDeltaPitch, PFRACT pfSampleLength, PFRACT pfLoopLength)
{
DWORD dwI;
//DWORD dwPosition1, dwPosition2;
//long lM1, lLM1;
//long lM2, lLM2;
DWORD dwIncDelta = dwDeltaPeriod;
//VFRACT dwFract1, dwFract2;
char * pcWave = (char *) m_pnWave;
PFRACT pfSamplePos = m_pfLastSample;
VFRACT vfLVolume = m_vfLastLVolume;
VFRACT vfRVolume = m_vfLastRVolume;
VFRACT vfLVolume2 = m_vfLastLVolume;
VFRACT vfRVolume2 = m_vfLastRVolume;
PFRACT pfPitch = m_pfLastPitch;
PFRACT pfPFract = pfPitch << 8;
dwLength <<= 1;
QWORD dwFractMASK = 0x000000000FFF0FFF;
QWORD dwFractOne = 0x0000000010001000;
QWORD wordmask = 0x0000FFFF0000FFFF;
QWORD vfDeltaLandRVolume;
_asm{
// vfLVFract and vfRVFract are in mm0
//VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around.
//VFRACT vfRVFract = vfRVolume1 << 8;
movd mm0, vfLVolume
movd mm7, vfRVolume
// vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume
movd mm1, vfDeltaLVolume
movd mm6, vfDeltaRVolume
punpckldq mm1, mm6
// dwI = 0
mov ecx, 0
movq vfDeltaLandRVolume, mm1
movq mm1, dwFractOne
movq mm4, dwFractMASK
mov eax, pfSamplePos
punpckldq mm0, mm7
mov ebx, pfPitch
pslld mm0, 8
mov edx, dwIncDelta
movq mm2, mm0 // vfLVolume and vfRVolume in mm2
// need to be set before first pass.
// *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value
psrld mm2, 5
//for (dwI = 0; dwI < dwLength; )
//{
mainloop:
cmp ecx, dwLength
jae done
cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength)
jb NotPastEndOfSample1 //{
cmp pfLoopLength, 0 //if (!pfLoopLength)
je done // break;
sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength;
NotPastEndOfSample1: //}
mov esi, eax // dwPosition1 = pfSamplePos;
add eax, ebx // pfSamplePos += pfPitch;
sub edx, 2 // dwIncDelta-=2;
jnz DontIncreaseValues1 //if (!dwIncDelta) {
// Since edx was use for dwIncDelta and now its zero, we can use if for a temporary
// for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume
// if it goes below zero.
paddd mm0, vfDeltaLandRVolume // vfLVFract += vfDeltaLVolume;
// vfRVFract += vfDeltaRVolume;
pxor mm5, mm5 // TestLVol = 0; TestRVol = 0;
mov edx, pfPFract // Temp = pfPFract;
pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff;
// if (TestRVol > vfRVFract) TestRVol = 0xffffffff;
add edx, pfDeltaPitch // Temp += pfDeltaPitch;
pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol);
// TestRVol = vfRVFract & (~TestRVol);
mov pfPFract, edx // pfPFract = Temp;
movq mm2, mm5 // vfLVolume = TestLVol;
// vfRVolume = TestRVol;
shr edx, 8 // Temp = Temp >> 8;
psrld mm2, 5 // vfLVolume = vfLVolume >> 5;
// vfRVolume = vfRVolume >> 5;
mov ebx, edx // pfPitch = Temp;
mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod;
//}
DontIncreaseValues1:
movd mm6, esi // dwFract1 = dwPosition1;
movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000
shr esi, 12 // dwPosition1 = dwPosition1 >> 12;
add ecx, 2 //dwI += 2;
// if ( dwI < dwLength) break;
cmp ecx, dwLength
jae StoreOne
//if (pfSamplePos >= pfSampleLength)
//{
cmp eax, pfSampleLength
jb NotPastEndOfSample2
// Original if in C was not negated
//if (!pfLoopLength)
cmp pfLoopLength, 0
//break;
je StoreOne
//else
//pfSamplePos -= pfLoopLength;
sub eax, pfLoopLength
//}
NotPastEndOfSample2:
//shl esi, 1 // do not shift left since pcWave is array of chars
mov edi, eax // dwPosition2 = pfSamplePos;
add esi, pcWave // Put address of pcWave[dwPosition1] in esi
movd mm7, eax // dwFract2 = pfSamplePos;
shr edi, 12 // dwPosition2 = dwPosition2 >> 12;
punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are
// 0, 0, dwFract2, dwFract1
pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff;
movzx esi, word ptr[esi] //lLM1 = pcWave[dwPosition1];
movd mm3, esi
psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1
//shl edi, 1 // do not shift left since pcWave is array of chars
punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1
add edi, pcWave // Put address of pcWave[dwPosition2] in edi
mov esi, ecx // Temp = dWI;
shl esi, 1 // Temp = Temp << 1;
movzx edi, word ptr[edi] //lLM2 = pcWave[dwPosition2];
movd mm6, edi
pxor mm7, mm7 // zero out mm7 to make 8 bit into 16 bit
// low 4 bytes bytes in mm3
punpcklwd mm3, mm6 // pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
add esi, pBuffer //
punpcklbw mm7, mm3 // bytes in mm7
// pcWave[dwPos2+1], 0, pcWave[dwPos2], 0, pcWave[dwPos1+1], pcWave[dwPos1], 0
pmaddwd mm7, mm5 // high dword = lM2 =
//(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2))
// low dword = lM1 =
//(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
movq mm3, mm2 // put left and right volume levels in mm3
add eax, ebx //pfSamplePos += pfPitch;
packssdw mm3, mm2 // words in mm3
// vfRVolume2, vfLVolume2, vfRVolume1, vfLVolume1
movq mm5, qword ptr[esi-4] // Load values from buffer
add ecx, 2 // dwI += 2;
psrad mm7, 12 // shift back down to 16 bits.
pand mm7, wordmask // combine results to get ready to multiply by left and right
movq mm6, mm7 // volume levels.
pslld mm6, 16 //
por mm7, mm6 // words in mm7
// lM2, lM2, lM1, lM1
// above multiplies and shifts are all done with this one pmul
pmulhw mm3, mm7 // lLM1 *= vfLVolume;
// lM1 *= vfRVolume;
// lLM2 *= vfLVolume;
// lM2 *= vfRVolume;
paddsw mm5, mm3 // Add values to buffer with saturation
movq qword ptr[esi-4], mm5 // Store values back into buffer.
// }
jmp mainloop
// Need to write only one.
//if (dwI < dwLength)
//{
StoreOne:
#if 1
// Linearly interpolate between points and store only one value.
// combine dwFract Values.
// Make mm7 zero for unpacking
//shl esi, 1 // do not shift left since pcWave is array of chars
add esi, pcWave // Put address of pcWave[dwPosition1] in esi
pxor mm7, mm7
//lLM1 = pcWave[dwPosition1];
movzx esi, word ptr[esi]
// Doing AND that was not done for dwFract1 and dwFract2
pand mm6, mm4
// words in MMX register after operation is complete.
psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1
punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1
// put values of pcWave into MMX registers. They are read into a regular register so
// that the routine does not read past the end of the buffer otherwise, it could read
// directly into the MMX registers.
pxor mm7, mm7
// byte in MMX registers
movd mm4, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1]
punpcklbw mm7, mm4 // 0, 0, 0, 0, pcWave[dwPos1+1], 0, pcWave[dwPos1], 0
// *2 pmadd efficent code.
//lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
//lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
pmaddwd mm7, mm5// low dword = lM1 =
//(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
psrad mm7, 12 // shift back down to 16 bits
pand mm7, wordmask // combine results to get ready to multiply by left and right
movq mm6, mm7 // volume levels.
pslld mm6, 16 //
por mm7, mm6 // words in mm7
// lM2, lM2, lM1, lM1
pxor mm6, mm6
movq mm5, mm2 // move volume1 into mm5
// use pack to get 4 volume values together for multiplication.
packssdw mm5, mm6 // words in mm7
// 0, 0, vfRVolume1, vfLVolume1
/*
// Set lLM to be same as lM
lLM1 = lM1;
lLM1 *= vfLVolume1;
lLM1 >>= 5; // Signal bumps up to 15 bits.
lM1 *= vfRVolume1;
lM1 >>= 5;
// Set lLM to be same as lM
lLM2 = lM2;
lLM2 *= vfLVolume2;
lLM2 >>= 5; // Signal bumps up to 15 bits.
lM2 *= vfRVolume2;
lM2 >>= 5;
*/
// above multiplies and shifts are all done with this one pmul
pmulhw mm5, mm7
// calculate buffer location.
mov edi, ecx
shl edi, 1
add edi, pBuffer
/*
add word ptr[edi-4], si
jno no_oflowl1
// pBuffer[dwI] = 0x7fff;
mov word ptr[edi-4], 0x7fff
js no_oflowl1
//pBuffer[dwI] = (short) 0x8000;
mov word ptr[edi-4], 0x8000
no_oflowl1:
//pBuffer[dwI+1] += (short) lM1;
add word ptr[edi-2], dx
jno no_oflowr1
//pBuffer[dwI+1] = 0x7fff;
mov word ptr[edi-2], 0x7fff
js no_oflowr1
//pBuffer[dwI+1] = (short) 0x8000;
mov word ptr[edi-2], 0x8000
no_oflowr1:
*/
movd mm7, dword ptr[edi-4]
paddsw mm7, mm5
movd dword ptr[edi-4], mm7
//}
#endif
done:
mov edx, this // get address of class object
//m_vfLastLVolume = vfLVolume;
//m_vfLastRVolume = vfRVolume;
// need to shift volume back down to 12 bits before storing
psrld mm2, 3
movd [edx]this.m_vfLastLVolume, mm2
psrlq mm2, 32
movd [edx]this.m_vfLastRVolume, mm2
//m_pfLastPitch = pfPitch;
mov [edx]this.m_pfLastPitch, ebx
//m_pfLastSample = pfSamplePos;
mov [edx]this.m_pfLastSample, eax
// put value back into dwI to be returned. This could just be passed back in eax I think.
mov dwI, ecx
emms
} // ASM block
return (dwI >> 1);
}
/*****************************************************************************
* CDigitalAudio::MixMono16X()
*****************************************************************************
* Implement a mono sixteen-bit mix.
* Heavily optimized for MMX.
*/
DWORD CDigitalAudio::MixMono16X(short * pBuffer, DWORD dwLength,
DWORD dwDeltaPeriod,VFRACT vfDeltaVolume,
PFRACT pfDeltaPitch,PFRACT pfSampleLength,
PFRACT pfLoopLength)
{
DWORD dwI,dwIncDelta = dwDeltaPeriod;
short * pcWave = (short*) m_pnWave;
PFRACT pfSamplePos = m_pfLastSample;
VFRACT vfVolume = m_vfLastLVolume;
PFRACT pfPitch = m_pfLastPitch;
PFRACT pfPFract = pfPitch << 8;
VFRACT vfVFract = vfVolume << 8; // Keep high res version around.
QWORD dwFractMASK = 0x000000000FFF0FFF;
QWORD dwFractOne = 0x0000000010001000;
QWORD wordmask = 0x0000FFFF0000FFFF;
QWORD vfDeltaLandRVolume;
_asm{
// vfLVFract and vfRVFract are in mm0
//VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around.
//VFRACT vfRVFract = vfRVolume1 << 8;
movd mm0, vfVolume
movd mm7, vfVolume
// vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume
movd mm1, vfDeltaVolume
movd mm6, vfDeltaVolume
punpckldq mm1, mm6
// dwI = 0
mov ecx, 0
movq vfDeltaLandRVolume, mm1
movq mm1, dwFractOne
movq mm4, dwFractMASK
mov eax, pfSamplePos
punpckldq mm0, mm7
mov ebx, pfPitch
pslld mm0, 8
mov edx, dwIncDelta
movq mm2, mm0 // vfLVolume and vfRVolume in mm2
// need to be set before first pass.
// *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value
psrld mm2, 5
//for (dwI = 0; dwI < dwLength; )
//{
mainloop:
cmp ecx, dwLength
jae done
cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength)
jb NotPastEndOfSample1 //{
cmp pfLoopLength, 0 //if (!pfLoopLength)
je done // break;
sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength;
NotPastEndOfSample1: //}
mov esi, eax // dwPosition1 = pfSamplePos;
add eax, ebx // pfSamplePos += pfPitch;
sub edx, 2 // dwIncDelta-=2;
jnz DontIncreaseValues1 //if (!dwIncDelta) {
// Since edx was use for dwIncDelta and now its zero, we can use if for a temporary
// for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume
// if it goes below zero.
paddd mm0, vfDeltaLandRVolume // vfVFract += vfDeltaVolume;
// vfVFract += vfDeltaVolume;
pxor mm5, mm5 // TestLVol = 0; TestRVol = 0;
mov edx, pfPFract // Temp = pfPFract;
pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff;
// if (TestRVol > vfRVFract) TestRVol = 0xffffffff;
add edx, pfDeltaPitch // Temp += pfDeltaPitch;
pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol);
// TestRVol = vfRVFract & (~TestRVol);
mov pfPFract, edx // pfPFract = Temp;
movq mm2, mm5 // vfLVolume = TestLVol;
// vfRVolume = TestRVol;
shr edx, 8 // Temp = Temp >> 8;
psrld mm2, 5 // vfLVolume = vfLVolume >> 5;
// vfRVolume = vfRVolume >> 5;
mov ebx, edx // pfPitch = Temp;
mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod;
//}
DontIncreaseValues1:
movd mm6, esi // dwFract1 = dwPosition1;
movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000
shr esi, 12 // dwPosition1 = dwPosition1 >> 12;
inc ecx //dwI++;
// if ( dwI < dwLength) break;
cmp ecx, dwLength
jae StoreOne
//if (pfSamplePos >= pfSampleLength)
//{
cmp eax, pfSampleLength
jb NotPastEndOfSample2
// Original if in C was not negated
//if (!pfLoopLength)
cmp pfLoopLength, 0
//break;
je StoreOne
//else
//pfSamplePos -= pfLoopLength;
sub eax, pfLoopLength
//}
NotPastEndOfSample2:
shl esi, 1 // shift left since pcWave is array of shorts
mov edi, eax // dwPosition2 = pfSamplePos;
add esi, pcWave // Put address of pcWave[dwPosition1] in esi
movd mm7, eax // dwFract2 = pfSamplePos;
shr edi, 12 // dwPosition2 = dwPosition2 >> 12;
punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are
// 0, 0, dwFract2, dwFract1
pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff;
movd mm7, dword ptr[esi] //lLM1 = pcWave[dwPosition1];
psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1
shl edi, 1 // shift left since pcWave is array of shorts
punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1
add edi, pcWave // Put address of pcWave[dwPosition2] in edi
mov esi, ecx // Temp = dWI;
shl esi, 1 // Temp = Temp << 1;
movq mm3, mm2 // put left and right volume levels in mm3
movd mm6, dword ptr[edi] //lLM2 = pcWave[dwPosition2];
packssdw mm3, mm2 // words in mm7
// vfRVolume2, vfLVolume2, vfRVolume1, vfLVolume1
add esi, pBuffer //
punpckldq mm7, mm6 // low four bytes bytes in
// pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
pmaddwd mm7, mm5 // high dword = lM2 =
//(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2))
// low dword = lM1 =
//(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
add eax, ebx //pfSamplePos += pfPitch;
movd mm5, dword ptr[esi-2] // Load values from buffer
inc ecx // dwI++;
psrad mm7, 12 // shift back down to 16 bits.
packssdw mm7, mm4 // only need one word in mono case.
// low word are lm2 and lm1
// above multiplies and shifts are all done with this one pmul. Low two word are only
// interest in mono case
pmulhw mm3, mm7 // lLM1 *= vfVolume;
// lLM2 *= vfVolume;
paddsw mm5, mm3 // Add values to buffer with saturation
movd dword ptr[esi-2], mm5 // Store values back into buffer.
// }
jmp mainloop
// Need to write only one.
//if (dwI < dwLength)
//{
StoreOne:
#if 1
// Linearly interpolate between points and store only one value.
// combine dwFract Values.
// Make mm7 zero for unpacking
shl esi, 1 // shift left since pcWave is array of shorts
add esi, pcWave // Put address of pcWave[dwPosition1] in esi
pxor mm7, mm7
//lLM1 = pcWave[dwPosition1];
mov esi, dword ptr[esi]
// Doing AND that was not done for dwFract1 and dwFract2
pand mm6, mm4
// words in MMX register after operation is complete.
psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1
punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1
// put values of pcWave into MMX registers. They are read into a regular register so
// that the routine does not read past the end of the buffer otherwise, it could read
// directly into the MMX registers.
// words in MMX registers
movd mm7, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1]
// *2 pmadd efficent code.
//lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
//lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
pmaddwd mm7, mm5// low dword = lM1 =
//(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
psrad mm7, 12 // shift back down to 16 bits
movq mm5, mm2 // move volume into mm5
/*
// Set lLM to be same as lM
lLM1 = lM1;
lLM1 *= vfLVolume1;
lLM1 >>= 5; // Signal bumps up to 15 bits.
lM1 *= vfRVolume1;
lM1 >>= 5;
// Set lLM to be same as lM
lLM2 = lM2;
lLM2 *= vfLVolume2;
lLM2 >>= 5; // Signal bumps up to 15 bits.
lM2 *= vfRVolume2;
lM2 >>= 5;
*/
// above multiplies and shifts are all done with this one pmul
pmulhw mm5, mm7
// calculate buffer location.
mov edi, ecx
shl edi, 1
add edi, pBuffer
movd edx, mm5
//pBuffer[dwI+1] += (short) lM1;
add word ptr[edi-2], dx
jno no_oflowr1
//pBuffer[dwI+1] = 0x7fff;
mov word ptr[edi-2], 0x7fff
js no_oflowr1
//pBuffer[dwI+1] = (short) 0x8000;
mov word ptr[edi-2], 0x8000
no_oflowr1:
//}
#endif
done:
mov edx, this // get address of class object
//m_vfLastLVolume = vfVolume;
//m_vfLastRVolume = vfVolume;
// need to shift volume back down to 12 bits before storing
psrld mm2, 3
movd [edx]this.m_vfLastLVolume, mm2
movd [edx]this.m_vfLastRVolume, mm2
//m_pfLastPitch = pfPitch;
mov [edx]this.m_pfLastPitch, ebx
//m_pfLastSample = pfSamplePos;
mov [edx]this.m_pfLastSample, eax
// put value back into dwI to be returned. This could just be passed back in eax I think.
mov dwI, ecx
emms
} // ASM block
return (dwI);
}
/*****************************************************************************
* CDigitalAudio::Mix16X()
*****************************************************************************
* Implement a stereo sixteen-bit mix.
* Heavily optimized for MMX.
*/
DWORD CDigitalAudio::Mix16X(short * pBuffer, DWORD dwLength,
DWORD dwDeltaPeriod, VFRACT vfDeltaLVolume,
VFRACT vfDeltaRVolume,PFRACT pfDeltaPitch,
PFRACT pfSampleLength,PFRACT pfLoopLength)
{
DWORD dwI,dwIncDelta = dwDeltaPeriod;
//DWORD dwPosition1, dwPosition2;
//long lM1, lLM1;
//long lM2, lLM2;
//VFRACT dwFract1, dwFract2;
short * pcWave = (short *) m_pnWave;
PFRACT pfSamplePos = m_pfLastSample;
VFRACT vfLVolume = m_vfLastLVolume;
VFRACT vfRVolume = m_vfLastRVolume;
VFRACT vfLVolume2 = m_vfLastLVolume;
VFRACT vfRVolume2 = m_vfLastRVolume;
PFRACT pfPitch = m_pfLastPitch;
PFRACT pfPFract = pfPitch << 8;
dwLength <<= 1;
QWORD dwFractMASK = 0x000000000FFF0FFF;
QWORD dwFractOne = 0x0000000010001000;
QWORD wordmask = 0x0000FFFF0000FFFF;
QWORD vfDeltaLandRVolume;
_asm{
// vfLVFract and vfRVFract are in mm0
//VFRACT vfLVFract = vfLVolume1 << 8; // Keep high res version around.
//VFRACT vfRVFract = vfRVolume1 << 8;
movd mm0, vfLVolume
movd mm7, vfRVolume
// vfDeltaLVolume and vfDeltaRVolume are put in mm1 so that they can be stored in vfDeltaLandRVolume
movd mm1, vfDeltaLVolume
movd mm6, vfDeltaRVolume
punpckldq mm1, mm6
// dwI = 0
mov ecx, 0
movq vfDeltaLandRVolume, mm1
movq mm1, dwFractOne
movq mm4, dwFractMASK
mov eax, pfSamplePos
punpckldq mm0, mm7
mov ebx, pfPitch
pslld mm0, 8
mov edx, dwIncDelta
movq mm2, mm0 // vfLVolume and vfRVolume in mm2
// need to be set before first pass.
// *1 I shift by 5 so that volume is a 15 bit value instead of a 12 bit value
psrld mm2, 5
//for (dwI = 0; dwI < dwLength; )
//{
mainloop:
cmp ecx, dwLength
jae done
cmp eax, pfSampleLength //if (pfSamplePos >= pfSampleLength)
jb NotPastEndOfSample1 //{
cmp pfLoopLength, 0 //if (!pfLoopLength)
je done // break;
sub eax, pfLoopLength // else pfSamplePos -= pfLoopLength;
NotPastEndOfSample1: //}
mov esi, eax // dwPosition1 = pfSamplePos;
add eax, ebx // pfSamplePos += pfPitch;
sub edx, 2 // dwIncDelta-=2;
jnz DontIncreaseValues1 //if (!dwIncDelta) {
// Since edx was use for dwIncDelta and now its zero, we can use if for a temporary
// for a bit. All code that TestLVol and TestRVol is doing is zeroing out the volume
// if it goes below zero.
paddd mm0, vfDeltaLandRVolume // vfLVFract += vfDeltaLVolume;
// vfRVFract += vfDeltaRVolume;
pxor mm5, mm5 // TestLVol = 0; TestRVol = 0;
mov edx, pfPFract // Temp = pfPFract;
pcmpgtd mm5, mm0 // if (TestLVol > vfLVFract) TestLVol = 0xffffffff;
// if (TestRVol > vfRVFract) TestRVol = 0xffffffff;
add edx, pfDeltaPitch // Temp += pfDeltaPitch;
pandn mm5, mm0 // TestLVol = vfLVFract & (~TestLVol);
// TestRVol = vfRVFract & (~TestRVol);
mov pfPFract, edx // pfPFract = Temp;
movq mm2, mm5 // vfLVolume = TestLVol;
// vfRVolume = TestRVol;
shr edx, 8 // Temp = Temp >> 8;
psrld mm2, 5 // vfLVolume = vfLVolume >> 5;
// vfRVolume = vfRVolume >> 5;
mov ebx, edx // pfPitch = Temp;
mov edx, dwDeltaPeriod //dwIncDelta = dwDeltaPeriod;
//}
DontIncreaseValues1:
movd mm6, esi // dwFract1 = dwPosition1;
movq mm5, mm1 // words in mm5 = 0, 0, 0x1000, 0x1000
shr esi, 12 // dwPosition1 = dwPosition1 >> 12;
add ecx, 2 //dwI += 2;
// if ( dwI < dwLength) break;
cmp ecx, dwLength
jae StoreOne
//if (pfSamplePos >= pfSampleLength)
//{
cmp eax, pfSampleLength
jb NotPastEndOfSample2
// Original if in C was not negated
//if (!pfLoopLength)
cmp pfLoopLength, 0
//break;
je StoreOne
//else
//pfSamplePos -= pfLoopLength;
sub eax, pfLoopLength
//}
NotPastEndOfSample2:
shl esi, 1 // shift left since pcWave is array of shorts
mov edi, eax // dwPosition2 = pfSamplePos;
add esi, pcWave // Put address of pcWave[dwPosition1] in esi
movd mm7, eax // dwFract2 = pfSamplePos;
shr edi, 12 // dwPosition2 = dwPosition2 >> 12;
punpcklwd mm6, mm7 // combine dwFract Values. Words in mm6 after unpack are
// 0, 0, dwFract2, dwFract1
pand mm6, mm4 // dwFract2 &= 0xfff; dwFract1 &= 0xfff;
movd mm7, dword ptr[esi] //lLM1 = pcWave[dwPosition1];
psubw mm5, mm6 // 0, 0, 0x1000 - dwFract2, 0x1000 - dwFract1
shl edi, 1 // shift left since pcWave is array of shorts
punpcklwd mm5, mm6 // dwFract2, 0x1000 - dwFract2, dwFract1, 0x1000 - dwFract1
add edi, pcWave // Put address of pcWave[dwPosition2] in edi
mov esi, ecx // Temp = dWI;
shl esi, 1 // Temp = Temp << 1;
movq mm3, mm2 // put left and right volume levels in mm3
movd mm6, dword ptr[edi] //lLM2 = pcWave[dwPosition2];
packssdw mm3, mm2 // words in mm7
// vfRVolume2, vfLVolume2, vfRVolume1, vfLVolume1
add esi, pBuffer //
punpckldq mm7, mm6 // low four bytes bytes in
// pcWave[dwPos2+1], pcWave[dwPos2], pcWave[dwPos1+1], pcWave[dwPos1]
pmaddwd mm7, mm5 // high dword = lM2 =
//(pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2))
// low dword = lM1 =
//(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
add eax, ebx //pfSamplePos += pfPitch;
movq mm5, qword ptr[esi-4] // Load values from buffer
add ecx, 2 // dwI += 2;
psrad mm7, 12 // shift back down to 16 bits.
pand mm7, wordmask // combine results to get ready to multiply by left and right
movq mm6, mm7 // volume levels.
pslld mm6, 16 //
por mm7, mm6 // words in mm7
// lM2, lM2, lM1, lM1
// above multiplies and shifts are all done with this one pmul
pmulhw mm3, mm7 // lLM1 *= vfLVolume;
// lM1 *= vfRVolume;
// lLM2 *= vfLVolume;
// lM2 *= vfRVolume;
paddsw mm5, mm3 // Add values to buffer with saturation
movq qword ptr[esi-4], mm5 // Store values back into buffer.
// }
jmp mainloop
// Need to write only one.
//if (dwI < dwLength)
//{
StoreOne:
#if 1
// Linearly interpolate between points and store only one value.
// combine dwFract Values.
// Make mm7 zero for unpacking
shl esi, 1 // shift left since pcWave is array of shorts
add esi, pcWave // Put address of pcWave[dwPosition1] in esi
pxor mm7, mm7
//lLM1 = pcWave[dwPosition1];
mov esi, dword ptr[esi]
// Doing AND that was not done for dwFract1 and dwFract2
pand mm6, mm4
// words in MMX register after operation is complete.
psubw mm5, mm6 // 0, 0, 0x1000 - 0, 0x1000 - dwFract1
punpcklwd mm5, mm6 // 0 , 0x1000 - 0, dwFract1, 0x1000 - dwFract1
// put values of pcWave into MMX registers. They are read into a regular register so
// that the routine does not read past the end of the buffer otherwise, it could read
// directly into the MMX registers.
// words in MMX registers
movd mm7, esi // 0, 0, pcWave[dwPos1+1], pcWave[dwPos1]
// *2 pmadd efficent code.
//lM2 = (pcWave[dwPosition2 + 1] * dwFract2 + pcWave[dwPosition2]*(0x1000-dwFract2)) >> 12;
//lM1 = (pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1)) >> 12;
pmaddwd mm7, mm5// low dword = lM1 =
//(pcWave[dwPosition1 + 1] * dwFract1 + pcWave[dwPosition1]*(0x1000-dwFract1))
psrad mm7, 12 // shift back down to 16 bits
pand mm7, wordmask // combine results to get ready to multiply by left and right
movq mm6, mm7 // volume levels.
pslld mm6, 16 //
por mm7, mm6 // words in mm7
// lM2, lM2, lM1, lM1
pxor mm6, mm6
movq mm5, mm2 // move volume1 into mm5
// use pack to get 4 volume values together for multiplication.
packssdw mm5, mm6 // words in mm7
// 0, 0, vfRVolume1, vfLVolume1
/*
// Set lLM to be same as lM
lLM1 = lM1;
lLM1 *= vfLVolume1;
lLM1 >>= 5; // Signal bumps up to 15 bits.
lM1 *= vfRVolume1;
lM1 >>= 5;
// Set lLM to be same as lM
lLM2 = lM2;
lLM2 *= vfLVolume2;
lLM2 >>= 5; // Signal bumps up to 15 bits.
lM2 *= vfRVolume2;
lM2 >>= 5;
*/
// above multiplies and shifts are all done with this one pmul
pmulhw mm5, mm7
// calculate buffer location.
mov edi, ecx
shl edi, 1
add edi, pBuffer
/*
add word ptr[edi-4], si
jno no_oflowl1
// pBuffer[dwI] = 0x7fff;
mov word ptr[edi-4], 0x7fff
js no_oflowl1
//pBuffer[dwI] = (short) 0x8000;
mov word ptr[edi-4], 0x8000
no_oflowl1:
//pBuffer[dwI+1] += (short) lM1;
add word ptr[edi-2], dx
jno no_oflowr1
//pBuffer[dwI+1] = 0x7fff;
mov word ptr[edi-2], 0x7fff
js no_oflowr1
//pBuffer[dwI+1] = (short) 0x8000;
mov word ptr[edi-2], 0x8000
no_oflowr1:
*/
movd mm7, dword ptr[edi-4]
paddsw mm7, mm5
movd dword ptr[edi-4], mm7
//}
#endif
done:
mov edx, this // get address of class object
//m_vfLastLVolume = vfLVolume;
//m_vfLastRVolume = vfRVolume;
// need to shift volume back down to 12 bits before storing
psrld mm2, 3
movd [edx]this.m_vfLastLVolume, mm2
psrlq mm2, 32
movd [edx]this.m_vfLastRVolume, mm2
//m_pfLastPitch = pfPitch;
mov [edx]this.m_pfLastPitch, ebx
//m_pfLastSample = pfSamplePos;
mov [edx]this.m_pfLastSample, eax
// put value back into dwI to be returned. This could just be passed back in eax I think.
mov dwI, ecx
emms
} // ASM block
return (dwI >> 1);
}
/*****************************************************************************
* MMXDisabled()
*****************************************************************************
* Check the registry key to determine whether to ignore MMX.
*/
static BOOL MMXDisabled()
{
ULONG ulValue;
if (!GetRegValueDword(
TEXT("Software\\Microsoft\\DirectMusic"),
TEXT("MMXDisabled"),
&ulValue))
{
return FALSE;
}
return (BOOL)ulValue;
}
#define CPU_ID _asm _emit 0x0f _asm _emit 0xa2
/*****************************************************************************
* MultiMediaInstructionsSupported()
*****************************************************************************
* Returns whether this CPU supports MMX.
*/
BOOL MultiMediaInstructionsSupported()
{
BOOL bMultiMediaInstructionsSupported;
if (!MMXDisabled())
{
_asm
{
pushfd // Store original EFLAGS on stack
pop eax // Get original EFLAGS in EAX
mov ecx, eax // Duplicate original EFLAGS in ECX for toggle check
xor eax, 0x00200000L // Flip ID bit in EFLAGS
push eax // Save new EFLAGS value on stack
popfd // Replace current EFLAGS value
pushfd // Store new EFLAGS on stack
pop eax // Get new EFLAGS in EAX
xor eax, ecx // Can we toggle ID bit?
jz Done // Jump if no, Processor is older than a Pentium so CPU_ID is not supported
mov eax, 1 // Set EAX to tell the CPUID instruction what to return
push ebx
CPU_ID // Get family/model/stepping/features
pop ebx
xor eax,eax // Assume failure
test edx, 0x00800000L // Check if mmx technology available
jz Done // Jump if no
// Tests passed, this machine supports MMX
inc eax // Set to success
Done:
mov bMultiMediaInstructionsSupported, eax
}
} else {
bMultiMediaInstructionsSupported = 0;
}
return (bMultiMediaInstructionsSupported);
}