/********************************************************************* Silence.Cpp - Code for detecting silence on an incoming audio stream begun 5/14/94 by Mike Rozak Modified 12/10/96 by John Merrill to fix up alignment problems */ #include "stdafx.h" #include #include "silence.h" // temporary #pragma warning(disable: 4100 4244) /********************************************************************* LowPassFilter - This low-pass filters 16-bit mono PCM data from one buffer into another. inputs short *lpSrc - Source buffer DWORD dwNumSamples - Number of samples in the source buffer short *lpDst - Destination buffer. This will be filled in with a low-passed version. It will have about an 8 sample lag. This must be as large as lpSrc. short *psMax - Filled in with the new maximum. If NULL then nothing is copied. short *psMin - Filled in with the new minimum If NULL then nothing is copied. short *psAvg - Filled in with the new average If NULL then nothing is copied. DWORD dwSamplesPerSec returns DWORD - Number of samples returned. This will be <= dwNumSamples, possible dwNumSamples - 7. */ DWORD LowPassFilter (short *lpSrc, DWORD dwNumSamples, short *lpDst, short *psMax, short *psMin, short *psAvg, DWORD dwSamplesPerSec) { SPDBG_FUNC( "LowPassFilter" ); DWORD i; long lSum; short sSum, sMax, sMin; short *lpLag; BOOL fLow = (dwSamplesPerSec < 13000); #define SHIFTRIGHT (fLow ? 3 : 4) // # bits to shift right. #define WINDOWSIZE (1 << SHIFTRIGHT) // # samples if (dwNumSamples < (DWORD) (WINDOWSIZE+1)) return 0; // take the first 8 samples and average them together. lSum = 0; for (i = 0; i < (DWORD) WINDOWSIZE; i++) lSum += lpSrc[i]; sSum = (short) (lSum >> SHIFTRIGHT); //loop through the rest of the samples lpLag = lpSrc; lpSrc += WINDOWSIZE; dwNumSamples -= WINDOWSIZE; lSum = 0; // total sMax = -32768; sMin = 32767; for (i = 0;dwNumSamples; lpSrc++, lpDst++, lpLag++, i++, dwNumSamples--) { sSum = sSum - (*lpLag >> SHIFTRIGHT) + (*lpSrc >> SHIFTRIGHT); // sSum = *lpSrc; // Dont do any filtering at all *lpDst = sSum; lSum += sSum; if (sSum > sMax) sMax = sSum; if (sSum < sMin) sMin = sSum; }; // whow much did we do if (psMax) *psMax = sMax; if (psMin) *psMin = sMin; if (psAvg && i) *psAvg = (short) (lSum / (long) i); return i; } /********************************************************************* QuantSamples - This quantizes the samples to +1, 0, or -1 (in place), depedning if the given value is: > sPositive then +1 < sNegative then -1 else 0 inputs short *pSamples - Samples DWORD dwNumSamples - Number of samples short sPositive - Positive threshhold short sNegative - Negative threshhold returns none */ void QuantSamples (short *pSamples, DWORD dwNumSamples, short sPositive, short sNegative) { SPDBG_FUNC( "QuantSamples" ); while (dwNumSamples) { if (*pSamples > sPositive) *pSamples = 1; else if (*pSamples < sNegative) *pSamples = -1; else *pSamples = 0; pSamples++; dwNumSamples--; }; } /********************************************************************* FindZC - This searches through the samples for the first zero crossing. The returned point will have its previous sample at <= 0, and the new one at >0. inputs short *pSamples - Samples; DWORD dwNumSamples - Number of samples returns DWORD - first sampe number which is positive, or 0 if cant find */ DWORD FindZC (short *pSamples, DWORD dwNumSamples) { SPDBG_FUNC( "FindZC" ); DWORD i; for (i = 1; i < dwNumSamples; i++) if ((pSamples[i] > 0) && (pSamples[i-1] <= 0)) return i; // else cant find return 0; } /********************************************************************* CompareSegments - This compares two wave segments and sees how much alike they are, returning a confidence that they are the same. inputs short *pA - Samples. This assumes that the samples are -1, 0, or +1. short *pB - Samples for B. Should be -1, 0, or +1 DWORD dwNumSamples - Number of samples in each of them returns WORD - Confidence from 0 to 0xffff (highest confidence) Notes about the algo: Each sample will score a "similarity point" for like signs, or if one of the values is a 0. */ WORD CompareSegments (short *pA, short *pB, DWORD dwNumSamples) { SPDBG_FUNC( "CompareSegments" ); DWORD dwSimilar = 0; DWORD dwLeft; for (dwLeft = dwNumSamples; dwLeft; pA++, pB++, dwLeft--) if ((*pA == *pB) || (*pA == 0) || (*pB == 0)) dwSimilar++; return (WORD) ((dwSimilar * 0xffff) / dwNumSamples); } /********************************************************************* FindMostLikelyWaveLen - This Searches through wave data and finds the most likeley wavelength for voiced audio. it returns a condifence score from 0 to ffff (ffff is 100% positive). inputs short *pSamples - Samples DWORD dwNumSamples - Number of samples DWORD dwMinWaveLen - Minimum accepatble wavelength DWORD dwMaxWaveLen - Maximum acceptable wavelength WORD *pwConfidence - Filled in with confidence rating. returns DWORD - Wavelength found. 0 if can't deteermine anything */ DWORD FindMostLikelyWaveLen (short *pSamples, DWORD dwNumSamples, DWORD dwMinWaveLen, DWORD dwMaxWaveLen, WORD *pwConfidence) { SPDBG_FUNC( "FindMostLikelyWaveLen" ); #define NUMCOMP (3) DWORD dwFirstZC, i; DWORD dwBestWaveLen; WORD wBestConfidence; DWORD dwCurZC, dwCurWaveLen, dwTemp; WORD wConf, wTemp; // Step one, find the first zero crossing dwFirstZC = FindZC (pSamples, dwNumSamples); if (!dwFirstZC) return 0; // error // Start at a minimum-wavelength away and start finding a wave // which repeats three times and compares well. dwBestWaveLen = 0; // best wavelength found so far wBestConfidence = 0; // confidence of the best wavelength dwCurWaveLen = dwMinWaveLen; while (dwCurWaveLen <= dwMaxWaveLen) { // Try the first comparison dwCurZC = dwFirstZC + dwCurWaveLen; if (dwCurZC >= dwNumSamples) break; // no more samples left // find first zero crossing from the current wavelen dwTemp = FindZC (pSamples + dwCurZC, dwNumSamples - dwCurZC); if (!dwTemp) break; // no more samples left dwCurZC += dwTemp; dwCurWaveLen += dwTemp; // Make sure that we have three wavelength's worth if ((dwFirstZC + (NUMCOMP+1)*dwCurWaveLen) >= dwNumSamples) break; // cant compare this // Do two confidence tests and multiply them toegther to // get the confidence for this wavelength wConf = 0xffff; for (i = 0; i < NUMCOMP; i++) { wTemp = CompareSegments (pSamples + dwFirstZC /* + i * dwCurWaveLen */, pSamples + (dwFirstZC + (i+1) * dwCurWaveLen), dwCurWaveLen); wConf = (WORD) (((DWORD) wConf * (DWORD) wTemp) >> 16); }; // If we're more confident about this one than others then use it if (wConf >= wBestConfidence) { wBestConfidence = wConf; dwBestWaveLen = dwCurWaveLen; }; // Up the current wavelength just a tad dwCurWaveLen++; }; *pwConfidence = wBestConfidence; return dwBestWaveLen; } /********************************************************************* IsSegmentVoiced - This detects if the segment if voiced or not. inputs short *pSamples - Sample data DWORD dwNumSamples - number of samples DWORD dwSamplesPerSec - Number of sample sper second WORD wMinConfidence - Minimum condifence returns BOOL - TRUE if its definately voiced, FALSE if not or cant tell */ BOOL CSilence::IsSegmentVoiced (short *pSamples, DWORD dwNumSamples, DWORD dwSamplesPerSec, WORD wMinConfidence, short *asFiltered) { SPDBG_FUNC( "CSilence::IsSegmentVoiced" ); //#define FILTERNUM (1024) // max # samples i nthe filter //#define MAXVOICEHZ (300) // maximum voicce pitchm in hz //#define MINVOICEHZ (50) // minimum voice pitch in hz // #define MINCONFIDENCE (0x6000) // minimum confidence // This means that 70% of the samples line up from one wavelength // to another DWORD dwNumFilter; //short asFiltered[FILTERNUM]; short sMax, sMin, sAvg; DWORD dwWaveLen; WORD wConfidence; short sPositive, sNegative; // Filter it first so we just get the voiced audio range if (dwNumSamples > FILTERNUM) dwNumSamples = FILTERNUM; dwNumFilter = LowPassFilter (pSamples, dwNumSamples, asFiltered, &sMax, &sMin, &sAvg, m_dwSamplesPerSec); // Truncate the wave samples to +1, 0, -1 sPositive = sAvg; sNegative = sAvg; QuantSamples (asFiltered, dwNumFilter, sPositive, sNegative); // look through the voiced wavelengths for a frequency dwWaveLen = FindMostLikelyWaveLen (asFiltered, dwNumFilter, dwSamplesPerSec / m_dwHighFreq, dwSamplesPerSec / MINVOICEHZ, &wConfidence); return (dwWaveLen && (wConfidence >= wMinConfidence)); } /********************************************************************* TrimMaxAmp - This extracts the maximum amplitude range of the wave file segment. inputs short * lpS - samples to look through WORD dwNum - number of samples returns WORD - maximum amplitude range */ WORD NEAR PASCAL TrimMaxAmp (short * lpS, DWORD dwNum) { SPDBG_FUNC( "TrimMaxAmp" ); DWORD i; short sMin, sMax, sTemp; sMin = 32767; sMax = (short) -32768; for (i = dwNum; i; i--) { sTemp = *(lpS++); if (sTemp < sMin) sMin = sTemp; if (sTemp > sMax) sMax = sTemp; }; // If we're clipping at all then claim that we've maxed out. // Some sound cards have bad DC offsets if ((sMax >= 0x7f00) || (sMin <= -0x7f00)) return 0xffff; return (WORD) (sMax - sMin); } /******************************************************************** TrimMaxAmpDelta - This extracts the maximum amplitude range and calculates the maximum delta of the wave file segment. inputs PBLOCKCHAR pBlockChar - Pointer to a block characteristic structure which is filled in. short * lpS - deltas to look through WORD dwNum - number of samples returns nothing */ void TrimMaxAmpDelta(PBLOCKCHAR pBlockChar, short *lpS, DWORD dwNum) { SPDBG_FUNC( "TrimMaxAmpDelta" ); DWORD i; WORD wMax = 0; WORD wTemp; short sMin, sMax, sCur, sLast; // BUGFIX: 4303 Merge TrimMaxAmp and TrimMaxDelta sLast = sMin = sMax = *(lpS++); for (i = dwNum - 1; i; i--, sLast = sCur) { sCur = *(lpS++); // TrimMaxAmp if (sCur < sMin) sMin = sCur; if (sCur > sMax) sMax = sCur; // TrimMaxDelta wTemp = sCur > sLast ? (WORD) (sCur - sLast) : (WORD) (sLast - sCur); if (wTemp > wMax) wMax = wTemp; } // If we're clipping at all then claim that we've maxed out. // Some sound cards have bad DC offsets pBlockChar->wMaxLevel = ((sMax >= 0x7F00) || (sMin <= -0x7F00)) ? 0xFFFF : (WORD) (sMax - sMin); pBlockChar->wMaxDelta = wMax; } /* End of TrimMaxAmpDelta() */ /********************************************************************* GetBlockChar - This gets the characteristics of a block of audio. This characteristics can then be used to determine if the block is silent or not. inputs short *lpS - sample data DWORD dwNum - number of samples PBLOCKCHAR pBlockChar - Pointer to a block characteristic structure which is filled in. BOOL fTestVoiced - Voicce testing will only be done if this is TTRUE (in order to save processor). returns none */ void GetBlockChar(short *lpS, DWORD dwNum, PBLOCKCHAR pBlockChar, BOOL fTestVoiced) { SPDBG_FUNC( "GetBlockChar" ); // BUGFIX: 4303 Merge TrimMaxAmp and TrimMaxDelta TrimMaxAmpDelta(pBlockChar, lpS, dwNum); pBlockChar->bIsVoiced = pBlockChar->bHighLevel = pBlockChar->bHighDelta = SIL_UNKNOWN; } /********************************************************************* IsBlockSound - This detects whether the block is silent or not. inputs PBLOCKCHAR pBlockInQuestion - Block in question. This has the bHighLevel and bHighDelta flags modified PBLOCKCHAR pBlockSilence - Silent block BOOL fInUtterance - TRUE if we're in an utterance (which means be more sensative), FALSE if we're not returns BOOL - TTRUE if has sound, FALSE if it is silent */ BOOL IsBlockSound (PBLOCKCHAR pBlockInQuestion, PBLOCKCHAR pBlockSilence, BOOL fInUtterance) { SPDBG_FUNC( "IsBlockSound" ); #ifdef SOFTEND // Use so that catches a soft ending to phrases #define SENSINV_THRESHHOLD_LEVEL(x) (((x)/4)*3) #define SENSINV_THRESHHOLD_DELTA(x) (((x)/4)*3) #else #define SENSINV_THRESHHOLD_LEVEL(x) ((x)/2) #define SENSINV_THRESHHOLD_DELTA(x) ((x)/2) #endif #define NORMINV_THRESHHOLD_LEVEL(x) ((x)/2) #define NORMINV_THRESHHOLD_DELTA(x) ((x)/2) if (fInUtterance) { pBlockInQuestion->bHighLevel = SENSINV_THRESHHOLD_LEVEL(pBlockInQuestion->wMaxLevel) >= pBlockSilence->wMaxLevel; pBlockInQuestion->bHighDelta = SENSINV_THRESHHOLD_DELTA(pBlockInQuestion->wMaxDelta) >= pBlockSilence->wMaxDelta; } else { pBlockInQuestion->bHighLevel = NORMINV_THRESHHOLD_LEVEL(pBlockInQuestion->wMaxLevel) >= pBlockSilence->wMaxLevel; pBlockInQuestion->bHighDelta = NORMINV_THRESHHOLD_DELTA(pBlockInQuestion->wMaxDelta) >= pBlockSilence->wMaxDelta; }; return pBlockInQuestion->bHighLevel || pBlockInQuestion->bHighDelta; } /********************************************************************* ReEvaluateSilence - This takes the values used for silence and re-evaluates them based upon new data which indicates what silence is. It automatically adjusts to the noise level in the room over a few seconds. NOTE: This should not be called when an utterance is happening, or when it might be starting. inputs PBLOCKCHAR pSilence - This is the silence block, and should start out with values in it. It will be modified so to incorporate the new silence information. PBLOCKCHAR pNew - New block which is known to be silence. BYTE bWeight - This is the weighting of the new block in influencing the old block, in a value from 0 to 255. 256 means that the value of the new silence completely overpowers the old one, 0 means that it doesnt have any affect. returns none */ void ReEvaluateSilence (PBLOCKCHAR pSilence, PBLOCKCHAR pNew, BYTE bWeight) { SPDBG_FUNC( "ReEvaluateSilence" ); #define ADJUST(wOrig,wNew,bWt) \ (WORD) (( \ ((DWORD) (wOrig) * (DWORD) (256 - (bWt))) + \ ((DWORD) (wNew) * (DWORD) (bWt)) \ ) >> 8); pSilence->wMaxLevel = ADJUST (pSilence->wMaxLevel, pNew->wMaxLevel, bWeight); pSilence->wMaxDelta = ADJUST (pSilence->wMaxDelta, pNew->wMaxDelta, bWeight); // If it's way too silence (and too good to be true) then assume // a default silece // if (!pNew->wMaxLevel && !pNew->wMaxDelta) { // if (pSilence->wMaxLevel < 2500) // pSilence->wMaxLevel = 2500; // if (pSilence->wMaxDelta < 400) // pSilence->wMaxDelta = 400; // } } /********************************************************************* WhatsTheNewState - This takes in a stream of bit-field indicating which of the last 32 blocks were detected as having sound, and what our state was the last time this was called (utterance or not). It then figureous out if we're still in an utterance, or we just entered one. It also says how many buffers ago that was. inputs DWORD dwSoundBits - This is a bit-field of the last 32 audio blocks. A 1 in the field indicates that there was sound there, a 0 indicates no sound. The low bit corresponds to the most recent block, and high bit the oldest. DWORD dwVoicedBits - Just like sound bits except that it indicates voiced sections of sound. BOOL fWasInUtterance - This is true is we had an utterance the last time this called, FALSE if there was silence BOOL fLongUtterance - If this is a long utterance then dont react for 1/4 second, otherwise use 1/8 second for short utterance WORD wBlocksPerSec - How many of the above-mentioned blocks fit into a second. WORD *wStarted - If a transition occurs from no utterance to an utterance, then this fills in the number of of blocks ago that the utterance started, into *wStarted. Otherwise it is not changed. WORD wReaction - Reaction time (in blocks) after an utterance is finished returns BOOL - TRUE if we're in an utterance now, FALSE if we're in silence */ BOOL CSilence::WhatsTheNewState (DWORD dwSoundBits, DWORD dwVoicedBits, BOOL fWasInUtterance, BOOL fLongUtterance, WORD wBlocksPerSec, WORD *wStarted, WORD wReaction) { SPDBG_FUNC( "CSilence::WhatsTheNewState" ); WORD wCount, wOneBits; WORD wTimeToCheck; DWORD dwTemp, dwMask; if (fWasInUtterance) wTimeToCheck = wReaction; else wTimeToCheck = (wBlocksPerSec/4); // 1/4 second if (!wTimeToCheck) wTimeToCheck = 1; for (wOneBits = 0, wCount = wTimeToCheck, dwTemp = dwSoundBits; wCount; dwTemp /= 2, wCount--) if (dwTemp & 0x01) wOneBits++; if (fWasInUtterance) { // If we were in an utterance, then we still are in an utterance // UNLESS the number of bits which are turned on for the last // 0.5 seconds is less that 1/4 of what should be turned on. if ( (wOneBits >= 1)) return TRUE; else return FALSE; } else { // We are in silence. We cannot possible go into an utterance // until the current block is voicced if (!(dwVoicedBits & 0x01)) return FALSE; // If we were in silence then we're still in silence // UNLESS the number of bits which are turned on for the last // 0.5 seconds is more than 1/2 of what should be turned on. // If so, then start the utterance 0.75 seconds ago. if (wOneBits >= (wTimeToCheck / 2)) { // we're not in an utterance // Look back until get 1/8 second of silence, and include // that in the data returned dwTemp = dwSoundBits; // dwMask = (1 << (wBlocksPerSec / 8)) - 1; // for (wCount = wBlocksPerSec/8; dwTemp & dwMask; dwTemp >>= 1, wCount++); dwMask = (1 << (wBlocksPerSec / m_wAddSilenceDiv)) - 1; for (wCount = wBlocksPerSec/m_wAddSilenceDiv; dwTemp & dwMask; dwTemp >>= 1, wCount++); *wStarted = wCount; return TRUE; } else return FALSE; }; } /********************************************************************* CSilence::CSilence - This creates the silence class. inputs WORD wBlocksPerSec - Number of blocks per second. The blocks will be passed down through AddBlock(). returns class */ CSilence::CSilence (WORD wBlocksPerSec) { SPDBG_FUNC( "CSilence::CSilence" ); m_wBlocksPerSec = min(wBlocksPerSec, 32); // no more than the # bits in a DWORD m_wBlocksInQueue = m_wBlocksPerSec; // 1 second worth. m_wLatestBlock = 0; m_paBlockInfo = NULL; m_dwSoundBits = m_dwVoicedBits = 0; m_fFirstBlock = TRUE; m_fInUtterance = FALSE; m_dwUtteranceLength = 0; m_dwSamplesPerSec = 11025; } /********************************************************************* CSilence::~CSilence - Free up everything. */ CSilence::~CSilence (void) { SPDBG_FUNC( "CSilence::~CSilence" ); WORD i; if (m_paBlockInfo) { for (i = 0; i < m_wBlocksInQueue; i++) if (m_paBlockInfo[i].pSamples) free(m_paBlockInfo[i].pSamples); free(m_paBlockInfo); } if (m_pASFiltered) free(m_pASFiltered); } /********************************************************************* CSilence::Init - This initializes the silence code. It basically allocates memory. It should be called immediately after the object is created and then not again. inputs none returns BOOL - TRUE if succeded, else out of memory */ BOOL CSilence::Init(BOOL fPhoneOptimized, DWORD dwSamplesPerSec) { SPDBG_FUNC( "CSilence::Init" ); m_dwSamplesPerSec = dwSamplesPerSec; if (fPhoneOptimized) { m_wAddSilenceDiv = (WORD) PHADD_BEGIN_SILENCE; m_dwHighFreq = PHMAXVOICEHZ; } else { m_wAddSilenceDiv = (WORD) PCADD_BEGIN_SILENCE; m_dwHighFreq = PCMAXVOICEHZ; } if ((m_pASFiltered = (short *) malloc((sizeof(short)) * FILTERNUM)) == NULL) return (FALSE); // Initialize memory for the blocks and clear it. if (m_paBlockInfo) return (TRUE); m_paBlockInfo = (PBINFO) malloc(m_wBlocksInQueue * sizeof(BINFO)); if (!m_paBlockInfo) return (FALSE); if (m_wBlocksInQueue && m_paBlockInfo) memset(m_paBlockInfo, 0, m_wBlocksInQueue * sizeof(BINFO)); return (TRUE); } /* End of Init() */ /********************************************************************* CSilence::AddBlock - This does the following: - Add the block the the queue. Free up an old block if needed. The block should be 1/wBlocksPerSec long (about). - Analyze the block to see if its got sound or is quiet. - Fill in *wVU with a VU level. - Return TRUE if we're in an utterance, FALSE if its silence now. If TRUE then app should call GetBlock() until no more blocks left, and pass them to the SR engine. inputs short *pSamples - Pointer to samples. This memory should be allocaed with malloc(), and may be freed by the object. DWORD dwNumSamples - Number of samples WORD *wVU - This is fille in with the VU meter for the block QWORD qwTimeStamp - Time stamp for this buffer. returns BOOL - TRUE if an utterance is taking place, FALSE if its silent */ BOOL CSilence::AddBlock (short *pSamples, DWORD dwNumSamples, WORD *wVU, QWORD qwTimeStamp) { SPDBG_FUNC( "CSilence::AddBlock" ); BLOCKCHAR bcNew; BOOL fSound, fUtt; PBINFO pbInfo; WORD wUttStart, i; // Dont add empty blocks if (!dwNumSamples) { if (pSamples) free (pSamples); return m_fInUtterance; }; // Analyze the block for characteristics. GetBlockChar (pSamples, dwNumSamples, &bcNew, !m_fInUtterance); // fill in the vu *wVU = bcNew.wMaxLevel; // see if it's silent or not if (m_fFirstBlock) { // first block, so of course its silent m_bcSilence = bcNew; m_fFirstBlock = FALSE; fSound = FALSE; // BUGFIX 2466 - If it's way too silence (and too good to be true) then assume // a default silece if ((m_bcSilence.wMaxLevel < 500) || (m_bcSilence.wMaxDelta < 100)) { m_bcSilence.wMaxLevel = 2500; m_bcSilence.wMaxDelta = 400; }; // If it's way too loud then cut down if ((m_bcSilence.wMaxLevel > 2500) || (m_bcSilence.wMaxDelta > 1500)) { m_bcSilence.wMaxLevel = min (m_bcSilence.wMaxLevel, 2500); m_bcSilence.wMaxDelta = min (m_bcSilence.wMaxDelta, 1500); }; } else { fSound = IsBlockSound (&bcNew, &m_bcSilence, m_fInUtterance); }; // Test to see if the block is voiced if: // - The amplitude level is more than background sound // - We're not yet in an utterance (to save processor) if (bcNew.bHighLevel && !m_fInUtterance) { WORD wNoise; wNoise = (m_dwSamplesPerSec <= 13000) ? m_wNoiseThresh : ((m_wNoiseThresh / 3) * 2); bcNew.bIsVoiced = this->IsSegmentVoiced (pSamples, dwNumSamples, m_dwSamplesPerSec, wNoise, m_pASFiltered) ? SIL_YES : SIL_NO; } // add the block m_dwVoicedBits = (m_dwVoicedBits << 1) | ( (bcNew.bIsVoiced == SIL_YES) ? 1 : 0 ); m_dwSoundBits = (m_dwSoundBits << 1) | (fSound ? 1 : 0); m_wLatestBlock++; if (m_wLatestBlock >= m_wBlocksInQueue) m_wLatestBlock = 0; pbInfo = m_paBlockInfo + m_wLatestBlock; if (pbInfo->pSamples) free (pbInfo->pSamples); pbInfo->pSamples = pSamples; pbInfo->dwNumSamples = dwNumSamples; // BUGFIX: Alignment code. We need to store the timestamp for // the BEGINNING of the block, not the end! pbInfo->qwTimeStamp = qwTimeStamp - dwNumSamples * sizeof(WORD); // What's our utterance state? fUtt = this->WhatsTheNewState (m_dwSoundBits, m_dwVoicedBits, m_fInUtterance, m_dwUtteranceLength >= m_wBlocksPerSec, m_wBlocksPerSec, &wUttStart, m_wReaction); if (fUtt && !m_fInUtterance) { // We just entered an utterance, so wUttStart has a valid teerm // in it. Go through the buffer queue and free all buffers which // are older than wUttStart. Remembeer, this is a circular buffer for (i = 0; i < (m_wBlocksInQueue - wUttStart); i++) { pbInfo = m_paBlockInfo + ( (m_wLatestBlock + i + 1) % m_wBlocksInQueue); if (pbInfo->pSamples) free (pbInfo->pSamples); pbInfo->pSamples = NULL; }; // Since we just entered an utterance clear the utterance length counter m_dwUtteranceLength = 0; }; m_fInUtterance = fUtt; // Remember how long this utterance has done on. Long utterances // deserve more patience as far as silence goes m_dwUtteranceLength++; // Adjust the silence level if we're not in an utterance // Requiring !fSound so that we dont accidentally indclude any // utterance sections in the sound calculations if (!m_fInUtterance /* && !fSound */) { ReEvaluateSilence (&m_bcSilence, &bcNew, 255 / m_wBlocksPerSec); } else if (m_dwUtteranceLength >= ((DWORD)m_wBlocksPerSec * 30)) // if we have a very long utterance (> 30 second) then it's not ReEvaluateSilence (&m_bcSilence, &bcNew, 255 / m_wBlocksPerSec); // done return m_fInUtterance; } /********************************************************************* CSilence::ExpectNoiseChange - Sent to the silence detection algorithm when it should expect the noise floor to go up/down. inputs WORD wValue - Amount that noise floor should change. 0x100 = no change. > 0x100 => louder, < 0x100 => quieter returns */ void CSilence::ExpectNoiseChange (WORD wValue) { SPDBG_FUNC( "CSilence::ExpectNoiseChange" ); DWORD dwTemp; dwTemp = ((DWORD) m_bcSilence.wMaxLevel * wValue) >> 8; if (dwTemp > 0xffff) dwTemp = 0xffff; m_bcSilence.wMaxLevel = (WORD) dwTemp; dwTemp = ((DWORD) m_bcSilence.wMaxDelta * wValue) >> 8; if (dwTemp > 0xffff) dwTemp = 0xffff; m_bcSilence.wMaxDelta = (WORD) dwTemp; } /********************************************************************* CSilence::GetBlock - This gets a block from the queue. This will fail if there are no more blocks left to get OR if there's not utterance. inputs DWORD *pdwNumSamples - If a block is returned then this will be filled in with the number of samples in the block. QWORD *pqwTimeStamp - Filled in woth the time-stamp for the buffer. returns short * - Pointer to a block of samples. This memory is the caller's property and can be freed with free(). */ short * CSilence::GetBlock (DWORD *pdwNumSamples, QWORD * pqwTimeStamp) { SPDBG_FUNC( "CSilence::GetBlock" ); PBINFO pbInfo; WORD i, wCount; short *pSamples; if (!m_fInUtterance) return NULL; // find the first occurance i = (m_wLatestBlock + 1) % m_wBlocksInQueue; for (wCount = m_wBlocksInQueue; wCount; i = ((i < (m_wBlocksInQueue-1)) ? (i+1) : 0), wCount-- ) { pbInfo = m_paBlockInfo + i; if (pbInfo->pSamples) { *pdwNumSamples = pbInfo->dwNumSamples; *pqwTimeStamp = pbInfo->qwTimeStamp; pSamples = pbInfo->pSamples; pbInfo->pSamples = NULL; return pSamples; }; }; // if got here then couldnt find anything return NULL; } /********************************************************************* CSilence::KillUtterance - Kills an exitsing utterance. inputs none returns none */ void CSilence::KillUtterance (void) { SPDBG_FUNC( "CSilence::KillUtterance" ); m_fInUtterance = FALSE; m_dwSoundBits = 0; m_dwVoicedBits = 0; }