/****************************************************************************** * Backend.h * *-----------* * This is the header file for the CBackend implementation. *------------------------------------------------------------------------------ * Copyright (C) 1999 Microsoft Corporation Date: 03/01/99 * All Rights Reserved * *********************************************************************** MC ****/ #ifndef Backend_H #define Backend_H #ifndef ReverbFX_H #include "ReverbFX.h" #endif #ifndef FeedChain_H #include "FeedChain.h" #endif #ifndef __spttseng_h__ #include "spttseng.h" #endif #ifndef SPDebug_h #include #endif #ifndef SPCollec_h #include #endif #include "SpTtsEngDebug.h" static const short MAX_TARGETS_PER_UNIT = 3; // Max number of knots allowed static const short MIN_VOICE_PITCH = 10; // Lowest voiced pitch (hertz) static const short UNIT_SIL = 0; // Silence phon static const short SPEECH_FRAME_SIZE = 5000; // Output audio uffer... static const short SPEECH_FRAME_OVER = 1000; // ...plus pad //---------------------------------------------------------- // find a yn corresponding to xn, // given (x0, y0), (x1, y1), x0 <= xn <= x1 //---------------------------------------------------------- inline float LinInterp( float x0, float xn, float x1, float y0, float y1 ) { return y0 + (y1-y0)*(xn-x0)/(x1-x0); } // Math marcos #define ABS(x) ((x) >= 0 ? (x) : -(x)) #define MAX(x,y) (((x) >= (y)) ? (x) : (y)) #define MIN(x,y) (((x) <= (y)) ? (x) : (y)) static const float LINEAR_BKPT = 0.1f; static const float LOG_RANGE = (-25.0f); //******************************************************************** // // CBackend keeps track of all the state information for the // synthesis process. // //******************************************************************** class CBackend { public: /*--- Constructors/Destructors ---*/ CBackend (); ~CBackend (); /*=== Methods =======*/ HRESULT Init( IMSVoiceData* pVoiceDataObj, CFeedChain *pSrcObj, MSVOICEINFO* pVoiceInfo ); SPEECH_STATE GetSpeechState() {return m_SpeechState;} void PrepareSpeech( ISpTTSEngineSite* outputSite ); HRESULT RenderFrame( ); private: HRESULT StartNewUnit(); long ProsodyMod( UNITINFO *pCurUnit, long cInEpochs, float durationMpy); void CleanUpSynth(); void ResRecons( float *pInRes, long InSize, float *pOutRes, long OutSize, float scale ); void LPCFilter( float *pCurLPC, float *pCurRes, long len, float gain ); void FreeSynth( MSUNITDATA* pSynth ); void PSOLA_Stretch( float *pInRes, long InSize, float *pOutRes, long OutSize, float *pWindow, long cWindowSize ); void CvtToShort( float *pSrc, long blocksize, long stereoOut, float audioGain ); void Release( ); /*=== Member Data ===*/ CFeedChain *m_pSrcObj; // Backend gets its input from here MSUNITDATA m_Synth; // Unit data from 'Voicedataobj' float *m_pHistory; // LPC delays unsigned long m_fModifiers; float *m_pHistory2; // IIR delays float *m_pFilter; // IIR/FIR coefficients long m_cNumTaps; // Coefficient count LP_CReverbFX m_pReverb; // Reverb object long *m_pMap; // in/out epoch map float *m_pOutEpoch; // epoch sizes short *m_pRevFlag; // true = rev unvoiced float *m_pInRes; // m_pSynth.pRes float *m_pInEpoch; // m_pSynth.pEpoch float *m_pLPC; // m_pSynth->pLPC long m_cOutSamples_Phon; // sample count long m_durationTarget; // target sample total long m_silMode; float *m_pSynthTime; // pCurUnit->pTime float *m_pSynthAmp; // pCurUnit->pAmp long m_nKnots; // pCurUnit->nKnots SPEECH_STATE m_SpeechState; // Either continue or done long m_cOutSamples_Frame; // Audio output sample count for frame float *m_pSpeechBuf; // Audio output sample buffer ULONG m_cOutSamples_Total; // Audio output sample count for Speak long m_EpochIndex; // Index for render long m_cOutEpochs; // Count for render long m_vibrato_Phase1; // Current vibrato phase index float m_VibratoDepth; // Vibrato gain float m_VibratoFreq; // Vibrato speed long m_StereoOut; // TRUE = stereo output long m_BytesPerSample; // 2 = mono, 4 = stereo IMSVoiceData* m_pVoiceDataObj; // Voice object ULONG m_cOrder; // LPC filter order float m_SampleRate; // I/O rate float* m_pWindow; // Hanning Window long m_FFTSize; // FFT length // User Controls float m_UnitVolume; // 0 - 1.0 (linear) long m_MasterVolume; // 0 - 100 (linear) float m_linearScale; // Linear taper region scale // SAPI audio sink ISpTTSEngineSite* m_pOutputSite; bool m_HasSpeech; }; //-------------------------------- // Unimplemented //-------------------------------- static const long BACKEND_BITFLAG_WHISPER = (1 << 0); static const long BACKEND_BITFLAG_FIR = (1 << 1); static const long BACKEND_BITFLAG_IIR = (1 << 2); static const long BACKEND_BITFLAG_REVERB = (1 << 3); static const float VIBRATO_DEFAULT_DEPTH = 0.05f; static const float VIBRATO_DEFAULT_FREQ = 3.0f; // hz #endif //--- This must be the last line in the file