/****************************************************************************** * VoiceDataObj.h * *----------------* * This is the header file for the CVoiceDataObj implementation. This object * is used to provide shared access to a specific voice data file. *------------------------------------------------------------------------------ * Copyright (C) 1999 Microsoft Corporation Date: 05/06/99 * All Rights Reserved * *********************************************************************** EDC ***/ #ifndef VoiceDataObj_h #define VoiceDataObj_h //--- Additional includes #include "ms_entropicengine.h" #include #include #include #include "resource.h" #include "SpTtsEngDebug.h" //=== Constants ==================================================== static const long VOICE_VERSION = 0x10001; static const long HEADER_VERSION = 0x10000; static const long MS_VOICE_TYPE = MAKEFOURCC('V','o','i','s'); static const long MS_DATA_TYPE = MAKEFOURCC('D','a','t','a'); static const float SIL_DURATION = 0.01f; //=== Class, Enum, Struct and Union Declarations =================== //------------------------------------ // Selector for 'GetData()' // For accessing voice data blocks //------------------------------------ enum VOICEDATATYPE { MSVD_PHONE, MSVD_SENONE, MSVD_TREEIMAGE, MSVD_INVENTORY, MSVD_ALLOID }; //--------------------------- // VOICEINFO data types //--------------------------- enum GENDER { GENDER_NEUTRAL = 0, GENDER_FEMALE, GENDER_MALE }; enum COMPRESS_TYPE { COMPRESS_NONE = 0, COMPRESS_LPC }; // THis is the data #pragma pack (1) struct VOICEINFO { long Type; // Always 'MS_VOICE_TYPE' ULONG Version; // Always 'VOICE_VERSION' WCHAR Copyright[256]; // INFO: WCHAR VoiceName[64]; // INFO: WCHAR Example[64]; // INFO: LCID LangID; GENDER Gender; // INFO: Male, female or neuter ULONG Age; // INFO: Speaker age in years ULONG Rate; // INFO & FE: Words-per-minute ULONG Pitch; // INFO & FE: Average pitch in Hz COMPRESS_TYPE CompressionType; // BE: Always 'COMPRESS_LPC' REVERBTYPE ReverbType; // BE: Reverb param ULONG NumOfTaps; // BE: Whisper param float TapCoefficients[8]; // BE: Whisper param ULONG ProsodyGain; // FE: 0 = monotone float VibratoFreq; // Hertz ULONG VibratoDepth; // 0 - 100% ULONG SampleRate; // 22050 typical GUID formatID; // SAPI audio format ID long Unused[4]; }; #pragma pack () typedef VOICEINFO *PVOICEINFO; //--------------------------------------------------- // Header definition for voice data block //--------------------------------------------------- #pragma pack (1) struct VOICEBLOCKOFFSETS { long Type; // Always 'MS_DATA_TYPE' long Version; // Always 'HEADER_VERSION' GUID DataID; // File ID long PhonOffset; // Offset to PHON block (from beginning of file) long PhonLen; // Length of PHON block long SenoneOffset; // Offset to SENONE block (from beginning of file) long SenoneLen; // Length of SENONE block long TreeOffset; // Offset to TREE block (from beginning of file) long TreeLen; // Length of TREE block long InvOffset; // Offset to INV block (from beginning of file) long InvLen; // Length of INV block long AlloIDOffset; // Offset to AlloId block (from beginning of file) long AlloIDLen; // Length of AlloID block }; #pragma pack () // Single VQ Codebook #pragma pack (1) typedef struct Book { long cCodeSize; // Number of codewords long cCodeDim; // Dimension of codeword long pData; // Offset to data (INVENTORY rel) } BOOK, *PBOOK; #pragma pack () static const long BOOKSHELF = 32; #pragma pack (1) typedef struct Inventory { long SampleRate; // Sample rate in Hz long cNumLPCBooks; // Number of LPC Codebooks long cNumResBooks; // Number of Residual Codebooks long cNumDresBooks; // Number of Delta Residual Codebooks BOOK LPCBook[BOOKSHELF]; // LPC Codebook array BOOK ResBook[BOOKSHELF]; // Residual Codebook array BOOK DresBook[BOOKSHELF]; // Delta residual Codebook array long cNumUnits; // Total number of units long UnitsOffset; // Offset to offset array to unit data (INVENTORY rel) long cOrder; // LPC analysis order long FFTSize; // Size of FFT long FFTOrder; // Order of FFT long TrigOffset; // Offset to sine table (INVENTORY rel) long WindowOffset; // Offset to Hanning Window (INVENTORY rel) long pGaussOffset; // Offset to Gaussian Random noise (INVENTORY rel) long GaussID; // Gaussian sample index } INVENTORY, *PINVENTORY; #pragma pack () //------------------------ // LPC order * 2 //------------------------ static const long MAXNO = 40; static const float KONEPI = 3.1415926535897931032f; static const float KTWOPI = (KONEPI * 2); static const float K2 = 0.70710678118655f; #pragma pack (1) typedef struct { long val; // Phon ID long obj; // Offset to phon string } HASH_ENTRY; #pragma pack () #pragma pack (1) typedef struct { long size; // Number entries in the table (127 typ.) long UNUSED1; long entryArrayOffs; // Offset to HASH_ENTRY array long UNUSED2; long UNUSED3; long UNUSED4; long UNUSED5; } HASH_TABLE; #pragma pack () #pragma pack (1) typedef struct { HASH_TABLE phonHash; long phones_list; // Offset to offsets to phon strings long numPhones; long numCiPhones; // Number of context ind. phones } PHON_DICT; #pragma pack () #pragma pack (1) typedef struct { long nfeat; long nint32perq; long b_ques; long e_ques; long s_ques; long eors_ques; long wwt_ques; long nstateq; } FEATURE; #pragma pack () #pragma pack (1) typedef struct { long prod; // For leaves, it means the counts. // For non-leaves, it is the offset // into TRIPHONE_TREE.prodspace. short yes; // Negative means there is no child. so this is a leaf short no; // for leaves, it is lcdsid short shallow_lcdsid; // negative means this is NOT a shallow leaf } C_NODE; #pragma pack () #pragma pack (1) typedef struct { short nnodes; short nleaves; long nodes; // Offset }TREE_ELEM; #define NUM_PHONS_MAX 64 #pragma pack (1) typedef struct { FEATURE feat; long UNUSED; // PHON_DICT *pd usually long nsenones; long silPhoneId; long nonSilCxt; long nclass; long gsOffset[NUM_PHONS_MAX]; // nclass+1 entries TREE_ELEM tree[NUM_PHONS_MAX]; long nuniq_prod; // not used for detailed tree long uniq_prod_Offset; // Offset to table long nint32perProd; } TRIPHONE_TREE; #pragma pack () static const long NO_PHON = (-1); #define ABS(x) ((x) >= 0 ? (x) : -(x)) #define MAX(x,y) (((x) >= (y)) ? (x) : (y)) #define MIN(x,y) (((x) <= (y)) ? (x) : (y)) #pragma pack (1) typedef struct { float dur; float durSD; float amp; float ampRatio; } UNIT_STATS; #pragma pack () //=== Enumerated Set Definitions =================================== //=== Function Type Definitions ==================================== //=== Class, Struct and Union Definitions ========================== /*** CVoiceDataObj COM object ******************************** */ class CVoiceData { /*=== Methods =======*/ public: CVoiceData(); ~CVoiceData(); private: /*--- Non interface methods ---*/ HRESULT MapFile(const WCHAR * pszTokenValName, HANDLE * phMapping, void ** ppvData); HRESULT GetDataBlock( VOICEDATATYPE type, char **ppvOut, ULONG *pdwSize ); HRESULT InitVoiceData(); HRESULT DecompressUnit( ULONG UnitID, MSUNITDATA* pSynth ); long DecompressEpoch( signed char *rgbyte, long cNumEpochs, float *pEpoch ); long OrderLSP( PFLOAT pLSPFrame, INT cOrder ); void LSPtoPC( float *pLSP, float *pLPC, long cOrder, long frame ); void PutSpectralBand( float *pFFT, float *pBand, long StartBin, long cNumBins, long FFTSize ); void AddSpectralBand( float *pFFT, float *pBand, long StartBin, long cNumBins, long FFTSize ); void InverseFFT( float *pDest, long fftSize, long fftOrder, float *sinePtr ); void SetEpochLen( float *pOutRes, long OutSize, float *pInRes, long InSize ); void GainDeNormalize( float *pRes, long FFTSize, float Gain ); long PhonToID( PHON_DICT *pd, char *phone_str ); char *PhonFromID( PHON_DICT *pd, long phone_id ); HRESULT GetTriphoneID( TRIPHONE_TREE *forest, long phon, // target phon long leftPhon, // left context long rightPhon, // right context long pos, // word position ("b", "e" or "s" PHON_DICT *pd, ULONG *pResult ); long PhonHashLookup( PHON_DICT *pPD, // the hash table char *sym, // The symbol to look up long *val ); // Phon ID void FIR_Filter( float *pVector, long cNumSamples, float *pFilter, float *pHistory, long cNumTaps ); void IIR_Filter( float *pVector, long cNumSamples, float *pFilter, float *pHistory, long cNumTaps ); HRESULT GetUnitDur( ULONG UnitID, float* pDur ); /*=== Interfaces ====*/ public: STDMETHOD(GetVoiceInfo)( MSVOICEINFO* pVoiceInfo ); STDMETHOD(GetUnitIDs)( UNIT_CVT* pUnits, ULONG cUnits ); STDMETHOD(GetUnitData)( ULONG unitID, MSUNITDATA* pUnitData ); STDMETHOD(AlloToUnit)( short allo, long attributes, long* pUnitID ); STDMETHOD(SetObjectToken)( ISpObjectToken *pToken ); private: /*=== Member Data ===*/ CComPtr m_cpToken; HANDLE m_hVoiceDef; HANDLE m_hVoiceData; VOICEINFO* m_pVoiceDef; VOICEBLOCKOFFSETS* m_pVoiceData; PHON_DICT* m_pd; TRIPHONE_TREE* m_pForest; long* m_SenoneBlock; ULONG m_First_Context_Phone; ULONG m_Sil_Index; // Unit Inventory INVENTORY* m_pInv; float m_SampleRate; long m_cOrder; long *m_pUnit; // Pointer to offsets to unit data float *m_pTrig; // Sine table float *m_pWindow; // Hanning Window float *m_pGauss; // Gaussian Random noise COMPRESS_TYPE m_CompressionType; ULONG m_FFTSize; long m_GaussID; short *m_AlloToUnitTbl; long m_NumOfAllos; ULONG m_NumOfUnits; // Inventory size }; #endif //--- This must be the last line in the file