windows-nt/Source/XPSP1/NT/enduser/speech/tts/ms_entropic/voicedataobj.h
2020-09-26 16:20:57 +08:00

366 lines
12 KiB
C++

/******************************************************************************
* VoiceDataObj.h *
*----------------*
* This is the header file for the CVoiceDataObj implementation. This object
* is used to provide shared access to a specific voice data file.
*------------------------------------------------------------------------------
* Copyright (C) 1999 Microsoft Corporation Date: 05/06/99
* All Rights Reserved
*
*********************************************************************** EDC ***/
#ifndef VoiceDataObj_h
#define VoiceDataObj_h
//--- Additional includes
#include "ms_entropicengine.h"
#include <spddkhlp.h>
#include <sphelper.h>
#include <MMREG.H>
#include "resource.h"
#include "SpTtsEngDebug.h"
//=== Constants ====================================================
static const long VOICE_VERSION = 0x10001;
static const long HEADER_VERSION = 0x10000;
static const long MS_VOICE_TYPE = MAKEFOURCC('V','o','i','s');
static const long MS_DATA_TYPE = MAKEFOURCC('D','a','t','a');
static const float SIL_DURATION = 0.01f;
//=== Class, Enum, Struct and Union Declarations ===================
//------------------------------------
// Selector for 'GetData()'
// For accessing voice data blocks
//------------------------------------
enum VOICEDATATYPE
{
MSVD_PHONE,
MSVD_SENONE,
MSVD_TREEIMAGE,
MSVD_INVENTORY,
MSVD_ALLOID
};
//---------------------------
// VOICEINFO data types
//---------------------------
enum GENDER
{
GENDER_NEUTRAL = 0,
GENDER_FEMALE,
GENDER_MALE
};
enum COMPRESS_TYPE
{
COMPRESS_NONE = 0,
COMPRESS_LPC
};
// THis is the data
#pragma pack (1)
struct VOICEINFO
{
long Type; // Always 'MS_VOICE_TYPE'
ULONG Version; // Always 'VOICE_VERSION'
WCHAR Copyright[256]; // INFO:
WCHAR VoiceName[64]; // INFO:
WCHAR Example[64]; // INFO:
LCID LangID;
GENDER Gender; // INFO: Male, female or neuter
ULONG Age; // INFO: Speaker age in years
ULONG Rate; // INFO & FE: Words-per-minute
ULONG Pitch; // INFO & FE: Average pitch in Hz
COMPRESS_TYPE CompressionType; // BE: Always 'COMPRESS_LPC'
REVERBTYPE ReverbType; // BE: Reverb param
ULONG NumOfTaps; // BE: Whisper param
float TapCoefficients[8]; // BE: Whisper param
ULONG ProsodyGain; // FE: 0 = monotone
float VibratoFreq; // Hertz
ULONG VibratoDepth; // 0 - 100%
ULONG SampleRate; // 22050 typical
GUID formatID; // SAPI audio format ID
long Unused[4];
};
#pragma pack ()
typedef VOICEINFO *PVOICEINFO;
//---------------------------------------------------
// Header definition for voice data block
//---------------------------------------------------
#pragma pack (1)
struct VOICEBLOCKOFFSETS
{
long Type; // Always 'MS_DATA_TYPE'
long Version; // Always 'HEADER_VERSION'
GUID DataID; // File ID
long PhonOffset; // Offset to PHON block (from beginning of file)
long PhonLen; // Length of PHON block
long SenoneOffset; // Offset to SENONE block (from beginning of file)
long SenoneLen; // Length of SENONE block
long TreeOffset; // Offset to TREE block (from beginning of file)
long TreeLen; // Length of TREE block
long InvOffset; // Offset to INV block (from beginning of file)
long InvLen; // Length of INV block
long AlloIDOffset; // Offset to AlloId block (from beginning of file)
long AlloIDLen; // Length of AlloID block
};
#pragma pack ()
// Single VQ Codebook
#pragma pack (1)
typedef struct Book
{
long cCodeSize; // Number of codewords
long cCodeDim; // Dimension of codeword
long pData; // Offset to data (INVENTORY rel)
} BOOK, *PBOOK;
#pragma pack ()
static const long BOOKSHELF = 32;
#pragma pack (1)
typedef struct Inventory
{
long SampleRate; // Sample rate in Hz
long cNumLPCBooks; // Number of LPC Codebooks
long cNumResBooks; // Number of Residual Codebooks
long cNumDresBooks; // Number of Delta Residual Codebooks
BOOK LPCBook[BOOKSHELF]; // LPC Codebook array
BOOK ResBook[BOOKSHELF]; // Residual Codebook array
BOOK DresBook[BOOKSHELF]; // Delta residual Codebook array
long cNumUnits; // Total number of units
long UnitsOffset; // Offset to offset array to unit data (INVENTORY rel)
long cOrder; // LPC analysis order
long FFTSize; // Size of FFT
long FFTOrder; // Order of FFT
long TrigOffset; // Offset to sine table (INVENTORY rel)
long WindowOffset; // Offset to Hanning Window (INVENTORY rel)
long pGaussOffset; // Offset to Gaussian Random noise (INVENTORY rel)
long GaussID; // Gaussian sample index
} INVENTORY, *PINVENTORY;
#pragma pack ()
//------------------------
// LPC order * 2
//------------------------
static const long MAXNO = 40;
static const float KONEPI = 3.1415926535897931032f;
static const float KTWOPI = (KONEPI * 2);
static const float K2 = 0.70710678118655f;
#pragma pack (1)
typedef struct
{
long val; // Phon ID
long obj; // Offset to phon string
} HASH_ENTRY;
#pragma pack ()
#pragma pack (1)
typedef struct
{
long size; // Number entries in the table (127 typ.)
long UNUSED1;
long entryArrayOffs; // Offset to HASH_ENTRY array
long UNUSED2;
long UNUSED3;
long UNUSED4;
long UNUSED5;
} HASH_TABLE;
#pragma pack ()
#pragma pack (1)
typedef struct
{
HASH_TABLE phonHash;
long phones_list; // Offset to offsets to phon strings
long numPhones;
long numCiPhones; // Number of context ind. phones
} PHON_DICT;
#pragma pack ()
#pragma pack (1)
typedef struct
{
long nfeat;
long nint32perq;
long b_ques;
long e_ques;
long s_ques;
long eors_ques;
long wwt_ques;
long nstateq;
} FEATURE;
#pragma pack ()
#pragma pack (1)
typedef struct
{
long prod; // For leaves, it means the counts.
// For non-leaves, it is the offset
// into TRIPHONE_TREE.prodspace.
short yes; // Negative means there is no child. so this is a leaf
short no; // for leaves, it is lcdsid
short shallow_lcdsid; // negative means this is NOT a shallow leaf
} C_NODE;
#pragma pack ()
#pragma pack (1)
typedef struct
{
short nnodes;
short nleaves;
long nodes; // Offset
}TREE_ELEM;
#define NUM_PHONS_MAX 64
#pragma pack (1)
typedef struct
{
FEATURE feat;
long UNUSED; // PHON_DICT *pd usually
long nsenones;
long silPhoneId;
long nonSilCxt;
long nclass;
long gsOffset[NUM_PHONS_MAX]; // nclass+1 entries
TREE_ELEM tree[NUM_PHONS_MAX];
long nuniq_prod; // not used for detailed tree
long uniq_prod_Offset; // Offset to table
long nint32perProd;
} TRIPHONE_TREE;
#pragma pack ()
static const long NO_PHON = (-1);
#define ABS(x) ((x) >= 0 ? (x) : -(x))
#define MAX(x,y) (((x) >= (y)) ? (x) : (y))
#define MIN(x,y) (((x) <= (y)) ? (x) : (y))
#pragma pack (1)
typedef struct
{
float dur;
float durSD;
float amp;
float ampRatio;
} UNIT_STATS;
#pragma pack ()
//=== Enumerated Set Definitions ===================================
//=== Function Type Definitions ====================================
//=== Class, Struct and Union Definitions ==========================
/*** CVoiceDataObj COM object ********************************
*/
class CVoiceData
{
/*=== Methods =======*/
public:
CVoiceData();
~CVoiceData();
private:
/*--- Non interface methods ---*/
HRESULT MapFile(const WCHAR * pszTokenValName, HANDLE * phMapping, void ** ppvData);
HRESULT GetDataBlock( VOICEDATATYPE type, char **ppvOut, ULONG *pdwSize );
HRESULT InitVoiceData();
HRESULT DecompressUnit( ULONG UnitID, MSUNITDATA* pSynth );
long DecompressEpoch( signed char *rgbyte, long cNumEpochs, float *pEpoch );
long OrderLSP( PFLOAT pLSPFrame, INT cOrder );
void LSPtoPC( float *pLSP, float *pLPC, long cOrder, long frame );
void PutSpectralBand( float *pFFT, float *pBand, long StartBin,
long cNumBins, long FFTSize );
void AddSpectralBand( float *pFFT, float *pBand, long StartBin,
long cNumBins, long FFTSize );
void InverseFFT( float *pDest, long fftSize, long fftOrder, float *sinePtr );
void SetEpochLen( float *pOutRes, long OutSize, float *pInRes,
long InSize );
void GainDeNormalize( float *pRes, long FFTSize, float Gain );
long PhonToID( PHON_DICT *pd, char *phone_str );
char *PhonFromID( PHON_DICT *pd, long phone_id );
HRESULT GetTriphoneID( TRIPHONE_TREE *forest,
long phon, // target phon
long leftPhon, // left context
long rightPhon, // right context
long pos, // word position ("b", "e" or "s"
PHON_DICT *pd,
ULONG *pResult );
long PhonHashLookup( PHON_DICT *pPD, // the hash table
char *sym, // The symbol to look up
long *val ); // Phon ID
void FIR_Filter( float *pVector, long cNumSamples, float *pFilter,
float *pHistory, long cNumTaps );
void IIR_Filter( float *pVector, long cNumSamples, float *pFilter,
float *pHistory, long cNumTaps );
HRESULT GetUnitDur( ULONG UnitID, float* pDur );
/*=== Interfaces ====*/
public:
STDMETHOD(GetVoiceInfo)( MSVOICEINFO* pVoiceInfo );
STDMETHOD(GetUnitIDs)( UNIT_CVT* pUnits, ULONG cUnits );
STDMETHOD(GetUnitData)( ULONG unitID, MSUNITDATA* pUnitData );
STDMETHOD(AlloToUnit)( short allo, long attributes, long* pUnitID );
STDMETHOD(SetObjectToken)( ISpObjectToken *pToken );
private:
/*=== Member Data ===*/
CComPtr<ISpObjectToken> m_cpToken;
HANDLE m_hVoiceDef;
HANDLE m_hVoiceData;
VOICEINFO* m_pVoiceDef;
VOICEBLOCKOFFSETS* m_pVoiceData;
PHON_DICT* m_pd;
TRIPHONE_TREE* m_pForest;
long* m_SenoneBlock;
ULONG m_First_Context_Phone;
ULONG m_Sil_Index;
// Unit Inventory
INVENTORY* m_pInv;
float m_SampleRate;
long m_cOrder;
long *m_pUnit; // Pointer to offsets to unit data
float *m_pTrig; // Sine table
float *m_pWindow; // Hanning Window
float *m_pGauss; // Gaussian Random noise
COMPRESS_TYPE m_CompressionType;
ULONG m_FFTSize;
long m_GaussID;
short *m_AlloToUnitTbl;
long m_NumOfAllos;
ULONG m_NumOfUnits; // Inventory size
};
#endif //--- This must be the last line in the file