/***************************************************************************** * spttseng.idl * *--------------* * Description: * This is the idl file for the Microsoft Text To Speech Driver. *----------------------------------------------------------------------------- * Creation: 03/01/99 * Copyright (C) Microsoft Corporation 1999 * All rights reserved. * ****************************************************************** EDC ******/ //--- Import base idl import "oaidl.idl"; import "ocidl.idl"; import "sapiddk.idl"; //=== Forward References ====================================================== interface IMSVoiceData; interface IMSTTSEngineInit; typedef enum ENGPARTOFSPEECH { MS_NotOverriden = SPPS_NotOverriden, MS_Unknown = SPPS_Unknown, // Probably from user lexicon MS_Noun = SPPS_Noun, MS_Verb = SPPS_Verb, MS_Modifier = SPPS_Modifier, MS_Function = SPPS_Function, MS_Interjection = SPPS_Interjection, // MS Nouns MS_Pron = ( SPPS_Noun + 1 ), MS_SubjPron = ( SPPS_Noun + 2 ), MS_ObjPron = ( SPPS_Noun + 3 ), MS_RelPron = ( SPPS_Noun + 4 ), MS_PosNoun = ( SPPS_Noun + 9 ), // MS Modifiers MS_Adj = ( SPPS_Modifier + 1 ), MS_Adv = ( SPPS_Modifier + 2 ), // MS Function Words MS_VAux = ( SPPS_Function + 1 ), MS_Conj = ( SPPS_Function + 3 ), MS_CConj = ( SPPS_Function + 4 ), MS_Interr = ( SPPS_Function + 5 ), MS_Det = ( SPPS_Function + 6 ), MS_Contr = ( SPPS_Function + 7 ), MS_Prep = ( SPPS_Function + 9 ), // MS Punctuation MS_Punctuation = ( SPPS_Function + 11 ), MS_GroupBegin = ( MS_Punctuation + 1 ), MS_GroupEnd = ( MS_Punctuation + 2 ), MS_EOSItem = ( MS_Punctuation + 3 ), MS_MiscPunc = ( MS_Punctuation + 4 ), MS_Quotation = ( MS_Punctuation + 5 ) } ENGPARTOFSPEECH; typedef enum TTSItemType { eWORDLIST_NOT_VALID = 0x0000, eWORDLIST_IS_VALID = 0x1000, eUNMATCHED = eWORDLIST_IS_VALID + 1, eALPHA_WORD = eWORDLIST_IS_VALID + 2, eABBREVIATION = eWORDLIST_IS_VALID + 3, eABBREVIATION_NORMALIZE = eWORDLIST_IS_VALID + 4, eINITIALISM = eWORDLIST_IS_VALID + 5, eNUM_CARDINAL = eWORDLIST_IS_VALID + 6, eNUM_ORDINAL = eWORDLIST_IS_VALID + 7, eNUM_DECIMAL = eWORDLIST_IS_VALID + 8, eNUM_PERCENT = eWORDLIST_IS_VALID + 9, eNUM_DEGREES = eWORDLIST_IS_VALID + 10, eNUM_SQUARED = eWORDLIST_IS_VALID + 11, eNUM_CUBED = eWORDLIST_IS_VALID + 12, eNUM_CURRENCY = eWORDLIST_IS_VALID + 13, eNUM_FRACTION = eWORDLIST_IS_VALID + 14, eNUM_MIXEDFRACTION = eWORDLIST_IS_VALID + 15, eNUM_ROMAN_NUMERAL = eWORDLIST_IS_VALID + 16, eNUM_ROMAN_NUMERAL_ORDINAL = eWORDLIST_IS_VALID + 17, eNUM_PHONENUMBER = eWORDLIST_IS_VALID + 18, eNUM_ZIPCODE = eWORDLIST_IS_VALID + 19, eDATE_YEAR = eWORDLIST_IS_VALID + 20, eDATE = eWORDLIST_IS_VALID + 21, eDATE_LONGFORM = eWORDLIST_IS_VALID + 22, eDECADE = eWORDLIST_IS_VALID + 23, eTIMEOFDAY = eWORDLIST_IS_VALID + 24, eTIME = eWORDLIST_IS_VALID + 25, eSPELLOUT = eWORDLIST_IS_VALID + 26, eHYPHENATED_STRING = eWORDLIST_IS_VALID + 27, eSTATE_AND_ZIPCODE = eWORDLIST_IS_VALID + 28, eTIME_RANGE = eWORDLIST_IS_VALID + 29, eNUM_RANGE = eWORDLIST_IS_VALID + 30, eTEMP_NUMBER = eWORDLIST_IS_VALID + 31, eTEMP_PERCENT = eWORDLIST_IS_VALID + 32, eTEMP_DEGREES = eWORDLIST_IS_VALID + 33, eTEMP_NUM_FRACTION = eWORDLIST_IS_VALID + 34, eTEMP_NUM_MIXEDFRACTION = eWORDLIST_IS_VALID + 35, eTEMP_NUM_DECIMAL = eWORDLIST_IS_VALID + 36, eTEMP_NUM_ORDINAL = eWORDLIST_IS_VALID + 37, eTEMP_NUM_CURRENCY = eWORDLIST_IS_VALID + 38, eNEWNUM_PHONENUMBER = eWORDLIST_IS_VALID + 39, eNUM_CURRENCYRANGE = eWORDLIST_IS_VALID + 40, eSUFFIX = eWORDLIST_IS_VALID + 41, eOPEN_PARENTHESIS = eWORDLIST_NOT_VALID + 1, eOPEN_BRACKET = eWORDLIST_NOT_VALID + 2, eOPEN_BRACE = eWORDLIST_NOT_VALID + 3, eCLOSE_PARENTHESIS = eWORDLIST_NOT_VALID + 4, eCLOSE_BRACKET = eWORDLIST_NOT_VALID + 5, eCLOSE_BRACE = eWORDLIST_NOT_VALID + 6, eSINGLE_QUOTE = eWORDLIST_NOT_VALID + 7, eDOUBLE_QUOTE = eWORDLIST_NOT_VALID + 8, ePERIOD = eWORDLIST_NOT_VALID + 9, eEXCLAMATION = eWORDLIST_NOT_VALID + 10, eQUESTION = eWORDLIST_NOT_VALID + 11, eCOMMA = eWORDLIST_NOT_VALID + 12, eSEMICOLON = eWORDLIST_NOT_VALID + 13, eCOLON = eWORDLIST_NOT_VALID + 14, eHYPHEN = eWORDLIST_NOT_VALID + 15, eELLIPSIS = eWORDLIST_NOT_VALID + 16, } TTSItemType; typedef struct TTSWord { const SPVSTATE* pXmlState; // The XML State of the word LPCWSTR pWordText; // Pointer to the orthographic form of the word ULONG ulWordLen; // Length of the word, in WCHARs LPCWSTR pLemma; // Pointer to the orthographic form of the root word ULONG ulLemmaLen; // Length of the lemma, in WCHARs SPPHONEID* pWordPron; // Pointer to the NULL terminated pronunciation of the word ENGPARTOFSPEECH eWordPartOfSpeech; // The part of speech of the word - Is this needed??? } TTSWord; typedef struct TTSItemInfo { TTSItemType Type; } TTSItemInfo; typedef struct TTSSentItem { LPCWSTR pItemSrcText; // Pointer to original text of the item ULONG ulItemSrcLen; // Length of the original text of the item ULONG ulItemSrcOffset; // Offset of the original text of the item TTSWord* Words; // The words of the item, post normalization ULONG ulNumWords; // The number of words of the item, post normalization ENGPARTOFSPEECH eItemPartOfSpeech; // The part of speech of the entire item TTSItemInfo* pItemInfo; } TTSSentItem; //=== Constants =============================================================== typedef enum INVCONST { MAX_LPCORDER = 30, MAX_FFTSIZE = 512 } INVCONST; //=== Interface definitions =================================================== ///// NOTE: This section to be moved to SAPI.IDL in SAPI6 [ object, local, uuid(E0F4088D-CD08-11d2-B503-00C04F797396), helpstring("IEnumSENTITEM Interface"), pointer_default(unique) ] interface IEnumSENTITEM : IUnknown { HRESULT Next( [out] TTSSentItem *pItemEnum ); HRESULT Reset(void); }; //--- IEnumSpSentence ------------------------------------------------------- // This structure points to a text fragement within the input stream and // the rendering attributes that are described by associated XML tags // [ object, local, uuid(299A9157-CD08-11d2-B503-00C04F797396), helpstring("IEnumSpSentence Interface"), pointer_default(unique) ] interface IEnumSpSentence : IUnknown { HRESULT SetFragList( [in] const SPVTEXTFRAG* pTextFragList, [in] DWORD dwSpeakFlags); HRESULT Next( [out]IEnumSENTITEM **ppSentItemEnum ); HRESULT Previous( [out]IEnumSENTITEM **ppSentItemEnum ); HRESULT Reset(void); }; ///// End SAPI6 section // Max number of POS per pronunciation enum { POS_MAX = 4 }; // Pronunciation source typedef enum PRONSRC { PRON_LEX = 0, PRON_LTS, } PRONSRC; //------------------------ // POS subset for prosody //------------------------ enum PROSODY_POS { POS_UNK, // unknown POS_FUNC, // any function word POS_CONTENT, // any content word POS_AUX, }; // Revberb delay presets typedef enum REVERBTYPE { REVERB_TYPE_OFF = 0, REVERB_TYPE_BATHTUB, REVERB_TYPE_ROOM, REVERB_TYPE_HALL, REVERB_TYPE_CHURCH, REVERB_TYPE_STADIUM, REVERB_TYPE_ECHO, REVERB_TYPE_ROBOSEQ, // Robot with 'sequencer' } REVERBTYPE; typedef enum UNITFLAGS { WORD_START_FLAG = (1L << 0), // Word starts on this unit SENT_START_FLAG = (1L << 1), // Sentence starts on this unit }UNITFLAGS; typedef enum TAPS { MAXTAPS = 8 }TAPS; enum USER_RATE_VALUE { MIN_USER_RATE = (-18), MAX_USER_RATE = 18, DEFAULT_USER_RATE = 0 // None }; // Change to new rate if value is NOT this enum { NO_RATE_CHANGE = MAX_USER_RATE + 1 }; /*** UNITINFO * This describes the unit info structure */ typedef struct UNIT_CVT { ULONG PhonID; // {in} Phoneme ID ULONG flags; // {in] Position flags ULONG UnitID; // [out] Inventory table ID ULONG SenoneID; // [out] Context offset float Dur; // [out] duration in seconds float Amp; // [out] Amplitude float AmpRatio; // [out] Amplitude gain CHAR szUnitName[15]; // [out] name string } UNIT_CVT; /*** MSVOICEINFO * This describes the voice data object */ typedef struct MSVOICEINFO { WAVEFORMATEX WaveFormatEx; // Voice data format. LCID LangID; // Voice data language ID ULONG Rate; // Words-per-minute ULONG Pitch; // Average pitch in Hz REVERBTYPE eReverbType; // Reverb param ULONG ProsodyGain; // 0 = monotone ULONG NumOfTaps; // BE: Whisper param float TapCoefficients[MAXTAPS]; // BE: Whisper param float VibratoFreq; // Hertz ULONG VibratoDepth; // 0 - 100% ULONG SampleRate; // 22050 typical ULONG LPCOrder; // Number of LPC coefficients ULONG FFTSize; // FFT window length float* pWindow; // Hanning Window } MSVOICEINFO; /*** MSUNITINFO * This is the result of an Unit fetch */ typedef struct MSUNITDATA { ULONG cNumEpochs; ULONG cNumSamples; ULONG cOrder; float *pEpoch; float *pLPC; float *pRes; float *pGain; } MSUNITDATA; // AlloToUnit() attributes enum { ALLO_IS_STRESSED = (1 << 0) }; /*** IMSVoiceData * Private interface on TTS voice data objects. A voice data object encapsulates * the voice data with the necessary lookup logic. */ [ object, local, uuid(6265B7E1-0340-11d3-B50C-00C04F797396), helpstring("IMSVoiceData Interface"), pointer_default(unique) ] interface IMSVoiceData : IUnknown { HRESULT GetVoiceInfo( [out]MSVOICEINFO* pVoiceInfo ); HRESULT GetUnitIDs( [in,out]UNIT_CVT* pUnits, [in]ULONG cUnits ); HRESULT GetUnitData( [in]ULONG unitID, [out]MSUNITDATA* pUnitData ); HRESULT AlloToUnit( [in]short allo, [in]long attributes, [out]long* pUnitID ); }; /*** IMSTTSEngineInit * Private engine initialization interface used to connect the voice * object to the synthesizer. */ [ object, local, uuid(8A7C38EB-D8B0-11d2-B504-00C04F797396), helpstring("IMSTTSEngineInit Interface"), pointer_default(unique) ] interface IMSTTSEngineInit : IUnknown { HRESULT VoiceInit( [in]IMSVoiceData* pVoiceData ); }; //=== CoClass definitions ===================================================== [ uuid(3F7C4D29-D007-11D2-B503-00C04F797396), version(1.0), helpstring("MS TTS Engine 1.0 Type Library") ] library MSTTSENGINELib { importlib("stdole32.tlb"); importlib("stdole2.tlb"); //--- This object is used to load the voice data files // and expose them to the driver. [ uuid(65DBDDEF-0725-11d3-B50C-00C04F797396), helpstring("MSVoiceData Class") ] coclass MSVoiceData { [default] interface IMSVoiceData; }; //--- This is the synthesizer object [ uuid(B93AE09F-D033-11D2-B503-00C04F797396), helpstring("MSTTSEngine Class") ] coclass MSTTSEngine { [default] interface ISpTTSEngine; interface IMSTTSEngineInit; }; };