/****************************************************************************** * AlloOps.h * *-----------* * This is the header file for the following clsses: * CAlloCell * CAlloList * CDuration * CSyllableTagger * CToneTargets * CPitchProsody *------------------------------------------------------------------------------ * Copyright (C) 1999 Microsoft Corporation Date: 03/01/99 * All Rights Reserved * *********************************************************************** MC ****/ #ifndef AlloOps_H #define AlloOps_H #include "stdafx.h" #include "commonlx.h" #include "ms_entropicengine.h" #include "FeedChain.h" #include #include "SpTtsEngDebug.h" //*************************** // Allophones //*************************** typedef enum { _IY_, _IH_, _EH_, _AE_, _AA_, _AH_, _AO_, _UH_, _AX_, _ER_, _EY_, _AY_, _OY_, _AW_, _OW_, _UW_, _IX_, _SIL_, _w_, _y_, _r_, _l_, _h_, _m_, _n_, _NG_, _f_, _v_, _TH_, _DH_, _s_, _z_, _SH_, _ZH_, _p_, _b_, _t_, _d_, _k_, _g_, _CH_, _JH_, _DX_, _STRESS1_, _STRESS2_, _EMPHSTRESS_, _SYLLABLE_, } ALLO_CODE; static const long NUMBER_OF_ALLO = (_SYLLABLE_ + 1); //----------------------------------- // For 2-word allo conversion //----------------------------------- static const short NO_IPA = 0; // XXXX XXXX XXXX XXXX XXXX bLis ssoo ttBB // X = unused // B = boundary type // t = syllable type // o = vowel order // s = stress type // i = word initial consonant // L = syLlable start // b = break enum ALLOTAGS { WORD_START = (1 << 0), TERM_BOUND = (1 << 1), BOUNDARY_TYPE_FIELD = WORD_START | TERM_BOUND, // mask WORD_END_SYLL = (1 << 2), TERM_END_SYLL = (1 << 3), SYLLABLE_TYPE_FIELD = WORD_END_SYLL | TERM_END_SYLL, // mask FIRST_SYLLABLE_IN_WORD = (1 << 4), // in multi-syllable word MID_SYLLABLE_IN_WORD = (2 << 4), LAST_SYLLABLE_IN_WORD = (3 << 4), MORE_THAN_ONE_SYLLABLE_IN_WORD = LAST_SYLLABLE_IN_WORD, // either bit is set ONE_OR_NO_SYLLABLE_IN_WORD = 0x0000, // niether bits are set SYLLABLE_ORDER_FIELD = LAST_SYLLABLE_IN_WORD, // mask PRIMARY_STRESS = (1 << 6), SECONDARY_STRESS = (1 << 7), EMPHATIC_STRESS = (1 << 8), IS_STRESSED = PRIMARY_STRESS | SECONDARY_STRESS | EMPHATIC_STRESS, PRIM_OR_EMPH_STRESS = PRIMARY_STRESS | EMPHATIC_STRESS, STRESS_FIELD = PRIMARY_STRESS | SECONDARY_STRESS | EMPHATIC_STRESS, // mask WORD_INITIAL_CONSONANT = (1 << 9), // up to 1st vowel in word STRESSED_INITIAL_CONS = (IS_STRESSED + WORD_INITIAL_CONSONANT), SYLLABLE_START = (1 << 10), SIL_BREAK = (1 << 11), }; //*************************** // AlloFlags //*************************** enum ALLOFLAGS { KVOWELF = (1<<0), KCONSONANTF = (1<<1), KVOICEDF = (1<<2), KVOWEL1F = (1<<3), KSONORANTF = (1<<4), KSONORANT1F = (1<<5), KNASALF = (1<<6), KLIQGLIDEF = (1<<7), KSONORCONSONF = (1<<8), KPLOSIVEF = (1<<9), KPLOSFRICF = (1<<10), KOBSTF = (1<<11), KSTOPF = (1<<12), KALVEOLARF = (1<<13), KVELAR = (1<<14), KLABIALF = (1<<15), KDENTALF = (1<<16), KPALATALF = (1<<17), KYGLIDESTARTF = (1<<18), KYGLIDEENDF = (1<<19), KGSTOPF = (1<<20), KFRONTF = (1<<21), KDIPHTHONGF = (1<<22), KHASRELEASEF = (1<<23), KAFFRICATEF = (1<<24), KLIQGLIDE2F = (1<<25), KVOCLIQ = (1<<26), KFRIC = (1<<27), KFLAGMASK1 = (KLABIALF+KDENTALF+KPALATALF+KALVEOLARF+KVELAR+KGSTOPF), KFLAGMASK2 = (KALVEOLARF-1), }; #define BOUNDARY_BASE 1000 enum TOBI_BOUNDARY { K_NOBND = 0, K_LMINUS = BOUNDARY_BASE, // fall K_HMINUS, // none K_LMINUSLPERC, K_LMINUSHPERC, K_HMINUSHPERC, K_HMINUSLPERC, }; enum TUNE_TYPE { NULL_BOUNDARY = 0, // no boundary NOTE: always put this at the beginning PHRASE_BOUNDARY, // comma EXCLAM_BOUNDARY, // exclamatory utterance terminator YN_QUEST_BOUNDARY, // yes-no question terminator WH_QUEST_BOUNDARY, // yes-no question terminator DECLAR_BOUNDARY, // declarative terminator PAREN_L_BOUNDARY, // left paren PAREN_R_BOUNDARY, // right paren QUOTE_L_BOUNDARY, // left quote QUOTE_R_BOUNDARY, // right quote PHONE_BOUNDARY, TOD_BOUNDARY, ELLIPSIS_BOUNDARY, SUB_BOUNDARY_1, // NOTE: always put these at the end SUB_BOUNDARY_2, SUB_BOUNDARY_3, SUB_BOUNDARY_4, SUB_BOUNDARY_5, SUB_BOUNDARY_6, NUMBER_BOUNDARY, TAIL_BOUNDARY, }; //*************************** // ToBI Constants //*************************** // !H is removed from consideration in the first pass processing // !H can possibly be recovered from analysis of the labeling and // contour at later stages (tilt, prominence, pitch range, downstep) #define ACCENT_BASE 1 enum TOBI_ACCENT { K_NOACC = 0, K_HSTAR = ACCENT_BASE, // peak rise / fall K_LSTAR, // acc syll nucleus valley early fall K_LSTARH, // late rise K_RSTAR, // K_LHSTAR, // early rise K_DHSTAR, // K_HSTARLSTAR, }; enum BOUNDARY_SOURCE { BND_NoSource = 0, //-- Phrase boundary rules BND_PhraseRule1, BND_PhraseRule2, BND_PhraseRule3, BND_PhraseRule4, BND_PhraseRule5, BND_PhraseRule6, BND_PhraseRule7, BND_PhraseRule8, BND_PhraseRule9, BND_PhraseRule10, BND_PhraseRule11, BND_PhraseRule12, BND_PhraseRule13, //-- ToBI BND_YNQuest, BND_WHQuest, BND_Period, BND_Comma, //--Templates BND_NumberTemplate, // Should never get this! BND_IntegerQuant, BND_Currency_DOLLAR, BND_Frac_Num, BND_Phone_COUNTRY, BND_Phone_AREA, BND_Phone_ONE, BND_Phone_DIGITS, BND_TimeOFDay_HR, BND_TimeOFDay_AB, BND_Ellipsis, BND_ForcedTerm, // Should never get this! BND_IDontKnow, }; enum ACCENT_SOURCE { ACC_NoSource = 0, //-- Phrase boundary rules ACC_PhraseRule1, ACC_PhraseRule2, ACC_PhraseRule3, ACC_PhraseRule4, ACC_PhraseRule5, ACC_PhraseRule6, ACC_PhraseRule7, ACC_PhraseRule8, ACC_PhraseRule9, ACC_PhraseRule10, ACC_PhraseRule11, ACC_PhraseRule12, ACC_PhraseRule13, //-- ToBI ACC_InitialVAux, ACC_FunctionSeq, ACC_ContentSeq, ACC_YNQuest, ACC_Period, ACC_Comma, //--Templates ACC_IntegerGroup, ACC_NumByNum, ACC_Frac_DEN, // "half", "tenths", etc. ACC_Phone_1stArea, // 1st digit in area code ACC_Phone_3rdArea, // 3rd digit in area code ACC_Phone_1st3, ACC_Phone_3rd3, ACC_Phone_1st4, ACC_Phone_3rd4, ACC_TimeOFDay_HR, ACC_TimeOFDay_1stMin, ACC_TimeOFDay_M, ACC_PhoneBnd_AREA, ACC_PhoneBnd_34, ACC_PhoneBnd_4, ACC_IDontKnow, }; enum SILENCE_SOURCE { SIL_NoSource = 0, SIL_Term, SIL_QuoteStart, SIL_QuoteEnd, SIL_ParenStart, SIL_ParenEnd, SIL_Emph, SIL_SubBound, // Should never see this (gets removed) SIL_XML, //-- Prosody templates SIL_TimeOfDay_HR, SIL_TimeOfDay_AB, SIL_Phone_COUNTRY, SIL_Phone_AREA, SIL_Phone_ONE, SIL_Phone_DIGITS, SIL_Fractions_NUM, SIL_Currency_DOLLAR, SIL_Integer_Quant, SIL_Head, SIL_Tail, SIL_Ellipsis, SIL_ForcedTerm, // Should never get this! }; static const short TOKEN_LEN_MAX = 20; class CFEToken { public: CFEToken(); ~CFEToken(); WCHAR tokStr[TOKEN_LEN_MAX]; long tokLen; PRONSRC m_PronType; long phon_Len; ALLO_CODE phon_Str[SP_MAX_PRON_LENGTH]; // Allo string ENGPARTOFSPEECH POScode; PROSODY_POS m_posClass; ULONG srcPosition; // Source position for this token ULONG srcLen; // Source length for this token ULONG sentencePosition; // Source position for sentence ULONG sentenceLen; // Source length for sentence ULONG user_Volume; // 1 - 101 long user_Rate; // -10 - 10 long user_Pitch; // -10 - 10 long user_Emph; // 0 or 5 ULONG user_Break; // ms of silence CBookmarkList *pBMObj; TOBI_ACCENT m_Accent; // accent prosodic control long m_Accent_Prom; // prominence prosodic control TOBI_BOUNDARY m_Boundary; // boundary tone prosodic control long m_Boundary_Prom; // prominence prosodic control TUNE_TYPE m_TuneBoundaryType; // Current token is a boundary float m_TermSil; // Pad word with silence (in sec) float m_DurScale; // Duration ratio float m_ProsodyDurScale; float m_PitchBaseOffs; // Relative baseline pitch offset in octaves float m_PitchRangeScale; // Pitch range offset scale (0 - 2.0) //--- Diagnostic ACCENT_SOURCE m_AccentSource; BOUNDARY_SOURCE m_BoundarySource; SILENCE_SOURCE m_SilenceSource; }; typedef CSPList CFETokenList; class CAlloCell { public: CAlloCell(); ~CAlloCell(); CAlloCell* operator =( CAlloCell *pNewCell ); //-------------------------------- // Member Vars //-------------------------------- ALLO_CODE m_allo; short m_dur; float m_ftDuration; float m_UnitDur; short m_knots; float m_ftTime[KNOTS_PER_PHON]; float m_ftPitch[KNOTS_PER_PHON]; long m_ctrlFlags; TOBI_ACCENT m_ToBI_Accent; long m_Accent_Prom; // prominence prosodic control TOBI_BOUNDARY m_ToBI_Boundary; long m_Boundary_Prom; // prominence prosodic control long m_PitchBufStart; long m_PitchBufEnd; ULONG m_user_Volume; long m_user_Rate; long m_user_Pitch; long m_user_Emph; ULONG m_user_Break; ULONG m_Sil_Break; float m_Pitch_HI; float m_Pitch_LO; ULONG m_SrcPosition; ULONG m_SrcLen; ULONG m_SentencePosition; // Source position for sentence ULONG m_SentenceLen; // Source length for sentence TUNE_TYPE m_TuneBoundaryType; TUNE_TYPE m_NextTuneBoundaryType; CBookmarkList *m_pBMObj; float m_DurScale; // Duration ratio float m_ProsodyDurScale; float m_PitchBaseOffs; // Relative baseline pitch offset in octaves float m_PitchRangeScale; // Pitch range offset scale (0 - 2.0) //--- Diagnostic ACCENT_SOURCE m_AccentSource; BOUNDARY_SOURCE m_BoundarySource; SILENCE_SOURCE m_SilenceSource; char *m_pTextStr; }; class CAlloList { public: CAlloList(); ~CAlloList(); //-------------------------------- // Methods //-------------------------------- CAlloCell *GetCell( long index ); CAlloCell *GetTailCell(); long GetCount(); bool WordToAllo( CFEToken *pPrevTok, CFEToken *pTok, CFEToken *pNextTok, CAlloCell *pEndCell ); CAlloCell *GetHeadCell() { m_ListPos = m_AlloCellList.GetHeadPosition(); return m_AlloCellList.GetNext( m_ListPos ); } CAlloCell *GetNextCell() { if( m_ListPos ) { return m_AlloCellList.GetNext( m_ListPos ); } else { //-- We're at end of list! return NULL; } } //-- For debug only void OutAllos(); private: //-------------------------------- // Member Vars //-------------------------------- long m_cAllos; SPLISTPOS m_ListPos; CSPList m_AlloCellList; }; //----------------------------------- // Speaking Rate parameters //----------------------------------- static const float MAX_SIL_DUR = 1.0f; // seconds static const float MIN_ALLO_DUR = 0.011f; // seconds static const float MAX_ALLO_DUR = 5.0f; // seconds class CDuration { public: //-------------------------------- // Methods //-------------------------------- void AlloDuration( CAlloList *pAllos, float rateRatio ); private: void Pause_Insertion( long userDuration, long silBreak ); void PhraseFinal_Lengthen( long cellCount ); long Emphatic_Lenghen( long lastStress ); //-------------------------------- // Member vars //-------------------------------- float m_DurHold; float m_TotalDurScale; float m_durationPad; ALLO_CODE m_cur_Phon; long m_cur_PhonCtrl; long m_cur_PhonFlags; long m_cur_SyllableType; short m_cur_VowelFlag; long m_cur_Stress; ALLO_CODE m_prev_Phon; long m_prev_PhonCtrl; long m_prev_PhonFlags; ALLO_CODE m_next_Phon; long m_next_PhonCtrl; long m_next_PhonFlags; ALLO_CODE m_next2_Phon; long m_next2_PhonCtrl; long m_next2_PhonFlags; TUNE_TYPE m_NextBoundary, m_CurBoundary; }; typedef struct { ALLO_CODE allo; long ctrlFlags; }ALLO_ARRAY; class CSyllableTagger { public: //-------------------------------- // Methods //-------------------------------- void TagSyllables( CAlloList *pAllos ); private: void MarkSyllableOrder( long scanIndex); void MarkSyllableBoundry( long scanIndex); void MarkSyllableStart(); short Find_Next_Word_Bound( short index ); short If_Consonant_Cluster( ALLO_CODE Consonant_1st, ALLO_CODE Consonant_2nd); void ListToArray( CAlloList *pAllos ); void ArrayToList( CAlloList *pAllos ); //-------------------------------- // Member vars //-------------------------------- ALLO_ARRAY *m_pAllos; long m_numOfCells; }; enum { TARG_PER_ALLO_MAX = 2 }; // One for accent and one for boundary enum TUNE_STYLE { FLAT_TUNE = 0, // flat DESCEND_TUNE, // go down ASCEND_TUNE, // go up }; //------------------ // Global Constants //------------------ static const float PITCH_BUF_RES = (float)0.005; static const float K_HSTAR_OFFSET = (float)0.5; static const float K_HDOWNSTEP_COEFF = (float)0.5; //------------------ // Macros //------------------ #define CeilVal(x) ((m_CeilSlope * x) + m_CeilStart) #define FloorVal(x) ((m_FloorSlope * x) + m_FloorStart) #define RefVal(x) ((m_RefSlope * x) + m_RefStart) class CPitchProsody { public: //-------------------------------- // Methods //-------------------------------- CPitchProsody() { m_pContBuf = NULL; }; ~CPitchProsody() { if ( m_pContBuf ) delete m_pContBuf; }; void AlloPitch( CAlloList *pAllos, int baseLine, int pitchRange ); void GetContour( float**, ULONG* ); private: float DoPitchControl( long pitchControl, float basePitch ); void PitchTrack(); void SetDefaultPitch(); void GetKnots(); void NewTarget( long index, float value ); //-------------------------------- // Member vars //-------------------------------- CAlloList *m_pAllos; long m_numOfCells; float m_TotalDur; // phrase duration in seconds TUNE_STYLE m_Tune_Style; float *m_pContBuf; ULONG m_ulNumPoints; float m_OffsTime; TOBI_ACCENT m_CurAccent; //------------------------ // Diagnostic //------------------------ ACCENT_SOURCE m_CurAccentSource; BOUNDARY_SOURCE m_CurBoundarySource; char *m_pCurTextStr; }; #endif //--- This must be the last line in the file