/****************************************************************************** * StdSentEnum.h * *---------------* * This is the header file for the CStdSentEnum implementation. *------------------------------------------------------------------------------ * Copyright (C) 1999 Microsoft Corporation Date: 03/01/99 * All Rights Reserved * *********************************************************************** EDC ***/ #ifndef StdSentEnum_h #define StdSentEnum_h //--- Additional includes #include "stdafx.h" #include "spttseng.h" #include "resource.h" #include "SentItemMemory.h" #include "morph.h" //=== Constants ==================================================== typedef enum SEPARATOR_AND_DECIMAL { PERIOD_COMMA = (1L << 0), COMMA_PERIOD = (1L << 1) } SEPARATOR_AND_DECIMAL; typedef enum SHORT_DATE_ORDER { MONTH_DAY_YEAR = (1L << 0), DAY_MONTH_YEAR = (1L << 1), YEAR_MONTH_DAY = (1L << 2) } SHORT_DATE_ORDER; //--- Vowel WCHAR values - used to disambiguate pronunciations of certain words const WCHAR g_Vowels[] = { 0x0a, // AA 0x0b, // AE 0x0c, // AH 0x0d, // AO 0x0e, // AW 0x0f, // AX 0x10, // AY 0x15, // EH 0x16, // ER 0x17, // EY 0x1b, // IH 0x1c, // IY 0x23, // OW 0x24, // OY 0x2a, // UH 0x2b, // UW }; //--- Normalization constants - see NormData.cpp extern const char g_pFlagCharacter; extern const unsigned char g_AnsiToAscii[256]; extern const SPLSTR g_O; extern const SPLSTR g_negative; extern const SPLSTR g_decimalpoint; extern const SPLSTR g_to; extern const SPLSTR g_a; extern const SPLSTR g_of; extern const SPLSTR g_percent; extern const SPLSTR g_degree; extern const SPLSTR g_degrees; extern const SPLSTR g_squared; extern const SPLSTR g_cubed; extern const SPLSTR g_ones[10]; extern const SPLSTR g_tens[10]; extern const SPLSTR g_teens[10]; extern const SPLSTR g_onesOrdinal[10]; extern const SPLSTR g_tensOrdinal[10]; extern const SPLSTR g_teensOrdinal[10]; extern const SPLSTR g_quantifiers[6]; extern const SPLSTR g_quantifiersOrdinal[6]; extern const SPLSTR g_dash; extern WCHAR g_Euro[2]; struct CurrencySign { SPLSTR Sign; SPLSTR MainUnit; SPLSTR SecondaryUnit; }; struct StateStruct { SPLSTR Abbreviation; SPLSTR FullName; }; extern const StateStruct g_StateAbbreviations[63]; extern const CurrencySign g_CurrencySigns[14]; extern const SPLSTR g_SingularPrimaryCurrencySigns[14]; extern const SPLSTR g_SingularSecondaryCurrencySigns[14]; extern const WCHAR g_DateDelimiters[3]; extern const SPLSTR g_months[12]; extern const SPLSTR g_monthAbbreviations[13]; extern const SPLSTR g_days[7]; extern const SPLSTR g_dayAbbreviations[10]; extern const SPLSTR g_Area; extern const SPLSTR g_Country; extern const SPLSTR g_Code; extern const SPLSTR g_Half; extern const SPLSTR g_Tenths; extern const SPLSTR g_Sixteenths; extern const SPLSTR g_Hundredths; extern const SPLSTR g_Over; extern const SPLSTR g_PluralDenominators[10]; extern const SPLSTR g_A; extern const SPLSTR g_M; extern const SPLSTR g_P; extern const SPLSTR g_OClock; extern const SPLSTR g_hundred; extern const SPLSTR g_hour; extern const SPLSTR g_hours; extern const SPLSTR g_minute; extern const SPLSTR g_minutes; extern const SPLSTR g_second; extern const SPLSTR g_seconds; extern const SPLSTR g_ANSICharacterProns[256]; extern const SPVSTATE g_DefaultXMLState; extern const SPLSTR g_And; extern const SPLSTR g_comma; extern const SPLSTR g_period; extern const SPLSTR g_periodString; extern const SPLSTR g_slash; extern const SPLSTR g_Decades[]; extern const SPLSTR g_Zeroes; extern const SPLSTR g_Hundreds; #define DAYMAX 31 #define DAYMIN 1 #define MONTHMAX 12 #define MONTHMIN 1 #define YEARMAX 9999 #define YEARMIN 0 #define HOURMIN 1 #define HOURMAX 23 #define MINUTEMIN 0 #define MINUTEMAX 59 #define SECONDMIN 0 #define SECONDMAX 59 //--- POS Tagger Constants - see MiscData.cpp typedef enum TEMPLATETYPE { PREV1T, NEXT1T, PREV2T, NEXT2T, PREV1OR2T, NEXT1OR2T, PREV1OR2OR3T, NEXT1OR2OR3T, PREV1TNEXT1T, PREV1TNEXT2T, PREV2TNEXT1T, NOTCAP, CAP, PREVNOTCAP, PREVCAP, PREV1W, NEXT1W, PREV2W, NEXT2W, PREV1OR2W, NEXT1OR2W, CURRWPREV1W, CURRWNEXT1W, CURRWPREV1T, CURRWNEXT1T, CURRW, PREV1WT, NEXT1WT, CURRWPREV1WT, CURRWNEXT1WT } TEMPLATETYPE; struct BrillPatch { ENGPARTOFSPEECH eCurrentPOS; ENGPARTOFSPEECH eConvertToPOS; TEMPLATETYPE eTemplateType; ENGPARTOFSPEECH eTemplatePOS1; ENGPARTOFSPEECH eTemplatePOS2; const WCHAR* pTemplateWord1; const WCHAR* pTemplateWord2; }; extern const BrillPatch g_POSTaggerPatches [63]; //=== Class, Enum, Struct and Union Declarations =================== typedef CSPList CWordList; typedef CSPList CItemList; //--- Structs used for normalization typedef enum { PRECEDING, FOLLOWING, UNATTACHED } NORM_POSITION; struct NumberGroup { BOOL fOnes; // "one" through "nineteen" BOOL fTens; // "twenty" through "ninety" BOOL fHundreds; // "one hundred" through "nine hundred" BOOL fQuantifier; // "thousand" through "quadrillion" }; struct TTSIntegerItemInfo { long lNumGroups; NumberGroup Groups[6]; BOOL fOrdinal; BOOL fDigitByDigit; ULONG ulNumDigits; //--- Normalization internal only long lLeftOver; BOOL fSeparators; const WCHAR* pStartChar; const WCHAR* pEndChar; }; struct TTSDigitsItemInfo : TTSItemInfo { const WCHAR* pFirstDigit; ULONG ulNumDigits; }; struct TTSNumberItemInfo; struct TTSFractionItemInfo { BOOL fIsStandard; TTSNumberItemInfo* pNumerator; TTSNumberItemInfo* pDenominator; //--- Normalization internal only const WCHAR* pVulgar; }; struct TTSNumberItemInfo : TTSItemInfo { BOOL fNegative; TTSIntegerItemInfo* pIntegerPart; TTSDigitsItemInfo* pDecimalPart; TTSFractionItemInfo* pFractionalPart; //--- Normalization internal only const WCHAR* pStartChar; const WCHAR* pEndChar; CWordList* pWordList; }; struct TTSPhoneNumberItemInfo : TTSItemInfo { //--- Country code members TTSNumberItemInfo* pCountryCode; //--- Area code members TTSDigitsItemInfo* pAreaCode; BOOL fIs800; BOOL fOne; //--- Main number members TTSDigitsItemInfo** ppGroups; ULONG ulNumGroups; }; struct TTSZipCodeItemInfo : TTSItemInfo { TTSDigitsItemInfo* pFirstFive; TTSDigitsItemInfo* pLastFour; }; struct TTSStateAndZipCodeItemInfo : TTSItemInfo { TTSZipCodeItemInfo* pZipCode; }; struct TTSCurrencyItemInfo : TTSItemInfo { TTSNumberItemInfo* pPrimaryNumberPart; TTSNumberItemInfo* pSecondaryNumberPart; BOOL fQuantifier; long lNumPostNumberStates; long lNumPostSymbolStates; }; struct TTSYearItemInfo : TTSItemInfo { const WCHAR* pYear; ULONG ulNumDigits; }; struct TTSRomanNumeralItemInfo : TTSItemInfo { TTSItemInfo* pNumberInfo; }; struct TTSDecadeItemInfo : TTSItemInfo { const WCHAR* pCentury; ULONG ulDecade; }; struct TTSDateItemInfo : TTSItemInfo { ULONG ulDayIndex; ULONG ulMonthIndex; TTSIntegerItemInfo* pDay; TTSYearItemInfo* pYear; }; typedef enum { AM, PM, UNDEFINED } TIMEABBREVIATION; struct TTSTimeOfDayItemInfo : TTSItemInfo { BOOL fTimeAbbreviation; BOOL fTwentyFourHour; BOOL fMinutes; }; struct TTSTimeItemInfo : TTSItemInfo { TTSNumberItemInfo* pHours; TTSNumberItemInfo* pMinutes; const WCHAR* pSeconds; }; struct TTSHyphenatedStringInfo : TTSItemInfo { TTSItemInfo* pFirstChunkInfo; TTSItemInfo* pSecondChunkInfo; const WCHAR* pFirstChunk; const WCHAR* pSecondChunk; }; struct TTSSuffixItemInfo : TTSItemInfo { const WCHAR* pFirstChar; ULONG ulNumChars; }; struct TTSNumberRangeItemInfo : TTSItemInfo { TTSItemInfo *pFirstNumberInfo; TTSItemInfo *pSecondNumberInfo; }; struct TTSTimeRangeItemInfo : TTSItemInfo { TTSTimeOfDayItemInfo *pFirstTimeInfo; TTSTimeOfDayItemInfo *pSecondTimeInfo; }; struct AbbrevRecord { const WCHAR* pOrth; WCHAR* pPron1; ENGPARTOFSPEECH POS1; WCHAR* pPron2; ENGPARTOFSPEECH POS2; WCHAR* pPron3; ENGPARTOFSPEECH POS3; int iSentBreakDisambig; int iPronDisambig; }; struct TTSAbbreviationInfo : TTSItemInfo { const AbbrevRecord* pAbbreviation; }; //--- Structs used for Lex Lookup typedef enum { PRON_A = 0, PRON_B = 1 }; struct PRONUNIT { ULONG phon_Len; WCHAR phon_Str[SP_MAX_PRON_LENGTH]; // Allo string ULONG POScount; ENGPARTOFSPEECH POScode[POS_MAX]; }; struct PRONRECORD { WCHAR orthStr[SP_MAX_WORD_LENGTH]; // Orth text WCHAR lemmaStr[SP_MAX_WORD_LENGTH]; // Root word ULONG pronType; // Pronunciation is lex or LTS PRONUNIT pronArray[2]; ENGPARTOFSPEECH POSchoice; ENGPARTOFSPEECH XMLPartOfSpeech; bool hasAlt; ULONG altChoice; BOOL fUsePron; }; //--- Miscellaneous structs and typedefs struct SentencePointer { const WCHAR *pSentenceStart; const SPVTEXTFRAG *pSentenceFrag; }; //=== Function Definitions =========================================== // Misc Number Normalization functions and helpers int MatchCurrencySign( const WCHAR*& pStartChar, const WCHAR*& pEndChar, NORM_POSITION& ePosition ); //=== Classes /*** CSentenceStack ************************************************* * This class is used to maintain a stack of sentences for the Skip * call to utilize. */ class CSentenceStack { public: /*--- Methods ---*/ CSentenceStack() { m_StackPtr = -1; } int GetCount( void ) { return m_StackPtr + 1; } virtual SentencePointer& Pop( void ) { SPDBG_ASSERT( m_StackPtr > -1 ); return m_Stack[m_StackPtr--]; } virtual HRESULT Push( const SentencePointer& val ) { ++m_StackPtr; return m_Stack.SetAtGrow( m_StackPtr, val ); } virtual void Reset( void ) { m_StackPtr = -1; } protected: /*--- Member data ---*/ CSPArray m_Stack; int m_StackPtr; }; /*** CSentItem * This object is a helper class */ class CSentItem : public TTSSentItem { public: CSentItem() { memset( this, 0, sizeof(*this) ); } CSentItem( TTSSentItem& Other ) { memcpy( this, &Other, sizeof( Other ) ); } }; /*** CSentItemEnum * This object is designed to be used by a single thread. */ class ATL_NO_VTABLE CSentItemEnum : public CComObjectRootEx, public IEnumSENTITEM { /*=== ATL Setup ===*/ public: DECLARE_PROTECT_FINAL_CONSTRUCT() BEGIN_COM_MAP(CSentItemEnum) COM_INTERFACE_ENTRY(IEnumSENTITEM) END_COM_MAP() /*=== Methods =======*/ public: /*--- Constructors/Destructors ---*/ /*--- Non interface methods ---*/ void _SetOwner( IUnknown* pOwner ) { m_cpOwner = pOwner; } CItemList& _GetList( void ) { return m_ItemList; } CSentItemMemory& _GetMemoryManager( void ) { return m_MemoryManager; } /*=== Interfaces ====*/ public: //--- IEnumSpSentence ---------------------------------------- STDMETHOD(Next)( TTSSentItem *pItemEnum ); STDMETHOD(Reset)( void ); /*=== Member data ===*/ private: CComPtr m_cpOwner; CItemList m_ItemList; SPLISTPOS m_ListPos; CSentItemMemory m_MemoryManager; }; /*** CStdSentEnum COM object */ class ATL_NO_VTABLE CStdSentEnum : public CComObjectRootEx, public IEnumSpSentence { /*=== ATL Setup ===*/ public: DECLARE_GET_CONTROLLING_UNKNOWN() DECLARE_PROTECT_FINAL_CONSTRUCT() BEGIN_COM_MAP(CStdSentEnum) COM_INTERFACE_ENTRY(IEnumSpSentence) END_COM_MAP() /*=== Methods =======*/ public: /*--- Constructors/Destructors ---*/ HRESULT FinalConstruct(); void FinalRelease(); /*--- Non interface methods ---*/ HRESULT InitAggregateLexicon( void ); HRESULT AddLexiconToAggregate( ISpLexicon *pAddLexicon, DWORD dwFlags ); HRESULT InitMorphLexicon( void ); //--- Abbreviation Sentence Breaking Disambiguation Functions HRESULT IsAbbreviationEOS( const AbbrevRecord* pAbbreviation, CItemList& ItemList, SPLISTPOS ItemPos, CSentItemMemory& MemoryManager, BOOL* pfIsEOS ); HRESULT IfEOSNotAbbreviation( const AbbrevRecord* pAbbreviation, CItemList& ItemList, SPLISTPOS ItemPos, CSentItemMemory& MemoryManager, BOOL* pfIsEOS ); HRESULT IfEOSAndLowercaseNotAbbreviation( const AbbrevRecord* pAbbreviation, CItemList& ItemList, SPLISTPOS ItemPos, CSentItemMemory& MemoryManager, BOOL* pfIsEOS ); //--- Abbreviation Pronunciation Disambiguation Functions HRESULT SingleOrPluralAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT DoctorDriveAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT AbbreviationFollowedByDigit( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT AllCapsAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT CapitalizedAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT SECAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT DegreeAbbreviation( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT AbbreviationModifier( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT ADisambig( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); HRESULT PolishDisambig( const AbbrevRecord* pAbbrevInfo, PRONRECORD* pPron, CItemList& ItemList, SPLISTPOS ListPos ); //--- Word Pronunciation Disambiguation Functions HRESULT MeasurementDisambig( const AbbrevRecord* pAbbrevInfo, CItemList& ItemList, SPLISTPOS ListPos, CSentItemMemory& MemoryManager ); HRESULT TheDisambig( const AbbrevRecord* pAbbrevInfo, CItemList& ItemList, SPLISTPOS ListPos, CSentItemMemory& MemoryManager ); HRESULT ReadDisambig( const AbbrevRecord* pAbbrevInfo, CItemList& ItemList, SPLISTPOS ListPos, CSentItemMemory& MemoryManager ); private: //--- Pronunciation Table init helper HRESULT InitPron( WCHAR** OriginalPron ); //--- Sentence breaking helpers ---// HRESULT GetNextSentence( IEnumSENTITEM** pItemEnum ); HRESULT AddNextSentItem( CItemList& ItemList, CSentItemMemory& MemoryManager, BOOL* pfIsEOS ); HRESULT SkipWhiteSpaceAndTags( const WCHAR*& pStartChar, const WCHAR*& pEndChar, const SPVTEXTFRAG*& pCurrFrag, CSentItemMemory& pMemoryManager, BOOL fAddToItemList = false, CItemList* pItemList = NULL ); const WCHAR* FindTokenEnd( const WCHAR* pStartChar, const WCHAR* pEndChar ); //--- Lexicon and POS helpers ---// HRESULT DetermineProns( CItemList& ItemList, CSentItemMemory& MemoryManager ); HRESULT Pronounce( PRONRECORD *pPron ); //--- Normalization helpers ---// HRESULT Normalize( CItemList& ItemList, SPLISTPOS ListPos, CSentItemMemory& MemoryManager ); HRESULT MatchCategory( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList ); HRESULT ExpandCategory( TTSItemInfo*& pItemNormInfo, CItemList& ItemList, SPLISTPOS ListPos, CSentItemMemory& MemoryManager ); HRESULT DoUnicodeToAsciiMap( const WCHAR *pUnicodeString, ULONG ulUnicodeStringLength, WCHAR *ppConvertedString ); HRESULT IsAlphaWord( const WCHAR* pStartChar, const WCHAR* pEndChar, TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager ); HRESULT IsInitialism( CItemList& ItemList, SPLISTPOS ItemPos, CSentItemMemory& MemoryManager, BOOL* pfIsEOS ); //--- Various Number Related Normalization helpers ---// HRESULT IsNumberCategory( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager ); HRESULT IsNumber( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager, BOOL fMultiItem = true ); HRESULT IsInteger( const WCHAR* pStartChar, TTSIntegerItemInfo*& pIntegerInfo, CSentItemMemory& MemoryManager ); HRESULT IsDigitString( const WCHAR* pStartChar, TTSDigitsItemInfo*& pDigitsInfo, CSentItemMemory& MemoryManager ); HRESULT ExpandNumber( TTSNumberItemInfo* pItemInfo, CWordList& WordList ); HRESULT ExpandPercent( TTSNumberItemInfo* pItemInfo, CWordList& WordList ); HRESULT ExpandDegrees( TTSNumberItemInfo* pItemInfo, CWordList& WordList ); HRESULT ExpandSquare( TTSNumberItemInfo* pItemInfo, CWordList& WordList ); HRESULT ExpandCube( TTSNumberItemInfo* pItemInfo, CWordList& WordList ); void ExpandInteger( TTSIntegerItemInfo* pItemInfo, const WCHAR* Context, CWordList &WordList ); void ExpandDigit( const WCHAR Number, NumberGroup& NormGroupInfo, CWordList& WordList ); void ExpandTwoDigits( const WCHAR* NumberString, NumberGroup& NormGroupInfo, CWordList& WordList ); void ExpandThreeDigits( const WCHAR* NumberString, NumberGroup& NormGroupInfo, CWordList& WordList ); void ExpandDigitOrdinal( const WCHAR Number, NumberGroup& NormGroupInfo, CWordList& WordList ); void ExpandTwoOrdinal( const WCHAR* NumberString, NumberGroup& NormGroupInfo, CWordList& WordList ); void ExpandThreeOrdinal( const WCHAR* NumberString, NumberGroup& NormGroupInfo, CWordList& WordList ); void ExpandDigits( TTSDigitsItemInfo* pItemInfo, CWordList& WordList ); HRESULT IsFraction( const WCHAR* pStartChar, TTSFractionItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager ); HRESULT ExpandFraction( TTSFractionItemInfo* pItemInfo, CWordList& WordList ); HRESULT IsRomanNumeral( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager ); HRESULT IsPhoneNumber( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager, CWordList& WordList ); HRESULT IsZipCode( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager ); HRESULT ExpandZipCode( TTSZipCodeItemInfo* pItemInfo, CWordList& WordList ); HRESULT IsCurrency( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList ); HRESULT IsNumberRange( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager ); HRESULT ExpandNumberRange( TTSNumberRangeItemInfo* pItemInfo, CWordList& WordList ); HRESULT IsCurrencyRange( TTSItemInfo*& pItemInfo, CSentItemMemory& MemoryManager, CWordList& WordList ); //--- Date Related Normalization helpers ---// HRESULT IsNumericCompactDate( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager ); HRESULT IsMonthStringCompactDate( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager ); HRESULT IsLongFormDate_DMDY( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList ); HRESULT IsLongFormDate_DDMY( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList ); HRESULT ExpandDate( TTSDateItemInfo* pItemInfo, CWordList& WordList ); HRESULT ExpandYear( TTSYearItemInfo* pItemInfo, CWordList& WordList ); HRESULT IsDecade( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager ); HRESULT ExpandDecade( TTSDecadeItemInfo* pItemInfo, CWordList& WordList ); ULONG MatchMonthString( WCHAR*& pMonth, ULONG ulLength ); ULONG MatchDayString( WCHAR*& pDayString, WCHAR* pEndChar ); bool MatchDateDelimiter( WCHAR **DateString ); //--- Time Related Normalization helpers ---// HRESULT IsTimeOfDay( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList, BOOL fMultiItem = true ); HRESULT IsTime( TTSItemInfo*& pItemNormInfo, const WCHAR* Context, CSentItemMemory& MemoryManager ); HRESULT ExpandTime( TTSTimeItemInfo* pItemInfo, CWordList& WordList ); HRESULT IsTimeRange( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList ); //--- SPELL tag normalization helper HRESULT SpellOutString( CWordList& WordList ); void ExpandPunctuation( CWordList& WordList, WCHAR wc ); //--- Default normalization helper HRESULT ExpandUnrecognizedString( CWordList& WordList, CSentItemMemory& MemoryManager ); //--- Misc. normalization helpers HRESULT IsStateAndZipcode( TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager, CWordList& WordList ); HRESULT IsHyphenatedString( const WCHAR* pStartChar, const WCHAR* pEndChar, TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager ); HRESULT ExpandHyphenatedString( TTSHyphenatedStringInfo* pItemInfo, CWordList& WordList ); HRESULT IsSuffix( const WCHAR* pStartChar, const WCHAR* pEndChar, TTSItemInfo*& pItemNormInfo, CSentItemMemory& MemoryManager ); HRESULT ExpandSuffix( TTSSuffixItemInfo* pItemInfo, CWordList& WordList ); bool Zeroes( const WCHAR* ); bool ThreeZeroes( const WCHAR* ); bool IsPunctuation(const TTSSentItem *Item); /*=== Interfaces ====*/ public: //--- IEnumSpSentence ---------------------------------------- STDMETHOD(SetFragList)( const SPVTEXTFRAG* pTextFragList, DWORD dwFlags ); STDMETHOD(Next)( IEnumSENTITEM **ppSentItemEnum ); STDMETHOD(Previous)( IEnumSENTITEM **ppSentItemEnum ); STDMETHOD(Reset)( void ); //=== Data members === private: CComPtr m_cpAggregateLexicon; CComPtr m_cpPhonemeConverter; CSMorph* m_pMorphLexicon; DWORD m_dwSpeakFlags; const SPVTEXTFRAG* m_pTextFragList; const SPVTEXTFRAG* m_pCurrFrag; const WCHAR* m_pNextChar; const WCHAR* m_pEndChar; const WCHAR* m_pEndOfCurrToken; const WCHAR* m_pEndOfCurrItem; CSentenceStack m_SentenceStack; SEPARATOR_AND_DECIMAL m_eSeparatorAndDecimal; SHORT_DATE_ORDER m_eShortDateOrder; static CComAutoCriticalSection m_AbbrevTableCritSec; }; //--- Structs and typedefs used for abbreviation stuff typedef HRESULT (CStdSentEnum::* SentBreakDisambigFunc)(const AbbrevRecord*, CItemList& , SPLISTPOS, CSentItemMemory&, BOOL*); typedef HRESULT (CStdSentEnum::* PronDisambigFunc) ( const AbbrevRecord*, PRONRECORD*, CItemList&, SPLISTPOS ); typedef HRESULT (CStdSentEnum::* PostLexLookupDisambigFunc) ( const AbbrevRecord*, CItemList&, SPLISTPOS, CSentItemMemory& ); extern AbbrevRecord g_AbbreviationTable[177]; extern const PronDisambigFunc g_PronDisambigTable[]; extern const SentBreakDisambigFunc g_SentBreakDisambigTable[]; extern AbbrevRecord g_AmbiguousWordTable[72]; extern const PronDisambigFunc g_AmbiguousWordDisambigTable[]; extern AbbrevRecord g_PostLexLookupWordTable[41]; extern const PostLexLookupDisambigFunc g_PostLexLookupDisambigTable[]; extern WCHAR *g_pOfA; extern WCHAR *g_pOfAn; extern BOOL g_fAbbrevTablesInitialized; extern void CleanupAbbrevTables( void ); //--- First words table - used in sentence breaking extern const SPLSTR g_FirstWords[163]; // //=== Inlines // inline ULONG my_wcstoul( const WCHAR *pStartChar, WCHAR **ppEndChar ) { if ( iswdigit( *pStartChar ) ) { return wcstoul( pStartChar, ppEndChar, 10 ); } else { if ( ppEndChar ) { *ppEndChar = (WCHAR*) pStartChar; } return 0; } } inline ENGPARTOFSPEECH ConvertItemTypeToPartOfSp( TTSItemType ItemType ) { switch ( ItemType ) { case eOPEN_PARENTHESIS: case eOPEN_BRACKET: case eOPEN_BRACE: return MS_GroupBegin; case eCLOSE_PARENTHESIS: case eCLOSE_BRACKET: case eCLOSE_BRACE: return MS_GroupEnd; case eSINGLE_QUOTE: case eDOUBLE_QUOTE: return MS_Quotation; case ePERIOD: case eQUESTION: case eEXCLAMATION: return MS_EOSItem; case eCOMMA: case eCOLON: case eSEMICOLON: case eHYPHEN: case eELLIPSIS: return MS_MiscPunc; default: return MS_Unknown; } } inline bool MatchPhoneNumberDelimiter( const WCHAR wc ) { return ( wc == L' ' || wc == L'-' || wc == L'.' ); } inline bool NeedsToBeNormalized( const AbbrevRecord* pAbbreviation ) { if( !wcscmp( pAbbreviation->pOrth, L"jan" ) || !wcscmp( pAbbreviation->pOrth, L"feb" ) || !wcscmp( pAbbreviation->pOrth, L"mar" ) || !wcscmp( pAbbreviation->pOrth, L"apr" ) || !wcscmp( pAbbreviation->pOrth, L"jun" ) || !wcscmp( pAbbreviation->pOrth, L"jul" ) || !wcscmp( pAbbreviation->pOrth, L"aug" ) || !wcscmp( pAbbreviation->pOrth, L"sep" ) || !wcscmp( pAbbreviation->pOrth, L"sept" ) || !wcscmp( pAbbreviation->pOrth, L"oct" ) || !wcscmp( pAbbreviation->pOrth, L"nov" ) || !wcscmp( pAbbreviation->pOrth, L"dec" ) || !wcscmp( pAbbreviation->pOrth, L"mon" ) || !wcscmp( pAbbreviation->pOrth, L"tue" ) || !wcscmp( pAbbreviation->pOrth, L"tues" ) || !wcscmp( pAbbreviation->pOrth, L"wed" ) || !wcscmp( pAbbreviation->pOrth, L"thu" ) || !wcscmp( pAbbreviation->pOrth, L"thur" ) || !wcscmp( pAbbreviation->pOrth, L"thurs" ) || !wcscmp( pAbbreviation->pOrth, L"fri" ) || !wcscmp( pAbbreviation->pOrth, L"sat" ) || !wcscmp( pAbbreviation->pOrth, L"sun" ) ) { return true; } else { return false; } } inline HRESULT SetWordList( CSentItem& Item, CWordList& WordList, CSentItemMemory& MemoryManager ) { HRESULT hr = S_OK; SPLISTPOS WordListPos = WordList.GetHeadPosition(); Item.ulNumWords = WordList.GetCount(); Item.Words = (TTSWord*) MemoryManager.GetMemory( Item.ulNumWords * sizeof(TTSWord), &hr ); if ( SUCCEEDED( hr ) ) { ULONG ulIndex = 0; while ( WordListPos ) { SPDBG_ASSERT( ulIndex < Item.ulNumWords ); Item.Words[ulIndex++] = WordList.GetNext( WordListPos ); } } return hr; } inline int CompareStringAndSPLSTR( const void* _String, const void* _SPLSTR ) { int _StringLen = wcslen( (const WCHAR*) _String ); int _SPLSTRLen = ( (const SPLSTR*) _SPLSTR )->Len; if ( _StringLen < _SPLSTRLen ) { int Result = wcsnicmp( (const WCHAR*) _String , ( (const SPLSTR*) _SPLSTR )->pStr, _StringLen ); if ( Result != 0 ) { return Result; } else { return -1; } } else if ( _StringLen > _SPLSTRLen ) { int Result = wcsnicmp( (const WCHAR*) _String , ( (const SPLSTR*) _SPLSTR )->pStr, _SPLSTRLen ); if ( Result != 0 ) { return Result; } else { return 1; } } else { return ( wcsnicmp( (const WCHAR*) _String , ( (const SPLSTR*) _SPLSTR )->pStr, _StringLen ) ); } } inline int CompareStringAndStateStruct( const void* _String, const void* _StateStruct ) { int _StringLen = wcslen( (const WCHAR*) _String ); int _StateStructLen = ( (const StateStruct*) _StateStruct )->Abbreviation.Len; if ( _StringLen < _StateStructLen ) { int Result = wcsnicmp( (const WCHAR*) _String , ( (const StateStruct*) _StateStruct )->Abbreviation.pStr, _StringLen ); if ( Result != 0 ) { return Result; } else { return -1; } } else if ( _StringLen > _StateStructLen ) { int Result = wcsnicmp( (const WCHAR*) _String , ( (const StateStruct*) _StateStruct )->Abbreviation.pStr, _StateStructLen ); if ( Result != 0 ) { return Result; } else { return 1; } } else { return ( wcsnicmp( (const WCHAR*) _String , ( (const StateStruct*) _StateStruct )->Abbreviation.pStr, _StringLen ) ); } } inline int CompareStringAndAbbrevRecord( const void* _String, const void* _AbbrevRecord ) { return ( _wcsicmp( (const WCHAR*) _String, ( (const AbbrevRecord*) _AbbrevRecord )->pOrth ) ); } inline int CompareWCHARAndWCHAR( const void *pWCHAR_1, const void *pWCHAR_2 ) { return ( *( (WCHAR*) pWCHAR_1) - *( (WCHAR*) pWCHAR_2) ); } inline BOOL IsSpace( WCHAR wc ) { return ( ( wc == 0x20 ) || ( wc == 0x9 ) || ( wc == 0xD ) || ( wc == 0xA ) || ( wc == 0x200B ) ); } inline BOOL IsCapital( WCHAR wc ) { return ( ( wc >= L'A' ) && ( wc <= L'Z' ) ); } inline TTSItemType IsGroupBeginning( WCHAR wc ) { if ( wc == L'(' ) { return eOPEN_PARENTHESIS; } else if ( wc == L'[' ) { return eOPEN_BRACKET; } else if ( wc == L'{' ) { return eOPEN_BRACE; } else { return eUNMATCHED; } } inline TTSItemType IsGroupEnding( WCHAR wc ) { if ( wc == L')' ) { return eCLOSE_PARENTHESIS; } else if ( wc == L']' ) { return eCLOSE_BRACKET; } else if ( wc == L'}' ) { return eCLOSE_BRACE; } else { return eUNMATCHED; } } inline TTSItemType IsQuotationMark( WCHAR wc ) { if ( wc == L'\'' ) { return eSINGLE_QUOTE; } else if ( wc == L'\"' ) { return eDOUBLE_QUOTE; } else { return eUNMATCHED; } } inline TTSItemType IsEOSItem( WCHAR wc ) { if ( wc == L'.' ) { return ePERIOD; } else if ( wc == L'!' ) { return eEXCLAMATION; } else if ( wc == L'?' ) { return eQUESTION; } else { return eUNMATCHED; } } inline TTSItemType IsMiscPunctuation( WCHAR wc ) { if ( wc == L',' ) { return eCOMMA; } else if ( wc == L';' ) { return eSEMICOLON; } else if ( wc == L':' ) { return eCOLON; } else if ( wc == L'-' ) { return eHYPHEN; } else { return eUNMATCHED; } } #endif //--- This must be the last line in the file