1082 lines
43 KiB
C++
1082 lines
43 KiB
C++
/*******************************************************************************
|
|
* StdSentEnum.cpp *
|
|
*-----------------*
|
|
* Description:
|
|
* This module is the main implementation file for the CStdSentEnum class.
|
|
*-------------------------------------------------------------------------------
|
|
* Created By: EDC Date: 03/19/99
|
|
* Copyright (C) 1999 Microsoft Corporation
|
|
* All Rights Reserved
|
|
*
|
|
*******************************************************************************/
|
|
|
|
//--- Additional includes
|
|
#include "stdafx.h"
|
|
#ifndef StdSentEnum_h
|
|
#include "stdsentenum.h"
|
|
#endif
|
|
#include "spttsengdebug.h"
|
|
#include "SpAutoObjectLock.h"
|
|
|
|
//--- Locals
|
|
CComAutoCriticalSection CStdSentEnum::m_AbbrevTableCritSec;
|
|
|
|
//=== CStdSentEnum ============================================================
|
|
//
|
|
|
|
/*****************************************************************************
|
|
* CStdSentEnum::InitPron *
|
|
*------------------------*
|
|
* Description:
|
|
* Inits pron tables
|
|
********************************************************************* AH ***/
|
|
HRESULT CStdSentEnum::InitPron( WCHAR** OriginalPron )
|
|
{
|
|
HRESULT hr = S_OK;
|
|
WCHAR *NewPron = NULL;
|
|
|
|
NewPron = new WCHAR[ wcslen( *OriginalPron ) ];
|
|
hr = m_cpPhonemeConverter->PhoneToId( *OriginalPron, NewPron );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
*OriginalPron = NewPron;
|
|
}
|
|
|
|
return hr;
|
|
} /* InitPron */
|
|
|
|
/*****************************************************************************
|
|
* CStdSentEnum::FinalConstruct *
|
|
*------------------------------*
|
|
* Description:
|
|
* Constructor
|
|
********************************************************************* EDC ***/
|
|
HRESULT CStdSentEnum::FinalConstruct()
|
|
{
|
|
SPDBG_FUNC( "CStdSentEnum::FinalConstruct" );
|
|
HRESULT hr = S_OK;
|
|
m_dwSpeakFlags = 0;
|
|
m_pTextFragList = NULL;
|
|
m_pMorphLexicon = NULL;
|
|
m_fHaveNamesLTS = false;
|
|
m_eSeparatorAndDecimal = COMMA_PERIOD;
|
|
m_eShortDateOrder = MONTH_DAY_YEAR;
|
|
/*** Create phone converter ***/
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
hr = SpCreatePhoneConverter( 1033, NULL, NULL, &m_cpPhonemeConverter );
|
|
m_AbbrevTableCritSec.Lock();
|
|
if ( !g_fAbbrevTablesInitialized )
|
|
{
|
|
for ( ULONG i = 0; SUCCEEDED( hr ) && i < sp_countof( g_AbbreviationTable ); i++ )
|
|
{
|
|
if ( g_AbbreviationTable[i].pPron1 )
|
|
{
|
|
hr = InitPron( &g_AbbreviationTable[i].pPron1 );
|
|
}
|
|
if ( SUCCEEDED( hr ) &&
|
|
g_AbbreviationTable[i].pPron2 )
|
|
{
|
|
hr = InitPron( &g_AbbreviationTable[i].pPron2 );
|
|
}
|
|
if ( SUCCEEDED( hr ) &&
|
|
g_AbbreviationTable[i].pPron3 )
|
|
{
|
|
hr = InitPron( &g_AbbreviationTable[i].pPron3 );
|
|
}
|
|
}
|
|
for ( i = 0; SUCCEEDED( hr ) && i < sp_countof( g_AmbiguousWordTable ); i++ )
|
|
{
|
|
if ( g_AmbiguousWordTable[i].pPron1 )
|
|
{
|
|
hr = InitPron( &g_AmbiguousWordTable[i].pPron1 );
|
|
}
|
|
if ( SUCCEEDED( hr ) &&
|
|
g_AmbiguousWordTable[i].pPron2 )
|
|
{
|
|
hr = InitPron( &g_AmbiguousWordTable[i].pPron2 );
|
|
}
|
|
if ( SUCCEEDED( hr ) &&
|
|
g_AmbiguousWordTable[i].pPron3 )
|
|
{
|
|
hr = InitPron( &g_AmbiguousWordTable[i].pPron3 );
|
|
}
|
|
}
|
|
for ( i = 0; SUCCEEDED( hr ) && i < sp_countof( g_PostLexLookupWordTable ); i++ )
|
|
{
|
|
if ( g_PostLexLookupWordTable[i].pPron1 )
|
|
{
|
|
hr = InitPron( &g_PostLexLookupWordTable[i].pPron1 );
|
|
}
|
|
if ( SUCCEEDED( hr ) &&
|
|
g_PostLexLookupWordTable[i].pPron2 )
|
|
{
|
|
hr = InitPron( &g_PostLexLookupWordTable[i].pPron2 );
|
|
}
|
|
if ( SUCCEEDED( hr ) &&
|
|
g_PostLexLookupWordTable[i].pPron3 )
|
|
{
|
|
hr = InitPron( &g_PostLexLookupWordTable[i].pPron3 );
|
|
}
|
|
}
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
hr = InitPron( &g_pOfA );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
hr = InitPron( &g_pOfAn );
|
|
}
|
|
}
|
|
}
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
g_fAbbrevTablesInitialized = true;
|
|
}
|
|
m_AbbrevTableCritSec.Unlock();
|
|
}
|
|
|
|
return hr;
|
|
} /* CStdSentEnum::FinalConstruct */
|
|
|
|
/*****************************************************************************
|
|
* CStdSentEnum::FinalRelease *
|
|
*----------------------------*
|
|
* Description:
|
|
* Destructor
|
|
********************************************************************* EDC ***/
|
|
void CStdSentEnum::FinalRelease()
|
|
{
|
|
SPDBG_FUNC( "CStdSentEnum::FinalRelease" );
|
|
|
|
if ( m_pMorphLexicon )
|
|
{
|
|
delete m_pMorphLexicon;
|
|
}
|
|
|
|
} /* CStdSentEnum::FinalRelease */
|
|
|
|
/*****************************************************************************
|
|
* CStdSentEnum::SetFragList *
|
|
*---------------------------*
|
|
* The text fragment list passed in is guaranteed to be valid for the lifetime
|
|
* of this object. Each time this method is called, the sentence enumerator
|
|
* should reset its state.
|
|
********************************************************************* EDC ***/
|
|
STDMETHODIMP CStdSentEnum::
|
|
SetFragList( const SPVTEXTFRAG* pTextFragList, DWORD dwSpeakFlags )
|
|
{
|
|
SPAUTO_OBJ_LOCK;
|
|
SPDBG_FUNC( "CStdSentEnum::SetFragList" );
|
|
HRESULT hr = S_OK;
|
|
|
|
//--- Check args
|
|
if( SP_IS_BAD_READ_PTR( pTextFragList ) ||
|
|
( dwSpeakFlags & SPF_UNUSED_FLAGS ) )
|
|
{
|
|
hr = E_INVALIDARG;
|
|
}
|
|
else
|
|
{
|
|
m_dwSpeakFlags = dwSpeakFlags;
|
|
m_pTextFragList = pTextFragList;
|
|
|
|
//--- grab normalization preferences from the registry
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
CComPtr<ISpObjectToken> cpToken;
|
|
CSpDynamicString dstrTokenKeyName;
|
|
hr = StringFromCLSID( CLSID_MSE_TTSEngine, &dstrTokenKeyName );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
hr = SpCreateNewToken( L"HKEY_CURRENT_USER\\Software\\Microsoft\\Speech\\Voices", dstrTokenKeyName,
|
|
&cpToken );
|
|
}
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
DWORD dwTemp;
|
|
if ( SUCCEEDED( cpToken->GetDWORD( L"SeparatorAndDecimal", &dwTemp ) ) )
|
|
{
|
|
m_eSeparatorAndDecimal = (SEPARATOR_AND_DECIMAL) dwTemp;
|
|
}
|
|
if ( SUCCEEDED( cpToken->GetDWORD( L"ShortDateOrder", &dwTemp ) ) )
|
|
{
|
|
m_eShortDateOrder = (SHORT_DATE_ORDER) dwTemp;
|
|
}
|
|
}
|
|
}
|
|
|
|
//--- Reset state
|
|
Reset();
|
|
}
|
|
|
|
return hr;
|
|
} /* CStdSentEnum::SetFragList */
|
|
|
|
/*****************************************************************************
|
|
* CStdSentEnum::Next *
|
|
*--------------------*
|
|
*
|
|
********************************************************************* EDC ***/
|
|
STDMETHODIMP CStdSentEnum::Next( IEnumSENTITEM **ppSentItemEnum )
|
|
{
|
|
SPAUTO_OBJ_LOCK;
|
|
SPDBG_FUNC( "CStdSentEnum::Next" );
|
|
HRESULT hr = S_OK;
|
|
|
|
//--- Check args
|
|
if( SPIsBadWritePtr( ppSentItemEnum, sizeof( IEnumSENTITEM* ) ) )
|
|
{
|
|
hr = E_INVALIDARG;
|
|
}
|
|
else
|
|
{
|
|
//--- If this is NULL then the enum needs to be reset
|
|
if( m_pCurrFrag )
|
|
{
|
|
SentencePointer NewSentencePointer;
|
|
NewSentencePointer.pSentenceFrag = m_pCurrFrag;
|
|
NewSentencePointer.pSentenceStart = m_pNextChar;
|
|
|
|
hr = GetNextSentence( ppSentItemEnum );
|
|
if( hr == S_OK )
|
|
{
|
|
//--- Update Sentence Pointer List
|
|
hr = m_SentenceStack.Push( NewSentencePointer );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
hr = S_FALSE;
|
|
}
|
|
}
|
|
|
|
return hr;
|
|
} /* CStdSentEnum::Next */
|
|
|
|
/*****************************************************************************
|
|
* CStdSentEnum::Previous *
|
|
*--------------------*
|
|
*
|
|
********************************************************************* AH ****/
|
|
STDMETHODIMP CStdSentEnum::Previous( IEnumSENTITEM **ppSentItemEnum )
|
|
{
|
|
SPAUTO_OBJ_LOCK;
|
|
SPDBG_FUNC( "CStdSentEnum::Previous" );
|
|
HRESULT hr = S_OK;
|
|
|
|
//--- Check args
|
|
if( SPIsBadWritePtr( ppSentItemEnum, sizeof( IEnumSENTITEM* ) ) )
|
|
{
|
|
hr = E_INVALIDARG;
|
|
}
|
|
else
|
|
{
|
|
//--- Don't care if m_pCurrFrag is NULL, as long as we have enough on the SentenceStack
|
|
//--- to skip backwards...
|
|
if( m_SentenceStack.GetCount() >= 2 )
|
|
{
|
|
//--- Get the previous Sentence from the Sentence List, and then remove the Current Sentence
|
|
SentencePointer &PreviousSentence = m_SentenceStack.Pop();
|
|
PreviousSentence = m_SentenceStack.Pop();
|
|
|
|
//--- Reset the current frag and the current text pointer position
|
|
m_pCurrFrag = PreviousSentence.pSentenceFrag;
|
|
m_pNextChar = PreviousSentence.pSentenceStart;
|
|
m_pEndChar = m_pCurrFrag->pTextStart + m_pCurrFrag->ulTextLen;
|
|
|
|
hr = GetNextSentence( ppSentItemEnum );
|
|
if( hr == S_OK )
|
|
{
|
|
//--- Update Sentence Pointer List
|
|
hr = m_SentenceStack.Push( PreviousSentence );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
hr = S_FALSE;
|
|
}
|
|
}
|
|
|
|
return hr;
|
|
} /* CStdSentEnum::Previous */
|
|
|
|
/*****************************************************************************
|
|
* SkipWhiteSpaceAndTags *
|
|
*-----------------------*
|
|
* Skips m_pNextChar ahead to the next non-whitespace character (skipping
|
|
* ahead in the frag list, if necessary) or sets it to NULL if it hits the
|
|
* end of the frag list text...
|
|
********************************************************************* AH ****/
|
|
HRESULT CStdSentEnum::SkipWhiteSpaceAndTags( const WCHAR*& pStartChar, const WCHAR*& pEndChar,
|
|
const SPVTEXTFRAG*& pCurrFrag, CSentItemMemory& MemoryManager,
|
|
BOOL fAddToItemList, CItemList* pItemList )
|
|
{
|
|
SPDBG_ASSERT( pStartChar <= pEndChar );
|
|
HRESULT hr = S_OK;
|
|
|
|
while ( pStartChar &&
|
|
( IsSpace( *pStartChar ) ||
|
|
pStartChar == pEndChar ) )
|
|
{
|
|
//--- Skip whitespace
|
|
while ( pStartChar < pEndChar &&
|
|
IsSpace( *pStartChar ) )
|
|
{
|
|
++pStartChar;
|
|
}
|
|
//--- Skip to next spoken frag, if necessary
|
|
if ( pStartChar == pEndChar )
|
|
{
|
|
pCurrFrag = pCurrFrag->pNext;
|
|
while ( pCurrFrag &&
|
|
pCurrFrag->State.eAction != SPVA_Speak &&
|
|
pCurrFrag->State.eAction != SPVA_SpellOut )
|
|
{
|
|
pStartChar = (WCHAR*) pCurrFrag->pTextStart;
|
|
pEndChar = (WCHAR*) pStartChar + pCurrFrag->ulTextLen;
|
|
//--- Add non-spoken fragments, if fAddToItemList is true.
|
|
if ( fAddToItemList )
|
|
{
|
|
//-- Check for names lexicon XML tag...
|
|
if( !m_fNameItem &&
|
|
m_pCurrFrag->ulTextLen == 6 &&
|
|
!_wcsnicmp( L"<NAME>", m_pCurrFrag->pTextStart, m_pCurrFrag->ulTextLen ) )
|
|
{
|
|
m_fNameItem = true;
|
|
}
|
|
else if( m_fNameItem &&
|
|
m_pCurrFrag->ulTextLen == 7 &&
|
|
!_wcsnicmp( L"</NAME>", m_pCurrFrag->pTextStart, m_pCurrFrag->ulTextLen ) )
|
|
{
|
|
m_fNameItem = false;
|
|
}
|
|
|
|
CSentItem Item;
|
|
Item.pItemSrcText = pCurrFrag->pTextStart;
|
|
Item.ulItemSrcLen = pCurrFrag->ulTextLen;
|
|
Item.ulItemSrcOffset = pCurrFrag->ulTextSrcOffset;
|
|
Item.ulNumWords = 1;
|
|
Item.Words = (TTSWord*) MemoryManager.GetMemory( sizeof(TTSWord), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( Item.Words, sizeof(TTSWord) );
|
|
Item.Words[0].pXmlState = &pCurrFrag->State;
|
|
Item.Words[0].eWordPartOfSpeech = MS_Unknown;
|
|
Item.eItemPartOfSpeech = MS_Unknown;
|
|
Item.pItemInfo = (TTSItemInfo*) MemoryManager.GetMemory( sizeof(TTSItemInfo), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
Item.pItemInfo->Type = eWORDLIST_IS_VALID;
|
|
pItemList->AddTail( Item );
|
|
}
|
|
}
|
|
}
|
|
pCurrFrag = pCurrFrag->pNext;
|
|
}
|
|
if ( !pCurrFrag )
|
|
{
|
|
pStartChar = NULL;
|
|
pEndChar = NULL;
|
|
}
|
|
else
|
|
{
|
|
pStartChar = (WCHAR*) pCurrFrag->pTextStart;
|
|
pEndChar = (WCHAR*) pStartChar + pCurrFrag->ulTextLen;
|
|
}
|
|
}
|
|
}
|
|
return hr;
|
|
} /* SkipWhiteSpaceAndTags */
|
|
|
|
/*****************************************************************************
|
|
* FindTokenEnd *
|
|
*--------------*
|
|
* Returns the position of the first whitespace character after pStartChar,
|
|
* or pEndChar, or the character after SP_MAX_WORD_LENGTH, whichever comes first.
|
|
********************************************************************* AH ****/
|
|
const WCHAR* CStdSentEnum::FindTokenEnd( const WCHAR* pStartChar, const WCHAR* pEndChar )
|
|
{
|
|
SPDBG_ASSERT( pStartChar < pEndChar );
|
|
ULONG ulNumChars = 1;
|
|
const WCHAR *pPos = pStartChar;
|
|
|
|
while ( pPos &&
|
|
pPos < pEndChar &&
|
|
!IsSpace( *pPos ) &&
|
|
ulNumChars < SP_MAX_WORD_LENGTH )
|
|
{
|
|
pPos++;
|
|
ulNumChars++;
|
|
}
|
|
|
|
return pPos;
|
|
} /* FindTokenEnd */
|
|
|
|
/*****************************************************************************
|
|
* CStdSentEnum::AddNextSentItem *
|
|
*-------------------------------*
|
|
* Locates the next sentence item in the stream and adds it to the list.
|
|
* Returns true if the last item added is the end of the sentence.
|
|
********************************************************************* AH ****/
|
|
HRESULT CStdSentEnum::AddNextSentItem( CItemList& ItemList, CSentItemMemory& MemoryManager, BOOL* pfIsEOS )
|
|
{
|
|
SPDBG_ASSERT( m_pNextChar && pfIsEOS );
|
|
HRESULT hr = S_OK;
|
|
BOOL fHitPauseItem = false;
|
|
CSentItem Item;
|
|
ULONG ulTrailItems = 0;
|
|
TTSItemType ItemType = eUNMATCHED;
|
|
*pfIsEOS = false;
|
|
|
|
//--- Skip initial whitespace characters and XML markup (by skipping ahead in the frag list).
|
|
hr = SkipWhiteSpaceAndTags( m_pNextChar, m_pEndChar, m_pCurrFrag, MemoryManager, true, &ItemList );
|
|
|
|
//--- This will happen when we hit the end of the frag list
|
|
if ( !m_pNextChar )
|
|
{
|
|
return S_OK;
|
|
}
|
|
|
|
//--- Find end of the next token (next whitespace character, hyphen, or m_pEndChar).
|
|
m_pEndOfCurrToken = FindTokenEnd( m_pNextChar, m_pEndChar );
|
|
|
|
//--- Get Primary Insert Position
|
|
SPLISTPOS ItemPos = ItemList.AddTail( Item );
|
|
|
|
//--- Try looking up this token in the User Lexicon...
|
|
WCHAR Temp = *( (WCHAR*) m_pEndOfCurrToken );
|
|
*( (WCHAR*) m_pEndOfCurrToken ) = 0;
|
|
SPWORDPRONUNCIATIONLIST SPList;
|
|
ZeroMemory( &SPList, sizeof( SPWORDPRONUNCIATIONLIST ) );
|
|
|
|
hr = m_cpAggregateLexicon->GetPronunciations( m_pNextChar, 1033, eLEXTYPE_USER, &SPList );
|
|
if( SPList.pvBuffer )
|
|
{
|
|
::CoTaskMemFree( SPList.pvBuffer );
|
|
}
|
|
|
|
*( (WCHAR*) m_pEndOfCurrToken ) = Temp;
|
|
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
Item.eItemPartOfSpeech = MS_Unknown;
|
|
Item.pItemSrcText = m_pNextChar;
|
|
Item.ulItemSrcLen = (ULONG) ( m_pEndOfCurrToken - m_pNextChar );
|
|
Item.ulItemSrcOffset = m_pCurrFrag->ulTextSrcOffset +
|
|
(ULONG)( m_pNextChar - m_pCurrFrag->pTextStart );
|
|
Item.ulNumWords = 1;
|
|
Item.Words = (TTSWord*) MemoryManager.GetMemory( sizeof(TTSWord), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( Item.Words, sizeof(TTSWord) );
|
|
Item.Words[0].pXmlState = &m_pCurrFrag->State;
|
|
Item.Words[0].pWordText = m_pNextChar;
|
|
Item.Words[0].ulWordLen = Item.ulItemSrcLen;
|
|
Item.Words[0].pLemma = Item.Words[0].pWordText;
|
|
Item.Words[0].ulLemmaLen = Item.Words[0].ulWordLen;
|
|
Item.Words[0].eWordPartOfSpeech = MS_Unknown;
|
|
Item.eItemPartOfSpeech = MS_Unknown;
|
|
Item.pItemInfo = (TTSItemInfo*) MemoryManager.GetMemory( sizeof(TTSItemInfo*), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
Item.pItemInfo->Type = eALPHA_WORD;
|
|
ItemList.SetAt( ItemPos, Item );
|
|
}
|
|
}
|
|
m_pNextChar = m_pEndOfCurrToken;
|
|
}
|
|
//--- Not in the user lex - itemize, normalize, etc.
|
|
else if ( hr == SPERR_NOT_IN_LEX )
|
|
{
|
|
hr = S_OK;
|
|
|
|
//--- convert text from Unicode to Ascii
|
|
hr = DoUnicodeToAsciiMap( m_pNextChar, (ULONG)( m_pEndOfCurrToken - m_pNextChar ), (WCHAR*)m_pNextChar );
|
|
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
//--- Find end of the next token (next whitespace character, hyphen, or m_pEndChar)
|
|
//--- AGAIN, since the mapping may have introduced new whitespace characters...
|
|
m_pEndOfCurrToken = FindTokenEnd( m_pNextChar, m_pEndChar );
|
|
|
|
//--- Insert lead items (group beginnings, quotation marks)
|
|
while ( m_pNextChar < m_pEndOfCurrToken &&
|
|
( ( ItemType = IsGroupBeginning( *m_pNextChar ) ) != eUNMATCHED ||
|
|
( ItemType = IsQuotationMark( *m_pNextChar ) ) != eUNMATCHED ) )
|
|
{
|
|
CSentItem LeadItem;
|
|
LeadItem.pItemSrcText = m_pNextChar;
|
|
LeadItem.ulItemSrcLen = 1;
|
|
LeadItem.ulItemSrcOffset = m_pCurrFrag->ulTextSrcOffset +
|
|
(ULONG)(( m_pNextChar - m_pCurrFrag->pTextStart ));
|
|
LeadItem.ulNumWords = 1;
|
|
LeadItem.Words = (TTSWord*) MemoryManager.GetMemory( sizeof(TTSWord), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( LeadItem.Words, sizeof(TTSWord) );
|
|
LeadItem.Words[0].pXmlState = &m_pCurrFrag->State;
|
|
LeadItem.Words[0].eWordPartOfSpeech = ConvertItemTypeToPartOfSp( ItemType );
|
|
LeadItem.eItemPartOfSpeech = ConvertItemTypeToPartOfSp( ItemType );
|
|
LeadItem.pItemInfo = (TTSItemInfo*) MemoryManager.GetMemory( sizeof(TTSItemInfo), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
LeadItem.pItemInfo->Type = ItemType;
|
|
if ( m_dwSpeakFlags & SPF_NLP_SPEAK_PUNC ||
|
|
m_pCurrFrag->State.eAction == SPVA_SpellOut )
|
|
{
|
|
CWordList TempWordList;
|
|
ExpandPunctuation( TempWordList, *m_pNextChar );
|
|
hr = SetWordList( LeadItem, TempWordList, MemoryManager );
|
|
LeadItem.pItemInfo->Type = eUNMATCHED;
|
|
}
|
|
ItemList.InsertBefore( ItemPos, LeadItem );
|
|
m_pNextChar++;
|
|
}
|
|
}
|
|
ItemType = eUNMATCHED;
|
|
}
|
|
|
|
//--- Insert trail items (group endings, quotation marks, misc. punctuation, EOS Items)
|
|
m_pEndOfCurrItem = m_pEndOfCurrToken;
|
|
BOOL fAddTrailItem = true;
|
|
BOOL fAbbreviation = false;
|
|
while ( (m_pEndOfCurrItem - 1) >= m_pNextChar &&
|
|
fAddTrailItem )
|
|
{
|
|
fAddTrailItem = false;
|
|
fAbbreviation = false;
|
|
|
|
//--- Check group endings, quotation marks, misc. punctuation.
|
|
if ( ( ItemType = IsGroupEnding( *(m_pEndOfCurrItem - 1) ) ) != eUNMATCHED ||
|
|
( ItemType = IsQuotationMark( *(m_pEndOfCurrItem - 1) ) ) != eUNMATCHED ||
|
|
( ItemType = IsMiscPunctuation( *(m_pEndOfCurrItem - 1) ) ) != eUNMATCHED )
|
|
{
|
|
fAddTrailItem = true;
|
|
if ( ItemType == eCOMMA ||
|
|
ItemType == eCOLON ||
|
|
ItemType == eSEMICOLON )
|
|
{
|
|
fHitPauseItem = true;
|
|
}
|
|
}
|
|
//--- Check EOS Items, except periods preceded by alpha characters
|
|
else if ( ( ItemType = IsEOSItem( *(m_pEndOfCurrItem - 1) ) ) != eUNMATCHED &&
|
|
! ( ItemType == ePERIOD &&
|
|
( m_pEndOfCurrItem - 2 >= m_pNextChar ) &&
|
|
( iswalpha( *(m_pEndOfCurrItem - 2) ) ) ) )
|
|
{
|
|
//--- Check for ellipses
|
|
if ( ItemType == ePERIOD )
|
|
{
|
|
if ( m_pEndOfCurrItem == m_pEndOfCurrToken &&
|
|
( m_pEndOfCurrItem - 2 >= m_pNextChar ) &&
|
|
( ( ItemType = IsEOSItem( *(m_pEndOfCurrItem - 2) ) ) == ePERIOD ) &&
|
|
( m_pEndOfCurrItem - 3 == m_pNextChar ) &&
|
|
( ( ItemType = IsEOSItem( *(m_pEndOfCurrItem - 3) ) ) == ePERIOD ) )
|
|
{
|
|
fAddTrailItem = true;
|
|
ItemType = eELLIPSIS;
|
|
}
|
|
else
|
|
{
|
|
ItemType = ePERIOD;
|
|
fAddTrailItem = true;
|
|
*pfIsEOS = true;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
fAddTrailItem = true;
|
|
*pfIsEOS = true;
|
|
}
|
|
}
|
|
//--- Period preceded by alpha character - determine whether it is EOS.
|
|
else if ( ItemType == ePERIOD )
|
|
{
|
|
//--- Is it an Initialism ( e.g. "e.g." )? If so, only EOS if the next
|
|
//--- word is in the common first words list...
|
|
hr = IsInitialism( ItemList, ItemPos, MemoryManager, pfIsEOS );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
if ( *pfIsEOS )
|
|
{
|
|
//--- Did we see a pause item earlier? In that case, we should NOT listen to this
|
|
//--- IsEOS decision from IsInitialism...
|
|
if ( fHitPauseItem )
|
|
{
|
|
*pfIsEOS = false;
|
|
}
|
|
else
|
|
{
|
|
fAddTrailItem = true;
|
|
fAbbreviation = true;
|
|
}
|
|
}
|
|
}
|
|
else if ( hr == E_INVALIDARG )
|
|
{
|
|
const WCHAR temp = (WCHAR) *( m_pEndOfCurrItem - 1 );
|
|
*( (WCHAR*) ( m_pEndOfCurrItem - 1 ) ) = 0;
|
|
|
|
const AbbrevRecord* pAbbrevRecord =
|
|
(AbbrevRecord*) bsearch( (void*) m_pNextChar, (void*) g_AbbreviationTable,
|
|
sp_countof( g_AbbreviationTable ), sizeof( AbbrevRecord ),
|
|
CompareStringAndAbbrevRecord );
|
|
|
|
*( (WCHAR*) ( m_pEndOfCurrItem - 1 ) ) = temp;
|
|
|
|
if ( pAbbrevRecord )
|
|
{
|
|
//--- Matched an abbreviation
|
|
if ( pAbbrevRecord->iSentBreakDisambig < 0 )
|
|
{
|
|
//--- Abbreviation will never end a sentence - just insert into ItemList
|
|
*pfIsEOS = false;
|
|
hr = S_OK;
|
|
|
|
Item.pItemSrcText = m_pNextChar;
|
|
Item.ulItemSrcLen = (ULONG)(m_pEndOfCurrItem - m_pNextChar);
|
|
Item.ulItemSrcOffset = m_pCurrFrag->ulTextSrcOffset +
|
|
(ULONG)( m_pNextChar - m_pCurrFrag->pTextStart );
|
|
Item.ulNumWords = 1;
|
|
Item.Words = (TTSWord*) MemoryManager.GetMemory( sizeof( TTSWord ), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( Item.Words, sizeof( TTSWord ) );
|
|
Item.Words[0].pXmlState = &m_pCurrFrag->State;
|
|
Item.Words[0].pWordText = Item.pItemSrcText;
|
|
Item.Words[0].ulWordLen = Item.ulItemSrcLen;
|
|
Item.Words[0].pLemma = Item.pItemSrcText;
|
|
Item.Words[0].ulLemmaLen = Item.ulItemSrcLen;
|
|
Item.pItemInfo = (TTSItemInfo*) MemoryManager.GetMemory( sizeof(TTSAbbreviationInfo), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
if ( NeedsToBeNormalized( pAbbrevRecord ) )
|
|
{
|
|
Item.pItemInfo->Type = eABBREVIATION_NORMALIZE;
|
|
}
|
|
else
|
|
{
|
|
Item.pItemInfo->Type = eABBREVIATION;
|
|
}
|
|
( (TTSAbbreviationInfo*) Item.pItemInfo )->pAbbreviation = pAbbrevRecord;
|
|
ItemList.SetAt( ItemPos, Item );
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
//--- Need to do some disambiguation to determine whether,
|
|
//--- a) this is indeed an abbreviation (e.g. "Ed.")
|
|
//--- b) the period doubles as EOS
|
|
hr = ( this->*g_SentBreakDisambigTable[pAbbrevRecord->iSentBreakDisambig] )
|
|
( pAbbrevRecord, ItemList, ItemPos, MemoryManager, pfIsEOS );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
if ( *pfIsEOS )
|
|
{
|
|
if ( fHitPauseItem )
|
|
{
|
|
*pfIsEOS = false;
|
|
}
|
|
else
|
|
{
|
|
fAddTrailItem = true;
|
|
fAbbreviation = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if ( hr == E_INVALIDARG )
|
|
{
|
|
//--- Just check for periods internal to the item - this catches stuff like
|
|
//--- 10:30p.m.
|
|
for ( const WCHAR* pIterator = m_pNextChar; pIterator < m_pEndOfCurrItem - 1; pIterator++ )
|
|
{
|
|
if ( *pIterator == L'.' )
|
|
{
|
|
*pfIsEOS = false;
|
|
break;
|
|
}
|
|
}
|
|
//--- If all previous checks have failed, it is EOS.
|
|
if ( pIterator == ( m_pEndOfCurrItem - 1 ) &&
|
|
!fHitPauseItem )
|
|
{
|
|
hr = S_OK;
|
|
fAddTrailItem = true;
|
|
*pfIsEOS = true;
|
|
}
|
|
else if ( hr == E_INVALIDARG )
|
|
{
|
|
hr = S_OK;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//--- Add trail item.
|
|
if ( fAddTrailItem )
|
|
{
|
|
ulTrailItems++;
|
|
CSentItem TrailItem;
|
|
if ( ItemType == eELLIPSIS )
|
|
{
|
|
TrailItem.pItemSrcText = m_pEndOfCurrItem - 3;
|
|
TrailItem.ulItemSrcLen = 3;
|
|
TrailItem.ulItemSrcOffset = m_pCurrFrag->ulTextSrcOffset +
|
|
(ULONG)( m_pEndOfCurrItem - m_pCurrFrag->pTextStart - 3 );
|
|
}
|
|
else
|
|
{
|
|
TrailItem.pItemSrcText = m_pEndOfCurrItem - 1;
|
|
TrailItem.ulItemSrcLen = 1;
|
|
TrailItem.ulItemSrcOffset = m_pCurrFrag->ulTextSrcOffset +
|
|
(ULONG)( m_pEndOfCurrItem - m_pCurrFrag->pTextStart - 1 );
|
|
}
|
|
TrailItem.ulNumWords = 1;
|
|
TrailItem.Words = (TTSWord*) MemoryManager.GetMemory( sizeof(TTSWord), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( TrailItem.Words, sizeof(TTSWord) );
|
|
TrailItem.Words[0].pXmlState = &m_pCurrFrag->State;
|
|
TrailItem.Words[0].eWordPartOfSpeech = ConvertItemTypeToPartOfSp( ItemType );
|
|
TrailItem.eItemPartOfSpeech = ConvertItemTypeToPartOfSp( ItemType );
|
|
TrailItem.pItemInfo = (TTSItemInfo*) MemoryManager.GetMemory( sizeof(TTSItemInfo), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
TrailItem.pItemInfo->Type = ItemType;
|
|
if ( m_dwSpeakFlags & SPF_NLP_SPEAK_PUNC ||
|
|
( m_pCurrFrag->State.eAction == SPVA_SpellOut &&
|
|
!fAbbreviation ) )
|
|
{
|
|
CWordList TempWordList;
|
|
ExpandPunctuation( TempWordList, *(m_pEndOfCurrItem - 1) );
|
|
hr = SetWordList( TrailItem, TempWordList, MemoryManager );
|
|
TrailItem.pItemInfo->Type = eUNMATCHED;
|
|
}
|
|
ItemList.InsertAfter( ItemPos, TrailItem );
|
|
if ( !fAbbreviation )
|
|
{
|
|
if ( ItemType == eELLIPSIS )
|
|
{
|
|
m_pEndOfCurrItem -= 3;
|
|
ulTrailItems = 3;
|
|
}
|
|
else
|
|
{
|
|
m_pEndOfCurrItem--;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
ItemType = eUNMATCHED;
|
|
if ( fAbbreviation )
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
//--- Do Main Item Insertion
|
|
if ( SUCCEEDED( hr ) &&
|
|
m_pNextChar == m_pEndOfCurrItem )
|
|
{
|
|
ItemList.RemoveAt( ItemPos );
|
|
}
|
|
else if ( SUCCEEDED( hr ) )
|
|
{
|
|
hr = Normalize( ItemList, ItemPos, MemoryManager );
|
|
}
|
|
|
|
if( m_fNameItem )
|
|
{
|
|
wcscpy( ItemList.GetAt( ItemPos ).CustomLtsToken, L"Names" );
|
|
}
|
|
|
|
//--- Advance m_pNextChar to m_pEndOfCurrItem + once for each trail item matched.
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
if ( !fAbbreviation &&
|
|
m_pEndOfCurrItem + ulTrailItems != m_pEndOfCurrToken )
|
|
{
|
|
//--- Multi-token item matched in Normalize()... Remove all previously matched trail items,
|
|
//--- as they were matched as part of the larger item...
|
|
m_pNextChar = m_pEndOfCurrItem;
|
|
Item = ItemList.GetNext( ItemPos );
|
|
while ( ItemPos )
|
|
{
|
|
SPLISTPOS RemovePos = ItemPos;
|
|
Item = ItemList.GetNext( ItemPos );
|
|
ItemList.RemoveAt( RemovePos );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
m_pNextChar = m_pEndOfCurrToken;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return hr;
|
|
} /* CStdSentEnum::AddNextSentItem */
|
|
|
|
/*****************************************************************************
|
|
* CStdSentEnum::GetNextSentence *
|
|
*-------------------------------*
|
|
* This method is used to create a sentence item enumerator and populate it
|
|
* with items. If the SPF_NLP_PASSTHROUGH flag is set, each item is the block
|
|
* of text between XML states. If the SPF_NLP_PASSTHROUGH flag is not set, each
|
|
* item is an individual word that is looked up in the current lexicon(s).
|
|
********************************************************************* EDC ***/
|
|
HRESULT CStdSentEnum::GetNextSentence( IEnumSENTITEM** ppItemEnum )
|
|
{
|
|
HRESULT hr = S_OK;
|
|
ULONG ulNumItems = 0;
|
|
const SPVTEXTFRAG* pPrevFrag = m_pCurrFrag;
|
|
|
|
//--- Is there any work to do
|
|
if( m_pCurrFrag == NULL ) return S_FALSE;
|
|
|
|
//--- Create sentence enum
|
|
CComObject<CSentItemEnum> *pItemEnum;
|
|
hr = CComObject<CSentItemEnum>::CreateInstance( &pItemEnum );
|
|
|
|
if( SUCCEEDED( hr ) )
|
|
{
|
|
pItemEnum->AddRef();
|
|
pItemEnum->_SetOwner( GetControllingUnknown() );
|
|
*ppItemEnum = pItemEnum;
|
|
}
|
|
|
|
if( SUCCEEDED( hr ) )
|
|
{
|
|
BOOL fSentDone = false;
|
|
BOOL fGoToNextFrag = false;
|
|
CItemList& ItemList = pItemEnum->_GetList();
|
|
CSentItemMemory& MemoryManager = pItemEnum->_GetMemoryManager();
|
|
|
|
while( SUCCEEDED(hr) && m_pCurrFrag && !fSentDone && ulNumItems < 50 )
|
|
{
|
|
ulNumItems++;
|
|
if( m_pCurrFrag->State.eAction == SPVA_Speak ||
|
|
m_pCurrFrag->State.eAction == SPVA_SpellOut )
|
|
{
|
|
hr = AddNextSentItem( ItemList, MemoryManager, &fSentDone );
|
|
|
|
//--- Advance fragment?
|
|
if( SUCCEEDED( hr ) &&
|
|
m_pNextChar &&
|
|
m_pEndChar &&
|
|
m_pNextChar >= m_pEndChar )
|
|
{
|
|
fGoToNextFrag = true;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
//-- Check for lexicon
|
|
if( !m_fNameItem &&
|
|
m_pCurrFrag->ulTextLen == 6 &&
|
|
!_wcsnicmp( L"<NAME>", m_pCurrFrag->pTextStart, m_pCurrFrag->ulTextLen ) )
|
|
{
|
|
m_fNameItem = true;
|
|
}
|
|
else if( m_fNameItem &&
|
|
m_pCurrFrag->ulTextLen == 7 &&
|
|
!_wcsnicmp( L"</NAME>", m_pCurrFrag->pTextStart, m_pCurrFrag->ulTextLen ) )
|
|
{
|
|
m_fNameItem = false;
|
|
}
|
|
|
|
//--- Add non spoken fragments
|
|
CSentItem Item;
|
|
Item.pItemSrcText = m_pCurrFrag->pTextStart;
|
|
Item.ulItemSrcLen = m_pCurrFrag->ulTextLen;
|
|
Item.ulItemSrcOffset = m_pCurrFrag->ulTextSrcOffset;
|
|
Item.ulNumWords = 1;
|
|
Item.Words = (TTSWord*) MemoryManager.GetMemory( sizeof(TTSWord), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( Item.Words, sizeof(TTSWord) );
|
|
Item.Words[0].pXmlState = &m_pCurrFrag->State;
|
|
Item.Words[0].eWordPartOfSpeech = MS_Unknown;
|
|
Item.eItemPartOfSpeech = MS_Unknown;
|
|
Item.pItemInfo = (TTSItemInfo*) MemoryManager.GetMemory( sizeof(TTSItemInfo), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
Item.pItemInfo->Type = eWORDLIST_IS_VALID;
|
|
ItemList.AddTail( Item );
|
|
}
|
|
}
|
|
fGoToNextFrag = true;
|
|
}
|
|
|
|
if( SUCCEEDED( hr ) &&
|
|
fGoToNextFrag )
|
|
{
|
|
fGoToNextFrag = false;
|
|
pPrevFrag = m_pCurrFrag;
|
|
m_pCurrFrag = m_pCurrFrag->pNext;
|
|
if( m_pCurrFrag )
|
|
{
|
|
m_pNextChar = m_pCurrFrag->pTextStart;
|
|
m_pEndChar = m_pNextChar + m_pCurrFrag->ulTextLen;
|
|
}
|
|
else
|
|
{
|
|
m_pNextChar = NULL;
|
|
m_pEndChar = NULL;
|
|
}
|
|
}
|
|
} // end while
|
|
|
|
//--- If no period has been added, add one now - this will happen if the text
|
|
//--- is ONLY XML markup...
|
|
if ( SUCCEEDED(hr) && !fSentDone )
|
|
{
|
|
CSentItem EOSItem;
|
|
EOSItem.pItemSrcText = g_period.pStr;
|
|
EOSItem.ulItemSrcLen = g_period.Len;
|
|
EOSItem.ulItemSrcOffset = pPrevFrag->ulTextSrcOffset + pPrevFrag->ulTextLen;
|
|
EOSItem.ulNumWords = 1;
|
|
EOSItem.Words = (TTSWord*) MemoryManager.GetMemory( sizeof(TTSWord), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
ZeroMemory( EOSItem.Words, sizeof(TTSWord) );
|
|
EOSItem.Words[0].pXmlState = &g_DefaultXMLState;
|
|
EOSItem.Words[0].eWordPartOfSpeech = MS_EOSItem;
|
|
EOSItem.eItemPartOfSpeech = MS_EOSItem;
|
|
EOSItem.pItemInfo = (TTSItemInfo*) MemoryManager.GetMemory( sizeof(TTSItemInfo), &hr );
|
|
if ( SUCCEEDED( hr ) )
|
|
{
|
|
EOSItem.pItemInfo->Type = ePERIOD;
|
|
ItemList.AddTail( EOSItem );
|
|
}
|
|
}
|
|
}
|
|
|
|
//--- Output debugging information, if sentence breaks are desired
|
|
TTSDBG_LOGITEMLIST( pItemEnum->_GetList(), STREAM_SENTENCEBREAKS );
|
|
|
|
if( SUCCEEDED( hr ) )
|
|
{
|
|
hr = DetermineProns( pItemEnum->_GetList(), pItemEnum->_GetMemoryManager() );
|
|
}
|
|
|
|
pItemEnum->Reset();
|
|
|
|
//--- Output debugging information, if POS or Pronunciations are desired
|
|
TTSDBG_LOGITEMLIST( pItemEnum->_GetList(), STREAM_LEXLOOKUP );
|
|
|
|
}
|
|
return hr;
|
|
} /* CStdSentEnum::GetNextSentence */
|
|
|
|
/*****************************************************************************
|
|
* CStdSentEnum::Reset *
|
|
*---------------------*
|
|
*
|
|
********************************************************************* EDC ***/
|
|
STDMETHODIMP CStdSentEnum::Reset( void )
|
|
{
|
|
SPAUTO_OBJ_LOCK;
|
|
SPDBG_FUNC( "CStdSentEnum::Reset" );
|
|
HRESULT hr = S_OK;
|
|
m_pCurrFrag = m_pTextFragList;
|
|
m_pNextChar = m_pCurrFrag->pTextStart;
|
|
m_pEndChar = m_pNextChar + m_pCurrFrag->ulTextLen;
|
|
m_SentenceStack.Reset();
|
|
m_fNameItem = false;
|
|
return hr;
|
|
} /* CStdSentEnum::Reset */
|
|
|
|
/*****************************************************************************
|
|
* CStdSentEnum::InitAggregateLexicon *
|
|
*------------------------------------*
|
|
*
|
|
********************************************************************* AH ****/
|
|
HRESULT CStdSentEnum::InitAggregateLexicon( void )
|
|
{
|
|
return m_cpAggregateLexicon.CoCreateInstance(CLSID_SpLexicon);
|
|
}
|
|
|
|
/*****************************************************************************
|
|
* CStdSentEnum::AddLexiconToAggregate *
|
|
*-------------------------------------*
|
|
*
|
|
********************************************************************* AH ****/
|
|
HRESULT CStdSentEnum::AddLexiconToAggregate( ISpLexicon *pAddLexicon, DWORD dwFlags )
|
|
{
|
|
return m_cpAggregateLexicon->AddLexicon( pAddLexicon, dwFlags );
|
|
}
|
|
|
|
/*****************************************************************************
|
|
* CStdSentEnum::InitMorphLexicon *
|
|
*--------------------------------*
|
|
*
|
|
********************************************************************* AH ****/
|
|
HRESULT CStdSentEnum::InitMorphLexicon( void )
|
|
{
|
|
HRESULT hr = S_OK;
|
|
|
|
m_pMorphLexicon = new CSMorph( m_cpAggregateLexicon, &hr );
|
|
|
|
return hr;
|
|
}
|
|
|
|
void CStdSentEnum::fNamesLTS( bool fHaveNamesLTS )
|
|
{
|
|
m_fHaveNamesLTS = fHaveNamesLTS;
|
|
}
|
|
|
|
//
|
|
//=== CSentItemEnum =========================================================
|
|
//
|
|
|
|
/*****************************************************************************
|
|
* CSentItemEnum::Next *
|
|
*---------------------*
|
|
*
|
|
********************************************************************* EDC ***/
|
|
STDMETHODIMP CSentItemEnum::
|
|
Next( TTSSentItem *pItemEnum )
|
|
{
|
|
SPDBG_FUNC( "CSentItemEnum::Next" );
|
|
HRESULT hr = S_OK;
|
|
|
|
//--- Check args
|
|
if( SPIsBadWritePtr( pItemEnum, sizeof( TTSSentItem ) ) )
|
|
{
|
|
hr = E_INVALIDARG;
|
|
}
|
|
else
|
|
{
|
|
if ( m_ListPos )
|
|
{
|
|
*pItemEnum = m_ItemList.GetNext( m_ListPos );
|
|
}
|
|
else
|
|
{
|
|
hr = S_FALSE;
|
|
}
|
|
}
|
|
return hr;
|
|
} /* CSentItemEnum::Next */
|
|
|
|
/*****************************************************************************
|
|
* CSentItemEnum::Reset *
|
|
*----------------------*
|
|
*
|
|
********************************************************************* EDC ***/
|
|
STDMETHODIMP CSentItemEnum::Reset( void )
|
|
{
|
|
SPDBG_FUNC( "CSentItemEnum::Reset" );
|
|
HRESULT hr = S_OK;
|
|
m_ListPos = m_ItemList.GetHeadPosition();
|
|
return hr;
|
|
} /* CSentItemEnum::Reset */
|