//////////////////////////////////////////////////////////////////////////////// // // Filename : Tokenizer.h // Purpose : Tokenizer declerations // // Project : WordBreakers // Component: English word breaker // // Author : yairh // // Log: // // Jan 06 2000 yairh creation // Apr 05 2000 dovh - Fixed two problematic debug / tracer buffer size // problems. (Fix Bug 15449). // May 07 2000 dovh - USE_WS_SENTINEL algorithm in BreakText // Nov 11 2000 dovh - Special underscore treatment // Added inline support routines (FindLeftmostUnderscore etc.) // //////////////////////////////////////////////////////////////////////////////// #ifndef _TOKENIZER_H_ #define _TOKENIZER_H_ #include "tracer.h" #include "PropArray.h" #include "Query.h" #include "stdafx.h" #include "cierror.h" #include "LangSupport.h" #include "Formats.h" #define TOKENIZER_MAXBUFFERLIMIT 1024 // max size of a token is 1024 chars DECLARE_TAG(s_tagTokenizer, "Tokenizer"); DECLARE_TAG(s_tagTokenizerOutput, "Tokenizer Output"); DECLARE_TAG(s_tagTokenizerTrace, "Tokenizer Trace"); DECLARE_TAG(s_tagTokenizerDecision, "Tokenizer Decision"); DECLARE_TAG(s_tagTokenizerSuspect, "Tokenizer Suspect"); #if defined(DEBUG) /////////////////////////////////////////////////////////////////////////////// // Class CTraceWordSink /////////////////////////////////////////////////////////////////////////////// class CTraceWordSink : public IWordSink { public: CTraceWordSink(IWordSink* p) : m_apWordSink(p) { } ULONG __stdcall AddRef() { return 1; } ULONG __stdcall Release() { return 0; } STDMETHOD(QueryInterface)( IN REFIID riid, IN void **ppvObject) { Assert(false); return E_FAIL; } STDMETHOD(PutWord)( ULONG cwc, WCHAR const* pwcInBuf, ULONG cwcSrcLen, ULONG cwcSrcPos) { Assert(cwc < TOKENIZER_MAXBUFFERLIMIT + 10); #if (defined (DEBUG) && !defined(_NO_TRACER)) || defined(USE_TRACER) if (CheckTraceRestrictions(elVerbose, s_tagTokenizerOutput)) { Trace( elVerbose, s_tagTokenizerOutput, ("PutWord: %*.*S, %d, %d, %d", cwc, cwc, pwcInBuf, cwc, cwcSrcLen, cwcSrcPos)); } #endif return m_apWordSink->PutWord(cwc, pwcInBuf, cwcSrcLen, cwcSrcPos); } STDMETHOD(PutAltWord)( ULONG cwc, WCHAR const* pwcInBuf, ULONG cwcSrcLen, ULONG cwcSrcPos) { Assert(cwc < TOKENIZER_MAXBUFFERLIMIT + 10); #if (defined (DEBUG) && !defined(_NO_TRACER)) || defined(USE_TRACER) if (CheckTraceRestrictions(elVerbose, s_tagTokenizerOutput)) { Trace( elVerbose, s_tagTokenizerOutput, ("PutAltWord: %*.*S, %d, %d, %d", cwc, cwc, pwcInBuf, cwc, cwcSrcLen, cwcSrcPos)); } #endif return m_apWordSink->PutAltWord(cwc, pwcInBuf, cwcSrcLen, cwcSrcPos); } STDMETHOD(StartAltPhrase)() { Trace( elVerbose, s_tagTokenizerOutput, ("StartAltPhrase")); return m_apWordSink->StartAltPhrase(); } STDMETHOD(EndAltPhrase)() { Trace( elVerbose, s_tagTokenizerOutput, ("EndAltPhrase")); return m_apWordSink->EndAltPhrase(); } STDMETHOD(PutBreak)(WORDREP_BREAK_TYPE breakType) { WCHAR* p; #if (defined (DEBUG) && !defined(_NO_TRACER)) || defined(USE_TRACER) if (CheckTraceRestrictions(elVerbose, s_tagTokenizerOutput)) { switch (breakType) { case WORDREP_BREAK_EOW: p = L"WORDREP_BREAK_EOW"; break; case WORDREP_BREAK_EOS: p = L"WORDREP_BREAK_EOS"; break; case WORDREP_BREAK_EOP: p = L"WORDREP_BREAK_EOP"; break; case WORDREP_BREAK_EOC: p = L"WORDREP_BREAK_EOC"; break; default: p = L"Unknown break type"; } Trace( elVerbose, s_tagTokenizerOutput, ("PutBreak %S", p)); } #endif return m_apWordSink->PutBreak(breakType); } CTraceWordSink* operator ->() { return this; } private: CComPtr m_apWordSink; }; #endif /////////////////////////////////////////////////////////////////////////////// // Class CTokenState /////////////////////////////////////////////////////////////////////////////// class CTokenState { public: // // methods // CTokenState(); CTokenState(CTokenState& s); CTokenState& operator = (CTokenState& S); void Clear(ULONG ulEnd); public: // // members // ULONG m_ulStart; ULONG m_ulEnd; CPropFlag m_Properties; WCHAR* m_pwcsToken; }; inline CTokenState::CTokenState() : m_ulStart(0), m_ulEnd(0) { } inline CTokenState::CTokenState(CTokenState& s) : m_ulStart(s.m_ulStart), m_ulEnd(s.m_ulEnd), m_pwcsToken(s.m_pwcsToken), m_Properties(s.m_Properties) { } inline CTokenState& CTokenState::operator = (CTokenState& S) { m_ulStart = S.m_ulStart; m_ulEnd = S.m_ulEnd; m_Properties = S.m_Properties; m_pwcsToken = S.m_pwcsToken; return *this; } inline void CTokenState::Clear(ULONG ulEnd) { m_ulStart = 0; m_ulEnd = ulEnd; m_Properties.Clear(); m_pwcsToken = NULL; } /////////////////////////////////////////////////////////////////////////////// // Class CToken /////////////////////////////////////////////////////////////////////////////// class CToken { public: // // methods // CToken(ULONG ulMaxTokenSize); bool IsNotEmpty(); void Clear(); bool IsFull(); void MarkEndToken(ULONG ulCurPosInTxtSourceBuffer); ULONG RemoveHeadPunct(CPropFlag& PunctProperties, CTokenState& State); ULONG RemoveTailPunct(CPropFlag& PunctProperties, CTokenState& State); void ComputeStateProperties(CTokenState& State); ULONG CalculateStateOffsetInTxtSourceBuffer(CTokenState& State); ULONG FindLeftmostUnderscore(CTokenState& State); ULONG FindRightmostUnderscore(CTokenState& State); public: // // members // ULONG m_ulBufPos; bool m_fHasEos; ULONG m_ulOffsetInTxtSourceBuffer; ULONG m_ulMaxTokenSize; CTokenState m_State; WCHAR m_awchBuf[TOKENIZER_MAXBUFFERLIMIT + 1]; }; inline CToken::CToken(ULONG ulMaxTokenSize) : m_ulBufPos(0), m_fHasEos(false), m_ulOffsetInTxtSourceBuffer(0), m_ulMaxTokenSize(ulMaxTokenSize) { m_awchBuf[0] = L'\0'; } inline bool CToken::IsNotEmpty() { return (m_ulBufPos > 0); } inline void CToken::Clear() { m_ulBufPos = 0; m_awchBuf[0] = L'\0'; m_State.Clear(0); m_fHasEos = false; m_ulOffsetInTxtSourceBuffer = 0; } inline bool CToken::IsFull() { return (m_ulBufPos == m_ulMaxTokenSize); } inline void CToken::MarkEndToken(ULONG ulCurPosInTxtSourceBuffer) { Assert(m_ulBufPos < m_ulMaxTokenSize + 1); m_awchBuf[m_ulBufPos] = L'\0'; m_State.m_pwcsToken = m_awchBuf; m_State.m_ulStart = 0; m_State.m_ulEnd = m_ulBufPos; if (TEST_PROP(m_State.m_Properties, PROP_EOS) && (m_ulBufPos < m_ulMaxTokenSize)) { ULONG ulCur = m_State.m_ulEnd - 1; while (TEST_PROP(GET_PROP(m_awchBuf[ulCur]), EOS_SUFFIX)) { ulCur--; } if (IS_EOS(m_awchBuf[ulCur])) { m_fHasEos = true; } } // // BUGBUG need to enalble the assert // // Assert(ulCurPosInTxtSourceBuffer > m_ulBufPos); m_ulOffsetInTxtSourceBuffer = ulCurPosInTxtSourceBuffer - m_ulBufPos; } inline ULONG CToken::CalculateStateOffsetInTxtSourceBuffer(CTokenState& State) { ULONG ulOffset = m_ulOffsetInTxtSourceBuffer + (State.m_pwcsToken - m_awchBuf) + State.m_ulStart; return ulOffset; } inline ULONG CToken::RemoveHeadPunct(CPropFlag& PunctProperties, CTokenState& State) { Assert(m_State.m_ulStart <= State.m_ulStart); Assert(State.m_ulStart <= State.m_ulEnd); Assert(State.m_ulEnd <= m_State.m_ulEnd); for (ULONG ul = State.m_ulStart; ul < State.m_ulEnd; ul++) { if (!TEST_PROP1(GET_PROP(State.m_pwcsToken[ul]), PunctProperties) ) { break; } } State.m_ulStart = ul; // // return num of characters removed // return ul; } inline ULONG CToken::RemoveTailPunct(CPropFlag& PunctProperties, CTokenState& State) { Assert(m_State.m_ulStart <= State.m_ulStart); Assert(State.m_ulStart <= State.m_ulEnd); Assert(State.m_ulEnd <= m_State.m_ulEnd); for (ULONG ul = State.m_ulEnd; ul > State.m_ulStart; ul--) { if (!TEST_PROP1(GET_PROP(State.m_pwcsToken[ul - 1]), PunctProperties) ) { break; } } ULONG ulNumOfRemovedChars = State.m_ulEnd - ul; State.m_ulEnd = ul; return ulNumOfRemovedChars; } inline void CToken::ComputeStateProperties(CTokenState& State) { Assert(m_State.m_ulStart <= State.m_ulStart); Assert(State.m_ulStart <= State.m_ulEnd); Assert(State.m_ulEnd <= m_State.m_ulEnd); State.m_Properties.Clear(); for (ULONG ul = State.m_ulStart; ul < State.m_ulEnd; ul++) { State.m_Properties |= GET_PROP(State.m_pwcsToken[ul]); } } //////////////////////////////////////////////////////////////////////////////// // // Support routines for UNDERSCORE '_' treatment. // // Current algorithm has the following behavior for tokens containing // ALPHANUMERIC characters and UNDERSCORES: // // 1. Single underscores and consecutive underscore sequence surrounded by // alphanumeric characters (IE underscores buried within words) are // treated as alphanumeric characters, and do not break words, or get // omitted. Examples: Foo_Bar => Foo_Bar, and X___Y => X___Y // // 2. An underscore / underscore sequence tacked to the right (left) end // end of an alphanumeric (+ embedded underscores) token, will be part of // the token, as long as the sequence is attacked only to one side of the // alphanumeric token. If there are BOTH header and trailer consecutive // underscore sequences, both header & trailer sequence will be omitted. // Examples: __Foo_Bar => __Foo_Bar , alpha_beta_ => alpha_beta_ , // __HEADERFILE__ => __HEADERFILE__ , __MY_FILE_H__ => MY_FILE_H // // 3. Caveat: Note that other than the two rules stated above underscores are // NOT treated as ALPHANUMERIC characters. he behavior on a mixed sequence // of underscores, and other non-alphanumeric characters is undefined! // //////////////////////////////////////////////////////////////////////////////// // // Assumes: on entry State.m_ulStart is the first alphanumeric in token // returns: num of underscores scanned // inline ULONG CToken::FindLeftmostUnderscore(CTokenState& State) { Assert(m_State.m_ulStart < State.m_ulStart); Assert(State.m_ulStart <= State.m_ulEnd); Assert(State.m_ulEnd <= m_State.m_ulEnd); Assert( TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulStart-1]), PROP_UNDERSCORE) ); ULONG ulNumUnderscores = 0; for (ULONG ul = State.m_ulStart; (ul > m_State.m_ulStart) && (TEST_PROP(GET_PROP(State.m_pwcsToken[ul-1]), PROP_UNDERSCORE) ); ul--) ; ulNumUnderscores = State.m_ulStart - ul; State.m_ulStart = ul; // // return num of underscores scanned // return (ulNumUnderscores); } // CToken::FindLeftmostUnderscore // // Assumes: on entry State.m_ulEnd is the last alphanumeric in token // returns: num of underscores scanned // inline ULONG CToken::FindRightmostUnderscore(CTokenState& State) { Assert(m_State.m_ulStart <= State.m_ulStart); Assert(State.m_ulStart <= State.m_ulEnd); Assert(State.m_ulEnd < m_State.m_ulEnd); Assert( TEST_PROP(GET_PROP(State.m_pwcsToken[State.m_ulEnd]), PROP_UNDERSCORE) ); ULONG ulNumUnderscores = 0; for (ULONG ul = State.m_ulEnd; (ul < m_State.m_ulEnd) && (TEST_PROP(GET_PROP(State.m_pwcsToken[ul]), PROP_UNDERSCORE) ); ul++) ; ulNumUnderscores = ul - State.m_ulEnd; State.m_ulEnd = ul; // // return num of underscores scanned // return (ulNumUnderscores); } // CToken::FindRightmostUnderscore /////////////////////////////////////////////////////////////////////////////// // Class CTokenizer /////////////////////////////////////////////////////////////////////////////// class CTokenizer { public: CTokenizer( TEXT_SOURCE* pTxtSource, IWordSink * pWordSink, IPhraseSink * pPhraseSink, LCID lcid, BOOL bQueryTime, ULONG ulMaxTokenSize); // destructor frees the passed buffer, if it exists virtual ~CTokenizer(void) { } void BreakText(); protected: // // methods // void ProcessToken(); void ProcessTokenInternal(); void BreakCompundString(CTokenState& State, CPropFlag& prop); HRESULT FillBuffer(); void CalculateUpdateEndOfBuffer(); bool CheckAndCreateNumber( WCHAR* pwcsStr, ULONG ulLen, WCHAR* pwcsOut, ULONG* pulOffsetToTxt, ULONG* pulOutLen); int CheckAndCreateNumber( WCHAR* pwcsStr, ULONG ulLen, WCHAR wchSDecimal, WCHAR wchSThousand, WCHAR* pwcsOut, ULONG* pulOffsetToTxt, ULONG* pulOutLen); short ConvertHexCharToNumber(WCHAR wch); void GetValuesFromDateString( CDateTerm* pFormat, WCHAR* pwcsDate, LONG* plD_M1, // we can't tell in this stage whether this is a Day or a month. LONG* plD_M2, LONG* plYear); void GetValuesFromTimeString( CTimeTerm* pFormat, WCHAR* pwcsTime, LONG* plHour, LONG* plMin, LONG* plSec, TimeFormat* pAmPm); LONG ConvertCharToDigit(WCHAR wch); #ifdef DEBUG void TraceToken(); #endif DEBUG bool VerifyAlphaUrl(); bool VerifyWwwUrl(); bool VerifyAcronym(); bool VerifyAbbreviation(); bool VerifySpecialAbbreviation(); bool VerifyHyphenation(); bool VerifyParens(); const CCliticsTerm* VerifyClitics(CTokenState& State); bool VerifyNumber(CTokenState& State); bool VerifyNumberOrTimeOrDate(); bool VerifyTime(CTokenState& State); bool VerifyDate(CTokenState& State); bool VerifyCurrency(); bool VerifyMisc(); bool VerifyCommersialSign(); void ProcessDefault(); ULONG AddBackUnderscores( IN CTokenState& State, IN bool hasFrontUnderscore, IN bool hasBackUnderscore ); bool CheckAndRemoveOneSidedUnderscores(CTokenState& State); void OutputUrl( CTokenState& State); void OutputAcronym( CTokenState& State, const CCliticsTerm* pCliticsTerm); void OutputAbbreviation( CTokenState& State); void OutputSpecialAbbreviation( CTokenState& State, CAbbTerm* pTerm, const CCliticsTerm* pCliticsTerm); virtual void OutputHyphenation( CTokenState& State, const CCliticsTerm* pCliticsTerm); void OutputParens( CTokenState& State); void OutputNumbers( CTokenState& State, ULONG ulLen, WCHAR* pwcsNumber, const CCliticsTerm* pCliticsTerm); void OutputTime( WCHAR* pwcsTime, CTokenState& State); void OutputDate( WCHAR* pwcsDate1, WCHAR* pwcsDate2, CTokenState& State); virtual void OutputSimpleToken( CTokenState& State, const CCliticsTerm* pTerm); void OutputCurrency( ULONG ulLen, WCHAR* pwcsCurrency, CTokenState& State, const CCliticsTerm* pTerm); void OutputMisc( CTokenState& State, bool bPatternContainOnlyUpperCase, ULONG ulSuffixSize, const CCliticsTerm* pCliticsTerm); void OutputCommersialSignToken(CTokenState& State); // // members // LCID m_Lcid; CAutoClassPointer m_apLangSupport; CToken* m_pCurToken; CToken m_Token; #if defined(DEBUG) CTraceWordSink m_apWordSink; #else CComPtr m_apWordSink; #endif CComPtr m_apPhraseSink; TEXT_SOURCE* m_pTxtSource; BOOL m_bQueryTime; ULONG m_ulUpdatedEndOfBuffer; bool m_bNoMoreTxt; // // All Chunks in buffer have a white space // bool m_bWhiteSpaceGuarranteed; ULONG m_ulMaxTokenSize; }; inline HRESULT CTokenizer::FillBuffer() { Trace( elVerbose, s_tagTokenizer, ("WBreakGetNextChar: Filling the buffer")); HRESULT hr; if (!m_bNoMoreTxt) { do { // // this loop usually performs only one rotations. we use it to solve the // problem when the user return 0 characters and a success return code. // the following code assumes that in case you get a success return code then // the buffer is not empty. // hr = m_pTxtSource->pfnFillTextBuffer(m_pTxtSource); } while ((m_pTxtSource->iEnd <= m_pTxtSource->iCur) && SUCCEEDED(hr)); if ( FAILED(hr)) { m_bNoMoreTxt = true; } } if (m_bNoMoreTxt && m_pTxtSource->iCur >= m_pTxtSource->iEnd) { // // we reached the end of the buffer. // return WBREAK_E_END_OF_TEXT; } CalculateUpdateEndOfBuffer(); return S_OK; } inline void CTokenizer::CalculateUpdateEndOfBuffer() { // // m_ulUpdatedEndOfBuffer is a marker for the last character that we can read // from the current buffer before and additional call to fill buffer is needed. // we use this marker to avoid terms spitted between two consecutive buffers. // in order to achieve the above m_ulUpdatedEndOfBuffer will point to a breaker // character. (the only exception to that is when we have a very long term that does // not contains breaker characters). // // // we split the buffer into chunks of TOKENIZER_MAXBUFFERLIMIT size. in each // chunk we make sure that there is a breaker. // ULONG ulStartChunk = m_pTxtSource->iCur; ULONG ulEndChunk ; bool fLastRound = false; Assert(m_pTxtSource->iEnd > m_pTxtSource->iCur); ulEndChunk = m_pTxtSource->iCur + m_ulMaxTokenSize > (m_pTxtSource->iEnd - 1) ? (m_pTxtSource->iEnd - 1) : m_pTxtSource->iCur + m_ulMaxTokenSize; ULONG ulCur; ULONG ulBreakerMarker = 0; m_bWhiteSpaceGuarranteed = false; while(true) { ulCur = ulEndChunk; // // per each chunk we go backward and try to find a WS. // while ((ulCur > ulStartChunk) && (!IS_WS(m_pTxtSource->awcBuffer[ulCur]))) { ulCur--; } if (ulCur == ulStartChunk) { // // the last chunk that we checked did not contain any WS // if (m_ulMaxTokenSize == (ulEndChunk - ulStartChunk)) { // // full buffer case. we look for a default breaker. // ulCur = ulEndChunk; while ( (ulCur > ulStartChunk) && !IS_BREAKER( m_pTxtSource->awcBuffer[ulCur] ) ) { ulCur--; } // // if we found a breaker then ulBreakerMarker will set to it else // the term does not contain any breakers and we set the ulBreakerMarker // to the end of the term. this is the only case that we spilt terms. // ulBreakerMarker = ulCur > ulStartChunk ? ulCur : ulEndChunk; } else { if (ulStartChunk > m_pTxtSource->iCur) { // // case we had a previous chunk. in this case ulStartChunk points to // a breaker // // // ulStart points to the WS from the previous chunk. // ulBreakerMarker = ulStartChunk; } else { ulBreakerMarker = m_pTxtSource->iEnd; } } break; } if (fLastRound) { // // ulCur points to a WS // ulBreakerMarker = ulCur + 1; m_bWhiteSpaceGuarranteed = true; break; } // // move to the next chunk // ulStartChunk = ulCur + 1; // ulStarChunk will points to a breaker if (ulStartChunk + m_ulMaxTokenSize < (m_pTxtSource->iEnd - 1)) { ulEndChunk = ulStartChunk + m_ulMaxTokenSize; } else { ulEndChunk = m_pTxtSource->iEnd - 1; fLastRound = true; } } Assert(ulBreakerMarker <= m_pTxtSource->iEnd); m_ulUpdatedEndOfBuffer = ulBreakerMarker; } inline short CTokenizer::ConvertHexCharToNumber(WCHAR wch) { // // assumes wch is a valid HEX character // Assert(wch >= L'0'); if (wch <= L'9') { return (wch - L'0'); } else if (wch <= L'F') { Assert(wch >= L'A'); return (wch - L'A' + 10); } else if (wch <= L'f') { Assert(wch >= L'a'); return (wch - L'a' + 10); } else if (wch <= 0xFF19) { Assert(wch >= 0xFF10); return (wch - 0xFF10); } else if (wch <= 0xFF26) { Assert(wch >= 0xFF21); return (wch - 0xFF21 + 10); } else { Assert((wch >= 0xFF41) && (wch <= 0xFF46)); return (wch - 0xFF41 + 10); } } inline LONG CTokenizer::ConvertCharToDigit(WCHAR wch) { Assert((wch >= L'0' && wch <= L'9') || ((wch >= 0xFF10) && (wch <= 0xFF19))); if (wch <= L'9') { return (wch - L'0'); } return (wch - 0xFF10); // Full width characters. } #endif _TOKENIZER_H_