213 lines
6.5 KiB
C
213 lines
6.5 KiB
C
// ITWBRK.H: (from Tripoli) IWordBreaker, IWordSink, IPhraseSink, IStem
|
|
// (from InfoTech) IWordBreakerConfig
|
|
// (from Tripoli and InfoTech) Supporting definitions.
|
|
|
|
#ifndef __ITWBRK_H__
|
|
#define __ITWBRK_H__
|
|
|
|
#include <comdef.h>
|
|
#include <itstem.h>
|
|
|
|
|
|
#ifndef __IPhraseSink_FWD_DEFINED__
|
|
#define __IPhraseSink_FWD_DEFINED__
|
|
typedef interface IPhraseSink IPhraseSink;
|
|
#endif /* __IPhraseSink_FWD_DEFINED__ */
|
|
|
|
|
|
#ifndef __IWordSink_FWD_DEFINED__
|
|
#define __IWordSink_FWD_DEFINED__
|
|
typedef interface IWordSink IWordSink;
|
|
#endif /* __IWordSink_FWD_DEFINED__ */
|
|
|
|
|
|
#ifndef __IWordBreaker_FWD_DEFINED__
|
|
#define __IWordBreaker_FWD_DEFINED__
|
|
typedef interface IWordBreaker IWordBreaker;
|
|
#endif /* __IWordBreaker_FWD_DEFINED__ */
|
|
|
|
|
|
#ifndef __IWordBreakerConfig_FWD_DEFINED__
|
|
#define __IWordBreakerConfig_FWD_DEFINED__
|
|
typedef interface IWordBreakerConfig IWordBreakerConfig;
|
|
#endif /* __IWordBreakerConfig_FWD_DEFINED__ */
|
|
|
|
|
|
#ifndef __IITStopWordList_FWD_DEFINED__
|
|
#define __IITStopWordList_FWD_DEFINED__
|
|
typedef interface IITStopWordList IITStopWordList;
|
|
#endif /* __IITStopWordList_FWD_DEFINED__ */
|
|
|
|
|
|
// Supporting definitions for IWordBreaker.
|
|
typedef struct tagTEXT_SOURCE TEXT_SOURCE;
|
|
typedef SCODE (__stdcall *PFNFILLTEXTBUFFER)(TEXT_SOURCE *pTextSource);
|
|
|
|
typedef struct tagTEXT_SOURCE
|
|
{
|
|
PFNFILLTEXTBUFFER pfnFillTextBuffer;
|
|
WCHAR *awcBuffer;
|
|
ULONG iEnd;
|
|
ULONG iCur;
|
|
} TEXT_SOURCE;
|
|
|
|
|
|
DECLARE_INTERFACE_(IWordBreaker, IUnknown)
|
|
{
|
|
STDMETHOD(Init)(BOOL fQuery, ULONG ulMaxTokenSize, BOOL *pfLicense) PURE;
|
|
STDMETHOD(BreakText)(TEXT_SOURCE *pTextSource, IWordSink *pWordSink,
|
|
IPhraseSink *pPhraseSink) PURE;
|
|
STDMETHOD(ComposePhrase)(WCHAR const *pwcNoun, ULONG cwcNoun,
|
|
WCHAR const *pwcModifier, ULONG cwcModifier,
|
|
ULONG ulAttachmentType, WCHAR *pwcPhrase,
|
|
ULONG *pcwcPhrase) PURE;
|
|
STDMETHOD(GetLicenseToUse)(WCHAR const **ppwcsLicense) PURE;
|
|
};
|
|
|
|
typedef IWordBreaker *PIWBRK;
|
|
|
|
|
|
// Break word types that can be passed to
|
|
// IWordBreakerConfig::SetBreakWordType.
|
|
#define IITWBC_BREAKTYPE_TEXT ((DWORD) 0)
|
|
#define IITWBC_BREAKTYPE_NUMBER ((DWORD) 1)
|
|
#define IITWBC_BREAKTYPE_DATE ((DWORD) 2)
|
|
#define IITWBC_BREAKTYPE_TIME ((DWORD) 3)
|
|
#define IITWBC_BREAKTYPE_EPOCH ((DWORD) 4)
|
|
|
|
|
|
// Breaker control flags that can be passed to
|
|
// IWordBreakerConfig::SetControlInfo.
|
|
#define IITWBC_BREAK_ACCEPT_WILDCARDS 0x00000001 // Interpret wildcard chars
|
|
// as such.
|
|
#define IITWBC_BREAK_AND_STEM 0x00000002 // Stem words after breaking
|
|
// them.
|
|
|
|
// External data types that can be passed to
|
|
// IWordBreakerConfig::LoadExternalBreakerData.
|
|
#define IITWBC_EXTDATA_CHARTABLE ((DWORD) 0)
|
|
#define IITWBC_EXTDATA_STOPWORDLIST ((DWORD) 1)
|
|
|
|
|
|
DECLARE_INTERFACE_(IWordBreakerConfig, IUnknown)
|
|
{
|
|
// Sets/gets locale info that will affect the word breaking
|
|
// behavior of IWordBreaker::BreakText.
|
|
// Returns S_OK if locale described by params is supported
|
|
// by the breaker object; E_INVALIDARG otherwise.
|
|
STDMETHOD(SetLocaleInfo)(DWORD dwCodePageID, LCID lcid) PURE;
|
|
STDMETHOD(GetLocaleInfo)(DWORD *pdwCodePageID, LCID *plcid) PURE;
|
|
|
|
// Sets/gets the type of words the breaker should expect
|
|
// to see in all subsequent calls to IWordBreaker::BreakText.
|
|
// Returns S_OK if the type is understood by the breaker
|
|
// object; E_INVALIDARG otherwise.
|
|
STDMETHOD(SetBreakWordType)(DWORD dwBreakWordType) PURE;
|
|
STDMETHOD(GetBreakWordType)(DWORD *pdwBreakWordType) PURE;
|
|
|
|
// Sets/gets info that controls certain aspects of word breaking.
|
|
// This method currently accepts only the following set of flags
|
|
// in grfBreakFlags:
|
|
// IITWBC_BREAK_ACCEPT_WILDCARDS
|
|
// IITWBC_BREAK_AND_STEM
|
|
// In the future, additional information may be passed in through
|
|
// dwReserved.
|
|
STDMETHOD(SetControlInfo)(DWORD grfBreakFlags, DWORD dwReserved) PURE;
|
|
STDMETHOD(GetControlInfo)(DWORD *pgrfBreakFlags, DWORD *pdwReserved) PURE;
|
|
|
|
// Will load external breaker data, such as a table containing
|
|
// char-by-char break information or a list of stop words.
|
|
// Although the format of the data in the stream is entirely
|
|
// implementation-specific, this interface does define a couple
|
|
// of general types for that data which can be passed in
|
|
// dwStreamDataType:
|
|
// IITWBC_EXTDATA_CHARTABLE
|
|
// IITWBC_EXTDATA_STOPWORDLIST
|
|
STDMETHOD(LoadExternalBreakerData)(IStream *pStream,
|
|
DWORD dwExtDataType) PURE;
|
|
|
|
// These methods allow a stemmer to be associated with the breaker. The
|
|
// breaker will take responsibility for calling
|
|
// IPersistStreamInit::Load/Save when it is loaded/saved if the stemmer
|
|
// supports that interface.
|
|
STDMETHOD(SetWordStemmer)(REFCLSID rclsid, IStemmer *pStemmer) PURE;
|
|
STDMETHOD(GetWordStemmer)(IStemmer **ppStemmer) PURE;
|
|
};
|
|
|
|
typedef IWordBreakerConfig *PIWBRKC;
|
|
|
|
|
|
// Supporting definitions for IWordSink.
|
|
typedef enum tagWORDREP_BREAK_TYPE
|
|
{
|
|
WORDREP_BREAK_EOW = 0,
|
|
WORDREP_BREAK_EOS = 1,
|
|
WORDREP_BREAK_EOP = 2,
|
|
WORDREP_BREAK_EOC = 3
|
|
} WORDREP_BREAK_TYPE;
|
|
|
|
|
|
DECLARE_INTERFACE_(IWordSink, IUnknown)
|
|
{
|
|
STDMETHOD(PutWord)(WCHAR const *pwcInBuf, ULONG cwc,
|
|
ULONG cwcSrcLen, ULONG cwcSrcPos) PURE;
|
|
STDMETHOD(PutAltWord)(WCHAR const *pwcInBuf, ULONG cwc,
|
|
ULONG cwcSrcLen, ULONG cwcSrcPos) PURE;
|
|
STDMETHOD(StartAltPhrase)(void) PURE;
|
|
STDMETHOD(EndAltPhrase)(void) PURE;
|
|
STDMETHOD(PutBreak)(WORDREP_BREAK_TYPE breakType) PURE;
|
|
};
|
|
|
|
typedef IWordSink *PIWRDSNK;
|
|
|
|
|
|
DECLARE_INTERFACE_(IPhraseSink, IUnknown)
|
|
{
|
|
STDMETHOD(PutSmallPhrase)(WCHAR const *pwcNoun, ULONG cwcNoun,
|
|
WCHAR const *pwcModifier,
|
|
ULONG cwcModifier,
|
|
ULONG ulAttachmentType) PURE;
|
|
STDMETHOD(PutPhrase)(WCHAR const *pwcPhrase, ULONG cwcPhrase) PURE;
|
|
};
|
|
|
|
typedef IPhraseSink *PIPHRSNK;
|
|
|
|
|
|
// Function or macro that can be used by a breaker implementation
|
|
// to pull characters from the caller's text source.
|
|
#ifdef __cplusplus
|
|
|
|
inline WCHAR WBreakGetWChar(TEXT_SOURCE *pTextSource )
|
|
{
|
|
if ( pTextSource->iCur == pTextSource->iEnd )
|
|
{
|
|
if ( FAILED(pTextSource->pfnFillTextBuffer( pTextSource ) ) )
|
|
return 0xFFFF; // UniCode EOF
|
|
}
|
|
|
|
return pTextSource->awcBuffer[pTextSource->iCur++];
|
|
};
|
|
|
|
#else
|
|
|
|
#define WBreakGetWChar( pTextSource )\
|
|
(pTextSource->iCur==pTextSource->iEnd)\
|
|
? (FAILED(pTextSource->pfnFillTextBuffer( pTextSource )) \
|
|
? 0xFFFF\
|
|
: pTextSource->awcBuffer[pTextSource->iCur++])\
|
|
: pTextSource->awcBuffer[pTextSource->iCur++]
|
|
|
|
#endif
|
|
|
|
|
|
DECLARE_INTERFACE_(IITStopWordList, IUnknown)
|
|
{
|
|
STDMETHOD(AddWord)(WCHAR const *pwcInBuf, ULONG cwc) PURE;
|
|
STDMETHOD(LookupWord)(WCHAR const *pwcInBuf, ULONG cwc) PURE;
|
|
};
|
|
|
|
typedef IITStopWordList *PIITSTWDL;
|
|
|
|
|
|
#endif // __ITWBRK_H__
|