486 lines
13 KiB
C++
486 lines
13 KiB
C++
//+---------------------------------------------------------------------------
|
|
//
|
|
// Microsoft Windows
|
|
// Copyright (C) Microsoft Corporation, 1994 - 1997
|
|
//
|
|
// File: IWBreak.cxx
|
|
//
|
|
// Contents: FarEast Word Breaker glue code
|
|
//
|
|
// History: 01-Jul-96 PatHal Created.
|
|
// weibz Merged and modified to NT5
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
#include "pch.cxx"
|
|
#pragma hdrstop
|
|
|
|
#include "iwbreak.hxx"
|
|
|
|
extern long gulcInstances;
|
|
|
|
#ifdef IWBDBG
|
|
void WbDbgOutputInt(WCHAR *pTitle, INT data)
|
|
{
|
|
|
|
WCHAR Outdbg[20];
|
|
int itmp, ii;
|
|
|
|
OutputDebugStringW(pTitle);
|
|
|
|
for (ii=0; ii<20; ii++)
|
|
Outdbg[ii] = 0x0020;
|
|
|
|
ii =7;
|
|
itmp = data;
|
|
Outdbg[ii--] = 0x0000;
|
|
|
|
while (itmp) {
|
|
|
|
if ( (itmp % 16) < 10 )
|
|
Outdbg[ii] = itmp % 16 + L'0';
|
|
else
|
|
Outdbg[ii] = itmp % 16 + L'A' - 10;
|
|
|
|
ii --;
|
|
itmp = itmp / 16;
|
|
}
|
|
|
|
OutputDebugStringW(Outdbg);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CWordBreaker::CWordBreaker
|
|
//
|
|
// Synopsis: Constructor for the CWordBreaker class.
|
|
//
|
|
// Arguments: [lcid] -- locale id
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
CWordBreaker::CWordBreaker( LCID lcid )
|
|
: _cRefs(1),
|
|
_lcid(lcid)
|
|
{
|
|
InterlockedIncrement( &gulcInstances );
|
|
#if defined(TH_LOG)
|
|
_hLog = ThLogOpen( "log.utf");
|
|
#endif
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CWordBreaker::~CWordBreaker
|
|
//
|
|
// Synopsis: Destructor for the CWordBreaker class.
|
|
//
|
|
// Notes: All termination/deallocation is done by embedded smart pointers
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
CWordBreaker::~CWordBreaker()
|
|
{
|
|
InterlockedDecrement( &gulcInstances );
|
|
#if defined(TH_LOG)
|
|
ThLogClose( _hLog );
|
|
#endif
|
|
}
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CWordBreaker::QueryInterface
|
|
//
|
|
// Synopsis: Rebind to other interface
|
|
//
|
|
// Arguments: [riid] -- IID of new interface
|
|
// [ppvObject] -- New interface * returned here
|
|
//
|
|
// Returns: S_OK if bind succeeded, E_NOINTERFACE if bind failed
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
SCODE STDMETHODCALLTYPE
|
|
CWordBreaker::QueryInterface( REFIID riid, void ** ppvObject)
|
|
{
|
|
//
|
|
// Optimize QueryInterface by only checking minimal number of bytes.
|
|
//
|
|
// IID_IUnknown = 00000000-0000-0000-C000-000000000046
|
|
// IID_IWordBreaker = D53552C8-77E3-101A-B552-08002B33B0E6
|
|
// --------
|
|
// |
|
|
// +--- Unique!
|
|
//
|
|
|
|
Assert( (IID_IUnknown.Data1 & 0x000000FF) == 0x00 );
|
|
Assert( (IID_IWordBreaker.Data1 & 0x000000FF) == 0xC8 );
|
|
|
|
IUnknown *pUnkTemp;
|
|
SCODE sc = S_OK;
|
|
|
|
switch( riid.Data1 )
|
|
{
|
|
case 0x00000000:
|
|
if ( memcmp( &IID_IUnknown, &riid, sizeof(riid) ) == 0 )
|
|
pUnkTemp = (IUnknown *)this;
|
|
else
|
|
sc = E_NOINTERFACE;
|
|
break;
|
|
|
|
case 0xD53552C8:
|
|
if ( memcmp( &IID_IWordBreaker, &riid, sizeof(riid) ) == 0 )
|
|
pUnkTemp = (IUnknown *)(IWordBreaker *)this;
|
|
else
|
|
sc = E_NOINTERFACE;
|
|
break;
|
|
|
|
default:
|
|
pUnkTemp = 0;
|
|
sc = E_NOINTERFACE;
|
|
break;
|
|
}
|
|
|
|
if( 0 != pUnkTemp )
|
|
{
|
|
*ppvObject = (void * )pUnkTemp;
|
|
pUnkTemp->AddRef();
|
|
}
|
|
else
|
|
*ppvObject = 0;
|
|
return(sc);
|
|
}
|
|
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CWordBreaker::AddRef
|
|
//
|
|
// Synopsis: Increments refcount
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
ULONG STDMETHODCALLTYPE
|
|
CWordBreaker::AddRef()
|
|
{
|
|
return InterlockedIncrement( &_cRefs );
|
|
}
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CWordBreaker::Release
|
|
//
|
|
// Synopsis: Decrement refcount. Delete if necessary.
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
ULONG STDMETHODCALLTYPE
|
|
CWordBreaker::Release()
|
|
{
|
|
unsigned long uTmp = InterlockedDecrement( &_cRefs );
|
|
|
|
if ( 0 == uTmp )
|
|
delete this;
|
|
|
|
return(uTmp);
|
|
}
|
|
|
|
//+-------------------------------------------------------------------------
|
|
//
|
|
// Method: CWordBreaker::Init
|
|
//
|
|
// Synopsis: Initialize word-breaker
|
|
//
|
|
// Arguments: [fQuery] -- TRUE if query-time
|
|
// [ulMaxTokenSize] -- Maximum size token stored by caller
|
|
// [pfLicense] -- Set to true if use restricted
|
|
//
|
|
// Returns: Status code
|
|
//
|
|
//--------------------------------------------------------------------------
|
|
|
|
SCODE STDMETHODCALLTYPE
|
|
CWordBreaker::Init(
|
|
BOOL fQuery,
|
|
ULONG ulMaxTokenSize,
|
|
BOOL *pfLicense )
|
|
{
|
|
|
|
if ( NULL == pfLicense )
|
|
return E_INVALIDARG;
|
|
|
|
if (IsBadWritePtr(pfLicense, sizeof(DWORD))) {
|
|
return E_INVALIDARG;
|
|
}
|
|
|
|
*pfLicense = TRUE;
|
|
_fQuery = fQuery;
|
|
_ulMaxTokenSize = ulMaxTokenSize;
|
|
|
|
return S_OK;
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CWordBreaker::ComposePhrase
|
|
//
|
|
// Synopsis: Convert a noun and a modifier into a phrase.
|
|
//
|
|
// Arguments: [pwcNoun] -- pointer to noun.
|
|
// [cwcNoun] -- count of chars in pwcNoun
|
|
// [pwcModifier] -- pointer to word modifying pwcNoun
|
|
// [cwcModifier] -- count of chars in pwcModifier
|
|
// [ulAttachmentType] -- relationship between pwcNoun &pwcModifier
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
SCODE STDMETHODCALLTYPE
|
|
CWordBreaker::ComposePhrase(
|
|
WCHAR const *pwcNoun,
|
|
ULONG cwcNoun,
|
|
WCHAR const *pwcModifier,
|
|
ULONG cwcModifier,
|
|
ULONG ulAttachmentType,
|
|
WCHAR *pwcPhrase,
|
|
ULONG *pcwcPhrase )
|
|
{
|
|
//
|
|
// Need to code in later
|
|
//
|
|
if ( _fQuery )
|
|
return( E_NOTIMPL );
|
|
else
|
|
return ( WBREAK_E_QUERY_ONLY );
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CWordBreaker::GetLicenseToUse
|
|
//
|
|
// Synopsis: Returns a pointer to vendors license information
|
|
//
|
|
// Arguments: [ppwcsLicense] -- ptr to ptr to which license info is returned
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
SCODE STDMETHODCALLTYPE
|
|
CWordBreaker::GetLicenseToUse(
|
|
const WCHAR **ppwcsLicense )
|
|
{
|
|
static WCHAR const * wcsCopyright = L"Copyright Microsoft, 1991-1998";
|
|
|
|
if ( NULL == ppwcsLicense )
|
|
return E_INVALIDARG;
|
|
|
|
if (IsBadWritePtr(ppwcsLicense, sizeof(DWORD))) {
|
|
return E_INVALIDARG;
|
|
}
|
|
|
|
*ppwcsLicense = wcsCopyright;
|
|
return( S_OK );
|
|
}
|
|
|
|
//+---------------------------------------------------------------------------
|
|
//
|
|
// Member: CWordBreaker::BreakText
|
|
//
|
|
// Synopsis: Break input stream into words.
|
|
//
|
|
// Arguments: [pTextSource] -- source of Unicode text
|
|
// [pWordSink] -- sink for collecting words
|
|
// [pPhraseSink] -- sink for collecting phrases
|
|
//
|
|
// Notes: Since the input buffer may be greater than MAX_BUFFER_LEN
|
|
// we process the buffer in chunks of length MAX_BUFFER_LEN.
|
|
//
|
|
//----------------------------------------------------------------------------
|
|
|
|
SCODE STDMETHODCALLTYPE
|
|
CWordBreaker::BreakText(
|
|
TEXT_SOURCE *pTextSource,
|
|
IWordSink *pWordSink,
|
|
IPhraseSink *pPhraseSink )
|
|
{
|
|
|
|
SCODE sc = S_OK;
|
|
|
|
if ( NULL == pWordSink ) {
|
|
// BUGBUG, propagate the null word sink error code
|
|
return sc;
|
|
}
|
|
|
|
// BUGBUG, need to normalize nums within T-Hammer, pass as flag?
|
|
|
|
// turn on noun phrase analysis if there is a phrase sink
|
|
if ( 0 != pPhraseSink ) {
|
|
// BUGBUG, do we need to pass a separate flag to T-Hammer for this?
|
|
// ignore the phrase sink for now
|
|
// return sc;
|
|
}
|
|
|
|
if ( ( NULL == pTextSource ) ||
|
|
( pTextSource->iEnd < pTextSource->iCur ) ) {
|
|
return E_INVALIDARG;
|
|
}
|
|
|
|
if (pTextSource->iEnd == pTextSource->iCur) {
|
|
return S_OK;
|
|
}
|
|
|
|
CONST WCHAR *pwcInput, *pwcStem;
|
|
ULONG cwc, cwcTail, iwcCurrent;
|
|
|
|
DWORD i;
|
|
BYTE ct;
|
|
BOOL fRomanWord = FALSE;
|
|
|
|
__try {
|
|
|
|
cwcTail = pTextSource->iEnd - pTextSource->iCur;
|
|
|
|
|
|
#ifdef IWBDBG
|
|
{
|
|
WCHAR tmp[2];
|
|
DWORD ii;
|
|
|
|
WbDbgOutputInt(L"\niCur=", pTextSource->iCur);
|
|
WbDbgOutputInt(L"\niEnd=", pTextSource->iEnd);
|
|
|
|
OutputDebugStringW(L"\n the Source String is:\n");
|
|
for (ii=pTextSource->iCur; ii<pTextSource->iEnd; ii++) {
|
|
tmp[0] = *(pTextSource->awcBuffer + ii);
|
|
tmp[1] = L'\0';
|
|
OutputDebugStringW(tmp);
|
|
}
|
|
|
|
OutputDebugStringW(L"\n");
|
|
|
|
}
|
|
|
|
#endif
|
|
do {
|
|
|
|
cwc = pTextSource->iEnd - pTextSource->iCur;
|
|
|
|
// Reinit the callback data structure
|
|
iwcCurrent = pTextSource->iCur;
|
|
pwcStem = pwcInput = pTextSource->awcBuffer + pTextSource->iCur;
|
|
|
|
for (i=0; i< cwc; i++, pwcInput++) {
|
|
|
|
if (*(pwcInput) != 0) {
|
|
ct = GetCharType(*pwcInput);
|
|
|
|
if (ct == CH) {
|
|
if (!fRomanWord) {
|
|
pwcStem = pwcInput;
|
|
fRomanWord = TRUE;
|
|
}
|
|
}
|
|
else {
|
|
if (fRomanWord) {
|
|
DWORD cwcTemp = (DWORD)(pwcInput - pwcStem);
|
|
if (cwcTemp > 0) {
|
|
(pWordSink->PutWord)(cwcTemp, pwcStem, cwcTemp,
|
|
iwcCurrent + (i - cwcTemp));
|
|
}
|
|
fRomanWord = FALSE;
|
|
}
|
|
// else {
|
|
switch (ct) {
|
|
case PS:
|
|
(pWordSink->PutBreak)( WORDREP_BREAK_EOS );
|
|
case WS:
|
|
break;
|
|
default:
|
|
(pWordSink->PutWord)(1, pwcInput, 1, iwcCurrent + i);
|
|
break;
|
|
}
|
|
// }
|
|
}
|
|
}
|
|
}
|
|
|
|
if ( !fRomanWord )
|
|
pTextSource->iCur += i;
|
|
else {
|
|
CONST WCHAR *pStart;
|
|
|
|
pStart = pTextSource->awcBuffer + pTextSource->iCur;
|
|
pTextSource->iCur += (DWORD)(pwcStem - pStart);
|
|
|
|
fRomanWord = FALSE;
|
|
}
|
|
|
|
cwcTail = pTextSource->iEnd - pTextSource->iCur;
|
|
|
|
} while ( SUCCEEDED(pTextSource->pfnFillTextBuffer(pTextSource)) );
|
|
|
|
// Don't ignore the tail HPB
|
|
if (cwcTail > 0) {
|
|
|
|
iwcCurrent = pTextSource->iCur;
|
|
pwcInput = pTextSource->awcBuffer + pTextSource->iCur;
|
|
|
|
|
|
for (i=0; i< cwcTail; i++, pwcInput++) {
|
|
if (*(pwcInput) != 0) {
|
|
ct = GetCharType(*pwcInput);
|
|
|
|
if (ct == CH) {
|
|
if (!fRomanWord) {
|
|
pwcStem = pwcInput;
|
|
fRomanWord = TRUE;
|
|
}
|
|
}
|
|
else {
|
|
if (fRomanWord) {
|
|
DWORD cwcTemp = (DWORD)(pwcInput - pwcStem);
|
|
(pWordSink->PutWord)(cwcTemp, pwcStem, cwcTemp,
|
|
iwcCurrent + (i - cwcTemp));
|
|
fRomanWord = FALSE;
|
|
}
|
|
// else {
|
|
switch (ct) {
|
|
case PS:
|
|
(pWordSink->PutBreak)( WORDREP_BREAK_EOS );
|
|
case WS:
|
|
break;
|
|
default:
|
|
(pWordSink->PutWord)(1, pwcInput, 1, iwcCurrent + i);
|
|
break;
|
|
}
|
|
// }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// put the last English word
|
|
if (fRomanWord) {
|
|
DWORD cwcTemp = (DWORD)(pwcInput - pwcStem);
|
|
|
|
assert( cwcTemp > 0);
|
|
|
|
if ( 0 == *(pwcInput-1) ) {
|
|
i--;
|
|
cwcTemp--;
|
|
}
|
|
|
|
(pWordSink->PutWord)(cwcTemp, pwcStem, cwcTemp,
|
|
iwcCurrent + (i - cwcTemp));
|
|
|
|
fRomanWord = FALSE;
|
|
}
|
|
|
|
} __except(1) {
|
|
|
|
sc = E_UNEXPECTED;
|
|
}
|
|
|
|
return sc;
|
|
}
|
|
|