windows-nt/Source/XPSP1/NT/inetsrv/intlwb/enu/wordbreaker/custombreaking.cpp
2020-09-26 16:20:57 +08:00

323 lines
8.2 KiB
C++

////////////////////////////////////////////////////////////////////////////////
//
// Filename : Tokenizer.cpp
// Purpose : Tokenizer declerations
//
// Project : WordBreakers
// Component: English word breaker
//
// Author : yairh
//
// Log:
//
// Jan 06 2000 yairh creation
// Apr 05 2000 dovh - Fixed two problematic debug / tracer buffer size
// problems. (Fix Bug 15449).
// May 07 2000 dovh - USE_WS_SENTINEL algorithm in BreakText
//
////////////////////////////////////////////////////////////////////////////////
#include "base.h"
#include "CustomBreaking.h"
#include "proparray.h"
#include "AutoPtr.h"
#include "excption.h"
#include "SpanishUtils.h"
#include "WbUtils.h"
#ifndef WHISTLER_BUILD
#include "LanguageResources_i.c"
#endif // WHISTLER_BUILD
CAutoClassPointer<CCustomBreaker> g_apEngCustomBreaker;
CAutoClassPointer<CCustomBreaker> g_apEngUKCustomBreaker;
CAutoClassPointer<CCustomBreaker> g_apFrnCustomBreaker;
CAutoClassPointer<CCustomBreaker> g_apSpnCustomBreaker;
CAutoClassPointer<CCustomBreaker> g_apItlCustomBreaker;
CCustomWordTerm::CCustomWordTerm(const WCHAR* pwcs) :
m_ulStartTxt(0),
m_ulEndTxt(0),
m_pwcs(NULL)
{
ULONG ulLen = wcslen(pwcs);
CAutoArrayPointer<WCHAR> ap;
ap = new WCHAR[ulLen + 1];
wcscpy(ap.Get(), pwcs);
while ((m_ulStartTxt < ulLen) &&
TEST_PROP(GET_PROP(ap.Get()[m_ulStartTxt]), CUSTOM_PUNCT_HEAD))
{
m_ulStartTxt++;
}
if (m_ulStartTxt == ulLen)
{
THROW_HRESULT_EXCEPTION(E_INVALIDARG);
}
m_ulEndTxt = ulLen;
while(m_ulEndTxt &&
TEST_PROP(GET_PROP(ap.Get()[m_ulEndTxt - 1]), CUSTOM_PUNCT_TAIL))
{
m_ulEndTxt--;
}
if (m_ulEndTxt <= m_ulStartTxt)
{
THROW_HRESULT_EXCEPTION(E_INVALIDARG);
}
m_pwcs = ap.Detach();
m_ulLen = ulLen;
}
bool CCustomWordTerm::CheckWord(
const ULONG ulBufLen,
ULONG ulOffsetToBaseWord,
ULONG ulBaseWordLen,
const WCHAR* pwcsBuf,
ULONG* pulMatchOffset,
ULONG* pulMatchLen)
{
ULONG ulStartTxt = m_ulStartTxt;
while (ulOffsetToBaseWord &&
ulStartTxt &&
m_pwcs[ulStartTxt] == pwcsBuf[ulOffsetToBaseWord])
{
ulOffsetToBaseWord--;
ulStartTxt--;
ulBaseWordLen++;
}
if (ulStartTxt)
{
return false;
}
ULONG ulEndTxt = m_ulEndTxt;
while ((ulEndTxt < m_ulLen) &&
(ulOffsetToBaseWord + ulBaseWordLen < ulBufLen) &&
(m_pwcs[ulEndTxt] == pwcsBuf[ulOffsetToBaseWord + ulBaseWordLen ]))
{
ulEndTxt++;
ulBaseWordLen++;
}
if (ulEndTxt != m_ulLen)
{
return false;
}
*pulMatchOffset = ulOffsetToBaseWord;
*pulMatchLen = ulBaseWordLen;
return true;
}
void CCustomWordCollection::AddWord(const WCHAR* pwcs)
{
CAutoClassPointer<CCustomWordTerm> ap;
ap = new CCustomWordTerm(pwcs);
m_vaWordCollection[m_ulCount] = ap.Get();
m_ulCount++;
ap.Detach();
}
bool CCustomWordCollection::CheckWord(
const ULONG ulLen,
const ULONG ulOffsetToBaseWord,
const ULONG ulBaseWordLen,
const WCHAR* pwcsBuf,
ULONG* pulMatchOffset,
ULONG* pulMatchLen)
{
for (ULONG ul = 0; ul < m_ulCount; ul++)
{
bool fRet = m_vaWordCollection[ul]->CheckWord(
ulLen,
ulOffsetToBaseWord,
ulBaseWordLen,
pwcsBuf,
pulMatchOffset,
pulMatchLen);
if (fRet)
{
return true;
}
}
return false;
}
CCustomBreaker::CCustomBreaker(LCID lcid) :
m_Trie(true),
m_ulWordCount(0)
{
CVarString vsPath;
if (false == GetCustomWBFilePath(lcid, vsPath))
{
return;
}
CStandardCFile Words((LPWSTR)vsPath, L"r", false);
if (!((FILE*)Words))
{
return;
}
WCHAR pwcsBuf[64];
DictStatus status;
while(fgetws(pwcsBuf, 64, (FILE*) Words))
{
m_ulWordCount++;
ULONG ulLen = wcslen(pwcsBuf);
if (ulLen && pwcsBuf[ulLen - 1] == L'\n')
{
pwcsBuf[ulLen - 1] = L'\0';
ulLen--;
}
if (0 == ulLen)
{
continue;
}
try
{
CAutoClassPointer<CCustomWordCollection> apCollection = new CCustomWordCollection;
apCollection->AddWord(pwcsBuf);
WCHAR* pwcsKey = pwcsBuf + apCollection->GetFirstWord()->GetTxtStart();
pwcsBuf[apCollection->GetFirstWord()->GetTxtEnd()] = L'\0';
DictStatus status;
CCustomWordCollection* pExistingCollection;
status = m_Trie.trie_Insert(
pwcsKey,
TRIE_DEFAULT,
apCollection.Get(),
&pExistingCollection);
if (DICT_ITEM_ALREADY_PRESENT == status)
{
pExistingCollection->AddWord(apCollection->GetFirstWord()->GetTxt());
}
else if (DICT_SUCCESS == status)
{
apCollection.Detach();
continue;
}
}
catch (CHresultException& h)
{
if (E_INVALIDARG == (HRESULT)h)
{
continue;
}
else
{
throw h;
}
}
}
}
//
// The idea behind the algorithm is to store a list of special patterns that should not
// be broken. We also want to be able to recognize those patterns when few punctuations
// are attached to them. For example if .NET is a special pattern then in the following
// patterns (.NET) .NET! .NET? we also want to recognize the .NET pattern and emit .NET
// It is more complicated in the next case - NET!. The expected behavior is not to break it.
// So algorithm need to identify when a punctuation is part of the token and not be broken
// and when it is just a breaker.
// The algorithm is
// 1. Initialization.
// for each token is the file
// a. Remove punctuations from the beginning and ending of the token - we will
// reference it as the base form of the token.
// b. Insert the base form to a dictionary. Each base form will be pointing to the
// generating token. Few tokens can be mapped to the same base form
// (NET? and NET!) so each base form will point to a collection of generating tokens
// 2. Breaking.
// For each pattern you get from the document
// a. perform 1a.
// b. look for the resulting base form in the dictionary.
// c. per each item in the collection check whether the generating token exist in the
// pattern we got from the document.
//
bool CCustomBreaker::BreakText(
ULONG ulLen,
WCHAR* pwcsBuf,
ULONG* pulOutLen,
ULONG* pulOffset)
{
DictStatus status;
CCustomWordCollection* pCollection;
short sCount = 0;
ULONG ul = 0;
while ((ul < ulLen) &&
TEST_PROP(GET_PROP(pwcsBuf[ul]), CUSTOM_PUNCT_HEAD))
{
ul++;
}
ULONG ulOffsetToBase = ul;
if (ulOffsetToBase == ulLen)
{
return false;
}
ULONG ulBaseLen = ulLen;
while(ulBaseLen &&
TEST_PROP(GET_PROP(pwcsBuf[ulBaseLen - 1]), CUSTOM_PUNCT_TAIL))
{
ulBaseLen--;
}
if (ulBaseLen <= ulOffsetToBase)
{
return false;
}
ulBaseLen -= ulOffsetToBase;
status = m_Trie.trie_Find(
pwcsBuf + ulOffsetToBase,
TRIE_LONGEST_MATCH,
1,
&pCollection,
&sCount);
if (sCount)
{
bool bRet;
bRet = pCollection->CheckWord(
ulLen,
ulOffsetToBase,
ulBaseLen,
pwcsBuf,
pulOffset,
pulOutLen);
return bRet;
}
return false;
}