804 lines
26 KiB
C++
804 lines
26 KiB
C++
|
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
|
||
|
//
|
||
|
// Copyright (c) 2001 Microsoft Corporation. All rights reserved.
|
||
|
//
|
||
|
// Module:
|
||
|
// volcano/dll/lattice-lm.cpp
|
||
|
//
|
||
|
// Description:
|
||
|
// Interface between the recognizer and IFELang3. Unlike the rest of
|
||
|
// the recognizer this is written in C++ because IFELang3 uses a COM
|
||
|
// interface.
|
||
|
//
|
||
|
// Author:
|
||
|
// hrowley
|
||
|
//
|
||
|
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
|
||
|
|
||
|
// Conditional define for use of IFELang3, so it can be turned off for WinCE
|
||
|
#ifdef USE_IFELANG3
|
||
|
|
||
|
#include <stdlib.h>
|
||
|
#include <limits.h>
|
||
|
|
||
|
// Make sure the GUIDs defined in imlang.h actually get instantiated
|
||
|
#define INITGUID
|
||
|
|
||
|
// Uncomment to dump out the lattice fed to the language model
|
||
|
//#define DUMP_LM_LATTICE
|
||
|
// Uncomment to append all the lattices into one file rather than overwriting.
|
||
|
//#define DUMP_LM_LATTICE_APPEND
|
||
|
#define LM_LATTICE_FILENAME L"c:/temp/cht-lm-log.txt"
|
||
|
// Uncomment this to get the dumped lattice in the form expected by the CHS test program
|
||
|
//#define LM_LATTICE_UNICODE
|
||
|
//#define LM_LATTICE_CHS
|
||
|
#define LM_LATTICE_CHT
|
||
|
|
||
|
#if defined(HWX_TUNE) || defined(DUMP_LM_LATTICE)
|
||
|
#include <stdio.h>
|
||
|
#endif
|
||
|
|
||
|
#include "volcanop.h"
|
||
|
|
||
|
// Japanese and IFELang3 generic stuff
|
||
|
#include "imlang.h"
|
||
|
|
||
|
// Simplified and traditional Chinese
|
||
|
#include "fel3user.h"
|
||
|
|
||
|
#ifdef USE_IFELANG3_BIGRAMS
|
||
|
#include "lattice-bigram.h"
|
||
|
#endif
|
||
|
|
||
|
// A global pointer to the IIMLanguage interface object
|
||
|
static IIMLanguage *g_pIFELang3=NULL;
|
||
|
|
||
|
// Each language has its own set of GUIDs used as command codes, so that the IIMLanguage interface can direct requests to
|
||
|
// the appropriate language model.
|
||
|
static int SetLanguageModelCommands(IMLANGUAGE_LM aLMITEM[3], IMLANGUAGE_LM_PARAM *sTuneParam, SImeLMParam *sHWTip1Param)
|
||
|
{
|
||
|
int nLMITEM = 0;
|
||
|
|
||
|
// Japanese
|
||
|
if (wcsicmp(g_szRecognizerLanguage,L"JPN")==0) {
|
||
|
/* aLMITEM[nLMITEM].guid = CLSID_MakeLatticeForChar;
|
||
|
aLMITEM[nLMITEM].pParam = NULL;
|
||
|
nLMITEM++;
|
||
|
aLMITEM[nLMITEM].guid = CLSID_AttachNGram;
|
||
|
aLMITEM[nLMITEM].pParam = NULL;
|
||
|
nLMITEM++;
|
||
|
aLMITEM[nLMITEM].guid = CLSID_SearchBestPath;
|
||
|
aLMITEM[nLMITEM].pParam = NULL;
|
||
|
nLMITEM++; */
|
||
|
aLMITEM[nLMITEM].guid = CLSID_JPN_IMELM_BBO_OS;
|
||
|
aLMITEM[nLMITEM].pParam = NULL;
|
||
|
nLMITEM++;
|
||
|
}
|
||
|
|
||
|
// For now, disable the use of the Chinese and Korean language models, since
|
||
|
// at least the Chinese language models currently are hurting accuracy.
|
||
|
|
||
|
// Simplified Chinese
|
||
|
if (wcsicmp(g_szRecognizerLanguage,L"CHS")==0) {
|
||
|
sHWTip1Param->dwLicenseId = CHINESE_IMELM_LICENSEID;
|
||
|
sHWTip1Param->flWeight = (float)(1.0);
|
||
|
sTuneParam->guid = GUID_CHINESE_IMELM_PARAM;
|
||
|
sTuneParam->dwSize = sizeof(SImeLMParam);
|
||
|
sTuneParam->pbData = (BYTE*)sHWTip1Param;
|
||
|
aLMITEM[nLMITEM].guid = CLSID_CHS_IMELM_BBO_OS;
|
||
|
aLMITEM[nLMITEM].pParam = sTuneParam;
|
||
|
nLMITEM++;
|
||
|
}
|
||
|
|
||
|
#if 0
|
||
|
// Traditional Chinese
|
||
|
if (wcsicmp(g_szRecognizerLanguage,L"CHT")==0) {
|
||
|
sHWTip1Param->dwLicenseId = CHINESE_IMELM_LICENSEID;
|
||
|
sHWTip1Param->flWeight = (float)(1.0);
|
||
|
sTuneParam->guid = GUID_CHINESE_IMELM_PARAM;
|
||
|
sTuneParam->dwSize = sizeof(SImeLMParam);
|
||
|
sTuneParam->pbData = (BYTE*)sHWTip1Param;
|
||
|
aLMITEM[nLMITEM].guid = CLSID_CHT_IMELM_BBO_OS;
|
||
|
aLMITEM[nLMITEM].pParam = sTuneParam;
|
||
|
nLMITEM++;
|
||
|
}
|
||
|
|
||
|
// Korean
|
||
|
if (wcsicmp(g_szRecognizerLanguage,L"KOR")==0) {
|
||
|
aLMITEM[nLMITEM].guid = CLSID_KOR_IMELM_BBO_OS;
|
||
|
aLMITEM[nLMITEM].pParam = NULL;
|
||
|
nLMITEM++;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
// Return the number of command codes, or 0 if there was no match for the current language
|
||
|
return nLMITEM;
|
||
|
}
|
||
|
|
||
|
// This function tests the IIMLanguage interface to see if the appropriate language model is
|
||
|
// installed.
|
||
|
BOOL TryDummyLattice()
|
||
|
{
|
||
|
IMLANGUAGE_LM_PARAM sTuneParam;
|
||
|
SImeLMParam sHWTip1Param;
|
||
|
IMLANGUAGE_LM aLMITEM[3];
|
||
|
int nLMITEM = 0;
|
||
|
HRESULT rc;
|
||
|
|
||
|
// Set return code to look like an error
|
||
|
rc = E_FAIL;
|
||
|
|
||
|
// Set up the language specific commands
|
||
|
nLMITEM = SetLanguageModelCommands(aLMITEM, &sTuneParam, &sHWTip1Param);
|
||
|
|
||
|
if (nLMITEM>0) {
|
||
|
// If there are any commands for that language, then apply them to the lattice.
|
||
|
rc = g_pIFELang3->GetLatticeMorphResult(nLMITEM, aLMITEM, 0, NULL, 0, NULL);
|
||
|
}
|
||
|
|
||
|
return (SUCCEEDED(rc));
|
||
|
}
|
||
|
|
||
|
// Load up IIMLanguage and check if it has a language model for this recognizer's language.
|
||
|
// If successful, returns TRUE, otherwise returns FALSE.
|
||
|
BOOL LatticeConfigIFELang3()
|
||
|
{
|
||
|
if (g_pIFELang3 != NULL)
|
||
|
{
|
||
|
return FALSE;
|
||
|
}
|
||
|
HRESULT res = CoCreateInstance(CLSID_IMLanguage_OS, NULL, CLSCTX_INPROC_SERVER, IID_IMLanguage, (void**)&g_pIFELang3);
|
||
|
// TPDBG_DMSG2("%08X: CoCreate(CLSID_IMLanguage_OS) -> %08X\n", GetCurrentThreadId(), res);
|
||
|
if (SUCCEEDED(res)) {
|
||
|
if (TryDummyLattice()) {
|
||
|
// TPDBG_DMSG1("%08X: TryDummyLattice() -> TRUE\n", GetCurrentThreadId());
|
||
|
return TRUE;
|
||
|
}
|
||
|
}
|
||
|
// Clean up if things didn't work.
|
||
|
// TPDBG_DMSG1("%08X: TryDummyLattice() -> FALSE\n", GetCurrentThreadId());
|
||
|
LatticeUnconfigIFELang3();
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
// Check whether IIMLanguage is loaded.
|
||
|
BOOL LatticeIFELang3Available()
|
||
|
{
|
||
|
return (g_pIFELang3!=NULL);
|
||
|
}
|
||
|
|
||
|
// Unload IIMLanguage if it is loaded.
|
||
|
BOOL LatticeUnconfigIFELang3()
|
||
|
{
|
||
|
if (g_pIFELang3!=NULL) {
|
||
|
// TPDBG_DMSG1("%08X: g_pIFELang3->Release()\n", GetCurrentThreadId());
|
||
|
g_pIFELang3->Release();
|
||
|
g_pIFELang3=NULL;
|
||
|
}
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
// qsort: compare the start time of element
|
||
|
int __cdecl CompareElemTime(const void *ps1, const void *ps2)
|
||
|
{
|
||
|
return ((IMLANGUAGE_ELEMENT*)ps1)->dwFrameStart -
|
||
|
((IMLANGUAGE_ELEMENT*)ps2)->dwFrameStart;
|
||
|
}
|
||
|
|
||
|
BOOL ProbIsBad(LATTICE *lat, float flProb)
|
||
|
{
|
||
|
if (lat->fUseGuide)
|
||
|
{
|
||
|
return (flProb < g_vtuneInfo.pTune->flStringHwxThreshold);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
return (flProb < g_vtuneInfo.pTune->flFreeHwxThreshold);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Figure out which strokes are valid end of character in paths that span the
|
||
|
// whole lattice.
|
||
|
static BOOL CheckIfValid(LATTICE *lat, int iStroke, BYTE *pValidEnd)
|
||
|
{
|
||
|
LATTICE_ALT_LIST *pAlts;
|
||
|
int ii;
|
||
|
BOOL fValidPath;
|
||
|
|
||
|
// Have we already checked this position?
|
||
|
if (pValidEnd[iStroke] == LCF_INVALID) {
|
||
|
return FALSE;
|
||
|
} else if (pValidEnd[iStroke] == LCF_VALID) {
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
// Assume no valid path, until proved otherwise.
|
||
|
fValidPath = FALSE;
|
||
|
|
||
|
// Process each element ending at this position in lattice.
|
||
|
pAlts = lat->pAltList + iStroke;
|
||
|
for (ii = 0; ii < pAlts->nUsed; ++ii) {
|
||
|
LATTICE_ELEMENT *pElem;
|
||
|
int prevEnd;
|
||
|
|
||
|
pElem = pAlts->alts + ii;
|
||
|
prevEnd = iStroke - pElem->nStrokes;
|
||
|
ASSERT(prevEnd >= -1);
|
||
|
if (ProbIsBad(lat, pElem->logProb)) {
|
||
|
// Pretend that the really bad scores don't exist!
|
||
|
// This test MUST match code building up lattice to use!
|
||
|
} else if (prevEnd < 0) {
|
||
|
// Reached start.
|
||
|
fValidPath = TRUE;
|
||
|
} else if (CheckIfValid(lat, prevEnd, pValidEnd)) {
|
||
|
// Have path to start.
|
||
|
fValidPath = TRUE;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Did one (or more) paths reach the start?
|
||
|
if (fValidPath) {
|
||
|
pValidEnd[iStroke] = LCF_VALID;
|
||
|
return TRUE;
|
||
|
} else {
|
||
|
pValidEnd[iStroke] = LCF_INVALID;
|
||
|
return FALSE;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Apply language model to produce a better current path
|
||
|
void ApplyLanguageModel(LATTICE *lat, wchar_t *wszCorrectAnswer)
|
||
|
{
|
||
|
DWORD dwNeutralSize;
|
||
|
IMLANGUAGE_ELEMENT *pLattice;
|
||
|
IMLANGUAGE_INFO *pDataList;
|
||
|
#ifdef USE_IFELANG3_BIGRAMS
|
||
|
DWORD dwBigramSize;
|
||
|
LATTICE_BIGRAM_INFO_LIST *pBigramInfo;
|
||
|
#endif
|
||
|
int cElem, cData;
|
||
|
int cLattice;
|
||
|
DWORD cElemOut;
|
||
|
IMLANGUAGE_ELEMENT *pElemOut;
|
||
|
IMLANGUAGE_LM_PARAM sTuneParam;
|
||
|
SImeLMParam sHWTip1Param;
|
||
|
IMLANGUAGE_LM aLMITEM[3];
|
||
|
IMLANGUAGE_INFO_NEUTRAL1 *pNeutral;
|
||
|
BYTE *pValidEnd;
|
||
|
int size;
|
||
|
int nLMITEM;
|
||
|
|
||
|
DWORD ii, jj;
|
||
|
HRESULT rc;
|
||
|
#ifdef DUMP_LM_LATTICE
|
||
|
FILE *f;
|
||
|
#endif
|
||
|
int iStroke, iAlt;
|
||
|
|
||
|
int cPreContext = 0, cPostContext = 0;
|
||
|
// DebugBreak();
|
||
|
// return;
|
||
|
|
||
|
// fprintf(stderr,"ApplyLanguageModel() fUseIFELang3=%d\n", lat->fUseIFELang3);
|
||
|
ASSERT(lat!=NULL);
|
||
|
if (!LatticeIFELang3Available()) return;
|
||
|
if (!lat->fUseLM || !lat->fUseIFELang3) return;
|
||
|
if (lat->nStrokes==0) return;
|
||
|
|
||
|
// Check if there are any alphabetic characters in the best path. If so, skip IFELang3
|
||
|
for (iStroke = 0; iStroke < lat->nStrokes; iStroke++)
|
||
|
{
|
||
|
for (iAlt = 0; iAlt < lat->pAltList[iStroke].nUsed; iAlt++)
|
||
|
{
|
||
|
if (lat->pAltList[iStroke].alts[iAlt].fCurrentPath)
|
||
|
{
|
||
|
wchar_t dch = lat->pAltList[iStroke].alts[iAlt].wChar;
|
||
|
if (dch != SYM_UNKNOWN)
|
||
|
{
|
||
|
wchar_t wch = LocRunDense2Unicode(&g_locRunInfo, dch);
|
||
|
if ((wch >= L'a' && wch <= L'z') || (wch >= L'A' && wch <= L'Z'))
|
||
|
{
|
||
|
return;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Check if we have a character of context that needs to be included in the lattice
|
||
|
if (lat->wszBefore != NULL)
|
||
|
cPreContext = wcslen(lat->wszBefore);
|
||
|
if (lat->wszAfter != NULL)
|
||
|
cPostContext = wcslen(lat->wszAfter);
|
||
|
|
||
|
// How big does each lattice element need to be? The structure includes space for
|
||
|
// one character, and it needs to be followed by 2 NUL characters.
|
||
|
dwNeutralSize = sizeof(IMLANGUAGE_INFO_NEUTRAL1) + sizeof(wchar_t)*2;
|
||
|
|
||
|
// Figure out valid character transition points by stroke. We don't wan't to include
|
||
|
// pieces of lattice that don't actually connect up for the full run. This uses
|
||
|
// a recursive algorithm to find valid paths. Since it marks the paths as it
|
||
|
// finds them, it can quickly test and not redo work.
|
||
|
size = sizeof(BYTE) * lat->nStrokes;
|
||
|
pValidEnd = (BYTE *)ExternAlloc(size);
|
||
|
// If we failed to allocate memory, just return
|
||
|
if (pValidEnd==NULL) return;
|
||
|
|
||
|
memset(pValidEnd, 0, size);
|
||
|
if (!CheckIfValid(lat, lat->nStrokes - 1, pValidEnd)) {
|
||
|
// No valid paths found?!?!
|
||
|
// This can happen... so clean up and return.
|
||
|
// We'll just get the best path selected by the HWX engine.
|
||
|
goto allocated_valid;
|
||
|
}
|
||
|
|
||
|
// First allocate space to build the lattice up in.
|
||
|
cElem = lat->nStrokes * MaxAltsPerStroke + (cPreContext + cPostContext);
|
||
|
cData = cElem;
|
||
|
#ifdef USE_IFELANG3_BIGRAMS
|
||
|
cData += lat->nStrokes*MaxAltsPerStroke;
|
||
|
#endif
|
||
|
pDataList = (IMLANGUAGE_INFO*)ExternAlloc(sizeof(IMLANGUAGE_INFO)*cData);
|
||
|
if (pDataList==NULL) goto allocated_valid;
|
||
|
pLattice = (IMLANGUAGE_ELEMENT*)ExternAlloc(sizeof(IMLANGUAGE_ELEMENT)*cElem);
|
||
|
if (pLattice==NULL) goto allocated_data;
|
||
|
|
||
|
#ifdef DUMP_LM_LATTICE
|
||
|
#ifdef DUMP_LM_LATTICE_APPEND
|
||
|
f=_wfopen(LM_LATTICE_FILENAME,L"ab");
|
||
|
#else
|
||
|
f=_wfopen(LM_LATTICE_FILENAME,L"wb");
|
||
|
#endif
|
||
|
fwprintf(f, L"<S>\r\n\r\n");
|
||
|
#endif
|
||
|
|
||
|
#ifdef HWX_TUNE
|
||
|
if (g_pTuneFile != NULL)
|
||
|
{
|
||
|
for (int i = 0; i < (int) wcslen(wszCorrectAnswer); i++)
|
||
|
{
|
||
|
if (i > 0)
|
||
|
{
|
||
|
fprintf(g_pTuneFile, " ");
|
||
|
}
|
||
|
fprintf(g_pTuneFile, "%04X", wszCorrectAnswer[i]);
|
||
|
}
|
||
|
fprintf(g_pTuneFile, "\n");
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
cLattice = 0;
|
||
|
cData = 0;
|
||
|
if (lat->wszBefore != NULL)
|
||
|
{
|
||
|
int iSrc;
|
||
|
int iFrame = 1;
|
||
|
for (iSrc = wcslen(lat->wszBefore) - 1; iSrc >= 0; iSrc--)
|
||
|
{
|
||
|
// If we have a context character, then create a lattice element for it.
|
||
|
pNeutral = (IMLANGUAGE_INFO_NEUTRAL1*)ExternAlloc(dwNeutralSize);
|
||
|
if (pNeutral == NULL) goto allocated_elements;
|
||
|
pNeutral->dwUnigram = 0;
|
||
|
pNeutral->fHypothesis = FALSE;
|
||
|
pNeutral->wsz[0] = LocRunDense2Unicode(&g_locRunInfo, lat->wszBefore[iSrc]);
|
||
|
pNeutral->wsz[1] = 0;
|
||
|
pNeutral->wsz[2] = 0;
|
||
|
|
||
|
pDataList[cData].guid = GUID_IMLANGUAGE_INFO_NEUTRAL1;
|
||
|
pDataList[cData].dwSize = dwNeutralSize;
|
||
|
pDataList[cData].pbData = (BYTE*)pNeutral;
|
||
|
|
||
|
pLattice[cLattice].dwFrameStart = iFrame++;
|
||
|
pLattice[cLattice].dwFrameLen = 1;
|
||
|
pLattice[cLattice].dwTotalInfo = 1;
|
||
|
pLattice[cLattice].pInfo = pDataList + cLattice;
|
||
|
|
||
|
#ifdef HWX_TUNE
|
||
|
if (g_pTuneFile != NULL)
|
||
|
{
|
||
|
fprintf(g_pTuneFile, "%04X %d %d %f\n",
|
||
|
pNeutral->wsz[0],
|
||
|
pLattice[cLattice].dwFrameStart,
|
||
|
pLattice[cLattice].dwFrameLen,
|
||
|
0.0);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
#ifdef DUMP_LM_LATTICE
|
||
|
if (f!=NULL) {
|
||
|
char sz[256];
|
||
|
ZeroMemory( sz, 256 );
|
||
|
#if defined(LM_LATTICE_CHS)
|
||
|
WideCharToMultiByte( 936, 0, pNeutral->wsz, -1, sz, 256, 0, 0 );
|
||
|
#elif defined(LM_LATTICE_CHT)
|
||
|
WideCharToMultiByte( 950, 0, pNeutral->wsz, -1, sz, 256, 0, 0 );
|
||
|
#elif defined(LM_LATTICE_JPN)
|
||
|
WideCharToMultiByte( 932, 0, pNeutral->wsz, -1, sz, 256, 0, 0 );
|
||
|
#endif
|
||
|
#if defined(LM_LATTICE_CHS) || defined(LM_LATTICE_CHT)
|
||
|
fwprintf(f, L"%s, %d, %d, %d\r\n",
|
||
|
pNeutral->wsz, pLattice[cLattice].dwFrameStart, pLattice[cLattice].dwFrameLen, pNeutral->dwUnigram);
|
||
|
#elif defined(LM_LATTICE_UNICODE)
|
||
|
fwprintf(f, L"%3d %2d %10d %s U+%04X\r\n",
|
||
|
pLattice[cLattice].dwFrameStart, pLattice[cLattice].dwFrameLen, pNeutral->dwUnigram,
|
||
|
pNeutral->wsz, pNeutral->wsz[0] );
|
||
|
#endif
|
||
|
}
|
||
|
#endif
|
||
|
cData++;
|
||
|
cLattice++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Now process each position.
|
||
|
for (iStroke=0; iStroke<lat->nStrokes; iStroke++) {
|
||
|
// Do we actually want any paths that end at this location?
|
||
|
if (pValidEnd[iStroke] != LCF_VALID) {
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// int max=5;
|
||
|
// if (iStroke==0) max=2;
|
||
|
|
||
|
for (iAlt=0; iAlt<lat->pAltList[iStroke].nUsed; iAlt++) if (1 /*iAlt<max || lat->pAltList[iStroke].alts[iAlt].fCurrentPath*/) {
|
||
|
wchar_t unicode;
|
||
|
float flScore;
|
||
|
|
||
|
// Prune really bad scores. This MUST match test in path
|
||
|
// checking code!
|
||
|
if (ProbIsBad(lat, lat->pAltList[iStroke].alts[iAlt].logProb)) {
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// Fill in lattice element
|
||
|
pNeutral = (IMLANGUAGE_INFO_NEUTRAL1*)ExternAlloc(dwNeutralSize);
|
||
|
if (pNeutral==NULL) goto allocated_elements;
|
||
|
flScore = -lat->pAltList[iStroke].alts[iAlt].logProb;
|
||
|
if (lat->fUseGuide)
|
||
|
{
|
||
|
flScore *= g_vtuneInfo.pTune->flStringHwxWeight;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
flScore *= g_vtuneInfo.pTune->flFreeHwxWeight * lat->pAltList[iStroke].alts[iAlt].nStrokes;
|
||
|
}
|
||
|
if (flScore < 0) flScore = (float)0;
|
||
|
if (flScore > INT_MAX) flScore = (float)INT_MAX;
|
||
|
pNeutral->dwUnigram = (DWORD)(flScore);
|
||
|
if (lat->pAltList[iStroke].alts[iAlt].wChar==SYM_UNKNOWN)
|
||
|
{
|
||
|
unicode=L' ';
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
unicode = LocRunDense2Unicode(
|
||
|
&g_locRunInfo, lat->pAltList[iStroke].alts[iAlt].wChar
|
||
|
);
|
||
|
}
|
||
|
pNeutral->fHypothesis = FALSE;
|
||
|
pNeutral->wsz[0] = unicode;
|
||
|
pNeutral->wsz[1] = 0;
|
||
|
pNeutral->wsz[2] = 0;
|
||
|
// Check for EUDC codes, which we shouldn't be generating (for debugging)
|
||
|
// if (pNeutral->wsz[0]>=0xE000 && pNeutral->wsz[0]<0xF900) {
|
||
|
// fprintf(stderr,"EUDC code in lattice (stroke %d alt %d): U+%04X\n",iStroke,iAlt,pNeutral->wsz[0]);
|
||
|
// }
|
||
|
|
||
|
pLattice[cLattice].dwFrameStart = (iStroke - lat->pAltList[iStroke].alts[iAlt].nStrokes) + 2 + cPreContext;
|
||
|
pLattice[cLattice].dwFrameLen=lat->pAltList[iStroke].alts[iAlt].nStrokes;
|
||
|
pLattice[cLattice].dwTotalInfo = 1;
|
||
|
pLattice[cLattice].pInfo = pDataList + cData;
|
||
|
|
||
|
pDataList[cData].guid = GUID_IMLANGUAGE_INFO_NEUTRAL1;
|
||
|
pDataList[cData].dwSize = dwNeutralSize;
|
||
|
pDataList[cData].pbData = (BYTE*)pNeutral;
|
||
|
cData++;
|
||
|
|
||
|
#ifdef HWX_TUNE
|
||
|
if (g_pTuneFile != NULL)
|
||
|
{
|
||
|
fprintf(g_pTuneFile, "%04X %d %d %f\n",
|
||
|
pNeutral->wsz[0],
|
||
|
pLattice[cLattice].dwFrameStart,
|
||
|
pLattice[cLattice].dwFrameLen,
|
||
|
lat->pAltList[iStroke].alts[iAlt].logProb);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
#ifdef USE_IFELANG3_BIGRAMS
|
||
|
lat->pAltList[iStroke].alts[iAlt].indexIFELang3=cLattice;
|
||
|
if (lat->pAltList[iStroke].alts[iAlt].nBigrams>0) {
|
||
|
dwBigramSize=sizeof(LATTICE_BIGRAM_INFO_LIST)+
|
||
|
sizeof(LATTICE_BIGRAM_INFO)*(lat->pAltList[iStroke].alts[iAlt].nBigrams-1);
|
||
|
pBigramInfo = (LATTICE_BIGRAM_INFO_LIST*)ExternAlloc(dwBigramSize);
|
||
|
if (pBigramInfo==NULL) goto allocated_elements;
|
||
|
|
||
|
int iPrevStart=iStroke-lat->pAltList[iStroke].alts[iAlt].nStrokes;
|
||
|
for (int i=0; i<lat->pAltList[iStroke].alts[iAlt].nBigrams; i++) {
|
||
|
pBigramInfo->bigrams[i].dwBigram=(DWORD)(lat->pAltList[iStroke].alts[iAlt].bigramLogProbs[i]);
|
||
|
pBigramInfo->bigrams[i].dwPrevElement=
|
||
|
lat->pAltList[iPrevStart].alts[lat->pAltList[iStroke].alts[iAlt].bigramAlts[i]].indexIFELang3;
|
||
|
}
|
||
|
|
||
|
pDataList[cData].guid = GUID_LATTICE_BIGRAM_INFO_LIST;
|
||
|
pDataList[cData].dwSize = dwBigramSize;
|
||
|
pDataList[cData].pbData = (BYTE*)pBigramInfo;
|
||
|
cData++;
|
||
|
pLattice[cLattice].dwTotalInfo++;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
#ifdef DUMP_LM_LATTICE
|
||
|
if (f!=NULL) {
|
||
|
char sz[256];
|
||
|
ZeroMemory( sz, 256 );
|
||
|
#if defined(LM_LATTICE_CHS)
|
||
|
WideCharToMultiByte( 936, 0, pNeutral->wsz, -1, sz, 256, 0, 0 );
|
||
|
#elif defined(LM_LATTICE_CHT)
|
||
|
WideCharToMultiByte( 950, 0, pNeutral->wsz, -1, sz, 256, 0, 0 );
|
||
|
#elif defined(LM_LATTICE_JPN)
|
||
|
WideCharToMultiByte( 932, 0, pNeutral->wsz, -1, sz, 256, 0, 0 );
|
||
|
#endif
|
||
|
// fprintf(f,"%3d %2d %10d %s\n",pLattice[cLattice].dwFrameStart,pLattice[cLattice].dwFrameLen,pNeutral->dwUnigram,sz);
|
||
|
// fprintf(f,"%3d %2d %10d %s U+%04X\n",pLattice[cLattice].dwFrameStart,pLattice[cLattice].dwFrameLen,pNeutral->dwUnigram,sz,pNeutral->wsz[0]);
|
||
|
#if defined(LM_LATTICE_CHS) || defined(LM_LATTICE_CHT)
|
||
|
fwprintf(f, L"%s, %d, %d, %d\r\n",
|
||
|
pNeutral->wsz, pLattice[cLattice].dwFrameStart, pLattice[cLattice].dwFrameLen, pNeutral->dwUnigram);
|
||
|
#elif defined(LM_LATTICE_UNICODE)
|
||
|
fwprintf(f, L"%3d %2d %10d %s U+%04X\r\n",
|
||
|
pLattice[cLattice].dwFrameStart, pLattice[cLattice].dwFrameLen, pNeutral->dwUnigram,
|
||
|
pNeutral->wsz, pNeutral->wsz[0] );
|
||
|
#endif
|
||
|
#ifdef USE_IFELANG3_BIGRAMS
|
||
|
fprintf(f,"%d",lat->pAltList[iStroke].alts[iAlt].nBigrams);
|
||
|
int iPrevStart=iStroke-lat->pAltList[iStroke].alts[iAlt].nStrokes;
|
||
|
for (int i=0; i<lat->pAltList[iStroke].alts[iAlt].nBigrams; i++) {
|
||
|
fprintf(f," %d %d\n",
|
||
|
lat->pAltList[iPrevStart].alts[lat->pAltList[iStroke].alts[iAlt].bigramAlts[i]].indexIFELang3,
|
||
|
lat->pAltList[iStroke].alts[iAlt].bigramLogProbs);
|
||
|
}
|
||
|
fprintf(f,"\n");
|
||
|
#endif
|
||
|
}
|
||
|
#endif
|
||
|
cLattice++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (lat->wszAfter != NULL)
|
||
|
{
|
||
|
int iFrame = pLattice[cLattice].dwFrameStart = (lat->nStrokes - 1) + 2 + cPreContext;
|
||
|
for (int iSrc = 0; iSrc < (int) wcslen(lat->wszAfter); iSrc++)
|
||
|
{
|
||
|
// If we have a context character, then create a lattice element for it.
|
||
|
pNeutral = (IMLANGUAGE_INFO_NEUTRAL1*)ExternAlloc(dwNeutralSize);
|
||
|
if (pNeutral == NULL) goto allocated_elements;
|
||
|
pNeutral->dwUnigram = 0;
|
||
|
pNeutral->fHypothesis = FALSE;
|
||
|
pNeutral->wsz[0] = LocRunDense2Unicode(&g_locRunInfo, lat->wszAfter[iSrc]);
|
||
|
pNeutral->wsz[1] = 0;
|
||
|
pNeutral->wsz[2] = 0;
|
||
|
|
||
|
pDataList[cData].guid = GUID_IMLANGUAGE_INFO_NEUTRAL1;
|
||
|
pDataList[cData].dwSize = dwNeutralSize;
|
||
|
pDataList[cData].pbData = (BYTE*)pNeutral;
|
||
|
|
||
|
pLattice[cLattice].dwFrameStart = iFrame++;
|
||
|
pLattice[cLattice].dwFrameLen = 1;
|
||
|
pLattice[cLattice].dwTotalInfo = 1;
|
||
|
pLattice[cLattice].pInfo = pDataList + cLattice;
|
||
|
|
||
|
#ifdef HWX_TUNE
|
||
|
if (g_pTuneFile != NULL)
|
||
|
{
|
||
|
fprintf(g_pTuneFile, "%04X %d %d %f\n",
|
||
|
pNeutral->wsz[0],
|
||
|
pLattice[cLattice].dwFrameStart,
|
||
|
pLattice[cLattice].dwFrameLen,
|
||
|
0.0);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
#ifdef DUMP_LM_LATTICE
|
||
|
if (f!=NULL) {
|
||
|
char sz[256];
|
||
|
ZeroMemory( sz, 256 );
|
||
|
#if defined(LM_LATTICE_CHS)
|
||
|
WideCharToMultiByte( 936, 0, pNeutral->wsz, -1, sz, 256, 0, 0 );
|
||
|
#elif defined(LM_LATTICE_CHT)
|
||
|
WideCharToMultiByte( 950, 0, pNeutral->wsz, -1, sz, 256, 0, 0 );
|
||
|
#elif defined(LM_LATTICE_JPN)
|
||
|
WideCharToMultiByte( 932, 0, pNeutral->wsz, -1, sz, 256, 0, 0 );
|
||
|
#endif
|
||
|
#if defined(LM_LATTICE_CHS) || defined(LM_LATTICE_CHT)
|
||
|
fwprintf(f, L"%s, %d, %d, %d\r\n",
|
||
|
pNeutral->wsz, pLattice[cLattice].dwFrameStart, pLattice[cLattice].dwFrameLen, pNeutral->dwUnigram);
|
||
|
#elif defined(LM_LATTICE_UNICODE)
|
||
|
fwprintf(f, L"%3d %2d %10d %s U+%04X\r\n",
|
||
|
pLattice[cLattice].dwFrameStart, pLattice[cLattice].dwFrameLen, pNeutral->dwUnigram,
|
||
|
pNeutral->wsz, pNeutral->wsz[0] );
|
||
|
#endif
|
||
|
}
|
||
|
#endif
|
||
|
cData++;
|
||
|
cLattice++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#ifdef DUMP_LM_LATTICE
|
||
|
if (f!=NULL) {
|
||
|
fwprintf(f, L"</S>\r\n");
|
||
|
fclose(f);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
#ifdef HWX_TUNE
|
||
|
if (g_pTuneFile != NULL)
|
||
|
{
|
||
|
fprintf(g_pTuneFile, "\n");
|
||
|
fflush(g_pTuneFile);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
// define "batch" for IIMLanguage
|
||
|
nLMITEM = SetLanguageModelCommands(aLMITEM, &sTuneParam, &sHWTip1Param);
|
||
|
|
||
|
// Apply IIMLanguage
|
||
|
rc = g_pIFELang3->GetLatticeMorphResult(
|
||
|
nLMITEM, aLMITEM,
|
||
|
cLattice, pLattice, &cElemOut, &pElemOut
|
||
|
);
|
||
|
|
||
|
// TPDBG_DMSG2("%08X: GetLatticeMorphResult -> %08X\n", GetCurrentThreadId(), rc);
|
||
|
|
||
|
if (SUCCEEDED(rc)) {
|
||
|
// Let's validate what came back from IFELang3. The main array of
|
||
|
// results must be writable, so we can sort it later.
|
||
|
if (cElemOut <= 0 || IsBadWritePtr(pElemOut, cElemOut * sizeof(*pElemOut)))
|
||
|
{
|
||
|
ASSERT(("IFELang3 pElemOut not readable and writable", FALSE));
|
||
|
goto free_lattice;
|
||
|
}
|
||
|
// Then validate each of the pointers in the above array
|
||
|
for (ii = 0; ii < cElemOut; ii++)
|
||
|
{
|
||
|
IMLANGUAGE_ELEMENT *pElem = pElemOut + ii;
|
||
|
// If the element doesn't have any info items in it, then skip it.
|
||
|
if (pElem->dwTotalInfo <= 0)
|
||
|
continue;
|
||
|
// Otherwise validate the array of info items
|
||
|
if (IsBadReadPtr(pElem->pInfo, pElem->dwTotalInfo * sizeof(*(pElem->pInfo))))
|
||
|
{
|
||
|
ASSERT(("IFELang3 pElemOut->pInfo not readable", FALSE));
|
||
|
goto free_lattice;
|
||
|
}
|
||
|
for (jj = 0; jj < pElem->dwTotalInfo; jj++)
|
||
|
{
|
||
|
// Make sure the info is readable and the size it is supposed to be
|
||
|
if (IsBadReadPtr(pElem->pInfo[jj].pbData, pElem->pInfo[jj].dwSize))
|
||
|
{
|
||
|
ASSERT(("IFELang3 pElemOut->pInfo->pbData not readable", FALSE));
|
||
|
goto free_lattice;
|
||
|
}
|
||
|
// We only care about info of this type
|
||
|
if (pElem->pInfo[jj].guid == GUID_IMLANGUAGE_INFO_NEUTRAL1)
|
||
|
{
|
||
|
// Get the pointer to the actual info.
|
||
|
pNeutral = (IMLANGUAGE_INFO_NEUTRAL1 *) pElem->pInfo[jj].pbData;
|
||
|
// cLattice + 1 is a very loose upper bound on the number of characters
|
||
|
// which should be returned; then use the string length to check that
|
||
|
// the specified size is correct.
|
||
|
if (IsBadStringPtrW(pNeutral->wsz, cLattice + 1) ||
|
||
|
sizeof(IMLANGUAGE_INFO_NEUTRAL1) +
|
||
|
sizeof(WCHAR) * (wcslen(pNeutral->wsz) + 1) > pElem->pInfo[jj].dwSize)
|
||
|
{
|
||
|
ASSERT(("IFELang3 pElemOut->pInfo->pbData->wsz not readable", FALSE));
|
||
|
goto free_lattice;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// If the language model worked,
|
||
|
// trace the path that came back from the model.
|
||
|
int cChars = 0;
|
||
|
wchar_t *wszBestPath = NULL;
|
||
|
|
||
|
// Sort the output lattice elements into time order
|
||
|
qsort(pElemOut, cElemOut, sizeof(IMLANGUAGE_ELEMENT), CompareElemTime);
|
||
|
|
||
|
// Count the number of characters in the path
|
||
|
for (ii=0; ii<cElemOut; ii++) {
|
||
|
IMLANGUAGE_ELEMENT *pElem = pElemOut + ii;
|
||
|
for (jj = 0; jj < pElem->dwTotalInfo; jj++ ) {
|
||
|
pNeutral = (IMLANGUAGE_INFO_NEUTRAL1 *) pElem->pInfo[jj].pbData;
|
||
|
if (pElem->pInfo[jj].guid==GUID_IMLANGUAGE_INFO_NEUTRAL1) {
|
||
|
/* FILE *f=fopen("/log.txt","a+");
|
||
|
fprintf(f,"element %d has %d chars hypothesis %d\n",cChars,wcslen(pNeutral->wsz),pNeutral->fHypothesis);
|
||
|
fclose(f); */
|
||
|
cChars += wcslen(pNeutral->wsz);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Allocate space for the path
|
||
|
wszBestPath = (wchar_t*)ExternAlloc(sizeof(wchar_t)*(cChars+1));
|
||
|
|
||
|
// If we failed, then just don't update the path
|
||
|
if (wszBestPath != NULL) {
|
||
|
// Copy the path out of the lattice
|
||
|
wszBestPath[0] = 0;
|
||
|
for (ii=0; ii<cElemOut; ii++) {
|
||
|
IMLANGUAGE_ELEMENT *pElem = pElemOut + ii;
|
||
|
for (jj = 0; jj < pElem->dwTotalInfo; jj++ ) {
|
||
|
pNeutral = (IMLANGUAGE_INFO_NEUTRAL1 *) pElem->pInfo[jj].pbData;
|
||
|
if (pElem->pInfo[jj].guid==GUID_IMLANGUAGE_INFO_NEUTRAL1)
|
||
|
wcscat(wszBestPath, pNeutral->wsz);
|
||
|
}
|
||
|
}
|
||
|
// Wipe out the post-context character(s)
|
||
|
wszBestPath[wcslen(wszBestPath) - cPostContext] = 0;
|
||
|
// Skip the pre-context character(s)
|
||
|
wszBestPath += cPreContext;
|
||
|
|
||
|
// Trace the string through the original lattice, using the
|
||
|
// same code that is used for the separator.
|
||
|
int nSubs=SearchForTargetResultInternal(lat, wszBestPath);
|
||
|
// fprintf(stderr, "Searching for target, nSubs=%d\n", nSubs);
|
||
|
// Put the context character(s) back
|
||
|
wszBestPath -= cPreContext;
|
||
|
if (nSubs!=0) {
|
||
|
// FILE *f = fopen("c:/log.txt", "a");
|
||
|
// fprintf(f, "Lost path with %d/%d/%d chars\n",
|
||
|
// cPreContext, wcslen(wszBestPath) - cPreContext, cPostContext);
|
||
|
// fclose(f);
|
||
|
} else {
|
||
|
// FILE *f = fopen("c:/log.txt", "a");
|
||
|
// fprintf(f, "Found path with %d/%d/%d chars\n",
|
||
|
// cPreContext, wcslen(wszBestPath) - cPreContext, cPostContext);
|
||
|
// fclose(f);
|
||
|
}
|
||
|
ExternFree(wszBestPath);
|
||
|
}
|
||
|
|
||
|
free_lattice:
|
||
|
// And free the memory allocated for us by IIMLanguage.
|
||
|
if (cElemOut > 0 && !IsBadReadPtr(pElemOut, cElemOut * sizeof(*pElemOut)))
|
||
|
{
|
||
|
for (ii=0; ii < cElemOut; ii++) {
|
||
|
IMLANGUAGE_ELEMENT *pElem = pElemOut + ii;
|
||
|
if (!IsBadReadPtr(pElem->pInfo, sizeof(*(pElem->pInfo)) * pElem->dwTotalInfo))
|
||
|
{
|
||
|
// Free each data item in the element.
|
||
|
for (jj = 0; jj < pElem->dwTotalInfo; jj++)
|
||
|
{
|
||
|
if (!IsBadReadPtr(pElem->pInfo[jj].pbData, pElem->pInfo[jj].dwSize))
|
||
|
{
|
||
|
CoTaskMemFree(pElem->pInfo[jj].pbData);
|
||
|
}
|
||
|
}
|
||
|
// Free the element array itself.
|
||
|
CoTaskMemFree(pElem->pInfo);
|
||
|
}
|
||
|
}
|
||
|
CoTaskMemFree(pElemOut);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Free all the memory we allocated.
|
||
|
allocated_elements:
|
||
|
for (ii = 0; ii < (DWORD)cData; ++ii) {
|
||
|
if (pDataList[ii].pbData!=NULL)
|
||
|
ExternFree(pDataList[ii].pbData);
|
||
|
}
|
||
|
ExternFree(pLattice);
|
||
|
allocated_data:
|
||
|
ExternFree(pDataList);
|
||
|
allocated_valid:
|
||
|
ExternFree(pValidEnd);
|
||
|
}
|
||
|
|
||
|
|
||
|
#endif
|
||
|
// USE_IFELANG3
|