windows-nt/Source/XPSP1/NT/shell/ext/mlang/detect.h

479 lines
14 KiB
C
Raw Normal View History

2020-09-26 03:20:57 -05:00
/*
* Automatic language and codepage detector
*
* Bob Powell, 2/97
* Copyright (C) 1996, 1997, Microsoft Corp. All rights reserved.
*/
#ifdef __cplusplus
#include <wtypes.h>
#include <limits.h>
#include "lcdetect.h"
#include "lccommon.h"
#include <qsort.h>
// Turn this on in SOURCES to enable debug output
#ifdef DEBUG_LCDETECT
#include <stdio.h>
extern int g_fDebug;
#define debug(x) { if (g_fDebug) { x; }}
#define unmapch(x) ((x) >= 2 ? (x)+'a'-2 : ' ')
#else
#define debug(x)
#endif
class LCDetect;
typedef LCDetect *PLCDetect;
class Language;
class Language7Bit;
class Language8Bit;
class LanguageUnicode;
typedef Language *PLanguage;
typedef Language7Bit *PLanguage7Bit;
typedef Language8Bit *PLanguage8Bit;
typedef LanguageUnicode *PLanguageUnicode;
class CScore;
class CScores;
/****************************************************************/
#define MAXSCORES 50 // Max possible simultaneous # of scores
#define MINRAWSCORE 100 // Score threshhold (weight * char count)
// for further processing
/****************************************************************/
// Histograms
// A histogram stores an array of n-gram occurrence counts.
// HElt stores the count, at present this is an unsigned char.
// The in-memory structure is similar to the file.
// The histogram array pointers m_panElts point into the mapped file image.
class Histogram {
public:
Histogram (const PFileHistogramSection pHS, const PHIdx pMap);
Histogram (const Histogram &H, const PHIdx pMap);
virtual ~Histogram (void);
DWORD Validate (DWORD nBytes) const;
UCHAR Dimensionality (void) { return m_nDimensionality; }
UCHAR EdgeSize (void) { return m_nEdgeSize; }
USHORT CodePage (void) { return m_nCodePage; }
USHORT GetRangeID (void) { return m_nRangeID; }
USHORT NElts (void) { return m_nElts; }
PHIdx GetMap (void) { return m_pMap; }
HElt Ref (USHORT i1) const { return m_panElts[i1]; }
HElt Ref (UCHAR i1, UCHAR i2) const {
return m_panElts[(i1 * m_nEdgeSize) + i2]; }
HElt Ref (UCHAR i1, UCHAR i2, UCHAR i3) const {
return m_panElts[((i1 * m_nEdgeSize) + i2) * m_nEdgeSize + i3]; }
HElt *Array (void) { return m_panElts; }
protected:
UCHAR m_nDimensionality; // 1=unigram, 2=digram etc.
UCHAR m_nEdgeSize; // edge size (is a function of char map)
union {
USHORT m_nCodePage; // For 7 and 8-bit, is code page
USHORT m_nRangeID; // For Unicode, is sub-language range ID
};
USHORT m_nElts; // (edge size ^ dimensionality)
PHIdx m_pMap; // char/WCHAR to histogram idx mapping
HElt *m_panElts; // array of elements / counts
};
typedef Histogram *PHistogram;
/****************************************************************/
// A Language object stores all the detection state for a given language,
// i.e. primary language ID.
class Language {
public:
// nCodePages is same as nSubLangs
Language (PLCDetect pL, int nLangID, int nCodePages, int nRangeID = 0);
virtual ~Language (void) { }
virtual DWORD AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx) = 0;
// Score the code pages for this language
virtual void ScoreCodePage (LPCSTR, int nCh, CScore &S, int &idx) const;
int LanguageID (void) const { return m_nLangID; }
int NCodePages (void) const { return m_nCodePages; }
int NSubLangs (void) const { return m_nSubLangs; }
int RangeID (void) const { return m_nRangeID; }
int GetScoreIdx (void) const { return m_nScoreIdx; }
void SetScoreIdx (int nScoreIdx) { m_nScoreIdx = nScoreIdx; }
virtual int GetCodePage (int n) const { return 0; }
virtual int GetSublangRangeID (int n) const { return 0; }
virtual int GetSublangID (int n) const { return 0; }
virtual DetectionType Type (void) = 0;
virtual Language7Bit const * GetLanguage7Bit (void) const { return NULL; }
virtual Language8Bit const * GetLanguage8Bit (void) const { return NULL; }
virtual LanguageUnicode const * GetLanguageUnicode (void) const { return NULL; }
protected:
PLCDetect m_pLC;
int m_nLangID; // Win32 primary language ID
int m_nRangeID; // Unicode range ID, for Unicode langs
union {
int m_nCodePages; // # of code pages trained for this language
int m_nSubLangs;
};
int m_nScoreIdx; // Used to create a unique index into the score arrays
// for each lang + cp combination, to eliminate the
// need to search the arrays to merge scores. Add
// the code page index to this to get the array index.
};
////////////////////////////////////////////////////////////////
class Language7Bit : public Language {
public:
Language7Bit (PLCDetect pL, int nLangID, int nCodePages);
~Language7Bit (void);
DWORD AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx);
void ScoreCodePage (LPCSTR, int nCh, CScore &S, int &idx) const;
int GetCodePage (int n) const { return m_ppCodePageHistogram[n]->CodePage();}
virtual DetectionType Type (void) { return DETECT_7BIT; }
PHistogram GetLangHistogram (void) const { return m_pLangHistogram; }
PHistogram GetCodePageHistogram (int i) const {
return m_ppCodePageHistogram[i]; }
virtual Language7Bit const * GetLanguage7Bit (void) const { return this; }
const PHElt * GetPHEltArray (void) const { return m_paHElt; }
private:
PHistogram m_pLangHistogram;
PHistogram m_ppCodePageHistogram[MAXSUBLANG];
PHElt m_paHElt[MAXSUBLANG];
};
////////////////////////////////////////////////////////////////
class Language8Bit : public Language {
public:
Language8Bit (PLCDetect pL, int nLangID, int nCodePages);
~Language8Bit (void);
DWORD AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx);
int GetCodePage (int n) const { return m_ppHistogram[n]->CodePage(); }
virtual DetectionType Type (void) { return DETECT_8BIT; }
PHistogram GetHistogram (int i) const { return m_ppHistogram[i]; }
virtual Language8Bit const * GetLanguage8Bit (void) const { return this; }
private:
PHistogram m_ppHistogram[MAXSUBLANG];
};
////////////////////////////////////////////////////////////////
class LanguageUnicode : public Language {
public:
LanguageUnicode (PLCDetect pL, int nLangID, int nRecordCount, int nRangeID);
~LanguageUnicode (void);
DWORD AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx);
void ScoreSublanguages (LPCWSTR wcs, int nch, CScores &S) const;
int GetSublangRangeID (int i) const{return GetHistogram(i)->GetRangeID();}
PLanguageUnicode GetSublanguage (int n) const;
virtual DetectionType Type (void) { return DETECT_UNICODE; }
PHistogram GetHistogram (int i) const { return m_ppSubLangHistogram[i]; }
virtual LanguageUnicode const * GetLanguageUnicode (void) const {
return this;
}
const PHElt * GetPHEltArray (void) const { return m_paHElt; }
private:
PHistogram m_ppSubLangHistogram[MAXSUBLANG];
PHElt m_paHElt[MAXSUBLANG];
};
/****************************************************************/
class Charmap {
public:
Charmap (PFileMapSection pMS) : m_nID(pMS->m_dwID), m_nSize(pMS->m_dwSize),
m_nUnique(pMS->m_dwNUnique), m_pElts( (PHIdx) (&pMS[1]) ) { }
// int ID (void) const { return m_nID; }
int Size (void) const { return m_nSize; }
int NUnique (void) const { return m_nUnique; }
PHIdx Map (void) const { return m_pElts; }
HIdx Map (WCHAR x) const { return m_pElts[x]; }
private:
int m_nID; // ID by which hardwired code finds the table
int m_nSize; // size of table (256 or 65536)
int m_nUnique; // # of unique output values
PHIdx m_pElts;
};
typedef Charmap *PCharmap;
/****************************************************************/
// class CScore -- score for one lang and/or code page, variously used for
// individual chunks and also for an entire document.
class CScore {
public:
// Only these two slots need to be initialized
CScore (void) : m_nScore(0), m_nChars(0) {}
~CScore (void) { }
const PLanguage GetLang (void) const { return m_pLang; }
int GetScore (void) const { return m_nScore; }
unsigned short GetCodePage (void) const { return m_nCodePage; }
unsigned short GetCharCount (void) const { return m_nChars; }
void SetLang (PLanguage p) { m_pLang = p; }
void SetScore (int x) { m_nScore = x; }
void SetCharCount (unsigned x) { m_nChars = (unsigned short)x; }
void SetCodePage (unsigned x) { m_nCodePage = (unsigned short)x; }
void Add (CScore &S) {
SetLang(S.GetLang());
SetCodePage(S.GetCodePage());
SetScore(GetScore() + S.GetScore());
SetCharCount(GetCharCount() + S.GetCharCount());
}
CScore & operator += (CScore &S) { Add (S); return *this; }
int operator <= (CScore &S) {
// Special: always put 8-bit langs first since the code page
// matters more for them.
if (GetLang()->Type() != S.GetLang()->Type())
return GetLang()->Type() == DETECT_8BIT ? -1 : 1;
return GetScore() <= S.GetScore();
}
#ifdef DEBUG_LCDETECT
void Print(void) {
printf("Lang=%d CodePage=%d Score=%d NChars=%d\n",
GetLang() ? GetLang()->LanguageID() : -1,
GetCodePage(), GetScore(), GetCharCount());
}
#endif
private:
PLanguage m_pLang;
int m_nScore;
unsigned short m_nCodePage;
unsigned short m_nChars;
};
typedef CScore *PScore;
////////////////////////////////////////////////////////////////
// class CScores
//
// For SBCS detection, the index e.g. Ref(i) is the language+codepage index,
// one of a contiguous set of values which identifies each unique supported
// language and codepage combination.
//
// For DBCS detection, the index is just the Unicode language group.
class CScores {
public:
CScores (int nAlloc, PScore p) : m_nAlloc(nAlloc), m_nUsed(0), m_p(p) { }
virtual ~CScores (void) { }
void Reset (void) {
memset ((void *)m_p, 0, sizeof(CScore) * m_nUsed);
m_nUsed = 0;
}
unsigned int &NElts (void) { return m_nUsed; }
CScore &Ref (unsigned int n) {
if (m_nUsed <= n)
m_nUsed = n + 1;
return m_p[n];
}
void SelectCodePages (void);
void RemoveZeroScores (void) {
for (unsigned int i = 0, j = 0; i < m_nUsed; i++)
{
if (m_p[i].GetScore() > MINRAWSCORE)
m_p[j++] = m_p[i];
}
m_nUsed = j;
}
// Sort by decreasing score.
// Instantiates template qsort using CScore::operator <=
void SortByScore (void) {
RemoveZeroScores ();
if (m_nUsed)
QSort (m_p, m_nUsed, FALSE);
}
CScore & FindHighScore (void) {
int highscore = 0;
for (unsigned int i = 0, highidx = 0; i < m_nUsed; i++) {
if (m_p[i].GetScore() > highscore)
{
highscore = m_p[i].GetScore();
highidx = i;
}
}
return m_p[highidx];
}
protected:
unsigned int m_nAlloc;
unsigned int m_nUsed; // high water mark to optimize NElts(), Reset()
PScore m_p; // score array, typically per TScores<NNN>
};
template<ULONG Size>class TScores : public CScores {
public:
TScores (void) : CScores (Size, m_S) { }
virtual ~TScores (void) { }
private:
CScore m_S[Size];
};
////////////////////////////////////////////////////////////////
class LCDetect {
public:
LCDetect (HMODULE hM);
~LCDetect (void);
unsigned int GetNCharmaps() const { return m_nCharmaps; }
unsigned int GetN7BitLanguages() const { return m_n7BitLanguages; }
unsigned int GetN8BitLanguages() const { return m_n8BitLanguages; }
unsigned int GetNUnicodeLanguages() const { return m_nUnicodeLanguages; }
PLanguage7Bit Get7BitLanguage (int i) const { return m_pp7BitLanguages[i]; }
PLanguage8Bit Get8BitLanguage (int i) const { return m_pp8BitLanguages[i]; }
PLanguageUnicode GetUnicodeLanguage (int i) const { return m_ppUnicodeLanguages[i]; }
PHIdx GetMap (int i) const { return m_ppCharmaps[i]->Map(); }
const LCDConfigure &GetConfig () const { return m_LCDConfigureDefault; }
DWORD LoadState (void);
DWORD DetectA (LPCSTR pStr, int nChars, PLCDScore paScores,
int *pnScores, PCLCDConfigure pLCDC) const;
DWORD DetectW (LPCWSTR wcs, int nInputChars, PLCDScore paScores,
int *pnScores, PCLCDConfigure pLCDC) const;
private:
DWORD Initialize7BitLanguage (PFileLanguageSection pLS, PLanguage *ppL);
DWORD Initialize8BitLanguage (PFileLanguageSection pLS, Language **ppL);
DWORD InitializeUnicodeLanguage (PFileLanguageSection pLS,Language **ppL);
DWORD LoadLanguageSection (void *pv, int nSectionSize, PLanguage *ppL);
DWORD LoadHistogramSection (void *pv, int nSectionSize, Language *pL);
DWORD LoadMapSection (void *pv, int nSectionSize);
DWORD BuildState (DWORD nFileSize);
void Score7Bit (LPCSTR pcszText, int nChars, CScores &S) const;
void Score8Bit (LPCSTR pcszText, int nChars, CScores &S) const;
int ScoreCodePage (LPCSTR pStr, int nChars, CScore &S) const;
int ChooseDetectionType (LPCSTR pcszText, int nChars) const;
void ScoreLanguageA (LPCSTR pStr, int nChars, CScores &S) const;
void ScoreLanguageW (LPCWSTR wcs, int nChars, CScores &S, PCLCDConfigure) const;
void ScoreLanguageAsSBCS (LPCWSTR wcs, int nch, CScores &S) const;
void ScoreUnicodeSublanguages (PLanguageUnicode pL, LPCWSTR wcs,
int nch, CScores &S) const;
private:
// Language training info virtual-mapped in training file
unsigned int m_nCharmaps;
unsigned int m_n7BitLanguages;
unsigned int m_n8BitLanguages;
unsigned int m_nUnicodeLanguages;
PCharmap *m_ppCharmaps;
PLanguage7Bit *m_pp7BitLanguages;
PLanguage8Bit *m_pp8BitLanguages;
PLanguageUnicode *m_ppUnicodeLanguages;
// Cached information for the optimized scoring inner-loops.
PHElt m_paHElt7Bit[MAX7BITLANG];
PHElt m_paHElt8Bit[MAXSCORES];
int m_nHElt8Bit;
// Special 7-bit lang histogram for ScoreLanguageAsSBCS()
PHistogram m_pHU27Bit;
// Initialization state variables
unsigned int m_n7BitLangsRead;
unsigned int m_n8BitLangsRead;
unsigned int m_nUnicodeLangsRead;
unsigned int m_nMapsRead;
int m_nHistogramsRead;
int m_nScoreIdx;
// Default configuration to use when NULL parameter passed to detect
LCDConfigure m_LCDConfigureDefault;
// File mapping information for the training data file
HANDLE m_hf;
HANDLE m_hmap;
void *m_pv;
HMODULE m_hModule;
};
////////////////////////////////////////////////////////////////
inline PLanguageUnicode
LanguageUnicode::GetSublanguage (int n) const
{
return m_pLC->GetUnicodeLanguage(GetSublangRangeID(n));
}
#endif // __cplusplus