windows-nt/Source/XPSP1/NT/shell/ext/mlang/lcinit.cpp

/*
 * Automatic language and codepage detector
 * 
 * Bob Powell, 2/97
 * Copyright (C) 1996, 1997, Microsoft Corp.  All rights reserved.
 * 
 *  History:    1-Feb-97    BobP      Created
 *              5-Aug-97    BobP      Unicode support; Charmaps in data file.
 */
#include "private.h"
/****************************************************************/


Histogram::Histogram (const PFileHistogramSection pHS, const PHIdx pMap)
: m_nDimensionality((UCHAR)pHS->m_dwDimensionality),
  m_nEdgeSize((UCHAR)pHS->m_dwEdgeSize),
  m_nCodePage((USHORT)pHS->m_dwCodePage),
  m_pMap(pMap),
  m_panElts((HElt *)&pHS[1])	// table follows header struct  in the file
{
	// #elements = #unique character values ^ #dimensions

	m_nElts = 1;
	for (UCHAR i = 0; i < m_nDimensionality; i++)
		m_nElts *= m_nEdgeSize;
}

DWORD
Histogram::Validate (DWORD nBytes) const
{
	if ( nBytes < m_nElts * sizeof(HElt) ||
		 m_nDimensionality > 4 )
	{
		return ERROR_INTERNAL_DB_CORRUPTION;
	}

	return NO_ERROR;
}

Histogram::Histogram (const Histogram &H, const PHIdx pMap)
: m_nDimensionality(H.m_nDimensionality),
  m_nEdgeSize(H.m_nEdgeSize),
  m_nCodePage(H.m_nCodePage),
  m_nElts(H.m_nElts),
  m_pMap(pMap),
  m_panElts(H.m_panElts)
//
// Clone a histogram but use a different Charmap.
{
}

Histogram::~Histogram (void)
//
// The pointer members point to the mapped file and do not need to be freed.
{
}

/****************************************************************/

Language::Language (PLCDetect pL, int nLangID, int nCodePages, int nRangeID)
: m_pLC(pL),
  m_nLangID(nLangID),
  m_nCodePages(nCodePages),
  m_nRangeID(nRangeID)
{
}

Language7Bit::Language7Bit (PLCDetect pL, int nLangID, int nCodePages)
: Language(pL, nLangID, nCodePages),
  m_pLangHistogram(NULL)
{
	memset ((void *)m_ppCodePageHistogram, 0, sizeof(m_ppCodePageHistogram));
}

Language7Bit::~Language7Bit (void)
{
	if (m_pLangHistogram)
		delete m_pLangHistogram;

	for (int i = 0; i < MAXSUBLANG; i++)
		if (m_ppCodePageHistogram[i])
			delete m_ppCodePageHistogram[i];
}

DWORD
Language7Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)
//
// Add the raw histogram at *pHS in the mapped file to this language object.  
// The histograms must be for 7-bit detection.
{
	DWORD hr = NO_ERROR;

	PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );

	if (nIdx == 0)
	{
		// The first histogram for a language is its language-detection table.

		if ( (m_pLangHistogram = new Histogram (pHS, pMap)) == NULL)
			return ERROR_OUTOFMEMORY;

		if ((hr = m_pLangHistogram->Validate (nBytes)) != NO_ERROR)
			return hr;
	}
	else
	{
		// Each subsequent histogram is a code page detection table.

		if (nIdx - 1 >= m_nCodePages)
			return ERROR_INTERNAL_DB_CORRUPTION;

		Histogram *pH;

		if ((pH = new Histogram (pHS, pMap)) == NULL)
			return ERROR_OUTOFMEMORY;

		if ((hr = pH->Validate (nBytes)) != NO_ERROR)
			return hr;

		m_ppCodePageHistogram[nIdx - 1] = pH;

		// Cache for the scoring vector math

		m_paHElt[nIdx - 1] = pH->Array();
	}

	return hr;
}

/****************************************************************/

Language8Bit::Language8Bit (PLCDetect pL, int nLangID, int nCodePages)
: Language(pL, nLangID, nCodePages)
{
	memset ((void *)m_ppHistogram, 0, sizeof(m_ppHistogram));
}

Language8Bit::~Language8Bit (void)
{
	for (int i = 0; i < MAXSUBLANG; i++)
		if (m_ppHistogram[i])
			delete m_ppHistogram[i];
}

DWORD
Language8Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)
//
// Add the raw histogram at *pHS to this language object.  
// This language is known to use 8-bit detection.
{
	DWORD hr = NO_ERROR;

	PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );

	// The histograms are the direct language-code page tables

	if (nIdx >= m_nCodePages)
		return ERROR_INTERNAL_DB_CORRUPTION;

	Histogram *pH;

	if ((pH = new Histogram (pHS, pMap)) == NULL)
		return ERROR_OUTOFMEMORY;

	if ((hr = pH->Validate (nBytes)) != NO_ERROR)
		return hr;

	m_ppHistogram[nIdx] = pH;

	return hr;
}

/****************************************************************/

LanguageUnicode::LanguageUnicode (PLCDetect pL, int nLangID, 
	int nSubLangs, int nRangeID)
: Language(pL, nLangID, nSubLangs, nRangeID)
{
	memset ((void *)m_ppSubLangHistogram, 0, sizeof(m_ppSubLangHistogram));
}

LanguageUnicode::~LanguageUnicode (void)
{
	for (int i = 0; i < MAXSUBLANG; i++)
		if (m_ppSubLangHistogram[i])
			delete m_ppSubLangHistogram[i];
}

DWORD
LanguageUnicode::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)
{
	DWORD hr = NO_ERROR;

	// All histograms for are sublanguage detection

	if (nIdx >= m_nSubLangs)
		return ERROR_INTERNAL_DB_CORRUPTION;

	// Get the custom charmap used for scoring this sublanguage group

	PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );

	Histogram *pH;

	if ((pH = new Histogram (pHS, pMap)) == NULL)
		return ERROR_OUTOFMEMORY;

	if ((hr = pH->Validate (nBytes)) != NO_ERROR)
		return hr;

	m_ppSubLangHistogram[nIdx] = pH;

	m_paHElt[nIdx] = pH->Array();

	return hr;
}

/****************************************************************/

LCDetect::LCDetect (HMODULE hM)
: m_hModule(hM),
  m_nCharmaps(0),
  m_n7BitLanguages(0),
  m_n8BitLanguages(0),
  m_nUnicodeLanguages(0),
  m_n7BitLangsRead(0),
  m_n8BitLangsRead(0),
  m_nUnicodeLangsRead(0),
  m_nMapsRead(0),
  m_nHistogramsRead(0),
  m_nScoreIdx(0),
  m_pp7BitLanguages(NULL),
  m_pp8BitLanguages(NULL),
  m_ppUnicodeLanguages(NULL),
  m_ppCharmaps(NULL),
  m_pv(NULL),
  m_hmap(0),
  m_hf(0),
  m_pHU27Bit(0)
{
}

LCDetect::~LCDetect ()
{
    delete m_pHU27Bit;

    for (unsigned int i = 0; i < m_n7BitLanguages; i++)
        delete m_pp7BitLanguages[i];
    delete m_pp7BitLanguages;

    for (i = 0; i < m_n8BitLanguages; i++)
        delete m_pp8BitLanguages[i];
    delete m_pp8BitLanguages;

    for (i = 0; i < m_nUnicodeLanguages; i++)
        delete m_ppUnicodeLanguages[i];
    delete m_ppUnicodeLanguages;

    for (i = 0; i < m_nCharmaps; i++)
        delete m_ppCharmaps[i];
    delete m_ppCharmaps;

    if (m_pv)
        UnmapViewOfFile (m_pv);

    CloseHandle (m_hmap);
    CloseHandle (m_hf);
}

DWORD
LCDetect::Initialize7BitLanguage (PFileLanguageSection pLS, PLanguage *ppL)
//
// Set *ppL to the Language object created from this section.
{
	// nRecordCount is lang histogram (1) + # of code page histograms

	if ( m_n7BitLangsRead >= m_n7BitLanguages || pLS->m_dwRecordCount < 1)
		return ERROR_INTERNAL_DB_CORRUPTION;

	PLanguage7Bit pL = new Language7Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount - 1);

	if (pL == NULL)
		return ERROR_OUTOFMEMORY;


	// Each 7-bit lang uses one score index slot per code page.
	// The range starts with the 7-bit langs, since both the 8-bit
	// and Unicode langs follow it.

	if (m_n7BitLangsRead == 0 && m_nScoreIdx != 0)
		return ERROR_INTERNAL_DB_CORRUPTION;;

	pL->SetScoreIdx(m_nScoreIdx);

	m_nScoreIdx += pLS->m_dwRecordCount - 1;	// skip 1st record (Language)

	m_pp7BitLanguages[ m_n7BitLangsRead++ ] = pL;

	*ppL = pL;

	return NO_ERROR;
}

DWORD
LCDetect::Initialize8BitLanguage (PFileLanguageSection pLS, Language **ppL)
//
// Set *ppL to the Language object created from this section.
{
	// nRecordCount is # of combined language / code page histograms

	if ( m_n8BitLangsRead >= m_n8BitLanguages || pLS->m_dwRecordCount < 1)
		return ERROR_INTERNAL_DB_CORRUPTION;

	PLanguage8Bit pL = new Language8Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount);

	if (pL == NULL)
		return ERROR_OUTOFMEMORY;


	// The 8-bit score indices follow the 7-bit languages

	// Each 8-bit lang uses a score index slot for each of its code pages,
	// since all the code pages are scored in the initial scoring pass.
	// The number of slots is the number of code page histograms, which is
	// one less than the number of records following this language.

	pL->SetScoreIdx(m_nScoreIdx);
	m_nScoreIdx += pLS->m_dwRecordCount;


	m_pp8BitLanguages[ m_n8BitLangsRead++ ] = pL;

	*ppL = pL;

	return NO_ERROR;
}

DWORD
LCDetect::InitializeUnicodeLanguage (PFileLanguageSection pLS, Language **ppL)
//
// Set *ppL to the Language object created from this section.
{
	// nRecordCount is # of sublanguage histograms

	if ( m_nUnicodeLangsRead >= m_nUnicodeLanguages ||
		 pLS->m_dwUnicodeRangeID >= m_nUnicodeLanguages )
	{
		return ERROR_INTERNAL_DB_CORRUPTION;
	}

	PLanguageUnicode pL = new LanguageUnicode (this, pLS->m_dwLangID, 
						pLS->m_dwRecordCount, pLS->m_dwUnicodeRangeID);

	if (pL == NULL)
		return ERROR_OUTOFMEMORY;


	// The Unicode score indices follow the 7-bit languages, and overlay the
	// 8-bit slots since they aren't used at the same time.

	if (m_nUnicodeLangsRead == 0 && GetN8BitLanguages() > 0)
		m_nScoreIdx = Get8BitLanguage(0)->GetScoreIdx();

	// Each Unicode entry uses exactly one score index.  SBCS subdetection
	// (Latin group) uses the slots for the corresponding 7-bit languages,
	// and Unicode subdetection (CJK) uses the slots already defined for the
	// Unicode sub-languages.

	pL->SetScoreIdx(m_nScoreIdx);

	m_nScoreIdx++;

	// For Unicode, the range ID is used as the Language array index.

	m_ppUnicodeLanguages[ pLS->m_dwUnicodeRangeID ] = pL;
	m_nUnicodeLangsRead++;

	*ppL = pL;

	return NO_ERROR;
}

DWORD
LCDetect::LoadLanguageSection (void *pv, int nSectionSize, PLanguage *ppL)
//
// A language section begins the definition of data for a language.
// Each language has exactly one of these records.  One or more
// histogram sections follow each language, and are always associated
// with the language of the preceding language section.
//
// Set *ppL to the Language object created from this section.
{
	DWORD hr = NO_ERROR;

	PFileLanguageSection pLS;

	pLS = (PFileLanguageSection)&((char *)pv)[sizeof(FileSection)];

	switch ( pLS->m_dwDetectionType ) {

	case DETECT_7BIT:
		hr = Initialize7BitLanguage (pLS, ppL);
		break;

	case DETECT_8BIT:
		hr = Initialize8BitLanguage (pLS, ppL);
		break;

	case DETECT_UNICODE:
		hr = InitializeUnicodeLanguage (pLS, ppL);
		break;
	}

	return hr;
}

DWORD
LCDetect::LoadHistogramSection (void *pv, int nSectionSize, Language *pL)
{
	PFileHistogramSection pHS;

	pHS = (PFileHistogramSection)&((char *)pv)[sizeof(FileSection)];

	int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pHS);

	return pL->AddHistogram ( pHS, nBytes, m_nHistogramsRead++);
}

DWORD
LCDetect::LoadMapSection (void *pv, int nSectionSize)
{
	PFileMapSection pMS;

	pMS = (PFileMapSection)&((char *)pv)[sizeof(FileSection)];

	int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pMS);

	if (m_nMapsRead >= m_nCharmaps)
		return ERROR_INTERNAL_DB_CORRUPTION;

	PCharmap pM = new Charmap (pMS);

	if (pM == NULL)
		return ERROR_OUTOFMEMORY;

	m_ppCharmaps[ m_nMapsRead++ ]  = pM;

	return NO_ERROR;
}

DWORD
LCDetect::BuildState (DWORD nFileSize)
//
// Build the detection structures from the mapped training file image at *m_pv
{
	PLanguage pL;
	PFileHeader pFH;
	PFileSection pFS;

	DWORD hr = NO_ERROR;

	// Validate header

	pFH = (PFileHeader) m_pv;

	if ( nFileSize < sizeof(*pFH) || 
		 pFH->m_dwAppSig != APP_SIGNATURE ||
		 pFH->m_dwVersion != APP_VERSION ||
		 pFH->m_dwHdrSizeBytes >= nFileSize ||
		 pFH->m_dwN7BitLanguages == 0 ||
		 pFH->m_dwN8BitLanguages == 0 ||
		 pFH->m_dwNUnicodeLanguages == 0 ||
		 pFH->m_dwNCharmaps == 0 )
	{
		return ERROR_INTERNAL_DB_CORRUPTION;
	}

	// Allocate language pointer table per header

	m_n7BitLanguages = pFH->m_dwN7BitLanguages;
	m_pp7BitLanguages = new PLanguage7Bit [m_n7BitLanguages];

	m_n8BitLanguages = pFH->m_dwN8BitLanguages;
	m_pp8BitLanguages = new PLanguage8Bit [m_n8BitLanguages];

	m_nUnicodeLanguages = pFH->m_dwNUnicodeLanguages;
	m_ppUnicodeLanguages = new PLanguageUnicode [m_nUnicodeLanguages];

	m_nCharmaps = pFH->m_dwNCharmaps;
	m_ppCharmaps = new PCharmap [m_nCharmaps];

	if ( m_pp7BitLanguages == NULL || 
		 m_pp8BitLanguages == NULL || 
		 m_ppUnicodeLanguages == NULL ||
		 m_ppCharmaps == NULL )
	{
		return ERROR_OUTOFMEMORY;
	}

	// Clear, because not all slots may be assigned
	memset (m_ppUnicodeLanguages, 0, sizeof(PLanguageUnicode) * m_nUnicodeLanguages);

	// Remember other header info

	m_LCDConfigureDefault.nMin7BitScore = pFH->m_dwMin7BitScore;
	m_LCDConfigureDefault.nMin8BitScore = pFH->m_dwMin8BitScore;
	m_LCDConfigureDefault.nMinUnicodeScore = pFH->m_dwMinUnicodeScore;
	m_LCDConfigureDefault.nRelativeThreshhold = pFH->m_dwRelativeThreshhold;
	m_LCDConfigureDefault.nDocPctThreshhold = pFH->m_dwDocPctThreshhold;
	m_LCDConfigureDefault.nChunkSize = pFH->m_dwChunkSize;

	// Position to first section

	pFS = (PFileSection) &((char *)m_pv)[pFH->m_dwHdrSizeBytes];

	// Read and process each file section

	while ( hr == NO_ERROR ) {

		// check alignment

		if (((DWORD_PTR)pFS & 3) != 0) {
			hr = ERROR_INTERNAL_DB_CORRUPTION;
			break;
		}

		// zero-length section marks end of data

		if (pFS->m_dwSizeBytes == 0)
			break;

		if ( &((char *)pFS)[pFS->m_dwSizeBytes] >= &((char *)m_pv)[nFileSize]) {
			hr = ERROR_INTERNAL_DB_CORRUPTION;
			break;
		}

		switch ( pFS->m_dwType ) {

		case SECTION_TYPE_LANGUAGE:								// sets pL
			hr = LoadLanguageSection ((void*)pFS, pFS->m_dwSizeBytes, &pL);
			m_nHistogramsRead = 0;
			break;

		case SECTION_TYPE_HISTOGRAM:							// uses pL
			hr = LoadHistogramSection ((void*)pFS, pFS->m_dwSizeBytes, pL);
			break;

		case SECTION_TYPE_MAP:
			hr = LoadMapSection ((void*)pFS, pFS->m_dwSizeBytes);
			break;

		default:					// ignore unrecognized sections
			break;
		}

		pFS = (PFileSection) &((char *)pFS)[pFS->m_dwSizeBytes];
	}

	if (hr != NO_ERROR)
		return hr;

	if ( m_nMapsRead != m_nCharmaps )
		return ERROR_INTERNAL_DB_CORRUPTION;


	// Set up quick-reference arrays used by the scoring inner loops

	for (unsigned int i = 0; i < GetN7BitLanguages(); i++)
		m_paHElt7Bit[i] = Get7BitLanguage(i)->GetLangHistogram()->Array();

	m_nHElt8Bit = 0;
	for (i = 0; i < GetN8BitLanguages(); i++) 
	{
		PLanguage8Bit pL = Get8BitLanguage(i);

		for (int j = 0; j < pL->NCodePages(); j++)
			m_paHElt8Bit[m_nHElt8Bit++] = pL->GetHistogram(j)->Array();
	}

	// Set up the Histogram used for ScoreVectorW() for scoring Unicode
	// text for 7-bit language detection.  Clone the first 7-bit language
	// histogram and replace its map with CHARMAP_U27BIT.

	m_pHU27Bit = new Histogram ( *Get7BitLanguage(0)->GetLangHistogram(),
								 GetMap(CHARMAP_U27BIT));

	return hr;
}


DWORD
LCDetect::LoadState (void)
//
// Overall initialization and state loading.  Open the compiled training
// file from its fixed location in the System32 directory, and assemble
// in-memory detection tables from its contents.
{
	DWORD hr = NO_ERROR;
	DWORD nFileSize;
#define MODULENAMELEN 100
	char szFilename[MODULENAMELEN+50], *p;

	// Find out if NT or Windows

	OSVERSIONINFOA OSVersionInfo;
	int nOSWinNT = 0;
	OSVersionInfo.dwOSVersionInfoSize = sizeof( OSVERSIONINFOA );
	if ( GetVersionExA( &OSVersionInfo ) )
		nOSWinNT = OSVersionInfo.dwPlatformId;

	// Open the training data file,
	// look in the directory that contains the DLL.

	if (GetModuleFileNameA (m_hModule, szFilename, MODULENAMELEN) == 0)
		return GetLastError();

	if ( (p = strrchr (szFilename, '\\')) != NULL ||
		 (p = strrchr (szFilename, ':')) != NULL )
	{
		*++p = 0;
	}
	else
		*szFilename = 0;
	strcat (szFilename, DETECTION_DATA_FILENAME);

    if ((m_hf = CreateFileA (szFilename, GENERIC_READ, FILE_SHARE_READ, 
                    NULL, OPEN_EXISTING, 
                    FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE) 
    {
        return E_FAIL;
    }

	if ((nFileSize = GetFileSize (m_hf, NULL)) == 0xffffffff) {
		hr = GetLastError();
		CloseHandle (m_hf);
		return hr;
	}

	// Virtual-map the file

	if ( nOSWinNT == VER_PLATFORM_WIN32_NT )
		m_hmap = CreateFileMapping (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL);
	else
		m_hmap = CreateFileMappingA (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL);

	if (m_hmap == NULL) {
		hr = GetLastError();
		CloseHandle (m_hf);
		return hr;
	}

	if ((m_pv = MapViewOfFile (m_hmap, FILE_MAP_READ, 0, 0, 0 )) == NULL) {
		hr = GetLastError();
		CloseHandle (m_hmap);
		CloseHandle (m_hf);
		return hr;
	}
		
	// Build the in-memory structures from the file

	hr = BuildState (nFileSize);

	return hr;
}

/****************************************************************/
Add source files 2020-09-26 03:20:57 -05:00			`/*`
			`* Automatic language and codepage detector`
			`*`
			`* Bob Powell, 2/97`
			`* Copyright (C) 1996, 1997, Microsoft Corp. All rights reserved.`
			`*`
			`* History: 1-Feb-97 BobP Created`
			`* 5-Aug-97 BobP Unicode support; Charmaps in data file.`
			`*/`
			`#include "private.h"`
			`/****************************************************************/`



			`Histogram::Histogram (const PFileHistogramSection pHS, const PHIdx pMap)`
			`: m_nDimensionality((UCHAR)pHS->m_dwDimensionality),`
			`m_nEdgeSize((UCHAR)pHS->m_dwEdgeSize),`
			`m_nCodePage((USHORT)pHS->m_dwCodePage),`
			`m_pMap(pMap),`
			`m_panElts((HElt *)&pHS[1]) // table follows header struct in the file`
			`{`
			`// #elements = #unique character values ^ #dimensions`

			`m_nElts = 1;`
			`for (UCHAR i = 0; i < m_nDimensionality; i++)`
			`m_nElts *= m_nEdgeSize;`
			`}`

			`DWORD`
			`Histogram::Validate (DWORD nBytes) const`
			`{`
			`if ( nBytes < m_nElts * sizeof(HElt) \|\|`
			`m_nDimensionality > 4 )`
			`{`
			`return ERROR_INTERNAL_DB_CORRUPTION;`
			`}`

			`return NO_ERROR;`
			`}`

			`Histogram::Histogram (const Histogram &H, const PHIdx pMap)`
			`: m_nDimensionality(H.m_nDimensionality),`
			`m_nEdgeSize(H.m_nEdgeSize),`
			`m_nCodePage(H.m_nCodePage),`
			`m_nElts(H.m_nElts),`
			`m_pMap(pMap),`
			`m_panElts(H.m_panElts)`
			`//`
			`// Clone a histogram but use a different Charmap.`
			`{`
			`}`

			`Histogram::~Histogram (void)`
			`//`
			`// The pointer members point to the mapped file and do not need to be freed.`
			`{`
			`}`

			`/****************************************************************/`

			`Language::Language (PLCDetect pL, int nLangID, int nCodePages, int nRangeID)`
			`: m_pLC(pL),`
			`m_nLangID(nLangID),`
			`m_nCodePages(nCodePages),`
			`m_nRangeID(nRangeID)`
			`{`
			`}`

			`Language7Bit::Language7Bit (PLCDetect pL, int nLangID, int nCodePages)`
			`: Language(pL, nLangID, nCodePages),`
			`m_pLangHistogram(NULL)`
			`{`
			`memset ((void *)m_ppCodePageHistogram, 0, sizeof(m_ppCodePageHistogram));`
			`}`

			`Language7Bit::~Language7Bit (void)`
			`{`
			`if (m_pLangHistogram)`
			`delete m_pLangHistogram;`

			`for (int i = 0; i < MAXSUBLANG; i++)`
			`if (m_ppCodePageHistogram[i])`
			`delete m_ppCodePageHistogram[i];`
			`}`

			`DWORD`
			`Language7Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)`
			`//`
			`// Add the raw histogram at *pHS in the mapped file to this language object.`
			`// The histograms must be for 7-bit detection.`
			`{`
			`DWORD hr = NO_ERROR;`

			`PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );`

			`if (nIdx == 0)`
			`{`
			`// The first histogram for a language is its language-detection table.`

			`if ( (m_pLangHistogram = new Histogram (pHS, pMap)) == NULL)`
			`return ERROR_OUTOFMEMORY;`

			`if ((hr = m_pLangHistogram->Validate (nBytes)) != NO_ERROR)`
			`return hr;`
			`}`
			`else`
			`{`
			`// Each subsequent histogram is a code page detection table.`

			`if (nIdx - 1 >= m_nCodePages)`
			`return ERROR_INTERNAL_DB_CORRUPTION;`

			`Histogram *pH;`

			`if ((pH = new Histogram (pHS, pMap)) == NULL)`
			`return ERROR_OUTOFMEMORY;`

			`if ((hr = pH->Validate (nBytes)) != NO_ERROR)`
			`return hr;`

			`m_ppCodePageHistogram[nIdx - 1] = pH;`

			`// Cache for the scoring vector math`

			`m_paHElt[nIdx - 1] = pH->Array();`
			`}`

			`return hr;`
			`}`

			`/****************************************************************/`

			`Language8Bit::Language8Bit (PLCDetect pL, int nLangID, int nCodePages)`
			`: Language(pL, nLangID, nCodePages)`
			`{`
			`memset ((void *)m_ppHistogram, 0, sizeof(m_ppHistogram));`
			`}`

			`Language8Bit::~Language8Bit (void)`
			`{`
			`for (int i = 0; i < MAXSUBLANG; i++)`
			`if (m_ppHistogram[i])`
			`delete m_ppHistogram[i];`
			`}`

			`DWORD`
			`Language8Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)`
			`//`
			`// Add the raw histogram at *pHS to this language object.`
			`// This language is known to use 8-bit detection.`
			`{`
			`DWORD hr = NO_ERROR;`

			`PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );`

			`// The histograms are the direct language-code page tables`

			`if (nIdx >= m_nCodePages)`
			`return ERROR_INTERNAL_DB_CORRUPTION;`

			`Histogram *pH;`

			`if ((pH = new Histogram (pHS, pMap)) == NULL)`
			`return ERROR_OUTOFMEMORY;`

			`if ((hr = pH->Validate (nBytes)) != NO_ERROR)`
			`return hr;`

			`m_ppHistogram[nIdx] = pH;`

			`return hr;`
			`}`

			`/****************************************************************/`

			`LanguageUnicode::LanguageUnicode (PLCDetect pL, int nLangID,`
			`int nSubLangs, int nRangeID)`
			`: Language(pL, nLangID, nSubLangs, nRangeID)`
			`{`
			`memset ((void *)m_ppSubLangHistogram, 0, sizeof(m_ppSubLangHistogram));`
			`}`

			`LanguageUnicode::~LanguageUnicode (void)`
			`{`
			`for (int i = 0; i < MAXSUBLANG; i++)`
			`if (m_ppSubLangHistogram[i])`
			`delete m_ppSubLangHistogram[i];`
			`}`

			`DWORD`
			`LanguageUnicode::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)`
			`{`
			`DWORD hr = NO_ERROR;`

			`// All histograms for are sublanguage detection`

			`if (nIdx >= m_nSubLangs)`
			`return ERROR_INTERNAL_DB_CORRUPTION;`

			`// Get the custom charmap used for scoring this sublanguage group`

			`PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );`

			`Histogram *pH;`

			`if ((pH = new Histogram (pHS, pMap)) == NULL)`
			`return ERROR_OUTOFMEMORY;`

			`if ((hr = pH->Validate (nBytes)) != NO_ERROR)`
			`return hr;`

			`m_ppSubLangHistogram[nIdx] = pH;`

			`m_paHElt[nIdx] = pH->Array();`

			`return hr;`
			`}`

			`/****************************************************************/`

			`LCDetect::LCDetect (HMODULE hM)`
			`: m_hModule(hM),`
			`m_nCharmaps(0),`
			`m_n7BitLanguages(0),`
			`m_n8BitLanguages(0),`
			`m_nUnicodeLanguages(0),`
			`m_n7BitLangsRead(0),`
			`m_n8BitLangsRead(0),`
			`m_nUnicodeLangsRead(0),`
			`m_nMapsRead(0),`
			`m_nHistogramsRead(0),`
			`m_nScoreIdx(0),`
			`m_pp7BitLanguages(NULL),`
			`m_pp8BitLanguages(NULL),`
			`m_ppUnicodeLanguages(NULL),`
			`m_ppCharmaps(NULL),`
			`m_pv(NULL),`
			`m_hmap(0),`
			`m_hf(0),`
			`m_pHU27Bit(0)`
			`{`
			`}`

			`LCDetect::~LCDetect ()`
			`{`
			`delete m_pHU27Bit;`

			`for (unsigned int i = 0; i < m_n7BitLanguages; i++)`
			`delete m_pp7BitLanguages[i];`
			`delete m_pp7BitLanguages;`

			`for (i = 0; i < m_n8BitLanguages; i++)`
			`delete m_pp8BitLanguages[i];`
			`delete m_pp8BitLanguages;`

			`for (i = 0; i < m_nUnicodeLanguages; i++)`
			`delete m_ppUnicodeLanguages[i];`
			`delete m_ppUnicodeLanguages;`

			`for (i = 0; i < m_nCharmaps; i++)`
			`delete m_ppCharmaps[i];`
			`delete m_ppCharmaps;`

			`if (m_pv)`
			`UnmapViewOfFile (m_pv);`

			`CloseHandle (m_hmap);`
			`CloseHandle (m_hf);`
			`}`

			`DWORD`
			`LCDetect::Initialize7BitLanguage (PFileLanguageSection pLS, PLanguage *ppL)`
			`//`
			`// Set *ppL to the Language object created from this section.`
			`{`
			`// nRecordCount is lang histogram (1) + # of code page histograms`

			`if ( m_n7BitLangsRead >= m_n7BitLanguages \|\| pLS->m_dwRecordCount < 1)`
			`return ERROR_INTERNAL_DB_CORRUPTION;`

			`PLanguage7Bit pL = new Language7Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount - 1);`

			`if (pL == NULL)`
			`return ERROR_OUTOFMEMORY;`


			`// Each 7-bit lang uses one score index slot per code page.`
			`// The range starts with the 7-bit langs, since both the 8-bit`
			`// and Unicode langs follow it.`

			`if (m_n7BitLangsRead == 0 && m_nScoreIdx != 0)`
			`return ERROR_INTERNAL_DB_CORRUPTION;;`

			`pL->SetScoreIdx(m_nScoreIdx);`

			`m_nScoreIdx += pLS->m_dwRecordCount - 1; // skip 1st record (Language)`

			`m_pp7BitLanguages[ m_n7BitLangsRead++ ] = pL;`

			`*ppL = pL;`

			`return NO_ERROR;`
			`}`

			`DWORD`
			`LCDetect::Initialize8BitLanguage (PFileLanguageSection pLS, Language **ppL)`
			`//`
			`// Set *ppL to the Language object created from this section.`
			`{`
			`// nRecordCount is # of combined language / code page histograms`

			`if ( m_n8BitLangsRead >= m_n8BitLanguages \|\| pLS->m_dwRecordCount < 1)`
			`return ERROR_INTERNAL_DB_CORRUPTION;`

			`PLanguage8Bit pL = new Language8Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount);`

			`if (pL == NULL)`
			`return ERROR_OUTOFMEMORY;`


			`// The 8-bit score indices follow the 7-bit languages`

			`// Each 8-bit lang uses a score index slot for each of its code pages,`
			`// since all the code pages are scored in the initial scoring pass.`
			`// The number of slots is the number of code page histograms, which is`
			`// one less than the number of records following this language.`

			`pL->SetScoreIdx(m_nScoreIdx);`
			`m_nScoreIdx += pLS->m_dwRecordCount;`


			`m_pp8BitLanguages[ m_n8BitLangsRead++ ] = pL;`

			`*ppL = pL;`

			`return NO_ERROR;`
			`}`

			`DWORD`
			`LCDetect::InitializeUnicodeLanguage (PFileLanguageSection pLS, Language **ppL)`
			`//`
			`// Set *ppL to the Language object created from this section.`
			`{`
			`// nRecordCount is # of sublanguage histograms`

			`if ( m_nUnicodeLangsRead >= m_nUnicodeLanguages \|\|`
			`pLS->m_dwUnicodeRangeID >= m_nUnicodeLanguages )`
			`{`
			`return ERROR_INTERNAL_DB_CORRUPTION;`
			`}`

			`PLanguageUnicode pL = new LanguageUnicode (this, pLS->m_dwLangID,`
			`pLS->m_dwRecordCount, pLS->m_dwUnicodeRangeID);`

			`if (pL == NULL)`
			`return ERROR_OUTOFMEMORY;`


			`// The Unicode score indices follow the 7-bit languages, and overlay the`
			`// 8-bit slots since they aren't used at the same time.`

			`if (m_nUnicodeLangsRead == 0 && GetN8BitLanguages() > 0)`
			`m_nScoreIdx = Get8BitLanguage(0)->GetScoreIdx();`

			`// Each Unicode entry uses exactly one score index. SBCS subdetection`
			`// (Latin group) uses the slots for the corresponding 7-bit languages,`
			`// and Unicode subdetection (CJK) uses the slots already defined for the`
			`// Unicode sub-languages.`

			`pL->SetScoreIdx(m_nScoreIdx);`

			`m_nScoreIdx++;`

			`// For Unicode, the range ID is used as the Language array index.`

			`m_ppUnicodeLanguages[ pLS->m_dwUnicodeRangeID ] = pL;`
			`m_nUnicodeLangsRead++;`

			`*ppL = pL;`

			`return NO_ERROR;`
			`}`

			`DWORD`
			`LCDetect::LoadLanguageSection (void pv, int nSectionSize, PLanguage ppL)`
			`//`
			`// A language section begins the definition of data for a language.`
			`// Each language has exactly one of these records. One or more`
			`// histogram sections follow each language, and are always associated`
			`// with the language of the preceding language section.`
			`//`
			`// Set *ppL to the Language object created from this section.`
			`{`
			`DWORD hr = NO_ERROR;`

			`PFileLanguageSection pLS;`

			`pLS = (PFileLanguageSection)&((char *)pv)[sizeof(FileSection)];`

			`switch ( pLS->m_dwDetectionType ) {`

			`case DETECT_7BIT:`
			`hr = Initialize7BitLanguage (pLS, ppL);`
			`break;`

			`case DETECT_8BIT:`
			`hr = Initialize8BitLanguage (pLS, ppL);`
			`break;`

			`case DETECT_UNICODE:`
			`hr = InitializeUnicodeLanguage (pLS, ppL);`
			`break;`
			`}`

			`return hr;`
			`}`

			`DWORD`
			`LCDetect::LoadHistogramSection (void pv, int nSectionSize, Language pL)`
			`{`
			`PFileHistogramSection pHS;`

			`pHS = (PFileHistogramSection)&((char *)pv)[sizeof(FileSection)];`

			`int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pHS);`

			`return pL->AddHistogram ( pHS, nBytes, m_nHistogramsRead++);`
			`}`

			`DWORD`
			`LCDetect::LoadMapSection (void *pv, int nSectionSize)`
			`{`
			`PFileMapSection pMS;`

			`pMS = (PFileMapSection)&((char *)pv)[sizeof(FileSection)];`

			`int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pMS);`

			`if (m_nMapsRead >= m_nCharmaps)`
			`return ERROR_INTERNAL_DB_CORRUPTION;`

			`PCharmap pM = new Charmap (pMS);`

			`if (pM == NULL)`
			`return ERROR_OUTOFMEMORY;`

			`m_ppCharmaps[ m_nMapsRead++ ] = pM;`

			`return NO_ERROR;`
			`}`

			`DWORD`
			`LCDetect::BuildState (DWORD nFileSize)`
			`//`
			`// Build the detection structures from the mapped training file image at *m_pv`
			`{`
			`PLanguage pL;`
			`PFileHeader pFH;`
			`PFileSection pFS;`

			`DWORD hr = NO_ERROR;`

			`// Validate header`

			`pFH = (PFileHeader) m_pv;`

			`if ( nFileSize < sizeof(*pFH) \|\|`
			`pFH->m_dwAppSig != APP_SIGNATURE \|\|`
			`pFH->m_dwVersion != APP_VERSION \|\|`
			`pFH->m_dwHdrSizeBytes >= nFileSize \|\|`
			`pFH->m_dwN7BitLanguages == 0 \|\|`
			`pFH->m_dwN8BitLanguages == 0 \|\|`
			`pFH->m_dwNUnicodeLanguages == 0 \|\|`
			`pFH->m_dwNCharmaps == 0 )`
			`{`
			`return ERROR_INTERNAL_DB_CORRUPTION;`
			`}`

			`// Allocate language pointer table per header`

			`m_n7BitLanguages = pFH->m_dwN7BitLanguages;`
			`m_pp7BitLanguages = new PLanguage7Bit [m_n7BitLanguages];`

			`m_n8BitLanguages = pFH->m_dwN8BitLanguages;`
			`m_pp8BitLanguages = new PLanguage8Bit [m_n8BitLanguages];`

			`m_nUnicodeLanguages = pFH->m_dwNUnicodeLanguages;`
			`m_ppUnicodeLanguages = new PLanguageUnicode [m_nUnicodeLanguages];`

			`m_nCharmaps = pFH->m_dwNCharmaps;`
			`m_ppCharmaps = new PCharmap [m_nCharmaps];`

			`if ( m_pp7BitLanguages == NULL \|\|`
			`m_pp8BitLanguages == NULL \|\|`
			`m_ppUnicodeLanguages == NULL \|\|`
			`m_ppCharmaps == NULL )`
			`{`
			`return ERROR_OUTOFMEMORY;`
			`}`

			`// Clear, because not all slots may be assigned`
			`memset (m_ppUnicodeLanguages, 0, sizeof(PLanguageUnicode) * m_nUnicodeLanguages);`

			`// Remember other header info`

			`m_LCDConfigureDefault.nMin7BitScore = pFH->m_dwMin7BitScore;`
			`m_LCDConfigureDefault.nMin8BitScore = pFH->m_dwMin8BitScore;`
			`m_LCDConfigureDefault.nMinUnicodeScore = pFH->m_dwMinUnicodeScore;`
			`m_LCDConfigureDefault.nRelativeThreshhold = pFH->m_dwRelativeThreshhold;`
			`m_LCDConfigureDefault.nDocPctThreshhold = pFH->m_dwDocPctThreshhold;`
			`m_LCDConfigureDefault.nChunkSize = pFH->m_dwChunkSize;`

			`// Position to first section`

			`pFS = (PFileSection) &((char *)m_pv)[pFH->m_dwHdrSizeBytes];`

			`// Read and process each file section`

			`while ( hr == NO_ERROR ) {`

			`// check alignment`

			`if (((DWORD_PTR)pFS & 3) != 0) {`
			`hr = ERROR_INTERNAL_DB_CORRUPTION;`
			`break;`
			`}`

			`// zero-length section marks end of data`

			`if (pFS->m_dwSizeBytes == 0)`
			`break;`

			`if ( &((char )pFS)[pFS->m_dwSizeBytes] >= &((char )m_pv)[nFileSize]) {`
			`hr = ERROR_INTERNAL_DB_CORRUPTION;`
			`break;`
			`}`

			`switch ( pFS->m_dwType ) {`

			`case SECTION_TYPE_LANGUAGE: // sets pL`
			`hr = LoadLanguageSection ((void*)pFS, pFS->m_dwSizeBytes, &pL);`
			`m_nHistogramsRead = 0;`
			`break;`

			`case SECTION_TYPE_HISTOGRAM: // uses pL`
			`hr = LoadHistogramSection ((void*)pFS, pFS->m_dwSizeBytes, pL);`
			`break;`

			`case SECTION_TYPE_MAP:`
			`hr = LoadMapSection ((void*)pFS, pFS->m_dwSizeBytes);`
			`break;`

			`default: // ignore unrecognized sections`
			`break;`
			`}`

			`pFS = (PFileSection) &((char *)pFS)[pFS->m_dwSizeBytes];`
			`}`

			`if (hr != NO_ERROR)`
			`return hr;`

			`if ( m_nMapsRead != m_nCharmaps )`
			`return ERROR_INTERNAL_DB_CORRUPTION;`


			`// Set up quick-reference arrays used by the scoring inner loops`

			`for (unsigned int i = 0; i < GetN7BitLanguages(); i++)`
			`m_paHElt7Bit[i] = Get7BitLanguage(i)->GetLangHistogram()->Array();`

			`m_nHElt8Bit = 0;`
			`for (i = 0; i < GetN8BitLanguages(); i++)`
			`{`
			`PLanguage8Bit pL = Get8BitLanguage(i);`

			`for (int j = 0; j < pL->NCodePages(); j++)`
			`m_paHElt8Bit[m_nHElt8Bit++] = pL->GetHistogram(j)->Array();`
			`}`

			`// Set up the Histogram used for ScoreVectorW() for scoring Unicode`
			`// text for 7-bit language detection. Clone the first 7-bit language`
			`// histogram and replace its map with CHARMAP_U27BIT.`

			`m_pHU27Bit = new Histogram ( *Get7BitLanguage(0)->GetLangHistogram(),`
			`GetMap(CHARMAP_U27BIT));`

			`return hr;`
			`}`


			`DWORD`
			`LCDetect::LoadState (void)`
			`//`
			`// Overall initialization and state loading. Open the compiled training`
			`// file from its fixed location in the System32 directory, and assemble`
			`// in-memory detection tables from its contents.`
			`{`
			`DWORD hr = NO_ERROR;`
			`DWORD nFileSize;`
			`#define MODULENAMELEN 100`
			`char szFilename[MODULENAMELEN+50], *p;`

			`// Find out if NT or Windows`

			`OSVERSIONINFOA OSVersionInfo;`
			`int nOSWinNT = 0;`
			`OSVersionInfo.dwOSVersionInfoSize = sizeof( OSVERSIONINFOA );`
			`if ( GetVersionExA( &OSVersionInfo ) )`
			`nOSWinNT = OSVersionInfo.dwPlatformId;`

			`// Open the training data file,`
			`// look in the directory that contains the DLL.`

			`if (GetModuleFileNameA (m_hModule, szFilename, MODULENAMELEN) == 0)`
			`return GetLastError();`

			`if ( (p = strrchr (szFilename, '\\')) != NULL \|\|`
			`(p = strrchr (szFilename, ':')) != NULL )`
			`{`
			`*++p = 0;`
			`}`
			`else`
			`*szFilename = 0;`
			`strcat (szFilename, DETECTION_DATA_FILENAME);`

			`if ((m_hf = CreateFileA (szFilename, GENERIC_READ, FILE_SHARE_READ,`
			`NULL, OPEN_EXISTING,`
			`FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE)`
			`{`
			`return E_FAIL;`
			`}`

			`if ((nFileSize = GetFileSize (m_hf, NULL)) == 0xffffffff) {`
			`hr = GetLastError();`
			`CloseHandle (m_hf);`
			`return hr;`
			`}`

			`// Virtual-map the file`

			`if ( nOSWinNT == VER_PLATFORM_WIN32_NT )`
			`m_hmap = CreateFileMapping (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL);`
			`else`
			`m_hmap = CreateFileMappingA (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL);`

			`if (m_hmap == NULL) {`
			`hr = GetLastError();`
			`CloseHandle (m_hf);`
			`return hr;`
			`}`

			`if ((m_pv = MapViewOfFile (m_hmap, FILE_MAP_READ, 0, 0, 0 )) == NULL) {`
			`hr = GetLastError();`
			`CloseHandle (m_hmap);`
			`CloseHandle (m_hf);`
			`return hr;`
			`}`

			`// Build the in-memory structures from the file`

			`hr = BuildState (nFileSize);`

			`return hr;`
			`}`

			`/****************************************************************/`