windows-nt/Source/XPSP1/NT/shell/ext/mlang/lcinit.cpp

/*
 * Automatic language and codepage detector
 *
 * Bob Powell, 2/97
 * Copyright (C) 1996, 1997, Microsoft Corp.  All rights reserved.
 *
 *  History:    1-Feb-97    BobP      Created
 *              5-Aug-97    BobP      Unicode support; Charmaps in data file.
 */
#include "private.h"
/****************************************************************/


Histogram::Histogram (const PFileHistogramSection pHS, const PHIdx pMap)
: m_nDimensionality((UCHAR)pHS->m_dwDimensionality),
  m_nEdgeSize((UCHAR)pHS->m_dwEdgeSize),
  m_nCodePage((USHORT)pHS->m_dwCodePage),
  m_pMap(pMap),
  m_panElts((HElt *)&pHS[1])	// table follows header struct  in the file
{
	// #elements = #unique character values ^ #dimensions

	m_nElts = 1;
	for (UCHAR i = 0; i < m_nDimensionality; i++)
		m_nElts *= m_nEdgeSize;
}

DWORD
Histogram::Validate (DWORD nBytes) const
{
	if ( nBytes < m_nElts * sizeof(HElt) ||
		 m_nDimensionality > 4 )
	{
		return ERROR_INTERNAL_DB_CORRUPTION;
	}

	return NO_ERROR;
}

Histogram::Histogram (const Histogram &H, const PHIdx pMap)
: m_nDimensionality(H.m_nDimensionality),
  m_nEdgeSize(H.m_nEdgeSize),
  m_nCodePage(H.m_nCodePage),
  m_nElts(H.m_nElts),
  m_pMap(pMap),
  m_panElts(H.m_panElts)
//
// Clone a histogram but use a different Charmap.
{
}

Histogram::~Histogram (void)
//
// The pointer members point to the mapped file and do not need to be freed.
{
}

/****************************************************************/

Language::Language (PLCDetect pL, int nLangID, int nCodePages, int nRangeID)
: m_pLC(pL),
  m_nLangID(nLangID),
  m_nCodePages(nCodePages),
  m_nRangeID(nRangeID)
{
}

Language7Bit::Language7Bit (PLCDetect pL, int nLangID, int nCodePages)
: Language(pL, nLangID, nCodePages),
  m_pLangHistogram(NULL)
{
	memset ((void *)m_ppCodePageHistogram, 0, sizeof(m_ppCodePageHistogram));
}

Language7Bit::~Language7Bit (void)
{
	if (m_pLangHistogram)
		delete m_pLangHistogram;

	for (int i = 0; i < MAXSUBLANG; i++)
		if (m_ppCodePageHistogram[i])
			delete m_ppCodePageHistogram[i];
}

DWORD
Language7Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)
//
// Add the raw histogram at *pHS in the mapped file to this language object.
// The histograms must be for 7-bit detection.
{
	DWORD hr = NO_ERROR;

	PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );

	if (nIdx == 0)
	{
		// The first histogram for a language is its language-detection table.

		if ( (m_pLangHistogram = new Histogram (pHS, pMap)) == NULL)
			return ERROR_OUTOFMEMORY;

		if ((hr = m_pLangHistogram->Validate (nBytes)) != NO_ERROR)
			return hr;
	}
	else
	{
		// Each subsequent histogram is a code page detection table.

		if (nIdx - 1 >= m_nCodePages)
			return ERROR_INTERNAL_DB_CORRUPTION;

		Histogram *pH;

		if ((pH = new Histogram (pHS, pMap)) == NULL)
			return ERROR_OUTOFMEMORY;

		if ((hr = pH->Validate (nBytes)) != NO_ERROR)
			return hr;

		m_ppCodePageHistogram[nIdx - 1] = pH;

		// Cache for the scoring vector math

		m_paHElt[nIdx - 1] = pH->Array();
	}

	return hr;
}

/****************************************************************/

Language8Bit::Language8Bit (PLCDetect pL, int nLangID, int nCodePages)
: Language(pL, nLangID, nCodePages)
{
	memset ((void *)m_ppHistogram, 0, sizeof(m_ppHistogram));
}

Language8Bit::~Language8Bit (void)
{
	for (int i = 0; i < MAXSUBLANG; i++)
		if (m_ppHistogram[i])
			delete m_ppHistogram[i];
}

DWORD
Language8Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)
//
// Add the raw histogram at *pHS to this language object.
// This language is known to use 8-bit detection.
{
	DWORD hr = NO_ERROR;

	PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );

	// The histograms are the direct language-code page tables

	if (nIdx >= m_nCodePages)
		return ERROR_INTERNAL_DB_CORRUPTION;

	Histogram *pH;

	if ((pH = new Histogram (pHS, pMap)) == NULL)
		return ERROR_OUTOFMEMORY;

	if ((hr = pH->Validate (nBytes)) != NO_ERROR)
		return hr;

	m_ppHistogram[nIdx] = pH;

	return hr;
}

/****************************************************************/

LanguageUnicode::LanguageUnicode (PLCDetect pL, int nLangID,
	int nSubLangs, int nRangeID)
: Language(pL, nLangID, nSubLangs, nRangeID)
{
	memset ((void *)m_ppSubLangHistogram, 0, sizeof(m_ppSubLangHistogram));
}

LanguageUnicode::~LanguageUnicode (void)
{
	for (int i = 0; i < MAXSUBLANG; i++)
		if (m_ppSubLangHistogram[i])
			delete m_ppSubLangHistogram[i];
}

DWORD
LanguageUnicode::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx)
{
	DWORD hr = NO_ERROR;

	// All histograms for are sublanguage detection

	if (nIdx >= m_nSubLangs)
		return ERROR_INTERNAL_DB_CORRUPTION;

	// Get the custom charmap used for scoring this sublanguage group

	PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID );

	Histogram *pH;

	if ((pH = new Histogram (pHS, pMap)) == NULL)
		return ERROR_OUTOFMEMORY;

	if ((hr = pH->Validate (nBytes)) != NO_ERROR)
		return hr;

	m_ppSubLangHistogram[nIdx] = pH;

	m_paHElt[nIdx] = pH->Array();

	return hr;
}

/****************************************************************/

LCDetect::LCDetect (HMODULE hM)
: m_hModule(hM),
  m_nCharmaps(0),
  m_n7BitLanguages(0),
  m_n8BitLanguages(0),
  m_nUnicodeLanguages(0),
  m_n7BitLangsRead(0),
  m_n8BitLangsRead(0),
  m_nUnicodeLangsRead(0),
  m_nMapsRead(0),
  m_nHistogramsRead(0),
  m_nScoreIdx(0),
  m_pp7BitLanguages(NULL),
  m_pp8BitLanguages(NULL),
  m_ppUnicodeLanguages(NULL),
  m_ppCharmaps(NULL),
  m_pv(NULL),
  m_hmap(0),
  m_hf(0),
  m_pHU27Bit(0)
{
}

LCDetect::~LCDetect ()
{
    delete m_pHU27Bit;

    for (unsigned int i = 0; i < m_n7BitLanguages; i++)
        delete m_pp7BitLanguages[i];
    delete m_pp7BitLanguages;

    for (i = 0; i < m_n8BitLanguages; i++)
        delete m_pp8BitLanguages[i];
    delete m_pp8BitLanguages;

    for (i = 0; i < m_nUnicodeLanguages; i++)
        delete m_ppUnicodeLanguages[i];
    delete m_ppUnicodeLanguages;

    for (i = 0; i < m_nCharmaps; i++)
        delete m_ppCharmaps[i];
    delete m_ppCharmaps;

    if (m_pv)
        UnmapViewOfFile (m_pv);

    CloseHandle (m_hmap);
    CloseHandle (m_hf);
}

DWORD
LCDetect::Initialize7BitLanguage (PFileLanguageSection pLS, PLanguage *ppL)
//
// Set *ppL to the Language object created from this section.
{
	// nRecordCount is lang histogram (1) + # of code page histograms

	if ( m_n7BitLangsRead >= m_n7BitLanguages || pLS->m_dwRecordCount < 1)
		return ERROR_INTERNAL_DB_CORRUPTION;

	PLanguage7Bit pL = new Language7Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount - 1);

	if (pL == NULL)
		return ERROR_OUTOFMEMORY;


	// Each 7-bit lang uses one score index slot per code page.
	// The range starts with the 7-bit langs, since both the 8-bit
	// and Unicode langs follow it.

	if (m_n7BitLangsRead == 0 && m_nScoreIdx != 0)
		return ERROR_INTERNAL_DB_CORRUPTION;;

	pL->SetScoreIdx(m_nScoreIdx);

	m_nScoreIdx += pLS->m_dwRecordCount - 1;	// skip 1st record (Language)

	m_pp7BitLanguages[ m_n7BitLangsRead++ ] = pL;

	*ppL = pL;

	return NO_ERROR;
}

DWORD
LCDetect::Initialize8BitLanguage (PFileLanguageSection pLS, Language **ppL)
//
// Set *ppL to the Language object created from this section.
{
	// nRecordCount is # of combined language / code page histograms

	if ( m_n8BitLangsRead >= m_n8BitLanguages || pLS->m_dwRecordCount < 1)
		return ERROR_INTERNAL_DB_CORRUPTION;

	PLanguage8Bit pL = new Language8Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount);

	if (pL == NULL)
		return ERROR_OUTOFMEMORY;


	// The 8-bit score indices follow the 7-bit languages

	// Each 8-bit lang uses a score index slot for each of its code pages,
	// since all the code pages are scored in the initial scoring pass.
	// The number of slots is the number of code page histograms, which is
	// one less than the number of records following this language.

	pL->SetScoreIdx(m_nScoreIdx);
	m_nScoreIdx += pLS->m_dwRecordCount;


	m_pp8BitLanguages[ m_n8BitLangsRead++ ] = pL;

	*ppL = pL;

	return NO_ERROR;
}

DWORD
LCDetect::InitializeUnicodeLanguage (PFileLanguageSection pLS, Language **ppL)
//
// Set *ppL to the Language object created from this section.
{
	// nRecordCount is # of sublanguage histograms

	if ( m_nUnicodeLangsRead >= m_nUnicodeLanguages ||
		 pLS->m_dwUnicodeRangeID >= m_nUnicodeLanguages )
	{
		return ERROR_INTERNAL_DB_CORRUPTION;
	}

	PLanguageUnicode pL = new LanguageUnicode (this, pLS->m_dwLangID,
						pLS->m_dwRecordCount, pLS->m_dwUnicodeRangeID);

	if (pL == NULL)
		return ERROR_OUTOFMEMORY;


	// The Unicode score indices follow the 7-bit languages, and overlay the
	// 8-bit slots since they aren't used at the same time.

	if (m_nUnicodeLangsRead == 0 && GetN8BitLanguages() > 0)
		m_nScoreIdx = Get8BitLanguage(0)->GetScoreIdx();

	// Each Unicode entry uses exactly one score index.  SBCS subdetection
	// (Latin group) uses the slots for the corresponding 7-bit languages,
	// and Unicode subdetection (CJK) uses the slots already defined for the
	// Unicode sub-languages.

	pL->SetScoreIdx(m_nScoreIdx);

	m_nScoreIdx++;

	// For Unicode, the range ID is used as the Language array index.

	m_ppUnicodeLanguages[ pLS->m_dwUnicodeRangeID ] = pL;
	m_nUnicodeLangsRead++;

	*ppL = pL;

	return NO_ERROR;
}

DWORD
LCDetect::LoadLanguageSection (void *pv, int nSectionSize, PLanguage *ppL)
//
// A language section begins the definition of data for a language.
// Each language has exactly one of these records.  One or more
// histogram sections follow each language, and are always associated
// with the language of the preceding language section.
//
// Set *ppL to the Language object created from this section.
{
	DWORD hr = NO_ERROR;

	PFileLanguageSection pLS;

	pLS = (PFileLanguageSection)&((char *)pv)[sizeof(FileSection)];

	switch ( pLS->m_dwDetectionType ) {

	case DETECT_7BIT:
		hr = Initialize7BitLanguage (pLS, ppL);
		break;

	case DETECT_8BIT:
		hr = Initialize8BitLanguage (pLS, ppL);
		break;

	case DETECT_UNICODE:
		hr = InitializeUnicodeLanguage (pLS, ppL);
		break;
	}

	return hr;
}

DWORD
LCDetect::LoadHistogramSection (void *pv, int nSectionSize, Language *pL)
{
	PFileHistogramSection pHS;

	pHS = (PFileHistogramSection)&((char *)pv)[sizeof(FileSection)];

	int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pHS);

	return pL->AddHistogram ( pHS, nBytes, m_nHistogramsRead++);
}

DWORD
LCDetect::LoadMapSection (void *pv, int nSectionSize)
{
	PFileMapSection pMS;

	pMS = (PFileMapSection)&((char *)pv)[sizeof(FileSection)];

	int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pMS);

	if (m_nMapsRead >= m_nCharmaps)
		return ERROR_INTERNAL_DB_CORRUPTION;

	PCharmap pM = new Charmap (pMS);

	if (pM == NULL)
		return ERROR_OUTOFMEMORY;

	m_ppCharmaps[ m_nMapsRead++ ]  = pM;

	return NO_ERROR;
}

DWORD
LCDetect::BuildState (DWORD nFileSize)
//
// Build the detection structures from the mapped training file image at *m_pv
{
	PLanguage pL;
	PFileHeader pFH;
	PFileSection pFS;

	DWORD hr = NO_ERROR;

	// Validate header

	pFH = (PFileHeader) m_pv;

	if ( nFileSize < sizeof(*pFH) ||
		 pFH->m_dwAppSig != APP_SIGNATURE ||
		 pFH->m_dwVersion != APP_VERSION ||
		 pFH->m_dwHdrSizeBytes >= nFileSize ||
		 pFH->m_dwN7BitLanguages == 0 ||
		 pFH->m_dwN8BitLanguages == 0 ||
		 pFH->m_dwNUnicodeLanguages == 0 ||
		 pFH->m_dwNCharmaps == 0 )
	{
		return ERROR_INTERNAL_DB_CORRUPTION;
	}

	// Allocate language pointer table per header

	m_n7BitLanguages = pFH->m_dwN7BitLanguages;
	m_pp7BitLanguages = new PLanguage7Bit [m_n7BitLanguages];

	m_n8BitLanguages = pFH->m_dwN8BitLanguages;
	m_pp8BitLanguages = new PLanguage8Bit [m_n8BitLanguages];

	m_nUnicodeLanguages = pFH->m_dwNUnicodeLanguages;
	m_ppUnicodeLanguages = new PLanguageUnicode [m_nUnicodeLanguages];

	m_nCharmaps = pFH->m_dwNCharmaps;
	m_ppCharmaps = new PCharmap [m_nCharmaps];

	if ( m_pp7BitLanguages == NULL ||
		 m_pp8BitLanguages == NULL ||
		 m_ppUnicodeLanguages == NULL ||
		 m_ppCharmaps == NULL )
	{
		return ERROR_OUTOFMEMORY;
	}

	// Clear, because not all slots may be assigned
	memset (m_ppUnicodeLanguages, 0, sizeof(PLanguageUnicode) * m_nUnicodeLanguages);

	// Remember other header info

	m_LCDConfigureDefault.nMin7BitScore = pFH->m_dwMin7BitScore;
	m_LCDConfigureDefault.nMin8BitScore = pFH->m_dwMin8BitScore;
	m_LCDConfigureDefault.nMinUnicodeScore = pFH->m_dwMinUnicodeScore;
	m_LCDConfigureDefault.nRelativeThreshhold = pFH->m_dwRelativeThreshhold;
	m_LCDConfigureDefault.nDocPctThreshhold = pFH->m_dwDocPctThreshhold;
	m_LCDConfigureDefault.nChunkSize = pFH->m_dwChunkSize;

	// Position to first section

	pFS = (PFileSection) &((char *)m_pv)[pFH->m_dwHdrSizeBytes];

	// Read and process each file section

	while ( hr == NO_ERROR ) {

		// check alignment

		if (((DWORD_PTR)pFS & 3) != 0) {
			hr = ERROR_INTERNAL_DB_CORRUPTION;
			break;
		}

		// zero-length section marks end of data

		if (pFS->m_dwSizeBytes == 0)
			break;

		if ( &((char *)pFS)[pFS->m_dwSizeBytes] >= &((char *)m_pv)[nFileSize]) {
			hr = ERROR_INTERNAL_DB_CORRUPTION;
			break;
		}

		switch ( pFS->m_dwType ) {

		case SECTION_TYPE_LANGUAGE:								// sets pL
			hr = LoadLanguageSection ((void*)pFS, pFS->m_dwSizeBytes, &pL);
			m_nHistogramsRead = 0;
			break;

		case SECTION_TYPE_HISTOGRAM:							// uses pL
			hr = LoadHistogramSection ((void*)pFS, pFS->m_dwSizeBytes, pL);
			break;

		case SECTION_TYPE_MAP:
			hr = LoadMapSection ((void*)pFS, pFS->m_dwSizeBytes);
			break;

		default:					// ignore unrecognized sections
			break;
		}

		pFS = (PFileSection) &((char *)pFS)[pFS->m_dwSizeBytes];
	}

	if (hr != NO_ERROR)
		return hr;

	if ( m_nMapsRead != m_nCharmaps )
		return ERROR_INTERNAL_DB_CORRUPTION;


	// Set up quick-reference arrays used by the scoring inner loops

	for (unsigned int i = 0; i < GetN7BitLanguages(); i++)
		m_paHElt7Bit[i] = Get7BitLanguage(i)->GetLangHistogram()->Array();

	m_nHElt8Bit = 0;
	for (i = 0; i < GetN8BitLanguages(); i++)
	{
		PLanguage8Bit pL = Get8BitLanguage(i);

		for (int j = 0; j < pL->NCodePages(); j++)
			m_paHElt8Bit[m_nHElt8Bit++] = pL->GetHistogram(j)->Array();
	}

	// Set up the Histogram used for ScoreVectorW() for scoring Unicode
	// text for 7-bit language detection.  Clone the first 7-bit language
	// histogram and replace its map with CHARMAP_U27BIT.

	m_pHU27Bit = new Histogram ( *Get7BitLanguage(0)->GetLangHistogram(),
								 GetMap(CHARMAP_U27BIT));

	return hr;
}


DWORD
LCDetect::LoadState (void)
//
// Overall initialization and state loading.  Open the compiled training
// file from its fixed location in the System32 directory, and assemble
// in-memory detection tables from its contents.
{
	DWORD hr = NO_ERROR;
	DWORD nFileSize;
#define MODULENAMELEN 100
	char szFilename[MODULENAMELEN+50], *p;

	// Find out if NT or Windows

	OSVERSIONINFOA OSVersionInfo;
	int nOSWinNT = 0;
	OSVersionInfo.dwOSVersionInfoSize = sizeof( OSVERSIONINFOA );
	if ( GetVersionExA( &OSVersionInfo ) )
		nOSWinNT = OSVersionInfo.dwPlatformId;

	// Open the training data file,
	// look in the directory that contains the DLL.

	if (GetModuleFileNameA (m_hModule, szFilename, MODULENAMELEN) == 0)
		return GetLastError();

	if ( (p = strrchr (szFilename, '\\')) != NULL ||
		 (p = strrchr (szFilename, ':')) != NULL )
	{
		*++p = 0;
	}
	else
		*szFilename = 0;
	strcat (szFilename, DETECTION_DATA_FILENAME);

    if ((m_hf = CreateFileA (szFilename, GENERIC_READ, FILE_SHARE_READ,
                    NULL, OPEN_EXISTING,
                    FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE)
    {
        return E_FAIL;
    }

	if ((nFileSize = GetFileSize (m_hf, NULL)) == 0xffffffff) {
		hr = GetLastError();
		CloseHandle (m_hf);
		return hr;
	}

	// Virtual-map the file

	if ( nOSWinNT == VER_PLATFORM_WIN32_NT )
		m_hmap = CreateFileMapping (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL);
	else
		m_hmap = CreateFileMappingA (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL);

	if (m_hmap == NULL) {
		hr = GetLastError();
		CloseHandle (m_hf);
		return hr;
	}

	if ((m_pv = MapViewOfFile (m_hmap, FILE_MAP_READ, 0, 0, 0 )) == NULL) {
		hr = GetLastError();
		CloseHandle (m_hmap);
		CloseHandle (m_hf);
		return hr;
	}

	// Build the in-memory structures from the file

	hr = BuildState (nFileSize);

	return hr;
}

/****************************************************************/