/* * Automatic language and codepage detector * * Bob Powell, 2/97 * Copyright (C) 1996, 1997, Microsoft Corp. All rights reserved. * * History: 1-Feb-97 BobP Created * 5-Aug-97 BobP Unicode support; Charmaps in data file. */ #include "private.h" /****************************************************************/ Histogram::Histogram (const PFileHistogramSection pHS, const PHIdx pMap) : m_nDimensionality((UCHAR)pHS->m_dwDimensionality), m_nEdgeSize((UCHAR)pHS->m_dwEdgeSize), m_nCodePage((USHORT)pHS->m_dwCodePage), m_pMap(pMap), m_panElts((HElt *)&pHS[1]) // table follows header struct in the file { // #elements = #unique character values ^ #dimensions m_nElts = 1; for (UCHAR i = 0; i < m_nDimensionality; i++) m_nElts *= m_nEdgeSize; } DWORD Histogram::Validate (DWORD nBytes) const { if ( nBytes < m_nElts * sizeof(HElt) || m_nDimensionality > 4 ) { return ERROR_INTERNAL_DB_CORRUPTION; } return NO_ERROR; } Histogram::Histogram (const Histogram &H, const PHIdx pMap) : m_nDimensionality(H.m_nDimensionality), m_nEdgeSize(H.m_nEdgeSize), m_nCodePage(H.m_nCodePage), m_nElts(H.m_nElts), m_pMap(pMap), m_panElts(H.m_panElts) // // Clone a histogram but use a different Charmap. { } Histogram::~Histogram (void) // // The pointer members point to the mapped file and do not need to be freed. { } /****************************************************************/ Language::Language (PLCDetect pL, int nLangID, int nCodePages, int nRangeID) : m_pLC(pL), m_nLangID(nLangID), m_nCodePages(nCodePages), m_nRangeID(nRangeID) { } Language7Bit::Language7Bit (PLCDetect pL, int nLangID, int nCodePages) : Language(pL, nLangID, nCodePages), m_pLangHistogram(NULL) { memset ((void *)m_ppCodePageHistogram, 0, sizeof(m_ppCodePageHistogram)); } Language7Bit::~Language7Bit (void) { if (m_pLangHistogram) delete m_pLangHistogram; for (int i = 0; i < MAXSUBLANG; i++) if (m_ppCodePageHistogram[i]) delete m_ppCodePageHistogram[i]; } DWORD Language7Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx) // // Add the raw histogram at *pHS in the mapped file to this language object. // The histograms must be for 7-bit detection. { DWORD hr = NO_ERROR; PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID ); if (nIdx == 0) { // The first histogram for a language is its language-detection table. if ( (m_pLangHistogram = new Histogram (pHS, pMap)) == NULL) return ERROR_OUTOFMEMORY; if ((hr = m_pLangHistogram->Validate (nBytes)) != NO_ERROR) return hr; } else { // Each subsequent histogram is a code page detection table. if (nIdx - 1 >= m_nCodePages) return ERROR_INTERNAL_DB_CORRUPTION; Histogram *pH; if ((pH = new Histogram (pHS, pMap)) == NULL) return ERROR_OUTOFMEMORY; if ((hr = pH->Validate (nBytes)) != NO_ERROR) return hr; m_ppCodePageHistogram[nIdx - 1] = pH; // Cache for the scoring vector math m_paHElt[nIdx - 1] = pH->Array(); } return hr; } /****************************************************************/ Language8Bit::Language8Bit (PLCDetect pL, int nLangID, int nCodePages) : Language(pL, nLangID, nCodePages) { memset ((void *)m_ppHistogram, 0, sizeof(m_ppHistogram)); } Language8Bit::~Language8Bit (void) { for (int i = 0; i < MAXSUBLANG; i++) if (m_ppHistogram[i]) delete m_ppHistogram[i]; } DWORD Language8Bit::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx) // // Add the raw histogram at *pHS to this language object. // This language is known to use 8-bit detection. { DWORD hr = NO_ERROR; PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID ); // The histograms are the direct language-code page tables if (nIdx >= m_nCodePages) return ERROR_INTERNAL_DB_CORRUPTION; Histogram *pH; if ((pH = new Histogram (pHS, pMap)) == NULL) return ERROR_OUTOFMEMORY; if ((hr = pH->Validate (nBytes)) != NO_ERROR) return hr; m_ppHistogram[nIdx] = pH; return hr; } /****************************************************************/ LanguageUnicode::LanguageUnicode (PLCDetect pL, int nLangID, int nSubLangs, int nRangeID) : Language(pL, nLangID, nSubLangs, nRangeID) { memset ((void *)m_ppSubLangHistogram, 0, sizeof(m_ppSubLangHistogram)); } LanguageUnicode::~LanguageUnicode (void) { for (int i = 0; i < MAXSUBLANG; i++) if (m_ppSubLangHistogram[i]) delete m_ppSubLangHistogram[i]; } DWORD LanguageUnicode::AddHistogram (PFileHistogramSection pHS, DWORD nBytes, int nIdx) { DWORD hr = NO_ERROR; // All histograms for are sublanguage detection if (nIdx >= m_nSubLangs) return ERROR_INTERNAL_DB_CORRUPTION; // Get the custom charmap used for scoring this sublanguage group PHIdx pMap = m_pLC->GetMap( pHS->m_dwMappingID ); Histogram *pH; if ((pH = new Histogram (pHS, pMap)) == NULL) return ERROR_OUTOFMEMORY; if ((hr = pH->Validate (nBytes)) != NO_ERROR) return hr; m_ppSubLangHistogram[nIdx] = pH; m_paHElt[nIdx] = pH->Array(); return hr; } /****************************************************************/ LCDetect::LCDetect (HMODULE hM) : m_hModule(hM), m_nCharmaps(0), m_n7BitLanguages(0), m_n8BitLanguages(0), m_nUnicodeLanguages(0), m_n7BitLangsRead(0), m_n8BitLangsRead(0), m_nUnicodeLangsRead(0), m_nMapsRead(0), m_nHistogramsRead(0), m_nScoreIdx(0), m_pp7BitLanguages(NULL), m_pp8BitLanguages(NULL), m_ppUnicodeLanguages(NULL), m_ppCharmaps(NULL), m_pv(NULL), m_hmap(0), m_hf(0), m_pHU27Bit(0) { } LCDetect::~LCDetect () { delete m_pHU27Bit; for (unsigned int i = 0; i < m_n7BitLanguages; i++) delete m_pp7BitLanguages[i]; delete m_pp7BitLanguages; for (i = 0; i < m_n8BitLanguages; i++) delete m_pp8BitLanguages[i]; delete m_pp8BitLanguages; for (i = 0; i < m_nUnicodeLanguages; i++) delete m_ppUnicodeLanguages[i]; delete m_ppUnicodeLanguages; for (i = 0; i < m_nCharmaps; i++) delete m_ppCharmaps[i]; delete m_ppCharmaps; if (m_pv) UnmapViewOfFile (m_pv); CloseHandle (m_hmap); CloseHandle (m_hf); } DWORD LCDetect::Initialize7BitLanguage (PFileLanguageSection pLS, PLanguage *ppL) // // Set *ppL to the Language object created from this section. { // nRecordCount is lang histogram (1) + # of code page histograms if ( m_n7BitLangsRead >= m_n7BitLanguages || pLS->m_dwRecordCount < 1) return ERROR_INTERNAL_DB_CORRUPTION; PLanguage7Bit pL = new Language7Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount - 1); if (pL == NULL) return ERROR_OUTOFMEMORY; // Each 7-bit lang uses one score index slot per code page. // The range starts with the 7-bit langs, since both the 8-bit // and Unicode langs follow it. if (m_n7BitLangsRead == 0 && m_nScoreIdx != 0) return ERROR_INTERNAL_DB_CORRUPTION;; pL->SetScoreIdx(m_nScoreIdx); m_nScoreIdx += pLS->m_dwRecordCount - 1; // skip 1st record (Language) m_pp7BitLanguages[ m_n7BitLangsRead++ ] = pL; *ppL = pL; return NO_ERROR; } DWORD LCDetect::Initialize8BitLanguage (PFileLanguageSection pLS, Language **ppL) // // Set *ppL to the Language object created from this section. { // nRecordCount is # of combined language / code page histograms if ( m_n8BitLangsRead >= m_n8BitLanguages || pLS->m_dwRecordCount < 1) return ERROR_INTERNAL_DB_CORRUPTION; PLanguage8Bit pL = new Language8Bit (this, pLS->m_dwLangID, pLS->m_dwRecordCount); if (pL == NULL) return ERROR_OUTOFMEMORY; // The 8-bit score indices follow the 7-bit languages // Each 8-bit lang uses a score index slot for each of its code pages, // since all the code pages are scored in the initial scoring pass. // The number of slots is the number of code page histograms, which is // one less than the number of records following this language. pL->SetScoreIdx(m_nScoreIdx); m_nScoreIdx += pLS->m_dwRecordCount; m_pp8BitLanguages[ m_n8BitLangsRead++ ] = pL; *ppL = pL; return NO_ERROR; } DWORD LCDetect::InitializeUnicodeLanguage (PFileLanguageSection pLS, Language **ppL) // // Set *ppL to the Language object created from this section. { // nRecordCount is # of sublanguage histograms if ( m_nUnicodeLangsRead >= m_nUnicodeLanguages || pLS->m_dwUnicodeRangeID >= m_nUnicodeLanguages ) { return ERROR_INTERNAL_DB_CORRUPTION; } PLanguageUnicode pL = new LanguageUnicode (this, pLS->m_dwLangID, pLS->m_dwRecordCount, pLS->m_dwUnicodeRangeID); if (pL == NULL) return ERROR_OUTOFMEMORY; // The Unicode score indices follow the 7-bit languages, and overlay the // 8-bit slots since they aren't used at the same time. if (m_nUnicodeLangsRead == 0 && GetN8BitLanguages() > 0) m_nScoreIdx = Get8BitLanguage(0)->GetScoreIdx(); // Each Unicode entry uses exactly one score index. SBCS subdetection // (Latin group) uses the slots for the corresponding 7-bit languages, // and Unicode subdetection (CJK) uses the slots already defined for the // Unicode sub-languages. pL->SetScoreIdx(m_nScoreIdx); m_nScoreIdx++; // For Unicode, the range ID is used as the Language array index. m_ppUnicodeLanguages[ pLS->m_dwUnicodeRangeID ] = pL; m_nUnicodeLangsRead++; *ppL = pL; return NO_ERROR; } DWORD LCDetect::LoadLanguageSection (void *pv, int nSectionSize, PLanguage *ppL) // // A language section begins the definition of data for a language. // Each language has exactly one of these records. One or more // histogram sections follow each language, and are always associated // with the language of the preceding language section. // // Set *ppL to the Language object created from this section. { DWORD hr = NO_ERROR; PFileLanguageSection pLS; pLS = (PFileLanguageSection)&((char *)pv)[sizeof(FileSection)]; switch ( pLS->m_dwDetectionType ) { case DETECT_7BIT: hr = Initialize7BitLanguage (pLS, ppL); break; case DETECT_8BIT: hr = Initialize8BitLanguage (pLS, ppL); break; case DETECT_UNICODE: hr = InitializeUnicodeLanguage (pLS, ppL); break; } return hr; } DWORD LCDetect::LoadHistogramSection (void *pv, int nSectionSize, Language *pL) { PFileHistogramSection pHS; pHS = (PFileHistogramSection)&((char *)pv)[sizeof(FileSection)]; int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pHS); return pL->AddHistogram ( pHS, nBytes, m_nHistogramsRead++); } DWORD LCDetect::LoadMapSection (void *pv, int nSectionSize) { PFileMapSection pMS; pMS = (PFileMapSection)&((char *)pv)[sizeof(FileSection)]; int nBytes = nSectionSize - sizeof(FileSection) - sizeof(*pMS); if (m_nMapsRead >= m_nCharmaps) return ERROR_INTERNAL_DB_CORRUPTION; PCharmap pM = new Charmap (pMS); if (pM == NULL) return ERROR_OUTOFMEMORY; m_ppCharmaps[ m_nMapsRead++ ] = pM; return NO_ERROR; } DWORD LCDetect::BuildState (DWORD nFileSize) // // Build the detection structures from the mapped training file image at *m_pv { PLanguage pL; PFileHeader pFH; PFileSection pFS; DWORD hr = NO_ERROR; // Validate header pFH = (PFileHeader) m_pv; if ( nFileSize < sizeof(*pFH) || pFH->m_dwAppSig != APP_SIGNATURE || pFH->m_dwVersion != APP_VERSION || pFH->m_dwHdrSizeBytes >= nFileSize || pFH->m_dwN7BitLanguages == 0 || pFH->m_dwN8BitLanguages == 0 || pFH->m_dwNUnicodeLanguages == 0 || pFH->m_dwNCharmaps == 0 ) { return ERROR_INTERNAL_DB_CORRUPTION; } // Allocate language pointer table per header m_n7BitLanguages = pFH->m_dwN7BitLanguages; m_pp7BitLanguages = new PLanguage7Bit [m_n7BitLanguages]; m_n8BitLanguages = pFH->m_dwN8BitLanguages; m_pp8BitLanguages = new PLanguage8Bit [m_n8BitLanguages]; m_nUnicodeLanguages = pFH->m_dwNUnicodeLanguages; m_ppUnicodeLanguages = new PLanguageUnicode [m_nUnicodeLanguages]; m_nCharmaps = pFH->m_dwNCharmaps; m_ppCharmaps = new PCharmap [m_nCharmaps]; if ( m_pp7BitLanguages == NULL || m_pp8BitLanguages == NULL || m_ppUnicodeLanguages == NULL || m_ppCharmaps == NULL ) { return ERROR_OUTOFMEMORY; } // Clear, because not all slots may be assigned memset (m_ppUnicodeLanguages, 0, sizeof(PLanguageUnicode) * m_nUnicodeLanguages); // Remember other header info m_LCDConfigureDefault.nMin7BitScore = pFH->m_dwMin7BitScore; m_LCDConfigureDefault.nMin8BitScore = pFH->m_dwMin8BitScore; m_LCDConfigureDefault.nMinUnicodeScore = pFH->m_dwMinUnicodeScore; m_LCDConfigureDefault.nRelativeThreshhold = pFH->m_dwRelativeThreshhold; m_LCDConfigureDefault.nDocPctThreshhold = pFH->m_dwDocPctThreshhold; m_LCDConfigureDefault.nChunkSize = pFH->m_dwChunkSize; // Position to first section pFS = (PFileSection) &((char *)m_pv)[pFH->m_dwHdrSizeBytes]; // Read and process each file section while ( hr == NO_ERROR ) { // check alignment if (((DWORD_PTR)pFS & 3) != 0) { hr = ERROR_INTERNAL_DB_CORRUPTION; break; } // zero-length section marks end of data if (pFS->m_dwSizeBytes == 0) break; if ( &((char *)pFS)[pFS->m_dwSizeBytes] >= &((char *)m_pv)[nFileSize]) { hr = ERROR_INTERNAL_DB_CORRUPTION; break; } switch ( pFS->m_dwType ) { case SECTION_TYPE_LANGUAGE: // sets pL hr = LoadLanguageSection ((void*)pFS, pFS->m_dwSizeBytes, &pL); m_nHistogramsRead = 0; break; case SECTION_TYPE_HISTOGRAM: // uses pL hr = LoadHistogramSection ((void*)pFS, pFS->m_dwSizeBytes, pL); break; case SECTION_TYPE_MAP: hr = LoadMapSection ((void*)pFS, pFS->m_dwSizeBytes); break; default: // ignore unrecognized sections break; } pFS = (PFileSection) &((char *)pFS)[pFS->m_dwSizeBytes]; } if (hr != NO_ERROR) return hr; if ( m_nMapsRead != m_nCharmaps ) return ERROR_INTERNAL_DB_CORRUPTION; // Set up quick-reference arrays used by the scoring inner loops for (unsigned int i = 0; i < GetN7BitLanguages(); i++) m_paHElt7Bit[i] = Get7BitLanguage(i)->GetLangHistogram()->Array(); m_nHElt8Bit = 0; for (i = 0; i < GetN8BitLanguages(); i++) { PLanguage8Bit pL = Get8BitLanguage(i); for (int j = 0; j < pL->NCodePages(); j++) m_paHElt8Bit[m_nHElt8Bit++] = pL->GetHistogram(j)->Array(); } // Set up the Histogram used for ScoreVectorW() for scoring Unicode // text for 7-bit language detection. Clone the first 7-bit language // histogram and replace its map with CHARMAP_U27BIT. m_pHU27Bit = new Histogram ( *Get7BitLanguage(0)->GetLangHistogram(), GetMap(CHARMAP_U27BIT)); return hr; } DWORD LCDetect::LoadState (void) // // Overall initialization and state loading. Open the compiled training // file from its fixed location in the System32 directory, and assemble // in-memory detection tables from its contents. { DWORD hr = NO_ERROR; DWORD nFileSize; #define MODULENAMELEN 100 char szFilename[MODULENAMELEN+50], *p; // Find out if NT or Windows OSVERSIONINFOA OSVersionInfo; int nOSWinNT = 0; OSVersionInfo.dwOSVersionInfoSize = sizeof( OSVERSIONINFOA ); if ( GetVersionExA( &OSVersionInfo ) ) nOSWinNT = OSVersionInfo.dwPlatformId; // Open the training data file, // look in the directory that contains the DLL. if (GetModuleFileNameA (m_hModule, szFilename, MODULENAMELEN) == 0) return GetLastError(); if ( (p = strrchr (szFilename, '\\')) != NULL || (p = strrchr (szFilename, ':')) != NULL ) { *++p = 0; } else *szFilename = 0; strcat (szFilename, DETECTION_DATA_FILENAME); if ((m_hf = CreateFileA (szFilename, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE) { return E_FAIL; } if ((nFileSize = GetFileSize (m_hf, NULL)) == 0xffffffff) { hr = GetLastError(); CloseHandle (m_hf); return hr; } // Virtual-map the file if ( nOSWinNT == VER_PLATFORM_WIN32_NT ) m_hmap = CreateFileMapping (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL); else m_hmap = CreateFileMappingA (m_hf, NULL, PAGE_READONLY, 0, nFileSize, NULL); if (m_hmap == NULL) { hr = GetLastError(); CloseHandle (m_hf); return hr; } if ((m_pv = MapViewOfFile (m_hmap, FILE_MAP_READ, 0, 0, 0 )) == NULL) { hr = GetLastError(); CloseHandle (m_hmap); CloseHandle (m_hf); return hr; } // Build the in-memory structures from the file hr = BuildState (nFileSize); return hr; } /****************************************************************/