/* * Declarations common to compiler and detector. * * Copyright (C) 1996, 1997, Microsoft Corp. All rights reserved. * * History: 1-Feb-97 BobP Created * 5-Aug-97 BobP Added Unicode support, and persisting * Charmaps in the data file. */ #ifndef __INC_LCDCOMP_COMMON_H #define __INC_LCDCOMP_COMMON_H /****************************************************************/ // Compiled detection data file, in lcdetect.dll module directory #define DETECTION_DATA_FILENAME "mlang.dat" // Limits #define MAX7BITLANG 30 #define MAX8BITLANG 30 #define MAXUNICODELANG 30 #define MAXSUBLANG 5 // max # of sublanguages or codepages per lang #define MAXCHARMAPS 10 // max # of Charmaps, overall // Special case entries for the training script and detector. // These language IDs are never returned by the detector. #define LANGID_UNKNOWN 0x400 #define LANGID_LATIN_GROUP 0x401 #define LANGID_CJK_GROUP 0x402 // Value type of a histogram array index. // This is the output value of the SBCS/DBCS or WCHAR reduction mapping, // and is used as the index into the n-gram arrays and for the Unicode // language group IDs. // typedef unsigned char HIdx; typedef HIdx *PHIdx; #define HIDX_MAX UCHAR_MAX // keep consistent w/ HIdx // Fixed index values for mapped characters #define HIDX_IGNORE 0 #define HIDX_EXTD 1 #define HIDX_LETTER_A 2 #define HIDX_LETTER_Z (HIDX_LETTER_A + 25) // Value type of a histogram element typedef unsigned char HElt; typedef HElt *PHElt; #define HELT_MAX UCHAR_MAX // keep consistent w/ HElt #define LANG7_DIM 3 // 7-bit language uses trigrams // Fixed IDs of the Charmaps #define CHARMAP_UNICODE 0 // Built from RANGE directives #define CHARMAP_7BITLANG 1 // Built from CHARMAP 1 #define CHARMAP_8BITLANG 2 // From CHARMAP 2 #define CHARMAP_CODEPAGE 3 // From CHARMAP 3 #define CHARMAP_U27BIT 4 // Built internally for Unicode to 7-bit lang #define CHARMAP_NINTERNAL 5 // First ID for dynamic subdetection maps #define DEFAULT_7BIT_EDGESIZE 28 #define DEFAULT_8BIT_EDGESIZE 155 #define UNICODE_DEFAULT_CHAR_SCORE 50 /****************************************************************/ // Compiled file format. // These declarations directly define the raw file format. // Be careful making changes here, and be sure to change the // header version number when appropriate. #define APP_SIGNATURE 0x5444434C // "LCDT" #define APP_VERSION 2 enum SectionTypes { // for m_dwType below SECTION_TYPE_LANGUAGE = 1, // any language definition SECTION_TYPE_HISTOGRAM = 2, // any histogram SECTION_TYPE_MAP = 3 // any character mapping table }; enum DetectionType { // SBCS/DBCS detection types DETECT_NOTDEFINED = 0, DETECT_7BIT, DETECT_8BIT, DETECT_UNICODE, DETECT_NTYPES }; // FileHeader -- one-time header at start of file typedef struct FileHeader { DWORD m_dwAppSig; // 'DTCT' DWORD m_dwVersion; DWORD m_dwHdrSizeBytes; // byte offset of 1st real section DWORD m_dwN7BitLanguages; DWORD m_dwN8BitLanguages; DWORD m_dwNUnicodeLanguages; DWORD m_dwNCharmaps; DWORD m_dwMin7BitScore; DWORD m_dwMin8BitScore; DWORD m_dwMinUnicodeScore; DWORD m_dwRelativeThreshhold; DWORD m_dwDocPctThreshhold; DWORD m_dwChunkSize; } FileHeader; typedef FileHeader *PFileHeader; // FileSection -- common header that begins each file section typedef struct FileSection { DWORD m_dwSizeBytes; // section size incl. header (offset to next) DWORD m_dwType; // type of entry this section } FileSection; typedef FileSection *PFileSection; // FileLanguageSection -- 1st entry of sequence for an SBCS/DBCS language // // Followed by 1 or more histogram sections typedef struct FileLanguageSection { // preceded by struct FileSection DWORD m_dwDetectionType; DWORD m_dwLangID; DWORD m_dwUnicodeRangeID; // Unicode range mapping value for this lang DWORD m_dwRecordCount; // # of histograms following this record } FileLanguageSection; typedef FileLanguageSection *PFileLanguageSection; // FileHistogramSection -- entry for one histogram (SBCS/DBCS or WCHAR) typedef struct FileHistogramSection { // preceded by struct FileSection union { DWORD m_dwCodePage; // for 7 or 8-bit, Codepage this indicates DWORD m_dwRangeID; // for Unicode, the sublanguage group ID }; DWORD m_dwDimensionality; DWORD m_dwEdgeSize; DWORD m_dwMappingID; // ID of Charmap to use // HElt m_Elts[] } FileHistogramSection; typedef struct FileHistogramSection *PFileHistogramSection; // FileMapSection -- entry for one character map (SBCS/DBCS or WCHAR) typedef struct FileMapSection { // preceded by struct FileSection DWORD m_dwID; // ID by which hardwired code finds the table DWORD m_dwSize; // size of table (256 or 65536) DWORD m_dwNUnique; // # of unique output values // HIdx m_map[] } FileMapSection; typedef struct FileMapSection *PFileMapSection; //////////////////////////////////////////////////////////////// // LangNames - lookup table to get from English-localized names to a Win32 // primary language ID. struct LangNames { LPCSTR pcszName; unsigned short nLangID; }; LPCSTR GetLangName (int id); int GetLangID (LPCSTR pcszName); extern const struct LangNames LangNames[]; //////////////////////////////////////////////////////////////// #endif