windows-nt/Source/XPSP1/NT/enduser/stuff/itss/lzx/encoder/encdefs.h

/*
 * encdefs.h
 *
 * Encoder #define's and structure definitions.
 */

/*
 * NOTES:
 *
 * To maximise compression one can set both BREAK_LENGTH
 * and FAST_DECISION_THRESHOLD to 250, define
 * INSERT_NEAR_LONG_MATCHES, and crank up EXTRA_SIZE to
 * a larger value (don't get too large, otherwise we
 * might overflow our ushort cumbits[]), but the improvement
 * is really  marginal; e.g. 3600 bytes on winword.exe
 * (3.9 MB compressed).  It really hurts performance too.
 */


/*
 * See optenc.c
 *
 * EXTRA_SIZE is the amount of extra data we allocate in addition
 * to the window, and LOOK is the amount of data the optimal
 * parser will look ahead.  LOOK is dependent on EXTRA_SIZE.
 *
 * Changing EXTRA_SIZE to 8K doesn't really do anything for
 * compression.  4K is a fairly optimal value.
 *
 * Be careful; our cumbits[] array and counters are all
 * ushort's in optenc.c, so make sure they don't overflow
 * (e.g. outputting all LOOK bytes as 9 bit uncompressed
 * symbols, say).  If necessary, change the typedef in optenc.c
 * to ulong.
 */
#define EXTRA_SIZE   4096
#define LOOK		(EXTRA_SIZE-MAX_MATCH-2)


/*
 * Number of search trees used (for storing root nodes)
 */
#define NUM_SEARCH_TREES 65536


/*
 * Chunk size required by FCI
 */
#define CHUNK_SIZE 32768


/*
 * The maximum amount of data we will allow in our output buffer before
 * calling lzx_output_callback() to get rid of it.  Since we do this
 * for every 32K of input data, the output buffer only has to be able
 * to contain 32K + some spillover, which won't be much, because we
 * output uncompressed blocks if we determine a block is going to be
 * too large.
 */
#define OUTPUT_BUFFER_SIZE (CHUNK_SIZE+MAX_GROWTH)


/*
 * Maximum allowable number of block splits per 32K of uncompressed
 * data; if increased, then MAX_GROWTH will have to be increased also.
 */
#define MAX_BLOCK_SPLITS    4


/*
 * Max growth is calculated as follows:
 *
 * TREE AND BLOCK INFO
 * ===================
 *
 * The very first time the encoder is run, it outputs a 32 bit
 * file translation size.
 *
 * 3 bits to output block type
 * 24 bits for block size in uncompressed bytes.
 *
 * Max size of a tree of n elements is 20*4 + 5*n bits
 *
 * There is a main tree of max 700 elements which is really encoded
 * as two separate trees of 256 and 444(max).  There is also a
 * secondary length tree of 249 elements.
 *
 * That is 1360 bits, plus 2300 bits, plus 1325 bits.
 *
 * There may also be an aligned offset tree, which is 24 bits.
 *
 * Flushing output bit buffer; max 16 bits.
 *
 * Grand total: 5084 bits/block.
 *
 *
 * PARSER INFO
 * ===========
 *
 * Parser worst case scenario is with 2 MB buffer (50 position slots),
 * all matches of length 2, distributed over slots 32 and 33 (since
 * matches of length 2 further away than 128K are prohibited).  These
 * slots have 15 verbatim bits.  Maximum size per code is then
 * 2 bits to say which slot (taking into account that there will be
 * at least another code in the tree) plus 15 verbatim bits, for a
 * total of 17 bits.  Max growth on 32K of input data is therefore
 * 1/16 * 32K, or 2K bytes.
 *
 * Alternatively, if there is only one match and everything else
 * is a character, then 255 characters will be length 8, and one
 * character and the match will be length 9.  Assume the true
 * frequency of the demoted character is almost a 1 in 2^7
 * probability (it got remoted from a 2^8, but it was fairly
 * close to being 2^7).  If there are 32768/256, or 128, occurrences
 * of each character, but, say, almost 256 for the demoted character,
 * then the demoted character will expand the data by less than
 * 1 bit * 256, or 256 bits.  The match will take a little to
 * output, but max growth for "all characters" is about 256 bits.
 *
 *
 * END RESULT
 * ==========
 *
 * The maximum number of blocks which can be output is limited to
 * 4 per 32K of uncompressed data.
 *
 * Therefore, max growth is 4*5084 bits, plus 2K bytes, or 4590
 * bytes.
 */
#define     MAX_GROWTH          6144

/*
 * Don't allow match length 2's which are further away than this
 * (see above)
 */
#define     MAX_LENGTH_TWO_OFFSET (128*1024)


/*
 * When we find a match which is at least this long, prematurely
 * exit the binary search.
 *
 * This avoids us inserting huge match lengths of 257 zeroes, for
 * example.  Compression will improve very *very* marginally by
 * increasing this figure, but it will seriously impact
 * performance.
 *
 * Don't make this number >= (MAX_MATCH-2); see bsearch.c.
 */
#define BREAK_LENGTH 50


/*
 * If this option is defined, the parser will insert all bytes of
 * matches with lengths >= 16 with a distance of 1; this is a bad
 * idea, since matches like that are generally zeroes, which we
 * want to avoid inserting into the search tree.
 */
//#define INSERT_NEAR_LONG_MATCHES


/*
 * If the optimal parser finds a match which is this long or
 * longer, it will take it automatically.  The compression
 * penalty is basically zero, and it helps performance.
 */
#define FAST_DECISION_THRESHOLD 50


/*
 * Every TREE_CREATE_INTERVAL items, recreate the trees from
 * the literals we've encountered so far, to update our cost
 * estimations.
 *
 * 4K seems pretty optimal.
 */
#define TREE_CREATE_INTERVAL 4096


/*
 * When we're forced to break in our parsing (we exceed
 * our span), don't output a match length 2 if it is
 * further away than this.
 *
 * Could make this a variable rather than a constant
 *
 * On a bad binary file, two chars    = 18 bits
 * On a good text file, two chars     = 12 bits
 *
 * But match length two's are very uncommon on text files.
 */
#define BREAK_MAX_LENGTH_TWO_OFFSET 2048


/*
 * When MatchPos >= MPSLOT3_CUTOFF, extra_bits[MP_SLOT(MatchPos)] >= 3
 *
 * matchpos:  0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
 * extrabits: 0,0,0,0,1,1,1,1,2,2, 2, 2, 2, 2, 2, 2, 3, ...
 *
 * Used for aligned offset blocks and statistics.
 */
#define MPSLOT3_CUTOFF 16


/*
 * Number of elements in the main tree
 */
#define MAIN_TREE_ELEMENTS			(NUM_CHARS+(((long) context->enc_num_position_slots) << NL_SHIFT))


/*
 * Max number of literals to hold.
 *
 * Memory required is MAX_LITERAL_ITEMS for enc_LitData[] array,
 * plus MAX_LITERAL_ITEMS/8 for enc_ItemType[] array.
 *
 * Must not exceed 64K, since that will cause our ushort
 * frequencies to overflow.
 */
#define MAX_LITERAL_ITEMS  65536


/*
 * Max number of distances to hold
 *
 * Memory required is MAX_DIST_ITEMS*4 for enc_DistData[] array
 *
 * MAX_DIST_ITEMS should never be greater than MAX_LITERAL_ITEMS,
 * since that just wastes space.
 *
 * However, it's extremely unlikely that one will get 65536 match
 * length 2's!  In any case, the literal and distance buffers
 * are checked independently, and a block is output if either
 * overflows.
 *
 * Bitmaps are highly redundant, though; lots of matches.
 */
#define MAX_DIST_ITEMS     32768