240 lines
6.7 KiB
C
240 lines
6.7 KiB
C
/*
|
|
* encdefs.h
|
|
*
|
|
* Encoder #define's and structure definitions.
|
|
*/
|
|
|
|
/*
|
|
* NOTES:
|
|
*
|
|
* To maximise compression one can set both BREAK_LENGTH
|
|
* and FAST_DECISION_THRESHOLD to 250, define
|
|
* INSERT_NEAR_LONG_MATCHES, and crank up EXTRA_SIZE to
|
|
* a larger value (don't get too large, otherwise we
|
|
* might overflow our ushort cumbits[]), but the improvement
|
|
* is really marginal; e.g. 3600 bytes on winword.exe
|
|
* (3.9 MB compressed). It really hurts performance too.
|
|
*/
|
|
|
|
|
|
/*
|
|
* See optenc.c
|
|
*
|
|
* EXTRA_SIZE is the amount of extra data we allocate in addition
|
|
* to the window, and LOOK is the amount of data the optimal
|
|
* parser will look ahead. LOOK is dependent on EXTRA_SIZE.
|
|
*
|
|
* Changing EXTRA_SIZE to 8K doesn't really do anything for
|
|
* compression. 4K is a fairly optimal value.
|
|
*
|
|
* Be careful; our cumbits[] array and counters are all
|
|
* ushort's in optenc.c, so make sure they don't overflow
|
|
* (e.g. outputting all LOOK bytes as 9 bit uncompressed
|
|
* symbols, say). If necessary, change the typedef in optenc.c
|
|
* to ulong.
|
|
*/
|
|
#define EXTRA_SIZE 4096
|
|
#define LOOK (EXTRA_SIZE-MAX_MATCH-2)
|
|
|
|
|
|
/*
|
|
* Number of search trees used (for storing root nodes)
|
|
*/
|
|
#define NUM_SEARCH_TREES 65536
|
|
|
|
|
|
/*
|
|
* Chunk size required by FCI
|
|
*/
|
|
#define CHUNK_SIZE 32768
|
|
|
|
|
|
/*
|
|
* The maximum amount of data we will allow in our output buffer before
|
|
* calling lzx_output_callback() to get rid of it. Since we do this
|
|
* for every 32K of input data, the output buffer only has to be able
|
|
* to contain 32K + some spillover, which won't be much, because we
|
|
* output uncompressed blocks if we determine a block is going to be
|
|
* too large.
|
|
*/
|
|
#define OUTPUT_BUFFER_SIZE (CHUNK_SIZE+MAX_GROWTH)
|
|
|
|
|
|
/*
|
|
* Maximum allowable number of block splits per 32K of uncompressed
|
|
* data; if increased, then MAX_GROWTH will have to be increased also.
|
|
*/
|
|
#define MAX_BLOCK_SPLITS 4
|
|
|
|
|
|
/*
|
|
* Max growth is calculated as follows:
|
|
*
|
|
* TREE AND BLOCK INFO
|
|
* ===================
|
|
*
|
|
* The very first time the encoder is run, it outputs a 32 bit
|
|
* file translation size.
|
|
*
|
|
* 3 bits to output block type
|
|
* 24 bits for block size in uncompressed bytes.
|
|
*
|
|
* Max size of a tree of n elements is 20*4 + 5*n bits
|
|
*
|
|
* There is a main tree of max 700 elements which is really encoded
|
|
* as two separate trees of 256 and 444(max). There is also a
|
|
* secondary length tree of 249 elements.
|
|
*
|
|
* That is 1360 bits, plus 2300 bits, plus 1325 bits.
|
|
*
|
|
* There may also be an aligned offset tree, which is 24 bits.
|
|
*
|
|
* Flushing output bit buffer; max 16 bits.
|
|
*
|
|
* Grand total: 5084 bits/block.
|
|
*
|
|
*
|
|
* PARSER INFO
|
|
* ===========
|
|
*
|
|
* Parser worst case scenario is with 2 MB buffer (50 position slots),
|
|
* all matches of length 2, distributed over slots 32 and 33 (since
|
|
* matches of length 2 further away than 128K are prohibited). These
|
|
* slots have 15 verbatim bits. Maximum size per code is then
|
|
* 2 bits to say which slot (taking into account that there will be
|
|
* at least another code in the tree) plus 15 verbatim bits, for a
|
|
* total of 17 bits. Max growth on 32K of input data is therefore
|
|
* 1/16 * 32K, or 2K bytes.
|
|
*
|
|
* Alternatively, if there is only one match and everything else
|
|
* is a character, then 255 characters will be length 8, and one
|
|
* character and the match will be length 9. Assume the true
|
|
* frequency of the demoted character is almost a 1 in 2^7
|
|
* probability (it got remoted from a 2^8, but it was fairly
|
|
* close to being 2^7). If there are 32768/256, or 128, occurrences
|
|
* of each character, but, say, almost 256 for the demoted character,
|
|
* then the demoted character will expand the data by less than
|
|
* 1 bit * 256, or 256 bits. The match will take a little to
|
|
* output, but max growth for "all characters" is about 256 bits.
|
|
*
|
|
*
|
|
* END RESULT
|
|
* ==========
|
|
*
|
|
* The maximum number of blocks which can be output is limited to
|
|
* 4 per 32K of uncompressed data.
|
|
*
|
|
* Therefore, max growth is 4*5084 bits, plus 2K bytes, or 4590
|
|
* bytes.
|
|
*/
|
|
#define MAX_GROWTH 6144
|
|
|
|
/*
|
|
* Don't allow match length 2's which are further away than this
|
|
* (see above)
|
|
*/
|
|
#define MAX_LENGTH_TWO_OFFSET (128*1024)
|
|
|
|
|
|
/*
|
|
* When we find a match which is at least this long, prematurely
|
|
* exit the binary search.
|
|
*
|
|
* This avoids us inserting huge match lengths of 257 zeroes, for
|
|
* example. Compression will improve very *very* marginally by
|
|
* increasing this figure, but it will seriously impact
|
|
* performance.
|
|
*
|
|
* Don't make this number >= (MAX_MATCH-2); see bsearch.c.
|
|
*/
|
|
#define BREAK_LENGTH 50
|
|
|
|
|
|
/*
|
|
* If this option is defined, the parser will insert all bytes of
|
|
* matches with lengths >= 16 with a distance of 1; this is a bad
|
|
* idea, since matches like that are generally zeroes, which we
|
|
* want to avoid inserting into the search tree.
|
|
*/
|
|
//#define INSERT_NEAR_LONG_MATCHES
|
|
|
|
|
|
/*
|
|
* If the optimal parser finds a match which is this long or
|
|
* longer, it will take it automatically. The compression
|
|
* penalty is basically zero, and it helps performance.
|
|
*/
|
|
#define FAST_DECISION_THRESHOLD 50
|
|
|
|
|
|
/*
|
|
* Every TREE_CREATE_INTERVAL items, recreate the trees from
|
|
* the literals we've encountered so far, to update our cost
|
|
* estimations.
|
|
*
|
|
* 4K seems pretty optimal.
|
|
*/
|
|
#define TREE_CREATE_INTERVAL 4096
|
|
|
|
|
|
/*
|
|
* When we're forced to break in our parsing (we exceed
|
|
* our span), don't output a match length 2 if it is
|
|
* further away than this.
|
|
*
|
|
* Could make this a variable rather than a constant
|
|
*
|
|
* On a bad binary file, two chars = 18 bits
|
|
* On a good text file, two chars = 12 bits
|
|
*
|
|
* But match length two's are very uncommon on text files.
|
|
*/
|
|
#define BREAK_MAX_LENGTH_TWO_OFFSET 2048
|
|
|
|
|
|
/*
|
|
* When MatchPos >= MPSLOT3_CUTOFF, extra_bits[MP_SLOT(MatchPos)] >= 3
|
|
*
|
|
* matchpos: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
|
|
* extrabits: 0,0,0,0,1,1,1,1,2,2, 2, 2, 2, 2, 2, 2, 3, ...
|
|
*
|
|
* Used for aligned offset blocks and statistics.
|
|
*/
|
|
#define MPSLOT3_CUTOFF 16
|
|
|
|
|
|
/*
|
|
* Number of elements in the main tree
|
|
*/
|
|
#define MAIN_TREE_ELEMENTS (NUM_CHARS+(((long) context->enc_num_position_slots) << NL_SHIFT))
|
|
|
|
|
|
/*
|
|
* Max number of literals to hold.
|
|
*
|
|
* Memory required is MAX_LITERAL_ITEMS for enc_LitData[] array,
|
|
* plus MAX_LITERAL_ITEMS/8 for enc_ItemType[] array.
|
|
*
|
|
* Must not exceed 64K, since that will cause our ushort
|
|
* frequencies to overflow.
|
|
*/
|
|
#define MAX_LITERAL_ITEMS 65536
|
|
|
|
|
|
/*
|
|
* Max number of distances to hold
|
|
*
|
|
* Memory required is MAX_DIST_ITEMS*4 for enc_DistData[] array
|
|
*
|
|
* MAX_DIST_ITEMS should never be greater than MAX_LITERAL_ITEMS,
|
|
* since that just wastes space.
|
|
*
|
|
* However, it's extremely unlikely that one will get 65536 match
|
|
* length 2's! In any case, the literal and distance buffers
|
|
* are checked independently, and a block is output if either
|
|
* overflows.
|
|
*
|
|
* Bitmaps are highly redundant, though; lots of matches.
|
|
*/
|
|
#define MAX_DIST_ITEMS 32768
|