/************************************************************************* * * * ENCODE.C * * * * Copyright (C) Microsoft Corporation 1990-1994 * * All Rights reserved. * * * ************************************************************************** * * * Module Intent * * General encoding & decoding techniques * * * ************************************************************************** * * * Current Owner: BinhN * * * ************************************************************************** * * * Released by Development: (date) * * * *************************************************************************/ #include #include #include #include "common.h" #include "index.h" /* Structure to access bits and bytes of a DWORD */ typedef struct { unsigned short w1; unsigned short w2; } TWOWORD; typedef struct { unsigned char b1; unsigned char b2; unsigned char b3; unsigned char b4; } FOURBYTE; typedef union { unsigned long dwVal; TWOWORD dw; FOURBYTE fb; } WORDLONG; #define HI_WORD(p) (((WORDLONG FAR *)&p)->dw.w2) #define LO_WORD(p) (((WORDLONG FAR *)&p)->dw.w1) #define BYTE1(p) (((WORDLONG FAR *)&p)->fb.b4) #define BYTE2(p) (((WORDLONG FAR *)&p)->fb.b3) #define BYTE3(p) (((WORDLONG FAR *)&p)->fb.b2) #define BYTE4(p) (((WORDLONG FAR *)&p)->fb.b1) /************************************************************************* * * INTERNAL PRIVATE FUNCTIONS * * All of them should be declared near * *************************************************************************/ PRIVATE LPB PASCAL NEAR LongValPack (LPB, DWORD); PRIVATE LPB PASCAL NEAR LongValUnpack (LPB, LPDW); /************************************************************************* * * INTERNAL PUBLIC FUNCTIONS * * All of them should be declared far, unless we know they belong to * the same segment. They should be included in some include files * *************************************************************************/ PUBLIC CB PASCAL NEAR CbBytePack(LPB, DWORD); PUBLIC CB PASCAL NEAR OccurrencePack (LPB, LPOCC, WORD); PUBLIC CB PASCAL NEAR CbCopySortPackedOcc (LPB, LPB, WORD); PUBLIC void PASCAL NEAR OccurrenceUnpack(LPOCC, LPB, OCCF); PUBLIC CBIT PASCAL NEAR CbitBitsDw (DWORD); /************************************************************************* * * @doc INTERNAL INDEX * * @func LPB PASCAL NEAR | LongValPack | * The function packs and writes out an encoded 4-bytes value. * The encoding scheme is as followed: * - High 3 bit: used to tell how many bytes are to follow * the current byte * - The packed value * Ex: * 0x1 will be output as 0x1 * 0x1F 0x1F * 0x2F 0x202F (0010 0000 0010 1111) * * @parm LPB | lpbOut | * Pointer to the output buffer * * @parm DWORD | dwVal | * 4-bytes value to be packed and emitted * * @rdesc * The buffer pointer is advanced and returned. * * @comm No validity check is done for the the output buffer *************************************************************************/ PRIVATE LPB PASCAL NEAR LongValPack (LPB lpbOut, DWORD dwVal) { if (HI_WORD(dwVal) > 0x1fff) { *lpbOut++ = 4 << 5; // 4 bytes follow this byte goto Copy4Bytes; } if (HI_WORD(dwVal) > 0x001f) { BYTE1(dwVal) |= 3 << 5; /* 3 bytes follows this byte */ goto Copy4Bytes; } if (HI_WORD(dwVal) > 0 || LO_WORD(dwVal) > 0x1fff) { BYTE2(dwVal) |= 2 << 5; /* 2 bytes follows this byte */ goto Copy3Bytes; } if (LO_WORD(dwVal) > 0x001f) { BYTE3(dwVal) |= 1 << 5; /* 1 bytes follows this byte */ goto Copy2Bytes; } else goto Copy1Bytes; Copy4Bytes: *lpbOut ++ = BYTE1(dwVal); Copy3Bytes: *lpbOut ++ = BYTE2(dwVal); Copy2Bytes: *lpbOut ++ = BYTE3(dwVal); Copy1Bytes: *lpbOut ++ = BYTE4(dwVal); return lpbOut; } /************************************************************************* * * @doc INTERNAL INDEX * * @func LPB PASCAL NEAR | LongValUnpack | * This is the reverse on LongValPack. Given a buffer containing * a packed 4-byte value, the function will unpack and return the * value. The pointer to the input buffer is updated and returned * * @parm LPB | lpbIn | * Input buffer containing the packed value * * @parm LPDW | lpdw | * Place to store the unpacked value * * @rdesc The new updated input buffer pointer * * @comm No validity check for lpbIn is done because of speed * *************************************************************************/ PRIVATE LPB PASCAL NEAR LongValUnpack (LPB lpbIn, LPDW lpdw) { DWORD dwVal = 0; register int cbByteCopied; /* Get the number of bytes to be copied */ cbByteCopied = *lpbIn >> 5; *lpbIn &= 0x1f; switch (cbByteCopied) { case 4: lpbIn++; case 3: BYTE1(dwVal) = *lpbIn++; case 2: BYTE2(dwVal) = *lpbIn++; case 1: BYTE3(dwVal) = *lpbIn++; case 0: BYTE4(dwVal) = *lpbIn++; } *lpdw = dwVal; return lpbIn; } /************************************************************************* * * @doc INTERNAL INDEX * * @func CB PASCAL NEAR | OccurrencePack | * Packs and emits all occurrence's fields * * @parm LPB | lpbOut | * Place to store the packed occurrence's fields * * @parm LPOCC | lpOccIn | * Pointer to occurrence structure * * @parm WORD | occf | * Occurrence flags telling which fields are present * * @rdesc The number of bytes written * *************************************************************************/ PUBLIC CB PASCAL NEAR OccurrencePack (register LPB lpbOut, LPOCC lpOccIn, register WORD occf) { DWORD dwVal; LPB lpbSaved = lpbOut; while (occf) { if (occf & OCCF_FIELDID) { dwVal = lpOccIn->dwFieldId; occf &= ~OCCF_FIELDID; } else if (occf & OCCF_TOPICID) { dwVal = lpOccIn->dwTopicID; occf &= ~OCCF_TOPICID; } else if (occf & OCCF_COUNT) { dwVal = lpOccIn->dwCount; occf &= ~OCCF_COUNT; } else if (occf & OCCF_OFFSET) { dwVal = lpOccIn->dwOffset; occf &= ~OCCF_OFFSET; } else if (occf & OCCF_LENGTH) { dwVal = lpOccIn->wWordLen; occf &= ~OCCF_LENGTH; } else { break; } if (HI_WORD(dwVal) > 0x1fff) { *lpbOut++ = 4 << 5; // 4 bytes follow this byte goto Copy4Bytes; } if (HI_WORD(dwVal) > 0x001f) { BYTE1(dwVal) |= 3 << 5; /* 3 bytes follows this byte */ goto Copy4Bytes; } if (HI_WORD(dwVal) > 0 || LO_WORD(dwVal) > 0x1fff) { BYTE2(dwVal) |= 2 << 5; /* 2 bytes follows this byte */ goto Copy3Bytes; } if (LO_WORD(dwVal) > 0x001f) { BYTE3(dwVal) |= 1 << 5; /* 1 bytes follows this byte */ goto Copy2Bytes; } else goto Copy1Bytes; #if 1 Copy4Bytes: *lpbOut ++ = BYTE1(dwVal); Copy3Bytes: *lpbOut ++ = BYTE2(dwVal); Copy2Bytes: *lpbOut ++ = BYTE3(dwVal); Copy1Bytes: *lpbOut ++ = BYTE4(dwVal); } return (CB)(lpbOut - lpbSaved); #else Copy4Bytes: *(LPDW)lpbOut = dwVal; lpbOut += 4; continue; Copy3Bytes: *lpbOut ++ = BYTE2(dwVal); Copy2Bytes: *(LPW)lpbOut = LO_WORD(dwVal); lpbOut += 2; continue; Copy1Bytes: *lpbOut ++ = BYTE4(dwVal); continue; } #endif return (CB)(lpbOut - lpbSaved); } /************************************************************************* * @doc INTERNAL INDEX * * @func CB PASCAL NEAR | CbCopySortPackedOcc | * Copy the packed occurrence structure * * @parm LPB | lpbDst | * Pointer to destination buffer * @parm LPB | lpbSrc | * Pointer to source buffer * @parm WORD | uiNumOcc | * Number of occurrence fields (>= 1) * @rdesc * return the number of bytes copied *************************************************************************/ PUBLIC CB PASCAL NEAR CbCopySortPackedOcc (LPB lpbDst, LPB lpbSrc, WORD uiNumOcc) { register int cbByteCopied; LPB lpbSaved = lpbDst; do { for (cbByteCopied = *lpbSrc >> 5; cbByteCopied >= 0; cbByteCopied--) *lpbDst++ = *lpbSrc++; uiNumOcc--; } while (uiNumOcc > 0); return (CB)(lpbDst - lpbSaved); } PUBLIC void PASCAL NEAR OccurrenceUnpack(LPOCC lpOccOut, register LPB lpbIn, register OCCF occf) { DWORD dwVal = 0; LPDW lpdw; register int cbByteCopied; while (occf) { DWORD dwTmp; if (occf & OCCF_FIELDID) { lpdw = &lpOccOut->dwFieldId; occf &= ~OCCF_FIELDID; } else if (occf & OCCF_TOPICID) { lpdw = &lpOccOut->dwTopicID; occf &= ~OCCF_TOPICID; } else if (occf & OCCF_COUNT) { lpdw = &lpOccOut->dwCount; occf &= ~OCCF_COUNT; } else if (occf & OCCF_OFFSET) { lpdw = &lpOccOut->dwOffset; occf &= ~OCCF_OFFSET; } else if (occf & OCCF_LENGTH) { dwTmp = lpOccOut->wWordLen; lpdw = &dwTmp; occf &= ~OCCF_LENGTH; } else { break; } dwVal = 0; /* Get the number of bytes to be copied */ cbByteCopied = *lpbIn >> 5; *lpbIn &= 0x1f; #if 1 switch (cbByteCopied) { case 4: lpbIn++; case 3: BYTE1(dwVal) = *lpbIn++; case 2: BYTE2(dwVal) = *lpbIn++; case 1: BYTE3(dwVal) = *lpbIn++; case 0: BYTE4(dwVal) = *lpbIn++; } #else switch (cbByteCopied) { case 4: lpbIn++; case 3: dwVal = *(LPDW)lpbIn; lpbIn += 4; break; case 2: BYTE1(dwVal) = *lpbIn++; case 1: LO_WORD(dwVal) = *(LPW)lpbIn; lpbIn += 2; break; case 0: BYTE4(dwVal) = *lpbIn++; } #endif *lpdw = dwVal; } } PUBLIC CBIT PASCAL NEAR CbitBitsDw (DWORD dwVal) { register WORD wVal; //Value to be scanned register WORD cBitCount; // Number of bit if (HI_WORD(dwVal)) { /* We will look at the hi-word only, but add 16 to the result */ cBitCount = 16; wVal = HI_WORD(dwVal); } else { /* We look at the lo-word only */ cBitCount = 0; wVal = LO_WORD(dwVal); } /* Now do the shift */ while (wVal) { cBitCount++; wVal >>= 1; } return cBitCount; } // - - - - - - - - - // This function figures out how best to encode a set of values. It // uses an array of statistics about the data in order to make this // determination. The array conveys to the algorithm the number of // values that require a particular number of bits to represent. For // the "fixed" and "bell" schemes, this is all the information that's // needed in order to make a judgment as to which scheme is best. // // The inner workings of this are bitching hard to understand, so you // should probably read any occurence compression external documentation // you can find before you try to tackle this function. // // - - - - - - - - - // // Information about the "bitstream" scheme: // // The number of bits necessary to encode the values using the // "bitstream" scheme is spoon-fed into the algorithm via a parameter, // because it's not possible to derive this value using the statistics // array. // // - - - - - - - - - // // Information about the "bell" scheme: // // Here's a bell grid, which I hope will provide some documentation as // to the characteristics of the bell scheme. It is possible to figure // out how many bits a given sample will take to encode, given a // particular bell "center" value, but the algorithm is complicated and // non-intuitive. // // Bell Center // // 0 1 2 3 4 5 ... 31 // +--------------------------------------------- ... ------ // 0 | 1(c) 2 3 4 5 6 ... 32 // 1 | 2(c) 2(c) 3 4 5 6 ... 32 // 2 | 4 3(c) 3(c) 4 5 6 ... 32 // Size in 3 | 6 5 4(c) 4(c) 5 6 ... 32 // bits of 4 | 8 7 6 5(c) 5(c) 6 ... 32 // value to 5 | 10 9 8 7 6(c) 6(c) .. 32 // encode 6 | 12 11 10 9 8 7(c) .. 32 // 7 | 14 13 12 11 10 9 ... 32 // 8 | 16 15 14 13 12 11 ... 32 // 9 | 18 17 16 15 14 13 ... 32 // .. . .. .. .. .. .. .. ... .. // 32 | 64 63 62 61 60 59 ... 33(c) // // The numbers in this table represent the number of bits necessary to // encode a given value, using a given bell center. The "(c)" represents // the point of minimum waste. There are two of these for each "center". // The waste at (c) is guaranteed to be exactly one bit. // // It's would be possible for the bell center to be equal to 32, but this // would mess up my life since I only store center values in 5 bits, and // 32 would take 6 bits. Upon examination, though, it can be shown that // there are no cases where a ceiling value of 32 is any better than a // ceiling value of 31, so I can rule out 32. // // - - - - - - - - - // // Information about the "fixed" scheme: // // The "center" as calculated by this algorithm is the number of bits // necessary to represent the largest value in the sample. // // Since this value can be 32, but I'm only using 5 bits to store center // values, I subtract one from this value, which I will add back in // during decompression. This means that I can't store zero, size // 0 - 1 = -1, which is 31 if we've got a 5-bit quantity. So I don't // allow the fixed scheme to use zero as a center. If the best value // comes up as zero, I make it one instead. // - - - - - - - - - PUBLIC void NEAR PASCAL VGetBestScheme( LPCKEY lpckey, // Output compression key. LRGDW lrgdwStats, // Each dword (N) in this array at // a given array index (M) represents // a count of the number of values in // the sample that require M bits to // store. If (lrgdwStats[6] == 17), // there were 17 values in the sample // that required 6 bits to store. DWORD lcbitRawBitstreamBits, // This is lcbitBITSTREAM_ILLEGAL if // bitstream packing is not allowed, // else it is equal to the number of // bits necessary to encode all of // the values using bitstream // encoding. int fNoFixedScheme) // Set if we don't want fixed scheme { register short iStats; // Scratch index. DWORD argdwBellBits[ // This is used to compute bell cbitCENTER_MAX]; // values. Its sole purpose is to // save a bunch of multiplies that // I'd have to do if it didn't exist. DWORD lcbitBell; // Total number of bits used if I // adopt the bell scheme to encode // this sample. DWORD lcbitFixed; // Total number of bits used if I // adopt the fixed scheme to encode // this sample. DWORD lcbitBitstream; // Total number of bits used if I // adopt the scheme scheme to encode // this sample. DWORD lcTotalEncodedValues; // The total number of values that I // have to encode. short idwCeiling; // The size of "lrgdwStats" if you // trim off all of the high-end zero // elements. short idwBellCeiling; // This is "idwCeiling" unless the // value of "idwCeiling" is // cbitCENTER_MAX, in which case // it's "idwCeiling - 1". CBIT cbitBellCenter; // This will be the best "center" // value found for the bell scheme. CBIT cbitFixedCenter; // This will be the "center" value for // the "fixed" scheme. // // Determine the value of "idwCeiling", which is used to trim off // consecutive zero values at the top end of the statistics // array. // for (iStats = cbitCENTER_MAX - 1; iStats >= 0; iStats--) if (lrgdwStats[iStats]) break; idwCeiling = iStats + 1; // // Initialize variables used in bell computation. // for (iStats = 0; iStats < idwCeiling; iStats++) argdwBellBits[iStats] = lrgdwStats[iStats] * (DWORD)(iStats * 2 + 1); lcbitBell = (DWORD)-1L; cbitBellCenter = 0; lcTotalEncodedValues = 0L; idwBellCeiling = (idwCeiling == cbitCENTER_MAX) ? cbitCENTER_MAX - 1 : idwCeiling; // // Each pass through the following loop generates a value, // "lcbitBellTotal", which is equal to the number of bits // necessary to encode all of the values, using a "center" value // equal to the loop index ("iStats"). This value is checked // against "lcbitBell", if it's less it becomes the new // "lcbitBell", and the center is stored in "cbitBellCenter". // for (iStats = 0; iStats < idwBellCeiling; iStats++) { DWORD lcbitBellTotal; register short i; lcTotalEncodedValues += lrgdwStats[iStats]; lcbitBellTotal = 0L; for (i = 0; i <= iStats; i++) { // Adjust values below center. lcbitBellTotal += argdwBellBits[i]; argdwBellBits[i] += lrgdwStats[i]; } for (; i < idwCeiling; i++) { // Adjust values above center. argdwBellBits[i] -= lrgdwStats[i]; lcbitBellTotal += argdwBellBits[i]; } if (lcbitBellTotal < lcbitBell) { lcbitBell = lcbitBellTotal; cbitBellCenter = iStats; } } // // As of this point the best bell center is stored in // "cbitBellCenter", although given the obscurity of the logic in // the above loop you might have to take my word for it. The // number of bits necessary to bell encode the values using // "cbitBellCenter" as the center is in "lcbitBell". // // This next bit of code figures out which scheme to use, and // sets up the returned compression key ("lpckey") with this // result. // lcbitBell += cbitWASTED_BELL; cbitFixedCenter = (idwCeiling <= 1) ? 1 : idwCeiling - 1; lcbitFixed = (DWORD)cbitFixedCenter * // Get total "fixed" bits. lcTotalEncodedValues + cbitWASTED_FIXED; lcbitBitstream = (lcbitRawBitstreamBits == lcbitBITSTREAM_ILLEGAL) ? (DWORD)-1L : // Get total "bitstream" bits. lcbitRawBitstreamBits + cbitWASTED_BITSTREAM; if ((lcbitFixed <= lcbitBell && fNoFixedScheme == FALSE) && (lcbitFixed <= lcbitBitstream)) { lpckey->cschScheme = CSCH_FIXED; // Best scheme was lpckey->ucCenter = // "fixed". Note (BYTE)(cbitFixedCenter - 1); // the "- 1". } else if (lcbitBitstream <= lcbitBell) lpckey->cschScheme = CSCH_NONE; // Best scheme was // "bitstream". else { lpckey->cschScheme = CSCH_BELL; // Best scheme was lpckey->ucCenter = // "bell". (BYTE)cbitBellCenter; } }