From 0c5ba4e32c2aa3f51152749e33331d216d1a6d1d Mon Sep 17 00:00:00 2001 From: Dennis Ranke Date: Sat, 25 May 2024 22:02:32 +0200 Subject: [PATCH] add highlevel description of compressed format in unpack.c --- c_unpacker/unpack.c | 64 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/c_unpacker/unpack.c b/c_unpacker/unpack.c index d317e02..1ec6dea 100644 --- a/c_unpacker/unpack.c +++ b/c_unpacker/unpack.c @@ -1,3 +1,56 @@ +/* + A simple C unpacker for upkr compressed data. + + This implements two variants, selected by the UPKR_BITSTREAM define: + - normal: faster and smaller on modern hardware as whole bytes are shifted into + the rANS state at a time, but requires 20bits for the state + - bitstream: only single bits are shifted into the rANS state at a time + which allows the state to always fit in 16bits which is a boon + on very old CPUs. + The encoder and decoder need to be configured to use the same varianet. + + upkr compressed data is a rANS byte-/bit-stream encoding a series of literal + byte values and back-references as probability encoded bits. + + upkr_decode_bit reads one bit from the rANS stream, taking a probability context + as parameter. The probability context is a byte estimating the probability of + a bit encoded in this context being set. It is updated by upkr_decode_bit + after each decoded bit to reflect the observed past frequencies of on/off bits. + + There are a number of different contexts used in the compressed format. The order in the + upkr_probs array is arbitrary, the only requirement for the unpacker is that all bits + that shared the same context while encoding also share the same context while decoding. + The contexts are: + - is match + - has offset + - literal bit N (0-7) with already decoded highest bits of literal == M (255 total) + - offset bit N (one less than max offset bits) + - has offset bit N (max offset bits) + - length bit N (one less then max length bits) + - has length bit N (max length bits) + + Literal bytes are encoded from highest to lowest bit, with the bit position and + the already decoded bits as context. + + Offst and Length are encoded in an interlaced variant of elias gamma coding. They + are encoded from lowest to highest bits. For each bit, first one bit is read in the + "has offset/length bit N)". If this is set, offset/length bit N is read in it's context + and the decoding continues with the next bit. If the "has bit N" is read as false, a + fixed 1 bit is added as the top bit at this position. + + The highlevel decode loop then looks like this: + loop: + if read_bit(IS_MATCH): + if prev_was_match || read_bit(HAS_OFFSET): + offset = read_length_or_offset(OFFSET) - 1 + if offset == 0: + break + length = read_length_or_offset(LENGTH) + copy_bytes_from_offset(length, offset) + else: + read_and_push(literal) +*/ + typedef unsigned char u8; typedef unsigned short u16; typedef unsigned long u32; @@ -14,6 +67,7 @@ u32 upkr_state; int upkr_decode_bit(int context_index) { #ifdef UPKR_BITSTREAM + // shift in single bits until rANS state is >= 32768 while(upkr_state < 32768) { if(upkr_bits_left == 0) { upkr_current_byte = *upkr_data_ptr++; @@ -24,6 +78,7 @@ int upkr_decode_bit(int context_index) { --upkr_bits_left; } #else + // shift in a full byte until rANS state is >= 4096 while(upkr_state < 4096) { upkr_state = (upkr_state << 8) | *upkr_data_ptr++; } @@ -32,6 +87,8 @@ int upkr_decode_bit(int context_index) { int prob = upkr_probs[context_index]; int bit = (upkr_state & 255) < prob ? 1 : 0; + // rANS state and context probability update + // for the later, add 1/16th (rounded) of difference from either 0 or 256 if(bit) { upkr_state = prob * (upkr_state >> 8) + (upkr_state & 255); prob += (256 - prob + 8) >> 4; @@ -60,6 +117,7 @@ void* upkr_unpack(void* destination, void* compressed_data) { #ifdef UPKR_BITSTREAM upkr_bits_left = 0; #endif + // all contexts are initialized to 128 = equal probability of 0 and 1 for(int i = 0; i < sizeof(upkr_probs); ++i) upkr_probs[i] = 128; @@ -68,10 +126,13 @@ void* upkr_unpack(void* destination, void* compressed_data) { int prev_was_match = 0; int offset = 0; for(;;) { + // is match if(upkr_decode_bit(0)) { + // has offset if(prev_was_match || upkr_decode_bit(256)) { offset = upkr_decode_length(257) - 1; if(offset == 0) { + // a 0 offset signals the end of the compressed data break; } } @@ -82,6 +143,9 @@ void* upkr_unpack(void* destination, void* compressed_data) { } prev_was_match = 1; } else { + // byte contains the previously read bits and indicates the number of + // read bits by the set top bit. Therefore it can be directly used as the + // context index. The set top bit ends up at bit position 8 and is not stored. int byte = 1; while(byte < 256) { int bit = upkr_decode_bit(byte);