add highlevel description of compressed format in unpack.c

2026-01-20 03:26:43 +01:00 · 2024-05-25 22:02:32 +02:00
parent f33bcb2396
commit 0c5ba4e32c
1 changed files with 64 additions and 0 deletions
--- a/c_unpacker/unpack.c
+++ b/c_unpacker/unpack.c
@@ -1,3 +1,56 @@
+/*
+    A simple C unpacker for upkr compressed data.
+
+    This implements two variants, selected by the UPKR_BITSTREAM define:
+    - normal: faster and smaller on modern hardware as whole bytes are shifted into
+              the rANS state at a time, but requires 20bits for the state
+    - bitstream: only single bits are shifted into the rANS state at a time
+                 which allows the state to always fit in 16bits which is a boon
+                 on very old CPUs.
+    The encoder and decoder need to be configured to use the same varianet.
+
+    upkr compressed data is a rANS byte-/bit-stream encoding a series of literal
+    byte values and back-references as probability encoded bits.
+
+    upkr_decode_bit reads one bit from the rANS stream, taking a probability context
+    as parameter. The probability context is a byte estimating the probability of
+    a bit encoded in this context being set. It is updated by upkr_decode_bit
+    after each decoded bit to reflect the observed past frequencies of on/off bits.
+
+    There are a number of different contexts used in the compressed format. The order in the
+    upkr_probs array is arbitrary, the only requirement for the unpacker is that all bits
+    that shared the same context while encoding also share the same context while decoding.
+    The contexts are:
+    - is match
+    - has offset
+    - literal bit N (0-7) with already decoded highest bits of literal == M (255 total)
+    - offset bit N (one less than max offset bits)
+    - has offset bit N (max offset bits)
+    - length bit N (one less then max length bits)
+    - has length bit N (max length bits)
+
+    Literal bytes are encoded from highest to lowest bit, with the bit position and
+    the already decoded bits as context.
+
+    Offst and Length are encoded in an interlaced variant of elias gamma coding. They
+    are encoded from lowest to highest bits. For each bit, first one bit is read in the
+    "has offset/length bit N)". If this is set, offset/length bit N is read in it's context
+    and the decoding continues with the next bit. If the "has bit N" is read as false, a
+    fixed 1 bit is added as the top bit at this position.
+
+    The highlevel decode loop then looks like this:
+    loop:
+        if read_bit(IS_MATCH):
+            if prev_was_match || read_bit(HAS_OFFSET):
+                offset = read_length_or_offset(OFFSET) - 1
+                if offset == 0:
+                    break
+            length = read_length_or_offset(LENGTH)
+            copy_bytes_from_offset(length, offset)
+        else:
+            read_and_push(literal)
+*/
+
 typedef unsigned char u8;
 typedef unsigned short u16;
 typedef unsigned long u32;
@@ -14,6 +67,7 @@ u32 upkr_state;

 int upkr_decode_bit(int context_index) {
 #ifdef UPKR_BITSTREAM
+    // shift in single bits until rANS state is >= 32768
    while(upkr_state < 32768) {
        if(upkr_bits_left == 0) {
            upkr_current_byte = *upkr_data_ptr++;
@@ -24,6 +78,7 @@ int upkr_decode_bit(int context_index) {
        --upkr_bits_left;
    }
 #else
+    // shift in a full byte until rANS state is >= 4096
    while(upkr_state < 4096) {
        upkr_state = (upkr_state << 8) | *upkr_data_ptr++;
    }
@@ -32,6 +87,8 @@ int upkr_decode_bit(int context_index) {
    int prob = upkr_probs[context_index];
    int bit = (upkr_state & 255) < prob ? 1 : 0;
    
+    // rANS state and context probability update
+    // for the later, add 1/16th (rounded) of difference from either 0 or 256
    if(bit) {
        upkr_state = prob * (upkr_state >> 8) + (upkr_state & 255);
        prob += (256 - prob + 8) >> 4;
@@ -60,6 +117,7 @@ void* upkr_unpack(void* destination, void* compressed_data) {
 #ifdef UPKR_BITSTREAM
    upkr_bits_left = 0;
 #endif
+    // all contexts are initialized to 128 = equal probability of 0 and 1
    for(int i = 0; i < sizeof(upkr_probs); ++i)
        upkr_probs[i] = 128;
    
@@ -68,10 +126,13 @@ void* upkr_unpack(void* destination, void* compressed_data) {
    int prev_was_match = 0;
    int offset = 0;
    for(;;) {
+        // is match
        if(upkr_decode_bit(0)) {
+            // has offset
            if(prev_was_match || upkr_decode_bit(256)) {
                offset = upkr_decode_length(257) - 1;
                if(offset == 0) {
+                    // a 0 offset signals the end of the compressed data
                    break;
                }
            }
@@ -82,6 +143,9 @@ void* upkr_unpack(void* destination, void* compressed_data) {
            }
            prev_was_match = 1;
        } else {
+            // byte contains the previously read bits and indicates the number of
+            // read bits by the set top bit. Therefore it can be directly used as the
+            // context index. The set top bit ends up at bit position 8 and is not stored.
            int byte = 1;
            while(byte < 256) {
                int bit = upkr_decode_bit(byte);