add highlevel description of compressed format in unpack.c

This commit is contained in:
2024-05-25 22:02:32 +02:00
parent f33bcb2396
commit 0c5ba4e32c

View File

@@ -1,3 +1,56 @@
/*
A simple C unpacker for upkr compressed data.
This implements two variants, selected by the UPKR_BITSTREAM define:
- normal: faster and smaller on modern hardware as whole bytes are shifted into
the rANS state at a time, but requires 20bits for the state
- bitstream: only single bits are shifted into the rANS state at a time
which allows the state to always fit in 16bits which is a boon
on very old CPUs.
The encoder and decoder need to be configured to use the same varianet.
upkr compressed data is a rANS byte-/bit-stream encoding a series of literal
byte values and back-references as probability encoded bits.
upkr_decode_bit reads one bit from the rANS stream, taking a probability context
as parameter. The probability context is a byte estimating the probability of
a bit encoded in this context being set. It is updated by upkr_decode_bit
after each decoded bit to reflect the observed past frequencies of on/off bits.
There are a number of different contexts used in the compressed format. The order in the
upkr_probs array is arbitrary, the only requirement for the unpacker is that all bits
that shared the same context while encoding also share the same context while decoding.
The contexts are:
- is match
- has offset
- literal bit N (0-7) with already decoded highest bits of literal == M (255 total)
- offset bit N (one less than max offset bits)
- has offset bit N (max offset bits)
- length bit N (one less then max length bits)
- has length bit N (max length bits)
Literal bytes are encoded from highest to lowest bit, with the bit position and
the already decoded bits as context.
Offst and Length are encoded in an interlaced variant of elias gamma coding. They
are encoded from lowest to highest bits. For each bit, first one bit is read in the
"has offset/length bit N)". If this is set, offset/length bit N is read in it's context
and the decoding continues with the next bit. If the "has bit N" is read as false, a
fixed 1 bit is added as the top bit at this position.
The highlevel decode loop then looks like this:
loop:
if read_bit(IS_MATCH):
if prev_was_match || read_bit(HAS_OFFSET):
offset = read_length_or_offset(OFFSET) - 1
if offset == 0:
break
length = read_length_or_offset(LENGTH)
copy_bytes_from_offset(length, offset)
else:
read_and_push(literal)
*/
typedef unsigned char u8;
typedef unsigned short u16;
typedef unsigned long u32;
@@ -14,6 +67,7 @@ u32 upkr_state;
int upkr_decode_bit(int context_index) {
#ifdef UPKR_BITSTREAM
// shift in single bits until rANS state is >= 32768
while(upkr_state < 32768) {
if(upkr_bits_left == 0) {
upkr_current_byte = *upkr_data_ptr++;
@@ -24,6 +78,7 @@ int upkr_decode_bit(int context_index) {
--upkr_bits_left;
}
#else
// shift in a full byte until rANS state is >= 4096
while(upkr_state < 4096) {
upkr_state = (upkr_state << 8) | *upkr_data_ptr++;
}
@@ -32,6 +87,8 @@ int upkr_decode_bit(int context_index) {
int prob = upkr_probs[context_index];
int bit = (upkr_state & 255) < prob ? 1 : 0;
// rANS state and context probability update
// for the later, add 1/16th (rounded) of difference from either 0 or 256
if(bit) {
upkr_state = prob * (upkr_state >> 8) + (upkr_state & 255);
prob += (256 - prob + 8) >> 4;
@@ -60,6 +117,7 @@ void* upkr_unpack(void* destination, void* compressed_data) {
#ifdef UPKR_BITSTREAM
upkr_bits_left = 0;
#endif
// all contexts are initialized to 128 = equal probability of 0 and 1
for(int i = 0; i < sizeof(upkr_probs); ++i)
upkr_probs[i] = 128;
@@ -68,10 +126,13 @@ void* upkr_unpack(void* destination, void* compressed_data) {
int prev_was_match = 0;
int offset = 0;
for(;;) {
// is match
if(upkr_decode_bit(0)) {
// has offset
if(prev_was_match || upkr_decode_bit(256)) {
offset = upkr_decode_length(257) - 1;
if(offset == 0) {
// a 0 offset signals the end of the compressed data
break;
}
}
@@ -82,6 +143,9 @@ void* upkr_unpack(void* destination, void* compressed_data) {
}
prev_was_match = 1;
} else {
// byte contains the previously read bits and indicates the number of
// read bits by the set top bit. Therefore it can be directly used as the
// context index. The set top bit ends up at bit position 8 and is not stored.
int byte = 1;
while(byte < 256) {
int bit = upkr_decode_bit(byte);