mirror of
https://github.com/exoticorn/upkr.git
synced 2026-01-20 11:36:42 +01:00
add highlevel description of compressed format in unpack.c
This commit is contained in:
@@ -1,3 +1,56 @@
|
|||||||
|
/*
|
||||||
|
A simple C unpacker for upkr compressed data.
|
||||||
|
|
||||||
|
This implements two variants, selected by the UPKR_BITSTREAM define:
|
||||||
|
- normal: faster and smaller on modern hardware as whole bytes are shifted into
|
||||||
|
the rANS state at a time, but requires 20bits for the state
|
||||||
|
- bitstream: only single bits are shifted into the rANS state at a time
|
||||||
|
which allows the state to always fit in 16bits which is a boon
|
||||||
|
on very old CPUs.
|
||||||
|
The encoder and decoder need to be configured to use the same varianet.
|
||||||
|
|
||||||
|
upkr compressed data is a rANS byte-/bit-stream encoding a series of literal
|
||||||
|
byte values and back-references as probability encoded bits.
|
||||||
|
|
||||||
|
upkr_decode_bit reads one bit from the rANS stream, taking a probability context
|
||||||
|
as parameter. The probability context is a byte estimating the probability of
|
||||||
|
a bit encoded in this context being set. It is updated by upkr_decode_bit
|
||||||
|
after each decoded bit to reflect the observed past frequencies of on/off bits.
|
||||||
|
|
||||||
|
There are a number of different contexts used in the compressed format. The order in the
|
||||||
|
upkr_probs array is arbitrary, the only requirement for the unpacker is that all bits
|
||||||
|
that shared the same context while encoding also share the same context while decoding.
|
||||||
|
The contexts are:
|
||||||
|
- is match
|
||||||
|
- has offset
|
||||||
|
- literal bit N (0-7) with already decoded highest bits of literal == M (255 total)
|
||||||
|
- offset bit N (one less than max offset bits)
|
||||||
|
- has offset bit N (max offset bits)
|
||||||
|
- length bit N (one less then max length bits)
|
||||||
|
- has length bit N (max length bits)
|
||||||
|
|
||||||
|
Literal bytes are encoded from highest to lowest bit, with the bit position and
|
||||||
|
the already decoded bits as context.
|
||||||
|
|
||||||
|
Offst and Length are encoded in an interlaced variant of elias gamma coding. They
|
||||||
|
are encoded from lowest to highest bits. For each bit, first one bit is read in the
|
||||||
|
"has offset/length bit N)". If this is set, offset/length bit N is read in it's context
|
||||||
|
and the decoding continues with the next bit. If the "has bit N" is read as false, a
|
||||||
|
fixed 1 bit is added as the top bit at this position.
|
||||||
|
|
||||||
|
The highlevel decode loop then looks like this:
|
||||||
|
loop:
|
||||||
|
if read_bit(IS_MATCH):
|
||||||
|
if prev_was_match || read_bit(HAS_OFFSET):
|
||||||
|
offset = read_length_or_offset(OFFSET) - 1
|
||||||
|
if offset == 0:
|
||||||
|
break
|
||||||
|
length = read_length_or_offset(LENGTH)
|
||||||
|
copy_bytes_from_offset(length, offset)
|
||||||
|
else:
|
||||||
|
read_and_push(literal)
|
||||||
|
*/
|
||||||
|
|
||||||
typedef unsigned char u8;
|
typedef unsigned char u8;
|
||||||
typedef unsigned short u16;
|
typedef unsigned short u16;
|
||||||
typedef unsigned long u32;
|
typedef unsigned long u32;
|
||||||
@@ -14,6 +67,7 @@ u32 upkr_state;
|
|||||||
|
|
||||||
int upkr_decode_bit(int context_index) {
|
int upkr_decode_bit(int context_index) {
|
||||||
#ifdef UPKR_BITSTREAM
|
#ifdef UPKR_BITSTREAM
|
||||||
|
// shift in single bits until rANS state is >= 32768
|
||||||
while(upkr_state < 32768) {
|
while(upkr_state < 32768) {
|
||||||
if(upkr_bits_left == 0) {
|
if(upkr_bits_left == 0) {
|
||||||
upkr_current_byte = *upkr_data_ptr++;
|
upkr_current_byte = *upkr_data_ptr++;
|
||||||
@@ -24,6 +78,7 @@ int upkr_decode_bit(int context_index) {
|
|||||||
--upkr_bits_left;
|
--upkr_bits_left;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
// shift in a full byte until rANS state is >= 4096
|
||||||
while(upkr_state < 4096) {
|
while(upkr_state < 4096) {
|
||||||
upkr_state = (upkr_state << 8) | *upkr_data_ptr++;
|
upkr_state = (upkr_state << 8) | *upkr_data_ptr++;
|
||||||
}
|
}
|
||||||
@@ -32,6 +87,8 @@ int upkr_decode_bit(int context_index) {
|
|||||||
int prob = upkr_probs[context_index];
|
int prob = upkr_probs[context_index];
|
||||||
int bit = (upkr_state & 255) < prob ? 1 : 0;
|
int bit = (upkr_state & 255) < prob ? 1 : 0;
|
||||||
|
|
||||||
|
// rANS state and context probability update
|
||||||
|
// for the later, add 1/16th (rounded) of difference from either 0 or 256
|
||||||
if(bit) {
|
if(bit) {
|
||||||
upkr_state = prob * (upkr_state >> 8) + (upkr_state & 255);
|
upkr_state = prob * (upkr_state >> 8) + (upkr_state & 255);
|
||||||
prob += (256 - prob + 8) >> 4;
|
prob += (256 - prob + 8) >> 4;
|
||||||
@@ -60,6 +117,7 @@ void* upkr_unpack(void* destination, void* compressed_data) {
|
|||||||
#ifdef UPKR_BITSTREAM
|
#ifdef UPKR_BITSTREAM
|
||||||
upkr_bits_left = 0;
|
upkr_bits_left = 0;
|
||||||
#endif
|
#endif
|
||||||
|
// all contexts are initialized to 128 = equal probability of 0 and 1
|
||||||
for(int i = 0; i < sizeof(upkr_probs); ++i)
|
for(int i = 0; i < sizeof(upkr_probs); ++i)
|
||||||
upkr_probs[i] = 128;
|
upkr_probs[i] = 128;
|
||||||
|
|
||||||
@@ -68,10 +126,13 @@ void* upkr_unpack(void* destination, void* compressed_data) {
|
|||||||
int prev_was_match = 0;
|
int prev_was_match = 0;
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
for(;;) {
|
for(;;) {
|
||||||
|
// is match
|
||||||
if(upkr_decode_bit(0)) {
|
if(upkr_decode_bit(0)) {
|
||||||
|
// has offset
|
||||||
if(prev_was_match || upkr_decode_bit(256)) {
|
if(prev_was_match || upkr_decode_bit(256)) {
|
||||||
offset = upkr_decode_length(257) - 1;
|
offset = upkr_decode_length(257) - 1;
|
||||||
if(offset == 0) {
|
if(offset == 0) {
|
||||||
|
// a 0 offset signals the end of the compressed data
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -82,6 +143,9 @@ void* upkr_unpack(void* destination, void* compressed_data) {
|
|||||||
}
|
}
|
||||||
prev_was_match = 1;
|
prev_was_match = 1;
|
||||||
} else {
|
} else {
|
||||||
|
// byte contains the previously read bits and indicates the number of
|
||||||
|
// read bits by the set top bit. Therefore it can be directly used as the
|
||||||
|
// context index. The set top bit ends up at bit position 8 and is not stored.
|
||||||
int byte = 1;
|
int byte = 1;
|
||||||
while(byte < 256) {
|
while(byte < 256) {
|
||||||
int bit = upkr_decode_bit(byte);
|
int bit = upkr_decode_bit(byte);
|
||||||
|
|||||||
Reference in New Issue
Block a user