8 Commits

16 changed files with 832 additions and 187 deletions

58
Cargo.lock generated
View File

@@ -95,6 +95,24 @@ dependencies = [
"winapi",
]
[[package]]
name = "proc-macro2"
version = "1.0.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7bd7356a8122b6c4a24a82b278680c73357984ca2fc79a0f9fa6dea7dced7c58"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
dependencies = [
"proc-macro2",
]
[[package]]
name = "sacabase"
version = "2.0.0"
@@ -104,6 +122,37 @@ dependencies = [
"num-traits",
]
[[package]]
name = "syn"
version = "1.0.101"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "thiserror"
version = "1.0.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a99cb8c4b9a8ef0e7907cd3b617cc8dc04d571c4e73c8ae403d80ac160bb122"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a891860d3c8d66fec8e73ddb3765f90082374dbaaa833407b904a94f1a7eb43"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "time"
version = "0.1.44"
@@ -115,14 +164,21 @@ dependencies = [
"winapi",
]
[[package]]
name = "unicode-ident"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd"
[[package]]
name = "upkr"
version = "0.1.0"
version = "0.2.0-pre3"
dependencies = [
"anyhow",
"cdivsufsort",
"lexopt",
"pbr",
"thiserror",
]
[[package]]

View File

@@ -1,6 +1,6 @@
[package]
name = "upkr"
version = "0.2.0"
version = "0.2.0-pre3"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -9,4 +9,5 @@ edition = "2021"
cdivsufsort = "2"
lexopt = "0.2.1"
anyhow = "1"
thiserror = "1.0.36"
pbr = "1"

View File

@@ -6,10 +6,25 @@ test_riscv64: build/unpack_riscv64
qemu-riscv64 $< test_data.upk /tmp/out.bin
cmp test_data.bin /tmp/out.bin
build/unpack_riscv64.bin: unpack_riscv.S
build/unpack_riscv64.o: unpack_riscv.S
mkdir -p build
riscv64-linux-gnu-gcc -c -o build/unpack_riscv64.o $?
riscv64-linux-gnu-objcopy -O binary --only-section=.text build/unpack_riscv64.o $@
riscv64-linux-gnu-gcc -c -o $@ $?
build/unpack_riscv64.bin: build/unpack_riscv64.o
riscv64-linux-gnu-objcopy -O binary --only-section=.text $? $@
disas-riscv64: build/unpack_riscv64.o
riscv64-linux-gnu-objdump -d $?
build/unpack_riscv32.o: unpack_riscv.S
mkdir -p build
riscv64-linux-gnu-gcc -march=rv32imc -mabi=ilp32 -c -o $@ $?
build/unpack_riscv32.bin: build/unpack_riscv32.o
riscv64-linux-gnu-objcopy -O binary --only-section=.text $? $@
disas-riscv32: build/unpack_riscv32.o
riscv64-linux-gnu-objdump -d $?
build/unpack_armv6m: ../c_unpacker/main.c unpack_armv6m.S
mkdir -p build
@@ -32,5 +47,5 @@ test_c: build/unpack_c
$< test_data.upk /tmp/out.bin
cmp test_data.bin /tmp/out.bin
sizes: build/unpack_armv6m.bin build/unpack_riscv64.bin
sizes: build/unpack_armv6m.bin build/unpack_riscv64.bin build/unpack_riscv32.bin
ls -l build/*.bin

View File

@@ -1,6 +1,6 @@
.section .text
#define FRAME_SIZE (256+64*4+4)
#define FRAME_SIZE (256+32*4+4)
// x8 prob array ptr
// x9 prev was literal
@@ -28,39 +28,39 @@ upkr_unpack:
jal upkr_decode_bit
beqz x15, .Lliteral
li x14, 256
beqz x9, .Lread_offset
slli x14, x14, 8
beqz x9, .Lread_offset_inc_x14
jal upkr_decode_bit
beqz x15, .Lskip_offset
.Lread_offset:
jal t3, upkr_decode_number
addi x12, x9, -1
beqz x12, .Ldone
bnez x15, .Lread_offset
.Lskip_offset:
li x14, 256+64
.Lfinished_offset:
addi x14, x14, 64
jal t3, upkr_decode_number
1:
sub x15, x10, x12
lbu x15, (x15)
sb x15, (x10)
add x14, x10, t0
lbu x14, (x14)
.Lstore_byte:
sb x14, (x10)
addi x10, x10, 1
addi x9, x9, -1
bnez x9, 1b
addi x9, x9, 1
blt x9, x0, 1b
j .Lmainloop
.Lliteral:
li x14, 1
1:
jal upkr_decode_bit
addi x14, x14, -1
slli x14, x14, 1
add x14, x14, x15
srli x9, x14, 8
beqz x9, 1b
sb x14, 0(x10)
addi x10, x10, 1
j .Lmainloop
beqz x9, .Lliteral
j .Lstore_byte
.Lread_offset_inc_x14:
addi x14, x14, 1
.Lread_offset:
jal t3, upkr_decode_number
addi t0, x9, 1
bnez t0, .Lfinished_offset
.Ldone:
addi sp, sp, FRAME_SIZE
mv x8, x17
@@ -68,16 +68,14 @@ upkr_unpack:
jr t4
// x14 context index
// return: x9 decoded number
// return: x9 negtive decoded number
upkr_decode_number:
mv t5, x14
li x9, 0
li x8, 1
li x8, -1
1:
addi x14, x14, 1
jal upkr_decode_bit
beqz x15, 1f
addi x14, x14, 1
jal upkr_decode_bit
beqz x15, 2f
add x9, x9, x8
@@ -99,46 +97,46 @@ upkr_load_byte:
// x11 in ptr
// x13 state
// x14 context index
// return: x15 decoded bit
// return:
// x14 context index + 1
// x15 decoded bit
upkr_decode_bit:
srli x15, x13, 12
beqz x15, upkr_load_byte
mv t0, x9
mv t1, x14
mv t2, x10
add x14, x14, sp
lbu x9, 0(x14)
lbu x12, 0(x14)
andi x10, x13, 255
sltu x15, x10, x9
sltu x15, x10, x12
srli x13, x13, 8
beqz x15, .Lelse
mul x13, x13, x9
mul x13, x13, x12
add x13, x13, x10
li x10, 256 + 8
sub x10, x10, x9
sub x10, x10, x12
srli x10, x10, 4
add x9, x9, x10
add x12, x12, x10
j .Lendif
.Lelse:
li x16, 256
sub x16, x16, x9
sub x16, x16, x12
mul x13, x13, x16
add x13, x13, x10
sub x13, x13, x9
addi x10, x9, 8
sub x13, x13, x12
addi x10, x12, 8
srli x10, x10, 4
sub x9, x9, x10
sub x12, x12, x10
.Lendif:
sb x9, 0(x14)
sb x12, 0(x14)
mv x9, t0
mv x14, t1
addi x14, t1, 1
mv x10, t2
ret

3
fuzz/.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
target
corpus
artifacts

247
fuzz/Cargo.lock generated Normal file
View File

@@ -0,0 +1,247 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "anyhow"
version = "1.0.65"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "98161a4e3e2184da77bb14f02184cdd111e83bbbcc9979dfee3c44b9a85f5602"
[[package]]
name = "arbitrary"
version = "1.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f44124848854b941eafdb34f05b3bcf59472f643c7e151eba7c2b69daa469ed5"
[[package]]
name = "autocfg"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "cc"
version = "1.0.73"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11"
dependencies = [
"jobserver",
]
[[package]]
name = "cdivsufsort"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edefce019197609da416762da75bb000bbd2224b2d89a7e722c2296cbff79b8c"
dependencies = [
"cc",
"sacabase",
]
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "crossbeam-channel"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521"
dependencies = [
"cfg-if",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc"
dependencies = [
"cfg-if",
"once_cell",
]
[[package]]
name = "jobserver"
version = "0.1.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "068b1ee6743e4d11fb9c6a1e6064b3693a1b600e7f5f5988047d98b3dc9fb90b"
dependencies = [
"libc",
]
[[package]]
name = "lexopt"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "478ee9e62aaeaf5b140bd4138753d1f109765488581444218d3ddda43234f3e8"
[[package]]
name = "libc"
version = "0.2.133"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0f80d65747a3e43d1596c7c5492d95d5edddaabd45a7fcdb02b95f644164966"
[[package]]
name = "libfuzzer-sys"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae185684fe19814afd066da15a7cc41e126886c21282934225d9fc847582da58"
dependencies = [
"arbitrary",
"cc",
"once_cell",
]
[[package]]
name = "num-traits"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
dependencies = [
"autocfg",
]
[[package]]
name = "once_cell"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1"
[[package]]
name = "pbr"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff5751d87f7c00ae6403eb1fcbba229b9c76c9a30de8c1cf87182177b168cea2"
dependencies = [
"crossbeam-channel",
"libc",
"time",
"winapi",
]
[[package]]
name = "proc-macro2"
version = "1.0.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7bd7356a8122b6c4a24a82b278680c73357984ca2fc79a0f9fa6dea7dced7c58"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
dependencies = [
"proc-macro2",
]
[[package]]
name = "sacabase"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9883fc3d6ce3d78bb54d908602f8bc1f7b5f983afe601dabe083009d86267a84"
dependencies = [
"num-traits",
]
[[package]]
name = "syn"
version = "1.0.101"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "thiserror"
version = "1.0.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a99cb8c4b9a8ef0e7907cd3b617cc8dc04d571c4e73c8ae403d80ac160bb122"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a891860d3c8d66fec8e73ddb3765f90082374dbaaa833407b904a94f1a7eb43"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "time"
version = "0.1.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255"
dependencies = [
"libc",
"wasi",
"winapi",
]
[[package]]
name = "unicode-ident"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd"
[[package]]
name = "upkr"
version = "0.2.0-pre3"
dependencies = [
"anyhow",
"cdivsufsort",
"lexopt",
"pbr",
"thiserror",
]
[[package]]
name = "upkr-fuzz"
version = "0.0.0"
dependencies = [
"libfuzzer-sys",
"upkr",
]
[[package]]
name = "wasi"
version = "0.10.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

31
fuzz/Cargo.toml Normal file
View File

@@ -0,0 +1,31 @@
[package]
name = "upkr-fuzz"
version = "0.0.0"
authors = ["Automatically generated"]
publish = false
edition = "2018"
[package.metadata]
cargo-fuzz = true
[dependencies]
libfuzzer-sys = "0.4"
[dependencies.upkr]
path = ".."
# Prevent this from interfering with workspaces
[workspace]
members = ["."]
[[bin]]
name = "all_configs"
path = "fuzz_targets/all_configs.rs"
test = false
doc = false
[[bin]]
name = "unpack"
path = "fuzz_targets/unpack.rs"
test = false
doc = false

View File

@@ -0,0 +1,29 @@
#![no_main]
use libfuzzer_sys::fuzz_target;
fuzz_target!(|data: &[u8]| {
let mut config = upkr::Config::default();
let mut level = 1;
let mut data = data;
if data.len() > 2 {
let flags1 = data[0];
let flags2 = data[1];
data = &data[2..];
config.use_bitstream = (flags1 & 1) != 0;
config.parity_contexts = if (flags1 & 2) == 0 { 1 } else { 2 };
config.invert_bit_encoding = (flags1 & 4) != 0;
config.is_match_bit = (flags1 & 8) != 0;
config.new_offset_bit = (flags1 & 16) != 0;
config.continue_value_bit = (flags1 & 32) != 0;
config.bitstream_is_big_endian = (flags1 & 64) != 0;
config.simplified_prob_update = (flags1 & 128) != 0;
config.no_repeated_offsets = (flags2 & 32) != 0;
config.eof_in_length = (flags2 & 1) != 0;
config.max_offset = if (flags2 & 2) == 0 { usize::MAX } else { 32 };
config.max_length = if (flags2 & 4) == 0 { usize::MAX } else { 5 };
level = (flags2 >> 3) & 3;
}
let packed = upkr::pack(data, level, &config, None);
let unpacked = upkr::unpack(&packed, &config, 1024 * 1024).unwrap();
assert!(unpacked == data);
});

View File

@@ -0,0 +1,6 @@
#![no_main]
use libfuzzer_sys::fuzz_target;
fuzz_target!(|data: &[u8]| {
let _ = upkr::unpack(data, &upkr::Config::default(), 64 * 1024);
});

View File

@@ -1,4 +1,7 @@
use crate::rans::{PROB_BITS, ONE_PROB};
use crate::{
rans::{ONE_PROB, PROB_BITS},
Config,
};
const INIT_PROB: u16 = 1 << (PROB_BITS - 1);
const UPDATE_RATE: u32 = 4;
@@ -7,6 +10,8 @@ const UPDATE_ADD: u32 = 8;
#[derive(Clone)]
pub struct ContextState {
contexts: Vec<u8>,
invert_bit_encoding: bool,
simplified_prob_update: bool,
}
pub struct Context<'a> {
@@ -15,9 +20,11 @@ pub struct Context<'a> {
}
impl ContextState {
pub fn new(size: usize) -> ContextState {
pub fn new(size: usize, config: &Config) -> ContextState {
ContextState {
contexts: vec![INIT_PROB as u8; size],
invert_bit_encoding: config.invert_bit_encoding,
simplified_prob_update: config.simplified_prob_update,
}
}
@@ -33,10 +40,21 @@ impl<'a> Context<'a> {
pub fn update(&mut self, bit: bool) {
let old = self.state.contexts[self.index];
self.state.contexts[self.index] = if bit {
self.state.contexts[self.index] = if self.state.simplified_prob_update {
let offset = if bit ^ self.state.invert_bit_encoding {
ONE_PROB as i32 >> UPDATE_RATE
} else {
0
};
(offset + old as i32 - ((old as i32 + UPDATE_ADD as i32) >> UPDATE_RATE)) as u8
} else {
if bit ^ self.state.invert_bit_encoding {
old + ((ONE_PROB - old as u32 + UPDATE_ADD) >> UPDATE_RATE) as u8
} else {
old - ((old as u32 + UPDATE_ADD) >> UPDATE_RATE) as u8
}
};
}
}

View File

@@ -1,17 +1,16 @@
use crate::lz;
use crate::match_finder::MatchFinder;
use crate::rans::RansCoder;
use crate::ProgressCallback;
use crate::{lz, Config};
pub fn pack(
data: &[u8],
use_bitstream: bool,
parity_contexts: usize,
config: &Config,
mut progress_callback: Option<ProgressCallback>,
) -> Vec<u8> {
let mut match_finder = MatchFinder::new(data);
let mut rans_coder = RansCoder::new(use_bitstream);
let mut state = lz::CoderState::new(parity_contexts);
let mut rans_coder = RansCoder::new(config);
let mut state = lz::CoderState::new(config);
let mut pos = 0;
while pos < data.len() {
@@ -20,15 +19,16 @@ pub fn pack(
}
let mut encoded_match = false;
if let Some(m) = match_finder.matches(pos).next() {
let max_offset = 1 << (m.length * 3 - 1).min(31);
let max_offset = config.max_offset.min(1 << (m.length * 3 - 1).min(31));
let offset = pos - m.pos;
if offset < max_offset {
if offset < max_offset && m.length >= config.min_length() {
let length = m.length.min(config.max_length);
lz::Op::Match {
offset: offset as u32,
len: m.length as u32,
len: length as u32,
}
.encode(&mut rans_coder, &mut state);
pos += m.length;
.encode(&mut rans_coder, &mut state, config);
pos += length;
encoded_match = true;
}
}
@@ -40,13 +40,14 @@ pub fn pack(
.iter()
.zip(data[(pos - offset)..].iter())
.take_while(|(a, b)| a == b)
.count();
if length > 0 {
.count()
.min(config.max_length);
if length >= config.min_length() {
lz::Op::Match {
offset: offset as u32,
len: length as u32,
}
.encode(&mut rans_coder, &mut state);
.encode(&mut rans_coder, &mut state, config);
pos += length;
encoded_match = true;
}
@@ -54,11 +55,11 @@ pub fn pack(
}
if !encoded_match {
lz::Op::Literal(data[pos]).encode(&mut rans_coder, &mut state);
lz::Op::Literal(data[pos]).encode(&mut rans_coder, &mut state, config);
pos += 1;
}
}
lz::encode_eof(&mut rans_coder, &mut state);
lz::encode_eof(&mut rans_coder, &mut state, config);
rans_coder.finish()
}

View File

@@ -5,13 +5,28 @@ mod match_finder;
mod parsing_packer;
mod rans;
pub use lz::unpack;
pub use lz::{calculate_margin, unpack, UnpackError};
pub type ProgressCallback<'a> = &'a mut dyn FnMut(usize);
#[derive(Debug)]
pub struct Config {
pub use_bitstream: bool,
pub parity_contexts: usize,
pub invert_bit_encoding: bool,
pub is_match_bit: bool,
pub new_offset_bit: bool,
pub continue_value_bit: bool,
pub bitstream_is_big_endian: bool,
pub simplified_prob_update: bool,
pub no_repeated_offsets: bool,
pub eof_in_length: bool,
pub max_offset: usize,
pub max_length: usize,
}
impl Default for Config {
@@ -19,6 +34,30 @@ impl Default for Config {
Config {
use_bitstream: false,
parity_contexts: 1,
invert_bit_encoding: false,
is_match_bit: true,
new_offset_bit: true,
continue_value_bit: true,
bitstream_is_big_endian: false,
simplified_prob_update: false,
no_repeated_offsets: false,
eof_in_length: false,
max_offset: usize::MAX,
max_length: usize::MAX,
}
}
}
impl Config {
pub fn min_length(&self) -> usize {
if self.eof_in_length {
2
} else {
1
}
}
}
@@ -26,24 +65,13 @@ impl Default for Config {
pub fn pack(
data: &[u8],
level: u8,
config: Config,
config: &Config,
progress_callback: Option<ProgressCallback>,
) -> Vec<u8> {
if level == 0 {
greedy_packer::pack(
data,
config.use_bitstream,
config.parity_contexts,
progress_callback,
)
greedy_packer::pack(data, config, progress_callback)
} else {
parsing_packer::pack(
data,
level,
config.use_bitstream,
config.parity_contexts,
progress_callback,
)
parsing_packer::pack(data, level, config, progress_callback)
}
}

177
src/lz.rs
View File

@@ -1,5 +1,7 @@
use crate::context_state::ContextState;
use crate::rans::{EntropyCoder, RansDecoder};
use crate::Config;
use thiserror::Error;
#[derive(Copy, Clone, Debug)]
pub enum Op {
@@ -8,11 +10,11 @@ pub enum Op {
}
impl Op {
pub fn encode(&self, coder: &mut dyn EntropyCoder, state: &mut CoderState) {
pub fn encode(&self, coder: &mut dyn EntropyCoder, state: &mut CoderState, config: &Config) {
let literal_base = state.pos % state.parity_contexts * 256;
match self {
&Op::Literal(lit) => {
encode_bit(coder, state, literal_base, false);
encode_bit(coder, state, literal_base, !config.is_match_bit);
let mut context_index = 1;
for i in (0..8).rev() {
let bit = (lit >> i) & 1 != 0;
@@ -23,22 +25,30 @@ impl Op {
state.pos += 1;
}
&Op::Match { offset, len } => {
encode_bit(coder, state, literal_base, true);
if !state.prev_was_match {
encode_bit(coder, state, literal_base, config.is_match_bit);
let mut new_offset = true;
if !state.prev_was_match && !config.no_repeated_offsets {
new_offset = offset != state.last_offset;
encode_bit(
coder,
state,
256 * state.parity_contexts,
offset != state.last_offset,
new_offset == config.new_offset_bit,
);
} else {
assert!(offset != state.last_offset);
}
if offset != state.last_offset {
encode_length(coder, state, 256 * state.parity_contexts + 1, offset + 1);
assert!(offset as usize <= config.max_offset);
if new_offset {
encode_length(
coder,
state,
256 * state.parity_contexts + 1,
offset + if config.eof_in_length { 0 } else { 1 },
config,
);
state.last_offset = offset;
}
encode_length(coder, state, 256 * state.parity_contexts + 65, len);
assert!(len as usize >= config.min_length() && len as usize <= config.max_length);
encode_length(coder, state, 256 * state.parity_contexts + 65, len, config);
state.prev_was_match = true;
state.pos += len as usize;
}
@@ -46,12 +56,27 @@ impl Op {
}
}
pub fn encode_eof(coder: &mut dyn EntropyCoder, state: &mut CoderState) {
encode_bit(coder, state, state.pos % state.parity_contexts * 256, true);
if !state.prev_was_match {
encode_bit(coder, state, 256 * state.parity_contexts, true);
pub fn encode_eof(coder: &mut dyn EntropyCoder, state: &mut CoderState, config: &Config) {
encode_bit(
coder,
state,
state.pos % state.parity_contexts * 256,
config.is_match_bit,
);
if !state.prev_was_match && !config.no_repeated_offsets {
encode_bit(
coder,
state,
256 * state.parity_contexts,
config.new_offset_bit ^ config.eof_in_length,
);
}
if !config.eof_in_length || state.prev_was_match || config.no_repeated_offsets {
encode_length(coder, state, 256 * state.parity_contexts + 1, 1, config);
}
if config.eof_in_length {
encode_length(coder, state, 256 * state.parity_contexts + 65, 1, config);
}
encode_length(coder, state, 256 * state.parity_contexts + 1, 1);
}
fn encode_bit(
@@ -68,36 +93,37 @@ fn encode_length(
state: &mut CoderState,
context_start: usize,
mut value: u32,
config: &Config,
) {
assert!(value >= 1);
let mut context_index = context_start;
while value >= 2 {
encode_bit(coder, state, context_index, true);
encode_bit(coder, state, context_index, config.continue_value_bit);
encode_bit(coder, state, context_index + 1, value & 1 != 0);
context_index += 2;
value >>= 1;
}
encode_bit(coder, state, context_index, false);
encode_bit(coder, state, context_index, !config.continue_value_bit);
}
#[derive(Clone)]
pub struct CoderState {
contexts: ContextState,
parity_contexts: usize,
last_offset: u32,
prev_was_match: bool,
pos: usize,
parity_contexts: usize,
}
impl CoderState {
pub fn new(parity_contexts: usize) -> CoderState {
pub fn new(config: &Config) -> CoderState {
CoderState {
contexts: ContextState::new((1 + 255) * parity_contexts + 1 + 64 + 64),
contexts: ContextState::new((1 + 255) * config.parity_contexts + 1 + 64 + 64, config),
last_offset: 0,
parity_contexts,
prev_was_match: false,
pos: 0,
parity_contexts: config.parity_contexts,
}
}
@@ -106,42 +132,89 @@ impl CoderState {
}
}
pub fn unpack(packed_data: &[u8], config: crate::Config) -> Vec<u8> {
let mut decoder = RansDecoder::new(packed_data, config.use_bitstream);
let mut contexts = ContextState::new((1 + 255) * config.parity_contexts + 1 + 64 + 64);
#[derive(Error, Debug)]
pub enum UnpackError {
#[error("match offset out of range: {offset} > {position}")]
OffsetOutOfRange { offset: usize, position: usize },
#[error("Unpacked data over size limit: {size} > {limit}")]
OverSize { size: usize, limit: usize },
#[error("Unexpected end of input data")]
UnexpectedEOF {
#[from]
source: crate::rans::UnexpectedEOF,
},
#[error("Overflow while reading value")]
ValueOverflow,
}
pub fn unpack(
packed_data: &[u8],
config: &Config,
max_size: usize,
) -> Result<Vec<u8>, UnpackError> {
let mut result = vec![];
let mut offset = 0;
let _ = unpack_internal(Some(&mut result), packed_data, config, max_size)?;
Ok(result)
}
pub fn calculate_margin(packed_data: &[u8], config: &Config) -> Result<isize, UnpackError> {
unpack_internal(None, packed_data, config, usize::MAX)
}
pub fn unpack_internal(
mut result: Option<&mut Vec<u8>>,
packed_data: &[u8],
config: &Config,
max_size: usize,
) -> Result<isize, UnpackError> {
let mut decoder = RansDecoder::new(packed_data, &config);
let mut contexts = ContextState::new((1 + 255) * config.parity_contexts + 1 + 64 + 64, &config);
let mut offset = usize::MAX;
let mut position = 0usize;
let mut prev_was_match = false;
let mut margin = 0isize;
fn decode_length(
decoder: &mut RansDecoder,
contexts: &mut ContextState,
mut context_index: usize,
) -> usize {
config: &Config,
) -> Result<usize, UnpackError> {
let mut length = 0;
let mut bit_pos = 0;
while decoder.decode_with_context(&mut contexts.context_mut(context_index)) {
length |= (decoder.decode_with_context(&mut contexts.context_mut(context_index + 1))
while decoder.decode_with_context(&mut contexts.context_mut(context_index))?
== config.continue_value_bit
{
length |= (decoder.decode_with_context(&mut contexts.context_mut(context_index + 1))?
as usize)
<< bit_pos;
bit_pos += 1;
if bit_pos >= 32 {
return Err(UnpackError::ValueOverflow);
}
context_index += 2;
}
length | (1 << bit_pos)
Ok(length | (1 << bit_pos))
}
loop {
let literal_base = result.len() % config.parity_contexts * 256;
if decoder.decode_with_context(&mut contexts.context_mut(literal_base)) {
if prev_was_match
margin = margin.max(position as isize - decoder.pos() as isize);
let literal_base = position % config.parity_contexts * 256;
if decoder.decode_with_context(&mut contexts.context_mut(literal_base))?
== config.is_match_bit
{
if config.no_repeated_offsets
|| prev_was_match
|| decoder
.decode_with_context(&mut contexts.context_mut(256 * config.parity_contexts))
.decode_with_context(&mut contexts.context_mut(256 * config.parity_contexts))?
== config.new_offset_bit
{
offset = decode_length(
&mut decoder,
&mut contexts,
256 * config.parity_contexts + 1,
) - 1;
&config,
)? - if config.eof_in_length { 0 } else { 1 };
if offset == 0 {
break;
}
@@ -150,24 +223,50 @@ pub fn unpack(packed_data: &[u8], config: crate::Config) -> Vec<u8> {
&mut decoder,
&mut contexts,
256 * config.parity_contexts + 65,
);
for _ in 0..length {
result.push(result[result.len() - offset]);
&config,
)?;
if config.eof_in_length && length == 1 {
break;
}
if offset > position {
return Err(UnpackError::OffsetOutOfRange { offset, position });
}
if let Some(ref mut result) = result {
for _ in 0..length {
if result.len() < max_size {
result.push(result[result.len() - offset]);
} else {
break;
}
}
}
position += length;
prev_was_match = true;
} else {
let mut context_index = 1;
let mut byte = 0;
for i in (0..8).rev() {
let bit = decoder
.decode_with_context(&mut contexts.context_mut(literal_base + context_index));
.decode_with_context(&mut contexts.context_mut(literal_base + context_index))?;
context_index = (context_index << 1) | bit as usize;
byte |= (bit as u8) << i;
}
if let Some(ref mut result) = result {
if result.len() < max_size {
result.push(byte);
}
}
position += 1;
prev_was_match = false;
}
}
result
if position > max_size {
return Err(UnpackError::OverSize {
size: position,
limit: max_size,
});
}
Ok(margin + decoder.pos() as isize - position as isize)
}

View File

@@ -8,9 +8,11 @@ fn main() -> Result<()> {
let mut config = upkr::Config::default();
let mut reverse = false;
let mut unpack = false;
let mut calculate_margin = false;
let mut level = 2;
let mut infile: Option<PathBuf> = None;
let mut outfile: Option<PathBuf> = None;
let mut max_unpacked_size = 512 * 1024 * 1024;
let mut parser = lexopt::Parser::from_env();
while let Some(arg) = parser.next()? {
@@ -19,9 +21,41 @@ fn main() -> Result<()> {
Short('b') | Long("bitstream") => config.use_bitstream = true,
Short('p') | Long("parity") => config.parity_contexts = parser.value()?.parse()?,
Short('r') | Long("reverse") => reverse = true,
Long("invert-is-match-bit") => config.is_match_bit = false,
Long("invert-new-offset-bit") => config.new_offset_bit = false,
Long("invert-continue-value-bit") => config.continue_value_bit = false,
Long("invert-bit-encoding") => config.invert_bit_encoding = true,
Long("simplified-prob-update") => config.simplified_prob_update = true,
Long("big-endian-bitstream") => {
config.use_bitstream = true;
config.bitstream_is_big_endian = true;
}
Long("no-repeated-offsets") => config.no_repeated_offsets = true,
Long("eof-in-length") => config.eof_in_length = true,
Long("max-offset") => config.max_offset = parser.value()?.parse()?,
Long("max-length") => config.max_length = parser.value()?.parse()?,
Long("z80") => {
config.use_bitstream = true;
config.bitstream_is_big_endian = true;
config.invert_bit_encoding = true;
config.simplified_prob_update = true;
level = 9;
}
Long("x86") => {
config.use_bitstream = true;
config.continue_value_bit = false;
config.is_match_bit = false;
config.new_offset_bit = false;
}
Short('u') | Long("unpack") => unpack = true,
Long("margin") => calculate_margin = true,
Short('l') | Long("level") => level = parser.value()?.parse()?,
Short(n) if n.is_ascii_digit() => level = n as u8 - b'0',
Short('h') | Long("help") => print_help(0),
Long("max-unpacked-size") => max_unpacked_size = parser.value()?.parse()?,
Value(val) if infile.is_none() => infile = Some(val.try_into()?),
Value(val) if outfile.is_none() => outfile = Some(val.try_into()?),
_ => return Err(arg.unexpected().into()),
@@ -53,7 +87,7 @@ fn main() -> Result<()> {
process::exit(1);
}
if !unpack {
if !unpack && !calculate_margin {
let mut data = vec![];
File::open(infile)?.read_to_end(&mut data)?;
if reverse {
@@ -65,7 +99,7 @@ fn main() -> Result<()> {
let mut packed_data = upkr::pack(
&data,
level,
config,
&config,
Some(&mut |pos| {
pb.set(pos as u64);
}),
@@ -89,12 +123,17 @@ fn main() -> Result<()> {
if reverse {
data.reverse();
}
let mut unpacked_data = upkr::unpack(&data, config);
if unpack {
let mut unpacked_data = upkr::unpack(&data, &config, max_unpacked_size)?;
if reverse {
unpacked_data.reverse();
}
File::create(outfile)?.write_all(&unpacked_data)?;
}
if calculate_margin {
println!("{}", upkr::calculate_margin(&data, &config)?);
}
}
Ok(())
}
@@ -103,13 +142,34 @@ fn print_help(exit_code: i32) -> ! {
eprintln!("Usage:");
eprintln!(" upkr [-l level(0-9)] [config options] <infile> [<outfile>]");
eprintln!(" upkr -u [config options] <infile> [<outfile>]");
eprintln!(" upkr --margin [config options] <infile>");
eprintln!();
eprintln!(" -l, --level N compression level 0-9");
eprintln!(" -0, ..., -9 short form for setting compression level");
eprintln!(" -u, --unpack unpack infile");
eprintln!(" --margin calculate margin for overlapped unpacking of a packed file");
eprintln!();
eprintln!("Config presets for specific unpackers:");
eprintln!(" --z80 --big-endian-bitstream --invert-bit-encoding --simplified-prob-update -9");
eprintln!(
" --x86 --bitstream --invert-is-match-bit --invert-continue-value-bit --invert-new-offset-bit"
);
eprintln!();
eprintln!("Config options (need to match when packing/unpacking):");
eprintln!(" -b, --bitstream bitstream mode");
eprintln!(" -p, --parity N use N (2/4) parity contexts");
eprintln!(" -r, --reverse reverse input & output");
eprintln!();
eprintln!("Config options to tailor output to specific optimized unpackers:");
eprintln!(" --invert-is-match-bit");
eprintln!(" --invert-new-offset-bit");
eprintln!(" --invert-continue-value-bit");
eprintln!(" --invert-bit-encoding");
eprintln!(" --simplified-prob-update");
eprintln!(" --big-endian-bitstream (implies --bitstream)");
eprintln!(" --no-repeated-offsets");
eprintln!(" --eof-in-length");
eprintln!(" --max-offset N");
eprintln!(" --max-length N");
process::exit(exit_code);
}

View File

@@ -9,27 +9,21 @@ use crate::{lz, ProgressCallback};
pub fn pack(
data: &[u8],
level: u8,
use_bitstream: bool,
parity_contexts: usize,
config: &crate::Config,
progress_cb: Option<ProgressCallback>,
) -> Vec<u8> {
let mut parse = parse(
data,
Config::from_level(level),
parity_contexts,
progress_cb,
);
let mut parse = parse(data, Config::from_level(level), config, progress_cb);
let mut ops = vec![];
while let Some(link) = parse {
ops.push(link.op);
parse = link.prev.clone();
}
let mut state = lz::CoderState::new(parity_contexts);
let mut coder = RansCoder::new(use_bitstream);
let mut state = lz::CoderState::new(config);
let mut coder = RansCoder::new(config);
for op in ops.into_iter().rev() {
op.encode(&mut coder, &mut state);
op.encode(&mut coder, &mut state, config);
}
lz::encode_eof(&mut coder, &mut state);
lz::encode_eof(&mut coder, &mut state, config);
coder.finish()
}
@@ -49,7 +43,7 @@ type Arrivals = HashMap<usize, Vec<Arrival>>;
fn parse(
data: &[u8],
config: Config,
parity_contexts: usize,
encoding_config: &crate::Config,
mut progress_cb: Option<ProgressCallback>,
) -> Option<Rc<Parse>> {
let mut match_finder = MatchFinder::new(data)
@@ -111,17 +105,22 @@ fn parse(
cost_counter: &mut CostCounter,
pos: usize,
offset: usize,
length: usize,
mut length: usize,
arrival: &Arrival,
max_arrivals: usize,
config: &crate::Config,
) {
if length < config.min_length() {
return;
}
length = length.min(config.max_length);
cost_counter.reset();
let mut state = arrival.state.clone();
let op = lz::Op::Match {
offset: offset as u32,
len: length as u32,
};
op.encode(cost_counter, &mut state);
op.encode(cost_counter, &mut state, config);
add_arrival(
arrivals,
pos + length,
@@ -141,13 +140,13 @@ fn parse(
0,
Arrival {
parse: None,
state: lz::CoderState::new(parity_contexts),
state: lz::CoderState::new(encoding_config),
cost: 0.0,
},
max_arrivals,
);
let cost_counter = &mut CostCounter::new();
let cost_counter = &mut CostCounter::new(encoding_config);
let mut best_per_offset = HashMap::new();
for pos in 0..data.len() {
let match_length = |offset: usize| {
@@ -188,6 +187,7 @@ fn parse(
for m in match_finder.matches(pos) {
closest_match = Some(closest_match.unwrap_or(0).max(m.pos));
let offset = pos - m.pos;
if offset <= encoding_config.max_offset {
found_last_offset |= offset as u32 == arrival.state.last_offset();
add_match(
&mut arrivals,
@@ -197,11 +197,13 @@ fn parse(
m.length,
&arrival,
max_arrivals,
encoding_config,
);
if m.length >= config.greedy_size {
break 'arrival_loop;
}
}
}
let mut near_matches_left = config.num_near_matches;
let mut match_pos = last_seen[data[pos] as usize];
@@ -210,6 +212,9 @@ fn parse(
&& closest_match.iter().all(|p| *p < match_pos)
{
let offset = pos - match_pos;
if offset > encoding_config.max_offset {
break;
}
let length = match_length(offset);
assert!(length > 0);
add_match(
@@ -220,6 +225,7 @@ fn parse(
length,
&arrival,
max_arrivals,
encoding_config,
);
found_last_offset |= offset as u32 == arrival.state.last_offset();
if offset < near_matches.len() {
@@ -240,6 +246,7 @@ fn parse(
length,
&arrival,
max_arrivals,
encoding_config,
);
}
}
@@ -247,7 +254,7 @@ fn parse(
cost_counter.reset();
let mut state = arrival.state;
let op = lz::Op::Literal(data[pos]);
op.encode(cost_counter, &mut state);
op.encode(cost_counter, &mut state, encoding_config);
add_arrival(
&mut arrivals,
pos + 1,

View File

@@ -1,4 +1,5 @@
use crate::context_state::Context;
use crate::{context_state::Context, Config};
use thiserror::Error;
pub const PROB_BITS: u32 = 8;
pub const ONE_PROB: u32 = 1 << PROB_BITS;
@@ -15,20 +16,25 @@ pub trait EntropyCoder {
pub struct RansCoder {
bits: Vec<u16>,
use_bitstream: bool,
bitstream_is_big_endian: bool,
invert_bit_encoding: bool,
}
impl EntropyCoder for RansCoder {
fn encode_bit(&mut self, bit: bool, prob: u16) {
assert!(prob < 32768);
self.bits.push(prob | ((bit as u16) << 15));
self.bits
.push(prob | (((bit ^ self.invert_bit_encoding) as u16) << 15));
}
}
impl RansCoder {
pub fn new(use_bitstream: bool) -> RansCoder {
pub fn new(config: &Config) -> RansCoder {
RansCoder {
bits: Vec::new(),
use_bitstream,
use_bitstream: config.use_bitstream,
bitstream_is_big_endian: config.bitstream_is_big_endian,
invert_bit_encoding: config.invert_bit_encoding,
}
}
@@ -38,8 +44,20 @@ impl RansCoder {
let mut state = 1 << l_bits;
let mut byte = 0u8;
let mut bit = 8;
let mut bit = if self.bitstream_is_big_endian { 0 } else { 8 };
let mut flush_state: Box<dyn FnMut(&mut u32)> = if self.use_bitstream {
if self.bitstream_is_big_endian {
Box::new(|state: &mut u32| {
byte |= ((*state & 1) as u8) << bit;
bit += 1;
if bit == 8 {
buffer.push(byte);
byte = 0;
bit = 0;
}
*state >>= 1;
})
} else {
Box::new(|state: &mut u32| {
bit -= 1;
byte |= ((*state & 1) as u8) << bit;
@@ -50,6 +68,7 @@ impl RansCoder {
}
*state >>= 1;
})
}
} else {
Box::new(|state: &mut u32| {
buffer.push(*state as u8);
@@ -91,10 +110,11 @@ impl RansCoder {
pub struct CostCounter {
cost: f64,
log2_table: Vec<f64>,
invert_bit_encoding: bool,
}
impl CostCounter {
pub fn new() -> CostCounter {
pub fn new(config: &Config) -> CostCounter {
let log2_table = (0..ONE_PROB)
.map(|prob| {
let inv_prob = ONE_PROB as f64 / prob as f64;
@@ -104,6 +124,7 @@ impl CostCounter {
CostCounter {
cost: 0.0,
log2_table,
invert_bit_encoding: config.invert_bit_encoding,
}
}
@@ -118,7 +139,7 @@ impl CostCounter {
impl EntropyCoder for CostCounter {
fn encode_bit(&mut self, bit: bool, prob: u16) {
let prob = if bit {
let prob = if bit ^ self.invert_bit_encoding {
prob as u32
} else {
ONE_PROB - prob as u32
@@ -129,48 +150,73 @@ impl EntropyCoder for CostCounter {
pub struct RansDecoder<'a> {
data: &'a [u8],
pos: usize,
state: u32,
use_bitstream: bool,
byte: u8,
bits_left: u8,
invert_bit_encoding: bool,
bitstream_is_big_endian: bool,
}
const PROB_MASK: u32 = ONE_PROB - 1;
#[derive(Debug, Error)]
#[error("Unexpected end of input")]
pub struct UnexpectedEOF;
impl<'a> RansDecoder<'a> {
pub fn new(data: &'a [u8], use_bitstream: bool) -> RansDecoder<'a> {
pub fn new(data: &'a [u8], config: &Config) -> RansDecoder<'a> {
RansDecoder {
data,
pos: 0,
state: 0,
use_bitstream,
use_bitstream: config.use_bitstream,
byte: 0,
bits_left: 0,
invert_bit_encoding: config.invert_bit_encoding,
bitstream_is_big_endian: config.bitstream_is_big_endian,
}
}
pub fn decode_with_context(&mut self, context: &mut Context) -> bool {
let bit = self.decode_bit(context.prob());
pub fn pos(&self) -> usize {
self.pos
}
pub fn decode_with_context(&mut self, context: &mut Context) -> Result<bool, UnexpectedEOF> {
let bit = self.decode_bit(context.prob())?;
context.update(bit);
bit
Ok(bit)
}
pub fn decode_bit(&mut self, prob: u16) -> bool {
pub fn decode_bit(&mut self, prob: u16) -> Result<bool, UnexpectedEOF> {
let prob = prob as u32;
if self.use_bitstream {
while self.state < 32768 {
if self.bits_left == 0 {
self.byte = self.data[0];
self.data = &self.data[1..];
if self.pos >= self.data.len() {
return Err(UnexpectedEOF);
}
self.byte = self.data[self.pos];
self.pos += 1;
self.bits_left = 8;
}
if self.bitstream_is_big_endian {
self.state = (self.state << 1) | (self.byte >> 7) as u32;
self.byte <<= 1;
} else {
self.state = (self.state << 1) | (self.byte & 1) as u32;
self.byte >>= 1;
}
self.bits_left -= 1;
}
} else {
while self.state < 4096 {
self.state = (self.state << 8) | self.data[0] as u32;
self.data = &self.data[1..];
if self.pos >= self.data.len() {
return Err(UnexpectedEOF);
}
self.state = (self.state << 8) | self.data[self.pos] as u32;
self.pos += 1;
}
}
@@ -183,6 +229,6 @@ impl<'a> RansDecoder<'a> {
};
self.state = prob * (self.state >> PROB_BITS) + (self.state & PROB_MASK) - start;
bit
Ok(bit ^ self.invert_bit_encoding)
}
}