first version, only very simple greedy packer

This commit is contained in:
2021-11-19 21:40:37 +01:00
commit 8f23ec711f
9 changed files with 544 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
/target

50
Cargo.lock generated Normal file
View File

@@ -0,0 +1,50 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "autocfg"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
[[package]]
name = "cc"
version = "1.0.72"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee"
[[package]]
name = "cdivsufsort"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edefce019197609da416762da75bb000bbd2224b2d89a7e722c2296cbff79b8c"
dependencies = [
"cc",
"sacabase",
]
[[package]]
name = "num-traits"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
dependencies = [
"autocfg",
]
[[package]]
name = "sacabase"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9883fc3d6ce3d78bb54d908602f8bc1f7b5f983afe601dabe083009d86267a84"
dependencies = [
"num-traits",
]
[[package]]
name = "upkr"
version = "0.1.0"
dependencies = [
"cdivsufsort",
]

9
Cargo.toml Normal file
View File

@@ -0,0 +1,9 @@
[package]
name = "upkr"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
cdivsufsort = "2"

39
src/context_state.rs Normal file
View File

@@ -0,0 +1,39 @@
const INIT_PROB: u16 = 0x8000;
const UPDATE_RATE: u32 = 4;
#[derive(Clone)]
pub struct ContextState {
contexts: Vec<u16>,
}
pub struct Context<'a> {
state: &'a mut ContextState,
index: usize,
}
impl ContextState {
pub fn new(size: usize) -> ContextState {
ContextState {
contexts: vec![INIT_PROB; size],
}
}
pub fn context_mut(&mut self, index: usize) -> Context {
Context { state: self, index }
}
}
impl<'a> Context<'a> {
pub fn prob(&self) -> u16 {
self.state.contexts[self.index]
}
pub fn update(&mut self, bit: bool) {
let old = self.state.contexts[self.index];
self.state.contexts[self.index] = if bit {
old + (((1 << 16) - old as u32) >> UPDATE_RATE) as u16
} else {
old - (old >> UPDATE_RATE)
};
}
}

44
src/greedy_packer.rs Normal file
View File

@@ -0,0 +1,44 @@
use crate::lz::LzCoder;
use crate::match_finder::MatchFinder;
pub fn pack(data: &[u8]) -> Vec<u8> {
let match_finder = MatchFinder::new(data);
let mut lz = LzCoder::new();
let mut pos = 0;
while pos < data.len() {
let mut encoded_match = false;
if let Some(m) = match_finder.matches(pos).next() {
let max_offset = 1 << (m.length * 3 - 1).min(31);
let offset = pos - m.pos;
if offset < max_offset {
lz.encode_match(offset, m.length);
pos += m.length;
encoded_match = true;
}
}
if !encoded_match {
let offset = lz.last_offset();
if offset != 0 {
let length = data[pos..]
.iter()
.zip(data[(pos - offset)..].iter())
.take_while(|(a, b)| a == b)
.count();
if length > 0 {
lz.encode_match(offset, length);
pos += length;
encoded_match = true;
}
}
}
if !encoded_match {
lz.encode_literal(data[pos]);
pos += 1;
}
}
lz.finish()
}

116
src/lz.rs Normal file
View File

@@ -0,0 +1,116 @@
use crate::context_state::ContextState;
use crate::range_coder::{RangeCoder, RangeDecoder};
pub struct LzCoder {
contexts: ContextState,
range_coder: RangeCoder,
last_offset: usize,
}
impl LzCoder {
pub fn new() -> LzCoder {
LzCoder {
contexts: ContextState::new(1 + 255 + 1 + 64 + 64),
range_coder: RangeCoder::new(),
last_offset: 0,
}
}
pub fn encode_literal(&mut self, byte: u8) {
self.bit(false, 0);
let mut context_index = 1;
for i in (0..8).rev() {
let bit = (byte >> i) & 1 != 0;
self.bit(bit, context_index);
context_index = (context_index << 1) | bit as usize;
}
}
pub fn encode_match(&mut self, offset: usize, length: usize) {
self.bit(true, 0);
if offset != self.last_offset {
self.last_offset = offset;
self.bit(true, 256);
self.length(offset + 1, 257);
} else {
self.bit(false, 256);
}
self.length(length, 257 + 64);
}
pub fn finish(mut self) -> Vec<u8> {
self.bit(true, 0);
self.bit(true, 256);
self.length(1, 257);
self.range_coder.finish()
}
pub fn last_offset(&self) -> usize {
self.last_offset
}
fn length(&mut self, value: usize, context_start: usize) {
assert!(value >= 1);
let top_bit = usize::BITS - 1 - value.leading_zeros();
let mut context_index = context_start;
for i in (0..top_bit).rev() {
self.bit(true, context_index);
self.bit((value >> i) & 1 != 0, context_index + 1);
context_index += 2;
}
self.bit(false, context_index);
}
fn bit(&mut self, b: bool, context_index: usize) {
self.range_coder
.encode_with_context(b, &mut self.contexts.context_mut(context_index));
}
}
pub fn unpack(packed_data: &[u8]) -> Vec<u8> {
let mut decoder = RangeDecoder::new(packed_data);
let mut contexts = ContextState::new(1 + 255 + 1 + 64 + 64);
let mut result = vec![];
let mut offset = 0;
fn decode_length(
decoder: &mut RangeDecoder,
contexts: &mut ContextState,
mut context_index: usize,
) -> usize {
let mut length = 1;
while decoder.decode_with_context(&mut contexts.context_mut(context_index)) {
length = (length << 1)
| decoder.decode_with_context(&mut contexts.context_mut(context_index + 1))
as usize;
context_index += 2;
}
length
}
loop {
if decoder.decode_with_context(&mut contexts.context_mut(0)) {
if decoder.decode_with_context(&mut contexts.context_mut(256)) {
offset = decode_length(&mut decoder, &mut contexts, 257) - 1;
if offset == 0 {
break;
}
}
let length = decode_length(&mut decoder, &mut contexts, 257 + 64);
for _ in 0..length {
result.push(result[result.len() - offset]);
}
} else {
let mut context_index = 1;
let mut byte = 0;
for i in (0..8).rev() {
let bit = decoder.decode_with_context(&mut contexts.context_mut(context_index));
context_index = (context_index << 1) | bit as usize;
byte |= (bit as u8) << i;
}
result.push(byte);
}
}
result
}

16
src/main.rs Normal file
View File

@@ -0,0 +1,16 @@
mod context_state;
mod greedy_packer;
mod lz;
mod match_finder;
mod range_coder;
fn main() {
let test_data = include_bytes!("../testcases/skipahead.wasm");
let packed = greedy_packer::pack(test_data);
dbg!((test_data.len(), packed.len()));
let unpacked = lz::unpack(&packed);
dbg!(unpacked.len());
assert!(test_data == unpacked.as_slice());
}

158
src/match_finder.rs Normal file
View File

@@ -0,0 +1,158 @@
use std::collections::BinaryHeap;
use std::ops::Range;
pub struct MatchFinder {
suffixes: Vec<i32>,
rev_suffixes: Vec<u32>,
lcp: Vec<u32>,
max_matches: usize,
patience: usize,
max_length_diff: usize,
}
impl MatchFinder {
pub fn new(data: &[u8]) -> MatchFinder {
let mut suffixes = vec![0i32; data.len()];
cdivsufsort::sort_in_place(data, &mut suffixes);
let mut rev_suffixes = vec![0u32; data.len()];
for (suffix_index, index) in suffixes.iter().enumerate() {
rev_suffixes[*index as usize] = suffix_index as u32;
}
let mut lcp = vec![0u32; data.len()];
let mut length = 0usize;
for suffix_index in &rev_suffixes {
if *suffix_index as usize + 1 < suffixes.len() {
let i = suffixes[*suffix_index as usize] as usize;
let j = suffixes[*suffix_index as usize + 1] as usize;
while i + length < data.len()
&& j + length < data.len()
&& data[i + length] == data[j + length]
{
length += 1;
}
lcp[*suffix_index as usize] = length as u32;
}
length = length.saturating_sub(1);
}
MatchFinder {
suffixes,
rev_suffixes,
lcp,
max_matches: 10,
patience: 10,
max_length_diff: 2,
}
}
pub fn matches(&self, pos: usize) -> Matches {
let index = self.rev_suffixes[pos] as usize;
let mut matches = Matches {
finder: self,
pos_range: 0..pos,
left_index: index,
left_length: usize::MAX,
right_index: index,
right_length: usize::MAX,
current_length: 0,
patience_left: 0,
matches_left: self.max_matches,
max_length: 0,
queue: BinaryHeap::new(),
};
matches.move_left();
matches.move_right();
matches
}
}
pub struct Matches<'a> {
finder: &'a MatchFinder,
pos_range: Range<usize>,
left_index: usize,
left_length: usize,
right_index: usize,
right_length: usize,
current_length: usize,
patience_left: usize,
matches_left: usize,
max_length: usize,
queue: BinaryHeap<usize>,
}
#[derive(Debug)]
pub struct Match {
pub pos: usize,
pub length: usize,
}
impl<'a> Iterator for Matches<'a> {
type Item = Match;
fn next(&mut self) -> Option<Match> {
if self.queue.is_empty() {
self.current_length = self.left_length.max(self.right_length);
self.max_length = self.max_length.max(self.current_length);
if self.current_length < 2
|| self.current_length + self.finder.max_length_diff < self.max_length
{
return None;
}
self.patience_left = self.finder.patience;
while self.matches_left > 0
&& self.patience_left > 0
&& (self.left_length == self.current_length
|| self.right_length == self.current_length)
{
if self.left_length == self.current_length {
self.add_to_queue(self.finder.suffixes[self.left_index]);
self.move_left();
}
if self.right_length == self.current_length {
self.add_to_queue(self.finder.suffixes[self.right_index]);
self.move_right();
}
}
}
self.queue.pop().map(|pos| Match {
pos,
length: self.current_length,
})
}
}
impl<'a> Matches<'a> {
fn move_left(&mut self) {
if self.left_index > 0 {
self.left_index -= 1;
self.left_length = self
.left_length
.min(self.finder.lcp[self.left_index] as usize);
} else {
self.left_length = 0;
}
}
fn move_right(&mut self) {
self.right_index += 1;
self.right_length = self
.right_length
.min(self.finder.lcp[self.right_index - 1] as usize);
}
fn add_to_queue(&mut self, pos: i32) {
if self.pos_range.contains(&(pos as usize)) {
self.queue.push(pos as usize);
self.matches_left -= 1;
self.patience_left = self.finder.patience;
} else {
self.patience_left = 0;
}
}
}

111
src/range_coder.rs Normal file
View File

@@ -0,0 +1,111 @@
use crate::context_state::Context;
pub struct RangeCoder {
buffer: Vec<u8>,
low: u64,
range: u64,
}
const TOTAL: u32 = 65536;
impl RangeCoder {
pub fn new() -> RangeCoder {
RangeCoder {
buffer: vec![],
low: 0,
range: 1 << 40,
}
}
pub fn encode_with_context(&mut self, bit: bool, context: &mut Context) {
self.encode_bit(bit, context.prob() as u32);
context.update(bit);
}
pub fn encode_bit(&mut self, bit: bool, prob: u32) {
let (start, size) = if bit { (0, prob) } else { (prob, TOTAL - prob) };
self.range /= TOTAL as u64;
self.low += start as u64 * self.range;
self.range *= size as u64;
while (self.low >> 32) == (self.low + self.range - 1) >> 32 {
self.emit_byte();
}
if self.range < 1 << 24 {
self.emit_byte();
self.emit_byte();
self.range = (1 << 40) - self.low;
}
}
pub fn finish(mut self) -> Vec<u8> {
while self.range < 1 << 32 {
self.emit_byte();
}
self.low += 1 << 32;
self.emit_byte();
self.buffer
}
fn emit_byte(&mut self) {
self.buffer.push((self.low >> 32).try_into().unwrap());
self.low = (self.low & 0xffffffff) << 8;
self.range *= 256;
}
}
pub struct RangeDecoder<'a> {
data: &'a [u8],
code: u64,
low: u64,
range: u64,
}
impl<'a> RangeDecoder<'a> {
pub fn new(data: &'a [u8]) -> RangeDecoder<'a> {
RangeDecoder {
data,
code: 0,
low: 0,
range: 1,
}
}
pub fn decode_with_context(&mut self, context: &mut Context) -> bool {
let bit = self.decode_bit(context.prob() as u32);
context.update(bit);
bit
}
pub fn decode_bit(&mut self, prob: u32) -> bool {
while self.low >> 32 == (self.low + self.range - 1) >> 32 {
self.append_byte();
}
if self.range < 1 << 24 {
self.append_byte();
self.append_byte();
self.range = (1 << 40) - self.low;
}
let bit = (self.code - self.low) / (self.range / TOTAL as u64) < prob as u64;
let (start, size) = if bit { (0, prob) } else { (prob, TOTAL - prob) };
self.range /= TOTAL as u64;
self.low += start as u64 * self.range;
self.range *= size as u64;
bit
}
fn append_byte(&mut self) {
self.code = (self.code & 0xffffffff) << 8;
if !self.data.is_empty() {
self.code |= self.data[0] as u64;
self.data = &self.data[1..];
}
self.low = (self.low & 0xffffffff) << 8;
self.range <<= 8;
}
}