commit 8f23ec711fd63e7cc9f22ee53a913258bafdb9bc Author: Dennis Ranke Date: Fri Nov 19 21:40:37 2021 +0100 first version, only very simple greedy packer diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..c241a73 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,50 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "autocfg" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" + +[[package]] +name = "cc" +version = "1.0.72" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee" + +[[package]] +name = "cdivsufsort" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edefce019197609da416762da75bb000bbd2224b2d89a7e722c2296cbff79b8c" +dependencies = [ + "cc", + "sacabase", +] + +[[package]] +name = "num-traits" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +dependencies = [ + "autocfg", +] + +[[package]] +name = "sacabase" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9883fc3d6ce3d78bb54d908602f8bc1f7b5f983afe601dabe083009d86267a84" +dependencies = [ + "num-traits", +] + +[[package]] +name = "upkr" +version = "0.1.0" +dependencies = [ + "cdivsufsort", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..1bb3bfa --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "upkr" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +cdivsufsort = "2" diff --git a/src/context_state.rs b/src/context_state.rs new file mode 100644 index 0000000..81cb3e0 --- /dev/null +++ b/src/context_state.rs @@ -0,0 +1,39 @@ +const INIT_PROB: u16 = 0x8000; +const UPDATE_RATE: u32 = 4; + +#[derive(Clone)] +pub struct ContextState { + contexts: Vec, +} + +pub struct Context<'a> { + state: &'a mut ContextState, + index: usize, +} + +impl ContextState { + pub fn new(size: usize) -> ContextState { + ContextState { + contexts: vec![INIT_PROB; size], + } + } + + pub fn context_mut(&mut self, index: usize) -> Context { + Context { state: self, index } + } +} + +impl<'a> Context<'a> { + pub fn prob(&self) -> u16 { + self.state.contexts[self.index] + } + + pub fn update(&mut self, bit: bool) { + let old = self.state.contexts[self.index]; + self.state.contexts[self.index] = if bit { + old + (((1 << 16) - old as u32) >> UPDATE_RATE) as u16 + } else { + old - (old >> UPDATE_RATE) + }; + } +} diff --git a/src/greedy_packer.rs b/src/greedy_packer.rs new file mode 100644 index 0000000..684b719 --- /dev/null +++ b/src/greedy_packer.rs @@ -0,0 +1,44 @@ +use crate::lz::LzCoder; +use crate::match_finder::MatchFinder; + +pub fn pack(data: &[u8]) -> Vec { + let match_finder = MatchFinder::new(data); + let mut lz = LzCoder::new(); + + let mut pos = 0; + while pos < data.len() { + let mut encoded_match = false; + if let Some(m) = match_finder.matches(pos).next() { + let max_offset = 1 << (m.length * 3 - 1).min(31); + let offset = pos - m.pos; + if offset < max_offset { + lz.encode_match(offset, m.length); + pos += m.length; + encoded_match = true; + } + } + + if !encoded_match { + let offset = lz.last_offset(); + if offset != 0 { + let length = data[pos..] + .iter() + .zip(data[(pos - offset)..].iter()) + .take_while(|(a, b)| a == b) + .count(); + if length > 0 { + lz.encode_match(offset, length); + pos += length; + encoded_match = true; + } + } + } + + if !encoded_match { + lz.encode_literal(data[pos]); + pos += 1; + } + } + + lz.finish() +} diff --git a/src/lz.rs b/src/lz.rs new file mode 100644 index 0000000..f220a28 --- /dev/null +++ b/src/lz.rs @@ -0,0 +1,116 @@ +use crate::context_state::ContextState; +use crate::range_coder::{RangeCoder, RangeDecoder}; + +pub struct LzCoder { + contexts: ContextState, + range_coder: RangeCoder, + last_offset: usize, +} + +impl LzCoder { + pub fn new() -> LzCoder { + LzCoder { + contexts: ContextState::new(1 + 255 + 1 + 64 + 64), + range_coder: RangeCoder::new(), + last_offset: 0, + } + } + + pub fn encode_literal(&mut self, byte: u8) { + self.bit(false, 0); + let mut context_index = 1; + for i in (0..8).rev() { + let bit = (byte >> i) & 1 != 0; + self.bit(bit, context_index); + context_index = (context_index << 1) | bit as usize; + } + } + + pub fn encode_match(&mut self, offset: usize, length: usize) { + self.bit(true, 0); + if offset != self.last_offset { + self.last_offset = offset; + self.bit(true, 256); + self.length(offset + 1, 257); + } else { + self.bit(false, 256); + } + self.length(length, 257 + 64); + } + + pub fn finish(mut self) -> Vec { + self.bit(true, 0); + self.bit(true, 256); + self.length(1, 257); + self.range_coder.finish() + } + + pub fn last_offset(&self) -> usize { + self.last_offset + } + + fn length(&mut self, value: usize, context_start: usize) { + assert!(value >= 1); + let top_bit = usize::BITS - 1 - value.leading_zeros(); + let mut context_index = context_start; + for i in (0..top_bit).rev() { + self.bit(true, context_index); + self.bit((value >> i) & 1 != 0, context_index + 1); + context_index += 2; + } + self.bit(false, context_index); + } + + fn bit(&mut self, b: bool, context_index: usize) { + self.range_coder + .encode_with_context(b, &mut self.contexts.context_mut(context_index)); + } +} + +pub fn unpack(packed_data: &[u8]) -> Vec { + let mut decoder = RangeDecoder::new(packed_data); + let mut contexts = ContextState::new(1 + 255 + 1 + 64 + 64); + let mut result = vec![]; + let mut offset = 0; + + fn decode_length( + decoder: &mut RangeDecoder, + contexts: &mut ContextState, + mut context_index: usize, + ) -> usize { + let mut length = 1; + while decoder.decode_with_context(&mut contexts.context_mut(context_index)) { + length = (length << 1) + | decoder.decode_with_context(&mut contexts.context_mut(context_index + 1)) + as usize; + context_index += 2; + } + length + } + + loop { + if decoder.decode_with_context(&mut contexts.context_mut(0)) { + if decoder.decode_with_context(&mut contexts.context_mut(256)) { + offset = decode_length(&mut decoder, &mut contexts, 257) - 1; + if offset == 0 { + break; + } + } + let length = decode_length(&mut decoder, &mut contexts, 257 + 64); + for _ in 0..length { + result.push(result[result.len() - offset]); + } + } else { + let mut context_index = 1; + let mut byte = 0; + for i in (0..8).rev() { + let bit = decoder.decode_with_context(&mut contexts.context_mut(context_index)); + context_index = (context_index << 1) | bit as usize; + byte |= (bit as u8) << i; + } + result.push(byte); + } + } + + result +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..82705bb --- /dev/null +++ b/src/main.rs @@ -0,0 +1,16 @@ +mod context_state; +mod greedy_packer; +mod lz; +mod match_finder; +mod range_coder; + +fn main() { + let test_data = include_bytes!("../testcases/skipahead.wasm"); + + let packed = greedy_packer::pack(test_data); + dbg!((test_data.len(), packed.len())); + + let unpacked = lz::unpack(&packed); + dbg!(unpacked.len()); + assert!(test_data == unpacked.as_slice()); +} diff --git a/src/match_finder.rs b/src/match_finder.rs new file mode 100644 index 0000000..e576019 --- /dev/null +++ b/src/match_finder.rs @@ -0,0 +1,158 @@ +use std::collections::BinaryHeap; +use std::ops::Range; + +pub struct MatchFinder { + suffixes: Vec, + rev_suffixes: Vec, + lcp: Vec, + + max_matches: usize, + patience: usize, + max_length_diff: usize, +} + +impl MatchFinder { + pub fn new(data: &[u8]) -> MatchFinder { + let mut suffixes = vec![0i32; data.len()]; + cdivsufsort::sort_in_place(data, &mut suffixes); + + let mut rev_suffixes = vec![0u32; data.len()]; + for (suffix_index, index) in suffixes.iter().enumerate() { + rev_suffixes[*index as usize] = suffix_index as u32; + } + + let mut lcp = vec![0u32; data.len()]; + let mut length = 0usize; + for suffix_index in &rev_suffixes { + if *suffix_index as usize + 1 < suffixes.len() { + let i = suffixes[*suffix_index as usize] as usize; + let j = suffixes[*suffix_index as usize + 1] as usize; + while i + length < data.len() + && j + length < data.len() + && data[i + length] == data[j + length] + { + length += 1; + } + lcp[*suffix_index as usize] = length as u32; + } + length = length.saturating_sub(1); + } + + MatchFinder { + suffixes, + rev_suffixes, + lcp, + max_matches: 10, + patience: 10, + max_length_diff: 2, + } + } + + pub fn matches(&self, pos: usize) -> Matches { + let index = self.rev_suffixes[pos] as usize; + let mut matches = Matches { + finder: self, + pos_range: 0..pos, + left_index: index, + left_length: usize::MAX, + right_index: index, + right_length: usize::MAX, + current_length: 0, + patience_left: 0, + matches_left: self.max_matches, + max_length: 0, + queue: BinaryHeap::new(), + }; + + matches.move_left(); + matches.move_right(); + + matches + } +} + +pub struct Matches<'a> { + finder: &'a MatchFinder, + pos_range: Range, + left_index: usize, + left_length: usize, + right_index: usize, + right_length: usize, + current_length: usize, + patience_left: usize, + matches_left: usize, + max_length: usize, + queue: BinaryHeap, +} + +#[derive(Debug)] +pub struct Match { + pub pos: usize, + pub length: usize, +} + +impl<'a> Iterator for Matches<'a> { + type Item = Match; + + fn next(&mut self) -> Option { + if self.queue.is_empty() { + self.current_length = self.left_length.max(self.right_length); + self.max_length = self.max_length.max(self.current_length); + if self.current_length < 2 + || self.current_length + self.finder.max_length_diff < self.max_length + { + return None; + } + self.patience_left = self.finder.patience; + while self.matches_left > 0 + && self.patience_left > 0 + && (self.left_length == self.current_length + || self.right_length == self.current_length) + { + if self.left_length == self.current_length { + self.add_to_queue(self.finder.suffixes[self.left_index]); + self.move_left(); + } + if self.right_length == self.current_length { + self.add_to_queue(self.finder.suffixes[self.right_index]); + self.move_right(); + } + } + } + + self.queue.pop().map(|pos| Match { + pos, + length: self.current_length, + }) + } +} + +impl<'a> Matches<'a> { + fn move_left(&mut self) { + if self.left_index > 0 { + self.left_index -= 1; + self.left_length = self + .left_length + .min(self.finder.lcp[self.left_index] as usize); + } else { + self.left_length = 0; + } + } + + fn move_right(&mut self) { + self.right_index += 1; + self.right_length = self + .right_length + .min(self.finder.lcp[self.right_index - 1] as usize); + } + + fn add_to_queue(&mut self, pos: i32) { + if self.pos_range.contains(&(pos as usize)) { + self.queue.push(pos as usize); + self.matches_left -= 1; + self.patience_left = self.finder.patience; + } else { + self.patience_left = 0; + } + } +} diff --git a/src/range_coder.rs b/src/range_coder.rs new file mode 100644 index 0000000..89ea633 --- /dev/null +++ b/src/range_coder.rs @@ -0,0 +1,111 @@ +use crate::context_state::Context; + +pub struct RangeCoder { + buffer: Vec, + low: u64, + range: u64, +} + +const TOTAL: u32 = 65536; + +impl RangeCoder { + pub fn new() -> RangeCoder { + RangeCoder { + buffer: vec![], + low: 0, + range: 1 << 40, + } + } + + pub fn encode_with_context(&mut self, bit: bool, context: &mut Context) { + self.encode_bit(bit, context.prob() as u32); + context.update(bit); + } + + pub fn encode_bit(&mut self, bit: bool, prob: u32) { + let (start, size) = if bit { (0, prob) } else { (prob, TOTAL - prob) }; + self.range /= TOTAL as u64; + self.low += start as u64 * self.range; + self.range *= size as u64; + + while (self.low >> 32) == (self.low + self.range - 1) >> 32 { + self.emit_byte(); + } + + if self.range < 1 << 24 { + self.emit_byte(); + self.emit_byte(); + self.range = (1 << 40) - self.low; + } + } + + pub fn finish(mut self) -> Vec { + while self.range < 1 << 32 { + self.emit_byte(); + } + self.low += 1 << 32; + self.emit_byte(); + self.buffer + } + + fn emit_byte(&mut self) { + self.buffer.push((self.low >> 32).try_into().unwrap()); + self.low = (self.low & 0xffffffff) << 8; + self.range *= 256; + } +} + +pub struct RangeDecoder<'a> { + data: &'a [u8], + code: u64, + low: u64, + range: u64, +} + +impl<'a> RangeDecoder<'a> { + pub fn new(data: &'a [u8]) -> RangeDecoder<'a> { + RangeDecoder { + data, + code: 0, + low: 0, + range: 1, + } + } + + pub fn decode_with_context(&mut self, context: &mut Context) -> bool { + let bit = self.decode_bit(context.prob() as u32); + context.update(bit); + bit + } + + pub fn decode_bit(&mut self, prob: u32) -> bool { + while self.low >> 32 == (self.low + self.range - 1) >> 32 { + self.append_byte(); + } + + if self.range < 1 << 24 { + self.append_byte(); + self.append_byte(); + self.range = (1 << 40) - self.low; + } + + let bit = (self.code - self.low) / (self.range / TOTAL as u64) < prob as u64; + + let (start, size) = if bit { (0, prob) } else { (prob, TOTAL - prob) }; + self.range /= TOTAL as u64; + self.low += start as u64 * self.range; + self.range *= size as u64; + + bit + } + + fn append_byte(&mut self) { + self.code = (self.code & 0xffffffff) << 8; + if !self.data.is_empty() { + self.code |= self.data[0] as u64; + self.data = &self.data[1..]; + } + self.low = (self.low & 0xffffffff) << 8; + self.range <<= 8; + } +}