mirror of
https://github.com/exoticorn/upkr.git
synced 2026-01-20 11:36:42 +01:00
first version, only very simple greedy packer
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
/target
|
||||
50
Cargo.lock
generated
Normal file
50
Cargo.lock
generated
Normal file
@@ -0,0 +1,50 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.0.72"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee"
|
||||
|
||||
[[package]]
|
||||
name = "cdivsufsort"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "edefce019197609da416762da75bb000bbd2224b2d89a7e722c2296cbff79b8c"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"sacabase",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sacabase"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9883fc3d6ce3d78bb54d908602f8bc1f7b5f983afe601dabe083009d86267a84"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "upkr"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cdivsufsort",
|
||||
]
|
||||
9
Cargo.toml
Normal file
9
Cargo.toml
Normal file
@@ -0,0 +1,9 @@
|
||||
[package]
|
||||
name = "upkr"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
cdivsufsort = "2"
|
||||
39
src/context_state.rs
Normal file
39
src/context_state.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
const INIT_PROB: u16 = 0x8000;
|
||||
const UPDATE_RATE: u32 = 4;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct ContextState {
|
||||
contexts: Vec<u16>,
|
||||
}
|
||||
|
||||
pub struct Context<'a> {
|
||||
state: &'a mut ContextState,
|
||||
index: usize,
|
||||
}
|
||||
|
||||
impl ContextState {
|
||||
pub fn new(size: usize) -> ContextState {
|
||||
ContextState {
|
||||
contexts: vec![INIT_PROB; size],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn context_mut(&mut self, index: usize) -> Context {
|
||||
Context { state: self, index }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Context<'a> {
|
||||
pub fn prob(&self) -> u16 {
|
||||
self.state.contexts[self.index]
|
||||
}
|
||||
|
||||
pub fn update(&mut self, bit: bool) {
|
||||
let old = self.state.contexts[self.index];
|
||||
self.state.contexts[self.index] = if bit {
|
||||
old + (((1 << 16) - old as u32) >> UPDATE_RATE) as u16
|
||||
} else {
|
||||
old - (old >> UPDATE_RATE)
|
||||
};
|
||||
}
|
||||
}
|
||||
44
src/greedy_packer.rs
Normal file
44
src/greedy_packer.rs
Normal file
@@ -0,0 +1,44 @@
|
||||
use crate::lz::LzCoder;
|
||||
use crate::match_finder::MatchFinder;
|
||||
|
||||
pub fn pack(data: &[u8]) -> Vec<u8> {
|
||||
let match_finder = MatchFinder::new(data);
|
||||
let mut lz = LzCoder::new();
|
||||
|
||||
let mut pos = 0;
|
||||
while pos < data.len() {
|
||||
let mut encoded_match = false;
|
||||
if let Some(m) = match_finder.matches(pos).next() {
|
||||
let max_offset = 1 << (m.length * 3 - 1).min(31);
|
||||
let offset = pos - m.pos;
|
||||
if offset < max_offset {
|
||||
lz.encode_match(offset, m.length);
|
||||
pos += m.length;
|
||||
encoded_match = true;
|
||||
}
|
||||
}
|
||||
|
||||
if !encoded_match {
|
||||
let offset = lz.last_offset();
|
||||
if offset != 0 {
|
||||
let length = data[pos..]
|
||||
.iter()
|
||||
.zip(data[(pos - offset)..].iter())
|
||||
.take_while(|(a, b)| a == b)
|
||||
.count();
|
||||
if length > 0 {
|
||||
lz.encode_match(offset, length);
|
||||
pos += length;
|
||||
encoded_match = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !encoded_match {
|
||||
lz.encode_literal(data[pos]);
|
||||
pos += 1;
|
||||
}
|
||||
}
|
||||
|
||||
lz.finish()
|
||||
}
|
||||
116
src/lz.rs
Normal file
116
src/lz.rs
Normal file
@@ -0,0 +1,116 @@
|
||||
use crate::context_state::ContextState;
|
||||
use crate::range_coder::{RangeCoder, RangeDecoder};
|
||||
|
||||
pub struct LzCoder {
|
||||
contexts: ContextState,
|
||||
range_coder: RangeCoder,
|
||||
last_offset: usize,
|
||||
}
|
||||
|
||||
impl LzCoder {
|
||||
pub fn new() -> LzCoder {
|
||||
LzCoder {
|
||||
contexts: ContextState::new(1 + 255 + 1 + 64 + 64),
|
||||
range_coder: RangeCoder::new(),
|
||||
last_offset: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_literal(&mut self, byte: u8) {
|
||||
self.bit(false, 0);
|
||||
let mut context_index = 1;
|
||||
for i in (0..8).rev() {
|
||||
let bit = (byte >> i) & 1 != 0;
|
||||
self.bit(bit, context_index);
|
||||
context_index = (context_index << 1) | bit as usize;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_match(&mut self, offset: usize, length: usize) {
|
||||
self.bit(true, 0);
|
||||
if offset != self.last_offset {
|
||||
self.last_offset = offset;
|
||||
self.bit(true, 256);
|
||||
self.length(offset + 1, 257);
|
||||
} else {
|
||||
self.bit(false, 256);
|
||||
}
|
||||
self.length(length, 257 + 64);
|
||||
}
|
||||
|
||||
pub fn finish(mut self) -> Vec<u8> {
|
||||
self.bit(true, 0);
|
||||
self.bit(true, 256);
|
||||
self.length(1, 257);
|
||||
self.range_coder.finish()
|
||||
}
|
||||
|
||||
pub fn last_offset(&self) -> usize {
|
||||
self.last_offset
|
||||
}
|
||||
|
||||
fn length(&mut self, value: usize, context_start: usize) {
|
||||
assert!(value >= 1);
|
||||
let top_bit = usize::BITS - 1 - value.leading_zeros();
|
||||
let mut context_index = context_start;
|
||||
for i in (0..top_bit).rev() {
|
||||
self.bit(true, context_index);
|
||||
self.bit((value >> i) & 1 != 0, context_index + 1);
|
||||
context_index += 2;
|
||||
}
|
||||
self.bit(false, context_index);
|
||||
}
|
||||
|
||||
fn bit(&mut self, b: bool, context_index: usize) {
|
||||
self.range_coder
|
||||
.encode_with_context(b, &mut self.contexts.context_mut(context_index));
|
||||
}
|
||||
}
|
||||
|
||||
pub fn unpack(packed_data: &[u8]) -> Vec<u8> {
|
||||
let mut decoder = RangeDecoder::new(packed_data);
|
||||
let mut contexts = ContextState::new(1 + 255 + 1 + 64 + 64);
|
||||
let mut result = vec![];
|
||||
let mut offset = 0;
|
||||
|
||||
fn decode_length(
|
||||
decoder: &mut RangeDecoder,
|
||||
contexts: &mut ContextState,
|
||||
mut context_index: usize,
|
||||
) -> usize {
|
||||
let mut length = 1;
|
||||
while decoder.decode_with_context(&mut contexts.context_mut(context_index)) {
|
||||
length = (length << 1)
|
||||
| decoder.decode_with_context(&mut contexts.context_mut(context_index + 1))
|
||||
as usize;
|
||||
context_index += 2;
|
||||
}
|
||||
length
|
||||
}
|
||||
|
||||
loop {
|
||||
if decoder.decode_with_context(&mut contexts.context_mut(0)) {
|
||||
if decoder.decode_with_context(&mut contexts.context_mut(256)) {
|
||||
offset = decode_length(&mut decoder, &mut contexts, 257) - 1;
|
||||
if offset == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let length = decode_length(&mut decoder, &mut contexts, 257 + 64);
|
||||
for _ in 0..length {
|
||||
result.push(result[result.len() - offset]);
|
||||
}
|
||||
} else {
|
||||
let mut context_index = 1;
|
||||
let mut byte = 0;
|
||||
for i in (0..8).rev() {
|
||||
let bit = decoder.decode_with_context(&mut contexts.context_mut(context_index));
|
||||
context_index = (context_index << 1) | bit as usize;
|
||||
byte |= (bit as u8) << i;
|
||||
}
|
||||
result.push(byte);
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
16
src/main.rs
Normal file
16
src/main.rs
Normal file
@@ -0,0 +1,16 @@
|
||||
mod context_state;
|
||||
mod greedy_packer;
|
||||
mod lz;
|
||||
mod match_finder;
|
||||
mod range_coder;
|
||||
|
||||
fn main() {
|
||||
let test_data = include_bytes!("../testcases/skipahead.wasm");
|
||||
|
||||
let packed = greedy_packer::pack(test_data);
|
||||
dbg!((test_data.len(), packed.len()));
|
||||
|
||||
let unpacked = lz::unpack(&packed);
|
||||
dbg!(unpacked.len());
|
||||
assert!(test_data == unpacked.as_slice());
|
||||
}
|
||||
158
src/match_finder.rs
Normal file
158
src/match_finder.rs
Normal file
@@ -0,0 +1,158 @@
|
||||
use std::collections::BinaryHeap;
|
||||
use std::ops::Range;
|
||||
|
||||
pub struct MatchFinder {
|
||||
suffixes: Vec<i32>,
|
||||
rev_suffixes: Vec<u32>,
|
||||
lcp: Vec<u32>,
|
||||
|
||||
max_matches: usize,
|
||||
patience: usize,
|
||||
max_length_diff: usize,
|
||||
}
|
||||
|
||||
impl MatchFinder {
|
||||
pub fn new(data: &[u8]) -> MatchFinder {
|
||||
let mut suffixes = vec![0i32; data.len()];
|
||||
cdivsufsort::sort_in_place(data, &mut suffixes);
|
||||
|
||||
let mut rev_suffixes = vec![0u32; data.len()];
|
||||
for (suffix_index, index) in suffixes.iter().enumerate() {
|
||||
rev_suffixes[*index as usize] = suffix_index as u32;
|
||||
}
|
||||
|
||||
let mut lcp = vec![0u32; data.len()];
|
||||
let mut length = 0usize;
|
||||
for suffix_index in &rev_suffixes {
|
||||
if *suffix_index as usize + 1 < suffixes.len() {
|
||||
let i = suffixes[*suffix_index as usize] as usize;
|
||||
let j = suffixes[*suffix_index as usize + 1] as usize;
|
||||
while i + length < data.len()
|
||||
&& j + length < data.len()
|
||||
&& data[i + length] == data[j + length]
|
||||
{
|
||||
length += 1;
|
||||
}
|
||||
lcp[*suffix_index as usize] = length as u32;
|
||||
}
|
||||
length = length.saturating_sub(1);
|
||||
}
|
||||
|
||||
MatchFinder {
|
||||
suffixes,
|
||||
rev_suffixes,
|
||||
lcp,
|
||||
max_matches: 10,
|
||||
patience: 10,
|
||||
max_length_diff: 2,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn matches(&self, pos: usize) -> Matches {
|
||||
let index = self.rev_suffixes[pos] as usize;
|
||||
let mut matches = Matches {
|
||||
finder: self,
|
||||
pos_range: 0..pos,
|
||||
left_index: index,
|
||||
left_length: usize::MAX,
|
||||
right_index: index,
|
||||
right_length: usize::MAX,
|
||||
current_length: 0,
|
||||
patience_left: 0,
|
||||
matches_left: self.max_matches,
|
||||
max_length: 0,
|
||||
queue: BinaryHeap::new(),
|
||||
};
|
||||
|
||||
matches.move_left();
|
||||
matches.move_right();
|
||||
|
||||
matches
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Matches<'a> {
|
||||
finder: &'a MatchFinder,
|
||||
pos_range: Range<usize>,
|
||||
left_index: usize,
|
||||
left_length: usize,
|
||||
right_index: usize,
|
||||
right_length: usize,
|
||||
current_length: usize,
|
||||
patience_left: usize,
|
||||
matches_left: usize,
|
||||
max_length: usize,
|
||||
queue: BinaryHeap<usize>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Match {
|
||||
pub pos: usize,
|
||||
pub length: usize,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Matches<'a> {
|
||||
type Item = Match;
|
||||
|
||||
fn next(&mut self) -> Option<Match> {
|
||||
if self.queue.is_empty() {
|
||||
self.current_length = self.left_length.max(self.right_length);
|
||||
self.max_length = self.max_length.max(self.current_length);
|
||||
if self.current_length < 2
|
||||
|| self.current_length + self.finder.max_length_diff < self.max_length
|
||||
{
|
||||
return None;
|
||||
}
|
||||
self.patience_left = self.finder.patience;
|
||||
while self.matches_left > 0
|
||||
&& self.patience_left > 0
|
||||
&& (self.left_length == self.current_length
|
||||
|| self.right_length == self.current_length)
|
||||
{
|
||||
if self.left_length == self.current_length {
|
||||
self.add_to_queue(self.finder.suffixes[self.left_index]);
|
||||
self.move_left();
|
||||
}
|
||||
if self.right_length == self.current_length {
|
||||
self.add_to_queue(self.finder.suffixes[self.right_index]);
|
||||
self.move_right();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.queue.pop().map(|pos| Match {
|
||||
pos,
|
||||
length: self.current_length,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Matches<'a> {
|
||||
fn move_left(&mut self) {
|
||||
if self.left_index > 0 {
|
||||
self.left_index -= 1;
|
||||
self.left_length = self
|
||||
.left_length
|
||||
.min(self.finder.lcp[self.left_index] as usize);
|
||||
} else {
|
||||
self.left_length = 0;
|
||||
}
|
||||
}
|
||||
|
||||
fn move_right(&mut self) {
|
||||
self.right_index += 1;
|
||||
self.right_length = self
|
||||
.right_length
|
||||
.min(self.finder.lcp[self.right_index - 1] as usize);
|
||||
}
|
||||
|
||||
fn add_to_queue(&mut self, pos: i32) {
|
||||
if self.pos_range.contains(&(pos as usize)) {
|
||||
self.queue.push(pos as usize);
|
||||
self.matches_left -= 1;
|
||||
self.patience_left = self.finder.patience;
|
||||
} else {
|
||||
self.patience_left = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
111
src/range_coder.rs
Normal file
111
src/range_coder.rs
Normal file
@@ -0,0 +1,111 @@
|
||||
use crate::context_state::Context;
|
||||
|
||||
pub struct RangeCoder {
|
||||
buffer: Vec<u8>,
|
||||
low: u64,
|
||||
range: u64,
|
||||
}
|
||||
|
||||
const TOTAL: u32 = 65536;
|
||||
|
||||
impl RangeCoder {
|
||||
pub fn new() -> RangeCoder {
|
||||
RangeCoder {
|
||||
buffer: vec![],
|
||||
low: 0,
|
||||
range: 1 << 40,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_with_context(&mut self, bit: bool, context: &mut Context) {
|
||||
self.encode_bit(bit, context.prob() as u32);
|
||||
context.update(bit);
|
||||
}
|
||||
|
||||
pub fn encode_bit(&mut self, bit: bool, prob: u32) {
|
||||
let (start, size) = if bit { (0, prob) } else { (prob, TOTAL - prob) };
|
||||
self.range /= TOTAL as u64;
|
||||
self.low += start as u64 * self.range;
|
||||
self.range *= size as u64;
|
||||
|
||||
while (self.low >> 32) == (self.low + self.range - 1) >> 32 {
|
||||
self.emit_byte();
|
||||
}
|
||||
|
||||
if self.range < 1 << 24 {
|
||||
self.emit_byte();
|
||||
self.emit_byte();
|
||||
self.range = (1 << 40) - self.low;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn finish(mut self) -> Vec<u8> {
|
||||
while self.range < 1 << 32 {
|
||||
self.emit_byte();
|
||||
}
|
||||
self.low += 1 << 32;
|
||||
self.emit_byte();
|
||||
self.buffer
|
||||
}
|
||||
|
||||
fn emit_byte(&mut self) {
|
||||
self.buffer.push((self.low >> 32).try_into().unwrap());
|
||||
self.low = (self.low & 0xffffffff) << 8;
|
||||
self.range *= 256;
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RangeDecoder<'a> {
|
||||
data: &'a [u8],
|
||||
code: u64,
|
||||
low: u64,
|
||||
range: u64,
|
||||
}
|
||||
|
||||
impl<'a> RangeDecoder<'a> {
|
||||
pub fn new(data: &'a [u8]) -> RangeDecoder<'a> {
|
||||
RangeDecoder {
|
||||
data,
|
||||
code: 0,
|
||||
low: 0,
|
||||
range: 1,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_with_context(&mut self, context: &mut Context) -> bool {
|
||||
let bit = self.decode_bit(context.prob() as u32);
|
||||
context.update(bit);
|
||||
bit
|
||||
}
|
||||
|
||||
pub fn decode_bit(&mut self, prob: u32) -> bool {
|
||||
while self.low >> 32 == (self.low + self.range - 1) >> 32 {
|
||||
self.append_byte();
|
||||
}
|
||||
|
||||
if self.range < 1 << 24 {
|
||||
self.append_byte();
|
||||
self.append_byte();
|
||||
self.range = (1 << 40) - self.low;
|
||||
}
|
||||
|
||||
let bit = (self.code - self.low) / (self.range / TOTAL as u64) < prob as u64;
|
||||
|
||||
let (start, size) = if bit { (0, prob) } else { (prob, TOTAL - prob) };
|
||||
self.range /= TOTAL as u64;
|
||||
self.low += start as u64 * self.range;
|
||||
self.range *= size as u64;
|
||||
|
||||
bit
|
||||
}
|
||||
|
||||
fn append_byte(&mut self) {
|
||||
self.code = (self.code & 0xffffffff) << 8;
|
||||
if !self.data.is_empty() {
|
||||
self.code |= self.data[0] as u64;
|
||||
self.data = &self.data[1..];
|
||||
}
|
||||
self.low = (self.low & 0xffffffff) << 8;
|
||||
self.range <<= 8;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user