implement parsing packer + fix huge bug in match finder

This commit is contained in:
2021-11-24 09:28:18 +01:00
parent 470b778340
commit 11b43f9bc4
7 changed files with 277 additions and 95 deletions

View File

@@ -1,9 +1,11 @@
use crate::lz::LzCoder; use crate::lz;
use crate::match_finder::MatchFinder; use crate::match_finder::MatchFinder;
use crate::rans::RansCoder;
pub fn pack(data: &[u8]) -> Vec<u8> { pub fn pack(data: &[u8]) -> Vec<u8> {
let match_finder = MatchFinder::new(data); let match_finder = MatchFinder::new(data);
let mut lz = LzCoder::new(); let mut rans_coder = RansCoder::new();
let mut state = lz::CoderState::new();
let mut pos = 0; let mut pos = 0;
while pos < data.len() { while pos < data.len() {
@@ -12,14 +14,18 @@ pub fn pack(data: &[u8]) -> Vec<u8> {
let max_offset = 1 << (m.length * 3 - 1).min(31); let max_offset = 1 << (m.length * 3 - 1).min(31);
let offset = pos - m.pos; let offset = pos - m.pos;
if offset < max_offset { if offset < max_offset {
lz.encode_match(offset, m.length); lz::Op::Match {
offset: offset as u32,
len: m.length as u32,
}
.encode(&mut rans_coder, &mut state);
pos += m.length; pos += m.length;
encoded_match = true; encoded_match = true;
} }
} }
if !encoded_match { if !encoded_match {
let offset = lz.last_offset(); let offset = state.last_offset() as usize;
if offset != 0 { if offset != 0 {
let length = data[pos..] let length = data[pos..]
.iter() .iter()
@@ -27,7 +33,11 @@ pub fn pack(data: &[u8]) -> Vec<u8> {
.take_while(|(a, b)| a == b) .take_while(|(a, b)| a == b)
.count(); .count();
if length > 0 { if length > 0 {
lz.encode_match(offset, length); lz::Op::Match {
offset: offset as u32,
len: length as u32,
}
.encode(&mut rans_coder, &mut state);
pos += length; pos += length;
encoded_match = true; encoded_match = true;
} }
@@ -35,10 +45,11 @@ pub fn pack(data: &[u8]) -> Vec<u8> {
} }
if !encoded_match { if !encoded_match {
lz.encode_literal(data[pos]); lz::Op::Literal(data[pos]).encode(&mut rans_coder, &mut state);
pos += 1; pos += 1;
} }
} }
lz.finish() lz::encode_eof(&mut rans_coder, &mut state);
rans_coder.finish()
} }

View File

@@ -3,6 +3,8 @@ mod greedy_packer;
mod lz; mod lz;
mod match_finder; mod match_finder;
mod rans; mod rans;
mod parsing_packer;
pub use greedy_packer::pack; pub use greedy_packer::pack as pack_fast;
pub use parsing_packer::pack;
pub use lz::unpack; pub use lz::unpack;

128
src/lz.rs
View File

@@ -1,70 +1,86 @@
use crate::context_state::ContextState; use crate::context_state::ContextState;
use crate::rans::{RansCoder, RansDecoder}; use crate::rans::{EntropyCoder, RansDecoder};
pub struct LzCoder { #[derive(Copy, Clone, Debug)]
contexts: ContextState, pub enum Op {
range_coder: RansCoder, Literal(u8),
last_offset: usize, Match { offset: u32, len: u32 },
} }
impl LzCoder { impl Op {
pub fn new() -> LzCoder { pub fn encode(&self, coder: &mut dyn EntropyCoder, state: &mut CoderState) {
LzCoder { match self {
&Op::Literal(lit) => {
encode_bit(coder, state, 0, false);
let mut context_index = 1;
for i in (0..8).rev() {
let bit = (lit >> i) & 1 != 0;
encode_bit(coder, state, context_index, bit);
context_index = (context_index << 1) | bit as usize;
}
}
&Op::Match { offset, len } => {
encode_bit(coder, state, 0, true);
encode_bit(coder, state, 256, offset != state.last_offset);
if offset != state.last_offset {
encode_length(coder, state, 257, offset + 1);
state.last_offset = offset;
}
encode_length(coder, state, 257 + 64, len);
}
}
}
}
pub fn encode_eof(coder: &mut dyn EntropyCoder, state: &mut CoderState) {
encode_bit(coder, state, 0, true);
encode_bit(coder, state, 256, true);
encode_length(coder, state, 257, 1);
}
fn encode_bit(
coder: &mut dyn EntropyCoder,
state: &mut CoderState,
context_index: usize,
bit: bool,
) {
coder.encode_with_context(bit, &mut state.contexts.context_mut(context_index));
}
fn encode_length(
coder: &mut dyn EntropyCoder,
state: &mut CoderState,
context_start: usize,
value: u32,
) {
assert!(value >= 1);
let top_bit = u32::BITS - 1 - value.leading_zeros();
let mut context_index = context_start;
for i in 0..top_bit {
encode_bit(coder, state, context_index, true);
encode_bit(coder, state, context_index + 1, (value >> i) & 1 != 0);
context_index += 2;
}
encode_bit(coder, state, context_index, false);
}
#[derive(Clone)]
pub struct CoderState {
contexts: ContextState,
last_offset: u32,
}
impl CoderState {
pub fn new() -> CoderState {
CoderState {
contexts: ContextState::new(1 + 255 + 1 + 64 + 64), contexts: ContextState::new(1 + 255 + 1 + 64 + 64),
range_coder: RansCoder::new(),
last_offset: 0, last_offset: 0,
} }
} }
pub fn encode_literal(&mut self, byte: u8) { pub fn last_offset(&self) -> u32 {
self.bit(false, 0);
let mut context_index = 1;
for i in (0..8).rev() {
let bit = (byte >> i) & 1 != 0;
self.bit(bit, context_index);
context_index = (context_index << 1) | bit as usize;
}
}
pub fn encode_match(&mut self, offset: usize, length: usize) {
self.bit(true, 0);
if offset != self.last_offset {
self.last_offset = offset;
self.bit(true, 256);
self.length(offset + 1, 257);
} else {
self.bit(false, 256);
}
self.length(length, 257 + 64);
}
pub fn finish(mut self) -> Vec<u8> {
self.bit(true, 0);
self.bit(true, 256);
self.length(1, 257);
self.range_coder.finish()
}
pub fn last_offset(&self) -> usize {
self.last_offset self.last_offset
} }
fn length(&mut self, value: usize, context_start: usize) {
assert!(value >= 1);
let top_bit = usize::BITS - 1 - value.leading_zeros();
let mut context_index = context_start;
for i in 0..top_bit {
self.bit(true, context_index);
self.bit((value >> i) & 1 != 0, context_index + 1);
context_index += 2;
}
self.bit(false, context_index);
}
fn bit(&mut self, b: bool, context_index: usize) {
self.range_coder
.encode_with_context(b, &mut self.contexts.context_mut(context_index));
}
} }
pub fn unpack(packed_data: &[u8]) -> Vec<u8> { pub fn unpack(packed_data: &[u8]) -> Vec<u8> {

View File

@@ -1,6 +1,6 @@
use std::{fs::File, path::PathBuf};
use std::io::prelude::*;
use anyhow::{bail, Result}; use anyhow::{bail, Result};
use std::io::prelude::*;
use std::{fs::File, path::PathBuf};
fn main() -> Result<()> { fn main() -> Result<()> {
let mut args = pico_args::Arguments::from_env(); let mut args = pico_args::Arguments::from_env();
@@ -8,12 +8,18 @@ fn main() -> Result<()> {
match args.subcommand()?.as_ref().map(|s| s.as_str()) { match args.subcommand()?.as_ref().map(|s| s.as_str()) {
None => print_help(), None => print_help(),
Some("pack") => { Some("pack") => {
let fast = args.contains("--fast");
let infile = args.free_from_os_str::<PathBuf, bool>(|s| Ok(s.into()))?; let infile = args.free_from_os_str::<PathBuf, bool>(|s| Ok(s.into()))?;
let outfile = args.free_from_os_str::<PathBuf, bool>(|s| Ok(s.into()))?; let outfile = args.free_from_os_str::<PathBuf, bool>(|s| Ok(s.into()))?;
let mut data = vec![]; let mut data = vec![];
File::open(infile)?.read_to_end(&mut data)?; File::open(infile)?.read_to_end(&mut data)?;
let packed_data = upkr::pack(&data); let packed_data = if fast {
upkr::pack_fast(&data)
} else {
upkr::pack(&data)
};
File::create(outfile)?.write_all(&packed_data)?; File::create(outfile)?.write_all(&packed_data)?;
} }
Some("unpack") => { Some("unpack") => {

View File

@@ -58,7 +58,6 @@ impl MatchFinder {
right_index: index, right_index: index,
right_length: usize::MAX, right_length: usize::MAX,
current_length: 0, current_length: 0,
patience_left: 0,
matches_left: self.max_matches, matches_left: self.max_matches,
max_length: 0, max_length: 0,
queue: BinaryHeap::new(), queue: BinaryHeap::new(),
@@ -79,7 +78,6 @@ pub struct Matches<'a> {
right_index: usize, right_index: usize,
right_length: usize, right_length: usize,
current_length: usize, current_length: usize,
patience_left: usize,
matches_left: usize, matches_left: usize,
max_length: usize, max_length: usize,
queue: BinaryHeap<usize>, queue: BinaryHeap<usize>,
@@ -103,9 +101,7 @@ impl<'a> Iterator for Matches<'a> {
{ {
return None; return None;
} }
self.patience_left = self.finder.patience;
while self.matches_left > 0 while self.matches_left > 0
&& self.patience_left > 0
&& (self.left_length == self.current_length && (self.left_length == self.current_length
|| self.right_length == self.current_length) || self.right_length == self.current_length)
{ {
@@ -129,30 +125,43 @@ impl<'a> Iterator for Matches<'a> {
impl<'a> Matches<'a> { impl<'a> Matches<'a> {
fn move_left(&mut self) { fn move_left(&mut self) {
if self.left_index > 0 { let mut patience = self.finder.patience;
while self.left_length > 0 && patience > 0 && self.left_index > 0 {
self.left_index -= 1; self.left_index -= 1;
self.left_length = self self.left_length = self
.left_length .left_length
.min(self.finder.lcp[self.left_index] as usize); .min(self.finder.lcp[self.left_index] as usize);
} else { if self
self.left_length = 0; .pos_range
.contains(&(self.finder.suffixes[self.left_index] as usize))
{
return;
} }
patience -= 1;
}
self.left_length = 0;
} }
fn move_right(&mut self) { fn move_right(&mut self) {
let mut patience = self.finder.patience;
while self.right_length > 0 && patience > 0 && self.right_index + 1 < self.finder.suffixes.len() {
self.right_index += 1; self.right_index += 1;
self.right_length = self self.right_length = self
.right_length .right_length
.min(self.finder.lcp[self.right_index - 1] as usize); .min(self.finder.lcp[self.right_index - 1] as usize);
if self
.pos_range
.contains(&(self.finder.suffixes[self.right_index] as usize))
{
return;
}
patience -= 1;
}
self.right_length = 0;
} }
fn add_to_queue(&mut self, pos: i32) { fn add_to_queue(&mut self, pos: i32) {
if self.pos_range.contains(&(pos as usize)) {
self.queue.push(pos as usize); self.queue.push(pos as usize);
self.matches_left -= 1; self.matches_left -= 1;
self.patience_left = self.finder.patience;
} else {
self.patience_left = 0;
}
} }
} }

122
src/parsing_packer.rs Normal file
View File

@@ -0,0 +1,122 @@
use std::collections::HashMap;
use std::rc::Rc;
use crate::match_finder::MatchFinder;
use crate::rans::{RansCoder, CostCounter};
use crate::lz;
pub fn pack(data: &[u8]) -> Vec<u8> {
let mut parse = parse(data);
let mut ops = vec![];
while let Some(link) = parse {
ops.push(link.op);
parse = link.prev.clone();
}
let mut state = lz::CoderState::new();
let mut coder = RansCoder::new();
for op in ops.into_iter().rev() {
op.encode(&mut coder, &mut state);
}
lz::encode_eof(&mut coder, &mut state);
coder.finish()
}
struct Parse {
prev: Option<Rc<Parse>>,
op: lz::Op,
}
struct Arrival {
parse: Option<Rc<Parse>>,
state: lz::CoderState,
cost: f64,
}
type Arrivals = HashMap<usize, Vec<Arrival>>;
const MAX_ARRIVALS: usize = 16;
fn parse(data: &[u8]) -> Option<Rc<Parse>> {
let match_finder = MatchFinder::new(data);
let mut arrivals: Arrivals = HashMap::new();
fn add_arrival(arrivals: &mut Arrivals, pos: usize, arrival: Arrival) {
let vec = arrivals.entry(pos).or_default();
if vec.len() < MAX_ARRIVALS || vec[MAX_ARRIVALS - 1].cost > arrival.cost {
vec.push(arrival);
vec.sort_by(|a, b| {
a.cost
.partial_cmp(&b.cost)
.unwrap_or(std::cmp::Ordering::Equal)
});
if vec.len() > MAX_ARRIVALS {
vec.pop();
}
}
}
fn add_match(arrivals: &mut Arrivals, pos: usize, offset: usize, length: usize, arrival: &Arrival) {
let mut cost_counter = CostCounter(0.);
let mut state = arrival.state.clone();
let op = lz::Op::Match { offset: offset as u32, len: length as u32 };
op.encode(&mut cost_counter, &mut state);
add_arrival(
arrivals,
pos + length,
Arrival {
parse: Some(Rc::new(Parse {
prev: arrival.parse.clone(),
op,
})),
state,
cost: arrival.cost + cost_counter.0,
},
);
}
add_arrival(
&mut arrivals,
0,
Arrival {
parse: None,
state: lz::CoderState::new(),
cost: 0.0,
},
);
for pos in 0..data.len() {
for arrival in arrivals.remove(&pos).unwrap() {
let mut found_last_offset = false;
for m in match_finder.matches(pos) {
let offset = pos - m.pos;
if offset as u32 == arrival.state.last_offset() {
found_last_offset = true;
}
add_match(&mut arrivals, pos, offset, m.length, &arrival);
}
if !found_last_offset && arrival.state.last_offset() > 0 {
let offset = arrival.state.last_offset() as usize;
let length = data[pos..].iter().zip(data[(pos - offset)..].iter()).take_while(|(a, b)| a == b).count();
if length > 0 {
add_match(&mut arrivals, pos, offset, length, &arrival);
}
}
let mut cost_counter = CostCounter(0.);
let mut state = arrival.state;
let op = lz::Op::Literal(data[pos]);
op.encode(&mut cost_counter, &mut state);
add_arrival(
&mut arrivals,
pos + 1,
Arrival {
parse: Some(Rc::new(Parse {
prev: arrival.parse,
op,
})),
state,
cost: arrival.cost + cost_counter.0,
},
);
}
}
arrivals.remove(&data.len()).unwrap()[0].parse.clone()
}

View File

@@ -4,23 +4,29 @@ const L_BITS: u32 = 16;
pub const PROB_BITS: u32 = 12; pub const PROB_BITS: u32 = 12;
pub const ONE_PROB: u32 = 1 << PROB_BITS; pub const ONE_PROB: u32 = 1 << PROB_BITS;
pub trait EntropyCoder {
fn encode_bit(&mut self, bit: bool, prob: u16);
fn encode_with_context(&mut self, bit: bool, context: &mut Context) {
self.encode_bit(bit, context.prob());
context.update(bit);
}
}
pub struct RansCoder(Vec<u16>); pub struct RansCoder(Vec<u16>);
impl EntropyCoder for RansCoder {
fn encode_bit(&mut self, bit: bool, prob: u16) {
assert!(prob < 32768);
self.0.push(prob | ((bit as u16) << 15));
}
}
impl RansCoder { impl RansCoder {
pub fn new() -> RansCoder { pub fn new() -> RansCoder {
RansCoder(Vec::new()) RansCoder(Vec::new())
} }
pub fn encode_with_context(&mut self, bit: bool, context: &mut Context) {
self.encode_bit(bit, context.prob());
context.update(bit);
}
pub fn encode_bit(&mut self, bit: bool, prob: u16) {
assert!(prob < 32768);
self.0.push(prob | ((bit as u16) << 15));
}
pub fn finish(self) -> Vec<u8> { pub fn finish(self) -> Vec<u8> {
let mut buffer = vec![]; let mut buffer = vec![];
let mut state = 1 << L_BITS; let mut state = 1 << L_BITS;
@@ -51,6 +57,16 @@ impl RansCoder {
} }
} }
pub struct CostCounter(pub f64);
impl EntropyCoder for CostCounter {
fn encode_bit(&mut self, bit: bool, prob: u16) {
let prob = if bit { prob as u32 } else { ONE_PROB - prob as u32 };
let inv_prob = ONE_PROB as f64 / prob as f64;
self.0 += inv_prob.log2();
}
}
pub struct RansDecoder<'a> { pub struct RansDecoder<'a> {
data: &'a [u8], data: &'a [u8],
state: u32, state: u32,