speed optimizations + progress bar

This commit is contained in:
2021-11-25 22:43:39 +01:00
parent 5e82c65c18
commit 5fedc032a9
8 changed files with 183 additions and 44 deletions

View File

@@ -3,7 +3,7 @@ use crate::match_finder::MatchFinder;
use crate::rans::RansCoder;
pub fn pack(data: &[u8]) -> Vec<u8> {
let match_finder = MatchFinder::new(data);
let mut match_finder = MatchFinder::new(data);
let mut rans_coder = RansCoder::new();
let mut state = lz::CoderState::new();

View File

@@ -7,4 +7,6 @@ mod parsing_packer;
pub use greedy_packer::pack as pack_fast;
pub use parsing_packer::pack;
pub use lz::unpack;
pub use lz::unpack;
pub type ProgressCallback<'a> = &'a mut dyn FnMut(usize);

View File

@@ -18,8 +18,18 @@ fn main() -> Result<()> {
let packed_data = if fast {
upkr::pack_fast(&data)
} else {
upkr::pack(&data)
let mut pb = pbr::ProgressBar::new(data.len() as u64);
pb.set_units(pbr::Units::Bytes);
let packed_data = upkr::pack(
&data,
Some(&mut |pos| {
pb.set(pos as u64);
}),
);
pb.finish();
packed_data
};
println!("Compressed {} bytes to {} bytes ({}%)", data.len(), packed_data.len(), packed_data.len() as f32 * 100. / data.len() as f32);
File::create(outfile)?.write_all(&packed_data)?;
}
Some("unpack") => {

View File

@@ -10,6 +10,8 @@ pub struct MatchFinder {
max_matches_per_length: usize,
patience: usize,
max_length_diff: usize,
queue: BinaryHeap<usize>
}
impl MatchFinder {
@@ -43,15 +45,17 @@ impl MatchFinder {
suffixes,
rev_suffixes,
lcp,
max_queue_size: 100,
max_matches_per_length: 5,
patience: 100,
max_length_diff: 2,
max_queue_size: 1000,
max_matches_per_length: 10,
patience: 1000,
max_length_diff: 4,
queue: BinaryHeap::new()
}
}
pub fn matches(&self, pos: usize) -> Matches {
pub fn matches(&mut self, pos: usize) -> Matches {
let index = self.rev_suffixes[pos] as usize;
self.queue.clear();
let mut matches = Matches {
finder: self,
pos_range: 0..pos,
@@ -62,7 +66,6 @@ impl MatchFinder {
current_length: usize::MAX,
matches_left: 0,
max_length: 0,
queue: BinaryHeap::new(),
};
matches.move_left();
@@ -73,7 +76,7 @@ impl MatchFinder {
}
pub struct Matches<'a> {
finder: &'a MatchFinder,
finder: &'a mut MatchFinder,
pos_range: Range<usize>,
left_index: usize,
left_length: usize,
@@ -82,7 +85,6 @@ pub struct Matches<'a> {
current_length: usize,
matches_left: usize,
max_length: usize,
queue: BinaryHeap<usize>,
}
#[derive(Debug)]
@@ -95,8 +97,8 @@ impl<'a> Iterator for Matches<'a> {
type Item = Match;
fn next(&mut self) -> Option<Match> {
if self.queue.is_empty() || self.matches_left == 0 {
self.queue.clear();
if self.finder.queue.is_empty() || self.matches_left == 0 {
self.finder.queue.clear();
self.current_length = self.current_length.saturating_sub(1).min(self.left_length.max(self.right_length));
self.max_length = self.max_length.max(self.current_length);
if self.current_length < 2
@@ -104,16 +106,16 @@ impl<'a> Iterator for Matches<'a> {
{
return None;
}
while self.queue.len() < self.finder.max_queue_size
while self.finder.queue.len() < self.finder.max_queue_size
&& (self.left_length == self.current_length
|| self.right_length == self.current_length)
{
if self.left_length == self.current_length {
self.add_to_queue(self.finder.suffixes[self.left_index]);
self.finder.queue.push(self.finder.suffixes[self.left_index] as usize);
self.move_left();
}
if self.right_length == self.current_length {
self.add_to_queue(self.finder.suffixes[self.right_index]);
self.finder.queue.push(self.finder.suffixes[self.right_index] as usize);
self.move_right();
}
}
@@ -122,7 +124,7 @@ impl<'a> Iterator for Matches<'a> {
self.matches_left = self.matches_left.saturating_sub(1);
self.queue.pop().map(|pos| Match {
self.finder.queue.pop().map(|pos| Match {
pos,
length: self.current_length,
})
@@ -168,8 +170,4 @@ impl<'a> Matches<'a> {
}
self.right_length = 0;
}
fn add_to_queue(&mut self, pos: i32) {
self.queue.push(pos as usize);
}
}

View File

@@ -1,12 +1,12 @@
use std::collections::HashMap;
use std::rc::Rc;
use crate::lz;
use crate::{ProgressCallback, lz};
use crate::match_finder::MatchFinder;
use crate::rans::{CostCounter, RansCoder};
pub fn pack(data: &[u8]) -> Vec<u8> {
let mut parse = parse(data);
pub fn pack(data: &[u8], progress_cb: Option<ProgressCallback>) -> Vec<u8> {
let mut parse = parse(data, progress_cb);
let mut ops = vec![];
while let Some(link) = parse {
ops.push(link.op);
@@ -34,10 +34,10 @@ struct Arrival {
type Arrivals = HashMap<usize, Vec<Arrival>>;
const MAX_ARRIVALS: usize = 4;
const MAX_ARRIVALS: usize = 256;
fn parse(data: &[u8]) -> Option<Rc<Parse>> {
let match_finder = MatchFinder::new(data);
fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Parse>> {
let mut match_finder = MatchFinder::new(data);
let mut near_matches = [usize::MAX; 1024];
let mut last_seen = [usize::MAX; 256];
@@ -58,18 +58,19 @@ fn parse(data: &[u8]) -> Option<Rc<Parse>> {
}
fn add_match(
arrivals: &mut Arrivals,
cost_counter: &mut CostCounter,
pos: usize,
offset: usize,
length: usize,
arrival: &Arrival,
) {
let mut cost_counter = CostCounter(0.);
cost_counter.reset();
let mut state = arrival.state.clone();
let op = lz::Op::Match {
offset: offset as u32,
len: length as u32,
};
op.encode(&mut cost_counter, &mut state);
op.encode(cost_counter, &mut state);
add_arrival(
arrivals,
pos + length,
@@ -79,7 +80,7 @@ fn parse(data: &[u8]) -> Option<Rc<Parse>> {
op,
})),
state,
cost: arrival.cost + cost_counter.0,
cost: arrival.cost + cost_counter.cost(),
},
);
}
@@ -92,6 +93,8 @@ fn parse(data: &[u8]) -> Option<Rc<Parse>> {
cost: 0.0,
},
);
let cost_counter = &mut CostCounter::new();
let mut best_per_offset = HashMap::new();
for pos in 0..data.len() {
let match_length = |offset: usize| {
@@ -117,8 +120,8 @@ fn parse(data: &[u8]) -> Option<Rc<Parse>> {
*per_offset = per_offset.min(arrival.cost);
}
for arrival in here_arrivals {
if arrival.cost > (best_cost + 32.0).min(*best_per_offset.get(&arrival.state.last_offset()).unwrap()) {
'arrival_loop: for arrival in here_arrivals {
if arrival.cost > (best_cost + 16.0).min(*best_per_offset.get(&arrival.state.last_offset()).unwrap()) {
continue;
}
let mut found_last_offset = false;
@@ -127,10 +130,13 @@ fn parse(data: &[u8]) -> Option<Rc<Parse>> {
closest_match = Some(closest_match.unwrap_or(0).max(m.pos));
let offset = pos - m.pos;
found_last_offset |= offset as u32 == arrival.state.last_offset();
add_match(&mut arrivals, pos, offset, m.length, &arrival);
add_match(&mut arrivals, cost_counter, pos, offset, m.length, &arrival);
if m.length > 64 {
break 'arrival_loop;
}
}
let mut near_matches_left = 4;
let mut near_matches_left = 8;
let mut match_pos = last_seen[data[pos] as usize];
while near_matches_left > 0
&& match_pos != usize::MAX
@@ -139,7 +145,7 @@ fn parse(data: &[u8]) -> Option<Rc<Parse>> {
let offset = pos - match_pos;
let length = match_length(offset);
assert!(length > 0);
add_match(&mut arrivals, pos, offset, length, &arrival);
add_match(&mut arrivals, cost_counter, pos, offset, length, &arrival);
found_last_offset |= offset as u32 == arrival.state.last_offset();
if offset < near_matches.len() {
match_pos = near_matches[match_pos % near_matches.len()];
@@ -151,14 +157,14 @@ fn parse(data: &[u8]) -> Option<Rc<Parse>> {
let offset = arrival.state.last_offset() as usize;
let length = match_length(offset);
if length > 0 {
add_match(&mut arrivals, pos, offset, length, &arrival);
add_match(&mut arrivals, cost_counter, pos, offset, length, &arrival);
}
}
let mut cost_counter = CostCounter(0.);
cost_counter.reset();
let mut state = arrival.state;
let op = lz::Op::Literal(data[pos]);
op.encode(&mut cost_counter, &mut state);
op.encode(cost_counter, &mut state);
add_arrival(
&mut arrivals,
pos + 1,
@@ -168,12 +174,15 @@ fn parse(data: &[u8]) -> Option<Rc<Parse>> {
op,
})),
state,
cost: arrival.cost + cost_counter.0,
cost: arrival.cost + cost_counter.cost(),
},
);
}
near_matches[pos % near_matches.len()] = last_seen[data[pos] as usize];
last_seen[data[pos] as usize] = pos;
if let Some(ref mut cb) = progress_cb {
cb(pos + 1);
}
}
arrivals.remove(&data.len()).unwrap()[0].parse.clone()
}

View File

@@ -57,13 +57,42 @@ impl RansCoder {
}
}
pub struct CostCounter(pub f64);
pub struct CostCounter {
cost: f64,
log2_table: Vec<f64>,
}
impl CostCounter {
pub fn new() -> CostCounter {
let log2_table = (0..ONE_PROB)
.map(|prob| {
let inv_prob = ONE_PROB as f64 / prob as f64;
inv_prob.log2()
})
.collect();
CostCounter {
cost: 0.0,
log2_table,
}
}
pub fn cost(&self) -> f64 {
self.cost
}
pub fn reset(&mut self) {
self.cost = 0.0;
}
}
impl EntropyCoder for CostCounter {
fn encode_bit(&mut self, bit: bool, prob: u16) {
let prob = if bit { prob as u32 } else { ONE_PROB - prob as u32 };
let inv_prob = ONE_PROB as f64 / prob as f64;
self.0 += inv_prob.log2();
let prob = if bit {
prob as u32
} else {
ONE_PROB - prob as u32
};
self.cost += self.log2_table[prob as usize];
}
}