add compression levels

This commit is contained in:
2021-11-26 00:01:33 +01:00
parent 5fedc032a9
commit c0560f99a8
3 changed files with 184 additions and 33 deletions

View File

@@ -8,20 +8,21 @@ fn main() -> Result<()> {
match args.subcommand()?.as_ref().map(|s| s.as_str()) { match args.subcommand()?.as_ref().map(|s| s.as_str()) {
None => print_help(), None => print_help(),
Some("pack") => { Some("pack") => {
let fast = args.contains("--fast"); let level = args.opt_value_from_str(["-l", "--level"])?.unwrap_or(2u8);
let infile = args.free_from_os_str::<PathBuf, bool>(|s| Ok(s.into()))?; let infile = args.free_from_os_str::<PathBuf, bool>(|s| Ok(s.into()))?;
let outfile = args.free_from_os_str::<PathBuf, bool>(|s| Ok(s.into()))?; let outfile = args.free_from_os_str::<PathBuf, bool>(|s| Ok(s.into()))?;
let mut data = vec![]; let mut data = vec![];
File::open(infile)?.read_to_end(&mut data)?; File::open(infile)?.read_to_end(&mut data)?;
let packed_data = if fast { let packed_data = if level == 0 {
upkr::pack_fast(&data) upkr::pack_fast(&data)
} else { } else {
let mut pb = pbr::ProgressBar::new(data.len() as u64); let mut pb = pbr::ProgressBar::new(data.len() as u64);
pb.set_units(pbr::Units::Bytes); pb.set_units(pbr::Units::Bytes);
let packed_data = upkr::pack( let packed_data = upkr::pack(
&data, &data,
level,
Some(&mut |pos| { Some(&mut |pos| {
pb.set(pos as u64); pb.set(pos as u64);
}), }),
@@ -29,7 +30,12 @@ fn main() -> Result<()> {
pb.finish(); pb.finish();
packed_data packed_data
}; };
println!("Compressed {} bytes to {} bytes ({}%)", data.len(), packed_data.len(), packed_data.len() as f32 * 100. / data.len() as f32); println!(
"Compressed {} bytes to {} bytes ({}%)",
data.len(),
packed_data.len(),
packed_data.len() as f32 * 100. / data.len() as f32
);
File::create(outfile)?.write_all(&packed_data)?; File::create(outfile)?.write_all(&packed_data)?;
} }
Some("unpack") => { Some("unpack") => {

View File

@@ -45,14 +45,34 @@ impl MatchFinder {
suffixes, suffixes,
rev_suffixes, rev_suffixes,
lcp, lcp,
max_queue_size: 1000, max_queue_size: 100,
max_matches_per_length: 10, max_matches_per_length: 5,
patience: 1000, patience: 100,
max_length_diff: 4, max_length_diff: 2,
queue: BinaryHeap::new() queue: BinaryHeap::new()
} }
} }
pub fn with_max_queue_size(mut self, v: usize) -> MatchFinder {
self.max_queue_size = v;
self
}
pub fn with_patience(mut self, v: usize) -> MatchFinder {
self.patience = v;
self
}
pub fn with_max_matches_per_length(mut self, v: usize) -> MatchFinder {
self.max_matches_per_length = v;
self
}
pub fn with_max_length_diff(mut self, v: usize) -> MatchFinder {
self.max_length_diff = v;
self
}
pub fn matches(&mut self, pos: usize) -> Matches { pub fn matches(&mut self, pos: usize) -> Matches {
let index = self.rev_suffixes[pos] as usize; let index = self.rev_suffixes[pos] as usize;
self.queue.clear(); self.queue.clear();

View File

@@ -1,12 +1,13 @@
use std::collections::HashMap; use std::collections::{HashMap, HashSet};
use std::mem;
use std::rc::Rc; use std::rc::Rc;
use crate::{ProgressCallback, lz};
use crate::match_finder::MatchFinder; use crate::match_finder::MatchFinder;
use crate::rans::{CostCounter, RansCoder}; use crate::rans::{CostCounter, RansCoder};
use crate::{lz, ProgressCallback};
pub fn pack(data: &[u8], progress_cb: Option<ProgressCallback>) -> Vec<u8> { pub fn pack(data: &[u8], level: u8, progress_cb: Option<ProgressCallback>) -> Vec<u8> {
let mut parse = parse(data, progress_cb); let mut parse = parse(data, Config::from_level(level), progress_cb);
let mut ops = vec![]; let mut ops = vec![];
while let Some(link) = parse { while let Some(link) = parse {
ops.push(link.op); ops.push(link.op);
@@ -34,27 +35,64 @@ struct Arrival {
type Arrivals = HashMap<usize, Vec<Arrival>>; type Arrivals = HashMap<usize, Vec<Arrival>>;
const MAX_ARRIVALS: usize = 256; fn parse(
data: &[u8],
fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Parse>> { config: Config,
let mut match_finder = MatchFinder::new(data); mut progress_cb: Option<ProgressCallback>,
) -> Option<Rc<Parse>> {
let mut match_finder = MatchFinder::new(data)
.with_max_queue_size(config.max_queue_size)
.with_patience(config.patience)
.with_max_matches_per_length(config.max_matches_per_length)
.with_max_length_diff(config.max_length_diff);
let mut near_matches = [usize::MAX; 1024]; let mut near_matches = [usize::MAX; 1024];
let mut last_seen = [usize::MAX; 256]; let mut last_seen = [usize::MAX; 256];
let max_arrivals = config.max_arrivals;
let mut arrivals: Arrivals = HashMap::new(); let mut arrivals: Arrivals = HashMap::new();
fn add_arrival(arrivals: &mut Arrivals, pos: usize, arrival: Arrival) { fn sort_arrivals(vec: &mut Vec<Arrival>, max_arrivals: usize) {
let vec = arrivals.entry(pos).or_default(); if max_arrivals == 0 {
if vec.len() < MAX_ARRIVALS || vec[MAX_ARRIVALS - 1].cost > arrival.cost { return;
vec.push(arrival); }
vec.sort_by(|a, b| { vec.sort_by(|a, b| {
a.cost a.cost
.partial_cmp(&b.cost) .partial_cmp(&b.cost)
.unwrap_or(std::cmp::Ordering::Equal) .unwrap_or(std::cmp::Ordering::Equal)
}); });
if vec.len() > MAX_ARRIVALS { let mut seen_offsets = HashSet::new();
vec.pop(); let mut remaining = Vec::new();
for arr in mem::replace(vec, Vec::new()) {
if seen_offsets.insert(arr.state.last_offset()) {
if vec.len() < max_arrivals {
vec.push(arr);
}
} else {
remaining.push(arr);
} }
} }
for arr in remaining {
if vec.len() >= max_arrivals {
break;
}
vec.push(arr);
}
}
fn add_arrival(arrivals: &mut Arrivals, pos: usize, arrival: Arrival, max_arrivals: usize) {
let vec = arrivals.entry(pos).or_default();
if max_arrivals == 0 {
if vec.is_empty() {
vec.push(arrival);
} else if vec[0].cost > arrival.cost {
vec[0] = arrival;
}
return;
}
vec.push(arrival);
if vec.len() > max_arrivals * 2 {
sort_arrivals(vec, max_arrivals);
}
} }
fn add_match( fn add_match(
arrivals: &mut Arrivals, arrivals: &mut Arrivals,
@@ -63,6 +101,7 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
offset: usize, offset: usize,
length: usize, length: usize,
arrival: &Arrival, arrival: &Arrival,
max_arrivals: usize,
) { ) {
cost_counter.reset(); cost_counter.reset();
let mut state = arrival.state.clone(); let mut state = arrival.state.clone();
@@ -82,6 +121,7 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
state, state,
cost: arrival.cost + cost_counter.cost(), cost: arrival.cost + cost_counter.cost(),
}, },
max_arrivals,
); );
} }
add_arrival( add_arrival(
@@ -92,6 +132,7 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
state: lz::CoderState::new(), state: lz::CoderState::new(),
cost: 0.0, cost: 0.0,
}, },
max_arrivals,
); );
let cost_counter = &mut CostCounter::new(); let cost_counter = &mut CostCounter::new();
@@ -105,7 +146,8 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
.count() .count()
}; };
let here_arrivals = if let Some(arr) = arrivals.remove(&pos) { let here_arrivals = if let Some(mut arr) = arrivals.remove(&pos) {
sort_arrivals(&mut arr, max_arrivals);
arr arr
} else { } else {
continue; continue;
@@ -121,7 +163,12 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
} }
'arrival_loop: for arrival in here_arrivals { 'arrival_loop: for arrival in here_arrivals {
if arrival.cost > (best_cost + 16.0).min(*best_per_offset.get(&arrival.state.last_offset()).unwrap()) { if arrival.cost
> (best_cost + config.max_cost_delta).min(
*best_per_offset.get(&arrival.state.last_offset()).unwrap()
+ config.max_offset_cost_delta,
)
{
continue; continue;
} }
let mut found_last_offset = false; let mut found_last_offset = false;
@@ -130,13 +177,21 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
closest_match = Some(closest_match.unwrap_or(0).max(m.pos)); closest_match = Some(closest_match.unwrap_or(0).max(m.pos));
let offset = pos - m.pos; let offset = pos - m.pos;
found_last_offset |= offset as u32 == arrival.state.last_offset(); found_last_offset |= offset as u32 == arrival.state.last_offset();
add_match(&mut arrivals, cost_counter, pos, offset, m.length, &arrival); add_match(
if m.length > 64 { &mut arrivals,
cost_counter,
pos,
offset,
m.length,
&arrival,
max_arrivals,
);
if m.length >= config.greedy_size {
break 'arrival_loop; break 'arrival_loop;
} }
} }
let mut near_matches_left = 8; let mut near_matches_left = config.num_near_matches;
let mut match_pos = last_seen[data[pos] as usize]; let mut match_pos = last_seen[data[pos] as usize];
while near_matches_left > 0 while near_matches_left > 0
&& match_pos != usize::MAX && match_pos != usize::MAX
@@ -145,7 +200,15 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
let offset = pos - match_pos; let offset = pos - match_pos;
let length = match_length(offset); let length = match_length(offset);
assert!(length > 0); assert!(length > 0);
add_match(&mut arrivals, cost_counter, pos, offset, length, &arrival); add_match(
&mut arrivals,
cost_counter,
pos,
offset,
length,
&arrival,
max_arrivals,
);
found_last_offset |= offset as u32 == arrival.state.last_offset(); found_last_offset |= offset as u32 == arrival.state.last_offset();
if offset < near_matches.len() { if offset < near_matches.len() {
match_pos = near_matches[match_pos % near_matches.len()]; match_pos = near_matches[match_pos % near_matches.len()];
@@ -157,7 +220,15 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
let offset = arrival.state.last_offset() as usize; let offset = arrival.state.last_offset() as usize;
let length = match_length(offset); let length = match_length(offset);
if length > 0 { if length > 0 {
add_match(&mut arrivals, cost_counter, pos, offset, length, &arrival); add_match(
&mut arrivals,
cost_counter,
pos,
offset,
length,
&arrival,
max_arrivals,
);
} }
} }
@@ -176,6 +247,7 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
state, state,
cost: arrival.cost + cost_counter.cost(), cost: arrival.cost + cost_counter.cost(),
}, },
max_arrivals,
); );
} }
near_matches[pos % near_matches.len()] = last_seen[data[pos] as usize]; near_matches[pos % near_matches.len()] = last_seen[data[pos] as usize];
@@ -186,3 +258,56 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
} }
arrivals.remove(&data.len()).unwrap()[0].parse.clone() arrivals.remove(&data.len()).unwrap()[0].parse.clone()
} }
struct Config {
max_arrivals: usize,
max_cost_delta: f64,
max_offset_cost_delta: f64,
num_near_matches: usize,
greedy_size: usize,
max_queue_size: usize,
patience: usize,
max_matches_per_length: usize,
max_length_diff: usize,
}
impl Config {
fn from_level(level: u8) -> Config {
let max_arrivals = match level {
0..=1 => 0,
2 => 2,
3 => 4,
4 => 8,
5 => 16,
6 => 32,
7 => 64,
8 => 96,
_ => 128,
};
let (max_cost_delta, max_offset_cost_delta) = match level {
0..=4 => (16.0, 0.0),
5..=8 => (16.0, 4.0),
_ => (16.0, 8.0),
};
let num_near_matches = level.saturating_sub(1) as usize;
let greedy_size = 4 + level as usize * level as usize * 3;
let max_length_diff = match level {
0..=1 => 0,
2..=3 => 1,
4..=5 => 2,
6..=7 => 3,
_ => 4,
};
Config {
max_arrivals,
max_cost_delta,
max_offset_cost_delta,
num_near_matches,
greedy_size,
max_queue_size: level as usize * 100,
patience: level as usize * 100,
max_matches_per_length: level as usize,
max_length_diff,
}
}
}