add compression levels

This commit is contained in:
2021-11-26 00:01:33 +01:00
parent 5fedc032a9
commit c0560f99a8
3 changed files with 184 additions and 33 deletions

View File

@@ -8,20 +8,21 @@ fn main() -> Result<()> {
match args.subcommand()?.as_ref().map(|s| s.as_str()) {
None => print_help(),
Some("pack") => {
let fast = args.contains("--fast");
let level = args.opt_value_from_str(["-l", "--level"])?.unwrap_or(2u8);
let infile = args.free_from_os_str::<PathBuf, bool>(|s| Ok(s.into()))?;
let outfile = args.free_from_os_str::<PathBuf, bool>(|s| Ok(s.into()))?;
let mut data = vec![];
File::open(infile)?.read_to_end(&mut data)?;
let packed_data = if fast {
let packed_data = if level == 0 {
upkr::pack_fast(&data)
} else {
let mut pb = pbr::ProgressBar::new(data.len() as u64);
pb.set_units(pbr::Units::Bytes);
let packed_data = upkr::pack(
&data,
level,
Some(&mut |pos| {
pb.set(pos as u64);
}),
@@ -29,7 +30,12 @@ fn main() -> Result<()> {
pb.finish();
packed_data
};
println!("Compressed {} bytes to {} bytes ({}%)", data.len(), packed_data.len(), packed_data.len() as f32 * 100. / data.len() as f32);
println!(
"Compressed {} bytes to {} bytes ({}%)",
data.len(),
packed_data.len(),
packed_data.len() as f32 * 100. / data.len() as f32
);
File::create(outfile)?.write_all(&packed_data)?;
}
Some("unpack") => {

View File

@@ -45,14 +45,34 @@ impl MatchFinder {
suffixes,
rev_suffixes,
lcp,
max_queue_size: 1000,
max_matches_per_length: 10,
patience: 1000,
max_length_diff: 4,
max_queue_size: 100,
max_matches_per_length: 5,
patience: 100,
max_length_diff: 2,
queue: BinaryHeap::new()
}
}
pub fn with_max_queue_size(mut self, v: usize) -> MatchFinder {
self.max_queue_size = v;
self
}
pub fn with_patience(mut self, v: usize) -> MatchFinder {
self.patience = v;
self
}
pub fn with_max_matches_per_length(mut self, v: usize) -> MatchFinder {
self.max_matches_per_length = v;
self
}
pub fn with_max_length_diff(mut self, v: usize) -> MatchFinder {
self.max_length_diff = v;
self
}
pub fn matches(&mut self, pos: usize) -> Matches {
let index = self.rev_suffixes[pos] as usize;
self.queue.clear();

View File

@@ -1,12 +1,13 @@
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use std::mem;
use std::rc::Rc;
use crate::{ProgressCallback, lz};
use crate::match_finder::MatchFinder;
use crate::rans::{CostCounter, RansCoder};
use crate::{lz, ProgressCallback};
pub fn pack(data: &[u8], progress_cb: Option<ProgressCallback>) -> Vec<u8> {
let mut parse = parse(data, progress_cb);
pub fn pack(data: &[u8], level: u8, progress_cb: Option<ProgressCallback>) -> Vec<u8> {
let mut parse = parse(data, Config::from_level(level), progress_cb);
let mut ops = vec![];
while let Some(link) = parse {
ops.push(link.op);
@@ -34,26 +35,63 @@ struct Arrival {
type Arrivals = HashMap<usize, Vec<Arrival>>;
const MAX_ARRIVALS: usize = 256;
fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Parse>> {
let mut match_finder = MatchFinder::new(data);
fn parse(
data: &[u8],
config: Config,
mut progress_cb: Option<ProgressCallback>,
) -> Option<Rc<Parse>> {
let mut match_finder = MatchFinder::new(data)
.with_max_queue_size(config.max_queue_size)
.with_patience(config.patience)
.with_max_matches_per_length(config.max_matches_per_length)
.with_max_length_diff(config.max_length_diff);
let mut near_matches = [usize::MAX; 1024];
let mut last_seen = [usize::MAX; 256];
let max_arrivals = config.max_arrivals;
let mut arrivals: Arrivals = HashMap::new();
fn add_arrival(arrivals: &mut Arrivals, pos: usize, arrival: Arrival) {
let vec = arrivals.entry(pos).or_default();
if vec.len() < MAX_ARRIVALS || vec[MAX_ARRIVALS - 1].cost > arrival.cost {
vec.push(arrival);
fn sort_arrivals(vec: &mut Vec<Arrival>, max_arrivals: usize) {
if max_arrivals == 0 {
return;
}
vec.sort_by(|a, b| {
a.cost
.partial_cmp(&b.cost)
.unwrap_or(std::cmp::Ordering::Equal)
});
if vec.len() > MAX_ARRIVALS {
vec.pop();
let mut seen_offsets = HashSet::new();
let mut remaining = Vec::new();
for arr in mem::replace(vec, Vec::new()) {
if seen_offsets.insert(arr.state.last_offset()) {
if vec.len() < max_arrivals {
vec.push(arr);
}
} else {
remaining.push(arr);
}
}
for arr in remaining {
if vec.len() >= max_arrivals {
break;
}
vec.push(arr);
}
}
fn add_arrival(arrivals: &mut Arrivals, pos: usize, arrival: Arrival, max_arrivals: usize) {
let vec = arrivals.entry(pos).or_default();
if max_arrivals == 0 {
if vec.is_empty() {
vec.push(arrival);
} else if vec[0].cost > arrival.cost {
vec[0] = arrival;
}
return;
}
vec.push(arrival);
if vec.len() > max_arrivals * 2 {
sort_arrivals(vec, max_arrivals);
}
}
fn add_match(
@@ -63,6 +101,7 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
offset: usize,
length: usize,
arrival: &Arrival,
max_arrivals: usize,
) {
cost_counter.reset();
let mut state = arrival.state.clone();
@@ -82,6 +121,7 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
state,
cost: arrival.cost + cost_counter.cost(),
},
max_arrivals,
);
}
add_arrival(
@@ -92,6 +132,7 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
state: lz::CoderState::new(),
cost: 0.0,
},
max_arrivals,
);
let cost_counter = &mut CostCounter::new();
@@ -105,7 +146,8 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
.count()
};
let here_arrivals = if let Some(arr) = arrivals.remove(&pos) {
let here_arrivals = if let Some(mut arr) = arrivals.remove(&pos) {
sort_arrivals(&mut arr, max_arrivals);
arr
} else {
continue;
@@ -121,7 +163,12 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
}
'arrival_loop: for arrival in here_arrivals {
if arrival.cost > (best_cost + 16.0).min(*best_per_offset.get(&arrival.state.last_offset()).unwrap()) {
if arrival.cost
> (best_cost + config.max_cost_delta).min(
*best_per_offset.get(&arrival.state.last_offset()).unwrap()
+ config.max_offset_cost_delta,
)
{
continue;
}
let mut found_last_offset = false;
@@ -130,13 +177,21 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
closest_match = Some(closest_match.unwrap_or(0).max(m.pos));
let offset = pos - m.pos;
found_last_offset |= offset as u32 == arrival.state.last_offset();
add_match(&mut arrivals, cost_counter, pos, offset, m.length, &arrival);
if m.length > 64 {
add_match(
&mut arrivals,
cost_counter,
pos,
offset,
m.length,
&arrival,
max_arrivals,
);
if m.length >= config.greedy_size {
break 'arrival_loop;
}
}
let mut near_matches_left = 8;
let mut near_matches_left = config.num_near_matches;
let mut match_pos = last_seen[data[pos] as usize];
while near_matches_left > 0
&& match_pos != usize::MAX
@@ -145,7 +200,15 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
let offset = pos - match_pos;
let length = match_length(offset);
assert!(length > 0);
add_match(&mut arrivals, cost_counter, pos, offset, length, &arrival);
add_match(
&mut arrivals,
cost_counter,
pos,
offset,
length,
&arrival,
max_arrivals,
);
found_last_offset |= offset as u32 == arrival.state.last_offset();
if offset < near_matches.len() {
match_pos = near_matches[match_pos % near_matches.len()];
@@ -157,7 +220,15 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
let offset = arrival.state.last_offset() as usize;
let length = match_length(offset);
if length > 0 {
add_match(&mut arrivals, cost_counter, pos, offset, length, &arrival);
add_match(
&mut arrivals,
cost_counter,
pos,
offset,
length,
&arrival,
max_arrivals,
);
}
}
@@ -176,6 +247,7 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
state,
cost: arrival.cost + cost_counter.cost(),
},
max_arrivals,
);
}
near_matches[pos % near_matches.len()] = last_seen[data[pos] as usize];
@@ -186,3 +258,56 @@ fn parse(data: &[u8], mut progress_cb: Option<ProgressCallback>) -> Option<Rc<Pa
}
arrivals.remove(&data.len()).unwrap()[0].parse.clone()
}
struct Config {
max_arrivals: usize,
max_cost_delta: f64,
max_offset_cost_delta: f64,
num_near_matches: usize,
greedy_size: usize,
max_queue_size: usize,
patience: usize,
max_matches_per_length: usize,
max_length_diff: usize,
}
impl Config {
fn from_level(level: u8) -> Config {
let max_arrivals = match level {
0..=1 => 0,
2 => 2,
3 => 4,
4 => 8,
5 => 16,
6 => 32,
7 => 64,
8 => 96,
_ => 128,
};
let (max_cost_delta, max_offset_cost_delta) = match level {
0..=4 => (16.0, 0.0),
5..=8 => (16.0, 4.0),
_ => (16.0, 8.0),
};
let num_near_matches = level.saturating_sub(1) as usize;
let greedy_size = 4 + level as usize * level as usize * 3;
let max_length_diff = match level {
0..=1 => 0,
2..=3 => 1,
4..=5 => 2,
6..=7 => 3,
_ => 4,
};
Config {
max_arrivals,
max_cost_delta,
max_offset_cost_delta,
num_near_matches,
greedy_size,
max_queue_size: level as usize * 100,
patience: level as usize * 100,
max_matches_per_length: level as usize,
max_length_diff,
}
}
}