diff --git a/src/main.rs b/src/main.rs index 93a8c7d..bba8243 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,20 +8,21 @@ fn main() -> Result<()> { match args.subcommand()?.as_ref().map(|s| s.as_str()) { None => print_help(), Some("pack") => { - let fast = args.contains("--fast"); + let level = args.opt_value_from_str(["-l", "--level"])?.unwrap_or(2u8); let infile = args.free_from_os_str::(|s| Ok(s.into()))?; let outfile = args.free_from_os_str::(|s| Ok(s.into()))?; let mut data = vec![]; File::open(infile)?.read_to_end(&mut data)?; - let packed_data = if fast { + let packed_data = if level == 0 { upkr::pack_fast(&data) } else { let mut pb = pbr::ProgressBar::new(data.len() as u64); pb.set_units(pbr::Units::Bytes); let packed_data = upkr::pack( &data, + level, Some(&mut |pos| { pb.set(pos as u64); }), @@ -29,7 +30,12 @@ fn main() -> Result<()> { pb.finish(); packed_data }; - println!("Compressed {} bytes to {} bytes ({}%)", data.len(), packed_data.len(), packed_data.len() as f32 * 100. / data.len() as f32); + println!( + "Compressed {} bytes to {} bytes ({}%)", + data.len(), + packed_data.len(), + packed_data.len() as f32 * 100. / data.len() as f32 + ); File::create(outfile)?.write_all(&packed_data)?; } Some("unpack") => { diff --git a/src/match_finder.rs b/src/match_finder.rs index 5b6a3f4..070eb67 100644 --- a/src/match_finder.rs +++ b/src/match_finder.rs @@ -45,14 +45,34 @@ impl MatchFinder { suffixes, rev_suffixes, lcp, - max_queue_size: 1000, - max_matches_per_length: 10, - patience: 1000, - max_length_diff: 4, + max_queue_size: 100, + max_matches_per_length: 5, + patience: 100, + max_length_diff: 2, queue: BinaryHeap::new() } } + pub fn with_max_queue_size(mut self, v: usize) -> MatchFinder { + self.max_queue_size = v; + self + } + + pub fn with_patience(mut self, v: usize) -> MatchFinder { + self.patience = v; + self + } + + pub fn with_max_matches_per_length(mut self, v: usize) -> MatchFinder { + self.max_matches_per_length = v; + self + } + + pub fn with_max_length_diff(mut self, v: usize) -> MatchFinder { + self.max_length_diff = v; + self + } + pub fn matches(&mut self, pos: usize) -> Matches { let index = self.rev_suffixes[pos] as usize; self.queue.clear(); diff --git a/src/parsing_packer.rs b/src/parsing_packer.rs index 0a423d2..1cdb29f 100644 --- a/src/parsing_packer.rs +++ b/src/parsing_packer.rs @@ -1,12 +1,13 @@ -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; +use std::mem; use std::rc::Rc; -use crate::{ProgressCallback, lz}; use crate::match_finder::MatchFinder; use crate::rans::{CostCounter, RansCoder}; +use crate::{lz, ProgressCallback}; -pub fn pack(data: &[u8], progress_cb: Option) -> Vec { - let mut parse = parse(data, progress_cb); +pub fn pack(data: &[u8], level: u8, progress_cb: Option) -> Vec { + let mut parse = parse(data, Config::from_level(level), progress_cb); let mut ops = vec![]; while let Some(link) = parse { ops.push(link.op); @@ -34,27 +35,64 @@ struct Arrival { type Arrivals = HashMap>; -const MAX_ARRIVALS: usize = 256; - -fn parse(data: &[u8], mut progress_cb: Option) -> Option> { - let mut match_finder = MatchFinder::new(data); +fn parse( + data: &[u8], + config: Config, + mut progress_cb: Option, +) -> Option> { + let mut match_finder = MatchFinder::new(data) + .with_max_queue_size(config.max_queue_size) + .with_patience(config.patience) + .with_max_matches_per_length(config.max_matches_per_length) + .with_max_length_diff(config.max_length_diff); let mut near_matches = [usize::MAX; 1024]; let mut last_seen = [usize::MAX; 256]; + let max_arrivals = config.max_arrivals; + let mut arrivals: Arrivals = HashMap::new(); - fn add_arrival(arrivals: &mut Arrivals, pos: usize, arrival: Arrival) { - let vec = arrivals.entry(pos).or_default(); - if vec.len() < MAX_ARRIVALS || vec[MAX_ARRIVALS - 1].cost > arrival.cost { - vec.push(arrival); - vec.sort_by(|a, b| { - a.cost - .partial_cmp(&b.cost) - .unwrap_or(std::cmp::Ordering::Equal) - }); - if vec.len() > MAX_ARRIVALS { - vec.pop(); + fn sort_arrivals(vec: &mut Vec, max_arrivals: usize) { + if max_arrivals == 0 { + return; + } + vec.sort_by(|a, b| { + a.cost + .partial_cmp(&b.cost) + .unwrap_or(std::cmp::Ordering::Equal) + }); + let mut seen_offsets = HashSet::new(); + let mut remaining = Vec::new(); + for arr in mem::replace(vec, Vec::new()) { + if seen_offsets.insert(arr.state.last_offset()) { + if vec.len() < max_arrivals { + vec.push(arr); + } + } else { + remaining.push(arr); } } + for arr in remaining { + if vec.len() >= max_arrivals { + break; + } + vec.push(arr); + } + } + + fn add_arrival(arrivals: &mut Arrivals, pos: usize, arrival: Arrival, max_arrivals: usize) { + let vec = arrivals.entry(pos).or_default(); + if max_arrivals == 0 { + if vec.is_empty() { + vec.push(arrival); + } else if vec[0].cost > arrival.cost { + vec[0] = arrival; + } + return; + } + vec.push(arrival); + if vec.len() > max_arrivals * 2 { + sort_arrivals(vec, max_arrivals); + } } fn add_match( arrivals: &mut Arrivals, @@ -63,6 +101,7 @@ fn parse(data: &[u8], mut progress_cb: Option) -> Option) -> Option) -> Option) -> Option) -> Option (best_cost + 16.0).min(*best_per_offset.get(&arrival.state.last_offset()).unwrap()) { + if arrival.cost + > (best_cost + config.max_cost_delta).min( + *best_per_offset.get(&arrival.state.last_offset()).unwrap() + + config.max_offset_cost_delta, + ) + { continue; } let mut found_last_offset = false; @@ -130,13 +177,21 @@ fn parse(data: &[u8], mut progress_cb: Option) -> Option 64 { + add_match( + &mut arrivals, + cost_counter, + pos, + offset, + m.length, + &arrival, + max_arrivals, + ); + if m.length >= config.greedy_size { break 'arrival_loop; } } - let mut near_matches_left = 8; + let mut near_matches_left = config.num_near_matches; let mut match_pos = last_seen[data[pos] as usize]; while near_matches_left > 0 && match_pos != usize::MAX @@ -145,7 +200,15 @@ fn parse(data: &[u8], mut progress_cb: Option) -> Option 0); - add_match(&mut arrivals, cost_counter, pos, offset, length, &arrival); + add_match( + &mut arrivals, + cost_counter, + pos, + offset, + length, + &arrival, + max_arrivals, + ); found_last_offset |= offset as u32 == arrival.state.last_offset(); if offset < near_matches.len() { match_pos = near_matches[match_pos % near_matches.len()]; @@ -157,7 +220,15 @@ fn parse(data: &[u8], mut progress_cb: Option) -> Option 0 { - add_match(&mut arrivals, cost_counter, pos, offset, length, &arrival); + add_match( + &mut arrivals, + cost_counter, + pos, + offset, + length, + &arrival, + max_arrivals, + ); } } @@ -176,6 +247,7 @@ fn parse(data: &[u8], mut progress_cb: Option) -> Option) -> Option Config { + let max_arrivals = match level { + 0..=1 => 0, + 2 => 2, + 3 => 4, + 4 => 8, + 5 => 16, + 6 => 32, + 7 => 64, + 8 => 96, + _ => 128, + }; + let (max_cost_delta, max_offset_cost_delta) = match level { + 0..=4 => (16.0, 0.0), + 5..=8 => (16.0, 4.0), + _ => (16.0, 8.0), + }; + let num_near_matches = level.saturating_sub(1) as usize; + let greedy_size = 4 + level as usize * level as usize * 3; + let max_length_diff = match level { + 0..=1 => 0, + 2..=3 => 1, + 4..=5 => 2, + 6..=7 => 3, + _ => 4, + }; + Config { + max_arrivals, + max_cost_delta, + max_offset_cost_delta, + num_near_matches, + greedy_size, + max_queue_size: level as usize * 100, + patience: level as usize * 100, + max_matches_per_length: level as usize, + max_length_diff, + } + } +}