diff --git a/Cargo.lock b/Cargo.lock index d93dffd..c3dd33f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -30,6 +30,44 @@ dependencies = [ "sacabase", ] +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "crossbeam-channel" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" +dependencies = [ + "cfg-if", + "lazy_static", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8521a1b57e76b1ec69af7599e75e38e7b7fad6610f037db8c79b127201b5d119" + [[package]] name = "num-traits" version = "0.2.14" @@ -39,6 +77,18 @@ dependencies = [ "autocfg", ] +[[package]] +name = "pbr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff5751d87f7c00ae6403eb1fcbba229b9c76c9a30de8c1cf87182177b168cea2" +dependencies = [ + "crossbeam-channel", + "libc", + "time", + "winapi", +] + [[package]] name = "pico-args" version = "0.4.2" @@ -54,11 +104,51 @@ dependencies = [ "num-traits", ] +[[package]] +name = "time" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" +dependencies = [ + "libc", + "wasi", + "winapi", +] + [[package]] name = "upkr" version = "0.1.0" dependencies = [ "anyhow", "cdivsufsort", + "pbr", "pico-args", ] + +[[package]] +name = "wasi" +version = "0.10.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/Cargo.toml b/Cargo.toml index 9269315..ec6ef90 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,4 +8,5 @@ edition = "2021" [dependencies] cdivsufsort = "2" pico-args = "0.4" -anyhow = "1" \ No newline at end of file +anyhow = "1" +pbr = "1" \ No newline at end of file diff --git a/src/greedy_packer.rs b/src/greedy_packer.rs index 2b7450f..d31fa08 100644 --- a/src/greedy_packer.rs +++ b/src/greedy_packer.rs @@ -3,7 +3,7 @@ use crate::match_finder::MatchFinder; use crate::rans::RansCoder; pub fn pack(data: &[u8]) -> Vec { - let match_finder = MatchFinder::new(data); + let mut match_finder = MatchFinder::new(data); let mut rans_coder = RansCoder::new(); let mut state = lz::CoderState::new(); diff --git a/src/lib.rs b/src/lib.rs index 70b1b2b..b8db242 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,4 +7,6 @@ mod parsing_packer; pub use greedy_packer::pack as pack_fast; pub use parsing_packer::pack; -pub use lz::unpack; \ No newline at end of file +pub use lz::unpack; + +pub type ProgressCallback<'a> = &'a mut dyn FnMut(usize); \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 0e11b89..93a8c7d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -18,8 +18,18 @@ fn main() -> Result<()> { let packed_data = if fast { upkr::pack_fast(&data) } else { - upkr::pack(&data) + let mut pb = pbr::ProgressBar::new(data.len() as u64); + pb.set_units(pbr::Units::Bytes); + let packed_data = upkr::pack( + &data, + Some(&mut |pos| { + pb.set(pos as u64); + }), + ); + pb.finish(); + packed_data }; + println!("Compressed {} bytes to {} bytes ({}%)", data.len(), packed_data.len(), packed_data.len() as f32 * 100. / data.len() as f32); File::create(outfile)?.write_all(&packed_data)?; } Some("unpack") => { diff --git a/src/match_finder.rs b/src/match_finder.rs index fc8e57a..5b6a3f4 100644 --- a/src/match_finder.rs +++ b/src/match_finder.rs @@ -10,6 +10,8 @@ pub struct MatchFinder { max_matches_per_length: usize, patience: usize, max_length_diff: usize, + + queue: BinaryHeap } impl MatchFinder { @@ -43,15 +45,17 @@ impl MatchFinder { suffixes, rev_suffixes, lcp, - max_queue_size: 100, - max_matches_per_length: 5, - patience: 100, - max_length_diff: 2, + max_queue_size: 1000, + max_matches_per_length: 10, + patience: 1000, + max_length_diff: 4, + queue: BinaryHeap::new() } } - pub fn matches(&self, pos: usize) -> Matches { + pub fn matches(&mut self, pos: usize) -> Matches { let index = self.rev_suffixes[pos] as usize; + self.queue.clear(); let mut matches = Matches { finder: self, pos_range: 0..pos, @@ -62,7 +66,6 @@ impl MatchFinder { current_length: usize::MAX, matches_left: 0, max_length: 0, - queue: BinaryHeap::new(), }; matches.move_left(); @@ -73,7 +76,7 @@ impl MatchFinder { } pub struct Matches<'a> { - finder: &'a MatchFinder, + finder: &'a mut MatchFinder, pos_range: Range, left_index: usize, left_length: usize, @@ -82,7 +85,6 @@ pub struct Matches<'a> { current_length: usize, matches_left: usize, max_length: usize, - queue: BinaryHeap, } #[derive(Debug)] @@ -95,8 +97,8 @@ impl<'a> Iterator for Matches<'a> { type Item = Match; fn next(&mut self) -> Option { - if self.queue.is_empty() || self.matches_left == 0 { - self.queue.clear(); + if self.finder.queue.is_empty() || self.matches_left == 0 { + self.finder.queue.clear(); self.current_length = self.current_length.saturating_sub(1).min(self.left_length.max(self.right_length)); self.max_length = self.max_length.max(self.current_length); if self.current_length < 2 @@ -104,16 +106,16 @@ impl<'a> Iterator for Matches<'a> { { return None; } - while self.queue.len() < self.finder.max_queue_size + while self.finder.queue.len() < self.finder.max_queue_size && (self.left_length == self.current_length || self.right_length == self.current_length) { if self.left_length == self.current_length { - self.add_to_queue(self.finder.suffixes[self.left_index]); + self.finder.queue.push(self.finder.suffixes[self.left_index] as usize); self.move_left(); } if self.right_length == self.current_length { - self.add_to_queue(self.finder.suffixes[self.right_index]); + self.finder.queue.push(self.finder.suffixes[self.right_index] as usize); self.move_right(); } } @@ -122,7 +124,7 @@ impl<'a> Iterator for Matches<'a> { self.matches_left = self.matches_left.saturating_sub(1); - self.queue.pop().map(|pos| Match { + self.finder.queue.pop().map(|pos| Match { pos, length: self.current_length, }) @@ -168,8 +170,4 @@ impl<'a> Matches<'a> { } self.right_length = 0; } - - fn add_to_queue(&mut self, pos: i32) { - self.queue.push(pos as usize); - } } diff --git a/src/parsing_packer.rs b/src/parsing_packer.rs index 7837da7..0a423d2 100644 --- a/src/parsing_packer.rs +++ b/src/parsing_packer.rs @@ -1,12 +1,12 @@ use std::collections::HashMap; use std::rc::Rc; -use crate::lz; +use crate::{ProgressCallback, lz}; use crate::match_finder::MatchFinder; use crate::rans::{CostCounter, RansCoder}; -pub fn pack(data: &[u8]) -> Vec { - let mut parse = parse(data); +pub fn pack(data: &[u8], progress_cb: Option) -> Vec { + let mut parse = parse(data, progress_cb); let mut ops = vec![]; while let Some(link) = parse { ops.push(link.op); @@ -34,10 +34,10 @@ struct Arrival { type Arrivals = HashMap>; -const MAX_ARRIVALS: usize = 4; +const MAX_ARRIVALS: usize = 256; -fn parse(data: &[u8]) -> Option> { - let match_finder = MatchFinder::new(data); +fn parse(data: &[u8], mut progress_cb: Option) -> Option> { + let mut match_finder = MatchFinder::new(data); let mut near_matches = [usize::MAX; 1024]; let mut last_seen = [usize::MAX; 256]; @@ -58,18 +58,19 @@ fn parse(data: &[u8]) -> Option> { } fn add_match( arrivals: &mut Arrivals, + cost_counter: &mut CostCounter, pos: usize, offset: usize, length: usize, arrival: &Arrival, ) { - let mut cost_counter = CostCounter(0.); + cost_counter.reset(); let mut state = arrival.state.clone(); let op = lz::Op::Match { offset: offset as u32, len: length as u32, }; - op.encode(&mut cost_counter, &mut state); + op.encode(cost_counter, &mut state); add_arrival( arrivals, pos + length, @@ -79,7 +80,7 @@ fn parse(data: &[u8]) -> Option> { op, })), state, - cost: arrival.cost + cost_counter.0, + cost: arrival.cost + cost_counter.cost(), }, ); } @@ -92,6 +93,8 @@ fn parse(data: &[u8]) -> Option> { cost: 0.0, }, ); + + let cost_counter = &mut CostCounter::new(); let mut best_per_offset = HashMap::new(); for pos in 0..data.len() { let match_length = |offset: usize| { @@ -117,8 +120,8 @@ fn parse(data: &[u8]) -> Option> { *per_offset = per_offset.min(arrival.cost); } - for arrival in here_arrivals { - if arrival.cost > (best_cost + 32.0).min(*best_per_offset.get(&arrival.state.last_offset()).unwrap()) { + 'arrival_loop: for arrival in here_arrivals { + if arrival.cost > (best_cost + 16.0).min(*best_per_offset.get(&arrival.state.last_offset()).unwrap()) { continue; } let mut found_last_offset = false; @@ -127,10 +130,13 @@ fn parse(data: &[u8]) -> Option> { closest_match = Some(closest_match.unwrap_or(0).max(m.pos)); let offset = pos - m.pos; found_last_offset |= offset as u32 == arrival.state.last_offset(); - add_match(&mut arrivals, pos, offset, m.length, &arrival); + add_match(&mut arrivals, cost_counter, pos, offset, m.length, &arrival); + if m.length > 64 { + break 'arrival_loop; + } } - let mut near_matches_left = 4; + let mut near_matches_left = 8; let mut match_pos = last_seen[data[pos] as usize]; while near_matches_left > 0 && match_pos != usize::MAX @@ -139,7 +145,7 @@ fn parse(data: &[u8]) -> Option> { let offset = pos - match_pos; let length = match_length(offset); assert!(length > 0); - add_match(&mut arrivals, pos, offset, length, &arrival); + add_match(&mut arrivals, cost_counter, pos, offset, length, &arrival); found_last_offset |= offset as u32 == arrival.state.last_offset(); if offset < near_matches.len() { match_pos = near_matches[match_pos % near_matches.len()]; @@ -151,14 +157,14 @@ fn parse(data: &[u8]) -> Option> { let offset = arrival.state.last_offset() as usize; let length = match_length(offset); if length > 0 { - add_match(&mut arrivals, pos, offset, length, &arrival); + add_match(&mut arrivals, cost_counter, pos, offset, length, &arrival); } } - let mut cost_counter = CostCounter(0.); + cost_counter.reset(); let mut state = arrival.state; let op = lz::Op::Literal(data[pos]); - op.encode(&mut cost_counter, &mut state); + op.encode(cost_counter, &mut state); add_arrival( &mut arrivals, pos + 1, @@ -168,12 +174,15 @@ fn parse(data: &[u8]) -> Option> { op, })), state, - cost: arrival.cost + cost_counter.0, + cost: arrival.cost + cost_counter.cost(), }, ); } near_matches[pos % near_matches.len()] = last_seen[data[pos] as usize]; last_seen[data[pos] as usize] = pos; + if let Some(ref mut cb) = progress_cb { + cb(pos + 1); + } } arrivals.remove(&data.len()).unwrap()[0].parse.clone() } diff --git a/src/rans.rs b/src/rans.rs index 1f4445a..bead28a 100644 --- a/src/rans.rs +++ b/src/rans.rs @@ -57,13 +57,42 @@ impl RansCoder { } } -pub struct CostCounter(pub f64); +pub struct CostCounter { + cost: f64, + log2_table: Vec, +} + +impl CostCounter { + pub fn new() -> CostCounter { + let log2_table = (0..ONE_PROB) + .map(|prob| { + let inv_prob = ONE_PROB as f64 / prob as f64; + inv_prob.log2() + }) + .collect(); + CostCounter { + cost: 0.0, + log2_table, + } + } + + pub fn cost(&self) -> f64 { + self.cost + } + + pub fn reset(&mut self) { + self.cost = 0.0; + } +} impl EntropyCoder for CostCounter { fn encode_bit(&mut self, bit: bool, prob: u16) { - let prob = if bit { prob as u32 } else { ONE_PROB - prob as u32 }; - let inv_prob = ONE_PROB as f64 / prob as f64; - self.0 += inv_prob.log2(); + let prob = if bit { + prob as u32 + } else { + ONE_PROB - prob as u32 + }; + self.cost += self.log2_table[prob as usize]; } }