From cab51e06ff542c7446f0790791cdc66b1b753b51 Mon Sep 17 00:00:00 2001 From: Dennis Ranke Date: Sun, 23 Oct 2022 23:06:09 +0200 Subject: [PATCH] implement heatmap calculation --- README.md | 12 ++++++++ src/heatmap.rs | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 4 ++- src/lz.rs | 31 +++++++++++++++++--- src/main.rs | 76 +++++++++++++++++++++++++++++++++++--------------- src/rans.rs | 24 ++++++++++++---- 6 files changed, 189 insertions(+), 32 deletions(-) create mode 100644 src/heatmap.rs diff --git a/README.md b/README.md index d22c401..54af2e6 100644 --- a/README.md +++ b/README.md @@ -27,11 +27,13 @@ The 16 bit dos unpacker also uses some variations. (`upkr --x86`) ``` upkr [-l level(0-9)] [config options] [] upkr -u [config options] [] + upkr --heatmap [config options] [] upkr --margin [config options] -l, --level N compression level 0-9 -0, ..., -9 short form for setting compression level -u, --unpack unpack infile + --heatmap calculate heatmap from compressed file --margin calculate margin for overlapped unpacking of a packed file Config presets for specific unpackers: @@ -56,3 +58,13 @@ Config options to tailor output to specific optimized unpackers: --max-offset N --max-length N ``` + +## Heatmap + +By default, the `--heatmap` flag writes out the heatmap data as a binary file. The heatmap file is +the same size as the unpacked data. Each byte can be interpreted like this: + +``` +is_literal = byte & 1; // whether the byte was encoded as a literal (as opposed to a match) +size_in_bits = 2.0 ** (((byte >> 1) - 64) / 8.0); // the size this byte takes up in the compressed data +``` diff --git a/src/heatmap.rs b/src/heatmap.rs new file mode 100644 index 0000000..9d507a6 --- /dev/null +++ b/src/heatmap.rs @@ -0,0 +1,74 @@ +pub struct Heatmap { + data: Vec, + cost: Vec, + literal_index: Vec, +} + +impl Heatmap { + pub fn new() -> Heatmap { + Heatmap { + data: Vec::new(), + cost: Vec::new(), + literal_index: Vec::new(), + } + } + + pub fn add_literal(&mut self, byte: u8, cost: f32) { + self.data.push(byte); + self.cost.push(cost); + self.literal_index.push(self.literal_index.len()); + } + + pub fn add_match(&mut self, offset: usize, length: usize, mut cost: f32) { + cost /= length as f32; + for _ in 0..length { + self.data.push(self.data[self.data.len() - offset]); + self.literal_index + .push(self.literal_index[self.literal_index.len() - offset]); + self.cost.push(cost); + } + } + + pub fn finish(&mut self) { + let mut ref_count = vec![0usize; self.literal_index.len()]; + for &index in &self.literal_index { + ref_count[index] += 1; + } + + let mut shifted = vec![]; + for (&index, &cost) in self.literal_index.iter().zip(self.cost.iter()) { + let delta = (self.cost[index] - cost) / ref_count[index] as f32; + shifted.push(delta); + shifted[index] -= delta; + } + + for (cost, delta) in self.cost.iter_mut().zip(shifted.into_iter()) { + *cost += delta; + } + } + + pub fn reverse(&mut self) { + self.data.reverse(); + self.cost.reverse(); + self.literal_index.reverse(); + for index in self.literal_index.iter_mut() { + *index = self.data.len() - *index; + } + } + + pub fn len(&self) -> usize { + self.cost.len() + } + + pub fn is_literal(&self, index: usize) -> bool { + self.literal_index[index] == index + } + + pub fn cost(&self, index: usize) -> f32 { + self.cost[index] + } + + pub fn byte(&self, index: usize) -> u8 { + self.data[index] + } +} diff --git a/src/lib.rs b/src/lib.rs index 7728f6b..103a260 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,11 +1,13 @@ mod context_state; mod greedy_packer; +mod heatmap; mod lz; mod match_finder; mod parsing_packer; mod rans; -pub use lz::{calculate_margin, unpack, UnpackError}; +pub use heatmap::Heatmap; +pub use lz::{calculate_margin, create_heatmap, unpack, UnpackError}; pub type ProgressCallback<'a> = &'a mut dyn FnMut(usize); diff --git a/src/lz.rs b/src/lz.rs index 21699cc..d2f8e7a 100644 --- a/src/lz.rs +++ b/src/lz.rs @@ -1,4 +1,5 @@ use crate::context_state::ContextState; +use crate::heatmap::Heatmap; use crate::rans::{EntropyCoder, RansDecoder}; use crate::Config; use thiserror::Error; @@ -153,21 +154,32 @@ pub fn unpack( max_size: usize, ) -> Result, UnpackError> { let mut result = vec![]; - let _ = unpack_internal(Some(&mut result), packed_data, config, max_size)?; + let _ = unpack_internal(Some(&mut result), None, packed_data, config, max_size)?; Ok(result) } pub fn calculate_margin(packed_data: &[u8], config: &Config) -> Result { - unpack_internal(None, packed_data, config, usize::MAX) + unpack_internal(None, None, packed_data, config, usize::MAX) } -pub fn unpack_internal( +pub fn create_heatmap( + packed_data: &[u8], + config: &Config, + max_size: usize, +) -> Result { + let mut heatmap = Heatmap::new(); + let _ = unpack_internal(None, Some(&mut heatmap), packed_data, config, max_size)?; + Ok(heatmap) +} + +fn unpack_internal( mut result: Option<&mut Vec>, + mut heatmap: Option<&mut Heatmap>, packed_data: &[u8], config: &Config, max_size: usize, ) -> Result { - let mut decoder = RansDecoder::new(packed_data, &config); + let mut decoder = RansDecoder::new(packed_data, &config)?; let mut contexts = ContextState::new((1 + 255) * config.parity_contexts + 1 + 64 + 64, &config); let mut offset = usize::MAX; let mut position = 0usize; @@ -198,6 +210,7 @@ pub fn unpack_internal( } loop { + let prev_decoder = decoder.clone(); margin = margin.max(position as isize - decoder.pos() as isize); let literal_base = position % config.parity_contexts * 256; if decoder.decode_with_context(&mut contexts.context_mut(literal_base))? @@ -231,6 +244,9 @@ pub fn unpack_internal( if offset > position { return Err(UnpackError::OffsetOutOfRange { offset, position }); } + if let Some(ref mut heatmap) = heatmap { + heatmap.add_match(offset, length, decoder.cost(&prev_decoder)); + } if let Some(ref mut result) = result { for _ in 0..length { if result.len() < max_size { @@ -251,6 +267,9 @@ pub fn unpack_internal( context_index = (context_index << 1) | bit as usize; byte |= (bit as u8) << i; } + if let Some(ref mut heatmap) = heatmap { + heatmap.add_literal(byte, decoder.cost(&prev_decoder)); + } if let Some(ref mut result) = result { if result.len() < max_size { result.push(byte); @@ -261,6 +280,10 @@ pub fn unpack_internal( } } + if let Some(heatmap) = heatmap { + heatmap.finish(); + } + if position > max_size { return Err(UnpackError::OverSize { size: position, diff --git a/src/main.rs b/src/main.rs index 5135f42..242f3c5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,6 +9,7 @@ fn main() -> Result<()> { let mut reverse = false; let mut unpack = false; let mut calculate_margin = false; + let mut create_heatmap = false; let mut level = 2; let mut infile: Option = None; let mut outfile: Option = None; @@ -58,6 +59,7 @@ fn main() -> Result<()> { Short('u') | Long("unpack") => unpack = true, Long("margin") => calculate_margin = true, + Long("heatmap") => create_heatmap = true, Short('l') | Long("level") => level = parser.value()?.parse()?, Short(n) if n.is_ascii_digit() => level = n as u8 - b'0', Short('h') | Long("help") => print_help(0), @@ -73,33 +75,46 @@ fn main() -> Result<()> { } let infile = infile.unwrap_or_else(|| print_help(1)); - let outfile = outfile.unwrap_or_else(|| { - let mut name = infile.clone(); - if unpack { - if name.extension().filter(|&e| e == "upk").is_some() { - name.set_extension(""); - } else { - name.set_extension("bin"); + enum OutFileType { + Packed, + Unpacked, + Heatmap, + } + let outfile = |tpe: OutFileType| { + outfile.clone().unwrap_or_else(|| { + let mut name = infile.clone(); + match tpe { + OutFileType::Packed => { + let mut filename = name + .file_name() + .unwrap_or_else(|| OsStr::new("")) + .to_os_string(); + filename.push(".upk"); + name.set_file_name(filename); + } + OutFileType::Unpacked => { + if name.extension().filter(|&e| e == "upk").is_some() { + name.set_extension(""); + } else { + name.set_extension("bin"); + } + } + OutFileType::Heatmap => { + name.set_extension("heatmap"); + } } - } else { - let mut filename = name - .file_name() - .unwrap_or_else(|| OsStr::new("")) - .to_os_string(); - filename.push(".upk"); - name.set_file_name(filename); - } - name - }); + name + }) + }; if config.parity_contexts != 1 && config.parity_contexts != 2 && config.parity_contexts != 4 { eprintln!("--parity has to be 1, 2, or 4"); process::exit(1); } - if !unpack && !calculate_margin { + if !unpack && !calculate_margin && !create_heatmap { let mut data = vec![]; - File::open(infile)?.read_to_end(&mut data)?; + File::open(&infile)?.read_to_end(&mut data)?; if reverse { data.reverse(); } @@ -126,10 +141,10 @@ fn main() -> Result<()> { packed_data.len(), packed_data.len() as f32 * 100. / data.len() as f32 ); - File::create(outfile)?.write_all(&packed_data)?; + File::create(outfile(OutFileType::Packed))?.write_all(&packed_data)?; } else { let mut data = vec![]; - File::open(infile)?.read_to_end(&mut data)?; + File::open(&infile)?.read_to_end(&mut data)?; if reverse { data.reverse(); } @@ -138,7 +153,22 @@ fn main() -> Result<()> { if reverse { unpacked_data.reverse(); } - File::create(outfile)?.write_all(&unpacked_data)?; + File::create(outfile(OutFileType::Unpacked))?.write_all(&unpacked_data)?; + } + if create_heatmap { + let mut heatmap = upkr::create_heatmap(&data, &config, max_unpacked_size)?; + if reverse { + heatmap.reverse(); + } + let mut heatmap_bin = Vec::with_capacity(heatmap.len()); + for i in 0..heatmap.len() { + let cost = (heatmap.cost(i).log2() * 8. + 64.) + .round() + .max(0.) + .min(127.) as u8; + heatmap_bin.push((cost << 1) | heatmap.is_literal(i) as u8); + } + File::create(outfile(OutFileType::Heatmap))?.write_all(&heatmap_bin)?; } if calculate_margin { println!("{}", upkr::calculate_margin(&data, &config)?); @@ -152,11 +182,13 @@ fn print_help(exit_code: i32) -> ! { eprintln!("Usage:"); eprintln!(" upkr [-l level(0-9)] [config options] []"); eprintln!(" upkr -u [config options] []"); + eprintln!(" upkr --heatmap [config options] []"); eprintln!(" upkr --margin [config options] "); eprintln!(); eprintln!(" -l, --level N compression level 0-9"); eprintln!(" -0, ..., -9 short form for setting compression level"); eprintln!(" -u, --unpack unpack infile"); + eprintln!(" --heatmap calculate heatmap from compressed file"); eprintln!(" --margin calculate margin for overlapped unpacking of a packed file"); eprintln!(); eprintln!("Version: {}", env!("CARGO_PKG_VERSION")); diff --git a/src/rans.rs b/src/rans.rs index d1a24b6..2a0405c 100644 --- a/src/rans.rs +++ b/src/rans.rs @@ -148,6 +148,7 @@ impl EntropyCoder for CostCounter { } } +#[derive(Clone)] pub struct RansDecoder<'a> { data: &'a [u8], pos: usize, @@ -166,8 +167,8 @@ const PROB_MASK: u32 = ONE_PROB - 1; pub struct UnexpectedEOF; impl<'a> RansDecoder<'a> { - pub fn new(data: &'a [u8], config: &Config) -> RansDecoder<'a> { - RansDecoder { + pub fn new(data: &'a [u8], config: &Config) -> Result, UnexpectedEOF> { + let mut decoder = RansDecoder { data, pos: 0, state: 0, @@ -176,7 +177,9 @@ impl<'a> RansDecoder<'a> { bits_left: 0, invert_bit_encoding: config.invert_bit_encoding, bitstream_is_big_endian: config.bitstream_is_big_endian, - } + }; + decoder.refill()?; + Ok(decoder) } pub fn pos(&self) -> usize { @@ -189,8 +192,7 @@ impl<'a> RansDecoder<'a> { Ok(bit) } - pub fn decode_bit(&mut self, prob: u16) -> Result { - let prob = prob as u32; + fn refill(&mut self) -> Result<(), UnexpectedEOF> { if self.use_bitstream { while self.state < 32768 { if self.bits_left == 0 { @@ -219,6 +221,13 @@ impl<'a> RansDecoder<'a> { self.pos += 1; } } + Ok(()) + } + + pub fn decode_bit(&mut self, prob: u16) -> Result { + self.refill()?; + + let prob = prob as u32; let bit = (self.state & PROB_MASK) < prob; @@ -231,4 +240,9 @@ impl<'a> RansDecoder<'a> { Ok(bit ^ self.invert_bit_encoding) } + + pub fn cost(&self, prev: &RansDecoder) -> f32 { + f32::log2(prev.state as f32) - f32::log2(self.state as f32) + + (self.pos - prev.pos) as f32 * 8. + } }