implement dictionary support when packing

This commit is contained in:
2025-03-11 10:52:24 +01:00
parent 88cbda559c
commit 130bf821fa
5 changed files with 19 additions and 3 deletions

BIN
README.md.upk Normal file

Binary file not shown.

View File

@@ -12,7 +12,7 @@ pub fn pack(
let mut rans_coder = RansCoder::new(config);
let mut state = lz::CoderState::new(config);
let mut pos = 0;
let mut pos = config.dictionary_size;
while pos < data.len() {
if let Some(ref mut cb) = progress_callback {
cb(pos);

View File

@@ -71,6 +71,9 @@ pub struct Config {
pub max_offset: usize,
/// The maximum match length value to encode when compressing.
pub max_length: usize,
/// Size of dictionary at the beginning of data (how many bytes to skip when compressing.)
pub dictionary_size: usize,
}
impl Default for Config {
@@ -92,6 +95,8 @@ impl Default for Config {
max_offset: usize::MAX,
max_length: usize::MAX,
dictionary_size: 0,
}
}
}

View File

@@ -16,6 +16,7 @@ fn main() -> Result<()> {
let mut level = 2;
let mut infile: Option<PathBuf> = None;
let mut outfile: Option<PathBuf> = None;
let mut dictionary: Option<PathBuf> = None;
let mut max_unpacked_size = 512 * 1024 * 1024;
let mut parser = lexopt::Parser::from_env();
@@ -74,6 +75,7 @@ fn main() -> Result<()> {
process::exit(0);
}
Long("max-unpacked-size") => max_unpacked_size = parser.value()?.parse()?,
Long("dictionary") => dictionary = Some(parser.value()?.try_into()?),
Value(val) if infile.is_none() => infile = Some(val.try_into()?),
Value(val) if outfile.is_none() => outfile = Some(val.try_into()?),
_ => return Err(arg.unexpected().into()),
@@ -94,6 +96,15 @@ fn main() -> Result<()> {
data.reverse();
}
if let Some(dictionary) = dictionary {
let mut dict = vec![];
File::open(dictionary)?.read_to_end(&mut dict)?;
config.dictionary_size = dict.len();
// prepend dict
dict.append(&mut data);
data = dict;
}
#[cfg(feature = "terminal")]
let mut packed_data = {
let mut pb = pbr::ProgressBar::on(std::io::stderr(), data.len() as u64);

View File

@@ -137,7 +137,7 @@ fn parse(
}
add_arrival(
&mut arrivals,
0,
encoding_config.dictionary_size,
Arrival {
parse: None,
state: lz::CoderState::new(encoding_config),
@@ -148,7 +148,7 @@ fn parse(
let cost_counter = &mut CostCounter::new(encoding_config);
let mut best_per_offset = HashMap::new();
for pos in 0..data.len() {
for pos in encoding_config.dictionary_size..data.len() {
let match_length = |offset: usize| {
data[pos..]
.iter()