add missing fields to Cargo.toml

prepare to publish crate / apply clippy fixes
add highlevel description of compressed format in unpack.c
2026-01-20 11:36:42 +01:00 · 2025-07-16 21:16:44 +02:00 · 2025-07-16 21:05:57 +02:00 · 2024-05-25 22:02:47 +02:00 · 2024-05-15 10:56:25 +02:00 · 2024-04-10 23:53:46 +02:00
60 changed files with 4218 additions and 223 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,24 +1,33 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4
 [[package]]
 name = "anyhow"
-version = "1.0.47"
+version = "1.0.98"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38d9ff5d688f1c13395289f67db01d4826b46dd694e7580accdc3e8430f2d98e"
+checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487"
 [[package]]
 name = "autocfg"
-version = "1.0.1"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
+checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
 [[package]]
 name = "bitflags"
 version = "2.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967"
 [[package]]
 name = "cc"
-version = "1.0.72"
+version = "1.2.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee"
+checksum = "5c1599538de2394445747c8cf7935946e3cc27e9625f889d979bfb2aaf569362"
 dependencies = [
 "shlex",
 ]
 [[package]]
 name = "cdivsufsort"
@@ -32,68 +41,172 @@ dependencies = [
 [[package]]
 name = "cfg-if"
-version = "1.0.0"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268"
 [[package]]
 name = "crossbeam-channel"
-version = "0.5.1"
+version = "0.5.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4"
+checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2"
 dependencies = [
 "cfg-if",
 "crossbeam-utils",
 ]
 [[package]]
 name = "crossbeam-utils"
-version = "0.8.5"
+version = "0.8.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
 [[package]]
 name = "crossterm"
 version = "0.29.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b"
 dependencies = [
- "cfg-if",
+ "bitflags",
- "lazy_static",
+ "document-features",
 "parking_lot",
 "rustix",
 ]
 [[package]]
-name = "lazy_static"
+name = "document-features"
-version = "1.4.0"
+version = "0.2.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+checksum = "95249b50c6c185bee49034bcb378a49dc2b5dff0be90ff6616d31d64febab05d"
 dependencies = [
 "litrs",
 ]
 [[package]]
 name = "errno"
 version = "0.3.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad"
 dependencies = [
 "libc",
 "windows-sys",
 ]
 [[package]]
 name = "lexopt"
 version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9fa0e2a1fcbe2f6be6c42e342259976206b383122fc152e872795338b5a3f3a7"
 [[package]]
 name = "libc"
-version = "0.2.108"
+version = "0.2.174"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8521a1b57e76b1ec69af7599e75e38e7b7fad6610f037db8c79b127201b5d119"
+checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776"
 [[package]]
 name = "linux-raw-sys"
 version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12"
 [[package]]
 name = "litrs"
 version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4ce301924b7887e9d637144fdade93f9dfff9b60981d4ac161db09720d39aa5"
 [[package]]
 name = "lock_api"
 version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765"
 dependencies = [
 "autocfg",
 "scopeguard",
 ]
 [[package]]
 name = "num-traits"
-version = "0.2.14"
+version = "0.2.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
 dependencies = [
 "autocfg",
 ]
 [[package]]
-name = "pbr"
+name = "parking_lot"
-version = "1.0.4"
+version = "0.12.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff5751d87f7c00ae6403eb1fcbba229b9c76c9a30de8c1cf87182177b168cea2"
+checksum = "70d58bf43669b5795d1576d0641cfb6fbb2057bf629506267a92807158584a13"
 dependencies = [
 "lock_api",
 "parking_lot_core",
 ]
 [[package]]
 name = "parking_lot_core"
 version = "0.9.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5"
 dependencies = [
 "cfg-if",
 "libc",
 "redox_syscall",
 "smallvec",
 "windows-targets 0.52.6",
 ]
 [[package]]
 name = "pbr"
 version = "1.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed5827dfa0d69b6c92493d6c38e633bbaa5937c153d0d7c28bf12313f8c6d514"
 dependencies = [
 "crossbeam-channel",
 "libc",
 "time",
 "winapi",
 ]
 [[package]]
-name = "pico-args"
+name = "proc-macro2"
-version = "0.4.2"
+version = "1.0.95"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db8bcd96cb740d03149cbad5518db9fd87126a10ab519c011893b1754134c468"
+checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
 dependencies = [
 "unicode-ident",
 ]
 [[package]]
 name = "quote"
 version = "1.0.40"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
 dependencies = [
 "proc-macro2",
 ]
 [[package]]
 name = "redox_syscall"
 version = "0.5.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0d04b7d0ee6b4a0207a0a7adb104d23ecb0b47d6beae7152d0fa34b692b29fd6"
 dependencies = [
 "bitflags",
 ]
 [[package]]
 name = "rustix"
 version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "11181fbabf243db407ef8df94a6ce0b2f9a733bd8be4ad02b4eda9602296cac8"
 dependencies = [
 "bitflags",
 "errno",
 "libc",
 "linux-raw-sys",
 "windows-sys",
 ]
 [[package]]
 name = "sacabase"
@@ -105,32 +218,72 @@ dependencies = [
 ]
 [[package]]
-name = "time"
+name = "scopeguard"
-version = "0.1.44"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 [[package]]
 name = "shlex"
 version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 [[package]]
 name = "smallvec"
 version = "1.15.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
 [[package]]
 name = "syn"
 version = "2.0.104"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40"
 dependencies = [
- "libc",
+ "proc-macro2",
- "wasi",
+ "quote",
- "winapi",
+ "unicode-ident",
 ]
 [[package]]
 name = "thiserror"
 version = "2.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708"
 dependencies = [
 "thiserror-impl",
 ]
 [[package]]
 name = "thiserror-impl"
 version = "2.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn",
 ]
 [[package]]
 name = "unicode-ident"
 version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
 [[package]]
 name = "upkr"
-version = "0.1.0"
+version = "0.2.3"
 dependencies = [
 "anyhow",
 "cdivsufsort",
 "crossterm",
 "lexopt",
 "pbr",
- "pico-args",
+ "thiserror",
 ]
 [[package]]
 name = "wasi"
 version = "0.10.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
 [[package]]
 name = "winapi"
 version = "0.3.9"
@@ -152,3 +305,140 @@ name = "winapi-x86_64-pc-windows-gnu"
 version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 [[package]]
 name = "windows-sys"
 version = "0.60.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
 dependencies = [
 "windows-targets 0.53.2",
 ]
 [[package]]
 name = "windows-targets"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
 dependencies = [
 "windows_aarch64_gnullvm 0.52.6",
 "windows_aarch64_msvc 0.52.6",
 "windows_i686_gnu 0.52.6",
 "windows_i686_gnullvm 0.52.6",
 "windows_i686_msvc 0.52.6",
 "windows_x86_64_gnu 0.52.6",
 "windows_x86_64_gnullvm 0.52.6",
 "windows_x86_64_msvc 0.52.6",
 ]
 [[package]]
 name = "windows-targets"
 version = "0.53.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c66f69fcc9ce11da9966ddb31a40968cad001c5bedeb5c2b82ede4253ab48aef"
 dependencies = [
 "windows_aarch64_gnullvm 0.53.0",
 "windows_aarch64_msvc 0.53.0",
 "windows_i686_gnu 0.53.0",
 "windows_i686_gnullvm 0.53.0",
 "windows_i686_msvc 0.53.0",
 "windows_x86_64_gnu 0.53.0",
 "windows_x86_64_gnullvm 0.53.0",
 "windows_x86_64_msvc 0.53.0",
 ]
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.53.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764"
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.53.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c"
 [[package]]
 name = "windows_i686_gnu"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
 [[package]]
 name = "windows_i686_gnu"
 version = "0.53.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3"
 [[package]]
 name = "windows_i686_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
 [[package]]
 name = "windows_i686_gnullvm"
 version = "0.53.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11"
 [[package]]
 name = "windows_i686_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
 [[package]]
 name = "windows_i686_msvc"
 version = "0.53.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d"
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.53.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba"
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.53.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57"
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.53.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,12 +1,21 @@
 [package]
 name = "upkr"
-version = "0.1.0"
+version = "0.2.3"
-edition = "2021"
+edition = "2024"
 description = "Simple LZ packer with relatively small unpackers"
 license = "Unlicense"
 reepository = "https://github.com/exoticorn/upkr"
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+[profile.release]
 strip = "debuginfo"
 [features]
 terminal = ["crossterm", "pbr"]
 [dependencies]
 cdivsufsort = "2"
-pico-args = "0.4"
+lexopt = "0.3.1"
 anyhow = "1"
-pbr = "1"
+thiserror = "2.0.12"
 pbr = { version = "1", optional = true }
 crossterm = { version = "0.29.0", default-features = false, optional = true }
--- a/README.md
+++ b/README.md
@@ -2,13 +2,83 @@
 Upkr is a simple general purpose lz packer designed to be used in the [MicroW8](https://github.com/exoticorn/microw8) platform.
 The compressed format is losely based on [Shrinkler](https://github.com/askeksa/Shrinkler) with the main difference being that
-Upkr doesn't differnetiate between literals at odd or even addresses and that I went with rANS/rABS instead of a range coder.
+Upkr doesn't differentiate between literals at odd or even addresses (by default) and that I went with rANS/rABS instead of a range coder.
-At this point, Upkr should still be considered unstable - the compressed format is not very likely to change but I still want
+Compression rate is on par with Shrinkler.
-to keep that option open a little longer.
+
 The differences compare to Shrinkler also makes it interesting on 8bit platforms. The z80 unpacker included in the release
 is both about twice as fast and smaller than the Shrinkler unpacker.
 ## Inspirations:
 * Ferris' blog about his [C64 intro packer](https://yupferris.github.io/blog/2020/08/31/c64-4k-intro-packer-deep-dive.html)
 * [Shrinkler](https://github.com/askeksa/Shrinkler)
 * Ryg's [sample rANS implementation](https://github.com/rygorous/ryg_rans)
 ## Unpackers
 The release includes a reference c unpacker, as well as some optimized asm unpackers (arm and riscv). The unpckers in
 c_unpacker and asm_unpackers unpack the default upkr compressed format. The z80_unpacker
 is based on some variations to the compressed format. (Use `upkr --z80` to select those variations.)
 The 16 bit dos unpacker also uses some variations. (`upkr --x86`)
 ### More unpackers outside this repository
 * [Atari Lynx](https://github.com/42Bastian/new_bll/blob/master/demos/depacker/unupkr.asm)
 * [Atari Jaguar](https://github.com/42Bastian/new_bjl/blob/main/exp/depacker/unupkr.js)
 * [8080, R800](https://github.com/ivagorRetrocomp/DeUpkr)
 * [6502](https://github.com/pfusik/upkr6502)
 ## Usage
 ```
  upkr [-l level(0-9)] [config options] <infile> [<outfile>]
  upkr -u [config options] <infile> [<outfile>]
  upkr --heatmap [config options] <infile> [<outfile>]
  upkr --margin [config options] <infile>
 -l, --level N       compression level 0-9
 -0, ..., -9         short form for setting compression level
 -d, --decompress    decompress infile
 --heatmap           calculate heatmap from compressed file
   --raw-cost        report raw cost of literals in heatmap
                     (the cost of literals is spread across all matches
                     that reference the literal by default.)
   --hexdump         print heatmap as colored hexdump
 --margin            calculate margin for overlapped unpacking of a packed file
 When no infile is given, or the infile is '-', read from stdin.
 When no outfile is given and reading from stdin, or when outfile is '-', write to stdout.
 Config presets for specific unpackers:
 --z80               --big-endian-bitstream --invert-bit-encoding --simplified-prob-update -9
 --x86               --bitstream --invert-is-match-bit --invert-continue-value-bit --invert-new-offset-bit
 --x86b              --bitstream --invert-continue-value-bit --no-repeated-offsets -9
 Config options (need to match when packing/unpacking):
 -b, --bitstream     bitstream mode
 -p, --parity N      use N (2/4) parity contexts
 -r, --reverse       reverse input & output
 Config options to tailor output to specific optimized unpackers:
 --invert-is-match-bit
 --invert-new-offset-bit
 --invert-continue-value-bit
 --invert-bit-encoding
 --simplified-prob-update
 --big-endian-bitstream   (implies --bitstream)
 --no-repeated-offsets
 --eof-in-length
 --max-offset N
 --max-length N
 ```
 ## Heatmap
 By default, the `--heatmap` flag writes out the heatmap data as a binary file. The heatmap file is
 the same size as the unpacked data. Each byte can be interpreted like this:
 ```
 is_literal = byte & 1; // whether the byte was encoded as a literal (as opposed to a match)
 size_in_bits = 2.0 ** (((byte >> 1) - 64) / 8.0); // the size this byte takes up in the compressed data
 ```
--- a/asm_unpackers/.gitignore
+++ b/asm_unpackers/.gitignore
@@ -0,0 +1 @@
 /build/
--- a/asm_unpackers/Makefile
+++ b/asm_unpackers/Makefile
@@ -0,0 +1,71 @@
 build/unpack_riscv64: ../c_unpacker/main.c unpack_riscv.S
 	mkdir -p build
 	riscv64-linux-gnu-gcc -g -static -o $@ $^
 test_riscv64: build/unpack_riscv64
 	qemu-riscv64 $< test_data.upk /tmp/out.bin
 	cmp test_data.bin /tmp/out.bin
 build/unpack_riscv64.o: unpack_riscv.S
 	mkdir -p build
 	riscv64-linux-gnu-gcc -c -o $@ $?
 build/unpack_riscv64.bin: build/unpack_riscv64.o
 	riscv64-linux-gnu-objcopy -O binary --only-section=.text $? $@
 disas-riscv64: build/unpack_riscv64.o
 	riscv64-linux-gnu-objdump -d $?
 build/unpack_riscv32.o: unpack_riscv.S
 	mkdir -p build
 	riscv64-linux-gnu-gcc -march=rv32imc -mabi=ilp32 -c -o $@ $?
 build/unpack_riscv32.bin: build/unpack_riscv32.o
 	riscv64-linux-gnu-objcopy -O binary --only-section=.text $? $@
 build/unpack_riscv32nc.o: unpack_riscv.S
 	mkdir -p build
 	riscv64-linux-gnu-gcc -march=rv32im -mabi=ilp32 -c -o $@ $?
 build/unpack_riscv32nc.bin: build/unpack_riscv32nc.o
 	riscv64-linux-gnu-objcopy -O binary --only-section=.text $? $@
 disas-riscv32: build/unpack_riscv32.o
 	riscv64-linux-gnu-objdump -d $?
 build/unpack_armv6m: ../c_unpacker/main.c unpack_armv6m.S
 	mkdir -p build
 	arm-linux-gnueabihf-gcc -g -static -o $@ $^
 test_armv6m: build/unpack_armv6m
 	qemu-arm $< test_data.upk /tmp/out.bin
 	cmp test_data.bin /tmp/out.bin
 build/unpack_armv6m.bin: unpack_armv6m.S
 	mkdir -p build
 	arm-none-eabi-gcc -march=armv6-m -c -o build/unpack_armv6m.o $?
 	arm-none-eabi-objcopy -O binary --only-section=.text build/unpack_armv6m.o $@
 build/unpack_arm32: ../c_unpacker/main.c unpack_arm32.S
 	mkdir -p build
 	arm-linux-gnueabihf-gcc -g -static -o $@ $^
 test_arm32: build/unpack_arm32
 	qemu-arm $< test_data.upk /tmp/out.bin
 	cmp test_data.bin /tmp/out.bin
 build/unpack_arm32.bin: unpack_arm32.S
 	mkdir -p build
 	arm-none-eabi-gcc -c -o build/unpack_arm32.o $?
 	arm-none-eabi-objcopy -O binary --only-section=.text build/unpack_arm32.o $@
 build/unpack_c: ../c_unpacker/main.c ../c_unpacker/unpack.c
 	mkdir -p build
 	gcc -g -o $@ $^
 test_c: build/unpack_c
 	$< test_data.upk /tmp/out.bin
 	cmp test_data.bin /tmp/out.bin
 sizes: build/unpack_armv6m.bin build/unpack_riscv64.bin build/unpack_riscv32.bin build/unpack_arm32.bin
 	ls -l build/*.bin
--- a/asm_unpackers/test_data.bin
+++ b/asm_unpackers/test_data.bin
@@ -0,0 +1,99 @@
 typedef unsigned char u8;
 typedef unsigned short u16;
 typedef unsigned long u32;
 u8* upkr_data_ptr;
 u8 upkr_probs[1 + 255 + 1 + 2*32 + 2*32]; 
 #ifdef UPKR_BITSTREAM
 u16 upkr_state;
 u8 upkr_current_byte;
 int upkr_bits_left;
 #else
 u32 upkr_state;
 #endif
 int upkr_decode_bit(int context_index) {
 #ifdef UPKR_BITSTREAM
    while(upkr_state < 32768) {
        if(upkr_bits_left == 0) {
            upkr_current_byte = *upkr_data_ptr++;
            upkr_bits_left = 8;
        }
        upkr_state = (upkr_state << 1) + (upkr_current_byte & 1);
        upkr_current_byte >>= 1;
        --upkr_bits_left;
    }
 #else
    while(upkr_state < 4096) {
        upkr_state = (upkr_state << 8) | *upkr_data_ptr++;
    }
 #endif
    int prob = upkr_probs[context_index];
    int bit = (upkr_state & 255) < prob ? 1 : 0;
    int tmp = prob;
    if(!bit) {
        tmp = 256 - tmp;
    }
    upkr_state = tmp * (upkr_state >> 8) + (upkr_state & 255);
    tmp += (256 - tmp + 8) >> 4;
    if(!bit) {
        upkr_state -= prob;
        tmp = 256 - tmp;
    }
    upkr_probs[context_index] = tmp;
    return bit;
 }
 int upkr_decode_length(int context_index) {
    int length = 0;
    int bit_pos = 0;
    while(upkr_decode_bit(context_index)) {
        length |= upkr_decode_bit(context_index + 1) << bit_pos++;
        context_index += 2;
    }
    return length | (1 << bit_pos);
 }
 void* upkr_unpack(void* destination, void* compressed_data) {
    upkr_data_ptr = (u8*)compressed_data;
    upkr_state = 0;
 #ifdef UPKR_BITSTREAM
    upkr_bits_left = 0;
 #endif
    for(int i = 0; i < sizeof(upkr_probs); ++i)
        upkr_probs[i] = 128;
    u8* write_ptr = (u8*)destination;
    int prev_was_match = 0;
    int offset = 0;
    for(;;) {
        if(upkr_decode_bit(0)) {
            if(prev_was_match || upkr_decode_bit(256)) {
                offset = upkr_decode_length(257) - 1;
                if(offset == 0) {
                    break;
                }
            }
            int length = upkr_decode_length(257 + 64);
            while(length--) {
                *write_ptr = write_ptr[-offset];
                ++write_ptr;
            }
            prev_was_match = 1;
        } else {
            int byte = 1;
            while(byte < 256) {
                int bit = upkr_decode_bit(byte);
                byte = (byte << 1) + bit;
            }
            *write_ptr++ = byte;
            prev_was_match = 0;
        }
    }
    return write_ptr;
 }
--- a/asm_unpackers/test_data.upk
+++ b/asm_unpackers/test_data.upk
--- a/asm_unpackers/unpack_arm32.S
+++ b/asm_unpackers/unpack_arm32.S
@@ -0,0 +1,100 @@
 .arm
 .section .text
 .global upkr_unpack
 .type upkr_unpack, %function
 // r0 .. out_ptr (returned)
 // r1 .. in_ptr (returned)
 // r2 .. state
 // r3 .. offset
 // r4 .. prev_was_literal / decode_length ret
 // r5 .. context index
 // r6 .. decode_length temp
 // r7 .. probs ptr
 // r8-r11 .. decode_bit temp
 // r12 .. decode_length return address
 upkr_unpack:
    push { r3-r11, lr }
    mov r2, #384
    mov r3, #128
 .Lclear:
    subs r2, r2, #1
    strb r3, [sp, -r2]
    bne .Lclear
 .Lloop:
    mov r5, #0
    bl upkr_decode_bit
    bcc .Ldata
 .Lmatch:
    mov r5, #256
    rsbs r6, r4, #0
    blcc upkr_decode_bit
    bcc .Lskip_offset
    bl upkr_decode_length
    adds r3, r4, #1
    popeq { r3-r11, pc }
 .Lskip_offset:
    mov r5, #256+64
    bl upkr_decode_length
 .Lcopy_loop:
    ldrb r5, [r0, r3]
 .Lstore:
    strb r5, [r0], #1
    adds r4, r4, #1
    blt .Lcopy_loop
    b .Lloop
 .Ldata:
    mov r5, #1
 .Ldata_loop:
    bl upkr_decode_bit
    adc r5, r5, r5
    movs r4, r5, lsr #8
    beq .Ldata_loop
    b .Lstore
 .type upkr_decode_length, %function
 upkr_decode_length:
    mov r12, lr
    mov r4, #0
    mvn r6, #0
 .Lbit_loop:
    bl upkr_decode_bit_inc
    addcc r4, r4, r6
    movcc pc, r12
    bl upkr_decode_bit_inc
    addcs r4, r4, r6
    mov r6, r6, lsl #1
    b .Lbit_loop
 .type upkr_decode_bit, %function
 upkr_decode_bit_inc:
    add r5, r5, #1
 upkr_decode_bit:
    cmp r2, #4096
    ldrltb r8, [r1], #1
    orrlt r2, r8, r2, lsl#8
    blt upkr_decode_bit
    ldrb r8, [sp, -r5]
    and r9, r2, #255
    add r9, r9, #1
    cmp r8, r9
    rsbcs r8, r8, #256
    mvn r9, r2, lsr#8
    addcs r9, r9, #1
    mla r2, r8, r9, r2
    add r9, r8, #8
    sub r8, r8, r9, lsr#4
    rsbcs r8, r8, #256
    strb r8, [sp, -r5]
    mov pc, r14
--- a/asm_unpackers/unpack_armv6m.S
+++ b/asm_unpackers/unpack_armv6m.S
@@ -0,0 +1,162 @@
 // armv6-m upkr unpacker by yrlf
 // some optimizations by exoticorn
 .syntax unified
 .thumb
 .section .text
 #define ALIGNUP(n, align) (((n) + (align) - 1) & ~((align) - 1))
 #define PROB_LEN (1 + 255 + 1 + 2*32 + 2*32)
 #define FRAME_SIZE ALIGNUP(PROB_LEN, 4)
 // auto upkr_unpack(uint8_t * out, uint8_t * in) -> tuple<uint8_t *, uint8_t *>
 .global upkr_unpack
 .type upkr_unpack, %function
 // r0 .. out_ptr (returned)
 // r1 .. in_ptr (returned)
 // r2 .. state
 // r3 .. offset
 // r4 .. prev_was_literal / decode_length ret
 // r5 .. subroutine arg (preserved)
 // r6 .. decode_bit ret
 // r7 .. probs ptr
 upkr_unpack:
    push { r4, r5, r6, r7, lr }
    sub sp, sp, #FRAME_SIZE
    mov r7, sp
    movs r2, #255
    adds r2, r2, #(PROB_LEN - 255)
    movs r3, #128
 .Lclear:
    subs r2, r2, #1
    strb r3, [r7, r2]
    bne .Lclear
 .Lloop:
    movs r5, #0
    bl upkr_decode_bit
    beq .Ldata
 .Lmatch:
    // r6 = 1
    lsls r5, r6, #8
    cmp r4, #0
    beq 1f
    bl upkr_decode_bit
    beq 2f
 1:
    bl upkr_decode_length
    adds r3, r4, #1
    beq .Lend
 2:
    adds r5, r5, #64
    bl upkr_decode_length
 .Lcopy_loop:
    ldrb r5, [r0, r3]
 .Lstore:
    strb r5, [r0]
    adds r0, r0, #1
    adds r4, r4, #1
    blt .Lcopy_loop
    b .Lloop
 .Ldata:
    movs r5, #1
 .Ldata_loop:
    bl upkr_decode_bit
    adcs r5, r5, r5
    lsrs r4, r5, #8
    beq .Ldata_loop
    b .Lstore
 .Lend:
    add sp, sp, #FRAME_SIZE
    pop { r4, r5, r6, r7, pc }
 .type upkr_decode_length, %function
 // r0 .. -length tmp (saved)
 // r1 ..
 // r2 ..
 // r3 ..
 // r4 .. -length (returned)
 // r5 .. context index (saved)
 // r6 .. (saved)
 // r7 ..
 upkr_decode_length:
    push { r0, r5, r6, lr }
    movs r0, #0
    subs r4, r0, #1
 .Lbit_loop:
    adds r5, r5, #1
    bl upkr_decode_bit
    beq 1f
    adds r5, r5, #1
    bl upkr_decode_bit
    beq 2f
    adds r0, r0, r4
 2:
    lsls r4, r4, #1
    b .Lbit_loop
 1:
    adds r4, r4, r0
    pop { r0, r5, r6, pc }
 .type upkr_decode_bit, %function
 // r0 .. tmp / prob (saved)
 // r1 .. in_ptr (modified)
 // r2 .. state (modified)
 // r3 .. scratch (saved)
 // r4 ..
 // r5 .. context index (preserved)
 // r6 .. bit (returned)
 // r7 .. probs ptr (preserved)
 upkr_fill_state:
    lsls r2, r2, #8
    ldrb r6, [r1]
    adds r1, r1, #1
    orrs r2, r2, r6
 upkr_decode_bit:
    lsrs r6, r2, #12
    beq upkr_fill_state
    push { r0, r1, r3, lr }
    ldrb r0, [r7, r5]
    lsrs r3, r2, #8
    uxtb r1, r2
    subs r6, r1, r0
    blt 1f
    subs r1, r2, r0
    rsbs r0, r0, #0
 1:
    muls r3, r3, r0
    adds r2, r1, r3
    rsbs r3, r0, #0
    uxtb r3, r3
    lsrs r3, r3, #4
    adcs r0, r0, r3
    cmp r6, #0
    blt 1f
    rsbs r0, r0, #0
 1:
    strb r0, [r7, r5]
    lsrs r6, r6, #31
    pop { r0, r1, r3, pc }
--- a/asm_unpackers/unpack_jagrisc.js
+++ b/asm_unpackers/unpack_jagrisc.js
@@ -0,0 +1,190 @@
 ;;; -*-asm-*-
 ;;; ukpr unpacker for Atari Jaguar RISC.
 ;;; lyxass syntax
 ; input:
 ;;; R20 : packed buffer
 ;;; R21 : output buffer
 ;;; r30 : return address
 ;;;
 ;;; Register usage (destroyed!)
 ;;; r0-r17,r20,r21
 ;;;
 DST		REG 21
 SRC		REG 20
 	REGTOP 16
 LR_save		REG 99
 LR_save2	REG 99
 GETBIT		REG 99
 GETLENGTH	REG 99
 LITERAL		REG 99
 LOOP		REG 99
 index		REG 99
 bit_pos		REG 99
 state		REG 99
 prev_was_match	REG 99
 offset		REG 99
 prob		reg 99
 byte		REG 99
 PROBS		reg 99
 tmp2		reg 2
 tmp1		REG 1
 tmp0		REG 0
 	REGMAP
 upkr_probs	equ $200
 SIZEOF_PROBS	EQU 1+255+1+2*32+2*32
 unupkr::
 	move	LR,LR_save
 	moveq	#0,tmp0
 	movei	#upkr_probs,PROBS
 	bset	#7,tmp0
 	movei	#SIZEOF_PROBS,tmp2
 	move	PROBS,tmp1
 .init	storeb	tmp0,(tmp1)
 	subq	#1,tmp2
 	jr	pl,.init
 	addq	#1,tmp1
 	moveq	#0,offset
 	moveq	#0,state
 	movei	#getlength,GETLENGTH
 	movei	#getbit,GETBIT
 .looppc	move	PC,LOOP
 	addq	#.loop-.looppc,LOOP
 	move	pc,LITERAL
 	jr	.start
 	addq	#6,LITERAL
 .literal
 	moveq	#1,byte
 	move	pc,LR
 	jr	.into
 	addq	#6,LR		; LR = .getbit
 .getbit
 	addc	byte,byte
 .into
 	btst	#8,byte
 	jump	eq,(GETBIT)
 	move	byte,index
 	storeb	byte,(DST)
 	addq	#1,DST
 .start
 	moveq	#0,prev_was_match
 .loop
 	moveq	#0,index
 	BL	(GETBIT)
 	jump	cc,(LITERAL)
 	addq	#14,LR
 	cmpq	#1,prev_was_match
 	jr	eq,.newoff
 	shlq	#8,r0
 	jump	(GETBIT)
 	move	r0,index
 	jr	cc,.oldoff
 	shlq	#8,r0
 .newoff
 	addq	#1,r0		; r0 = 257
 	BL	(GETLENGTH)
 	subq	#1,r0
 	jump	eq,(LR_save)
 	move	r0,offset
 .oldoff
 	movei	#257+64,r0
 	BL	(GETLENGTH)
 	move	DST,r1
 	sub	offset,r1
 .cpymatch1
 	loadb	(r1),r2
 	subq	#1,r0
 	addqt	#1,r1
 	storeb	r2,(DST)
 	jr	ne,.cpymatch1
 	addq	#1,DST
 	jump	(LOOP)
 	moveq	#1,prev_was_match
 getlength:
 	move	LR,LR_save2
 	moveq	#0,byte
 	move	r0,index
 	moveq	#0,bit_pos
 	move	pc,LR
 	jump	(GETBIT)
 	addq	#6,LR
 .gl
 	jr	cc,.exit
 	addq	#8,LR		; => return to "sh ..."
 	jump	(GETBIT)
 	nop
 	sh	bit_pos,r0
 	subq	#1,bit_pos	; sh < 0 => shift left!
 	or	r0,byte
 	jump	(GETBIT)
 	subq	#8,LR
 .exit
 	moveq	#1,r0
 	sh	bit_pos,r0
 	jump	(LR_save2)
 	or	byte,r0
 .newbyte:
 	loadb	(SRC),r2
 	shlq	#8,state
 	addq	#1,SRC
 	or	r2,state
 getbit
 	move	state,r2
 	move	PROBS,r1
 	add	index,r1		; r1 = &probs[index]
 	shrq	#12,r2
 	loadb	(r1),prob
 	jr	eq,.newbyte
 	move	state,r2
 	move	state,r0
 	shlq	#24,r2
 	shrq	#8,r0		; sh
 	shrq	#24,r2		; sl
 	cmp	prob,r2
 	addqt	#1,index
 	jr	cs,.one
 	mult	prob,r0
 	;; state -= ((state >> 8) + 1)*prob
 	;; prob -= (prob+8)>>4
 	move	prob,r2
 	add	prob,r0
 	addq	#8,r2
 	sub	r0,state
 	shrq	#4,r2
 	moveq	#0,r0
 	jr	.ret
 	sub	r2,prob
 .one
 	;; state = (state >> 8)*prob+(state & 0xff)
 	;; prob += (256 + 8 - prob) >> 4
 	move	r2,state
 	movei	#256+8,r2
 	add	r0,state
 	sub	prob,r2		; 256-prob+8
 	shrq	#4,r2
 	add	r2,prob
 	moveq	#3,r0
 .ret
 	storeb	prob,(r1)
 	jump	(LR)
 	shrq	#1,r0		; C = 0, r0 = 1
--- a/asm_unpackers/unpack_jagrisc_fast.js
+++ b/asm_unpackers/unpack_jagrisc_fast.js
@@ -0,0 +1,217 @@
 ;;; -*-asm-*-
 ;;; ukpr unpacker for Atari Jaguar RISC. (quick version)
 ;;; lyxass syntax
 ; input:
 ;;; R20 : packed buffer
 ;;; R21 : output buffer
 ;;; r30 : return address
 ;;;
 ;;; Register usage (destroyed!)
 ;;; r0-r17,r20,r21
 ;;;
 DST		REG 21
 SRC		REG 20
 	REGTOP 17
 LR_save		REG 99
 LR_save2	REG 99
 GETBIT		REG 99
 GETLENGTH	REG 99
 LITERAL		REG 99
 LOOP		REG 99
 index		REG 99
 bit_pos		REG 99
 state		REG 99
 prev_was_match	REG 99
 offset		REG 99
 prob		reg 99
 byte		REG 99
 ndata		reg 99
 PROBS		reg 99
 tmp2		reg 2
 tmp1		REG 1
 tmp0		REG 0
 	REGMAP
 upkr_probs	equ $200
 SIZEOF_PROBS	EQU 1+255+1+2*32+2*32
 unupkr::
 	move	LR,LR_save
 	movei	#$80808080,tmp0
 	movei	#upkr_probs,PROBS
 	movei	#SIZEOF_PROBS,tmp2
 	move	PROBS,tmp1
 .init	store	tmp0,(tmp1)
 	subq	#4,tmp2
 	jr	pl,.init
 	addq	#4,tmp1
 	loadb	(SRC),ndata
 	addq	#1,SRC
 	moveq	#0,offset
 	moveq	#0,state
 	movei	#getlength,GETLENGTH
 	movei	#getbit,GETBIT
 .looppc	move	PC,LOOP
 	addq	#.loop-.looppc,LOOP
 	move	pc,LITERAL
 	jr	.start
 	addq	#6,LITERAL
 .literal
 	moveq	#1,byte
 	move	pc,LR
 	jr	.into
 	addq	#6,LR		; LR = .getbit
 .getbit
 	addc	byte,byte
 .into
 	btst	#8,byte
 	jump	eq,(GETBIT)
 	move	byte,index
 	storeb	byte,(DST)
 	addq	#1,DST
 .start
 	moveq	#0,prev_was_match
 .loop
 	moveq	#0,index
 	BL	(GETBIT)
 	jump	cc,(LITERAL)
 	addq	#14,LR
 	cmpq	#1,prev_was_match
 	jr	eq,.newoff
 	shlq	#8,r0
 	jump	(GETBIT)
 	move	r0,index
 	jr	cc,.oldoff
 	shlq	#8,r0
 .newoff
 	addq	#1,r0		; r0 = 257
 	BL	(GETLENGTH)
 	subq	#1,r0
 	move	r0,offset
 	jump	eq,(LR_save)
 	nop
 .oldoff
 	movei	#257+64,r0
 	BL	(GETLENGTH)
 	move	DST,r2
 	move	DST,r1
 	or	offset,r2
 	btst	#0,r2
 	moveq	#1,prev_was_match
 	jr	ne,.cpymatch1
 	sub	offset,r1
 .cpymatch2
 	loadw	(r1),r2
 	addqt	#2,r1
 	subq	#2,r0
 	storew	r2,(DST)
 	jump	eq,(LOOP)
 	addqt	#2,DST
 	jr	pl,.cpymatch2
 	nop
 	jump	(LOOP)
 	subq	#1,DST
 .cpymatch1
 	loadb	(r1),r2
 	subq	#1,r0
 	addqt	#1,r1
 	storeb	r2,(DST)
 	jr	ne,.cpymatch1
 	addq	#1,DST
 	jump	(LOOP)
 //->	nop
 getlength:
 	move	LR,LR_save2
 	moveq	#0,byte
 	move	r0,index
 	moveq	#0,bit_pos
 	move	pc,LR
 	jump	(GETBIT)
 	addq	#6,LR
 .gl
 	jr	cc,.exit
 	addq	#8,LR		; => return to "sh ..."
 	jump	(GETBIT)
 	nop
 	sh	bit_pos,r0
 	subq	#1,bit_pos	; sh < 0 => shift left!
 	or	r0,byte
 	jump	(GETBIT)
 	subq	#8,LR
 .exit
 	moveq	#1,r0
 	sh	bit_pos,r0
 	jump	(LR_save2)
 	or	byte,r0
 .newbyte:
 	move	ndata,r2
 	shlq	#8,state
 	loadb	(SRC),ndata
 	or	r2,state
 	addq	#1,SRC
 	move	state,r2
 	shrq	#12,r2
 	jr	ne,.done
 	move	state,r2
 	jr	.newbyte
 getbit
 	move	state,r2
 	move	PROBS,r1
 	add	index,r1		; r1 = &probs[index]
 	shrq	#12,r2
 	loadb	(r1),prob
 	jr	eq,.newbyte
 	move	state,r2
 .done
 	move	state,r0
 	shlq	#24,r2
 	shrq	#8,r0		; sh
 	shrq	#24,r2		; sl
 	cmp	prob,r2
 	addqt	#1,index
 	jr	cs,.one
 	mult	prob,r0
 	;; state -= ((state >> 8) + 1)*prob
 	;; prob -= (prob+8)>>4
 	move	prob,r2
 	add	prob,r0
 	addq	#8,r2
 	sub	r0,state
 	shrq	#4,r2
 	moveq	#0,r0
 	sub	r2,prob
 	shrq	#1,r0		; C = 0, r0 = 0
 	jump	(LR)
 	storeb	prob,(r1)
 .one
 	;; state = (state >> 8)*prob+(state & 0xff)
 	;; prob += (256 + 8 - prob) >> 4
 	move	r2,state
 	movei	#256+8,r2
 	add	r0,state
 	sub	prob,r2		; 256-prob+8
 	shrq	#4,r2
 	add	r2,prob
 	moveq	#3,r0
 	storeb	prob,(r1)
 	jump	(LR)
 	shrq	#1,r0		; C = 0, r0 = 1
--- a/asm_unpackers/unpack_riscv.S
+++ b/asm_unpackers/unpack_riscv.S
@@ -0,0 +1,131 @@
 .section .text
 // x9 prev was literal
 // x10 out ptr
 // x11 in ptr
 // x12 offset
 // x13 state
 // x14 context index
 .global upkr_unpack
 .type upkr_unpack, %function
 upkr_unpack:
 	mv t4, ra
 	mv x17, x8
 	mv t6, x9
 	li x9, 256 + 128
 	mv x13, x9
 1:
 	sub x8, sp, x13
 	sb x9, 0(x8)
 	addi x13, x13, -1
 	bnez x13, 1b
 .Lmainloop:
 	li x14, 0
 	jal upkr_decode_bit
 	beqz x15, .Lliteral
 	slli x14, x14, 8
 	beqz x9, .Lread_offset_inc_x14
 	jal upkr_decode_bit
 	bnez x15, .Lread_offset
 .Lfinished_offset:
 	addi x14, x14, 64
 	jalr ra // jal upkr_decode_number
 1:
 	add x14, x10, t0
 	lbu x14, (x14)
 .Lstore_byte:
 	sb x14, (x10)
 	addi x10, x10, 1
 	addi x9, x9, 1
 	blt x9, x0, 1b
 	j .Lmainloop
 .Lliteral:
 	jal upkr_decode_bit
 	addi x14, x14, -1
 	slli x14, x14, 1
 	add x14, x14, x15
 	srli x9, x14, 8
 	beqz x9, .Lliteral
 	j .Lstore_byte
 .Lread_offset_inc_x14:
 	addi x14, x14, 1
 .Lread_offset:
 	jalr ra // jal upkr_decode_number
 	addi t0, x9, 1
 	bnez t0, .Lfinished_offset
 .Ldone:
 	mv x8, x17
 	mv x9, t6
 	jr t4
 upkr_load_byte:
 	lbu x15, 0(x11)
 	addi x11, x11, 1
 	slli x13, x13, 8
 	add x13, x13, x15
 // x8 prob array ptr
 // x11 in ptr
 // x13 state
 // x14 context index
 // return:
 //   x14 context index + 1
 //   x15 decoded bit
 upkr_decode_bit:
 	srli x15, x13, 12
 	beqz x15, upkr_load_byte
 	addi x14, x14, 1
 	sub t2, sp, x14
 	lbu x12, (t2)
 	andi x8, x13, 255
 	sltu x15, x8, x12
 	beqz x15, 1f
 	xori x12, x12, 255
 	addi x12, x12, 1
 1:
 	srli x8, x13, 8
 	addi x8, x8, 1
 	sub x8, x8, x15
 	mul x8, x8, x12
 	sub x13, x13, x8
 	addi x8, x12, 8
 	srli x8, x8, 4
 	sub x12, x12, x8
 	beqz x15, 1f
 	sub x12, x0, x12
 1:
 	sb x12, (t2)
 	jalr ra
 // x14 context index
 // return: x9 negtive decoded number 
 upkr_decode_number:
 	mv t3, ra
 	mv t5, x14
 	li x9, 0
 	li t1, -1
 1:
 	jal upkr_decode_bit
 	beqz x15, 1f
 	jal upkr_decode_bit
 	beqz x15, 2f
 	add x9, x9, t1
 2:
 	add t1, t1, t1
 	j 1b
 1:
 	add x9, x9, t1
 	mv x14, t5
 	jr t3
--- a/c_library/.gitignore
+++ b/c_library/.gitignore
@@ -0,0 +1,2 @@
 /target/
 /upkr
--- a/c_library/Cargo.lock
+++ b/c_library/Cargo.lock
@@ -0,0 +1,127 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
 version = 3
 [[package]]
 name = "anyhow"
 version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800"
 [[package]]
 name = "autocfg"
 version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 [[package]]
 name = "cc"
 version = "1.0.79"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
 [[package]]
 name = "cdivsufsort"
 version = "2.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "edefce019197609da416762da75bb000bbd2224b2d89a7e722c2296cbff79b8c"
 dependencies = [
 "cc",
 "sacabase",
 ]
 [[package]]
 name = "lexopt"
 version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "478ee9e62aaeaf5b140bd4138753d1f109765488581444218d3ddda43234f3e8"
 [[package]]
 name = "num-traits"
 version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
 dependencies = [
 "autocfg",
 ]
 [[package]]
 name = "proc-macro2"
 version = "1.0.51"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6"
 dependencies = [
 "unicode-ident",
 ]
 [[package]]
 name = "quote"
 version = "1.0.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
 dependencies = [
 "proc-macro2",
 ]
 [[package]]
 name = "sacabase"
 version = "2.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9883fc3d6ce3d78bb54d908602f8bc1f7b5f983afe601dabe083009d86267a84"
 dependencies = [
 "num-traits",
 ]
 [[package]]
 name = "syn"
 version = "1.0.109"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
 dependencies = [
 "proc-macro2",
 "quote",
 "unicode-ident",
 ]
 [[package]]
 name = "thiserror"
 version = "1.0.39"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a5ab016db510546d856297882807df8da66a16fb8c4101cb8b30054b0d5b2d9c"
 dependencies = [
 "thiserror-impl",
 ]
 [[package]]
 name = "thiserror-impl"
 version = "1.0.39"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5420d42e90af0c38c3290abcca25b9b3bdf379fc9f55c528f53a269d9c9a267e"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn",
 ]
 [[package]]
 name = "unicode-ident"
 version = "1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
 [[package]]
 name = "upkr"
 version = "0.2.1"
 dependencies = [
 "anyhow",
 "cdivsufsort",
 "lexopt",
 "thiserror",
 ]
 [[package]]
 name = "upkr_c"
 version = "0.0.1"
 dependencies = [
 "upkr",
 ]
--- a/c_library/Cargo.toml
+++ b/c_library/Cargo.toml
@@ -0,0 +1,17 @@
 [package]
 name = "upkr_c"
 version = "0.0.1"
 edition = "2021"
 [lib]
 name = "upkr"
 crate-type = ["staticlib"]
 [profile.release]
 opt-level = "s"
 strip = "debuginfo"
 lto = true
 panic = "abort"
 [dependencies]
 upkr = { path="..", default-features=false }
--- a/c_library/Makefile
+++ b/c_library/Makefile
@@ -0,0 +1,8 @@
 upkr: upkr.c upkr.h target/release/libupkr.a
 	gcc -O2 -Ltarget/release -o upkr upkr.c -lupkr -lm
 	strip upkr
 target/release/libupkr.a: cargo
 	cargo build --release
 .PHONY: cargo
--- a/c_library/Readme.md
+++ b/c_library/Readme.md
@@ -0,0 +1,11 @@
 This is a simple example of compiling upkr to a library that can be linked in a
 c program. It consists of a small rust crate which implements the c api and
 compiles to a static library and a matching c header file. As is, the rust
 crate offers two simple functions to compress/uncompress data with the default
 upkr config.
 The provided makefile will only work on linux. Building the example upkr.c on
 other platforms is left as an exercise for the reader ;)
 On Windows you might have to make sure to install and use the correct rust
 toolchain version (mingw vs. msvc) to match your c compiler.
--- a/c_library/src/lib.rs
+++ b/c_library/src/lib.rs
@@ -0,0 +1,42 @@
 use std::ffi::c_int;
 // the upkr config to use, this can be modified to use other configs
 fn config() -> upkr::Config {
    upkr::Config::default()
 }
 #[no_mangle]
 pub extern "C" fn upkr_compress(
    output_buffer: *mut u8,
    output_buffer_size: usize,
    input_buffer: *const u8,
    input_size: usize,
    compression_level: c_int,
 ) -> usize {
    let output_buffer = unsafe { std::slice::from_raw_parts_mut(output_buffer, output_buffer_size) };
    let input_buffer = unsafe { std::slice::from_raw_parts(input_buffer, input_size) };
    let packed_data = upkr::pack(input_buffer, compression_level.max(0).min(9) as u8, &config(), None);
    let copy_size = packed_data.len().min(output_buffer.len());
    output_buffer[..copy_size].copy_from_slice(&packed_data[..copy_size]);
    packed_data.len()
 }
 #[no_mangle]
 pub extern "C" fn upkr_uncompress(output_buffer: *mut u8, output_buffer_size: usize, input_buffer: *const u8, input_size: usize) -> isize {
    let output_buffer = unsafe { std::slice::from_raw_parts_mut(output_buffer, output_buffer_size)};
    let input_buffer = unsafe { std::slice::from_raw_parts(input_buffer, input_size)};
    match upkr::unpack(input_buffer, &config(), output_buffer.len()) {
        Ok(unpacked_data) => {
            output_buffer[..unpacked_data.len()].copy_from_slice(&unpacked_data);
            unpacked_data.len() as isize
        }
        Err(upkr::UnpackError::OverSize { size, .. }) => size as isize,
        Err(other) => {
            eprintln!("[upkr] compressed data corrupt: {}", other);
            -1
        }
    }
 }
--- a/c_library/upkr.c
+++ b/c_library/upkr.c
@@ -0,0 +1,99 @@
 #include "upkr.h"
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
 int main(int argc, char** argv) {
    if(argc < 2) {
        fprintf(stdout, "Usage:\n  upkr [compress] [-0 .. -9] <file> [<out-file>]\n  upkr [uncompress] <file> [<out-file>]\n");
        return 1;
    }
    int argi = 1;
    int uncompress = 0;
    int compression_level = 4;
    if(strcmp(argv[argi], "compress") == 0) {
        ++argi;
    } else if(strcmp(argv[argi], "uncompress") == 0) {
        uncompress = 1;
        ++argi;
    }
    if(argi < argc && argv[argi][0] == '-') {
        compression_level = atoi(argv[argi] + 1);
        ++argi;
    }
    if(argi == argc) {
        fprintf(stdout, "intput filename missing\n");
        return 1;
    }
    const char* input_name = argv[argi++];
    char* output_name;
    if(argi < argc) {
        output_name = argv[argi];
    } else {
        output_name = malloc(strlen(input_name) + 5);
        strcpy(output_name, input_name);
        strcat(output_name, uncompress ? ".unp" : ".upk");
    }
    FILE* file = fopen(input_name, "rb");
    if(file == 0) {
        fprintf(stdout, "failed to open input file '%s'\n", file);
        return 1;
    }
    fseek(file, 0, SEEK_END);
    long input_size = ftell(file);
    rewind(file);
    char* input_buffer = (char*)malloc(input_size);
    long offset = 0;
    while(offset < input_size) {
        long read_size = fread(input_buffer + offset, 1, input_size - offset, file);
        if(read_size <= 0) {
            fprintf(stdout, "error reading input file\n");
            return 1;
        }
        offset += read_size;
    }
    fclose(file);
    long output_buffer_size = input_size * 8;
    long output_size;
    char* output_buffer = (char*)malloc(output_buffer_size);
    for(;;) {
        if(uncompress) {
            output_size = upkr_uncompress(output_buffer, output_buffer_size, input_buffer, input_size);
        } else {
            output_size = upkr_compress(output_buffer, output_buffer_size, input_buffer, input_size, compression_level);
        }
        if(output_size < 0) {
            return 1;
        }
        if(output_size <= output_buffer_size) {
            break;
        }
        output_buffer = (char*)realloc(output_buffer, output_size);
        output_buffer_size = output_size;
    }
    file = fopen(output_name, "wb");
    if(file == 0) {
        fprintf(stdout, "failed to open output file '%s'\n", output_name);
        return 1;
    }
    offset = 0;
    while(offset < output_size) {
        long written_size = fwrite(output_buffer + offset, 1, output_size - offset, file);
        if(written_size <= 0) {
            fprintf(stdout, "error writing output file\n");
            return 1;
        }
        offset += written_size;
    }
    fclose(file);
    return 0;
 }
--- a/c_library/upkr.h
+++ b/c_library/upkr.h
@@ -0,0 +1,25 @@
 #ifndef UPKR_H_INCLUDED
 #include <stddef.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 // input_buffer/input_size: input data to compress
 // output_buffer/output_buffer_size: buffer to compress into
 // compression_level: 0-9
 // returns the size of the compressed data, even if it didn't fit into the output buffer
 size_t upkr_compress(void* output_buffer, size_t output_buffer_size, void* input_buffer, size_t input_size, int compression_level);
 // input_buffer/input_size: compressed data
 // output_buffer/output_buffer_size: buffer to uncompress into
 // return value:
 //  >= 0 : size of uncompressed data, even if it didn't fit into the output buffer
 //  < 0  : input data corrupt, unable to decompress
 ptrdiff_t upkr_uncompress(void* output_buffer, size_t output_buffer_size, void* input_buffer, size_t input_size);
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/c_unpacker/decode_bit_alt.c
+++ b/c_unpacker/decode_bit_alt.c
@@ -0,0 +1,33 @@
 int upkr_decode_bit(int context_index) {
 #ifdef UPKR_BITSTREAM
    while(upkr_state < 32768) {
        if(upkr_bits_left == 0) {
            upkr_current_byte = *upkr_data_ptr++;
            upkr_bits_left = 8;
        }
        upkr_state = (upkr_state << 1) + (upkr_current_byte & 1);
        upkr_current_byte >>= 1;
        --upkr_bits_left;
    }
 #else
    while(upkr_state < 4096) {
        upkr_state = (upkr_state << 8) | *upkr_data_ptr++;
    }
 #endif
    int prob = upkr_probs[context_index];
    int bit = (upkr_state & 255) < prob ? 1 : 0;
    if(bit) {
        prob = 256 - prob;
    }
    upkr_state -= prob * ((upkr_state >> 8) + (bit ^ 1));
    prob -= (prob + 8) >> 4;
    if(bit) {
        prob = -prob;
    }
    upkr_probs[context_index] = prob;
    return bit;
 }
--- a/c_unpacker/main.c
+++ b/c_unpacker/main.c
@@ -1,7 +1,7 @@
 #include <stdio.h>
 #include <stdlib.h>
-int upkr_unpack(void* destination, void* compressed_data);
+void* upkr_unpack(void* destination, void* compressed_data);
 int main(int argn, char** argv) {
  void* input_buffer = malloc(1024*1024);
@@ -13,7 +13,8 @@ int main(int argn, char** argv) {
  printf("Compressed size: %d\n", in_size);
-  int out_size = upkr_unpack(output_buffer, input_buffer);
+  void* end_ptr = upkr_unpack(output_buffer, input_buffer);
  int out_size = (char*)end_ptr - (char*)output_buffer;
  printf("Uncompressed size: %d\n", out_size);
--- a/c_unpacker/unpack.c
+++ b/c_unpacker/unpack.c
@@ -1,3 +1,56 @@
 /*
    A simple C unpacker for upkr compressed data.
    This implements two variants, selected by the UPKR_BITSTREAM define:
    - normal: faster and smaller on modern hardware as whole bytes are shifted into
              the rANS state at a time, but requires 20bits for the state
    - bitstream: only single bits are shifted into the rANS state at a time
                 which allows the state to always fit in 16bits which is a boon
                 on very old CPUs.
    The encoder and decoder need to be configured to use the same varianet.
    upkr compressed data is a rANS byte-/bit-stream encoding a series of literal
    byte values and back-references as probability encoded bits.
    upkr_decode_bit reads one bit from the rANS stream, taking a probability context
    as parameter. The probability context is a byte estimating the probability of
    a bit encoded in this context being set. It is updated by upkr_decode_bit
    after each decoded bit to reflect the observed past frequencies of on/off bits.
    There are a number of different contexts used in the compressed format. The order in the
    upkr_probs array is arbitrary, the only requirement for the unpacker is that all bits
    that shared the same context while encoding also share the same context while decoding.
    The contexts are:
    - is match
    - has offset
    - literal bit N (0-7) with already decoded highest bits of literal == M (255 total)
    - offset bit N (one less than max offset bits)
    - has offset bit N (max offset bits)
    - length bit N (one less then max length bits)
    - has length bit N (max length bits)
    Literal bytes are encoded from highest to lowest bit, with the bit position and
    the already decoded bits as context.
    Offst and Length are encoded in an interlaced variant of elias gamma coding. They
    are encoded from lowest to highest bits. For each bit, first one bit is read in the
    "has offset/length bit N)". If this is set, offset/length bit N is read in it's context
    and the decoding continues with the next bit. If the "has bit N" is read as false, a
    fixed 1 bit is added as the top bit at this position.
    The highlevel decode loop then looks like this:
    loop:
        if read_bit(IS_MATCH):
            if prev_was_match || read_bit(HAS_OFFSET):
                offset = read_length_or_offset(OFFSET) - 1
                if offset == 0:
                    break
            length = read_length_or_offset(LENGTH)
            copy_bytes_from_offset(length, offset)
        else:
            read_and_push(literal)
 */
 typedef unsigned char u8;
 typedef unsigned short u16;
 typedef unsigned long u32;
@@ -14,6 +67,7 @@ u32 upkr_state;
 int upkr_decode_bit(int context_index) {
 #ifdef UPKR_BITSTREAM
    // shift in single bits until rANS state is >= 32768
    while(upkr_state < 32768) {
        if(upkr_bits_left == 0) {
            upkr_current_byte = *upkr_data_ptr++;
@@ -24,6 +78,7 @@ int upkr_decode_bit(int context_index) {
        --upkr_bits_left;
    }
 #else
    // shift in a full byte until rANS state is >= 4096
    while(upkr_state < 4096) {
        upkr_state = (upkr_state << 8) | *upkr_data_ptr++;
    }
@@ -32,13 +87,16 @@ int upkr_decode_bit(int context_index) {
    int prob = upkr_probs[context_index];
    int bit = (upkr_state & 255) < prob ? 1 : 0;
    // rANS state and context probability update
    // for the later, add 1/16th (rounded) of difference from either 0 or 256
    if(bit) {
        upkr_state = prob * (upkr_state >> 8) + (upkr_state & 255);
-        upkr_probs[context_index] = prob + ((256 - prob + 8) >> 4);
+        prob += (256 - prob + 8) >> 4;
    } else {
        upkr_state = (256 - prob) * (upkr_state >> 8) + (upkr_state & 255) - prob;
-        upkr_probs[context_index] = prob - ((prob + 8) >> 4);
+        prob -= (prob + 8) >> 4;
    }
    upkr_probs[context_index] = prob;
    return bit;
 }
@@ -53,12 +111,13 @@ int upkr_decode_length(int context_index) {
    return length | (1 << bit_pos);
 }
-int upkr_unpack(void* destination, void* compressed_data) {
+void* upkr_unpack(void* destination, void* compressed_data) {
    upkr_data_ptr = (u8*)compressed_data;
    upkr_state = 0;
 #ifdef UPKR_BITSTREAM
    upkr_bits_left = 0;
 #endif
    // all contexts are initialized to 128 = equal probability of 0 and 1
    for(int i = 0; i < sizeof(upkr_probs); ++i)
        upkr_probs[i] = 128;
@@ -67,10 +126,13 @@ int upkr_unpack(void* destination, void* compressed_data) {
    int prev_was_match = 0;
    int offset = 0;
    for(;;) {
        // is match
        if(upkr_decode_bit(0)) {
            // has offset
            if(prev_was_match || upkr_decode_bit(256)) {
                offset = upkr_decode_length(257) - 1;
                if(offset == 0) {
                    // a 0 offset signals the end of the compressed data
                    break;
                }
            }
@@ -81,6 +143,9 @@ int upkr_unpack(void* destination, void* compressed_data) {
            }
            prev_was_match = 1;
        } else {
            // byte contains the previously read bits and indicates the number of
            // read bits by the set top bit. Therefore it can be directly used as the
            // context index. The set top bit ends up at bit position 8 and is not stored.
            int byte = 1;
            while(byte < 256) {
                int bit = upkr_decode_bit(byte);
@@ -91,5 +156,5 @@ int upkr_unpack(void* destination, void* compressed_data) {
        }
    }
-    return write_ptr - (u8*)destination;
+    return write_ptr;
 }
--- a/dos_unpacker/readme.txt
+++ b/dos_unpacker/readme.txt
@@ -0,0 +1,13 @@
 16 bit DOS executable stubs
 ---------------------------
 by pestis and TomCat
 unpack_x86_16_DOS.asm:
  maximum compatibility, relocates unpacked code to normal start address
 unpack_x86_16_DOS_no_relocation.asm:
  saves some bytes by not relocating, unpacked code needs to be assembled to
  start at 0x3FFE
 unpack_x86_16_DOS_no_repeated_offset.asm:
  removes support for repeated offsets, potentially at the cost of some compression ratio.
  most likely only a win in very narrow circumstances around the 1kb mark
--- a/dos_unpacker/unpack_x86_16_DOS.asm
+++ b/dos_unpacker/unpack_x86_16_DOS.asm
@@ -0,0 +1,160 @@
 ; Contributions from pestis, TomCat and exoticorn
 ;
 ; This is the 16-bit DOS x86 decompression stub for upkr, which is designed for
 ; maximum compatibility: it relocates the compressed data so it can be
 ; decompressed starting at the normal .COM starting address. In other words,
 ; many of the already existing .COM files should be compressable using this
 ; stub.
 ;
 ; How to use:
 ;   1) Pack your intro using upkr into data.bin with the --x86 command line
 ;      argument:
 ;
 ;           $ upkr --x86 intro.com data.bin
 ;
 ;   2) Compile this .asm file using nasm (or any compatible assembler):
 ;
 ;           $ nasm unpack_x86_16_DOS.asm -fbin -o intropck.com
 ;
 ; The packed size of the intro+stub is limited by max_len (see below) bytes.
 ;
 ; In specific cases, the unpacker stub can be further optimized to save a byte
 ; or two:
 ;   1) You can remove CLC before RET, if you don't mind carry being set upon
 ;      program entry
 ;   2) You can also move PUSHA before PUSH SI and put POPA as the first
 ;      operation of the compressed code.
 max_len     equ 16384
 prog_start  equ (0x100+max_len+510+relocation-upkr_unpack)
 probs       equ (((prog_start+max_len+510)+255)/256)*256
 org 0x100
 ; This is will be loaded at 0x100, but relocates the code and data to prog_start
 relocation:
    push    si                  ; si = 0x100 at DOS start, so save it for later ret
    pusha                       ; pusha to recall all registers before starting intro
    push    si                  ; for pop di to start writing the output
    mov     di, prog_start      ; the depacker & data are relocated from 0x100 to prog_start
    mov     ch, max_len/512
    rep     movsw
    jmp     si                  ; jump to relocated upkr_unpack
 ; upkr_unpack unpacks the code to 0x100 and runs it when done.
 upkr_unpack:
    xchg    ax, bp              ; position in input bitstream (bp) = 0
    cwd                         ; upkr_state (dx) = 0;
    xchg    ax, cx              ; cx = 0x9XX
    mov     al, 128             ; for(int i = 0; i < sizeof(upkr_probs); ++i) upkr_probs[i] = 128;
    rep     stosb
    pop     di                  ; u8* write_ptr = (u8*)destination;
 .mainloop:
    mov     bx, probs
    call    upkr_decode_bit
    jc      .else               ; if(upkr_decode_bit(0)) {
    mov     bh, (probs+256)/256
    jcxz    .skip_call
    call    upkr_decode_bit
    jc      .skipoffset
 .skip_call:
    stc
    call    upkr_decode_number  ; offset = upkr_decode_length(258) - 1;
    loop    .notdone            ; if(offset == 0)
    popa
    clc
    ret
 .notdone:
    mov     si, di
 .sub:
    dec     si
    loop    .sub
 .skipoffset:
    mov     bl, 128             ; int length = upkr_decode_length(384);
    call    upkr_decode_number
    rep     movsb               ; *write_ptr = write_ptr[-offset];
    jmp     .mainloop
 .byteloop:
    call    upkr_decode_bit     ; int bit = upkr_decode_bit(byte);
 .else:
    adc     bl, bl              ; byte = (byte << 1) + bit;
    jnc     .byteloop
    xchg    ax, bx
    stosb
    inc     si
    mov     cl, 1
    jmp     .mainloop           ;  prev_was_match = 0;
 ; upkr_decode_bit decodes one bit from the rANS entropy encoded bit stream.
 ; parameters:
 ;    bx = memory address of the context probability
 ;    dx = decoder state
 ;    bp = bit position in input stream
 ; returns:
 ;    dx = new decoder state
 ;    bp = new bit position in input stream
 ;    carry = bit
 ; trashes ax
 upkr_load_bit:
    bt      [compressed_data-relocation+prog_start], bp
    inc     bp
    adc     dx, dx
 upkr_decode_bit:
    inc     dx              ; inc dx, dec dx is used to test the top (sign) bit of dx
    dec     dx
    jns     upkr_load_bit
    movzx   ax, byte [bx]   ; u16 prob = upkr_probs[context_index]
    neg     byte [bx]
    push    ax              ; save prob, tmp = prob
    cmp     dl, al          ; int bit = (upkr_state & 255) < prob ? 1 : 0; (carry = bit)
    pushf                   ; save bit flags
    jc      .bit            ; (skip if bit)
    xchg    [bx], al        ;   tmp = 256 - tmp;
 .bit:
    shr     byte [bx], 4    ; upkr_probs[context_index] = tmp + (256 - tmp + 8) >> 4;
    adc     [bx], al
    mul     dh              ; upkr_state = tmp * (upkr_state >> 8) + (upkr_state & 255);
    mov     dh, 0
    add     dx, ax
    popf
    pop     ax
    jc      .bit2           ; (skip if bit)
    neg     byte [bx]       ;   tmp = 256 - tmp;
    sub     dx, ax          ;   upkr_state -= prob; note that this will also leave carry always unset, which is what we want
 .bit2:
    ret                     ; return the bit in carry
 ; upkr_decode_number loads a variable length encoded number (up to 16 bits) from
 ; the compressed stream. Only numbers 1..65535 can be encoded. If the encoded
 ; number has 4 bits and is 1ABC, it is encoded using a kind of an "interleaved
 ; elias code": 0A0B0C1. The 1 in the end implies that no more bits are coming.
 ; parameters:
 ;   cx = must be 0
 ;   bx = memory address of the context probability
 ;   dx = decoder state
 ;   bp = bit position in input stream
 ;   carry = must be 1
 ; returns:
 ;   cx = length
 ;   dx = new decoder state
 ;   bp = new bit position in input stream
 ;   carry = 1
 ; trashes bl, ax
 upkr_decode_number_loop:
    inc     bx
    call    upkr_decode_bit
 upkr_decode_number:
    rcr     cx, 1
    inc     bx
    call    upkr_decode_bit
    jnc     upkr_decode_number_loop     ; 0 = there's more bits coming, 1 = no more bits
 .loop2:
    rcr     cx, 1
    jnc     .loop2
    ret
 compressed_data:
   incbin   "data.bin"
--- a/dos_unpacker/unpack_x86_16_DOS_no_relocation.asm
+++ b/dos_unpacker/unpack_x86_16_DOS_no_relocation.asm
@@ -0,0 +1,151 @@
 ; Contributions from pestis, TomCat and exoticorn
 ;
 ; This is the 16-bit DOS x86 decompression stub for upkr, which decompresses the
 ; code starting at address 0x3FFE (or whatever is defined by the entrypoint
 ; below). Thus, the packed code needs to be assembled with org 0x3FFE to work.
 ;
 ; How to use:
 ;   1) Put POPA as the first instruction of your compiled code and use org
 ;      0x3FFE
 ;   2) Pack your intro using upkr into data.bin with the --x86 command line
 ;      argument:
 ;
 ;           $ upkr --x86 intro.com data.bin
 ;
 ;   2) Compile this .asm file using nasm (or any compatible assembler) e.g.
 ;
 ;           $ nasm unpack_x86_16_DOS_no_relocation.asm -fbin -o intropck.com
 ;
 ; In specific cases, the unpacker stub can be further optimized to save a byte
 ; or two:
 ;   1) If your stub+compressed code is 2k or smaller, you can save 1 byte by
 ;      putting probs at 0x900 and initializing DI with SALC; XCHG AX, DI instead
 ;      of MOV DI, probs
 ;   2) If you remove the PUSHA (and POPA in the compressed code), then you can
 ;      assume the registers as follows: AX = 0x00XX, BX = probs + 0x1XX, CX = 0
 ;      DX = (trash), SI = DI = right after your program, SP = as it was when the
 ;      program started, flags = carry set
 ;
 ; Note that even with the PUSHA / POPA, carry will be set (!) unlike normal dos
 ; program.
 entry       equ 0x3FFE
 probs       equ entry - 0x1FE   ; must be aligned to 256
 org 0x100
 ; This is will be loaded at 0x100, but relocates the code and data to prog_start
 upkr_unpack:
    pusha
    xchg    ax, bp              ; position in bitstream = 0
    cwd                         ; upkr_state = 0;
    mov     di, probs
    mov     ax, 0x8080          ; for(int i = 0; i < sizeof(upkr_probs); ++i) upkr_probs[i] = 128;
    rep     stosw
    push    di
 .mainloop:
    mov     bx, probs
    call    upkr_decode_bit
    jc      .else               ; if(upkr_decode_bit(0)) {
    mov     bh, (probs+256)/256
    jcxz    .skip_call          ; if(prev_was_match || upkr_decode_bit(257)) {
    call    upkr_decode_bit
    jc      .skipoffset
 .skip_call:
    stc
    call    upkr_decode_number  ;  offset = upkr_decode_number(258) - 1;
    mov     si, di
    loop    .sub                ; if(offset == 0)
    ret
 .sub:
    dec     si
    loop    .sub
 .skipoffset:
    mov     bl, 128             ; int length = upkr_decode_number(384);
    call    upkr_decode_number
    rep     movsb               ; *write_ptr = write_ptr[-offset];
    jmp     .mainloop
 .byteloop:
    call    upkr_decode_bit     ; int bit = upkr_decode_bit(byte);
 .else:
    adc     bl, bl              ; byte = (byte << 1) + bit;
    jnc     .byteloop
    xchg    ax, bx
    stosb
    inc     si
    mov     cl, 1
    jmp     .mainloop           ;  prev_was_match = 0;
 ; upkr_decode_bit decodes one bit from the rANS entropy encoded bit stream.
 ; parameters:
 ;    bx = memory address of the context probability
 ;    dx = decoder state
 ;    bp = bit position in input stream
 ; returns:
 ;    dx = new decoder state
 ;    bp = new bit position in input stream
 ;    carry = bit
 ; trashes ax
 upkr_load_bit:
    bt      [compressed_data], bp
    inc     bp
    adc     dx, dx
 upkr_decode_bit:
    inc     dx
    dec     dx              ; inc dx, dec dx is used to test the top (sign) bit of dx
    jns     upkr_load_bit
    movzx   ax, byte [bx]   ; u16 prob = upkr_probs[context_index]
    neg     byte [bx]
    push    ax              ; save prob, tmp = prob
    cmp     dl, al          ; int bit = (upkr_state & 255) < prob ? 1 : 0; (carry = bit)
    pushf                   ; save bit flags
    jc      .bit            ; (skip if bit)
    xchg    [bx], al        ;   tmp = 256 - tmp;
 .bit:
    shr     byte [bx], 4    ; upkr_probs[context_index] = tmp + (256 - tmp + 8) >> 4;
    adc     [bx], al
    mul     dh              ; upkr_state = tmp * (upkr_state >> 8) + (upkr_state & 255);
    mov     dh, 0
    add     dx, ax
    popf
    pop     ax
    jc      .bit2           ; (skip if bit)
    neg     byte [bx]       ;   tmp = 256 - tmp;
    sub     dx, ax          ;    upkr_state -= prob; note that this will also leave carry always unset, which is what we want
 .bit2:
    ret                     ; flags = bit
 ; upkr_decode_number loads a variable length encoded number (up to 16 bits) from
 ; the compressed stream. Only numbers 1..65535 can be encoded. If the encoded
 ; number has 4 bits and is 1ABC, it is encoded using a kind of an "interleaved
 ; elias code": 0A0B0C1. The 1 in the end implies that no more bits are coming.
 ; parameters:
 ;   cx = must be 0
 ;   bx = memory address of the context probability
 ;   dx = decoder state
 ;   bp = bit position in input stream
 ;   carry = must be 1
 ; returns:
 ;   cx = length
 ;   dx = new decoder state
 ;   bp = new bit position in input stream
 ;   carry = 1
 ; trashes bl, ax
 upkr_decode_number_loop:
    inc     bx
    call    upkr_decode_bit
 upkr_decode_number:
    rcr     cx, 1
    inc     bx
    call    upkr_decode_bit
    jnc     upkr_decode_number_loop ; while(upkr_decode_bit(context_index)) {
 .loop2:
    rcr     cx, 1
    jnc     .loop2
    ret
 compressed_data:
    incbin  "data.bin"
--- a/dos_unpacker/unpack_x86_16_DOS_no_repeated_offset.asm
+++ b/dos_unpacker/unpack_x86_16_DOS_no_repeated_offset.asm
@@ -0,0 +1,154 @@
 ; Contributions from pestis, TomCat and exoticorn
 ;
 ; This is the 16-bit DOS x86 decompression stub for upkr, which is designed for
 ; the --no-repeated-offsets option of upkr. The decompression stub is slightly
 ; smaller, but the compressed data might be bigger, so you have to test if
 ; --no-repeated-offsets pays off in the end. This stub relocates the compressed
 ; data so it can be decompressed starting at the normal .COM starting address.
 ;
 ; How to use:
 ;   1) Pack your intro using upkr into data.bin with the --x86b command line
 ;      argument: (notice the --x86b, not --x86!)
 ;
 ;           $ upkr --x86b intro.com data.bin
 ;
 ;   2) Compile this .asm file using nasm (or any compatible assembler):
 ;
 ;           $ nasm unpack_x86_16_DOS_no_repeated_offsets.asm -fbin -o intropck.com
 ;
 ; The packed size of the intro+stub is limited by max_len (see below) bytes.
 ;
 ; In specific cases, the unpacker stub can be further optimized to save a byte
 ; or two:
 ;   1) You can remove CLC before RET, if you don't mind carry being set upon
 ;      program entry
 ;   2) You can also move PUSHA before PUSH SI and put POPA as the first
 ;      operation of the compressed code.
 max_len     equ 16384
 prog_start  equ (0x100+max_len+510+relocation-upkr_unpack)
 probs       equ (((prog_start+max_len+510)+255)/256)*256
 org 0x100
 ; This is will be loaded at 0x100, but relocates the code and data to prog_start
 relocation:
    push    si                  ; si = 0x100 at DOS start, so save it for later ret
    pusha                       ; pusha to recall all registers before starting intro
    push    si                  ; for pop di to start writing the output
    mov     di, prog_start      ; the depacker & data are relocated from 0x100 to prog_start
    mov     ch, max_len/512
    rep     movsw
    jmp     si                  ; jump to relocated upkr_unpack
 ; upkr_unpack unpacks the code to 0x100 and runs it when done.
 upkr_unpack:
    xchg    ax, bp              ; position in bitstream = 0
    cwd                         ; upkr_state = 0;
    xchg    cx, ax              ; cx > 0x0200
    mov     al, 128             ; for(int i = 0; i < sizeof(upkr_probs); ++i) upkr_probs[i] = 128;
    rep     stosb
    pop     di                  ; u8* write_ptr = (u8*)destination;
 .mainloop:
    mov     bx, probs
    call    upkr_decode_bit
    jnc     .else               ; if(upkr_decode_bit(0)) {
    inc     bh
    call    upkr_decode_number  ;  offset = upkr_decode_number(258) - 1;
    loop    .notdone            ; if(offset == 0)
    popa
    clc
    ret
 .notdone:
    mov     si, di
 .sub:
    dec     si
    loop    .sub
    mov     bl, 128             ; int length = upkr_decode_number(384);
    call    upkr_decode_number
    rep     movsb               ; *write_ptr = write_ptr[-offset];
    jmp     .mainloop
 .else:
    inc     bx
 .byteloop:
    call    upkr_decode_bit     ; int bit = upkr_decode_bit(byte);
    adc     bl, bl              ; byte = (byte << 1) + bit;
    jnc     .byteloop
    xchg    ax, bx
    stosb
    jmp     .mainloop           ;  prev_was_match = 0;
 ; upkr_decode_bit decodes one bit from the rANS entropy encoded bit stream.
 ; parameters:
 ;    bx = memory address of the context probability
 ;    dx = decoder state
 ;    bp = bit position in input stream
 ; returns:
 ;    dx = new decoder state
 ;    bp = new bit position in input stream
 ;    carry = bit
 ; trashes ax
 upkr_load_bit:
    bt      [compressed_data-relocation+prog_start], bp
    inc     bp
    adc     dx, dx
 upkr_decode_bit:
    inc     dx
    dec     dx              ; or whatever other test for the top bit there is
    jns     upkr_load_bit
    movzx   ax, byte [bx]   ; u16 prob = upkr_probs[context_index]
    neg     byte [bx]
    push    ax              ; save prob, tmp = prob
    cmp     dl, al          ; int bit = (upkr_state & 255) < prob ? 1 : 0; (carry = bit)
    pushf                   ; save bit flags
    jc      .bit            ; (skip if bit)
    xchg    [bx], al        ;   tmp = 256 - tmp;
 .bit:
    shr     byte [bx], 4    ; upkr_probs[context_index] = tmp + (256 - tmp + 8) >> 4;
    adc     [bx], al        ; upkr_probs[context_index] = tmp;
    mul     dh              ; upkr_state = tmp * (upkr_state >> 8) + (upkr_state & 255);
    mov     dh, 0
    add     dx, ax
    popf
    pop     ax
    jc      .bit2           ; (skip if bit)
    neg     byte [bx]       ;   tmp = 256 - tmp;
    sub     dx, ax          ;   upkr_state -= prob; note that this will also leave carry always unset, which is what we want
 .bit2:
    ret                     ; flags = bit
 ; upkr_decode_number loads a variable length encoded number (up to 16 bits) from
 ; the compressed stream. Only numbers 1..65535 can be encoded. If the encoded
 ; number has 4 bits and is 1ABC, it is encoded using a kind of an "interleaved
 ; elias code": 0A0B0C1. The 1 in the end implies that no more bits are coming.
 ; parameters:
 ;   cx = must be 0
 ;   bx = memory address of the context probability
 ;   dx = decoder state
 ;   bp = bit position in input stream
 ;   carry = must be 1
 ; returns:
 ;   cx = length
 ;   dx = new decoder state
 ;   bp = new bit position in input stream
 ;   carry = 1
 ; trashes bl, ax
 upkr_decode_number_loop:
    inc     bx
    call    upkr_decode_bit
 upkr_decode_number:
    rcr     cx, 1
    inc     bx
    call    upkr_decode_bit
    jnc     upkr_decode_number_loop     ; 0 = there's more bits coming, 1 = no more bits
 .loop2:
    rcr     cx, 1
    jnc     .loop2
    ret
 compressed_data:
    incbin "data.bin"
--- a/fuzz/.gitignore
+++ b/fuzz/.gitignore
@@ -0,0 +1,3 @@
 target
 corpus
 artifacts
--- a/fuzz/Cargo.lock
+++ b/fuzz/Cargo.lock
@@ -0,0 +1,247 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
 version = 3
 [[package]]
 name = "anyhow"
 version = "1.0.65"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "98161a4e3e2184da77bb14f02184cdd111e83bbbcc9979dfee3c44b9a85f5602"
 [[package]]
 name = "arbitrary"
 version = "1.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f44124848854b941eafdb34f05b3bcf59472f643c7e151eba7c2b69daa469ed5"
 [[package]]
 name = "autocfg"
 version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 [[package]]
 name = "cc"
 version = "1.0.73"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11"
 dependencies = [
 "jobserver",
 ]
 [[package]]
 name = "cdivsufsort"
 version = "2.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "edefce019197609da416762da75bb000bbd2224b2d89a7e722c2296cbff79b8c"
 dependencies = [
 "cc",
 "sacabase",
 ]
 [[package]]
 name = "cfg-if"
 version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521"
 dependencies = [
 "cfg-if",
 "crossbeam-utils",
 ]
 [[package]]
 name = "crossbeam-utils"
 version = "0.8.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc"
 dependencies = [
 "cfg-if",
 "once_cell",
 ]
 [[package]]
 name = "jobserver"
 version = "0.1.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "068b1ee6743e4d11fb9c6a1e6064b3693a1b600e7f5f5988047d98b3dc9fb90b"
 dependencies = [
 "libc",
 ]
 [[package]]
 name = "lexopt"
 version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "478ee9e62aaeaf5b140bd4138753d1f109765488581444218d3ddda43234f3e8"
 [[package]]
 name = "libc"
 version = "0.2.133"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c0f80d65747a3e43d1596c7c5492d95d5edddaabd45a7fcdb02b95f644164966"
 [[package]]
 name = "libfuzzer-sys"
 version = "0.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ae185684fe19814afd066da15a7cc41e126886c21282934225d9fc847582da58"
 dependencies = [
 "arbitrary",
 "cc",
 "once_cell",
 ]
 [[package]]
 name = "num-traits"
 version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
 dependencies = [
 "autocfg",
 ]
 [[package]]
 name = "once_cell"
 version = "1.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1"
 [[package]]
 name = "pbr"
 version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff5751d87f7c00ae6403eb1fcbba229b9c76c9a30de8c1cf87182177b168cea2"
 dependencies = [
 "crossbeam-channel",
 "libc",
 "time",
 "winapi",
 ]
 [[package]]
 name = "proc-macro2"
 version = "1.0.44"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7bd7356a8122b6c4a24a82b278680c73357984ca2fc79a0f9fa6dea7dced7c58"
 dependencies = [
 "unicode-ident",
 ]
 [[package]]
 name = "quote"
 version = "1.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
 dependencies = [
 "proc-macro2",
 ]
 [[package]]
 name = "sacabase"
 version = "2.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9883fc3d6ce3d78bb54d908602f8bc1f7b5f983afe601dabe083009d86267a84"
 dependencies = [
 "num-traits",
 ]
 [[package]]
 name = "syn"
 version = "1.0.101"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2"
 dependencies = [
 "proc-macro2",
 "quote",
 "unicode-ident",
 ]
 [[package]]
 name = "thiserror"
 version = "1.0.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0a99cb8c4b9a8ef0e7907cd3b617cc8dc04d571c4e73c8ae403d80ac160bb122"
 dependencies = [
 "thiserror-impl",
 ]
 [[package]]
 name = "thiserror-impl"
 version = "1.0.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3a891860d3c8d66fec8e73ddb3765f90082374dbaaa833407b904a94f1a7eb43"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn",
 ]
 [[package]]
 name = "time"
 version = "0.1.44"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255"
 dependencies = [
 "libc",
 "wasi",
 "winapi",
 ]
 [[package]]
 name = "unicode-ident"
 version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd"
 [[package]]
 name = "upkr"
 version = "0.2.0-pre3"
 dependencies = [
 "anyhow",
 "cdivsufsort",
 "lexopt",
 "pbr",
 "thiserror",
 ]
 [[package]]
 name = "upkr-fuzz"
 version = "0.0.0"
 dependencies = [
 "libfuzzer-sys",
 "upkr",
 ]
 [[package]]
 name = "wasi"
 version = "0.10.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
 [[package]]
 name = "winapi"
 version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
 dependencies = [
 "winapi-i686-pc-windows-gnu",
 "winapi-x86_64-pc-windows-gnu",
 ]
 [[package]]
 name = "winapi-i686-pc-windows-gnu"
 version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
 [[package]]
 name = "winapi-x86_64-pc-windows-gnu"
 version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -0,0 +1,31 @@
 [package]
 name = "upkr-fuzz"
 version = "0.0.0"
 authors = ["Automatically generated"]
 publish = false
 edition = "2018"
 [package.metadata]
 cargo-fuzz = true
 [dependencies]
 libfuzzer-sys = "0.4"
 [dependencies.upkr]
 path = ".."
 # Prevent this from interfering with workspaces
 [workspace]
 members = ["."]
 [[bin]]
 name = "all_configs"
 path = "fuzz_targets/all_configs.rs"
 test = false
 doc = false
 [[bin]]
 name = "unpack"
 path = "fuzz_targets/unpack.rs"
 test = false
 doc = false
--- a/fuzz/fuzz_targets/all_configs.rs
+++ b/fuzz/fuzz_targets/all_configs.rs
@@ -0,0 +1,29 @@
 #![no_main]
 use libfuzzer_sys::fuzz_target;
 fuzz_target!(|data: &[u8]| {
    let mut config = upkr::Config::default();
    let mut level = 1;
    let mut data = data;
    if data.len() > 2 {
        let flags1 = data[0];
        let flags2 = data[1];
        data = &data[2..];
        config.use_bitstream = (flags1 & 1) != 0;
        config.parity_contexts = if (flags1 & 2) == 0 { 1 } else { 2 };
        config.invert_bit_encoding = (flags1 & 4) != 0;
        config.is_match_bit = (flags1 & 8) != 0;
        config.new_offset_bit = (flags1 & 16) != 0;
        config.continue_value_bit = (flags1 & 32) != 0;
        config.bitstream_is_big_endian = (flags1 & 64) != 0;
        config.simplified_prob_update = (flags1 & 128) != 0;
        config.no_repeated_offsets = (flags2 & 32) != 0;
        config.eof_in_length = (flags2 & 1) != 0;
        config.max_offset = if (flags2 & 2) == 0 { usize::MAX } else { 32 };
        config.max_length = if (flags2 & 4) == 0 { usize::MAX } else { 5 };
        level = (flags2 >> 3) & 3;
    }
    let packed = upkr::pack(data, level, &config, None);
    let unpacked = upkr::unpack(&packed, &config, 1024 * 1024).unwrap();
    assert!(unpacked == data);
 });
--- a/fuzz/fuzz_targets/unpack.rs
+++ b/fuzz/fuzz_targets/unpack.rs
@@ -0,0 +1,6 @@
 #![no_main]
 use libfuzzer_sys::fuzz_target;
 fuzz_target!(|data: &[u8]| {
    let _ = upkr::unpack(data, &upkr::Config::default(), 64 * 1024);
 });
--- a/release/.gitignore
+++ b/release/.gitignore
@@ -0,0 +1,5 @@
 *.zip
 *.tgz
 upkr-linux/
 upkr-windows/
 upkr-windows-32/
--- a/release/Makefile
+++ b/release/Makefile
@@ -0,0 +1,51 @@
 VERSION := $(shell cargo run --release -- --version)
 all: clean upkr-linux-$(VERSION).tgz upkr-windows-$(VERSION).zip upkr-windows-32-$(VERSION).zip
 clean:
 	rm -rf upkr-linux
 	rm -f upkr-linux*.tgz
 	rm -rf upkr-windows
 	rm -rf upkr-windows-32
 	rm -f upkr-windows*.zip
 upkr-linux-$(VERSION).tgz: upkr-linux/upkr PHONY
 	cp ../README.md upkr-linux
 	cd .. && git archive HEAD c_unpacker | tar -xC release/upkr-linux
 	cd .. && git archive HEAD z80_unpacker | tar -xC release/upkr-linux
 	cd .. && git archive HEAD asm_unpackers | tar -xC release/upkr-linux
 	tar czf $@ upkr-linux
 upkr-windows-$(VERSION).zip: upkr-windows/upkr.exe PHONY
 	cp ../README.md upkr-windows/
 	cd .. && git archive HEAD c_unpacker | tar -xC release/upkr-windows
 	cd .. && git archive HEAD z80_unpacker | tar -xC release/upkr-windows
 	cd .. && git archive HEAD asm_unpackers | tar -xC release/upkr-windows
 	zip -r -9 $@ upkr-windows
 upkr-windows-32-$(VERSION).zip: upkr-windows-32/upkr.exe PHONY
 	cp ../README.md upkr-windows-32/
 	cd .. && git archive HEAD c_unpacker | tar -xC release/upkr-windows-32
 	cd .. && git archive HEAD z80_unpacker | tar -xC release/upkr-windows-32
 	cd .. && git archive HEAD asm_unpackers | tar -xC release/upkr-windows-32
 	zip -r -9 $@ upkr-windows-32
 upkr-linux/upkr:
 	cargo build --target x86_64-unknown-linux-musl --release -F terminal
 	mkdir -p upkr-linux
 	cp ../target/x86_64-unknown-linux-musl/release/upkr upkr-linux/
 	strip upkr-linux/upkr
 upkr-windows/upkr.exe:
 	cargo build --target x86_64-pc-windows-gnu --release -F terminal
 	mkdir -p upkr-windows
 	cp ../target/x86_64-pc-windows-gnu/release/upkr.exe upkr-windows/
 	x86_64-w64-mingw32-strip upkr-windows/upkr.exe
 upkr-windows-32/upkr.exe:
 	cargo build --target i686-pc-windows-gnu --release -F terminal
 	mkdir -p upkr-windows-32
 	cp ../target/i686-pc-windows-gnu/release/upkr.exe upkr-windows-32/
 	i686-w64-mingw32-strip upkr-windows-32/upkr.exe
 PHONY:
--- a/src/context_state.rs
+++ b/src/context_state.rs
@@ -1,4 +1,7 @@
-use crate::rans::{PROB_BITS, ONE_PROB};
+use crate::{
    Config,
    rans::{ONE_PROB, PROB_BITS},
 };
 const INIT_PROB: u16 = 1 << (PROB_BITS - 1);
 const UPDATE_RATE: u32 = 4;
@@ -7,6 +10,8 @@ const UPDATE_ADD: u32 = 8;
 #[derive(Clone)]
 pub struct ContextState {
    contexts: Vec<u8>,
    invert_bit_encoding: bool,
    simplified_prob_update: bool,
 }
 pub struct Context<'a> {
@@ -15,9 +20,11 @@ pub struct Context<'a> {
 }
 impl ContextState {
-    pub fn new(size: usize) -> ContextState {
+    pub fn new(size: usize, config: &Config) -> ContextState {
        ContextState {
            contexts: vec![INIT_PROB as u8; size],
            invert_bit_encoding: config.invert_bit_encoding,
            simplified_prob_update: config.simplified_prob_update,
        }
    }
@@ -33,7 +40,16 @@ impl<'a> Context<'a> {
    pub fn update(&mut self, bit: bool) {
        let old = self.state.contexts[self.index];
-        self.state.contexts[self.index] = if bit {
+
        self.state.contexts[self.index] = if self.state.simplified_prob_update {
            let offset = if bit ^ self.state.invert_bit_encoding {
                ONE_PROB as i32 >> UPDATE_RATE
            } else {
                0
            };
            (offset + old as i32 - ((old as i32 + UPDATE_ADD as i32) >> UPDATE_RATE)) as u8
        } else if bit ^ self.state.invert_bit_encoding {
            old + ((ONE_PROB - old as u32 + UPDATE_ADD) >> UPDATE_RATE) as u8
        } else {
            old - ((old as u32 + UPDATE_ADD) >> UPDATE_RATE) as u8
--- a/src/greedy_packer.rs
+++ b/src/greedy_packer.rs
@@ -1,16 +1,16 @@
 use crate::lz;
 use crate::match_finder::MatchFinder;
 use crate::rans::RansCoder;
 use crate::ProgressCallback;
 use crate::{lz, Config};
 pub fn pack(
    data: &[u8],
-    use_bitstream: bool,
+    config: &Config,
    mut progress_callback: Option<ProgressCallback>,
 ) -> Vec<u8> {
    let mut match_finder = MatchFinder::new(data);
-    let mut rans_coder = RansCoder::new(use_bitstream);
+    let mut rans_coder = RansCoder::new(config);
-    let mut state = lz::CoderState::new();
+    let mut state = lz::CoderState::new(config);
    let mut pos = 0;
    while pos < data.len() {
@@ -19,15 +19,16 @@ pub fn pack(
        }
        let mut encoded_match = false;
        if let Some(m) = match_finder.matches(pos).next() {
-            let max_offset = 1 << (m.length * 3 - 1).min(31);
+            let max_offset = config.max_offset.min(1 << (m.length * 3 - 1).min(31));
            let offset = pos - m.pos;
-            if offset < max_offset {
+            if offset < max_offset && m.length >= config.min_length() {
                let length = m.length.min(config.max_length);
                lz::Op::Match {
                    offset: offset as u32,
-                    len: m.length as u32,
+                    len: length as u32,
                }
-                .encode(&mut rans_coder, &mut state);
+                .encode(&mut rans_coder, &mut state, config);
-                pos += m.length;
+                pos += length;
                encoded_match = true;
            }
        }
@@ -39,13 +40,14 @@ pub fn pack(
                    .iter()
                    .zip(data[(pos - offset)..].iter())
                    .take_while(|(a, b)| a == b)
-                    .count();
+                    .count()
-                if length > 0 {
+                    .min(config.max_length);
                if length >= config.min_length() {
                    lz::Op::Match {
                        offset: offset as u32,
                        len: length as u32,
                    }
-                    .encode(&mut rans_coder, &mut state);
+                    .encode(&mut rans_coder, &mut state, config);
                    pos += length;
                    encoded_match = true;
                }
@@ -53,11 +55,11 @@ pub fn pack(
        }
        if !encoded_match {
-            lz::Op::Literal(data[pos]).encode(&mut rans_coder, &mut state);
+            lz::Op::Literal(data[pos]).encode(&mut rans_coder, &mut state, config);
            pos += 1;
        }
    }
-    lz::encode_eof(&mut rans_coder, &mut state);
+    lz::encode_eof(&mut rans_coder, &mut state, config);
    rans_coder.finish()
 }
--- a/src/heatmap.rs
+++ b/src/heatmap.rs
@@ -0,0 +1,213 @@
 /// Heatmap information about a compressed block of data.
 ///
 /// For each byte in the uncompressed data, the heatmap provides two pieces of intormation:
 /// 1. whether this byte was encoded as a literal or as part of a match
 /// 2. how many (fractional) bits where spend on encoding this byte
 ///
 /// For the sake of the heatmap, the cost of literals are spread out across all matches
 /// that reference the literal.
 ///
 /// If the `terminal` feature is enabled, there is a function to write out the
 /// heatmap as a colored hexdump.
 pub struct Heatmap {
    data: Vec<u8>,
    cost: Vec<f32>,
    raw_cost: Vec<f32>,
    literal_index: Vec<usize>,
 }
 impl Heatmap {
    pub(crate) fn new() -> Heatmap {
        Heatmap {
            data: Vec::new(),
            cost: Vec::new(),
            raw_cost: Vec::new(),
            literal_index: Vec::new(),
        }
    }
    pub(crate) fn add_literal(&mut self, byte: u8, cost: f32) {
        self.data.push(byte);
        self.cost.push(cost);
        self.literal_index.push(self.literal_index.len());
    }
    pub(crate) fn add_match(&mut self, offset: usize, length: usize, mut cost: f32) {
        cost /= length as f32;
        for _ in 0..length {
            self.data.push(self.data[self.data.len() - offset]);
            self.literal_index
                .push(self.literal_index[self.literal_index.len() - offset]);
            self.cost.push(cost);
        }
    }
    pub(crate) fn finish(&mut self) {
        self.raw_cost = self.cost.clone();
        let mut ref_count = vec![0usize; self.literal_index.len()];
        for &index in &self.literal_index {
            ref_count[index] += 1;
        }
        let mut shifted = vec![];
        for (&index, &cost) in self.literal_index.iter().zip(self.cost.iter()) {
            let delta = (self.cost[index] - cost) / ref_count[index] as f32;
            shifted.push(delta);
            shifted[index] -= delta;
        }
        for (cost, delta) in self.cost.iter_mut().zip(shifted.into_iter()) {
            *cost += delta;
        }
    }
    /// Reverses the heatmap
    pub fn reverse(&mut self) {
        self.data.reverse();
        self.cost.reverse();
        self.literal_index.reverse();
        for index in self.literal_index.iter_mut() {
            *index = self.data.len() - *index;
        }
    }
    /// The number of (uncompressed) bytes of data in this heatmap
    pub fn len(&self) -> usize {
        self.cost.len()
    }
    /// Returns whether the heatmap data is empty
    pub fn is_empty(&self) -> bool {
        self.cost.is_empty()
    }
    /// Returns whether the byte at `index` was encoded as a literal
    pub fn is_literal(&self, index: usize) -> bool {
        self.literal_index[index] == index
    }
    /// Returns the cost of encoding the byte at `index` in (fractional) bits.
    /// The cost of literal bytes is spread across the matches that reference it.
    /// See `raw_cost` for the raw encoding cost of each byte.
    pub fn cost(&self, index: usize) -> f32 {
        self.cost[index]
    }
    /// Returns the raw cost of encoding the byte at `index` in (fractional) bits
    pub fn raw_cost(&self, index: usize) -> f32 {
        self.raw_cost[index]
    }
    /// Returns the uncompressed data byte at `index`
    pub fn byte(&self, index: usize) -> u8 {
        self.data[index]
    }
    #[cfg(feature = "crossterm")]
    /// Print the heatmap as a colored hexdump
    pub fn print_as_hex(&self) -> std::io::Result<()> {
        self.print_as_hex_internal(false)
    }
    #[cfg(feature = "crossterm")]
    /// Print the heatmap as a colored hexdump, based on `raw_cost`.
    pub fn print_as_hex_raw_cost(&self) -> std::io::Result<()> {
        self.print_as_hex_internal(true)
    }
    #[cfg(feature = "crossterm")]
    fn print_as_hex_internal(&self, report_raw_cost: bool) -> std::io::Result<()> {
        use crossterm::{
            QueueableCommand,
            style::{Attribute, Color, Print, SetAttribute, SetBackgroundColor},
        };
        use std::io::{Write, stdout};
        fn set_color(
            mut out: impl QueueableCommand,
            heatmap: &Heatmap,
            index: usize,
            num_colors: u16,
            report_raw_cost: bool,
        ) -> std::io::Result<()> {
            let cost = if report_raw_cost {
                heatmap.raw_cost(index)
            } else {
                heatmap.cost(index)
            };
            if num_colors < 256 {
                let colors = [
                    Color::Red,
                    Color::Yellow,
                    Color::Green,
                    Color::Cyan,
                    Color::Blue,
                    Color::DarkBlue,
                    Color::Black,
                ];
                let color_index = (3. - cost.log2())
                    .round()
                    .max(0.)
                    .min((colors.len() - 1) as f32) as usize;
                out.queue(SetBackgroundColor(colors[color_index]))?;
            } else {
                let colors = [
                    196, 166, 136, 106, 76, 46, 41, 36, 31, 26, 21, 20, 19, 18, 17, 16,
                ];
                let color_index = ((3. - cost.log2()) * 2.5)
                    .round()
                    .max(0.)
                    .min((colors.len() - 1) as f32) as usize;
                out.queue(SetBackgroundColor(Color::AnsiValue(colors[color_index])))?;
            }
            out.queue(SetAttribute(if heatmap.is_literal(index) {
                Attribute::Underlined
            } else {
                Attribute::NoUnderline
            }))?;
            Ok(())
        }
        let num_colors = crossterm::style::available_color_count();
        let term_width = crossterm::terminal::size()?.0.min(120) as usize;
        let bytes_per_row = (term_width - 8) / 4;
        for row_start in (0..self.data.len()).step_by(bytes_per_row) {
            let row_range = row_start..self.data.len().min(row_start + bytes_per_row);
            let mut stdout = stdout();
            stdout.queue(Print(&format!("{:04x}  ", row_start)))?;
            for i in row_range.clone() {
                set_color(&mut stdout, self, i, num_colors, report_raw_cost)?;
                stdout.queue(Print(&format!("{:02x} ", self.data[i])))?;
            }
            let num_spaces = 1 + (bytes_per_row - (row_range.end - row_range.start)) * 3;
            let gap: String = std::iter::repeat(' ').take(num_spaces).collect();
            stdout
                .queue(SetAttribute(Attribute::Reset))?
                .queue(Print(&gap))?;
            for i in row_range.clone() {
                set_color(&mut stdout, self, i, num_colors, report_raw_cost)?;
                let byte = self.data[i];
                if byte >= 32 && byte < 127 {
                    stdout.queue(Print(format!("{}", byte as char)))?;
                } else {
                    stdout.queue(Print("."))?;
                }
            }
            stdout
                .queue(SetAttribute(Attribute::Reset))?
                .queue(Print("\n"))?;
            stdout.flush()?;
        }
        Ok(())
    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,27 +1,142 @@
 #![deny(missing_docs)]
 //! Compression and decompression of the upkr format and variants.
 //!
 //! Upkr is a compression format initially designed for the MicroW8 fantasy console,
 //! with design goals being a competitive compression ratio, reasonable fast
 //! decompression, low memory overhead and very small decompression code
 //! when handoptimized in assembler. (An optimized DOS execuable decompressor is <140 bytes.)
 mod context_state;
 mod greedy_packer;
 mod heatmap;
 mod lz;
 mod match_finder;
 mod parsing_packer;
 mod rans;
-pub use lz::unpack;
+pub use heatmap::Heatmap;
 pub use lz::{calculate_margin, create_heatmap, unpack, UnpackError};
 /// The type of a callback function to be given to the `pack` function.
 ///
 /// It will be periodically called with the number of bytes of the input already processed.
 pub type ProgressCallback<'a> = &'a mut dyn FnMut(usize);
 /// A configuration of which compression format variation to use.
 ///
 /// Use `Config::default()` for the standard upkr format.
 ///
 /// Compression format variants exist to help with micro-optimizations in uncompression
 /// code on specific platforms.
 #[derive(Debug)]
 pub struct Config {
    /// Shift in bits from a bitstream into the rANS state, rather than whole bytes.
    /// This decreases the size of the rNAS state to 16 bits which is very useful on
    /// 8 bit platforms.
    pub use_bitstream: bool,
    /// The number of parity contexts (usually 1, 2 or 4). This can improve compression
    /// on data that consists of regular groups of 2 or 4 bytes. One example is 32bit ARM
    /// code, where each instruction is 4 bytes, so `parity_contexts = 4` improves compression
    /// quite a bit. Defaults to `1`.
    pub parity_contexts: usize,
    /// Invert the encoding of bits in the rANS coder. `bit = state_lo >= prob` instead of
    /// `bit = state_lo < prob`.
    pub invert_bit_encoding: bool,
    /// The boolean value which encodes a match. Defaults to `true`.
    pub is_match_bit: bool,
    /// The boolean value which encodes a new offset (rather than re-using the previous offset).
    /// Defaults to `true`.
    pub new_offset_bit: bool,
    /// The boolean value which encodes that there are more bits comming for length/offset values.
    /// Defaults to `true`.
    pub continue_value_bit: bool,
    /// Reverses the bits in the bitstream.
    pub bitstream_is_big_endian: bool,
    /// A slightly less accurate, but slightly simpler variation of the prob update in the
    /// rANS coder, Used for the z80 uncompressor.
    pub simplified_prob_update: bool,
    /// Disables support for re-using the last offset in the compression format.
    /// This might save a few bytes when working with very small data.
    pub no_repeated_offsets: bool,
    /// Standard upkr encodes the EOF marker in the offset. This encodes it in the match length
    /// instead.
    pub eof_in_length: bool,
    /// The maximum match offset value to encode when compressing.
    pub max_offset: usize,
    /// The maximum match length value to encode when compressing.
    pub max_length: usize,
 }
 impl Default for Config {
    fn default() -> Config {
        Config {
            use_bitstream: false,
            parity_contexts: 1,
            invert_bit_encoding: false,
            is_match_bit: true,
            new_offset_bit: true,
            continue_value_bit: true,
            bitstream_is_big_endian: false,
            simplified_prob_update: false,
            no_repeated_offsets: false,
            eof_in_length: false,
            max_offset: usize::MAX,
            max_length: usize::MAX,
        }
    }
 }
 impl Config {
    fn min_length(&self) -> usize {
        if self.eof_in_length {
            2
        } else {
            1
        }
    }
 }
 /// Compresses the given data.
 ///
 /// # Arguments
 /// - `data`: The data to compress
 /// - `level`: The compression level (0-9). Increasing the level by one roughly halves the
 ///   compression speed.
 /// - `config`: The compression format variant to use.
 /// - `progress_callback`: An optional callback which will periodically be called with
 ///   the number of bytes already processed.
 ///
 /// # Example
 /// ```rust
 /// let compressed_data = upkr::pack(b"Hello, World! Yellow world!", 0, &upkr::Config::default(), None);
 /// assert!(compressed_data.len() < 27);
 /// ```
 pub fn pack(
    data: &[u8],
    level: u8,
-    use_bitstream: bool,
+    config: &Config,
    progress_callback: Option<ProgressCallback>,
 ) -> Vec<u8> {
    if level == 0 {
-        greedy_packer::pack(data, use_bitstream, progress_callback)
+        greedy_packer::pack(data, config, progress_callback)
    } else {
-        parsing_packer::pack(data, level, use_bitstream, progress_callback)
+        parsing_packer::pack(data, level, config, progress_callback)
    }
 }
 /// Estimate the exact (fractional) size of upkr compressed data.
 ///
 /// Note that this currently does NOT work for the bitstream variant.
 pub fn compressed_size(mut data: &[u8]) -> f32 {
    let mut state = 0;
    while state < 4096 {
--- a/src/lz.rs
+++ b/src/lz.rs
@@ -1,5 +1,8 @@
 use crate::Config;
 use crate::context_state::ContextState;
 use crate::heatmap::Heatmap;
 use crate::rans::{EntropyCoder, RansDecoder};
 use thiserror::Error;
 #[derive(Copy, Clone, Debug)]
 pub enum Op {
@@ -8,42 +11,73 @@ pub enum Op {
 }
 impl Op {
-    pub fn encode(&self, coder: &mut dyn EntropyCoder, state: &mut CoderState) {
+    pub fn encode(&self, coder: &mut dyn EntropyCoder, state: &mut CoderState, config: &Config) {
-        match self {
+        let literal_base = state.pos % state.parity_contexts * 256;
-            &Op::Literal(lit) => {
+        match *self {
-                encode_bit(coder, state, 0, false);
+            Op::Literal(lit) => {
                encode_bit(coder, state, literal_base, !config.is_match_bit);
                let mut context_index = 1;
                for i in (0..8).rev() {
                    let bit = (lit >> i) & 1 != 0;
-                    encode_bit(coder, state, context_index, bit);
+                    encode_bit(coder, state, literal_base + context_index, bit);
                    context_index = (context_index << 1) | bit as usize;
                }
                state.prev_was_match = false;
                state.pos += 1;
            }
-            &Op::Match { offset, len } => {
+            Op::Match { offset, len } => {
-                encode_bit(coder, state, 0, true);
+                encode_bit(coder, state, literal_base, config.is_match_bit);
-                if !state.prev_was_match {
+                let mut new_offset = true;
-                    encode_bit(coder, state, 256, offset != state.last_offset);
+                if !state.prev_was_match && !config.no_repeated_offsets {
-                } else {
+                    new_offset = offset != state.last_offset;
-                    assert!(offset != state.last_offset);
+                    encode_bit(
                        coder,
                        state,
                        256 * state.parity_contexts,
                        new_offset == config.new_offset_bit,
                    );
                }
-                if offset != state.last_offset {
+                assert!(offset as usize <= config.max_offset);
-                    encode_length(coder, state, 257, offset + 1);
+                if new_offset {
                    encode_length(
                        coder,
                        state,
                        256 * state.parity_contexts + 1,
                        offset + if config.eof_in_length { 0 } else { 1 },
                        config,
                    );
                    state.last_offset = offset;
                }
-                encode_length(coder, state, 257 + 64, len);
+                assert!(len as usize >= config.min_length() && len as usize <= config.max_length);
                encode_length(coder, state, 256 * state.parity_contexts + 65, len, config);
                state.prev_was_match = true;
                state.pos += len as usize;
            }
        }
    }
 }
-pub fn encode_eof(coder: &mut dyn EntropyCoder, state: &mut CoderState) {
+pub fn encode_eof(coder: &mut dyn EntropyCoder, state: &mut CoderState, config: &Config) {
-    encode_bit(coder, state, 0, true);
+    encode_bit(
-    if !state.prev_was_match {
+        coder,
-        encode_bit(coder, state, 256, true);
+        state,
        state.pos % state.parity_contexts * 256,
        config.is_match_bit,
    );
    if !state.prev_was_match && !config.no_repeated_offsets {
        encode_bit(
            coder,
            state,
            256 * state.parity_contexts,
            config.new_offset_bit ^ config.eof_in_length,
        );
    }
    if !config.eof_in_length || state.prev_was_match || config.no_repeated_offsets {
        encode_length(coder, state, 256 * state.parity_contexts + 1, 1, config);
    }
    if config.eof_in_length {
        encode_length(coder, state, 256 * state.parity_contexts + 65, 1, config);
    }
    encode_length(coder, state, 257, 1);
 }
 fn encode_bit(
@@ -60,17 +94,18 @@ fn encode_length(
    state: &mut CoderState,
    context_start: usize,
    mut value: u32,
    config: &Config,
 ) {
    assert!(value >= 1);
    let mut context_index = context_start;
    while value >= 2 {
-        encode_bit(coder, state, context_index, true);
+        encode_bit(coder, state, context_index, config.continue_value_bit);
        encode_bit(coder, state, context_index + 1, value & 1 != 0);
        context_index += 2;
        value >>= 1;
    }
-    encode_bit(coder, state, context_index, false);
+    encode_bit(coder, state, context_index, !config.continue_value_bit);
 }
 #[derive(Clone)]
@@ -78,14 +113,18 @@ pub struct CoderState {
    contexts: ContextState,
    last_offset: u32,
    prev_was_match: bool,
    pos: usize,
    parity_contexts: usize,
 }
 impl CoderState {
-    pub fn new() -> CoderState {
+    pub fn new(config: &Config) -> CoderState {
        CoderState {
-            contexts: ContextState::new(1 + 255 + 1 + 64 + 64),
+            contexts: ContextState::new((1 + 255) * config.parity_contexts + 1 + 64 + 64, config),
            last_offset: 0,
            prev_was_match: false,
            pos: 0,
            parity_contexts: config.parity_contexts,
        }
    }
@@ -94,55 +133,201 @@ impl CoderState {
    }
 }
-pub fn unpack(packed_data: &[u8], use_bitstream: bool) -> Vec<u8> {
+/// The error type for the uncompressing related functions
-    let mut decoder = RansDecoder::new(packed_data, use_bitstream);
+#[derive(Error, Debug)]
-    let mut contexts = ContextState::new(1 + 255 + 1 + 64 + 64);
+pub enum UnpackError {
    /// a match offset pointing beyond the start of the unpacked data was encountered
    #[error("match offset out of range: {offset} > {position}")]
    OffsetOutOfRange {
        /// the match offset
        offset: usize,
        /// the current position in the uncompressed stream
        position: usize,
    },
    /// The passed size limit was exceeded
    #[error("Unpacked data over size limit: {size} > {limit}")]
    OverSize {
        /// the size of the uncompressed data
        size: usize,
        /// the size limit passed into the function
        limit: usize,
    },
    /// The end of the packed data was reached without an encoded EOF marker
    #[error("Unexpected end of input data")]
    UnexpectedEOF {
        #[from]
        /// the underlying EOF error in the rANS decoder
        source: crate::rans::UnexpectedEOF,
    },
    /// An offset or length value was found that exceeded 32bit
    #[error("Overflow while reading value")]
    ValueOverflow,
 }
 /// Uncompress a piece of compressed data
 ///
 /// Returns either the uncompressed data, or an `UnpackError`
 ///
 /// # Parameters
 ///
 /// - `packed_data`: the compressed data
 /// - `config`: the exact compression format config used to compress the data
 /// - `max_size`: the maximum size of uncompressed data to return. When this is exceeded,
 ///   `UnpackError::OverSize` is returned
 pub fn unpack(
    packed_data: &[u8],
    config: &Config,
    max_size: usize,
 ) -> Result<Vec<u8>, UnpackError> {
    let mut result = vec![];
-    let mut offset = 0;
+    let _ = unpack_internal(Some(&mut result), None, packed_data, config, max_size)?;
    Ok(result)
 }
 /// Calculates the minimum margin when overlapping buffers.
 ///
 /// Returns the minimum margin needed between the end of the compressed data and the
 /// end of the uncompressed data when overlapping the two buffers to save on RAM.
 pub fn calculate_margin(packed_data: &[u8], config: &Config) -> Result<isize, UnpackError> {
    unpack_internal(None, None, packed_data, config, usize::MAX)
 }
 /// Calculates a `Heatmap` from compressed data.
 ///
 /// # Parameters
 ///
 /// - `packed_data`: the compressed data
 /// - `config`: the exact compression format config used to compress the data
 /// - `max_size`: the maximum size of the heatmap to return. When this is exceeded,
 ///   `UnpackError::OverSize` is returned
 pub fn create_heatmap(
    packed_data: &[u8],
    config: &Config,
    max_size: usize,
 ) -> Result<Heatmap, UnpackError> {
    let mut heatmap = Heatmap::new();
    let _ = unpack_internal(None, Some(&mut heatmap), packed_data, config, max_size)?;
    Ok(heatmap)
 }
 fn unpack_internal(
    mut result: Option<&mut Vec<u8>>,
    mut heatmap: Option<&mut Heatmap>,
    packed_data: &[u8],
    config: &Config,
    max_size: usize,
 ) -> Result<isize, UnpackError> {
    let mut decoder = RansDecoder::new(packed_data, config)?;
    let mut contexts = ContextState::new((1 + 255) * config.parity_contexts + 1 + 64 + 64, config);
    let mut offset = usize::MAX;
    let mut position = 0usize;
    let mut prev_was_match = false;
    let mut margin = 0isize;
    fn decode_length(
        decoder: &mut RansDecoder,
        contexts: &mut ContextState,
        mut context_index: usize,
-    ) -> usize {
+        config: &Config,
    ) -> Result<usize, UnpackError> {
        let mut length = 0;
        let mut bit_pos = 0;
-        while decoder.decode_with_context(&mut contexts.context_mut(context_index)) {
+        while decoder.decode_with_context(&mut contexts.context_mut(context_index))?
-            length |= (decoder.decode_with_context(&mut contexts.context_mut(context_index + 1))
+            == config.continue_value_bit
        {
            length |= (decoder.decode_with_context(&mut contexts.context_mut(context_index + 1))?
                as usize)
                << bit_pos;
            bit_pos += 1;
            if bit_pos >= 32 {
                return Err(UnpackError::ValueOverflow);
            }
            context_index += 2;
        }
-        length | (1 << bit_pos)
+        Ok(length | (1 << bit_pos))
    }
    loop {
-        if decoder.decode_with_context(&mut contexts.context_mut(0)) {
+        let prev_decoder = decoder.clone();
-            if prev_was_match || decoder.decode_with_context(&mut contexts.context_mut(256)) {
+        margin = margin.max(position as isize - decoder.pos() as isize);
-                offset = decode_length(&mut decoder, &mut contexts, 257) - 1;
+        let literal_base = position % config.parity_contexts * 256;
        if decoder.decode_with_context(&mut contexts.context_mut(literal_base))?
            == config.is_match_bit
        {
            if config.no_repeated_offsets
                || prev_was_match
                || decoder
                    .decode_with_context(&mut contexts.context_mut(256 * config.parity_contexts))?
                    == config.new_offset_bit
            {
                offset = decode_length(
                    &mut decoder,
                    &mut contexts,
                    256 * config.parity_contexts + 1,
                    config,
                )? - if config.eof_in_length { 0 } else { 1 };
                if offset == 0 {
                    break;
                }
            }
-            let length = decode_length(&mut decoder, &mut contexts, 257 + 64);
+            let length = decode_length(
-            for _ in 0..length {
+                &mut decoder,
-                result.push(result[result.len() - offset]);
+                &mut contexts,
                256 * config.parity_contexts + 65,
                config,
            )?;
            if config.eof_in_length && length == 1 {
                break;
            }
            if offset > position {
                return Err(UnpackError::OffsetOutOfRange { offset, position });
            }
            if let Some(ref mut heatmap) = heatmap {
                heatmap.add_match(offset, length, decoder.cost(&prev_decoder));
            }
            if let Some(ref mut result) = result {
                for _ in 0..length {
                    if result.len() < max_size {
                        result.push(result[result.len() - offset]);
                    } else {
                        break;
                    }
                }
            }
            position += length;
            prev_was_match = true;
        } else {
            let mut context_index = 1;
            let mut byte = 0;
            for i in (0..8).rev() {
-                let bit = decoder.decode_with_context(&mut contexts.context_mut(context_index));
+                let bit = decoder
                    .decode_with_context(&mut contexts.context_mut(literal_base + context_index))?;
                context_index = (context_index << 1) | bit as usize;
                byte |= (bit as u8) << i;
            }
            if let Some(ref mut heatmap) = heatmap {
                heatmap.add_literal(byte, decoder.cost(&prev_decoder));
            }
            if let Some(ref mut result) = result {
                if result.len() < max_size {
                    result.push(byte);
                }
            }
            position += 1;
            prev_was_match = false;
        }
    }
-    result
+    if let Some(heatmap) = heatmap {
        heatmap.finish();
    }
    if position > max_size {
        return Err(UnpackError::OverSize {
            size: position,
            limit: max_size,
        });
    }
    Ok(margin + decoder.pos() as isize - position as isize)
 }
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,64 +1,302 @@
-use anyhow::{bail, Result};
+use anyhow::Result;
 use std::ffi::OsStr;
 use std::io::prelude::*;
 use std::process;
 use std::{fs::File, path::PathBuf};
 fn main() -> Result<()> {
-    let mut args = pico_args::Arguments::from_env();
+    let mut config = upkr::Config::default();
    let mut reverse = false;
    let mut unpack = false;
    let mut calculate_margin = false;
    let mut create_heatmap = false;
    let mut report_raw_cost = false;
    #[allow(unused_mut)]
    let mut do_hexdump = false;
    let mut level = 2;
    let mut infile: Option<PathBuf> = None;
    let mut outfile: Option<PathBuf> = None;
    let mut max_unpacked_size = 512 * 1024 * 1024;
-    match args.subcommand()?.as_ref().map(|s| s.as_str()) {
+    let mut parser = lexopt::Parser::from_env();
-        None => print_help(),
+    while let Some(arg) = parser.next()? {
-        Some("pack") => {
+        use lexopt::prelude::*;
-            let level = args.opt_value_from_str(["-l", "--level"])?.unwrap_or(2u8);
+        match arg {
-            let use_bitstream = args.contains(["-b", "--bitstream"]);
+            Short('b') | Long("bitstream") => config.use_bitstream = true,
            Short('p') | Long("parity") => config.parity_contexts = parser.value()?.parse()?,
            Short('r') | Long("reverse") => reverse = true,
            Long("invert-is-match-bit") => config.is_match_bit = false,
            Long("invert-new-offset-bit") => config.new_offset_bit = false,
            Long("invert-continue-value-bit") => config.continue_value_bit = false,
            Long("invert-bit-encoding") => config.invert_bit_encoding = true,
            Long("simplified-prob-update") => config.simplified_prob_update = true,
            Long("big-endian-bitstream") => {
                config.use_bitstream = true;
                config.bitstream_is_big_endian = true;
            }
            Long("no-repeated-offsets") => config.no_repeated_offsets = true,
            Long("eof-in-length") => config.eof_in_length = true,
-            let infile = args.free_from_os_str::<PathBuf, bool>(|s| Ok(s.into()))?;
+            Long("max-offset") => config.max_offset = parser.value()?.parse()?,
-            let outfile = args.free_from_os_str::<PathBuf, bool>(|s| Ok(s.into()))?;
+            Long("max-length") => config.max_length = parser.value()?.parse()?,
-            let mut data = vec![];
+            Long("z80") => {
-            File::open(infile)?.read_to_end(&mut data)?;
+                config.use_bitstream = true;
                config.bitstream_is_big_endian = true;
                config.invert_bit_encoding = true;
                config.simplified_prob_update = true;
                level = 9;
            }
            Long("x86") => {
                config.use_bitstream = true;
                config.continue_value_bit = false;
                config.is_match_bit = false;
                config.new_offset_bit = false;
            }
            Long("x86b") => {
                config.use_bitstream = true;
                config.continue_value_bit = false;
                config.no_repeated_offsets = true;
                level = 9;
            }
-            let mut pb = pbr::ProgressBar::new(data.len() as u64);
+            Short('u') | Long("unpack") | Short('d') | Long("decompress") => unpack = true,
            Long("margin") => calculate_margin = true,
            Long("heatmap") => create_heatmap = true,
            Long("raw-cost") => report_raw_cost = true,
            #[cfg(feature = "crossterm")]
            Long("hexdump") => do_hexdump = true,
            Short('l') | Long("level") => level = parser.value()?.parse()?,
            Short(n) if n.is_ascii_digit() => level = n as u8 - b'0',
            Short('h') | Long("help") => print_help(0),
            Long("version") => {
                println!("{}", env!("CARGO_PKG_VERSION"));
                process::exit(0);
            }
            Long("max-unpacked-size") => max_unpacked_size = parser.value()?.parse()?,
            Value(val) if infile.is_none() => infile = Some(val.into()),
            Value(val) if outfile.is_none() => outfile = Some(val.into()),
            _ => return Err(arg.unexpected().into()),
        }
    }
    let infile = IoTarget::from_filename(infile);
    let outfile = |tpe: OutFileType| infile.output(tpe, &outfile);
    if config.parity_contexts != 1 && config.parity_contexts != 2 && config.parity_contexts != 4 {
        eprintln!("--parity has to be 1, 2, or 4");
        process::exit(1);
    }
    if !unpack && !calculate_margin && !create_heatmap {
        let mut data = infile.read()?;
        if reverse {
            data.reverse();
        }
        #[cfg(feature = "terminal")]
        let mut packed_data = {
            let mut pb = pbr::ProgressBar::on(std::io::stderr(), data.len() as u64);
            pb.set_units(pbr::Units::Bytes);
            let packed_data = upkr::pack(
                &data,
                level,
-                use_bitstream,
+                &config,
                Some(&mut |pos| {
                    pb.set(pos as u64);
                }),
            );
            pb.finish();
            eprintln!();
            packed_data
        };
        #[cfg(not(feature = "terminal"))]
        let mut packed_data = upkr::pack(&data, level, &config, None);
-            println!(
+        if reverse {
            packed_data.reverse();
        }
        eprintln!(
            "Compressed {} bytes to {} bytes ({}%)",
            data.len(),
            packed_data.len(),
            packed_data.len() as f32 * 100. / data.len() as f32
        );
-            File::create(outfile)?.write_all(&packed_data)?;
+        outfile(OutFileType::Packed).write(&packed_data)?;
    } else {
        let mut data = infile.read()?;
        if reverse {
            data.reverse();
        }
-        Some("unpack") => {
+        if unpack {
-            let use_bitstream = args.contains(["-b", "--bitstream"]);
+            let mut unpacked_data = upkr::unpack(&data, &config, max_unpacked_size)?;
-
+            if reverse {
-            let infile = args.free_from_os_str::<PathBuf, bool>(|s| Ok(s.into()))?;
+                unpacked_data.reverse();
            let outfile = args.free_from_os_str::<PathBuf, bool>(|s| Ok(s.into()))?;
            let mut data = vec![];
            File::open(infile)?.read_to_end(&mut data)?;
            let packed_data = upkr::unpack(&data, use_bitstream);
            File::create(outfile)?.write_all(&packed_data)?;
            }
-        Some(other) => {
+            outfile(OutFileType::Unpacked).write(&unpacked_data)?;
-            bail!("Unknown subcommand '{}'", other);
+        }
        if create_heatmap {
            let mut heatmap = upkr::create_heatmap(&data, &config, max_unpacked_size)?;
            if reverse {
                heatmap.reverse();
            }
            match do_hexdump {
                #[cfg(feature = "crossterm")]
                true => {
                    if report_raw_cost {
                        heatmap.print_as_hex_raw_cost()?
                    } else {
                        heatmap.print_as_hex()?
                    }
                }
                _ => {
                    let mut heatmap_bin = Vec::with_capacity(heatmap.len());
                    for i in 0..heatmap.len() {
                        let cost = if report_raw_cost {
                            heatmap.raw_cost(i)
                        } else {
                            heatmap.cost(i)
                        };
                        let cost = (cost.log2() * 8. + 64.).round().clamp(0., 127.) as u8;
                        heatmap_bin.push((cost << 1) | heatmap.is_literal(i) as u8);
                    }
                    outfile(OutFileType::Heatmap).write(&heatmap_bin)?;
                }
            }
        }
        if calculate_margin {
            println!("{}", upkr::calculate_margin(&data, &config)?);
        }
    }
    Ok(())
 }
-fn print_help() {
+enum OutFileType {
-    eprintln!("Usage:");
+    Packed,
-    eprintln!("  upkr pack [-l level(0-9)] <infile> <outfile>");
+    Unpacked,
-    eprintln!("  upkr unpack <infile> <outfile>");
+    Heatmap,
-    std::process::exit(1);
+}
 enum IoTarget {
    StdInOut,
    File(PathBuf),
 }
 impl IoTarget {
    fn from_filename(filename: Option<PathBuf>) -> IoTarget {
        if let Some(path) = filename {
            if path.as_os_str() == "-" {
                IoTarget::StdInOut
            } else {
                IoTarget::File(path)
            }
        } else {
            IoTarget::StdInOut
        }
    }
    fn read(&self) -> Result<Vec<u8>> {
        let mut buffer = vec![];
        match *self {
            IoTarget::StdInOut => std::io::stdin().read_to_end(&mut buffer)?,
            IoTarget::File(ref path) => File::open(path)?.read_to_end(&mut buffer)?,
        };
        Ok(buffer)
    }
    fn write(&self, data: &[u8]) -> Result<()> {
        match *self {
            IoTarget::StdInOut => std::io::stdout().write_all(data)?,
            IoTarget::File(ref path) => File::create(path)?.write_all(data)?,
        };
        Ok(())
    }
    fn output(&self, tpe: OutFileType, outname: &Option<PathBuf>) -> IoTarget {
        if outname.is_some() {
            return IoTarget::from_filename(outname.clone());
        }
        match *self {
            IoTarget::StdInOut => IoTarget::StdInOut,
            IoTarget::File(ref path) => {
                let mut name = path.clone();
                match tpe {
                    OutFileType::Packed => {
                        let mut filename = name
                            .file_name()
                            .unwrap_or_else(|| OsStr::new(""))
                            .to_os_string();
                        filename.push(".upk");
                        name.set_file_name(filename);
                    }
                    OutFileType::Unpacked => {
                        if name.extension().filter(|&e| e == "upk").is_some() {
                            name.set_extension("");
                        } else {
                            name.set_extension("bin");
                        }
                    }
                    OutFileType::Heatmap => {
                        name.set_extension("heatmap");
                    }
                }
                IoTarget::File(name)
            }
        }
    }
 }
 fn print_help(exit_code: i32) -> ! {
    eprintln!("Usage:");
    eprintln!("  upkr [-l level(0-9)] [config options] <infile> [<outfile>]");
    eprintln!("  upkr -u [config options] <infile> [<outfile>]");
    eprintln!("  upkr --heatmap [config options] <infile> [<outfile>]");
    eprintln!("  upkr --margin [config options] <infile>");
    eprintln!();
    eprintln!(" -l, --level N       compression level 0-9");
    eprintln!(" -0, ..., -9         short form for setting compression level");
    eprintln!(" -d, --decompress    decompress infile");
    eprintln!(" --heatmap           calculate heatmap from compressed file");
    eprintln!("   --raw-cost        report raw cost of literals in heatmap");
    #[cfg(feature = "crossterm")]
    eprintln!("   --hexdump         print heatmap as colored hexdump");
    eprintln!(" --margin            calculate margin for overlapped unpacking of a packed file");
    eprintln!();
    eprintln!("When no infile is given, or the infile is '-', read from stdin.");
    eprintln!(
        "When no outfile is given and reading from stdin, or when outfile is '-', write to stdout."
    );
    eprintln!();
    eprintln!("Version: {}", env!("CARGO_PKG_VERSION"));
    eprintln!();
    eprintln!("Config presets for specific unpackers:");
    eprintln!(
        " --z80               --big-endian-bitstream --invert-bit-encoding --simplified-prob-update -9"
    );
    eprintln!(
        " --x86               --bitstream --invert-is-match-bit --invert-continue-value-bit --invert-new-offset-bit"
    );
    eprintln!(
        " --x86b              --bitstream --invert-continue-value-bit --no-repeated-offsets -9"
    );
    eprintln!();
    eprintln!("Config options (need to match when packing/unpacking):");
    eprintln!(" -b, --bitstream     bitstream mode");
    eprintln!(" -p, --parity N      use N (2/4) parity contexts");
    eprintln!(" -r, --reverse       reverse input & output");
    eprintln!();
    eprintln!("Config options to tailor output to specific optimized unpackers:");
    eprintln!(" --invert-is-match-bit");
    eprintln!(" --invert-new-offset-bit");
    eprintln!(" --invert-continue-value-bit");
    eprintln!(" --invert-bit-encoding");
    eprintln!(" --simplified-prob-update");
    eprintln!(" --big-endian-bitstream   (implies --bitstream)");
    eprintln!(" --no-repeated-offsets");
    eprintln!(" --eof-in-length");
    eprintln!(" --max-offset N");
    eprintln!(" --max-length N");
    process::exit(exit_code);
 }
--- a/src/parsing_packer.rs
+++ b/src/parsing_packer.rs
@@ -4,21 +4,26 @@ use std::rc::Rc;
 use crate::match_finder::MatchFinder;
 use crate::rans::{CostCounter, RansCoder};
-use crate::{lz, ProgressCallback};
+use crate::{ProgressCallback, lz};
-pub fn pack(data: &[u8], level: u8, use_bitstream: bool, progress_cb: Option<ProgressCallback>) -> Vec<u8> {
+pub fn pack(
-    let mut parse = parse(data, Config::from_level(level), progress_cb);
+    data: &[u8],
    level: u8,
    config: &crate::Config,
    progress_cb: Option<ProgressCallback>,
 ) -> Vec<u8> {
    let mut parse = parse(data, Config::from_level(level), config, progress_cb);
    let mut ops = vec![];
    while let Some(link) = parse {
        ops.push(link.op);
        parse = link.prev.clone();
    }
-    let mut state = lz::CoderState::new();
+    let mut state = lz::CoderState::new(config);
-    let mut coder = RansCoder::new(use_bitstream);
+    let mut coder = RansCoder::new(config);
    for op in ops.into_iter().rev() {
-        op.encode(&mut coder, &mut state);
+        op.encode(&mut coder, &mut state, config);
    }
-    lz::encode_eof(&mut coder, &mut state);
+    lz::encode_eof(&mut coder, &mut state, config);
    coder.finish()
 }
@@ -38,6 +43,7 @@ type Arrivals = HashMap<usize, Vec<Arrival>>;
 fn parse(
    data: &[u8],
    config: Config,
    encoding_config: &crate::Config,
    mut progress_cb: Option<ProgressCallback>,
 ) -> Option<Rc<Parse>> {
    let mut match_finder = MatchFinder::new(data)
@@ -62,7 +68,7 @@ fn parse(
        });
        let mut seen_offsets = HashSet::new();
        let mut remaining = Vec::new();
-        for arr in mem::replace(vec, Vec::new()) {
+        for arr in mem::take(vec) {
            if seen_offsets.insert(arr.state.last_offset()) {
                if vec.len() < max_arrivals {
                    vec.push(arr);
@@ -99,17 +105,22 @@ fn parse(
        cost_counter: &mut CostCounter,
        pos: usize,
        offset: usize,
-        length: usize,
+        mut length: usize,
        arrival: &Arrival,
        max_arrivals: usize,
        config: &crate::Config,
    ) {
        if length < config.min_length() {
            return;
        }
        length = length.min(config.max_length);
        cost_counter.reset();
        let mut state = arrival.state.clone();
        let op = lz::Op::Match {
            offset: offset as u32,
            len: length as u32,
        };
-        op.encode(cost_counter, &mut state);
+        op.encode(cost_counter, &mut state, config);
        add_arrival(
            arrivals,
            pos + length,
@@ -129,13 +140,13 @@ fn parse(
        0,
        Arrival {
            parse: None,
-            state: lz::CoderState::new(),
+            state: lz::CoderState::new(encoding_config),
            cost: 0.0,
        },
        max_arrivals,
    );
-    let cost_counter = &mut CostCounter::new();
+    let cost_counter = &mut CostCounter::new(encoding_config);
    let mut best_per_offset = HashMap::new();
    for pos in 0..data.len() {
        let match_length = |offset: usize| {
@@ -176,6 +187,7 @@ fn parse(
            for m in match_finder.matches(pos) {
                closest_match = Some(closest_match.unwrap_or(0).max(m.pos));
                let offset = pos - m.pos;
                if offset <= encoding_config.max_offset {
                    found_last_offset |= offset as u32 == arrival.state.last_offset();
                    add_match(
                        &mut arrivals,
@@ -185,11 +197,13 @@ fn parse(
                        m.length,
                        &arrival,
                        max_arrivals,
                        encoding_config,
                    );
                    if m.length >= config.greedy_size {
                        break 'arrival_loop;
                    }
                }
            }
            let mut near_matches_left = config.num_near_matches;
            let mut match_pos = last_seen[data[pos] as usize];
@@ -198,6 +212,9 @@ fn parse(
                && closest_match.iter().all(|p| *p < match_pos)
            {
                let offset = pos - match_pos;
                if offset > encoding_config.max_offset {
                    break;
                }
                let length = match_length(offset);
                assert!(length > 0);
                add_match(
@@ -208,6 +225,7 @@ fn parse(
                    length,
                    &arrival,
                    max_arrivals,
                    encoding_config,
                );
                found_last_offset |= offset as u32 == arrival.state.last_offset();
                if offset < near_matches.len() {
@@ -228,6 +246,7 @@ fn parse(
                        length,
                        &arrival,
                        max_arrivals,
                        encoding_config,
                    );
                }
            }
@@ -235,7 +254,7 @@ fn parse(
            cost_counter.reset();
            let mut state = arrival.state;
            let op = lz::Op::Literal(data[pos]);
-            op.encode(cost_counter, &mut state);
+            op.encode(cost_counter, &mut state, encoding_config);
            add_arrival(
                &mut arrivals,
                pos + 1,
--- a/src/rans.rs
+++ b/src/rans.rs
@@ -1,4 +1,5 @@
-use crate::context_state::Context;
+use crate::{context_state::Context, Config};
 use thiserror::Error;
 pub const PROB_BITS: u32 = 8;
 pub const ONE_PROB: u32 = 1 << PROB_BITS;
@@ -15,20 +16,25 @@ pub trait EntropyCoder {
 pub struct RansCoder {
    bits: Vec<u16>,
    use_bitstream: bool,
    bitstream_is_big_endian: bool,
    invert_bit_encoding: bool,
 }
 impl EntropyCoder for RansCoder {
    fn encode_bit(&mut self, bit: bool, prob: u16) {
        assert!(prob < 32768);
-        self.bits.push(prob | ((bit as u16) << 15));
+        self.bits
            .push(prob | (((bit ^ self.invert_bit_encoding) as u16) << 15));
    }
 }
 impl RansCoder {
-    pub fn new(use_bitstream: bool) -> RansCoder {
+    pub fn new(config: &Config) -> RansCoder {
        RansCoder {
            bits: Vec::new(),
-            use_bitstream,
+            use_bitstream: config.use_bitstream,
            bitstream_is_big_endian: config.bitstream_is_big_endian,
            invert_bit_encoding: config.invert_bit_encoding,
        }
    }
@@ -38,8 +44,20 @@ impl RansCoder {
        let mut state = 1 << l_bits;
        let mut byte = 0u8;
-        let mut bit = 8;
+        let mut bit = if self.bitstream_is_big_endian { 0 } else { 8 };
        let mut flush_state: Box<dyn FnMut(&mut u32)> = if self.use_bitstream {
            if self.bitstream_is_big_endian {
                Box::new(|state: &mut u32| {
                    byte |= ((*state & 1) as u8) << bit;
                    bit += 1;
                    if bit == 8 {
                        buffer.push(byte);
                        byte = 0;
                        bit = 0;
                    }
                    *state >>= 1;
                })
            } else {
                Box::new(|state: &mut u32| {
                    bit -= 1;
                    byte |= ((*state & 1) as u8) << bit;
@@ -50,6 +68,7 @@ impl RansCoder {
                    }
                    *state >>= 1;
                })
            }
        } else {
            Box::new(|state: &mut u32| {
                buffer.push(*state as u8);
@@ -91,10 +110,11 @@ impl RansCoder {
 pub struct CostCounter {
    cost: f64,
    log2_table: Vec<f64>,
    invert_bit_encoding: bool,
 }
 impl CostCounter {
-    pub fn new() -> CostCounter {
+    pub fn new(config: &Config) -> CostCounter {
        let log2_table = (0..ONE_PROB)
            .map(|prob| {
                let inv_prob = ONE_PROB as f64 / prob as f64;
@@ -104,6 +124,7 @@ impl CostCounter {
        CostCounter {
            cost: 0.0,
            log2_table,
            invert_bit_encoding: config.invert_bit_encoding,
        }
    }
@@ -118,7 +139,7 @@ impl CostCounter {
 impl EntropyCoder for CostCounter {
    fn encode_bit(&mut self, bit: bool, prob: u16) {
-        let prob = if bit {
+        let prob = if bit ^ self.invert_bit_encoding {
            prob as u32
        } else {
            ONE_PROB - prob as u32
@@ -127,52 +148,86 @@ impl EntropyCoder for CostCounter {
    }
 }
 #[derive(Clone)]
 pub struct RansDecoder<'a> {
    data: &'a [u8],
    pos: usize,
    state: u32,
    use_bitstream: bool,
    byte: u8,
    bits_left: u8,
    invert_bit_encoding: bool,
    bitstream_is_big_endian: bool,
 }
 const PROB_MASK: u32 = ONE_PROB - 1;
 #[derive(Debug, Error)]
 #[error("Unexpected end of input")]
 pub struct UnexpectedEOF;
 impl<'a> RansDecoder<'a> {
-    pub fn new(data: &'a [u8], use_bitstream: bool) -> RansDecoder<'a> {
+    pub fn new(data: &'a [u8], config: &Config) -> Result<RansDecoder<'a>, UnexpectedEOF> {
-        RansDecoder {
+        let mut decoder = RansDecoder {
            data,
            pos: 0,
            state: 0,
-            use_bitstream,
+            use_bitstream: config.use_bitstream,
            byte: 0,
            bits_left: 0,
-        }
+            invert_bit_encoding: config.invert_bit_encoding,
            bitstream_is_big_endian: config.bitstream_is_big_endian,
        };
        decoder.refill()?;
        Ok(decoder)
    }
-    pub fn decode_with_context(&mut self, context: &mut Context) -> bool {
+    pub fn pos(&self) -> usize {
-        let bit = self.decode_bit(context.prob());
+        self.pos
    }
    pub fn decode_with_context(&mut self, context: &mut Context) -> Result<bool, UnexpectedEOF> {
        let bit = self.decode_bit(context.prob())?;
        context.update(bit);
-        bit
+        Ok(bit)
    }
-    pub fn decode_bit(&mut self, prob: u16) -> bool {
+    fn refill(&mut self) -> Result<(), UnexpectedEOF> {
        let prob = prob as u32;
        if self.use_bitstream {
            while self.state < 32768 {
                if self.bits_left == 0 {
-                    self.byte = self.data[0];
+                    if self.pos >= self.data.len() {
-                    self.data = &self.data[1..];
+                        return Err(UnexpectedEOF);
                    }
                    self.byte = self.data[self.pos];
                    self.pos += 1;
                    self.bits_left = 8;
                }
                if self.bitstream_is_big_endian {
                    self.state = (self.state << 1) | (self.byte >> 7) as u32;
                    self.byte <<= 1;
                } else {
                    self.state = (self.state << 1) | (self.byte & 1) as u32;
                    self.byte >>= 1;
                }
                self.bits_left -= 1;
            }
        } else {
            while self.state < 4096 {
-                self.state = (self.state << 8) | self.data[0] as u32;
+                if self.pos >= self.data.len() {
-                self.data = &self.data[1..];
+                    return Err(UnexpectedEOF);
                }
                self.state = (self.state << 8) | self.data[self.pos] as u32;
                self.pos += 1;
            }
        }
        Ok(())
    }
    pub fn decode_bit(&mut self, prob: u16) -> Result<bool, UnexpectedEOF> {
        self.refill()?;
        let prob = prob as u32;
        let bit = (self.state & PROB_MASK) < prob;
@@ -183,6 +238,11 @@ impl<'a> RansDecoder<'a> {
        };
        self.state = prob * (self.state >> PROB_BITS) + (self.state & PROB_MASK) - start;
-        bit
+        Ok(bit ^ self.invert_bit_encoding)
    }
    pub fn cost(&self, prev: &RansDecoder) -> f32 {
        f32::log2(prev.state as f32) - f32::log2(self.state as f32)
            + (self.pos - prev.pos) as f32 * 8.
    }
 }
--- a/z80_unpacker/.gitignore
+++ b/z80_unpacker/.gitignore
@@ -0,0 +1,3 @@
 *.bin
 *.tap
 *.lst
--- a/z80_unpacker/Makefile
+++ b/z80_unpacker/Makefile
@@ -0,0 +1,11 @@
 all: unpack.bin example/example.sna
 # binary is positioned from ORG 0, not usable, just assembling to verify the syntax
 unpack.bin: unpack.asm
 	sjasmplus --msg=war --lst --lstlab=sort --raw=unpack.bin unpack.asm
 example/example.sna: unpack.asm example/example.asm
 	cd example && sjasmplus --msg=war --lst --lstlab=sort example.asm
 clean:
 	$(RM) unpack.bin unpack.lst example/example.sna example/example.lst
--- a/z80_unpacker/example/example.asm
+++ b/z80_unpacker/example/example.asm
@@ -0,0 +1,100 @@
 ;; Example using upkr depacker for screens slideshow
    OPT --syntax=abf
    DEVICE ZXSPECTRUM48,$8FFF
    ORG     $9000
  ;; forward example data
 compressed_scr_files.fwd:               ; border color byte + upkr-packed .scr file
    DB      1
    INCBIN  "screens/Grongy - ZX Spectrum (2022).scr.upk"
    DB      7
    INCBIN  "screens/Schafft - Poison (2017).scr.upk"
    DB      0
    INCBIN  "screens/diver - Mercenary 4. The Heaven's Devil (2014) (Forever 2014 Olympic Edition, 1).scr.upk"
    DB      6
    INCBIN  "screens/diver - Back to Bjork (2015).scr.upk"
 .e:
  ;; backward example data (unpacker goes from the end of the data!)
 compressed_scr_files.rwd.e: EQU $-1     ; the final IX will point one byte ahead of "$" here
    INCBIN  "screens.reversed/diver - Back to Bjork (2015).scr.upk"
    DB      6
    INCBIN  "screens.reversed/diver - Mercenary 4. The Heaven's Devil (2014) (Forever 2014 Olympic Edition, 1).scr.upk"
    DB      0
    INCBIN  "screens.reversed/Schafft - Poison (2017).scr.upk"
    DB      7
    INCBIN  "screens.reversed/Grongy - ZX Spectrum (2022).scr.upk"
 compressed_scr_files.rwd:               ; border color byte + upkr-packed .scr file (backward)
    DB      1
 start:
    di
 ;     OPT --zxnext
 ;     nextreg 7,3                       ; ZX Next: switch to 28Mhz
  ;;; FORWARD packed/unpacked data demo
    ld      ix,compressed_scr_files.fwd
 .slideshow_loop.fwd:
  ; set BORDER for next image
    ld      a,(ix)
    inc     ix
    out     (254),a
  ; call unpack of next image directly into VRAM
    ld      de,$4000                    ; target VRAM
    exx
  ; IX = packed data, DE' = destination ($4000)
  ; returned IX will point right after the packed data
    call    fwd.upkr.unpack
  ; do some busy loop with CPU to delay between images
    call    delay
  ; check if all images were displayed, loop around from first one then
    ld      a,ixl
    cp      low compressed_scr_files.fwd.e
    jr      nz,.slideshow_loop.fwd
  ;;; BACKWARD packed/unpacked data demo
    ld      ix,compressed_scr_files.rwd
 .slideshow_loop.rwd:
  ; set BORDER for next image
    ld      a,(ix)
    dec     ix
    out     (254),a
  ; call unpack of next image directly into VRAM
    ld      de,$5AFF                    ; target VRAM
    exx
  ; IX = packed data, DE' = destination
  ; returned IX will point right ahead of the packed data
    call    rwd.upkr.unpack
  ; do some busy loop with CPU to delay between images
    call    delay
  ; check if all images were displayed, loop around from first one then
    ld      a,ixl
    cp      low compressed_scr_files.rwd.e
    jr      nz,.slideshow_loop.rwd
    jr      start
 delay:
    ld      bc,$AA00
 .delay:
    .8 ex      (sp),ix
    dec     c
    jr      nz,.delay
    djnz    .delay
    ret
  ; include the depacker library, optionally putting probs array buffer near end of RAM
    DEFINE  UPKR_PROBS_ORIGIN $FA00   ; if not defined, array will be put after unpack code
    MODULE fwd
        INCLUDE "../unpack.asm"
    ENDMODULE
    MODULE rwd
        DEFINE BACKWARDS_UNPACK         ; defined to build backwards unpack
                ; initial IX points at last byte of compressed data
                ; initial DE' points at last byte of unpacked data
        INCLUDE "../unpack.asm"
    ENDMODULE
    SAVESNA "example.sna",start
--- a/z80_unpacker/example/example.sna
+++ b/z80_unpacker/example/example.sna
--- a/z80_unpacker/example/screens.reversed/Grongy
+++ b/z80_unpacker/example/screens.reversed/Grongy
--- a/z80_unpacker/example/screens.reversed/Schafft
+++ b/z80_unpacker/example/screens.reversed/Schafft
--- a/z80_unpacker/example/screens.reversed/diver
+++ b/z80_unpacker/example/screens.reversed/diver
--- a/z80_unpacker/example/screens.reversed/diver
+++ b/z80_unpacker/example/screens.reversed/diver
--- a/z80_unpacker/example/screens/Grongy
+++ b/z80_unpacker/example/screens/Grongy
--- a/z80_unpacker/example/screens/Grongy
+++ b/z80_unpacker/example/screens/Grongy
--- a/z80_unpacker/example/screens/Schafft
+++ b/z80_unpacker/example/screens/Schafft
--- a/z80_unpacker/example/screens/Schafft
+++ b/z80_unpacker/example/screens/Schafft
--- a/z80_unpacker/example/screens/diver
+++ b/z80_unpacker/example/screens/diver
--- a/z80_unpacker/example/screens/diver
+++ b/z80_unpacker/example/screens/diver
--- a/z80_unpacker/example/screens/diver
+++ b/z80_unpacker/example/screens/diver
--- a/z80_unpacker/example/screens/diver
+++ b/z80_unpacker/example/screens/diver
--- a/z80_unpacker/readme.txt
+++ b/z80_unpacker/readme.txt
@@ -0,0 +1,32 @@
 Z80 asm implementation of C unpacker, code-size focused (not performance).
 **ONLY BITSTREAM** variant is currently supported, make sure to use "-b" in packer.
 The project is expected to further evolve, including possible changes to binary format, this is
 initial version of Z80 unpacker to explore if/how it works and how it can be improved further.
 (copy full packer+depacker source to your project if you plan to use it, as future revisions
 may be incompatible with files you will produce with current version)
 Asm syntax is z00m's sjasmplus: https://github.com/z00m128/sjasmplus
 Backward direction unpacker added as compile-time option, see example for both forward/backward
 depacker in action.
 The packed/unpacked data-overlap has to be tested per-case, in worst case the packed data
 may need even more than 7 bytes to unpack final byte, but usually 1-4 bytes may suffice.
 TODO:
 - build bigger corpus of test data to benchmark future changes in algorithm/format (example and zx48.rom was used to do initial tests)
 - maybe try to beat double-loop `decode_number` with different encoding format
 - (@ped7g) Z80N version of unpacker for ZX Next devs
 - (@exoticorn) add Z80 specific packer (to avoid confusion with original MicroW8 variant), and land it all to master branch, maybe in "z80" directory or something? (and overall decide how to organise+merge this upstream into main repo)
 - (@exoticorn) add to packer output with possible packed/unpacked region overlap
 DONE:
 * review non-bitstream variant, if it's feasible to try to implement it with Z80
    - Ped7g: IMHO nope, the 12b x 8b MUL code would probably quickly cancel any gains from the simpler state update
 * review first implementation to identify weak spots where the implementation can be shorter+faster
 with acceptable small changes to the format
    - Ped7g: the decode_bit settled down and now doesn't feel so confused and redundant, the code seems pretty on point to me, no obvious simplification from format change
    - Ped7g: the decode_number double-loop is surprisingly resilient, especially in terms of code size I failed to beat it, speed wise only negligible gains
--- a/z80_unpacker/unpack.asm
+++ b/z80_unpacker/unpack.asm
@@ -0,0 +1,381 @@
 ;; https://github.com/exoticorn/upkr/blob/z80/c_unpacker/unpack.c - original C implementation
 ;; C source in comments ahead of asm - the C macros are removed to keep only bitstream variant
 ;;
 ;; initial version by Peter "Ped" Helcmanovsky (C) 2022, licensed same as upkr project ("unlicensed")
 ;; to assemble use z00m's sjasmplus: https://github.com/z00m128/sjasmplus
 ;;
 ;; you can define UPKR_PROBS_ORIGIN to specific 256 byte aligned address for probs array (320 bytes),
 ;; otherwise it will be positioned after the unpacker code (256 aligned)
 ;;
 ;; public API:
 ;;
 ;;     upkr.unpack
 ;;         IN: IX = packed data, DE' (shadow DE) = destination
 ;;         OUT: IX = after packed data
 ;;         modifies: all registers except IY, requires 10 bytes of stack space
 ;;
 ;     DEFINE BACKWARDS_UNPACK         ; uncomment to build backwards depacker (write_ptr--, upkr_data_ptr--)
            ; initial IX points at last byte of compressed data
            ; initial DE' points at last byte of unpacked data
 ;     DEFINE UPKR_UNPACK_SPEED        ; uncomment to get larger but faster unpack routine
 ; code size hint: if you put probs array just ahead of BASIC entry point, you will get BC
 ; initialised to probs.e by BASIC `USR` command and you can remove it from unpack init (-3B)
    OPT push reset --syntax=abf
    MODULE upkr
 NUMBER_BITS     EQU     16+15       ; context-bits per offset/length (16+15 for 16bit offsets/pointers)
    ; numbers (offsets/lengths) are encoded like: 1a1b1c1d1e0 = 0000'0000'001e'dbca
 /*
 u8* upkr_data_ptr;
 u8 upkr_probs[1 + 255 + 1 + 2*32 + 2*32];
 u16 upkr_state;
 u8 upkr_current_byte;
 int upkr_bits_left;
 int upkr_unpack(void* destination, void* compressed_data) {
    upkr_data_ptr = (u8*)compressed_data;
    upkr_state = 0;
    upkr_bits_left = 0;
    for(int i = 0; i < sizeof(upkr_probs); ++i)
        upkr_probs[i] = 128;
    u8* write_ptr = (u8*)destination;
    int prev_was_match = 0;
    int offset = 0;
    for(;;) {
        if(upkr_decode_bit(0)) {
            if(prev_was_match || upkr_decode_bit(256)) {
                offset = upkr_decode_length(257) - 1;
                if(offset == 0) {
                    break;
                }
            }
            int length = upkr_decode_length(257 + 64);
            while(length--) {
                *write_ptr = write_ptr[-offset];
                ++write_ptr;
            }
            prev_was_match = 1;
        } else {
            int byte = 1;
            while(byte < 256) {
                int bit = upkr_decode_bit(byte);
                byte = (byte << 1) + bit;
            }
            *write_ptr++ = byte;
            prev_was_match = 0;
        }
    }
    return write_ptr - (u8*)destination;
 }
 */
 ; IN: IX = compressed_data, DE' = destination
 unpack:
  ; ** reset probs to 0x80, also reset HL (state) to zero, and set BC to probs+context 0
    ld      hl,probs.c>>1
    ld      bc,probs.e
    ld      a,$80
 .reset_probs:
    dec     bc
    ld      (bc),a              ; will overwrite one extra byte after the array because of odd length
    dec     bc
    ld      (bc),a
    dec     l
    jr      nz,.reset_probs
    exa
    ; BC = probs (context_index 0), state HL = 0, A' = 0x80 (no source bits left in upkr_current_byte)
  ; ** main loop to decompress data
    ; D = prev_was_match = uninitialised, literal is expected first => will reset D to "false"
    ; values for false/true of prev_was_match are: false = high(probs), true = 1 + high(probs)
 .decompress_data:
    ld      c,0
    call    decode_bit          ; if(upkr_decode_bit(0))
    jr      c,.copy_chunk
  ; * extract byte from compressed data (literal)
    inc     c                   ; C = byte = 1 (and also context_index)
 .decode_byte:
    call    decode_bit          ; bit = upkr_decode_bit(byte);
    rl      c                   ; byte = (byte << 1) + bit;
    jr      nc,.decode_byte     ; while(byte < 256)
    ld      a,c
    exx
    ld      (de),a              ; *write_ptr++ = byte;
    IFNDEF BACKWARDS_UNPACK : inc de : ELSE : dec de : ENDIF
    exx
    ld      d,b                 ; prev_was_match = false
    jr      .decompress_data
  ; * copy chunk of already decompressed data (match)
 .copy_chunk:
    ld      a,b
    inc     b                   ; context_index = 256
        ;             if(prev_was_match || upkr_decode_bit(256)) {
        ;                 offset = upkr_decode_length(257) - 1;
        ;                 if (0 == offset) break;
        ;             }
    cp      d                   ; CF = prev_was_match
    call    nc,decode_bit       ; if not prev_was_match, then upkr_decode_bit(256)
    jr      nc,.keep_offset     ; if neither, keep old offset
    call    decode_number       ; context_index is already 257-1 as needed by decode_number
    dec     de                  ; offset = upkr_decode_length(257) - 1;
    ld      a,d
    or      e
    ret     z                   ; if(offset == 0) break
    ld      (.offset),de
 .keep_offset:
        ;             int length = upkr_decode_length(257 + 64);
        ;             while(length--) {
        ;                 *write_ptr = write_ptr[-offset];
        ;                 ++write_ptr;
        ;             }
        ;             prev_was_match = 1;
    ld      c,low(257 + NUMBER_BITS - 1)    ; context_index to second "number" set for lengths decoding
    call    decode_number       ; length = upkr_decode_length(257 + 64);
    push    de
    exx
    IFNDEF BACKWARDS_UNPACK
        ; forward unpack (write_ptr++, upkr_data_ptr++)
        ld      h,d             ; DE = write_ptr
        ld      l,e
 .offset+*:  ld  bc,0
        sbc     hl,bc           ; CF=0 from decode_number ; HL = write_ptr - offset
        pop     bc              ; BC = length
        ldir
    ELSE
        ; backward unpack (write_ptr--, upkr_data_ptr--)
 .offset+*:  ld  hl,0
        add     hl,de           ; HL = write_ptr + offset
        pop     bc              ; BC = length
        lddr
    ENDIF
    exx
    ld      d,b                 ; prev_was_match = true
    djnz    .decompress_data    ; adjust context_index back to 0..255 range, go to main loop
 /*
 int upkr_decode_bit(int context_index) {
    while(upkr_state < 32768) {
        if(upkr_bits_left == 0) {
            upkr_current_byte = *upkr_data_ptr++;
            upkr_bits_left = 8;
        }
        upkr_state = (upkr_state << 1) + (upkr_current_byte >> 7);
        upkr_current_byte <<= 1;
        --upkr_bits_left;
    }
    int prob = upkr_probs[context_index];
    int bit = (upkr_state & 255) >= prob ? 1 : 0;
    int prob_offset = 16;
    int state_offset = 0;
    int state_scale = prob;
    if(bit) {
        state_offset = -prob;
        state_scale = 256 - prob;
        prob_offset = 0;
    }
    upkr_state = state_offset + state_scale * (upkr_state >> 8) + (upkr_state & 255);
    upkr_probs[context_index] = prob_offset + prob - ((prob + 8) >> 4);
    return bit;
 }
 */
 inc_c_decode_bit:
  ; ++low(context_index) before decode_bit (to get -1B by two calls in decode_number)
    inc     c
 decode_bit:
  ; HL = upkr_state
  ; IX = upkr_data_ptr
  ; BC = probs+context_index
  ; A' = upkr_current_byte (!!! init to 0x80 at start, not 0x00)
  ; preserves DE
  ; ** while (state < 32768) - initial check
    push    de
    bit     7,h
    jr      nz,.state_b15_set
    exa
  ; ** while body
 .state_b15_zero:
  ; HL = upkr_state
  ; IX = upkr_data_ptr
  ; A = upkr_current_byte (init to 0x80 at start, not 0x00)
    add     a,a                     ; upkr_current_byte <<= 1; // and testing if(upkr_bits_left == 0)
    jr      nz,.has_bit             ; CF=data, ZF=0 -> some bits + stop bit still available
  ; CF=1 (by stop bit)
    ld      a,(ix)
    IFNDEF BACKWARDS_UNPACK : inc ix : ELSE : dec ix : ENDIF    ; upkr_current_byte = *upkr_data_ptr++;
    adc     a,a                     ; CF=data, b0=1 as new stop bit
 .has_bit:
    adc     hl,hl                   ; upkr_state = (upkr_state << 1) + (upkr_current_byte >> 7);
    jp      p,.state_b15_zero       ; while (state < 32768)
    exa
  ; ** set "bit"
 .state_b15_set:
    ld      a,(bc)                  ; A = upkr_probs[context_index]
    dec     a                       ; prob is in ~7..249 range, never zero, safe to -1
    cp      l                       ; CF = bit = prob-1 < (upkr_state & 255) <=> prob <= (upkr_state & 255)
    inc     a
  ; ** adjust state
    push    bc
    ld      c,l                     ; C = (upkr_state & 255); (preserving the value)
    push    af
    jr      nc,.bit_is_0
    neg                             ; A = -prob == (256-prob), CF=1 preserved
 .bit_is_0:
    ld      d,0
    ld      e,a                     ; DE = state_scale ; prob || (256-prob)
    ld      l,d                     ; H:L = (upkr_state>>8) : 0
  IFNDEF UPKR_UNPACK_SPEED
    ;; looped MUL for minimum unpack size
    ld      b,8                     ; counter
 .mulLoop:
    add     hl,hl
    jr      nc,.mul0
    add     hl,de
 .mul0:
    djnz    .mulLoop                ; until HL = state_scale * (upkr_state>>8), also BC becomes (upkr_state & 255)
  ELSE
    ;;; unrolled MUL for better performance, +25 bytes unpack size
    ld      b,d
    DUP     8
        add     hl,hl
        jr      nc,0_f
        add     hl,de
 0:
    EDUP
  ENDIF
    add     hl,bc                   ; HL = state_scale * (upkr_state >> 8) + (upkr_state & 255)
    pop     af                      ; restore prob and CF=bit
    jr      nc,.bit_is_0_2
    dec     d                       ; DE = -prob (also D = bit ? $FF : $00)
    add     hl,de                   ; HL += -prob
    ; ^ this always preserves CF=1, because (state>>8) >= 128, state_scale: 7..250, prob: 7..250,
    ; so 7*128 > 250 and thus edge case `ADD hl=(7*128+0),de=(-250)` => CF=1
 .bit_is_0_2:
 ; *** adjust probs[context_index]
    rra                             ; + (bit<<4) ; part of -prob_offset, needs another -16
    and     $FC                     ; clear/keep correct bits to get desired (prob>>4) + extras, CF=0
    rra
    rra
    rra                             ; A = (bit<<4) + (prob>>4), CF=(prob & 8)
    adc     a,-16                   ; A = (bit<<4) - 16 + ((prob + 8)>>4) ; -prob_offset = (bit<<4) - 16
    ld      e,a
    pop     bc
    ld      a,(bc)                  ; A = prob (cheaper + shorter to re-read again from memory)
    sub     e                       ; A = 16 - (bit<<4) + prob - ((prob + 8)>>4) ; = prob_offset + prob - ((prob + 8)>>4)
    ld      (bc),a                  ; probs[context_index] = prob_offset + prob - ((prob + 8) >> 4);
    add     a,d                     ; restore CF = bit (D = bit ? $FF : $00 && A > 0)
    pop     de
    ret
 /*
 int upkr_decode_length(int context_index) {
    int length = 0;
    int bit_pos = 0;
    while(upkr_decode_bit(context_index)) {
        length |= upkr_decode_bit(context_index + 1) << bit_pos++;
        context_index += 2;
    }
    return length | (1 << bit_pos);
 }
 */
 decode_number:
  ; HL = upkr_state
  ; IX = upkr_data_ptr
  ; BC = probs+context_index-1
  ; A' = upkr_current_byte (!!! init to 0x80 at start, not 0x00)
  ; return length in DE, CF=0
    ld      de,$FFFF            ; length = 0 with positional-stop-bit
    or      a                   ; CF=0 to skip getting data bit and use only `rr d : rr e` to fix init DE
 .loop:
    call    c,inc_c_decode_bit  ; get data bit, context_index + 1 / if CF=0 just add stop bit into DE init
    rr      d
    rr      e                   ; DE = length = (length >> 1) | (bit << 15);
    call    inc_c_decode_bit    ; context_index += 2
    jr      c,.loop
 .fix_bit_pos:
    ccf                         ; NC will become this final `| (1 << bit_pos)` bit
    rr      d
    rr      e
    jr      c,.fix_bit_pos      ; until stop bit is reached (all bits did land to correct position)
    ret                         ; return with CF=0 (important for unpack routine)
    DISPLAY "upkr.unpack total size: ",/D,$-unpack
    ; reserve space for probs array without emitting any machine code (using only EQU)
    IFDEF UPKR_PROBS_ORIGIN     ; if specific address is defined by user, move probs array there
 probs:      EQU ((UPKR_PROBS_ORIGIN) + 255) & -$100     ; probs array aligned to 256
    ELSE
 probs:      EQU ($ + 255) & -$100                       ; probs array aligned to 256
    ENDIF
 .real_c:    EQU 1 + 255 + 1 + 2*NUMBER_BITS             ; real size of probs array
 .c:         EQU (.real_c + 1) & -2                      ; padding to even size (required by init code)
 .e:         EQU probs + .c
    DISPLAY "upkr.unpack probs array placed at: ",/A,probs,",\tsize: ",/A,probs.c
 /*
 archived: negligibly faster but +6B longer decode_number variant using HL' and BC' to
 do `number|=(1<<bit_pos);` type of logic in single loop.
 */
 ; decode_number:
 ;     exx
 ;     ld      bc,1
 ;     ld      l,b
 ;     ld      h,b                 ; HL = 0
 ; .loop
 ;     exx
 ;     inc     c
 ;     call    decode_bit
 ;     jr      nc,.done
 ;     inc     c
 ;     call    decode_bit
 ;     exx
 ;     jr      nc,.b0
 ;     add     hl,bc
 ; .b0:
 ;     sla     c
 ;     rl      b
 ;     jr      .loop
 ; .done:
 ;     exx
 ;     add     hl,bc
 ;     push    hl
 ;     exx
 ;     pop     de
 ;     ret
 /*
 archived: possible LUT variant of updating probs value, requires 512-aligned 512B table (not tested)
 */
 ; code is replacing decode_bit from "; *** adjust probs[context_index]", followed by `ld (bc),a : add a,d ...`
 ;     ld      c,a
 ;     ld      a,high(probs_update_table)/2    ; must be 512 aligned
 ;     rla
 ;     ld      b,a
 ;     ld      a,(bc)
 ;     pop     bc
 ; -------------------------------------------
 ; probs_update_table: EQU probs-512
 ; -------------------------------------------
 ; table generator is not obvious and probably not short either, 20+ bytes almost for sure, maybe even 30-40
    ENDMODULE
    OPT pop
Author	SHA1	Message	Date
Dennis Ranke	a9e56d9d50	add missing fields to Cargo.toml	2025-07-16 21:16:44 +02:00
Dennis Ranke	1e4beedfee	prepare to publish crate / apply clippy fixes	2025-07-16 21:05:57 +02:00
Dennis Ranke	0c5ba4e32c	add highlevel description of compressed format in unpack.c	2024-05-25 22:02:47 +02:00
Piotr Fusik	f33bcb2396	mention 6502 unpacker	2024-05-15 10:56:25 +02:00
42Bastian	6d8e13d7e8	add Jaguar RISC versions	2024-04-10 23:53:46 +02:00
Dennis Ranke	88cbda559c	add 32bit windows build to releases	2024-03-04 13:07:40 +01:00
Dennis Ranke	f647449497	add --raw-cost and --hexdump flags to help text	2023-08-25 19:42:59 +02:00
Dennis Ranke	7186188118	add --raw-cost option for heatmap reporting	2023-08-25 19:34:17 +02:00
Dennis Ranke	588c1452b9	strip release executables	2023-08-25 00:03:51 +02:00
Dennis Ranke	1e0c8bb5a7	update readme, change --unpack to --decompress, put progress bar on stderr	2023-08-24 22:59:27 +02:00
Dennis Ranke	e11622202b	add support to read/write from/to stdin/stdout	2023-08-24 00:00:20 +02:00
Dennis Ranke	795e6c3090	add basic example for compiling upkr to a c library	2023-03-12 13:45:23 +01:00
Dennis Ranke	080db40d00	deny missing docs	2023-01-27 20:26:36 +01:00
5684185+vsariola@users.noreply.github.com	f502bf4e28	optimize dos stub size to save 2 bytes (discovered by qkumba)	2023-01-26 22:04:59 +01:00
Dennis Ranke	2eb8f230ba	add documentation, make pbr optional as well	2022-10-26 23:40:41 +02:00
Dennis Ranke	4eab36b9d9	add some api documentation	2022-10-25 23:33:32 +02:00
Dennis Ranke	7cec54f62b	make crossterm dependency optional	2022-10-25 22:40:56 +02:00
Dennis Ranke	7fa6be6ff4	implement printing heatmap as hexdump	2022-10-24 23:34:07 +02:00
Dennis Ranke	cab51e06ff	implement heatmap calculation	2022-10-23 23:06:09 +02:00
Dennis Ranke	c4fce626da	some clean up - move dos unpacker, fix arm32 unpacker formatting	2022-10-19 22:32:57 +02:00
exoticorn	0d7cda06bb	Merge pull request #6 from vsariola/dev/x86 implement three versions of a decompression stub for 16-bit x86 DOS	2022-10-19 22:11:04 +02:00
5684185+vsariola@users.noreply.github.com	140678ae20	implement three versions of a decompression stub for 16-bit x86 DOS	2022-10-17 15:50:09 +03:00
Dennis Ranke	d7bdc8c1c7	add --version flag to output upkr version	2022-10-17 01:03:01 +02:00
Dennis Ranke	887722a66b	prepare for 0.2.0 release	2022-10-17 00:45:52 +02:00
Dennis Ranke	39c95598f2	more arm32 optimizations, now 228b	2022-10-05 13:54:04 +02:00
Dennis Ranke	3e31b37c1c	first version of arm32 unpacker: 240 bytes	2022-10-04 22:44:28 +02:00
Dennis Ranke	83c023de45	more rv optimizations, rv32imc now 204b	2022-10-03 15:38:43 +02:00
Dennis Ranke	a46eb0e7f5	some more optimizations to the rv unpacker	2022-10-03 08:47:12 +02:00
Dennis Ranke	32cd8e5b6c	add --x86b preset	2022-09-30 04:23:53 +02:00
Dennis Ranke	90fa31ce1a	strip debuginfo in release build	2022-09-28 09:19:55 +02:00
Dennis Ranke	31fb91c629	Merge branch 'ped7g-z80_ped7g'	2022-09-27 22:34:19 +02:00
Dennis Ranke	e429f252a5	Merge branch 'z80'	2022-09-27 22:28:12 +02:00
Dennis Ranke	f6642f07c9	more config options, unpack error handling, fuzzing	2022-09-27 17:16:05 +02:00
Dennis Ranke	8715dede0e	add --eof-in-length option	2022-09-26 23:41:17 +02:00
Dennis Ranke	b12c8f8d93	add parameter to print out margin for overlapped unpacking	2022-09-25 23:44:03 +02:00
Dennis Ranke	af5fe898bf	add --no-repeated-offsets to help	2022-09-25 16:24:24 +02:00
Dennis Ranke	331857a711	add option to disable repeated offsets	2022-09-25 16:23:11 +02:00
Dennis Ranke	12e6f95fe8	add remaining encoding config options + presets for x86 and z80	2022-09-24 22:00:50 +02:00
Dennis Ranke	23872b3222	implement encoding options	2022-09-24 20:52:39 +02:00
Dennis Ranke	ced6cc8c32	some more risc-v optimizations	2022-09-24 08:45:14 +02:00
Dennis Ranke	8c9e4311b9	first (poorly optimized) risc-v unpacker	2022-09-23 22:40:47 +02:00
Dennis Ranke	31c31bdcfb	clean up command line interface	2022-09-21 22:45:06 +02:00
Dennis Ranke	8f33ae0b1e	add reverse compression option	2022-09-21 21:37:30 +02:00
Dennis Ranke	f5fc9bd005	implement optional parity contexts	2022-09-20 23:24:19 +02:00
Dennis Ranke	cc41feb5cd	alternative way to write state/prob update	2022-09-19 18:33:02 +02:00
Peter Helcmanovsky (Ped)	8a32e1384c	z80_unpacker: readme.txt and comment update	2022-09-19 15:19:39 +02:00
Peter Helcmanovsky (Ped)	9913dcf4bb	z80_unpacker: comment with possible LUT variant of updating probs value missing 512 byte table generator, which doesn't look trivial to do (especially in terms of code size). Not tested, but looks as decent speed up.	2022-09-19 14:31:00 +02:00
Peter Helcmanovsky (Ped)	a8fd3dc573	z80_unpacker: optimisation: -1B in decode_number (fwd 170B / rev 167B) slightly slower code, ROM unpack is back to ~22.6s	2022-09-19 13:20:44 +02:00
Peter Helcmanovsky (Ped)	e1f9fa143a	z80_unpacker: comment with caller size optimisation tip	2022-09-19 11:58:32 +02:00
Peter Helcmanovsky (Ped)	db1c7d2d14	z80_unpacker: optimisation: -1B in decode_number (fwd 171B / rev 168B)	2022-09-19 11:49:53 +02:00
Peter Helcmanovsky (Ped)	c1ffd0e7ed	z80_unpacker: attempt for faster `decode_number` (+6B, ~1% faster) => not good archived in comments for future reference	2022-09-19 11:42:56 +02:00
Peter Helcmanovsky (Ped)	00d084105a	z80_unpacker: optimisation: -2B in backward unpack (fwd 172B / rev 169B) backward was already -1B, so now the total difference is -3B.	2022-09-19 01:31:22 +02:00
Peter Helcmanovsky (Ped)	8e5298caee	z80_unpacker: optimisation: -1B in decode_number = 172B (but +4T per length)	2022-09-19 01:09:21 +02:00
Peter Helcmanovsky (Ped)	1fb29f3a1b	z80_unpacker: optimisation: -1B and -1T in decode_bit = 173B	2022-09-18 23:44:18 +02:00
Dennis Ranke	c8924456aa	-r reverses both input and output	2022-09-18 23:38:41 +02:00
exoticorn	7b0e22f459	Merge pull request #3 from ped7g/z80_ped7g backward unpacker + example extended	2022-09-18 23:24:28 +02:00
Dennis Ranke	5c7aee046a	optimize decode_bit some more -> 166b	2022-09-18 23:11:26 +02:00
Peter Helcmanovsky (Ped)	165f593a11	z80_unpacker: (codestyle) whitespace + temporary label rename	2022-09-18 23:04:37 +02:00
Peter Helcmanovsky (Ped)	d4bce4bf7c	z80_unpacker: optimisation: -3B and ~-10T in decode_bit = 174B unpack zx48.rom is now ~22.6s (from 23.0s) (performance version is now 199 bytes, zx48.rom unpack 19.4s -> 19.0s)	2022-09-18 22:54:10 +02:00
Dennis Ranke	612084a5bf	decode_length returns negative value -> 172b	2022-09-18 22:36:31 +02:00
exoticorn	ad731c2e75	Merge pull request #4 from Ferdi265/master unpack_armv6m: update comment headers and remove unneeded pushed register	2022-09-18 18:51:37 +02:00
Ferdinand Bachmann	52f9778c0f	unpack_armv6m: update comment headers and remove unneeded pushed register	2022-09-18 18:49:20 +02:00
Dennis Ranke	49a611e8ba	some more optimizations -> 176 bytes	2022-09-18 17:17:37 +02:00
Dennis Ranke	2f820316e3	change prob_index update to save two instructions -> 184b	2022-09-18 16:27:21 +02:00
Dennis Ranke	5bc3f88564	invert was_match -> 188 bytes	2022-09-18 15:58:31 +02:00
Dennis Ranke	434769b591	simple dev setup for asm unpackers	2022-09-18 15:40:23 +02:00
Peter Helcmanovsky (Ped)	b13fa05413	z80_unpacker: add backward variant of unpacker + example extended	2022-09-18 00:23:14 +02:00
Peter Helcmanovsky (Ped)	3c773aca8d	z80_unpacker: add performance variant of depacker	2022-09-16 03:38:03 +02:00
exoticorn	a5406deb30	Merge pull request #2 from ped7g/z80_ped7g Z80 ped7g - few more optimisations for current variant of packer	2022-09-16 00:26:55 +02:00
Peter Helcmanovsky (Ped)	9211544cb9	z80_unpacker: add resulting snapshot file to example	2022-09-15 18:37:06 +02:00
Peter Helcmanovsky (Ped)	3fa9e0fa12	z80_unpacker: optimisations: 0B, -13T in decode_bit (stays 177B)	2022-09-15 18:22:33 +02:00
Peter Helcmanovsky (Ped)	aa3fad4d80	z80_unpacker: optimisations: -3B and ~-24T in decode_bit = 177B	2022-09-15 18:22:32 +02:00
Peter Helcmanovsky (Ped)	6624940ed9	z80_unpacker: optimisations: -2B and -27T in decode_bit = 180B	2022-09-15 18:22:32 +02:00
Peter Helcmanovsky (Ped)	c3a9773e5c	z80_unpacker: optimisations: -1B in unpack implementation = 182B	2022-09-15 18:22:31 +02:00
Peter Helcmanovsky (Ped)	a75a35efb2	z80_unpacker: probs context-size for offset/length numbers as EQU	2022-09-15 18:22:27 +02:00
Dennis Ranke	540a91d1ba	forgot to add back -l 9	2022-09-15 00:18:30 +02:00
Dennis Ranke	e7aaf1491a	add old-prob-update to compare script, add reverse option	2022-09-14 23:51:38 +02:00
Dennis Ranke	a1dabaf7f9	add simple script to compare compression of variants	2022-09-14 23:41:14 +02:00
Dennis Ranke	75e375fb1f	Merge branch 'ped7g-z80_ped7g' into z80	2022-09-14 09:03:28 +02:00
Peter Helcmanovsky (Ped)	c7ea11bce3	z80_unpacker: optimisations: -2B in unpack implementation = 183B	2022-09-14 01:44:04 +02:00
Peter Helcmanovsky (Ped)	02d20867ee	z80_unpacker: optimisations: -2B in unpack implementation = 185B	2022-09-14 01:01:56 +02:00
Peter Helcmanovsky (Ped)	511ddefc08	z80_unpacker: optimisations: -4T per offset/length bit decoded making the 256-alignment of probs array even more baked-in, but there was no real chance to get rid of that any way	2022-09-14 00:01:51 +02:00
Peter Helcmanovsky (Ped)	d30baaa91f	z80_unpacker: optimisations: -1B by keeping write_ptr in DE'	2022-09-13 23:57:59 +02:00
Peter Helcmanovsky (Ped)	919a892ef0	z80_unpacker: optimisations: -1B by decode_length returning CF=0	2022-09-13 23:25:03 +02:00
Peter Helcmanovsky (Ped)	ea5c0b1b15	z80_unpacker: optimisations: shorter `>>4` in probs update	2022-09-13 23:15:18 +02:00
Peter Helcmanovsky (Ped)	a19ec2abb7	z80_unpacker: optimisations: remove .offset init first offset is mandatory in packed data	2022-09-13 22:53:15 +02:00
Peter Helcmanovsky (Ped)	7b051113e1	z80_unpacker: initial working version with screen-slideshow example	2022-09-13 22:12:03 +02:00
Dennis Ranke	f1f1c64a76	implement simplified prob update, update unpack.c	2022-09-10 12:01:42 +02:00
Dennis Ranke	36cb6d77b5	BE bitstream, flip bit encoding	2022-09-10 11:31:09 +02:00
Dennis Ranke	629c5fce7d	optimize c_unpacker state update a bit, add -b flag to --help	2022-09-09 19:10:31 +02:00