Merge pull request #5 from ped7g/z80_ped7g

Z80 ped7g - further optimisations
z80_unpacker: readme.txt and comment update
2026-01-20 19:46:42 +01:00 · 2022-09-27 22:35:44 +02:00 · 2022-09-19 15:19:39 +02:00 · 2022-09-19 14:31:00 +02:00 · 2022-09-19 13:20:44 +02:00 · 2022-09-19 11:58:32 +02:00
9 changed files with 210 additions and 59 deletions
--- a/src/main.rs
+++ b/src/main.rs
@@ -23,7 +23,7 @@ fn main() -> Result<()> {

            let mut pb = pbr::ProgressBar::new(data.len() as u64);
            pb.set_units(pbr::Units::Bytes);
-            let packed_data = upkr::pack(
+            let mut packed_data = upkr::pack(
                &data,
                level,
                use_bitstream,
@@ -33,6 +33,10 @@ fn main() -> Result<()> {
            );
            pb.finish();

+            if reverse {
+                packed_data.reverse();
+            }
+
            println!(
                "Compressed {} bytes to {} bytes ({}%)",
                data.len(),
@@ -50,6 +54,9 @@ fn main() -> Result<()> {

            let mut data = vec![];
            File::open(infile)?.read_to_end(&mut data)?;
+            if reverse {
+                data.reverse();
+            }
            let mut unpacked_data = upkr::unpack(&data, use_bitstream);
            if reverse {
                unpacked_data.reverse();
--- a/z80_unpacker/example/example.asm
+++ b/z80_unpacker/example/example.asm
@@ -3,7 +3,8 @@
    DEVICE ZXSPECTRUM48,$8FFF

    ORG     $9000
-compressed_scr_files:       ; border color byte + upkr-packed .scr file
+  ;; forward example data
+compressed_scr_files.fwd:               ; border color byte + upkr-packed .scr file
    DB      1
    INCBIN  "screens/Grongy - ZX Spectrum (2022).scr.upk"
    DB      7
@@ -13,37 +14,87 @@ compressed_scr_files:       ; border color byte + upkr-packed .scr file
    DB      6
    INCBIN  "screens/diver - Back to Bjork (2015).scr.upk"
 .e:
+  ;; backward example data (unpacker goes from the end of the data!)
+compressed_scr_files.rwd.e: EQU $-1     ; the final IX will point one byte ahead of "$" here
+    INCBIN  "screens.reversed/diver - Back to Bjork (2015).scr.upk"
+    DB      6
+    INCBIN  "screens.reversed/diver - Mercenary 4. The Heaven's Devil (2014) (Forever 2014 Olympic Edition, 1).scr.upk"
+    DB      0
+    INCBIN  "screens.reversed/Schafft - Poison (2017).scr.upk"
+    DB      7
+    INCBIN  "screens.reversed/Grongy - ZX Spectrum (2022).scr.upk"
+compressed_scr_files.rwd:               ; border color byte + upkr-packed .scr file (backward)
+    DB      1

 start:
    di
 ;     OPT --zxnext
-;     nextreg 7,3                 ; ZX Next: switch to 28Mhz
-    ld      ix,compressed_scr_files
-.slideshow_loop
+;     nextreg 7,3                       ; ZX Next: switch to 28Mhz
+
+  ;;; FORWARD packed/unpacked data demo
+    ld      ix,compressed_scr_files.fwd
+.slideshow_loop.fwd:
  ; set BORDER for next image
-    ldi     a,(ix)              ; fake: ld a,(ix) : inc ix
+    ld      a,(ix)
+    inc     ix
    out     (254),a
  ; call unpack of next image directly into VRAM
-    ld      de,$4000            ; target VRAM
+    ld      de,$4000                    ; target VRAM
    exx
  ; IX = packed data, DE' = destination ($4000)
  ; returned IX will point right after the packed data
-    call    upkr.unpack
+    call    fwd.upkr.unpack
  ; do some busy loop with CPU to delay between images
+    call    delay
+  ; check if all images were displayed, loop around from first one then
+    ld      a,ixl
+    cp      low compressed_scr_files.fwd.e
+    jr      nz,.slideshow_loop.fwd
+
+  ;;; BACKWARD packed/unpacked data demo
+    ld      ix,compressed_scr_files.rwd
+.slideshow_loop.rwd:
+  ; set BORDER for next image
+    ld      a,(ix)
+    dec     ix
+    out     (254),a
+  ; call unpack of next image directly into VRAM
+    ld      de,$5AFF                    ; target VRAM
+    exx
+  ; IX = packed data, DE' = destination
+  ; returned IX will point right ahead of the packed data
+    call    rwd.upkr.unpack
+  ; do some busy loop with CPU to delay between images
+    call    delay
+  ; check if all images were displayed, loop around from first one then
+    ld      a,ixl
+    cp      low compressed_scr_files.rwd.e
+    jr      nz,.slideshow_loop.rwd
+
+    jr      start
+
+delay:
    ld      bc,$AA00
 .delay:
    .8 ex      (sp),ix
    dec     c
    jr      nz,.delay
    djnz    .delay
-  ; check if all images were displayed, loop around from first one then
-    ld      a,ixl
-    cp      low compressed_scr_files.e
-    jr      z,start
-    jr      .slideshow_loop
+    ret

  ; include the depacker library, optionally putting probs array buffer near end of RAM
    DEFINE  UPKR_PROBS_ORIGIN $FA00   ; if not defined, array will be put after unpack code
-    INCLUDE "../unpack.asm"
+
+    MODULE fwd
+        INCLUDE "../unpack.asm"
+    ENDMODULE
+
+    MODULE rwd
+        DEFINE BACKWARDS_UNPACK         ; defined to build backwards unpack
+                ; initial IX points at last byte of compressed data
+                ; initial DE' points at last byte of unpacked data
+
+        INCLUDE "../unpack.asm"
+    ENDMODULE

    SAVESNA "example.sna",start
--- a/z80_unpacker/example/example.sna
+++ b/z80_unpacker/example/example.sna
--- a/z80_unpacker/example/screens.reversed/Grongy
+++ b/z80_unpacker/example/screens.reversed/Grongy
--- a/z80_unpacker/example/screens.reversed/Schafft
+++ b/z80_unpacker/example/screens.reversed/Schafft
--- a/z80_unpacker/example/screens.reversed/diver
+++ b/z80_unpacker/example/screens.reversed/diver
--- a/z80_unpacker/example/screens.reversed/diver
+++ b/z80_unpacker/example/screens.reversed/diver
--- a/z80_unpacker/readme.txt
+++ b/z80_unpacker/readme.txt
@@ -10,10 +10,23 @@ may be incompatible with files you will produce with current version)

 Asm syntax is z00m's sjasmplus: https://github.com/z00m128/sjasmplus

+Backward direction unpacker added as compile-time option, see example for both forward/backward
+depacker in action.
+
+The packed/unpacked data-overlap has to be tested per-case, in worst case the packed data
+may need even more than 7 bytes to unpack final byte, but usually 1-4 bytes may suffice.
+
 TODO:
- build base corpus of test data to benchmark future changes in algorithm/format
- review first implementation to identify weak spots where the implementation can be shorter+faster
-with acceptable small changes to the format
- review non-bitstream variant, if it's feasible to try to implement it with Z80
+- build bigger corpus of test data to benchmark future changes in algorithm/format (example and zx48.rom was used to do initial tests)
+- maybe try to beat double-loop `decode_number` with different encoding format
 - (@ped7g) Z80N version of unpacker for ZX Next devs
 - (@exoticorn) add Z80 specific packer (to avoid confusion with original MicroW8 variant), and land it all to master branch, maybe in "z80" directory or something? (and overall decide how to organise+merge this upstream into main repo)
+- (@exoticorn) add to packer output with possible packed/unpacked region overlap
+
+DONE:
+* review non-bitstream variant, if it's feasible to try to implement it with Z80
+    - Ped7g: IMHO nope, the 12b x 8b MUL code would probably quickly cancel any gains from the simpler state update
+* review first implementation to identify weak spots where the implementation can be shorter+faster
+with acceptable small changes to the format
+    - Ped7g: the decode_bit settled down and now doesn't feel so confused and redundant, the code seems pretty on point to me, no obvious simplification from format change
+    - Ped7g: the decode_number double-loop is surprisingly resilient, especially in terms of code size I failed to beat it, speed wise only negligible gains
--- a/z80_unpacker/unpack.asm
+++ b/z80_unpacker/unpack.asm
@@ -4,7 +4,7 @@
 ;; initial version by Peter "Ped" Helcmanovsky (C) 2022, licensed same as upkr project ("unlicensed")
 ;; to assemble use z00m's sjasmplus: https://github.com/z00m128/sjasmplus
 ;;
-;; you can define UPKR_PROBS_ORIGIN to specific 256 byte aligned address for probs array (386 bytes),
+;; you can define UPKR_PROBS_ORIGIN to specific 256 byte aligned address for probs array (320 bytes),
 ;; otherwise it will be positioned after the unpacker code (256 aligned)
 ;;
 ;; public API:
@@ -15,6 +15,15 @@
 ;;         modifies: all registers except IY, requires 10 bytes of stack space
 ;;

+;     DEFINE BACKWARDS_UNPACK         ; uncomment to build backwards depacker (write_ptr--, upkr_data_ptr--)
+            ; initial IX points at last byte of compressed data
+            ; initial DE' points at last byte of unpacked data
+
+;     DEFINE UPKR_UNPACK_SPEED        ; uncomment to get larger but faster unpack routine
+
+; code size hint: if you put probs array just ahead of BASIC entry point, you will get BC
+; initialised to probs.e by BASIC `USR` command and you can remove it from unpack init (-3B)
+
    OPT push reset --syntax=abf
    MODULE upkr

@@ -100,7 +109,7 @@ unpack:
    ld      a,c
    exx
    ld      (de),a              ; *write_ptr++ = byte;
-    inc     de
+    IFNDEF BACKWARDS_UNPACK : inc de : ELSE : dec de : ENDIF
    exx
    ld      d,b                 ; prev_was_match = false
    jr      .decompress_data
@@ -116,8 +125,7 @@ unpack:
    cp      d                   ; CF = prev_was_match
    call    nc,decode_bit       ; if not prev_was_match, then upkr_decode_bit(256)
    jr      nc,.keep_offset     ; if neither, keep old offset
-    inc     c                   ; context_index to first "number" set for offsets decoding (257)
-    call    decode_number
+    call    decode_number       ; context_index is already 257-1 as needed by decode_number
    dec     de                  ; offset = upkr_decode_length(257) - 1;
    ld      a,d
    or      e
@@ -130,16 +138,25 @@ unpack:
        ;                 ++write_ptr;
        ;             }
        ;             prev_was_match = 1;
-    ld      c,low(257 + NUMBER_BITS)    ; context_index to second "number" set for lengths decoding
+    ld      c,low(257 + NUMBER_BITS - 1)    ; context_index to second "number" set for lengths decoding
    call    decode_number       ; length = upkr_decode_length(257 + 64);
    push    de
    exx
-    ld      h,d                 ; DE = write_ptr
-    ld      l,e
-.offset+*:  ld      bc,0
-    sbc     hl,bc               ; CF=0 from decode_number ; HL = write_ptr - offset
-    pop     bc                  ; BC = length
-    ldir
+    IFNDEF BACKWARDS_UNPACK
+        ; forward unpack (write_ptr++, upkr_data_ptr++)
+        ld      h,d             ; DE = write_ptr
+        ld      l,e
+.offset+*:  ld  bc,0
+        sbc     hl,bc           ; CF=0 from decode_number ; HL = write_ptr - offset
+        pop     bc              ; BC = length
+        ldir
+    ELSE
+        ; backward unpack (write_ptr--, upkr_data_ptr--)
+.offset+*:  ld  hl,0
+        add     hl,de           ; HL = write_ptr + offset
+        pop     bc              ; BC = length
+        lddr
+    ENDIF
    exx
    ld      d,b                 ; prev_was_match = true
    djnz    .decompress_data    ; adjust context_index back to 0..255 range, go to main loop
@@ -173,6 +190,9 @@ int upkr_decode_bit(int context_index) {
    return bit;
 }
 */
+inc_c_decode_bit:
+  ; ++low(context_index) before decode_bit (to get -1B by two calls in decode_number)
+    inc     c
 decode_bit:
  ; HL = upkr_state
  ; IX = upkr_data_ptr
@@ -193,7 +213,7 @@ decode_bit:
    jr      nz,.has_bit             ; CF=data, ZF=0 -> some bits + stop bit still available
  ; CF=1 (by stop bit)
    ld      a,(ix)
-    inc     ix                      ; upkr_current_byte = *upkr_data_ptr++;
+    IFNDEF BACKWARDS_UNPACK : inc ix : ELSE : dec ix : ENDIF    ; upkr_current_byte = *upkr_data_ptr++;
    adc     a,a                     ; CF=data, b0=1 as new stop bit
 .has_bit:
    adc     hl,hl                   ; upkr_state = (upkr_state << 1) + (upkr_current_byte >> 7);
@@ -215,6 +235,10 @@ decode_bit:
    ld      d,0
    ld      e,a                     ; DE = state_scale ; prob || (256-prob)
    ld      l,d                     ; H:L = (upkr_state>>8) : 0
+
+  IFNDEF UPKR_UNPACK_SPEED
+
+    ;; looped MUL for minimum unpack size
    ld      b,8                     ; counter
 .mulLoop:
    add     hl,hl
@@ -222,28 +246,41 @@ decode_bit:
    add     hl,de
 .mul0:
    djnz    .mulLoop                ; until HL = state_scale * (upkr_state>>8), also BC becomes (upkr_state & 255)
+
+  ELSE
+
+    ;;; unrolled MUL for better performance, +25 bytes unpack size
+    ld      b,d
+    DUP     8
+        add     hl,hl
+        jr      nc,0_f
+        add     hl,de
+0:
+    EDUP
+
+  ENDIF
+
    add     hl,bc                   ; HL = state_scale * (upkr_state >> 8) + (upkr_state & 255)
-    pop     af
-    ld      d,-16                   ; D = -prob_offset (-16 0xF0 when bit = 0)
+    pop     af                      ; restore prob and CF=bit
    jr      nc,.bit_is_0_2
-    ld      d,b                     ; D = -prob_offset (0 when bit = 1) (also does fix following ADD)
-    dec     h
-    add     hl,de                   ; HL += -prob (HL += (256 - prob) - 256)
-.bit_is_0_2:                        ; HL = state_offset + state_scale * (upkr_state >> 8) + (upkr_state & 255) ; new upkr_state
+    dec     d                       ; DE = -prob (also D = bit ? $FF : $00)
+    add     hl,de                   ; HL += -prob
+    ; ^ this always preserves CF=1, because (state>>8) >= 128, state_scale: 7..250, prob: 7..250,
+    ; so 7*128 > 250 and thus edge case `ADD hl=(7*128+0),de=(-250)` => CF=1
+.bit_is_0_2:
 ; *** adjust probs[context_index]
-    ld      e,a                     ; D:E = -prob_offset:prob, A = prob
-    and     $F8
+    rra                             ; + (bit<<4) ; part of -prob_offset, needs another -16
+    and     $FC                     ; clear/keep correct bits to get desired (prob>>4) + extras, CF=0
    rra
    rra
-    rra
-    rra
-    adc     a,d                     ; A = -prob_offset + ((prob + 8) >> 4)
-    neg
-    add     a,e                     ; A = prob_offset + prob - ((prob + 8) >> 4)
+    rra                             ; A = (bit<<4) + (prob>>4), CF=(prob & 8)
+    adc     a,-16                   ; A = (bit<<4) - 16 + ((prob + 8)>>4) ; -prob_offset = (bit<<4) - 16
+    ld      e,a
    pop     bc
-    ld      (bc),a                  ; update probs[context_index]
-    add     a,d                     ; bit=0: A = 23..249, D = 240 -> CF=1 || bit=1: D=0 -> CF=0
-    ccf                             ; resulting CF = bit restored
+    ld      a,(bc)                  ; A = prob (cheaper + shorter to re-read again from memory)
+    sub     e                       ; A = 16 - (bit<<4) + prob - ((prob + 8)>>4) ; = prob_offset + prob - ((prob + 8)>>4)
+    ld      (bc),a                  ; probs[context_index] = prob_offset + prob - ((prob + 8) >> 4);
+    add     a,d                     ; restore CF = bit (D = bit ? $FF : $00 && A > 0)
    pop     de
    ret

@@ -261,19 +298,16 @@ int upkr_decode_length(int context_index) {
 decode_number:
  ; HL = upkr_state
  ; IX = upkr_data_ptr
-  ; BC = probs+context_index
+  ; BC = probs+context_index-1
  ; A' = upkr_current_byte (!!! init to 0x80 at start, not 0x00)
  ; return length in DE, CF=0
-    ld      de,$7FFF            ; length = 0 with positional-stop-bit
-    jr      .loop_entry
+    ld      de,$FFFF            ; length = 0 with positional-stop-bit
+    or      a                   ; CF=0 to skip getting data bit and use only `rr d : rr e` to fix init DE
 .loop:
-    inc     c                   ; context_index + 1
-    call    decode_bit
+    call    c,inc_c_decode_bit  ; get data bit, context_index + 1 / if CF=0 just add stop bit into DE init
    rr      d
    rr      e                   ; DE = length = (length >> 1) | (bit << 15);
-    inc     c                   ; context_index += 2
-.loop_entry:
-    call    decode_bit
+    call    inc_c_decode_bit    ; context_index += 2
    jr      c,.loop
 .fix_bit_pos:
    ccf                         ; NC will become this final `| (1 << bit_pos)` bit
@@ -287,15 +321,61 @@ decode_number:
    ; reserve space for probs array without emitting any machine code (using only EQU)

    IFDEF UPKR_PROBS_ORIGIN     ; if specific address is defined by user, move probs array there
-    ORG UPKR_PROBS_ORIGIN
+probs:      EQU ((UPKR_PROBS_ORIGIN) + 255) & -$100     ; probs array aligned to 256
+    ELSE
+probs:      EQU ($ + 255) & -$100                       ; probs array aligned to 256
    ENDIF
-
-probs:      EQU ($+255) & -$100                 ; probs array aligned to 256
-.real_c:    EQU 1 + 255 + 1 + 2*NUMBER_BITS     ; real size of probs array
-.c:         EQU (.real_c + 1) & -2              ; padding to even size (required by init code)
+.real_c:    EQU 1 + 255 + 1 + 2*NUMBER_BITS             ; real size of probs array
+.c:         EQU (.real_c + 1) & -2                      ; padding to even size (required by init code)
 .e:         EQU probs + .c

    DISPLAY "upkr.unpack probs array placed at: ",/A,probs,",\tsize: ",/A,probs.c

+/*
+ archived: negligibly faster but +6B longer decode_number variant using HL' and BC' to
+ do `number|=(1<<bit_pos);` type of logic in single loop.
+*/
+; decode_number:
+;     exx
+;     ld      bc,1
+;     ld      l,b
+;     ld      h,b                 ; HL = 0
+; .loop
+;     exx
+;     inc     c
+;     call    decode_bit
+;     jr      nc,.done
+;     inc     c
+;     call    decode_bit
+;     exx
+;     jr      nc,.b0
+;     add     hl,bc
+; .b0:
+;     sla     c
+;     rl      b
+;     jr      .loop
+; .done:
+;     exx
+;     add     hl,bc
+;     push    hl
+;     exx
+;     pop     de
+;     ret
+
+/*
+ archived: possible LUT variant of updating probs value, requires 512-aligned 512B table (not tested)
+*/
+; code is replacing decode_bit from "; *** adjust probs[context_index]", followed by `ld (bc),a : add a,d ...`
+;     ld      c,a
+;     ld      a,high(probs_update_table)/2    ; must be 512 aligned
+;     rla
+;     ld      b,a
+;     ld      a,(bc)
+;     pop     bc
+; -------------------------------------------
+; probs_update_table: EQU probs-512
+; -------------------------------------------
+; table generator is not obvious and probably not short either, 20+ bytes almost for sure, maybe even 30-40
+
    ENDMODULE
    OPT pop
Author	SHA1	Message	Date
exoticorn	48727040b3	Merge pull request #5 from ped7g/z80_ped7g Z80 ped7g - further optimisations	2022-09-27 22:35:44 +02:00
Peter Helcmanovsky (Ped)	8a32e1384c	z80_unpacker: readme.txt and comment update	2022-09-19 15:19:39 +02:00
Peter Helcmanovsky (Ped)	9913dcf4bb	z80_unpacker: comment with possible LUT variant of updating probs value missing 512 byte table generator, which doesn't look trivial to do (especially in terms of code size). Not tested, but looks as decent speed up.	2022-09-19 14:31:00 +02:00
Peter Helcmanovsky (Ped)	a8fd3dc573	z80_unpacker: optimisation: -1B in decode_number (fwd 170B / rev 167B) slightly slower code, ROM unpack is back to ~22.6s	2022-09-19 13:20:44 +02:00
Peter Helcmanovsky (Ped)	e1f9fa143a	z80_unpacker: comment with caller size optimisation tip	2022-09-19 11:58:32 +02:00
Peter Helcmanovsky (Ped)	db1c7d2d14	z80_unpacker: optimisation: -1B in decode_number (fwd 171B / rev 168B)	2022-09-19 11:49:53 +02:00
Peter Helcmanovsky (Ped)	c1ffd0e7ed	z80_unpacker: attempt for faster `decode_number` (+6B, ~1% faster) => not good archived in comments for future reference	2022-09-19 11:42:56 +02:00
Peter Helcmanovsky (Ped)	00d084105a	z80_unpacker: optimisation: -2B in backward unpack (fwd 172B / rev 169B) backward was already -1B, so now the total difference is -3B.	2022-09-19 01:31:22 +02:00
Peter Helcmanovsky (Ped)	8e5298caee	z80_unpacker: optimisation: -1B in decode_number = 172B (but +4T per length)	2022-09-19 01:09:21 +02:00
Peter Helcmanovsky (Ped)	1fb29f3a1b	z80_unpacker: optimisation: -1B and -1T in decode_bit = 173B	2022-09-18 23:44:18 +02:00
Dennis Ranke	c8924456aa	-r reverses both input and output	2022-09-18 23:38:41 +02:00
exoticorn	7b0e22f459	Merge pull request #3 from ped7g/z80_ped7g backward unpacker + example extended	2022-09-18 23:24:28 +02:00
Peter Helcmanovsky (Ped)	165f593a11	z80_unpacker: (codestyle) whitespace + temporary label rename	2022-09-18 23:04:37 +02:00
Peter Helcmanovsky (Ped)	d4bce4bf7c	z80_unpacker: optimisation: -3B and ~-10T in decode_bit = 174B unpack zx48.rom is now ~22.6s (from 23.0s) (performance version is now 199 bytes, zx48.rom unpack 19.4s -> 19.0s)	2022-09-18 22:54:10 +02:00
Peter Helcmanovsky (Ped)	b13fa05413	z80_unpacker: add backward variant of unpacker + example extended	2022-09-18 00:23:14 +02:00
Peter Helcmanovsky (Ped)	3c773aca8d	z80_unpacker: add performance variant of depacker	2022-09-16 03:38:03 +02:00