diff --git a/z80_unpacker/example/example.sna b/z80_unpacker/example/example.sna index 515f5d6..5a56975 100644 Binary files a/z80_unpacker/example/example.sna and b/z80_unpacker/example/example.sna differ diff --git a/z80_unpacker/readme.txt b/z80_unpacker/readme.txt index b7fff1b..4d93ac9 100644 --- a/z80_unpacker/readme.txt +++ b/z80_unpacker/readme.txt @@ -10,10 +10,23 @@ may be incompatible with files you will produce with current version) Asm syntax is z00m's sjasmplus: https://github.com/z00m128/sjasmplus +Backward direction unpacker added as compile-time option, see example for both forward/backward +depacker in action. + +The packed/unpacked data-overlap has to be tested per-case, in worst case the packed data +may need even more than 7 bytes to unpack final byte, but usually 1-4 bytes may suffice. + TODO: -- build base corpus of test data to benchmark future changes in algorithm/format -- review first implementation to identify weak spots where the implementation can be shorter+faster -with acceptable small changes to the format -- review non-bitstream variant, if it's feasible to try to implement it with Z80 +- build bigger corpus of test data to benchmark future changes in algorithm/format (example and zx48.rom was used to do initial tests) +- maybe try to beat double-loop `decode_number` with different encoding format - (@ped7g) Z80N version of unpacker for ZX Next devs - (@exoticorn) add Z80 specific packer (to avoid confusion with original MicroW8 variant), and land it all to master branch, maybe in "z80" directory or something? (and overall decide how to organise+merge this upstream into main repo) +- (@exoticorn) add to packer output with possible packed/unpacked region overlap + +DONE: +* review non-bitstream variant, if it's feasible to try to implement it with Z80 + - Ped7g: IMHO nope, the 12b x 8b MUL code would probably quickly cancel any gains from the simpler state update +* review first implementation to identify weak spots where the implementation can be shorter+faster +with acceptable small changes to the format + - Ped7g: the decode_bit settled down and now doesn't feel so confused and redundant, the code seems pretty on point to me, no obvious simplification from format change + - Ped7g: the decode_number double-loop is surprisingly resilient, especially in terms of code size I failed to beat it, speed wise only negligible gains diff --git a/z80_unpacker/unpack.asm b/z80_unpacker/unpack.asm index 26e47b7..637b048 100644 --- a/z80_unpacker/unpack.asm +++ b/z80_unpacker/unpack.asm @@ -4,7 +4,7 @@ ;; initial version by Peter "Ped" Helcmanovsky (C) 2022, licensed same as upkr project ("unlicensed") ;; to assemble use z00m's sjasmplus: https://github.com/z00m128/sjasmplus ;; -;; you can define UPKR_PROBS_ORIGIN to specific 256 byte aligned address for probs array (386 bytes), +;; you can define UPKR_PROBS_ORIGIN to specific 256 byte aligned address for probs array (320 bytes), ;; otherwise it will be positioned after the unpacker code (256 aligned) ;; ;; public API: @@ -15,12 +15,15 @@ ;; modifies: all registers except IY, requires 10 bytes of stack space ;; -; DEFINE BACKWARDS_UNPACK ; uncomment to build backwards depacker +; DEFINE BACKWARDS_UNPACK ; uncomment to build backwards depacker (write_ptr--, upkr_data_ptr--) ; initial IX points at last byte of compressed data ; initial DE' points at last byte of unpacked data ; DEFINE UPKR_UNPACK_SPEED ; uncomment to get larger but faster unpack routine +; code size hint: if you put probs array just ahead of BASIC entry point, you will get BC +; initialised to probs.e by BASIC `USR` command and you can remove it from unpack init (-3B) + OPT push reset --syntax=abf MODULE upkr @@ -122,8 +125,7 @@ unpack: cp d ; CF = prev_was_match call nc,decode_bit ; if not prev_was_match, then upkr_decode_bit(256) jr nc,.keep_offset ; if neither, keep old offset - inc c ; context_index to first "number" set for offsets decoding (257) - call decode_number + call decode_number ; context_index is already 257-1 as needed by decode_number dec de ; offset = upkr_decode_length(257) - 1; ld a,d or e @@ -136,20 +138,25 @@ unpack: ; ++write_ptr; ; } ; prev_was_match = 1; - ld c,low(257 + NUMBER_BITS) ; context_index to second "number" set for lengths decoding + ld c,low(257 + NUMBER_BITS - 1) ; context_index to second "number" set for lengths decoding call decode_number ; length = upkr_decode_length(257 + 64); push de exx - ld h,d ; DE = write_ptr - ld l,e -.offset+*: ld bc,0 IFNDEF BACKWARDS_UNPACK - sbc hl,bc ; CF=0 from decode_number ; HL = write_ptr - offset + ; forward unpack (write_ptr++, upkr_data_ptr++) + ld h,d ; DE = write_ptr + ld l,e +.offset+*: ld bc,0 + sbc hl,bc ; CF=0 from decode_number ; HL = write_ptr - offset + pop bc ; BC = length + ldir ELSE - add hl,bc ; HL = write_ptr + offset + ; backward unpack (write_ptr--, upkr_data_ptr--) +.offset+*: ld hl,0 + add hl,de ; HL = write_ptr + offset + pop bc ; BC = length + lddr ENDIF - pop bc ; BC = length - IFNDEF BACKWARDS_UNPACK : ldir : ELSE : lddr : ENDIF exx ld d,b ; prev_was_match = true djnz .decompress_data ; adjust context_index back to 0..255 range, go to main loop @@ -183,6 +190,9 @@ int upkr_decode_bit(int context_index) { return bit; } */ +inc_c_decode_bit: + ; ++low(context_index) before decode_bit (to get -1B by two calls in decode_number) + inc c decode_bit: ; HL = upkr_state ; IX = upkr_data_ptr @@ -259,16 +269,16 @@ decode_bit: ; so 7*128 > 250 and thus edge case `ADD hl=(7*128+0),de=(-250)` => CF=1 .bit_is_0_2: ; *** adjust probs[context_index] - ld e,a ; preserve prob rra ; + (bit<<4) ; part of -prob_offset, needs another -16 and $FC ; clear/keep correct bits to get desired (prob>>4) + extras, CF=0 rra rra rra ; A = (bit<<4) + (prob>>4), CF=(prob & 8) adc a,-16 ; A = (bit<<4) - 16 + ((prob + 8)>>4) ; -prob_offset = (bit<<4) - 16 - sub e ; A = (bit<<4) - 16 + ((prob + 8)>>4) - prob ; = ((prob + 8)>>4) - prob_offset - prob - neg ; A = prob_offset + prob - ((prob + 8)>>4) + ld e,a pop bc + ld a,(bc) ; A = prob (cheaper + shorter to re-read again from memory) + sub e ; A = 16 - (bit<<4) + prob - ((prob + 8)>>4) ; = prob_offset + prob - ((prob + 8)>>4) ld (bc),a ; probs[context_index] = prob_offset + prob - ((prob + 8) >> 4); add a,d ; restore CF = bit (D = bit ? $FF : $00 && A > 0) pop de @@ -288,19 +298,16 @@ int upkr_decode_length(int context_index) { decode_number: ; HL = upkr_state ; IX = upkr_data_ptr - ; BC = probs+context_index + ; BC = probs+context_index-1 ; A' = upkr_current_byte (!!! init to 0x80 at start, not 0x00) ; return length in DE, CF=0 - ld de,$7FFF ; length = 0 with positional-stop-bit - jr .loop_entry + ld de,$FFFF ; length = 0 with positional-stop-bit + or a ; CF=0 to skip getting data bit and use only `rr d : rr e` to fix init DE .loop: - inc c ; context_index + 1 - call decode_bit + call c,inc_c_decode_bit ; get data bit, context_index + 1 / if CF=0 just add stop bit into DE init rr d rr e ; DE = length = (length >> 1) | (bit << 15); - inc c ; context_index += 2 -.loop_entry: - call decode_bit + call inc_c_decode_bit ; context_index += 2 jr c,.loop .fix_bit_pos: ccf ; NC will become this final `| (1 << bit_pos)` bit @@ -324,5 +331,51 @@ probs: EQU ($ + 255) & -$100 ; probs array aligned to DISPLAY "upkr.unpack probs array placed at: ",/A,probs,",\tsize: ",/A,probs.c +/* + archived: negligibly faster but +6B longer decode_number variant using HL' and BC' to + do `number|=(1<