From 1fb29f3a1b4e55f440446860a81e12a3cbbfad85 Mon Sep 17 00:00:00 2001 From: "Peter Helcmanovsky (Ped)" Date: Sun, 18 Sep 2022 23:41:51 +0200 Subject: [PATCH 1/9] z80_unpacker: optimisation: -1B and -1T in decode_bit = 173B --- z80_unpacker/example/example.sna | Bin 49179 -> 49179 bytes z80_unpacker/unpack.asm | 6 +++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/z80_unpacker/example/example.sna b/z80_unpacker/example/example.sna index 515f5d68690562b02a92e501cbcea636569d53df..f9d8ac6a21f3660ab798d06a222eb4ffeaf3405a 100644 GIT binary patch delta 149 zcmbQ;z&yKwd4u>u#`BXU7KZU{c_>}A`NZ4c^oM*3XSYnQS?J6o|Ll*vy!^Qj@dvpk zPd>BINJs9Gg#gc4xksl34SrVM6q3FfAn~$-W!DwCM+Q7RXA>SlOb~elG$CQK)*>Y) ol}D2oE|gP1Hi)U|;z_~(^%9(5gQQMN1L@7Zi=N9&F515m0NL(DH~;_u delta 151 zcmbQ;z&yKwd4u>u#tV}r7KZU{eJEYE`NZ4c^oM*3XSYtSS?J6gFaPY1yuAFm50l@z z9GrZ5p^=XKBMSkZv+|El3mW{ayeTAoGeF{H1 250 and thus edge case `ADD hl=(7*128+0),de=(-250)` => CF=1 .bit_is_0_2: ; *** adjust probs[context_index] - ld e,a ; preserve prob rra ; + (bit<<4) ; part of -prob_offset, needs another -16 and $FC ; clear/keep correct bits to get desired (prob>>4) + extras, CF=0 rra rra rra ; A = (bit<<4) + (prob>>4), CF=(prob & 8) adc a,-16 ; A = (bit<<4) - 16 + ((prob + 8)>>4) ; -prob_offset = (bit<<4) - 16 - sub e ; A = (bit<<4) - 16 + ((prob + 8)>>4) - prob ; = ((prob + 8)>>4) - prob_offset - prob - neg ; A = prob_offset + prob - ((prob + 8)>>4) + ld e,a pop bc + ld a,(bc) ; A = prob (cheaper + shorter to re-read again from memory) + sub e ; A = 16 - (bit<<4) + prob - ((prob + 8)>>4) ; = prob_offset + prob - ((prob + 8)>>4) ld (bc),a ; probs[context_index] = prob_offset + prob - ((prob + 8) >> 4); add a,d ; restore CF = bit (D = bit ? $FF : $00 && A > 0) pop de From 8e5298caee30769740557fc3785f6596341c0537 Mon Sep 17 00:00:00 2001 From: "Peter Helcmanovsky (Ped)" Date: Mon, 19 Sep 2022 01:09:21 +0200 Subject: [PATCH 2/9] z80_unpacker: optimisation: -1B in decode_number = 172B (but +4T per length) --- z80_unpacker/example/example.sna | Bin 49179 -> 49179 bytes z80_unpacker/unpack.asm | 9 ++++----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/z80_unpacker/example/example.sna b/z80_unpacker/example/example.sna index f9d8ac6a21f3660ab798d06a222eb4ffeaf3405a..0bb80da49a0c00415f7f501071bc24f56ac9cb94 100644 GIT binary patch delta 222 zcmbQ;z&yKwd4u>u#&eS;7FzfhKC}?vIa~Pfw4lMy%9}#MHv=SIR4qiQ7qaeW1uE2J;j^VCy|J$rj<&zgI zH0I#|8z^;J8c0vRzfjI!<`KvenMY7dq+yoGJOWx0_XuK%&?BHFagU&ua2|ZS18Bz~ h6g#v*b|^iXyl|lr$P%D|U`u2+cQ1M=Gr4I0N&rZMXd3_k delta 224 zcmbQ;z&yKwd4u>u#`BXU7Fq-pJ+u(uIa~Dbw4lMy%9}#MHv=SIRPrMCIf5@i*~0L6)dIn!I445y%#xiC|mgHg_(1DKoib|8fAh4QNmR diff --git a/z80_unpacker/unpack.asm b/z80_unpacker/unpack.asm index cb458e4..e925d07 100644 --- a/z80_unpacker/unpack.asm +++ b/z80_unpacker/unpack.asm @@ -122,8 +122,7 @@ unpack: cp d ; CF = prev_was_match call nc,decode_bit ; if not prev_was_match, then upkr_decode_bit(256) jr nc,.keep_offset ; if neither, keep old offset - inc c ; context_index to first "number" set for offsets decoding (257) - call decode_number + call decode_number ; context_index is already 257-1 as needed by decode_number dec de ; offset = upkr_decode_length(257) - 1; ld a,d or e @@ -136,7 +135,7 @@ unpack: ; ++write_ptr; ; } ; prev_was_match = 1; - ld c,low(257 + NUMBER_BITS) ; context_index to second "number" set for lengths decoding + ld c,low(257 + NUMBER_BITS - 1) ; context_index to second "number" set for lengths decoding call decode_number ; length = upkr_decode_length(257 + 64); push de exx @@ -288,7 +287,7 @@ int upkr_decode_length(int context_index) { decode_number: ; HL = upkr_state ; IX = upkr_data_ptr - ; BC = probs+context_index + ; BC = probs+context_index-1 ; A' = upkr_current_byte (!!! init to 0x80 at start, not 0x00) ; return length in DE, CF=0 ld de,$7FFF ; length = 0 with positional-stop-bit @@ -298,8 +297,8 @@ decode_number: call decode_bit rr d rr e ; DE = length = (length >> 1) | (bit << 15); - inc c ; context_index += 2 .loop_entry: + inc c ; context_index += 2 call decode_bit jr c,.loop .fix_bit_pos: From 00d084105ac1ab4266dd466b65a8487c4bafd6cf Mon Sep 17 00:00:00 2001 From: "Peter Helcmanovsky (Ped)" Date: Mon, 19 Sep 2022 01:31:22 +0200 Subject: [PATCH 3/9] z80_unpacker: optimisation: -2B in backward unpack (fwd 172B / rev 169B) backward was already -1B, so now the total difference is -3B. --- z80_unpacker/example/example.sna | Bin 49179 -> 49179 bytes z80_unpacker/unpack.asm | 17 ++++++++++------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/z80_unpacker/example/example.sna b/z80_unpacker/example/example.sna index 0bb80da49a0c00415f7f501071bc24f56ac9cb94..608c82a0cd10aa8ef47756414fd76847ca067304 100644 GIT binary patch delta 110 zcmV-!0FnQjfCHO=1F-y|O&a1j5Dd*4;>!^*`FYtA8`)48>39UX)EeS23(Z{O8+xz9000@m?YP-c5X9BXd>{}Ahad*segNGokL_ymAmWptp*0B_;>#M# Q8w|}Fvx%bX9FuatsTpcC3jhEB delta 112 zcmV-$0FVEhfCHO=1F-y|PaNVn5Dd*6;>!^*`FYtA8`)48>39UX)Ewe43(a2Q8+xz=YXJZN3Bm2S*-#L|)ysS!5D14L2Hk!D-7AmnYVsoDlc1qB2^`|f S8p|6D%^b6bqU#)!bib!|lQg&h diff --git a/z80_unpacker/unpack.asm b/z80_unpacker/unpack.asm index e925d07..324f032 100644 --- a/z80_unpacker/unpack.asm +++ b/z80_unpacker/unpack.asm @@ -139,16 +139,19 @@ unpack: call decode_number ; length = upkr_decode_length(257 + 64); push de exx - ld h,d ; DE = write_ptr - ld l,e -.offset+*: ld bc,0 IFNDEF BACKWARDS_UNPACK - sbc hl,bc ; CF=0 from decode_number ; HL = write_ptr - offset + ld h,d ; DE = write_ptr + ld l,e +.offset+*: ld bc,0 + sbc hl,bc ; CF=0 from decode_number ; HL = write_ptr - offset + pop bc ; BC = length + ldir ELSE - add hl,bc ; HL = write_ptr + offset +.offset+*: ld hl,0 + add hl,de ; HL = write_ptr + offset + pop bc ; BC = length + lddr ENDIF - pop bc ; BC = length - IFNDEF BACKWARDS_UNPACK : ldir : ELSE : lddr : ENDIF exx ld d,b ; prev_was_match = true djnz .decompress_data ; adjust context_index back to 0..255 range, go to main loop From c1ffd0e7ed1825bef171d50d81b996fff059c16e Mon Sep 17 00:00:00 2001 From: "Peter Helcmanovsky (Ped)" Date: Mon, 19 Sep 2022 11:42:56 +0200 Subject: [PATCH 4/9] z80_unpacker: attempt for faster `decode_number` (+6B, ~1% faster) => not good archived in comments for future reference --- z80_unpacker/unpack.asm | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/z80_unpacker/unpack.asm b/z80_unpacker/unpack.asm index 324f032..a2d1fc9 100644 --- a/z80_unpacker/unpack.asm +++ b/z80_unpacker/unpack.asm @@ -15,7 +15,7 @@ ;; modifies: all registers except IY, requires 10 bytes of stack space ;; -; DEFINE BACKWARDS_UNPACK ; uncomment to build backwards depacker +; DEFINE BACKWARDS_UNPACK ; uncomment to build backwards depacker (write_ptr--, upkr_data_ptr--) ; initial IX points at last byte of compressed data ; initial DE' points at last byte of unpacked data @@ -140,6 +140,7 @@ unpack: push de exx IFNDEF BACKWARDS_UNPACK + ; forward unpack (write_ptr++, upkr_data_ptr++) ld h,d ; DE = write_ptr ld l,e .offset+*: ld bc,0 @@ -147,6 +148,7 @@ unpack: pop bc ; BC = length ldir ELSE + ; backward unpack (write_ptr--, upkr_data_ptr--) .offset+*: ld hl,0 add hl,de ; HL = write_ptr + offset pop bc ; BC = length @@ -311,6 +313,37 @@ decode_number: jr c,.fix_bit_pos ; until stop bit is reached (all bits did land to correct position) ret ; return with CF=0 (important for unpack routine) +/* + archived: negligibly faster but +6B longer decode_number variant using HL' and BC' to + do `number|=(1< Date: Mon, 19 Sep 2022 11:49:53 +0200 Subject: [PATCH 5/9] z80_unpacker: optimisation: -1B in decode_number (fwd 171B / rev 168B) --- z80_unpacker/example/example.sna | Bin 49179 -> 49179 bytes z80_unpacker/unpack.asm | 9 +++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/z80_unpacker/example/example.sna b/z80_unpacker/example/example.sna index 608c82a0cd10aa8ef47756414fd76847ca067304..e3e730a6542a30c1ad427059d407fb98c5464bff 100644 GIT binary patch delta 167 zcmbQ;z&yKwd4u>u#4)R<*U85kt(XPODw~pbia{t?`PZg8r zFEr+5KU?_lwA5*7AZ77o^23D=qWq75Cd53tdQ*{sLGs|+9YF0zcqW@Jas#TBd<0f2 S`N-nS=B`DrWF{BwUkL!@3s7YM delta 165 zcmbQ;z&yKwd4u>u#&eS;7FzfhKC}?vIa~Pfw4lMy%9}#MHv=SIR4qiQ7qaeW1uE2J;j^VCy|J$rj<&zgI zH0I|38z^;J8bn)sn*3m)1EauX#YMuCO&7U|17)Qiod(NFJ+k;@4`N#UJh@Tvm-t45 L7c7%Y_Adtj`%g_n diff --git a/z80_unpacker/unpack.asm b/z80_unpacker/unpack.asm index a2d1fc9..4cd5829 100644 --- a/z80_unpacker/unpack.asm +++ b/z80_unpacker/unpack.asm @@ -187,6 +187,9 @@ int upkr_decode_bit(int context_index) { return bit; } */ +inc_c_decode_bit: + ; ++low(context_index) before decode_bit (to get -1B by two calls in decode_number) + inc c decode_bit: ; HL = upkr_state ; IX = upkr_data_ptr @@ -298,13 +301,11 @@ decode_number: ld de,$7FFF ; length = 0 with positional-stop-bit jr .loop_entry .loop: - inc c ; context_index + 1 - call decode_bit + call inc_c_decode_bit ; context_index + 1 rr d rr e ; DE = length = (length >> 1) | (bit << 15); .loop_entry: - inc c ; context_index += 2 - call decode_bit + call inc_c_decode_bit ; context_index += 2 jr c,.loop .fix_bit_pos: ccf ; NC will become this final `| (1 << bit_pos)` bit From e1f9fa143a905943be1397159604cd3e6699a9a5 Mon Sep 17 00:00:00 2001 From: "Peter Helcmanovsky (Ped)" Date: Mon, 19 Sep 2022 11:58:32 +0200 Subject: [PATCH 6/9] z80_unpacker: comment with caller size optimisation tip --- z80_unpacker/unpack.asm | 3 +++ 1 file changed, 3 insertions(+) diff --git a/z80_unpacker/unpack.asm b/z80_unpacker/unpack.asm index 4cd5829..b3c48ea 100644 --- a/z80_unpacker/unpack.asm +++ b/z80_unpacker/unpack.asm @@ -21,6 +21,9 @@ ; DEFINE UPKR_UNPACK_SPEED ; uncomment to get larger but faster unpack routine +; code size hint: if you put probs array just ahead of BASIC entry point, you will get BC +; initialised to probs.e by BASIC `USR` command and you can remove it from unpack init (-3B) + OPT push reset --syntax=abf MODULE upkr From a8fd3dc5732a8ffe2a22948194dfea982a9ca643 Mon Sep 17 00:00:00 2001 From: "Peter Helcmanovsky (Ped)" Date: Mon, 19 Sep 2022 13:20:44 +0200 Subject: [PATCH 7/9] z80_unpacker: optimisation: -1B in decode_number (fwd 170B / rev 167B) slightly slower code, ROM unpack is back to ~22.6s --- z80_unpacker/example/example.sna | Bin 49179 -> 49179 bytes z80_unpacker/unpack.asm | 7 +++---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/z80_unpacker/example/example.sna b/z80_unpacker/example/example.sna index e3e730a6542a30c1ad427059d407fb98c5464bff..5a56975f125af16b3553eae3b10735738c65bc0f 100644 GIT binary patch delta 113 zcmV-%0FM8gfCHO=1F#pN0nC#ap}7P9x7?G}p(ZOC;y4ft%^Bj$5it3A*%BMsP#EcW z1iI82;xG%%TjCpfv&ijJ4&n|U&0CWuq9Fnw;*+7FFc$x}+!*4^8p|8a7~(keKg$}+ T8#wvNu_5{wu`uielXSnQujDop delta 115 zcmbQ;z&yKwd4u>u#9-S66_*r>VNcv`g#LEhn zU00+Y8E~JCc_dx6`NZ2`{zrWBXJaO-E>dJtcr> 1) | (bit << 15); -.loop_entry: call inc_c_decode_bit ; context_index += 2 jr c,.loop .fix_bit_pos: From 9913dcf4bb77e5ef5a4a9591a915a063f07a8af6 Mon Sep 17 00:00:00 2001 From: "Peter Helcmanovsky (Ped)" Date: Mon, 19 Sep 2022 14:31:00 +0200 Subject: [PATCH 8/9] z80_unpacker: comment with possible LUT variant of updating probs value missing 512 byte table generator, which doesn't look trivial to do (especially in terms of code size). Not tested, but looks as decent speed up. --- z80_unpacker/unpack.asm | 43 +++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/z80_unpacker/unpack.asm b/z80_unpacker/unpack.asm index 300579d..e0218e9 100644 --- a/z80_unpacker/unpack.asm +++ b/z80_unpacker/unpack.asm @@ -316,6 +316,21 @@ decode_number: jr c,.fix_bit_pos ; until stop bit is reached (all bits did land to correct position) ret ; return with CF=0 (important for unpack routine) + DISPLAY "upkr.unpack total size: ",/D,$-unpack + + ; reserve space for probs array without emitting any machine code (using only EQU) + + IFDEF UPKR_PROBS_ORIGIN ; if specific address is defined by user, move probs array there +probs: EQU ((UPKR_PROBS_ORIGIN) + 255) & -$100 ; probs array aligned to 256 + ELSE +probs: EQU ($ + 255) & -$100 ; probs array aligned to 256 + ENDIF +.real_c: EQU 1 + 255 + 1 + 2*NUMBER_BITS ; real size of probs array +.c: EQU (.real_c + 1) & -2 ; padding to even size (required by init code) +.e: EQU probs + .c + + DISPLAY "upkr.unpack probs array placed at: ",/A,probs,",\tsize: ",/A,probs.c + /* archived: negligibly faster but +6B longer decode_number variant using HL' and BC' to do `number|=(1< Date: Mon, 19 Sep 2022 15:19:39 +0200 Subject: [PATCH 9/9] z80_unpacker: readme.txt and comment update --- z80_unpacker/readme.txt | 21 +++++++++++++++++---- z80_unpacker/unpack.asm | 2 +- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/z80_unpacker/readme.txt b/z80_unpacker/readme.txt index b7fff1b..4d93ac9 100644 --- a/z80_unpacker/readme.txt +++ b/z80_unpacker/readme.txt @@ -10,10 +10,23 @@ may be incompatible with files you will produce with current version) Asm syntax is z00m's sjasmplus: https://github.com/z00m128/sjasmplus +Backward direction unpacker added as compile-time option, see example for both forward/backward +depacker in action. + +The packed/unpacked data-overlap has to be tested per-case, in worst case the packed data +may need even more than 7 bytes to unpack final byte, but usually 1-4 bytes may suffice. + TODO: -- build base corpus of test data to benchmark future changes in algorithm/format -- review first implementation to identify weak spots where the implementation can be shorter+faster -with acceptable small changes to the format -- review non-bitstream variant, if it's feasible to try to implement it with Z80 +- build bigger corpus of test data to benchmark future changes in algorithm/format (example and zx48.rom was used to do initial tests) +- maybe try to beat double-loop `decode_number` with different encoding format - (@ped7g) Z80N version of unpacker for ZX Next devs - (@exoticorn) add Z80 specific packer (to avoid confusion with original MicroW8 variant), and land it all to master branch, maybe in "z80" directory or something? (and overall decide how to organise+merge this upstream into main repo) +- (@exoticorn) add to packer output with possible packed/unpacked region overlap + +DONE: +* review non-bitstream variant, if it's feasible to try to implement it with Z80 + - Ped7g: IMHO nope, the 12b x 8b MUL code would probably quickly cancel any gains from the simpler state update +* review first implementation to identify weak spots where the implementation can be shorter+faster +with acceptable small changes to the format + - Ped7g: the decode_bit settled down and now doesn't feel so confused and redundant, the code seems pretty on point to me, no obvious simplification from format change + - Ped7g: the decode_number double-loop is surprisingly resilient, especially in terms of code size I failed to beat it, speed wise only negligible gains diff --git a/z80_unpacker/unpack.asm b/z80_unpacker/unpack.asm index e0218e9..637b048 100644 --- a/z80_unpacker/unpack.asm +++ b/z80_unpacker/unpack.asm @@ -4,7 +4,7 @@ ;; initial version by Peter "Ped" Helcmanovsky (C) 2022, licensed same as upkr project ("unlicensed") ;; to assemble use z00m's sjasmplus: https://github.com/z00m128/sjasmplus ;; -;; you can define UPKR_PROBS_ORIGIN to specific 256 byte aligned address for probs array (386 bytes), +;; you can define UPKR_PROBS_ORIGIN to specific 256 byte aligned address for probs array (320 bytes), ;; otherwise it will be positioned after the unpacker code (256 aligned) ;; ;; public API: