10 Commits

Author SHA1 Message Date
48727040b3 Merge pull request #5 from ped7g/z80_ped7g
Z80 ped7g - further optimisations
2022-09-27 22:35:44 +02:00
Peter Helcmanovsky (Ped)
8a32e1384c z80_unpacker: readme.txt and comment update 2022-09-19 15:19:39 +02:00
Peter Helcmanovsky (Ped)
9913dcf4bb z80_unpacker: comment with possible LUT variant of updating probs value
missing 512 byte table generator, which doesn't look trivial to do
(especially in terms of code size).
Not tested, but looks as decent speed up.
2022-09-19 14:31:00 +02:00
Peter Helcmanovsky (Ped)
a8fd3dc573 z80_unpacker: optimisation: -1B in decode_number (fwd 170B / rev 167B)
slightly slower code, ROM unpack is back to ~22.6s
2022-09-19 13:20:44 +02:00
Peter Helcmanovsky (Ped)
e1f9fa143a z80_unpacker: comment with caller size optimisation tip 2022-09-19 11:58:32 +02:00
Peter Helcmanovsky (Ped)
db1c7d2d14 z80_unpacker: optimisation: -1B in decode_number (fwd 171B / rev 168B) 2022-09-19 11:49:53 +02:00
Peter Helcmanovsky (Ped)
c1ffd0e7ed z80_unpacker: attempt for faster decode_number (+6B, ~1% faster) => not good
archived in comments for future reference
2022-09-19 11:42:56 +02:00
Peter Helcmanovsky (Ped)
00d084105a z80_unpacker: optimisation: -2B in backward unpack (fwd 172B / rev 169B)
backward was already -1B, so now the total difference is -3B.
2022-09-19 01:31:22 +02:00
Peter Helcmanovsky (Ped)
8e5298caee z80_unpacker: optimisation: -1B in decode_number = 172B (but +4T per length) 2022-09-19 01:09:21 +02:00
Peter Helcmanovsky (Ped)
1fb29f3a1b z80_unpacker: optimisation: -1B and -1T in decode_bit = 173B 2022-09-18 23:44:18 +02:00
3 changed files with 93 additions and 27 deletions

Binary file not shown.

View File

@@ -10,10 +10,23 @@ may be incompatible with files you will produce with current version)
Asm syntax is z00m's sjasmplus: https://github.com/z00m128/sjasmplus
Backward direction unpacker added as compile-time option, see example for both forward/backward
depacker in action.
The packed/unpacked data-overlap has to be tested per-case, in worst case the packed data
may need even more than 7 bytes to unpack final byte, but usually 1-4 bytes may suffice.
TODO:
- build base corpus of test data to benchmark future changes in algorithm/format
- review first implementation to identify weak spots where the implementation can be shorter+faster
with acceptable small changes to the format
- review non-bitstream variant, if it's feasible to try to implement it with Z80
- build bigger corpus of test data to benchmark future changes in algorithm/format (example and zx48.rom was used to do initial tests)
- maybe try to beat double-loop `decode_number` with different encoding format
- (@ped7g) Z80N version of unpacker for ZX Next devs
- (@exoticorn) add Z80 specific packer (to avoid confusion with original MicroW8 variant), and land it all to master branch, maybe in "z80" directory or something? (and overall decide how to organise+merge this upstream into main repo)
- (@exoticorn) add to packer output with possible packed/unpacked region overlap
DONE:
* review non-bitstream variant, if it's feasible to try to implement it with Z80
- Ped7g: IMHO nope, the 12b x 8b MUL code would probably quickly cancel any gains from the simpler state update
* review first implementation to identify weak spots where the implementation can be shorter+faster
with acceptable small changes to the format
- Ped7g: the decode_bit settled down and now doesn't feel so confused and redundant, the code seems pretty on point to me, no obvious simplification from format change
- Ped7g: the decode_number double-loop is surprisingly resilient, especially in terms of code size I failed to beat it, speed wise only negligible gains

View File

@@ -4,7 +4,7 @@
;; initial version by Peter "Ped" Helcmanovsky (C) 2022, licensed same as upkr project ("unlicensed")
;; to assemble use z00m's sjasmplus: https://github.com/z00m128/sjasmplus
;;
;; you can define UPKR_PROBS_ORIGIN to specific 256 byte aligned address for probs array (386 bytes),
;; you can define UPKR_PROBS_ORIGIN to specific 256 byte aligned address for probs array (320 bytes),
;; otherwise it will be positioned after the unpacker code (256 aligned)
;;
;; public API:
@@ -15,12 +15,15 @@
;; modifies: all registers except IY, requires 10 bytes of stack space
;;
; DEFINE BACKWARDS_UNPACK ; uncomment to build backwards depacker
; DEFINE BACKWARDS_UNPACK ; uncomment to build backwards depacker (write_ptr--, upkr_data_ptr--)
; initial IX points at last byte of compressed data
; initial DE' points at last byte of unpacked data
; DEFINE UPKR_UNPACK_SPEED ; uncomment to get larger but faster unpack routine
; code size hint: if you put probs array just ahead of BASIC entry point, you will get BC
; initialised to probs.e by BASIC `USR` command and you can remove it from unpack init (-3B)
OPT push reset --syntax=abf
MODULE upkr
@@ -122,8 +125,7 @@ unpack:
cp d ; CF = prev_was_match
call nc,decode_bit ; if not prev_was_match, then upkr_decode_bit(256)
jr nc,.keep_offset ; if neither, keep old offset
inc c ; context_index to first "number" set for offsets decoding (257)
call decode_number
call decode_number ; context_index is already 257-1 as needed by decode_number
dec de ; offset = upkr_decode_length(257) - 1;
ld a,d
or e
@@ -136,20 +138,25 @@ unpack:
; ++write_ptr;
; }
; prev_was_match = 1;
ld c,low(257 + NUMBER_BITS) ; context_index to second "number" set for lengths decoding
ld c,low(257 + NUMBER_BITS - 1) ; context_index to second "number" set for lengths decoding
call decode_number ; length = upkr_decode_length(257 + 64);
push de
exx
ld h,d ; DE = write_ptr
ld l,e
.offset+*: ld bc,0
IFNDEF BACKWARDS_UNPACK
sbc hl,bc ; CF=0 from decode_number ; HL = write_ptr - offset
; forward unpack (write_ptr++, upkr_data_ptr++)
ld h,d ; DE = write_ptr
ld l,e
.offset+*: ld bc,0
sbc hl,bc ; CF=0 from decode_number ; HL = write_ptr - offset
pop bc ; BC = length
ldir
ELSE
add hl,bc ; HL = write_ptr + offset
; backward unpack (write_ptr--, upkr_data_ptr--)
.offset+*: ld hl,0
add hl,de ; HL = write_ptr + offset
pop bc ; BC = length
lddr
ENDIF
pop bc ; BC = length
IFNDEF BACKWARDS_UNPACK : ldir : ELSE : lddr : ENDIF
exx
ld d,b ; prev_was_match = true
djnz .decompress_data ; adjust context_index back to 0..255 range, go to main loop
@@ -183,6 +190,9 @@ int upkr_decode_bit(int context_index) {
return bit;
}
*/
inc_c_decode_bit:
; ++low(context_index) before decode_bit (to get -1B by two calls in decode_number)
inc c
decode_bit:
; HL = upkr_state
; IX = upkr_data_ptr
@@ -259,16 +269,16 @@ decode_bit:
; so 7*128 > 250 and thus edge case `ADD hl=(7*128+0),de=(-250)` => CF=1
.bit_is_0_2:
; *** adjust probs[context_index]
ld e,a ; preserve prob
rra ; + (bit<<4) ; part of -prob_offset, needs another -16
and $FC ; clear/keep correct bits to get desired (prob>>4) + extras, CF=0
rra
rra
rra ; A = (bit<<4) + (prob>>4), CF=(prob & 8)
adc a,-16 ; A = (bit<<4) - 16 + ((prob + 8)>>4) ; -prob_offset = (bit<<4) - 16
sub e ; A = (bit<<4) - 16 + ((prob + 8)>>4) - prob ; = ((prob + 8)>>4) - prob_offset - prob
neg ; A = prob_offset + prob - ((prob + 8)>>4)
ld e,a
pop bc
ld a,(bc) ; A = prob (cheaper + shorter to re-read again from memory)
sub e ; A = 16 - (bit<<4) + prob - ((prob + 8)>>4) ; = prob_offset + prob - ((prob + 8)>>4)
ld (bc),a ; probs[context_index] = prob_offset + prob - ((prob + 8) >> 4);
add a,d ; restore CF = bit (D = bit ? $FF : $00 && A > 0)
pop de
@@ -288,19 +298,16 @@ int upkr_decode_length(int context_index) {
decode_number:
; HL = upkr_state
; IX = upkr_data_ptr
; BC = probs+context_index
; BC = probs+context_index-1
; A' = upkr_current_byte (!!! init to 0x80 at start, not 0x00)
; return length in DE, CF=0
ld de,$7FFF ; length = 0 with positional-stop-bit
jr .loop_entry
ld de,$FFFF ; length = 0 with positional-stop-bit
or a ; CF=0 to skip getting data bit and use only `rr d : rr e` to fix init DE
.loop:
inc c ; context_index + 1
call decode_bit
call c,inc_c_decode_bit ; get data bit, context_index + 1 / if CF=0 just add stop bit into DE init
rr d
rr e ; DE = length = (length >> 1) | (bit << 15);
inc c ; context_index += 2
.loop_entry:
call decode_bit
call inc_c_decode_bit ; context_index += 2
jr c,.loop
.fix_bit_pos:
ccf ; NC will become this final `| (1 << bit_pos)` bit
@@ -324,5 +331,51 @@ probs: EQU ($ + 255) & -$100 ; probs array aligned to
DISPLAY "upkr.unpack probs array placed at: ",/A,probs,",\tsize: ",/A,probs.c
/*
archived: negligibly faster but +6B longer decode_number variant using HL' and BC' to
do `number|=(1<<bit_pos);` type of logic in single loop.
*/
; decode_number:
; exx
; ld bc,1
; ld l,b
; ld h,b ; HL = 0
; .loop
; exx
; inc c
; call decode_bit
; jr nc,.done
; inc c
; call decode_bit
; exx
; jr nc,.b0
; add hl,bc
; .b0:
; sla c
; rl b
; jr .loop
; .done:
; exx
; add hl,bc
; push hl
; exx
; pop de
; ret
/*
archived: possible LUT variant of updating probs value, requires 512-aligned 512B table (not tested)
*/
; code is replacing decode_bit from "; *** adjust probs[context_index]", followed by `ld (bc),a : add a,d ...`
; ld c,a
; ld a,high(probs_update_table)/2 ; must be 512 aligned
; rla
; ld b,a
; ld a,(bc)
; pop bc
; -------------------------------------------
; probs_update_table: EQU probs-512
; -------------------------------------------
; table generator is not obvious and probably not short either, 20+ bytes almost for sure, maybe even 30-40
ENDMODULE
OPT pop