mirror of
https://github.com/exoticorn/upkr.git
synced 2026-01-20 19:46:42 +01:00
missing 512 byte table generator, which doesn't look trivial to do (especially in terms of code size). Not tested, but looks as decent speed up.
382 lines
14 KiB
NASM
382 lines
14 KiB
NASM
;; https://github.com/exoticorn/upkr/blob/z80/c_unpacker/unpack.c - original C implementation
|
|
;; C source in comments ahead of asm - the C macros are removed to keep only bitstream variant
|
|
;;
|
|
;; initial version by Peter "Ped" Helcmanovsky (C) 2022, licensed same as upkr project ("unlicensed")
|
|
;; to assemble use z00m's sjasmplus: https://github.com/z00m128/sjasmplus
|
|
;;
|
|
;; you can define UPKR_PROBS_ORIGIN to specific 256 byte aligned address for probs array (386 bytes),
|
|
;; otherwise it will be positioned after the unpacker code (256 aligned)
|
|
;;
|
|
;; public API:
|
|
;;
|
|
;; upkr.unpack
|
|
;; IN: IX = packed data, DE' (shadow DE) = destination
|
|
;; OUT: IX = after packed data
|
|
;; modifies: all registers except IY, requires 10 bytes of stack space
|
|
;;
|
|
|
|
; DEFINE BACKWARDS_UNPACK ; uncomment to build backwards depacker (write_ptr--, upkr_data_ptr--)
|
|
; initial IX points at last byte of compressed data
|
|
; initial DE' points at last byte of unpacked data
|
|
|
|
; DEFINE UPKR_UNPACK_SPEED ; uncomment to get larger but faster unpack routine
|
|
|
|
; code size hint: if you put probs array just ahead of BASIC entry point, you will get BC
|
|
; initialised to probs.e by BASIC `USR` command and you can remove it from unpack init (-3B)
|
|
|
|
OPT push reset --syntax=abf
|
|
MODULE upkr
|
|
|
|
NUMBER_BITS EQU 16+15 ; context-bits per offset/length (16+15 for 16bit offsets/pointers)
|
|
; numbers (offsets/lengths) are encoded like: 1a1b1c1d1e0 = 0000'0000'001e'dbca
|
|
|
|
/*
|
|
u8* upkr_data_ptr;
|
|
u8 upkr_probs[1 + 255 + 1 + 2*32 + 2*32];
|
|
u16 upkr_state;
|
|
u8 upkr_current_byte;
|
|
int upkr_bits_left;
|
|
|
|
int upkr_unpack(void* destination, void* compressed_data) {
|
|
upkr_data_ptr = (u8*)compressed_data;
|
|
upkr_state = 0;
|
|
upkr_bits_left = 0;
|
|
for(int i = 0; i < sizeof(upkr_probs); ++i)
|
|
upkr_probs[i] = 128;
|
|
|
|
u8* write_ptr = (u8*)destination;
|
|
|
|
int prev_was_match = 0;
|
|
int offset = 0;
|
|
for(;;) {
|
|
if(upkr_decode_bit(0)) {
|
|
if(prev_was_match || upkr_decode_bit(256)) {
|
|
offset = upkr_decode_length(257) - 1;
|
|
if(offset == 0) {
|
|
break;
|
|
}
|
|
}
|
|
int length = upkr_decode_length(257 + 64);
|
|
while(length--) {
|
|
*write_ptr = write_ptr[-offset];
|
|
++write_ptr;
|
|
}
|
|
prev_was_match = 1;
|
|
} else {
|
|
int byte = 1;
|
|
while(byte < 256) {
|
|
int bit = upkr_decode_bit(byte);
|
|
byte = (byte << 1) + bit;
|
|
}
|
|
*write_ptr++ = byte;
|
|
prev_was_match = 0;
|
|
}
|
|
}
|
|
|
|
return write_ptr - (u8*)destination;
|
|
}
|
|
*/
|
|
; IN: IX = compressed_data, DE' = destination
|
|
unpack:
|
|
; ** reset probs to 0x80, also reset HL (state) to zero, and set BC to probs+context 0
|
|
ld hl,probs.c>>1
|
|
ld bc,probs.e
|
|
ld a,$80
|
|
.reset_probs:
|
|
dec bc
|
|
ld (bc),a ; will overwrite one extra byte after the array because of odd length
|
|
dec bc
|
|
ld (bc),a
|
|
dec l
|
|
jr nz,.reset_probs
|
|
exa
|
|
; BC = probs (context_index 0), state HL = 0, A' = 0x80 (no source bits left in upkr_current_byte)
|
|
|
|
; ** main loop to decompress data
|
|
; D = prev_was_match = uninitialised, literal is expected first => will reset D to "false"
|
|
; values for false/true of prev_was_match are: false = high(probs), true = 1 + high(probs)
|
|
.decompress_data:
|
|
ld c,0
|
|
call decode_bit ; if(upkr_decode_bit(0))
|
|
jr c,.copy_chunk
|
|
|
|
; * extract byte from compressed data (literal)
|
|
inc c ; C = byte = 1 (and also context_index)
|
|
.decode_byte:
|
|
call decode_bit ; bit = upkr_decode_bit(byte);
|
|
rl c ; byte = (byte << 1) + bit;
|
|
jr nc,.decode_byte ; while(byte < 256)
|
|
ld a,c
|
|
exx
|
|
ld (de),a ; *write_ptr++ = byte;
|
|
IFNDEF BACKWARDS_UNPACK : inc de : ELSE : dec de : ENDIF
|
|
exx
|
|
ld d,b ; prev_was_match = false
|
|
jr .decompress_data
|
|
|
|
; * copy chunk of already decompressed data (match)
|
|
.copy_chunk:
|
|
ld a,b
|
|
inc b ; context_index = 256
|
|
; if(prev_was_match || upkr_decode_bit(256)) {
|
|
; offset = upkr_decode_length(257) - 1;
|
|
; if (0 == offset) break;
|
|
; }
|
|
cp d ; CF = prev_was_match
|
|
call nc,decode_bit ; if not prev_was_match, then upkr_decode_bit(256)
|
|
jr nc,.keep_offset ; if neither, keep old offset
|
|
call decode_number ; context_index is already 257-1 as needed by decode_number
|
|
dec de ; offset = upkr_decode_length(257) - 1;
|
|
ld a,d
|
|
or e
|
|
ret z ; if(offset == 0) break
|
|
ld (.offset),de
|
|
.keep_offset:
|
|
; int length = upkr_decode_length(257 + 64);
|
|
; while(length--) {
|
|
; *write_ptr = write_ptr[-offset];
|
|
; ++write_ptr;
|
|
; }
|
|
; prev_was_match = 1;
|
|
ld c,low(257 + NUMBER_BITS - 1) ; context_index to second "number" set for lengths decoding
|
|
call decode_number ; length = upkr_decode_length(257 + 64);
|
|
push de
|
|
exx
|
|
IFNDEF BACKWARDS_UNPACK
|
|
; forward unpack (write_ptr++, upkr_data_ptr++)
|
|
ld h,d ; DE = write_ptr
|
|
ld l,e
|
|
.offset+*: ld bc,0
|
|
sbc hl,bc ; CF=0 from decode_number ; HL = write_ptr - offset
|
|
pop bc ; BC = length
|
|
ldir
|
|
ELSE
|
|
; backward unpack (write_ptr--, upkr_data_ptr--)
|
|
.offset+*: ld hl,0
|
|
add hl,de ; HL = write_ptr + offset
|
|
pop bc ; BC = length
|
|
lddr
|
|
ENDIF
|
|
exx
|
|
ld d,b ; prev_was_match = true
|
|
djnz .decompress_data ; adjust context_index back to 0..255 range, go to main loop
|
|
|
|
/*
|
|
int upkr_decode_bit(int context_index) {
|
|
while(upkr_state < 32768) {
|
|
if(upkr_bits_left == 0) {
|
|
upkr_current_byte = *upkr_data_ptr++;
|
|
upkr_bits_left = 8;
|
|
}
|
|
upkr_state = (upkr_state << 1) + (upkr_current_byte >> 7);
|
|
upkr_current_byte <<= 1;
|
|
--upkr_bits_left;
|
|
}
|
|
|
|
int prob = upkr_probs[context_index];
|
|
int bit = (upkr_state & 255) >= prob ? 1 : 0;
|
|
|
|
int prob_offset = 16;
|
|
int state_offset = 0;
|
|
int state_scale = prob;
|
|
if(bit) {
|
|
state_offset = -prob;
|
|
state_scale = 256 - prob;
|
|
prob_offset = 0;
|
|
}
|
|
upkr_state = state_offset + state_scale * (upkr_state >> 8) + (upkr_state & 255);
|
|
upkr_probs[context_index] = prob_offset + prob - ((prob + 8) >> 4);
|
|
|
|
return bit;
|
|
}
|
|
*/
|
|
inc_c_decode_bit:
|
|
; ++low(context_index) before decode_bit (to get -1B by two calls in decode_number)
|
|
inc c
|
|
decode_bit:
|
|
; HL = upkr_state
|
|
; IX = upkr_data_ptr
|
|
; BC = probs+context_index
|
|
; A' = upkr_current_byte (!!! init to 0x80 at start, not 0x00)
|
|
; preserves DE
|
|
; ** while (state < 32768) - initial check
|
|
push de
|
|
bit 7,h
|
|
jr nz,.state_b15_set
|
|
exa
|
|
; ** while body
|
|
.state_b15_zero:
|
|
; HL = upkr_state
|
|
; IX = upkr_data_ptr
|
|
; A = upkr_current_byte (init to 0x80 at start, not 0x00)
|
|
add a,a ; upkr_current_byte <<= 1; // and testing if(upkr_bits_left == 0)
|
|
jr nz,.has_bit ; CF=data, ZF=0 -> some bits + stop bit still available
|
|
; CF=1 (by stop bit)
|
|
ld a,(ix)
|
|
IFNDEF BACKWARDS_UNPACK : inc ix : ELSE : dec ix : ENDIF ; upkr_current_byte = *upkr_data_ptr++;
|
|
adc a,a ; CF=data, b0=1 as new stop bit
|
|
.has_bit:
|
|
adc hl,hl ; upkr_state = (upkr_state << 1) + (upkr_current_byte >> 7);
|
|
jp p,.state_b15_zero ; while (state < 32768)
|
|
exa
|
|
; ** set "bit"
|
|
.state_b15_set:
|
|
ld a,(bc) ; A = upkr_probs[context_index]
|
|
dec a ; prob is in ~7..249 range, never zero, safe to -1
|
|
cp l ; CF = bit = prob-1 < (upkr_state & 255) <=> prob <= (upkr_state & 255)
|
|
inc a
|
|
; ** adjust state
|
|
push bc
|
|
ld c,l ; C = (upkr_state & 255); (preserving the value)
|
|
push af
|
|
jr nc,.bit_is_0
|
|
neg ; A = -prob == (256-prob), CF=1 preserved
|
|
.bit_is_0:
|
|
ld d,0
|
|
ld e,a ; DE = state_scale ; prob || (256-prob)
|
|
ld l,d ; H:L = (upkr_state>>8) : 0
|
|
|
|
IFNDEF UPKR_UNPACK_SPEED
|
|
|
|
;; looped MUL for minimum unpack size
|
|
ld b,8 ; counter
|
|
.mulLoop:
|
|
add hl,hl
|
|
jr nc,.mul0
|
|
add hl,de
|
|
.mul0:
|
|
djnz .mulLoop ; until HL = state_scale * (upkr_state>>8), also BC becomes (upkr_state & 255)
|
|
|
|
ELSE
|
|
|
|
;;; unrolled MUL for better performance, +25 bytes unpack size
|
|
ld b,d
|
|
DUP 8
|
|
add hl,hl
|
|
jr nc,0_f
|
|
add hl,de
|
|
0:
|
|
EDUP
|
|
|
|
ENDIF
|
|
|
|
add hl,bc ; HL = state_scale * (upkr_state >> 8) + (upkr_state & 255)
|
|
pop af ; restore prob and CF=bit
|
|
jr nc,.bit_is_0_2
|
|
dec d ; DE = -prob (also D = bit ? $FF : $00)
|
|
add hl,de ; HL += -prob
|
|
; ^ this always preserves CF=1, because (state>>8) >= 128, state_scale: 7..250, prob: 7..250,
|
|
; so 7*128 > 250 and thus edge case `ADD hl=(7*128+0),de=(-250)` => CF=1
|
|
.bit_is_0_2:
|
|
; *** adjust probs[context_index]
|
|
rra ; + (bit<<4) ; part of -prob_offset, needs another -16
|
|
and $FC ; clear/keep correct bits to get desired (prob>>4) + extras, CF=0
|
|
rra
|
|
rra
|
|
rra ; A = (bit<<4) + (prob>>4), CF=(prob & 8)
|
|
adc a,-16 ; A = (bit<<4) - 16 + ((prob + 8)>>4) ; -prob_offset = (bit<<4) - 16
|
|
ld e,a
|
|
pop bc
|
|
ld a,(bc) ; A = prob (cheaper + shorter to re-read again from memory)
|
|
sub e ; A = 16 - (bit<<4) + prob - ((prob + 8)>>4) ; = prob_offset + prob - ((prob + 8)>>4)
|
|
ld (bc),a ; probs[context_index] = prob_offset + prob - ((prob + 8) >> 4);
|
|
add a,d ; restore CF = bit (D = bit ? $FF : $00 && A > 0)
|
|
pop de
|
|
ret
|
|
|
|
/*
|
|
int upkr_decode_length(int context_index) {
|
|
int length = 0;
|
|
int bit_pos = 0;
|
|
while(upkr_decode_bit(context_index)) {
|
|
length |= upkr_decode_bit(context_index + 1) << bit_pos++;
|
|
context_index += 2;
|
|
}
|
|
return length | (1 << bit_pos);
|
|
}
|
|
*/
|
|
decode_number:
|
|
; HL = upkr_state
|
|
; IX = upkr_data_ptr
|
|
; BC = probs+context_index-1
|
|
; A' = upkr_current_byte (!!! init to 0x80 at start, not 0x00)
|
|
; return length in DE, CF=0
|
|
ld de,$FFFF ; length = 0 with positional-stop-bit
|
|
or a ; CF=0 to skip getting data bit and use only `rr d : rr e` to fix init DE
|
|
.loop:
|
|
call c,inc_c_decode_bit ; get data bit, context_index + 1 / if CF=0 just add stop bit into DE init
|
|
rr d
|
|
rr e ; DE = length = (length >> 1) | (bit << 15);
|
|
call inc_c_decode_bit ; context_index += 2
|
|
jr c,.loop
|
|
.fix_bit_pos:
|
|
ccf ; NC will become this final `| (1 << bit_pos)` bit
|
|
rr d
|
|
rr e
|
|
jr c,.fix_bit_pos ; until stop bit is reached (all bits did land to correct position)
|
|
ret ; return with CF=0 (important for unpack routine)
|
|
|
|
DISPLAY "upkr.unpack total size: ",/D,$-unpack
|
|
|
|
; reserve space for probs array without emitting any machine code (using only EQU)
|
|
|
|
IFDEF UPKR_PROBS_ORIGIN ; if specific address is defined by user, move probs array there
|
|
probs: EQU ((UPKR_PROBS_ORIGIN) + 255) & -$100 ; probs array aligned to 256
|
|
ELSE
|
|
probs: EQU ($ + 255) & -$100 ; probs array aligned to 256
|
|
ENDIF
|
|
.real_c: EQU 1 + 255 + 1 + 2*NUMBER_BITS ; real size of probs array
|
|
.c: EQU (.real_c + 1) & -2 ; padding to even size (required by init code)
|
|
.e: EQU probs + .c
|
|
|
|
DISPLAY "upkr.unpack probs array placed at: ",/A,probs,",\tsize: ",/A,probs.c
|
|
|
|
/*
|
|
archived: negligibly faster but +6B longer decode_number variant using HL' and BC' to
|
|
do `number|=(1<<bit_pos);` type of logic in single loop.
|
|
*/
|
|
; decode_number:
|
|
; exx
|
|
; ld bc,1
|
|
; ld l,b
|
|
; ld h,b ; HL = 0
|
|
; .loop
|
|
; exx
|
|
; inc c
|
|
; call decode_bit
|
|
; jr nc,.done
|
|
; inc c
|
|
; call decode_bit
|
|
; exx
|
|
; jr nc,.b0
|
|
; add hl,bc
|
|
; .b0:
|
|
; sla c
|
|
; rl b
|
|
; jr .loop
|
|
; .done:
|
|
; exx
|
|
; add hl,bc
|
|
; push hl
|
|
; exx
|
|
; pop de
|
|
; ret
|
|
|
|
/*
|
|
archived: possible LUT variant of updating probs value, requires 512-aligned 512B table (not tested)
|
|
*/
|
|
; code is replacing decode_bit from "; *** adjust probs[context_index]", followed by `ld (bc),a : add a,d ...`
|
|
; ld c,a
|
|
; ld a,high(probs_update_table)/2 ; must be 512 aligned
|
|
; rla
|
|
; ld b,a
|
|
; ld a,(bc)
|
|
; pop bc
|
|
; -------------------------------------------
|
|
; probs_update_table: EQU probs-512
|
|
; -------------------------------------------
|
|
; table generator is not obvious and probably not short either, 20+ bytes almost for sure, maybe even 30-40
|
|
|
|
ENDMODULE
|
|
OPT pop
|