diff --git a/z80_unpacker/example/example.sna b/z80_unpacker/example/example.sna
index 515f5d6..5a56975 100644
Binary files a/z80_unpacker/example/example.sna and b/z80_unpacker/example/example.sna differ
diff --git a/z80_unpacker/readme.txt b/z80_unpacker/readme.txt
index b7fff1b..4d93ac9 100644
--- a/z80_unpacker/readme.txt
+++ b/z80_unpacker/readme.txt
@@ -10,10 +10,23 @@ may be incompatible with files you will produce with current version)
 
 Asm syntax is z00m's sjasmplus: https://github.com/z00m128/sjasmplus
 
+Backward direction unpacker added as compile-time option, see example for both forward/backward
+depacker in action.
+
+The packed/unpacked data-overlap has to be tested per-case, in worst case the packed data
+may need even more than 7 bytes to unpack final byte, but usually 1-4 bytes may suffice.
+
 TODO:
-- build base corpus of test data to benchmark future changes in algorithm/format
-- review first implementation to identify weak spots where the implementation can be shorter+faster
-with acceptable small changes to the format
-- review non-bitstream variant, if it's feasible to try to implement it with Z80
+- build bigger corpus of test data to benchmark future changes in algorithm/format (example and zx48.rom was used to do initial tests)
+- maybe try to beat double-loop `decode_number` with different encoding format
 - (@ped7g) Z80N version of unpacker for ZX Next devs
 - (@exoticorn) add Z80 specific packer (to avoid confusion with original MicroW8 variant), and land it all to master branch, maybe in "z80" directory or something? (and overall decide how to organise+merge this upstream into main repo)
+- (@exoticorn) add to packer output with possible packed/unpacked region overlap
+
+DONE:
+* review non-bitstream variant, if it's feasible to try to implement it with Z80
+    - Ped7g: IMHO nope, the 12b x 8b MUL code would probably quickly cancel any gains from the simpler state update
+* review first implementation to identify weak spots where the implementation can be shorter+faster
+with acceptable small changes to the format
+    - Ped7g: the decode_bit settled down and now doesn't feel so confused and redundant, the code seems pretty on point to me, no obvious simplification from format change
+    - Ped7g: the decode_number double-loop is surprisingly resilient, especially in terms of code size I failed to beat it, speed wise only negligible gains
diff --git a/z80_unpacker/unpack.asm b/z80_unpacker/unpack.asm
index 26e47b7..637b048 100644
--- a/z80_unpacker/unpack.asm
+++ b/z80_unpacker/unpack.asm
@@ -4,7 +4,7 @@
 ;; initial version by Peter "Ped" Helcmanovsky (C) 2022, licensed same as upkr project ("unlicensed")
 ;; to assemble use z00m's sjasmplus: https://github.com/z00m128/sjasmplus
 ;;
-;; you can define UPKR_PROBS_ORIGIN to specific 256 byte aligned address for probs array (386 bytes),
+;; you can define UPKR_PROBS_ORIGIN to specific 256 byte aligned address for probs array (320 bytes),
 ;; otherwise it will be positioned after the unpacker code (256 aligned)
 ;;
 ;; public API:
@@ -15,12 +15,15 @@
 ;;         modifies: all registers except IY, requires 10 bytes of stack space
 ;;
 
-;     DEFINE BACKWARDS_UNPACK         ; uncomment to build backwards depacker
+;     DEFINE BACKWARDS_UNPACK         ; uncomment to build backwards depacker (write_ptr--, upkr_data_ptr--)
             ; initial IX points at last byte of compressed data
             ; initial DE' points at last byte of unpacked data
 
 ;     DEFINE UPKR_UNPACK_SPEED        ; uncomment to get larger but faster unpack routine
 
+; code size hint: if you put probs array just ahead of BASIC entry point, you will get BC
+; initialised to probs.e by BASIC `USR` command and you can remove it from unpack init (-3B)
+
     OPT push reset --syntax=abf
     MODULE upkr
 
@@ -122,8 +125,7 @@ unpack:
     cp      d                   ; CF = prev_was_match
     call    nc,decode_bit       ; if not prev_was_match, then upkr_decode_bit(256)
     jr      nc,.keep_offset     ; if neither, keep old offset
-    inc     c                   ; context_index to first "number" set for offsets decoding (257)
-    call    decode_number
+    call    decode_number       ; context_index is already 257-1 as needed by decode_number
     dec     de                  ; offset = upkr_decode_length(257) - 1;
     ld      a,d
     or      e
@@ -136,20 +138,25 @@ unpack:
         ;                 ++write_ptr;
         ;             }
         ;             prev_was_match = 1;
-    ld      c,low(257 + NUMBER_BITS)    ; context_index to second "number" set for lengths decoding
+    ld      c,low(257 + NUMBER_BITS - 1)    ; context_index to second "number" set for lengths decoding
     call    decode_number       ; length = upkr_decode_length(257 + 64);
     push    de
     exx
-    ld      h,d                 ; DE = write_ptr
-    ld      l,e
-.offset+*:  ld      bc,0
     IFNDEF BACKWARDS_UNPACK
-    sbc     hl,bc               ; CF=0 from decode_number ; HL = write_ptr - offset
+        ; forward unpack (write_ptr++, upkr_data_ptr++)
+        ld      h,d             ; DE = write_ptr
+        ld      l,e
+.offset+*:  ld  bc,0
+        sbc     hl,bc           ; CF=0 from decode_number ; HL = write_ptr - offset
+        pop     bc              ; BC = length
+        ldir
     ELSE
-    add     hl,bc               ; HL = write_ptr + offset
+        ; backward unpack (write_ptr--, upkr_data_ptr--)
+.offset+*:  ld  hl,0
+        add     hl,de           ; HL = write_ptr + offset
+        pop     bc              ; BC = length
+        lddr
     ENDIF
-    pop     bc                  ; BC = length
-    IFNDEF BACKWARDS_UNPACK : ldir : ELSE : lddr : ENDIF
     exx
     ld      d,b                 ; prev_was_match = true
     djnz    .decompress_data    ; adjust context_index back to 0..255 range, go to main loop
@@ -183,6 +190,9 @@ int upkr_decode_bit(int context_index) {
     return bit;
 }
 */
+inc_c_decode_bit:
+  ; ++low(context_index) before decode_bit (to get -1B by two calls in decode_number)
+    inc     c
 decode_bit:
   ; HL = upkr_state
   ; IX = upkr_data_ptr
@@ -259,16 +269,16 @@ decode_bit:
     ; so 7*128 > 250 and thus edge case `ADD hl=(7*128+0),de=(-250)` => CF=1
 .bit_is_0_2:
  ; *** adjust probs[context_index]
-    ld      e,a                     ; preserve prob
     rra                             ; + (bit<<4) ; part of -prob_offset, needs another -16
     and     $FC                     ; clear/keep correct bits to get desired (prob>>4) + extras, CF=0
     rra
     rra
     rra                             ; A = (bit<<4) + (prob>>4), CF=(prob & 8)
     adc     a,-16                   ; A = (bit<<4) - 16 + ((prob + 8)>>4) ; -prob_offset = (bit<<4) - 16
-    sub     e                       ; A = (bit<<4) - 16 + ((prob + 8)>>4) - prob ; = ((prob + 8)>>4) - prob_offset - prob
-    neg                             ; A = prob_offset + prob - ((prob + 8)>>4)
+    ld      e,a
     pop     bc
+    ld      a,(bc)                  ; A = prob (cheaper + shorter to re-read again from memory)
+    sub     e                       ; A = 16 - (bit<<4) + prob - ((prob + 8)>>4) ; = prob_offset + prob - ((prob + 8)>>4)
     ld      (bc),a                  ; probs[context_index] = prob_offset + prob - ((prob + 8) >> 4);
     add     a,d                     ; restore CF = bit (D = bit ? $FF : $00 && A > 0)
     pop     de
@@ -288,19 +298,16 @@ int upkr_decode_length(int context_index) {
 decode_number:
   ; HL = upkr_state
   ; IX = upkr_data_ptr
-  ; BC = probs+context_index
+  ; BC = probs+context_index-1
   ; A' = upkr_current_byte (!!! init to 0x80 at start, not 0x00)
   ; return length in DE, CF=0
-    ld      de,$7FFF            ; length = 0 with positional-stop-bit
-    jr      .loop_entry
+    ld      de,$FFFF            ; length = 0 with positional-stop-bit
+    or      a                   ; CF=0 to skip getting data bit and use only `rr d : rr e` to fix init DE
 .loop:
-    inc     c                   ; context_index + 1
-    call    decode_bit
+    call    c,inc_c_decode_bit  ; get data bit, context_index + 1 / if CF=0 just add stop bit into DE init
     rr      d
     rr      e                   ; DE = length = (length >> 1) | (bit << 15);
-    inc     c                   ; context_index += 2
-.loop_entry:
-    call    decode_bit
+    call    inc_c_decode_bit    ; context_index += 2
     jr      c,.loop
 .fix_bit_pos:
     ccf                         ; NC will become this final `| (1 << bit_pos)` bit
@@ -324,5 +331,51 @@ probs:      EQU ($ + 255) & -$100                       ; probs array aligned to
 
     DISPLAY "upkr.unpack probs array placed at: ",/A,probs,",\tsize: ",/A,probs.c
 
+/*
+ archived: negligibly faster but +6B longer decode_number variant using HL' and BC' to
+ do `number|=(1<<bit_pos);` type of logic in single loop.
+*/
+; decode_number:
+;     exx
+;     ld      bc,1
+;     ld      l,b
+;     ld      h,b                 ; HL = 0
+; .loop
+;     exx
+;     inc     c
+;     call    decode_bit
+;     jr      nc,.done
+;     inc     c
+;     call    decode_bit
+;     exx
+;     jr      nc,.b0
+;     add     hl,bc
+; .b0:
+;     sla     c
+;     rl      b
+;     jr      .loop
+; .done:
+;     exx
+;     add     hl,bc
+;     push    hl
+;     exx
+;     pop     de
+;     ret
+
+/*
+ archived: possible LUT variant of updating probs value, requires 512-aligned 512B table (not tested)
+*/
+; code is replacing decode_bit from "; *** adjust probs[context_index]", followed by `ld (bc),a : add a,d ...`
+;     ld      c,a
+;     ld      a,high(probs_update_table)/2    ; must be 512 aligned
+;     rla
+;     ld      b,a
+;     ld      a,(bc)
+;     pop     bc
+; -------------------------------------------
+; probs_update_table: EQU probs-512
+; -------------------------------------------
+; table generator is not obvious and probably not short either, 20+ bytes almost for sure, maybe even 30-40
+
     ENDMODULE
     OPT pop