diff --git a/Docs/Полное описание команд микропроцессора Z80.doc b/Docs/Полное описание команд микропроцессора Z80.doc new file mode 100644 index 0000000..3a03798 Binary files /dev/null and b/Docs/Полное описание команд микропроцессора Z80.doc differ diff --git a/constants/SP2000.inc b/constants/SP2000.inc index de95dd7..34654e7 100644 --- a/constants/SP2000.inc +++ b/constants/SP2000.inc @@ -351,6 +351,7 @@ FastRAM: .ON EQU #FB ; 祭 IN A,(FastRAM.ON) - ਡ ﬨ .OFF EQU #7B ; ⪫祭 IN A,(FastRAM.OFF) - ਡ ﬨ .SLOT0 EQU #5C ; ४祭 ࠭ FastRAM.ON 㫥 (bit0..1) - ਡ ﬨ +; ४祭 ࠭ ࠡ⠥ ⮫쪮 SYS_PORT.ROM. ;!TODO ⠢ ⮢ ⥭ #FB ७ SLOT0 ; ⮡ 祭 ॣ ࠭﫠 ࠭ 0 ( 祭 ) ;; diff --git a/math.asm b/math.asm new file mode 100644 index 0000000..06d74eb --- /dev/null +++ b/math.asm @@ -0,0 +1,4968 @@ +; +;ےਣ᪨ +; +arctan_88: +;Input: +; D.E +;Output: atan(D.E)->D.E + push de + ld a,d + or a + jp p,$+5 + neg + ld d,a + dec a + jr nz,.checkneedinv + inc e : dec e : jr nz,.checkneedinv + pop af : rla : ld de,201 : ret nc : ld de,-201 : ret +.checkneedinv: + inc a + call nz,.DEgt1_Inv +;0.E is the value to atan + ld hl,.adjustatan + push hl + ld a,e + cp 46 : ret c + dec a : cp 42h : ret c + dec a : cp 4Eh : ret c + dec a : cp 57h : ret c + dec a : cp 5Eh : ret c + dec a : cp 64h : ret c + dec a : cp 6Ah : ret c + dec a : cp 6Fh : ret c + sub 6Fh : ld e,a + ld hl,.LUT + add hl,de + ld a,(hl) + ret +.adjustatan: + ld e,a + pop bc + ld a,b + or a + jp p,$+5 + neg + jr z,$+9 + ld hl,402 + or a + sbc hl,de + ex de,hl + rl b + ret nc + xor a + sub e + ld e,a + sbc a,a + sub d + ld d,a + ret + +.DEgt1_Inv: +;Works if DE>1 + ld hl,256 + ld b,8 +.InvLoop: + add hl,hl + sbc hl,de + jr nc,$+3 + add hl,de + adc a,a + djnz .InvLoop + cpl + ld e,a + ld d,b + ret +; 0 1 2 3 4 5 6 7 8 9 +.LUT: DB #6F, #6F, #70, #71, #72, #73, #73, #74, #75, #76 ; 0 + #77, #77, #78, #79, #7A, #7B, #7B, #7C, #7D, #7E ; 1 + #7F, #7F, #80, #81, #82, #82, #83, #84, #85, #85 ; 2 + #86, #87, #88, #88, #89, #8A, #8B, #8B, #8C, #8D ; 3 + #8E, #8E, #8F, #90, #90, #91, #92, #93, #93, #94 ; 4 + #95, #95, #96, #97, #97, #98, #99, #9A, #9A, #9B ; 5 + #9C, #9C, #9D, #9E, #9E, #9F, #A0, #A0, #A1, #A2 ; 6 + #A2, #A3, #A3, #A4, #A5, #A5, #A6, #A7, #A7, #A8 ; 7 + #A9, #A9, #AA, #AA, #AB, #AC, #AC, #AD, #AD, #AE ; 8 + #AF, #AF, #B0, #B0, #B1, #B2, #B2, #B3, #B3, #B4 ; 9 + #B5, #B5, #B6, #B6, #B7, #B7, #B8, #B9, #B9, #BA ; 10 + #BA, #BB, #BB, #BC, #BC, #BD, #BE, #BE, #BF, #BF ; 11 + #C0, #C0, #C1, #C1, #C2, #C2, #C3, #C3, #C4, #C4 ; 12 + #C5, #C6, #C6, #C7, #C7, #C8, #C8, #C9 ; 13 +;; + + +;; +atan8: +;computes 256*atan(A/256)->A +;56 bytes including the LUT +;min: 246cc +;max: 271cc +;avg: 258.5cc + rlca + rlca + rlca + ld d,a + and 7 + ld hl,.LUT + add a,l + ld l,a + if (.LUT & 255) > 248 ;this section not included in size/speed totals + jr nc,$+3 ;can add three bytes, 12cc to max, 11cc to min, and 11.5cc to avg + inc h + endif + ld c,(hl) + inc hl + ld a,(hl) + sub c + ld e,0 + ex de,hl + ld d,l + ld e,a + sla h : jr nc,$+3 : ld l,e + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl + add hl,hl + add hl,hl +; add hl,hl ;used in rounding... + ld a,h +; rra ;but doesn't seem to improve the error + adc a,c + ret +.LUT: DB 0,32,63,92,119,143,165,184,201 +;; + + +;; +atanE: +;returns H=256*arctan(E/256) +;min: 496cc +;max: 539cc +;avg: 517.5cc +;multiply E by 201 + ld d,0 + ld h,d + ld l,e + add hl,hl + add hl,de + add hl,hl + add hl,hl + add hl,hl + add hl,de + add hl,hl + add hl,hl + add hl,hl + add hl,de + ld b,h + ld c,l + +;E*(256-E) + xor a + ld d,a + sub e + ld h,a + ld l,d + sla h : jr nc,$+3 : ld l,e + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de +;.HL*70 + ld d,h + ld e,l + xor a + add hl,hl + add hl,hl : rla ;rla needed for the case when input = 128 :( + add hl,hl : rla + add hl,hl : rla + add hl,de : adc a,0 + add hl,hl : rla + add hl,de : adc a,0 + add hl,hl : rla + ld l,h + ld h,a + add hl,bc + ret +;; + + +; +;ۂ⠭ +; +;written by calc84maniac +;comment from calc84maniac: +; To clarify why I did a cpl/scf/adc instead of a cpl/inc/add or neg/add, +; is that it handles the case of A=0 properly. Typically, SUB N and +; ADD A,-N give opposite carry outputs, but SUB 0 and ADD A,-0 both reset the +; carry flag. On the other hand, SCF : ADC A,255 will set the carry flag like +; we want it to. +; BC=BC-A +BC_Minus_A: + cpl + scf + adc a,c + ld c,a + ret c + dec b + ret +;; + + +;; +;via calc84maniac +;"Optimized routine for HL=A-HL (the negate HL optimization can be derived from this by setting A=0 first)" +A_Minus_HL: + sub l + ld l,a + sbc a,a + sub h + ld h,a + ret +;; + + +; +;ۊ७ +; +;Adapted from Axe +;Inputs: A.C +;Output: D.E contains the squareroot +;speed: 1482+12{0,17} +;min: 1482cc +;max: 1686cc +;avg: 1584cc +;35 bytes +sqrtfixed_88: + ld b,12 + ld de,0 + ld h,d + ld l,e +.Loop: sub #40 + sbc hl,de + jr nc,.Skip + add a,#40 + adc hl,de +.Skip: ccf + rl e + rl d + sla c + rla + adc hl,hl + sla c + rla + adc hl,hl + djnz .Loop + ret +;; + + +;; +;returns HL as the sqrt, DE as the remainder +;33 bytes +;min: 928cc +;max: 1120cc +;avg: 1024cc +;928+8{24,0} +sqrtDE: + ld b,#80 + xor a + ld h,a + ld l,a +.sqrt_loop: + srl b + rra + ld c,a + add hl,bc + ex de,hl + sbc hl,de + jr nc,.next + add hl,de + ex de,hl + or a + sbc hl,bc + DB #DA ;start of jp c,** which is 10cc to skip the next two bytes. +.next: + ex de,hl + add hl,bc + srl h + rr l + srl b + rra + jr nc,.sqrt_loop + ret +;; + + +;; +;Written by Zeda +;Input: A.E ==> D.E +;Output: DE is the sqrt, AHL is the remainder +;Speed: 690+6{0,13}+{0,3+{0,18}}+{0,38}+sqrtA +;min: 855cc +;max: 1003cc +;avg: 924.5cc +;152 bytes +sqrtfixed_88: + call sqrtA + ld l,a + ld a,e + ld h,0 + ld e,d + ld d,h + + sla e + rl d + + sll e : rl d + add a,a : adc hl,hl + add a,a : adc hl,hl + sbc hl,de + jr nc,.next + add hl,de + dec e + DB #FE ;start of `cp *` +.next: + inc e + + sll e : rl d + add a,a : adc hl,hl + add a,a : adc hl,hl + sbc hl,de + jr nc,.next2 + add hl,de + dec e + DB #FE ;start of `cp *` +.next2: + inc e + + sll e : rl d + add a,a : adc hl,hl + add a,a : adc hl,hl + sbc hl,de + jr nc,.next3 + add hl,de + dec e + DB #FE ;start of `cp *` +.next3: + inc e + + sll e : rl d + add a,a : adc hl,hl + add a,a : adc hl,hl + sbc hl,de + jr nc,.next4 + add hl,de + dec e + DB #FE ;start of `cp *` +.next4: + inc e + +;Now we have four more iterations +;The first two are no problem + sll e : rl d + add hl,hl + add hl,hl + sbc hl,de + jr nc,.next5 + add hl,de + dec e + DB #FE ;start of `cp *` +.next5: + inc e + + sll e : rl d + add hl,hl + add hl,hl + sbc hl,de + jr nc,.next6 + add hl,de + dec e + DB #FE ;start of `cp *` +.next6: + inc e + +.iter11: +;On the next iteration, HL might temporarily overflow by 1 bit + sll e : rl d ;sla e : rl d : inc e + add hl,hl + add hl,hl + jr c,.iter11_br0 +; + sbc hl,de + jr nc,.next7 + add hl,de + dec e + jr .iter12 +.iter11_br0: + or a + sbc hl,de +.next7: + inc e + +;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anyways +.iter12: + ld b,a ;A is 0, so B is 0 + add hl,hl + add hl,hl + rla +;AHL - (DE+DE+1) + sbc hl,de : sbc a,b + inc e + or a + sbc hl,de : sbc a,b + ret p + add hl,de + adc a,b + dec e + add hl,de + adc a,b + ret + +;; + + +;; +; FASTEST +;Written by Zeda +;Input: A +;Output: D is the squareroot, A is the remainder (input-D^2) +;Destroys: E +;speed: 118+{0,6}+{0,7}+{0,7}+{0,3} +;min: 118cc +;max: 141cc +;avg: 129.5cc +;38 bytes +sqrtA: + ld de,5040h + sub e + jr nc,.skip1 + add a,e + ld d,10h +.skip1: +; ------ + cp d + jr c,.skip2 + sub d + set 5,d +.skip2: +; ------ + res 4,d + srl d + set 2,d + cp d + jr c,.skip3 + sub D + set 3,d +.skip3: + srl d +; ------ + inc a + sub d + jr nc,.skip4 + dec d + add a,d +.skip4: + srl d + ret +;; + + +;; +;Input: HLDE +;Output: DE is the sqrt, AHL is the remainder +;speed: 238+{0,1}+{0,44}+sqrtHL+3*.sub_2+.iter15 +;min: 1260 +;max: 1506 +;avg: 1377.75 +sqrt32: + push de + call sqrtHL + pop bc + add a,a + ld e,a + jr nc,.skip + inc d +.skip: + ld a,b + call .sub_2 + call .sub_2 +;Now we have four more iterations +;The first two are no problem + ld a,c + call .sub_2 + +;On the next iteration, HL might temporarily overflow by 1 bit + call .iter15 + +;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anyways +.iter16: + add a,a + ld b,a ;either 0x00 or 0x80 + adc hl,hl + rla + adc hl,hl + rla +;AHL - (DE+DE+1) + sbc hl,de : sbc a,b + inc e + or a + sbc hl,de : sbc a,b + ret p + add hl,de + adc a,b + dec e + add hl,de + adc a,b + ret + +.sub_2: +;min: 185cc +;max: 231cc +;avg: 208cc + call .iter17 +.iter17: +;min: 84cc +;max: 107cc +;avg: 95.5cc + sll e : rl d + add a,a : adc hl,hl + add a,a : adc hl,hl + + sbc hl,de + inc e + ret nc + dec e + add hl,de + dec e + ret + +.iter15: +;91+{8,0+{0,23}} +;min: 91cc +;max: 114cc +;avg: 100.75cc + sll e : rl d ;sla e : rl d : inc e + add a,a + adc hl,hl + add a,a + adc hl,hl ;This might overflow! + jr c,.iter15_br0 +; + sbc hl,de + inc e + ret nc + dec e + add hl,de + dec e + ret +.iter15_br0: + or a + sbc hl,de + inc e + ret +;; + + +;; +;Inputs: +; L is the value to find the square root of +;Outputs: +; C is the result +; B,L are 0 +; DE is not changed +; H is how far away it is from the next smallest perfect square +; L is 0 +; z flag set if it was a perfect square +;Destroyed: +; A +;287+7x, x is the number of bits in the result +;min: 287 +;max: 315 +;19 bytes +SqrtL: + ld bc,#400 + ld h,c +.Loop: + add hl,hl + add hl,hl + rl c + ld a,c + rla + sub a,h + jr nc,$+5 + inc c + cpl + ld h,a + djnz .Loop + ret +;; + + +;; +;Input: HLIX +;Output: DE is the sqrt, AHL is the remainder +;speed: 751+6{0,6}+{0,3+{0,18}}+{0,38}+sqrtHL +;min: 1103 +;max: 1237 +;avg: 1165.5 +;166 bytes +sqrtHLIX: + call .sqrtHL ;expects returns A as sqrt, HL as remainder, D = 0 + add a,a + ld e,a + rl d + + ld a,ixh + sll e : rl d + add a,a : adc hl,hl + add a,a : adc hl,hl + sbc hl,de + jr nc,.skip1 + add hl,de + dec e + DB #FE ;start of `cp *` +.skip1: + inc e + + sll e : rl d + add a,a : adc hl,hl + add a,a : adc hl,hl + sbc hl,de + jr nc,.skip2 + add hl,de + dec e + DB #FE ;start of `cp *` +.skip2: + inc e + + sll e : rl d + add a,a : adc hl,hl + add a,a : adc hl,hl + sbc hl,de + jr nc,.skip3 + add hl,de + dec e + DB #FE ;start of `cp *` +.skip3: + inc e + + sll e : rl d + add a,a : adc hl,hl + add a,a : adc hl,hl + sbc hl,de + jr nc,.skip4 + add hl,de + dec e + DB #FE ;start of `cp *` +.skip4: + inc e + +;Now we have four more iterations +;The first two are no problem + ld a,ixl + sll e : rl d + add a,a : adc hl,hl + add a,a : adc hl,hl + sbc hl,de + jr nc,.skip5 + add hl,de + dec e + DB #FE ;start of `cp *` +.skip5: + inc e + + sll e : rl d + add a,a : adc hl,hl + add a,a : adc hl,hl + sbc hl,de + jr nc,.skip6 + add hl,de + dec e + DB #FE ;start of `cp *` +.skip6: + inc e + +.iter15: +;On the next iteration, HL might temporarily overflow by 1 bit + sll e : rl d ;sla e : rl d : inc e + add a,a + adc hl,hl + add a,a + adc hl,hl ;This might overflow! + jr c,.iter15_br0 +; + sbc hl,de + jr nc,.skip7 + add hl,de + dec e + jr .iter16 +.iter15_br0: + or a + sbc hl,de +.skip7: + inc e + +;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anyways +.iter16: + add a,a + ld b,a ;either 0x00 or 0x80 + adc hl,hl + rla + adc hl,hl + rla +;AHL - (DE+DE+1) + sbc hl,de : sbc a,b + inc e + or a + sbc hl,de : sbc a,b + ret p + add hl,de + adc a,b + dec e + add hl,de + adc a,b + ret +;; + + +;; +; very fastest 16-bit isqrt by Zeda Thomas +;Feel free to use for whatever :) +;Input: HL +;Output: A is the integer square root of HL +;Destroys: HL,DE (D is actually 0) +;min: 343cc +;max: 380cc +;avg: 361.5cc +;88 bytes +sqrtHL: + ld de,05040h + ld a,h + sub e + jr nc,.sq7 + add a,e + ld d,16 +.sq7: +; ---------- + cp d + jr c,.sq6 + sub d + set 5,d +.sq6: +; ---------- + res 4,d + srl d + set 2,d + cp d + jr c,.sq5 + sub d + set 3,d +.sq5: + srl d +; ---------- + inc a + sub d + jr nc,.sq4 + dec d + add a,d + dec d ; <-- this resets the low bit of D, so `srl d` resets carry. +.sq4: + srl d + ld h,a +; ---------- + ld a,e + sbc hl,de + jr nc,.sq3 + add hl,de +.sq3: + ccf + rra + srl d + rra +; ---------- + ld e,a + sbc hl,de + jr c,.sq2 + or #20 + db 254 ; <-- start of `cp *` which is 7cc to skip the next byte. +.sq2: + add hl,de + xor #18 + srl d + rra +; ---------- + ld e,a + sbc hl,de + jr c,.sq1 + or 8 + db 254 ; <-- start of `cp *` which is 7cc to skip the next byte. +.sq1: + add hl,de + xor 6 + srl d + rra +; ---------- + ld e,a + sbc hl,de +;This code would restore the square root +; jr nc,.sq0 +; add hl,de ; | 12cc or 18cc +; .sq0: + sbc a,255 + srl d + rra + ret +;; + + +;; +; FASTEST +;written by Zeda +;returns A as the sqrt, HL as the remainder, D = 0 +;min: 352cc +;max: 391cc +;avg: 371.5cc +sqrtHL: + ld de,#5040 + ld a,h + sub e + jr nc,.sq7 + add a,e + ld d,16 +.sq7: +; ---------- + cp d + jr c,.sq6 + sub d + set 5,d +.sq6: +; ---------- + res 4,d + srl d + set 2,d + cp d + jr c,.sq5 + sub d + set 3,d +.sq5: + srl d +; ---------- + inc a + sub d + jr nc,.sq4 + dec d + add a,d + dec d ; <-- this resets the low bit of D, so `srl d` resets carry. +.sq4: + srl d + ld h,a +; ---------- + ld a,e + sbc hl,de + jr nc,.sq3 + add hl,de +.sq3: + ccf + rra + srl d + rra +; ---------- + ld e,a + sbc hl,de + jr c,.sq2 + or #20 + db 254 ; <-- start of `cp *` which is 7cc to skip the next byte. +.sq2: + add hl,de + xor #18 + srl d + rra +; ---------- + ld e,a + sbc hl,de + jr c,.sq1 + or 8 + db 254 ; <-- start of `cp *` which is 7cc to skip the next byte. +.sq1: + add hl,de + xor 6 + srl d + rra +; ---------- + ld e,a + sbc hl,de + jr nc,.sq + add hl,de + srl d + rra + ret +.sq: + inc a + srl d + rra + ret + +;; + + +;; +;Adapted from Axe +;Input: HL +;Output: D is the square root, cH is the remainder (c being the c flag), A is 0, B is 0, L is 0 +;speed: 758+8{0,6} +;min: 758cc +;max: 806cc +;avg: 782cc +;26 bytes +sqrtHL: +;p_Sqrt: + ld a,l + ld l,h + ld de,#0040 + ld h,d + ld b,8 + or a +.Loop: + sbc hl,de + jr nc,.Skip + add hl,de +.Skip: + ccf + rl d + add a,a + adc hl,hl + add a,a + adc hl,hl + djnz .Loop + ret +;; + + +; +;RND +; +;Inputs: (seed1), (seed2), and (seed3) are 16-bit seeds. (seed1) and (seed2) can't both be 0. +;Outputs: HL is the pseudorandom number +;Destroys: A,DE,BC +;cycle: 281,474,976,645,120 +;It would take about 185 years at 15MHz to repeat +;min: 258cc (236cc if using ENABLE_SMC) +;max: 288cc (266cc if using ENABLE_SMC) +;avg: 273cc (251cc if using ENABLE_SMC) +;63 bytes (62 bytes if using ENABLE_SMC) +xsp32: + ifdef ENABLE_SMC +.seed1 equ $+1 + ld hl,12345 +.seed2 equ $+1 + ld de,6789 + else + ld hl,(.seed1) + ld de,(.seed2) + endif + +;first, XOR it with itself, shifted left 23 bits +;low bit of d needs to be shifted in + ld a,h + rra + ld a,l + rra + jr nc,.skip1 + rl e + ccf + rr e +.skip1: + xor d + ld d,a + +;XOR it with itself, shifted right 15 bits + ld a,h + rla + ld a,e + rla + xor l + ld l,a + + ld a,e + rla + ld a,d + rla + jr nc,.skip2 + rr e + ccf + rl e +.skip2: + xor h + ld h,a + +;XOR it with itself, shifted left 17 bits +;HL<<1 + ld (.seed1),hl + add hl,hl + ld a,h + xor d + ld h,a + + ld a,l + xor e + ld l,a + ld (.seed2),hl + ex de,hl + + ifdef ENABLE_SMC +.seed3 equ $+1 + ld hl,33333 + else + ld hl,(.seed3) + endif + + inc hl + inc h + ld (.seed3),hl + add hl,de + ret +;; + + +;; +;32-bit xorshift +;seed^=seed<<23 +;seed^=seed>>15 +;seed^=seed<<17 +;min: 209cc (193cc if using ENABLE_SMC) +;max: 239cc (223cc if using ENABLE_SMC) +;avg: 224cc (208cc if using ENABLE_SMC) +;53 bytes (52 bytes if using ENABLE_SMC) +xs32: + ifdef ENABLE_SMC +.seed1 equ $+1 + ld hl,12345 +.seed2 equ $+1 + ld de,6789 + else + ld hl,(.seed1) + ld de,(.seed2) + endif + +;first, XOR it with itself, shifted left 23 bits +;low bit of d needs to be shifted in + ld a,h + rra + ld a,l + rra + jr nc,.skip1 + rl e + ccf + rr e +.skip1: + xor d + ld d,a + +;XOR it with itself, shifted right 15 bits + ld a,h + rla + ld a,e + rla + xor l + ld l,a + + ld a,e + rla + ld a,d + rla + jr nc,.skip2 + rr e + ccf + rl e +.skip2: + xor h + ld h,a + +;XOR it with itself, shifted left 17 bits +;HL<<1 + ld (.seed1),hl + add hl,hl + ld a,h + xor d + ld h,a + + ld a,l + xor e + ld l,a + ld (.seed2),hl + ret +;; + + +;; +;You may use this routine, just be sure to credit John Metcalf! +;Written by John Metcalf +; http://www.retroprogramming.com/2017/07/xorshift-pseudorandom-numbers-in-z80.html +; +; Annotated by Zeda Thomas, fixed typo (86 cycles==> 82 cycles) +;Note: uses ENABLE_SMC (Self Modifying Code) +; 16-bit xorshift pseudorandom number generator +; 20 bytes, 82 cycles (excluding ret) +; returns hl = pseudorandom number +; corrupts a +xrnd: + ld hl,1 ; Init the seed, must not be 0 + ld a,h ;\ + rra ; | Get the top bits of xs<<7 and xor with the top byte of HL + ld a,l ; | abcdefgh ijklmnop + rra ; | ^hijklmno 00000000 + xor h ; | Note that we still need to xor the 'p' with the top byte of l + ld h,a ;/ + ld a,l ;\ + rra ; | we get 'p' in the carry flag, now shift that in when we do xs>>9 + ld a,h ; | abcdefgh ijklmnop (new value) + rra ; | ^00000000 pabcdefg + xor l ; | the 'p' is leftover from the first step, so now Step 1 and 2 are done + ld l,a ;/ + xor h ;\ Finally, xor the bottom byte with the top byte for step 3 + ld h,a ;/ + ld (xrnd+1),hl ; write back the new value as the next seed + ret +;; + + +;; +;This code snippet is 9 bytes and 43cc +;Inputs: +; HL is the input seed and must be non-zero +;Outputs: +; A is the 8-bit pseudo-random number +; HL is the new seed value (will be non-zero) +rng8_very_very_fast: + add hl,hl + sbc a,a + and %0010'1101 + xor l + ld l,a + ld a,r + add a,h + ret +;------------------------------------------------------------------------------- +;Technical details: +; The concept behind this routine is to combine an LFSR (poor RNG) with a +; counter. The counter improves the RNG quality, while also extending the period +; length. +; For this routine, I took advantage of the Z80's built-in counter, the `r` +; register. This means that we don't need to store the counter anywhere, and it +; is pretty fast to access! +; Some caveats: +; * r is a 7-bit counter +; * r will increment some number of times between runs of the RNG. In most +; cases, this will be constant, but if it increments an even number each +; time, then the bottom bit is always the same, weakening the effect of +; the counter. In the worst case, it increments a multiple of 128 times, +; effectively making your RNG just as good/bad as the LFSR. Ideally, you +; want `r` to increment an odd number of times between runs. +; * In the best case, the bottom 7 bits have 50/50 chance of being 0 or 1. +; The top bit is 1 with probability 1/2 + 1/(2^17-2) ~ .5000076295 +; * In the event that your main loop waits for user input between calls, +; then congatulations, you might have a True RNG :) +;------------------------------------------------------------------------------- +;; + + +;; +;Tested and passes all CAcert tests +;Uses a very simple 32-bit LCG and 32-bit LFSR +;it has a period of 18,446,744,069,414,584,320 +;roughly 18.4 quintillion. +;LFSR taps: 0,2,6,7 = 11000101 +;291cc +;Thanks to Runer112 for his help on optimizing the LCG and suggesting to try the much simpler LCG. On their own, the two are terrible, but together they are great. +;58 bytes +rand32: +.seed1_0 equ $+1 + ld hl,12345 +.seed1_1 equ $+1 + ld de,6789 + ld b,h + ld c,l + add hl,hl : rl e : rl d + add hl,hl : rl e : rl d + inc l + add hl,bc + ld (.seed1_0),hl + ld hl,(.seed1_1) + adc hl,de + ld (.seed1_1),hl + ex de,hl +;;lfsr +.seed2_0 equ $+1 + ld hl,9876 +.seed2_1 equ $+1 + ld bc,54321 + add hl,hl : rl c : rl b + ld (.seed2_1),bc + sbc a,a + and %1100'0101 + xor l + ld l,a + ld (.seed2_0),hl + ex de,hl + add hl,bc + ret +;; + + +;; +;;219cc +rand24: + ifdef ENABLE_SMC +.seed1_0 equ $+1 + ld hl,12345 +.seed1_1 equ $+1 + ld a,67 + else + ld hl,(.seed1_0) + ld a,(.seed1_1) + endif + ld b,h + ld c,l + ld d,a + add hl,hl : rla + add hl,hl : rla + inc l + add hl,bc : adc a,0 + ld (.seed1_0),hl + ld (.seed1_1),a + ld c,b + ld b,a + ifdef ENABLE_SMC +.seed2_0 equ $+1 + ld hl,65432 +.seed2_1 equ $+1 + ld a,10 + else + ld hl,(.seed2_0) + ld a,(.seed2_1) + endif + add hl,hl + rla + ld (.seed2_1),a + sbc a,a + and %1000'0111 + xor l + ld l,a + ld (.seed2_0),hl + add hl,bc + ret + +;; + + +;; +;You may use this routine, just be sure to credit John Metcalf for the +;xorshift16 part of this routine! + +; This routine is a fast Pseudo Random Number Generator +;for the Z80. It combines a 16-bit LCG and 16-bit xorshift. +;The xorshift routine was written by John Metcalf +;and posted here: +; http://www.retroprogramming.com/2017/07/xorshift-pseudorandom-numbers-in-z80.html + +;174cc (or 186cc if not using ENABLE_SMC) +;34 bytes +;cycle length: 4,294,901,760 (almost 4.3 billion) + +; For the first seed, we use an LCG, 1+5*seed1 ==> seed1 +rand16: + ifdef ENABLE_SMC +.seed1 equ $+1 + ld hl,9999 + else + ld hl,(.seed1) + endif + ld b,h + ld c,l + add hl,hl + add hl,hl + inc l + add hl,bc + ld (.seed1),hl + +; For the second seed, we apply an xorshift +; seed2^(seed2<<7) ==> seed2 +; seed2^(seed2>>9) ==> seed2 +; seed2^(seed2<<8) ==> seed2 +; This code was originally made by John Metcalf and posted here: +; http://www.retroprogramming.com/2017/07/xorshift-pseudorandom-numbers-in-z80.html +; (My modifications are only in naming and compiler directives.) + + ifdef ENABLE_SMC +.seed2 equ $+1 + ld hl,9999 + else + ld hl,(.seed2) + endif + ld a,h + rra + ld a,l + rra + xor h + ld h,a + ld a,l + rra + ld a,h + rra + xor l + ld l,a + xor h + ld h,a + ld (.seed2),hl + add hl,bc + ret +;; + + +;; +;collaboration by Zeda with Runer112 +;160cc or 148cc if using ENABLE_SMC +;26 bytes +;cycle: 4,294,901,760 (almost 4.3 billion) +rand16: + ifdef ENABLE_SMC +.seed1 equ $+1 + ld hl,9999 + else + ld hl,(.seed1) + endif + ld b,h + ld c,l + add hl,hl + add hl,hl + inc l + add hl,bc + ld (.seed1),hl + ifdef ENABLE_SMC +.seed2 equ $+1 + ld hl,9999 + else + ld hl,(.seed2) + endif + add hl,hl + sbc a,a + and %00101101 + xor l + ld l,a + ld (.seed2),hl + add hl,bc + ret +;; + + +;; +;Returns A on [0,4] +;Destroys: All +;Notes: +; This is a non-standard approach to generating random integers on [0,4]. +; If you have a truly random number generator that generates bits (0 or 1) +; with equal probability, then standard approaches will still cause a slight +; bias. ("Standard": "rand mod 5" or int(5*rand)). For example, suppose we +; generate a 4-bit number. Then "rand mod 5" will cause 0 to be chosen +; 4/16 times, while 1, 2, 3, and 4 will be chosen 3/16 times (on average). +; A similar problem exists with int(5*rand). One way to mitigate this issue +; is just generating infintely many bits, but apparently that is impractical, +; so I came up with a compromise. +; +; My approach basically looks at the binary expansion of 1/5, 2/5, 3/5, and 4/5. +; 1/5 = .0011001100110011... +; 2/5 = .0110011001100110... +; 3/5 = .1001100110011001... +; 4/5 = .1100110011001100... +; +; So if I generate random bits and I get .001100, then a 0, then I know +; that no matter what all of the rest of the bits are, the number is less than +; 1/5, and so int(5*rand) is 0. +; +; By applying similar logic to the rest of the values, I can guarantee a uniform +; distribution on [0,4]. But there are four cases where this process might +; continue forever, specifically the cases that are like ...00110011...., but +; lucky for us, this happens 4/inf= 0% of the time. In fact, on average it +; takes 3 to 4 bits before the algorithm can assert which value to return. +; +; The one caveat is that on the Z80, we generally don't have truly random +; numbers :| On the otherhand, it is easy enough to generate pseudo-random +; bits with equal probability :) +rand5: + call rand + ld a,h + and #C0 + push af ;save the original value + ld c,a +.start: + push bc + call rand + pop bc + ld b,15 ;I set this to 15 because I like to guarantee a bit is available for rand10. +.loop: + ld a,h + xor c + jp p,.end + add hl,hl + sla c + jr c,$+4 + set 6,c + djnz .loop + jr .start +.end: + pop af + rlca + rlca + sla h + adc a,0 + ret +;; + + +;; +;Returns A as a random integer on [0,9] +;Destroys: All +rand10: + call rand5 + sla h + rla + ret +;; + + +;; +;Generates a random TI float at HL +rand_TI_Float: + push hl + ; call rand_init + ld de,#8000 ;D is exponent, E is type. E is used in .zero +.get_rand_exponent_loop: +;decrement exponent + dec d + +;if the exponent is -100, underflow to 0. +;I don't think this is possible with this RNG, or even likely to ever happen +;before the universe's heat death with a true RNG, but better to be safe? + ld a,d + cp 28 + jp z,.zero + +;save the exponent + push de + +;Generate a uniform random digit on [0,9] as a candidate for our first digit. + call rand10 + +;restore the exponent+type + pop de + or a +;if A is 0, we'll decrement the exponent and find a new candidate for the first +;digit. This is because we need our float to be "normalized" (top digit non-zero) +;This also preserves the uniform distribution for values. + jr z,.get_rand_exponent_loop + + pop hl + ld (hl),e + inc hl + ld (hl),d + inc hl + +;write the first digit + ld (hl),a + ld b,13 +.math_rand_loop: +;now generate subsequent digits + push bc + rr b + jr c,$+3 + inc hl + push hl + +;generate the next digit + call rand10 + pop hl + rld + pop bc + djnz .math_rand_loop + ret + +.zero: + pop hl + ld b,9 + ld (hl),e ; E is 0 + inc hl + djnz $-2 + ret +;; + + +;; +; Output is in HL +; This rand routine combines Patrik Rak's fantastic 32-bit xorshift +; (https://gist.github.com/raxoft/c074743ea3f926db0037) with a simple lcg for +; extra smoothing. +; It has a period of 281,474,976,645,120 (2^48-2^16) and uses 48 bits of state. +; 42 bytes +; 210cc +rand: + ld hl,(.seed0) + ld b,h + ld c,l + add hl,hl + add hl,hl + inc l + add hl,bc + ld (.seed0),hl +; xorshift + ld hl,(.seed1) ; yw -> zt + ld de,(.seed1+2) ; xz -> yw + ld (.seed1+2),hl ; x = y, z = w + ld a,l ; w = w ^ ( w << 3 ) + add a,a + add a,a + add a,a + xor l + ld l,a + ld a,d ; t = x ^ (x << 1) + add a,a + xor d + ld h,a + rra ; t = t ^ (t >> 1) ^ w + xor h + xor l + ld h,e ; y = z + ld l,a ; w = t + ld (.seed1),hl +; Mix the xorshift and the lcg + add hl,bc + ret +;; + + +;; +; need to make sure seed1 is non-zero +randinit: + ld hl,.seed1 + ld a,(hl) + inc hl + or (hl) + inc hl + or (hl) + inc hl + or (hl) + ret nz + dec (hl) + ret +;; + + +;; +;;Output: A is an 8-bit pseudo-random number. +lfsr64: + ld hl,.seed + sla (hl) : inc hl + rl (hl) : inc hl + rl (hl) : inc hl + rl (hl) : inc hl + rl (hl) : inc hl + rl (hl) : inc hl + rl (hl) : inc hl + rl (hl) + ret nc + ld a,(.seed) + xor %000011011 + ld (.seed),a + ret +;; + + +;; +;13 bytes +;72cc (66cc if using SMC) +;period is 65535 +LFSR: + ifdef ENABLE_SMC +.seed equ $+1 + ld hl,9797 + else + ld hl,(.seed) + endif + add hl,hl + sbc a,a + and %00101101 + xor l + ld l,a + ld (seed),hl + ret +;; + + +;; +;Input: +; (seed) has the seed value of the RNG +;Output: +; (seed) is updated, HL is the result +;Destroys: +; A,DE,BC +;Timing: +; if seed>0 231cc or 232cc, condition dependent +; if seed=0 91cc +; if ENABLE_SMC defined subtract 6cc +;Size: 44 bytes +;Notes: +; Uses the Lehmer RNG used by the Sinclair ZX81 +; 75x mod 65537 -> x +lehmer: + ifndef ENABLE_SMC + ld hl,(.seed) + else +.seed equ $+1 + ld hl,0 + endif +;multiply by 75 + ld c,l + ld b,h + xor a + adc hl,hl + jr z,.special + ld d,a : rla + add hl,hl : rla + add hl,hl : rla : add hl,bc : adc a,d + add hl,hl : rla + add hl,hl : rla : add hl,bc : adc a,d + add hl,hl : rla : add hl,bc +;modulo 65537, see note below on how this works + ld e,a + sbc hl,de ;No need to reset the c flag since it is already + jr nc,$+3 + inc hl + ld (.seed),hl + ret +.special: +;In the case that HL=0, this should be interpreted as 65536 = -1 mod 65537, so return -75 mod 65537 = -74 mod 65536 in HL + ld hl,-74 + ld (.seed),hl + ret +;mod by 2^16 + 1 (a prime) +;current form is A*2^16+HL +;need: +; (A*2^16+HL) mod (2^16+1) +;add 0 as +1-1 +; (A*(2^16+1-1)+HL) mod (2^16+1) +;distribute +; (A*(2^16+1)-A+HL) mod (2^16+1) +;A*(2^16+1) mod 2^16+1 = 0, so remove +; (-A+HL) mod (2^16+1) +;Oh hey, that's easy! :P +;I use this trick everywhere, you should, too. +;; + + +; +;ۊ୨ +; +;A*A->A +;Destroys: HL +;76cc or 79cc or 82cc +;Avg: 79cc +;51 bytes +sqrA: + add a,a + add a,a + jr nc,$+4 + neg + rrca + rrca + ld l,a + srl l + ld h,.LUT/256 + jr c,$+4 + neg + add a,(hl) + ret +;!FIXIT +;MUST BE ALIGNED to a 256-byte boundary. +;Can use: +; #if 0!=$&255 +; .fill 256-($&255),0 +; #endif +.LUT: DB #00, #06, #14, #2A, #48, #6E, #9C, #D2 + DB #10, #56, #A4, #FA, #58, #BE, #2C, #A2 + DB #20, #A6, #34, #CA, #68, #0E, #BC, #72 + DB #30, #F6, #C4, #9A, #78, #5E, #4C, #42 + + ASSERT (low sqrLUT) = 0, "sqrLUT MUST BE ALIGNED to a 256-byte boundary!" +;; + + +;; +;Input: L +;Output: L*L->A +;147 t-states +;36 bytes +L_sqrd: + ld b,l + ;First iteration, get the lowest 3 bits of -x^2 + sla l + rrc b + sbc a,a + or l + ld c,a + ;second iteration, get the next 2 bits of -x^2 + rrc b + sbc a,a + xor l + and #F8 + add a,c + ld c,a + ;third iteration, get the next 2 bits of -x^2 + sla l + rrc b + sbc a,a + xor l + and #E0 + add a,c + ld c,a + ;fourth iteration, get the eight bit of x^2 + sla l + rrc b + sbc a,a + xor l + and #80 + sub c + ret +;; + + +; +;ۓ +; +;This multiplies two 64-bit integers and returns a 128-bit result. +;This requires the following routines: +; mul32 +; Inputs: DEHL, BCIX +; Output: stored at z32_0, little-endian +;Multiplies DE.HL by BC.IX, stores the result in DE.HL +mulfixed16_16: +; First, find out if the output is positive or negative + ld a,d + xor b + push af ;sign bit is the result sign bit +; Now make sure the inputs are positive + xor b ;A now has the value of D, since I XORed it with B twice (cancelling) + jp p,.skip1 ;if Positive, don't negate + xor a + sub l + ld l,a + ld a,0 + sbc a,h + ld h,a + ld a,0 + sbc a,e + ld e,a + sbc a,a + sub d + ld d,a +.skip1: + bit 7,b + jr z,.skip2 + xor a + sub ixl + ld ixl,a + ld a,0 + sbc a,ixh + ld ixh,a + ld a,0 + sbc a,c + ld c,a + sbc a,a + sub b + ld b,a +.skip2: +; Now we multiply + call mul32 +;We should check for overflow. If the upper two bytes are non-zero, we will set the result to 0x7FFFFFFF + ld hl,(.z32_0+6) + ld a,h + or l +;Get the middle four bytes and put them in DEHL + ld hl,(.z32_0+2) + ld de,(.z32_0+4) +;Maybe we need to set the result to 0x7FFFFFFF + jr z,.skip3 + ld de,#7FFF + ld h,e + ld l,e +.skip3: +; Now we need to restore the sign + pop af + ret p ;don't need to do anything, result is already positive + xor a + ld b,a + sub l + ld l,a + ld a,b + sbc a,h + ld h,a + ld a,b + sbc a,e + ld e,a + sbc a,a + sub d + ld d,a + ret +;; + + +;; +;This multiplies two 64-bit integers and returns a 128-bit result. +;This requires the following routines: +; mul32 +; Inputs: DEHL, BCIX +; Output: stored at z32_0, little-endian +;Multiplies DE.HL by BC.IX, stores the result in DE.HL +mulfixed16_16: ; First, find out if the output is positive or negative + LD A,D + XOR B + PUSH AF ;sign bit is the result sign bit + ; Now make sure the inputs are positive + XOR B ;A now has the value of D, since I XORed it with B twice (cancelling) + JP P,.skip1 ;if Positive, don't negate + XOR A + SUB L + LD L,A + LD A,0 + SBC A,H + LD H,A + LD A,0 + SBC A,E + LD E,A + SBC A,A + SUB D + LD D,A +.skip1: BIT 7,B + JR Z,.skip2 + XOR A + SUB IXL + LD IXL,A + LD A,0 + SBC A,IXH + LD IXH,A + LD A,0 + SBC A,C + LD C,A + SBC A,A + SUB B + LD B,A +.skip2: ; Now we multiply + CALL mul32 + ;We should check for overflow. If the upper two bytes are non-zero, we will set the result to 0x7FFFFFFF + LD HL,(.z32_0+6) + LD A,H + OR L + ;Get the middle four bytes and put them in DEHL + LD HL,(.z32_0+2) + LD DE,(.z32_0+4) + ;Maybe we need to set the result to 0x7FFFFFFF + JR Z,.skip3 + LD DE,#7FFF + LD H,E + LD L,E +.skip3: ; Now we need to restore the sign + POP AF + RET P ;don't need to do anything, result is already positive + XOR A + LD B,A + SUB L + LD L,A + LD A,B + SBC A,H + LD H,A + LD A,B + SBC A,E + LD E,A + SBC A,A + SUB D + LD D,A + RET +;; + + +;; +;Requires: +; mul16 +; Inputs: BC,DE +; Output: DEHL +;Multiplies 4.12 fixed point numbers. +;Inputs: HL is the first fixed-point multiplicand +; DE is the second fixed-point multiplicand +;Output: HL is the fixed-point output +;Overflow is stored as 0x7.FFF or 0x8.001 depending on positive or negative +mulfixed4_12: +; First, find out if the output is positive or negative + ld a,h + xor d + push af ;sign bit is the result sign bit +; Now make sure the inputs are positive + xor d ;A now has the value of H, since I XORed it with D twice (cancelling) + jp p,.skip1 ;if Positive, don't negate + xor a + sub l + ld l,a + sbc a,a + sub h + ld h,a +.skip1: + bit 7,d + jr z,.skip2 + xor a + sub e + ld e,a + sbc a,a + sub d + ld d,a +.skip2: +; Now we need to put DE in BC to use mul16 + ld b,h + ld c,l + call mul16 +;The result doesn't need the top 4 bits or bottom 12 bits. +;We'll hold onto the top 4 bits to check overflow, though. +;Currently we need to shift DEH left by 4 bits and keep DE, or right by 12 bits and keep HL. + ld a,h ;we'll actually be moving the discared bits into A + and #F0 + ex de,hl + rla : adc hl,hl + rla : adc hl,hl + rla : adc hl,hl + rla : adc hl,hl + adc a,a +;if A is non-zero, we have overflow + jr z,.skip3 + ld hl,#7FFF +.skip3: +; Now we need to restore the sign + pop af + ret p ;don't need to do anything, result is already positive + xor a + sub l + ld l,a + sbc a,a + sub h + ld h,a + ret +;; + + +;; +;Multiplies H.L by D.E, stores the result in H.L +mulfixed_88: +; First, find out if the output is positive or negative + ld a,h + xor d + push af ;sign bit is the result sign bit + +; Now make sure the inputs are positive + xor d ;A now has the value of H, since I XORed it with D twice (cancelling) + jp p,.skip1 ;if Positive, don't negate + xor a + sub l + ld l,a + sbc a,a + sub h + ld h,a +.skip1: + bit 7,d + jr z,.skip2 + xor a + sub e + ld e,a + sbc a,a + sub d + ld d,a +.skip2: +; Now we need to put HL in BC to use mul16 + ld b,h + ld c,l + call mul16 + +;Need to round, so get the top bit of L + sla l + +;Get the middle two bytes, EH, and put them in HL + ld l,h + ld h,e + + ld a,d + ld de,0 + adc hl,de + +;check for overflow! +;We should check for overflow. If A>0, we will set HL to 0x7FFF + adc a,e + jr c,$+4 + jr z,.skip3 + ld hl,#7FFF +.skip3: + +; Now we need to restore the sign + pop af + ret p ;don't need to do anything, result is already positive + xor a + sub l + ld l,a + sbc a,a + sub h + ld h,a + ret + +;; + + +;; +;This multiplies two 64-bit integers and returns a 128-bit result. +;This requires the following routines: +; mul32 ;!TEST +; Inputs: DEHL, BCIX +; Output: stored at z32_0, little-endian +; +; Defined: +; inp64_1 is where the first 64-bit multiplicand is located, little-endian +; inp64_2 is where the second 64-bit multiplicand is located, little-endian +; out128 is where the 128-bit result is stored +; Uses 8 additional bytes after out128 + +;multiplies the 64-bit integers at inp64_1 and inp64_2 +;stores the 128-bit (16-byte) result at out128 +; +;min: 1740+3*min(mul32) +; 5631cc +;max: 1901+3*max(mul32) +; 10013cc +;avg: 1797+3*avg(mul32) + 9572881/2^24 +; ~8720.733cc +;uses 24 bytes at out128 +mul64: +.z64_0 EQU out128 +.z64_2 EQU .z64_0+8 +.z32_0 EQU .z64_2+8 + + ld de,(.inp64_1+6) + ld hl,(.inp64_1+4) + ld bc,(.inp64_2+6) + ld ix,(.inp64_2+4) + call mul32 + ;copy the 8 bytes at z32_0 to z64_2 + ld hl,.z32_0 + ld de,.z64_2 + call .mov8 + + ld de,(.inp64_1+2) + ld hl,(.inp64_1) + ld bc,(.inp64_2+2) + ld ix,(.inp64_2) + call mul32 + ;copy the 8 bytes at z32_0 to z64_0 + ld hl,.z32_0 + ld de,.z64_0 + call .mov8 + +;now I need to subtract the 32-bit digits from each other + xor a + ld hl,(.inp64_1) + ld bc,(.inp64_1+4) + sbc hl,bc + ex de,hl + ld hl,(.inp64_1+2) + ld bc,(.inp64_1+6) + sbc hl,bc + jr nc,.skip1 + ld b,a : sub e : ld e,a + ld a,b : sbc a,d : ld d,a + ld a,b : sbc a,l : ld l,a + ld a,b : sbc a,h : ld h,a + ld a,b +.skip1: + rla + push hl ;top byte + push de + + ld hl,(.inp64_2) + ld bc,(.inp64_2+4) + sbc hl,bc + ex de,hl + ld hl,(.inp64_2+2) + ld bc,(.inp64_2+6) + sbc hl,bc + jr nc,.skip2 + ld c,a + xor a + ld b,a + sub e : ld e,a + ld a,b : sbc a,d : ld d,a + ld a,b : sbc a,l : ld l,a + ld a,b : sbc a,h : ld h,a + ld a,c + inc a +.skip2: + ex de,hl + pop ix + pop bc + push af + call mul32 + pop af ;holds the sign in the low bit + + rra + jp c,.add +;need to perform z0+z2-result + xor a + ld hl,(.z64_0) + ld de,(.z64_2) + add hl,de + ld (.inp64_1),hl + ld hl,(.z64_0+2) + ld de,(.z64_2+2) + adc hl,de + ld (.inp64_1+2),hl + ld hl,(.z64_0+4) + ld de,(.z64_2+4) + adc hl,de + ld (.inp64_1+4),hl + ld hl,(.z64_0+6) + ld de,(.z64_2+6) + adc hl,de + ld (.inp64_1+6),hl + rla +;now need to subtract + ld hl,(.inp64_1) + ld de,(.z32_0) + sbc hl,de + ld (.inp64_1),hl + ld hl,(.inp64_1+2) + ld de,(.z32_0+2) + sbc hl,de + ld (.inp64_1+2),hl + ld hl,(.inp64_1+4) + ld de,(.z32_0+4) + sbc hl,de + ld (.inp64_1+4),hl + ld hl,(.inp64_1+6) + ld de,(.z32_0+6) + sbc hl,de + ld (.inp64_1+6),hl + sbc a,0 +.final: +;now need to add it back in + ld hl,(z64_0+4) + ld de,(.inp64_1) + add hl,de + ld (z64_0+4),hl + ld hl,(z64_0+6) + ld de,(.inp64_1+2) + adc hl,de + ld (z64_0+6),hl + ld hl,(z64_0+8) + ld de,(.inp64_1+4) + adc hl,de + ld (z64_0+8),hl + ld hl,(z64_0+10) + ld de,(.inp64_1+6) + adc hl,de + ld (z64_0+10),hl + ld hl,z64_0+12 + adc a,(hl) + ld (hl),a + ret nc + inc hl : inc (hl) : ret nz + inc hl : inc (hl) : ret nz + inc hl : inc (hl) : ret +.add: +;add to the current result +;z0+z2+result + xor a + ld hl,(z64_0) + ld de,(z64_2) + add hl,de + ld (.inp64_1),hl + ld hl,(z64_0+2) + ld de,(z64_2+2) + adc hl,de + ld (.inp64_1+2),hl + ld hl,(z64_0+4) + ld de,(z64_2+4) + adc hl,de + ld (.inp64_1+4),hl + ld hl,(z64_0+6) + ld de,(z64_2+6) + adc hl,de + ld (.inp64_1+6),hl + rla +;now need to subtract + ld hl,(.inp64_1) + ld de,(.z32_0) + add hl,de + ld (.inp64_1),hl + ld hl,(.inp64_1+2) + ld de,(.z32_0+2) + adc hl,de + ld (.inp64_1+2),hl + ld hl,(.inp64_1+4) + ld de,(.z32_0+4) + adc hl,de + ld (.inp64_1+4),hl + ld hl,(.inp64_1+6) + ld de,(.z32_0+6) + adc hl,de + ld (.inp64_1+6),hl + adc a,0 + jp .final + +.mov8: LDI + LDI + LDI + LDI + LDI + LDI + LDI + LDI + RET +;; + + +;; +;Requires: +; mul16 ;!TEST +; Inputs: BC,DE +; Output: DEHL +;max: 703cc + 3*mul16 +; 2704cc +;min: 655cc + 3*mul16 +; 1297cc +;avg: 673.25cc+3*mul16 +; 2307.911cc +;DEHL * BCIX ==> .z32_0 +mul32: + push de + push bc + push hl + push ix + call mul16 ;DEHL + ld (.z32_2),hl + ld (.z32_2+2),de + + pop de + pop bc + push de + call mul16 ;DEHL + ld (.z32_0),hl + ld (.z32_0+2),de + + pop de ;low word + pop hl + xor a + sbc hl,de + jr nc,.skip1 + sub l + ld l,a + sbc a,a + sub h + ld h,a + xor a + inc a +.skip1: + ex de,hl + pop hl + sbc hl,bc + jr nc,.skip2 + ld b,a + xor a + sub l + ld l,a + sbc a,a + sub h + ld h,a + ld a,b + inc a +.skip2: + ld b,h + ld c,l + push af + call mul16 + pop af ;holds the sign in the low bit + rra + jr c,.add +;need to perform z0+z2-result + push de + push hl + xor a + ld hl,(.z32_0) + ld bc,(.z32_2) + add hl,bc + ex de,hl + ld hl,(.z32_0+2) + ld bc,(.z32_2+2) + adc hl,bc + rla +;now need to subtract + ex de,hl + pop bc + sbc hl,bc + ex de,hl + pop bc + sbc hl,bc + sbc a,0 +;A:HL:DE is the result, need to add to z32_0+2 +.final: + ld bc,(.z32_0+2) + ex de,hl + add hl,bc + ld (.z32_0+2),hl + ld hl,(.z32_2) + adc hl,de + ld (.z32_2),hl + ld hl,z32_2+2 + adc a,(hl) + ld (hl),a + ret nc + inc hl + inc (hl) + ret +.add: +;add to the current result + xor a + ld bc,(.z32_0) + add hl,bc + ex de,hl + ld bc,(.z32_0+2) + adc hl,bc + rla + ex de,hl + ld bc,(.z32_2) + add hl,bc + ex de,hl + ld bc,(.z32_2+2) + adc hl,bc + adc a,0 + jp .final + ; +.z32_0: DS 4 +.z32_2: DS 4 +;; + + +;; +;BDE*CHL -> HLBCDE +;155 bytes +;402+3*C_Times_BDE +;fastest:1201cc +;slowest:1753cc +;avg: 1464.9033203125cc (1464+925/1024) +;min: 825cc +;max: 1926cc +;avg: 1449.63839751681cc +mul24: + push bc + ld c,l + push hl + call C_Times_BDE + ld (.var48),hl + ld l,a + ld h,c + ld (.var48+2),hl + + pop hl + ld c,h + call C_Times_BDE + push bc + ld bc,(.var48+1) + add hl,bc + ld (.var48+1),hl + pop bc + ld b,c + ld c,a + ld hl,(.var48+3) + ld h,0 + adc hl,bc + ld (.var48+3),hl + + pop bc + call C_Times_BDE + ld de,(.var48+2) + add hl,de + ld (.var48+2),hl + ld d,c + ld e,a + ld b,h + ld c,l + ld hl,(.var48+4) + ld h,0 + adc hl,de + ld de,(.var48) + ret + +.var48: DS 6 +;; + + +;; +;This was made by Runer112 +;Tested by jacobly +;BC*DE --> DEHL +; ~544.887cc as calculated in jacobly's test +;min: 214cc (DE = 1) +;max: 667cc +;avg: 544.4507883cc however, deferring to jacobly's result as mine may have math issues ? +;177 bytes +mul16: LD A,D + LD D,0 + LD H,B + LD L,C + ADD A,A : JR C,.Bit14 + ADD A,A : JR C,.Bit13 + ADD A,A : JR C,.Bit12 + ADD A,A : JR C,.Bit11 + ADD A,A : JR C,.Bit10 + ADD A,A : JR C,.Bit9 + ADD A,A : JR C,.Bit8 + ADD A,A : JR C,.Bit7 + LD A,E + AND %11111110 + ADD A,A : JR C,.Bit6 + ADD A,A : JR C,.Bit5 + ADD A,A : JR C,.Bit4 + ADD A,A : JR C,.Bit3 + ADD A,A : JR C,.Bit2 + ADD A,A : JR C,.Bit1 + ADD A,A : JR C,.Bit0 + RR E + RET C + LD H,D + LD L,E + RET + ; +.Bit14: ADD HL,HL : ADC A,A : JR NC,.Bit13 : ADD HL,BC : ADC A,D +.Bit13: ADD HL,HL : ADC A,A : JR NC,.Bit12 : ADD HL,BC : ADC A,D +.Bit12: ADD HL,HL : ADC A,A : JR NC,.Bit11 : ADD HL,BC : ADC A,D +.Bit11: ADD HL,HL : ADC A,A : JR NC,.Bit10 : ADD HL,BC : ADC A,D +.Bit10: ADD HL,HL : ADC A,A : JR NC,.Bit9 : ADD HL,BC : ADC A,D +.Bit9: ADD HL,HL : ADC A,A : JR NC,.Bit8 : ADD HL,BC : ADC A,D +.Bit8: ADD HL,HL : ADC A,A : JR NC,.Bit7 : ADD HL,BC : ADC A,D +.Bit7: LD D,A + LD A,E + AND %11111110 + ADD HL,HL : ADC A,A : JR NC,.Bit6 : ADD HL,BC : ADC A,0 +.Bit6: ADD HL,HL : ADC A,A : JR NC,.Bit5 : ADD HL,BC : ADC A,0 +.Bit5: ADD HL,HL : ADC A,A : JR NC,.Bit4 : ADD HL,BC : ADC A,0 +.Bit4: ADD HL,HL : ADC A,A : JR NC,.Bit3 : ADD HL,BC : ADC A,0 +.Bit3: ADD HL,HL : ADC A,A : JR NC,.Bit2 : ADD HL,BC : ADC A,0 +.Bit2: ADD HL,HL : ADC A,A : JR NC,.Bit1 : ADD HL,BC : ADC A,0 +.Bit1: ADD HL,HL : ADC A,A : JR NC,.Bit0 : ADD HL,BC : ADC A,0 +.Bit0: ADD HL,HL + ADC A,A + JR C,.FunkyCarry + RR E + LD E,A + RET NC + ADD HL,BC + RET NC + INC E + RET NZ + INC D + RET + ; +.FunkyCarry: INC D + RR E + LD E,A + RET NC + ADD HL,BC + RET NC + INC E + RET +;; + + +;; +;Inputs: H,E +;Outputs: HL is the product, D is 0 +;Destroys: A +;187+6{0,6}+{0,15} +;min: 187cc +;max: 238cc +;avg: 212.5cc +;35 bytes +H_Times_E: + ld d,0 + sla h + sbc a,a + and e + ld l,a + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : ret nc : add hl,de + ret +;; + + +;; +H_Times_E_No_A: +;Inputs: H,E +;Outputs: HL is the product, D is 0 +;190+6{0,6}+{0,15}+{0,1} +;min: 190cc +;max: 242 +;avg: 216 +;36 bytes + ld d,0 + ld l,d + sla h : jr nc,$+3 : ld l,e + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : jr nc,$+3 : add hl,de + add hl,hl : ret nc : add hl,de + ret +;; + + +;; +HL_Times_128: + xor a + rr h + rr l + rra + ld h,l + ld l,a + ret +;; + + +;; +;NOTE: This is a set of in-line routines! +; Input: HL +; Output: BC is the input, HL is 12 times the input +; 6 bytes, 52cc +HL_Times_12 + ld b,h + ld c,l + add hl,hl + add hl,bc + add hl,hl + add hl,hl +;Destroys only register E and F +; Input: HL <= 85, +; 8 bytes, 46cc + ld e,a + ld a,l + add a,a ; hl*2 + add a,l ; hl*3 + ld l,a + ld a,e + add hl,hl ; hl*6 + add hl,hl ; hl*12 +;Destroys only register E and F +; Input: HL <= 85, +; 7 bytes, 55cc + ld e,l + add hl,hl ; hl*2 + add hl,de ; hl*3+d*256 + ld h,0 ; hl*3 + add hl,hl ; hl*6 + add hl,hl ; hl*12 + RET +;; + + +;; +;Inputs: +; DEBC is a 32-bit multiplicand +; A is an 8-bit multiplicand +;Outputs: +; AHLIX is the 40-bit result +; carry reset +; z set if top 8 bits are 0 +; sign flag set as expected +;=============================================================== +;503+8{0,41} +;min: 503cc +;max: 831cc +;avg: 667cc +;29 bytes +DEBC_Times_A: + ld hl,0 + ld ix,0 + call .iter3 +.iter3: +;231+4{0,41} + call .iter2 +.iter2: +;107+2{0,41} + call .iter1 +.iter1: +;45+{0,41} + add ix,ix + adc hl,hl + adc a,a + ret nc + add ix,bc + adc hl,de + adc a,0 + ret +;; + + +;; +;Inputs: +; DE and A are factors +;Outputs: +; A is not changed +; B is 0 +; C is not changed +; DE is not changed +; HL is the product +;Time: +; 342+6x +;13 bytes +DE_Times_A: + ld b,8 + ld hl,0 +.loop: + add hl,hl + rlca + jr nc,$+3 + add hl,de + djnz .loop + ret +;; + + +;; +;Input: DE,A +;Output: A:HL is the product, C=0, B,DE unaffected, z flag set if result is zero, c flag set if A is input as 1, else nc. +;A:128~255 219+6{0,10}+{0,19} avg=258.5 *1/2 +;A:64~127 203+5{0,10}+{0,19} avg=237.5 *1/4 +;A:32~63 187+4{0,10}+{0,19} avg=216.5 *1/8 +;A:16~31 171+3{0,10}+{0,19} avg=195.5 *1/16 +;A:8~15 155+2{0,10}+{0,19} avg=174.5 *1/32 +;A:4~7 139+{0,10}+{0,19} avg=153.5 *1/64 +;A:2~3 123+{0,19} avg=132.5 *1/128 +;A:1 107cc avg=107 *1/256 +;A:0 119cc avg=119 *1/256 +;overall avg: 237.671875cc +DE_Times_A_v1: + ld c,0 + ld h,d + ld l,e + add a,a : jr c,.mul_07 + rla : jr c,.mul_06 + rla : jr c,.mul_05 + rla : jr c,.mul_04 + rla : jr c,.mul_03 + rla : jr c,.mul_02 + rla : jr c,.mul_01 + rla + ret c + ld h,a + ld l,a + ret +.mul_07: + add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,c +.mul_06: + add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,c +.mul_05: + add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,c +.mul_04: + add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,c +.mul_03: + add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,c +.mul_02: + add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,c +.mul_01: + add hl,hl : rla : ret nc : add hl,de : adc a,c + ret +;; + + +;; +DE_Times_A_v2: +;DE*A ==> AHL + ld hl,0 + ld b,h + add a,a : jr nc,$+5 : ld h,d : ld l,e + add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,b + add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,b + add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,b + add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,b + add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,b + add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,b + add hl,hl : rla : ret nc : add hl,de : adc a,b + ret +;; + + +;; +;C*BDE => CAHL +;C = 0 157 +;C = 1 141 +;141+ +;C>=128 135+6{0,33+{0,1}}+{0,20+{0,8}} +;C>=64 115+5{0,33+{0,1}}+{0,20+{0,8}} +;C>=32 95+4{0,33+{0,1}}+{0,20+{0,8}} +;C>=16 75+3{0,33+{0,1}}+{0,20+{0,8}} +;C>=8 55+2{0,33+{0,1}}+{0,20+{0,8}} +;C>=4 35+{0,33+{0,1}}+{0,20+{0,8}} +;C>=2 15+{0,20+{0,8}} +;min: 141cc +;max: 508cc +;avg: 349.21279907227cc +C_times_BDE: + ld a,b + ld h,d + ld l,e + sla c : jr c,.mul8_24_1 + sla c : jr c,.mul8_24_2 + sla c : jr c,.mul8_24_3 + sla c : jr c,.mul8_24_4 + sla c : jr c,.mul8_24_5 + sla c : jr c,.mul8_24_6 + sla c : jr c,.mul8_24_7 + sla c : ret c + ld a,c + ld h,c + ld l,c + ret +.mul8_24_1: + add hl,hl : rla : rl c : jr nc,$+7 : add hl,de : adc a,b : jr nc,$+3 : inc c +.mul8_24_2: + add hl,hl : rla : rl c : jr nc,$+7 : add hl,de : adc a,b : jr nc,$+3 : inc c +.mul8_24_3: + add hl,hl : rla : rl c : jr nc,$+7 : add hl,de : adc a,b : jr nc,$+3 : inc c +.mul8_24_4: + add hl,hl : rla : rl c : jr nc,$+7 : add hl,de : adc a,b : jr nc,$+3 : inc c +.mul8_24_5: + add hl,hl : rla : rl c : jr nc,$+7 : add hl,de : adc a,b : jr nc,$+3 : inc c +.mul8_24_6: + add hl,hl : rla : rl c : jr nc,$+7 : add hl,de : adc a,b : jr nc,$+3 : inc c +.mul8_24_7: + add hl,hl : rla : rl c : ret nc : add hl,de : adc a,b : ret nc : inc c + ret +;; + + +; +;ۄ +; +;Signed division CHL/DE by Zeda, inspired by code from matrefeytontias. +;signed CHL/DE +;signed CHL/DE ==> CHL, |remainder| is DE +sdiv24_16: +;Get the sign of the result + ld a,c + xor d + push af + +;Make BHL positive + xor d + jp p,.skip1 + xor a + sub l + ld l,a + ld a,0 + sbc a,h + ld h,a + sbc a,a + sub c + ld c,a +.skip1: + +;make DE negative + bit 7,d + jr z,.skip2 ;setting DE negative + xor a + sub e + ld e,a + sbc a,a + sub d + ld d,a + ld a,c +.skip2: + + ld b,24 + push hl + pop ix + ld hl,0 + +.loop: + add ix,ix + rla + adc hl,hl + add hl,de + jr c,.skip3 + sbc hl,de + DB #DA ;start or `jp c,**` +.skip3: + inc ixl + djnz .loop + ld c,a + ex de,hl ;DE is remainder + + push ix + pop hl + +;restore sign + pop af + ret p + xor a + sub l + ld l,a + ld a,b + sbc a,h + ld h,a + sbc a,a + sub c + ld c,a + ret +;; + + +;; +;Adapted from Axe +p_SDiv: + ld a,h + xor d + push af + xor d + jp p,.Skip1 + xor a + sub l + ld l,a + sbc a,a + sub h + ld h,a +.Skip1: + bit 7,d + jr z,.Skip2 + xor a + sub e + ld e,a + sbc a,a + sub d + ld d,a +.Skip2: + call div16 ;normal routine division + pop af + ret p + xor a + sub l + ld l,a + sbc a,a + sub h + ld h,a + ret +;; + + +;; +;Input: HLDE is numerator, C<129 is the divisor. +;Output: HLDE is quotient, A is remainder, C is negated +;1021+4{0,15} +;min: 1021cc +;max: 1081cc +;min: 1051cc +;87 bytes +HLDE_Div_C: + xor a + sub c + ld c,a +;; +;Note: -C<129 +;1009+4{0,15} +;min: 1009cc +;max: 1069cc +;min: 1039cc +;84 bytes +HLDE_Div_negC: + xor a + call .div + ld b,h + + ld h,l + call .div + ld l,h + + ld h,d + call .div + ld d,h + + ld h,e + call .div + ld e,h + + ld h,b + rl e + rl d + adc hl,hl + ret + +;216+7{0,1}+{0,8} +;min: 216cc +;max: 231cc +;avg: 224.5cc +.div: + rl h : rla : add a,c : jr c,$+3 : sub c + rl h : rla : add a,c : jr c,$+3 : sub c + rl h : rla : add a,c : jr c,$+3 : sub c + rl h : rla : add a,c : jr c,$+3 : sub c + rl h : rla : add a,c : jr c,$+3 : sub c + rl h : rla : add a,c : jr c,$+3 : sub c + rl h : rla : add a,c : jr c,$+3 : sub c + rl h : rla : add a,c : ret c : sub c + ret +;; + + +;; +;Written by calc84maniac, based on a routine from Zeda +;=============================================================== +;=============================================================== +;Performs HL/BC +;Speed: 1168 to 1318 cycles depending on how many set bits in the result +; add 19 if HL is negative +; add 19 if BC is positive +; add another 28 if only one is negative +;Size: 54 bytes +; **31 bytes larger than the regular HL_Div_BC +;Inputs: +; HL is the numerator +; BC is the denominator +;Outputs: +; HL is the quotient +; DE is the remainder +; BC = -abs(BC) +;=============================================================== +HL_SDiv_BC: + ld a,h + xor b + push af +.absHL: + add hl,hl + jr nc,.negabsBC + xor a : sub l : ld l,a + sbc a,a : sub h : ld h,a +.negabsBC: + bit 7,b + jr nz,$+8 + xor a : sub c : ld c,a + sbc a,a : sub b : ld b,a + ex de,hl + xor a + ld h,a + ld l,a + ld a,15 +.Div_Loop_1: + rl e : rl d + adc hl,hl + add hl,bc + jr c,$+4 + sbc hl,bc + dec a + jr nz,.Div_Loop_1 + ex de,hl + adc hl,hl + pop af : ret p + xor a : sub l : ld l,a + sbc a,a : sub h : ld h,a + ret +;; + + +;; +;Inputs: +; HL is the numerator +; C<128 is the denominator +;Outputs: +; A is twice the remainder of the unrounded value +; B is 0 +; C is not changed +; DE is not changed +; HL is the rounded quotient +; c flag set means no rounding was performed +; reset means the value was rounded +HL_Div_C_round: + ld b,16 + xor a + add hl,hl + rla + cp c + jr c,$+4 + inc l + sub c + djnz $-7 + add a,a + cp c + ret c + inc hl + ret +;; + + +;; +;I'm not postive on the timing. +;min: 203 +;max: 308 +;avg: 236.125 +HL_Div_B: + add hl,hl + ld a,h + jr c,.div16_8_2_0 + cp b + jr c,$+4 + sub b : inc l + sla l : rla + jr c,.div16_8_2_1 +.div16_8_1_1: + cp b + jr c,$+4 + sub b : inc l + sla l : rla + jr c,.div16_8_2_2 +.div16_8_1_2: + cp b + jr c,$+4 + sub b : inc l + sla l : rla + jr c,.div16_8_2_3 +.div16_8_1_3: + cp b + jr c,$+4 + sub b : inc l + sla l : rla + jr c,.div16_8_2_4 +.div16_8_1_4: + cp b + jr c,$+4 + sub b : inc l + sla l : rla + jr c,.div16_8_2_5 +.div16_8_1_5: + cp b + jr c,$+4 + sub b : inc l + sla l : rla + jr c,.div16_8_2_6 +.div16_8_1_6: + cp b + jr c,$+4 + sub b : inc l + sla l : rla + jr c,.div16_8_2_7 +.div16_8_1_7: + cp b : ret c : sub b : inc l + ret + +.div16_8_2_0: + sub b : rl l : rla : jr nc,.div16_8_1_1 +.div16_8_2_1: + sub b : rl l : rla : jr nc,.div16_8_1_2 +.div16_8_2_2: + sub b : rl l : rla : jr nc,.div16_8_1_3 +.div16_8_2_3: + sub b : rl l : rla : jr nc,.div16_8_1_4 +.div16_8_2_4: + sub b : rl l : rla : jr nc,.div16_8_1_5 +.div16_8_2_5: + sub b : rl l : rla : jr nc,.div16_8_1_6 +.div16_8_2_6: + sub b : rl l : rla : jr nc,.div16_8_1_7 +.div16_8_2_7: + sub b : inc l + ret +;; + + +;; +HL_Div_384: +;223cc + ;(HL+HL*5*17*2)/256 + push hl + ld b,h + ld c,l + xor a + add hl,hl : rl a + add hl,hl : rl a + add hl,bc : adc a,0 + ld d,a + ld b,h + ld c,l + add hl,hl : rl a + add hl,hl : rl a + add hl,hl : rl a + add hl,hl : rl a + add hl,bc : adc a,d + add hl,hl : rla + pop de + add hl,hl : rl a + sla l + adc a,0 + ret +;; + + +;; +;;270cc or 280cc +HL_Div_7_round: + xor a + ld d,h + ld e,l + ld b,a + add hl,hl : rla + add hl,hl : rla + add hl,hl : rla + add hl,de : adc a,b + ld d,h + ld e,l + ld c,a + add hl,hl : rla + add hl,hl : rla + ld d,h + ld e,l + ld c,a + add hl,hl : rla + add hl,hl : rla + ld d,a + ld d,h + ld e,l + ld c,a + ld l,a + ld h,b + add hl,hl + add hl,hl + add hl,hl + add hl,hl + add hl,de + adc a,b + sla l + ld l,h + ld h,a + ret nc + inc hl + ret +;AH/16 +;; + + +;; +;210cc or 220cc +HL_Div_5_round: + xor a + ld d,h : ld e,l : ld b,a + add hl,hl : rla + add hl,de : adc a,b + ld d,h : ld e,l : ld c,a + add hl,hl : rla + add hl,hl : rla + add hl,hl : rla + add hl,hl : rla + add hl,de : adc a,c + ld d,a : ld e,h + add hl,de : adc a,b + ld d,a : ld e,h + add a,l + ex de,hl + rla : rla : and 3 : rra + adc a,b + add a,l + ld l,a + ret nc + inc h + ret +;; + + +;; +;HL/5 +;HL/4+HL*3*17*257 +;234cc to 245cc +HL_Div_5: + xor a + ld b,h + ld c,l + ld d,a + add hl,hl : rla + add hl,bc : adc a,d ;3 + add hl,hl : rla ;6 + add hl,hl : rla ;12 + add hl,hl : rla ;24 + add hl,bc : adc a,d ;25 + add hl,hl : rla ;50 + add hl,bc : adc a,d ;51 +;AHL0+AHL+BC/2 +;AHL*257/256 =AHL+A + srl b : rr c + srl b : rr c + ld d,a + ld a,b + add a,l + ld b,a + ld e,h + jr nc,$+3 + inc de + add hl,bc + ld a,d + add a,e + ld e,a + ret nc + inc d + ret +;; + + +;; +;205cc or 215cc +HL_Div_3_round: + xor a : ld d,h : ld e,l + add hl,hl : rla + add hl,hl : rla + add hl,de + ld d,h : ld e,l : ld b,a + add hl,hl : rla + add hl,hl : rla + add hl,hl : rla + add hl,hl : rla + add hl,de : adc a,bas + ld d,h : ld e,l : ld b,a + ld d,a : ld e,h : add hl,de + adc a,0 + sla l + ld l,h + ld h,a + ret nc + inc hl + ret +;; + + +;; +;HL/3 --> DE +;209cc to 219cc +HL_Div_3: + xor a + ld b,a + ld d,h + ld e,l + add hl,hl : rla + add hl,hl : rla + add hl,de : adc a,b + add hl,hl : rla + add hl,hl : rla + add hl,de : adc a,b + add hl,hl : rla + add hl,hl : rla + add hl,de : adc a,b +;AHL+(AHL+(DE>>1))/256 + srl d : rr e +;AHL+(AHL+DE)/256 +;AH.L+A.HL+.DE + ld b,h + ld c,l +;AB.C+A.HL+.DE + add hl,de +;AB.C+A.HL+carry + ld d,a +;DB.C+A.H+carry + adc a,b + ld e,a + jr nc,$+3 + inc d +;DE.C+0.H+carry + ld a,h + add a,c + ex de,hl + ret nc + inc hl + ret +;; + + +;; +;Input: HL +;Output: HL is the input divided by 3 +;Destroys: B,C,E,A +;217cc +HL_Div_3: +;increment HL, putting overflow in A + ld bc,1 + ld a,b + add hl,bc + adc a,b +;We want a difference of a factor of 2 shifts + ld b,h + ld c,l + ld e,a + add hl,hl : rla + add hl,hl : rla + add hl,bc : adc a,e +;We want a difference of a factor of 4 shifts + ld b,h + ld c,l + ld e,a + add hl,hl : rla + add hl,hl : rla + add hl,hl : rla + add hl,hl : rla + add hl,bc : adc a,e + ld b,a + ld c,h + add hl,bc + adc a,0 + ld l,h + ld h,a +;now HL is our result + ret +;; + + +;; +;1360+24({0,3+{0,3}}) +;min: 1360cc +;max: 1504cc +;avg: 1414cc +;17 bytes +EHL_Div_D: + xor a + ld b,24 +.loop: + add hl,hl + rl e + rla + jr c,$+5 ;if D is guaranteed <129, can omit this + cp d + jr c,$+4 + sub d + inc l + djnz .loop + ret +;; + + +;; +;Inputs: +; DE,BC are 8.8 Fixed Point numbers +;Outputs: +; HL is the 8.8 Fixed Point result (rounded to the least significant bit) +;if DE is 0 : 122cc or 136cc if BC is negative +;if |BC|>=128*|DE| : 152cc or 166cc if BC is negative +;Otherwise: +;min: 1164cc +;max: 1377cc +;avg: 1258.5cc +BC_Div_DE_88: +; First, find out if the output is positive or negative + ld a,b + xor d + push af ;sign bit is the result sign bit + +; Now make sure the inputs are positive + xor d ;A now has the value of B, since I XORed it with D twice (cancelling) + jp p,.skip1 ;if Positive, don't negate + xor a + sub c + ld c,a + sbc a,a + sub b + ld b,a +.skip1: + +;now make DE negative to optimize the remainder comparison + ld a,d + or d + jp m,.skip2 + xor a + sub e + ld e,a + sbc a,a + sub d + ld d,a +.skip2: + +;if DE is 0, we can call it an overflow +;A is the current value of D + or e + jr z,div_fixed88_overflow + +;The accumulator gets set to B if no overflow. +;We can use H=0 to save a few cc in the meantime + ld h,0 + +;if B+DE>=0, then we'll have overflow + ld a,b + add a,e + ld a,d + adc a,h + jr c,div_fixed88_overflow + +;Now we can load the accumulator/remainder with B +;H is already 0 + ld l,b + + ld a,c + call div_fixed88_sub + ld c,a + + ld a,b ;A is now 0 + call div_fixed88_sub + +; if 2HL+DE>=0, increment result to round. + add hl,hl + add hl,de + ld h,c + ld l,a + jr nc,$+3 + inc hl + +;Now check if H is overflowed + bit 7,h + jr nz,.div_fixed88_overflow + + + pop af + ret p + xor a + sub l + ld l,a + sbc a,a + sub h + ld h,a + ret + +.div_fixed88_overflow: + ld hl,#7FFF + pop af + ret p + inc hl + inc l + ret + +.div_fixed88_sub: +;min: 456cc +;max: 536cc +;avg: 496cc + ld b,8 +.loop: + rla + adc hl,hl + add hl,de + jr c,$+4 + sbc hl,de + djnz .loop + adc a,a + ret +;; + + +;; +;HLIX/BC -> HLIX remainder DE +;174+4*.sub8 +;min: 2186cc +;max: 2794cc +;avg: 2466cc +;61 bytes +div32_16: + ex de,hl ; 4 +; Negate BC to allow add instead of sbc + xor a ; 4 +; Need to set HL to 0 anyways, so save 2cc and a byte + ld h,a ; 4 + ld l,a ; 4 + sub c ; 4 + ld c,a ; 4 + sbc a,a ; 4 + sub b ; 4 + ld b,a ; 4 + + + ld a,d ; 4 + call .sub8 ; 17 + rla ; 4 + ld d,a ; 4 + + ld a,e ; 4 + call .sub8 ; 17 + rla ; 4 + ld e,a ; 4 + + ld a,ixh ; 8 + call .sub8 ; 17 + rla ; 4 + ld ixh,a ; 8 + + ld a,ixl ; 8 + call .sub8 ; 17 + rla ; 4 + ld ixl,a ; 8 + + ex de,hl ; 4 + ret ; 10 + +.sub8: +;119+8*.sub +;min: 503cc +;max: 655cc +;avg: 573cc + call .iter1 +.iter1: +;17+2(17+2(.sub))) + call .iter2 +.iter2: +;17+2(.sub) + call .sub +.sub: +;48+{8,0+{0,19}} +;min: 48cc +;max: 67cc +;avg: 56.75cc + rla ; 4 + adc hl,hl ; 15 + jr c,.skip ;12/7 + add hl,bc ; 11 + ret c ;11/5 + sbc hl,bc ; 15 + ret ; 10 +.skip: + add hl,bc ; 11 + scf ; 4 + ret ; 10 +;; + + +;; +;HL/9 --> A, HL<2304 +div9: + inc hl + ld d,h + ld e,l + add hl,hl + add hl,de + add hl,hl + add hl,de + ld e,0 + ld d,l + ld a,h + add hl,hl + add hl,hl + add hl,de + adc a,e + add hl,hl + rla + add hl,hl + rla + ret +;; + + +;; +;Made by Zeda Thomas, use it for whatever, and please optimize this! +;Slight Warning: This passed a handful of tests, but if you find a bug, +;please report it. I still actively maintain these (as of January 2020). +;Inputs: +; HLIX/BCDE +;Outputs: +; HLIX is the quotient +; BCDE is the remainder +;RAM: +; uses 8 bytes of RAM: +; 4 bytes at temp32_0 +; 4 bytes at temp32_1 +; +;min: 5240cc +;max: 6264cc +;avg: 5752cc +;113 bytes +div_32_32: +; Back up HLIX + ld (.temp32_0),ix + ld (.temp32_0+2),hl + + +;negate BCDE + xor a + ld l,a : sbc a,e : ld e,a + ld a,l : sbc a,d : ld d,a + ld a,l : sbc a,c : ld c,a + ld a,l : sbc a,b : ld b,a + + ld a,h +;set HLIX to 0 + ld h,l + ld ix,0 + call .sub + ld (.temp32_0+3),a + + ld a,(.temp32_0+2) + call .sub + ld (.temp32_0+2),a + + ld a,(.temp32_0+1) + call .sub + ld (.temp32_0+1),a + + ld a,(.temp32_0+0) + call .sub + ld (.temp32_0),a + + push ix + pop de + ld b,h + ld c,l + ld ix,(.temp32_0) + ld hl,(.temp32_0+2) + ret + + + +.sub: +;min: 1223cc +;max: 1479cc +;avg: 1351cc + + call .iter1 +.iter1: + call .iter2 +.iter2: + call .iter3 +.iter3: +;min: 138cc +;max: 170cc +;avg: 154cc +;HLIX*2 + add ix,ix + adc hl,hl + +;rotate in the bit + add a,a + jr nc,.skip1 + inc ix +.skip1: + +;save HLIX in case we need to restore + ld (temp32_1),ix + ld (temp32_1+2),hl + +;check if HLIX>=-BCDE +; ==> HLIX+BCDE >= 0 + add ix,de + adc hl,bc + jr c,.skip2 + +;we need to restore + ld ix,(temp32_1) + ld hl,(temp32_1+2) + ret +.skip2: + inc a + ret +;; + + +;; +;Created by calc84maniac +;NOTE from Zeda: C should <=128, the original forgot to mention this. +;Inputs: dehl=32-bit dividend, c<=128 is the divisor (Or is it the other way around?) +;Outputs: dehl=32-bit quotient, a=remainder, c=unchanged, b=0 +;min: 1936cc +;max: 2032cc +;avg: 1984cc +;Size: 17 bytes +DEHL_Div_C: +.div32bit: + ld b,32 + xor a +.divloop: + add hl,hl + rl e + rl d + rla + cp c + jr c,.divlbl + inc l + sub c +.divlbl: + djnz .divloop + ret +;; + + +;; +;Inputs: +; DEHL +;Outputs: +; DEHL is the quotient +; A is the remainder +; B is the remainder +; C is 10 +;1300cc~1329cc +;49 bytes +DEHL_Div_10_v1: + xor a + ld bc,05F6h + rl d : rla + rl d : rla + rl d : rla + rl d : rla : add a,c : jr c,$+3 : sub c : djnz $-7 + ld b,8 + rl e : rla : add a,c : jr c,$+3 : sub c : djnz $-7 + ld b,8 + rl h : rla : add a,c : jr c,$+3 : sub c : djnz $-7 + ld b,8 + rl l : rla : add a,c : jr c,$+3 : sub c : djnz $-7 + adc hl,hl + rl e + rl d + ret +;; + + +;; +;Inputs: +; DEHL +;Outputs: +; DEHL is the quotient +; A is the remainder +; B is the remainder +; C is 10 +;912cc~941cc +DEHL_Div_10_v2: + xor a + ld c,10 + rl d : rla + rl d : rla + rl d : rla + rl d : rla : sub c : jr nc,$+3 : add a,c + rl d : rla : sub c : jr nc,$+3 : add a,c + rl d : rla : sub c : jr nc,$+3 : add a,c + rl d : rla : sub c : jr nc,$+3 : add a,c + rl d : rla : sub c : jr nc,$+3 : add a,c + rl e : rla : sub c : jr nc,$+3 : add a,c + rl e : rla : sub c : jr nc,$+3 : add a,c + rl e : rla : sub c : jr nc,$+3 : add a,c + rl e : rla : sub c : jr nc,$+3 : add a,c + rl e : rla : sub c : jr nc,$+3 : add a,c + rl e : rla : sub c : jr nc,$+3 : add a,c + rl e : rla : sub c : jr nc,$+3 : add a,c + rl e : rla : sub c : jr nc,$+3 : add a,c + rl h : rla : sub c : jr nc,$+3 : add a,c + rl h : rla : sub c : jr nc,$+3 : add a,c + rl h : rla : sub c : jr nc,$+3 : add a,c + rl h : rla : sub c : jr nc,$+3 : add a,c + rl h : rla : sub c : jr nc,$+3 : add a,c + rl h : rla : sub c : jr nc,$+3 : add a,c + rl h : rla : sub c : jr nc,$+3 : add a,c + rl h : rla : sub c : jr nc,$+3 : add a,c + rl l : rla : sub c : jr nc,$+3 : add a,c + rl l : rla : sub c : jr nc,$+3 : add a,c + rl l : rla : sub c : jr nc,$+3 : add a,c + rl l : rla : sub c : jr nc,$+3 : add a,c + rl l : rla : sub c : jr nc,$+3 : add a,c + rl l : rla : sub c : jr nc,$+3 : add a,c + rl l : rla : sub c : jr nc,$+3 : add a,c + rl l : rla : sub c : jr nc,$+3 : add a,c + ld b,a + ld a,l : rra : ccf : ld l,a + ld a,h : rra : ccf : ld h,a + ld a,e : rra : ccf : ld e,a + ld a,d : rra : ccf : ld d,a + ld a,b + ret +;; + + +;; +;Inputs: +; C is the numerator +; D is the denominator +;Outputs: +; A is the remainder +; B is 0 +; C is the result of C/D +; D,E,H,L are not changed +C_Div_D: + ld b,8 + xor a +.loop: + sla c + rla + cp d + jr c,.skip1 + inc c + sub d +.skip1: + djnz .loop + ret +;; + + +;; +;Input: +; HL points to the bignum (1 byte size prefix (0 -> 1 byte, 1 -> 2 bytes, n-1 -> n bytes), n subsequent bytes) +;Output: +; bignum is divided in-place, not renormalized +; A is the remainder +; BC is 100 +bignum_div_100: + ld c,100 +bignum_div_C: +;Note: C<128 + ld b,(hl) + inc hl + ld a,(hl) + ld h,-1 + inc h : sub c : jr nc,$-2 + add a,c + ld (hl),a + inc b + dec b + ret z +.loop: + inc hl + ld e,(hl) + sla e : rla : cp c : jr c,$+4 : sub a,c : inc e + sla e : rla : cp c : jr c,$+4 : sub a,c : inc e + sla e : rla : cp c : jr c,$+4 : sub a,c : inc e + sla e : rla : cp c : jr c,$+4 : sub a,c : inc e + sla e : rla : cp c : jr c,$+4 : sub a,c : inc e + sla e : rla : cp c : jr c,$+4 : sub a,c : inc e + sla e : rla : cp c : jr c,$+4 : sub a,c : inc e + sla e : rla : cp c : jr c,$+4 : sub a,c : inc e + ld (hl),a + djnz .loop + ret +;; + + +;; +;BC/DE ==> BC, remainder in HL +BC_Div_DE: + ld hl,0 + ld a,b + ld b,16 +.loop: + ;shift the bits from BC into HL + sla c + rla + adc hl,hl + sbc hl,de + jr nc,.inc_acc + add hl,de + db #FE ;this begins the instruction `cp *`, so it eats the next byte. +.inc_acc: + inc c + djnz .loop + ld b,a + ret +;; + + +;; +BC_Div_DE_faster: +;BC/DE ==> BC, remainder in HL +;NOTE: BC/0 returns 0 as the quotient. +;min: 738cc +;max: 898cc +;avg: 818cc +;144 bytes + xor a + ld h,a + ld l,a + sub e + ld e,a + sbc a,a + sub d + ld d,a + + ld a,b + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla + ld b,a + + ld a,c + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla + ld c,a + + ret +;; + + +;; +;BC/DE ==> BC, remainder in HL +;NOTE: BC/0 returns 0 as the quotient. +;min: 773cc +;max: 933cc +;avg: 853cc +;82 bytes +BC_Div_DE_fast: + xor a + ld h,a + ld l,a + sub e + ld e,a + sbc a,a + sub d + ld d,a + + ld a,b + ld b,c + call .sub + ld a,b + ld b,c + +.sub: +;min: 354cc +;max: 434cc +;avg: 394cc + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de + rla + ld c,a + ret +;; + + +;; +BC_Div_DE: + ld hl,0 + inc d + dec d + jr z,.smalldiv + ld l,b + ld b,h +.nextpart: + ld a,c + rla : adc hl,hl : sbc hl,de : jr nc,$+3 : add hl,de + rla : adc hl,hl : sbc hl,de : jr nc,$+3 : add hl,de + rla : adc hl,hl : sbc hl,de : jr nc,$+3 : add hl,de + rla : adc hl,hl : sbc hl,de : jr nc,$+3 : add hl,de + rla : adc hl,hl : sbc hl,de : jr nc,$+3 : add hl,de + rla : adc hl,hl : sbc hl,de : jr nc,$+3 : add hl,de + rla : adc hl,hl : sbc hl,de : jr nc,$+3 : add hl,de + rla : adc hl,hl : sbc hl,de : jr nc,$+3 : add hl,de + cpl + ld c,a + ret +.smalldiv: + xor a + rl b : rla : sub e : jr nc,$+3 : add a,e + rl b : rla : sub e : jr nc,$+3 : add a,e + rl b : rla : sub e : jr nc,$+3 : add a,e + rl b : rla : sub e : jr nc,$+3 : add a,e + rl b : rla : sub e : jr nc,$+3 : add a,e + rl b : rla : sub e : jr nc,$+3 : add a,e + rl b : rla : sub e : jr nc,$+3 : add a,e + rl b : rla : sub e : jr nc,$+3 : add a,e + ld l,a + ld a,b + cpl + ld b,a + jp .nextpart +;; + + +;; +;Divides a 48-bit integer by 100, where A holds the upper 8 bits and L holds the next 8, followed by DE and IX +;Result is in HLDEIX, A is the remainder +ALDEIX_div_100: + ld c,100 +ALDEIX_Div_C: +;Note: C<128 + call AL_Div_C + push hl + ld l,d + call AL_Div_C.rotate + ld h,l + ld l,e + call AL_Div_C.rotate + push hl + push ix + pop de + ld l,d + call AL_Div_C.rotate + ld h,l + ld l,e + call AL_Div_C.rotate + pop de + ex (sp),ix + pop hl + ret +;; + + +;; +;Divides a 32-bit integer by 100, where A holds the upper 8 bits and L holds the next 8, followed by DE +;Result is in DEHL, A is the remainder +ALDE_div_100: + ld c,100 +ALDE_Div_C: +;Note: C<128 + call AL_Div_C + push hl + ld l,d + call AL_Div_C.rotate + ld h,l + ld l,e + pop de + jp AL_Div_C.rotate +;; + + +;; +AL_div_100: +;Divides a 16-bit integer by 100, where A holds the upper 8 bits and L holds the lower +;Result is in HL, A is the remainder +;min:256 +;max:329 +;avg:305.5625cc + ld c,100 +AL_Div_C: +;Note: C<128 + ld h,-1 + inc h : sub c : jr nc,$-2 + add a,c +.rotate: + sla l : rla : cp c : jr c,$+4 : sub a,c : inc l + sla l : rla : cp c : jr c,$+4 : sub a,c : inc l + sla l : rla : cp c : jr c,$+4 : sub a,c : inc l + sla l : rla : cp c : jr c,$+4 : sub a,c : inc l + sla l : rla : cp c : jr c,$+4 : sub a,c : inc l + sla l : rla : cp c : jr c,$+4 : sub a,c : inc l + sla l : rla : cp c : jr c,$+4 : sub a,c : inc l + sla l : rla : cp c : ret c : sub a,c : inc l + ret +;; + + +;; +HL_mod_3: +;destroys HL, returns HL mod 3 in A +;112+{0,2} + {0,8} + {0,1} +;min: 112 +;max: 123 +;avg: 117.5 + +; HL mod 3 == (H*256+L) mod 3 == (H*1+L) mod 3 == (H+L) mod 3 +;So add the upper and lower byte + ld a,h + add a,l + +;If adding caused an overflow, well add (256 mod 3) == 1 to A. + adc a,0 ;We don't need to worry abput overflow here :) +;; +;destroys HL, returns A mod 3 in A +;97+{0,2} + {0,8} + {0,1} +;min: 97 +;max: 108 +;avg: 102.5 +;A mod 3 is equal to adding the upper and lower nibble of A mod 3 +;For example, if A=16u+l, then A mod 3 == 16u+l mod 3 == u+l +A_mod_3: +;So add the upper and lower nibble + ld l,a ;save a copy of a + add a,a + add a,a + add a,a + add a,a + add a,l + +; If there was overflow, again, add 1. However, our number is shifted up by 4, +; so we need to add 1<<4 == 16 + jr nc,$+4 + add a,16 + +; Now our number is in the upper 4 bits of A. We need to add the top 2 bits to +; the preceding 2 bits + + ld l,a + add a,a + add a,a + +; Note that now we might have some garbage bits in the middle 4 bits of A, +; overlapping two garbage bits in L. We'll need to clear out bits to avoid +; issues. It is convenient to use a mask of %11000000 + ld h,%11000000 + and h + add a,l + +;Now if there was overflow, add 1<<6 == #40. H "happens" to be -#40, so we can +;do this by subtracting h + jr nc,$+3 + sub h + +;Now finally, mask out all but those upper two bits + and h + +; At this point, we can stop if we only need to test divisibility +; If the parity is even, then we have to do (0 mod 3) or (3 mod 3), both of +; which are 0, indicating divisibility by 3. If we have odd parity, then the +; upper two bits are 10 or 01, both of which are not 0 mod 3. +; basically, pe==divisible, po==not divisible. +; +; But, to get full modulo, shift those uppertwo bits into the lower two bits + rlca + rlca + ret po +; And make sure to set A to 0 if it was 0 or 3 :) + xor a + ret +;; + + +;; +;Inputs: HL +;Outputs: pe if HL was divisible by 3, else po. +;Destroys: HL +;103+{0,2}+{0,1} +;min: 103 +;max: 106 +;avg: 104.5 +HL_divisible_by_3: + ld a,h + add a,l + adc a,0 +;; +;Inputs: A +;Outputs: pe if A was divisible by 3, po if A was not divisible by 3 +;Destroys: HL +;88+{0,2}+{0,1} +;min: 88 +;max: 91 +;avg: 89.5 +A_divisible_by_3: + ld h,#C0 + ld l,a ;save a copy of a + add a,a + add a,a + add a,a + add a,a + add a,l + jr nc,$+4 + add a,16 + ld l,a + add a,a + add a,a + and h + add a,l + jr nc,$+3 + sub h + and h + ret +;; + + +;; +;; + + +; +;ۋ +; +;Input: H.L needs to be on (0,128.0) +;Output: H.L if c flag set +; returns nc if input is negative (HL not modified) +;Error: +; The error on the outputs is as follows: +; 20592 inputs are exact +; 12075 inputs are off by 1/256 +; 100 inputs are off by 2/256 +; So all 32767 inputs are within 2/256, with average error being <1/683 which is smaller than 1/256. +;Size: 177 bytes +;Speed: average speed is less than 1250 t-states +lognat: + ld a,h : or l : jr nz,$+5 + ld h,80h : ret + dec h + dec h + jr nz,$+9 + inc l : dec l + jr nz,.normalizeln + ld l,177 + ret + inc h + jr nz,.normalizeln_2 + ld b,h + ld c,l + ld e,l + ld d,8 + add hl,hl + add hl,hl + add hl,de + ex de,hl + ;call .HL_Div_DE + add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a + add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a + add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a + add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a + add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a + add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a + add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a + add hl,hl : sbc hl,de : adc a,a + ld h,a : ld l,b + sla h : jr c,$+3 : ld l,c + add hl,hl : jr c,$+3 : add hl,bc + add hl,hl : jr c,$+3 : add hl,bc + add hl,hl : jr c,$+3 : add hl,bc + add hl,hl : jr c,$+3 : add hl,bc + add hl,hl : jr c,$+3 : add hl,bc + add hl,hl : jr c,$+3 : add hl,bc + add hl,hl : jr c,$+3 : add hl,bc + rl l + ld a,h + adc a,b + ld h,b + ld l,a + scf + ret + +; .HL_Div_DE: +; add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a +; add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a +; add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a +; add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a +; add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a +; add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a +; add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a +; add hl,hl : sbc hl,de : adc a,a : ret + +.normalizeln: + inc h +.normalizeln_2: + xor a + inc h : ret m + ld d,a : ld e,a + ld a,l + jr z,.toosmall + inc e : srl h : rra : jr nz,$-4 + rla : rl h + dec e +.stepin: + ld l,a + push de + call lognat + pop de + ;now multiply DE by 355, then divide by 2 (rounding) + ld b,d : ld c,e : ld a,d + ex de,hl + add hl,hl + add hl,hl ;4 + add hl,bc ;5 + add hl,hl ;10 + add hl,bc ;11 + add hl,hl ;22 + add hl,hl + add hl,hl + add hl,hl + add hl,bc + add hl,hl + add hl,bc + sra h : rr l + adc hl,de + scf + ret +.toosmall: + dec d + dec e : add a,a : jr nc,$-2 + inc h + jp .stepin +;; + + +;; +;Input: HL is a fixed point number +;Output: lg(H.L)->H.L +;Speed: Avg: 340 +lg_88: + ld de,.LUT + ld b,0 + ld a,h + or a + ret m + ld a,l + jr z,$+8 + inc b : srl h : rra : jr nz,$-4 + or a : jr nz,$+6 + ld hl,8000h : ret + rra : inc b : jr nc,$-2 + ;A is the element to look up in the LUT + ld l,a + ld c,h + dec b + add hl,hl + add hl,de + ld e,(hl) + inc hl + ld d,(hl) + ex de,hl + add hl,bc + ret +; 0 1 2 3 4 5 6 7 8 9 +.LUT: DW #F800, #F996, #FA52, #FACF, #FB2C, #FB76, #FBB3, #FBE8, #FC16, #FC3F ; 0 + DW #FC64, #FC86, #FCA5, #FCC1, #FCDC, #FCF4, #FD0B, #FD21, #FD36, #FD49 ; 1 + DW #FD5C, #FD6D, #FD7E, #FD8E, #FD9D, #FDAC, #FDBA, #FDC8, #FDD5, #FDE2 ; 2 + DW #FDEE, #FDFA, #FE06, #FE11, #FE1C, #FE26, #FE31, #FE3B, #FE44, #FE4E ; 3 + DW #FE57, #FE60, #FE69, #FE71, #FE7A, #FE82, #FE8A, #FE92, #FE9A, #FEA1 ; 4 + DW #FEA9, #FEB0, #FEB7, #FEBE, #FEC5, #FECB, #FED2, #FED8, #FEDF, #FEE5 ; 5 + DW #FEEB, #FEF1, #FEF7, #FEFD, #FF03, #FF09, #FF0E, #FF14, #FF19, #FF1E ; 6 + DW #FF24, #FF29, #FF2E, #FF33, #FF38, #FF3D, #FF42, #FF47, #FF4B, #FF50 ; 7 + DW #FF55, #FF59, #FF5E, #FF62, #FF67, #FF6B, #FF6F, #FF74, #FF78, #FF7C ; 8 + DW #FF80, #FF84, #FF88, #FF8C, #FF90, #FF94, #FF98, #FF9B, #FF9F, #FFA3 ; 9 + DW #FFA7, #FFAA, #FFAE, #FFB2, #FFB5, #FFB9, #FFBC, #FFC0, #FFC3, #FFC6 ; 10 + DW #FFCA, #FFCD, #FFD0, #FFD4, #FFD7, #FFDA, #FFDD, #FFE0, #FFE4, #FFE7 ; 11 + DW #FFEA, #FFED, #FFF0, #FFF3, #FFF6, #FFF9, #FFFC, #FFFF ; 12 +;; + + +;; +;Inputs: +; HL is an unsigned 8.8 fixed point number. +;Outputs: +; HL is the signed 8.8 fixed point value of log base 2 of the input. +;Example: +; pass HL = 3.0, returns 1.58203125 (actual is ~1.584962501...) +;averages about 39 t-states slower than original +;62 bytes +Log_2_88_size: + ex de,hl + ld hl,0 + ld a,d + ld c,8 + or a + jr z,.DE_lessthan_1 + srl d + jr z,logloop-1 + inc l + rr e + jr $-7 +.DE_lessthan_1: + ld a,e + dec hl + or a + ret z + inc l + dec l + add a,a + jr nc,$-2 + ld e,a + + inc d +.loop: + add hl,hl + push hl + ld h,d + ld l,e + ld a,e + ld b,8 + + add hl,hl + rla + jr nc,$+5 + add hl,de + adc a,0 + djnz $-7 + + ld e,h + ld d,a + pop hl + rr a ;this is NOT supposed to be rra, we need the z flag affected + jr z,$+7 + srl d + rr e + inc l + dec c + jr nz,.loop + ret +;; + + +;; + +;Input: HL is a fixed point number +;Output: ln(H.L)->H.L +;Speed: Avg: 340+(325 worst case) +ln_88_fixed: + call lg_88 + ;now signed multiply HL by 355, then divide by 2 (rounding) + ld de,0 + bit 7,h + jr z,$+9 + dec e : xor a : sub l : ld l,a + sbc a,a : sub h : ld h,a + ld b,h + ld c,l + xor a + add hl,hl + add hl,hl : rla + add hl,bc : adc a,d + add hl,hl : rla + add hl,bc : adc a,d + add hl,hl : rla + add hl,hl : rla + add hl,hl : rla + add hl,hl : rla + add hl,bc : adc a,d + add hl,hl : rla + add hl,bc : adc a,d + sra a : rr h + ld l,h + ld h,a + inc e + ret nz + xor a : sub l : ld l,a + sbc a,a : sub h : ld h,a + ret +;; + + +; +;ې +; +; ⥯ +;Inputs: +; HL is the 8.8 fixed point number 'x' for 2^x +;Outputs: +; DEHL is the 24.8 fixed point result. If there was overflow exceeding 2^24, then this value is set to the max. +power_2: + ld a,l + or a + push hl ;save H for later, H is the integer part of the power + ld hl,1 + jr z,.integer + scf ;set the carry flag so that a bit is rotated into a. This will act as our counter. +;wait until we come across the lowest bit. Also note that we + rra + jr nc,$-1 + ld hl,2*256 +.loop: + push af + call FPSqrtHL ;returns in HL + pop af + srl a + jr z,.integer + jr nc,.loop + add hl,hl + jp .loop +.integer: + pop bc +;Now b is the integer part for 2^x that we need to multiply HL by. + ld de,0 + ld a,b + or a + ret z + + add hl,hl + rl e : rl d : jr c,.wayoverflow + djnz $-7 + ret +.wayoverflow: + ld hl,-1 + ld d,h + ld e,l + ret +;; + + +;; +;Written by Zeda + +; Requires ;!TEST +; mul16 ;BC*DE ==> DEHL +; DEHL_Div_BC ;DEHL/BC ==> DEHL +;"n choose r", defined as n!/(r!(n-r)!) +;Computes "HL choose DE" +;Inputs: HL,DE +;Outputs: +; HL is the result +; "HL choose DE" +; carry flag reset means overflow +;Destroys: +; A,BC,DE,IX +;Notes: +; Overflow is returned as 0 +; Overflow happens if HL choose DE exceeds 65535 +; This algorithm is constructed in such a way that intermediate +; operations won't erroneously trigger overflow. +;66 bytes +ncr_HL_DE: + ld bc,1 + or a + sbc hl,de + jr c,.oob + jr z,.exit + sbc hl,de + add hl,de + jr c,$+3 + ex de,hl + ld a,h + or l + push hl + pop ix +.exit: + ld h,b + ld l,c + scf + ret z +.loop: + push bc : push de + push hl : push bc + ld b,h + ld c,l + call mul16 ;BC*DE ==> DEHL + pop bc + call DEHL_Div_BC ;result in DEHL + ld a,d + or e + pop bc + pop de + jr nz,.overflow + add hl,bc + jr c,.overflow + pop bc + inc bc + ld a,b + cp ixh + jr c,.loop + ld a,ixl + cp c + jr nc,.loop + ret +.overflow: + pop bc + xor a + ld b,a +.oob: + ld h,b + ld l,b + ret +;; + + +;; +;Inputs: DE,HL +;Outputs: c flag set if HL is not divisible by DE, else c flag is reset. +; HL is 0 if true. +;See below for a note on the motivation and development of this algorithm. +isDivisible: + ld a,d : or e : ccf : ret z ;remove this if DE is always guaranteed non-zero +;step 1 + ld a,e : or l : rra : jr c,.step2 ;\ + srl d : rr e : rr h : rr l ; | + ld a,e : or l : rra : jr nc,$-11 ; |Remove these if DE is always guaranteed odd at input. +.step2: ; | + ld a,e : rra : ccf : ret c ;/ +;steps 3, 4, and 5 + ld a,l + or a +.loop: + sbc hl,de : ret c : ret z + rr h : rra : bit 0,a : jr z,$-5 + ld l,a + jp .loop +;Motivation and Development +; I often find myself in a situation where I need to find the factors of a number, but I have no technology around to aid me. This means I need to use... mental arithmetic! +; I've been doing this for 15 years, so I have refined my mental process quite a bit. +; It is still a trial division algorithm, but with a very obfuscated "division" technique. +; We don't need to do 1131/7 to see if it is divisible by 7, we just need to see if 7 divides 1131 and this is what my algorithm does. +; Interestingly, testing divisibility at the algorithmic level is a little faster than division. Not by much, but it is also non-negligible. +;The Algorithm +; The core algorith is designed around checking that (A mod B == 0) is true or false. +; We also make the assumption that B is odd and by extension, non-zero. +; The case where B is non-zero and even will be discussed later. +; +; Since B is odd, 2 does not divide B. This means that if A is even: +; (A mod B == 0) if and only if (A/2 mod B)==0. +; We also know by the definition of divisibility that +; (A mod B) == (A+c*B mod B) +; where c is any integer. Combining all this, we have an algorithm: +; +; 1] Remove all factors of 2 from A +; 2] With A now odd, do A=A-B +; If the result is zero, that means (A mod B == 0) +; If the result underflow (becomes "negative", or on the Z80, sets the carry flag), it means that A was somewhere on [1,B-1], so it is not divisible by B. +; 3] Continue back at 1. +; +; Now suppose B is allowd to be non-zero and even. Then B is of the form d*2^k where d is odd. +; This just means there are some factors of 2 that can be removed from B until it is odd. +; The only way A is divisible by B, is if it has the same number or more of factors of 2 as B. +; If we factor out common factors of 2 and find B is still even, then A is not divisible by B. +; Otherwise we have an odd number and only need to check the new (A mod d) +; for which we can use the "odd algorithm" above. +; So putting it all together: +; +; 1] If B==0, return FALSE. +; 2] Remove common factors of 2 from A and B. +; 3] If B is even, return FALSE. +; 4] Remove all factors of 2 from A. +; 5] Subtract B from A (A=A-B). +; If the result is zero, return TRUE. +; If the result is "negative" (setting the carry flag on many processors), return FALSE. +; 6] Repeat at 4] +; +; The overhead steps are 1] to 3]. +; The iterated steps are 4] and 5]. +; Because 5 always produces an even number, when it then performs step 4, it always divides by at least one factor of 2. +; This means the algorithm takes at most 1+ceil(log2(A))-floor(log2(B) iterations. +; For example, if A is a 37-bit number and B is a 13-bit number,this takes at most 38-13 = 25 iterations. +; However, in practice it is usually slightly less. +;Example Time: +; Say I wanted to test if 1337 is divisible by 17. +; Since 17 is odd, we can proceed. +; 1337 is odd, so no factors of 2 to remove. +; 1337-17 == 1320. +; 1320/2 == 660 +; 660/2 == 330 +; 330/2 == 165 +; 165-17 == 148 +; 148/2 == 74 +; 74/2 == 37 +; 37-17 == 20 +; 20/2 == 10 +; 10/2 == 5 +; 5-17 = -12 +; +; So 1337 is not divisible by 17. +;Now test divisibility by 7: +;1337 => 1330 +;=>665 +;=>658 +;=>329 +;=>322 +;=>161 +;=>154 +;=>77 +;=>70 +;=>35 +;=>28 +;=>14 +;=>7 +;=>0 +; +; So 1337 is divisible by 7. +;; + + +;; +;Adds two, little-endian 16-digit BCD integers (8 bytes) +;Input: +; HL points to one BCD integer +; DE points to another BCD integer +;Output: +; The sum is wrriten over the integer at HL. +; HL and DE point to the last digit of their integers. +;46 bytes, 284cc +addBCD_16: + ld a,(de) : add a,(hl) : daa : ld (de),a : inc hl : inc de + ld a,(de) : adc a,(hl) : daa : ld (de),a : inc hl : inc de + ld a,(de) : adc a,(hl) : daa : ld (de),a : inc hl : inc de + ld a,(de) : adc a,(hl) : daa : ld (de),a : inc hl : inc de + ld a,(de) : adc a,(hl) : daa : ld (de),a : inc hl : inc de + ld a,(de) : adc a,(hl) : daa : ld (de),a : inc hl : inc de + ld a,(de) : adc a,(hl) : daa : ld (de),a : inc hl : inc de + ld a,(de) : adc a,(hl) : daa : ld (de),a + ret +;; + + +;; +;gcd(HL,DE)->HL +;Output: +; B=0 +; HL is the GCD of the inputs +;Destroys: +; A,DE +; DE is guaranteed 0 unless the output is 0 (which only happens if one of the inputs is 0). +;Uses the binary GCD algorithm +;65 bytes +gcdHL_DE: +;B is our cofactor-of-2 counter + ld b,0 +;If HL=0, return 0 + ld a,h : or l : ret z +;If DE=0, return 0 + ex de,hl + ld a,h : or l : jr nz,.test_cofactor_of_2 + ret +.cofactor_2_loop: + inc b + srl h : rr l + srl d : rr e +.test_cofactor_of_2: + inc b + ld a,e + or l + rra + .c,gcd_cofactor_2_loop + +.remove_factors_of_2_op2: + srl h : rr l : jr nc,.remove_factors_of_2_op2 + adc hl,hl + jr .swap_ops + +.swap_ops_negate: +;At this point, HL needs to be negated and swapped with DE + xor a : sub l : ld l,a : sbc a,a : sub h : ld h,a +.swap_ops: + ex de,hl +.remove_factors_of_2_op1: + srl h : rr l : jr nc,.remove_factors_of_2_op1 + adc hl,hl + sbc hl,de + jr c,.swap_ops_negate + jp nz,.remove_factors_of_2_op1 + +;DE is the GCD, need to shift it left B-1 times. + ex de,hl + dec b + ret z + add hl,hl : djnz $-1 + ret +;; + + +;; +;ॢ ᫮ A HEX-String 頥 HL. +; ᫨ A=#31, HL 㤥 byte #33,#31 +; 112 T +ByteToStrHEX: LD B,A + AND #F0 + RRCA + RRCA + RRCA + RRCA + ADD A,#90 + DAA + ADC A,#40 + DAA + LD (HL),A + INC HL + LD A,B + AND #0F + ADD A,#90 + DAA + ADC A,#40 + DAA + LD (HL),A + RET +;; + + +; +;ۑࠢ. +; +;These code snippets are for 16-bit comparisons. +;"I learned these from calc84maniac" +;"These have similar flags to that of the `cp` instruction. At the very least, +; you get the zero and carry flag identical." + +;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;; +;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;; +;Inputs: +; HL, DE +;Outputs: +; z flag is set if HL=DE, else nz +; c flag is set if HL