; ;ےਣ᪨ ; arctan_88: ;Input: ; D.E ;Output: atan(D.E)->D.E push de ld a,d or a jp p,$+5 neg ld d,a dec a jr nz,.checkneedinv inc e : dec e : jr nz,.checkneedinv pop af : rla : ld de,201 : ret nc : ld de,-201 : ret .checkneedinv: inc a call nz,.DEgt1_Inv ;0.E is the value to atan ld hl,.adjustatan push hl ld a,e cp 46 : ret c dec a : cp 42h : ret c dec a : cp 4Eh : ret c dec a : cp 57h : ret c dec a : cp 5Eh : ret c dec a : cp 64h : ret c dec a : cp 6Ah : ret c dec a : cp 6Fh : ret c sub 6Fh : ld e,a ld hl,.LUT add hl,de ld a,(hl) ret .adjustatan: ld e,a pop bc ld a,b or a jp p,$+5 neg jr z,$+9 ld hl,402 or a sbc hl,de ex de,hl rl b ret nc xor a sub e ld e,a sbc a,a sub d ld d,a ret .DEgt1_Inv: ;Works if DE>1 ld hl,256 ld b,8 .InvLoop: add hl,hl sbc hl,de jr nc,$+3 add hl,de adc a,a djnz .InvLoop cpl ld e,a ld d,b ret ; 0 1 2 3 4 5 6 7 8 9 .LUT: DB #6F, #6F, #70, #71, #72, #73, #73, #74, #75, #76 ; 0 #77, #77, #78, #79, #7A, #7B, #7B, #7C, #7D, #7E ; 1 #7F, #7F, #80, #81, #82, #82, #83, #84, #85, #85 ; 2 #86, #87, #88, #88, #89, #8A, #8B, #8B, #8C, #8D ; 3 #8E, #8E, #8F, #90, #90, #91, #92, #93, #93, #94 ; 4 #95, #95, #96, #97, #97, #98, #99, #9A, #9A, #9B ; 5 #9C, #9C, #9D, #9E, #9E, #9F, #A0, #A0, #A1, #A2 ; 6 #A2, #A3, #A3, #A4, #A5, #A5, #A6, #A7, #A7, #A8 ; 7 #A9, #A9, #AA, #AA, #AB, #AC, #AC, #AD, #AD, #AE ; 8 #AF, #AF, #B0, #B0, #B1, #B2, #B2, #B3, #B3, #B4 ; 9 #B5, #B5, #B6, #B6, #B7, #B7, #B8, #B9, #B9, #BA ; 10 #BA, #BB, #BB, #BC, #BC, #BD, #BE, #BE, #BF, #BF ; 11 #C0, #C0, #C1, #C1, #C2, #C2, #C3, #C3, #C4, #C4 ; 12 #C5, #C6, #C6, #C7, #C7, #C8, #C8, #C9 ; 13 ;; ;; atan8: ;computes 256*atan(A/256)->A ;56 bytes including the LUT ;min: 246cc ;max: 271cc ;avg: 258.5cc rlca rlca rlca ld d,a and 7 ld hl,.LUT add a,l ld l,a if (.LUT & 255) > 248 ;this section not included in size/speed totals jr nc,$+3 ;can add three bytes, 12cc to max, 11cc to min, and 11.5cc to avg inc h endif ld c,(hl) inc hl ld a,(hl) sub c ld e,0 ex de,hl ld d,l ld e,a sla h : jr nc,$+3 : ld l,e add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl add hl,hl add hl,hl ; add hl,hl ;used in rounding... ld a,h ; rra ;but doesn't seem to improve the error adc a,c ret .LUT: DB 0,32,63,92,119,143,165,184,201 ;; ;; atanE: ;returns H=256*arctan(E/256) ;min: 496cc ;max: 539cc ;avg: 517.5cc ;multiply E by 201 ld d,0 ld h,d ld l,e add hl,hl add hl,de add hl,hl add hl,hl add hl,hl add hl,de add hl,hl add hl,hl add hl,hl add hl,de ld b,h ld c,l ;E*(256-E) xor a ld d,a sub e ld h,a ld l,d sla h : jr nc,$+3 : ld l,e add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de ;.HL*70 ld d,h ld e,l xor a add hl,hl add hl,hl : rla ;rla needed for the case when input = 128 :( add hl,hl : rla add hl,hl : rla add hl,de : adc a,0 add hl,hl : rla add hl,de : adc a,0 add hl,hl : rla ld l,h ld h,a add hl,bc ret ;; ; ;ۂ⠭ ; ;written by calc84maniac ;comment from calc84maniac: ; To clarify why I did a cpl/scf/adc instead of a cpl/inc/add or neg/add, ; is that it handles the case of A=0 properly. Typically, SUB N and ; ADD A,-N give opposite carry outputs, but SUB 0 and ADD A,-0 both reset the ; carry flag. On the other hand, SCF : ADC A,255 will set the carry flag like ; we want it to. ; BC=BC-A BC_Minus_A: cpl scf adc a,c ld c,a ret c dec b ret ;; ;; ;via calc84maniac ;"Optimized routine for HL=A-HL (the negate HL optimization can be derived from this by setting A=0 first)" A_Minus_HL: sub l ld l,a sbc a,a sub h ld h,a ret ;; ; ;ۊ७ ; ;Adapted from Axe ;Inputs: A.C ;Output: D.E contains the squareroot ;speed: 1482+12{0,17} ;min: 1482cc ;max: 1686cc ;avg: 1584cc ;35 bytes sqrtfixed_88: ld b,12 ld de,0 ld h,d ld l,e .Loop: sub #40 sbc hl,de jr nc,.Skip add a,#40 adc hl,de .Skip: ccf rl e rl d sla c rla adc hl,hl sla c rla adc hl,hl djnz .Loop ret ;; ;; ;returns HL as the sqrt, DE as the remainder ;33 bytes ;min: 928cc ;max: 1120cc ;avg: 1024cc ;928+8{24,0} sqrtDE: ld b,#80 xor a ld h,a ld l,a .sqrt_loop: srl b rra ld c,a add hl,bc ex de,hl sbc hl,de jr nc,.next add hl,de ex de,hl or a sbc hl,bc DB #DA ;start of jp c,** which is 10cc to skip the next two bytes. .next: ex de,hl add hl,bc srl h rr l srl b rra jr nc,.sqrt_loop ret ;; ;; ;Written by Zeda ;Input: A.E ==> D.E ;Output: DE is the sqrt, AHL is the remainder ;Speed: 690+6{0,13}+{0,3+{0,18}}+{0,38}+sqrtA ;min: 855cc ;max: 1003cc ;avg: 924.5cc ;152 bytes sqrtfixed_88: call sqrtA ld l,a ld a,e ld h,0 ld e,d ld d,h sla e rl d sll e : rl d add a,a : adc hl,hl add a,a : adc hl,hl sbc hl,de jr nc,.next add hl,de dec e DB #FE ;start of `cp *` .next: inc e sll e : rl d add a,a : adc hl,hl add a,a : adc hl,hl sbc hl,de jr nc,.next2 add hl,de dec e DB #FE ;start of `cp *` .next2: inc e sll e : rl d add a,a : adc hl,hl add a,a : adc hl,hl sbc hl,de jr nc,.next3 add hl,de dec e DB #FE ;start of `cp *` .next3: inc e sll e : rl d add a,a : adc hl,hl add a,a : adc hl,hl sbc hl,de jr nc,.next4 add hl,de dec e DB #FE ;start of `cp *` .next4: inc e ;Now we have four more iterations ;The first two are no problem sll e : rl d add hl,hl add hl,hl sbc hl,de jr nc,.next5 add hl,de dec e DB #FE ;start of `cp *` .next5: inc e sll e : rl d add hl,hl add hl,hl sbc hl,de jr nc,.next6 add hl,de dec e DB #FE ;start of `cp *` .next6: inc e .iter11: ;On the next iteration, HL might temporarily overflow by 1 bit sll e : rl d ;sla e : rl d : inc e add hl,hl add hl,hl jr c,.iter11_br0 ; sbc hl,de jr nc,.next7 add hl,de dec e jr .iter12 .iter11_br0: or a sbc hl,de .next7: inc e ;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anyways .iter12: ld b,a ;A is 0, so B is 0 add hl,hl add hl,hl rla ;AHL - (DE+DE+1) sbc hl,de : sbc a,b inc e or a sbc hl,de : sbc a,b ret p add hl,de adc a,b dec e add hl,de adc a,b ret ;; ;; ; FASTEST ;Written by Zeda ;Input: A ;Output: D is the squareroot, A is the remainder (input-D^2) ;Destroys: E ;speed: 118+{0,6}+{0,7}+{0,7}+{0,3} ;min: 118cc ;max: 141cc ;avg: 129.5cc ;38 bytes sqrtA: ld de,5040h sub e jr nc,.skip1 add a,e ld d,10h .skip1: ; ------ cp d jr c,.skip2 sub d set 5,d .skip2: ; ------ res 4,d srl d set 2,d cp d jr c,.skip3 sub D set 3,d .skip3: srl d ; ------ inc a sub d jr nc,.skip4 dec d add a,d .skip4: srl d ret ;; ;; ;Input: HLDE ;Output: DE is the sqrt, AHL is the remainder ;speed: 238+{0,1}+{0,44}+sqrtHL+3*.sub_2+.iter15 ;min: 1260 ;max: 1506 ;avg: 1377.75 sqrt32: push de call sqrtHL pop bc add a,a ld e,a jr nc,.skip inc d .skip: ld a,b call .sub_2 call .sub_2 ;Now we have four more iterations ;The first two are no problem ld a,c call .sub_2 ;On the next iteration, HL might temporarily overflow by 1 bit call .iter15 ;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anyways .iter16: add a,a ld b,a ;either 0x00 or 0x80 adc hl,hl rla adc hl,hl rla ;AHL - (DE+DE+1) sbc hl,de : sbc a,b inc e or a sbc hl,de : sbc a,b ret p add hl,de adc a,b dec e add hl,de adc a,b ret .sub_2: ;min: 185cc ;max: 231cc ;avg: 208cc call .iter17 .iter17: ;min: 84cc ;max: 107cc ;avg: 95.5cc sll e : rl d add a,a : adc hl,hl add a,a : adc hl,hl sbc hl,de inc e ret nc dec e add hl,de dec e ret .iter15: ;91+{8,0+{0,23}} ;min: 91cc ;max: 114cc ;avg: 100.75cc sll e : rl d ;sla e : rl d : inc e add a,a adc hl,hl add a,a adc hl,hl ;This might overflow! jr c,.iter15_br0 ; sbc hl,de inc e ret nc dec e add hl,de dec e ret .iter15_br0: or a sbc hl,de inc e ret ;; ;; ;Inputs: ; L is the value to find the square root of ;Outputs: ; C is the result ; B,L are 0 ; DE is not changed ; H is how far away it is from the next smallest perfect square ; L is 0 ; z flag set if it was a perfect square ;Destroyed: ; A ;287+7x, x is the number of bits in the result ;min: 287 ;max: 315 ;19 bytes SqrtL: ld bc,#400 ld h,c .Loop: add hl,hl add hl,hl rl c ld a,c rla sub a,h jr nc,$+5 inc c cpl ld h,a djnz .Loop ret ;; ;; ;Input: HLIX ;Output: DE is the sqrt, AHL is the remainder ;speed: 751+6{0,6}+{0,3+{0,18}}+{0,38}+sqrtHL ;min: 1103 ;max: 1237 ;avg: 1165.5 ;166 bytes sqrtHLIX: call .sqrtHL ;expects returns A as sqrt, HL as remainder, D = 0 add a,a ld e,a rl d ld a,ixh sll e : rl d add a,a : adc hl,hl add a,a : adc hl,hl sbc hl,de jr nc,.skip1 add hl,de dec e DB #FE ;start of `cp *` .skip1: inc e sll e : rl d add a,a : adc hl,hl add a,a : adc hl,hl sbc hl,de jr nc,.skip2 add hl,de dec e DB #FE ;start of `cp *` .skip2: inc e sll e : rl d add a,a : adc hl,hl add a,a : adc hl,hl sbc hl,de jr nc,.skip3 add hl,de dec e DB #FE ;start of `cp *` .skip3: inc e sll e : rl d add a,a : adc hl,hl add a,a : adc hl,hl sbc hl,de jr nc,.skip4 add hl,de dec e DB #FE ;start of `cp *` .skip4: inc e ;Now we have four more iterations ;The first two are no problem ld a,ixl sll e : rl d add a,a : adc hl,hl add a,a : adc hl,hl sbc hl,de jr nc,.skip5 add hl,de dec e DB #FE ;start of `cp *` .skip5: inc e sll e : rl d add a,a : adc hl,hl add a,a : adc hl,hl sbc hl,de jr nc,.skip6 add hl,de dec e DB #FE ;start of `cp *` .skip6: inc e .iter15: ;On the next iteration, HL might temporarily overflow by 1 bit sll e : rl d ;sla e : rl d : inc e add a,a adc hl,hl add a,a adc hl,hl ;This might overflow! jr c,.iter15_br0 ; sbc hl,de jr nc,.skip7 add hl,de dec e jr .iter16 .iter15_br0: or a sbc hl,de .skip7: inc e ;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anyways .iter16: add a,a ld b,a ;either 0x00 or 0x80 adc hl,hl rla adc hl,hl rla ;AHL - (DE+DE+1) sbc hl,de : sbc a,b inc e or a sbc hl,de : sbc a,b ret p add hl,de adc a,b dec e add hl,de adc a,b ret ;; ;; ; very fastest 16-bit isqrt by Zeda Thomas ;Feel free to use for whatever :) ;Input: HL ;Output: A is the integer square root of HL ;Destroys: HL,DE (D is actually 0) ;min: 343cc ;max: 380cc ;avg: 361.5cc ;88 bytes sqrtHL: ld de,05040h ld a,h sub e jr nc,.sq7 add a,e ld d,16 .sq7: ; ---------- cp d jr c,.sq6 sub d set 5,d .sq6: ; ---------- res 4,d srl d set 2,d cp d jr c,.sq5 sub d set 3,d .sq5: srl d ; ---------- inc a sub d jr nc,.sq4 dec d add a,d dec d ; <-- this resets the low bit of D, so `srl d` resets carry. .sq4: srl d ld h,a ; ---------- ld a,e sbc hl,de jr nc,.sq3 add hl,de .sq3: ccf rra srl d rra ; ---------- ld e,a sbc hl,de jr c,.sq2 or #20 db 254 ; <-- start of `cp *` which is 7cc to skip the next byte. .sq2: add hl,de xor #18 srl d rra ; ---------- ld e,a sbc hl,de jr c,.sq1 or 8 db 254 ; <-- start of `cp *` which is 7cc to skip the next byte. .sq1: add hl,de xor 6 srl d rra ; ---------- ld e,a sbc hl,de ;This code would restore the square root ; jr nc,.sq0 ; add hl,de ; | 12cc or 18cc ; .sq0: sbc a,255 srl d rra ret ;; ;; ; FASTEST ;written by Zeda ;returns A as the sqrt, HL as the remainder, D = 0 ;min: 352cc ;max: 391cc ;avg: 371.5cc sqrtHL: ld de,#5040 ld a,h sub e jr nc,.sq7 add a,e ld d,16 .sq7: ; ---------- cp d jr c,.sq6 sub d set 5,d .sq6: ; ---------- res 4,d srl d set 2,d cp d jr c,.sq5 sub d set 3,d .sq5: srl d ; ---------- inc a sub d jr nc,.sq4 dec d add a,d dec d ; <-- this resets the low bit of D, so `srl d` resets carry. .sq4: srl d ld h,a ; ---------- ld a,e sbc hl,de jr nc,.sq3 add hl,de .sq3: ccf rra srl d rra ; ---------- ld e,a sbc hl,de jr c,.sq2 or #20 db 254 ; <-- start of `cp *` which is 7cc to skip the next byte. .sq2: add hl,de xor #18 srl d rra ; ---------- ld e,a sbc hl,de jr c,.sq1 or 8 db 254 ; <-- start of `cp *` which is 7cc to skip the next byte. .sq1: add hl,de xor 6 srl d rra ; ---------- ld e,a sbc hl,de jr nc,.sq add hl,de srl d rra ret .sq: inc a srl d rra ret ;; ;; ;Adapted from Axe ;Input: HL ;Output: D is the square root, cH is the remainder (c being the c flag), A is 0, B is 0, L is 0 ;speed: 758+8{0,6} ;min: 758cc ;max: 806cc ;avg: 782cc ;26 bytes sqrtHL: ;p_Sqrt: ld a,l ld l,h ld de,#0040 ld h,d ld b,8 or a .Loop: sbc hl,de jr nc,.Skip add hl,de .Skip: ccf rl d add a,a adc hl,hl add a,a adc hl,hl djnz .Loop ret ;; ; ;RND ; ;Inputs: (seed1), (seed2), and (seed3) are 16-bit seeds. (seed1) and (seed2) can't both be 0. ;Outputs: HL is the pseudorandom number ;Destroys: A,DE,BC ;cycle: 281,474,976,645,120 ;It would take about 185 years at 15MHz to repeat ;min: 258cc (236cc if using ENABLE_SMC) ;max: 288cc (266cc if using ENABLE_SMC) ;avg: 273cc (251cc if using ENABLE_SMC) ;63 bytes (62 bytes if using ENABLE_SMC) xsp32: ifdef ENABLE_SMC .seed1 equ $+1 ld hl,12345 .seed2 equ $+1 ld de,6789 else ld hl,(.seed1) ld de,(.seed2) endif ;first, XOR it with itself, shifted left 23 bits ;low bit of d needs to be shifted in ld a,h rra ld a,l rra jr nc,.skip1 rl e ccf rr e .skip1: xor d ld d,a ;XOR it with itself, shifted right 15 bits ld a,h rla ld a,e rla xor l ld l,a ld a,e rla ld a,d rla jr nc,.skip2 rr e ccf rl e .skip2: xor h ld h,a ;XOR it with itself, shifted left 17 bits ;HL<<1 ld (.seed1),hl add hl,hl ld a,h xor d ld h,a ld a,l xor e ld l,a ld (.seed2),hl ex de,hl ifdef ENABLE_SMC .seed3 equ $+1 ld hl,33333 else ld hl,(.seed3) endif inc hl inc h ld (.seed3),hl add hl,de ret ;; ;; ;32-bit xorshift ;seed^=seed<<23 ;seed^=seed>>15 ;seed^=seed<<17 ;min: 209cc (193cc if using ENABLE_SMC) ;max: 239cc (223cc if using ENABLE_SMC) ;avg: 224cc (208cc if using ENABLE_SMC) ;53 bytes (52 bytes if using ENABLE_SMC) xs32: ifdef ENABLE_SMC .seed1 equ $+1 ld hl,12345 .seed2 equ $+1 ld de,6789 else ld hl,(.seed1) ld de,(.seed2) endif ;first, XOR it with itself, shifted left 23 bits ;low bit of d needs to be shifted in ld a,h rra ld a,l rra jr nc,.skip1 rl e ccf rr e .skip1: xor d ld d,a ;XOR it with itself, shifted right 15 bits ld a,h rla ld a,e rla xor l ld l,a ld a,e rla ld a,d rla jr nc,.skip2 rr e ccf rl e .skip2: xor h ld h,a ;XOR it with itself, shifted left 17 bits ;HL<<1 ld (.seed1),hl add hl,hl ld a,h xor d ld h,a ld a,l xor e ld l,a ld (.seed2),hl ret ;; ;; ;You may use this routine, just be sure to credit John Metcalf! ;Written by John Metcalf ; http://www.retroprogramming.com/2017/07/xorshift-pseudorandom-numbers-in-z80.html ; ; Annotated by Zeda Thomas, fixed typo (86 cycles==> 82 cycles) ;Note: uses ENABLE_SMC (Self Modifying Code) ; 16-bit xorshift pseudorandom number generator ; 20 bytes, 82 cycles (excluding ret) ; returns hl = pseudorandom number ; corrupts a xrnd: ld hl,1 ; Init the seed, must not be 0 ld a,h ;\ rra ; | Get the top bits of xs<<7 and xor with the top byte of HL ld a,l ; | abcdefgh ijklmnop rra ; | ^hijklmno 00000000 xor h ; | Note that we still need to xor the 'p' with the top byte of l ld h,a ;/ ld a,l ;\ rra ; | we get 'p' in the carry flag, now shift that in when we do xs>>9 ld a,h ; | abcdefgh ijklmnop (new value) rra ; | ^00000000 pabcdefg xor l ; | the 'p' is leftover from the first step, so now Step 1 and 2 are done ld l,a ;/ xor h ;\ Finally, xor the bottom byte with the top byte for step 3 ld h,a ;/ ld (xrnd+1),hl ; write back the new value as the next seed ret ;; ;; ;This code snippet is 9 bytes and 43cc ;Inputs: ; HL is the input seed and must be non-zero ;Outputs: ; A is the 8-bit pseudo-random number ; HL is the new seed value (will be non-zero) rng8_very_very_fast: add hl,hl sbc a,a and %0010'1101 xor l ld l,a ld a,r add a,h ret ;------------------------------------------------------------------------------- ;Technical details: ; The concept behind this routine is to combine an LFSR (poor RNG) with a ; counter. The counter improves the RNG quality, while also extending the period ; length. ; For this routine, I took advantage of the Z80's built-in counter, the `r` ; register. This means that we don't need to store the counter anywhere, and it ; is pretty fast to access! ; Some caveats: ; * r is a 7-bit counter ; * r will increment some number of times between runs of the RNG. In most ; cases, this will be constant, but if it increments an even number each ; time, then the bottom bit is always the same, weakening the effect of ; the counter. In the worst case, it increments a multiple of 128 times, ; effectively making your RNG just as good/bad as the LFSR. Ideally, you ; want `r` to increment an odd number of times between runs. ; * In the best case, the bottom 7 bits have 50/50 chance of being 0 or 1. ; The top bit is 1 with probability 1/2 + 1/(2^17-2) ~ .5000076295 ; * In the event that your main loop waits for user input between calls, ; then congatulations, you might have a True RNG :) ;------------------------------------------------------------------------------- ;; ;; ;Tested and passes all CAcert tests ;Uses a very simple 32-bit LCG and 32-bit LFSR ;it has a period of 18,446,744,069,414,584,320 ;roughly 18.4 quintillion. ;LFSR taps: 0,2,6,7 = 11000101 ;291cc ;Thanks to Runer112 for his help on optimizing the LCG and suggesting to try the much simpler LCG. On their own, the two are terrible, but together they are great. ;58 bytes rand32: .seed1_0 equ $+1 ld hl,12345 .seed1_1 equ $+1 ld de,6789 ld b,h ld c,l add hl,hl : rl e : rl d add hl,hl : rl e : rl d inc l add hl,bc ld (.seed1_0),hl ld hl,(.seed1_1) adc hl,de ld (.seed1_1),hl ex de,hl ;;lfsr .seed2_0 equ $+1 ld hl,9876 .seed2_1 equ $+1 ld bc,54321 add hl,hl : rl c : rl b ld (.seed2_1),bc sbc a,a and %1100'0101 xor l ld l,a ld (.seed2_0),hl ex de,hl add hl,bc ret ;; ;; ;;219cc rand24: ifdef ENABLE_SMC .seed1_0 equ $+1 ld hl,12345 .seed1_1 equ $+1 ld a,67 else ld hl,(.seed1_0) ld a,(.seed1_1) endif ld b,h ld c,l ld d,a add hl,hl : rla add hl,hl : rla inc l add hl,bc : adc a,0 ld (.seed1_0),hl ld (.seed1_1),a ld c,b ld b,a ifdef ENABLE_SMC .seed2_0 equ $+1 ld hl,65432 .seed2_1 equ $+1 ld a,10 else ld hl,(.seed2_0) ld a,(.seed2_1) endif add hl,hl rla ld (.seed2_1),a sbc a,a and %1000'0111 xor l ld l,a ld (.seed2_0),hl add hl,bc ret ;; ;; ;You may use this routine, just be sure to credit John Metcalf for the ;xorshift16 part of this routine! ; This routine is a fast Pseudo Random Number Generator ;for the Z80. It combines a 16-bit LCG and 16-bit xorshift. ;The xorshift routine was written by John Metcalf ;and posted here: ; http://www.retroprogramming.com/2017/07/xorshift-pseudorandom-numbers-in-z80.html ;174cc (or 186cc if not using ENABLE_SMC) ;34 bytes ;cycle length: 4,294,901,760 (almost 4.3 billion) ; For the first seed, we use an LCG, 1+5*seed1 ==> seed1 rand16: ifdef ENABLE_SMC .seed1 equ $+1 ld hl,9999 else ld hl,(.seed1) endif ld b,h ld c,l add hl,hl add hl,hl inc l add hl,bc ld (.seed1),hl ; For the second seed, we apply an xorshift ; seed2^(seed2<<7) ==> seed2 ; seed2^(seed2>>9) ==> seed2 ; seed2^(seed2<<8) ==> seed2 ; This code was originally made by John Metcalf and posted here: ; http://www.retroprogramming.com/2017/07/xorshift-pseudorandom-numbers-in-z80.html ; (My modifications are only in naming and compiler directives.) ifdef ENABLE_SMC .seed2 equ $+1 ld hl,9999 else ld hl,(.seed2) endif ld a,h rra ld a,l rra xor h ld h,a ld a,l rra ld a,h rra xor l ld l,a xor h ld h,a ld (.seed2),hl add hl,bc ret ;; ;; ;collaboration by Zeda with Runer112 ;160cc or 148cc if using ENABLE_SMC ;26 bytes ;cycle: 4,294,901,760 (almost 4.3 billion) rand16: ifdef ENABLE_SMC .seed1 equ $+1 ld hl,9999 else ld hl,(.seed1) endif ld b,h ld c,l add hl,hl add hl,hl inc l add hl,bc ld (.seed1),hl ifdef ENABLE_SMC .seed2 equ $+1 ld hl,9999 else ld hl,(.seed2) endif add hl,hl sbc a,a and %00101101 xor l ld l,a ld (.seed2),hl add hl,bc ret ;; ;; ;Returns A on [0,4] ;Destroys: All ;Notes: ; This is a non-standard approach to generating random integers on [0,4]. ; If you have a truly random number generator that generates bits (0 or 1) ; with equal probability, then standard approaches will still cause a slight ; bias. ("Standard": "rand mod 5" or int(5*rand)). For example, suppose we ; generate a 4-bit number. Then "rand mod 5" will cause 0 to be chosen ; 4/16 times, while 1, 2, 3, and 4 will be chosen 3/16 times (on average). ; A similar problem exists with int(5*rand). One way to mitigate this issue ; is just generating infintely many bits, but apparently that is impractical, ; so I came up with a compromise. ; ; My approach basically looks at the binary expansion of 1/5, 2/5, 3/5, and 4/5. ; 1/5 = .0011001100110011... ; 2/5 = .0110011001100110... ; 3/5 = .1001100110011001... ; 4/5 = .1100110011001100... ; ; So if I generate random bits and I get .001100, then a 0, then I know ; that no matter what all of the rest of the bits are, the number is less than ; 1/5, and so int(5*rand) is 0. ; ; By applying similar logic to the rest of the values, I can guarantee a uniform ; distribution on [0,4]. But there are four cases where this process might ; continue forever, specifically the cases that are like ...00110011...., but ; lucky for us, this happens 4/inf= 0% of the time. In fact, on average it ; takes 3 to 4 bits before the algorithm can assert which value to return. ; ; The one caveat is that on the Z80, we generally don't have truly random ; numbers :| On the otherhand, it is easy enough to generate pseudo-random ; bits with equal probability :) rand5: call rand ld a,h and #C0 push af ;save the original value ld c,a .start: push bc call rand pop bc ld b,15 ;I set this to 15 because I like to guarantee a bit is available for rand10. .loop: ld a,h xor c jp p,.end add hl,hl sla c jr c,$+4 set 6,c djnz .loop jr .start .end: pop af rlca rlca sla h adc a,0 ret ;; ;; ;Returns A as a random integer on [0,9] ;Destroys: All rand10: call rand5 sla h rla ret ;; ;; ;Generates a random TI float at HL rand_TI_Float: push hl ; call rand_init ld de,#8000 ;D is exponent, E is type. E is used in .zero .get_rand_exponent_loop: ;decrement exponent dec d ;if the exponent is -100, underflow to 0. ;I don't think this is possible with this RNG, or even likely to ever happen ;before the universe's heat death with a true RNG, but better to be safe? ld a,d cp 28 jp z,.zero ;save the exponent push de ;Generate a uniform random digit on [0,9] as a candidate for our first digit. call rand10 ;restore the exponent+type pop de or a ;if A is 0, we'll decrement the exponent and find a new candidate for the first ;digit. This is because we need our float to be "normalized" (top digit non-zero) ;This also preserves the uniform distribution for values. jr z,.get_rand_exponent_loop pop hl ld (hl),e inc hl ld (hl),d inc hl ;write the first digit ld (hl),a ld b,13 .math_rand_loop: ;now generate subsequent digits push bc rr b jr c,$+3 inc hl push hl ;generate the next digit call rand10 pop hl rld pop bc djnz .math_rand_loop ret .zero: pop hl ld b,9 ld (hl),e ; E is 0 inc hl djnz $-2 ret ;; ;; ; Output is in HL ; This rand routine combines Patrik Rak's fantastic 32-bit xorshift ; (https://gist.github.com/raxoft/c074743ea3f926db0037) with a simple lcg for ; extra smoothing. ; It has a period of 281,474,976,645,120 (2^48-2^16) and uses 48 bits of state. ; 42 bytes ; 210cc rand: ld hl,(.seed0) ld b,h ld c,l add hl,hl add hl,hl inc l add hl,bc ld (.seed0),hl ; xorshift ld hl,(.seed1) ; yw -> zt ld de,(.seed1+2) ; xz -> yw ld (.seed1+2),hl ; x = y, z = w ld a,l ; w = w ^ ( w << 3 ) add a,a add a,a add a,a xor l ld l,a ld a,d ; t = x ^ (x << 1) add a,a xor d ld h,a rra ; t = t ^ (t >> 1) ^ w xor h xor l ld h,e ; y = z ld l,a ; w = t ld (.seed1),hl ; Mix the xorshift and the lcg add hl,bc ret ;; ;; ; need to make sure seed1 is non-zero randinit: ld hl,.seed1 ld a,(hl) inc hl or (hl) inc hl or (hl) inc hl or (hl) ret nz dec (hl) ret ;; ;; ;;Output: A is an 8-bit pseudo-random number. lfsr64: ld hl,.seed sla (hl) : inc hl rl (hl) : inc hl rl (hl) : inc hl rl (hl) : inc hl rl (hl) : inc hl rl (hl) : inc hl rl (hl) : inc hl rl (hl) ret nc ld a,(.seed) xor %000011011 ld (.seed),a ret ;; ;; ;13 bytes ;72cc (66cc if using SMC) ;period is 65535 LFSR: ifdef ENABLE_SMC .seed equ $+1 ld hl,9797 else ld hl,(.seed) endif add hl,hl sbc a,a and %00101101 xor l ld l,a ld (seed),hl ret ;; ;; ;Input: ; (seed) has the seed value of the RNG ;Output: ; (seed) is updated, HL is the result ;Destroys: ; A,DE,BC ;Timing: ; if seed>0 231cc or 232cc, condition dependent ; if seed=0 91cc ; if ENABLE_SMC defined subtract 6cc ;Size: 44 bytes ;Notes: ; Uses the Lehmer RNG used by the Sinclair ZX81 ; 75x mod 65537 -> x lehmer: ifndef ENABLE_SMC ld hl,(.seed) else .seed equ $+1 ld hl,0 endif ;multiply by 75 ld c,l ld b,h xor a adc hl,hl jr z,.special ld d,a : rla add hl,hl : rla add hl,hl : rla : add hl,bc : adc a,d add hl,hl : rla add hl,hl : rla : add hl,bc : adc a,d add hl,hl : rla : add hl,bc ;modulo 65537, see note below on how this works ld e,a sbc hl,de ;No need to reset the c flag since it is already jr nc,$+3 inc hl ld (.seed),hl ret .special: ;In the case that HL=0, this should be interpreted as 65536 = -1 mod 65537, so return -75 mod 65537 = -74 mod 65536 in HL ld hl,-74 ld (.seed),hl ret ;mod by 2^16 + 1 (a prime) ;current form is A*2^16+HL ;need: ; (A*2^16+HL) mod (2^16+1) ;add 0 as +1-1 ; (A*(2^16+1-1)+HL) mod (2^16+1) ;distribute ; (A*(2^16+1)-A+HL) mod (2^16+1) ;A*(2^16+1) mod 2^16+1 = 0, so remove ; (-A+HL) mod (2^16+1) ;Oh hey, that's easy! :P ;I use this trick everywhere, you should, too. ;; ; ;ۊ୨ ; ;A*A->A ;Destroys: HL ;76cc or 79cc or 82cc ;Avg: 79cc ;51 bytes sqrA: add a,a add a,a jr nc,$+4 neg rrca rrca ld l,a srl l ld h,.LUT/256 jr c,$+4 neg add a,(hl) ret ;!FIXIT ;MUST BE ALIGNED to a 256-byte boundary. ;Can use: ; #if 0!=$&255 ; .fill 256-($&255),0 ; #endif .LUT: DB #00, #06, #14, #2A, #48, #6E, #9C, #D2 DB #10, #56, #A4, #FA, #58, #BE, #2C, #A2 DB #20, #A6, #34, #CA, #68, #0E, #BC, #72 DB #30, #F6, #C4, #9A, #78, #5E, #4C, #42 ASSERT (low sqrLUT) = 0, "sqrLUT MUST BE ALIGNED to a 256-byte boundary!" ;; ;; ;Input: L ;Output: L*L->A ;147 t-states ;36 bytes L_sqrd: ld b,l ;First iteration, get the lowest 3 bits of -x^2 sla l rrc b sbc a,a or l ld c,a ;second iteration, get the next 2 bits of -x^2 rrc b sbc a,a xor l and #F8 add a,c ld c,a ;third iteration, get the next 2 bits of -x^2 sla l rrc b sbc a,a xor l and #E0 add a,c ld c,a ;fourth iteration, get the eight bit of x^2 sla l rrc b sbc a,a xor l and #80 sub c ret ;; ; ;ۓ ; ;This multiplies two 64-bit integers and returns a 128-bit result. ;This requires the following routines: ; mul32 ; Inputs: DEHL, BCIX ; Output: stored at z32_0, little-endian ;Multiplies DE.HL by BC.IX, stores the result in DE.HL mulfixed16_16: ; First, find out if the output is positive or negative ld a,d xor b push af ;sign bit is the result sign bit ; Now make sure the inputs are positive xor b ;A now has the value of D, since I XORed it with B twice (cancelling) jp p,.skip1 ;if Positive, don't negate xor a sub l ld l,a ld a,0 sbc a,h ld h,a ld a,0 sbc a,e ld e,a sbc a,a sub d ld d,a .skip1: bit 7,b jr z,.skip2 xor a sub ixl ld ixl,a ld a,0 sbc a,ixh ld ixh,a ld a,0 sbc a,c ld c,a sbc a,a sub b ld b,a .skip2: ; Now we multiply call mul32 ;We should check for overflow. If the upper two bytes are non-zero, we will set the result to 0x7FFFFFFF ld hl,(.z32_0+6) ld a,h or l ;Get the middle four bytes and put them in DEHL ld hl,(.z32_0+2) ld de,(.z32_0+4) ;Maybe we need to set the result to 0x7FFFFFFF jr z,.skip3 ld de,#7FFF ld h,e ld l,e .skip3: ; Now we need to restore the sign pop af ret p ;don't need to do anything, result is already positive xor a ld b,a sub l ld l,a ld a,b sbc a,h ld h,a ld a,b sbc a,e ld e,a sbc a,a sub d ld d,a ret ;; ;; ;This multiplies two 64-bit integers and returns a 128-bit result. ;This requires the following routines: ; mul32 ; Inputs: DEHL, BCIX ; Output: stored at z32_0, little-endian ;Multiplies DE.HL by BC.IX, stores the result in DE.HL mulfixed16_16: ; First, find out if the output is positive or negative LD A,D XOR B PUSH AF ;sign bit is the result sign bit ; Now make sure the inputs are positive XOR B ;A now has the value of D, since I XORed it with B twice (cancelling) JP P,.skip1 ;if Positive, don't negate XOR A SUB L LD L,A LD A,0 SBC A,H LD H,A LD A,0 SBC A,E LD E,A SBC A,A SUB D LD D,A .skip1: BIT 7,B JR Z,.skip2 XOR A SUB IXL LD IXL,A LD A,0 SBC A,IXH LD IXH,A LD A,0 SBC A,C LD C,A SBC A,A SUB B LD B,A .skip2: ; Now we multiply CALL mul32 ;We should check for overflow. If the upper two bytes are non-zero, we will set the result to 0x7FFFFFFF LD HL,(.z32_0+6) LD A,H OR L ;Get the middle four bytes and put them in DEHL LD HL,(.z32_0+2) LD DE,(.z32_0+4) ;Maybe we need to set the result to 0x7FFFFFFF JR Z,.skip3 LD DE,#7FFF LD H,E LD L,E .skip3: ; Now we need to restore the sign POP AF RET P ;don't need to do anything, result is already positive XOR A LD B,A SUB L LD L,A LD A,B SBC A,H LD H,A LD A,B SBC A,E LD E,A SBC A,A SUB D LD D,A RET ;; ;; ;Requires: ; mul16 ; Inputs: BC,DE ; Output: DEHL ;Multiplies 4.12 fixed point numbers. ;Inputs: HL is the first fixed-point multiplicand ; DE is the second fixed-point multiplicand ;Output: HL is the fixed-point output ;Overflow is stored as 0x7.FFF or 0x8.001 depending on positive or negative mulfixed4_12: ; First, find out if the output is positive or negative ld a,h xor d push af ;sign bit is the result sign bit ; Now make sure the inputs are positive xor d ;A now has the value of H, since I XORed it with D twice (cancelling) jp p,.skip1 ;if Positive, don't negate xor a sub l ld l,a sbc a,a sub h ld h,a .skip1: bit 7,d jr z,.skip2 xor a sub e ld e,a sbc a,a sub d ld d,a .skip2: ; Now we need to put DE in BC to use mul16 ld b,h ld c,l call mul16 ;The result doesn't need the top 4 bits or bottom 12 bits. ;We'll hold onto the top 4 bits to check overflow, though. ;Currently we need to shift DEH left by 4 bits and keep DE, or right by 12 bits and keep HL. ld a,h ;we'll actually be moving the discared bits into A and #F0 ex de,hl rla : adc hl,hl rla : adc hl,hl rla : adc hl,hl rla : adc hl,hl adc a,a ;if A is non-zero, we have overflow jr z,.skip3 ld hl,#7FFF .skip3: ; Now we need to restore the sign pop af ret p ;don't need to do anything, result is already positive xor a sub l ld l,a sbc a,a sub h ld h,a ret ;; ;; ;Multiplies H.L by D.E, stores the result in H.L mulfixed_88: ; First, find out if the output is positive or negative ld a,h xor d push af ;sign bit is the result sign bit ; Now make sure the inputs are positive xor d ;A now has the value of H, since I XORed it with D twice (cancelling) jp p,.skip1 ;if Positive, don't negate xor a sub l ld l,a sbc a,a sub h ld h,a .skip1: bit 7,d jr z,.skip2 xor a sub e ld e,a sbc a,a sub d ld d,a .skip2: ; Now we need to put HL in BC to use mul16 ld b,h ld c,l call mul16 ;Need to round, so get the top bit of L sla l ;Get the middle two bytes, EH, and put them in HL ld l,h ld h,e ld a,d ld de,0 adc hl,de ;check for overflow! ;We should check for overflow. If A>0, we will set HL to 0x7FFF adc a,e jr c,$+4 jr z,.skip3 ld hl,#7FFF .skip3: ; Now we need to restore the sign pop af ret p ;don't need to do anything, result is already positive xor a sub l ld l,a sbc a,a sub h ld h,a ret ;; ;; ;This multiplies two 64-bit integers and returns a 128-bit result. ;This requires the following routines: ; mul32 ;!TEST ; Inputs: DEHL, BCIX ; Output: stored at z32_0, little-endian ; ; Defined: ; inp64_1 is where the first 64-bit multiplicand is located, little-endian ; inp64_2 is where the second 64-bit multiplicand is located, little-endian ; out128 is where the 128-bit result is stored ; Uses 8 additional bytes after out128 ;multiplies the 64-bit integers at inp64_1 and inp64_2 ;stores the 128-bit (16-byte) result at out128 ; ;min: 1740+3*min(mul32) ; 5631cc ;max: 1901+3*max(mul32) ; 10013cc ;avg: 1797+3*avg(mul32) + 9572881/2^24 ; ~8720.733cc ;uses 24 bytes at out128 mul64: .z64_0 EQU out128 .z64_2 EQU .z64_0+8 .z32_0 EQU .z64_2+8 ld de,(.inp64_1+6) ld hl,(.inp64_1+4) ld bc,(.inp64_2+6) ld ix,(.inp64_2+4) call mul32 ;copy the 8 bytes at z32_0 to z64_2 ld hl,.z32_0 ld de,.z64_2 call .mov8 ld de,(.inp64_1+2) ld hl,(.inp64_1) ld bc,(.inp64_2+2) ld ix,(.inp64_2) call mul32 ;copy the 8 bytes at z32_0 to z64_0 ld hl,.z32_0 ld de,.z64_0 call .mov8 ;now I need to subtract the 32-bit digits from each other xor a ld hl,(.inp64_1) ld bc,(.inp64_1+4) sbc hl,bc ex de,hl ld hl,(.inp64_1+2) ld bc,(.inp64_1+6) sbc hl,bc jr nc,.skip1 ld b,a : sub e : ld e,a ld a,b : sbc a,d : ld d,a ld a,b : sbc a,l : ld l,a ld a,b : sbc a,h : ld h,a ld a,b .skip1: rla push hl ;top byte push de ld hl,(.inp64_2) ld bc,(.inp64_2+4) sbc hl,bc ex de,hl ld hl,(.inp64_2+2) ld bc,(.inp64_2+6) sbc hl,bc jr nc,.skip2 ld c,a xor a ld b,a sub e : ld e,a ld a,b : sbc a,d : ld d,a ld a,b : sbc a,l : ld l,a ld a,b : sbc a,h : ld h,a ld a,c inc a .skip2: ex de,hl pop ix pop bc push af call mul32 pop af ;holds the sign in the low bit rra jp c,.add ;need to perform z0+z2-result xor a ld hl,(.z64_0) ld de,(.z64_2) add hl,de ld (.inp64_1),hl ld hl,(.z64_0+2) ld de,(.z64_2+2) adc hl,de ld (.inp64_1+2),hl ld hl,(.z64_0+4) ld de,(.z64_2+4) adc hl,de ld (.inp64_1+4),hl ld hl,(.z64_0+6) ld de,(.z64_2+6) adc hl,de ld (.inp64_1+6),hl rla ;now need to subtract ld hl,(.inp64_1) ld de,(.z32_0) sbc hl,de ld (.inp64_1),hl ld hl,(.inp64_1+2) ld de,(.z32_0+2) sbc hl,de ld (.inp64_1+2),hl ld hl,(.inp64_1+4) ld de,(.z32_0+4) sbc hl,de ld (.inp64_1+4),hl ld hl,(.inp64_1+6) ld de,(.z32_0+6) sbc hl,de ld (.inp64_1+6),hl sbc a,0 .final: ;now need to add it back in ld hl,(z64_0+4) ld de,(.inp64_1) add hl,de ld (z64_0+4),hl ld hl,(z64_0+6) ld de,(.inp64_1+2) adc hl,de ld (z64_0+6),hl ld hl,(z64_0+8) ld de,(.inp64_1+4) adc hl,de ld (z64_0+8),hl ld hl,(z64_0+10) ld de,(.inp64_1+6) adc hl,de ld (z64_0+10),hl ld hl,z64_0+12 adc a,(hl) ld (hl),a ret nc inc hl : inc (hl) : ret nz inc hl : inc (hl) : ret nz inc hl : inc (hl) : ret .add: ;add to the current result ;z0+z2+result xor a ld hl,(z64_0) ld de,(z64_2) add hl,de ld (.inp64_1),hl ld hl,(z64_0+2) ld de,(z64_2+2) adc hl,de ld (.inp64_1+2),hl ld hl,(z64_0+4) ld de,(z64_2+4) adc hl,de ld (.inp64_1+4),hl ld hl,(z64_0+6) ld de,(z64_2+6) adc hl,de ld (.inp64_1+6),hl rla ;now need to subtract ld hl,(.inp64_1) ld de,(.z32_0) add hl,de ld (.inp64_1),hl ld hl,(.inp64_1+2) ld de,(.z32_0+2) adc hl,de ld (.inp64_1+2),hl ld hl,(.inp64_1+4) ld de,(.z32_0+4) adc hl,de ld (.inp64_1+4),hl ld hl,(.inp64_1+6) ld de,(.z32_0+6) adc hl,de ld (.inp64_1+6),hl adc a,0 jp .final .mov8: LDI LDI LDI LDI LDI LDI LDI LDI RET ;; ;; ;Requires: ; mul16 ;!TEST ; Inputs: BC,DE ; Output: DEHL ;max: 703cc + 3*mul16 ; 2704cc ;min: 655cc + 3*mul16 ; 1297cc ;avg: 673.25cc+3*mul16 ; 2307.911cc ;DEHL * BCIX ==> .z32_0 mul32: push de push bc push hl push ix call mul16 ;DEHL ld (.z32_2),hl ld (.z32_2+2),de pop de pop bc push de call mul16 ;DEHL ld (.z32_0),hl ld (.z32_0+2),de pop de ;low word pop hl xor a sbc hl,de jr nc,.skip1 sub l ld l,a sbc a,a sub h ld h,a xor a inc a .skip1: ex de,hl pop hl sbc hl,bc jr nc,.skip2 ld b,a xor a sub l ld l,a sbc a,a sub h ld h,a ld a,b inc a .skip2: ld b,h ld c,l push af call mul16 pop af ;holds the sign in the low bit rra jr c,.add ;need to perform z0+z2-result push de push hl xor a ld hl,(.z32_0) ld bc,(.z32_2) add hl,bc ex de,hl ld hl,(.z32_0+2) ld bc,(.z32_2+2) adc hl,bc rla ;now need to subtract ex de,hl pop bc sbc hl,bc ex de,hl pop bc sbc hl,bc sbc a,0 ;A:HL:DE is the result, need to add to z32_0+2 .final: ld bc,(.z32_0+2) ex de,hl add hl,bc ld (.z32_0+2),hl ld hl,(.z32_2) adc hl,de ld (.z32_2),hl ld hl,z32_2+2 adc a,(hl) ld (hl),a ret nc inc hl inc (hl) ret .add: ;add to the current result xor a ld bc,(.z32_0) add hl,bc ex de,hl ld bc,(.z32_0+2) adc hl,bc rla ex de,hl ld bc,(.z32_2) add hl,bc ex de,hl ld bc,(.z32_2+2) adc hl,bc adc a,0 jp .final ; .z32_0: DS 4 .z32_2: DS 4 ;; ;; ;BDE*CHL -> HLBCDE ;155 bytes ;402+3*C_Times_BDE ;fastest:1201cc ;slowest:1753cc ;avg: 1464.9033203125cc (1464+925/1024) ;min: 825cc ;max: 1926cc ;avg: 1449.63839751681cc mul24: push bc ld c,l push hl call C_Times_BDE ld (.var48),hl ld l,a ld h,c ld (.var48+2),hl pop hl ld c,h call C_Times_BDE push bc ld bc,(.var48+1) add hl,bc ld (.var48+1),hl pop bc ld b,c ld c,a ld hl,(.var48+3) ld h,0 adc hl,bc ld (.var48+3),hl pop bc call C_Times_BDE ld de,(.var48+2) add hl,de ld (.var48+2),hl ld d,c ld e,a ld b,h ld c,l ld hl,(.var48+4) ld h,0 adc hl,de ld de,(.var48) ret .var48: DS 6 ;; ;; ;This was made by Runer112 ;Tested by jacobly ;BC*DE --> DEHL ; ~544.887cc as calculated in jacobly's test ;min: 214cc (DE = 1) ;max: 667cc ;avg: 544.4507883cc however, deferring to jacobly's result as mine may have math issues ? ;177 bytes mul16: LD A,D LD D,0 LD H,B LD L,C ADD A,A : JR C,.Bit14 ADD A,A : JR C,.Bit13 ADD A,A : JR C,.Bit12 ADD A,A : JR C,.Bit11 ADD A,A : JR C,.Bit10 ADD A,A : JR C,.Bit9 ADD A,A : JR C,.Bit8 ADD A,A : JR C,.Bit7 LD A,E AND %11111110 ADD A,A : JR C,.Bit6 ADD A,A : JR C,.Bit5 ADD A,A : JR C,.Bit4 ADD A,A : JR C,.Bit3 ADD A,A : JR C,.Bit2 ADD A,A : JR C,.Bit1 ADD A,A : JR C,.Bit0 RR E RET C LD H,D LD L,E RET ; .Bit14: ADD HL,HL : ADC A,A : JR NC,.Bit13 : ADD HL,BC : ADC A,D .Bit13: ADD HL,HL : ADC A,A : JR NC,.Bit12 : ADD HL,BC : ADC A,D .Bit12: ADD HL,HL : ADC A,A : JR NC,.Bit11 : ADD HL,BC : ADC A,D .Bit11: ADD HL,HL : ADC A,A : JR NC,.Bit10 : ADD HL,BC : ADC A,D .Bit10: ADD HL,HL : ADC A,A : JR NC,.Bit9 : ADD HL,BC : ADC A,D .Bit9: ADD HL,HL : ADC A,A : JR NC,.Bit8 : ADD HL,BC : ADC A,D .Bit8: ADD HL,HL : ADC A,A : JR NC,.Bit7 : ADD HL,BC : ADC A,D .Bit7: LD D,A LD A,E AND %11111110 ADD HL,HL : ADC A,A : JR NC,.Bit6 : ADD HL,BC : ADC A,0 .Bit6: ADD HL,HL : ADC A,A : JR NC,.Bit5 : ADD HL,BC : ADC A,0 .Bit5: ADD HL,HL : ADC A,A : JR NC,.Bit4 : ADD HL,BC : ADC A,0 .Bit4: ADD HL,HL : ADC A,A : JR NC,.Bit3 : ADD HL,BC : ADC A,0 .Bit3: ADD HL,HL : ADC A,A : JR NC,.Bit2 : ADD HL,BC : ADC A,0 .Bit2: ADD HL,HL : ADC A,A : JR NC,.Bit1 : ADD HL,BC : ADC A,0 .Bit1: ADD HL,HL : ADC A,A : JR NC,.Bit0 : ADD HL,BC : ADC A,0 .Bit0: ADD HL,HL ADC A,A JR C,.FunkyCarry RR E LD E,A RET NC ADD HL,BC RET NC INC E RET NZ INC D RET ; .FunkyCarry: INC D RR E LD E,A RET NC ADD HL,BC RET NC INC E RET ;; ;; ;Inputs: H,E ;Outputs: HL is the product, D is 0 ;Destroys: A ;187+6{0,6}+{0,15} ;min: 187cc ;max: 238cc ;avg: 212.5cc ;35 bytes H_Times_E: ld d,0 sla h sbc a,a and e ld l,a add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl : ret nc : add hl,de ret ;; ;; H_Times_E_No_A: ;Inputs: H,E ;Outputs: HL is the product, D is 0 ;190+6{0,6}+{0,15}+{0,1} ;min: 190cc ;max: 242 ;avg: 216 ;36 bytes ld d,0 ld l,d sla h : jr nc,$+3 : ld l,e add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl : jr nc,$+3 : add hl,de add hl,hl : ret nc : add hl,de ret ;; ;; HL_Times_128: xor a rr h rr l rra ld h,l ld l,a ret ;; ;; ;NOTE: This is a set of in-line routines! ; Input: HL ; Output: BC is the input, HL is 12 times the input ; 6 bytes, 52cc HL_Times_12 ld b,h ld c,l add hl,hl add hl,bc add hl,hl add hl,hl ;Destroys only register E and F ; Input: HL <= 85, ; 8 bytes, 46cc ld e,a ld a,l add a,a ; hl*2 add a,l ; hl*3 ld l,a ld a,e add hl,hl ; hl*6 add hl,hl ; hl*12 ;Destroys only register E and F ; Input: HL <= 85, ; 7 bytes, 55cc ld e,l add hl,hl ; hl*2 add hl,de ; hl*3+d*256 ld h,0 ; hl*3 add hl,hl ; hl*6 add hl,hl ; hl*12 RET ;; ;; ;Inputs: ; DEBC is a 32-bit multiplicand ; A is an 8-bit multiplicand ;Outputs: ; AHLIX is the 40-bit result ; carry reset ; z set if top 8 bits are 0 ; sign flag set as expected ;=============================================================== ;503+8{0,41} ;min: 503cc ;max: 831cc ;avg: 667cc ;29 bytes DEBC_Times_A: ld hl,0 ld ix,0 call .iter3 .iter3: ;231+4{0,41} call .iter2 .iter2: ;107+2{0,41} call .iter1 .iter1: ;45+{0,41} add ix,ix adc hl,hl adc a,a ret nc add ix,bc adc hl,de adc a,0 ret ;; ;; ;Inputs: ; DE and A are factors ;Outputs: ; A is not changed ; B is 0 ; C is not changed ; DE is not changed ; HL is the product ;Time: ; 342+6x ;13 bytes DE_Times_A: ld b,8 ld hl,0 .loop: add hl,hl rlca jr nc,$+3 add hl,de djnz .loop ret ;; ;; ;Input: DE,A ;Output: A:HL is the product, C=0, B,DE unaffected, z flag set if result is zero, c flag set if A is input as 1, else nc. ;A:128~255 219+6{0,10}+{0,19} avg=258.5 *1/2 ;A:64~127 203+5{0,10}+{0,19} avg=237.5 *1/4 ;A:32~63 187+4{0,10}+{0,19} avg=216.5 *1/8 ;A:16~31 171+3{0,10}+{0,19} avg=195.5 *1/16 ;A:8~15 155+2{0,10}+{0,19} avg=174.5 *1/32 ;A:4~7 139+{0,10}+{0,19} avg=153.5 *1/64 ;A:2~3 123+{0,19} avg=132.5 *1/128 ;A:1 107cc avg=107 *1/256 ;A:0 119cc avg=119 *1/256 ;overall avg: 237.671875cc DE_Times_A_v1: ld c,0 ld h,d ld l,e add a,a : jr c,.mul_07 rla : jr c,.mul_06 rla : jr c,.mul_05 rla : jr c,.mul_04 rla : jr c,.mul_03 rla : jr c,.mul_02 rla : jr c,.mul_01 rla ret c ld h,a ld l,a ret .mul_07: add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,c .mul_06: add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,c .mul_05: add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,c .mul_04: add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,c .mul_03: add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,c .mul_02: add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,c .mul_01: add hl,hl : rla : ret nc : add hl,de : adc a,c ret ;; ;; DE_Times_A_v2: ;DE*A ==> AHL ld hl,0 ld b,h add a,a : jr nc,$+5 : ld h,d : ld l,e add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,b add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,b add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,b add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,b add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,b add hl,hl : rla : jr nc,$+4 : add hl,de : adc a,b add hl,hl : rla : ret nc : add hl,de : adc a,b ret ;; ;; ;C*BDE => CAHL ;C = 0 157 ;C = 1 141 ;141+ ;C>=128 135+6{0,33+{0,1}}+{0,20+{0,8}} ;C>=64 115+5{0,33+{0,1}}+{0,20+{0,8}} ;C>=32 95+4{0,33+{0,1}}+{0,20+{0,8}} ;C>=16 75+3{0,33+{0,1}}+{0,20+{0,8}} ;C>=8 55+2{0,33+{0,1}}+{0,20+{0,8}} ;C>=4 35+{0,33+{0,1}}+{0,20+{0,8}} ;C>=2 15+{0,20+{0,8}} ;min: 141cc ;max: 508cc ;avg: 349.21279907227cc C_times_BDE: ld a,b ld h,d ld l,e sla c : jr c,.mul8_24_1 sla c : jr c,.mul8_24_2 sla c : jr c,.mul8_24_3 sla c : jr c,.mul8_24_4 sla c : jr c,.mul8_24_5 sla c : jr c,.mul8_24_6 sla c : jr c,.mul8_24_7 sla c : ret c ld a,c ld h,c ld l,c ret .mul8_24_1: add hl,hl : rla : rl c : jr nc,$+7 : add hl,de : adc a,b : jr nc,$+3 : inc c .mul8_24_2: add hl,hl : rla : rl c : jr nc,$+7 : add hl,de : adc a,b : jr nc,$+3 : inc c .mul8_24_3: add hl,hl : rla : rl c : jr nc,$+7 : add hl,de : adc a,b : jr nc,$+3 : inc c .mul8_24_4: add hl,hl : rla : rl c : jr nc,$+7 : add hl,de : adc a,b : jr nc,$+3 : inc c .mul8_24_5: add hl,hl : rla : rl c : jr nc,$+7 : add hl,de : adc a,b : jr nc,$+3 : inc c .mul8_24_6: add hl,hl : rla : rl c : jr nc,$+7 : add hl,de : adc a,b : jr nc,$+3 : inc c .mul8_24_7: add hl,hl : rla : rl c : ret nc : add hl,de : adc a,b : ret nc : inc c ret ;; ; ;ۄ ; ;Signed division CHL/DE by Zeda, inspired by code from matrefeytontias. ;signed CHL/DE ;signed CHL/DE ==> CHL, |remainder| is DE sdiv24_16: ;Get the sign of the result ld a,c xor d push af ;Make BHL positive xor d jp p,.skip1 xor a sub l ld l,a ld a,0 sbc a,h ld h,a sbc a,a sub c ld c,a .skip1: ;make DE negative bit 7,d jr z,.skip2 ;setting DE negative xor a sub e ld e,a sbc a,a sub d ld d,a ld a,c .skip2: ld b,24 push hl pop ix ld hl,0 .loop: add ix,ix rla adc hl,hl add hl,de jr c,.skip3 sbc hl,de DB #DA ;start or `jp c,**` .skip3: inc ixl djnz .loop ld c,a ex de,hl ;DE is remainder push ix pop hl ;restore sign pop af ret p xor a sub l ld l,a ld a,b sbc a,h ld h,a sbc a,a sub c ld c,a ret ;; ;; ;Adapted from Axe p_SDiv: ld a,h xor d push af xor d jp p,.Skip1 xor a sub l ld l,a sbc a,a sub h ld h,a .Skip1: bit 7,d jr z,.Skip2 xor a sub e ld e,a sbc a,a sub d ld d,a .Skip2: call div16 ;normal routine division pop af ret p xor a sub l ld l,a sbc a,a sub h ld h,a ret ;; ;; ;Input: HLDE is numerator, C<129 is the divisor. ;Output: HLDE is quotient, A is remainder, C is negated ;1021+4{0,15} ;min: 1021cc ;max: 1081cc ;min: 1051cc ;87 bytes HLDE_Div_C: xor a sub c ld c,a ;; ;Note: -C<129 ;1009+4{0,15} ;min: 1009cc ;max: 1069cc ;min: 1039cc ;84 bytes HLDE_Div_negC: xor a call .div ld b,h ld h,l call .div ld l,h ld h,d call .div ld d,h ld h,e call .div ld e,h ld h,b rl e rl d adc hl,hl ret ;216+7{0,1}+{0,8} ;min: 216cc ;max: 231cc ;avg: 224.5cc .div: rl h : rla : add a,c : jr c,$+3 : sub c rl h : rla : add a,c : jr c,$+3 : sub c rl h : rla : add a,c : jr c,$+3 : sub c rl h : rla : add a,c : jr c,$+3 : sub c rl h : rla : add a,c : jr c,$+3 : sub c rl h : rla : add a,c : jr c,$+3 : sub c rl h : rla : add a,c : jr c,$+3 : sub c rl h : rla : add a,c : ret c : sub c ret ;; ;; ;Written by calc84maniac, based on a routine from Zeda ;=============================================================== ;=============================================================== ;Performs HL/BC ;Speed: 1168 to 1318 cycles depending on how many set bits in the result ; add 19 if HL is negative ; add 19 if BC is positive ; add another 28 if only one is negative ;Size: 54 bytes ; **31 bytes larger than the regular HL_Div_BC ;Inputs: ; HL is the numerator ; BC is the denominator ;Outputs: ; HL is the quotient ; DE is the remainder ; BC = -abs(BC) ;=============================================================== HL_SDiv_BC: ld a,h xor b push af .absHL: add hl,hl jr nc,.negabsBC xor a : sub l : ld l,a sbc a,a : sub h : ld h,a .negabsBC: bit 7,b jr nz,$+8 xor a : sub c : ld c,a sbc a,a : sub b : ld b,a ex de,hl xor a ld h,a ld l,a ld a,15 .Div_Loop_1: rl e : rl d adc hl,hl add hl,bc jr c,$+4 sbc hl,bc dec a jr nz,.Div_Loop_1 ex de,hl adc hl,hl pop af : ret p xor a : sub l : ld l,a sbc a,a : sub h : ld h,a ret ;; ;; ;Inputs: ; HL is the numerator ; C<128 is the denominator ;Outputs: ; A is twice the remainder of the unrounded value ; B is 0 ; C is not changed ; DE is not changed ; HL is the rounded quotient ; c flag set means no rounding was performed ; reset means the value was rounded HL_Div_C_round: ld b,16 xor a add hl,hl rla cp c jr c,$+4 inc l sub c djnz $-7 add a,a cp c ret c inc hl ret ;; ;; ;I'm not postive on the timing. ;min: 203 ;max: 308 ;avg: 236.125 HL_Div_B: add hl,hl ld a,h jr c,.div16_8_2_0 cp b jr c,$+4 sub b : inc l sla l : rla jr c,.div16_8_2_1 .div16_8_1_1: cp b jr c,$+4 sub b : inc l sla l : rla jr c,.div16_8_2_2 .div16_8_1_2: cp b jr c,$+4 sub b : inc l sla l : rla jr c,.div16_8_2_3 .div16_8_1_3: cp b jr c,$+4 sub b : inc l sla l : rla jr c,.div16_8_2_4 .div16_8_1_4: cp b jr c,$+4 sub b : inc l sla l : rla jr c,.div16_8_2_5 .div16_8_1_5: cp b jr c,$+4 sub b : inc l sla l : rla jr c,.div16_8_2_6 .div16_8_1_6: cp b jr c,$+4 sub b : inc l sla l : rla jr c,.div16_8_2_7 .div16_8_1_7: cp b : ret c : sub b : inc l ret .div16_8_2_0: sub b : rl l : rla : jr nc,.div16_8_1_1 .div16_8_2_1: sub b : rl l : rla : jr nc,.div16_8_1_2 .div16_8_2_2: sub b : rl l : rla : jr nc,.div16_8_1_3 .div16_8_2_3: sub b : rl l : rla : jr nc,.div16_8_1_4 .div16_8_2_4: sub b : rl l : rla : jr nc,.div16_8_1_5 .div16_8_2_5: sub b : rl l : rla : jr nc,.div16_8_1_6 .div16_8_2_6: sub b : rl l : rla : jr nc,.div16_8_1_7 .div16_8_2_7: sub b : inc l ret ;; ;; HL_Div_384: ;223cc ;(HL+HL*5*17*2)/256 push hl ld b,h ld c,l xor a add hl,hl : rl a add hl,hl : rl a add hl,bc : adc a,0 ld d,a ld b,h ld c,l add hl,hl : rl a add hl,hl : rl a add hl,hl : rl a add hl,hl : rl a add hl,bc : adc a,d add hl,hl : rla pop de add hl,hl : rl a sla l adc a,0 ret ;; ;; ;;270cc or 280cc HL_Div_7_round: xor a ld d,h ld e,l ld b,a add hl,hl : rla add hl,hl : rla add hl,hl : rla add hl,de : adc a,b ld d,h ld e,l ld c,a add hl,hl : rla add hl,hl : rla ld d,h ld e,l ld c,a add hl,hl : rla add hl,hl : rla ld d,a ld d,h ld e,l ld c,a ld l,a ld h,b add hl,hl add hl,hl add hl,hl add hl,hl add hl,de adc a,b sla l ld l,h ld h,a ret nc inc hl ret ;AH/16 ;; ;; ;210cc or 220cc HL_Div_5_round: xor a ld d,h : ld e,l : ld b,a add hl,hl : rla add hl,de : adc a,b ld d,h : ld e,l : ld c,a add hl,hl : rla add hl,hl : rla add hl,hl : rla add hl,hl : rla add hl,de : adc a,c ld d,a : ld e,h add hl,de : adc a,b ld d,a : ld e,h add a,l ex de,hl rla : rla : and 3 : rra adc a,b add a,l ld l,a ret nc inc h ret ;; ;; ;HL/5 ;HL/4+HL*3*17*257 ;234cc to 245cc HL_Div_5: xor a ld b,h ld c,l ld d,a add hl,hl : rla add hl,bc : adc a,d ;3 add hl,hl : rla ;6 add hl,hl : rla ;12 add hl,hl : rla ;24 add hl,bc : adc a,d ;25 add hl,hl : rla ;50 add hl,bc : adc a,d ;51 ;AHL0+AHL+BC/2 ;AHL*257/256 =AHL+A srl b : rr c srl b : rr c ld d,a ld a,b add a,l ld b,a ld e,h jr nc,$+3 inc de add hl,bc ld a,d add a,e ld e,a ret nc inc d ret ;; ;; ;205cc or 215cc HL_Div_3_round: xor a : ld d,h : ld e,l add hl,hl : rla add hl,hl : rla add hl,de ld d,h : ld e,l : ld b,a add hl,hl : rla add hl,hl : rla add hl,hl : rla add hl,hl : rla add hl,de : adc a,bas ld d,h : ld e,l : ld b,a ld d,a : ld e,h : add hl,de adc a,0 sla l ld l,h ld h,a ret nc inc hl ret ;; ;; ;HL/3 --> DE ;209cc to 219cc HL_Div_3: xor a ld b,a ld d,h ld e,l add hl,hl : rla add hl,hl : rla add hl,de : adc a,b add hl,hl : rla add hl,hl : rla add hl,de : adc a,b add hl,hl : rla add hl,hl : rla add hl,de : adc a,b ;AHL+(AHL+(DE>>1))/256 srl d : rr e ;AHL+(AHL+DE)/256 ;AH.L+A.HL+.DE ld b,h ld c,l ;AB.C+A.HL+.DE add hl,de ;AB.C+A.HL+carry ld d,a ;DB.C+A.H+carry adc a,b ld e,a jr nc,$+3 inc d ;DE.C+0.H+carry ld a,h add a,c ex de,hl ret nc inc hl ret ;; ;; ;Input: HL ;Output: HL is the input divided by 3 ;Destroys: B,C,E,A ;217cc HL_Div_3: ;increment HL, putting overflow in A ld bc,1 ld a,b add hl,bc adc a,b ;We want a difference of a factor of 2 shifts ld b,h ld c,l ld e,a add hl,hl : rla add hl,hl : rla add hl,bc : adc a,e ;We want a difference of a factor of 4 shifts ld b,h ld c,l ld e,a add hl,hl : rla add hl,hl : rla add hl,hl : rla add hl,hl : rla add hl,bc : adc a,e ld b,a ld c,h add hl,bc adc a,0 ld l,h ld h,a ;now HL is our result ret ;; ;; ;1360+24({0,3+{0,3}}) ;min: 1360cc ;max: 1504cc ;avg: 1414cc ;17 bytes EHL_Div_D: xor a ld b,24 .loop: add hl,hl rl e rla jr c,$+5 ;if D is guaranteed <129, can omit this cp d jr c,$+4 sub d inc l djnz .loop ret ;; ;; ;Inputs: ; DE,BC are 8.8 Fixed Point numbers ;Outputs: ; HL is the 8.8 Fixed Point result (rounded to the least significant bit) ;if DE is 0 : 122cc or 136cc if BC is negative ;if |BC|>=128*|DE| : 152cc or 166cc if BC is negative ;Otherwise: ;min: 1164cc ;max: 1377cc ;avg: 1258.5cc BC_Div_DE_88: ; First, find out if the output is positive or negative ld a,b xor d push af ;sign bit is the result sign bit ; Now make sure the inputs are positive xor d ;A now has the value of B, since I XORed it with D twice (cancelling) jp p,.skip1 ;if Positive, don't negate xor a sub c ld c,a sbc a,a sub b ld b,a .skip1: ;now make DE negative to optimize the remainder comparison ld a,d or d jp m,.skip2 xor a sub e ld e,a sbc a,a sub d ld d,a .skip2: ;if DE is 0, we can call it an overflow ;A is the current value of D or e jr z,div_fixed88_overflow ;The accumulator gets set to B if no overflow. ;We can use H=0 to save a few cc in the meantime ld h,0 ;if B+DE>=0, then we'll have overflow ld a,b add a,e ld a,d adc a,h jr c,div_fixed88_overflow ;Now we can load the accumulator/remainder with B ;H is already 0 ld l,b ld a,c call div_fixed88_sub ld c,a ld a,b ;A is now 0 call div_fixed88_sub ; if 2HL+DE>=0, increment result to round. add hl,hl add hl,de ld h,c ld l,a jr nc,$+3 inc hl ;Now check if H is overflowed bit 7,h jr nz,.div_fixed88_overflow pop af ret p xor a sub l ld l,a sbc a,a sub h ld h,a ret .div_fixed88_overflow: ld hl,#7FFF pop af ret p inc hl inc l ret .div_fixed88_sub: ;min: 456cc ;max: 536cc ;avg: 496cc ld b,8 .loop: rla adc hl,hl add hl,de jr c,$+4 sbc hl,de djnz .loop adc a,a ret ;; ;; ;HLIX/BC -> HLIX remainder DE ;174+4*.sub8 ;min: 2186cc ;max: 2794cc ;avg: 2466cc ;61 bytes div32_16: ex de,hl ; 4 ; Negate BC to allow add instead of sbc xor a ; 4 ; Need to set HL to 0 anyways, so save 2cc and a byte ld h,a ; 4 ld l,a ; 4 sub c ; 4 ld c,a ; 4 sbc a,a ; 4 sub b ; 4 ld b,a ; 4 ld a,d ; 4 call .sub8 ; 17 rla ; 4 ld d,a ; 4 ld a,e ; 4 call .sub8 ; 17 rla ; 4 ld e,a ; 4 ld a,ixh ; 8 call .sub8 ; 17 rla ; 4 ld ixh,a ; 8 ld a,ixl ; 8 call .sub8 ; 17 rla ; 4 ld ixl,a ; 8 ex de,hl ; 4 ret ; 10 .sub8: ;119+8*.sub ;min: 503cc ;max: 655cc ;avg: 573cc call .iter1 .iter1: ;17+2(17+2(.sub))) call .iter2 .iter2: ;17+2(.sub) call .sub .sub: ;48+{8,0+{0,19}} ;min: 48cc ;max: 67cc ;avg: 56.75cc rla ; 4 adc hl,hl ; 15 jr c,.skip ;12/7 add hl,bc ; 11 ret c ;11/5 sbc hl,bc ; 15 ret ; 10 .skip: add hl,bc ; 11 scf ; 4 ret ; 10 ;; ;; ;HL/9 --> A, HL<2304 div9: inc hl ld d,h ld e,l add hl,hl add hl,de add hl,hl add hl,de ld e,0 ld d,l ld a,h add hl,hl add hl,hl add hl,de adc a,e add hl,hl rla add hl,hl rla ret ;; ;; ;Made by Zeda Thomas, use it for whatever, and please optimize this! ;Slight Warning: This passed a handful of tests, but if you find a bug, ;please report it. I still actively maintain these (as of January 2020). ;Inputs: ; HLIX/BCDE ;Outputs: ; HLIX is the quotient ; BCDE is the remainder ;RAM: ; uses 8 bytes of RAM: ; 4 bytes at temp32_0 ; 4 bytes at temp32_1 ; ;min: 5240cc ;max: 6264cc ;avg: 5752cc ;113 bytes div_32_32: ; Back up HLIX ld (.temp32_0),ix ld (.temp32_0+2),hl ;negate BCDE xor a ld l,a : sbc a,e : ld e,a ld a,l : sbc a,d : ld d,a ld a,l : sbc a,c : ld c,a ld a,l : sbc a,b : ld b,a ld a,h ;set HLIX to 0 ld h,l ld ix,0 call .sub ld (.temp32_0+3),a ld a,(.temp32_0+2) call .sub ld (.temp32_0+2),a ld a,(.temp32_0+1) call .sub ld (.temp32_0+1),a ld a,(.temp32_0+0) call .sub ld (.temp32_0),a push ix pop de ld b,h ld c,l ld ix,(.temp32_0) ld hl,(.temp32_0+2) ret .sub: ;min: 1223cc ;max: 1479cc ;avg: 1351cc call .iter1 .iter1: call .iter2 .iter2: call .iter3 .iter3: ;min: 138cc ;max: 170cc ;avg: 154cc ;HLIX*2 add ix,ix adc hl,hl ;rotate in the bit add a,a jr nc,.skip1 inc ix .skip1: ;save HLIX in case we need to restore ld (temp32_1),ix ld (temp32_1+2),hl ;check if HLIX>=-BCDE ; ==> HLIX+BCDE >= 0 add ix,de adc hl,bc jr c,.skip2 ;we need to restore ld ix,(temp32_1) ld hl,(temp32_1+2) ret .skip2: inc a ret ;; ;; ;Created by calc84maniac ;NOTE from Zeda: C should <=128, the original forgot to mention this. ;Inputs: dehl=32-bit dividend, c<=128 is the divisor (Or is it the other way around?) ;Outputs: dehl=32-bit quotient, a=remainder, c=unchanged, b=0 ;min: 1936cc ;max: 2032cc ;avg: 1984cc ;Size: 17 bytes DEHL_Div_C: .div32bit: ld b,32 xor a .divloop: add hl,hl rl e rl d rla cp c jr c,.divlbl inc l sub c .divlbl: djnz .divloop ret ;; ;; ;Inputs: ; DEHL ;Outputs: ; DEHL is the quotient ; A is the remainder ; B is the remainder ; C is 10 ;1300cc~1329cc ;49 bytes DEHL_Div_10_v1: xor a ld bc,05F6h rl d : rla rl d : rla rl d : rla rl d : rla : add a,c : jr c,$+3 : sub c : djnz $-7 ld b,8 rl e : rla : add a,c : jr c,$+3 : sub c : djnz $-7 ld b,8 rl h : rla : add a,c : jr c,$+3 : sub c : djnz $-7 ld b,8 rl l : rla : add a,c : jr c,$+3 : sub c : djnz $-7 adc hl,hl rl e rl d ret ;; ;; ;Inputs: ; DEHL ;Outputs: ; DEHL is the quotient ; A is the remainder ; B is the remainder ; C is 10 ;912cc~941cc DEHL_Div_10_v2: xor a ld c,10 rl d : rla rl d : rla rl d : rla rl d : rla : sub c : jr nc,$+3 : add a,c rl d : rla : sub c : jr nc,$+3 : add a,c rl d : rla : sub c : jr nc,$+3 : add a,c rl d : rla : sub c : jr nc,$+3 : add a,c rl d : rla : sub c : jr nc,$+3 : add a,c rl e : rla : sub c : jr nc,$+3 : add a,c rl e : rla : sub c : jr nc,$+3 : add a,c rl e : rla : sub c : jr nc,$+3 : add a,c rl e : rla : sub c : jr nc,$+3 : add a,c rl e : rla : sub c : jr nc,$+3 : add a,c rl e : rla : sub c : jr nc,$+3 : add a,c rl e : rla : sub c : jr nc,$+3 : add a,c rl e : rla : sub c : jr nc,$+3 : add a,c rl h : rla : sub c : jr nc,$+3 : add a,c rl h : rla : sub c : jr nc,$+3 : add a,c rl h : rla : sub c : jr nc,$+3 : add a,c rl h : rla : sub c : jr nc,$+3 : add a,c rl h : rla : sub c : jr nc,$+3 : add a,c rl h : rla : sub c : jr nc,$+3 : add a,c rl h : rla : sub c : jr nc,$+3 : add a,c rl h : rla : sub c : jr nc,$+3 : add a,c rl l : rla : sub c : jr nc,$+3 : add a,c rl l : rla : sub c : jr nc,$+3 : add a,c rl l : rla : sub c : jr nc,$+3 : add a,c rl l : rla : sub c : jr nc,$+3 : add a,c rl l : rla : sub c : jr nc,$+3 : add a,c rl l : rla : sub c : jr nc,$+3 : add a,c rl l : rla : sub c : jr nc,$+3 : add a,c rl l : rla : sub c : jr nc,$+3 : add a,c ld b,a ld a,l : rra : ccf : ld l,a ld a,h : rra : ccf : ld h,a ld a,e : rra : ccf : ld e,a ld a,d : rra : ccf : ld d,a ld a,b ret ;; ;; ;Inputs: ; C is the numerator ; D is the denominator ;Outputs: ; A is the remainder ; B is 0 ; C is the result of C/D ; D,E,H,L are not changed C_Div_D: ld b,8 xor a .loop: sla c rla cp d jr c,.skip1 inc c sub d .skip1: djnz .loop ret ;; ;; ;Input: ; HL points to the bignum (1 byte size prefix (0 -> 1 byte, 1 -> 2 bytes, n-1 -> n bytes), n subsequent bytes) ;Output: ; bignum is divided in-place, not renormalized ; A is the remainder ; BC is 100 bignum_div_100: ld c,100 bignum_div_C: ;Note: C<128 ld b,(hl) inc hl ld a,(hl) ld h,-1 inc h : sub c : jr nc,$-2 add a,c ld (hl),a inc b dec b ret z .loop: inc hl ld e,(hl) sla e : rla : cp c : jr c,$+4 : sub a,c : inc e sla e : rla : cp c : jr c,$+4 : sub a,c : inc e sla e : rla : cp c : jr c,$+4 : sub a,c : inc e sla e : rla : cp c : jr c,$+4 : sub a,c : inc e sla e : rla : cp c : jr c,$+4 : sub a,c : inc e sla e : rla : cp c : jr c,$+4 : sub a,c : inc e sla e : rla : cp c : jr c,$+4 : sub a,c : inc e sla e : rla : cp c : jr c,$+4 : sub a,c : inc e ld (hl),a djnz .loop ret ;; ;; ;BC/DE ==> BC, remainder in HL BC_Div_DE: ld hl,0 ld a,b ld b,16 .loop: ;shift the bits from BC into HL sla c rla adc hl,hl sbc hl,de jr nc,.inc_acc add hl,de db #FE ;this begins the instruction `cp *`, so it eats the next byte. .inc_acc: inc c djnz .loop ld b,a ret ;; ;; BC_Div_DE_faster: ;BC/DE ==> BC, remainder in HL ;NOTE: BC/0 returns 0 as the quotient. ;min: 738cc ;max: 898cc ;avg: 818cc ;144 bytes xor a ld h,a ld l,a sub e ld e,a sbc a,a sub d ld d,a ld a,b rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla ld b,a ld a,c rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla ld c,a ret ;; ;; ;BC/DE ==> BC, remainder in HL ;NOTE: BC/0 returns 0 as the quotient. ;min: 773cc ;max: 933cc ;avg: 853cc ;82 bytes BC_Div_DE_fast: xor a ld h,a ld l,a sub e ld e,a sbc a,a sub d ld d,a ld a,b ld b,c call .sub ld a,b ld b,c .sub: ;min: 354cc ;max: 434cc ;avg: 394cc rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla : adc hl,hl : add hl,de : jr c,$+4 : sbc hl,de rla ld c,a ret ;; ;; BC_Div_DE: ld hl,0 inc d dec d jr z,.smalldiv ld l,b ld b,h .nextpart: ld a,c rla : adc hl,hl : sbc hl,de : jr nc,$+3 : add hl,de rla : adc hl,hl : sbc hl,de : jr nc,$+3 : add hl,de rla : adc hl,hl : sbc hl,de : jr nc,$+3 : add hl,de rla : adc hl,hl : sbc hl,de : jr nc,$+3 : add hl,de rla : adc hl,hl : sbc hl,de : jr nc,$+3 : add hl,de rla : adc hl,hl : sbc hl,de : jr nc,$+3 : add hl,de rla : adc hl,hl : sbc hl,de : jr nc,$+3 : add hl,de rla : adc hl,hl : sbc hl,de : jr nc,$+3 : add hl,de cpl ld c,a ret .smalldiv: xor a rl b : rla : sub e : jr nc,$+3 : add a,e rl b : rla : sub e : jr nc,$+3 : add a,e rl b : rla : sub e : jr nc,$+3 : add a,e rl b : rla : sub e : jr nc,$+3 : add a,e rl b : rla : sub e : jr nc,$+3 : add a,e rl b : rla : sub e : jr nc,$+3 : add a,e rl b : rla : sub e : jr nc,$+3 : add a,e rl b : rla : sub e : jr nc,$+3 : add a,e ld l,a ld a,b cpl ld b,a jp .nextpart ;; ;; ;Divides a 48-bit integer by 100, where A holds the upper 8 bits and L holds the next 8, followed by DE and IX ;Result is in HLDEIX, A is the remainder ALDEIX_div_100: ld c,100 ALDEIX_Div_C: ;Note: C<128 call AL_Div_C push hl ld l,d call AL_Div_C.rotate ld h,l ld l,e call AL_Div_C.rotate push hl push ix pop de ld l,d call AL_Div_C.rotate ld h,l ld l,e call AL_Div_C.rotate pop de ex (sp),ix pop hl ret ;; ;; ;Divides a 32-bit integer by 100, where A holds the upper 8 bits and L holds the next 8, followed by DE ;Result is in DEHL, A is the remainder ALDE_div_100: ld c,100 ALDE_Div_C: ;Note: C<128 call AL_Div_C push hl ld l,d call AL_Div_C.rotate ld h,l ld l,e pop de jp AL_Div_C.rotate ;; ;; AL_div_100: ;Divides a 16-bit integer by 100, where A holds the upper 8 bits and L holds the lower ;Result is in HL, A is the remainder ;min:256 ;max:329 ;avg:305.5625cc ld c,100 AL_Div_C: ;Note: C<128 ld h,-1 inc h : sub c : jr nc,$-2 add a,c .rotate: sla l : rla : cp c : jr c,$+4 : sub a,c : inc l sla l : rla : cp c : jr c,$+4 : sub a,c : inc l sla l : rla : cp c : jr c,$+4 : sub a,c : inc l sla l : rla : cp c : jr c,$+4 : sub a,c : inc l sla l : rla : cp c : jr c,$+4 : sub a,c : inc l sla l : rla : cp c : jr c,$+4 : sub a,c : inc l sla l : rla : cp c : jr c,$+4 : sub a,c : inc l sla l : rla : cp c : ret c : sub a,c : inc l ret ;; ;; HL_mod_3: ;destroys HL, returns HL mod 3 in A ;112+{0,2} + {0,8} + {0,1} ;min: 112 ;max: 123 ;avg: 117.5 ; HL mod 3 == (H*256+L) mod 3 == (H*1+L) mod 3 == (H+L) mod 3 ;So add the upper and lower byte ld a,h add a,l ;If adding caused an overflow, well add (256 mod 3) == 1 to A. adc a,0 ;We don't need to worry abput overflow here :) ;; ;destroys HL, returns A mod 3 in A ;97+{0,2} + {0,8} + {0,1} ;min: 97 ;max: 108 ;avg: 102.5 ;A mod 3 is equal to adding the upper and lower nibble of A mod 3 ;For example, if A=16u+l, then A mod 3 == 16u+l mod 3 == u+l A_mod_3: ;So add the upper and lower nibble ld l,a ;save a copy of a add a,a add a,a add a,a add a,a add a,l ; If there was overflow, again, add 1. However, our number is shifted up by 4, ; so we need to add 1<<4 == 16 jr nc,$+4 add a,16 ; Now our number is in the upper 4 bits of A. We need to add the top 2 bits to ; the preceding 2 bits ld l,a add a,a add a,a ; Note that now we might have some garbage bits in the middle 4 bits of A, ; overlapping two garbage bits in L. We'll need to clear out bits to avoid ; issues. It is convenient to use a mask of %11000000 ld h,%11000000 and h add a,l ;Now if there was overflow, add 1<<6 == #40. H "happens" to be -#40, so we can ;do this by subtracting h jr nc,$+3 sub h ;Now finally, mask out all but those upper two bits and h ; At this point, we can stop if we only need to test divisibility ; If the parity is even, then we have to do (0 mod 3) or (3 mod 3), both of ; which are 0, indicating divisibility by 3. If we have odd parity, then the ; upper two bits are 10 or 01, both of which are not 0 mod 3. ; basically, pe==divisible, po==not divisible. ; ; But, to get full modulo, shift those uppertwo bits into the lower two bits rlca rlca ret po ; And make sure to set A to 0 if it was 0 or 3 :) xor a ret ;; ;; ;Inputs: HL ;Outputs: pe if HL was divisible by 3, else po. ;Destroys: HL ;103+{0,2}+{0,1} ;min: 103 ;max: 106 ;avg: 104.5 HL_divisible_by_3: ld a,h add a,l adc a,0 ;; ;Inputs: A ;Outputs: pe if A was divisible by 3, po if A was not divisible by 3 ;Destroys: HL ;88+{0,2}+{0,1} ;min: 88 ;max: 91 ;avg: 89.5 A_divisible_by_3: ld h,#C0 ld l,a ;save a copy of a add a,a add a,a add a,a add a,a add a,l jr nc,$+4 add a,16 ld l,a add a,a add a,a and h add a,l jr nc,$+3 sub h and h ret ;; ;; ;; ; ;ۋ ; ;Input: H.L needs to be on (0,128.0) ;Output: H.L if c flag set ; returns nc if input is negative (HL not modified) ;Error: ; The error on the outputs is as follows: ; 20592 inputs are exact ; 12075 inputs are off by 1/256 ; 100 inputs are off by 2/256 ; So all 32767 inputs are within 2/256, with average error being <1/683 which is smaller than 1/256. ;Size: 177 bytes ;Speed: average speed is less than 1250 t-states lognat: ld a,h : or l : jr nz,$+5 ld h,80h : ret dec h dec h jr nz,$+9 inc l : dec l jr nz,.normalizeln ld l,177 ret inc h jr nz,.normalizeln_2 ld b,h ld c,l ld e,l ld d,8 add hl,hl add hl,hl add hl,de ex de,hl ;call .HL_Div_DE add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a add hl,hl : sbc hl,de : adc a,a ld h,a : ld l,b sla h : jr c,$+3 : ld l,c add hl,hl : jr c,$+3 : add hl,bc add hl,hl : jr c,$+3 : add hl,bc add hl,hl : jr c,$+3 : add hl,bc add hl,hl : jr c,$+3 : add hl,bc add hl,hl : jr c,$+3 : add hl,bc add hl,hl : jr c,$+3 : add hl,bc add hl,hl : jr c,$+3 : add hl,bc rl l ld a,h adc a,b ld h,b ld l,a scf ret ; .HL_Div_DE: ; add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a ; add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a ; add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a ; add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a ; add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a ; add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a ; add hl,hl : sbc hl,de : jr nc,$+3 : add hl,de : adc a,a ; add hl,hl : sbc hl,de : adc a,a : ret .normalizeln: inc h .normalizeln_2: xor a inc h : ret m ld d,a : ld e,a ld a,l jr z,.toosmall inc e : srl h : rra : jr nz,$-4 rla : rl h dec e .stepin: ld l,a push de call lognat pop de ;now multiply DE by 355, then divide by 2 (rounding) ld b,d : ld c,e : ld a,d ex de,hl add hl,hl add hl,hl ;4 add hl,bc ;5 add hl,hl ;10 add hl,bc ;11 add hl,hl ;22 add hl,hl add hl,hl add hl,hl add hl,bc add hl,hl add hl,bc sra h : rr l adc hl,de scf ret .toosmall: dec d dec e : add a,a : jr nc,$-2 inc h jp .stepin ;; ;; ;Input: HL is a fixed point number ;Output: lg(H.L)->H.L ;Speed: Avg: 340 lg_88: ld de,.LUT ld b,0 ld a,h or a ret m ld a,l jr z,$+8 inc b : srl h : rra : jr nz,$-4 or a : jr nz,$+6 ld hl,8000h : ret rra : inc b : jr nc,$-2 ;A is the element to look up in the LUT ld l,a ld c,h dec b add hl,hl add hl,de ld e,(hl) inc hl ld d,(hl) ex de,hl add hl,bc ret ; 0 1 2 3 4 5 6 7 8 9 .LUT: DW #F800, #F996, #FA52, #FACF, #FB2C, #FB76, #FBB3, #FBE8, #FC16, #FC3F ; 0 DW #FC64, #FC86, #FCA5, #FCC1, #FCDC, #FCF4, #FD0B, #FD21, #FD36, #FD49 ; 1 DW #FD5C, #FD6D, #FD7E, #FD8E, #FD9D, #FDAC, #FDBA, #FDC8, #FDD5, #FDE2 ; 2 DW #FDEE, #FDFA, #FE06, #FE11, #FE1C, #FE26, #FE31, #FE3B, #FE44, #FE4E ; 3 DW #FE57, #FE60, #FE69, #FE71, #FE7A, #FE82, #FE8A, #FE92, #FE9A, #FEA1 ; 4 DW #FEA9, #FEB0, #FEB7, #FEBE, #FEC5, #FECB, #FED2, #FED8, #FEDF, #FEE5 ; 5 DW #FEEB, #FEF1, #FEF7, #FEFD, #FF03, #FF09, #FF0E, #FF14, #FF19, #FF1E ; 6 DW #FF24, #FF29, #FF2E, #FF33, #FF38, #FF3D, #FF42, #FF47, #FF4B, #FF50 ; 7 DW #FF55, #FF59, #FF5E, #FF62, #FF67, #FF6B, #FF6F, #FF74, #FF78, #FF7C ; 8 DW #FF80, #FF84, #FF88, #FF8C, #FF90, #FF94, #FF98, #FF9B, #FF9F, #FFA3 ; 9 DW #FFA7, #FFAA, #FFAE, #FFB2, #FFB5, #FFB9, #FFBC, #FFC0, #FFC3, #FFC6 ; 10 DW #FFCA, #FFCD, #FFD0, #FFD4, #FFD7, #FFDA, #FFDD, #FFE0, #FFE4, #FFE7 ; 11 DW #FFEA, #FFED, #FFF0, #FFF3, #FFF6, #FFF9, #FFFC, #FFFF ; 12 ;; ;; ;Inputs: ; HL is an unsigned 8.8 fixed point number. ;Outputs: ; HL is the signed 8.8 fixed point value of log base 2 of the input. ;Example: ; pass HL = 3.0, returns 1.58203125 (actual is ~1.584962501...) ;averages about 39 t-states slower than original ;62 bytes Log_2_88_size: ex de,hl ld hl,0 ld a,d ld c,8 or a jr z,.DE_lessthan_1 srl d jr z,logloop-1 inc l rr e jr $-7 .DE_lessthan_1: ld a,e dec hl or a ret z inc l dec l add a,a jr nc,$-2 ld e,a inc d .loop: add hl,hl push hl ld h,d ld l,e ld a,e ld b,8 add hl,hl rla jr nc,$+5 add hl,de adc a,0 djnz $-7 ld e,h ld d,a pop hl rr a ;this is NOT supposed to be rra, we need the z flag affected jr z,$+7 srl d rr e inc l dec c jr nz,.loop ret ;; ;; ;Input: HL is a fixed point number ;Output: ln(H.L)->H.L ;Speed: Avg: 340+(325 worst case) ln_88_fixed: call lg_88 ;now signed multiply HL by 355, then divide by 2 (rounding) ld de,0 bit 7,h jr z,$+9 dec e : xor a : sub l : ld l,a sbc a,a : sub h : ld h,a ld b,h ld c,l xor a add hl,hl add hl,hl : rla add hl,bc : adc a,d add hl,hl : rla add hl,bc : adc a,d add hl,hl : rla add hl,hl : rla add hl,hl : rla add hl,hl : rla add hl,bc : adc a,d add hl,hl : rla add hl,bc : adc a,d sra a : rr h ld l,h ld h,a inc e ret nz xor a : sub l : ld l,a sbc a,a : sub h : ld h,a ret ;; ; ;ې ; ; ⥯ ;Inputs: ; HL is the 8.8 fixed point number 'x' for 2^x ;Outputs: ; DEHL is the 24.8 fixed point result. If there was overflow exceeding 2^24, then this value is set to the max. power_2: ld a,l or a push hl ;save H for later, H is the integer part of the power ld hl,1 jr z,.integer scf ;set the carry flag so that a bit is rotated into a. This will act as our counter. ;wait until we come across the lowest bit. Also note that we rra jr nc,$-1 ld hl,2*256 .loop: push af call FPSqrtHL ;returns in HL pop af srl a jr z,.integer jr nc,.loop add hl,hl jp .loop .integer: pop bc ;Now b is the integer part for 2^x that we need to multiply HL by. ld de,0 ld a,b or a ret z add hl,hl rl e : rl d : jr c,.wayoverflow djnz $-7 ret .wayoverflow: ld hl,-1 ld d,h ld e,l ret ;; ;; ;Written by Zeda ; Requires ;!TEST ; mul16 ;BC*DE ==> DEHL ; DEHL_Div_BC ;DEHL/BC ==> DEHL ;"n choose r", defined as n!/(r!(n-r)!) ;Computes "HL choose DE" ;Inputs: HL,DE ;Outputs: ; HL is the result ; "HL choose DE" ; carry flag reset means overflow ;Destroys: ; A,BC,DE,IX ;Notes: ; Overflow is returned as 0 ; Overflow happens if HL choose DE exceeds 65535 ; This algorithm is constructed in such a way that intermediate ; operations won't erroneously trigger overflow. ;66 bytes ncr_HL_DE: ld bc,1 or a sbc hl,de jr c,.oob jr z,.exit sbc hl,de add hl,de jr c,$+3 ex de,hl ld a,h or l push hl pop ix .exit: ld h,b ld l,c scf ret z .loop: push bc : push de push hl : push bc ld b,h ld c,l call mul16 ;BC*DE ==> DEHL pop bc call DEHL_Div_BC ;result in DEHL ld a,d or e pop bc pop de jr nz,.overflow add hl,bc jr c,.overflow pop bc inc bc ld a,b cp ixh jr c,.loop ld a,ixl cp c jr nc,.loop ret .overflow: pop bc xor a ld b,a .oob: ld h,b ld l,b ret ;; ;; ;Inputs: DE,HL ;Outputs: c flag set if HL is not divisible by DE, else c flag is reset. ; HL is 0 if true. ;See below for a note on the motivation and development of this algorithm. isDivisible: ld a,d : or e : ccf : ret z ;remove this if DE is always guaranteed non-zero ;step 1 ld a,e : or l : rra : jr c,.step2 ;\ srl d : rr e : rr h : rr l ; | ld a,e : or l : rra : jr nc,$-11 ; |Remove these if DE is always guaranteed odd at input. .step2: ; | ld a,e : rra : ccf : ret c ;/ ;steps 3, 4, and 5 ld a,l or a .loop: sbc hl,de : ret c : ret z rr h : rra : bit 0,a : jr z,$-5 ld l,a jp .loop ;Motivation and Development ; I often find myself in a situation where I need to find the factors of a number, but I have no technology around to aid me. This means I need to use... mental arithmetic! ; I've been doing this for 15 years, so I have refined my mental process quite a bit. ; It is still a trial division algorithm, but with a very obfuscated "division" technique. ; We don't need to do 1131/7 to see if it is divisible by 7, we just need to see if 7 divides 1131 and this is what my algorithm does. ; Interestingly, testing divisibility at the algorithmic level is a little faster than division. Not by much, but it is also non-negligible. ;The Algorithm ; The core algorith is designed around checking that (A mod B == 0) is true or false. ; We also make the assumption that B is odd and by extension, non-zero. ; The case where B is non-zero and even will be discussed later. ; ; Since B is odd, 2 does not divide B. This means that if A is even: ; (A mod B == 0) if and only if (A/2 mod B)==0. ; We also know by the definition of divisibility that ; (A mod B) == (A+c*B mod B) ; where c is any integer. Combining all this, we have an algorithm: ; ; 1] Remove all factors of 2 from A ; 2] With A now odd, do A=A-B ; If the result is zero, that means (A mod B == 0) ; If the result underflow (becomes "negative", or on the Z80, sets the carry flag), it means that A was somewhere on [1,B-1], so it is not divisible by B. ; 3] Continue back at 1. ; ; Now suppose B is allowd to be non-zero and even. Then B is of the form d*2^k where d is odd. ; This just means there are some factors of 2 that can be removed from B until it is odd. ; The only way A is divisible by B, is if it has the same number or more of factors of 2 as B. ; If we factor out common factors of 2 and find B is still even, then A is not divisible by B. ; Otherwise we have an odd number and only need to check the new (A mod d) ; for which we can use the "odd algorithm" above. ; So putting it all together: ; ; 1] If B==0, return FALSE. ; 2] Remove common factors of 2 from A and B. ; 3] If B is even, return FALSE. ; 4] Remove all factors of 2 from A. ; 5] Subtract B from A (A=A-B). ; If the result is zero, return TRUE. ; If the result is "negative" (setting the carry flag on many processors), return FALSE. ; 6] Repeat at 4] ; ; The overhead steps are 1] to 3]. ; The iterated steps are 4] and 5]. ; Because 5 always produces an even number, when it then performs step 4, it always divides by at least one factor of 2. ; This means the algorithm takes at most 1+ceil(log2(A))-floor(log2(B) iterations. ; For example, if A is a 37-bit number and B is a 13-bit number,this takes at most 38-13 = 25 iterations. ; However, in practice it is usually slightly less. ;Example Time: ; Say I wanted to test if 1337 is divisible by 17. ; Since 17 is odd, we can proceed. ; 1337 is odd, so no factors of 2 to remove. ; 1337-17 == 1320. ; 1320/2 == 660 ; 660/2 == 330 ; 330/2 == 165 ; 165-17 == 148 ; 148/2 == 74 ; 74/2 == 37 ; 37-17 == 20 ; 20/2 == 10 ; 10/2 == 5 ; 5-17 = -12 ; ; So 1337 is not divisible by 17. ;Now test divisibility by 7: ;1337 => 1330 ;=>665 ;=>658 ;=>329 ;=>322 ;=>161 ;=>154 ;=>77 ;=>70 ;=>35 ;=>28 ;=>14 ;=>7 ;=>0 ; ; So 1337 is divisible by 7. ;; ;; ;Adds two, little-endian 16-digit BCD integers (8 bytes) ;Input: ; HL points to one BCD integer ; DE points to another BCD integer ;Output: ; The sum is wrriten over the integer at HL. ; HL and DE point to the last digit of their integers. ;46 bytes, 284cc addBCD_16: ld a,(de) : add a,(hl) : daa : ld (de),a : inc hl : inc de ld a,(de) : adc a,(hl) : daa : ld (de),a : inc hl : inc de ld a,(de) : adc a,(hl) : daa : ld (de),a : inc hl : inc de ld a,(de) : adc a,(hl) : daa : ld (de),a : inc hl : inc de ld a,(de) : adc a,(hl) : daa : ld (de),a : inc hl : inc de ld a,(de) : adc a,(hl) : daa : ld (de),a : inc hl : inc de ld a,(de) : adc a,(hl) : daa : ld (de),a : inc hl : inc de ld a,(de) : adc a,(hl) : daa : ld (de),a ret ;; ;; ;gcd(HL,DE)->HL ;Output: ; B=0 ; HL is the GCD of the inputs ;Destroys: ; A,DE ; DE is guaranteed 0 unless the output is 0 (which only happens if one of the inputs is 0). ;Uses the binary GCD algorithm ;65 bytes gcdHL_DE: ;B is our cofactor-of-2 counter ld b,0 ;If HL=0, return 0 ld a,h : or l : ret z ;If DE=0, return 0 ex de,hl ld a,h : or l : jr nz,.test_cofactor_of_2 ret .cofactor_2_loop: inc b srl h : rr l srl d : rr e .test_cofactor_of_2: inc b ld a,e or l rra .c,gcd_cofactor_2_loop .remove_factors_of_2_op2: srl h : rr l : jr nc,.remove_factors_of_2_op2 adc hl,hl jr .swap_ops .swap_ops_negate: ;At this point, HL needs to be negated and swapped with DE xor a : sub l : ld l,a : sbc a,a : sub h : ld h,a .swap_ops: ex de,hl .remove_factors_of_2_op1: srl h : rr l : jr nc,.remove_factors_of_2_op1 adc hl,hl sbc hl,de jr c,.swap_ops_negate jp nz,.remove_factors_of_2_op1 ;DE is the GCD, need to shift it left B-1 times. ex de,hl dec b ret z add hl,hl : djnz $-1 ret ;; ;; ;ॢ ᫮ A HEX-String 頥 HL. ; ᫨ A=#31, HL 㤥 byte #33,#31 ; 112 T ByteToStrHEX: LD B,A AND #F0 RRCA RRCA RRCA RRCA ADD A,#90 DAA ADC A,#40 DAA LD (HL),A INC HL LD A,B AND #0F ADD A,#90 DAA ADC A,#40 DAA LD (HL),A RET ;; ; ;ۑࠢ. ; ;These code snippets are for 16-bit comparisons. ;"I learned these from calc84maniac" ;"These have similar flags to that of the `cp` instruction. At the very least, ; you get the zero and carry flag identical." ;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;; ;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;;:;; ;Inputs: ; HL, DE ;Outputs: ; z flag is set if HL=DE, else nz ; c flag is set if HL