382 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			382 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import math
 | 
						|
import sys
 | 
						|
 | 
						|
from asm import (
 | 
						|
    AC,
 | 
						|
    C,
 | 
						|
    X,
 | 
						|
    Y,
 | 
						|
    adda,
 | 
						|
    align,
 | 
						|
    anda,
 | 
						|
    bge,
 | 
						|
    blt,
 | 
						|
    bne,
 | 
						|
    bra,
 | 
						|
    disableListing,
 | 
						|
    enableListing,
 | 
						|
    end,
 | 
						|
    fillers,
 | 
						|
    hi,
 | 
						|
    jmp,
 | 
						|
    label,
 | 
						|
    ld,
 | 
						|
    lo,
 | 
						|
    nop,
 | 
						|
    ora,
 | 
						|
    pc,
 | 
						|
    st,
 | 
						|
    suba,
 | 
						|
    writeRomFiles,
 | 
						|
    xora,
 | 
						|
    zpByte,
 | 
						|
)
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    enableListing()
 | 
						|
 | 
						|
# Inputs
 | 
						|
a, b = zpByte(), zpByte()
 | 
						|
# Output
 | 
						|
result = zpByte(2)
 | 
						|
# Used for storage
 | 
						|
tmp = zpByte()
 | 
						|
# Used for the continuation address after lookup.
 | 
						|
continuation = zpByte()
 | 
						|
# Used for the action to take with the high byte.
 | 
						|
high_byte_action = zpByte()
 | 
						|
 | 
						|
 | 
						|
# The following code implements a lookup table of floored quarter squares,
 | 
						|
# for values up to 255.
 | 
						|
# This is supposed to enable a fast multiplication for 7-bit numbers.
 | 
						|
 | 
						|
# First the high-bytes.
 | 
						|
# The table is shifted down by 32 places, as the first 32 high-bytes are all zero
 | 
						|
# This allows us to have code later in the page which we can branch back to.
 | 
						|
align(0x100, size=0x100)
 | 
						|
label("Quarter-squares lookup table")
 | 
						|
for i in range(32, 256):
 | 
						|
    val = math.floor(i**2 / 4)
 | 
						|
    ld(hi(val))
 | 
						|
    C(f"${val:04x} = {val} = floor({i} ** 2 / 4); ${val:04x} >> 8 = ${val >> 8:02x}")
 | 
						|
 | 
						|
# We jump back here after looking up the low-byte of the result.
 | 
						|
label("low-byte return point")
 | 
						|
ld(hi("multiply 7x7"), Y)
 | 
						|
jmp(Y, [continuation])
 | 
						|
ld(hi(pc()), Y)  # Make it easy to get back here!
 | 
						|
cost_of_low_byte_return = 3
 | 
						|
label("table entry.possibly-negative")
 | 
						|
# AC is negative, if b > a. Find absolute value
 | 
						|
blt(pc() + 3)  # 1
 | 
						|
bra(pc() + 3)  # 2
 | 
						|
suba(1)  # 3; if >= 0
 | 
						|
xora(0xFF)  # 3;  if < 0
 | 
						|
adda(1)  # 4
 | 
						|
cost_of_absolute = 4
 | 
						|
label("table entry")
 | 
						|
# Calculate an index into the high-byte table.
 | 
						|
# This is basically a matter of subtracting 32, and jumping in if the result >= 0.
 | 
						|
# But values greater than 160 have the sign-bit set after subtraction,
 | 
						|
# despite being >32.
 | 
						|
# We test for the sign bit and jump after subtraction even if 'negative' in these cases.
 | 
						|
st([tmp])  # 1
 | 
						|
blt(pc() + 5)  # 2
 | 
						|
suba(32)  # 3
 | 
						|
bge(AC)  # 4
 | 
						|
bra([high_byte_action])  # 5
 | 
						|
ld(0)  # 6
 | 
						|
bra(AC)  # 4
 | 
						|
bra([high_byte_action])  # 5
 | 
						|
cost_of_high_byte_table_entry = 6
 | 
						|
# Some space here for other code?
 | 
						|
fillers(until=251)
 | 
						|
# We jump back here after looking up the high-byte of the result.
 | 
						|
# Counting is in reverse
 | 
						|
label("high-byte action.invert-and-add")
 | 
						|
xora(0xFF)  # 4
 | 
						|
label("high-byte action.add")
 | 
						|
adda([result + 1])  # 3
 | 
						|
label("high-byte action.store")
 | 
						|
st([result + 1])  # 2
 | 
						|
ld([tmp])  # 1
 | 
						|
assert pc() & 0xFF == 0xFF, pc()
 | 
						|
cost_of_high_byte_invert_and_add = 4
 | 
						|
cost_of_high_byte_add = 3
 | 
						|
cost_of_high_byte_store = 2
 | 
						|
label("low-byte table entry")
 | 
						|
# With the index in the accumulator, and the current page in the Y register,
 | 
						|
# We jump to the right entry in the low-byte table, replace AC with the result,
 | 
						|
# and jump back out to 'low-byte return point' defined above,
 | 
						|
# using the double-jump trick.
 | 
						|
# This exploits the fact the following is the last instruction on the page,
 | 
						|
# and the high byte of the PC has already incremented,
 | 
						|
# so the bne will take us to an address in the next page.
 | 
						|
# We use the Y register to return to the current page.
 | 
						|
# The table has two zeros at the start, meaning that we can replace the first
 | 
						|
# with the jump back.
 | 
						|
bne(AC)  # 1
 | 
						|
align(0x100, size=0x100)
 | 
						|
jmp(Y, "low-byte return point")  # 2
 | 
						|
ld(0)  # 3
 | 
						|
cost_of_low_byte_table_entry = 3
 | 
						|
C("0 = floor(0 ** 2 / 4) and floor(1 ** 2 / 4)")
 | 
						|
for i in range(2, 256):
 | 
						|
    val = math.floor(i**2 / 4)
 | 
						|
    ld(val)
 | 
						|
    C(f"${val:04x} = {val} = floor({i} ** 2 / 4)")
 | 
						|
 | 
						|
 | 
						|
# Code copied from the main ROM. This provides a lookup table for right-shifts
 | 
						|
align(0x100, size=0x100)
 | 
						|
label("shiftTable")
 | 
						|
shiftTable = pc()
 | 
						|
 | 
						|
for ix in range(255):
 | 
						|
    for n in range(1, 9):  # Find first zero
 | 
						|
        if ~ix & (1 << (n - 1)):
 | 
						|
            break
 | 
						|
    pattern = ["x" if i < n else "1" if ix & (1 << i) else "0" for i in range(8)]
 | 
						|
    ld(ix >> n)
 | 
						|
    C("0b%s >> %d" % ("".join(reversed(pattern)), n))
 | 
						|
 | 
						|
assert pc() & 255 == 255
 | 
						|
bra([continuation])  # Jumps back into next page
 | 
						|
align(0x100, size=0x100)
 | 
						|
nop()  #
 | 
						|
 | 
						|
label("multiply 7x7")
 | 
						|
# The formula is floor(((a + b) ** 2) / 4) - floor(((a - b) ** 2) / 4)
 | 
						|
 | 
						|
ld(".after-first-lookup")  # 1
 | 
						|
st([continuation])
 | 
						|
ld(hi("Quarter-squares lookup table"), Y)
 | 
						|
ld("high-byte action.store")
 | 
						|
st([high_byte_action])  # 5
 | 
						|
ld([a])
 | 
						|
jmp(Y, "table entry")  # 7
 | 
						|
adda([b])  # 8
 | 
						|
cost_to_first_lookup = 8
 | 
						|
 | 
						|
cost_after_first_lookup = (
 | 
						|
    cost_to_first_lookup
 | 
						|
    + cost_of_high_byte_table_entry
 | 
						|
    + cost_of_high_byte_store
 | 
						|
    + cost_of_low_byte_table_entry
 | 
						|
    + cost_of_low_byte_return
 | 
						|
)
 | 
						|
 | 
						|
label(".after-first-lookup")
 | 
						|
# On return we have the low-byte in the accumulator
 | 
						|
# We can safely add one to it without causing an overflow,
 | 
						|
# because 255 does not appear in the low-byte table.
 | 
						|
# This is part of the following subtraction.
 | 
						|
adda(1)  # 1
 | 
						|
st([result])
 | 
						|
ld(".after-second-lookup")
 | 
						|
st([continuation])
 | 
						|
ld("high-byte action.invert-and-add")  # 5
 | 
						|
st([high_byte_action])
 | 
						|
ld([a])
 | 
						|
jmp(Y, "table entry.possibly-negative")  # 8
 | 
						|
suba([b])  # 9
 | 
						|
cost_between_lookups = 9
 | 
						|
 | 
						|
cost_after_second_lookup = (
 | 
						|
    cost_after_first_lookup
 | 
						|
    + cost_between_lookups
 | 
						|
    + cost_of_absolute
 | 
						|
    + cost_of_high_byte_table_entry
 | 
						|
    + cost_of_high_byte_invert_and_add
 | 
						|
    + cost_of_low_byte_table_entry
 | 
						|
    + cost_of_low_byte_return
 | 
						|
)
 | 
						|
 | 
						|
label(".after-second-lookup")
 | 
						|
xora(0xFF)  # 1
 | 
						|
# We need to add this to the result
 | 
						|
# But we may have a carry
 | 
						|
adda([result])
 | 
						|
st([tmp])
 | 
						|
blt(pc() + 4)  # 5
 | 
						|
suba([result])  # 6
 | 
						|
bra(pc() + 4)  # 7
 | 
						|
ora([result])  # 8
 | 
						|
bra(pc() + 2)  # 7
 | 
						|
anda([result])  # 8
 | 
						|
anda(0b1000_0000, X)  # 9
 | 
						|
ld([tmp])
 | 
						|
st([result])  # 10
 | 
						|
ld([X])
 | 
						|
adda([result + 1])
 | 
						|
st([result + 1])  # 13
 | 
						|
cost_of_final_add = 13
 | 
						|
 | 
						|
cost_of_7bit_multiply = cost_after_second_lookup + cost_of_final_add
 | 
						|
C(f"Total cost: {cost_of_7bit_multiply} cycles")
 | 
						|
label("done")
 | 
						|
nop()
 | 
						|
 | 
						|
 | 
						|
label("multiply 8x8")
 | 
						|
# Extend the 7bit x 7bit multiplication to 8bit x 8bit
 | 
						|
#
 | 
						|
# The logic goes as follows.
 | 
						|
# Let A and B be the low seven bits of a and b, e.g.
 | 
						|
# A = a₆2⁶ + a₅2⁵ + a₄2⁴ + a₃2³ + a₂2² + a₁2¹ + a₀2⁰
 | 
						|
# B = b₆2⁶ + b₅2⁵ + b₄2⁴ + b₃2³ + b₂2² + b₁2¹ + b₀2⁰
 | 
						|
# Then we could think of 8bit x 8bit multiply as
 | 
						|
# (a₇2⁷ + A)(b₇2⁷ + B)
 | 
						|
# Multiplying out the brackets gives
 | 
						|
# a₇2⁷b₇2⁷ + a₇2⁷B + b₇2⁷A + AB
 | 
						|
# (and AB is the result we already know how to calculate)
 | 
						|
# Simplifying a bit, we get
 | 
						|
# a₇b₇2¹⁴ + 2⁷(a₇B + b₇A) + AB
 | 
						|
# Since a₇ and b₇ are one or zero, multiplying by them is like an if.
 | 
						|
# We can consider four cases,
 | 
						|
# if a₇ is 1 and b₇ is 1:
 | 
						|
#   2¹⁴ + 2⁷(B + A) + AB
 | 
						|
# if a₇ is 1 and b₇ is 0:
 | 
						|
#   2⁷B + AB
 | 
						|
# if a₇ is 0 and b₇ is 1:
 | 
						|
#   2⁷A + AB
 | 
						|
# if a₇ is 0 and b₇ is 0:
 | 
						|
#   AB
 | 
						|
# Multiplication by 2⁷(=128) is a left-shift by seven, which
 | 
						|
# can be broken down as moving the LSB of the low-byte to the
 | 
						|
# MSB of the low-byte, and setting all of the other bits to zero,
 | 
						|
# The other bits of the low byte can be right-shifted by one,
 | 
						|
# and moved to the high byte.
 | 
						|
 | 
						|
# Test which of the branches we are on.
 | 
						|
ld([a])  # 1
 | 
						|
xora([b])
 | 
						|
blt(".one MSB set")  # 3
 | 
						|
ld([a])  # 4
 | 
						|
 | 
						|
# Both MSBs equal
 | 
						|
bge("multiply 7x7")  # 5
 | 
						|
anda(0b0111_1111)  # 6
 | 
						|
cost_of_8bit_multiply__both_msbs_low = cost_of_7bit_multiply + 6
 | 
						|
 | 
						|
# Both MSBs set
 | 
						|
st([a])  # 7; a = A
 | 
						|
ld(".after right-shift")
 | 
						|
st([continuation])
 | 
						|
ld(2**14 >> 8)  # 10; Write the high-byte for later addition.
 | 
						|
st([result + 1])
 | 
						|
ld([b])
 | 
						|
anda(0b0111_1111)
 | 
						|
st([b])  # b = B
 | 
						|
adda([a])  # 15
 | 
						|
cost_of_both_msbs_set = 15
 | 
						|
 | 
						|
label(". << 7")
 | 
						|
st([tmp])  # 1
 | 
						|
anda(0b0000_0001)  # Clear all but the bottom bit
 | 
						|
adda(0b0111_1111)  # Carries bottom bit to top bit
 | 
						|
anda(0b1000_0000)  # Clears all but the top bit
 | 
						|
st([result])  # 5
 | 
						|
ld([tmp])
 | 
						|
anda(0b1111_1110)  # Calculate index to right-shift-table
 | 
						|
ld(hi("shiftTable"), Y)
 | 
						|
jmp(Y, AC)  # 9
 | 
						|
bra(0xFF)  # 10
 | 
						|
# 11 ld (a + b) >> 1
 | 
						|
# 12 bra [continuation]
 | 
						|
# 12 nop
 | 
						|
cost_of_right_shift = 13
 | 
						|
cost_after_right_shift = cost_of_both_msbs_set + cost_of_right_shift
 | 
						|
 | 
						|
label(".one MSB set")
 | 
						|
bge(".b has msb set")  # 5
 | 
						|
ld(0b0111_1111)  # 6
 | 
						|
# a has msb set
 | 
						|
anda([a])  # 7
 | 
						|
st([a])
 | 
						|
ld(lo(".after right-shift") + 1)
 | 
						|
st([continuation])  # 10
 | 
						|
bra(". << 7")  # 11
 | 
						|
ld([b])  # 12
 | 
						|
label(".b has msb set")
 | 
						|
anda([b])  # 7
 | 
						|
st([b])
 | 
						|
ld(lo(".after right-shift") + 1)
 | 
						|
st([continuation])  # 10
 | 
						|
bra(". << 7")  # 11
 | 
						|
ld([a])  # 12
 | 
						|
cost_of_one_msb_set = 12
 | 
						|
one_msb_cost_saving = cost_of_both_msbs_set - cost_of_one_msb_set + 1
 | 
						|
 | 
						|
label(".after right-shift")
 | 
						|
adda([result + 1])  # 1
 | 
						|
st([result + 1])
 | 
						|
ld(".after-first-lookup-8bit")
 | 
						|
st([continuation])
 | 
						|
ld(hi("Quarter-squares lookup table"), Y)  # 5
 | 
						|
ld("high-byte action.add")
 | 
						|
st([high_byte_action])
 | 
						|
ld([a])
 | 
						|
jmp(Y, "table entry")  # 9
 | 
						|
adda([b])  # 10
 | 
						|
cost_of_after_right_shift = 10
 | 
						|
cost_after_first_lookup__8bit = (
 | 
						|
    cost_after_right_shift
 | 
						|
    + cost_of_after_right_shift
 | 
						|
    + cost_of_high_byte_table_entry
 | 
						|
    + cost_of_high_byte_add
 | 
						|
    + cost_of_low_byte_table_entry
 | 
						|
    + cost_of_low_byte_return
 | 
						|
)
 | 
						|
 | 
						|
label(".after-first-lookup-8bit")
 | 
						|
# On return we have the low-byte in the accumulator
 | 
						|
# We already either have 0 or 128 in the low byte of the result.
 | 
						|
# Adding will cause an overflow into the high byte iff both
 | 
						|
# have the MSB set, in which case the result will definitely
 | 
						|
# not have the MSB set.
 | 
						|
adda([result])  # 1
 | 
						|
blt(pc() + 3)  # 2
 | 
						|
bra(pc() + 3)  # 3
 | 
						|
ld([result], X)  # 4; May be a carry
 | 
						|
ld(0, X)  # 4; Definitely no carry
 | 
						|
# We can safely add one to the result without causing a further overflow,
 | 
						|
# because 255 does not appear in the low-byte table.
 | 
						|
# This is part of the subtraction.
 | 
						|
adda(1)  # 5
 | 
						|
st([result])
 | 
						|
# Do the carry
 | 
						|
ld([X])
 | 
						|
adda([result + 1])
 | 
						|
st([result + 1])
 | 
						|
ld(".after-second-lookup")  # 10
 | 
						|
st([continuation])
 | 
						|
ld("high-byte action.invert-and-add")
 | 
						|
st([high_byte_action])
 | 
						|
ld([a])
 | 
						|
jmp(Y, "table entry.possibly-negative")  # 15
 | 
						|
suba([b])  # 16
 | 
						|
cost_between_lookups__8bit = 16
 | 
						|
 | 
						|
cost_after_second_lookup__8bit = (
 | 
						|
    cost_after_first_lookup__8bit
 | 
						|
    + cost_between_lookups__8bit
 | 
						|
    + cost_of_absolute
 | 
						|
    + cost_of_high_byte_table_entry
 | 
						|
    + cost_of_high_byte_invert_and_add
 | 
						|
    + cost_of_low_byte_table_entry
 | 
						|
    + cost_of_low_byte_return
 | 
						|
)
 | 
						|
 | 
						|
cost_of_8bit_multiply = cost_after_second_lookup__8bit + cost_of_final_add
 | 
						|
no_msb_cost_saving = cost_of_8bit_multiply - cost_of_8bit_multiply__both_msbs_low
 | 
						|
C(f"Worst case cost: {cost_of_8bit_multiply}")
 | 
						|
C(f"Saving when both values < 128: {no_msb_cost_saving}")
 | 
						|
C(f"Saving when only one value < 128: {one_msb_cost_saving}")
 | 
						|
end()
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
    disableListing()
 | 
						|
    writeRomFiles(sys.argv[0])
 |