mirror of
https://github.com/holub/mame
synced 2025-04-22 00:11:58 +03:00
276 lines
6.1 KiB
NASM
276 lines
6.1 KiB
NASM
; Sha256Opt.asm -- SHA-256 optimized code for SHA-256 x86 hardware instructions
|
|
; 2022-04-17 : Igor Pavlov : Public domain
|
|
|
|
include 7zAsm.asm
|
|
|
|
MY_ASM_START
|
|
|
|
; .data
|
|
; public K
|
|
|
|
; we can use external SHA256_K_ARRAY defined in Sha256.c
|
|
; but we must guarantee that SHA256_K_ARRAY is aligned for 16-bytes
|
|
|
|
COMMENT @
|
|
ifdef x64
|
|
K_CONST equ SHA256_K_ARRAY
|
|
else
|
|
K_CONST equ _SHA256_K_ARRAY
|
|
endif
|
|
EXTRN K_CONST:xmmword
|
|
@
|
|
|
|
CONST SEGMENT
|
|
|
|
align 16
|
|
Reverse_Endian_Mask db 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12
|
|
|
|
; COMMENT @
|
|
align 16
|
|
K_CONST \
|
|
DD 0428a2f98H, 071374491H, 0b5c0fbcfH, 0e9b5dba5H
|
|
DD 03956c25bH, 059f111f1H, 0923f82a4H, 0ab1c5ed5H
|
|
DD 0d807aa98H, 012835b01H, 0243185beH, 0550c7dc3H
|
|
DD 072be5d74H, 080deb1feH, 09bdc06a7H, 0c19bf174H
|
|
DD 0e49b69c1H, 0efbe4786H, 00fc19dc6H, 0240ca1ccH
|
|
DD 02de92c6fH, 04a7484aaH, 05cb0a9dcH, 076f988daH
|
|
DD 0983e5152H, 0a831c66dH, 0b00327c8H, 0bf597fc7H
|
|
DD 0c6e00bf3H, 0d5a79147H, 006ca6351H, 014292967H
|
|
DD 027b70a85H, 02e1b2138H, 04d2c6dfcH, 053380d13H
|
|
DD 0650a7354H, 0766a0abbH, 081c2c92eH, 092722c85H
|
|
DD 0a2bfe8a1H, 0a81a664bH, 0c24b8b70H, 0c76c51a3H
|
|
DD 0d192e819H, 0d6990624H, 0f40e3585H, 0106aa070H
|
|
DD 019a4c116H, 01e376c08H, 02748774cH, 034b0bcb5H
|
|
DD 0391c0cb3H, 04ed8aa4aH, 05b9cca4fH, 0682e6ff3H
|
|
DD 0748f82eeH, 078a5636fH, 084c87814H, 08cc70208H
|
|
DD 090befffaH, 0a4506cebH, 0bef9a3f7H, 0c67178f2H
|
|
; @
|
|
|
|
CONST ENDS
|
|
|
|
; _TEXT$SHA256OPT SEGMENT 'CODE'
|
|
|
|
ifndef x64
|
|
.686
|
|
.xmm
|
|
endif
|
|
|
|
; jwasm-based assemblers for linux and linker from new versions of binutils
|
|
; can generate incorrect code for load [ARRAY + offset] instructions.
|
|
; 22.00: we load K_CONST offset to (rTable) register to avoid jwasm+binutils problem
|
|
rTable equ r0
|
|
; rTable equ K_CONST
|
|
|
|
ifdef x64
|
|
rNum equ REG_ABI_PARAM_2
|
|
if (IS_LINUX eq 0)
|
|
LOCAL_SIZE equ (16 * 2)
|
|
endif
|
|
else
|
|
rNum equ r3
|
|
LOCAL_SIZE equ (16 * 1)
|
|
endif
|
|
|
|
rState equ REG_ABI_PARAM_0
|
|
rData equ REG_ABI_PARAM_1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MY_SHA_INSTR macro cmd, a1, a2
|
|
db 0fH, 038H, cmd, (0c0H + a1 * 8 + a2)
|
|
endm
|
|
|
|
cmd_sha256rnds2 equ 0cbH
|
|
cmd_sha256msg1 equ 0ccH
|
|
cmd_sha256msg2 equ 0cdH
|
|
|
|
MY_sha256rnds2 macro a1, a2
|
|
MY_SHA_INSTR cmd_sha256rnds2, a1, a2
|
|
endm
|
|
|
|
MY_sha256msg1 macro a1, a2
|
|
MY_SHA_INSTR cmd_sha256msg1, a1, a2
|
|
endm
|
|
|
|
MY_sha256msg2 macro a1, a2
|
|
MY_SHA_INSTR cmd_sha256msg2, a1, a2
|
|
endm
|
|
|
|
MY_PROLOG macro
|
|
ifdef x64
|
|
if (IS_LINUX eq 0)
|
|
movdqa [r4 + 8], xmm6
|
|
movdqa [r4 + 8 + 16], xmm7
|
|
sub r4, LOCAL_SIZE + 8
|
|
movdqa [r4 ], xmm8
|
|
movdqa [r4 + 16], xmm9
|
|
endif
|
|
else ; x86
|
|
push r3
|
|
push r5
|
|
mov r5, r4
|
|
NUM_PUSH_REGS equ 2
|
|
PARAM_OFFSET equ (REG_SIZE * (1 + NUM_PUSH_REGS))
|
|
if (IS_CDECL gt 0)
|
|
mov rState, [r4 + PARAM_OFFSET]
|
|
mov rData, [r4 + PARAM_OFFSET + REG_SIZE * 1]
|
|
mov rNum, [r4 + PARAM_OFFSET + REG_SIZE * 2]
|
|
else ; fastcall
|
|
mov rNum, [r4 + PARAM_OFFSET]
|
|
endif
|
|
and r4, -16
|
|
sub r4, LOCAL_SIZE
|
|
endif
|
|
endm
|
|
|
|
MY_EPILOG macro
|
|
ifdef x64
|
|
if (IS_LINUX eq 0)
|
|
movdqa xmm8, [r4]
|
|
movdqa xmm9, [r4 + 16]
|
|
add r4, LOCAL_SIZE + 8
|
|
movdqa xmm6, [r4 + 8]
|
|
movdqa xmm7, [r4 + 8 + 16]
|
|
endif
|
|
else ; x86
|
|
mov r4, r5
|
|
pop r5
|
|
pop r3
|
|
endif
|
|
MY_ENDP
|
|
endm
|
|
|
|
|
|
msg equ xmm0
|
|
tmp equ xmm0
|
|
state0_N equ 2
|
|
state1_N equ 3
|
|
w_regs equ 4
|
|
|
|
|
|
state1_save equ xmm1
|
|
state0 equ @CatStr(xmm, %state0_N)
|
|
state1 equ @CatStr(xmm, %state1_N)
|
|
|
|
|
|
ifdef x64
|
|
state0_save equ xmm8
|
|
mask2 equ xmm9
|
|
else
|
|
state0_save equ [r4]
|
|
mask2 equ xmm0
|
|
endif
|
|
|
|
LOAD_MASK macro
|
|
movdqa mask2, XMMWORD PTR Reverse_Endian_Mask
|
|
endm
|
|
|
|
LOAD_W macro k:req
|
|
movdqu @CatStr(xmm, %(w_regs + k)), [rData + (16 * (k))]
|
|
pshufb @CatStr(xmm, %(w_regs + k)), mask2
|
|
endm
|
|
|
|
|
|
; pre1 <= 4 && pre2 >= 1 && pre1 > pre2 && (pre1 - pre2) <= 1
|
|
pre1 equ 3
|
|
pre2 equ 2
|
|
|
|
|
|
|
|
RND4 macro k
|
|
movdqa msg, xmmword ptr [rTable + (k) * 16]
|
|
paddd msg, @CatStr(xmm, %(w_regs + ((k + 0) mod 4)))
|
|
MY_sha256rnds2 state0_N, state1_N
|
|
pshufd msg, msg, 0eH
|
|
|
|
if (k GE (4 - pre1)) AND (k LT (16 - pre1))
|
|
; w4[0] = msg1(w4[-4], w4[-3])
|
|
MY_sha256msg1 (w_regs + ((k + pre1) mod 4)), (w_regs + ((k + pre1 - 3) mod 4))
|
|
endif
|
|
|
|
MY_sha256rnds2 state1_N, state0_N
|
|
|
|
if (k GE (4 - pre2)) AND (k LT (16 - pre2))
|
|
movdqa tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 1) mod 4)))
|
|
palignr tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 2) mod 4))), 4
|
|
paddd @CatStr(xmm, %(w_regs + ((k + pre2) mod 4))), tmp
|
|
; w4[0] = msg2(w4[0], w4[-1])
|
|
MY_sha256msg2 %(w_regs + ((k + pre2) mod 4)), %(w_regs + ((k + pre2 - 1) mod 4))
|
|
endif
|
|
endm
|
|
|
|
|
|
|
|
|
|
|
|
REVERSE_STATE macro
|
|
; state0 ; dcba
|
|
; state1 ; hgfe
|
|
pshufd tmp, state0, 01bH ; abcd
|
|
pshufd state0, state1, 01bH ; efgh
|
|
movdqa state1, state0 ; efgh
|
|
punpcklqdq state0, tmp ; cdgh
|
|
punpckhqdq state1, tmp ; abef
|
|
endm
|
|
|
|
|
|
MY_PROC Sha256_UpdateBlocks_HW, 3
|
|
MY_PROLOG
|
|
|
|
lea rTable, [K_CONST]
|
|
|
|
cmp rNum, 0
|
|
je end_c
|
|
|
|
movdqu state0, [rState] ; dcba
|
|
movdqu state1, [rState + 16] ; hgfe
|
|
|
|
REVERSE_STATE
|
|
|
|
ifdef x64
|
|
LOAD_MASK
|
|
endif
|
|
|
|
align 16
|
|
nextBlock:
|
|
movdqa state0_save, state0
|
|
movdqa state1_save, state1
|
|
|
|
ifndef x64
|
|
LOAD_MASK
|
|
endif
|
|
|
|
LOAD_W 0
|
|
LOAD_W 1
|
|
LOAD_W 2
|
|
LOAD_W 3
|
|
|
|
|
|
k = 0
|
|
rept 16
|
|
RND4 k
|
|
k = k + 1
|
|
endm
|
|
|
|
paddd state0, state0_save
|
|
paddd state1, state1_save
|
|
|
|
add rData, 64
|
|
sub rNum, 1
|
|
jnz nextBlock
|
|
|
|
REVERSE_STATE
|
|
|
|
movdqu [rState], state0
|
|
movdqu [rState + 16], state1
|
|
|
|
end_c:
|
|
MY_EPILOG
|
|
|
|
; _TEXT$SHA256OPT ENDS
|
|
|
|
end
|