From 54eaeea942f135fb7853f3caff091032e250e72f Mon Sep 17 00:00:00 2001 From: Ryan Holtz Date: Tue, 25 Mar 2014 10:59:11 +0000 Subject: [PATCH] (nw) Temporarily split all RSP functions into SIMD and scalar versions so that they can more easily be run alongside each other for the purposes of regression testing. Currently SIMD ops introduce some lighting issues, but only when a light and model are axis-aligned, which makes it remarkably difficult to track down based solely on comprehensive logging, so this will eventually allow for running the scalar version of the opcode after the SIMD one in order to deduce exactly which SIMD opcode is divergent. --- src/emu/cpu/rsp/rsp.h | 3 +- src/emu/cpu/rsp/rspdrc.c | 3723 ++++++++++++++++++++++++++++---------- 2 files changed, 2722 insertions(+), 1004 deletions(-) diff --git a/src/emu/cpu/rsp/rsp.h b/src/emu/cpu/rsp/rsp.h index 8029742dd43..accbdf4ae8a 100644 --- a/src/emu/cpu/rsp/rsp.h +++ b/src/emu/cpu/rsp/rsp.h @@ -16,7 +16,8 @@ #ifndef __RSP_H__ #define __RSP_H__ -#define USE_SIMD (0) +#define USE_SIMD (1) +#define SIMUL_SIMD (1) #if USE_SIMD #include diff --git a/src/emu/cpu/rsp/rspdrc.c b/src/emu/cpu/rsp/rspdrc.c index a78b5aa814b..58561139457 100644 --- a/src/emu/cpu/rsp/rspdrc.c +++ b/src/emu/cpu/rsp/rspdrc.c @@ -153,31 +153,59 @@ static void cfunc_ctc2(void *param); //static void cfunc_lwc2(void *param); static void cfunc_sp_set_status_cb(void *param); -static void cfunc_rsp_lbv(void *param); -static void cfunc_rsp_lsv(void *param); -static void cfunc_rsp_llv(void *param); -static void cfunc_rsp_ldv(void *param); -static void cfunc_rsp_lqv(void *param); -static void cfunc_rsp_lrv(void *param); -static void cfunc_rsp_lpv(void *param); -static void cfunc_rsp_luv(void *param); -static void cfunc_rsp_lhv(void *param); -static void cfunc_rsp_lfv(void *param); -static void cfunc_rsp_lwv(void *param); -static void cfunc_rsp_ltv(void *param); +#if USE_SIMD +static void cfunc_rsp_lbv_simd(void *param); +static void cfunc_rsp_lsv_simd(void *param); +static void cfunc_rsp_llv_simd(void *param); +static void cfunc_rsp_ldv_simd(void *param); +static void cfunc_rsp_lqv_simd(void *param); +static void cfunc_rsp_lrv_simd(void *param); +static void cfunc_rsp_lpv_simd(void *param); +static void cfunc_rsp_luv_simd(void *param); +static void cfunc_rsp_lhv_simd(void *param); +static void cfunc_rsp_lfv_simd(void *param); +static void cfunc_rsp_lwv_simd(void *param); +static void cfunc_rsp_ltv_simd(void *param); -static void cfunc_rsp_sbv(void *param); -static void cfunc_rsp_ssv(void *param); -static void cfunc_rsp_slv(void *param); -static void cfunc_rsp_sdv(void *param); -static void cfunc_rsp_sqv(void *param); -static void cfunc_rsp_srv(void *param); -static void cfunc_rsp_spv(void *param); -static void cfunc_rsp_suv(void *param); -static void cfunc_rsp_shv(void *param); -static void cfunc_rsp_sfv(void *param); -static void cfunc_rsp_swv(void *param); -static void cfunc_rsp_stv(void *param); +static void cfunc_rsp_sbv_simd(void *param); +static void cfunc_rsp_ssv_simd(void *param); +static void cfunc_rsp_slv_simd(void *param); +static void cfunc_rsp_sdv_simd(void *param); +static void cfunc_rsp_sqv_simd(void *param); +static void cfunc_rsp_srv_simd(void *param); +static void cfunc_rsp_spv_simd(void *param); +static void cfunc_rsp_suv_simd(void *param); +static void cfunc_rsp_shv_simd(void *param); +static void cfunc_rsp_sfv_simd(void *param); +static void cfunc_rsp_swv_simd(void *param); +static void cfunc_rsp_stv_simd(void *param); +#elif (!USE_SIMD || SIMUL_SIMD) +static void cfunc_rsp_lbv_scalar(void *param); +static void cfunc_rsp_lsv_scalar(void *param); +static void cfunc_rsp_llv_scalar(void *param); +static void cfunc_rsp_ldv_scalar(void *param); +static void cfunc_rsp_lqv_scalar(void *param); +static void cfunc_rsp_lrv_scalar(void *param); +static void cfunc_rsp_lpv_scalar(void *param); +static void cfunc_rsp_luv_scalar(void *param); +static void cfunc_rsp_lhv_scalar(void *param); +static void cfunc_rsp_lfv_scalar(void *param); +static void cfunc_rsp_lwv_scalar(void *param); +static void cfunc_rsp_ltv_scalar(void *param); + +static void cfunc_rsp_sbv_scalar(void *param); +static void cfunc_rsp_ssv_scalar(void *param); +static void cfunc_rsp_slv_scalar(void *param); +static void cfunc_rsp_sdv_scalar(void *param); +static void cfunc_rsp_sqv_scalar(void *param); +static void cfunc_rsp_srv_scalar(void *param); +static void cfunc_rsp_spv_scalar(void *param); +static void cfunc_rsp_suv_scalar(void *param); +static void cfunc_rsp_shv_scalar(void *param); +static void cfunc_rsp_sfv_scalar(void *param); +static void cfunc_rsp_swv_scalar(void *param); +static void cfunc_rsp_stv_scalar(void *param); +#endif static void static_generate_entry_point(rsp_state *rsp); static void static_generate_nocode_handler(rsp_state *rsp); @@ -245,34 +273,35 @@ static void log_add_disasm_comment(rsp_state *rsp, drcuml_block *block, UINT32 p #define VEC_EL_2(x,z) (vector_elements_2[(x)][(z)]) #define ACCUM(x) rsp->accum[x].q + #if USE_SIMD -INLINE UINT16 ACCUM_H(const rsp_state *rsp, int x) +INLINE UINT16 VEC_ACCUM_H(const rsp_state *rsp, int x) { UINT16 out; SIMD_EXTRACT16(rsp->accum_h, out, x); return out; } -INLINE UINT16 ACCUM_M(const rsp_state *rsp, int x) +INLINE UINT16 VEC_ACCUM_M(const rsp_state *rsp, int x) { UINT16 out; SIMD_EXTRACT16(rsp->accum_m, out, x); return out; } -INLINE UINT16 ACCUM_L(const rsp_state *rsp, int x) +INLINE UINT16 VEC_ACCUM_L(const rsp_state *rsp, int x) { UINT16 out; SIMD_EXTRACT16(rsp->accum_l, out, x); return out; } -#define SET_ACCUM_H(v, x) SIMD_INSERT16(rsp->accum_h, v, x); -#define SET_ACCUM_M(v, x) SIMD_INSERT16(rsp->accum_m, v, x); -#define SET_ACCUM_L(v, x) SIMD_INSERT16(rsp->accum_l, v, x); +#define VEC_SET_ACCUM_H(v, x) SIMD_INSERT16(rsp->accum_h, v, x); +#define VEC_SET_ACCUM_M(v, x) SIMD_INSERT16(rsp->accum_m, v, x); +#define VEC_SET_ACCUM_L(v, x) SIMD_INSERT16(rsp->accum_l, v, x); -#define SCALAR_GET_VS1(out, i) SIMD_EXTRACT16(rsp->xv[VS1REG], out, i) -#define SCALAR_GET_VS2(out, i) SIMD_EXTRACT16(rsp->xv[VS2REG], out, VEC_EL_2(EL, i)) +#define SCALAR_GET_VS1(out, i) SIMD_EXTRACT16(rsp->xv[VS1REG], out, i); +#define SCALAR_GET_VS2(out, i) SIMD_EXTRACT16(rsp->xv[VS2REG], out, VEC_EL_2(EL, i)); #else @@ -909,7 +938,17 @@ static CPU_RESET( rsp ) rsp->nextpc = ~0; } -static void cfunc_rsp_lbv(void *param) +#if USE_SIMD +// LBV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 00000 | IIII | Offset | +// -------------------------------------------------- +// +// Load 1 byte to vector byte index + +static void cfunc_rsp_lbv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -923,27 +962,49 @@ static void cfunc_rsp_lbv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 00000 | IIII | Offset | - // -------------------------------------------------- - // - // Load 1 byte to vector byte index ea = (base) ? rsp->r[base] + offset : offset; -#if USE_SIMD UINT16 element; SIMD_EXTRACT16(rsp->xv[dest], element, (index >> 1)); element &= 0xff00 >> ((1-(index & 1)) * 8); element |= READ8(rsp, ea) << ((1-(index & 1)) * 8); SIMD_INSERT16(rsp->xv[dest], element, (index >> 1)); -#else - VREG_B(dest, index) = READ8(rsp, ea); -#endif } -static void cfunc_rsp_lsv(void *param) +#else + +static void cfunc_rsp_lbv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + + UINT32 ea = 0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + ea = (base) ? rsp->r[base] + offset : offset; + VREG_B(dest, index) = READ8(rsp, ea); +} +#endif + +#if USE_SIMD +// LSV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 00001 | IIII | Offset | +// -------------------------------------------------- +// +// Loads 2 bytes starting from vector byte index + +static void cfunc_rsp_lsv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -955,31 +1016,56 @@ static void cfunc_rsp_lsv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 00001 | IIII | Offset | - // -------------------------------------------------- - // - // Loads 2 bytes starting from vector byte index UINT32 ea = (base) ? rsp->r[base] + (offset * 2) : (offset * 2); int end = index + 2; for (int i = index; i < end; i++) { -#if USE_SIMD UINT16 element; SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1)); element &= 0xff00 >> ((1 - (i & 1)) * 8); element |= READ8(rsp, ea) << ((1 - (i & 1)) * 8); SIMD_INSERT16(rsp->xv[dest], element, (i >> 1)); -#else - VREG_B(dest, i) = READ8(rsp, ea); -#endif ea++; } } -static void cfunc_rsp_llv(void *param) +#else + +static void cfunc_rsp_lsv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xe; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 2) : (offset * 2); + int end = index + 2; + for (int i = index; i < end; i++) + { + VREG_B(dest, i) = READ8(rsp, ea); + ea++; + } +} +#endif + +#if USE_SIMD +// LLV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 00010 | IIII | Offset | +// -------------------------------------------------- +// +// Loads 4 bytes starting from vector byte index + +static void cfunc_rsp_llv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -992,12 +1078,6 @@ static void cfunc_rsp_llv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 00010 | IIII | Offset | - // -------------------------------------------------- - // - // Loads 4 bytes starting from vector byte index ea = (base) ? rsp->r[base] + (offset * 4) : (offset * 4); @@ -1005,20 +1085,54 @@ static void cfunc_rsp_llv(void *param) for (int i = index; i < end; i++) { -#if USE_SIMD UINT16 element; SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1)); element &= 0xff00 >> ((1 - (i & 1)) * 8); element |= READ8(rsp, ea) << ((1 - (i & 1)) * 8); SIMD_INSERT16(rsp->xv[dest], element, (i >> 1)); -#else - VREG_B(dest, i) = READ8(rsp, ea); -#endif ea++; } } -static void cfunc_rsp_ldv(void *param) +#else + +static void cfunc_rsp_llv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + UINT32 ea = 0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xc; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + ea = (base) ? rsp->r[base] + (offset * 4) : (offset * 4); + + int end = index + 4; + + for (int i = index; i < end; i++) + { + VREG_B(dest, i) = READ8(rsp, ea); + ea++; + } +} +#endif + +#if USE_SIMD +// LDV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 00011 | IIII | Offset | +// -------------------------------------------------- +// +// Loads 8 bytes starting from vector byte index + +static void cfunc_rsp_ldv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1031,12 +1145,6 @@ static void cfunc_rsp_ldv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 00011 | IIII | Offset | - // -------------------------------------------------- - // - // Loads 8 bytes starting from vector byte index ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); @@ -1044,37 +1152,64 @@ static void cfunc_rsp_ldv(void *param) for (int i = index; i < end; i++) { -#if USE_SIMD UINT16 element; SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1)); element &= 0xff00 >> ((1 - (i & 1)) * 8); element |= READ8(rsp, ea) << ((1 - (i & 1)) * 8); SIMD_INSERT16(rsp->xv[dest], element, (i >> 1)); -#else - VREG_B(dest, i) = READ8(rsp, ea); -#endif ea++; } } -static void cfunc_rsp_lqv(void *param) +#else + +static void cfunc_rsp_ldv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + UINT32 ea = 0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0x8; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); + + int end = index + 8; + + for (int i = index; i < end; i++) + { + VREG_B(dest, i) = READ8(rsp, ea); + ea++; + } +} +#endif + +#if USE_SIMD +// LQV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 00100 | IIII | Offset | +// -------------------------------------------------- +// +// Loads up to 16 bytes starting from vector byte index + +static void cfunc_rsp_lqv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; int dest = (op >> 16) & 0x1f; int base = (op >> 21) & 0x1f; - //int index = 0; // Just a test, it goes right back the way it was if something breaks //(op >> 7) & 0xf; int offset = (op & 0x7f); if (offset & 0x40) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 00100 | IIII | Offset | - // -------------------------------------------------- - // - // Loads up to 16 bytes starting from vector byte index UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); @@ -1083,20 +1218,53 @@ static void cfunc_rsp_lqv(void *param) for (int i = 0; i < end; i++) { -#if USE_SIMD UINT16 element; SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1)); element &= 0xff00 >> ((1 - (i & 1)) * 8); element |= READ8(rsp, ea) << ((1 - (i & 1)) * 8); SIMD_INSERT16(rsp->xv[dest], element, (i >> 1)); -#else - VREG_B(dest, i) = READ8(rsp, ea); -#endif ea++; } } -static void cfunc_rsp_lrv(void *param) +#else + +static void cfunc_rsp_lqv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + + int end = 16 - (ea & 0xf); + if (end > 16) end = 16; + + for (int i = 0; i < end; i++) + { + VREG_B(dest, i) = READ8(rsp, ea); + ea++; + } +} +#endif + +#if USE_SIMD +// LRV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 00101 | IIII | Offset | +// -------------------------------------------------- +// +// Stores up to 16 bytes starting from right side until 16-byte boundary + +static void cfunc_rsp_lrv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1108,12 +1276,6 @@ static void cfunc_rsp_lrv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 00101 | IIII | Offset | - // -------------------------------------------------- - // - // Stores up to 16 bytes starting from right side until 16-byte boundary UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); @@ -1122,20 +1284,54 @@ static void cfunc_rsp_lrv(void *param) for (int i = index; i < 16; i++) { -#if USE_SIMD UINT16 element; SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1)); element &= 0xff00 >> ((1-(i & 1)) * 8); element |= READ8(rsp, ea) << ((1-(i & 1)) * 8); SIMD_INSERT16(rsp->xv[dest], element, (i >> 1)); -#else - VREG_B(dest, i) = READ8(rsp, ea); -#endif ea++; } } -static void cfunc_rsp_lpv(void *param) +#else + +static void cfunc_rsp_lrv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + + index = 16 - ((ea & 0xf) - index); + ea &= ~0xf; + + for (int i = index; i < 16; i++) + { + VREG_B(dest, i) = READ8(rsp, ea); + ea++; + } +} +#endif + +#if USE_SIMD +// LPV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 00110 | IIII | Offset | +// -------------------------------------------------- +// +// Loads a byte as the upper 8 bits of each element + +static void cfunc_rsp_lpv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1147,26 +1343,18 @@ static void cfunc_rsp_lpv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 00110 | IIII | Offset | - // -------------------------------------------------- - // - // Loads a byte as the upper 8 bits of each element UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); for (int i = 0; i < 8; i++) { -#if USE_SIMD SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea + (((16-index) + i) & 0xf)) << 8, i); -#else - W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 8; -#endif } } -static void cfunc_rsp_luv(void *param) +#else + +static void cfunc_rsp_lpv_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1178,26 +1366,82 @@ static void cfunc_rsp_luv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 00111 | IIII | Offset | - // -------------------------------------------------- - // - // Loads a byte as the bits 14-7 of each element UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); for (int i = 0; i < 8; i++) { -#if USE_SIMD - SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea + (((16-index) + i) & 0xf)) << 7, i); -#else - W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 7; -#endif + W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 8; } } +#endif -static void cfunc_rsp_lhv(void *param) +#if USE_SIMD +// LUV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 00111 | IIII | Offset | +// -------------------------------------------------- +// +// Loads a byte as the bits 14-7 of each element + +static void cfunc_rsp_luv_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); + + for (int i = 0; i < 8; i++) + { + SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea + (((16-index) + i) & 0xf)) << 7, i); + } +} + +#else + +static void cfunc_rsp_luv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); + + for (int i = 0; i < 8; i++) + { + W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 7; + } +} +#endif + +#if USE_SIMD +// LHV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 01000 | IIII | Offset | +// -------------------------------------------------- +// +// Loads a byte as the bits 14-7 of each element, with 2-byte stride + +static void cfunc_rsp_lhv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1209,26 +1453,49 @@ static void cfunc_rsp_lhv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 01000 | IIII | Offset | - // -------------------------------------------------- - // - // Loads a byte as the bits 14-7 of each element, with 2-byte stride UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); for (int i = 0; i < 8; i++) { -#if USE_SIMD SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea + (((16-index) + (i<<1)) & 0xf)) << 7, i); -#else - W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + (i<<1)) & 0xf)) << 7; -#endif } } -static void cfunc_rsp_lfv(void *param) +#else + +static void cfunc_rsp_lhv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + + for (int i = 0; i < 8; i++) + { + W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + (i<<1)) & 0xf)) << 7; + } +} +#endif + +#if USE_SIMD +// LFV +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 01001 | IIII | Offset | +// -------------------------------------------------- +// +// Loads a byte as the bits 14-7 of upper or lower quad, with 4-byte stride + +static void cfunc_rsp_lfv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1240,12 +1507,6 @@ static void cfunc_rsp_lfv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 01001 | IIII | Offset | - // -------------------------------------------------- - // - // Loads a byte as the bits 14-7 of upper or lower quad, with 4-byte stride UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); @@ -1255,16 +1516,14 @@ static void cfunc_rsp_lfv(void *param) for (int i = index >> 1; i < end; i++) { -#if USE_SIMD SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea) << 7, i); -#else - W_VREG_S(dest, i) = READ8(rsp, ea) << 7; -#endif ea += 4; } } -static void cfunc_rsp_lwv(void *param) +#else + +static void cfunc_rsp_lfv_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1276,37 +1535,33 @@ static void cfunc_rsp_lwv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 01010 | IIII | Offset | - // -------------------------------------------------- - // - // Loads the full 128-bit vector starting from vector byte index and wrapping to index 0 - // after byte index 15 UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); - int end = (16 - index) + 16; -#if USE_SIMD - UINT8 val[16]; -#endif - for (int i = (16 - index); i < end; i++) + // not sure what happens if 16-byte boundary is crossed... + + int end = (index >> 1) + 4; + + for (int i = index >> 1; i < end; i++) { -#if USE_SIMD - val[i & 0xf] = READ8(rsp, ea); -#else - VREG_B(dest, i & 0xf) = READ8(rsp, ea); -#endif + W_VREG_S(dest, i) = READ8(rsp, ea) << 7; ea += 4; } +} +#endif #if USE_SIMD - rsp->xv[dest] = _mm_set_epi8(val[15], val[14], val[13], val[12], val[11], val[10], val[ 9], val[ 8], - val[ 7], val[ 6], val[ 5], val[ 4], val[ 3], val[ 2], val[ 1], val[ 0]); -#endif -} +// LWV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 01010 | IIII | Offset | +// -------------------------------------------------- +// +// Loads the full 128-bit vector starting from vector byte index and wrapping to index 0 +// after byte index 15 -static void cfunc_rsp_ltv(void *param) +static void cfunc_rsp_lwv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1314,13 +1569,69 @@ static void cfunc_rsp_ltv(void *param) int base = (op >> 21) & 0x1f; int index = (op >> 7) & 0xf; int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 01011 | IIII | Offset | - // -------------------------------------------------- - // - // Loads one element to maximum of 8 vectors, while incrementing element index + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + int end = (16 - index) + 16; + + UINT8 val[16]; + for (int i = (16 - index); i < end; i++) + { + val[i & 0xf] = READ8(rsp, ea); + ea += 4; + } + + rsp->xv[dest] = _mm_set_epi8(val[15], val[14], val[13], val[12], val[11], val[10], val[ 9], val[ 8], + val[ 7], val[ 6], val[ 5], val[ 4], val[ 3], val[ 2], val[ 1], val[ 0]); +} + +#else + +static void cfunc_rsp_lwv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + int end = (16 - index) + 16; + + for (int i = (16 - index); i < end; i++) + { + VREG_B(dest, i & 0xf) = READ8(rsp, ea); + ea += 4; + } +} +#endif + +#if USE_SIMD +// LTV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 01011 | IIII | Offset | +// -------------------------------------------------- +// +// Loads one element to maximum of 8 vectors, while incrementing element index + +static void cfunc_rsp_ltv_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); // FIXME: has a small problem with odd indices @@ -1338,19 +1649,122 @@ static void cfunc_rsp_ltv(void *param) ea = ((ea + 8) & ~0xf) + (index & 1); for (int i = vs; i < ve; i++) { - element = ((8 - (index >> 1) + (i - vs)) << 1); -#if USE_SIMD + element = (8 - (index >> 1) + (i - vs)) << 1; UINT16 value = (READ8(rsp, ea) << 8) | READ8(rsp, ea + 1); SIMD_INSERT16(rsp->xv[i], value, (element >> 1)); -#else - VREG_B(i, (element & 0xf)) = READ8(rsp, ea); - VREG_B(i, ((element + 1) & 0xf)) = READ8(rsp, ea + 1); -#endif - ea += 2; } } +#else + +static void cfunc_rsp_ltv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + + // FIXME: has a small problem with odd indices + + int vs = dest; + int ve = dest + 8; + if (ve > 32) + { + ve = 32; + } + + int element = 7 - (index >> 1); + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + + ea = ((ea + 8) & ~0xf) + (index & 1); + for (int i = vs; i < ve; i++) + { + element = (8 - (index >> 1) + (i - vs)) << 1; + VREG_B(i, (element & 0xf)) = READ8(rsp, ea); + VREG_B(i, ((element + 1) & 0xf)) = READ8(rsp, ea + 1); + ea += 2; + } +} +#endif + +#if USE_SIMD +static int generate_lwc2(rsp_state *rsp, drcuml_block *block, compiler_state *compiler, const opcode_desc *desc) +{ + //int loopdest; + UINT32 op = desc->opptr.l[0]; + //int dest = (op >> 16) & 0x1f; + //int base = (op >> 21) & 0x1f; + //int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + //int skip; + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + switch ((op >> 11) & 0x1f) + { + case 0x00: /* LBV */ + //UML_ADD(block, I0, R32(RSREG), offset); + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_lbv_simd, rsp); + return TRUE; + case 0x01: /* LSV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_lsv_simd, rsp); + return TRUE; + case 0x02: /* LLV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_llv_simd, rsp); + return TRUE; + case 0x03: /* LDV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_ldv_simd, rsp); + return TRUE; + case 0x04: /* LQV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_lqv_simd, rsp); + return TRUE; + case 0x05: /* LRV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_lrv_simd, rsp); + return TRUE; + case 0x06: /* LPV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_lpv_simd, rsp); + return TRUE; + case 0x07: /* LUV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_luv_simd, rsp); + return TRUE; + case 0x08: /* LHV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_lhv_simd, rsp); + return TRUE; + case 0x09: /* LFV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_lfv_simd, rsp); + return TRUE; + case 0x0a: /* LWV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_lwv_simd, rsp); + return TRUE; + case 0x0b: /* LTV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_ltv_simd, rsp); + return TRUE; + + default: + return FALSE; + } +} + +#else + static int generate_lwc2(rsp_state *rsp, drcuml_block *block, compiler_state *compiler, const opcode_desc *desc) { //int loopdest; @@ -1370,59 +1784,70 @@ static int generate_lwc2(rsp_state *rsp, drcuml_block *block, compiler_state *co case 0x00: /* LBV */ //UML_ADD(block, I0, R32(RSREG), offset); UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_lbv, rsp); + UML_CALLC(block, cfunc_rsp_lbv_scalar, rsp); return TRUE; case 0x01: /* LSV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_lsv, rsp); + UML_CALLC(block, cfunc_rsp_lsv_scalar, rsp); return TRUE; case 0x02: /* LLV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_llv, rsp); + UML_CALLC(block, cfunc_rsp_llv_scalar, rsp); return TRUE; case 0x03: /* LDV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_ldv, rsp); + UML_CALLC(block, cfunc_rsp_ldv_scalar, rsp); return TRUE; case 0x04: /* LQV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_lqv, rsp); + UML_CALLC(block, cfunc_rsp_lqv_scalar, rsp); return TRUE; case 0x05: /* LRV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_lrv, rsp); + UML_CALLC(block, cfunc_rsp_lrv_scalar, rsp); return TRUE; case 0x06: /* LPV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_lpv, rsp); + UML_CALLC(block, cfunc_rsp_lpv_scalar, rsp); return TRUE; case 0x07: /* LUV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_luv, rsp); + UML_CALLC(block, cfunc_rsp_luv_scalar, rsp); return TRUE; case 0x08: /* LHV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_lhv, rsp); + UML_CALLC(block, cfunc_rsp_lhv_scalar, rsp); return TRUE; case 0x09: /* LFV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_lfv, rsp); + UML_CALLC(block, cfunc_rsp_lfv_scalar, rsp); return TRUE; case 0x0a: /* LWV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_lwv, rsp); + UML_CALLC(block, cfunc_rsp_lwv_scalar, rsp); return TRUE; case 0x0b: /* LTV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_ltv, rsp); + UML_CALLC(block, cfunc_rsp_ltv_scalar, rsp); return TRUE; default: return FALSE; } } +#endif -static void cfunc_rsp_sbv(void *param) +#if USE_SIMD +// SBV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 00000 | IIII | Offset | +// -------------------------------------------------- +// +// Stores 1 byte from vector byte index + +static void cfunc_rsp_sbv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1435,25 +1860,16 @@ static void cfunc_rsp_sbv(void *param) offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 00000 | IIII | Offset | - // -------------------------------------------------- - // - // Stores 1 byte from vector byte index - UINT32 ea = (base) ? rsp->r[base] + offset : offset; -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, (index >> 1)); value >>= (1-(index & 1)) * 8; WRITE8(rsp, ea, (UINT8)value); -#else - WRITE8(rsp, ea, VREG_B(dest, index)); -#endif } -static void cfunc_rsp_ssv(void *param) +#else + +static void cfunc_rsp_sbv_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1466,31 +1882,79 @@ static void cfunc_rsp_ssv(void *param) offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 00001 | IIII | Offset | - // -------------------------------------------------- - // - // Stores 2 bytes starting from vector byte index + UINT32 ea = (base) ? rsp->r[base] + offset : offset; + WRITE8(rsp, ea, VREG_B(dest, index)); +} +#endif + +#if USE_SIMD +// SSV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 00001 | IIII | Offset | +// -------------------------------------------------- +// +// Stores 2 bytes starting from vector byte index + +static void cfunc_rsp_ssv_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } UINT32 ea = (base) ? rsp->r[base] + (offset * 2) : (offset * 2); -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, (index >> 1)); WRITE8(rsp, ea, (UINT8)(value >> 8)); WRITE8(rsp, ea+1, (UINT8)(value & 0x00ff)); +} + #else + +static void cfunc_rsp_ssv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 2) : (offset * 2); + int end = index + 2; for (int i = index; i < end; i++) { WRITE8(rsp, ea, VREG_B(dest, i)); ea++; } -#endif } +#endif -static void cfunc_rsp_slv(void *param) +#if USE_SIMD +// SLV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 00010 | IIII | Offset | +// -------------------------------------------------- +// +// Stores 4 bytes starting from vector byte index + +static void cfunc_rsp_slv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1502,16 +1966,9 @@ static void cfunc_rsp_slv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 00010 | IIII | Offset | - // -------------------------------------------------- - // - // Stores 4 bytes starting from vector byte index UINT32 ea = (base) ? rsp->r[base] + (offset * 4) : (offset * 4); -#if USE_SIMD UINT16 value0, value1; index >>= 1; SIMD_EXTRACT16(rsp->xv[dest], value0, index); @@ -1520,17 +1977,45 @@ static void cfunc_rsp_slv(void *param) WRITE8(rsp, ea+1, (UINT8)(value0 & 0x00ff)); WRITE8(rsp, ea+2, (UINT8)(value1 >> 8)); WRITE8(rsp, ea+3, (UINT8)(value1 & 0x00ff)); +} + #else + +static void cfunc_rsp_slv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 4) : (offset * 4); + int end = index + 4; for (int i = index; i < end; i++) { WRITE8(rsp, ea, VREG_B(dest, i)); ea++; } -#endif } +#endif -static void cfunc_rsp_sdv(void *param) +#if USE_SIMD +// SDV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 00011 | IIII | Offset | +// -------------------------------------------------- +// +// Stores 8 bytes starting from vector byte index + +static void cfunc_rsp_sdv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1542,15 +2027,8 @@ static void cfunc_rsp_sdv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 00011 | IIII | Offset | - // -------------------------------------------------- - // - // Stores 8 bytes starting from vector byte index UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); -#if USE_SIMD UINT16 value0, value1, value2, value3; index >>= 1; SIMD_EXTRACT16(rsp->xv[dest], value0, index); @@ -1565,17 +2043,44 @@ static void cfunc_rsp_sdv(void *param) WRITE8(rsp, ea+5, (UINT8)(value2 & 0x00ff)); WRITE8(rsp, ea+6, (UINT8)(value3 >> 8)); WRITE8(rsp, ea+7, (UINT8)(value3 & 0x00ff)); +} + #else + +static void cfunc_rsp_sdv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0x8; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); + int end = index + 8; for (int i = index; i < end; i++) { WRITE8(rsp, ea, VREG_B(dest, i)); ea++; } -#endif } +#endif -static void cfunc_rsp_sqv(void *param) +#if USE_SIMD +// SQV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 00100 | IIII | Offset | +// -------------------------------------------------- +// +// Stores up to 16 bytes starting from vector byte index until 16-byte boundary + +static void cfunc_rsp_sqv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1587,30 +2092,55 @@ static void cfunc_rsp_sqv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 00100 | IIII | Offset | - // -------------------------------------------------- - // - // Stores up to 16 bytes starting from vector byte index until 16-byte boundary UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); int end = index + (16 - (ea & 0xf)); for (int i=index; i < end; i++) { -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, (i >> 1)); value >>= (1-(i & 1)) * 8; WRITE8(rsp, ea, (UINT8)value); -#else - WRITE8(rsp, ea, VREG_B(dest, i & 0xf)); -#endif ea++; } } -static void cfunc_rsp_srv(void *param) +#else + +static void cfunc_rsp_sqv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + int end = index + (16 - (ea & 0xf)); + for (int i=index; i < end; i++) + { + WRITE8(rsp, ea, VREG_B(dest, i & 0xf)); + ea++; + } +} +#endif + +#if USE_SIMD +// SRV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 00101 | IIII | Offset | +// -------------------------------------------------- +// +// Stores up to 16 bytes starting from right side until 16-byte boundary + +static void cfunc_rsp_srv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1622,12 +2152,6 @@ static void cfunc_rsp_srv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 00101 | IIII | Offset | - // -------------------------------------------------- - // - // Stores up to 16 bytes starting from right side until 16-byte boundary UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); @@ -1637,20 +2161,55 @@ static void cfunc_rsp_srv(void *param) for (int i = index; i < end; i++) { -#if USE_SIMD UINT32 bi = (i + o) & 0xf; UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, (bi >> 1)); value >>= (1-(bi & 1)) * 8; WRITE8(rsp, ea, (UINT8)value); -#else - WRITE8(rsp, ea, VREG_B(dest, ((i + o) & 0xf))); -#endif ea++; } } -static void cfunc_rsp_spv(void *param) +#else + +static void cfunc_rsp_srv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + + int end = index + (ea & 0xf); + int o = (16 - (ea & 0xf)) & 0xf; + ea &= ~0xf; + + for (int i = index; i < end; i++) + { + WRITE8(rsp, ea, VREG_B(dest, ((i + o) & 0xf))); + ea++; + } +} +#endif + +#if USE_SIMD +// SPV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 00110 | IIII | Offset | +// -------------------------------------------------- +// +// Stores upper 8 bits of each element + +static void cfunc_rsp_spv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1662,12 +2221,6 @@ static void cfunc_rsp_spv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 00110 | IIII | Offset | - // -------------------------------------------------- - // - // Stores upper 8 bits of each element UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); int end = index + 8; @@ -1675,30 +2228,23 @@ static void cfunc_rsp_spv(void *param) { if ((i & 0xf) < 8) { -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, i); WRITE8(rsp, ea, (UINT8)(value >> 8)); -#else - WRITE8(rsp, ea, VREG_B(dest, (i & 0xf) << 1)); -#endif } else { -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, i); - value >>= 7; - WRITE8(rsp, ea, (UINT8)value); -#else - WRITE8(rsp, ea, VREG_S(dest, (i & 0x7)) >> 7); -#endif + WRITE8(rsp, ea, (UINT8)(value >> 7)); } ea++; } } -static void cfunc_rsp_suv(void *param) +#else + +static void cfunc_rsp_spv_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1710,12 +2256,6 @@ static void cfunc_rsp_suv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 00111 | IIII | Offset | - // -------------------------------------------------- - // - // Stores bits 14-7 of each element UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); int end = index + 8; @@ -1723,30 +2263,103 @@ static void cfunc_rsp_suv(void *param) { if ((i & 0xf) < 8) { -#if USE_SIMD - UINT16 value; - SIMD_EXTRACT16(rsp->xv[dest], value, i); - value >>= 7; - WRITE8(rsp, ea, (UINT8)value); -#else - WRITE8(rsp, ea, VREG_S(dest, (i & 0x7)) >> 7); -#endif + WRITE8(rsp, ea, VREG_B(dest, (i & 0xf) << 1)); } else { -#if USE_SIMD - UINT16 value; - SIMD_EXTRACT16(rsp->xv[dest], value, i); - WRITE8(rsp, ea, (UINT8)value >> 8); -#else - WRITE8(rsp, ea, VREG_B(dest, ((i & 0x7) << 1))); -#endif + WRITE8(rsp, ea, VREG_S(dest, (i & 0x7)) >> 7); } ea++; } } +#endif -static void cfunc_rsp_shv(void *param) +#if USE_SIMD +// SUV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 00111 | IIII | Offset | +// -------------------------------------------------- +// +// Stores bits 14-7 of each element + +static void cfunc_rsp_suv_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); + int end = index + 8; + for (int i=index; i < end; i++) + { + if ((i & 0xf) < 8) + { + UINT16 value; + SIMD_EXTRACT16(rsp->xv[dest], value, i); + WRITE8(rsp, ea, (UINT8)(value >> 7)); + } + else + { + UINT16 value; + SIMD_EXTRACT16(rsp->xv[dest], value, i); + WRITE8(rsp, ea, (UINT8)(value >> 8)); + } + ea++; + } +} + +#else + +static void cfunc_rsp_suv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); + int end = index + 8; + for (int i=index; i < end; i++) + { + if ((i & 0xf) < 8) + { + WRITE8(rsp, ea, VREG_S(dest, (i & 0x7)) >> 7); + } + else + { + WRITE8(rsp, ea, VREG_B(dest, ((i & 0x7) << 1))); + } + ea++; + } +} +#endif + +#if USE_SIMD +// SHV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 01000 | IIII | Offset | +// -------------------------------------------------- +// +// Stores bits 14-7 of each element, with 2-byte stride + +static void cfunc_rsp_shv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1758,31 +2371,21 @@ static void cfunc_rsp_shv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 01000 | IIII | Offset | - // -------------------------------------------------- - // - // Stores bits 14-7 of each element, with 2-byte stride UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); for (int i=0; i < 8; i++) { int element = index + (i << 1); -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, element >> 1); WRITE8(rsp, ea, (value >> 7) & 0x00ff); -#else - UINT8 d = (VREG_B(dest, (element & 0xf)) << 1) | - (VREG_B(dest, ((element + 1) & 0xf)) >> 7); - WRITE8(rsp, ea, d); -#endif ea += 2; } } -static void cfunc_rsp_sfv(void *param) +#else + +static void cfunc_rsp_shv_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1794,14 +2397,41 @@ static void cfunc_rsp_sfv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 01001 | IIII | Offset | - // -------------------------------------------------- - // - // Stores bits 14-7 of upper or lower quad, with 4-byte stride - if (index & 0x7) printf("RSP: SFV: index = %d at %08X\n", index, rsp->ppc); + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + for (int i=0; i < 8; i++) + { + int element = index + (i << 1); + UINT8 d = (VREG_B(dest, (element & 0xf)) << 1) | + (VREG_B(dest, ((element + 1) & 0xf)) >> 7); + WRITE8(rsp, ea, d); + ea += 2; + } +} +#endif + +#if USE_SIMD +// SFV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 01001 | IIII | Offset | +// -------------------------------------------------- +// +// Stores bits 14-7 of upper or lower quad, with 4-byte stride + +static void cfunc_rsp_sfv_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); int eaoffset = ea & 0xf; @@ -1811,18 +2441,54 @@ static void cfunc_rsp_sfv(void *param) for (int i = index>>1; i < end; i++) { -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, i); WRITE8(rsp, ea + (eaoffset & 0xf), (value >> 7) & 0x00ff); -#else - WRITE8(rsp, ea + (eaoffset & 0xf), VREG_S(dest, i) >> 7); -#endif eaoffset += 4; } } -static void cfunc_rsp_swv(void *param) +#else + +static void cfunc_rsp_sfv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + int eaoffset = ea & 0xf; + ea &= ~0xf; + + int end = (index >> 1) + 4; + + for (int i = index>>1; i < end; i++) + { + WRITE8(rsp, ea + (eaoffset & 0xf), VREG_S(dest, i) >> 7); + eaoffset += 4; + } +} +#endif + +#if USE_SIMD +// SWV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 01010 | IIII | Offset | +// -------------------------------------------------- +// +// Stores the full 128-bit vector starting from vector byte index and wrapping to index 0 +// after byte index 15 + +static void cfunc_rsp_swv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1834,13 +2500,6 @@ static void cfunc_rsp_swv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 01010 | IIII | Offset | - // -------------------------------------------------- - // - // Stores the full 128-bit vector starting from vector byte index and wrapping to index 0 - // after byte index 15 UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); int eaoffset = ea & 0xf; @@ -1849,18 +2508,52 @@ static void cfunc_rsp_swv(void *param) int end = index + 16; for (int i = index; i < end; i++) { -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, i >> 1); WRITE8(rsp, ea + (eaoffset & 0xf), (value >> ((1-(i & 1)) * 8)) & 0xff); -#else - WRITE8(rsp, ea + (eaoffset & 0xf), VREG_B(dest, i & 0xf)); -#endif eaoffset++; } } -static void cfunc_rsp_stv(void *param) +#else + +static void cfunc_rsp_swv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + int eaoffset = ea & 0xf; + ea &= ~0xf; + + int end = index + 16; + for (int i = index; i < end; i++) + { + WRITE8(rsp, ea + (eaoffset & 0xf), VREG_B(dest, i & 0xf)); + eaoffset++; + } +} +#endif + +#if USE_SIMD +// STV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 01011 | IIII | Offset | +// -------------------------------------------------- +// +// Stores one element from maximum of 8 vectors, while incrementing element index + +static void cfunc_rsp_stv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1873,12 +2566,6 @@ static void cfunc_rsp_stv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 01011 | IIII | Offset | - // -------------------------------------------------- - // - // Stores one element from maximum of 8 vectors, while incrementing element index int vs = dest; int ve = dest + 8; @@ -1895,18 +2582,129 @@ static void cfunc_rsp_stv(void *param) for (int i = vs; i < ve; i++) { -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, element); WRITE16(rsp, ea + (eaoffset & 0xf), value); -#else - WRITE16(rsp, ea + (eaoffset & 0xf), VREG_S(i, element & 0x7)); -#endif eaoffset += 2; element++; } } +#else + +static void cfunc_rsp_stv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + int vs = dest; + int ve = dest + 8; + if (ve > 32) + { + ve = 32; + } + + int element = 8 - (index >> 1); + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + int eaoffset = (ea & 0xf) + (element * 2); + ea &= ~0xf; + + for (int i = vs; i < ve; i++) + { + WRITE16(rsp, ea + (eaoffset & 0xf), VREG_S(i, element & 0x7)); + eaoffset += 2; + element++; + } +} + +#endif + +#if USE_SIMD +static int generate_swc2(rsp_state *rsp, drcuml_block *block, compiler_state *compiler, const opcode_desc *desc) +{ +// int loopdest; + UINT32 op = desc->opptr.l[0]; + //int dest = (op >> 16) & 0x1f; + //int base = (op >> 21) & 0x1f; + //int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + //int skip; + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + switch ((op >> 11) & 0x1f) + { + case 0x00: /* SBV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_sbv_simd, rsp); + return TRUE; + case 0x01: /* SSV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_ssv_simd, rsp); + return TRUE; + case 0x02: /* SLV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_slv_simd, rsp); + return TRUE; + case 0x03: /* SDV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_sdv_simd, rsp); + return TRUE; + case 0x04: /* SQV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_sqv_simd, rsp); + return TRUE; + case 0x05: /* SRV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_srv_simd, rsp); + return TRUE; + case 0x06: /* SPV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_spv_simd, rsp); + return TRUE; + case 0x07: /* SUV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_suv_simd, rsp); + return TRUE; + case 0x08: /* SHV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_shv_simd, rsp); + return TRUE; + case 0x09: /* SFV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_sfv_simd, rsp); + return TRUE; + case 0x0a: /* SWV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_swv_simd, rsp); + return TRUE; + case 0x0b: /* STV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_stv_simd, rsp); + return TRUE; + + default: + unimplemented_opcode(rsp, op); + return FALSE; + } + + return TRUE; +} + +#else + static int generate_swc2(rsp_state *rsp, drcuml_block *block, compiler_state *compiler, const opcode_desc *desc) { // int loopdest; @@ -1925,51 +2723,51 @@ static int generate_swc2(rsp_state *rsp, drcuml_block *block, compiler_state *co { case 0x00: /* SBV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_sbv, rsp); + UML_CALLC(block, cfunc_rsp_sbv_scalar, rsp); return TRUE; case 0x01: /* SSV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_ssv, rsp); + UML_CALLC(block, cfunc_rsp_ssv_scalar, rsp); return TRUE; case 0x02: /* SLV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_slv, rsp); + UML_CALLC(block, cfunc_rsp_slv_scalar, rsp); return TRUE; case 0x03: /* SDV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_sdv, rsp); + UML_CALLC(block, cfunc_rsp_sdv_scalar, rsp); return TRUE; case 0x04: /* SQV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_sqv, rsp); + UML_CALLC(block, cfunc_rsp_sqv_scalar, rsp); return TRUE; case 0x05: /* SRV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_srv, rsp); + UML_CALLC(block, cfunc_rsp_srv_scalar, rsp); return TRUE; case 0x06: /* SPV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_spv, rsp); + UML_CALLC(block, cfunc_rsp_spv_scalar, rsp); return TRUE; case 0x07: /* SUV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_suv, rsp); + UML_CALLC(block, cfunc_rsp_suv_scalar, rsp); return TRUE; case 0x08: /* SHV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_shv, rsp); + UML_CALLC(block, cfunc_rsp_shv_scalar, rsp); return TRUE; case 0x09: /* SFV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_sfv, rsp); + UML_CALLC(block, cfunc_rsp_sfv_scalar, rsp); return TRUE; case 0x0a: /* SWV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_swv, rsp); + UML_CALLC(block, cfunc_rsp_swv_scalar, rsp); return TRUE; case 0x0b: /* STV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_stv, rsp); + UML_CALLC(block, cfunc_rsp_stv_scalar, rsp); return TRUE; default: @@ -1979,9 +2777,62 @@ static int generate_swc2(rsp_state *rsp, drcuml_block *block, compiler_state *co return TRUE; } +#endif INLINE UINT16 SATURATE_ACCUM(rsp_state *rsp, int accum, int slice, UINT16 negative, UINT16 positive) { +#if USE_SIMD + if ((INT16)VEC_ACCUM_H(rsp, accum) < 0) + { + if ((UINT16)(VEC_ACCUM_H(rsp, accum)) != 0xffff) + { + return negative; + } + else + { + if ((INT16)VEC_ACCUM_M(rsp, accum) >= 0) + { + return negative; + } + else + { + if (slice == 0) + { + return VEC_ACCUM_L(rsp, accum); + } + else if (slice == 1) + { + return VEC_ACCUM_M(rsp, accum); + } + } + } + } + else + { + if ((UINT16)(VEC_ACCUM_H(rsp, accum)) != 0) + { + return positive; + } + else + { + if ((INT16)VEC_ACCUM_M(rsp, accum) < 0) + { + return positive; + } + else + { + if (slice == 0) + { + return VEC_ACCUM_L(rsp, accum); + } + else + { + return VEC_ACCUM_M(rsp, accum); + } + } + } + } +#else if ((INT16)ACCUM_H(rsp, accum) < 0) { if ((UINT16)(ACCUM_H(rsp, accum)) != 0xffff) @@ -2032,7 +2883,7 @@ INLINE UINT16 SATURATE_ACCUM(rsp_state *rsp, int accum, int slice, UINT16 negati } } } - +#endif return 0; } @@ -2076,6 +2927,44 @@ INLINE UINT16 SATURATE_ACCUM1(rsp_state *rsp, int accum, UINT16 negative, UINT16 // Return positive if H>0 || (H==0 && M<0) // Return medium slice if H==0xffff && M<0 // Return medium slice if H==0 && M>=0 +#if USE_SIMD + if ((INT16)VEC_ACCUM_H(rsp, accum) < 0) + { + if ((UINT16)(VEC_ACCUM_H(rsp, accum)) != 0xffff) + { + return negative; + } + else + { + if ((INT16)VEC_ACCUM_M(rsp, accum) >= 0) + { + return negative; + } + else + { + return VEC_ACCUM_M(rsp, accum); + } + } + } + else + { + if ((UINT16)(VEC_ACCUM_H(rsp, accum)) != 0) + { + return positive; + } + else + { + if ((INT16)VEC_ACCUM_M(rsp, accum) < 0) + { + return positive; + } + else + { + return VEC_ACCUM_M(rsp, accum); + } + } + } +#else if ((INT16)ACCUM_H(rsp, accum) < 0) { if ((UINT16)(ACCUM_H(rsp, accum)) != 0xffff) @@ -2112,7 +3001,7 @@ INLINE UINT16 SATURATE_ACCUM1(rsp_state *rsp, int accum, UINT16 negative, UINT16 } } } - +#endif return 0; } @@ -2276,17 +3165,58 @@ INLINE __m128i RSPClampLowToVal(__m128i vaccLow, __m128i vaccMid, __m128i vaccHi return _mm_or_si128(negVal, posVal); } #endif -INLINE void cfunc_rsp_vmulf(void *param) + +#if USE_SIMD +// VMULF +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000000 | +// ------------------------------------------------------ +// +// Multiplies signed integer by signed integer * 2 + +INLINE void cfunc_rsp_vmulf_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + + INT16 vres[8]; + for (int i = 0; i < 8; i++) + { + UINT16 w1, w2; + SCALAR_GET_VS1(w1, i); + SCALAR_GET_VS2(w2, i); + INT32 s1 = (INT32)(INT16)w1; + INT32 s2 = (INT32)(INT16)w2; + + if (s1 == -32768 && s2 == -32768) + { + // overflow + VEC_SET_ACCUM_H(0, i); + VEC_SET_ACCUM_M(-32768, i); + VEC_SET_ACCUM_L(-32768, i); + vres[i] = 0x7fff; + } + else + { + INT64 r = s1 * s2 * 2; + r += 0x8000; // rounding ? + VEC_SET_ACCUM_H((r < 0) ? 0xffff : 0, i); + VEC_SET_ACCUM_M((INT16)(r >> 16), i); + VEC_SET_ACCUM_L((UINT16)(r), i); + vres[i] = VEC_ACCUM_M(rsp, i); + } + } + WRITEBACK_RESULT(); +} + +#else + +INLINE void cfunc_rsp_vmulf_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - //int i; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000000 | - // ------------------------------------------------------ - // - // Multiplies signed integer by signed integer * 2 INT16 vres[8]; for (int i = 0; i < 8; i++) @@ -2317,16 +3247,60 @@ INLINE void cfunc_rsp_vmulf(void *param) } WRITEBACK_RESULT(); } +#endif -INLINE void cfunc_rsp_vmulu(void *param) +#if USE_SIMD +// VMULU +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000001 | +// ------------------------------------------------------ +// + +INLINE void cfunc_rsp_vmulu_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + + INT16 vres[8]; + for (int i = 0; i < 8; i++) + { + UINT16 w1, w2; + SCALAR_GET_VS1(w1, i); + SCALAR_GET_VS2(w2, i); + INT32 s1 = (INT32)(INT16)w1; + INT32 s2 = (INT32)(INT16)w2; + + INT64 r = s1 * s2 * 2; + r += 0x8000; // rounding ? + + VEC_SET_ACCUM_H((UINT16)(r >> 32), i); + VEC_SET_ACCUM_M((UINT16)(r >> 16), i); + VEC_SET_ACCUM_L((UINT16)(r), i); + + if (r < 0) + { + vres[i] = 0; + } + else if (((INT16)(VEC_ACCUM_H(rsp, i)) ^ (INT16)(VEC_ACCUM_M(rsp, i))) < 0) + { + vres[i] = -1; + } + else + { + vres[i] = VEC_ACCUM_M(rsp, i); + } + } + WRITEBACK_RESULT(); +} + +#else + +INLINE void cfunc_rsp_vmulu_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000001 | - // ------------------------------------------------------ - // INT16 vres[8]; for (int i = 0; i < 8; i++) @@ -2359,23 +3333,25 @@ INLINE void cfunc_rsp_vmulu(void *param) } WRITEBACK_RESULT(); } +#endif -INLINE void cfunc_rsp_vmudl(void *param) +#if USE_SIMD +// VMUDL +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001101 | +// ------------------------------------------------------ +// +// Multiplies signed integer by unsigned fraction +// The result is added into accumulator +// The middle slice of accumulator is stored into destination element + +INLINE void cfunc_rsp_vmudl_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001101 | - // ------------------------------------------------------ - // - // Multiplies signed integer by unsigned fraction - // The result is added into accumulator - // The middle slice of accumulator is stored into destination element - -#if USE_SIMD - __m128i vsReg = rsp->xv[VS1REG]; __m128i vtReg = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); @@ -2389,9 +3365,15 @@ INLINE void cfunc_rsp_vmudl(void *param) rsp->accum_m = _mm_setzero_si128(); rsp->accum_h = _mm_setzero_si128(); +} #else +INLINE void cfunc_rsp_vmudl_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -2410,25 +3392,26 @@ INLINE void cfunc_rsp_vmudl(void *param) vres[i] = ACCUM_L(rsp, i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmudm(void *param) +#if USE_SIMD +// VMUDM +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000101 | +// ------------------------------------------------------ +// +// Multiplies signed integer by unsigned fraction +// The result is stored into accumulator +// The middle slice of accumulator is stored into destination element + +INLINE void cfunc_rsp_vmudm_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000101 | - // ------------------------------------------------------ - // - // Multiplies signed integer by unsigned fraction - // The result is stored into accumulator - // The middle slice of accumulator is stored into destination element - -#if USE_SIMD - __m128i vsRegLo, vsRegHi, vtRegLo, vtRegHi; __m128i vsReg = rsp->xv[VS1REG]; @@ -2447,9 +3430,15 @@ INLINE void cfunc_rsp_vmudm(void *param) loProduct = _mm_cmplt_epi32(loProduct, _mm_setzero_si128()); hiProduct = _mm_cmplt_epi32(hiProduct, _mm_setzero_si128()); rsp->accum_h = _mm_packs_epi32(loProduct, hiProduct); +} #else +INLINE void cfunc_rsp_vmudm_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -2468,25 +3457,26 @@ INLINE void cfunc_rsp_vmudm(void *param) vres[i] = ACCUM_M(rsp, i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmudn(void *param) +#if USE_SIMD +// VMUDN +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000110 | +// ------------------------------------------------------ +// +// Multiplies unsigned fraction by signed integer +// The result is stored into accumulator +// The low slice of accumulator is stored into destination element + +INLINE void cfunc_rsp_vmudn_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000110 | - // ------------------------------------------------------ - // - // Multiplies unsigned fraction by signed integer - // The result is stored into accumulator - // The low slice of accumulator is stored into destination element - -#if USE_SIMD - __m128i vsRegLo, vsRegHi, vtRegLo, vtRegHi; __m128i vsReg = rsp->xv[VS1REG]; @@ -2502,9 +3492,15 @@ INLINE void cfunc_rsp_vmudn(void *param) rsp->xv[VDREG] = rsp->accum_l = RSPPackLo32to16(loProduct, hiProduct); rsp->accum_m = RSPPackHi32to16(loProduct, hiProduct); rsp->accum_h = _mm_cmplt_epi16(rsp->accum_m, _mm_setzero_si128()); +} #else +INLINE void cfunc_rsp_vmudn_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8] = { 0 }; for (int i = 0; i < 8; i++) { @@ -2523,25 +3519,26 @@ INLINE void cfunc_rsp_vmudn(void *param) vres[i] = (UINT16)(r); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmudh(void *param) +#if USE_SIMD +// VMUDH +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000111 | +// ------------------------------------------------------ +// +// Multiplies signed integer by signed integer +// The result is stored into highest 32 bits of accumulator, the low slice is zero +// The highest 32 bits of accumulator is saturated into destination element + +INLINE void cfunc_rsp_vmudh_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000111 | - // ------------------------------------------------------ - // - // Multiplies signed integer by signed integer - // The result is stored into highest 32 bits of accumulator, the low slice is zero - // The highest 32 bits of accumulator is saturated into destination element - -#if USE_SIMD - __m128i vaccLow, vaccHigh; __m128i unpackLo, unpackHi; @@ -2559,9 +3556,15 @@ INLINE void cfunc_rsp_vmudh(void *param) rsp->accum_l = _mm_setzero_si128(); rsp->accum_m = RSPPackLo32to16(vaccLow, vaccHigh); rsp->accum_h = RSPPackHi32to16(vaccLow, vaccHigh); +} #else +INLINE void cfunc_rsp_vmudh_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -2582,16 +3585,23 @@ INLINE void cfunc_rsp_vmudh(void *param) vres[i] = (INT16)(r); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmacf(void *param) +#if USE_SIMD +// VMACF +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001000 | +// ------------------------------------------------------ +// + +INLINE void cfunc_rsp_vmacf_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; -#if USE_SIMD - __m128i loProduct, hiProduct, unpackLo, unpackHi; __m128i vaccHigh; __m128i vdReg, vdRegLo, vdRegHi; @@ -2642,9 +3652,15 @@ INLINE void cfunc_rsp_vmacf(void *param) rsp->xv[VDREG] = _mm_packs_epi32(vaccLow, vaccHigh); rsp->accum_m = RSPPackLo32to16(vaccLow, vaccHigh); rsp->accum_h = RSPPackHi32to16(vaccLow, vaccHigh); +} #else +INLINE void cfunc_rsp_vmacf_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -2671,22 +3687,23 @@ INLINE void cfunc_rsp_vmacf(void *param) vres[i] = SATURATE_ACCUM(rsp, i, 1, 0x8000, 0x7fff); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmacu(void *param) +#if USE_SIMD +// VMACU +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001001 | +// ------------------------------------------------------ +// + +INLINE void cfunc_rsp_vmacu_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001001 | - // ------------------------------------------------------ - // - -#if USE_SIMD - __m128i loProduct, hiProduct, unpackLo, unpackHi; __m128i vaccHigh; __m128i vdReg, vdRegLo, vdRegHi; @@ -2736,8 +3753,15 @@ INLINE void cfunc_rsp_vmacu(void *param) /* Clamp the accumulator and write it all out. */ rsp->accum_m = RSPPackLo32to16(vaccLow, vaccHigh); rsp->accum_h = RSPPackHi32to16(vaccLow, vaccHigh); +} + #else +INLINE void cfunc_rsp_vmacu_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -2780,25 +3804,26 @@ INLINE void cfunc_rsp_vmacu(void *param) } } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmadl(void *param) +#if USE_SIMD +// VMADL +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001100 | +// ------------------------------------------------------ +// +// Multiplies unsigned fraction by unsigned fraction +// Adds the higher 16 bits of the 32-bit result to accumulator +// The low slice of accumulator is stored into destination element + +INLINE void cfunc_rsp_vmadl_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001100 | - // ------------------------------------------------------ - // - // Multiplies unsigned fraction by unsigned fraction - // Adds the higher 16 bits of the 32-bit result to accumulator - // The low slice of accumulator is stored into destination element - -#if USE_SIMD - __m128i vaccHigh; __m128i unpackHi, loProduct, hiProduct; __m128i vdReg, vdRegLo, vdRegHi; @@ -2833,8 +3858,15 @@ INLINE void cfunc_rsp_vmadl(void *param) rsp->accum_m = RSPPackLo32to16(vaccLow, vaccHigh); rsp->accum_h = RSPPackHi32to16(vaccLow, vaccHigh); rsp->xv[VDREG] = RSPClampLowToVal(vdReg, rsp->accum_m, rsp->accum_h); +} #else + +INLINE void cfunc_rsp_vmadl_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -2855,15 +3887,18 @@ INLINE void cfunc_rsp_vmadl(void *param) vres[i] = SATURATE_ACCUM(rsp, i, 0, 0x0000, 0xffff); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmadm(void *param) +#if USE_SIMD +// VMADM +// + +INLINE void cfunc_rsp_vmadm_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; -#if USE_SIMD __m128i vaccLow, vaccHigh, loProduct, hiProduct; __m128i vsRegLo, vsRegHi, vtRegLo, vtRegHi, vdRegLo, vdRegHi; @@ -2908,8 +3943,15 @@ INLINE void cfunc_rsp_vmadm(void *param) rsp->xv[VDREG] = _mm_packs_epi32(vaccLow, vaccHigh); rsp->accum_m = RSPPackLo32to16(vaccLow, vaccHigh); rsp->accum_h = RSPPackHi32to16(vaccLow, vaccHigh); +} #else + +INLINE void cfunc_rsp_vmadm_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -2934,15 +3976,18 @@ INLINE void cfunc_rsp_vmadm(void *param) vres[i] = SATURATE_ACCUM(rsp, i, 1, 0x8000, 0x7fff); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmadn(void *param) +#if USE_SIMD +// VMADN +// + +INLINE void cfunc_rsp_vmadn_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; -#if USE_SIMD __m128i vaccLow, vaccHigh, loProduct, hiProduct; __m128i vsRegLo, vsRegHi, vtRegLo, vtRegHi, vdRegLo, vdRegHi; @@ -2989,7 +4034,15 @@ INLINE void cfunc_rsp_vmadn(void *param) rsp->accum_m = RSPPackLo32to16(vaccLow, vaccHigh); rsp->accum_h = RSPPackHi32to16(vaccLow, vaccHigh); rsp->xv[VDREG] = RSPClampLowToVal(rsp->accum_l, rsp->accum_m, rsp->accum_h); +} + #else + +INLINE void cfunc_rsp_vmadn_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -3013,25 +4066,26 @@ INLINE void cfunc_rsp_vmadn(void *param) vres[i] = SATURATE_ACCUM(rsp, i, 0, 0x0000, 0xffff); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmadh(void *param) +#if USE_SIMD +// VMADH +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001111 | +// ------------------------------------------------------ +// +// Multiplies signed integer by signed integer +// The result is added into highest 32 bits of accumulator, the low slice is zero +// The highest 32 bits of accumulator is saturated into destination element + +INLINE void cfunc_rsp_vmadh_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001111 | - // ------------------------------------------------------ - // - // Multiplies signed integer by signed integer - // The result is added into highest 32 bits of accumulator, the low slice is zero - // The highest 32 bits of accumulator is saturated into destination element - -#if USE_SIMD - __m128i vsReg = rsp->xv[VS1REG]; __m128i vtReg = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); @@ -3051,8 +4105,15 @@ INLINE void cfunc_rsp_vmadh(void *param) rsp->xv[VDREG] = _mm_packs_epi32(vaccLow, vaccHigh); rsp->accum_m = RSPPackLo32to16(vaccLow, vaccHigh); rsp->accum_h = RSPPackHi32to16(vaccLow, vaccHigh); +} #else + +INLINE void cfunc_rsp_vmadh_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -3072,23 +4133,23 @@ INLINE void cfunc_rsp_vmadh(void *param) vres[i] = SATURATE_ACCUM1(rsp, i, 0x8000, 0x7fff); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vadd(void *param) +#if USE_SIMD +// VADD +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010000 | +// ------------------------------------------------------ +// +// Adds two vector registers and carry flag, the result is saturated to 32767 + +INLINE void cfunc_rsp_vadd_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010000 | - // ------------------------------------------------------ - // - // Adds two vector registers and carry flag, the result is saturated to 32767 - -#if USE_SIMD - __m128i shuffled = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); __m128i carry = _mm_and_si128(rsp->xvflag[CARRY], vec_flagmask); rsp->accum_l = _mm_add_epi16(_mm_add_epi16(rsp->xv[VS1REG], shuffled), carry); @@ -3102,7 +4163,15 @@ INLINE void cfunc_rsp_vadd(void *param) rsp->xvflag[ZERO] = _mm_setzero_si128(); rsp->xvflag[CARRY] = _mm_setzero_si128(); +} + #else + +INLINE void cfunc_rsp_vadd_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8] = { 0 }; for (int i = 0; i < 8; i++) { @@ -3122,24 +4191,25 @@ INLINE void cfunc_rsp_vadd(void *param) CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vsub(void *param) +#if USE_SIMD +// VSUB +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010001 | +// ------------------------------------------------------ +// +// Subtracts two vector registers and carry flag, the result is saturated to -32768 +// TODO: check VS2REG == VDREG + +INLINE void cfunc_rsp_vsub_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010001 | - // ------------------------------------------------------ - // - // Subtracts two vector registers and carry flag, the result is saturated to -32768 - - // TODO: check VS2REG == VDREG - -#if USE_SIMD __m128i shuffled = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); __m128i carry = _mm_and_si128(rsp->xvflag[CARRY], vec_flagmask); __m128i unsat = _mm_sub_epi16(_mm_sub_epi16(rsp->xv[VS1REG], shuffled), carry); @@ -3154,7 +4224,15 @@ INLINE void cfunc_rsp_vsub(void *param) rsp->xvflag[ZERO] = _mm_setzero_si128(); rsp->xvflag[CARRY] = _mm_setzero_si128(); +} + #else + +INLINE void cfunc_rsp_vsub_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -3175,23 +4253,24 @@ INLINE void cfunc_rsp_vsub(void *param) CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vabs(void *param) +#if USE_SIMD +// VABS +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010011 | +// ------------------------------------------------------ +// +// Changes the sign of source register 2 if source register 1 is negative and stores the result to destination register + +INLINE void cfunc_rsp_vabs_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010011 | - // ------------------------------------------------------ - // - // Changes the sign of source register 2 if source register 1 is negative and stores - // the result to destination register - -#if USE_SIMD __m128i shuf2 = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); __m128i negs2 = _mm_sub_epi16(_mm_setzero_si128(), shuf2); __m128i s2_n32768 = _mm_cmpeq_epi16(shuf2, vec_n32768); @@ -3201,7 +4280,15 @@ INLINE void cfunc_rsp_vabs(void *param) __m128i result_n32768 = _mm_and_si128(s1_lz, _mm_and_si128(vec_32767, s2_n32768)); __m128i result_negs2 = _mm_and_si128(s1_lz, _mm_and_si128(negs2, _mm_xor_si128(s2_n32768, vec_neg1))); rsp->xv[VDREG] = rsp->accum_l = _mm_or_si128(result_gz, _mm_or_si128(result_n32768, result_negs2)); +} + #else + +INLINE void cfunc_rsp_vabs_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -3232,27 +4319,28 @@ INLINE void cfunc_rsp_vabs(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vaddc(void *param) +#if USE_SIMD +// VADDC +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010100 | +// ------------------------------------------------------ +// +// Adds two vector registers, the carry out is stored into carry register +// TODO: check VS2REG = VDREG + +INLINE void cfunc_rsp_vaddc_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010100 | - // ------------------------------------------------------ - // - // Adds two vector registers, the carry out is stored into carry register - - // TODO: check VS2REG = VDREG - CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); -#if USE_SIMD __m128i shuf2 = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); __m128i vec7531 = _mm_and_si128(rsp->xv[VS1REG], vec_lomask); __m128i vec6420 = _mm_srli_epi32(rsp->xv[VS1REG], 16); @@ -3266,7 +4354,18 @@ INLINE void cfunc_rsp_vaddc(void *param) rsp->xvflag[CARRY] = _mm_or_si128(over6420, _mm_srli_epi32(over7531, 16)); rsp->accum_l = rsp->xv[VDREG] = _mm_or_si128(_mm_slli_epi32(sum6420, 16), sum7531); +} + #else + +INLINE void cfunc_rsp_vaddc_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + + CLEAR_ZERO_FLAGS(); + CLEAR_CARRY_FLAGS(); + INT16 vres[8] = { 0 }; for (int i = 0; i < 8; i++) { @@ -3286,27 +4385,29 @@ INLINE void cfunc_rsp_vaddc(void *param) } } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vsubc(void *param) +#if USE_SIMD +// VSUBC +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010101 | +// ------------------------------------------------------ +// +// Subtracts two vector registers, the carry out is stored into carry register +// TODO: check VS2REG = VDREG + +INLINE void cfunc_rsp_vsubc_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010101 | - // ------------------------------------------------------ - // - // Subtracts two vector registers, the carry out is stored into carry register - - // TODO: check VS2REG = VDREG CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); -#if USE_SIMD __m128i shuf2 = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); __m128i vec7531 = _mm_and_si128(rsp->xv[VS1REG], vec_lomask); __m128i vec6420 = _mm_srli_epi32(rsp->xv[VS1REG], 16); @@ -3326,7 +4427,19 @@ INLINE void cfunc_rsp_vsubc(void *param) rsp->xvflag[ZERO] = _mm_or_si128(zero6420, _mm_srli_epi32(zero7531, 16)); rsp->accum_l = rsp->xv[VDREG] = _mm_or_si128(_mm_slli_epi32(sum6420, 16), sum7531); +} + #else + +INLINE void cfunc_rsp_vsubc_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + + + CLEAR_ZERO_FLAGS(); + CLEAR_CARRY_FLAGS(); + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -3350,78 +4463,99 @@ INLINE void cfunc_rsp_vsubc(void *param) } } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vsaw(void *param) +#if USE_SIMD +// VSAW +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 011101 | +// ------------------------------------------------------ +// +// Stores high, middle or low slice of accumulator to destination vector + +INLINE void cfunc_rsp_vsaw_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 011101 | - // ------------------------------------------------------ - // - // Stores high, middle or low slice of accumulator to destination vector - switch (EL) { case 0x08: // VSAWH { -#if USE_SIMD rsp->xv[VDREG] = rsp->accum_h; -#else - for (int i = 0; i < 8; i++) - { - W_VREG_S(VDREG, i) = ACCUM_H(rsp, i); - } -#endif break; } case 0x09: // VSAWM { -#if USE_SIMD rsp->xv[VDREG] = rsp->accum_m; -#else - for (int i = 0; i < 8; i++) - { - W_VREG_S(VDREG, i) = ACCUM_M(rsp, i); - } -#endif break; } case 0x0a: // VSAWL { -#if USE_SIMD rsp->xv[VDREG] = rsp->accum_l; -#else - for (int i = 0; i < 8; i++) - { - W_VREG_S(VDREG, i) = ACCUM_L(rsp, i); - } -#endif break; } default: fatalerror("RSP: VSAW: el = %d\n", EL); } } -INLINE void cfunc_rsp_vlt(void *param) +#else + +INLINE void cfunc_rsp_vsaw_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - //int i; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100000 | - // ------------------------------------------------------ - // - // Sets compare flags if elements in VS1 are less than VS2 - // Moves the element in VS2 to destination vector + switch (EL) + { + case 0x08: // VSAWH + { + for (int i = 0; i < 8; i++) + { + W_VREG_S(VDREG, i) = ACCUM_H(rsp, i); + } + break; + } + case 0x09: // VSAWM + { + for (int i = 0; i < 8; i++) + { + W_VREG_S(VDREG, i) = ACCUM_M(rsp, i); + } + break; + } + case 0x0a: // VSAWL + { + for (int i = 0; i < 8; i++) + { + W_VREG_S(VDREG, i) = ACCUM_L(rsp, i); + } + break; + } + default: fatalerror("RSP: VSAW: el = %d\n", EL); + } +} +#endif #if USE_SIMD +// VLT +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100000 | +// ------------------------------------------------------ +// +// Sets compare flags if elements in VS1 are less than VS2 +// Moves the element in VS2 to destination vector + +INLINE void cfunc_rsp_vlt_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + rsp->xvflag[COMPARE] = rsp->xvflag[CLIP2] = _mm_setzero_si128(); __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); @@ -3435,7 +4569,15 @@ INLINE void cfunc_rsp_vlt(void *param) rsp->accum_l = rsp->xv[VDREG] = _mm_or_si128(result, _mm_and_si128(shuf, _mm_xor_si128(rsp->xvflag[COMPARE], vec_neg1))); rsp->xvflag[ZERO] = rsp->xvflag[CARRY] = _mm_setzero_si128(); +} + #else + +INLINE void cfunc_rsp_vlt_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + CLEAR_COMPARE_FLAGS(); CLEAR_CLIP2_FLAGS(); @@ -3473,23 +4615,25 @@ INLINE void cfunc_rsp_vlt(void *param) CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_veq(void *param) +#if USE_SIMD +// VEQ +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100001 | +// ------------------------------------------------------ +// +// Sets compare flags if elements in VS1 are equal with VS2 +// Moves the element in VS2 to destination vector + +INLINE void cfunc_rsp_veq_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100001 | - // ------------------------------------------------------ - // - // Sets compare flags if elements in VS1 are equal with VS2 - // Moves the element in VS2 to destination vector - -#if USE_SIMD rsp->xvflag[COMPARE] = rsp->xvflag[CLIP2] = _mm_setzero_si128(); __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); @@ -3502,7 +4646,15 @@ INLINE void cfunc_rsp_veq(void *param) rsp->accum_l = rsp->xv[VDREG] = _mm_or_si128(result, _mm_and_si128(shuf, _mm_xor_si128(rsp->xvflag[COMPARE], vec_neg1))); rsp->xvflag[ZERO] = rsp->xvflag[CARRY] = _mm_setzero_si128(); +} + #else + +INLINE void cfunc_rsp_veq_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + CLEAR_COMPARE_FLAGS(); CLEAR_CLIP2_FLAGS(); @@ -3529,23 +4681,25 @@ INLINE void cfunc_rsp_veq(void *param) CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vne(void *param) +#if USE_SIMD +// VNE +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100010 | +// ------------------------------------------------------ +// +// Sets compare flags if elements in VS1 are not equal with VS2 +// Moves the element in VS2 to destination vector + +INLINE void cfunc_rsp_vne_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100010 | - // ------------------------------------------------------ - // - // Sets compare flags if elements in VS1 are not equal with VS2 - // Moves the element in VS2 to destination vector - -#if USE_SIMD rsp->xvflag[COMPARE] = rsp->xvflag[CLIP2] = _mm_setzero_si128(); __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); @@ -3557,7 +4711,15 @@ INLINE void cfunc_rsp_vne(void *param) rsp->accum_l = rsp->xv[VDREG] = _mm_or_si128(result, _mm_and_si128(shuf, _mm_xor_si128(rsp->xvflag[COMPARE], vec_neg1))); rsp->xvflag[ZERO] = rsp->xvflag[CARRY] = _mm_setzero_si128(); +} + #else + +INLINE void cfunc_rsp_vne_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + CLEAR_COMPARE_FLAGS(); CLEAR_CLIP2_FLAGS(); @@ -3584,24 +4746,25 @@ INLINE void cfunc_rsp_vne(void *param) CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vge(void *param) +#if USE_SIMD +// VGE +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100011 | +// ------------------------------------------------------ +// +// Sets compare flags if elements in VS1 are greater or equal with VS2 +// Moves the element in VS2 to destination vector + +INLINE void cfunc_rsp_vge_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - //int i; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100011 | - // ------------------------------------------------------ - // - // Sets compare flags if elements in VS1 are greater or equal with VS2 - // Moves the element in VS2 to destination vector - -#if USE_SIMD rsp->xvflag[COMPARE] = rsp->xvflag[CLIP2] = _mm_setzero_si128(); __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); @@ -3614,7 +4777,15 @@ INLINE void cfunc_rsp_vge(void *param) rsp->accum_l = rsp->xv[VDREG] = _mm_or_si128(result, _mm_and_si128(shuf, _mm_xor_si128(rsp->xvflag[COMPARE], vec_neg1))); rsp->xvflag[ZERO] = rsp->xvflag[CARRY] = _mm_setzero_si128(); +} + #else + +INLINE void cfunc_rsp_vge_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + CLEAR_COMPARE_FLAGS(); CLEAR_CLIP2_FLAGS(); @@ -3640,218 +4811,117 @@ INLINE void cfunc_rsp_vge(void *param) CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vcl(void *param) +#if USE_SIMD +// VCL +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100100 | +// ------------------------------------------------------ +// +// Vector clip low + +INLINE void cfunc_rsp_vcl_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; INT16 vres[8]; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100100 | - // ------------------------------------------------------ - // - // Vector clip low + for (int i = 0; i < 8; i++) + { + INT16 s1, s2; + SCALAR_GET_VS1(s1, i); + SCALAR_GET_VS2(s2, i); -#if 0//USE_SIMD - __m128i flag0_07 = _mm_set_epi16(CARRY_FLAG(0), CARRY_FLAG(1), CARRY_FLAG(2), CARRY_FLAG(3), - CARRY_FLAG(4), CARRY_FLAG(5), CARRY_FLAG(6), CARRY_FLAG(7)); - __m128i flag0_815 = _mm_set_epi16(ZERO_FLAG(0), ZERO_FLAG(1), ZERO_FLAG(2), ZERO_FLAG(3), - ZERO_FLAG(4), ZERO_FLAG(5), ZERO_FLAG(6), ZERO_FLAG(7)); - __m128i flag1_07 = _mm_set_epi16(COMPARE_FLAG(0), COMPARE_FLAG(1), COMPARE_FLAG(2), COMPARE_FLAG(3), - COMPARE_FLAG(4), COMPARE_FLAG(5), COMPARE_FLAG(6), COMPARE_FLAG(7)); - __m128i flag1_815 = _mm_set_epi16((rsp->flag[1] >> 8) & 1, (rsp->flag[1] >> 9) & 1, (rsp->flag[1] >> 10) & 1, (rsp->flag[1] >> 11) & 1, - (rsp->flag[1] >> 12) & 1, (rsp->flag[1] >> 13) & 1, (rsp->flag[1] >> 14) & 1, (rsp->flag[1] >> 15) & 1); - __m128i flag2_07 = _mm_set_epi16(rsp->flag[2][0], rsp->flag[2][1], rsp->flag[2][2], rsp->flag[2][3], - rsp->flag[2][4], rsp->flag[2][5], rsp->flag[2][6], rsp->flag[2][7]); - __m128i n0_07 = _mm_xor_si128(flag0_07, vec_neg1); - __m128i n0_815 = _mm_xor_si128(flag0_815, vec_neg1); - __m128i n1_07 = _mm_xor_si128(flag1_07, vec_neg1); - __m128i n1_815 = _mm_xor_si128(flag1_815, vec_neg1); - __m128i n2_07 = _mm_xor_si128(flag2_07, vec_neg1); + if (CARRY_FLAG(rsp, i) != 0) + { + if (ZERO_FLAG(rsp, i) != 0) + { + if (COMPARE_FLAG(rsp, i) != 0) + { + VEC_SET_ACCUM_L(-(UINT16)s2, i); + } + else + { + VEC_SET_ACCUM_L(s1, i); + } + } + else//ZERO_FLAG(rsp, i)==0 + { + if (CLIP1_FLAG(rsp, i) != 0) + { + if (((UINT32)(UINT16)(s1) + (UINT32)(UINT16)(s2)) > 0x10000) + {//proper fix for Harvest Moon 64, r4 + VEC_SET_ACCUM_L(s1, i); + CLEAR_COMPARE_FLAG(i); + } + else + { + VEC_SET_ACCUM_L(-((UINT16)s2), i); + SET_COMPARE_FLAG(i); + } + } + else + { + if (((UINT32)(UINT16)(s1) + (UINT32)(UINT16)(s2)) != 0) + { + VEC_SET_ACCUM_L(s1, i); + CLEAR_COMPARE_FLAG(i); + } + else + { + VEC_SET_ACCUM_L(-((UINT16)s2), i); + SET_COMPARE_FLAG(i); + } + } + } + } + else//CARRY_FLAG(rsp, i)==0 + { + if (ZERO_FLAG(rsp, i) != 0) + { + if (CLIP2_FLAG(rsp, i) != 0) + { + VEC_SET_ACCUM_L(s2, i); + } + else + { + VEC_SET_ACCUM_L(s1, i); + } + } + else + { + if (((INT32)(UINT16)s1 - (INT32)(UINT16)s2) >= 0) + { + VEC_SET_ACCUM_L(s2, i); + SET_CLIP2_FLAG(i); + } + else + { + VEC_SET_ACCUM_L(s1, i); + CLEAR_CLIP2_FLAG(i); + } + } + } + vres[i] = VEC_ACCUM_L(rsp, i); + } + CLEAR_ZERO_FLAGS(); + CLEAR_CARRY_FLAGS(); + CLEAR_CLIP1_FLAGS(); + WRITEBACK_RESULT(); +} - __m128i shuf2 = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); - __m128i vec7531 = _mm_and_si128(rsp->xv[VS1REG], vec_lomask); - __m128i vec6420 = _mm_srli_epi32(rsp->xv[VS1REG], 16); - __m128i shuf7531 = _mm_and_si128(shuf2, vec_lomask); - __m128i shuf6420 = _mm_srli_epi32(shuf2, 16); - __m128i sub7531 = _mm_sub_epi32(vec7531, shuf7531); - __m128i sub6420 = _mm_sub_epi32(vec6420, shuf6420); - __m128i subh7531 = _mm_and_si128(sub7531, vec_himask); - __m128i subh6420 = _mm_and_si128(sub6420, vec_himask); - __m128i sub_gez = _mm_or_si128(_mm_slli_epi32(_mm_cmpeq_epi16(subh6420, _mm_setzero_si128()), 16), _mm_cmpeq_epi16(subh7531, _mm_setzero_si128())); - __m128i sub_lz = _mm_xor_si128(sub_gez, vec_neg1); - - __m128i sum7531 = _mm_add_epi32(vec7531, shuf7531); - __m128i sum6420 = _mm_add_epi32(vec6420, shuf6420); - __m128i suml7531 = _mm_and_si128(sum7531, vec_lomask); - __m128i suml6420 = _mm_and_si128(sum6420, vec_lomask); - __m128i sumh7531 = _mm_and_si128(sum7531, vec_himask); - __m128i sumh6420 = _mm_and_si128(sum6420, vec_himask); - __m128i suml_z = _mm_or_si128(_mm_slli_epi32(_mm_cmpeq_epi16(suml6420, _mm_setzero_si128()), 16), _mm_cmpeq_epi16(suml7531, _mm_setzero_si128())); - __m128i sumh_1 = _mm_or_si128(_mm_slli_epi32(_mm_cmpeq_epi16(sumh6420, vec_hibit), 16), _mm_cmpeq_epi16(sumh7531, vec_hibit)); - __m128i sumh_z = _mm_or_si128(_mm_slli_epi32(_mm_cmpeq_epi16(sumh6420, _mm_setzero_si128()), 16), _mm_cmpeq_epi16(sumh7531, _mm_setzero_si128())); - __m128i sum_z = _mm_and_si128(suml_z, sumh_z); - __m128i sum_nz = _mm_xor_si128(sum_z, vec_neg1); - __m128i sum_le0x10000 = _mm_or_si128(_mm_and_si128(suml_z, sumh_1), sumh_z); - __m128i sum_g0x10000 = _mm_xor_si128(sum_le0x10000, vec_neg1); - - __m128i f0a_and_nf0b = _mm_and_si128(flag0_07, n0_815); - __m128i nf0a_and_nf0b = _mm_and_si128( n0_07, n0_815); - - // accum set to -s2 if flag0[0-7] && flag0[8-15] && flag1[0-7] - // accum set to -s2 if flag0[0-7] && !flag0[8-15] && flag2[0-7] && (s1 + s2) > 0x10000 - // accum set to -s2 if flag0[0-7] && !flag0[8-15] && !flag2[0-7] && (s1 + s2) == 0 - __m128i accum_ns2 = _mm_and_si128(_mm_and_si128(flag0_07, flag0_815), flag1_07); - accum_ns2 = _mm_or_si128(accum_ns2, _mm_and_si128(_mm_and_si128(f0a_and_nf0b, flag2_07), sum_g0x10000)); - accum_ns2 = _mm_or_si128(accum_ns2, _mm_and_si128(_mm_and_si128(f0a_and_nf0b, n2_07), sum_z)); - - // accum set to s2 if !flag0[0-7] && flag0[8-15] && flag1[8-15] - // accum set to s2 if !flag0[0-7] && !flag0[8-15] && (s1 - s2) >= 0 - __m128i accum_s2 = _mm_and_si128(n0_07, _mm_and_si128(flag0_815, flag1_815)); - accum_s2 = _mm_or_si128(accum_s2, _mm_and_si128(_mm_and_si128(n0_07, n0_815), sub_gez)); - - // flag1[8-15] set if !flag0[0-7] && !flag0[8-15] && (s1 - s2) >= 0 - __m128i new_f1b_s = _mm_and_si128(_mm_and_si128(nf0a_and_nf0b, sub_gez), vec_flagmask); - UINT16 flag1_set = 0; - flag1_set |= _mm_extract_epi16(new_f1b_s, 0) << 8; - flag1_set |= _mm_extract_epi16(new_f1b_s, 1) << 9; - flag1_set |= _mm_extract_epi16(new_f1b_s, 2) << 10; - flag1_set |= _mm_extract_epi16(new_f1b_s, 3) << 11; - flag1_set |= _mm_extract_epi16(new_f1b_s, 4) << 12; - flag1_set |= _mm_extract_epi16(new_f1b_s, 5) << 13; - flag1_set |= _mm_extract_epi16(new_f1b_s, 6) << 14; - flag1_set |= _mm_extract_epi16(new_f1b_s, 7) << 15; - - // flag1[8-15]unset if !flag0[0-7] && !flag0[8-15] && (s1 - s2) < 0 - __m128i new_f1b_u = _mm_xor_si128(vec_neg1, _mm_and_si128(nf0a_and_nf0b, sub_lz)); - new_f1b_u = _mm_and_si128(new_f1b_u, vec_flagmask); - UINT16 flag1_unset = 0; - flag1_unset |= _mm_extract_epi16(new_f1b_u, 0) << 8; - flag1_unset |= _mm_extract_epi16(new_f1b_u, 1) << 9; - flag1_unset |= _mm_extract_epi16(new_f1b_u, 2) << 10; - flag1_unset |= _mm_extract_epi16(new_f1b_u, 3) << 11; - flag1_unset |= _mm_extract_epi16(new_f1b_u, 4) << 12; - flag1_unset |= _mm_extract_epi16(new_f1b_u, 5) << 13; - flag1_unset |= _mm_extract_epi16(new_f1b_u, 6) << 14; - flag1_unset |= _mm_extract_epi16(new_f1b_u, 7) << 15; - - // flag1[0-7] set if flag0[0-7] && !flag0[8-15] && flag2[0-7] && (s1 + s2) <= 0x10000 - // flag1[0-7] set if flag0[0-7] && !flag0[8-15] && !flag2[0-7] && (s1 + s2) == 0 - __m128i new_f1a_s = _mm_and_si128(_mm_and_si128(f0a_and_nf0b, flag2_07), sum_le0x10000); - new_f1a_s = _mm_or_si128(new_f1a_u, _mm_and_si128(_mm_and_si128(f0a_and_nf0b, n2_07), sum_z)); - new_f1a_s = _mm_and_si128(new_f1a_s, vec_flagmask); - flag1_set |= _mm_extract_epi16(new_f1a_s, 0) << 0; - flag1_set |= _mm_extract_epi16(new_f1a_s, 1) << 1; - flag1_set |= _mm_extract_epi16(new_f1a_s, 2) << 2; - flag1_set |= _mm_extract_epi16(new_f1a_s, 3) << 3; - flag1_set |= _mm_extract_epi16(new_f1a_s, 4) << 4; - flag1_set |= _mm_extract_epi16(new_f1a_s, 5) << 5; - flag1_set |= _mm_extract_epi16(new_f1a_s, 6) << 6; - flag1_set |= _mm_extract_epi16(new_f1a_s, 7) << 7; - - // flag1[0-7] unset if flag0[0-7] && !flag0[8-15] && flag2[0-7] && (s1 + s2) > 0x10000 - // flag1[0-7] unset if flag0[0-7] && !flag0[8-15] && !flag2[0-7] && (s1 + s2) != 0 - __m128i new_f1a_u = _mm_and_si128(_mm_and_si128(f0a_and_nf0b, flag2_07), sum_g0x10000); - new_f1a_u = _mm_or_si128(new_f1a_u, _mm_and_si128(_mm_and_si128(f0a_and_nf0b, n2_07), sum_nz)); - new_f1a_u = _mm_and_si128(new_f1a_u, vec_flagmask); - flag1_unset |= _mm_extract_epi16(new_f1a_u, 0) << 0; - flag1_unset |= _mm_extract_epi16(new_f1a_u, 1) << 1; - flag1_unset |= _mm_extract_epi16(new_f1a_u, 2) << 2; - flag1_unset |= _mm_extract_epi16(new_f1a_u, 3) << 3; - flag1_unset |= _mm_extract_epi16(new_f1a_u, 4) << 4; - flag1_unset |= _mm_extract_epi16(new_f1a_u, 5) << 5; - flag1_unset |= _mm_extract_epi16(new_f1a_u, 6) << 6; - flag1_unset |= _mm_extract_epi16(new_f1a_u, 7) << 7; - - rsp->flag[1] &= ~flag1_unset; - rsp->flag[1] |= flag1_set; - - // accum set to s1 if flag0[0-7] && flag0[8-15] && !flag1[0-7] - // accum set to s1 if flag0[0-7] && !flag0[8-15] && flag2[0-7] && (s1 + s2) <= 0x10000 - // accum set to s1 if flag0[0-7] && !flag0[8-15] && !flag2[0-7] && (s1 + s2) != 0 - // accum set to s1 if !flag0[0-7] && flag0[8-15] && !flag1[8-15] - // accum set to s1 if !flag0[0-7] && !flag0[8-15] && (s1 - s2) < 0 - __m128i accum_s1 = _mm_and_si128(flag0_07, _mm_and_si128(flag0_815, n1_07)); - accum_s1 = _mm_or_si128(accum_s1, _mm_and_si128(_mm_and_si128(f0a_and_nf0b, flag2_07), sum_le0x10000)); - accum_s1 = _mm_or_si128(accum_s1, _mm_and_si128(_mm_and_si128(f0a_and_nf0b, n2_07), sum_nz)); - accum_s1 = _mm_or_si128(accum_s1, _mm_and_si128(_mm_and_si128(n0_07, flag0_815), n1_815)); - accum_s1 = _mm_or_si128(accum_s1, _mm_and_si128(nf0a_and_nf0b, sub_lz)); - //__m128i zms2 = _mm_sub_epi16(_mm_setzero_si128(), shuf2); - - /* - __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); - __m128i s1_xor_s2 = _mm_xor_si128(rsp->xv[VS1REG], shuf); - __m128i s1_plus_s2 = _mm_add_epi16(rsp->xv[VS1REG], shuf); - __m128i s1_sub_s2 = _mm_sub_epi16(rsp->xv[VS1REG], shuf); - __m128i s2_neg = _mm_xor_si128(shuf, vec_neg1); - - __m128i s2_lz = _mm_cmplt_epi16(shuf, _mm_setzero_si128()); - __m128i s1s2_xor_lz = _mm_cmplt_epi16(s1_xor_s2, _mm_setzero_si128()); - __m128i s1s2_xor_gez = _mm_xor_si128(s1s2_xor_lz, vec_neg1); - __m128i s1s2_plus_nz = _mm_xor_si128(_mm_cmpeq_epi16(s1_plus_s2, _mm_setzero_si128()), vec_neg1); - __m128i s1s2_plus_gz = _mm_cmpgt_epi16(s1_plus_s2, _mm_setzero_si128()); - __m128i s1s2_plus_lez = _mm_xor_si128(s1s2_plus_gz, vec_neg1); - __m128i s1s2_plus_n1 = _mm_cmpeq_epi16(s1_plus_s2, vec_neg1); - __m128i s1s2_sub_nz = _mm_xor_si128(_mm_cmpeq_epi16(s1_sub_s2, _mm_setzero_si128()), vec_neg1); - __m128i s1s2_sub_lz = _mm_cmplt_epi16(s1_sub_s2, _mm_setzero_si128()); - __m128i s1s2_sub_gez = _mm_xor_si128(s1s2_sub_lz, vec_neg1); - __m128i s1_nens2 = _mm_xor_si128(_mm_cmpeq_epi16(rsp->xv[VS1REG], s2_neg), vec_neg1); - - __m128i ext_mask = _mm_and_si128(_mm_and_si128(s1s2_xor_lz, s1s2_plus_n1), vec_flagmask); - rsp->flag[2] |= _mm_extract_epi16(ext_mask, 0) << 0; - rsp->flag[2] |= _mm_extract_epi16(ext_mask, 1) << 1; - rsp->flag[2] |= _mm_extract_epi16(ext_mask, 2) << 2; - rsp->flag[2] |= _mm_extract_epi16(ext_mask, 3) << 3; - rsp->flag[2] |= _mm_extract_epi16(ext_mask, 4) << 4; - rsp->flag[2] |= _mm_extract_epi16(ext_mask, 5) << 5; - rsp->flag[2] |= _mm_extract_epi16(ext_mask, 6) << 6; - rsp->flag[2] |= _mm_extract_epi16(ext_mask, 7) << 7; - - __m128i carry_mask = _mm_and_si128(s1s2_xor_lz, vec_flagmask); - rsp->flag[0] |= _mm_extract_epi16(carry_mask, 0) << 0; - rsp->flag[0] |= _mm_extract_epi16(carry_mask, 1) << 1; - rsp->flag[0] |= _mm_extract_epi16(carry_mask, 2) << 2; - rsp->flag[0] |= _mm_extract_epi16(carry_mask, 3) << 3; - rsp->flag[0] |= _mm_extract_epi16(carry_mask, 4) << 4; - rsp->flag[0] |= _mm_extract_epi16(carry_mask, 5) << 5; - rsp->flag[0] |= _mm_extract_epi16(carry_mask, 6) << 6; - rsp->flag[0] |= _mm_extract_epi16(carry_mask, 7) << 7; - - __m128i z0_mask = _mm_and_si128(_mm_and_si128(s1s2_xor_gez, s1s2_sub_nz), s1_nens2); - __m128i z1_mask = _mm_and_si128(_mm_and_si128(s1s2_xor_lz, s1s2_plus_nz), s1_nens2); - __m128i z_mask = _mm_and_si128(_mm_or_si128(z0_mask, z1_mask), vec_flagmask); - z_mask = _mm_and_si128(_mm_or_si128(z_mask, _mm_srli_epi32(z_mask, 15)), vec_shiftmask2); - z_mask = _mm_and_si128(_mm_or_si128(z_mask, _mm_srli_epi64(z_mask, 30)), vec_shiftmask4); - z_mask = _mm_or_si128(z_mask, _mm_srli_si128(z_mask, 7)); - z_mask = _mm_or_si128(z_mask, _mm_srli_epi16(z_mask, 4)); - rsp->flag[0] |= (_mm_extract_epi16(z_mask, 0) << 8) & 0x00ff00; - - __m128i f0_mask = _mm_and_si128(_mm_or_si128(_mm_and_si128(s1s2_xor_gez, s2_lz), _mm_and_si128(s1s2_xor_lz, s1s2_plus_lez)), vec_flagmask); - __m128i f8_mask = _mm_and_si128(_mm_or_si128(_mm_and_si128(s1s2_xor_gez, s1s2_sub_gez), _mm_and_si128(s1s2_xor_lz, s2_lz)), vec_flagmask); - f0_mask = _mm_and_si128(f0_mask, vec_flagmask); - f8_mask = _mm_and_si128(f8_mask, vec_flagmask); - rsp->flag[1] |= _mm_extract_epi16(f0_mask, 0) << 0; - rsp->flag[1] |= _mm_extract_epi16(f0_mask, 1) << 1; - rsp->flag[1] |= _mm_extract_epi16(f0_mask, 2) << 2; - rsp->flag[1] |= _mm_extract_epi16(f0_mask, 3) << 3; - rsp->flag[1] |= _mm_extract_epi16(f0_mask, 4) << 4; - rsp->flag[1] |= _mm_extract_epi16(f0_mask, 5) << 5; - rsp->flag[1] |= _mm_extract_epi16(f0_mask, 6) << 6; - rsp->flag[1] |= _mm_extract_epi16(f0_mask, 7) << 7; - - rsp->flag[1] |= _mm_extract_epi16(f8_mask, 0) << 8; - rsp->flag[1] |= _mm_extract_epi16(f8_mask, 1) << 9; - rsp->flag[1] |= _mm_extract_epi16(f8_mask, 2) << 10; - rsp->flag[1] |= _mm_extract_epi16(f8_mask, 3) << 11; - rsp->flag[1] |= _mm_extract_epi16(f8_mask, 4) << 12; - rsp->flag[1] |= _mm_extract_epi16(f8_mask, 5) << 13; - rsp->flag[1] |= _mm_extract_epi16(f8_mask, 6) << 14; - rsp->flag[1] |= _mm_extract_epi16(f8_mask, 7) << 15;*/ #else + +INLINE void cfunc_rsp_vcl_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; + for (int i = 0; i < 8; i++) { INT16 s1, s2; @@ -3934,28 +5004,31 @@ INLINE void cfunc_rsp_vcl(void *param) CLEAR_CARRY_FLAGS(); CLEAR_CLIP1_FLAGS(); WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vch(void *param) +#if USE_SIMD +// VCH +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100101 | +// ------------------------------------------------------ +// +// Vector clip high + +INLINE void cfunc_rsp_vch_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100101 | - // ------------------------------------------------------ - // - // Vector clip high - CLEAR_CARRY_FLAGS(); CLEAR_COMPARE_FLAGS(); CLEAR_CLIP1_FLAGS(); CLEAR_ZERO_FLAGS(); CLEAR_CLIP2_FLAGS(); -#if 0//USE_SIMD +#if 0 // Compare flag // flag[1] bit [0- 7] set if (s1 ^ s2) < 0 && (s1 + s2) <= 0) // flag[1] bit [0- 7] set if (s1 ^ s2) >= 0 && (s2 < 0) @@ -4046,8 +5119,83 @@ INLINE void cfunc_rsp_vch(void *param) rsp->flag[1] |= _mm_extract_epi16(f8_mask, 5) << 13; rsp->flag[1] |= _mm_extract_epi16(f8_mask, 6) << 14; rsp->flag[1] |= _mm_extract_epi16(f8_mask, 7) << 15; +#endif + INT16 vres[8]; + UINT32 vce = 0; + for (int i = 0; i < 8; i++) + { + INT16 s1, s2; + SCALAR_GET_VS1(s1, i); + SCALAR_GET_VS2(s2, i); + + if ((s1 ^ s2) < 0) + { + vce = (s1 + s2 == -1); + SET_CARRY_FLAG(i); + if (s2 < 0) + { + SET_CLIP2_FLAG(i); + } + + if ((s1 + s2) <= 0) + { + SET_COMPARE_FLAG(i); + vres[i] = -((UINT16)s2); + } + else + { + vres[i] = s1; + } + + if ((s1 + s2) != 0 && s1 != ~s2) + { + SET_ZERO_FLAG(i); + } + }//sign + else + { + vce = 0; + if (s2 < 0) + { + SET_COMPARE_FLAG(i); + } + if ((s1 - s2) >= 0) + { + SET_CLIP2_FLAG(i); + vres[i] = s2; + } + else + { + vres[i] = s1; + } + + if ((s1 - s2) != 0 && s1 != ~s2) + { + SET_ZERO_FLAG(i); + } + } + if (vce) + { + SET_CLIP1_FLAG(i); + } + VEC_SET_ACCUM_L(vres[i], i); + } + WRITEBACK_RESULT(); +} + #else +INLINE void cfunc_rsp_vch_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + + CLEAR_CARRY_FLAGS(); + CLEAR_COMPARE_FLAGS(); + CLEAR_CLIP1_FLAGS(); + CLEAR_ZERO_FLAGS(); + CLEAR_CLIP2_FLAGS(); + INT16 vres[8]; UINT32 vce = 0; for (int i = 0; i < 8; i++) @@ -4109,28 +5257,31 @@ INLINE void cfunc_rsp_vch(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vcr(void *param) +#if USE_SIMD +// VCR +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100110 | +// ------------------------------------------------------ +// +// Vector clip reverse + +INLINE void cfunc_rsp_vcr_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100110 | - // ------------------------------------------------------ - // - // Vector clip reverse - CLEAR_CARRY_FLAGS(); CLEAR_COMPARE_FLAGS(); CLEAR_CLIP1_FLAGS(); CLEAR_ZERO_FLAGS(); CLEAR_CLIP2_FLAGS(); -#if 0//USE_SIMD +#if 0 // flag[1] bit [0- 7] set if (s1 ^ s2) < 0 && (s1 + s2) <= 0) // flag[1] bit [0- 7] set if (s1 ^ s2) >= 0 && (s2 < 0) @@ -4163,7 +5314,65 @@ INLINE void cfunc_rsp_vcr(void *param) rsp->xvflag[COMPARE] = _mm_or_si128(_mm_and_si128(s1s2_xor_gez, s2_lz), _mm_and_si128(s1s2_xor_lz, s1s2_plus_lez)); rsp->xvflag[CLIP2] = _mm_or_si128(_mm_and_si128(s1s2_xor_gez, s1s2_sub_gez), _mm_and_si128(s1s2_xor_lz, s2_lz)); +#endif + INT16 vres[8]; + for (int i = 0; i < 8; i++) + { + INT16 s1, s2; + SCALAR_GET_VS1(s1, i); + SCALAR_GET_VS2(s2, i); + + if ((INT16)(s1 ^ s2) < 0) + { + if (s2 < 0) + { + SET_CLIP2_FLAG(i); + } + if ((s1 + s2) <= 0) + { + VEC_SET_ACCUM_L(~((UINT16)s2), i); + SET_COMPARE_FLAG(i); + } + else + { + VEC_SET_ACCUM_L(s1, i); + } + } + else + { + if (s2 < 0) + { + SET_COMPARE_FLAG(i); + } + if ((s1 - s2) >= 0) + { + VEC_SET_ACCUM_L(s2, i); + SET_CLIP2_FLAG(i); + } + else + { + VEC_SET_ACCUM_L(s1, i); + } + } + + vres[i] = VEC_ACCUM_L(rsp, i); + } + WRITEBACK_RESULT(); +} + #else + +INLINE void cfunc_rsp_vcr_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + + CLEAR_CARRY_FLAGS(); + CLEAR_COMPARE_FLAGS(); + CLEAR_CLIP1_FLAGS(); + CLEAR_ZERO_FLAGS(); + CLEAR_CLIP2_FLAGS(); + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -4207,29 +5416,39 @@ INLINE void cfunc_rsp_vcr(void *param) vres[i] = ACCUM_L(rsp, i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmrg(void *param) +#if USE_SIMD +// VMRG +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100111 | +// ------------------------------------------------------ +// +// Merges two vectors according to compare flags + +INLINE void cfunc_rsp_vmrg_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100111 | - // ------------------------------------------------------ - // - // Merges two vectors according to compare flags - -#if USE_SIMD __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); __m128i s2mask = _mm_cmpeq_epi16(rsp->xvflag[COMPARE], _mm_setzero_si128()); __m128i s1mask = _mm_xor_si128(s2mask, vec_neg1); __m128i result = _mm_and_si128(rsp->xv[VS1REG], s1mask); rsp->xv[VDREG] = _mm_or_si128(result, _mm_and_si128(shuf, s2mask)); rsp->accum_l = rsp->xv[VDREG]; +} + #else + +INLINE void cfunc_rsp_vmrg_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -4248,26 +5467,36 @@ INLINE void cfunc_rsp_vmrg(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vand(void *param) +#if USE_SIMD +// VAND +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101000 | +// ------------------------------------------------------ +// +// Bitwise AND of two vector registers + +INLINE void cfunc_rsp_vand_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101000 | - // ------------------------------------------------------ - // - // Bitwise AND of two vector registers - -#if USE_SIMD __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); rsp->xv[VDREG] = _mm_and_si128(rsp->xv[VS1REG], shuf); rsp->accum_l = rsp->xv[VDREG]; +} + #else + +INLINE void cfunc_rsp_vand_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -4278,26 +5507,36 @@ INLINE void cfunc_rsp_vand(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vnand(void *param) +#if USE_SIMD +// VNAND +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101001 | +// ------------------------------------------------------ +// +// Bitwise NOT AND of two vector registers + +INLINE void cfunc_rsp_vnand_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101001 | - // ------------------------------------------------------ - // - // Bitwise NOT AND of two vector registers - -#if USE_SIMD __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); rsp->xv[VDREG] = _mm_xor_si128(_mm_and_si128(rsp->xv[VS1REG], shuf), vec_neg1); rsp->accum_l = rsp->xv[VDREG]; +} + #else + +INLINE void cfunc_rsp_vnand_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -4308,26 +5547,36 @@ INLINE void cfunc_rsp_vnand(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vor(void *param) +#if USE_SIMD +// VOR +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101010 | +// ------------------------------------------------------ +// +// Bitwise OR of two vector registers + +INLINE void cfunc_rsp_vor_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101010 | - // ------------------------------------------------------ - // - // Bitwise OR of two vector registers - -#if USE_SIMD __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); rsp->xv[VDREG] = _mm_or_si128(rsp->xv[VS1REG], shuf); rsp->accum_l = rsp->xv[VDREG]; +} + #else + +INLINE void cfunc_rsp_vor_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -4338,26 +5587,36 @@ INLINE void cfunc_rsp_vor(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vnor(void *param) +#if USE_SIMD +// VNOR +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101011 | +// ------------------------------------------------------ +// +// Bitwise NOT OR of two vector registers + +INLINE void cfunc_rsp_vnor_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101011 | - // ------------------------------------------------------ - // - // Bitwise NOT OR of two vector registers - -#if USE_SIMD __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); rsp->xv[VDREG] = _mm_xor_si128(_mm_or_si128(rsp->xv[VS1REG], shuf), vec_neg1); rsp->accum_l = rsp->xv[VDREG]; +} + #else + +INLINE void cfunc_rsp_vnor_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -4368,26 +5627,36 @@ INLINE void cfunc_rsp_vnor(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vxor(void *param) +#if USE_SIMD +// VXOR +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101100 | +// ------------------------------------------------------ +// +// Bitwise XOR of two vector registers + +INLINE void cfunc_rsp_vxor_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101100 | - // ------------------------------------------------------ - // - // Bitwise XOR of two vector registers - -#if USE_SIMD __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); rsp->xv[VDREG] = _mm_xor_si128(rsp->xv[VS1REG], shuf); rsp->accum_l = rsp->xv[VDREG]; +} + #else + +INLINE void cfunc_rsp_vxor_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -4398,26 +5667,36 @@ INLINE void cfunc_rsp_vxor(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vnxor(void *param) +#if USE_SIMD +// VNXOR +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101101 | +// ------------------------------------------------------ +// +// Bitwise NOT XOR of two vector registers + +INLINE void cfunc_rsp_vnxor_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101101 | - // ------------------------------------------------------ - // - // Bitwise NOT XOR of two vector registers - -#if USE_SIMD __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); rsp->xv[VDREG] = _mm_xor_si128(_mm_xor_si128(rsp->xv[VS1REG], shuf), vec_neg1); rsp->accum_l = rsp->xv[VDREG]; +} + #else + +INLINE void cfunc_rsp_vnxor_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -4428,30 +5707,29 @@ INLINE void cfunc_rsp_vnxor(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vrcp(void *param) +#if USE_SIMD +// VRCP +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110000 | +// ------------------------------------------------------ +// +// Calculates reciprocal + +INLINE void cfunc_rsp_vrcp_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110000 | - // ------------------------------------------------------ - // - // Calculates reciprocal - INT32 shifter = 0; -#if USE_SIMD UINT16 urec; INT32 rec; SIMD_EXTRACT16(rsp->xv[VS2REG], urec, EL); rec = (INT16)urec; -#else - INT32 rec = (INT16)(VREG_S(VS2REG, EL & 7)); -#endif INT32 datainput = (rec < 0) ? (-rec) : rec; if (datainput) { @@ -4489,39 +5767,84 @@ INLINE void cfunc_rsp_vrcp(void *param) rsp->reciprocal_res = rec; rsp->dp_allowed = 0; -#if USE_SIMD SIMD_INSERT16(rsp->xv[VDREG], (UINT16)rec, VS1REG); rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); +} + #else + +INLINE void cfunc_rsp_vrcp_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + + INT32 shifter = 0; + INT32 rec = (INT16)(VREG_S(VS2REG, EL & 7)); + INT32 datainput = (rec < 0) ? (-rec) : rec; + if (datainput) + { + for (int i = 0; i < 32; i++) + { + if (datainput & (1 << ((~i) & 0x1f))) + { + shifter = i; + break; + } + } + } + else + { + shifter = 0x10; + } + + INT32 address = ((datainput << shifter) & 0x7fc00000) >> 22; + INT32 fetchval = rsp_divtable[address]; + INT32 temp = (0x40000000 | (fetchval << 14)) >> ((~shifter) & 0x1f); + if (rec < 0) + { + temp = ~temp; + } + if (!rec) + { + temp = 0x7fffffff; + } + else if (rec == 0xffff8000) + { + temp = 0xffff0000; + } + rec = temp; + + rsp->reciprocal_res = rec; + rsp->dp_allowed = 0; + W_VREG_S(VDREG, VS1REG & 7) = (UINT16)rec; for (int i = 0; i < 8; i++) { SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i); } -#endif } +#endif -INLINE void cfunc_rsp_vrcpl(void *param) +#if USE_SIMD +// VRCPL +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110001 | +// ------------------------------------------------------ +// +// Calculates reciprocal low part + +INLINE void cfunc_rsp_vrcpl_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110001 | - // ------------------------------------------------------ - // - // Calculates reciprocal low part - INT32 shifter = 0; -#if USE_SIMD UINT16 urec; SIMD_EXTRACT16(rsp->xv[VS2REG], urec, EL); INT32 rec = (urec | rsp->reciprocal_high); -#else - INT32 rec = ((UINT16)(VREG_S(VS2REG, EL & 7)) | rsp->reciprocal_high); -#endif INT32 datainput = rec; @@ -4588,37 +5911,114 @@ INLINE void cfunc_rsp_vrcpl(void *param) rsp->reciprocal_res = rec; rsp->dp_allowed = 0; -#if USE_SIMD SIMD_INSERT16(rsp->xv[VDREG], (UINT16)rec, VS1REG); -#else - W_VREG_S(VDREG, VS1REG & 7) = (UINT16)rec; -#endif for (int i = 0; i < 8; i++) { -#if USE_SIMD INT16 val; SIMD_EXTRACT16(rsp->xv[VS2REG], val, VEC_EL_2(EL, i)); -#else - INT16 val = VREG_S(VS2REG, VEC_EL_2(EL, i)); -#endif - SET_ACCUM_L(val, i); + VEC_SET_ACCUM_L(val, i); } } -INLINE void cfunc_rsp_vrcph(void *param) +#else + +INLINE void cfunc_rsp_vrcpl_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110010 | - // ------------------------------------------------------ - // - // Calculates reciprocal high part + INT32 shifter = 0; + INT32 rec = ((UINT16)(VREG_S(VS2REG, EL & 7)) | rsp->reciprocal_high); + INT32 datainput = rec; + + if (rec < 0) + { + if (rsp->dp_allowed) + { + if (rec < -32768) + { + datainput = ~datainput; + } + else + { + datainput = -datainput; + } + } + else + { + datainput = -datainput; + } + } + + + if (datainput) + { + for (int i = 0; i < 32; i++) + { + if (datainput & (1 << ((~i) & 0x1f))) + { + shifter = i; + break; + } + } + } + else + { + if (rsp->dp_allowed) + { + shifter = 0; + } + else + { + shifter = 0x10; + } + } + + INT32 address = ((datainput << shifter) & 0x7fc00000) >> 22; + INT32 fetchval = rsp_divtable[address]; + INT32 temp = (0x40000000 | (fetchval << 14)) >> ((~shifter) & 0x1f); + if (rec < 0) + { + temp = ~temp; + } + if (!rec) + { + temp = 0x7fffffff; + } + else if (rec == 0xffff8000) + { + temp = 0xffff0000; + } + rec = temp; + + rsp->reciprocal_res = rec; + rsp->dp_allowed = 0; + + W_VREG_S(VDREG, VS1REG & 7) = (UINT16)rec; + + for (int i = 0; i < 8; i++) + { + SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i); + } +} +#endif #if USE_SIMD +// VRCPH +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110010 | +// ------------------------------------------------------ +// +// Calculates reciprocal high part + +INLINE void cfunc_rsp_vrcph_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + UINT16 rcph; SIMD_EXTRACT16(rsp->xv[VS2REG], rcph, EL); rsp->reciprocal_high = rcph << 16; @@ -4627,7 +6027,15 @@ INLINE void cfunc_rsp_vrcph(void *param) rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); SIMD_INSERT16(rsp->xv[VDREG], (INT16)(rsp->reciprocal_res >> 16), VS1REG); +} + #else + +INLINE void cfunc_rsp_vrcph_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + rsp->reciprocal_high = (VREG_S(VS2REG, EL & 7)) << 16; rsp->dp_allowed = 1; @@ -4637,55 +6045,65 @@ INLINE void cfunc_rsp_vrcph(void *param) } W_VREG_S(VDREG, VS1REG & 7) = (INT16)(rsp->reciprocal_res >> 16); -#endif } +#endif -INLINE void cfunc_rsp_vmov(void *param) +#if USE_SIMD +// VMOV +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110011 | +// ------------------------------------------------------ +// +// Moves element from vector to destination vector + +INLINE void cfunc_rsp_vmov_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110011 | - // ------------------------------------------------------ - // - // Moves element from vector to destination vector - -#if USE_SIMD INT16 val; SIMD_EXTRACT16(rsp->xv[VS2REG], val, EL); SIMD_INSERT16(rsp->xv[VDREG], val, VS1REG); rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); +} + #else + +INLINE void cfunc_rsp_vmov_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + W_VREG_S(VDREG, VS1REG & 7) = VREG_S(VS2REG, EL & 7); for (int i = 0; i < 8; i++) { SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i); } -#endif } -INLINE void cfunc_rsp_vrsql(void *param) +#endif + +#if USE_SIMD +// VRSQL +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110101 | +// ------------------------------------------------------ +// +// Calculates reciprocal square-root low part + +INLINE void cfunc_rsp_vrsql_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110101 | - // ------------------------------------------------------ - // - // Calculates reciprocal square-root low part - INT32 shifter = 0; -#if USE_SIMD UINT16 val; SIMD_EXTRACT16(rsp->xv[VS2REG], val, EL); INT32 rec = rsp->reciprocal_high | val; -#else - INT32 rec = rsp->reciprocal_high | (UINT16)VREG_S(VS2REG, EL & 7); -#endif INT32 datainput = rec; if (rec < 0) @@ -4752,31 +6170,108 @@ INLINE void cfunc_rsp_vrsql(void *param) rsp->reciprocal_res = rec; rsp->dp_allowed = 0; -#if USE_SIMD SIMD_INSERT16(rsp->xv[VDREG], (UINT16)rec, VS1REG); rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); +} + #else + +INLINE void cfunc_rsp_vrsql_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + + INT32 shifter = 0; + INT32 rec = rsp->reciprocal_high | (UINT16)VREG_S(VS2REG, EL & 7); + INT32 datainput = rec; + + if (rec < 0) + { + if (rsp->dp_allowed) + { + if (rec < -32768) + { + datainput = ~datainput; + } + else + { + datainput = -datainput; + } + } + else + { + datainput = -datainput; + } + } + + if (datainput) + { + for (int i = 0; i < 32; i++) + { + if (datainput & (1 << ((~i) & 0x1f))) + { + shifter = i; + break; + } + } + } + else + { + if (rsp->dp_allowed) + { + shifter = 0; + } + else + { + shifter = 0x10; + } + } + + INT32 address = ((datainput << shifter) & 0x7fc00000) >> 22; + address = ((address | 0x200) & 0x3fe) | (shifter & 1); + + INT32 fetchval = rsp_divtable[address]; + INT32 temp = (0x40000000 | (fetchval << 14)) >> (((~shifter) & 0x1f) >> 1); + if (rec < 0) + { + temp = ~temp; + } + if (!rec) + { + temp = 0x7fffffff; + } + else if (rec == 0xffff8000) + { + temp = 0xffff0000; + } + rec = temp; + + rsp->reciprocal_res = rec; + rsp->dp_allowed = 0; + W_VREG_S(VDREG, VS1REG & 7) = (UINT16)(rec & 0xffff); for (int i = 0; i < 8; i++) { SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i); } -#endif } +#endif -INLINE void cfunc_rsp_vrsqh(void *param) +#if USE_SIMD +// VRSQH +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110110 | +// ------------------------------------------------------ +// +// Calculates reciprocal square-root high part + +INLINE void cfunc_rsp_vrsqh_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110110 | - // ------------------------------------------------------ - // - // Calculates reciprocal square-root high part - -#if USE_SIMD UINT16 val; SIMD_EXTRACT16(rsp->xv[VS2REG], val, EL); rsp->reciprocal_high = val << 16; @@ -4785,7 +6280,15 @@ INLINE void cfunc_rsp_vrsqh(void *param) rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); SIMD_INSERT16(rsp->xv[VDREG], (INT16)(rsp->reciprocal_res >> 16), VS1REG); // store high part +} + #else + +INLINE void cfunc_rsp_vrsqh_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + rsp->reciprocal_high = (VREG_S(VS2REG, EL & 7)) << 16; rsp->dp_allowed = 1; @@ -4795,8 +6298,9 @@ INLINE void cfunc_rsp_vrsqh(void *param) } W_VREG_S(VDREG, VS1REG & 7) = (INT16)(rsp->reciprocal_res >> 16); // store high part -#endif } +#endif + static void cfunc_sp_set_status_cb(void *param) { @@ -5372,6 +6876,8 @@ static void generate_delay_slot_and_branch(rsp_state *rsp, drcuml_block *block, opcode -------------------------------------------------*/ +#if USE_SIMD + static int generate_vector_opcode(rsp_state *rsp, drcuml_block *block, compiler_state *compiler, const opcode_desc *desc) { UINT32 op = desc->opptr.l[0]; @@ -5385,192 +6891,192 @@ static int generate_vector_opcode(rsp_state *rsp, drcuml_block *block, compiler_ { case 0x00: /* VMULF */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmulf, rsp); + UML_CALLC(block, cfunc_rsp_vmulf_simd, rsp); return TRUE; case 0x01: /* VMULU */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmulu, rsp); + UML_CALLC(block, cfunc_rsp_vmulu_simd, rsp); return TRUE; case 0x04: /* VMUDL */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmudl, rsp); + UML_CALLC(block, cfunc_rsp_vmudl_simd, rsp); return TRUE; case 0x05: /* VMUDM */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmudm, rsp); + UML_CALLC(block, cfunc_rsp_vmudm_simd, rsp); return TRUE; case 0x06: /* VMUDN */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmudn, rsp); + UML_CALLC(block, cfunc_rsp_vmudn_simd, rsp); return TRUE; case 0x07: /* VMUDH */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmudh, rsp); + UML_CALLC(block, cfunc_rsp_vmudh_simd, rsp); return TRUE; case 0x08: /* VMACF */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmacf, rsp); + UML_CALLC(block, cfunc_rsp_vmacf_simd, rsp); return TRUE; case 0x09: /* VMACU */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmacu, rsp); + UML_CALLC(block, cfunc_rsp_vmacu_simd, rsp); return TRUE; case 0x0c: /* VMADL */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmadl, rsp); + UML_CALLC(block, cfunc_rsp_vmadl_simd, rsp); return TRUE; case 0x0d: /* VMADM */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmadm, rsp); + UML_CALLC(block, cfunc_rsp_vmadm_simd, rsp); return TRUE; case 0x0e: /* VMADN */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmadn, rsp); + UML_CALLC(block, cfunc_rsp_vmadn_simd, rsp); return TRUE; case 0x0f: /* VMADH */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmadh, rsp); + UML_CALLC(block, cfunc_rsp_vmadh_simd, rsp); return TRUE; case 0x10: /* VADD */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vadd, rsp); + UML_CALLC(block, cfunc_rsp_vadd_simd, rsp); return TRUE; case 0x11: /* VSUB */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vsub, rsp); + UML_CALLC(block, cfunc_rsp_vsub_simd, rsp); return TRUE; case 0x13: /* VABS */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vabs, rsp); + UML_CALLC(block, cfunc_rsp_vabs_simd, rsp); return TRUE; case 0x14: /* VADDC */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vaddc, rsp); + UML_CALLC(block, cfunc_rsp_vaddc_simd, rsp); return TRUE; case 0x15: /* VSUBC */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vsubc, rsp); + UML_CALLC(block, cfunc_rsp_vsubc_simd, rsp); return TRUE; case 0x1d: /* VSAW */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vsaw, rsp); + UML_CALLC(block, cfunc_rsp_vsaw_simd, rsp); return TRUE; case 0x20: /* VLT */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vlt, rsp); + UML_CALLC(block, cfunc_rsp_vlt_simd, rsp); return TRUE; case 0x21: /* VEQ */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_veq, rsp); + UML_CALLC(block, cfunc_rsp_veq_simd, rsp); return TRUE; case 0x22: /* VNE */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vne, rsp); + UML_CALLC(block, cfunc_rsp_vne_simd, rsp); return TRUE; case 0x23: /* VGE */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vge, rsp); + UML_CALLC(block, cfunc_rsp_vge_simd, rsp); return TRUE; case 0x24: /* VCL */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vcl, rsp); + UML_CALLC(block, cfunc_rsp_vcl_simd, rsp); return TRUE; case 0x25: /* VCH */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vch, rsp); + UML_CALLC(block, cfunc_rsp_vch_simd, rsp); return TRUE; case 0x26: /* VCR */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vcr, rsp); + UML_CALLC(block, cfunc_rsp_vcr_simd, rsp); return TRUE; case 0x27: /* VMRG */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmrg, rsp); + UML_CALLC(block, cfunc_rsp_vmrg_simd, rsp); return TRUE; case 0x28: /* VAND */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vand, rsp); + UML_CALLC(block, cfunc_rsp_vand_simd, rsp); return TRUE; case 0x29: /* VNAND */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vnand, rsp); + UML_CALLC(block, cfunc_rsp_vnand_simd, rsp); return TRUE; case 0x2a: /* VOR */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vor, rsp); + UML_CALLC(block, cfunc_rsp_vor_simd, rsp); return TRUE; case 0x2b: /* VNOR */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vnor, rsp); + UML_CALLC(block, cfunc_rsp_vnor_simd, rsp); return TRUE; case 0x2c: /* VXOR */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vxor, rsp); + UML_CALLC(block, cfunc_rsp_vxor_simd, rsp); return TRUE; case 0x2d: /* VNXOR */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vnxor, rsp); + UML_CALLC(block, cfunc_rsp_vnxor_simd, rsp); return TRUE; case 0x30: /* VRCP */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vrcp, rsp); + UML_CALLC(block, cfunc_rsp_vrcp_simd, rsp); return TRUE; case 0x31: /* VRCPL */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vrcpl, rsp); + UML_CALLC(block, cfunc_rsp_vrcpl_simd, rsp); return TRUE; case 0x32: /* VRCPH */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vrcph, rsp); + UML_CALLC(block, cfunc_rsp_vrcph_simd, rsp); return TRUE; case 0x33: /* VMOV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmov, rsp); + UML_CALLC(block, cfunc_rsp_vmov_simd, rsp); return TRUE; case 0x35: /* VRSQL */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vrsql, rsp); + UML_CALLC(block, cfunc_rsp_vrsql_simd, rsp); return TRUE; case 0x36: /* VRSQH */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vrsqh, rsp); + UML_CALLC(block, cfunc_rsp_vrsqh_simd, rsp); return TRUE; default: @@ -5580,6 +7086,217 @@ static int generate_vector_opcode(rsp_state *rsp, drcuml_block *block, compiler_ } } +#else + +static int generate_vector_opcode(rsp_state *rsp, drcuml_block *block, compiler_state *compiler, const opcode_desc *desc) +{ + UINT32 op = desc->opptr.l[0]; + // Opcode legend: + // E = VS2 element type + // S = VS1, Source vector 1 + // T = VS2, Source vector 2 + // D = Destination vector + + switch (op & 0x3f) + { + case 0x00: /* VMULF */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmulf_scalar, rsp); + return TRUE; + + case 0x01: /* VMULU */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmulu_scalar, rsp); + return TRUE; + + case 0x04: /* VMUDL */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmudl_scalar, rsp); + return TRUE; + + case 0x05: /* VMUDM */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmudm_scalar, rsp); + return TRUE; + + case 0x06: /* VMUDN */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmudn_scalar, rsp); + return TRUE; + + case 0x07: /* VMUDH */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmudh_scalar, rsp); + return TRUE; + + case 0x08: /* VMACF */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmacf_scalar, rsp); + return TRUE; + + case 0x09: /* VMACU */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmacu_scalar, rsp); + return TRUE; + + case 0x0c: /* VMADL */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmadl_scalar, rsp); + return TRUE; + + case 0x0d: /* VMADM */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmadm_scalar, rsp); + return TRUE; + + case 0x0e: /* VMADN */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmadn_scalar, rsp); + return TRUE; + + case 0x0f: /* VMADH */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmadh_scalar, rsp); + return TRUE; + + case 0x10: /* VADD */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vadd_scalar, rsp); + return TRUE; + + case 0x11: /* VSUB */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vsub_scalar, rsp); + return TRUE; + + case 0x13: /* VABS */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vabs_scalar, rsp); + return TRUE; + + case 0x14: /* VADDC */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vaddc_scalar, rsp); + return TRUE; + + case 0x15: /* VSUBC */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vsubc_scalar, rsp); + return TRUE; + + case 0x1d: /* VSAW */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vsaw_scalar, rsp); + return TRUE; + + case 0x20: /* VLT */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vlt_scalar, rsp); + return TRUE; + + case 0x21: /* VEQ */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_veq_scalar, rsp); + return TRUE; + + case 0x22: /* VNE */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vne_scalar, rsp); + return TRUE; + + case 0x23: /* VGE */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vge_scalar, rsp); + return TRUE; + + case 0x24: /* VCL */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vcl_scalar, rsp); + return TRUE; + + case 0x25: /* VCH */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vch_scalar, rsp); + return TRUE; + + case 0x26: /* VCR */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vcr_scalar, rsp); + return TRUE; + + case 0x27: /* VMRG */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmrg_scalar, rsp); + return TRUE; + + case 0x28: /* VAND */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vand_scalar, rsp); + return TRUE; + + case 0x29: /* VNAND */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vnand_scalar, rsp); + return TRUE; + + case 0x2a: /* VOR */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vor_scalar, rsp); + return TRUE; + + case 0x2b: /* VNOR */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vnor_scalar, rsp); + return TRUE; + + case 0x2c: /* VXOR */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vxor_scalar, rsp); + return TRUE; + + case 0x2d: /* VNXOR */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vnxor_scalar, rsp); + return TRUE; + + case 0x30: /* VRCP */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vrcp_scalar, rsp); + return TRUE; + + case 0x31: /* VRCPL */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vrcpl_scalar, rsp); + return TRUE; + + case 0x32: /* VRCPH */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vrcph_scalar, rsp); + return TRUE; + + case 0x33: /* VMOV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmov_scalar, rsp); + return TRUE; + + case 0x35: /* VRSQL */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vrsql_scalar, rsp); + return TRUE; + + case 0x36: /* VRSQH */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vrsqh_scalar, rsp); + return TRUE; + + default: + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_unimplemented_opcode, rsp); + return FALSE; + } +} +#endif + static int generate_opcode(rsp_state *rsp, drcuml_block *block, compiler_state *compiler, const opcode_desc *desc) { int in_delay_slot = ((desc->flags & OPFLAG_IN_DELAY_SLOT) != 0);