diff --git a/src/emu/cpu/rsp/rsp.h b/src/emu/cpu/rsp/rsp.h index 8029742dd43..accbdf4ae8a 100644 --- a/src/emu/cpu/rsp/rsp.h +++ b/src/emu/cpu/rsp/rsp.h @@ -16,7 +16,8 @@ #ifndef __RSP_H__ #define __RSP_H__ -#define USE_SIMD (0) +#define USE_SIMD (1) +#define SIMUL_SIMD (1) #if USE_SIMD #include diff --git a/src/emu/cpu/rsp/rspdrc.c b/src/emu/cpu/rsp/rspdrc.c index a78b5aa814b..58561139457 100644 --- a/src/emu/cpu/rsp/rspdrc.c +++ b/src/emu/cpu/rsp/rspdrc.c @@ -153,31 +153,59 @@ static void cfunc_ctc2(void *param); //static void cfunc_lwc2(void *param); static void cfunc_sp_set_status_cb(void *param); -static void cfunc_rsp_lbv(void *param); -static void cfunc_rsp_lsv(void *param); -static void cfunc_rsp_llv(void *param); -static void cfunc_rsp_ldv(void *param); -static void cfunc_rsp_lqv(void *param); -static void cfunc_rsp_lrv(void *param); -static void cfunc_rsp_lpv(void *param); -static void cfunc_rsp_luv(void *param); -static void cfunc_rsp_lhv(void *param); -static void cfunc_rsp_lfv(void *param); -static void cfunc_rsp_lwv(void *param); -static void cfunc_rsp_ltv(void *param); +#if USE_SIMD +static void cfunc_rsp_lbv_simd(void *param); +static void cfunc_rsp_lsv_simd(void *param); +static void cfunc_rsp_llv_simd(void *param); +static void cfunc_rsp_ldv_simd(void *param); +static void cfunc_rsp_lqv_simd(void *param); +static void cfunc_rsp_lrv_simd(void *param); +static void cfunc_rsp_lpv_simd(void *param); +static void cfunc_rsp_luv_simd(void *param); +static void cfunc_rsp_lhv_simd(void *param); +static void cfunc_rsp_lfv_simd(void *param); +static void cfunc_rsp_lwv_simd(void *param); +static void cfunc_rsp_ltv_simd(void *param); -static void cfunc_rsp_sbv(void *param); -static void cfunc_rsp_ssv(void *param); -static void cfunc_rsp_slv(void *param); -static void cfunc_rsp_sdv(void *param); -static void cfunc_rsp_sqv(void *param); -static void cfunc_rsp_srv(void *param); -static void cfunc_rsp_spv(void *param); -static void cfunc_rsp_suv(void *param); -static void cfunc_rsp_shv(void *param); -static void cfunc_rsp_sfv(void *param); -static void cfunc_rsp_swv(void *param); -static void cfunc_rsp_stv(void *param); +static void cfunc_rsp_sbv_simd(void *param); +static void cfunc_rsp_ssv_simd(void *param); +static void cfunc_rsp_slv_simd(void *param); +static void cfunc_rsp_sdv_simd(void *param); +static void cfunc_rsp_sqv_simd(void *param); +static void cfunc_rsp_srv_simd(void *param); +static void cfunc_rsp_spv_simd(void *param); +static void cfunc_rsp_suv_simd(void *param); +static void cfunc_rsp_shv_simd(void *param); +static void cfunc_rsp_sfv_simd(void *param); +static void cfunc_rsp_swv_simd(void *param); +static void cfunc_rsp_stv_simd(void *param); +#elif (!USE_SIMD || SIMUL_SIMD) +static void cfunc_rsp_lbv_scalar(void *param); +static void cfunc_rsp_lsv_scalar(void *param); +static void cfunc_rsp_llv_scalar(void *param); +static void cfunc_rsp_ldv_scalar(void *param); +static void cfunc_rsp_lqv_scalar(void *param); +static void cfunc_rsp_lrv_scalar(void *param); +static void cfunc_rsp_lpv_scalar(void *param); +static void cfunc_rsp_luv_scalar(void *param); +static void cfunc_rsp_lhv_scalar(void *param); +static void cfunc_rsp_lfv_scalar(void *param); +static void cfunc_rsp_lwv_scalar(void *param); +static void cfunc_rsp_ltv_scalar(void *param); + +static void cfunc_rsp_sbv_scalar(void *param); +static void cfunc_rsp_ssv_scalar(void *param); +static void cfunc_rsp_slv_scalar(void *param); +static void cfunc_rsp_sdv_scalar(void *param); +static void cfunc_rsp_sqv_scalar(void *param); +static void cfunc_rsp_srv_scalar(void *param); +static void cfunc_rsp_spv_scalar(void *param); +static void cfunc_rsp_suv_scalar(void *param); +static void cfunc_rsp_shv_scalar(void *param); +static void cfunc_rsp_sfv_scalar(void *param); +static void cfunc_rsp_swv_scalar(void *param); +static void cfunc_rsp_stv_scalar(void *param); +#endif static void static_generate_entry_point(rsp_state *rsp); static void static_generate_nocode_handler(rsp_state *rsp); @@ -245,34 +273,35 @@ static void log_add_disasm_comment(rsp_state *rsp, drcuml_block *block, UINT32 p #define VEC_EL_2(x,z) (vector_elements_2[(x)][(z)]) #define ACCUM(x) rsp->accum[x].q + #if USE_SIMD -INLINE UINT16 ACCUM_H(const rsp_state *rsp, int x) +INLINE UINT16 VEC_ACCUM_H(const rsp_state *rsp, int x) { UINT16 out; SIMD_EXTRACT16(rsp->accum_h, out, x); return out; } -INLINE UINT16 ACCUM_M(const rsp_state *rsp, int x) +INLINE UINT16 VEC_ACCUM_M(const rsp_state *rsp, int x) { UINT16 out; SIMD_EXTRACT16(rsp->accum_m, out, x); return out; } -INLINE UINT16 ACCUM_L(const rsp_state *rsp, int x) +INLINE UINT16 VEC_ACCUM_L(const rsp_state *rsp, int x) { UINT16 out; SIMD_EXTRACT16(rsp->accum_l, out, x); return out; } -#define SET_ACCUM_H(v, x) SIMD_INSERT16(rsp->accum_h, v, x); -#define SET_ACCUM_M(v, x) SIMD_INSERT16(rsp->accum_m, v, x); -#define SET_ACCUM_L(v, x) SIMD_INSERT16(rsp->accum_l, v, x); +#define VEC_SET_ACCUM_H(v, x) SIMD_INSERT16(rsp->accum_h, v, x); +#define VEC_SET_ACCUM_M(v, x) SIMD_INSERT16(rsp->accum_m, v, x); +#define VEC_SET_ACCUM_L(v, x) SIMD_INSERT16(rsp->accum_l, v, x); -#define SCALAR_GET_VS1(out, i) SIMD_EXTRACT16(rsp->xv[VS1REG], out, i) -#define SCALAR_GET_VS2(out, i) SIMD_EXTRACT16(rsp->xv[VS2REG], out, VEC_EL_2(EL, i)) +#define SCALAR_GET_VS1(out, i) SIMD_EXTRACT16(rsp->xv[VS1REG], out, i); +#define SCALAR_GET_VS2(out, i) SIMD_EXTRACT16(rsp->xv[VS2REG], out, VEC_EL_2(EL, i)); #else @@ -909,7 +938,17 @@ static CPU_RESET( rsp ) rsp->nextpc = ~0; } -static void cfunc_rsp_lbv(void *param) +#if USE_SIMD +// LBV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 00000 | IIII | Offset | +// -------------------------------------------------- +// +// Load 1 byte to vector byte index + +static void cfunc_rsp_lbv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -923,27 +962,49 @@ static void cfunc_rsp_lbv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 00000 | IIII | Offset | - // -------------------------------------------------- - // - // Load 1 byte to vector byte index ea = (base) ? rsp->r[base] + offset : offset; -#if USE_SIMD UINT16 element; SIMD_EXTRACT16(rsp->xv[dest], element, (index >> 1)); element &= 0xff00 >> ((1-(index & 1)) * 8); element |= READ8(rsp, ea) << ((1-(index & 1)) * 8); SIMD_INSERT16(rsp->xv[dest], element, (index >> 1)); -#else - VREG_B(dest, index) = READ8(rsp, ea); -#endif } -static void cfunc_rsp_lsv(void *param) +#else + +static void cfunc_rsp_lbv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + + UINT32 ea = 0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + ea = (base) ? rsp->r[base] + offset : offset; + VREG_B(dest, index) = READ8(rsp, ea); +} +#endif + +#if USE_SIMD +// LSV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 00001 | IIII | Offset | +// -------------------------------------------------- +// +// Loads 2 bytes starting from vector byte index + +static void cfunc_rsp_lsv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -955,31 +1016,56 @@ static void cfunc_rsp_lsv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 00001 | IIII | Offset | - // -------------------------------------------------- - // - // Loads 2 bytes starting from vector byte index UINT32 ea = (base) ? rsp->r[base] + (offset * 2) : (offset * 2); int end = index + 2; for (int i = index; i < end; i++) { -#if USE_SIMD UINT16 element; SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1)); element &= 0xff00 >> ((1 - (i & 1)) * 8); element |= READ8(rsp, ea) << ((1 - (i & 1)) * 8); SIMD_INSERT16(rsp->xv[dest], element, (i >> 1)); -#else - VREG_B(dest, i) = READ8(rsp, ea); -#endif ea++; } } -static void cfunc_rsp_llv(void *param) +#else + +static void cfunc_rsp_lsv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xe; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 2) : (offset * 2); + int end = index + 2; + for (int i = index; i < end; i++) + { + VREG_B(dest, i) = READ8(rsp, ea); + ea++; + } +} +#endif + +#if USE_SIMD +// LLV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 00010 | IIII | Offset | +// -------------------------------------------------- +// +// Loads 4 bytes starting from vector byte index + +static void cfunc_rsp_llv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -992,12 +1078,6 @@ static void cfunc_rsp_llv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 00010 | IIII | Offset | - // -------------------------------------------------- - // - // Loads 4 bytes starting from vector byte index ea = (base) ? rsp->r[base] + (offset * 4) : (offset * 4); @@ -1005,20 +1085,54 @@ static void cfunc_rsp_llv(void *param) for (int i = index; i < end; i++) { -#if USE_SIMD UINT16 element; SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1)); element &= 0xff00 >> ((1 - (i & 1)) * 8); element |= READ8(rsp, ea) << ((1 - (i & 1)) * 8); SIMD_INSERT16(rsp->xv[dest], element, (i >> 1)); -#else - VREG_B(dest, i) = READ8(rsp, ea); -#endif ea++; } } -static void cfunc_rsp_ldv(void *param) +#else + +static void cfunc_rsp_llv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + UINT32 ea = 0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xc; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + ea = (base) ? rsp->r[base] + (offset * 4) : (offset * 4); + + int end = index + 4; + + for (int i = index; i < end; i++) + { + VREG_B(dest, i) = READ8(rsp, ea); + ea++; + } +} +#endif + +#if USE_SIMD +// LDV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 00011 | IIII | Offset | +// -------------------------------------------------- +// +// Loads 8 bytes starting from vector byte index + +static void cfunc_rsp_ldv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1031,12 +1145,6 @@ static void cfunc_rsp_ldv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 00011 | IIII | Offset | - // -------------------------------------------------- - // - // Loads 8 bytes starting from vector byte index ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); @@ -1044,37 +1152,64 @@ static void cfunc_rsp_ldv(void *param) for (int i = index; i < end; i++) { -#if USE_SIMD UINT16 element; SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1)); element &= 0xff00 >> ((1 - (i & 1)) * 8); element |= READ8(rsp, ea) << ((1 - (i & 1)) * 8); SIMD_INSERT16(rsp->xv[dest], element, (i >> 1)); -#else - VREG_B(dest, i) = READ8(rsp, ea); -#endif ea++; } } -static void cfunc_rsp_lqv(void *param) +#else + +static void cfunc_rsp_ldv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + UINT32 ea = 0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0x8; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); + + int end = index + 8; + + for (int i = index; i < end; i++) + { + VREG_B(dest, i) = READ8(rsp, ea); + ea++; + } +} +#endif + +#if USE_SIMD +// LQV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 00100 | IIII | Offset | +// -------------------------------------------------- +// +// Loads up to 16 bytes starting from vector byte index + +static void cfunc_rsp_lqv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; int dest = (op >> 16) & 0x1f; int base = (op >> 21) & 0x1f; - //int index = 0; // Just a test, it goes right back the way it was if something breaks //(op >> 7) & 0xf; int offset = (op & 0x7f); if (offset & 0x40) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 00100 | IIII | Offset | - // -------------------------------------------------- - // - // Loads up to 16 bytes starting from vector byte index UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); @@ -1083,20 +1218,53 @@ static void cfunc_rsp_lqv(void *param) for (int i = 0; i < end; i++) { -#if USE_SIMD UINT16 element; SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1)); element &= 0xff00 >> ((1 - (i & 1)) * 8); element |= READ8(rsp, ea) << ((1 - (i & 1)) * 8); SIMD_INSERT16(rsp->xv[dest], element, (i >> 1)); -#else - VREG_B(dest, i) = READ8(rsp, ea); -#endif ea++; } } -static void cfunc_rsp_lrv(void *param) +#else + +static void cfunc_rsp_lqv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + + int end = 16 - (ea & 0xf); + if (end > 16) end = 16; + + for (int i = 0; i < end; i++) + { + VREG_B(dest, i) = READ8(rsp, ea); + ea++; + } +} +#endif + +#if USE_SIMD +// LRV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 00101 | IIII | Offset | +// -------------------------------------------------- +// +// Stores up to 16 bytes starting from right side until 16-byte boundary + +static void cfunc_rsp_lrv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1108,12 +1276,6 @@ static void cfunc_rsp_lrv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 00101 | IIII | Offset | - // -------------------------------------------------- - // - // Stores up to 16 bytes starting from right side until 16-byte boundary UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); @@ -1122,20 +1284,54 @@ static void cfunc_rsp_lrv(void *param) for (int i = index; i < 16; i++) { -#if USE_SIMD UINT16 element; SIMD_EXTRACT16(rsp->xv[dest], element, (i >> 1)); element &= 0xff00 >> ((1-(i & 1)) * 8); element |= READ8(rsp, ea) << ((1-(i & 1)) * 8); SIMD_INSERT16(rsp->xv[dest], element, (i >> 1)); -#else - VREG_B(dest, i) = READ8(rsp, ea); -#endif ea++; } } -static void cfunc_rsp_lpv(void *param) +#else + +static void cfunc_rsp_lrv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + + index = 16 - ((ea & 0xf) - index); + ea &= ~0xf; + + for (int i = index; i < 16; i++) + { + VREG_B(dest, i) = READ8(rsp, ea); + ea++; + } +} +#endif + +#if USE_SIMD +// LPV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 00110 | IIII | Offset | +// -------------------------------------------------- +// +// Loads a byte as the upper 8 bits of each element + +static void cfunc_rsp_lpv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1147,26 +1343,18 @@ static void cfunc_rsp_lpv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 00110 | IIII | Offset | - // -------------------------------------------------- - // - // Loads a byte as the upper 8 bits of each element UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); for (int i = 0; i < 8; i++) { -#if USE_SIMD SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea + (((16-index) + i) & 0xf)) << 8, i); -#else - W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 8; -#endif } } -static void cfunc_rsp_luv(void *param) +#else + +static void cfunc_rsp_lpv_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1178,26 +1366,82 @@ static void cfunc_rsp_luv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 00111 | IIII | Offset | - // -------------------------------------------------- - // - // Loads a byte as the bits 14-7 of each element UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); for (int i = 0; i < 8; i++) { -#if USE_SIMD - SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea + (((16-index) + i) & 0xf)) << 7, i); -#else - W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 7; -#endif + W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 8; } } +#endif -static void cfunc_rsp_lhv(void *param) +#if USE_SIMD +// LUV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 00111 | IIII | Offset | +// -------------------------------------------------- +// +// Loads a byte as the bits 14-7 of each element + +static void cfunc_rsp_luv_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); + + for (int i = 0; i < 8; i++) + { + SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea + (((16-index) + i) & 0xf)) << 7, i); + } +} + +#else + +static void cfunc_rsp_luv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); + + for (int i = 0; i < 8; i++) + { + W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + i) & 0xf)) << 7; + } +} +#endif + +#if USE_SIMD +// LHV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 01000 | IIII | Offset | +// -------------------------------------------------- +// +// Loads a byte as the bits 14-7 of each element, with 2-byte stride + +static void cfunc_rsp_lhv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1209,26 +1453,49 @@ static void cfunc_rsp_lhv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 01000 | IIII | Offset | - // -------------------------------------------------- - // - // Loads a byte as the bits 14-7 of each element, with 2-byte stride UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); for (int i = 0; i < 8; i++) { -#if USE_SIMD SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea + (((16-index) + (i<<1)) & 0xf)) << 7, i); -#else - W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + (i<<1)) & 0xf)) << 7; -#endif } } -static void cfunc_rsp_lfv(void *param) +#else + +static void cfunc_rsp_lhv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + + for (int i = 0; i < 8; i++) + { + W_VREG_S(dest, i) = READ8(rsp, ea + (((16-index) + (i<<1)) & 0xf)) << 7; + } +} +#endif + +#if USE_SIMD +// LFV +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 01001 | IIII | Offset | +// -------------------------------------------------- +// +// Loads a byte as the bits 14-7 of upper or lower quad, with 4-byte stride + +static void cfunc_rsp_lfv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1240,12 +1507,6 @@ static void cfunc_rsp_lfv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 01001 | IIII | Offset | - // -------------------------------------------------- - // - // Loads a byte as the bits 14-7 of upper or lower quad, with 4-byte stride UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); @@ -1255,16 +1516,14 @@ static void cfunc_rsp_lfv(void *param) for (int i = index >> 1; i < end; i++) { -#if USE_SIMD SIMD_INSERT16(rsp->xv[dest], READ8(rsp, ea) << 7, i); -#else - W_VREG_S(dest, i) = READ8(rsp, ea) << 7; -#endif ea += 4; } } -static void cfunc_rsp_lwv(void *param) +#else + +static void cfunc_rsp_lfv_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1276,37 +1535,33 @@ static void cfunc_rsp_lwv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 01010 | IIII | Offset | - // -------------------------------------------------- - // - // Loads the full 128-bit vector starting from vector byte index and wrapping to index 0 - // after byte index 15 UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); - int end = (16 - index) + 16; -#if USE_SIMD - UINT8 val[16]; -#endif - for (int i = (16 - index); i < end; i++) + // not sure what happens if 16-byte boundary is crossed... + + int end = (index >> 1) + 4; + + for (int i = index >> 1; i < end; i++) { -#if USE_SIMD - val[i & 0xf] = READ8(rsp, ea); -#else - VREG_B(dest, i & 0xf) = READ8(rsp, ea); -#endif + W_VREG_S(dest, i) = READ8(rsp, ea) << 7; ea += 4; } +} +#endif #if USE_SIMD - rsp->xv[dest] = _mm_set_epi8(val[15], val[14], val[13], val[12], val[11], val[10], val[ 9], val[ 8], - val[ 7], val[ 6], val[ 5], val[ 4], val[ 3], val[ 2], val[ 1], val[ 0]); -#endif -} +// LWV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 01010 | IIII | Offset | +// -------------------------------------------------- +// +// Loads the full 128-bit vector starting from vector byte index and wrapping to index 0 +// after byte index 15 -static void cfunc_rsp_ltv(void *param) +static void cfunc_rsp_lwv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1314,13 +1569,69 @@ static void cfunc_rsp_ltv(void *param) int base = (op >> 21) & 0x1f; int index = (op >> 7) & 0xf; int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 110010 | BBBBB | TTTTT | 01011 | IIII | Offset | - // -------------------------------------------------- - // - // Loads one element to maximum of 8 vectors, while incrementing element index + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + int end = (16 - index) + 16; + + UINT8 val[16]; + for (int i = (16 - index); i < end; i++) + { + val[i & 0xf] = READ8(rsp, ea); + ea += 4; + } + + rsp->xv[dest] = _mm_set_epi8(val[15], val[14], val[13], val[12], val[11], val[10], val[ 9], val[ 8], + val[ 7], val[ 6], val[ 5], val[ 4], val[ 3], val[ 2], val[ 1], val[ 0]); +} + +#else + +static void cfunc_rsp_lwv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + int end = (16 - index) + 16; + + for (int i = (16 - index); i < end; i++) + { + VREG_B(dest, i & 0xf) = READ8(rsp, ea); + ea += 4; + } +} +#endif + +#if USE_SIMD +// LTV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 110010 | BBBBB | TTTTT | 01011 | IIII | Offset | +// -------------------------------------------------- +// +// Loads one element to maximum of 8 vectors, while incrementing element index + +static void cfunc_rsp_ltv_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); // FIXME: has a small problem with odd indices @@ -1338,19 +1649,122 @@ static void cfunc_rsp_ltv(void *param) ea = ((ea + 8) & ~0xf) + (index & 1); for (int i = vs; i < ve; i++) { - element = ((8 - (index >> 1) + (i - vs)) << 1); -#if USE_SIMD + element = (8 - (index >> 1) + (i - vs)) << 1; UINT16 value = (READ8(rsp, ea) << 8) | READ8(rsp, ea + 1); SIMD_INSERT16(rsp->xv[i], value, (element >> 1)); -#else - VREG_B(i, (element & 0xf)) = READ8(rsp, ea); - VREG_B(i, ((element + 1) & 0xf)) = READ8(rsp, ea + 1); -#endif - ea += 2; } } +#else + +static void cfunc_rsp_ltv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + + // FIXME: has a small problem with odd indices + + int vs = dest; + int ve = dest + 8; + if (ve > 32) + { + ve = 32; + } + + int element = 7 - (index >> 1); + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + + ea = ((ea + 8) & ~0xf) + (index & 1); + for (int i = vs; i < ve; i++) + { + element = (8 - (index >> 1) + (i - vs)) << 1; + VREG_B(i, (element & 0xf)) = READ8(rsp, ea); + VREG_B(i, ((element + 1) & 0xf)) = READ8(rsp, ea + 1); + ea += 2; + } +} +#endif + +#if USE_SIMD +static int generate_lwc2(rsp_state *rsp, drcuml_block *block, compiler_state *compiler, const opcode_desc *desc) +{ + //int loopdest; + UINT32 op = desc->opptr.l[0]; + //int dest = (op >> 16) & 0x1f; + //int base = (op >> 21) & 0x1f; + //int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + //int skip; + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + switch ((op >> 11) & 0x1f) + { + case 0x00: /* LBV */ + //UML_ADD(block, I0, R32(RSREG), offset); + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_lbv_simd, rsp); + return TRUE; + case 0x01: /* LSV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_lsv_simd, rsp); + return TRUE; + case 0x02: /* LLV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_llv_simd, rsp); + return TRUE; + case 0x03: /* LDV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_ldv_simd, rsp); + return TRUE; + case 0x04: /* LQV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_lqv_simd, rsp); + return TRUE; + case 0x05: /* LRV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_lrv_simd, rsp); + return TRUE; + case 0x06: /* LPV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_lpv_simd, rsp); + return TRUE; + case 0x07: /* LUV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_luv_simd, rsp); + return TRUE; + case 0x08: /* LHV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_lhv_simd, rsp); + return TRUE; + case 0x09: /* LFV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_lfv_simd, rsp); + return TRUE; + case 0x0a: /* LWV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_lwv_simd, rsp); + return TRUE; + case 0x0b: /* LTV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_ltv_simd, rsp); + return TRUE; + + default: + return FALSE; + } +} + +#else + static int generate_lwc2(rsp_state *rsp, drcuml_block *block, compiler_state *compiler, const opcode_desc *desc) { //int loopdest; @@ -1370,59 +1784,70 @@ static int generate_lwc2(rsp_state *rsp, drcuml_block *block, compiler_state *co case 0x00: /* LBV */ //UML_ADD(block, I0, R32(RSREG), offset); UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_lbv, rsp); + UML_CALLC(block, cfunc_rsp_lbv_scalar, rsp); return TRUE; case 0x01: /* LSV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_lsv, rsp); + UML_CALLC(block, cfunc_rsp_lsv_scalar, rsp); return TRUE; case 0x02: /* LLV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_llv, rsp); + UML_CALLC(block, cfunc_rsp_llv_scalar, rsp); return TRUE; case 0x03: /* LDV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_ldv, rsp); + UML_CALLC(block, cfunc_rsp_ldv_scalar, rsp); return TRUE; case 0x04: /* LQV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_lqv, rsp); + UML_CALLC(block, cfunc_rsp_lqv_scalar, rsp); return TRUE; case 0x05: /* LRV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_lrv, rsp); + UML_CALLC(block, cfunc_rsp_lrv_scalar, rsp); return TRUE; case 0x06: /* LPV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_lpv, rsp); + UML_CALLC(block, cfunc_rsp_lpv_scalar, rsp); return TRUE; case 0x07: /* LUV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_luv, rsp); + UML_CALLC(block, cfunc_rsp_luv_scalar, rsp); return TRUE; case 0x08: /* LHV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_lhv, rsp); + UML_CALLC(block, cfunc_rsp_lhv_scalar, rsp); return TRUE; case 0x09: /* LFV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_lfv, rsp); + UML_CALLC(block, cfunc_rsp_lfv_scalar, rsp); return TRUE; case 0x0a: /* LWV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_lwv, rsp); + UML_CALLC(block, cfunc_rsp_lwv_scalar, rsp); return TRUE; case 0x0b: /* LTV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_ltv, rsp); + UML_CALLC(block, cfunc_rsp_ltv_scalar, rsp); return TRUE; default: return FALSE; } } +#endif -static void cfunc_rsp_sbv(void *param) +#if USE_SIMD +// SBV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 00000 | IIII | Offset | +// -------------------------------------------------- +// +// Stores 1 byte from vector byte index + +static void cfunc_rsp_sbv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1435,25 +1860,16 @@ static void cfunc_rsp_sbv(void *param) offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 00000 | IIII | Offset | - // -------------------------------------------------- - // - // Stores 1 byte from vector byte index - UINT32 ea = (base) ? rsp->r[base] + offset : offset; -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, (index >> 1)); value >>= (1-(index & 1)) * 8; WRITE8(rsp, ea, (UINT8)value); -#else - WRITE8(rsp, ea, VREG_B(dest, index)); -#endif } -static void cfunc_rsp_ssv(void *param) +#else + +static void cfunc_rsp_sbv_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1466,31 +1882,79 @@ static void cfunc_rsp_ssv(void *param) offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 00001 | IIII | Offset | - // -------------------------------------------------- - // - // Stores 2 bytes starting from vector byte index + UINT32 ea = (base) ? rsp->r[base] + offset : offset; + WRITE8(rsp, ea, VREG_B(dest, index)); +} +#endif + +#if USE_SIMD +// SSV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 00001 | IIII | Offset | +// -------------------------------------------------- +// +// Stores 2 bytes starting from vector byte index + +static void cfunc_rsp_ssv_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } UINT32 ea = (base) ? rsp->r[base] + (offset * 2) : (offset * 2); -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, (index >> 1)); WRITE8(rsp, ea, (UINT8)(value >> 8)); WRITE8(rsp, ea+1, (UINT8)(value & 0x00ff)); +} + #else + +static void cfunc_rsp_ssv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 2) : (offset * 2); + int end = index + 2; for (int i = index; i < end; i++) { WRITE8(rsp, ea, VREG_B(dest, i)); ea++; } -#endif } +#endif -static void cfunc_rsp_slv(void *param) +#if USE_SIMD +// SLV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 00010 | IIII | Offset | +// -------------------------------------------------- +// +// Stores 4 bytes starting from vector byte index + +static void cfunc_rsp_slv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1502,16 +1966,9 @@ static void cfunc_rsp_slv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 00010 | IIII | Offset | - // -------------------------------------------------- - // - // Stores 4 bytes starting from vector byte index UINT32 ea = (base) ? rsp->r[base] + (offset * 4) : (offset * 4); -#if USE_SIMD UINT16 value0, value1; index >>= 1; SIMD_EXTRACT16(rsp->xv[dest], value0, index); @@ -1520,17 +1977,45 @@ static void cfunc_rsp_slv(void *param) WRITE8(rsp, ea+1, (UINT8)(value0 & 0x00ff)); WRITE8(rsp, ea+2, (UINT8)(value1 >> 8)); WRITE8(rsp, ea+3, (UINT8)(value1 & 0x00ff)); +} + #else + +static void cfunc_rsp_slv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 4) : (offset * 4); + int end = index + 4; for (int i = index; i < end; i++) { WRITE8(rsp, ea, VREG_B(dest, i)); ea++; } -#endif } +#endif -static void cfunc_rsp_sdv(void *param) +#if USE_SIMD +// SDV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 00011 | IIII | Offset | +// -------------------------------------------------- +// +// Stores 8 bytes starting from vector byte index + +static void cfunc_rsp_sdv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1542,15 +2027,8 @@ static void cfunc_rsp_sdv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 00011 | IIII | Offset | - // -------------------------------------------------- - // - // Stores 8 bytes starting from vector byte index UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); -#if USE_SIMD UINT16 value0, value1, value2, value3; index >>= 1; SIMD_EXTRACT16(rsp->xv[dest], value0, index); @@ -1565,17 +2043,44 @@ static void cfunc_rsp_sdv(void *param) WRITE8(rsp, ea+5, (UINT8)(value2 & 0x00ff)); WRITE8(rsp, ea+6, (UINT8)(value3 >> 8)); WRITE8(rsp, ea+7, (UINT8)(value3 & 0x00ff)); +} + #else + +static void cfunc_rsp_sdv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0x8; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); + int end = index + 8; for (int i = index; i < end; i++) { WRITE8(rsp, ea, VREG_B(dest, i)); ea++; } -#endif } +#endif -static void cfunc_rsp_sqv(void *param) +#if USE_SIMD +// SQV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 00100 | IIII | Offset | +// -------------------------------------------------- +// +// Stores up to 16 bytes starting from vector byte index until 16-byte boundary + +static void cfunc_rsp_sqv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1587,30 +2092,55 @@ static void cfunc_rsp_sqv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 00100 | IIII | Offset | - // -------------------------------------------------- - // - // Stores up to 16 bytes starting from vector byte index until 16-byte boundary UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); int end = index + (16 - (ea & 0xf)); for (int i=index; i < end; i++) { -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, (i >> 1)); value >>= (1-(i & 1)) * 8; WRITE8(rsp, ea, (UINT8)value); -#else - WRITE8(rsp, ea, VREG_B(dest, i & 0xf)); -#endif ea++; } } -static void cfunc_rsp_srv(void *param) +#else + +static void cfunc_rsp_sqv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + int end = index + (16 - (ea & 0xf)); + for (int i=index; i < end; i++) + { + WRITE8(rsp, ea, VREG_B(dest, i & 0xf)); + ea++; + } +} +#endif + +#if USE_SIMD +// SRV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 00101 | IIII | Offset | +// -------------------------------------------------- +// +// Stores up to 16 bytes starting from right side until 16-byte boundary + +static void cfunc_rsp_srv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1622,12 +2152,6 @@ static void cfunc_rsp_srv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 00101 | IIII | Offset | - // -------------------------------------------------- - // - // Stores up to 16 bytes starting from right side until 16-byte boundary UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); @@ -1637,20 +2161,55 @@ static void cfunc_rsp_srv(void *param) for (int i = index; i < end; i++) { -#if USE_SIMD UINT32 bi = (i + o) & 0xf; UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, (bi >> 1)); value >>= (1-(bi & 1)) * 8; WRITE8(rsp, ea, (UINT8)value); -#else - WRITE8(rsp, ea, VREG_B(dest, ((i + o) & 0xf))); -#endif ea++; } } -static void cfunc_rsp_spv(void *param) +#else + +static void cfunc_rsp_srv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + + int end = index + (ea & 0xf); + int o = (16 - (ea & 0xf)) & 0xf; + ea &= ~0xf; + + for (int i = index; i < end; i++) + { + WRITE8(rsp, ea, VREG_B(dest, ((i + o) & 0xf))); + ea++; + } +} +#endif + +#if USE_SIMD +// SPV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 00110 | IIII | Offset | +// -------------------------------------------------- +// +// Stores upper 8 bits of each element + +static void cfunc_rsp_spv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1662,12 +2221,6 @@ static void cfunc_rsp_spv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 00110 | IIII | Offset | - // -------------------------------------------------- - // - // Stores upper 8 bits of each element UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); int end = index + 8; @@ -1675,30 +2228,23 @@ static void cfunc_rsp_spv(void *param) { if ((i & 0xf) < 8) { -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, i); WRITE8(rsp, ea, (UINT8)(value >> 8)); -#else - WRITE8(rsp, ea, VREG_B(dest, (i & 0xf) << 1)); -#endif } else { -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, i); - value >>= 7; - WRITE8(rsp, ea, (UINT8)value); -#else - WRITE8(rsp, ea, VREG_S(dest, (i & 0x7)) >> 7); -#endif + WRITE8(rsp, ea, (UINT8)(value >> 7)); } ea++; } } -static void cfunc_rsp_suv(void *param) +#else + +static void cfunc_rsp_spv_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1710,12 +2256,6 @@ static void cfunc_rsp_suv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 00111 | IIII | Offset | - // -------------------------------------------------- - // - // Stores bits 14-7 of each element UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); int end = index + 8; @@ -1723,30 +2263,103 @@ static void cfunc_rsp_suv(void *param) { if ((i & 0xf) < 8) { -#if USE_SIMD - UINT16 value; - SIMD_EXTRACT16(rsp->xv[dest], value, i); - value >>= 7; - WRITE8(rsp, ea, (UINT8)value); -#else - WRITE8(rsp, ea, VREG_S(dest, (i & 0x7)) >> 7); -#endif + WRITE8(rsp, ea, VREG_B(dest, (i & 0xf) << 1)); } else { -#if USE_SIMD - UINT16 value; - SIMD_EXTRACT16(rsp->xv[dest], value, i); - WRITE8(rsp, ea, (UINT8)value >> 8); -#else - WRITE8(rsp, ea, VREG_B(dest, ((i & 0x7) << 1))); -#endif + WRITE8(rsp, ea, VREG_S(dest, (i & 0x7)) >> 7); } ea++; } } +#endif -static void cfunc_rsp_shv(void *param) +#if USE_SIMD +// SUV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 00111 | IIII | Offset | +// -------------------------------------------------- +// +// Stores bits 14-7 of each element + +static void cfunc_rsp_suv_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); + int end = index + 8; + for (int i=index; i < end; i++) + { + if ((i & 0xf) < 8) + { + UINT16 value; + SIMD_EXTRACT16(rsp->xv[dest], value, i); + WRITE8(rsp, ea, (UINT8)(value >> 7)); + } + else + { + UINT16 value; + SIMD_EXTRACT16(rsp->xv[dest], value, i); + WRITE8(rsp, ea, (UINT8)(value >> 8)); + } + ea++; + } +} + +#else + +static void cfunc_rsp_suv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 8) : (offset * 8); + int end = index + 8; + for (int i=index; i < end; i++) + { + if ((i & 0xf) < 8) + { + WRITE8(rsp, ea, VREG_S(dest, (i & 0x7)) >> 7); + } + else + { + WRITE8(rsp, ea, VREG_B(dest, ((i & 0x7) << 1))); + } + ea++; + } +} +#endif + +#if USE_SIMD +// SHV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 01000 | IIII | Offset | +// -------------------------------------------------- +// +// Stores bits 14-7 of each element, with 2-byte stride + +static void cfunc_rsp_shv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1758,31 +2371,21 @@ static void cfunc_rsp_shv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 01000 | IIII | Offset | - // -------------------------------------------------- - // - // Stores bits 14-7 of each element, with 2-byte stride UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); for (int i=0; i < 8; i++) { int element = index + (i << 1); -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, element >> 1); WRITE8(rsp, ea, (value >> 7) & 0x00ff); -#else - UINT8 d = (VREG_B(dest, (element & 0xf)) << 1) | - (VREG_B(dest, ((element + 1) & 0xf)) >> 7); - WRITE8(rsp, ea, d); -#endif ea += 2; } } -static void cfunc_rsp_sfv(void *param) +#else + +static void cfunc_rsp_shv_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1794,14 +2397,41 @@ static void cfunc_rsp_sfv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 01001 | IIII | Offset | - // -------------------------------------------------- - // - // Stores bits 14-7 of upper or lower quad, with 4-byte stride - if (index & 0x7) printf("RSP: SFV: index = %d at %08X\n", index, rsp->ppc); + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + for (int i=0; i < 8; i++) + { + int element = index + (i << 1); + UINT8 d = (VREG_B(dest, (element & 0xf)) << 1) | + (VREG_B(dest, ((element + 1) & 0xf)) >> 7); + WRITE8(rsp, ea, d); + ea += 2; + } +} +#endif + +#if USE_SIMD +// SFV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 01001 | IIII | Offset | +// -------------------------------------------------- +// +// Stores bits 14-7 of upper or lower quad, with 4-byte stride + +static void cfunc_rsp_sfv_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); int eaoffset = ea & 0xf; @@ -1811,18 +2441,54 @@ static void cfunc_rsp_sfv(void *param) for (int i = index>>1; i < end; i++) { -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, i); WRITE8(rsp, ea + (eaoffset & 0xf), (value >> 7) & 0x00ff); -#else - WRITE8(rsp, ea + (eaoffset & 0xf), VREG_S(dest, i) >> 7); -#endif eaoffset += 4; } } -static void cfunc_rsp_swv(void *param) +#else + +static void cfunc_rsp_sfv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + int eaoffset = ea & 0xf; + ea &= ~0xf; + + int end = (index >> 1) + 4; + + for (int i = index>>1; i < end; i++) + { + WRITE8(rsp, ea + (eaoffset & 0xf), VREG_S(dest, i) >> 7); + eaoffset += 4; + } +} +#endif + +#if USE_SIMD +// SWV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 01010 | IIII | Offset | +// -------------------------------------------------- +// +// Stores the full 128-bit vector starting from vector byte index and wrapping to index 0 +// after byte index 15 + +static void cfunc_rsp_swv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1834,13 +2500,6 @@ static void cfunc_rsp_swv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 01010 | IIII | Offset | - // -------------------------------------------------- - // - // Stores the full 128-bit vector starting from vector byte index and wrapping to index 0 - // after byte index 15 UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); int eaoffset = ea & 0xf; @@ -1849,18 +2508,52 @@ static void cfunc_rsp_swv(void *param) int end = index + 16; for (int i = index; i < end; i++) { -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, i >> 1); WRITE8(rsp, ea + (eaoffset & 0xf), (value >> ((1-(i & 1)) * 8)) & 0xff); -#else - WRITE8(rsp, ea + (eaoffset & 0xf), VREG_B(dest, i & 0xf)); -#endif eaoffset++; } } -static void cfunc_rsp_stv(void *param) +#else + +static void cfunc_rsp_swv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + int eaoffset = ea & 0xf; + ea &= ~0xf; + + int end = index + 16; + for (int i = index; i < end; i++) + { + WRITE8(rsp, ea + (eaoffset & 0xf), VREG_B(dest, i & 0xf)); + eaoffset++; + } +} +#endif + +#if USE_SIMD +// STV +// +// 31 25 20 15 10 6 0 +// -------------------------------------------------- +// | 111010 | BBBBB | TTTTT | 01011 | IIII | Offset | +// -------------------------------------------------- +// +// Stores one element from maximum of 8 vectors, while incrementing element index + +static void cfunc_rsp_stv_simd(void *param) { rsp_state *rsp = (rsp_state*)param; UINT32 op = rsp->impstate->arg0; @@ -1873,12 +2566,6 @@ static void cfunc_rsp_stv(void *param) { offset |= 0xffffffc0; } - // 31 25 20 15 10 6 0 - // -------------------------------------------------- - // | 111010 | BBBBB | TTTTT | 01011 | IIII | Offset | - // -------------------------------------------------- - // - // Stores one element from maximum of 8 vectors, while incrementing element index int vs = dest; int ve = dest + 8; @@ -1895,18 +2582,129 @@ static void cfunc_rsp_stv(void *param) for (int i = vs; i < ve; i++) { -#if USE_SIMD UINT16 value; SIMD_EXTRACT16(rsp->xv[dest], value, element); WRITE16(rsp, ea + (eaoffset & 0xf), value); -#else - WRITE16(rsp, ea + (eaoffset & 0xf), VREG_S(i, element & 0x7)); -#endif eaoffset += 2; element++; } } +#else + +static void cfunc_rsp_stv_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + UINT32 op = rsp->impstate->arg0; + int dest = (op >> 16) & 0x1f; + int base = (op >> 21) & 0x1f; + int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + int vs = dest; + int ve = dest + 8; + if (ve > 32) + { + ve = 32; + } + + int element = 8 - (index >> 1); + + UINT32 ea = (base) ? rsp->r[base] + (offset * 16) : (offset * 16); + int eaoffset = (ea & 0xf) + (element * 2); + ea &= ~0xf; + + for (int i = vs; i < ve; i++) + { + WRITE16(rsp, ea + (eaoffset & 0xf), VREG_S(i, element & 0x7)); + eaoffset += 2; + element++; + } +} + +#endif + +#if USE_SIMD +static int generate_swc2(rsp_state *rsp, drcuml_block *block, compiler_state *compiler, const opcode_desc *desc) +{ +// int loopdest; + UINT32 op = desc->opptr.l[0]; + //int dest = (op >> 16) & 0x1f; + //int base = (op >> 21) & 0x1f; + //int index = (op >> 7) & 0xf; + int offset = (op & 0x7f); + //int skip; + if (offset & 0x40) + { + offset |= 0xffffffc0; + } + + switch ((op >> 11) & 0x1f) + { + case 0x00: /* SBV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_sbv_simd, rsp); + return TRUE; + case 0x01: /* SSV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_ssv_simd, rsp); + return TRUE; + case 0x02: /* SLV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_slv_simd, rsp); + return TRUE; + case 0x03: /* SDV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_sdv_simd, rsp); + return TRUE; + case 0x04: /* SQV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_sqv_simd, rsp); + return TRUE; + case 0x05: /* SRV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_srv_simd, rsp); + return TRUE; + case 0x06: /* SPV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_spv_simd, rsp); + return TRUE; + case 0x07: /* SUV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_suv_simd, rsp); + return TRUE; + case 0x08: /* SHV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_shv_simd, rsp); + return TRUE; + case 0x09: /* SFV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_sfv_simd, rsp); + return TRUE; + case 0x0a: /* SWV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_swv_simd, rsp); + return TRUE; + case 0x0b: /* STV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_stv_simd, rsp); + return TRUE; + + default: + unimplemented_opcode(rsp, op); + return FALSE; + } + + return TRUE; +} + +#else + static int generate_swc2(rsp_state *rsp, drcuml_block *block, compiler_state *compiler, const opcode_desc *desc) { // int loopdest; @@ -1925,51 +2723,51 @@ static int generate_swc2(rsp_state *rsp, drcuml_block *block, compiler_state *co { case 0x00: /* SBV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_sbv, rsp); + UML_CALLC(block, cfunc_rsp_sbv_scalar, rsp); return TRUE; case 0x01: /* SSV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_ssv, rsp); + UML_CALLC(block, cfunc_rsp_ssv_scalar, rsp); return TRUE; case 0x02: /* SLV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_slv, rsp); + UML_CALLC(block, cfunc_rsp_slv_scalar, rsp); return TRUE; case 0x03: /* SDV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_sdv, rsp); + UML_CALLC(block, cfunc_rsp_sdv_scalar, rsp); return TRUE; case 0x04: /* SQV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_sqv, rsp); + UML_CALLC(block, cfunc_rsp_sqv_scalar, rsp); return TRUE; case 0x05: /* SRV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_srv, rsp); + UML_CALLC(block, cfunc_rsp_srv_scalar, rsp); return TRUE; case 0x06: /* SPV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_spv, rsp); + UML_CALLC(block, cfunc_rsp_spv_scalar, rsp); return TRUE; case 0x07: /* SUV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_suv, rsp); + UML_CALLC(block, cfunc_rsp_suv_scalar, rsp); return TRUE; case 0x08: /* SHV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_shv, rsp); + UML_CALLC(block, cfunc_rsp_shv_scalar, rsp); return TRUE; case 0x09: /* SFV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_sfv, rsp); + UML_CALLC(block, cfunc_rsp_sfv_scalar, rsp); return TRUE; case 0x0a: /* SWV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_swv, rsp); + UML_CALLC(block, cfunc_rsp_swv_scalar, rsp); return TRUE; case 0x0b: /* STV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_stv, rsp); + UML_CALLC(block, cfunc_rsp_stv_scalar, rsp); return TRUE; default: @@ -1979,9 +2777,62 @@ static int generate_swc2(rsp_state *rsp, drcuml_block *block, compiler_state *co return TRUE; } +#endif INLINE UINT16 SATURATE_ACCUM(rsp_state *rsp, int accum, int slice, UINT16 negative, UINT16 positive) { +#if USE_SIMD + if ((INT16)VEC_ACCUM_H(rsp, accum) < 0) + { + if ((UINT16)(VEC_ACCUM_H(rsp, accum)) != 0xffff) + { + return negative; + } + else + { + if ((INT16)VEC_ACCUM_M(rsp, accum) >= 0) + { + return negative; + } + else + { + if (slice == 0) + { + return VEC_ACCUM_L(rsp, accum); + } + else if (slice == 1) + { + return VEC_ACCUM_M(rsp, accum); + } + } + } + } + else + { + if ((UINT16)(VEC_ACCUM_H(rsp, accum)) != 0) + { + return positive; + } + else + { + if ((INT16)VEC_ACCUM_M(rsp, accum) < 0) + { + return positive; + } + else + { + if (slice == 0) + { + return VEC_ACCUM_L(rsp, accum); + } + else + { + return VEC_ACCUM_M(rsp, accum); + } + } + } + } +#else if ((INT16)ACCUM_H(rsp, accum) < 0) { if ((UINT16)(ACCUM_H(rsp, accum)) != 0xffff) @@ -2032,7 +2883,7 @@ INLINE UINT16 SATURATE_ACCUM(rsp_state *rsp, int accum, int slice, UINT16 negati } } } - +#endif return 0; } @@ -2076,6 +2927,44 @@ INLINE UINT16 SATURATE_ACCUM1(rsp_state *rsp, int accum, UINT16 negative, UINT16 // Return positive if H>0 || (H==0 && M<0) // Return medium slice if H==0xffff && M<0 // Return medium slice if H==0 && M>=0 +#if USE_SIMD + if ((INT16)VEC_ACCUM_H(rsp, accum) < 0) + { + if ((UINT16)(VEC_ACCUM_H(rsp, accum)) != 0xffff) + { + return negative; + } + else + { + if ((INT16)VEC_ACCUM_M(rsp, accum) >= 0) + { + return negative; + } + else + { + return VEC_ACCUM_M(rsp, accum); + } + } + } + else + { + if ((UINT16)(VEC_ACCUM_H(rsp, accum)) != 0) + { + return positive; + } + else + { + if ((INT16)VEC_ACCUM_M(rsp, accum) < 0) + { + return positive; + } + else + { + return VEC_ACCUM_M(rsp, accum); + } + } + } +#else if ((INT16)ACCUM_H(rsp, accum) < 0) { if ((UINT16)(ACCUM_H(rsp, accum)) != 0xffff) @@ -2112,7 +3001,7 @@ INLINE UINT16 SATURATE_ACCUM1(rsp_state *rsp, int accum, UINT16 negative, UINT16 } } } - +#endif return 0; } @@ -2276,17 +3165,58 @@ INLINE __m128i RSPClampLowToVal(__m128i vaccLow, __m128i vaccMid, __m128i vaccHi return _mm_or_si128(negVal, posVal); } #endif -INLINE void cfunc_rsp_vmulf(void *param) + +#if USE_SIMD +// VMULF +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000000 | +// ------------------------------------------------------ +// +// Multiplies signed integer by signed integer * 2 + +INLINE void cfunc_rsp_vmulf_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + + INT16 vres[8]; + for (int i = 0; i < 8; i++) + { + UINT16 w1, w2; + SCALAR_GET_VS1(w1, i); + SCALAR_GET_VS2(w2, i); + INT32 s1 = (INT32)(INT16)w1; + INT32 s2 = (INT32)(INT16)w2; + + if (s1 == -32768 && s2 == -32768) + { + // overflow + VEC_SET_ACCUM_H(0, i); + VEC_SET_ACCUM_M(-32768, i); + VEC_SET_ACCUM_L(-32768, i); + vres[i] = 0x7fff; + } + else + { + INT64 r = s1 * s2 * 2; + r += 0x8000; // rounding ? + VEC_SET_ACCUM_H((r < 0) ? 0xffff : 0, i); + VEC_SET_ACCUM_M((INT16)(r >> 16), i); + VEC_SET_ACCUM_L((UINT16)(r), i); + vres[i] = VEC_ACCUM_M(rsp, i); + } + } + WRITEBACK_RESULT(); +} + +#else + +INLINE void cfunc_rsp_vmulf_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - //int i; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000000 | - // ------------------------------------------------------ - // - // Multiplies signed integer by signed integer * 2 INT16 vres[8]; for (int i = 0; i < 8; i++) @@ -2317,16 +3247,60 @@ INLINE void cfunc_rsp_vmulf(void *param) } WRITEBACK_RESULT(); } +#endif -INLINE void cfunc_rsp_vmulu(void *param) +#if USE_SIMD +// VMULU +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000001 | +// ------------------------------------------------------ +// + +INLINE void cfunc_rsp_vmulu_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + + INT16 vres[8]; + for (int i = 0; i < 8; i++) + { + UINT16 w1, w2; + SCALAR_GET_VS1(w1, i); + SCALAR_GET_VS2(w2, i); + INT32 s1 = (INT32)(INT16)w1; + INT32 s2 = (INT32)(INT16)w2; + + INT64 r = s1 * s2 * 2; + r += 0x8000; // rounding ? + + VEC_SET_ACCUM_H((UINT16)(r >> 32), i); + VEC_SET_ACCUM_M((UINT16)(r >> 16), i); + VEC_SET_ACCUM_L((UINT16)(r), i); + + if (r < 0) + { + vres[i] = 0; + } + else if (((INT16)(VEC_ACCUM_H(rsp, i)) ^ (INT16)(VEC_ACCUM_M(rsp, i))) < 0) + { + vres[i] = -1; + } + else + { + vres[i] = VEC_ACCUM_M(rsp, i); + } + } + WRITEBACK_RESULT(); +} + +#else + +INLINE void cfunc_rsp_vmulu_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000001 | - // ------------------------------------------------------ - // INT16 vres[8]; for (int i = 0; i < 8; i++) @@ -2359,23 +3333,25 @@ INLINE void cfunc_rsp_vmulu(void *param) } WRITEBACK_RESULT(); } +#endif -INLINE void cfunc_rsp_vmudl(void *param) +#if USE_SIMD +// VMUDL +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001101 | +// ------------------------------------------------------ +// +// Multiplies signed integer by unsigned fraction +// The result is added into accumulator +// The middle slice of accumulator is stored into destination element + +INLINE void cfunc_rsp_vmudl_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001101 | - // ------------------------------------------------------ - // - // Multiplies signed integer by unsigned fraction - // The result is added into accumulator - // The middle slice of accumulator is stored into destination element - -#if USE_SIMD - __m128i vsReg = rsp->xv[VS1REG]; __m128i vtReg = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); @@ -2389,9 +3365,15 @@ INLINE void cfunc_rsp_vmudl(void *param) rsp->accum_m = _mm_setzero_si128(); rsp->accum_h = _mm_setzero_si128(); +} #else +INLINE void cfunc_rsp_vmudl_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -2410,25 +3392,26 @@ INLINE void cfunc_rsp_vmudl(void *param) vres[i] = ACCUM_L(rsp, i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmudm(void *param) +#if USE_SIMD +// VMUDM +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000101 | +// ------------------------------------------------------ +// +// Multiplies signed integer by unsigned fraction +// The result is stored into accumulator +// The middle slice of accumulator is stored into destination element + +INLINE void cfunc_rsp_vmudm_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000101 | - // ------------------------------------------------------ - // - // Multiplies signed integer by unsigned fraction - // The result is stored into accumulator - // The middle slice of accumulator is stored into destination element - -#if USE_SIMD - __m128i vsRegLo, vsRegHi, vtRegLo, vtRegHi; __m128i vsReg = rsp->xv[VS1REG]; @@ -2447,9 +3430,15 @@ INLINE void cfunc_rsp_vmudm(void *param) loProduct = _mm_cmplt_epi32(loProduct, _mm_setzero_si128()); hiProduct = _mm_cmplt_epi32(hiProduct, _mm_setzero_si128()); rsp->accum_h = _mm_packs_epi32(loProduct, hiProduct); +} #else +INLINE void cfunc_rsp_vmudm_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -2468,25 +3457,26 @@ INLINE void cfunc_rsp_vmudm(void *param) vres[i] = ACCUM_M(rsp, i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmudn(void *param) +#if USE_SIMD +// VMUDN +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000110 | +// ------------------------------------------------------ +// +// Multiplies unsigned fraction by signed integer +// The result is stored into accumulator +// The low slice of accumulator is stored into destination element + +INLINE void cfunc_rsp_vmudn_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000110 | - // ------------------------------------------------------ - // - // Multiplies unsigned fraction by signed integer - // The result is stored into accumulator - // The low slice of accumulator is stored into destination element - -#if USE_SIMD - __m128i vsRegLo, vsRegHi, vtRegLo, vtRegHi; __m128i vsReg = rsp->xv[VS1REG]; @@ -2502,9 +3492,15 @@ INLINE void cfunc_rsp_vmudn(void *param) rsp->xv[VDREG] = rsp->accum_l = RSPPackLo32to16(loProduct, hiProduct); rsp->accum_m = RSPPackHi32to16(loProduct, hiProduct); rsp->accum_h = _mm_cmplt_epi16(rsp->accum_m, _mm_setzero_si128()); +} #else +INLINE void cfunc_rsp_vmudn_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8] = { 0 }; for (int i = 0; i < 8; i++) { @@ -2523,25 +3519,26 @@ INLINE void cfunc_rsp_vmudn(void *param) vres[i] = (UINT16)(r); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmudh(void *param) +#if USE_SIMD +// VMUDH +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000111 | +// ------------------------------------------------------ +// +// Multiplies signed integer by signed integer +// The result is stored into highest 32 bits of accumulator, the low slice is zero +// The highest 32 bits of accumulator is saturated into destination element + +INLINE void cfunc_rsp_vmudh_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 000111 | - // ------------------------------------------------------ - // - // Multiplies signed integer by signed integer - // The result is stored into highest 32 bits of accumulator, the low slice is zero - // The highest 32 bits of accumulator is saturated into destination element - -#if USE_SIMD - __m128i vaccLow, vaccHigh; __m128i unpackLo, unpackHi; @@ -2559,9 +3556,15 @@ INLINE void cfunc_rsp_vmudh(void *param) rsp->accum_l = _mm_setzero_si128(); rsp->accum_m = RSPPackLo32to16(vaccLow, vaccHigh); rsp->accum_h = RSPPackHi32to16(vaccLow, vaccHigh); +} #else +INLINE void cfunc_rsp_vmudh_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -2582,16 +3585,23 @@ INLINE void cfunc_rsp_vmudh(void *param) vres[i] = (INT16)(r); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmacf(void *param) +#if USE_SIMD +// VMACF +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001000 | +// ------------------------------------------------------ +// + +INLINE void cfunc_rsp_vmacf_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; -#if USE_SIMD - __m128i loProduct, hiProduct, unpackLo, unpackHi; __m128i vaccHigh; __m128i vdReg, vdRegLo, vdRegHi; @@ -2642,9 +3652,15 @@ INLINE void cfunc_rsp_vmacf(void *param) rsp->xv[VDREG] = _mm_packs_epi32(vaccLow, vaccHigh); rsp->accum_m = RSPPackLo32to16(vaccLow, vaccHigh); rsp->accum_h = RSPPackHi32to16(vaccLow, vaccHigh); +} #else +INLINE void cfunc_rsp_vmacf_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -2671,22 +3687,23 @@ INLINE void cfunc_rsp_vmacf(void *param) vres[i] = SATURATE_ACCUM(rsp, i, 1, 0x8000, 0x7fff); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmacu(void *param) +#if USE_SIMD +// VMACU +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001001 | +// ------------------------------------------------------ +// + +INLINE void cfunc_rsp_vmacu_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001001 | - // ------------------------------------------------------ - // - -#if USE_SIMD - __m128i loProduct, hiProduct, unpackLo, unpackHi; __m128i vaccHigh; __m128i vdReg, vdRegLo, vdRegHi; @@ -2736,8 +3753,15 @@ INLINE void cfunc_rsp_vmacu(void *param) /* Clamp the accumulator and write it all out. */ rsp->accum_m = RSPPackLo32to16(vaccLow, vaccHigh); rsp->accum_h = RSPPackHi32to16(vaccLow, vaccHigh); +} + #else +INLINE void cfunc_rsp_vmacu_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -2780,25 +3804,26 @@ INLINE void cfunc_rsp_vmacu(void *param) } } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmadl(void *param) +#if USE_SIMD +// VMADL +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001100 | +// ------------------------------------------------------ +// +// Multiplies unsigned fraction by unsigned fraction +// Adds the higher 16 bits of the 32-bit result to accumulator +// The low slice of accumulator is stored into destination element + +INLINE void cfunc_rsp_vmadl_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001100 | - // ------------------------------------------------------ - // - // Multiplies unsigned fraction by unsigned fraction - // Adds the higher 16 bits of the 32-bit result to accumulator - // The low slice of accumulator is stored into destination element - -#if USE_SIMD - __m128i vaccHigh; __m128i unpackHi, loProduct, hiProduct; __m128i vdReg, vdRegLo, vdRegHi; @@ -2833,8 +3858,15 @@ INLINE void cfunc_rsp_vmadl(void *param) rsp->accum_m = RSPPackLo32to16(vaccLow, vaccHigh); rsp->accum_h = RSPPackHi32to16(vaccLow, vaccHigh); rsp->xv[VDREG] = RSPClampLowToVal(vdReg, rsp->accum_m, rsp->accum_h); +} #else + +INLINE void cfunc_rsp_vmadl_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -2855,15 +3887,18 @@ INLINE void cfunc_rsp_vmadl(void *param) vres[i] = SATURATE_ACCUM(rsp, i, 0, 0x0000, 0xffff); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmadm(void *param) +#if USE_SIMD +// VMADM +// + +INLINE void cfunc_rsp_vmadm_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; -#if USE_SIMD __m128i vaccLow, vaccHigh, loProduct, hiProduct; __m128i vsRegLo, vsRegHi, vtRegLo, vtRegHi, vdRegLo, vdRegHi; @@ -2908,8 +3943,15 @@ INLINE void cfunc_rsp_vmadm(void *param) rsp->xv[VDREG] = _mm_packs_epi32(vaccLow, vaccHigh); rsp->accum_m = RSPPackLo32to16(vaccLow, vaccHigh); rsp->accum_h = RSPPackHi32to16(vaccLow, vaccHigh); +} #else + +INLINE void cfunc_rsp_vmadm_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -2934,15 +3976,18 @@ INLINE void cfunc_rsp_vmadm(void *param) vres[i] = SATURATE_ACCUM(rsp, i, 1, 0x8000, 0x7fff); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmadn(void *param) +#if USE_SIMD +// VMADN +// + +INLINE void cfunc_rsp_vmadn_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; -#if USE_SIMD __m128i vaccLow, vaccHigh, loProduct, hiProduct; __m128i vsRegLo, vsRegHi, vtRegLo, vtRegHi, vdRegLo, vdRegHi; @@ -2989,7 +4034,15 @@ INLINE void cfunc_rsp_vmadn(void *param) rsp->accum_m = RSPPackLo32to16(vaccLow, vaccHigh); rsp->accum_h = RSPPackHi32to16(vaccLow, vaccHigh); rsp->xv[VDREG] = RSPClampLowToVal(rsp->accum_l, rsp->accum_m, rsp->accum_h); +} + #else + +INLINE void cfunc_rsp_vmadn_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -3013,25 +4066,26 @@ INLINE void cfunc_rsp_vmadn(void *param) vres[i] = SATURATE_ACCUM(rsp, i, 0, 0x0000, 0xffff); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmadh(void *param) +#if USE_SIMD +// VMADH +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001111 | +// ------------------------------------------------------ +// +// Multiplies signed integer by signed integer +// The result is added into highest 32 bits of accumulator, the low slice is zero +// The highest 32 bits of accumulator is saturated into destination element + +INLINE void cfunc_rsp_vmadh_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 001111 | - // ------------------------------------------------------ - // - // Multiplies signed integer by signed integer - // The result is added into highest 32 bits of accumulator, the low slice is zero - // The highest 32 bits of accumulator is saturated into destination element - -#if USE_SIMD - __m128i vsReg = rsp->xv[VS1REG]; __m128i vtReg = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); @@ -3051,8 +4105,15 @@ INLINE void cfunc_rsp_vmadh(void *param) rsp->xv[VDREG] = _mm_packs_epi32(vaccLow, vaccHigh); rsp->accum_m = RSPPackLo32to16(vaccLow, vaccHigh); rsp->accum_h = RSPPackHi32to16(vaccLow, vaccHigh); +} #else + +INLINE void cfunc_rsp_vmadh_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -3072,23 +4133,23 @@ INLINE void cfunc_rsp_vmadh(void *param) vres[i] = SATURATE_ACCUM1(rsp, i, 0x8000, 0x7fff); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vadd(void *param) +#if USE_SIMD +// VADD +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010000 | +// ------------------------------------------------------ +// +// Adds two vector registers and carry flag, the result is saturated to 32767 + +INLINE void cfunc_rsp_vadd_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010000 | - // ------------------------------------------------------ - // - // Adds two vector registers and carry flag, the result is saturated to 32767 - -#if USE_SIMD - __m128i shuffled = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); __m128i carry = _mm_and_si128(rsp->xvflag[CARRY], vec_flagmask); rsp->accum_l = _mm_add_epi16(_mm_add_epi16(rsp->xv[VS1REG], shuffled), carry); @@ -3102,7 +4163,15 @@ INLINE void cfunc_rsp_vadd(void *param) rsp->xvflag[ZERO] = _mm_setzero_si128(); rsp->xvflag[CARRY] = _mm_setzero_si128(); +} + #else + +INLINE void cfunc_rsp_vadd_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8] = { 0 }; for (int i = 0; i < 8; i++) { @@ -3122,24 +4191,25 @@ INLINE void cfunc_rsp_vadd(void *param) CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vsub(void *param) +#if USE_SIMD +// VSUB +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010001 | +// ------------------------------------------------------ +// +// Subtracts two vector registers and carry flag, the result is saturated to -32768 +// TODO: check VS2REG == VDREG + +INLINE void cfunc_rsp_vsub_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010001 | - // ------------------------------------------------------ - // - // Subtracts two vector registers and carry flag, the result is saturated to -32768 - - // TODO: check VS2REG == VDREG - -#if USE_SIMD __m128i shuffled = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); __m128i carry = _mm_and_si128(rsp->xvflag[CARRY], vec_flagmask); __m128i unsat = _mm_sub_epi16(_mm_sub_epi16(rsp->xv[VS1REG], shuffled), carry); @@ -3154,7 +4224,15 @@ INLINE void cfunc_rsp_vsub(void *param) rsp->xvflag[ZERO] = _mm_setzero_si128(); rsp->xvflag[CARRY] = _mm_setzero_si128(); +} + #else + +INLINE void cfunc_rsp_vsub_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -3175,23 +4253,24 @@ INLINE void cfunc_rsp_vsub(void *param) CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vabs(void *param) +#if USE_SIMD +// VABS +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010011 | +// ------------------------------------------------------ +// +// Changes the sign of source register 2 if source register 1 is negative and stores the result to destination register + +INLINE void cfunc_rsp_vabs_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010011 | - // ------------------------------------------------------ - // - // Changes the sign of source register 2 if source register 1 is negative and stores - // the result to destination register - -#if USE_SIMD __m128i shuf2 = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); __m128i negs2 = _mm_sub_epi16(_mm_setzero_si128(), shuf2); __m128i s2_n32768 = _mm_cmpeq_epi16(shuf2, vec_n32768); @@ -3201,7 +4280,15 @@ INLINE void cfunc_rsp_vabs(void *param) __m128i result_n32768 = _mm_and_si128(s1_lz, _mm_and_si128(vec_32767, s2_n32768)); __m128i result_negs2 = _mm_and_si128(s1_lz, _mm_and_si128(negs2, _mm_xor_si128(s2_n32768, vec_neg1))); rsp->xv[VDREG] = rsp->accum_l = _mm_or_si128(result_gz, _mm_or_si128(result_n32768, result_negs2)); +} + #else + +INLINE void cfunc_rsp_vabs_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -3232,27 +4319,28 @@ INLINE void cfunc_rsp_vabs(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vaddc(void *param) +#if USE_SIMD +// VADDC +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010100 | +// ------------------------------------------------------ +// +// Adds two vector registers, the carry out is stored into carry register +// TODO: check VS2REG = VDREG + +INLINE void cfunc_rsp_vaddc_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010100 | - // ------------------------------------------------------ - // - // Adds two vector registers, the carry out is stored into carry register - - // TODO: check VS2REG = VDREG - CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); -#if USE_SIMD __m128i shuf2 = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); __m128i vec7531 = _mm_and_si128(rsp->xv[VS1REG], vec_lomask); __m128i vec6420 = _mm_srli_epi32(rsp->xv[VS1REG], 16); @@ -3266,7 +4354,18 @@ INLINE void cfunc_rsp_vaddc(void *param) rsp->xvflag[CARRY] = _mm_or_si128(over6420, _mm_srli_epi32(over7531, 16)); rsp->accum_l = rsp->xv[VDREG] = _mm_or_si128(_mm_slli_epi32(sum6420, 16), sum7531); +} + #else + +INLINE void cfunc_rsp_vaddc_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + + CLEAR_ZERO_FLAGS(); + CLEAR_CARRY_FLAGS(); + INT16 vres[8] = { 0 }; for (int i = 0; i < 8; i++) { @@ -3286,27 +4385,29 @@ INLINE void cfunc_rsp_vaddc(void *param) } } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vsubc(void *param) +#if USE_SIMD +// VSUBC +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010101 | +// ------------------------------------------------------ +// +// Subtracts two vector registers, the carry out is stored into carry register +// TODO: check VS2REG = VDREG + +INLINE void cfunc_rsp_vsubc_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 010101 | - // ------------------------------------------------------ - // - // Subtracts two vector registers, the carry out is stored into carry register - - // TODO: check VS2REG = VDREG CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); -#if USE_SIMD __m128i shuf2 = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); __m128i vec7531 = _mm_and_si128(rsp->xv[VS1REG], vec_lomask); __m128i vec6420 = _mm_srli_epi32(rsp->xv[VS1REG], 16); @@ -3326,7 +4427,19 @@ INLINE void cfunc_rsp_vsubc(void *param) rsp->xvflag[ZERO] = _mm_or_si128(zero6420, _mm_srli_epi32(zero7531, 16)); rsp->accum_l = rsp->xv[VDREG] = _mm_or_si128(_mm_slli_epi32(sum6420, 16), sum7531); +} + #else + +INLINE void cfunc_rsp_vsubc_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + + + CLEAR_ZERO_FLAGS(); + CLEAR_CARRY_FLAGS(); + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -3350,78 +4463,99 @@ INLINE void cfunc_rsp_vsubc(void *param) } } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vsaw(void *param) +#if USE_SIMD +// VSAW +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 011101 | +// ------------------------------------------------------ +// +// Stores high, middle or low slice of accumulator to destination vector + +INLINE void cfunc_rsp_vsaw_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 011101 | - // ------------------------------------------------------ - // - // Stores high, middle or low slice of accumulator to destination vector - switch (EL) { case 0x08: // VSAWH { -#if USE_SIMD rsp->xv[VDREG] = rsp->accum_h; -#else - for (int i = 0; i < 8; i++) - { - W_VREG_S(VDREG, i) = ACCUM_H(rsp, i); - } -#endif break; } case 0x09: // VSAWM { -#if USE_SIMD rsp->xv[VDREG] = rsp->accum_m; -#else - for (int i = 0; i < 8; i++) - { - W_VREG_S(VDREG, i) = ACCUM_M(rsp, i); - } -#endif break; } case 0x0a: // VSAWL { -#if USE_SIMD rsp->xv[VDREG] = rsp->accum_l; -#else - for (int i = 0; i < 8; i++) - { - W_VREG_S(VDREG, i) = ACCUM_L(rsp, i); - } -#endif break; } default: fatalerror("RSP: VSAW: el = %d\n", EL); } } -INLINE void cfunc_rsp_vlt(void *param) +#else + +INLINE void cfunc_rsp_vsaw_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - //int i; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100000 | - // ------------------------------------------------------ - // - // Sets compare flags if elements in VS1 are less than VS2 - // Moves the element in VS2 to destination vector + switch (EL) + { + case 0x08: // VSAWH + { + for (int i = 0; i < 8; i++) + { + W_VREG_S(VDREG, i) = ACCUM_H(rsp, i); + } + break; + } + case 0x09: // VSAWM + { + for (int i = 0; i < 8; i++) + { + W_VREG_S(VDREG, i) = ACCUM_M(rsp, i); + } + break; + } + case 0x0a: // VSAWL + { + for (int i = 0; i < 8; i++) + { + W_VREG_S(VDREG, i) = ACCUM_L(rsp, i); + } + break; + } + default: fatalerror("RSP: VSAW: el = %d\n", EL); + } +} +#endif #if USE_SIMD +// VLT +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100000 | +// ------------------------------------------------------ +// +// Sets compare flags if elements in VS1 are less than VS2 +// Moves the element in VS2 to destination vector + +INLINE void cfunc_rsp_vlt_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + rsp->xvflag[COMPARE] = rsp->xvflag[CLIP2] = _mm_setzero_si128(); __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); @@ -3435,7 +4569,15 @@ INLINE void cfunc_rsp_vlt(void *param) rsp->accum_l = rsp->xv[VDREG] = _mm_or_si128(result, _mm_and_si128(shuf, _mm_xor_si128(rsp->xvflag[COMPARE], vec_neg1))); rsp->xvflag[ZERO] = rsp->xvflag[CARRY] = _mm_setzero_si128(); +} + #else + +INLINE void cfunc_rsp_vlt_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + CLEAR_COMPARE_FLAGS(); CLEAR_CLIP2_FLAGS(); @@ -3473,23 +4615,25 @@ INLINE void cfunc_rsp_vlt(void *param) CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_veq(void *param) +#if USE_SIMD +// VEQ +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100001 | +// ------------------------------------------------------ +// +// Sets compare flags if elements in VS1 are equal with VS2 +// Moves the element in VS2 to destination vector + +INLINE void cfunc_rsp_veq_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100001 | - // ------------------------------------------------------ - // - // Sets compare flags if elements in VS1 are equal with VS2 - // Moves the element in VS2 to destination vector - -#if USE_SIMD rsp->xvflag[COMPARE] = rsp->xvflag[CLIP2] = _mm_setzero_si128(); __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); @@ -3502,7 +4646,15 @@ INLINE void cfunc_rsp_veq(void *param) rsp->accum_l = rsp->xv[VDREG] = _mm_or_si128(result, _mm_and_si128(shuf, _mm_xor_si128(rsp->xvflag[COMPARE], vec_neg1))); rsp->xvflag[ZERO] = rsp->xvflag[CARRY] = _mm_setzero_si128(); +} + #else + +INLINE void cfunc_rsp_veq_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + CLEAR_COMPARE_FLAGS(); CLEAR_CLIP2_FLAGS(); @@ -3529,23 +4681,25 @@ INLINE void cfunc_rsp_veq(void *param) CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vne(void *param) +#if USE_SIMD +// VNE +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100010 | +// ------------------------------------------------------ +// +// Sets compare flags if elements in VS1 are not equal with VS2 +// Moves the element in VS2 to destination vector + +INLINE void cfunc_rsp_vne_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100010 | - // ------------------------------------------------------ - // - // Sets compare flags if elements in VS1 are not equal with VS2 - // Moves the element in VS2 to destination vector - -#if USE_SIMD rsp->xvflag[COMPARE] = rsp->xvflag[CLIP2] = _mm_setzero_si128(); __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); @@ -3557,7 +4711,15 @@ INLINE void cfunc_rsp_vne(void *param) rsp->accum_l = rsp->xv[VDREG] = _mm_or_si128(result, _mm_and_si128(shuf, _mm_xor_si128(rsp->xvflag[COMPARE], vec_neg1))); rsp->xvflag[ZERO] = rsp->xvflag[CARRY] = _mm_setzero_si128(); +} + #else + +INLINE void cfunc_rsp_vne_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + CLEAR_COMPARE_FLAGS(); CLEAR_CLIP2_FLAGS(); @@ -3584,24 +4746,25 @@ INLINE void cfunc_rsp_vne(void *param) CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vge(void *param) +#if USE_SIMD +// VGE +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100011 | +// ------------------------------------------------------ +// +// Sets compare flags if elements in VS1 are greater or equal with VS2 +// Moves the element in VS2 to destination vector + +INLINE void cfunc_rsp_vge_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - //int i; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100011 | - // ------------------------------------------------------ - // - // Sets compare flags if elements in VS1 are greater or equal with VS2 - // Moves the element in VS2 to destination vector - -#if USE_SIMD rsp->xvflag[COMPARE] = rsp->xvflag[CLIP2] = _mm_setzero_si128(); __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); @@ -3614,7 +4777,15 @@ INLINE void cfunc_rsp_vge(void *param) rsp->accum_l = rsp->xv[VDREG] = _mm_or_si128(result, _mm_and_si128(shuf, _mm_xor_si128(rsp->xvflag[COMPARE], vec_neg1))); rsp->xvflag[ZERO] = rsp->xvflag[CARRY] = _mm_setzero_si128(); +} + #else + +INLINE void cfunc_rsp_vge_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + CLEAR_COMPARE_FLAGS(); CLEAR_CLIP2_FLAGS(); @@ -3640,218 +4811,117 @@ INLINE void cfunc_rsp_vge(void *param) CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vcl(void *param) +#if USE_SIMD +// VCL +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100100 | +// ------------------------------------------------------ +// +// Vector clip low + +INLINE void cfunc_rsp_vcl_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; INT16 vres[8]; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100100 | - // ------------------------------------------------------ - // - // Vector clip low + for (int i = 0; i < 8; i++) + { + INT16 s1, s2; + SCALAR_GET_VS1(s1, i); + SCALAR_GET_VS2(s2, i); -#if 0//USE_SIMD - __m128i flag0_07 = _mm_set_epi16(CARRY_FLAG(0), CARRY_FLAG(1), CARRY_FLAG(2), CARRY_FLAG(3), - CARRY_FLAG(4), CARRY_FLAG(5), CARRY_FLAG(6), CARRY_FLAG(7)); - __m128i flag0_815 = _mm_set_epi16(ZERO_FLAG(0), ZERO_FLAG(1), ZERO_FLAG(2), ZERO_FLAG(3), - ZERO_FLAG(4), ZERO_FLAG(5), ZERO_FLAG(6), ZERO_FLAG(7)); - __m128i flag1_07 = _mm_set_epi16(COMPARE_FLAG(0), COMPARE_FLAG(1), COMPARE_FLAG(2), COMPARE_FLAG(3), - COMPARE_FLAG(4), COMPARE_FLAG(5), COMPARE_FLAG(6), COMPARE_FLAG(7)); - __m128i flag1_815 = _mm_set_epi16((rsp->flag[1] >> 8) & 1, (rsp->flag[1] >> 9) & 1, (rsp->flag[1] >> 10) & 1, (rsp->flag[1] >> 11) & 1, - (rsp->flag[1] >> 12) & 1, (rsp->flag[1] >> 13) & 1, (rsp->flag[1] >> 14) & 1, (rsp->flag[1] >> 15) & 1); - __m128i flag2_07 = _mm_set_epi16(rsp->flag[2][0], rsp->flag[2][1], rsp->flag[2][2], rsp->flag[2][3], - rsp->flag[2][4], rsp->flag[2][5], rsp->flag[2][6], rsp->flag[2][7]); - __m128i n0_07 = _mm_xor_si128(flag0_07, vec_neg1); - __m128i n0_815 = _mm_xor_si128(flag0_815, vec_neg1); - __m128i n1_07 = _mm_xor_si128(flag1_07, vec_neg1); - __m128i n1_815 = _mm_xor_si128(flag1_815, vec_neg1); - __m128i n2_07 = _mm_xor_si128(flag2_07, vec_neg1); + if (CARRY_FLAG(rsp, i) != 0) + { + if (ZERO_FLAG(rsp, i) != 0) + { + if (COMPARE_FLAG(rsp, i) != 0) + { + VEC_SET_ACCUM_L(-(UINT16)s2, i); + } + else + { + VEC_SET_ACCUM_L(s1, i); + } + } + else//ZERO_FLAG(rsp, i)==0 + { + if (CLIP1_FLAG(rsp, i) != 0) + { + if (((UINT32)(UINT16)(s1) + (UINT32)(UINT16)(s2)) > 0x10000) + {//proper fix for Harvest Moon 64, r4 + VEC_SET_ACCUM_L(s1, i); + CLEAR_COMPARE_FLAG(i); + } + else + { + VEC_SET_ACCUM_L(-((UINT16)s2), i); + SET_COMPARE_FLAG(i); + } + } + else + { + if (((UINT32)(UINT16)(s1) + (UINT32)(UINT16)(s2)) != 0) + { + VEC_SET_ACCUM_L(s1, i); + CLEAR_COMPARE_FLAG(i); + } + else + { + VEC_SET_ACCUM_L(-((UINT16)s2), i); + SET_COMPARE_FLAG(i); + } + } + } + } + else//CARRY_FLAG(rsp, i)==0 + { + if (ZERO_FLAG(rsp, i) != 0) + { + if (CLIP2_FLAG(rsp, i) != 0) + { + VEC_SET_ACCUM_L(s2, i); + } + else + { + VEC_SET_ACCUM_L(s1, i); + } + } + else + { + if (((INT32)(UINT16)s1 - (INT32)(UINT16)s2) >= 0) + { + VEC_SET_ACCUM_L(s2, i); + SET_CLIP2_FLAG(i); + } + else + { + VEC_SET_ACCUM_L(s1, i); + CLEAR_CLIP2_FLAG(i); + } + } + } + vres[i] = VEC_ACCUM_L(rsp, i); + } + CLEAR_ZERO_FLAGS(); + CLEAR_CARRY_FLAGS(); + CLEAR_CLIP1_FLAGS(); + WRITEBACK_RESULT(); +} - __m128i shuf2 = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); - __m128i vec7531 = _mm_and_si128(rsp->xv[VS1REG], vec_lomask); - __m128i vec6420 = _mm_srli_epi32(rsp->xv[VS1REG], 16); - __m128i shuf7531 = _mm_and_si128(shuf2, vec_lomask); - __m128i shuf6420 = _mm_srli_epi32(shuf2, 16); - __m128i sub7531 = _mm_sub_epi32(vec7531, shuf7531); - __m128i sub6420 = _mm_sub_epi32(vec6420, shuf6420); - __m128i subh7531 = _mm_and_si128(sub7531, vec_himask); - __m128i subh6420 = _mm_and_si128(sub6420, vec_himask); - __m128i sub_gez = _mm_or_si128(_mm_slli_epi32(_mm_cmpeq_epi16(subh6420, _mm_setzero_si128()), 16), _mm_cmpeq_epi16(subh7531, _mm_setzero_si128())); - __m128i sub_lz = _mm_xor_si128(sub_gez, vec_neg1); - - __m128i sum7531 = _mm_add_epi32(vec7531, shuf7531); - __m128i sum6420 = _mm_add_epi32(vec6420, shuf6420); - __m128i suml7531 = _mm_and_si128(sum7531, vec_lomask); - __m128i suml6420 = _mm_and_si128(sum6420, vec_lomask); - __m128i sumh7531 = _mm_and_si128(sum7531, vec_himask); - __m128i sumh6420 = _mm_and_si128(sum6420, vec_himask); - __m128i suml_z = _mm_or_si128(_mm_slli_epi32(_mm_cmpeq_epi16(suml6420, _mm_setzero_si128()), 16), _mm_cmpeq_epi16(suml7531, _mm_setzero_si128())); - __m128i sumh_1 = _mm_or_si128(_mm_slli_epi32(_mm_cmpeq_epi16(sumh6420, vec_hibit), 16), _mm_cmpeq_epi16(sumh7531, vec_hibit)); - __m128i sumh_z = _mm_or_si128(_mm_slli_epi32(_mm_cmpeq_epi16(sumh6420, _mm_setzero_si128()), 16), _mm_cmpeq_epi16(sumh7531, _mm_setzero_si128())); - __m128i sum_z = _mm_and_si128(suml_z, sumh_z); - __m128i sum_nz = _mm_xor_si128(sum_z, vec_neg1); - __m128i sum_le0x10000 = _mm_or_si128(_mm_and_si128(suml_z, sumh_1), sumh_z); - __m128i sum_g0x10000 = _mm_xor_si128(sum_le0x10000, vec_neg1); - - __m128i f0a_and_nf0b = _mm_and_si128(flag0_07, n0_815); - __m128i nf0a_and_nf0b = _mm_and_si128( n0_07, n0_815); - - // accum set to -s2 if flag0[0-7] && flag0[8-15] && flag1[0-7] - // accum set to -s2 if flag0[0-7] && !flag0[8-15] && flag2[0-7] && (s1 + s2) > 0x10000 - // accum set to -s2 if flag0[0-7] && !flag0[8-15] && !flag2[0-7] && (s1 + s2) == 0 - __m128i accum_ns2 = _mm_and_si128(_mm_and_si128(flag0_07, flag0_815), flag1_07); - accum_ns2 = _mm_or_si128(accum_ns2, _mm_and_si128(_mm_and_si128(f0a_and_nf0b, flag2_07), sum_g0x10000)); - accum_ns2 = _mm_or_si128(accum_ns2, _mm_and_si128(_mm_and_si128(f0a_and_nf0b, n2_07), sum_z)); - - // accum set to s2 if !flag0[0-7] && flag0[8-15] && flag1[8-15] - // accum set to s2 if !flag0[0-7] && !flag0[8-15] && (s1 - s2) >= 0 - __m128i accum_s2 = _mm_and_si128(n0_07, _mm_and_si128(flag0_815, flag1_815)); - accum_s2 = _mm_or_si128(accum_s2, _mm_and_si128(_mm_and_si128(n0_07, n0_815), sub_gez)); - - // flag1[8-15] set if !flag0[0-7] && !flag0[8-15] && (s1 - s2) >= 0 - __m128i new_f1b_s = _mm_and_si128(_mm_and_si128(nf0a_and_nf0b, sub_gez), vec_flagmask); - UINT16 flag1_set = 0; - flag1_set |= _mm_extract_epi16(new_f1b_s, 0) << 8; - flag1_set |= _mm_extract_epi16(new_f1b_s, 1) << 9; - flag1_set |= _mm_extract_epi16(new_f1b_s, 2) << 10; - flag1_set |= _mm_extract_epi16(new_f1b_s, 3) << 11; - flag1_set |= _mm_extract_epi16(new_f1b_s, 4) << 12; - flag1_set |= _mm_extract_epi16(new_f1b_s, 5) << 13; - flag1_set |= _mm_extract_epi16(new_f1b_s, 6) << 14; - flag1_set |= _mm_extract_epi16(new_f1b_s, 7) << 15; - - // flag1[8-15]unset if !flag0[0-7] && !flag0[8-15] && (s1 - s2) < 0 - __m128i new_f1b_u = _mm_xor_si128(vec_neg1, _mm_and_si128(nf0a_and_nf0b, sub_lz)); - new_f1b_u = _mm_and_si128(new_f1b_u, vec_flagmask); - UINT16 flag1_unset = 0; - flag1_unset |= _mm_extract_epi16(new_f1b_u, 0) << 8; - flag1_unset |= _mm_extract_epi16(new_f1b_u, 1) << 9; - flag1_unset |= _mm_extract_epi16(new_f1b_u, 2) << 10; - flag1_unset |= _mm_extract_epi16(new_f1b_u, 3) << 11; - flag1_unset |= _mm_extract_epi16(new_f1b_u, 4) << 12; - flag1_unset |= _mm_extract_epi16(new_f1b_u, 5) << 13; - flag1_unset |= _mm_extract_epi16(new_f1b_u, 6) << 14; - flag1_unset |= _mm_extract_epi16(new_f1b_u, 7) << 15; - - // flag1[0-7] set if flag0[0-7] && !flag0[8-15] && flag2[0-7] && (s1 + s2) <= 0x10000 - // flag1[0-7] set if flag0[0-7] && !flag0[8-15] && !flag2[0-7] && (s1 + s2) == 0 - __m128i new_f1a_s = _mm_and_si128(_mm_and_si128(f0a_and_nf0b, flag2_07), sum_le0x10000); - new_f1a_s = _mm_or_si128(new_f1a_u, _mm_and_si128(_mm_and_si128(f0a_and_nf0b, n2_07), sum_z)); - new_f1a_s = _mm_and_si128(new_f1a_s, vec_flagmask); - flag1_set |= _mm_extract_epi16(new_f1a_s, 0) << 0; - flag1_set |= _mm_extract_epi16(new_f1a_s, 1) << 1; - flag1_set |= _mm_extract_epi16(new_f1a_s, 2) << 2; - flag1_set |= _mm_extract_epi16(new_f1a_s, 3) << 3; - flag1_set |= _mm_extract_epi16(new_f1a_s, 4) << 4; - flag1_set |= _mm_extract_epi16(new_f1a_s, 5) << 5; - flag1_set |= _mm_extract_epi16(new_f1a_s, 6) << 6; - flag1_set |= _mm_extract_epi16(new_f1a_s, 7) << 7; - - // flag1[0-7] unset if flag0[0-7] && !flag0[8-15] && flag2[0-7] && (s1 + s2) > 0x10000 - // flag1[0-7] unset if flag0[0-7] && !flag0[8-15] && !flag2[0-7] && (s1 + s2) != 0 - __m128i new_f1a_u = _mm_and_si128(_mm_and_si128(f0a_and_nf0b, flag2_07), sum_g0x10000); - new_f1a_u = _mm_or_si128(new_f1a_u, _mm_and_si128(_mm_and_si128(f0a_and_nf0b, n2_07), sum_nz)); - new_f1a_u = _mm_and_si128(new_f1a_u, vec_flagmask); - flag1_unset |= _mm_extract_epi16(new_f1a_u, 0) << 0; - flag1_unset |= _mm_extract_epi16(new_f1a_u, 1) << 1; - flag1_unset |= _mm_extract_epi16(new_f1a_u, 2) << 2; - flag1_unset |= _mm_extract_epi16(new_f1a_u, 3) << 3; - flag1_unset |= _mm_extract_epi16(new_f1a_u, 4) << 4; - flag1_unset |= _mm_extract_epi16(new_f1a_u, 5) << 5; - flag1_unset |= _mm_extract_epi16(new_f1a_u, 6) << 6; - flag1_unset |= _mm_extract_epi16(new_f1a_u, 7) << 7; - - rsp->flag[1] &= ~flag1_unset; - rsp->flag[1] |= flag1_set; - - // accum set to s1 if flag0[0-7] && flag0[8-15] && !flag1[0-7] - // accum set to s1 if flag0[0-7] && !flag0[8-15] && flag2[0-7] && (s1 + s2) <= 0x10000 - // accum set to s1 if flag0[0-7] && !flag0[8-15] && !flag2[0-7] && (s1 + s2) != 0 - // accum set to s1 if !flag0[0-7] && flag0[8-15] && !flag1[8-15] - // accum set to s1 if !flag0[0-7] && !flag0[8-15] && (s1 - s2) < 0 - __m128i accum_s1 = _mm_and_si128(flag0_07, _mm_and_si128(flag0_815, n1_07)); - accum_s1 = _mm_or_si128(accum_s1, _mm_and_si128(_mm_and_si128(f0a_and_nf0b, flag2_07), sum_le0x10000)); - accum_s1 = _mm_or_si128(accum_s1, _mm_and_si128(_mm_and_si128(f0a_and_nf0b, n2_07), sum_nz)); - accum_s1 = _mm_or_si128(accum_s1, _mm_and_si128(_mm_and_si128(n0_07, flag0_815), n1_815)); - accum_s1 = _mm_or_si128(accum_s1, _mm_and_si128(nf0a_and_nf0b, sub_lz)); - //__m128i zms2 = _mm_sub_epi16(_mm_setzero_si128(), shuf2); - - /* - __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); - __m128i s1_xor_s2 = _mm_xor_si128(rsp->xv[VS1REG], shuf); - __m128i s1_plus_s2 = _mm_add_epi16(rsp->xv[VS1REG], shuf); - __m128i s1_sub_s2 = _mm_sub_epi16(rsp->xv[VS1REG], shuf); - __m128i s2_neg = _mm_xor_si128(shuf, vec_neg1); - - __m128i s2_lz = _mm_cmplt_epi16(shuf, _mm_setzero_si128()); - __m128i s1s2_xor_lz = _mm_cmplt_epi16(s1_xor_s2, _mm_setzero_si128()); - __m128i s1s2_xor_gez = _mm_xor_si128(s1s2_xor_lz, vec_neg1); - __m128i s1s2_plus_nz = _mm_xor_si128(_mm_cmpeq_epi16(s1_plus_s2, _mm_setzero_si128()), vec_neg1); - __m128i s1s2_plus_gz = _mm_cmpgt_epi16(s1_plus_s2, _mm_setzero_si128()); - __m128i s1s2_plus_lez = _mm_xor_si128(s1s2_plus_gz, vec_neg1); - __m128i s1s2_plus_n1 = _mm_cmpeq_epi16(s1_plus_s2, vec_neg1); - __m128i s1s2_sub_nz = _mm_xor_si128(_mm_cmpeq_epi16(s1_sub_s2, _mm_setzero_si128()), vec_neg1); - __m128i s1s2_sub_lz = _mm_cmplt_epi16(s1_sub_s2, _mm_setzero_si128()); - __m128i s1s2_sub_gez = _mm_xor_si128(s1s2_sub_lz, vec_neg1); - __m128i s1_nens2 = _mm_xor_si128(_mm_cmpeq_epi16(rsp->xv[VS1REG], s2_neg), vec_neg1); - - __m128i ext_mask = _mm_and_si128(_mm_and_si128(s1s2_xor_lz, s1s2_plus_n1), vec_flagmask); - rsp->flag[2] |= _mm_extract_epi16(ext_mask, 0) << 0; - rsp->flag[2] |= _mm_extract_epi16(ext_mask, 1) << 1; - rsp->flag[2] |= _mm_extract_epi16(ext_mask, 2) << 2; - rsp->flag[2] |= _mm_extract_epi16(ext_mask, 3) << 3; - rsp->flag[2] |= _mm_extract_epi16(ext_mask, 4) << 4; - rsp->flag[2] |= _mm_extract_epi16(ext_mask, 5) << 5; - rsp->flag[2] |= _mm_extract_epi16(ext_mask, 6) << 6; - rsp->flag[2] |= _mm_extract_epi16(ext_mask, 7) << 7; - - __m128i carry_mask = _mm_and_si128(s1s2_xor_lz, vec_flagmask); - rsp->flag[0] |= _mm_extract_epi16(carry_mask, 0) << 0; - rsp->flag[0] |= _mm_extract_epi16(carry_mask, 1) << 1; - rsp->flag[0] |= _mm_extract_epi16(carry_mask, 2) << 2; - rsp->flag[0] |= _mm_extract_epi16(carry_mask, 3) << 3; - rsp->flag[0] |= _mm_extract_epi16(carry_mask, 4) << 4; - rsp->flag[0] |= _mm_extract_epi16(carry_mask, 5) << 5; - rsp->flag[0] |= _mm_extract_epi16(carry_mask, 6) << 6; - rsp->flag[0] |= _mm_extract_epi16(carry_mask, 7) << 7; - - __m128i z0_mask = _mm_and_si128(_mm_and_si128(s1s2_xor_gez, s1s2_sub_nz), s1_nens2); - __m128i z1_mask = _mm_and_si128(_mm_and_si128(s1s2_xor_lz, s1s2_plus_nz), s1_nens2); - __m128i z_mask = _mm_and_si128(_mm_or_si128(z0_mask, z1_mask), vec_flagmask); - z_mask = _mm_and_si128(_mm_or_si128(z_mask, _mm_srli_epi32(z_mask, 15)), vec_shiftmask2); - z_mask = _mm_and_si128(_mm_or_si128(z_mask, _mm_srli_epi64(z_mask, 30)), vec_shiftmask4); - z_mask = _mm_or_si128(z_mask, _mm_srli_si128(z_mask, 7)); - z_mask = _mm_or_si128(z_mask, _mm_srli_epi16(z_mask, 4)); - rsp->flag[0] |= (_mm_extract_epi16(z_mask, 0) << 8) & 0x00ff00; - - __m128i f0_mask = _mm_and_si128(_mm_or_si128(_mm_and_si128(s1s2_xor_gez, s2_lz), _mm_and_si128(s1s2_xor_lz, s1s2_plus_lez)), vec_flagmask); - __m128i f8_mask = _mm_and_si128(_mm_or_si128(_mm_and_si128(s1s2_xor_gez, s1s2_sub_gez), _mm_and_si128(s1s2_xor_lz, s2_lz)), vec_flagmask); - f0_mask = _mm_and_si128(f0_mask, vec_flagmask); - f8_mask = _mm_and_si128(f8_mask, vec_flagmask); - rsp->flag[1] |= _mm_extract_epi16(f0_mask, 0) << 0; - rsp->flag[1] |= _mm_extract_epi16(f0_mask, 1) << 1; - rsp->flag[1] |= _mm_extract_epi16(f0_mask, 2) << 2; - rsp->flag[1] |= _mm_extract_epi16(f0_mask, 3) << 3; - rsp->flag[1] |= _mm_extract_epi16(f0_mask, 4) << 4; - rsp->flag[1] |= _mm_extract_epi16(f0_mask, 5) << 5; - rsp->flag[1] |= _mm_extract_epi16(f0_mask, 6) << 6; - rsp->flag[1] |= _mm_extract_epi16(f0_mask, 7) << 7; - - rsp->flag[1] |= _mm_extract_epi16(f8_mask, 0) << 8; - rsp->flag[1] |= _mm_extract_epi16(f8_mask, 1) << 9; - rsp->flag[1] |= _mm_extract_epi16(f8_mask, 2) << 10; - rsp->flag[1] |= _mm_extract_epi16(f8_mask, 3) << 11; - rsp->flag[1] |= _mm_extract_epi16(f8_mask, 4) << 12; - rsp->flag[1] |= _mm_extract_epi16(f8_mask, 5) << 13; - rsp->flag[1] |= _mm_extract_epi16(f8_mask, 6) << 14; - rsp->flag[1] |= _mm_extract_epi16(f8_mask, 7) << 15;*/ #else + +INLINE void cfunc_rsp_vcl_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; + for (int i = 0; i < 8; i++) { INT16 s1, s2; @@ -3934,28 +5004,31 @@ INLINE void cfunc_rsp_vcl(void *param) CLEAR_CARRY_FLAGS(); CLEAR_CLIP1_FLAGS(); WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vch(void *param) +#if USE_SIMD +// VCH +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100101 | +// ------------------------------------------------------ +// +// Vector clip high + +INLINE void cfunc_rsp_vch_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100101 | - // ------------------------------------------------------ - // - // Vector clip high - CLEAR_CARRY_FLAGS(); CLEAR_COMPARE_FLAGS(); CLEAR_CLIP1_FLAGS(); CLEAR_ZERO_FLAGS(); CLEAR_CLIP2_FLAGS(); -#if 0//USE_SIMD +#if 0 // Compare flag // flag[1] bit [0- 7] set if (s1 ^ s2) < 0 && (s1 + s2) <= 0) // flag[1] bit [0- 7] set if (s1 ^ s2) >= 0 && (s2 < 0) @@ -4046,8 +5119,83 @@ INLINE void cfunc_rsp_vch(void *param) rsp->flag[1] |= _mm_extract_epi16(f8_mask, 5) << 13; rsp->flag[1] |= _mm_extract_epi16(f8_mask, 6) << 14; rsp->flag[1] |= _mm_extract_epi16(f8_mask, 7) << 15; +#endif + INT16 vres[8]; + UINT32 vce = 0; + for (int i = 0; i < 8; i++) + { + INT16 s1, s2; + SCALAR_GET_VS1(s1, i); + SCALAR_GET_VS2(s2, i); + + if ((s1 ^ s2) < 0) + { + vce = (s1 + s2 == -1); + SET_CARRY_FLAG(i); + if (s2 < 0) + { + SET_CLIP2_FLAG(i); + } + + if ((s1 + s2) <= 0) + { + SET_COMPARE_FLAG(i); + vres[i] = -((UINT16)s2); + } + else + { + vres[i] = s1; + } + + if ((s1 + s2) != 0 && s1 != ~s2) + { + SET_ZERO_FLAG(i); + } + }//sign + else + { + vce = 0; + if (s2 < 0) + { + SET_COMPARE_FLAG(i); + } + if ((s1 - s2) >= 0) + { + SET_CLIP2_FLAG(i); + vres[i] = s2; + } + else + { + vres[i] = s1; + } + + if ((s1 - s2) != 0 && s1 != ~s2) + { + SET_ZERO_FLAG(i); + } + } + if (vce) + { + SET_CLIP1_FLAG(i); + } + VEC_SET_ACCUM_L(vres[i], i); + } + WRITEBACK_RESULT(); +} + #else +INLINE void cfunc_rsp_vch_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + + CLEAR_CARRY_FLAGS(); + CLEAR_COMPARE_FLAGS(); + CLEAR_CLIP1_FLAGS(); + CLEAR_ZERO_FLAGS(); + CLEAR_CLIP2_FLAGS(); + INT16 vres[8]; UINT32 vce = 0; for (int i = 0; i < 8; i++) @@ -4109,28 +5257,31 @@ INLINE void cfunc_rsp_vch(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vcr(void *param) +#if USE_SIMD +// VCR +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100110 | +// ------------------------------------------------------ +// +// Vector clip reverse + +INLINE void cfunc_rsp_vcr_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100110 | - // ------------------------------------------------------ - // - // Vector clip reverse - CLEAR_CARRY_FLAGS(); CLEAR_COMPARE_FLAGS(); CLEAR_CLIP1_FLAGS(); CLEAR_ZERO_FLAGS(); CLEAR_CLIP2_FLAGS(); -#if 0//USE_SIMD +#if 0 // flag[1] bit [0- 7] set if (s1 ^ s2) < 0 && (s1 + s2) <= 0) // flag[1] bit [0- 7] set if (s1 ^ s2) >= 0 && (s2 < 0) @@ -4163,7 +5314,65 @@ INLINE void cfunc_rsp_vcr(void *param) rsp->xvflag[COMPARE] = _mm_or_si128(_mm_and_si128(s1s2_xor_gez, s2_lz), _mm_and_si128(s1s2_xor_lz, s1s2_plus_lez)); rsp->xvflag[CLIP2] = _mm_or_si128(_mm_and_si128(s1s2_xor_gez, s1s2_sub_gez), _mm_and_si128(s1s2_xor_lz, s2_lz)); +#endif + INT16 vres[8]; + for (int i = 0; i < 8; i++) + { + INT16 s1, s2; + SCALAR_GET_VS1(s1, i); + SCALAR_GET_VS2(s2, i); + + if ((INT16)(s1 ^ s2) < 0) + { + if (s2 < 0) + { + SET_CLIP2_FLAG(i); + } + if ((s1 + s2) <= 0) + { + VEC_SET_ACCUM_L(~((UINT16)s2), i); + SET_COMPARE_FLAG(i); + } + else + { + VEC_SET_ACCUM_L(s1, i); + } + } + else + { + if (s2 < 0) + { + SET_COMPARE_FLAG(i); + } + if ((s1 - s2) >= 0) + { + VEC_SET_ACCUM_L(s2, i); + SET_CLIP2_FLAG(i); + } + else + { + VEC_SET_ACCUM_L(s1, i); + } + } + + vres[i] = VEC_ACCUM_L(rsp, i); + } + WRITEBACK_RESULT(); +} + #else + +INLINE void cfunc_rsp_vcr_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + + CLEAR_CARRY_FLAGS(); + CLEAR_COMPARE_FLAGS(); + CLEAR_CLIP1_FLAGS(); + CLEAR_ZERO_FLAGS(); + CLEAR_CLIP2_FLAGS(); + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -4207,29 +5416,39 @@ INLINE void cfunc_rsp_vcr(void *param) vres[i] = ACCUM_L(rsp, i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vmrg(void *param) +#if USE_SIMD +// VMRG +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100111 | +// ------------------------------------------------------ +// +// Merges two vectors according to compare flags + +INLINE void cfunc_rsp_vmrg_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 100111 | - // ------------------------------------------------------ - // - // Merges two vectors according to compare flags - -#if USE_SIMD __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); __m128i s2mask = _mm_cmpeq_epi16(rsp->xvflag[COMPARE], _mm_setzero_si128()); __m128i s1mask = _mm_xor_si128(s2mask, vec_neg1); __m128i result = _mm_and_si128(rsp->xv[VS1REG], s1mask); rsp->xv[VDREG] = _mm_or_si128(result, _mm_and_si128(shuf, s2mask)); rsp->accum_l = rsp->xv[VDREG]; +} + #else + +INLINE void cfunc_rsp_vmrg_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -4248,26 +5467,36 @@ INLINE void cfunc_rsp_vmrg(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vand(void *param) +#if USE_SIMD +// VAND +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101000 | +// ------------------------------------------------------ +// +// Bitwise AND of two vector registers + +INLINE void cfunc_rsp_vand_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101000 | - // ------------------------------------------------------ - // - // Bitwise AND of two vector registers - -#if USE_SIMD __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); rsp->xv[VDREG] = _mm_and_si128(rsp->xv[VS1REG], shuf); rsp->accum_l = rsp->xv[VDREG]; +} + #else + +INLINE void cfunc_rsp_vand_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -4278,26 +5507,36 @@ INLINE void cfunc_rsp_vand(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vnand(void *param) +#if USE_SIMD +// VNAND +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101001 | +// ------------------------------------------------------ +// +// Bitwise NOT AND of two vector registers + +INLINE void cfunc_rsp_vnand_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101001 | - // ------------------------------------------------------ - // - // Bitwise NOT AND of two vector registers - -#if USE_SIMD __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); rsp->xv[VDREG] = _mm_xor_si128(_mm_and_si128(rsp->xv[VS1REG], shuf), vec_neg1); rsp->accum_l = rsp->xv[VDREG]; +} + #else + +INLINE void cfunc_rsp_vnand_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -4308,26 +5547,36 @@ INLINE void cfunc_rsp_vnand(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vor(void *param) +#if USE_SIMD +// VOR +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101010 | +// ------------------------------------------------------ +// +// Bitwise OR of two vector registers + +INLINE void cfunc_rsp_vor_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101010 | - // ------------------------------------------------------ - // - // Bitwise OR of two vector registers - -#if USE_SIMD __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); rsp->xv[VDREG] = _mm_or_si128(rsp->xv[VS1REG], shuf); rsp->accum_l = rsp->xv[VDREG]; +} + #else + +INLINE void cfunc_rsp_vor_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -4338,26 +5587,36 @@ INLINE void cfunc_rsp_vor(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vnor(void *param) +#if USE_SIMD +// VNOR +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101011 | +// ------------------------------------------------------ +// +// Bitwise NOT OR of two vector registers + +INLINE void cfunc_rsp_vnor_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101011 | - // ------------------------------------------------------ - // - // Bitwise NOT OR of two vector registers - -#if USE_SIMD __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); rsp->xv[VDREG] = _mm_xor_si128(_mm_or_si128(rsp->xv[VS1REG], shuf), vec_neg1); rsp->accum_l = rsp->xv[VDREG]; +} + #else + +INLINE void cfunc_rsp_vnor_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -4368,26 +5627,36 @@ INLINE void cfunc_rsp_vnor(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vxor(void *param) +#if USE_SIMD +// VXOR +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101100 | +// ------------------------------------------------------ +// +// Bitwise XOR of two vector registers + +INLINE void cfunc_rsp_vxor_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101100 | - // ------------------------------------------------------ - // - // Bitwise XOR of two vector registers - -#if USE_SIMD __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); rsp->xv[VDREG] = _mm_xor_si128(rsp->xv[VS1REG], shuf); rsp->accum_l = rsp->xv[VDREG]; +} + #else + +INLINE void cfunc_rsp_vxor_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -4398,26 +5667,36 @@ INLINE void cfunc_rsp_vxor(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vnxor(void *param) +#if USE_SIMD +// VNXOR +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101101 | +// ------------------------------------------------------ +// +// Bitwise NOT XOR of two vector registers + +INLINE void cfunc_rsp_vnxor_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | TTTTT | DDDDD | 101101 | - // ------------------------------------------------------ - // - // Bitwise NOT XOR of two vector registers - -#if USE_SIMD __m128i shuf = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); rsp->xv[VDREG] = _mm_xor_si128(_mm_xor_si128(rsp->xv[VS1REG], shuf), vec_neg1); rsp->accum_l = rsp->xv[VDREG]; +} + #else + +INLINE void cfunc_rsp_vnxor_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + INT16 vres[8]; for (int i = 0; i < 8; i++) { @@ -4428,30 +5707,29 @@ INLINE void cfunc_rsp_vnxor(void *param) SET_ACCUM_L(vres[i], i); } WRITEBACK_RESULT(); -#endif } +#endif -INLINE void cfunc_rsp_vrcp(void *param) +#if USE_SIMD +// VRCP +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110000 | +// ------------------------------------------------------ +// +// Calculates reciprocal + +INLINE void cfunc_rsp_vrcp_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110000 | - // ------------------------------------------------------ - // - // Calculates reciprocal - INT32 shifter = 0; -#if USE_SIMD UINT16 urec; INT32 rec; SIMD_EXTRACT16(rsp->xv[VS2REG], urec, EL); rec = (INT16)urec; -#else - INT32 rec = (INT16)(VREG_S(VS2REG, EL & 7)); -#endif INT32 datainput = (rec < 0) ? (-rec) : rec; if (datainput) { @@ -4489,39 +5767,84 @@ INLINE void cfunc_rsp_vrcp(void *param) rsp->reciprocal_res = rec; rsp->dp_allowed = 0; -#if USE_SIMD SIMD_INSERT16(rsp->xv[VDREG], (UINT16)rec, VS1REG); rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); +} + #else + +INLINE void cfunc_rsp_vrcp_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + + INT32 shifter = 0; + INT32 rec = (INT16)(VREG_S(VS2REG, EL & 7)); + INT32 datainput = (rec < 0) ? (-rec) : rec; + if (datainput) + { + for (int i = 0; i < 32; i++) + { + if (datainput & (1 << ((~i) & 0x1f))) + { + shifter = i; + break; + } + } + } + else + { + shifter = 0x10; + } + + INT32 address = ((datainput << shifter) & 0x7fc00000) >> 22; + INT32 fetchval = rsp_divtable[address]; + INT32 temp = (0x40000000 | (fetchval << 14)) >> ((~shifter) & 0x1f); + if (rec < 0) + { + temp = ~temp; + } + if (!rec) + { + temp = 0x7fffffff; + } + else if (rec == 0xffff8000) + { + temp = 0xffff0000; + } + rec = temp; + + rsp->reciprocal_res = rec; + rsp->dp_allowed = 0; + W_VREG_S(VDREG, VS1REG & 7) = (UINT16)rec; for (int i = 0; i < 8; i++) { SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i); } -#endif } +#endif -INLINE void cfunc_rsp_vrcpl(void *param) +#if USE_SIMD +// VRCPL +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110001 | +// ------------------------------------------------------ +// +// Calculates reciprocal low part + +INLINE void cfunc_rsp_vrcpl_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110001 | - // ------------------------------------------------------ - // - // Calculates reciprocal low part - INT32 shifter = 0; -#if USE_SIMD UINT16 urec; SIMD_EXTRACT16(rsp->xv[VS2REG], urec, EL); INT32 rec = (urec | rsp->reciprocal_high); -#else - INT32 rec = ((UINT16)(VREG_S(VS2REG, EL & 7)) | rsp->reciprocal_high); -#endif INT32 datainput = rec; @@ -4588,37 +5911,114 @@ INLINE void cfunc_rsp_vrcpl(void *param) rsp->reciprocal_res = rec; rsp->dp_allowed = 0; -#if USE_SIMD SIMD_INSERT16(rsp->xv[VDREG], (UINT16)rec, VS1REG); -#else - W_VREG_S(VDREG, VS1REG & 7) = (UINT16)rec; -#endif for (int i = 0; i < 8; i++) { -#if USE_SIMD INT16 val; SIMD_EXTRACT16(rsp->xv[VS2REG], val, VEC_EL_2(EL, i)); -#else - INT16 val = VREG_S(VS2REG, VEC_EL_2(EL, i)); -#endif - SET_ACCUM_L(val, i); + VEC_SET_ACCUM_L(val, i); } } -INLINE void cfunc_rsp_vrcph(void *param) +#else + +INLINE void cfunc_rsp_vrcpl_scalar(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110010 | - // ------------------------------------------------------ - // - // Calculates reciprocal high part + INT32 shifter = 0; + INT32 rec = ((UINT16)(VREG_S(VS2REG, EL & 7)) | rsp->reciprocal_high); + INT32 datainput = rec; + + if (rec < 0) + { + if (rsp->dp_allowed) + { + if (rec < -32768) + { + datainput = ~datainput; + } + else + { + datainput = -datainput; + } + } + else + { + datainput = -datainput; + } + } + + + if (datainput) + { + for (int i = 0; i < 32; i++) + { + if (datainput & (1 << ((~i) & 0x1f))) + { + shifter = i; + break; + } + } + } + else + { + if (rsp->dp_allowed) + { + shifter = 0; + } + else + { + shifter = 0x10; + } + } + + INT32 address = ((datainput << shifter) & 0x7fc00000) >> 22; + INT32 fetchval = rsp_divtable[address]; + INT32 temp = (0x40000000 | (fetchval << 14)) >> ((~shifter) & 0x1f); + if (rec < 0) + { + temp = ~temp; + } + if (!rec) + { + temp = 0x7fffffff; + } + else if (rec == 0xffff8000) + { + temp = 0xffff0000; + } + rec = temp; + + rsp->reciprocal_res = rec; + rsp->dp_allowed = 0; + + W_VREG_S(VDREG, VS1REG & 7) = (UINT16)rec; + + for (int i = 0; i < 8; i++) + { + SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i); + } +} +#endif #if USE_SIMD +// VRCPH +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110010 | +// ------------------------------------------------------ +// +// Calculates reciprocal high part + +INLINE void cfunc_rsp_vrcph_simd(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + UINT16 rcph; SIMD_EXTRACT16(rsp->xv[VS2REG], rcph, EL); rsp->reciprocal_high = rcph << 16; @@ -4627,7 +6027,15 @@ INLINE void cfunc_rsp_vrcph(void *param) rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); SIMD_INSERT16(rsp->xv[VDREG], (INT16)(rsp->reciprocal_res >> 16), VS1REG); +} + #else + +INLINE void cfunc_rsp_vrcph_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + rsp->reciprocal_high = (VREG_S(VS2REG, EL & 7)) << 16; rsp->dp_allowed = 1; @@ -4637,55 +6045,65 @@ INLINE void cfunc_rsp_vrcph(void *param) } W_VREG_S(VDREG, VS1REG & 7) = (INT16)(rsp->reciprocal_res >> 16); -#endif } +#endif -INLINE void cfunc_rsp_vmov(void *param) +#if USE_SIMD +// VMOV +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110011 | +// ------------------------------------------------------ +// +// Moves element from vector to destination vector + +INLINE void cfunc_rsp_vmov_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110011 | - // ------------------------------------------------------ - // - // Moves element from vector to destination vector - -#if USE_SIMD INT16 val; SIMD_EXTRACT16(rsp->xv[VS2REG], val, EL); SIMD_INSERT16(rsp->xv[VDREG], val, VS1REG); rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); +} + #else + +INLINE void cfunc_rsp_vmov_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + W_VREG_S(VDREG, VS1REG & 7) = VREG_S(VS2REG, EL & 7); for (int i = 0; i < 8; i++) { SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i); } -#endif } -INLINE void cfunc_rsp_vrsql(void *param) +#endif + +#if USE_SIMD +// VRSQL +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110101 | +// ------------------------------------------------------ +// +// Calculates reciprocal square-root low part + +INLINE void cfunc_rsp_vrsql_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110101 | - // ------------------------------------------------------ - // - // Calculates reciprocal square-root low part - INT32 shifter = 0; -#if USE_SIMD UINT16 val; SIMD_EXTRACT16(rsp->xv[VS2REG], val, EL); INT32 rec = rsp->reciprocal_high | val; -#else - INT32 rec = rsp->reciprocal_high | (UINT16)VREG_S(VS2REG, EL & 7); -#endif INT32 datainput = rec; if (rec < 0) @@ -4752,31 +6170,108 @@ INLINE void cfunc_rsp_vrsql(void *param) rsp->reciprocal_res = rec; rsp->dp_allowed = 0; -#if USE_SIMD SIMD_INSERT16(rsp->xv[VDREG], (UINT16)rec, VS1REG); rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); +} + #else + +INLINE void cfunc_rsp_vrsql_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + + INT32 shifter = 0; + INT32 rec = rsp->reciprocal_high | (UINT16)VREG_S(VS2REG, EL & 7); + INT32 datainput = rec; + + if (rec < 0) + { + if (rsp->dp_allowed) + { + if (rec < -32768) + { + datainput = ~datainput; + } + else + { + datainput = -datainput; + } + } + else + { + datainput = -datainput; + } + } + + if (datainput) + { + for (int i = 0; i < 32; i++) + { + if (datainput & (1 << ((~i) & 0x1f))) + { + shifter = i; + break; + } + } + } + else + { + if (rsp->dp_allowed) + { + shifter = 0; + } + else + { + shifter = 0x10; + } + } + + INT32 address = ((datainput << shifter) & 0x7fc00000) >> 22; + address = ((address | 0x200) & 0x3fe) | (shifter & 1); + + INT32 fetchval = rsp_divtable[address]; + INT32 temp = (0x40000000 | (fetchval << 14)) >> (((~shifter) & 0x1f) >> 1); + if (rec < 0) + { + temp = ~temp; + } + if (!rec) + { + temp = 0x7fffffff; + } + else if (rec == 0xffff8000) + { + temp = 0xffff0000; + } + rec = temp; + + rsp->reciprocal_res = rec; + rsp->dp_allowed = 0; + W_VREG_S(VDREG, VS1REG & 7) = (UINT16)(rec & 0xffff); for (int i = 0; i < 8; i++) { SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i); } -#endif } +#endif -INLINE void cfunc_rsp_vrsqh(void *param) +#if USE_SIMD +// VRSQH +// +// 31 25 24 20 15 10 5 0 +// ------------------------------------------------------ +// | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110110 | +// ------------------------------------------------------ +// +// Calculates reciprocal square-root high part + +INLINE void cfunc_rsp_vrsqh_simd(void *param) { rsp_state *rsp = (rsp_state*)param; int op = rsp->impstate->arg0; - // 31 25 24 20 15 10 5 0 - // ------------------------------------------------------ - // | 010010 | 1 | EEEE | SSSSS | ?FFFF | DDDDD | 110110 | - // ------------------------------------------------------ - // - // Calculates reciprocal square-root high part - -#if USE_SIMD UINT16 val; SIMD_EXTRACT16(rsp->xv[VS2REG], val, EL); rsp->reciprocal_high = val << 16; @@ -4785,7 +6280,15 @@ INLINE void cfunc_rsp_vrsqh(void *param) rsp->accum_l = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]); SIMD_INSERT16(rsp->xv[VDREG], (INT16)(rsp->reciprocal_res >> 16), VS1REG); // store high part +} + #else + +INLINE void cfunc_rsp_vrsqh_scalar(void *param) +{ + rsp_state *rsp = (rsp_state*)param; + int op = rsp->impstate->arg0; + rsp->reciprocal_high = (VREG_S(VS2REG, EL & 7)) << 16; rsp->dp_allowed = 1; @@ -4795,8 +6298,9 @@ INLINE void cfunc_rsp_vrsqh(void *param) } W_VREG_S(VDREG, VS1REG & 7) = (INT16)(rsp->reciprocal_res >> 16); // store high part -#endif } +#endif + static void cfunc_sp_set_status_cb(void *param) { @@ -5372,6 +6876,8 @@ static void generate_delay_slot_and_branch(rsp_state *rsp, drcuml_block *block, opcode -------------------------------------------------*/ +#if USE_SIMD + static int generate_vector_opcode(rsp_state *rsp, drcuml_block *block, compiler_state *compiler, const opcode_desc *desc) { UINT32 op = desc->opptr.l[0]; @@ -5385,192 +6891,192 @@ static int generate_vector_opcode(rsp_state *rsp, drcuml_block *block, compiler_ { case 0x00: /* VMULF */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmulf, rsp); + UML_CALLC(block, cfunc_rsp_vmulf_simd, rsp); return TRUE; case 0x01: /* VMULU */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmulu, rsp); + UML_CALLC(block, cfunc_rsp_vmulu_simd, rsp); return TRUE; case 0x04: /* VMUDL */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmudl, rsp); + UML_CALLC(block, cfunc_rsp_vmudl_simd, rsp); return TRUE; case 0x05: /* VMUDM */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmudm, rsp); + UML_CALLC(block, cfunc_rsp_vmudm_simd, rsp); return TRUE; case 0x06: /* VMUDN */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmudn, rsp); + UML_CALLC(block, cfunc_rsp_vmudn_simd, rsp); return TRUE; case 0x07: /* VMUDH */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmudh, rsp); + UML_CALLC(block, cfunc_rsp_vmudh_simd, rsp); return TRUE; case 0x08: /* VMACF */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmacf, rsp); + UML_CALLC(block, cfunc_rsp_vmacf_simd, rsp); return TRUE; case 0x09: /* VMACU */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmacu, rsp); + UML_CALLC(block, cfunc_rsp_vmacu_simd, rsp); return TRUE; case 0x0c: /* VMADL */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmadl, rsp); + UML_CALLC(block, cfunc_rsp_vmadl_simd, rsp); return TRUE; case 0x0d: /* VMADM */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmadm, rsp); + UML_CALLC(block, cfunc_rsp_vmadm_simd, rsp); return TRUE; case 0x0e: /* VMADN */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmadn, rsp); + UML_CALLC(block, cfunc_rsp_vmadn_simd, rsp); return TRUE; case 0x0f: /* VMADH */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmadh, rsp); + UML_CALLC(block, cfunc_rsp_vmadh_simd, rsp); return TRUE; case 0x10: /* VADD */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vadd, rsp); + UML_CALLC(block, cfunc_rsp_vadd_simd, rsp); return TRUE; case 0x11: /* VSUB */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vsub, rsp); + UML_CALLC(block, cfunc_rsp_vsub_simd, rsp); return TRUE; case 0x13: /* VABS */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vabs, rsp); + UML_CALLC(block, cfunc_rsp_vabs_simd, rsp); return TRUE; case 0x14: /* VADDC */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vaddc, rsp); + UML_CALLC(block, cfunc_rsp_vaddc_simd, rsp); return TRUE; case 0x15: /* VSUBC */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vsubc, rsp); + UML_CALLC(block, cfunc_rsp_vsubc_simd, rsp); return TRUE; case 0x1d: /* VSAW */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vsaw, rsp); + UML_CALLC(block, cfunc_rsp_vsaw_simd, rsp); return TRUE; case 0x20: /* VLT */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vlt, rsp); + UML_CALLC(block, cfunc_rsp_vlt_simd, rsp); return TRUE; case 0x21: /* VEQ */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_veq, rsp); + UML_CALLC(block, cfunc_rsp_veq_simd, rsp); return TRUE; case 0x22: /* VNE */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vne, rsp); + UML_CALLC(block, cfunc_rsp_vne_simd, rsp); return TRUE; case 0x23: /* VGE */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vge, rsp); + UML_CALLC(block, cfunc_rsp_vge_simd, rsp); return TRUE; case 0x24: /* VCL */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vcl, rsp); + UML_CALLC(block, cfunc_rsp_vcl_simd, rsp); return TRUE; case 0x25: /* VCH */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vch, rsp); + UML_CALLC(block, cfunc_rsp_vch_simd, rsp); return TRUE; case 0x26: /* VCR */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vcr, rsp); + UML_CALLC(block, cfunc_rsp_vcr_simd, rsp); return TRUE; case 0x27: /* VMRG */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmrg, rsp); + UML_CALLC(block, cfunc_rsp_vmrg_simd, rsp); return TRUE; case 0x28: /* VAND */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vand, rsp); + UML_CALLC(block, cfunc_rsp_vand_simd, rsp); return TRUE; case 0x29: /* VNAND */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vnand, rsp); + UML_CALLC(block, cfunc_rsp_vnand_simd, rsp); return TRUE; case 0x2a: /* VOR */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vor, rsp); + UML_CALLC(block, cfunc_rsp_vor_simd, rsp); return TRUE; case 0x2b: /* VNOR */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vnor, rsp); + UML_CALLC(block, cfunc_rsp_vnor_simd, rsp); return TRUE; case 0x2c: /* VXOR */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vxor, rsp); + UML_CALLC(block, cfunc_rsp_vxor_simd, rsp); return TRUE; case 0x2d: /* VNXOR */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vnxor, rsp); + UML_CALLC(block, cfunc_rsp_vnxor_simd, rsp); return TRUE; case 0x30: /* VRCP */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vrcp, rsp); + UML_CALLC(block, cfunc_rsp_vrcp_simd, rsp); return TRUE; case 0x31: /* VRCPL */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vrcpl, rsp); + UML_CALLC(block, cfunc_rsp_vrcpl_simd, rsp); return TRUE; case 0x32: /* VRCPH */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vrcph, rsp); + UML_CALLC(block, cfunc_rsp_vrcph_simd, rsp); return TRUE; case 0x33: /* VMOV */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vmov, rsp); + UML_CALLC(block, cfunc_rsp_vmov_simd, rsp); return TRUE; case 0x35: /* VRSQL */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vrsql, rsp); + UML_CALLC(block, cfunc_rsp_vrsql_simd, rsp); return TRUE; case 0x36: /* VRSQH */ UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l - UML_CALLC(block, cfunc_rsp_vrsqh, rsp); + UML_CALLC(block, cfunc_rsp_vrsqh_simd, rsp); return TRUE; default: @@ -5580,6 +7086,217 @@ static int generate_vector_opcode(rsp_state *rsp, drcuml_block *block, compiler_ } } +#else + +static int generate_vector_opcode(rsp_state *rsp, drcuml_block *block, compiler_state *compiler, const opcode_desc *desc) +{ + UINT32 op = desc->opptr.l[0]; + // Opcode legend: + // E = VS2 element type + // S = VS1, Source vector 1 + // T = VS2, Source vector 2 + // D = Destination vector + + switch (op & 0x3f) + { + case 0x00: /* VMULF */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmulf_scalar, rsp); + return TRUE; + + case 0x01: /* VMULU */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmulu_scalar, rsp); + return TRUE; + + case 0x04: /* VMUDL */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmudl_scalar, rsp); + return TRUE; + + case 0x05: /* VMUDM */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmudm_scalar, rsp); + return TRUE; + + case 0x06: /* VMUDN */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmudn_scalar, rsp); + return TRUE; + + case 0x07: /* VMUDH */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmudh_scalar, rsp); + return TRUE; + + case 0x08: /* VMACF */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmacf_scalar, rsp); + return TRUE; + + case 0x09: /* VMACU */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmacu_scalar, rsp); + return TRUE; + + case 0x0c: /* VMADL */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmadl_scalar, rsp); + return TRUE; + + case 0x0d: /* VMADM */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmadm_scalar, rsp); + return TRUE; + + case 0x0e: /* VMADN */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmadn_scalar, rsp); + return TRUE; + + case 0x0f: /* VMADH */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmadh_scalar, rsp); + return TRUE; + + case 0x10: /* VADD */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vadd_scalar, rsp); + return TRUE; + + case 0x11: /* VSUB */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vsub_scalar, rsp); + return TRUE; + + case 0x13: /* VABS */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vabs_scalar, rsp); + return TRUE; + + case 0x14: /* VADDC */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vaddc_scalar, rsp); + return TRUE; + + case 0x15: /* VSUBC */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vsubc_scalar, rsp); + return TRUE; + + case 0x1d: /* VSAW */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vsaw_scalar, rsp); + return TRUE; + + case 0x20: /* VLT */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vlt_scalar, rsp); + return TRUE; + + case 0x21: /* VEQ */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_veq_scalar, rsp); + return TRUE; + + case 0x22: /* VNE */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vne_scalar, rsp); + return TRUE; + + case 0x23: /* VGE */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vge_scalar, rsp); + return TRUE; + + case 0x24: /* VCL */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vcl_scalar, rsp); + return TRUE; + + case 0x25: /* VCH */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vch_scalar, rsp); + return TRUE; + + case 0x26: /* VCR */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vcr_scalar, rsp); + return TRUE; + + case 0x27: /* VMRG */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmrg_scalar, rsp); + return TRUE; + + case 0x28: /* VAND */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vand_scalar, rsp); + return TRUE; + + case 0x29: /* VNAND */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vnand_scalar, rsp); + return TRUE; + + case 0x2a: /* VOR */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vor_scalar, rsp); + return TRUE; + + case 0x2b: /* VNOR */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vnor_scalar, rsp); + return TRUE; + + case 0x2c: /* VXOR */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vxor_scalar, rsp); + return TRUE; + + case 0x2d: /* VNXOR */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vnxor_scalar, rsp); + return TRUE; + + case 0x30: /* VRCP */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vrcp_scalar, rsp); + return TRUE; + + case 0x31: /* VRCPL */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vrcpl_scalar, rsp); + return TRUE; + + case 0x32: /* VRCPH */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vrcph_scalar, rsp); + return TRUE; + + case 0x33: /* VMOV */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vmov_scalar, rsp); + return TRUE; + + case 0x35: /* VRSQL */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vrsql_scalar, rsp); + return TRUE; + + case 0x36: /* VRSQH */ + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_rsp_vrsqh_scalar, rsp); + return TRUE; + + default: + UML_MOV(block, mem(&rsp->impstate->arg0), desc->opptr.l[0]); // mov [arg0],desc->opptr.l + UML_CALLC(block, cfunc_unimplemented_opcode, rsp); + return FALSE; + } +} +#endif + static int generate_opcode(rsp_state *rsp, drcuml_block *block, compiler_state *compiler, const opcode_desc *desc) { int in_delay_slot = ((desc->flags & OPFLAG_IN_DELAY_SLOT) != 0);