From ae67c6b0ade9e416b70b5f3d499d08d8541566ff Mon Sep 17 00:00:00 2001 From: "therealmogminer@gmail.com" Date: Tue, 30 Jun 2015 21:15:17 +0200 Subject: [PATCH] nw, more RSP opts ported from CEN64 --- src/emu/cpu/rsp/rsp.h | 2 + src/emu/cpu/rsp/rspcp2.c | 439 ++++++++++++++++++++++++++++++++++++++- src/emu/cpu/rsp/rspcp2.h | 32 ++- src/emu/cpu/rsp/vldst.h | 67 ++++++ 4 files changed, 534 insertions(+), 6 deletions(-) create mode 100644 src/emu/cpu/rsp/vldst.h diff --git a/src/emu/cpu/rsp/rsp.h b/src/emu/cpu/rsp/rsp.h index fce4635a359..6bc588c2f01 100644 --- a/src/emu/cpu/rsp/rsp.h +++ b/src/emu/cpu/rsp/rsp.h @@ -163,6 +163,8 @@ public: void ccfunc_sp_set_status_cb(); void ccfunc_unimplemented(); + UINT8* get_dmem() { return m_dmem8; } + protected: // device-level overrides virtual void device_start(); diff --git a/src/emu/cpu/rsp/rspcp2.c b/src/emu/cpu/rsp/rspcp2.c index df55952ab4c..0d00e635d69 100644 --- a/src/emu/cpu/rsp/rspcp2.c +++ b/src/emu/cpu/rsp/rspcp2.c @@ -178,10 +178,58 @@ const rsp_cop2::vec_helpers_t rsp_cop2::m_vec_helpers = { { 0x0d0e, 0x030c, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08 }, { 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0c0b, 0x0809, 0x0e0f }, { 0x030c, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e } + }, + { // qr_lut + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }, + { 0xff00, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }, + { 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }, + { 0x0000, 0xff00, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }, + + { 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}, + { 0x0000, 0x0000, 0xff00, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }, + { 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }, + { 0x0000, 0x0000, 0x0000, 0xff00, 0xffff, 0xffff, 0xffff, 0xffff }, + + { 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0xff00, 0xffff, 0xffff, 0xffff }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xff00, 0xffff, 0xffff }, + + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xff00, 0xffff }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff }, + { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xff00 } + }, + { // bdls_lut - mask to denote which part of the vector to load/store. + { 0x00ff, 0x0000, 0x0000, 0x0000 }, // B + { 0xffff, 0x0000, 0x0000, 0x0000 }, // S + { 0xffff, 0xffff, 0x0000, 0x0000 }, // L + { 0xffff, 0xffff, 0xffff, 0xffff } // D } }; #ifndef __SSSE3__ +// TODO: Highly optimized. More of a stopgap measure. +static inline rsp_vec_t sse2_pshufb(rsp_vec_t v, const UINT16 *keys) +{ + UINT8 dest[16]; + UINT8 temp[16]; + + _mm_storeu_si128((rsp_vec_t *) temp, v); + + for (UINT32 j = 0; j < 8; j++) + { + UINT16 key = keys[j]; + UINT8 key_hi = key >> 8; + UINT8 key_lo = key >> 0; + + dest[(j << 1) + 1] = key_hi == 0x80 ? 0x00 : temp[key_hi]; + dest[(j << 1) + 0] = key_lo == 0x80 ? 0x00 : temp[key_lo]; + } + + return _mm_loadu_si128((rsp_vec_t *) dest); +} + rsp_vec_t rsp_cop2::vec_load_and_shuffle_operand(const UINT16* src, UINT32 element) { if (element >= 8) // element => 0w ... 7w @@ -234,6 +282,292 @@ rsp_vec_t rsp_cop2::vec_load_and_shuffle_operand(const UINT16* src, UINT32 eleme return _mm_shuffle_epi8(operand, key); } #endif +// +// SSSE3+ accelerated loads for group I. Byteswap big-endian to 2-byte +// little-endian vector. Start at vector element offset, discarding any +// wraparound as necessary. +// +// TODO: Reverse-engineer what happens when loads to vector elements must +// wraparound. Do we just discard the data, as below, or does the +// data effectively get rotated around the edge of the vector? +// +void rsp_cop2::vec_load_group1(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm) +{ + UINT32 offset = addr & 0x7; + UINT32 ror = offset - element; + + // Always load in 8-byte chunks to emulate wraparound. + rsp_vec_t data; + if (offset) { + UINT32 aligned_addr_lo = addr & ~0x7; + UINT32 aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF; + + data = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_lo)); + rsp_vec_t temp = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_hi)); + data = _mm_unpacklo_epi64(data, temp); + } + else + { + data = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + addr)); + } + + // Shift the DQM up to the point where we mux in the data. +#ifndef __SSSE3__ + dqm = sse2_pshufb(dqm, m_vec_helpers.sll_b2l_keys[element]); +#else + rsp_vec_t ekey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.sll_b2l_keys[element])); + dqm = _mm_shuffle_epi8(dqm, ekey); +#endif + + // Align the data to the DQM so we can mask it in. +#ifndef __SSSE3__ + data = sse2_pshufb(data, m_vec_helpers.ror_b2l_keys[ror & 0xF]); +#else + ekey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.ror_b2l_keys[ror & 0xF])); + data = _mm_shuffle_epi8(data, ekey); +#endif + + // Mask and mux in the data. +#ifdef __SSE4_1__ + reg = _mm_blendv_epi8(reg, data, dqm); +#else + data = _mm_and_si128(dqm, data); + reg = _mm_andnot_si128(dqm, reg); + reg = _mm_or_si128(data, reg); +#endif + + _mm_store_si128((rsp_vec_t *) regp, reg); +} + +// +// SSSE3+ accelerated loads for group II. +// +// TODO: Reverse-engineer what happens when loads to vector elements must +// wraparound. Do we just discard the data, as below, or does the +// data effectively get rotated around the edge of the vector? +// +// TODO: Reverse-engineer what happens when element != 0. +// +void rsp_cop2::vec_load_group2(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type) { + UINT32 offset = addr & 0x7; + rsp_vec_t data; + + // Always load in 8-byte chunks to emulate wraparound. + if (offset) { + UINT32 aligned_addr_lo = addr & ~0x7; + UINT32 aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF; + UINT64 datalow, datahigh; + + memcpy(&datalow, m_rsp.get_dmem() + aligned_addr_lo, sizeof(datalow)); + memcpy(&datahigh, m_rsp.get_dmem() + aligned_addr_hi, sizeof(datahigh)); + + // TODO: Test for endian issues? + datahigh >>= ((8 - offset) << 3); + datalow <<= (offset << 3); + datalow = datahigh | datalow; + + data = _mm_loadl_epi64((rsp_vec_t *) &datalow); + } + else + { + data = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + addr)); + } + + // "Unpack" the data. + data = _mm_unpacklo_epi8(_mm_setzero_si128(), data); + + if (request_type != RSP_MEM_REQUEST_PACK) + { + data = _mm_srli_epi16(data, 1); + } + + _mm_store_si128((rsp_vec_t *) regp, data); +} + +// +// SSSE3+ accelerated loads for group IV. Byteswap big-endian to 2-byte +// little-endian vector. Stop loading at quadword boundaries. +// +// TODO: Reverse-engineer what happens when loads from vector elements +// must wraparound (i.e., the address offset is small, starting +// element is large). +// +void rsp_cop2::vec_load_group4(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type) +{ + UINT32 aligned_addr = addr & 0xFF0; + UINT32 offset = addr & 0xF; + + rsp_vec_t data = _mm_load_si128((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr)); + + UINT32 ror; + if (request_type == RSP_MEM_REQUEST_QUAD) + { + ror = 16 - element + offset; + } + else + { + // TODO: How is this adjusted for LRV when e != 0? + dqm = _mm_cmpeq_epi8(_mm_setzero_si128(), dqm); + ror = 16 - offset; + } + +#ifndef __SSSE3__ + data = sse2_pshufb(data, m_vec_helpers.ror_b2l_keys[ror & 0xF]); + dqm = sse2_pshufb(dqm, m_vec_helpers.ror_b2l_keys[ror & 0xF]); +#else + rsp_vec_t dkey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.ror_b2l_keys[ror & 0xF])); + data = _mm_shuffle_epi8(data, dkey); + dqm = _mm_shuffle_epi8(dqm, dkey); +#endif + + // Mask and mux in the data. +#ifdef __SSE4_1__ + data = _mm_blendv_epi8(reg, data, dqm); +#else + data = _mm_and_si128(dqm, data); + reg = _mm_andnot_si128(dqm, reg); + data = _mm_or_si128(data, reg); +#endif + + _mm_store_si128((rsp_vec_t *) regp, data); +} + +// +// SSE3+ accelerated stores for group I. Byteswap 2-byte little-endian +// vector back to big-endian. Start at vector element offset, wrapping +// around the edge of the vector as necessary. +// +// TODO: Reverse-engineer what happens when stores from vector elements +// must wraparound. Do we just stop storing the data, or do we +// continue storing from the front of the vector, as below? +// +void rsp_cop2::vec_store_group1(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm) +{ + UINT32 offset = addr & 0x7; + UINT32 ror = element - offset; + + // Shift the DQM up to the point where we mux in the data. +#ifndef __SSSE3__ + dqm = sse2_pshufb(dqm, m_vec_helpers.sll_l2b_keys[offset]); +#else + __m182i ekey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.sll_l2b_keys[offset])); + dqm = _mm_shuffle_epi8(dqm, ekey); +#endif + + // Rotate the reg to align with the DQM. +#ifndef __SSSE3__ + reg = sse2_pshufb(reg, m_vec_helpers.ror_l2b_keys[ror & 0xF]); +#else + ekey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.ror_l2b_keys[ror & 0xF])); + reg = _mm_shuffle_epi8(reg, ekey); +#endif + + // Always load in 8-byte chunks to emulate wraparound. + rsp_vec_t data; + if (offset) + { + UINT32 aligned_addr_lo = addr & ~0x7; + UINT32 aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF; + + data = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_lo)); + rsp_vec_t temp = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_hi)); + data = _mm_unpacklo_epi64(data, temp); + + // Mask and mux in the data. +#ifdef __SSE4_1__ + data = _mm_blendv_epi8(data, reg, dqm); +#else + data = _mm_andnot_si128(dqm, data); + reg = _mm_and_si128(dqm, reg); + data = _mm_or_si128(data, reg); +#endif + + _mm_storel_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_lo), data); + + data = _mm_srli_si128(data, 8); + _mm_storel_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_hi), data); + } + else + { + data = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + addr)); + + // Mask and mux in the data. +#ifdef __SSE4_1__ + data = _mm_blendv_epi8(data, reg, dqm); +#else + data = _mm_andnot_si128(dqm, data); + reg = _mm_and_si128(dqm, reg); + data = _mm_or_si128(data, reg); +#endif + + _mm_storel_epi64((rsp_vec_t *) (m_rsp.get_dmem() + addr), data); + } +} + +// +// SSE3+ accelerated stores for group II. Byteswap 2-byte little-endian +// vector back to big-endian. Start at vector element offset, wrapping +// around the edge of the vector as necessary. +// +// TODO: Reverse-engineer what happens when stores from vector elements +// must wraparound. Do we just stop storing the data, or do we +// continue storing from the front of the vector, as below? +// +// TODO: Reverse-engineer what happens when element != 0. +// +void rsp_cop2::vec_store_group2(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type) { + // "Pack" the data. + if (request_type != RSP_MEM_REQUEST_PACK) + { + reg = _mm_slli_epi16(reg, 1); + } + + reg = _mm_srai_epi16(reg, 8); + reg = _mm_packs_epi16(reg, reg); + + // TODO: Always store in 8-byte chunks to emulate wraparound. + _mm_storel_epi64((rsp_vec_t *) (m_rsp.get_dmem() + addr), reg); +} + +// +// SSE3+ accelerated stores for group IV. Byteswap 2-byte little-endian +// vector back to big-endian. Stop storing at quadword boundaries. +// +void rsp_cop2::vec_store_group4(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type) { + UINT32 aligned_addr = addr & 0xFF0; + UINT32 offset = addr & 0xF; + UINT32 rol = offset; + + rsp_vec_t data = _mm_load_si128((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr)); + + if (request_type == RSP_MEM_REQUEST_QUAD) + { + rol -= element; + } + else + { + // TODO: How is this adjusted for SRV when e != 0? + dqm = _mm_cmpeq_epi8(_mm_setzero_si128(), dqm); + } + +#ifndef __SSSE3__ + reg = sse2_pshufb(reg, m_vec_helpers.rol_l2b_keys[rol & 0xF]); +#else + rsp_vec_t ekey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.rol_l2b_keys[rol & 0xF])); + reg = _mm_shuffle_epi8(reg, ekey); +#endif + + // Mask and mux out the data, write. +#ifdef __SSE4_1__ + data = _mm_blendv_epi8(data, reg, dqm); +#else + reg = _mm_and_si128(dqm, reg); + data = _mm_andnot_si128(dqm, data); + data = _mm_or_si128(data, reg); +#endif + + _mm_store_si128((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr), data); +} #endif extern offs_t rsp_dasm_one(char *buffer, offs_t pc, UINT32 op); @@ -498,14 +832,16 @@ void rsp_cop2::state_string_export(const int index, std::string &str) void rsp_cop2::handle_lwc2(UINT32 op) { + int base = (op >> 21) & 0x1f; +#if !USE_SIMD int i, end; UINT32 ea; int dest = (op >> 16) & 0x1f; - int base = (op >> 21) & 0x1f; int index = (op >> 7) & 0xf; int offset = (op & 0x7f); if (offset & 0x40) offset |= 0xffffffc0; +#endif switch ((op >> 11) & 0x1f) { @@ -518,8 +854,12 @@ void rsp_cop2::handle_lwc2(UINT32 op) // // Load 1 byte to vector byte index +#if USE_SIMD + vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + offset : offset; VREG_B(dest, index) = m_rsp.READ8(ea); +#endif break; } case 0x01: /* LSV */ @@ -531,6 +871,9 @@ void rsp_cop2::handle_lwc2(UINT32 op) // // Loads 2 bytes starting from vector byte index +#if USE_SIMD + vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 2) : (offset * 2); end = index + 2; @@ -540,6 +883,7 @@ void rsp_cop2::handle_lwc2(UINT32 op) VREG_B(dest, i) = m_rsp.READ8(ea); ea++; } +#endif break; } case 0x02: /* LLV */ @@ -551,6 +895,9 @@ void rsp_cop2::handle_lwc2(UINT32 op) // // Loads 4 bytes starting from vector byte index +#if USE_SIMD + vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 4) : (offset * 4); end = index + 4; @@ -560,6 +907,7 @@ void rsp_cop2::handle_lwc2(UINT32 op) VREG_B(dest, i) = m_rsp.READ8(ea); ea++; } +#endif break; } case 0x03: /* LDV */ @@ -571,6 +919,9 @@ void rsp_cop2::handle_lwc2(UINT32 op) // // Loads 8 bytes starting from vector byte index +#if USE_SIMD + vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8); end = index + 8; @@ -580,6 +931,7 @@ void rsp_cop2::handle_lwc2(UINT32 op) VREG_B(dest, i) = m_rsp.READ8(ea); ea++; } +#endif break; } case 0x04: /* LQV */ @@ -591,6 +943,9 @@ void rsp_cop2::handle_lwc2(UINT32 op) // // Loads up to 16 bytes starting from vector byte index +#if USE_SIMD + vec_lqrv_sqrv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16); end = index + (16 - (ea & 0xf)); @@ -601,6 +956,7 @@ void rsp_cop2::handle_lwc2(UINT32 op) VREG_B(dest, i) = m_rsp.READ8(ea); ea++; } +#endif break; } case 0x05: /* LRV */ @@ -612,6 +968,9 @@ void rsp_cop2::handle_lwc2(UINT32 op) // // Stores up to 16 bytes starting from right side until 16-byte boundary +#if USE_SIMD + vec_lqrv_sqrv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16); index = 16 - ((ea & 0xf) - index); @@ -623,6 +982,7 @@ void rsp_cop2::handle_lwc2(UINT32 op) VREG_B(dest, i) = m_rsp.READ8(ea); ea++; } +#endif break; } case 0x06: /* LPV */ @@ -634,12 +994,16 @@ void rsp_cop2::handle_lwc2(UINT32 op) // // Loads a byte as the upper 8 bits of each element +#if USE_SIMD + vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8); for (i=0; i < 8; i++) { VREG_S(dest, i) = m_rsp.READ8(ea + (((16-index) + i) & 0xf)) << 8; } +#endif break; } case 0x07: /* LUV */ @@ -651,12 +1015,16 @@ void rsp_cop2::handle_lwc2(UINT32 op) // // Loads a byte as the bits 14-7 of each element +#if USE_SIMD + vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8); for (i=0; i < 8; i++) { VREG_S(dest, i) = m_rsp.READ8(ea + (((16-index) + i) & 0xf)) << 7; } +#endif break; } case 0x08: /* LHV */ @@ -668,12 +1036,16 @@ void rsp_cop2::handle_lwc2(UINT32 op) // // Loads a byte as the bits 14-7 of each element, with 2-byte stride +#if USE_SIMD + vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16); for (i=0; i < 8; i++) { VREG_S(dest, i) = m_rsp.READ8(ea + (((16-index) + (i<<1)) & 0xf)) << 7; } +#endif break; } case 0x09: /* LFV */ @@ -685,6 +1057,9 @@ void rsp_cop2::handle_lwc2(UINT32 op) // // Loads a byte as the bits 14-7 of upper or lower quad, with 4-byte stride +#if USE_SIMD + vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16); // not sure what happens if 16-byte boundary is crossed... @@ -696,6 +1071,7 @@ void rsp_cop2::handle_lwc2(UINT32 op) VREG_S(dest, i) = m_rsp.READ8(ea) << 7; ea += 4; } +#endif break; } case 0x0a: /* LWV */ @@ -708,6 +1084,8 @@ void rsp_cop2::handle_lwc2(UINT32 op) // Loads the full 128-bit vector starting from vector byte index and wrapping to index 0 // after byte index 15 +#if USE_SIMD +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16); end = (16 - index) + 16; @@ -717,6 +1095,7 @@ void rsp_cop2::handle_lwc2(UINT32 op) VREG_B(dest, i & 0xf) = m_rsp.READ8(ea); ea += 4; } +#endif break; } case 0x0b: /* LTV */ @@ -730,6 +1109,8 @@ void rsp_cop2::handle_lwc2(UINT32 op) // FIXME: has a small problem with odd indices +#if USE_SIMD +#else int element; int vs = dest; int ve = dest + 8; @@ -751,6 +1132,7 @@ void rsp_cop2::handle_lwc2(UINT32 op) ea += 2; } +#endif break; } @@ -769,15 +1151,17 @@ void rsp_cop2::handle_lwc2(UINT32 op) void rsp_cop2::handle_swc2(UINT32 op) { + int base = (op >> 21) & 0x1f; +#if !USE_SIMD int i, end; int eaoffset; UINT32 ea; int dest = (op >> 16) & 0x1f; - int base = (op >> 21) & 0x1f; int index = (op >> 7) & 0xf; int offset = (op & 0x7f); if (offset & 0x40) offset |= 0xffffffc0; +#endif switch ((op >> 11) & 0x1f) { @@ -790,8 +1174,12 @@ void rsp_cop2::handle_swc2(UINT32 op) // // Stores 1 byte from vector byte index +#if USE_SIMD + vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + offset : offset; m_rsp.WRITE8(ea, VREG_B(dest, index)); +#endif break; } case 0x01: /* SSV */ @@ -803,6 +1191,9 @@ void rsp_cop2::handle_swc2(UINT32 op) // // Stores 2 bytes starting from vector byte index +#if USE_SIMD + vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 2) : (offset * 2); end = index + 2; @@ -812,6 +1203,7 @@ void rsp_cop2::handle_swc2(UINT32 op) m_rsp.WRITE8(ea, VREG_B(dest, i)); ea++; } +#endif break; } case 0x02: /* SLV */ @@ -823,6 +1215,9 @@ void rsp_cop2::handle_swc2(UINT32 op) // // Stores 4 bytes starting from vector byte index +#if USE_SIMD + vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 4) : (offset * 4); end = index + 4; @@ -832,6 +1227,7 @@ void rsp_cop2::handle_swc2(UINT32 op) m_rsp.WRITE8(ea, VREG_B(dest, i)); ea++; } +#endif break; } case 0x03: /* SDV */ @@ -843,6 +1239,9 @@ void rsp_cop2::handle_swc2(UINT32 op) // // Stores 8 bytes starting from vector byte index +#if USE_SIMD + vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8); end = index + 8; @@ -852,6 +1251,7 @@ void rsp_cop2::handle_swc2(UINT32 op) m_rsp.WRITE8(ea, VREG_B(dest, i)); ea++; } +#endif break; } case 0x04: /* SQV */ @@ -863,6 +1263,9 @@ void rsp_cop2::handle_swc2(UINT32 op) // // Stores up to 16 bytes starting from vector byte index until 16-byte boundary +#if USE_SIMD + vec_lqrv_sqrv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16); end = index + (16 - (ea & 0xf)); @@ -872,6 +1275,7 @@ void rsp_cop2::handle_swc2(UINT32 op) m_rsp.WRITE8(ea, VREG_B(dest, i & 0xf)); ea++; } +#endif break; } case 0x05: /* SRV */ @@ -883,6 +1287,9 @@ void rsp_cop2::handle_swc2(UINT32 op) // // Stores up to 16 bytes starting from right side until 16-byte boundary +#if USE_SIMD + vec_lqrv_sqrv(op, m_rsp.m_rsp_state->r[base]); +#else int o; ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16); @@ -895,6 +1302,7 @@ void rsp_cop2::handle_swc2(UINT32 op) m_rsp.WRITE8(ea, VREG_B(dest, ((i + o) & 0xf))); ea++; } +#endif break; } case 0x06: /* SPV */ @@ -906,6 +1314,9 @@ void rsp_cop2::handle_swc2(UINT32 op) // // Stores upper 8 bits of each element +#if USE_SIMD + vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8); end = index + 8; @@ -921,6 +1332,7 @@ void rsp_cop2::handle_swc2(UINT32 op) } ea++; } +#endif break; } case 0x07: /* SUV */ @@ -932,6 +1344,9 @@ void rsp_cop2::handle_swc2(UINT32 op) // // Stores bits 14-7 of each element +#if USE_SIMD + vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8); end = index + 8; @@ -947,6 +1362,7 @@ void rsp_cop2::handle_swc2(UINT32 op) } ea++; } +#endif break; } case 0x08: /* SHV */ @@ -958,6 +1374,9 @@ void rsp_cop2::handle_swc2(UINT32 op) // // Stores bits 14-7 of each element, with 2-byte stride +#if USE_SIMD + vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16); for (i=0; i < 8; i++) @@ -968,6 +1387,7 @@ void rsp_cop2::handle_swc2(UINT32 op) m_rsp.WRITE8(ea, d); ea += 2; } +#endif break; } case 0x09: /* SFV */ @@ -981,6 +1401,9 @@ void rsp_cop2::handle_swc2(UINT32 op) // FIXME: only works for index 0 and index 8 +#if USE_SIMD + vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]); +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16); eaoffset = ea & 0xf; @@ -993,6 +1416,7 @@ void rsp_cop2::handle_swc2(UINT32 op) m_rsp.WRITE8(ea + (eaoffset & 0xf), VREG_S(dest, i) >> 7); eaoffset += 4; } +#endif break; } case 0x0a: /* SWV */ @@ -1005,6 +1429,8 @@ void rsp_cop2::handle_swc2(UINT32 op) // Stores the full 128-bit vector starting from vector byte index and wrapping to index 0 // after byte index 15 +#if USE_SIMD +#else ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16); eaoffset = ea & 0xf; @@ -1017,6 +1443,7 @@ void rsp_cop2::handle_swc2(UINT32 op) m_rsp.WRITE8(ea + (eaoffset & 0xf), VREG_B(dest, i & 0xf)); eaoffset++; } +#endif break; } case 0x0b: /* STV */ @@ -1028,6 +1455,8 @@ void rsp_cop2::handle_swc2(UINT32 op) // // Stores one element from maximum of 8 vectors, while incrementing element index +#if USE_SIMD +#else int element; int vs = dest; int ve = dest + 8; @@ -1047,6 +1476,7 @@ void rsp_cop2::handle_swc2(UINT32 op) eaoffset += 2; element++; } +#endif break; } @@ -1981,16 +2411,21 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // Stores high, middle or low slice of accumulator to destination vector #if USE_SIMD + UINT16 *acc = m_acc.s; switch (EL) { case 8: + m_v[VDREG].v = read_acc_hi(acc); break; case 9: + m_v[VDREG].v = read_acc_mid(acc); break; case 10: + m_v[VDREG].v = read_acc_lo(acc); break; default: + m_v[VDREG].v = _mm_setzero_si128(); break; } #else diff --git a/src/emu/cpu/rsp/rspcp2.h b/src/emu/cpu/rsp/rspcp2.h index 2e09cd587f4..77bbd468233 100644 --- a/src/emu/cpu/rsp/rspcp2.h +++ b/src/emu/cpu/rsp/rspcp2.h @@ -199,6 +199,23 @@ protected: RSP_ACC_HI = 0, }; + enum rsp_mem_request_type { + RSP_MEM_REQUEST_NONE, + RSP_MEM_REQUEST_INT_MEM, + RSP_MEM_REQUEST_VECTOR, + RSP_MEM_REQUEST_FOURTH, + RSP_MEM_REQUEST_HALF, + RSP_MEM_REQUEST_PACK, + RSP_MEM_REQUEST_QUAD, + RSP_MEM_REQUEST_REST, + RSP_MEM_REQUEST_UPACK + }; + + union aligned_rsp_1vect_t { + rsp_vec_t __align; + UINT16 s[8]; + }; + union aligned_rsp_2vect_t { rsp_vec_t __align[2]; UINT16 s[16]; @@ -209,6 +226,7 @@ protected: UINT16 s[24]; }; + aligned_rsp_1vect_t m_vdqm; aligned_rsp_2vect_t m_flags[3]; aligned_rsp_3vect_t m_acc; UINT32 m_dp_flag; @@ -225,11 +243,16 @@ protected: const UINT16 ror_b2l_keys[16][8]; const UINT16 rol_l2b_keys[16][8]; const UINT16 ror_l2b_keys[16][8]; + const UINT16 qr_lut[16][8]; + const UINT16 bdls_lut[4][4]; } vec_helpers_t; static const vec_helpers_t m_vec_helpers; rsp_vec_t vec_load_and_shuffle_operand(const UINT16* src, UINT32 element); + static inline UINT32 sign_extend_6(INT32 i) { + return (i << (32 - 7)) >> (32 - 7); + } static inline rsp_vec_t vec_load_unshuffled_operand(const UINT16* src) { return _mm_load_si128((rsp_vec_t*) src); @@ -319,11 +342,11 @@ protected: } void vec_load_group1(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm); - void vec_load_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm); - void vec_load_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm); + void vec_load_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type); + void vec_load_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type); void vec_store_group1(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm); - void vec_store_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm); - void vec_store_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm); + void vec_store_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type); + void vec_store_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type); #include "clamp.h" #include "vabs.h" @@ -349,6 +372,7 @@ protected: #include "vsub.h" #include "vsubc.h" #include "vxor.h" +#include "vldst.h" #endif private: diff --git a/src/emu/cpu/rsp/vldst.h b/src/emu/cpu/rsp/vldst.h new file mode 100644 index 00000000000..eaef0ccb512 --- /dev/null +++ b/src/emu/cpu/rsp/vldst.h @@ -0,0 +1,67 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +// LBV, LDV, LLV, LSV, SBV, SDV, SLV, SSV +inline void vec_lbdlsv_sbdlsv(UINT32 iw, UINT32 rs) +{ + const UINT32 shift_and_idx = (iw >> 11) & 0x3; + rsp_vec_t dqm = _mm_loadl_epi64((rsp_vec_t *) (m_vec_helpers.bdls_lut[shift_and_idx])); + + const UINT32 addr = rs + (sign_extend_6(iw) << shift_and_idx); + const UINT32 element = (iw >> 21) & 0xf; + UINT16* regp = m_v[(iw >> 16) & 0x1f].s; + + if (iw >> 29 & 0x1) + { + vec_store_group1(addr, element, regp, vec_load_unshuffled_operand(regp), dqm); + } + else + { + vec_load_group1(addr, element, regp, vec_load_unshuffled_operand(regp), dqm); + } +} + +// LPV, LUV, SPV, SUV +inline void vec_lfhpuv_sfhpuv(UINT32 iw, UINT32 rs) +{ + static const enum rsp_mem_request_type fhpu_type_lut[4] = { + RSP_MEM_REQUEST_PACK, + RSP_MEM_REQUEST_UPACK, + RSP_MEM_REQUEST_HALF, + RSP_MEM_REQUEST_FOURTH + }; + + const UINT32 addr = rs + (sign_extend_6(iw) << 3); + const UINT32 element = (iw >> 21) & 0xf; + UINT16* regp = m_v[(iw >> 16) & 0x1f].s; + + rsp_mem_request_type request_type = fhpu_type_lut[((iw >> 11) & 0x1f) - 6]; + if ((iw >> 29) && 0x1) + { + vec_store_group2(addr, element, regp, vec_load_unshuffled_operand(regp), _mm_setzero_si128(), request_type); + } + else + { + vec_load_group2(addr, element, regp, vec_load_unshuffled_operand(regp), _mm_setzero_si128(), request_type); + } +} + +// LQV, LRV, SQV, SRV +inline void vec_lqrv_sqrv(UINT32 iw, UINT32 rs) +{ + const UINT32 addr = rs + (sign_extend_6(iw) << 4); + const UINT32 element = (iw >> 21) & 0xf; + UINT16* regp = m_v[(iw >> 16) & 0x1f].s; + + memcpy(m_vdqm.s, m_vec_helpers.qr_lut[addr & 0xf], sizeof(m_vdqm.s)); + + rsp_mem_request_type request_type = (iw >> 11 & 0x1) ? RSP_MEM_REQUEST_REST : RSP_MEM_REQUEST_QUAD; + if ((iw >> 29) & 0x1) + { + vec_store_group4(addr, element, regp, vec_load_unshuffled_operand(regp), vec_load_unshuffled_operand(m_vdqm.s), request_type); + } + else + { + vec_load_group4(addr, element, regp, vec_load_unshuffled_operand(regp), vec_load_unshuffled_operand(m_vdqm.s), request_type); + } +}