nw, more RSP opts ported from CEN64

This commit is contained in:
therealmogminer@gmail.com 2015-06-30 21:15:17 +02:00
parent b6385b1c6e
commit ae67c6b0ad
4 changed files with 534 additions and 6 deletions

View File

@ -163,6 +163,8 @@ public:
void ccfunc_sp_set_status_cb();
void ccfunc_unimplemented();
UINT8* get_dmem() { return m_dmem8; }
protected:
// device-level overrides
virtual void device_start();

View File

@ -178,10 +178,58 @@ const rsp_cop2::vec_helpers_t rsp_cop2::m_vec_helpers = {
{ 0x0d0e, 0x030c, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08 },
{ 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0c0b, 0x0809, 0x0e0f },
{ 0x030c, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e }
},
{ // qr_lut
{ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff },
{ 0xff00, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff },
{ 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff },
{ 0x0000, 0xff00, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff },
{ 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
{ 0x0000, 0x0000, 0xff00, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff },
{ 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff },
{ 0x0000, 0x0000, 0x0000, 0xff00, 0xffff, 0xffff, 0xffff, 0xffff },
{ 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff },
{ 0x0000, 0x0000, 0x0000, 0x0000, 0xff00, 0xffff, 0xffff, 0xffff },
{ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff },
{ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xff00, 0xffff, 0xffff },
{ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff },
{ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xff00, 0xffff },
{ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff },
{ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xff00 }
},
{ // bdls_lut - mask to denote which part of the vector to load/store.
{ 0x00ff, 0x0000, 0x0000, 0x0000 }, // B
{ 0xffff, 0x0000, 0x0000, 0x0000 }, // S
{ 0xffff, 0xffff, 0x0000, 0x0000 }, // L
{ 0xffff, 0xffff, 0xffff, 0xffff } // D
}
};
#ifndef __SSSE3__
// TODO: Highly optimized. More of a stopgap measure.
static inline rsp_vec_t sse2_pshufb(rsp_vec_t v, const UINT16 *keys)
{
UINT8 dest[16];
UINT8 temp[16];
_mm_storeu_si128((rsp_vec_t *) temp, v);
for (UINT32 j = 0; j < 8; j++)
{
UINT16 key = keys[j];
UINT8 key_hi = key >> 8;
UINT8 key_lo = key >> 0;
dest[(j << 1) + 1] = key_hi == 0x80 ? 0x00 : temp[key_hi];
dest[(j << 1) + 0] = key_lo == 0x80 ? 0x00 : temp[key_lo];
}
return _mm_loadu_si128((rsp_vec_t *) dest);
}
rsp_vec_t rsp_cop2::vec_load_and_shuffle_operand(const UINT16* src, UINT32 element)
{
if (element >= 8) // element => 0w ... 7w
@ -234,6 +282,292 @@ rsp_vec_t rsp_cop2::vec_load_and_shuffle_operand(const UINT16* src, UINT32 eleme
return _mm_shuffle_epi8(operand, key);
}
#endif
//
// SSSE3+ accelerated loads for group I. Byteswap big-endian to 2-byte
// little-endian vector. Start at vector element offset, discarding any
// wraparound as necessary.
//
// TODO: Reverse-engineer what happens when loads to vector elements must
// wraparound. Do we just discard the data, as below, or does the
// data effectively get rotated around the edge of the vector?
//
void rsp_cop2::vec_load_group1(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm)
{
UINT32 offset = addr & 0x7;
UINT32 ror = offset - element;
// Always load in 8-byte chunks to emulate wraparound.
rsp_vec_t data;
if (offset) {
UINT32 aligned_addr_lo = addr & ~0x7;
UINT32 aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF;
data = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_lo));
rsp_vec_t temp = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_hi));
data = _mm_unpacklo_epi64(data, temp);
}
else
{
data = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + addr));
}
// Shift the DQM up to the point where we mux in the data.
#ifndef __SSSE3__
dqm = sse2_pshufb(dqm, m_vec_helpers.sll_b2l_keys[element]);
#else
rsp_vec_t ekey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.sll_b2l_keys[element]));
dqm = _mm_shuffle_epi8(dqm, ekey);
#endif
// Align the data to the DQM so we can mask it in.
#ifndef __SSSE3__
data = sse2_pshufb(data, m_vec_helpers.ror_b2l_keys[ror & 0xF]);
#else
ekey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.ror_b2l_keys[ror & 0xF]));
data = _mm_shuffle_epi8(data, ekey);
#endif
// Mask and mux in the data.
#ifdef __SSE4_1__
reg = _mm_blendv_epi8(reg, data, dqm);
#else
data = _mm_and_si128(dqm, data);
reg = _mm_andnot_si128(dqm, reg);
reg = _mm_or_si128(data, reg);
#endif
_mm_store_si128((rsp_vec_t *) regp, reg);
}
//
// SSSE3+ accelerated loads for group II.
//
// TODO: Reverse-engineer what happens when loads to vector elements must
// wraparound. Do we just discard the data, as below, or does the
// data effectively get rotated around the edge of the vector?
//
// TODO: Reverse-engineer what happens when element != 0.
//
void rsp_cop2::vec_load_group2(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type) {
UINT32 offset = addr & 0x7;
rsp_vec_t data;
// Always load in 8-byte chunks to emulate wraparound.
if (offset) {
UINT32 aligned_addr_lo = addr & ~0x7;
UINT32 aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF;
UINT64 datalow, datahigh;
memcpy(&datalow, m_rsp.get_dmem() + aligned_addr_lo, sizeof(datalow));
memcpy(&datahigh, m_rsp.get_dmem() + aligned_addr_hi, sizeof(datahigh));
// TODO: Test for endian issues?
datahigh >>= ((8 - offset) << 3);
datalow <<= (offset << 3);
datalow = datahigh | datalow;
data = _mm_loadl_epi64((rsp_vec_t *) &datalow);
}
else
{
data = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + addr));
}
// "Unpack" the data.
data = _mm_unpacklo_epi8(_mm_setzero_si128(), data);
if (request_type != RSP_MEM_REQUEST_PACK)
{
data = _mm_srli_epi16(data, 1);
}
_mm_store_si128((rsp_vec_t *) regp, data);
}
//
// SSSE3+ accelerated loads for group IV. Byteswap big-endian to 2-byte
// little-endian vector. Stop loading at quadword boundaries.
//
// TODO: Reverse-engineer what happens when loads from vector elements
// must wraparound (i.e., the address offset is small, starting
// element is large).
//
void rsp_cop2::vec_load_group4(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type)
{
UINT32 aligned_addr = addr & 0xFF0;
UINT32 offset = addr & 0xF;
rsp_vec_t data = _mm_load_si128((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr));
UINT32 ror;
if (request_type == RSP_MEM_REQUEST_QUAD)
{
ror = 16 - element + offset;
}
else
{
// TODO: How is this adjusted for LRV when e != 0?
dqm = _mm_cmpeq_epi8(_mm_setzero_si128(), dqm);
ror = 16 - offset;
}
#ifndef __SSSE3__
data = sse2_pshufb(data, m_vec_helpers.ror_b2l_keys[ror & 0xF]);
dqm = sse2_pshufb(dqm, m_vec_helpers.ror_b2l_keys[ror & 0xF]);
#else
rsp_vec_t dkey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.ror_b2l_keys[ror & 0xF]));
data = _mm_shuffle_epi8(data, dkey);
dqm = _mm_shuffle_epi8(dqm, dkey);
#endif
// Mask and mux in the data.
#ifdef __SSE4_1__
data = _mm_blendv_epi8(reg, data, dqm);
#else
data = _mm_and_si128(dqm, data);
reg = _mm_andnot_si128(dqm, reg);
data = _mm_or_si128(data, reg);
#endif
_mm_store_si128((rsp_vec_t *) regp, data);
}
//
// SSE3+ accelerated stores for group I. Byteswap 2-byte little-endian
// vector back to big-endian. Start at vector element offset, wrapping
// around the edge of the vector as necessary.
//
// TODO: Reverse-engineer what happens when stores from vector elements
// must wraparound. Do we just stop storing the data, or do we
// continue storing from the front of the vector, as below?
//
void rsp_cop2::vec_store_group1(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm)
{
UINT32 offset = addr & 0x7;
UINT32 ror = element - offset;
// Shift the DQM up to the point where we mux in the data.
#ifndef __SSSE3__
dqm = sse2_pshufb(dqm, m_vec_helpers.sll_l2b_keys[offset]);
#else
__m182i ekey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.sll_l2b_keys[offset]));
dqm = _mm_shuffle_epi8(dqm, ekey);
#endif
// Rotate the reg to align with the DQM.
#ifndef __SSSE3__
reg = sse2_pshufb(reg, m_vec_helpers.ror_l2b_keys[ror & 0xF]);
#else
ekey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.ror_l2b_keys[ror & 0xF]));
reg = _mm_shuffle_epi8(reg, ekey);
#endif
// Always load in 8-byte chunks to emulate wraparound.
rsp_vec_t data;
if (offset)
{
UINT32 aligned_addr_lo = addr & ~0x7;
UINT32 aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF;
data = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_lo));
rsp_vec_t temp = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_hi));
data = _mm_unpacklo_epi64(data, temp);
// Mask and mux in the data.
#ifdef __SSE4_1__
data = _mm_blendv_epi8(data, reg, dqm);
#else
data = _mm_andnot_si128(dqm, data);
reg = _mm_and_si128(dqm, reg);
data = _mm_or_si128(data, reg);
#endif
_mm_storel_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_lo), data);
data = _mm_srli_si128(data, 8);
_mm_storel_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_hi), data);
}
else
{
data = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + addr));
// Mask and mux in the data.
#ifdef __SSE4_1__
data = _mm_blendv_epi8(data, reg, dqm);
#else
data = _mm_andnot_si128(dqm, data);
reg = _mm_and_si128(dqm, reg);
data = _mm_or_si128(data, reg);
#endif
_mm_storel_epi64((rsp_vec_t *) (m_rsp.get_dmem() + addr), data);
}
}
//
// SSE3+ accelerated stores for group II. Byteswap 2-byte little-endian
// vector back to big-endian. Start at vector element offset, wrapping
// around the edge of the vector as necessary.
//
// TODO: Reverse-engineer what happens when stores from vector elements
// must wraparound. Do we just stop storing the data, or do we
// continue storing from the front of the vector, as below?
//
// TODO: Reverse-engineer what happens when element != 0.
//
void rsp_cop2::vec_store_group2(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type) {
// "Pack" the data.
if (request_type != RSP_MEM_REQUEST_PACK)
{
reg = _mm_slli_epi16(reg, 1);
}
reg = _mm_srai_epi16(reg, 8);
reg = _mm_packs_epi16(reg, reg);
// TODO: Always store in 8-byte chunks to emulate wraparound.
_mm_storel_epi64((rsp_vec_t *) (m_rsp.get_dmem() + addr), reg);
}
//
// SSE3+ accelerated stores for group IV. Byteswap 2-byte little-endian
// vector back to big-endian. Stop storing at quadword boundaries.
//
void rsp_cop2::vec_store_group4(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type) {
UINT32 aligned_addr = addr & 0xFF0;
UINT32 offset = addr & 0xF;
UINT32 rol = offset;
rsp_vec_t data = _mm_load_si128((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr));
if (request_type == RSP_MEM_REQUEST_QUAD)
{
rol -= element;
}
else
{
// TODO: How is this adjusted for SRV when e != 0?
dqm = _mm_cmpeq_epi8(_mm_setzero_si128(), dqm);
}
#ifndef __SSSE3__
reg = sse2_pshufb(reg, m_vec_helpers.rol_l2b_keys[rol & 0xF]);
#else
rsp_vec_t ekey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.rol_l2b_keys[rol & 0xF]));
reg = _mm_shuffle_epi8(reg, ekey);
#endif
// Mask and mux out the data, write.
#ifdef __SSE4_1__
data = _mm_blendv_epi8(data, reg, dqm);
#else
reg = _mm_and_si128(dqm, reg);
data = _mm_andnot_si128(dqm, data);
data = _mm_or_si128(data, reg);
#endif
_mm_store_si128((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr), data);
}
#endif
extern offs_t rsp_dasm_one(char *buffer, offs_t pc, UINT32 op);
@ -498,14 +832,16 @@ void rsp_cop2::state_string_export(const int index, std::string &str)
void rsp_cop2::handle_lwc2(UINT32 op)
{
int base = (op >> 21) & 0x1f;
#if !USE_SIMD
int i, end;
UINT32 ea;
int dest = (op >> 16) & 0x1f;
int base = (op >> 21) & 0x1f;
int index = (op >> 7) & 0xf;
int offset = (op & 0x7f);
if (offset & 0x40)
offset |= 0xffffffc0;
#endif
switch ((op >> 11) & 0x1f)
{
@ -518,8 +854,12 @@ void rsp_cop2::handle_lwc2(UINT32 op)
//
// Load 1 byte to vector byte index
#if USE_SIMD
vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + offset : offset;
VREG_B(dest, index) = m_rsp.READ8(ea);
#endif
break;
}
case 0x01: /* LSV */
@ -531,6 +871,9 @@ void rsp_cop2::handle_lwc2(UINT32 op)
//
// Loads 2 bytes starting from vector byte index
#if USE_SIMD
vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 2) : (offset * 2);
end = index + 2;
@ -540,6 +883,7 @@ void rsp_cop2::handle_lwc2(UINT32 op)
VREG_B(dest, i) = m_rsp.READ8(ea);
ea++;
}
#endif
break;
}
case 0x02: /* LLV */
@ -551,6 +895,9 @@ void rsp_cop2::handle_lwc2(UINT32 op)
//
// Loads 4 bytes starting from vector byte index
#if USE_SIMD
vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 4) : (offset * 4);
end = index + 4;
@ -560,6 +907,7 @@ void rsp_cop2::handle_lwc2(UINT32 op)
VREG_B(dest, i) = m_rsp.READ8(ea);
ea++;
}
#endif
break;
}
case 0x03: /* LDV */
@ -571,6 +919,9 @@ void rsp_cop2::handle_lwc2(UINT32 op)
//
// Loads 8 bytes starting from vector byte index
#if USE_SIMD
vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8);
end = index + 8;
@ -580,6 +931,7 @@ void rsp_cop2::handle_lwc2(UINT32 op)
VREG_B(dest, i) = m_rsp.READ8(ea);
ea++;
}
#endif
break;
}
case 0x04: /* LQV */
@ -591,6 +943,9 @@ void rsp_cop2::handle_lwc2(UINT32 op)
//
// Loads up to 16 bytes starting from vector byte index
#if USE_SIMD
vec_lqrv_sqrv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
end = index + (16 - (ea & 0xf));
@ -601,6 +956,7 @@ void rsp_cop2::handle_lwc2(UINT32 op)
VREG_B(dest, i) = m_rsp.READ8(ea);
ea++;
}
#endif
break;
}
case 0x05: /* LRV */
@ -612,6 +968,9 @@ void rsp_cop2::handle_lwc2(UINT32 op)
//
// Stores up to 16 bytes starting from right side until 16-byte boundary
#if USE_SIMD
vec_lqrv_sqrv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
index = 16 - ((ea & 0xf) - index);
@ -623,6 +982,7 @@ void rsp_cop2::handle_lwc2(UINT32 op)
VREG_B(dest, i) = m_rsp.READ8(ea);
ea++;
}
#endif
break;
}
case 0x06: /* LPV */
@ -634,12 +994,16 @@ void rsp_cop2::handle_lwc2(UINT32 op)
//
// Loads a byte as the upper 8 bits of each element
#if USE_SIMD
vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8);
for (i=0; i < 8; i++)
{
VREG_S(dest, i) = m_rsp.READ8(ea + (((16-index) + i) & 0xf)) << 8;
}
#endif
break;
}
case 0x07: /* LUV */
@ -651,12 +1015,16 @@ void rsp_cop2::handle_lwc2(UINT32 op)
//
// Loads a byte as the bits 14-7 of each element
#if USE_SIMD
vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8);
for (i=0; i < 8; i++)
{
VREG_S(dest, i) = m_rsp.READ8(ea + (((16-index) + i) & 0xf)) << 7;
}
#endif
break;
}
case 0x08: /* LHV */
@ -668,12 +1036,16 @@ void rsp_cop2::handle_lwc2(UINT32 op)
//
// Loads a byte as the bits 14-7 of each element, with 2-byte stride
#if USE_SIMD
vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
for (i=0; i < 8; i++)
{
VREG_S(dest, i) = m_rsp.READ8(ea + (((16-index) + (i<<1)) & 0xf)) << 7;
}
#endif
break;
}
case 0x09: /* LFV */
@ -685,6 +1057,9 @@ void rsp_cop2::handle_lwc2(UINT32 op)
//
// Loads a byte as the bits 14-7 of upper or lower quad, with 4-byte stride
#if USE_SIMD
vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
// not sure what happens if 16-byte boundary is crossed...
@ -696,6 +1071,7 @@ void rsp_cop2::handle_lwc2(UINT32 op)
VREG_S(dest, i) = m_rsp.READ8(ea) << 7;
ea += 4;
}
#endif
break;
}
case 0x0a: /* LWV */
@ -708,6 +1084,8 @@ void rsp_cop2::handle_lwc2(UINT32 op)
// Loads the full 128-bit vector starting from vector byte index and wrapping to index 0
// after byte index 15
#if USE_SIMD
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
end = (16 - index) + 16;
@ -717,6 +1095,7 @@ void rsp_cop2::handle_lwc2(UINT32 op)
VREG_B(dest, i & 0xf) = m_rsp.READ8(ea);
ea += 4;
}
#endif
break;
}
case 0x0b: /* LTV */
@ -730,6 +1109,8 @@ void rsp_cop2::handle_lwc2(UINT32 op)
// FIXME: has a small problem with odd indices
#if USE_SIMD
#else
int element;
int vs = dest;
int ve = dest + 8;
@ -751,6 +1132,7 @@ void rsp_cop2::handle_lwc2(UINT32 op)
ea += 2;
}
#endif
break;
}
@ -769,15 +1151,17 @@ void rsp_cop2::handle_lwc2(UINT32 op)
void rsp_cop2::handle_swc2(UINT32 op)
{
int base = (op >> 21) & 0x1f;
#if !USE_SIMD
int i, end;
int eaoffset;
UINT32 ea;
int dest = (op >> 16) & 0x1f;
int base = (op >> 21) & 0x1f;
int index = (op >> 7) & 0xf;
int offset = (op & 0x7f);
if (offset & 0x40)
offset |= 0xffffffc0;
#endif
switch ((op >> 11) & 0x1f)
{
@ -790,8 +1174,12 @@ void rsp_cop2::handle_swc2(UINT32 op)
//
// Stores 1 byte from vector byte index
#if USE_SIMD
vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + offset : offset;
m_rsp.WRITE8(ea, VREG_B(dest, index));
#endif
break;
}
case 0x01: /* SSV */
@ -803,6 +1191,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
//
// Stores 2 bytes starting from vector byte index
#if USE_SIMD
vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 2) : (offset * 2);
end = index + 2;
@ -812,6 +1203,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
m_rsp.WRITE8(ea, VREG_B(dest, i));
ea++;
}
#endif
break;
}
case 0x02: /* SLV */
@ -823,6 +1215,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
//
// Stores 4 bytes starting from vector byte index
#if USE_SIMD
vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 4) : (offset * 4);
end = index + 4;
@ -832,6 +1227,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
m_rsp.WRITE8(ea, VREG_B(dest, i));
ea++;
}
#endif
break;
}
case 0x03: /* SDV */
@ -843,6 +1239,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
//
// Stores 8 bytes starting from vector byte index
#if USE_SIMD
vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8);
end = index + 8;
@ -852,6 +1251,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
m_rsp.WRITE8(ea, VREG_B(dest, i));
ea++;
}
#endif
break;
}
case 0x04: /* SQV */
@ -863,6 +1263,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
//
// Stores up to 16 bytes starting from vector byte index until 16-byte boundary
#if USE_SIMD
vec_lqrv_sqrv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
end = index + (16 - (ea & 0xf));
@ -872,6 +1275,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
m_rsp.WRITE8(ea, VREG_B(dest, i & 0xf));
ea++;
}
#endif
break;
}
case 0x05: /* SRV */
@ -883,6 +1287,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
//
// Stores up to 16 bytes starting from right side until 16-byte boundary
#if USE_SIMD
vec_lqrv_sqrv(op, m_rsp.m_rsp_state->r[base]);
#else
int o;
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
@ -895,6 +1302,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
m_rsp.WRITE8(ea, VREG_B(dest, ((i + o) & 0xf)));
ea++;
}
#endif
break;
}
case 0x06: /* SPV */
@ -906,6 +1314,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
//
// Stores upper 8 bits of each element
#if USE_SIMD
vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8);
end = index + 8;
@ -921,6 +1332,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
}
ea++;
}
#endif
break;
}
case 0x07: /* SUV */
@ -932,6 +1344,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
//
// Stores bits 14-7 of each element
#if USE_SIMD
vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8);
end = index + 8;
@ -947,6 +1362,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
}
ea++;
}
#endif
break;
}
case 0x08: /* SHV */
@ -958,6 +1374,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
//
// Stores bits 14-7 of each element, with 2-byte stride
#if USE_SIMD
vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
for (i=0; i < 8; i++)
@ -968,6 +1387,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
m_rsp.WRITE8(ea, d);
ea += 2;
}
#endif
break;
}
case 0x09: /* SFV */
@ -981,6 +1401,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
// FIXME: only works for index 0 and index 8
#if USE_SIMD
vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]);
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
eaoffset = ea & 0xf;
@ -993,6 +1416,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
m_rsp.WRITE8(ea + (eaoffset & 0xf), VREG_S(dest, i) >> 7);
eaoffset += 4;
}
#endif
break;
}
case 0x0a: /* SWV */
@ -1005,6 +1429,8 @@ void rsp_cop2::handle_swc2(UINT32 op)
// Stores the full 128-bit vector starting from vector byte index and wrapping to index 0
// after byte index 15
#if USE_SIMD
#else
ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
eaoffset = ea & 0xf;
@ -1017,6 +1443,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
m_rsp.WRITE8(ea + (eaoffset & 0xf), VREG_B(dest, i & 0xf));
eaoffset++;
}
#endif
break;
}
case 0x0b: /* STV */
@ -1028,6 +1455,8 @@ void rsp_cop2::handle_swc2(UINT32 op)
//
// Stores one element from maximum of 8 vectors, while incrementing element index
#if USE_SIMD
#else
int element;
int vs = dest;
int ve = dest + 8;
@ -1047,6 +1476,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
eaoffset += 2;
element++;
}
#endif
break;
}
@ -1981,16 +2411,21 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// Stores high, middle or low slice of accumulator to destination vector
#if USE_SIMD
UINT16 *acc = m_acc.s;
switch (EL)
{
case 8:
m_v[VDREG].v = read_acc_hi(acc);
break;
case 9:
m_v[VDREG].v = read_acc_mid(acc);
break;
case 10:
m_v[VDREG].v = read_acc_lo(acc);
break;
default:
m_v[VDREG].v = _mm_setzero_si128();
break;
}
#else

View File

@ -199,6 +199,23 @@ protected:
RSP_ACC_HI = 0,
};
enum rsp_mem_request_type {
RSP_MEM_REQUEST_NONE,
RSP_MEM_REQUEST_INT_MEM,
RSP_MEM_REQUEST_VECTOR,
RSP_MEM_REQUEST_FOURTH,
RSP_MEM_REQUEST_HALF,
RSP_MEM_REQUEST_PACK,
RSP_MEM_REQUEST_QUAD,
RSP_MEM_REQUEST_REST,
RSP_MEM_REQUEST_UPACK
};
union aligned_rsp_1vect_t {
rsp_vec_t __align;
UINT16 s[8];
};
union aligned_rsp_2vect_t {
rsp_vec_t __align[2];
UINT16 s[16];
@ -209,6 +226,7 @@ protected:
UINT16 s[24];
};
aligned_rsp_1vect_t m_vdqm;
aligned_rsp_2vect_t m_flags[3];
aligned_rsp_3vect_t m_acc;
UINT32 m_dp_flag;
@ -225,11 +243,16 @@ protected:
const UINT16 ror_b2l_keys[16][8];
const UINT16 rol_l2b_keys[16][8];
const UINT16 ror_l2b_keys[16][8];
const UINT16 qr_lut[16][8];
const UINT16 bdls_lut[4][4];
} vec_helpers_t;
static const vec_helpers_t m_vec_helpers;
rsp_vec_t vec_load_and_shuffle_operand(const UINT16* src, UINT32 element);
static inline UINT32 sign_extend_6(INT32 i) {
return (i << (32 - 7)) >> (32 - 7);
}
static inline rsp_vec_t vec_load_unshuffled_operand(const UINT16* src)
{
return _mm_load_si128((rsp_vec_t*) src);
@ -319,11 +342,11 @@ protected:
}
void vec_load_group1(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
void vec_load_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
void vec_load_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
void vec_load_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type);
void vec_load_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type);
void vec_store_group1(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
void vec_store_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
void vec_store_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
void vec_store_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type);
void vec_store_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type);
#include "clamp.h"
#include "vabs.h"
@ -349,6 +372,7 @@ protected:
#include "vsub.h"
#include "vsubc.h"
#include "vxor.h"
#include "vldst.h"
#endif
private:

67
src/emu/cpu/rsp/vldst.h Normal file
View File

@ -0,0 +1,67 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
// LBV, LDV, LLV, LSV, SBV, SDV, SLV, SSV
inline void vec_lbdlsv_sbdlsv(UINT32 iw, UINT32 rs)
{
const UINT32 shift_and_idx = (iw >> 11) & 0x3;
rsp_vec_t dqm = _mm_loadl_epi64((rsp_vec_t *) (m_vec_helpers.bdls_lut[shift_and_idx]));
const UINT32 addr = rs + (sign_extend_6(iw) << shift_and_idx);
const UINT32 element = (iw >> 21) & 0xf;
UINT16* regp = m_v[(iw >> 16) & 0x1f].s;
if (iw >> 29 & 0x1)
{
vec_store_group1(addr, element, regp, vec_load_unshuffled_operand(regp), dqm);
}
else
{
vec_load_group1(addr, element, regp, vec_load_unshuffled_operand(regp), dqm);
}
}
// LPV, LUV, SPV, SUV
inline void vec_lfhpuv_sfhpuv(UINT32 iw, UINT32 rs)
{
static const enum rsp_mem_request_type fhpu_type_lut[4] = {
RSP_MEM_REQUEST_PACK,
RSP_MEM_REQUEST_UPACK,
RSP_MEM_REQUEST_HALF,
RSP_MEM_REQUEST_FOURTH
};
const UINT32 addr = rs + (sign_extend_6(iw) << 3);
const UINT32 element = (iw >> 21) & 0xf;
UINT16* regp = m_v[(iw >> 16) & 0x1f].s;
rsp_mem_request_type request_type = fhpu_type_lut[((iw >> 11) & 0x1f) - 6];
if ((iw >> 29) && 0x1)
{
vec_store_group2(addr, element, regp, vec_load_unshuffled_operand(regp), _mm_setzero_si128(), request_type);
}
else
{
vec_load_group2(addr, element, regp, vec_load_unshuffled_operand(regp), _mm_setzero_si128(), request_type);
}
}
// LQV, LRV, SQV, SRV
inline void vec_lqrv_sqrv(UINT32 iw, UINT32 rs)
{
const UINT32 addr = rs + (sign_extend_6(iw) << 4);
const UINT32 element = (iw >> 21) & 0xf;
UINT16* regp = m_v[(iw >> 16) & 0x1f].s;
memcpy(m_vdqm.s, m_vec_helpers.qr_lut[addr & 0xf], sizeof(m_vdqm.s));
rsp_mem_request_type request_type = (iw >> 11 & 0x1) ? RSP_MEM_REQUEST_REST : RSP_MEM_REQUEST_QUAD;
if ((iw >> 29) & 0x1)
{
vec_store_group4(addr, element, regp, vec_load_unshuffled_operand(regp), vec_load_unshuffled_operand(m_vdqm.s), request_type);
}
else
{
vec_load_group4(addr, element, regp, vec_load_unshuffled_operand(regp), vec_load_unshuffled_operand(m_vdqm.s), request_type);
}
}