From ae67c6b0ade9e416b70b5f3d499d08d8541566ff Mon Sep 17 00:00:00 2001
From: "therealmogminer@gmail.com" <therealmogminer@gmail.com>
Date: Tue, 30 Jun 2015 21:15:17 +0200
Subject: [PATCH] nw, more RSP opts ported from CEN64

---
 src/emu/cpu/rsp/rsp.h    |   2 +
 src/emu/cpu/rsp/rspcp2.c | 439 ++++++++++++++++++++++++++++++++++++++-
 src/emu/cpu/rsp/rspcp2.h |  32 ++-
 src/emu/cpu/rsp/vldst.h  |  67 ++++++
 4 files changed, 534 insertions(+), 6 deletions(-)
 create mode 100644 src/emu/cpu/rsp/vldst.h

diff --git a/src/emu/cpu/rsp/rsp.h b/src/emu/cpu/rsp/rsp.h
index fce4635a359..6bc588c2f01 100644
--- a/src/emu/cpu/rsp/rsp.h
+++ b/src/emu/cpu/rsp/rsp.h
@@ -163,6 +163,8 @@ public:
 	void ccfunc_sp_set_status_cb();
 	void ccfunc_unimplemented();
 
+	UINT8* get_dmem() { return m_dmem8; }
+
 protected:
 	// device-level overrides
 	virtual void device_start();
diff --git a/src/emu/cpu/rsp/rspcp2.c b/src/emu/cpu/rsp/rspcp2.c
index df55952ab4c..0d00e635d69 100644
--- a/src/emu/cpu/rsp/rspcp2.c
+++ b/src/emu/cpu/rsp/rspcp2.c
@@ -178,10 +178,58 @@ const rsp_cop2::vec_helpers_t rsp_cop2::m_vec_helpers = {
 		{ 0x0d0e, 0x030c, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08 },
 		{ 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0c0b, 0x0809, 0x0e0f },
 		{ 0x030c, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e }
+	},
+	{ // qr_lut
+		{ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff },
+		{ 0xff00, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff },
+		{ 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff },
+		{ 0x0000, 0xff00, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff },
+
+		{ 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff},
+		{ 0x0000, 0x0000, 0xff00, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff },
+		{ 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff },
+		{ 0x0000, 0x0000, 0x0000, 0xff00, 0xffff, 0xffff, 0xffff, 0xffff },
+
+		{ 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff },
+		{ 0x0000, 0x0000, 0x0000, 0x0000, 0xff00, 0xffff, 0xffff, 0xffff },
+		{ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff },
+		{ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xff00, 0xffff, 0xffff },
+
+		{ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff },
+		{ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xff00, 0xffff },
+		{ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff },
+		{ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xff00 }
+	},
+	{ // bdls_lut - mask to denote which part of the vector to load/store.
+		{ 0x00ff, 0x0000, 0x0000, 0x0000 }, // B
+		{ 0xffff, 0x0000, 0x0000, 0x0000 }, // S
+		{ 0xffff, 0xffff, 0x0000, 0x0000 }, // L
+		{ 0xffff, 0xffff, 0xffff, 0xffff }  // D
 	}
 };
 
 #ifndef __SSSE3__
+// TODO: Highly optimized. More of a stopgap measure.
+static inline rsp_vec_t sse2_pshufb(rsp_vec_t v, const UINT16 *keys)
+{
+	UINT8 dest[16];
+	UINT8 temp[16];
+
+	_mm_storeu_si128((rsp_vec_t *) temp, v);
+
+	for (UINT32 j = 0; j < 8; j++)
+	{
+		UINT16 key = keys[j];
+		UINT8 key_hi = key >> 8;
+		UINT8 key_lo = key >> 0;
+
+		dest[(j << 1) + 1] = key_hi == 0x80 ? 0x00 : temp[key_hi];
+		dest[(j << 1) + 0] = key_lo == 0x80 ? 0x00 : temp[key_lo];
+	}
+
+	return _mm_loadu_si128((rsp_vec_t *) dest);
+}
+
 rsp_vec_t rsp_cop2::vec_load_and_shuffle_operand(const UINT16* src, UINT32 element)
 {
 	if (element >= 8) // element => 0w ... 7w
@@ -234,6 +282,292 @@ rsp_vec_t rsp_cop2::vec_load_and_shuffle_operand(const UINT16* src, UINT32 eleme
 	return _mm_shuffle_epi8(operand, key);
 }
 #endif
+//
+// SSSE3+ accelerated loads for group I. Byteswap big-endian to 2-byte
+// little-endian vector. Start at vector element offset, discarding any
+// wraparound as necessary.
+//
+// TODO: Reverse-engineer what happens when loads to vector elements must
+//       wraparound. Do we just discard the data, as below, or does the
+//       data effectively get rotated around the edge of the vector?
+//
+void rsp_cop2::vec_load_group1(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm)
+{
+	UINT32 offset = addr & 0x7;
+	UINT32 ror = offset - element;
+
+	// Always load in 8-byte chunks to emulate wraparound.
+	rsp_vec_t data;
+	if (offset) {
+		UINT32 aligned_addr_lo = addr & ~0x7;
+		UINT32 aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF;
+
+		data = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_lo));
+		rsp_vec_t temp = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_hi));
+		data = _mm_unpacklo_epi64(data, temp);
+	}
+	else
+	{
+		data = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + addr));
+	}
+
+	// Shift the DQM up to the point where we mux in the data.
+#ifndef __SSSE3__
+	dqm = sse2_pshufb(dqm, m_vec_helpers.sll_b2l_keys[element]);
+#else
+	rsp_vec_t ekey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.sll_b2l_keys[element]));
+	dqm = _mm_shuffle_epi8(dqm, ekey);
+#endif
+
+  // Align the data to the DQM so we can mask it in.
+#ifndef __SSSE3__
+	data = sse2_pshufb(data, m_vec_helpers.ror_b2l_keys[ror & 0xF]);
+#else
+	ekey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.ror_b2l_keys[ror & 0xF]));
+	data = _mm_shuffle_epi8(data, ekey);
+#endif
+
+  // Mask and mux in the data.
+#ifdef __SSE4_1__
+	reg = _mm_blendv_epi8(reg, data, dqm);
+#else
+	data = _mm_and_si128(dqm, data);
+	reg = _mm_andnot_si128(dqm, reg);
+	reg = _mm_or_si128(data, reg);
+#endif
+
+	_mm_store_si128((rsp_vec_t *) regp, reg);
+}
+
+//
+// SSSE3+ accelerated loads for group II.
+//
+// TODO: Reverse-engineer what happens when loads to vector elements must
+//       wraparound. Do we just discard the data, as below, or does the
+//       data effectively get rotated around the edge of the vector?
+//
+// TODO: Reverse-engineer what happens when element != 0.
+//
+void rsp_cop2::vec_load_group2(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type) {
+	UINT32 offset = addr & 0x7;
+	rsp_vec_t data;
+
+	// Always load in 8-byte chunks to emulate wraparound.
+	if (offset) {
+		UINT32 aligned_addr_lo = addr & ~0x7;
+		UINT32 aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF;
+		UINT64 datalow, datahigh;
+
+		memcpy(&datalow, m_rsp.get_dmem() + aligned_addr_lo, sizeof(datalow));
+		memcpy(&datahigh, m_rsp.get_dmem() + aligned_addr_hi, sizeof(datahigh));
+
+		// TODO: Test for endian issues?
+		datahigh >>= ((8 - offset) << 3);
+		datalow <<= (offset << 3);
+		datalow = datahigh | datalow;
+
+		data = _mm_loadl_epi64((rsp_vec_t *) &datalow);
+	}
+	else
+	{
+		data = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + addr));
+	}
+
+	// "Unpack" the data.
+	data = _mm_unpacklo_epi8(_mm_setzero_si128(), data);
+
+	if (request_type != RSP_MEM_REQUEST_PACK)
+	{
+		data = _mm_srli_epi16(data, 1);
+	}
+
+	_mm_store_si128((rsp_vec_t *) regp, data);
+}
+
+//
+// SSSE3+ accelerated loads for group IV. Byteswap big-endian to 2-byte
+// little-endian vector. Stop loading at quadword boundaries.
+//
+// TODO: Reverse-engineer what happens when loads from vector elements
+//       must wraparound (i.e., the address offset is small, starting
+//       element is large).
+//
+void rsp_cop2::vec_load_group4(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type)
+{
+	UINT32 aligned_addr = addr & 0xFF0;
+	UINT32 offset = addr & 0xF;
+
+	rsp_vec_t data = _mm_load_si128((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr));
+
+	UINT32 ror;
+	if (request_type == RSP_MEM_REQUEST_QUAD)
+	{
+		ror = 16 - element + offset;
+	}
+	else
+	{
+		// TODO: How is this adjusted for LRV when e != 0?
+		dqm = _mm_cmpeq_epi8(_mm_setzero_si128(), dqm);
+		ror = 16 - offset;
+	}
+
+#ifndef __SSSE3__
+	data = sse2_pshufb(data, m_vec_helpers.ror_b2l_keys[ror & 0xF]);
+	dqm = sse2_pshufb(dqm, m_vec_helpers.ror_b2l_keys[ror & 0xF]);
+#else
+	rsp_vec_t dkey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.ror_b2l_keys[ror & 0xF]));
+	data = _mm_shuffle_epi8(data, dkey);
+	dqm = _mm_shuffle_epi8(dqm, dkey);
+#endif
+
+	// Mask and mux in the data.
+#ifdef __SSE4_1__
+	data = _mm_blendv_epi8(reg, data, dqm);
+#else
+	data = _mm_and_si128(dqm, data);
+	reg = _mm_andnot_si128(dqm, reg);
+	data = _mm_or_si128(data, reg);
+#endif
+
+	_mm_store_si128((rsp_vec_t *) regp, data);
+}
+
+//
+// SSE3+ accelerated stores for group I. Byteswap 2-byte little-endian
+// vector back to big-endian. Start at vector element offset, wrapping
+// around the edge of the vector as necessary.
+//
+// TODO: Reverse-engineer what happens when stores from vector elements
+//       must wraparound. Do we just stop storing the data, or do we
+//       continue storing from the front of the vector, as below?
+//
+void rsp_cop2::vec_store_group1(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm)
+{
+	UINT32 offset = addr & 0x7;
+	UINT32 ror = element - offset;
+
+	// Shift the DQM up to the point where we mux in the data.
+#ifndef __SSSE3__
+	dqm = sse2_pshufb(dqm, m_vec_helpers.sll_l2b_keys[offset]);
+#else
+	__m182i ekey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.sll_l2b_keys[offset]));
+	dqm = _mm_shuffle_epi8(dqm, ekey);
+#endif
+
+	// Rotate the reg to align with the DQM.
+#ifndef __SSSE3__
+	reg = sse2_pshufb(reg, m_vec_helpers.ror_l2b_keys[ror & 0xF]);
+#else
+	ekey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.ror_l2b_keys[ror & 0xF]));
+	reg = _mm_shuffle_epi8(reg, ekey);
+#endif
+
+	// Always load in 8-byte chunks to emulate wraparound.
+	rsp_vec_t data;
+	if (offset)
+	{
+		UINT32 aligned_addr_lo = addr & ~0x7;
+		UINT32 aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF;
+
+		data = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_lo));
+		rsp_vec_t temp = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_hi));
+		data = _mm_unpacklo_epi64(data, temp);
+
+		// Mask and mux in the data.
+#ifdef __SSE4_1__
+		data = _mm_blendv_epi8(data, reg, dqm);
+#else
+		data = _mm_andnot_si128(dqm, data);
+		reg = _mm_and_si128(dqm, reg);
+		data = _mm_or_si128(data, reg);
+#endif
+
+		_mm_storel_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_lo), data);
+
+		data = _mm_srli_si128(data, 8);
+		_mm_storel_epi64((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr_hi), data);
+	}
+	else
+	{
+		data = _mm_loadl_epi64((rsp_vec_t *) (m_rsp.get_dmem() + addr));
+
+		// Mask and mux in the data.
+#ifdef __SSE4_1__
+		data = _mm_blendv_epi8(data, reg, dqm);
+#else
+		data = _mm_andnot_si128(dqm, data);
+		reg = _mm_and_si128(dqm, reg);
+		data = _mm_or_si128(data, reg);
+#endif
+
+		_mm_storel_epi64((rsp_vec_t *) (m_rsp.get_dmem() + addr), data);
+	}
+}
+
+//
+// SSE3+ accelerated stores for group II. Byteswap 2-byte little-endian
+// vector back to big-endian. Start at vector element offset, wrapping
+// around the edge of the vector as necessary.
+//
+// TODO: Reverse-engineer what happens when stores from vector elements
+//       must wraparound. Do we just stop storing the data, or do we
+//       continue storing from the front of the vector, as below?
+//
+// TODO: Reverse-engineer what happens when element != 0.
+//
+void rsp_cop2::vec_store_group2(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type) {
+	// "Pack" the data.
+	if (request_type != RSP_MEM_REQUEST_PACK)
+	{
+		reg = _mm_slli_epi16(reg, 1);
+	}
+
+	reg = _mm_srai_epi16(reg, 8);
+	reg = _mm_packs_epi16(reg, reg);
+
+	// TODO: Always store in 8-byte chunks to emulate wraparound.
+	_mm_storel_epi64((rsp_vec_t *) (m_rsp.get_dmem() + addr), reg);
+}
+
+//
+// SSE3+ accelerated stores for group IV. Byteswap 2-byte little-endian
+// vector back to big-endian. Stop storing at quadword boundaries.
+//
+void rsp_cop2::vec_store_group4(UINT32 addr, UINT32 element, UINT16 *regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type) {
+	UINT32 aligned_addr = addr & 0xFF0;
+	UINT32 offset = addr & 0xF;
+	UINT32 rol = offset;
+
+	rsp_vec_t data = _mm_load_si128((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr));
+
+	if (request_type == RSP_MEM_REQUEST_QUAD)
+	{
+		rol -= element;
+	}
+	else
+	{
+		// TODO: How is this adjusted for SRV when e != 0?
+		dqm = _mm_cmpeq_epi8(_mm_setzero_si128(), dqm);
+	}
+
+#ifndef __SSSE3__
+	reg = sse2_pshufb(reg, m_vec_helpers.rol_l2b_keys[rol & 0xF]);
+#else
+	rsp_vec_t ekey = _mm_load_si128((rsp_vec_t *) (m_vec_helpers.rol_l2b_keys[rol & 0xF]));
+	reg = _mm_shuffle_epi8(reg, ekey);
+#endif
+
+  // Mask and mux out the data, write.
+#ifdef __SSE4_1__
+	data = _mm_blendv_epi8(data, reg, dqm);
+#else
+	reg = _mm_and_si128(dqm, reg);
+	data = _mm_andnot_si128(dqm, data);
+	data = _mm_or_si128(data, reg);
+#endif
+
+	_mm_store_si128((rsp_vec_t *) (m_rsp.get_dmem() + aligned_addr), data);
+}
 #endif
 
 extern offs_t rsp_dasm_one(char *buffer, offs_t pc, UINT32 op);
@@ -498,14 +832,16 @@ void rsp_cop2::state_string_export(const int index, std::string &str)
 
 void rsp_cop2::handle_lwc2(UINT32 op)
 {
+	int base = (op >> 21) & 0x1f;
+#if !USE_SIMD
 	int i, end;
 	UINT32 ea;
 	int dest = (op >> 16) & 0x1f;
-	int base = (op >> 21) & 0x1f;
 	int index = (op >> 7) & 0xf;
 	int offset = (op & 0x7f);
 	if (offset & 0x40)
 		offset |= 0xffffffc0;
+#endif
 
 	switch ((op >> 11) & 0x1f)
 	{
@@ -518,8 +854,12 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 			//
 			// Load 1 byte to vector byte index
 
+#if USE_SIMD
+			vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + offset : offset;
 			VREG_B(dest, index) = m_rsp.READ8(ea);
+#endif
 			break;
 		}
 		case 0x01:      /* LSV */
@@ -531,6 +871,9 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 			//
 			// Loads 2 bytes starting from vector byte index
 
+#if USE_SIMD
+			vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 2) : (offset * 2);
 
 			end = index + 2;
@@ -540,6 +883,7 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 				VREG_B(dest, i) = m_rsp.READ8(ea);
 				ea++;
 			}
+#endif
 			break;
 		}
 		case 0x02:      /* LLV */
@@ -551,6 +895,9 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 			//
 			// Loads 4 bytes starting from vector byte index
 
+#if USE_SIMD
+			vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 4) : (offset * 4);
 
 			end = index + 4;
@@ -560,6 +907,7 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 				VREG_B(dest, i) = m_rsp.READ8(ea);
 				ea++;
 			}
+#endif
 			break;
 		}
 		case 0x03:      /* LDV */
@@ -571,6 +919,9 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 			//
 			// Loads 8 bytes starting from vector byte index
 
+#if USE_SIMD
+			vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8);
 
 			end = index + 8;
@@ -580,6 +931,7 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 				VREG_B(dest, i) = m_rsp.READ8(ea);
 				ea++;
 			}
+#endif
 			break;
 		}
 		case 0x04:      /* LQV */
@@ -591,6 +943,9 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 			//
 			// Loads up to 16 bytes starting from vector byte index
 
+#if USE_SIMD
+			vec_lqrv_sqrv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
 
 			end = index + (16 - (ea & 0xf));
@@ -601,6 +956,7 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 				VREG_B(dest, i) = m_rsp.READ8(ea);
 				ea++;
 			}
+#endif
 			break;
 		}
 		case 0x05:      /* LRV */
@@ -612,6 +968,9 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 			//
 			// Stores up to 16 bytes starting from right side until 16-byte boundary
 
+#if USE_SIMD
+			vec_lqrv_sqrv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
 
 			index = 16 - ((ea & 0xf) - index);
@@ -623,6 +982,7 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 				VREG_B(dest, i) = m_rsp.READ8(ea);
 				ea++;
 			}
+#endif
 			break;
 		}
 		case 0x06:      /* LPV */
@@ -634,12 +994,16 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 			//
 			// Loads a byte as the upper 8 bits of each element
 
+#if USE_SIMD
+			vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8);
 
 			for (i=0; i < 8; i++)
 			{
 				VREG_S(dest, i) = m_rsp.READ8(ea + (((16-index) + i) & 0xf)) << 8;
 			}
+#endif
 			break;
 		}
 		case 0x07:      /* LUV */
@@ -651,12 +1015,16 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 			//
 			// Loads a byte as the bits 14-7 of each element
 
+#if USE_SIMD
+			vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8);
 
 			for (i=0; i < 8; i++)
 			{
 				VREG_S(dest, i) = m_rsp.READ8(ea + (((16-index) + i) & 0xf)) << 7;
 			}
+#endif
 			break;
 		}
 		case 0x08:      /* LHV */
@@ -668,12 +1036,16 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 			//
 			// Loads a byte as the bits 14-7 of each element, with 2-byte stride
 
+#if USE_SIMD
+			vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
 
 			for (i=0; i < 8; i++)
 			{
 				VREG_S(dest, i) = m_rsp.READ8(ea + (((16-index) + (i<<1)) & 0xf)) << 7;
 			}
+#endif
 			break;
 		}
 		case 0x09:      /* LFV */
@@ -685,6 +1057,9 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 			//
 			// Loads a byte as the bits 14-7 of upper or lower quad, with 4-byte stride
 
+#if USE_SIMD
+			vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
 
 			// not sure what happens if 16-byte boundary is crossed...
@@ -696,6 +1071,7 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 				VREG_S(dest, i) = m_rsp.READ8(ea) << 7;
 				ea += 4;
 			}
+#endif
 			break;
 		}
 		case 0x0a:      /* LWV */
@@ -708,6 +1084,8 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 			// Loads the full 128-bit vector starting from vector byte index and wrapping to index 0
 			// after byte index 15
 
+#if USE_SIMD
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
 
 			end = (16 - index) + 16;
@@ -717,6 +1095,7 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 				VREG_B(dest, i & 0xf) = m_rsp.READ8(ea);
 				ea += 4;
 			}
+#endif
 			break;
 		}
 		case 0x0b:      /* LTV */
@@ -730,6 +1109,8 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 
 			// FIXME: has a small problem with odd indices
 
+#if USE_SIMD
+#else
 			int element;
 			int vs = dest;
 			int ve = dest + 8;
@@ -751,6 +1132,7 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 
 				ea += 2;
 			}
+#endif
 			break;
 		}
 
@@ -769,15 +1151,17 @@ void rsp_cop2::handle_lwc2(UINT32 op)
 
 void rsp_cop2::handle_swc2(UINT32 op)
 {
+	int base = (op >> 21) & 0x1f;
+#if !USE_SIMD
 	int i, end;
 	int eaoffset;
 	UINT32 ea;
 	int dest = (op >> 16) & 0x1f;
-	int base = (op >> 21) & 0x1f;
 	int index = (op >> 7) & 0xf;
 	int offset = (op & 0x7f);
 	if (offset & 0x40)
 		offset |= 0xffffffc0;
+#endif
 
 	switch ((op >> 11) & 0x1f)
 	{
@@ -790,8 +1174,12 @@ void rsp_cop2::handle_swc2(UINT32 op)
 			//
 			// Stores 1 byte from vector byte index
 
+#if USE_SIMD
+			vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + offset : offset;
 			m_rsp.WRITE8(ea, VREG_B(dest, index));
+#endif
 			break;
 		}
 		case 0x01:      /* SSV */
@@ -803,6 +1191,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
 			//
 			// Stores 2 bytes starting from vector byte index
 
+#if USE_SIMD
+			vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 2) : (offset * 2);
 
 			end = index + 2;
@@ -812,6 +1203,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
 				m_rsp.WRITE8(ea, VREG_B(dest, i));
 				ea++;
 			}
+#endif
 			break;
 		}
 		case 0x02:      /* SLV */
@@ -823,6 +1215,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
 			//
 			// Stores 4 bytes starting from vector byte index
 
+#if USE_SIMD
+			vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 4) : (offset * 4);
 
 			end = index + 4;
@@ -832,6 +1227,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
 				m_rsp.WRITE8(ea, VREG_B(dest, i));
 				ea++;
 			}
+#endif
 			break;
 		}
 		case 0x03:      /* SDV */
@@ -843,6 +1239,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
 			//
 			// Stores 8 bytes starting from vector byte index
 
+#if USE_SIMD
+			vec_lbdlsv_sbdlsv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8);
 
 			end = index + 8;
@@ -852,6 +1251,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
 				m_rsp.WRITE8(ea, VREG_B(dest, i));
 				ea++;
 			}
+#endif
 			break;
 		}
 		case 0x04:      /* SQV */
@@ -863,6 +1263,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
 			//
 			// Stores up to 16 bytes starting from vector byte index until 16-byte boundary
 
+#if USE_SIMD
+			vec_lqrv_sqrv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
 
 			end = index + (16 - (ea & 0xf));
@@ -872,6 +1275,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
 				m_rsp.WRITE8(ea, VREG_B(dest, i & 0xf));
 				ea++;
 			}
+#endif
 			break;
 		}
 		case 0x05:      /* SRV */
@@ -883,6 +1287,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
 			//
 			// Stores up to 16 bytes starting from right side until 16-byte boundary
 
+#if USE_SIMD
+			vec_lqrv_sqrv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			int o;
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
 
@@ -895,6 +1302,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
 				m_rsp.WRITE8(ea, VREG_B(dest, ((i + o) & 0xf)));
 				ea++;
 			}
+#endif
 			break;
 		}
 		case 0x06:      /* SPV */
@@ -906,6 +1314,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
 			//
 			// Stores upper 8 bits of each element
 
+#if USE_SIMD
+			vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8);
 			end = index + 8;
 
@@ -921,6 +1332,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
 				}
 				ea++;
 			}
+#endif
 			break;
 		}
 		case 0x07:      /* SUV */
@@ -932,6 +1344,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
 			//
 			// Stores bits 14-7 of each element
 
+#if USE_SIMD
+			vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 8) : (offset * 8);
 			end = index + 8;
 
@@ -947,6 +1362,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
 				}
 				ea++;
 			}
+#endif
 			break;
 		}
 		case 0x08:      /* SHV */
@@ -958,6 +1374,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
 			//
 			// Stores bits 14-7 of each element, with 2-byte stride
 
+#if USE_SIMD
+			vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
 
 			for (i=0; i < 8; i++)
@@ -968,6 +1387,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
 				m_rsp.WRITE8(ea, d);
 				ea += 2;
 			}
+#endif
 			break;
 		}
 		case 0x09:      /* SFV */
@@ -981,6 +1401,9 @@ void rsp_cop2::handle_swc2(UINT32 op)
 
 			// FIXME: only works for index 0 and index 8
 
+#if USE_SIMD
+			vec_lfhpuv_sfhpuv(op, m_rsp.m_rsp_state->r[base]);
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
 
 			eaoffset = ea & 0xf;
@@ -993,6 +1416,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
 				m_rsp.WRITE8(ea + (eaoffset & 0xf), VREG_S(dest, i) >> 7);
 				eaoffset += 4;
 			}
+#endif
 			break;
 		}
 		case 0x0a:      /* SWV */
@@ -1005,6 +1429,8 @@ void rsp_cop2::handle_swc2(UINT32 op)
 			// Stores the full 128-bit vector starting from vector byte index and wrapping to index 0
 			// after byte index 15
 
+#if USE_SIMD
+#else
 			ea = (base) ? m_rsp.m_rsp_state->r[base] + (offset * 16) : (offset * 16);
 
 			eaoffset = ea & 0xf;
@@ -1017,6 +1443,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
 				m_rsp.WRITE8(ea + (eaoffset & 0xf), VREG_B(dest, i & 0xf));
 				eaoffset++;
 			}
+#endif
 			break;
 		}
 		case 0x0b:      /* STV */
@@ -1028,6 +1455,8 @@ void rsp_cop2::handle_swc2(UINT32 op)
 			//
 			// Stores one element from maximum of 8 vectors, while incrementing element index
 
+#if USE_SIMD
+#else
 			int element;
 			int vs = dest;
 			int ve = dest + 8;
@@ -1047,6 +1476,7 @@ void rsp_cop2::handle_swc2(UINT32 op)
 				eaoffset += 2;
 				element++;
 			}
+#endif
 			break;
 		}
 
@@ -1981,16 +2411,21 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// Stores high, middle or low slice of accumulator to destination vector
 
 #if USE_SIMD
+			UINT16 *acc = m_acc.s;
 			switch (EL)
 			{
 				case 8:
+					m_v[VDREG].v = read_acc_hi(acc);
 					break;
 				case 9:
+					m_v[VDREG].v = read_acc_mid(acc);
 					break;
 				case 10:
+					m_v[VDREG].v = read_acc_lo(acc);
 					break;
 
 				default:
+					m_v[VDREG].v = _mm_setzero_si128();
 					break;
 			}
 #else
diff --git a/src/emu/cpu/rsp/rspcp2.h b/src/emu/cpu/rsp/rspcp2.h
index 2e09cd587f4..77bbd468233 100644
--- a/src/emu/cpu/rsp/rspcp2.h
+++ b/src/emu/cpu/rsp/rspcp2.h
@@ -199,6 +199,23 @@ protected:
 		RSP_ACC_HI = 0,
 	};
 
+	enum rsp_mem_request_type {
+		RSP_MEM_REQUEST_NONE,
+		RSP_MEM_REQUEST_INT_MEM,
+		RSP_MEM_REQUEST_VECTOR,
+		RSP_MEM_REQUEST_FOURTH,
+		RSP_MEM_REQUEST_HALF,
+		RSP_MEM_REQUEST_PACK,
+		RSP_MEM_REQUEST_QUAD,
+		RSP_MEM_REQUEST_REST,
+		RSP_MEM_REQUEST_UPACK
+	};
+
+	union aligned_rsp_1vect_t {
+		rsp_vec_t __align;
+		UINT16 s[8];
+	};
+
 	union aligned_rsp_2vect_t {
 		rsp_vec_t __align[2];
 		UINT16 s[16];
@@ -209,6 +226,7 @@ protected:
 		UINT16 s[24];
 	};
 
+	aligned_rsp_1vect_t m_vdqm;
 	aligned_rsp_2vect_t m_flags[3];
 	aligned_rsp_3vect_t m_acc;
 	UINT32 m_dp_flag;
@@ -225,11 +243,16 @@ protected:
 		const UINT16 ror_b2l_keys[16][8];
 		const UINT16 rol_l2b_keys[16][8];
 		const UINT16 ror_l2b_keys[16][8];
+		const UINT16 qr_lut[16][8];
+		const UINT16 bdls_lut[4][4];
 	} vec_helpers_t;
 
 	static const vec_helpers_t m_vec_helpers;
 
 	rsp_vec_t vec_load_and_shuffle_operand(const UINT16* src, UINT32 element);
+	static inline UINT32 sign_extend_6(INT32 i) {
+		return (i << (32 - 7)) >> (32 - 7);
+	}
 	static inline rsp_vec_t vec_load_unshuffled_operand(const UINT16* src)
 	{
 		return _mm_load_si128((rsp_vec_t*) src);
@@ -319,11 +342,11 @@ protected:
 	}
 
 	void vec_load_group1(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
-	void vec_load_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
-	void vec_load_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
+	void vec_load_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type);
+	void vec_load_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type);
 	void vec_store_group1(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
-	void vec_store_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
-	void vec_store_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
+	void vec_store_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type);
+	void vec_store_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm, rsp_mem_request_type request_type);
 
 #include "clamp.h"
 #include "vabs.h"
@@ -349,6 +372,7 @@ protected:
 #include "vsub.h"
 #include "vsubc.h"
 #include "vxor.h"
+#include "vldst.h"
 #endif
 
 private:
diff --git a/src/emu/cpu/rsp/vldst.h b/src/emu/cpu/rsp/vldst.h
new file mode 100644
index 00000000000..eaef0ccb512
--- /dev/null
+++ b/src/emu/cpu/rsp/vldst.h
@@ -0,0 +1,67 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+// LBV, LDV, LLV, LSV, SBV, SDV, SLV, SSV
+inline void vec_lbdlsv_sbdlsv(UINT32 iw, UINT32 rs)
+{
+	const UINT32 shift_and_idx = (iw >> 11) & 0x3;
+	rsp_vec_t dqm = _mm_loadl_epi64((rsp_vec_t *) (m_vec_helpers.bdls_lut[shift_and_idx]));
+
+	const UINT32 addr = rs + (sign_extend_6(iw) << shift_and_idx);
+	const UINT32 element = (iw >> 21) & 0xf;
+	UINT16* regp = m_v[(iw >> 16) & 0x1f].s;
+
+	if (iw >> 29 & 0x1)
+	{
+		vec_store_group1(addr, element, regp, vec_load_unshuffled_operand(regp), dqm);
+	}
+	else
+	{
+		vec_load_group1(addr, element, regp, vec_load_unshuffled_operand(regp), dqm);
+	}
+}
+
+// LPV, LUV, SPV, SUV
+inline void vec_lfhpuv_sfhpuv(UINT32 iw, UINT32 rs)
+{
+	static const enum rsp_mem_request_type fhpu_type_lut[4] = {
+		RSP_MEM_REQUEST_PACK,
+		RSP_MEM_REQUEST_UPACK,
+		RSP_MEM_REQUEST_HALF,
+		RSP_MEM_REQUEST_FOURTH
+	};
+
+	const UINT32 addr = rs + (sign_extend_6(iw) << 3);
+	const UINT32 element = (iw >> 21) & 0xf;
+	UINT16* regp = m_v[(iw >> 16) & 0x1f].s;
+
+	rsp_mem_request_type request_type = fhpu_type_lut[((iw >> 11) & 0x1f) - 6];
+	if ((iw >> 29) && 0x1)
+	{
+		vec_store_group2(addr, element, regp, vec_load_unshuffled_operand(regp), _mm_setzero_si128(), request_type);
+	}
+	else
+	{
+		vec_load_group2(addr, element, regp, vec_load_unshuffled_operand(regp), _mm_setzero_si128(), request_type);
+	}
+}
+
+// LQV, LRV, SQV, SRV
+inline void vec_lqrv_sqrv(UINT32 iw, UINT32 rs)
+{
+	const UINT32 addr = rs + (sign_extend_6(iw) << 4);
+	const UINT32 element = (iw >> 21) & 0xf;
+	UINT16* regp = m_v[(iw >> 16) & 0x1f].s;
+
+	memcpy(m_vdqm.s, m_vec_helpers.qr_lut[addr & 0xf], sizeof(m_vdqm.s));
+
+	rsp_mem_request_type request_type = (iw >> 11 & 0x1) ? RSP_MEM_REQUEST_REST : RSP_MEM_REQUEST_QUAD;
+	if ((iw >> 29) & 0x1)
+	{
+		vec_store_group4(addr, element, regp, vec_load_unshuffled_operand(regp), vec_load_unshuffled_operand(m_vdqm.s), request_type);
+	}
+	else
+	{
+		vec_load_group4(addr, element, regp, vec_load_unshuffled_operand(regp), vec_load_unshuffled_operand(m_vdqm.s), request_type);
+	}
+}