nw, start merging in RSP vector ops from CEN64

2025-07-17 15:29:14 +03:00 · 2015-06-25 19:19:25 +02:00 · 2015-06-25 19:19:25 +02:00 · 3c5cd12782
commit 3c5cd12782
parent 9c6f6114c7
31 changed files with 1267 additions and 10 deletions
--- a/src/emu/cpu/rsp/clamp.h
+++ b/src/emu/cpu/rsp/clamp.h
@ -0,0 +1,37 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t sclamp_acc_to_mid(rsp_vec_t acc_mid, rsp_vec_t acc_hi)
+{
+	return _mm_packs_epi32(
+		_mm_unpacklo_epi16(acc_mid, acc_hi),
+		_mm_unpackhi_epi16(acc_mid, acc_hi)
+	);
+}
+
+static inline rsp_vec_t uclamp_acc(rsp_vec_t val, rsp_vec_t acc_mid, rsp_vec_t acc_hi, rsp_vec_t zero)
+{
+  rsp_vec_t hi_negative = _mm_srai_epi16(acc_hi, 15);
+  rsp_vec_t mid_negative = _mm_srai_epi16(acc_mid, 15);
+
+  // We don't have to clamp if the HI part of the
+  // accumulator is sign-extended down to the MD part.
+  rsp_vec_t hi_sign_check = _mm_cmpeq_epi16(hi_negative, acc_hi);
+  rsp_vec_t mid_sign_check = _mm_cmpeq_epi16(hi_negative, mid_negative);
+  rsp_vec_t clamp_mask = _mm_and_si128(mid_sign_check, hi_sign_check);
+
+  // Generate the value in the event we need to clamp.
+  //   * hi_negative, mid_sign => xxxx
+  //   * hi_negative, !mid_sign => 0000
+  //   * !hi_negative, mid_sign => FFFF
+  //   * !hi_negative, !mid_sign => xxxx
+  rsp_vec_t clamped_val = _mm_cmpeq_epi16(hi_negative, zero);
+
+#ifndef __SSE4_1__
+  clamped_val = _mm_and_si128(clamp_mask, val);
+  val = _mm_andnot_si128(clamp_mask, clamped_val);
+  return _mm_or_si128(val, clamped_val);
+#else
+  return _mm_blendv_epi8(clamped_val, val, clamp_mask);
+#endif
+}
--- a/src/emu/cpu/rsp/rsp.h
+++ b/src/emu/cpu/rsp/rsp.h
@ -14,6 +14,7 @@
 #ifndef __RSP_H__
 #define __RSP_H__

+#include "emu.h"
 #include "cpu/drcfe.h"
 #include "cpu/drcuml.h"

--- a/src/emu/cpu/rsp/rspcp2.c
+++ b/src/emu/cpu/rsp/rspcp2.c
@ -1,5 +1,5 @@
 // license:BSD-3-Clause
-// copyright-holders:Ryan Holtz
+// copyright-holders:Ryan Holtz,Tyler J. Stachecki
 /***************************************************************************

    rspcp2.c
@ -11,13 +11,176 @@

 #include "emu.h"
 #include "rsp.h"
-#include "rspdiv.h"
 #include "rspcp2.h"
-#include "cpu/drcfe.h"
-#include "cpu/drcuml.h"
-#include "cpu/drcumlsh.h"

-using namespace uml;
+#if USE_SIMD
+#include <emmintrin.h>
+
+const rsp_cop2::vec_helpers_t rsp_cop2::m_vec_helpers = {
+	{ 0 },
+	{ // logic_mask
+		{  0,  0,  0,  0,  0,  0,  0,  0 },
+		{ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 }
+	},
+	{ // vrsq_mask_table
+		{ ~0,  0,  0,  0,  0,  0,  0,  0 },
+		{  0, ~0,  0,  0,  0,  0,  0,  0 },
+		{  0,  0, ~0,  0,  0,  0,  0,  0 },
+		{  0,  0,  0, ~0,  0,  0,  0,  0 },
+		{  0,  0,  0,  0, ~0,  0,  0,  0 },
+		{  0,  0,  0,  0,  0, ~0,  0,  0 },
+		{  0,  0,  0,  0,  0,  0, ~0,  0 },
+		{  0,  0,  0,  0,  0,  0,  0, ~0 }
+	},
+	{ // shuffle_keys
+/* -- */{0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0b0a, 0x0d0c, 0x0f0e},
+/* -- */{0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0b0a, 0x0d0c, 0x0f0e},
+
+/* 0q */{0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0d0c, 0x0d0c},
+/* 1q */{0x0302, 0x0302, 0x0706, 0x0706, 0x0b0a, 0x0b0a, 0x0f0e, 0x0f0e},
+
+/* 0h */{0x0100, 0x0100, 0x0100, 0x0100, 0x0908, 0x0908, 0x0908, 0x0908},
+/* 1h */{0x0302, 0x0302, 0x0302, 0x0302, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a},
+/* 2h */{0x0504, 0x0504, 0x0504, 0x0504, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c},
+/* 3h */{0x0706, 0x0706, 0x0706, 0x0706, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e},
+
+/* 0w */{0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100},
+/* 1w */{0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302},
+/* 2w */{0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504},
+/* 3w */{0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706},
+/* 4w */{0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908},
+/* 5w */{0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a},
+/* 6w */{0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c},
+/* 7w */{0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e}
+	},
+	{ // sll_b2l_keys
+		{0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d},
+		{0x0102, 0x8000, 0x0506, 0x0304, 0x090a, 0x0708, 0x0d0e, 0x0b0c},
+		{0x0001, 0x8080, 0x0405, 0x0203, 0x0809, 0x0607, 0x0c0d, 0x0a0b},
+		{0x8000, 0x8080, 0x0304, 0x0102, 0x0708, 0x0506, 0x0b0c, 0x090a},
+
+		{0x8080, 0x8080, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809},
+		{0x8080, 0x8080, 0x0102, 0x8000, 0x0506, 0x0304, 0x090a, 0x0708},
+		{0x8080, 0x8080, 0x0001, 0x8080, 0x0405, 0x0203, 0x0809, 0x0607},
+		{0x8080, 0x8080, 0x8000, 0x8080, 0x0304, 0x0102, 0x0708, 0x0506},
+
+		{0x8080, 0x8080, 0x8080, 0x8080, 0x0203, 0x0001, 0x0607, 0x0405},
+		{0x8080, 0x8080, 0x8080, 0x8080, 0x0102, 0x8000, 0x0506, 0x0304},
+		{0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x8080, 0x0405, 0x0203},
+		{0x8080, 0x8080, 0x8080, 0x8080, 0x8000, 0x8080, 0x0304, 0x0102},
+
+		{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0203, 0x0001},
+		{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0102, 0x8000},
+		{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x8080},
+		{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8000, 0x8080}
+	},
+	{ // sll_l2b_keys
+		{0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d},
+		{0x0380, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e},
+		{0x8080, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f},
+		{0x8080, 0x0380, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08},
+
+		{0x8080, 0x8080, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809},
+		{0x8080, 0x8080, 0x0380, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a},
+		{0x8080, 0x8080, 0x8080, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b},
+		{0x8080, 0x8080, 0x8080, 0x0380, 0x0102, 0x0700, 0x0506, 0x0b04},
+
+		{0x8080, 0x8080, 0x8080, 0x8080, 0x0203, 0x0001, 0x0607, 0x0405},
+		{0x8080, 0x8080, 0x8080, 0x8080, 0x0380, 0x0102, 0x0700, 0x0506},
+		{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0203, 0x0001, 0x0607},
+		{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0380, 0x0102, 0x0700},
+
+		{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0203, 0x0001},
+		{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0380, 0x0102},
+		{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0203},
+		{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0380}
+	},
+	{ // srl_b2l_keys
+		{0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d},
+		{0x0304, 0x0102, 0x0708, 0x0506, 0x0b0c, 0x090a, 0x0f80, 0x0d0e},
+		{0x0405, 0x0203, 0x0809, 0x0607, 0x0c0d, 0x0a0b, 0x8080, 0x0e0f},
+		{0x0506, 0x0304, 0x090a, 0x0708, 0x0d0e, 0x0b0c, 0x8080, 0x0f80},
+
+		{0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x8080, 0x8080},
+		{0x0708, 0x0506, 0x0b0c, 0x090a, 0x0f80, 0x0d0e, 0x8080, 0x8080},
+		{0x0809, 0x0607, 0x0c0d, 0x0a0b, 0x8080, 0x0e0f, 0x8080, 0x8080},
+		{0x090a, 0x0708, 0x0d0e, 0x0b0c, 0x8080, 0x0f80, 0x8080, 0x8080},
+
+		{0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x8080, 0x8080, 0x8080, 0x8080},
+		{0x0b0c, 0x090a, 0x0f80, 0x0d0e, 0x8080, 0x8080, 0x8080, 0x8080},
+		{0x0c0d, 0x0a0b, 0x8080, 0x0e0f, 0x8080, 0x8080, 0x8080, 0x8080},
+		{0x0d0e, 0x0b0c, 0x8080, 0x0f80, 0x8080, 0x8080, 0x8080, 0x8080},
+
+		{0x0e0f, 0x0c0d, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
+		{0x0f80, 0x0d0e, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
+		{0x8080, 0x0e0f, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
+		{0x8080, 0x0f80, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080}
+	},
+	{ // ror_b2l_keys
+		{0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d},
+		{0x0304, 0x0102, 0x0708, 0x0506, 0x0b0c, 0x090a, 0x0f00, 0x0d0e},
+		{0x0405, 0x0203, 0x0809, 0x0607, 0x0c0d, 0x0a0b, 0x0001, 0x0e0f},
+		{0x0506, 0x0304, 0x090a, 0x0708, 0x0d0e, 0x0b0c, 0x0102, 0x0f00},
+
+		{0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001},
+		{0x0708, 0x0506, 0x0b0c, 0x090a, 0x0f00, 0x0d0e, 0x0304, 0x0102},
+		{0x0809, 0x0607, 0x0c0d, 0x0a0b, 0x0001, 0x0e0f, 0x0405, 0x0203},
+		{0x090a, 0x0708, 0x0d0e, 0x0b0c, 0x0102, 0x0f00, 0x0506, 0x0304},
+
+		{0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405},
+		{0x0b0c, 0x090a, 0x0f00, 0x0d0e, 0x0304, 0x0102, 0x0708, 0x0506},
+		{0x0c0d, 0x0a0b, 0x0001, 0x0e0f, 0x0405, 0x0203, 0x0809, 0x0607},
+		{0x0d0e, 0x0b0c, 0x0102, 0x0f00, 0x0506, 0x0304, 0x090a, 0x0708},
+
+		{0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809},
+		{0x0f00, 0x0d0e, 0x0304, 0x0102, 0x0708, 0x0506, 0x0b0c, 0x090a},
+		{0x0001, 0x0e0f, 0x0405, 0x0203, 0x0809, 0x0607, 0x0c0d, 0x0a0b},
+		{0x0102, 0x0f00, 0x0506, 0x0304, 0x090a, 0x0708, 0x0d0e, 0x0b0c}
+	},
+	{ // rol_l2b_keys
+		{0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d},
+		{0x030c, 0x0102, 0x0400, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e},
+		{0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f},
+		{0x0d0e, 0x030c, 0x0102, 0x0400, 0x0506, 0x0b04, 0x090a, 0x0f08},
+
+		{0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809},
+		{0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0400, 0x0506, 0x0b04, 0x090a},
+		{0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b},
+		{0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0400, 0x0506, 0x0b04},
+
+		{0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405},
+		{0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0400, 0x0506},
+		{0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607},
+		{0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0400},
+
+		{0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001},
+		{0x0400, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102},
+		{0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203},
+		{0x0102, 0x0400, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c}
+	},
+	{ // ror_l2b_keys
+		{0x0203, 0x0001, 0x0607, 0x0405, 0x0c0b, 0x0809, 0x0e0f, 0x0c0d},
+		{0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c},
+		{0x0001, 0x0607, 0x0405, 0x0c0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203},
+		{0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102},
+
+		{0x0607, 0x0405, 0x0c0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001},
+		{0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0700},
+		{0x0405, 0x0c0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607},
+		{0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0700, 0x0506},
+
+		{0x0c0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405},
+		{0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0700, 0x0506, 0x0b04},
+		{0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0c0b},
+		{0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a},
+
+		{0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0c0b, 0x0809},
+		{0x0d0e, 0x030c, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08},
+		{0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0c0b, 0x0809, 0x0e0f},
+		{0x030c, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e}
+	}
+};
+#endif

 extern offs_t rsp_dasm_one(char *buffer, offs_t pc, UINT32 op);

@ -100,6 +263,7 @@ extern offs_t rsp_dasm_one(char *buffer, offs_t pc, UINT32 op);
 		VREG_S(VDREG, 7) = m_vres[7];   \
 }

+#if !USE_SIMD
 static const int vector_elements_2[16][8] =
 {
 	{ 0, 1, 2, 3, 4, 5, 6, 7 },     // none
@ -119,6 +283,7 @@ static const int vector_elements_2[16][8] =
 	{ 6, 6, 6, 6, 6, 6, 6, 6 },     // 6
 	{ 7, 7, 7, 7, 7, 7, 7, 7 },     // 7
 };
+#endif

 rsp_cop2::rsp_cop2(rsp_device &rsp, running_machine &machine)
 	: m_rsp(rsp)
@ -905,7 +1070,9 @@ UINT16 rsp_cop2::SATURATE_ACCUM(int accum, int slice, UINT16 negative, UINT16 po

 void rsp_cop2::handle_vector_ops(UINT32 op)
 {
+#if !USE_SIMD
 	int i;
+#endif

 	// Opcode legend:
 	//    E = VS2 element type
@ -924,6 +1091,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Multiplies signed integer by signed integer * 2

+#if USE_SIMD
+#else
 			for (i=0; i < 8; i++)
 			{
 				INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -948,8 +1117,9 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				}
 			}
 			WRITEBACK_RESULT();
-
+#endif
 			break;
+
 		}

 		case 0x01:      /* VMULU */
@ -960,6 +1130,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// ------------------------------------------------------
 			//

+#if USE_SIMD
+#else
 			for (i=0; i < 8; i++)
 			{
 				INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -986,6 +1158,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				}
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1000,6 +1173,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// Stores the higher 16 bits of the 32-bit result to accumulator
 			// The low slice of accumulator is stored into destination element

+#if USE_SIMD
+#else
 			for (i=0; i < 8; i++)
 			{
 				UINT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, i);
@ -1013,6 +1188,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				m_vres[i] = ACCUM_L(i);
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1027,6 +1203,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// The result is stored into accumulator
 			// The middle slice of accumulator is stored into destination element

+#if USE_SIMD
+#else
 			for (i=0; i < 8; i++)
 			{
 				INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -1040,6 +1218,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				m_vres[i] = ACCUM_M(i);
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;

 		}
@ -1055,6 +1234,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// The result is stored into accumulator
 			// The low slice of accumulator is stored into destination element

+#if USE_SIMD
+#else
 			for (i=0; i < 8; i++)
 			{
 				INT32 s1 = (UINT16)VREG_S(VS1REG, i);     // not sign-extended
@ -1068,6 +1249,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				m_vres[i] = ACCUM_L(i);
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1082,6 +1264,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// The result is stored into highest 32 bits of accumulator, the low slice is zero
 			// The highest 32 bits of accumulator is saturated into destination element

+#if USE_SIMD
+#else
 			for (i=0; i < 8; i++)
 			{
 				INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -1097,6 +1281,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				m_vres[i] = (INT16)(r);
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1110,6 +1295,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// Multiplies signed integer by signed integer * 2
 			// The result is added to accumulator

+#if USE_SIMD
+#else
 			for (i=0; i < 8; i++)
 			{
 				INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -1131,6 +1318,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				m_vres[i] = SATURATE_ACCUM(i, 1, 0x8000, 0x7fff);
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}
 		case 0x09:      /* VMACU */
@ -1141,6 +1329,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// ------------------------------------------------------
 			//

+#if USE_SIMD
+#else
 			for (i = 0; i < 8; i++)
 			{
 				INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -1177,6 +1367,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				}
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1191,6 +1382,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// Adds the higher 16 bits of the 32-bit result to accumulator
 			// The low slice of accumulator is stored into destination element

+#if USE_SIMD
+#else
 			for (i = 0; i < 8; i++)
 			{
 				UINT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, i);
@ -1206,6 +1399,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				m_vres[i] = SATURATE_ACCUM(i, 0, 0x0000, 0xffff);
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1220,6 +1414,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// The result is added into accumulator
 			// The middle slice of accumulator is stored into destination element

+#if USE_SIMD
+#else
 			for (i=0; i < 8; i++)
 			{
 				UINT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -1237,6 +1433,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				m_vres[i] = SATURATE_ACCUM(i, 1, 0x8000, 0x7fff);
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1251,6 +1448,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// The result is added into accumulator
 			// The low slice of accumulator is stored into destination element

+#if USE_SIMD
+#else
 			for (i=0; i < 8; i++)
 			{
 				INT32 s1 = (UINT16)VREG_S(VS1REG, i);     // not sign-extended
@ -1271,6 +1470,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			}
 			WRITEBACK_RESULT();

+#endif
 			break;
 		}

@ -1285,6 +1485,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// The result is added into highest 32 bits of accumulator, the low slice is zero
 			// The highest 32 bits of accumulator is saturated into destination element

+#if USE_SIMD
+#else
 			for (i = 0; i < 8; i++)
 			{
 				INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -1301,6 +1503,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			}
 			WRITEBACK_RESULT();

+#endif
 			break;
 		}

@ -1315,6 +1518,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)

 			// TODO: check VS2REG == VDREG

+#if USE_SIMD
+#else
 			for (i=0; i < 8; i++)
 			{
 				INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -1330,6 +1535,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			CLEAR_ZERO_FLAGS();
 			CLEAR_CARRY_FLAGS();
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1344,6 +1550,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)

 			// TODO: check VS2REG == VDREG

+#if USE_SIMD
+#else
 			for (i = 0; i < 8; i++)
 			{
 				INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -1360,6 +1568,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			CLEAR_ZERO_FLAGS();
 			CLEAR_CARRY_FLAGS();
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1373,6 +1582,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// Changes the sign of source register 2 if source register 1 is negative and stores
 			// the result to destination register

+#if USE_SIMD
+#else
 			for (i=0; i < 8; i++)
 			{
 				INT16 s1 = (INT16)VREG_S(VS1REG, i);
@ -1401,6 +1612,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				SET_ACCUM_L(m_vres[i], i);
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1415,6 +1627,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)

 			// TODO: check VS2REG = VDREG

+#if USE_SIMD
+#else
 			CLEAR_ZERO_FLAGS();
 			CLEAR_CARRY_FLAGS();

@ -1433,6 +1647,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				}
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1447,6 +1662,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)

 			// TODO: check VS2REG = VDREG

+#if USE_SIMD
+#else
 			CLEAR_ZERO_FLAGS();
 			CLEAR_CARRY_FLAGS();

@ -1469,6 +1686,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				}
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1481,6 +1699,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Stores high, middle or low slice of accumulator to destination vector

+#if USE_SIMD
+#else
 			switch (EL)
 			{
 				case 0x08:      // VSAWH
@ -1511,6 +1731,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 					printf("RSP: VSAW: el = %d\n", EL);//??? ???
 					exit(0);
 			}
+#endif
 			break;
 		}

@ -1524,6 +1745,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// Sets compare flags if elements in VS1 are less than VS2
 			// Moves the element in VS2 to destination vector

+#if USE_SIMD
+#else
 			CLEAR_COMPARE_FLAGS();
 			CLEAR_CLIP2_FLAGS();

@ -1559,6 +1782,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			CLEAR_CARRY_FLAGS();
 			CLEAR_ZERO_FLAGS();
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1572,6 +1796,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// Sets compare flags if elements in VS1 are equal with VS2
 			// Moves the element in VS2 to destination vector

+#if USE_SIMD
+#else
 			CLEAR_COMPARE_FLAGS();
 			CLEAR_CLIP2_FLAGS();

@ -1595,6 +1821,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			CLEAR_ZERO_FLAGS();
 			CLEAR_CARRY_FLAGS();
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1608,6 +1835,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// Sets compare flags if elements in VS1 are not equal with VS2
 			// Moves the element in VS2 to destination vector

+#if USE_SIMD
+#else
 			CLEAR_COMPARE_FLAGS();
 			CLEAR_CLIP2_FLAGS();

@ -1632,6 +1861,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			CLEAR_CARRY_FLAGS();
 			CLEAR_ZERO_FLAGS();
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1645,6 +1875,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// Sets compare flags if elements in VS1 are greater or equal with VS2
 			// Moves the element in VS2 to destination vector

+#if USE_SIMD
+#else
 			CLEAR_COMPARE_FLAGS();
 			CLEAR_CLIP2_FLAGS();

@ -1669,6 +1901,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			CLEAR_CARRY_FLAGS();
 			CLEAR_ZERO_FLAGS();
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1681,6 +1914,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Vector clip low

+#if USE_SIMD
+#else
 			for (i = 0; i < 8; i++)
 			{
 				INT16 s1 = VREG_S(VS1REG, i);
@ -1763,6 +1998,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			CLEAR_ZERO_FLAGS();
 			CLEAR_CLIP1_FLAGS();
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1775,6 +2011,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Vector clip high

+#if USE_SIMD
+#else
 			CLEAR_CARRY_FLAGS();
 			CLEAR_COMPARE_FLAGS();
 			CLEAR_CLIP1_FLAGS();
@ -1847,6 +2085,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				SET_ACCUM_L(m_vres[i], i);
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1859,6 +2098,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Vector clip reverse

+#if USE_SIMD
+#else
 			CLEAR_CARRY_FLAGS();
 			CLEAR_COMPARE_FLAGS();
 			CLEAR_CLIP1_FLAGS();
@ -1906,6 +2147,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				m_vres[i] = ACCUM_L(i);
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -1918,6 +2160,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Merges two vectors according to compare flags

+#if USE_SIMD
+#else
 			for (i = 0; i < 8; i++)
 			{
 				if (COMPARE_FLAG(i) != 0)
@ -1932,6 +2176,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				SET_ACCUM_L(m_vres[i], i);
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}
 		case 0x28:      /* VAND */
@ -1943,12 +2188,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Bitwise AND of two vector registers

+#if USE_SIMD
+#else
 			for (i = 0; i < 8; i++)
 			{
 				m_vres[i] = VREG_S(VS1REG, i) & VREG_S(VS2REG, VEC_EL_2(EL, i));
 				SET_ACCUM_L(m_vres[i], i);
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}
 		case 0x29:      /* VNAND */
@ -1960,12 +2208,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Bitwise NOT AND of two vector registers

+#if USE_SIMD
+#else
 			for (i = 0; i < 8; i++)
 			{
 				m_vres[i] = ~((VREG_S(VS1REG, i) & VREG_S(VS2REG, VEC_EL_2(EL, i))));
 				SET_ACCUM_L(m_vres[i], i);
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}
 		case 0x2a:      /* VOR */
@ -1977,12 +2228,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Bitwise OR of two vector registers

+#if USE_SIMD
+#else
 			for (i = 0; i < 8; i++)
 			{
 				m_vres[i] = VREG_S(VS1REG, i) | VREG_S(VS2REG, VEC_EL_2(EL, i));
 				SET_ACCUM_L(m_vres[i], i);
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}
 		case 0x2b:      /* VNOR */
@ -1994,12 +2248,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Bitwise NOT OR of two vector registers

+#if USE_SIMD
+#else
 			for (i=0; i < 8; i++)
 			{
 				m_vres[i] = ~((VREG_S(VS1REG, i) | VREG_S(VS2REG, VEC_EL_2(EL, i))));
 				SET_ACCUM_L(m_vres[i], i);
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}
 		case 0x2c:      /* VXOR */
@ -2011,12 +2268,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Bitwise XOR of two vector registers

+#if USE_SIMD
+#else
 			for (i=0; i < 8; i++)
 			{
 				m_vres[i] = VREG_S(VS1REG, i) ^ VREG_S(VS2REG, VEC_EL_2(EL, i));
 				SET_ACCUM_L(m_vres[i], i);
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}
 		case 0x2d:      /* VNXOR */
@ -2028,12 +2288,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Bitwise NOT XOR of two vector registers

+#if USE_SIMD
+#else
 			for (i=0; i < 8; i++)
 			{
 				m_vres[i] = ~((VREG_S(VS1REG, i) ^ VREG_S(VS2REG, VEC_EL_2(EL, i))));
 				SET_ACCUM_L(m_vres[i], i);
 			}
 			WRITEBACK_RESULT();
+#endif
 			break;
 		}

@ -2045,6 +2308,9 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			// ------------------------------------------------------
 			//
 			// Calculates reciprocal
+
+#if USE_SIMD
+#else
 			INT32 shifter = 0;

 			INT32 rec = (INT16)(VREG_S(VS2REG, EL & 7));
@ -2093,6 +2359,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			}


+#endif
 			break;
 		}

@ -2105,6 +2372,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Calculates reciprocal low part

+#if USE_SIMD
+#else
 			INT32 shifter = 0;

 			INT32 rec = (INT16)VREG_S(VS2REG, EL & 7);
@ -2169,6 +2438,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i);
 			}

+#endif
 			break;
 		}

@ -2181,6 +2451,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Calculates reciprocal high part

+#if USE_SIMD
+#else
 			m_reciprocal_high = (VREG_S(VS2REG, EL & 7)) << 16;
 			m_dp_allowed = 1;

@ -2191,6 +2463,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)

 			VREG_S(VDREG, VS1REG & 7) = (INT16)(m_reciprocal_res >> 16);

+#endif
 			break;
 		}

@ -2203,11 +2476,14 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Moves element from vector to destination vector

+#if USE_SIMD
+#else
 			VREG_S(VDREG, VS1REG & 7) = VREG_S(VS2REG, EL & 7);
 			for (i = 0; i < 8; i++)
 			{
 				SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i);
 			}
+#endif
 			break;
 		}

@ -2220,6 +2496,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Calculates reciprocal square-root

+#if USE_SIMD
+#else
 			INT32 shifter = 0;

 			INT32 rec = (INT16)(VREG_S(VS2REG, EL & 7));
@ -2269,6 +2547,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i);
 			}

+#endif
 			break;
 		}

@ -2281,6 +2560,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Calculates reciprocal square-root low part

+#if USE_SIMD
+#else
 			INT32 shifter = 0;
 			INT32 rec = (INT16)VREG_S(VS2REG, EL & 7);
 			INT32 datainput = rec;
@ -2348,6 +2629,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 				SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i);
 			}

+#endif
 			break;
 		}

@ -2360,6 +2642,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			//
 			// Calculates reciprocal square-root high part

+#if USE_SIMD
+#else
 			m_reciprocal_high = (VREG_S(VS2REG, EL & 7)) << 16;
 			m_dp_allowed = 1;

@ -2369,6 +2653,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
 			}

 			VREG_S(VDREG, VS1REG & 7) = (INT16)(m_reciprocal_res >> 16);    // store high part
+#endif
 			break;
 		}

--- a/src/emu/cpu/rsp/rspcp2.h
+++ b/src/emu/cpu/rsp/rspcp2.h
@ -1,5 +1,5 @@
 // license:BSD-3-Clause
-// copyright-holders:Ryan Holtz
+// copyright-holders:Ryan Holtz,Tyler J. Stachecki
 /***************************************************************************

    rspcp2.h
@ -15,6 +15,37 @@

 #include "cpu/drcuml.h"
 #include "rsp.h"
+#include "rspdiv.h"
+
+#define SIMD_OFF		(1)
+
+#if (defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__) || defined(__SSE4_1__) || defined(__SSE4_2__))
+#define SSE_AVAILABLE	(1)
+#else
+#define SSE_AVAILABLE	(0)
+#endif
+
+#if (!defined(MAME_DEBUG) || defined(__OPTIMIZE__)) && (SSE_AVAILABLE || defined(_MSC_VER)) && defined(PTR64) && !SIMD_OFF
+#define USE_SIMD	(1)
+#else
+#define USE_SIMD	(0)
+#endif
+
+#if USE_SIMD
+#ifdef __SSE4_2__
+#include <nmmintrin.h>
+#elif defined(__SSE4_1__)
+#include <smmintrin.h>
+#elif defined(__SSSE3__)
+#include <tmmintrin.h>
+#elif defined(__SSE3__)
+#include <pmmintrin.h>
+#else
+#include <emmintrin.h>
+#endif
+
+typedef __m128i rsp_vec_t;
+#endif

 union VECTOR_REG
 {
@ -22,6 +53,9 @@ union VECTOR_REG
 	UINT32 l[4];
 	INT16 s[8];
 	UINT8 b[16];
+#if USE_SIMD
+	rsp_vec_t v;
+#endif
 };

 union ACCUMULATOR_REG
@ -152,10 +186,148 @@ protected:
 	UINT32          m_reciprocal_high;
 	INT32           m_dp_allowed;

+#if USE_SIMD
+	typedef struct
+	{
+		rsp_vec_t dummy_for_alignment;
+		const UINT16 logic_mask[2][8];
+		const UINT16 vrsq_mask_table[8][8];
+		const UINT16 shuffle_keys[16][8];
+		const UINT16 sll_b2l_keys[16][8];
+		const UINT16 sll_l2b_keys[16][8];
+		const UINT16 srl_b2l_keys[16][8];
+		const UINT16 ror_b2l_keys[16][8];
+		const UINT16 rol_l2b_keys[16][8];
+		const UINT16 ror_l2b_keys[16][8];
+	} vec_helpers_t;
+
+	static const vec_helpers_t m_vec_helpers;
+
+	rsp_vec_t vec_load_and_shuffle_operand(const UINT16* src, UINT32 element);
+	static inline rsp_vec_t vec_load_unshuffled_operand(const UINT16* src)
+	{
+		return _mm_load_si128((rsp_vec_t*) src);
+	}
+	static inline void vec_write_operand(UINT16* dest, rsp_vec_t src)
+	{
+		_mm_store_si128((rsp_vec_t*) dest, src);
+	}
+	static inline rsp_vec_t read_acc_lo(const UINT16 *acc)
+	{
+		return vec_load_unshuffled_operand(acc + sizeof(rsp_vec_t) * 2);
+	}
+	static inline rsp_vec_t read_acc_mid(const UINT16 *acc)
+	{
+		return vec_load_unshuffled_operand(acc + sizeof(rsp_vec_t));
+	}
+	static inline rsp_vec_t read_acc_hi(const UINT16 *acc)
+	{
+		return vec_load_unshuffled_operand(acc);
+	}
+	static inline rsp_vec_t read_vcc_lo(const UINT16 *vcc)
+	{
+		return vec_load_unshuffled_operand(vcc + sizeof(rsp_vec_t));
+	}
+	static inline rsp_vec_t read_vcc_hi(const UINT16 *vcc)
+	{
+		return vec_load_unshuffled_operand(vcc);
+	}
+	static inline rsp_vec_t read_vco_lo(const UINT16 *vco)
+	{
+		return vec_load_unshuffled_operand(vco + sizeof(rsp_vec_t));
+	}
+	static inline rsp_vec_t read_vco_hi(const UINT16 *vco)
+	{
+		return vec_load_unshuffled_operand(vco);
+	}
+	static inline rsp_vec_t read_vce(const UINT16 *vce)
+	{
+		return vec_load_unshuffled_operand(vce + sizeof(rsp_vec_t));
+	}
+	static inline void write_acc_lo(UINT16 *acc, rsp_vec_t acc_lo)
+	{
+		return vec_write_operand(acc + sizeof(rsp_vec_t) * 2, acc_lo);
+	}
+	static inline void write_acc_mid(UINT16 *acc, rsp_vec_t acc_mid)
+	{
+		return vec_write_operand(acc + sizeof(rsp_vec_t), acc_mid);
+	}
+	static inline void write_acc_hi(UINT16 *acc, rsp_vec_t acc_hi)
+	{
+		return vec_write_operand(acc, acc_hi);
+	}
+	static inline void write_vcc_lo(UINT16 *vcc, rsp_vec_t vcc_lo)
+	{
+		return vec_write_operand(vcc + sizeof(rsp_vec_t), vcc_lo);
+	}
+	static inline void write_vcc_hi(UINT16 *vcc, rsp_vec_t vcc_hi)
+	{
+		return vec_write_operand(vcc, vcc_hi);
+	}
+	static inline void write_vco_lo(UINT16 *vcc, rsp_vec_t vco_lo)
+	{
+		return vec_write_operand(vcc + sizeof(rsp_vec_t), vco_lo);
+	}
+	static inline void write_vco_hi(UINT16 *vcc, rsp_vec_t vco_hi)
+	{
+		return vec_write_operand(vcc, vco_hi);
+	}
+	static inline void write_vce(UINT16 *vce, rsp_vec_t vce_r)
+	{
+		return vec_write_operand(vce, vce_r);
+	}
+
+	static inline INT16 get_flags(const UINT16 *flags)
+	{
+		return (INT16)_mm_movemask_epi8(
+			_mm_packs_epi16(
+				_mm_load_si128((rsp_vec_t*) (flags + sizeof(rsp_vec_t))),
+				_mm_load_si128((rsp_vec_t*) flags)
+			)
+		);
+	}
+
+	static inline rsp_vec_t vec_zero()
+	{
+		return _mm_setzero_si128();
+	}
+
+	void vec_load_group1(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
+	void vec_load_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
+	void vec_load_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
+	void vec_store_group1(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
+	void vec_store_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
+	void vec_store_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
+
+#include "clamp.h"
+#include "vabs.h"
+#include "vadd.h"
+#include "vaddc.h"
+#include "vand.h"
+#include "vch.h"
+#include "vcmp.h"
+#include "vcl.h"
+#include "vcr.h"
+#include "vmac.h"
+#include "vmrg.h"
+#include "vmul.h"
+#include "vmulh.h"
+#include "vmull.h"
+#include "vmulm.h"
+#include "vmuln.h"
+#include "vor.h"
+#include "vsub.h"
+#include "vsubc.h"
+#include "vxor.h"
+#endif
+
 private:
 	void            handle_lwc2(UINT32 op);
 	void            handle_swc2(UINT32 op);
 	void            handle_vector_ops(UINT32 op);
+
+	UINT32			m_div_in;
+	UINT32			m_div_out;
 };

 #endif /* __RSPCP2_H__ */
--- a/src/emu/cpu/rsp/rspcp2d.c
+++ b/src/emu/cpu/rsp/rspcp2d.c
@ -11,7 +11,6 @@

 #include "emu.h"
 #include "rsp.h"
-#include "rspdiv.h"
 #include "rspcp2.h"
 #include "rspcp2d.h"
 #include "cpu/drcfe.h"
--- a/src/emu/cpu/rsp/rspcp2d.h
+++ b/src/emu/cpu/rsp/rspcp2d.h
@ -31,6 +31,7 @@ class rsp_cop2_drc : public rsp_cop2
 	virtual void state_string_export(const int index, std::string &str);

 	void cfunc_unimplemented_opcode();
+
 public:
 	virtual void lbv();
 	virtual void lsv();
--- a/src/emu/cpu/rsp/rspdrc.c
+++ b/src/emu/cpu/rsp/rspdrc.c
@ -21,7 +21,6 @@
 #include "emu.h"
 #include "debugger.h"
 #include "rsp.h"
-#include "rspdiv.h"
 #include "rspfe.h"
 #include "rspcp2.h"
 #include "cpu/drcfe.h"
--- a/src/emu/cpu/rsp/vabs.h
+++ b/src/emu/cpu/rsp/vabs.h
@ -0,0 +1,15 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_vabs(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo)
+{
+	rsp_vec_t vs_zero = _mm_cmpeq_epi16(vs, zero);
+	rsp_vec_t sign_lt = _mm_srai_epi16(vs, 15);
+	rsp_vec_t vd = _mm_andnot_si128(vs_zero, vt);
+
+	// Careful: if VT = 0x8000 and VS is negative,
+	// acc_lo will be 0x8000 but vd will be 0x7FFF.
+	vd = _mm_xor_si128(vd, sign_lt);
+	*acc_lo = _mm_sub_epi16(vd, sign_lt);
+	return _mm_subs_epi16(vd, sign_lt);
+}
--- a/src/emu/cpu/rsp/vadd.h
+++ b/src/emu/cpu/rsp/vadd.h
@ -0,0 +1,17 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_vadd(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t carry, rsp_vec_t *acc_lo)
+{
+	// VCC uses unsaturated arithmetic.
+	rsp_vec_t vd = _mm_add_epi16(vs, vt);
+	*acc_lo = _mm_sub_epi16(vd, carry);
+
+	// VD is the signed sum of the two sources and the carry. Since we
+	// have to saturate the sum of all three, we have to be clever.
+	rsp_vec_t minimum = _mm_min_epi16(vs, vt);
+	rsp_vec_t maximum = _mm_max_epi16(vs, vt);
+	minimum = _mm_subs_epi16(minimum, carry);
+	return _mm_adds_epi16(minimum, maximum);
+}
+
--- a/src/emu/cpu/rsp/vaddc.h
+++ b/src/emu/cpu/rsp/vaddc.h
@ -0,0 +1,13 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_vaddc(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *sn)
+{
+	rsp_vec_t sat_sum = _mm_adds_epu16(vs, vt);
+	rsp_vec_t unsat_sum = _mm_add_epi16(vs, vt);
+
+	*sn = _mm_cmpeq_epi16(sat_sum, unsat_sum);
+	*sn = _mm_cmpeq_epi16(*sn, zero);
+
+	return unsat_sum;
+}
--- a/src/emu/cpu/rsp/vand.h
+++ b/src/emu/cpu/rsp/vand.h
@ -0,0 +1,9 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_vand_vnand(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt) {
+	rsp_vec_t vmask = _mm_load_si128((rsp_vec_t *) m_vec_helpers.logic_mask[iw & 0x1]);
+
+	rsp_vec_t vd = _mm_and_si128(vs, vt);
+	return _mm_xor_si128(vd, vmask);
+}
--- a/src/emu/cpu/rsp/vch.h
+++ b/src/emu/cpu/rsp/vch.h
@ -0,0 +1,57 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_vch(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *ge, rsp_vec_t *le, rsp_vec_t *eq, rsp_vec_t *sign, rsp_vec_t *vce) {
+	// sign = (vs ^ vt) < 0
+	*sign = _mm_xor_si128(vs, vt);
+	*sign = _mm_cmplt_epi16(*sign, zero);
+
+	// sign_negvt = sign ? -vt : vt
+	rsp_vec_t sign_negvt = _mm_xor_si128(vt, *sign);
+	sign_negvt = _mm_sub_epi16(sign_negvt, *sign);
+
+	// Compute diff, diff_zero:
+	rsp_vec_t diff = _mm_sub_epi16(vs, sign_negvt);
+	rsp_vec_t diff_zero = _mm_cmpeq_epi16(diff, zero);
+
+	// Compute le/ge:
+	rsp_vec_t vt_neg = _mm_cmplt_epi16(vt, zero);
+	rsp_vec_t diff_lez = _mm_cmpgt_epi16(diff, zero);
+	rsp_vec_t diff_gez = _mm_or_si128(diff_lez, diff_zero);
+	diff_lez = _mm_cmpeq_epi16(zero, diff_lez);
+
+#ifdef __SSE4_1__
+	*ge = _mm_blendv_epi8(diff_gez, vt_neg, *sign);
+	*le = _mm_blendv_epi8(vt_neg, diff_lez, *sign);
+#else
+	*ge = _mm_and_si128(*sign, vt_neg);
+	diff_gez = _mm_andnot_si128(*sign, diff_gez);
+	*ge = _mm_or_si128(*ge, diff_gez);
+
+	*le = _mm_and_si128(*sign, diff_lez);
+	diff_lez = _mm_andnot_si128(*sign, vt_neg);
+	*le = _mm_or_si128(*le, diff_lez);
+#endif
+
+	// Compute vce:
+	*vce = _mm_cmpeq_epi16(diff, *sign);
+	*vce = _mm_and_si128(*vce, *sign);
+
+	// Compute !eq:
+	*eq = _mm_or_si128(diff_zero, *vce);
+	*eq = _mm_cmpeq_epi16(*eq, zero);
+
+	// Compute result:
+#ifdef __SSE4_1__
+	rsp_vec_t diff_sel_mask = _mm_blendv_epi8(*ge, *le, *sign);
+	return _mm_blendv_epi8(vs, sign_negvt, diff_sel_mask);
+#else
+	diff_lez = _mm_and_si128(*sign, *le);
+	diff_gez = _mm_andnot_si128(*sign, *ge);
+	rsp_vec_t diff_sel_mask = _mm_or_si128(diff_lez, diff_gez);
+
+	diff_lez = _mm_and_si128(diff_sel_mask, sign_negvt);
+	diff_gez = _mm_andnot_si128(diff_sel_mask, vs);
+	return _mm_or_si128(diff_lez, diff_gez);
+#endif
+}
--- a/src/emu/cpu/rsp/vcl.h
+++ b/src/emu/cpu/rsp/vcl.h
@ -0,0 +1,65 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_vcl(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *ge, rsp_vec_t *le, rsp_vec_t eq, rsp_vec_t sign, rsp_vec_t vce)
+{
+	// sign_negvt = sign ? -vt : vt
+	rsp_vec_t sign_negvt = _mm_xor_si128(vt, sign);
+	sign_negvt = _mm_sub_epi16(sign_negvt, sign);
+
+	// Compute diff, diff_zero, ncarry, and nvce:
+	// Note: diff = sign ? (vs + vt) : (vs - vt).
+	rsp_vec_t diff = _mm_sub_epi16(vs, sign_negvt);
+	rsp_vec_t ncarry = _mm_adds_epu16(vs, vt);
+	ncarry = _mm_cmpeq_epi16(diff, ncarry);
+	rsp_vec_t nvce = _mm_cmpeq_epi16(vce, zero);
+	rsp_vec_t diff_zero = _mm_cmpeq_epi16(diff, zero);
+
+	// Compute results for if (sign && ne):
+	rsp_vec_t le_case1 = _mm_and_si128(diff_zero, ncarry);
+	le_case1 = _mm_and_si128(nvce, le_case1);
+	rsp_vec_t le_case2 = _mm_or_si128(diff_zero, ncarry);
+	le_case2 = _mm_and_si128(vce, le_case2);
+	rsp_vec_t le_eq = _mm_or_si128(le_case1, le_case2);
+
+	// Compute results for if (!sign && ne):
+	rsp_vec_t ge_eq = _mm_subs_epu16(vt, vs);
+	ge_eq = _mm_cmpeq_epi16(ge_eq, zero);
+
+	// Blend everything together. Caveat: we don't update
+	// the results of ge/le if ne is false, so be careful.
+	rsp_vec_t do_le = _mm_andnot_si128(eq, sign);
+#ifdef __SSE4_1__
+	*le = _mm_blendv_epi8(*le, le_eq, do_le);
+#else
+	le_eq = _mm_and_si128(do_le, le_eq);
+	*le = _mm_andnot_si128(do_le, *le);
+	*le = _mm_or_si128(le_eq, *le);
+#endif
+
+	rsp_vec_t do_ge = _mm_or_si128(sign, eq);
+#ifdef __SSE4_1__
+	*ge = _mm_blendv_epi8(ge_eq, *ge, do_ge);
+#else
+	*ge = _mm_and_si128(do_ge, *ge);
+	ge_eq = _mm_andnot_si128(do_ge, ge_eq);
+	*ge = _mm_or_si128(ge_eq, *ge);
+#endif
+
+  // Mux the result based on the value of sign.
+#ifdef __SSE4_1__
+	rsp_vec_t mux_mask = _mm_blendv_epi8(*ge, *le, sign);
+#else
+	do_le = _mm_and_si128(sign, *le);
+	do_ge = _mm_andnot_si128(sign, *ge);
+	rsp_vec_t mux_mask  = _mm_or_si128(do_le, do_ge);
+#endif
+
+#ifdef __SSE4_1__
+	return _mm_blendv_epi8(vs, sign_negvt, mux_mask);
+#else
+	sign_negvt = _mm_and_si128(mux_mask, sign_negvt);
+	vs = _mm_andnot_si128(mux_mask, vs);
+	return _mm_or_si128(sign_negvt, vs);
+#endif
+}
--- a/src/emu/cpu/rsp/vcmp.h
+++ b/src/emu/cpu/rsp/vcmp.h
@ -0,0 +1,49 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_veq_vge_vlt_vne(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *le, rsp_vec_t eq, rsp_vec_t sign)
+{
+	rsp_vec_t equal = _mm_cmpeq_epi16(vs, vt);
+
+	if (iw & 0x2) // VNE & VGE
+	{
+		if (iw & 0x1) // VGE
+		{
+			rsp_vec_t gt = _mm_cmpgt_epi16(vs, vt);
+			rsp_vec_t equalsign = _mm_and_si128(eq, sign);
+
+			equal = _mm_andnot_si128(equalsign, equal);
+			*le = _mm_or_si128(gt, equal);
+		}
+		else // VNE
+		{
+			rsp_vec_t nequal = _mm_cmpeq_epi16(equal, zero);
+
+			*le = _mm_and_si128(eq, equal);
+			*le = _mm_or_si128(*le, nequal);
+		}
+	}
+	else // VEQ & VLT
+	{
+		if (iw & 0x1) // VEQ
+		{
+			*le = _mm_andnot_si128(eq, equal);
+		}
+		else // VLT
+		{
+			rsp_vec_t lt = _mm_cmplt_epi16(vs, vt);
+
+			equal = _mm_and_si128(eq, equal);
+			equal = _mm_and_si128(sign, equal);
+			*le = _mm_or_si128(lt, equal);
+		}
+	}
+
+#ifdef __SSE4_1__
+	return _mm_blendv_epi8(vt, vs, *le);
+#else
+	vs = _mm_and_si128(*le, vs);
+	vt = _mm_andnot_si128(*le, vt);
+	return _mm_or_si128(vs, vt);
+#endif
+}
--- a/src/emu/cpu/rsp/vcr.h
+++ b/src/emu/cpu/rsp/vcr.h
@ -0,0 +1,35 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_vcr(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *ge, rsp_vec_t *le) {
+	// sign = (vs ^ vt) < 0
+	rsp_vec_t sign = _mm_xor_si128(vs, vt);
+	sign = _mm_srai_epi16(sign, 15);
+
+	// Compute le
+	rsp_vec_t diff_lez = _mm_and_si128(vs, sign);
+	diff_lez = _mm_add_epi16(diff_lez, vt);
+	*le = _mm_srai_epi16(diff_lez, 15);
+
+	// Compute ge
+	rsp_vec_t diff_gez = _mm_or_si128(vs, sign);
+	diff_gez = _mm_min_epi16(diff_gez, vt);
+	*ge = _mm_cmpeq_epi16(diff_gez, vt);
+
+	// sign_notvt = sn ? ~vt : vt
+	rsp_vec_t sign_notvt = _mm_xor_si128(vt, sign);
+
+	// Compute result:
+#ifdef __SSE4_1__
+	rsp_vec_t diff_sel_mask = _mm_blendv_epi8(*ge, *le, sign);
+	return _mm_blendv_epi8(vs, sign_notvt, diff_sel_mask);
+#else
+	rsp_vec_t diff_sel_mask = _mm_sub_epi16(*le, *ge);
+	diff_sel_mask = _mm_and_si128(diff_sel_mask, sign);
+	diff_sel_mask = _mm_add_epi16(diff_sel_mask, *ge);
+
+	zero = _mm_sub_epi16(sign_notvt, vs);
+	zero = _mm_and_si128(zero, diff_sel_mask);
+	return _mm_add_epi16(zero, vs);
+#endif
+}
--- a/src/emu/cpu/rsp/vdivh.h
+++ b/src/emu/cpu/rsp/vdivh.h
@ -0,0 +1,12 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+rsp_vec_t vec_vdivh(UINT32 src, UINT32 e, UINT32 dest, UINT32 de)
+{
+	// Get the element from VT.
+	m_div_in = m_v[src].s[e & 0x7];
+
+	// Write out the upper part of the result.
+	m_v[dest].s[de & 0x7] = m_div_out;
+	return vec_load_unshuffled_operand(m_v[dest].s);
+}
--- a/src/emu/cpu/rsp/vmac.h
+++ b/src/emu/cpu/rsp/vmac.h
@ -0,0 +1,57 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_vmacf_vmacu(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_mid, rsp_vec_t *acc_hi)
+{
+	// Get the product and shift it over
+	// being sure to save the carries.
+	rsp_vec_t lo = _mm_mullo_epi16(vs, vt);
+	rsp_vec_t hi = _mm_mulhi_epi16(vs, vt);
+
+	rsp_vec_t mid = _mm_slli_epi16(hi, 1);
+	rsp_vec_t carry = _mm_srli_epi16(lo, 15);
+	hi = _mm_srai_epi16(hi, 15);
+	mid = _mm_or_si128(mid, carry);
+	lo = _mm_slli_epi16(lo, 1);
+
+	// Tricky part: start accumulating everything.
+	// Get/keep the carry as we'll add it in later.
+	rsp_vec_t overflow_mask = _mm_adds_epu16(*acc_lo, lo);
+	*acc_lo = _mm_add_epi16(*acc_lo, lo);
+
+	overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask);
+	overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
+
+	// Add in the carry. If the middle portion is
+	// already 0xFFFF and we have a carry, we have
+	// to carry the all the way up to hi.
+	mid = _mm_sub_epi16(mid, overflow_mask);
+	carry = _mm_cmpeq_epi16(mid, zero);
+	carry = _mm_and_si128(carry, overflow_mask);
+	hi = _mm_sub_epi16(hi, carry);
+
+	// Accumulate the middle portion.
+	overflow_mask = _mm_adds_epu16(*acc_mid, mid);
+	*acc_mid = _mm_add_epi16(*acc_mid, mid);
+
+	overflow_mask = _mm_cmpeq_epi16(*acc_mid, overflow_mask);
+	overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
+
+	// Finish up the accumulation of the... accumulator.
+	*acc_hi = _mm_add_epi16(*acc_hi, hi);
+	*acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask);
+
+	if (iw & 0x1) // VMACU
+	{
+		rsp_vec_t overflow_hi_mask = _mm_srai_epi16(*acc_hi, 15);
+		rsp_vec_t overflow_mid_mask = _mm_srai_epi16(*acc_mid, 15);
+		mid = _mm_or_si128(overflow_mid_mask, *acc_mid);
+		overflow_mask = _mm_cmpgt_epi16(*acc_hi, zero);
+		mid = _mm_andnot_si128(overflow_hi_mask, mid);
+		return _mm_or_si128(overflow_mask, mid);
+	}
+	else // VMACF
+	{
+		return sclamp_acc_to_mid(*acc_mid, *acc_hi);
+	}
+}
--- a/src/emu/cpu/rsp/vmov.h
+++ b/src/emu/cpu/rsp/vmov.h
@ -0,0 +1,9 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+rsp_vec_t vec_vmov(UINT32 src, UINT32 e, UINT32 dest, UINT32 de)
+{
+	// Get the element from VT and write out the upper part of the result.
+	m_v[dest].s[de & 0x7] = m_v[src].s[e & 0x7];
+	return rsp_vect_load_unshuffled_operand(m_v[dest].s);
+}
--- a/src/emu/cpu/rsp/vmrg.h
+++ b/src/emu/cpu/rsp/vmrg.h
@ -0,0 +1,13 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_vmrg(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t le)
+{
+#ifdef __SSE4_1__
+	return _mm_blendv_epi8(vt, vs, le);
+#else
+	vs = _mm_and_si128(le, vs);
+	vt = _mm_andnot_si128(le, vt);
+	return _mm_or_si128(vs, vt);
+#endif
+}
--- a/src/emu/cpu/rsp/vmudh.h
+++ b/src/emu/cpu/rsp/vmudh.h
@ -0,0 +1,11 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t rsp_vmudh(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t *acc_md, rsp_vec_t *acc_hi)
+{
+	*acc_md = _mm_mullo_epi16(vs, vt);
+	*acc_hi = _mm_mulhi_epi16(vs, vt);
+
+	return sclamp_acc_to_mid(*acc_md, *acc_hi);
+}
+
--- a/src/emu/cpu/rsp/vmul.h
+++ b/src/emu/cpu/rsp/vmul.h
@ -0,0 +1,39 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+//
+// TODO: CHECK ME.
+//
+
+static inline rsp_vec_t vec_vmulf_vmulu(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_md, rsp_vec_t *acc_hi)
+{
+	rsp_vec_t lo = _mm_mullo_epi16(vs, vt);
+	rsp_vec_t round = _mm_cmpeq_epi16(zero, zero);
+	rsp_vec_t sign1 = _mm_srli_epi16(lo, 15);
+	lo = _mm_add_epi16(lo, lo);
+	round = _mm_slli_epi16(round, 15);
+	rsp_vec_t hi = _mm_mulhi_epi16(vs, vt);
+	rsp_vec_t sign2 = _mm_srli_epi16(lo, 15);
+	*acc_lo = _mm_add_epi16(round, lo);
+	sign1 = _mm_add_epi16(sign1, sign2);
+
+	hi = _mm_slli_epi16(hi, 1);
+	rsp_vec_t eq = _mm_cmpeq_epi16(vs, vt);
+	rsp_vec_t neq = eq;
+	*acc_md = _mm_add_epi16(hi, sign1);
+
+	rsp_vec_t neg = _mm_srai_epi16(*acc_md, 15);
+
+	if (iw & 0x1) // VMULU
+	{
+		*acc_hi = _mm_andnot_si128(eq, neg);
+		hi =_mm_or_si128(*acc_md, neg);
+		return _mm_andnot_si128(*acc_hi, hi);
+	}
+	else // VMULF
+	{
+		eq = _mm_and_si128(eq, neg);
+		*acc_hi = _mm_andnot_si128(neq, neg);
+		return _mm_add_epi16(*acc_md, eq);
+	}
+}
--- a/src/emu/cpu/rsp/vmulh.h
+++ b/src/emu/cpu/rsp/vmulh.h
@ -0,0 +1,31 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_vmadh_vmudh(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_md, rsp_vec_t *acc_hi)
+{
+	rsp_vec_t lo = _mm_mullo_epi16(vs, vt);
+	rsp_vec_t hi = _mm_mulhi_epi16(vs, vt);
+
+	if (iw & 0x8) // VMADH
+	{
+		// Tricky part: start accumulating everything.
+		// Get/keep the carry as we'll add it in later.
+		rsp_vec_t overflow_mask = _mm_adds_epu16(*acc_md, lo);
+		*acc_md = _mm_add_epi16(*acc_md, lo);
+
+		overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask);
+		overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
+
+		hi = _mm_sub_epi16(hi, overflow_mask);
+		*acc_hi = _mm_add_epi16(*acc_hi, hi);
+	}
+	else // VMUDH
+	{
+		*acc_lo = zero;
+		*acc_md = lo;
+		*acc_hi = hi;
+	}
+
+	return sclamp_acc_to_mid(*acc_md, *acc_hi);
+}
+
--- a/src/emu/cpu/rsp/vmull.h
+++ b/src/emu/cpu/rsp/vmull.h
@ -0,0 +1,44 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_vmadl_vmudl(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_md, rsp_vec_t *acc_hi)
+{
+	rsp_vec_t hi = _mm_mulhi_epu16(vs, vt);
+
+	if (iw & 0x8) // VMADL
+	{
+		// Tricky part: start accumulating everything.
+		// Get/keep the carry as we'll add it in later.
+		rsp_vec_t overflow_mask = _mm_adds_epu16(*acc_lo, hi);
+		*acc_lo = _mm_add_epi16(*acc_lo, hi);
+
+		overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask);
+		overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
+		hi = _mm_sub_epi16(zero, overflow_mask);
+
+		// Check for overflow of the upper sum.
+		//
+		// TODO: Since hi can only be {0,1}, we should
+		// be able to generalize this for performance.
+		overflow_mask = _mm_adds_epu16(*acc_md, hi);
+		*acc_md = _mm_add_epi16(*acc_md, hi);
+
+		overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask);
+		overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
+
+		// Finish up the accumulation of the... accumulator.
+		// Since the product was unsigned, only worry about
+		// positive overflow (i.e.: borrowing not possible).
+		*acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask);
+
+		return uclamp_acc(*acc_lo, *acc_md, *acc_hi, zero);
+	}
+	else // VMUDL
+	{
+		*acc_lo = hi;
+		*acc_md = zero;
+		*acc_hi = zero;
+
+		return hi;
+	}
+}
--- a/src/emu/cpu/rsp/vmulm.h
+++ b/src/emu/cpu/rsp/vmulm.h
@ -0,0 +1,56 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_vmadm_vmudm(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_md, rsp_vec_t *acc_hi)
+{
+	rsp_vec_t lo = _mm_mullo_epi16(vs, vt);
+	rsp_vec_t hi = _mm_mulhi_epu16(vs, vt);
+
+	// What we really want to do is unsigned vs * signed vt.
+	// However, we have no such instructions to do so.
+	//
+	// There's a trick to "fix" an unsigned product, though:
+	// If vt was negative, take the upper 16-bits of the product
+	// and subtract vs.
+	rsp_vec_t sign = _mm_srai_epi16(vs, 15);
+	vt = _mm_and_si128(vt, sign);
+	hi = _mm_sub_epi16(hi, vt);
+
+	if (iw & 0x8) // VMADM
+	{
+		// Tricky part: start accumulating everything.
+		// Get/keep the carry as we'll add it in later.
+		rsp_vec_t overflow_mask = _mm_adds_epu16(*acc_lo, lo);
+		*acc_lo = _mm_add_epi16(*acc_lo, lo);
+
+		overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask);
+		overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
+
+		// This is REALLY clever. Since the product results from
+		// two 16-bit components, one positive and one negative,
+		// we don't have to worry about carrying the 1 (we can
+		// only borrow) past 32-bits. So we can just add it here.
+		hi = _mm_sub_epi16(hi, overflow_mask);
+
+		// Check for overflow of the upper sum.
+		overflow_mask = _mm_adds_epu16(*acc_md, hi);
+		*acc_md = _mm_add_epi16(*acc_md, hi);
+
+		overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask);
+		overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
+
+		// Finish up the accumulation of the... accumulator.
+		*acc_hi = _mm_add_epi16(*acc_hi, _mm_srai_epi16(hi, 15));
+		*acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask);
+
+		return sclamp_acc_to_mid(*acc_md, *acc_hi);
+	}
+	else // VMUDM
+	{
+		*acc_lo = lo;
+		*acc_md = hi;
+		*acc_hi = _mm_srai_epi16(hi, 15);
+
+		return hi;
+	}
+}
--- a/src/emu/cpu/rsp/vmuln.h
+++ b/src/emu/cpu/rsp/vmuln.h
@ -0,0 +1,55 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_vmadn_vmudn(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_md, rsp_vec_t *acc_hi)
+{
+	rsp_vec_t lo = _mm_mullo_epi16(vs, vt);
+	rsp_vec_t hi = _mm_mulhi_epu16(vs, vt);
+
+	// What we really want to do is unsigned vs * signed vt.
+	// However, we have no such instructions to do so.
+	//
+	// There's a trick to "fix" an unsigned product, though:
+	// If vt was negative, take the upper 16-bits of the product
+	// and subtract vs.
+	rsp_vec_t sign = _mm_srai_epi16(vt, 15);
+	vs = _mm_and_si128(vs, sign);
+	hi = _mm_sub_epi16(hi, vs);
+
+	if (iw & 0x8) // VMADN
+	{
+		// Tricky part: start accumulate everything.
+		// Get/keep the carry as we'll add it in later.
+		rsp_vec_t overflow_mask = _mm_adds_epu16(*acc_lo, lo);
+		*acc_lo = _mm_add_epi16(*acc_lo, lo);
+
+		overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask);
+		overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
+
+		// This is REALLY clever. Since the product results from
+		// two 16-bit components, one positive and one negative,
+		// we don't have to worry about carrying the 1 (we can
+		// only borrow) past 32-bits. So we can just add it here.
+		hi = _mm_sub_epi16(hi, overflow_mask);
+
+		// Check for overflow of the upper sum.
+		overflow_mask = _mm_adds_epu16(*acc_md, hi);
+		*acc_md = _mm_add_epi16(*acc_md, hi);
+
+		overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask);
+		overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
+
+		// Finish up the accumulation of the... accumulator.
+		*acc_hi = _mm_add_epi16(*acc_hi, _mm_srai_epi16(hi, 15));
+		*acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask);
+		return uclamp_acc(*acc_lo, *acc_md, *acc_hi, zero);
+	}
+	else // VMUDN
+	{
+		*acc_lo = lo;
+		*acc_md = hi;
+		*acc_hi = _mm_srai_epi16(hi, 15);
+
+		return lo;
+	}
+}
--- a/src/emu/cpu/rsp/vor.h
+++ b/src/emu/cpu/rsp/vor.h
@ -0,0 +1,10 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_vor_vnor(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt)
+{
+	rsp_vec_t vmask = _mm_load_si128((rsp_vec_t *) m_vec_helpers.logic_mask[iw & 0x1]);
+
+	rsp_vec_t vd = _mm_or_si128(vs, vt);
+	return _mm_xor_si128(vd, vmask);
+}
--- a/src/emu/cpu/rsp/vrcpsq.h
+++ b/src/emu/cpu/rsp/vrcpsq.h
@ -0,0 +1,60 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_vrcp_vrsq(UINT32 iw, INT32 dp, UINT32 src, UINT32 e, UINT32 dest, UINT32 de)
+{
+	UINT32 shift, idx;
+
+	// Get the element from VT.
+	INT16 vt = m_v[src].s[e & 0x7];
+
+	UINT32 dp_input = ((UINT32) m_div_in << 16) | (UINT16) vt;
+	UINT32 sp_input = vt;
+
+	INT32 input = (dp) ? dp_input : sp_input;
+	INT32 input_mask = input >> 31;
+	INT32 data = input ^ input_mask;
+
+	if (input > -32768)
+	{
+		data -= input_mask;
+	}
+
+	// Handle edge cases.
+	INT32 result;
+	if (data == 0)
+	{
+		result = 0x7fffffff;
+	}
+	else if (input == -32768)
+	{
+    	result = 0xffff0000;
+	}
+	else // Main case: compute the reciprocal.
+	{
+		UINT32 shift = count_leading_zeros(data);
+		UINT32 idx = (((UINT64) data << shift) & 0x7FC00000) >> 22;
+
+		if (iw & 0x4) // VRSQ
+		{
+			idx = (idx | 0x200) & 0x3FE | (shift % 2);
+			result = rsp_divtable[idx];
+
+			result = ((0x10000 | result) << 14) >> ((31 - shift) >> 1);
+		}
+		else // VRCP
+		{
+			result = rsp_divtable[idx];
+
+			result = ((0x10000 | result) << 14) >> (31 - shift);
+		}
+
+		result = result ^ input_mask;
+	}
+
+	// Write out the results.
+	m_div_out = result >> 16;
+	m_v[dest].s[de & 0x7] = result;
+
+	return vec_load_unshuffled_operand(m_v[dest].s);
+}
--- a/src/emu/cpu/rsp/vrsq.h
+++ b/src/emu/cpu/rsp/vrsq.h
@ -0,0 +1,65 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+rsp_vec_t vec_vrsq(INT32 dp, UINT32 src, UINT32 e, UINT32 dest, UINT32 de)
+{
+	// Get the element from VT.
+	INT16 vt = m_v[src].s[e & 0x7];
+
+	UINT32 dp_input = ((UINT32) m_div_in << 16) | (UINT16) vt;
+	UINT32 sp_input = vt;
+
+	INT32 input = (dp) ? dp_input : sp_input;
+	INT32 input_mask = input >> 31;
+	INT32 data = input ^ input_mask;
+
+	if (input > -32768)
+	{
+		data -= input_mask;
+	}
+
+	// Handle edge cases.
+	if (data == 0)
+	{
+		result = 0x7fffFFFFU;
+	}
+	else if (input == -32768)
+	{
+    	result = 0xffff0000U;
+	}
+	else // Main case: compute the reciprocal.
+	{
+		UINT32 shift = count_leading_zeros(data);
+
+		UINT32 idx = (((UINT64) data << shift) & 0x7FC00000U) >> 22;
+		idx = (idx | 0x200) & 0x3FE | (shift % 2);
+		INT32 result = rsp_reciprocal_rom[idx];
+
+		result = ((0x10000 | result) << 14) >> ((31 - shift) >> 1);
+		result = result ^ input_mask;
+	}
+
+	// Write out the results.
+	m_div_out = result >> 16;
+	m_v[dest].s[de & 0x7] = result;
+
+	return vec_load_unshuffled_operand(m_v[dest].s);
+}
+
+rsp_vec_t vec_vrsqh(UINT32 src, UINT32 e, UINT32 dest, UINT32 de)
+{
+	INT16 elements[8];
+
+	// Get the element from VT.
+	memcpy(elements, &m_v[src], sizeof(rsp_vec_t));
+	m_div_in = elements[e];
+
+	// Write out the upper part of the result.
+	rsp_vec_t vd_mask = _mm_load_si128((rsp_vec_t *) m_vec_helpers.vrsq_mask_table[de]);
+	rsp_vec_t vd = _mm_load_si128((rsp_vec_t *) &m_v[dest]);
+	vd = _mm_andnot_si128(vd_mask, vd);
+
+	rsp_vec_t b_result = _mm_set1_epi16(m_div_out);
+	b_result = _mm_and_si128(vd_mask, b_result);
+	return _mm_or_si128(b_result, vd);
+}
--- a/src/emu/cpu/rsp/vsub.h
+++ b/src/emu/cpu/rsp/vsub.h
@ -0,0 +1,17 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t rsp_vsub(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t carry, rsp_vec_t *acc_lo)
+{
+	// acc_lo uses saturated arithmetic.
+	rsp_vec_t unsat_diff = _mm_sub_epi16(vt, carry);
+	rsp_vec_t sat_diff = _mm_subs_epi16(vt, carry);
+
+	*acc_lo = _mm_sub_epi16(vs, unsat_diff);
+	rsp_vec_t vd = _mm_subs_epi16(vs, sat_diff);
+
+	// VD is the signed diff of the two sources and the carry. Since we
+	// have to saturate the diff of all three, we have to be clever.
+	rsp_vec_t overflow = _mm_cmpgt_epi16(sat_diff, unsat_diff);
+	return _mm_adds_epi16(vd, overflow);
+}
--- a/src/emu/cpu/rsp/vsubc.h
+++ b/src/emu/cpu/rsp/vsubc.h
@ -0,0 +1,14 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_vsubc(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *eq, rsp_vec_t *sn)
+{
+	rsp_vec_t sat_udiff = _mm_subs_epu16(vs, vt);
+	rsp_vec_t equal = _mm_cmpeq_epi16(vs, vt);
+	rsp_vec_t sat_udiff_zero = _mm_cmpeq_epi16(sat_udiff, zero);
+
+	*eq = _mm_cmpeq_epi16(equal, zero);
+	*sn = _mm_andnot_si128(equal, sat_udiff_zero);
+
+	return _mm_sub_epi16(vs, vt);
+}
--- a/src/emu/cpu/rsp/vxor.h
+++ b/src/emu/cpu/rsp/vxor.h
@ -0,0 +1,10 @@
+// license:BSD-3-Clause
+// copyright-holders:Tyler J. Stachecki,Ryan Holtz
+
+static inline rsp_vec_t vec_vxor_vnxor(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt)
+{
+	rsp_vec_t vmask = _mm_load_si128((rsp_vec_t *) m_vec_helpers.logic_mask[iw & 0x1]);
+
+	rsp_vec_t vd = _mm_xor_si128(vs, vt);
+	return _mm_xor_si128(vd, vmask);
+}