- Added USE_SIMD flag to RSP headers and began converting some opcodes to use

SSE* intrinsics. Current plan is to target SSSE3-capable hardware (Core2 and up), with the resulting speedup theorized to be on the order of 5-10x when conversion is complete, though this applies only to situations where the emulation is heavily bottlenecked by the RSP. [MooglyGuy]
2025-07-13 13:35:41 +03:00 · 2013-06-09 06:08:14 +00:00 · 2013-06-09 06:08:14 +00:00 · 63a76589d1
commit 63a76589d1
parent 3e31aadb63
2 changed files with 78 additions and 4 deletions
--- a/src/emu/cpu/rsp/rsp.h
+++ b/src/emu/cpu/rsp/rsp.h
@ -16,6 +16,12 @@
 #ifndef __RSP_H__
 #define __RSP_H__

+#define USE_SIMD		(1)
+
+#if USE_SIMD
+#include <tmmintrin.h>
+#endif
+
 #define USE_RSPDRC

 /***************************************************************************
@ -159,6 +165,11 @@ struct rsp_state
 	UINT32 pc;
 	UINT32 r[35];
 	VECTOR_REG v[32];
+#if USE_SIMD
+	// Mirror of v[] for now, to be used in parallel as
+	// more vector ops are transitioned over
+	__m128i xv[32];
+#endif
 	UINT16 flag[4];
 	UINT32 sr;
 	UINT32 step_count;
--- a/src/emu/cpu/rsp/rspdrc.c
+++ b/src/emu/cpu/rsp/rspdrc.c
@ -18,6 +18,8 @@

 ***************************************************************************/

+#include <tmmintrin.h>
+
 #include "emu.h"
 #include "debugger.h"
 #include "rsp.h"
@ -753,6 +755,28 @@ static void cfunc_rsp_lbv(void *param)

 	ea = (base) ? rsp->r[base] + offset : offset;
 	VREG_B(dest, index) = READ8(rsp, ea);
+
+	// SSE
+#if USE_SIMD
+	// Better solutions for this situation welcome. Need to be able to insert a byte at an arbitrary
+	// byte index in the __m128. Current method amounts to:
+	//     final_vec = (in_vec &~ discard_mask) | insert_value
+	// Naturally, SSE4.1 adds the highly-useful PINSRB opcode. As the name implies, it's an
+	// arbitrary byte-insert-into-m128, but do we want to require SSE4.1? Maybe just have an ifdef
+	// and use the more optimal one if available.
+	const __m128i neg1 = _mm_set_epi16(0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff);
+
+	__m128i insert_vec = _mm_setzero_si128();
+	INT16 insert_value = READ8(rsp, ea) << ((1 - (index & 1)) << 2);
+	_mm_insert_epi16 (insert_vec, insert_value, index >> 1);
+
+	__m128i discard_mask = _mm_setzero_si128();
+	INT16 discard_element = 0x00ff << ((1 - (index & 1)) << 2);
+	_mm_insert_epi16 (discard_mask, discard_element, index >> 1);
+	_mm_xor_si128 (discard_mask, neg1);
+	_mm_and_si128 (rsp->xv[dest], discard_mask);
+	_mm_or_si128 (rsp->xv[dest], insert_vec);
+#endif
 }

 static void cfunc_rsp_lsv(void *param)
@ -762,7 +786,7 @@ static void cfunc_rsp_lsv(void *param)
 	UINT32 ea = 0;
 	int dest = (op >> 16) & 0x1f;
 	int base = (op >> 21) & 0x1f;
-	int index = (op >> 7) & 0xf;
+	int index = (op >> 7) & 0xe;
 	int offset = (op & 0x7f);
 	if (offset & 0x40)
 	{
@ -784,6 +808,12 @@ static void cfunc_rsp_lsv(void *param)
 		VREG_B(dest, i) = READ8(rsp, ea);
 		ea++;
 	}
+
+	// SSE
+#if USE_SIMD
+	INT16 insert_value = READ8(rsp, ea) << 8 | READ8(rsp, ea + 1);
+	_mm_insert_epi16 (rsp->xv[dest], insert_value, index >> 1);
+#endif
 }

 static void cfunc_rsp_llv(void *param)
@ -793,7 +823,7 @@ static void cfunc_rsp_llv(void *param)
 	UINT32 ea = 0;
 	int dest = (op >> 16) & 0x1f;
 	int base = (op >> 21) & 0x1f;
-	int index = (op >> 7) & 0xf;
+	int index = (op >> 7) & 0xc;
 	int offset = (op & 0x7f);
 	if (offset & 0x40)
 	{
@ -815,6 +845,14 @@ static void cfunc_rsp_llv(void *param)
 		VREG_B(dest, i) = READ8(rsp, ea);
 		ea++;
 	}
+
+	// SSE
+#if USE_SIMD
+	INT16 insert_value0 = READ8(rsp, ea) << 8 | READ8(rsp, ea + 1);
+	INT16 insert_value1 = READ8(rsp, ea + 2) << 8 | READ8(rsp, ea + 3);
+	_mm_insert_epi16 (rsp->xv[dest], insert_value0, (index >> 1));
+	_mm_insert_epi16 (rsp->xv[dest], insert_value1, (index >> 1) + 1);
+#endif
 }

 static void cfunc_rsp_ldv(void *param)
@ -824,7 +862,7 @@ static void cfunc_rsp_ldv(void *param)
 	UINT32 ea = 0;
 	int dest = (op >> 16) & 0x1f;
 	int base = (op >> 21) & 0x1f;
-	int index = (op >> 7) & 0xf;
+	int index = (op >> 7) & 0x8;
 	int offset = (op & 0x7f);
 	if (offset & 0x40)
 	{
@ -846,6 +884,17 @@ static void cfunc_rsp_ldv(void *param)
 		VREG_B(dest, i) = READ8(rsp, ea);
 		ea++;
 	}
+
+#if USE_SIMD
+	INT16 insert_value0 = READ8(rsp, ea) << 8 | READ8(rsp, ea + 1);
+	INT16 insert_value1 = READ8(rsp, ea + 2) << 8 | READ8(rsp, ea + 3);
+	INT16 insert_value2 = READ8(rsp, ea + 4) << 8 | READ8(rsp, ea + 5);
+	INT16 insert_value3 = READ8(rsp, ea + 6) << 8 | READ8(rsp, ea + 7);
+	_mm_insert_epi16 (rsp->xv[dest], insert_value0, (index >> 1));
+	_mm_insert_epi16 (rsp->xv[dest], insert_value1, (index >> 1) + 1);
+	_mm_insert_epi16 (rsp->xv[dest], insert_value2, (index >> 1) + 2);
+	_mm_insert_epi16 (rsp->xv[dest], insert_value3, (index >> 1) + 3);
+#endif
 }

 static void cfunc_rsp_lqv(void *param)
@ -857,7 +906,7 @@ static void cfunc_rsp_lqv(void *param)
 	UINT32 ea = 0;
 	int dest = (op >> 16) & 0x1f;
 	int base = (op >> 21) & 0x1f;
-	int index = (op >> 7) & 0xf;
+	int index = 0; // Just a test, it goes right back the way it was if something breaks //(op >> 7) & 0xf;
 	int offset = (op & 0x7f);
 	if (offset & 0x40)
 	{
@ -880,6 +929,20 @@ static void cfunc_rsp_lqv(void *param)
 		VREG_B(dest, i) = READ8(rsp, ea);
 		ea++;
 	}
+
+	// SSE
+#if USE_SIMD
+	INT16 val0 = READ8(rsp, ea) << 8 | READ8(rsp, ea + 1);
+	INT16 val1 = READ8(rsp, ea + 2) << 8 | READ8(rsp, ea + 3);
+	INT16 val2 = READ8(rsp, ea + 4) << 8 | READ8(rsp, ea + 5);
+	INT16 val3 = READ8(rsp, ea + 6) << 8 | READ8(rsp, ea + 7);
+	INT16 val4 = READ8(rsp, ea + 8) << 8 | READ8(rsp, ea + 9);
+	INT16 val5 = READ8(rsp, ea + 10) << 8 | READ8(rsp, ea + 11);
+	INT16 val6 = READ8(rsp, ea + 12) << 8 | READ8(rsp, ea + 13);
+	INT16 val7 = READ8(rsp, ea + 14) << 8 | READ8(rsp, ea + 15);
+
+	rsp->xv[dest] = _mm_set_epi16(val0, val1, val2, val3, val4, val5, val6, val7);
+#endif
 }

 static void cfunc_rsp_lrv(void *param)