rgbsse: Optimize some sse routines. (nw) (#2700)

* rgbsse: Optimize some sse routines. (nw) * rgbsse: Create a generic getter instead of having individual color operation. (nw) * rgbsse: Allow up to 12 bits for scaling factors. (nw)
2025-10-07 09:25:34 +03:00 · 2017-10-09 08:56:21 -06:00 · 2017-10-09 08:56:21 -06:00 · 620f8550dd
commit 620f8550dd
parent eb628f0296
7 changed files with 114 additions and 56 deletions
--- a/src/devices/video/vooddefs.h
+++ b/src/devices/video/vooddefs.h
@ -1361,6 +1361,7 @@ static inline void ATTR_FORCE_INLINE applyFogging(voodoo_device *vd, uint32_t fb
 			/* if fog_mult is zero, we subtract the incoming color */
 			if (!FOGMODE_FOG_MULT(fogModeReg))
 			{
 				// Need to check this, manual states 9 bits
 				fogColorLocal.sub(color);
 				//fog.rgb -= color.rgb;
 				//fr -= (RR);
@ -1423,9 +1424,11 @@ static inline void ATTR_FORCE_INLINE applyFogging(voodoo_device *vd, uint32_t fb
 			//fg = (fg * fogblend) >> 8;
 			//fb = (fb * fogblend) >> 8;
 			/* if fog_mult is 0, we add this to the original color */
 			fogColorLocal.scale_imm_and_clamp((int16_t)fogblend);
 			if (FOGMODE_FOG_MULT(fogModeReg) == 0)
 			{
-				fogColorLocal.scale_imm_add_and_clamp(fogblend, color);
+				fogColorLocal.add(color);
 				fogColorLocal.clamp_to_uint8();
 				//color += fog;
 				//(RR) += fr;
 				//(GG) += fg;
@ -1435,7 +1438,6 @@ static inline void ATTR_FORCE_INLINE applyFogging(voodoo_device *vd, uint32_t fb
 			/* otherwise this just becomes the new color */
 			else
 			{
 				fogColorLocal.scale_imm_and_clamp(fogblend);
 				//color = fog;
 				//(RR) = fr;
 				//(GG) = fg;
--- a/src/emu/validity.cpp
+++ b/src/emu/validity.cpp
@ -770,6 +770,38 @@ void validity_checker::validate_rgb()
 	rgb.mul_imm_rgba(actual_a, actual_r, actual_g, actual_b);
 	check_expected("rgbaint_t::mul_imm_rgba");
 	// test select alpha element multiplication
 	expected_a *= actual_a = random_i32();
 	expected_r *= actual_a;
 	expected_g *= actual_a;
 	expected_b *= actual_a;
 	rgb.mul(rgbaint_t(actual_a, actual_r, actual_g, actual_b).select_alpha32());
 	check_expected("rgbaint_t::mul(select_alpha32)");
 	// test select red element multiplication
 	expected_a *= actual_r = random_i32();
 	expected_r *= actual_r;
 	expected_g *= actual_r;
 	expected_b *= actual_r;
 	rgb.mul(rgbaint_t(actual_a, actual_r, actual_g, actual_b).select_red32());
 	check_expected("rgbaint_t::mul(select_red32)");
 	// test select green element multiplication
 	expected_a *= actual_g = random_i32();
 	expected_r *= actual_g;
 	expected_g *= actual_g;
 	expected_b *= actual_g;
 	rgb.mul(rgbaint_t(actual_a, actual_r, actual_g, actual_b).select_green32());
 	check_expected("rgbaint_t::mul(select_green32)");
 	// test select blue element multiplication
 	expected_a *= actual_b = random_i32();
 	expected_r *= actual_b;
 	expected_g *= actual_b;
 	expected_b *= actual_b;
 	rgb.mul(rgbaint_t(actual_a, actual_r, actual_g, actual_b).select_blue32());
 	check_expected("rgbaint_t::mul(select_blue32)");
 	// test RGB and not
 	expected_a &= ~(actual_a = random_i32());
 	expected_r &= ~(actual_r = random_i32());
--- a/src/emu/video/rgbgen.cpp
+++ b/src/emu/video/rgbgen.cpp
@ -77,26 +77,6 @@ void rgbaint_t::scale_and_clamp(const rgbaint_t& scale)
 }
 void rgbaint_t::scale_imm_add_and_clamp(s32 scale, const rgbaint_t& other)
 {
 	m_a = (m_a * scale) >> 8;
 	m_r = (m_r * scale) >> 8;
 	m_g = (m_g * scale) >> 8;
 	m_b = (m_b * scale) >> 8;
 	m_a |= (m_a & 0x00800000) ? 0xff000000 : 0;
 	m_r |= (m_r & 0x00800000) ? 0xff000000 : 0;
 	m_g |= (m_g & 0x00800000) ? 0xff000000 : 0;
 	m_b |= (m_b & 0x00800000) ? 0xff000000 : 0;
 	m_a += other.m_a;
 	m_r += other.m_r;
 	m_g += other.m_g;
 	m_b += other.m_b;
 	if (u32(m_a) > 255) { m_a = (m_a < 0) ? 0 : 255; }
 	if (u32(m_r) > 255) { m_r = (m_r < 0) ? 0 : 255; }
 	if (u32(m_g) > 255) { m_g = (m_g < 0) ? 0 : 255; }
 	if (u32(m_b) > 255) { m_b = (m_b < 0) ? 0 : 255; }
 }
 void rgbaint_t::scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other)
 {
 	m_a = (m_a * scale.m_a) >> 8;
--- a/src/emu/video/rgbgen.h
+++ b/src/emu/video/rgbgen.h
@ -64,6 +64,12 @@ public:
 	s32 get_g32() const { return m_g; }
 	s32 get_b32() const { return m_b; }
 	// These selects return an rgbaint_t with all fields set to the element choosen (a, r, g, or b)
 	rgbaint_t select_alpha32() const { return rgbaint_t(get_a32(), get_a32(), get_a32(), get_a32()); }
 	rgbaint_t select_red32() const { return rgbaint_t(get_r32(), get_r32(), get_r32(), get_r32()); }
 	rgbaint_t select_green32() const { return rgbaint_t(get_g32(), get_g32(), get_g32(), get_g32()); }
 	rgbaint_t select_blue32() const { return rgbaint_t(get_b32(), get_b32(), get_b32(), get_b32()); }
 	inline void add(const rgbaint_t& color)
 	{
 		add_imm_rgba(color.m_a, color.m_r, color.m_g, color.m_b);
@ -304,7 +310,6 @@ public:
 	void scale_imm_and_clamp(const s32 scale);
 	void scale2_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other, const rgbaint_t& scale2);
 	void scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other);
 	void scale_imm_add_and_clamp(const s32 scale, const rgbaint_t& other);
 	void cmpeq(const rgbaint_t& value) { cmpeq_imm_rgba(value.m_a, value.m_r, value.m_g, value.m_b); }
 	void cmpgt(const rgbaint_t& value) { cmpgt_imm_rgba(value.m_a, value.m_r, value.m_g, value.m_b); }
--- a/src/emu/video/rgbsse.h
+++ b/src/emu/video/rgbsse.h
@ -67,7 +67,7 @@ public:
 	u8 get_a() const { return u8(unsigned(_mm_extract_epi16(m_value, 6))); }
 	u8 get_r() const { return u8(unsigned(_mm_extract_epi16(m_value, 4))); }
 	u8 get_g() const { return u8(unsigned(_mm_extract_epi16(m_value, 2))); }
-	u8 get_b() const { return u8(unsigned(_mm_extract_epi16(m_value, 0))); }
+	u8 get_b() const { return u8(unsigned(_mm_cvtsi128_si32(m_value))); }
 #ifdef __SSE4_1__
 	s32 get_a32() const { return _mm_extract_epi32(m_value, 3); }
@ -75,12 +75,18 @@ public:
 	s32 get_g32() const { return _mm_extract_epi32(m_value, 1); }
 	s32 get_b32() const { return _mm_extract_epi32(m_value, 0); }
 #else
-	s32 get_a32() const { return (_mm_extract_epi16(m_value, 7) << 16) | _mm_extract_epi16(m_value, 6); }
+	s32 get_a32() const { return (_mm_cvtsi128_si32(_mm_shuffle_epi32(m_value, _MM_SHUFFLE(0, 0, 0, 3)))); }
-	s32 get_r32() const { return (_mm_extract_epi16(m_value, 5) << 16) | _mm_extract_epi16(m_value, 4); }
+	s32 get_r32() const { return (_mm_cvtsi128_si32(_mm_shuffle_epi32(m_value, _MM_SHUFFLE(0, 0, 0, 2)))); }
-	s32 get_g32() const { return (_mm_extract_epi16(m_value, 3) << 16) | _mm_extract_epi16(m_value, 2); }
+	s32 get_g32() const { return (_mm_cvtsi128_si32(_mm_shuffle_epi32(m_value, _MM_SHUFFLE(0, 0, 0, 1)))); }
-	s32 get_b32() const { return (_mm_extract_epi16(m_value, 1) << 16) | _mm_extract_epi16(m_value, 0); }
+	s32 get_b32() const { return (_mm_cvtsi128_si32(m_value)); }
 #endif
 	// These selects return an rgbaint_t with all fields set to the element choosen (a, r, g, or b)
 	rgbaint_t select_alpha32() const { return (rgbaint_t)_mm_shuffle_epi32(m_value, _MM_SHUFFLE(3, 3, 3, 3)); }
 	rgbaint_t select_red32() const { return (rgbaint_t)_mm_shuffle_epi32(m_value, _MM_SHUFFLE(2, 2, 2, 2)); }
 	rgbaint_t select_green32() const { return (rgbaint_t)_mm_shuffle_epi32(m_value, _MM_SHUFFLE(1, 1, 1, 1)); }
 	rgbaint_t select_blue32() const { return (rgbaint_t)_mm_shuffle_epi32(m_value, _MM_SHUFFLE(0, 0, 0, 0)); }
 	inline void add(const rgbaint_t& color2)
 	{
 		m_value = _mm_add_epi32(m_value, color2.m_value);
@ -283,37 +289,71 @@ public:
 	void scale_and_clamp(const rgbaint_t& scale);
-	inline void scale_imm_and_clamp(const s32 scale)
+	// Leave this here in case Model3 blows up...
 	//inline void scale_imm_and_clamp(const s32 scale)
 	//{
 	//	mul_imm(scale);
 	//	sra_imm(8);
 	//	clamp_to_uint8();
 	//}
 	// This version needs values to be 12 bits or less
 	inline void scale_imm_and_clamp(const s16 scale)
 	{
-		mul_imm(scale);
+		// Set mult a 16 bit inputs to scale
-		sra_imm(8);
+		__m128i immv = _mm_set1_epi16(scale);
-		clamp_to_uint8();
+		// Shift up by 4
-	}
+		immv = _mm_slli_epi16(immv, 4);
-
+		// Pack color into mult b 16 bit inputs
-	inline void scale_imm_add_and_clamp(const s32 scale, const rgbaint_t& other)
+		m_value = _mm_packs_epi32(m_value, _mm_setzero_si128());
-	{
+		// Shift up by 4
-		mul_imm(scale);
+		m_value = _mm_slli_epi16(m_value, 4);
-		sra_imm(8);
+		// Do the 16 bit multiply, bottom 64 bits will contain 16 bit truncated results
-		add(other);
+		m_value = _mm_mulhi_epi16(m_value, immv);
-		clamp_to_uint8();
+		// Clamp to u8
 		m_value = _mm_packus_epi16(m_value, _mm_setzero_si128());
 		// Unpack up to s32
 		m_value = _mm_unpacklo_epi8(m_value, _mm_setzero_si128());
 		m_value = _mm_unpacklo_epi16(m_value, _mm_setzero_si128());
 	}
 	// This function needs values to be 12 bits or less
 	inline void scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other)
 	{
-		mul(scale);
+		// Pack scale into mult a 16 bits
-		sra_imm(8);
+		__m128i tmp1 = _mm_packs_epi32(scale.m_value, _mm_setzero_si128());
 		// Shift up by 4
 		tmp1 = _mm_slli_epi16(tmp1, 4);
 		// Pack color into mult b 16 bit inputs
 		m_value = _mm_packs_epi32(m_value, _mm_setzero_si128());
 		// Shift up by 4
 		m_value = _mm_slli_epi16(m_value, 4);
 		// Do the 16 bit multiply, bottom 64 bits will contain 16 bit truncated results
 		m_value = _mm_mulhi_epi16(m_value, tmp1);
 		// Unpack into 32 bit values
 		m_value = _mm_unpacklo_epi16(m_value, _mm_setzero_si128());
 		add(other);
 		clamp_to_uint8();
 	}
 	// This function needs values to be 12 bits or less
 	inline void scale2_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other, const rgbaint_t& scale2)
 	{
-		rgbaint_t color2(other);
+		// Pack both scale values into mult a 16 bits
-		color2.mul(scale2);
+		__m128i tmp1 = _mm_packs_epi32(scale.m_value, scale2.m_value);
-
+		// Shift up by 4
-		mul(scale);
+		tmp1 = _mm_slli_epi16(tmp1, 4);
-		add(color2);
+		// Pack both color values into mult b 16 bit inputs
-		sra_imm(8);
+		m_value = _mm_packs_epi32(m_value, other.m_value);
 		// Shift up by 4
 		m_value = _mm_slli_epi16(m_value, 4);
 		// Do the 16 bit multiply, top and bottom 64 bits will contain 16 bit truncated results
 		tmp1 = _mm_mulhi_epi16(m_value, tmp1);
 		// Unpack the results
 		m_value = _mm_unpacklo_epi16(tmp1, _mm_setzero_si128());
 		tmp1 = _mm_unpackhi_epi16(tmp1, _mm_setzero_si128());
 		// Add the results
 		m_value = _mm_add_epi32(m_value, tmp1);
 		clamp_to_uint8();
 	}
--- a/src/emu/video/rgbutil.h
+++ b/src/emu/video/rgbutil.h
@ -14,6 +14,7 @@
 // use SSE on 64-bit implementations, where it can be assumed
 #if (!defined(MAME_DEBUG) || defined(__OPTIMIZE__)) && (defined(__SSE2__) || defined(_MSC_VER)) && defined(PTR64)
 #include "rgbsse.h"
 #elif defined(__ALTIVEC__)
 #include "rgbvmx.h"
--- a/src/emu/video/rgbvmx.h
+++ b/src/emu/video/rgbvmx.h
@ -205,6 +205,12 @@ public:
 		return result;
 	}
 	// These selects return an rgbaint_t with all fields set to the element choosen (a, r, g, or b)
 	rgbaint_t select_alpha32() const { return rgbaint_t(get_a32(), get_a32(), get_a32(), get_a32()); }
 	rgbaint_t select_red32() const { return rgbaint_t(get_r32(), get_r32(), get_r32(), get_r32()); }
 	rgbaint_t select_green32() const { return rgbaint_t(get_g32(), get_g32(), get_g32(), get_g32()); }
 	rgbaint_t select_blue32() const { return rgbaint_t(get_b32(), get_b32(), get_b32(), get_b32()); }
 	inline void add(const rgbaint_t& color2)
 	{
 		m_value = vec_add(m_value, color2.m_value);
@ -460,14 +466,6 @@ public:
 	void scale_and_clamp(const rgbaint_t& scale);
 	void scale_imm_and_clamp(const s32 scale);
 	void scale_imm_add_and_clamp(const s32 scale, const rgbaint_t& other)
 	{
 		mul_imm(scale);
 		sra_imm(8);
 		add(other);
 		clamp_to_uint8();
 	}
 	void scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other)
 	{
 		mul(scale);