rgbsse: Optimize some sse routines. (nw) (#2700)

* rgbsse: Optimize some sse routines. (nw)
* rgbsse: Create a generic getter instead of having individual color operation. (nw)
* rgbsse: Allow up to 12 bits for scaling factors. (nw)
This commit is contained in:
tedgreen99 2017-10-09 08:56:21 -06:00 committed by GitHub
parent eb628f0296
commit 620f8550dd
7 changed files with 114 additions and 56 deletions

View File

@ -1361,6 +1361,7 @@ static inline void ATTR_FORCE_INLINE applyFogging(voodoo_device *vd, uint32_t fb
/* if fog_mult is zero, we subtract the incoming color */ /* if fog_mult is zero, we subtract the incoming color */
if (!FOGMODE_FOG_MULT(fogModeReg)) if (!FOGMODE_FOG_MULT(fogModeReg))
{ {
// Need to check this, manual states 9 bits
fogColorLocal.sub(color); fogColorLocal.sub(color);
//fog.rgb -= color.rgb; //fog.rgb -= color.rgb;
//fr -= (RR); //fr -= (RR);
@ -1423,9 +1424,11 @@ static inline void ATTR_FORCE_INLINE applyFogging(voodoo_device *vd, uint32_t fb
//fg = (fg * fogblend) >> 8; //fg = (fg * fogblend) >> 8;
//fb = (fb * fogblend) >> 8; //fb = (fb * fogblend) >> 8;
/* if fog_mult is 0, we add this to the original color */ /* if fog_mult is 0, we add this to the original color */
fogColorLocal.scale_imm_and_clamp((int16_t)fogblend);
if (FOGMODE_FOG_MULT(fogModeReg) == 0) if (FOGMODE_FOG_MULT(fogModeReg) == 0)
{ {
fogColorLocal.scale_imm_add_and_clamp(fogblend, color); fogColorLocal.add(color);
fogColorLocal.clamp_to_uint8();
//color += fog; //color += fog;
//(RR) += fr; //(RR) += fr;
//(GG) += fg; //(GG) += fg;
@ -1435,7 +1438,6 @@ static inline void ATTR_FORCE_INLINE applyFogging(voodoo_device *vd, uint32_t fb
/* otherwise this just becomes the new color */ /* otherwise this just becomes the new color */
else else
{ {
fogColorLocal.scale_imm_and_clamp(fogblend);
//color = fog; //color = fog;
//(RR) = fr; //(RR) = fr;
//(GG) = fg; //(GG) = fg;

View File

@ -770,6 +770,38 @@ void validity_checker::validate_rgb()
rgb.mul_imm_rgba(actual_a, actual_r, actual_g, actual_b); rgb.mul_imm_rgba(actual_a, actual_r, actual_g, actual_b);
check_expected("rgbaint_t::mul_imm_rgba"); check_expected("rgbaint_t::mul_imm_rgba");
// test select alpha element multiplication
expected_a *= actual_a = random_i32();
expected_r *= actual_a;
expected_g *= actual_a;
expected_b *= actual_a;
rgb.mul(rgbaint_t(actual_a, actual_r, actual_g, actual_b).select_alpha32());
check_expected("rgbaint_t::mul(select_alpha32)");
// test select red element multiplication
expected_a *= actual_r = random_i32();
expected_r *= actual_r;
expected_g *= actual_r;
expected_b *= actual_r;
rgb.mul(rgbaint_t(actual_a, actual_r, actual_g, actual_b).select_red32());
check_expected("rgbaint_t::mul(select_red32)");
// test select green element multiplication
expected_a *= actual_g = random_i32();
expected_r *= actual_g;
expected_g *= actual_g;
expected_b *= actual_g;
rgb.mul(rgbaint_t(actual_a, actual_r, actual_g, actual_b).select_green32());
check_expected("rgbaint_t::mul(select_green32)");
// test select blue element multiplication
expected_a *= actual_b = random_i32();
expected_r *= actual_b;
expected_g *= actual_b;
expected_b *= actual_b;
rgb.mul(rgbaint_t(actual_a, actual_r, actual_g, actual_b).select_blue32());
check_expected("rgbaint_t::mul(select_blue32)");
// test RGB and not // test RGB and not
expected_a &= ~(actual_a = random_i32()); expected_a &= ~(actual_a = random_i32());
expected_r &= ~(actual_r = random_i32()); expected_r &= ~(actual_r = random_i32());

View File

@ -77,26 +77,6 @@ void rgbaint_t::scale_and_clamp(const rgbaint_t& scale)
} }
void rgbaint_t::scale_imm_add_and_clamp(s32 scale, const rgbaint_t& other)
{
m_a = (m_a * scale) >> 8;
m_r = (m_r * scale) >> 8;
m_g = (m_g * scale) >> 8;
m_b = (m_b * scale) >> 8;
m_a |= (m_a & 0x00800000) ? 0xff000000 : 0;
m_r |= (m_r & 0x00800000) ? 0xff000000 : 0;
m_g |= (m_g & 0x00800000) ? 0xff000000 : 0;
m_b |= (m_b & 0x00800000) ? 0xff000000 : 0;
m_a += other.m_a;
m_r += other.m_r;
m_g += other.m_g;
m_b += other.m_b;
if (u32(m_a) > 255) { m_a = (m_a < 0) ? 0 : 255; }
if (u32(m_r) > 255) { m_r = (m_r < 0) ? 0 : 255; }
if (u32(m_g) > 255) { m_g = (m_g < 0) ? 0 : 255; }
if (u32(m_b) > 255) { m_b = (m_b < 0) ? 0 : 255; }
}
void rgbaint_t::scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other) void rgbaint_t::scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other)
{ {
m_a = (m_a * scale.m_a) >> 8; m_a = (m_a * scale.m_a) >> 8;

View File

@ -64,6 +64,12 @@ public:
s32 get_g32() const { return m_g; } s32 get_g32() const { return m_g; }
s32 get_b32() const { return m_b; } s32 get_b32() const { return m_b; }
// These selects return an rgbaint_t with all fields set to the element choosen (a, r, g, or b)
rgbaint_t select_alpha32() const { return rgbaint_t(get_a32(), get_a32(), get_a32(), get_a32()); }
rgbaint_t select_red32() const { return rgbaint_t(get_r32(), get_r32(), get_r32(), get_r32()); }
rgbaint_t select_green32() const { return rgbaint_t(get_g32(), get_g32(), get_g32(), get_g32()); }
rgbaint_t select_blue32() const { return rgbaint_t(get_b32(), get_b32(), get_b32(), get_b32()); }
inline void add(const rgbaint_t& color) inline void add(const rgbaint_t& color)
{ {
add_imm_rgba(color.m_a, color.m_r, color.m_g, color.m_b); add_imm_rgba(color.m_a, color.m_r, color.m_g, color.m_b);
@ -304,7 +310,6 @@ public:
void scale_imm_and_clamp(const s32 scale); void scale_imm_and_clamp(const s32 scale);
void scale2_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other, const rgbaint_t& scale2); void scale2_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other, const rgbaint_t& scale2);
void scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other); void scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other);
void scale_imm_add_and_clamp(const s32 scale, const rgbaint_t& other);
void cmpeq(const rgbaint_t& value) { cmpeq_imm_rgba(value.m_a, value.m_r, value.m_g, value.m_b); } void cmpeq(const rgbaint_t& value) { cmpeq_imm_rgba(value.m_a, value.m_r, value.m_g, value.m_b); }
void cmpgt(const rgbaint_t& value) { cmpgt_imm_rgba(value.m_a, value.m_r, value.m_g, value.m_b); } void cmpgt(const rgbaint_t& value) { cmpgt_imm_rgba(value.m_a, value.m_r, value.m_g, value.m_b); }

View File

@ -67,7 +67,7 @@ public:
u8 get_a() const { return u8(unsigned(_mm_extract_epi16(m_value, 6))); } u8 get_a() const { return u8(unsigned(_mm_extract_epi16(m_value, 6))); }
u8 get_r() const { return u8(unsigned(_mm_extract_epi16(m_value, 4))); } u8 get_r() const { return u8(unsigned(_mm_extract_epi16(m_value, 4))); }
u8 get_g() const { return u8(unsigned(_mm_extract_epi16(m_value, 2))); } u8 get_g() const { return u8(unsigned(_mm_extract_epi16(m_value, 2))); }
u8 get_b() const { return u8(unsigned(_mm_extract_epi16(m_value, 0))); } u8 get_b() const { return u8(unsigned(_mm_cvtsi128_si32(m_value))); }
#ifdef __SSE4_1__ #ifdef __SSE4_1__
s32 get_a32() const { return _mm_extract_epi32(m_value, 3); } s32 get_a32() const { return _mm_extract_epi32(m_value, 3); }
@ -75,12 +75,18 @@ public:
s32 get_g32() const { return _mm_extract_epi32(m_value, 1); } s32 get_g32() const { return _mm_extract_epi32(m_value, 1); }
s32 get_b32() const { return _mm_extract_epi32(m_value, 0); } s32 get_b32() const { return _mm_extract_epi32(m_value, 0); }
#else #else
s32 get_a32() const { return (_mm_extract_epi16(m_value, 7) << 16) | _mm_extract_epi16(m_value, 6); } s32 get_a32() const { return (_mm_cvtsi128_si32(_mm_shuffle_epi32(m_value, _MM_SHUFFLE(0, 0, 0, 3)))); }
s32 get_r32() const { return (_mm_extract_epi16(m_value, 5) << 16) | _mm_extract_epi16(m_value, 4); } s32 get_r32() const { return (_mm_cvtsi128_si32(_mm_shuffle_epi32(m_value, _MM_SHUFFLE(0, 0, 0, 2)))); }
s32 get_g32() const { return (_mm_extract_epi16(m_value, 3) << 16) | _mm_extract_epi16(m_value, 2); } s32 get_g32() const { return (_mm_cvtsi128_si32(_mm_shuffle_epi32(m_value, _MM_SHUFFLE(0, 0, 0, 1)))); }
s32 get_b32() const { return (_mm_extract_epi16(m_value, 1) << 16) | _mm_extract_epi16(m_value, 0); } s32 get_b32() const { return (_mm_cvtsi128_si32(m_value)); }
#endif #endif
// These selects return an rgbaint_t with all fields set to the element choosen (a, r, g, or b)
rgbaint_t select_alpha32() const { return (rgbaint_t)_mm_shuffle_epi32(m_value, _MM_SHUFFLE(3, 3, 3, 3)); }
rgbaint_t select_red32() const { return (rgbaint_t)_mm_shuffle_epi32(m_value, _MM_SHUFFLE(2, 2, 2, 2)); }
rgbaint_t select_green32() const { return (rgbaint_t)_mm_shuffle_epi32(m_value, _MM_SHUFFLE(1, 1, 1, 1)); }
rgbaint_t select_blue32() const { return (rgbaint_t)_mm_shuffle_epi32(m_value, _MM_SHUFFLE(0, 0, 0, 0)); }
inline void add(const rgbaint_t& color2) inline void add(const rgbaint_t& color2)
{ {
m_value = _mm_add_epi32(m_value, color2.m_value); m_value = _mm_add_epi32(m_value, color2.m_value);
@ -283,37 +289,71 @@ public:
void scale_and_clamp(const rgbaint_t& scale); void scale_and_clamp(const rgbaint_t& scale);
inline void scale_imm_and_clamp(const s32 scale) // Leave this here in case Model3 blows up...
//inline void scale_imm_and_clamp(const s32 scale)
//{
// mul_imm(scale);
// sra_imm(8);
// clamp_to_uint8();
//}
// This version needs values to be 12 bits or less
inline void scale_imm_and_clamp(const s16 scale)
{ {
mul_imm(scale); // Set mult a 16 bit inputs to scale
sra_imm(8); __m128i immv = _mm_set1_epi16(scale);
clamp_to_uint8(); // Shift up by 4
} immv = _mm_slli_epi16(immv, 4);
// Pack color into mult b 16 bit inputs
inline void scale_imm_add_and_clamp(const s32 scale, const rgbaint_t& other) m_value = _mm_packs_epi32(m_value, _mm_setzero_si128());
{ // Shift up by 4
mul_imm(scale); m_value = _mm_slli_epi16(m_value, 4);
sra_imm(8); // Do the 16 bit multiply, bottom 64 bits will contain 16 bit truncated results
add(other); m_value = _mm_mulhi_epi16(m_value, immv);
clamp_to_uint8(); // Clamp to u8
m_value = _mm_packus_epi16(m_value, _mm_setzero_si128());
// Unpack up to s32
m_value = _mm_unpacklo_epi8(m_value, _mm_setzero_si128());
m_value = _mm_unpacklo_epi16(m_value, _mm_setzero_si128());
} }
// This function needs values to be 12 bits or less
inline void scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other) inline void scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other)
{ {
mul(scale); // Pack scale into mult a 16 bits
sra_imm(8); __m128i tmp1 = _mm_packs_epi32(scale.m_value, _mm_setzero_si128());
// Shift up by 4
tmp1 = _mm_slli_epi16(tmp1, 4);
// Pack color into mult b 16 bit inputs
m_value = _mm_packs_epi32(m_value, _mm_setzero_si128());
// Shift up by 4
m_value = _mm_slli_epi16(m_value, 4);
// Do the 16 bit multiply, bottom 64 bits will contain 16 bit truncated results
m_value = _mm_mulhi_epi16(m_value, tmp1);
// Unpack into 32 bit values
m_value = _mm_unpacklo_epi16(m_value, _mm_setzero_si128());
add(other); add(other);
clamp_to_uint8(); clamp_to_uint8();
} }
// This function needs values to be 12 bits or less
inline void scale2_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other, const rgbaint_t& scale2) inline void scale2_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other, const rgbaint_t& scale2)
{ {
rgbaint_t color2(other); // Pack both scale values into mult a 16 bits
color2.mul(scale2); __m128i tmp1 = _mm_packs_epi32(scale.m_value, scale2.m_value);
// Shift up by 4
mul(scale); tmp1 = _mm_slli_epi16(tmp1, 4);
add(color2); // Pack both color values into mult b 16 bit inputs
sra_imm(8); m_value = _mm_packs_epi32(m_value, other.m_value);
// Shift up by 4
m_value = _mm_slli_epi16(m_value, 4);
// Do the 16 bit multiply, top and bottom 64 bits will contain 16 bit truncated results
tmp1 = _mm_mulhi_epi16(m_value, tmp1);
// Unpack the results
m_value = _mm_unpacklo_epi16(tmp1, _mm_setzero_si128());
tmp1 = _mm_unpackhi_epi16(tmp1, _mm_setzero_si128());
// Add the results
m_value = _mm_add_epi32(m_value, tmp1);
clamp_to_uint8(); clamp_to_uint8();
} }

View File

@ -14,6 +14,7 @@
// use SSE on 64-bit implementations, where it can be assumed // use SSE on 64-bit implementations, where it can be assumed
#if (!defined(MAME_DEBUG) || defined(__OPTIMIZE__)) && (defined(__SSE2__) || defined(_MSC_VER)) && defined(PTR64) #if (!defined(MAME_DEBUG) || defined(__OPTIMIZE__)) && (defined(__SSE2__) || defined(_MSC_VER)) && defined(PTR64)
#include "rgbsse.h" #include "rgbsse.h"
#elif defined(__ALTIVEC__) #elif defined(__ALTIVEC__)
#include "rgbvmx.h" #include "rgbvmx.h"

View File

@ -205,6 +205,12 @@ public:
return result; return result;
} }
// These selects return an rgbaint_t with all fields set to the element choosen (a, r, g, or b)
rgbaint_t select_alpha32() const { return rgbaint_t(get_a32(), get_a32(), get_a32(), get_a32()); }
rgbaint_t select_red32() const { return rgbaint_t(get_r32(), get_r32(), get_r32(), get_r32()); }
rgbaint_t select_green32() const { return rgbaint_t(get_g32(), get_g32(), get_g32(), get_g32()); }
rgbaint_t select_blue32() const { return rgbaint_t(get_b32(), get_b32(), get_b32(), get_b32()); }
inline void add(const rgbaint_t& color2) inline void add(const rgbaint_t& color2)
{ {
m_value = vec_add(m_value, color2.m_value); m_value = vec_add(m_value, color2.m_value);
@ -460,14 +466,6 @@ public:
void scale_and_clamp(const rgbaint_t& scale); void scale_and_clamp(const rgbaint_t& scale);
void scale_imm_and_clamp(const s32 scale); void scale_imm_and_clamp(const s32 scale);
void scale_imm_add_and_clamp(const s32 scale, const rgbaint_t& other)
{
mul_imm(scale);
sra_imm(8);
add(other);
clamp_to_uint8();
}
void scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other) void scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other)
{ {
mul(scale); mul(scale);