rdp-sse (nw) Proper fix for bilinear_filter

This commit is contained in:
therealmogminer@gmail.com 2015-06-23 20:58:21 +02:00
parent 0dcf906e4a
commit 1bef19e984
7 changed files with 91 additions and 84 deletions

View File

@ -109,26 +109,4 @@ void rgbaint_t::scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& oth
if ((UINT16)m_b > 255) { m_b = (m_b < 0) ? 0 : 255; }
}
/*-------------------------------------------------
bilinear_filter - bilinear filter between
four pixel values; this code is derived from
code provided by Michael Herf
-------------------------------------------------*/
UINT32 rgbaint_t::bilinear_filter(UINT32 rgb00, UINT32 rgb01, UINT32 rgb10, UINT32 rgb11, UINT8 u, UINT8 v)
{
UINT32 ag0, ag1, rb0, rb1;
rb0 = (rgb00 & 0x00ff00ff) + ((((rgb01 & 0x00ff00ff) - (rgb00 & 0x00ff00ff)) * u) >> 8);
rb1 = (rgb10 & 0x00ff00ff) + ((((rgb11 & 0x00ff00ff) - (rgb10 & 0x00ff00ff)) * u) >> 8);
ag0 = (rgb00 & 0xff00ff00) + ((((rgb01 & 0xff00ff00) - (rgb00 & 0xff00ff00)) * u) >> 8);
ag1 = (rgb10 & 0xff00ff00) + ((((rgb11 & 0xff00ff00) - (rgb10 & 0xff00ff00)) * u) >> 8);
rb0 = (rb0 & 0x00ff00ff) + ((((rb1 & 0x00ff00ff) - (rb0 & 0x00ff00ff)) * v) >> 8);
ag0 = (ag0 & 0xff00ff00) + ((((ag1 & 0xff00ff00) - (ag0 & 0xff00ff00)) * v) >> 8);
return (ag0 & 0xff00ff00) | (rb0 & 0x00ff00ff);
}
#endif // !defined(__ALTIVEC__)

View File

@ -20,6 +20,7 @@ class rgbaint_t
{
public:
inline rgbaint_t() { }
inline rgbaint_t(UINT32 rgba) { set(rgba); }
inline rgbaint_t(UINT32 a, UINT32 r, UINT32 g, UINT32 b) { set(a, r, g, b); }
inline rgbaint_t(rgb_t& rgba) { set(rgba); }
@ -454,7 +455,24 @@ public:
return *this;
}
static UINT32 bilinear_filter(UINT32 rgb00, UINT32 rgb01, UINT32 rgb10, UINT32 rgb11, UINT8 u, UINT8 v);
static UINT32 bilinear_filter(UINT32 rgb00, UINT32 rgb01, UINT32 rgb10, UINT32 rgb11, UINT8 u, UINT8 v)
{
UINT32 rb0 = (rgb00 & 0x00ff00ff) + ((((rgb01 & 0x00ff00ff) - (rgb00 & 0x00ff00ff)) * u) >> 8);
UINT32 rb1 = (rgb10 & 0x00ff00ff) + ((((rgb11 & 0x00ff00ff) - (rgb10 & 0x00ff00ff)) * u) >> 8);
rgb00 >>= 8;
rgb01 >>= 8;
rgb10 >>= 8;
rgb11 >>= 8;
UINT32 ag0 = (rgb00 & 0x00ff00ff) + ((((rgb01 & 0x00ff00ff) - (rgb00 & 0x00ff00ff)) * u) >> 8);
UINT32 ag1 = (rgb10 & 0x00ff00ff) + ((((rgb11 & 0x00ff00ff) - (rgb10 & 0x00ff00ff)) * u) >> 8);
rb0 = (rb0 & 0x00ff00ff) + ((((rb1 & 0x00ff00ff) - (rb0 & 0x00ff00ff)) * v) >> 8);
ag0 = (ag0 & 0x00ff00ff) + ((((ag1 & 0x00ff00ff) - (ag0 & 0x00ff00ff)) * v) >> 8);
return ((ag0 << 8) & 0xff00ff00) | (rb0 & 0x00ff00ff);
}
protected:
UINT32 m_a;

View File

@ -14,7 +14,7 @@
#include "emu.h"
#include <emmintrin.h>
#include "rgbutil.h"
#include "rgbsse.h"
/***************************************************************************
HIGHER LEVEL OPERATIONS
@ -32,6 +32,7 @@ void rgbaint_t::scale_and_clamp(const rgbaint_t& scale)
mul(scale);
shr(8);
min(255);
max(0);
}
void rgbaint_t::scale_imm_and_clamp(const INT32 scale)
@ -39,11 +40,11 @@ void rgbaint_t::scale_imm_and_clamp(const INT32 scale)
mul_imm(scale);
shr(8);
min(255);
max(0);
}
void rgbaint_t::scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other, const rgbaint_t& scale2)
{
mul(scale);
rgbaint_t color2(other);
color2.mul(scale2);
@ -51,6 +52,7 @@ void rgbaint_t::scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& oth
add(color2);
shr(8);
min(255);
max(0);
}
void rgbaint_t::scale_imm_add_and_clamp(const INT32 scale, const rgbaint_t& other)
@ -59,6 +61,7 @@ void rgbaint_t::scale_imm_add_and_clamp(const INT32 scale, const rgbaint_t& othe
add(other);
shr(8);
min(255);
max(0);
}
void rgbaint_t::scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& other)
@ -67,30 +70,7 @@ void rgbaint_t::scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& oth
add(other);
shr(8);
min(255);
}
UINT32 rgbaint_t::bilinear_filter(UINT32 rgb00, UINT32 rgb01, UINT32 rgb10, UINT32 rgb11, UINT8 u, UINT8 v)
{
__m128i color00 = _mm_cvtsi32_si128(rgb00);
__m128i color01 = _mm_cvtsi32_si128(rgb01);
__m128i color10 = _mm_cvtsi32_si128(rgb10);
__m128i color11 = _mm_cvtsi32_si128(rgb11);
/* interleave color01 and color00 at the byte level */
color01 = _mm_unpacklo_epi8(color01, color00);
color11 = _mm_unpacklo_epi8(color11, color10);
color01 = _mm_unpacklo_epi8(color01, _mm_setzero_si128());
color11 = _mm_unpacklo_epi8(color11, _mm_setzero_si128());
color01 = _mm_madd_epi16(color01, *(__m128i *)&rgbsse_statics.scale_table[u][0]);
color11 = _mm_madd_epi16(color11, *(__m128i *)&rgbsse_statics.scale_table[u][0]);
color01 = _mm_slli_epi32(color01, 15);
color11 = _mm_srli_epi32(color11, 1);
color01 = _mm_max_epi16(color01, color11);
color01 = _mm_madd_epi16(color01, *(__m128i *)&rgbsse_statics.scale_table[v][0]);
color01 = _mm_srli_epi32(color01, 15);
color01 = _mm_packs_epi32(color01, color01);
color01 = _mm_packus_epi16(color01, color01);
return _mm_cvtsi128_si32(color01);
max(0);
}
#endif // defined(__SSE2__) || defined(_MSC_VER)

View File

@ -49,8 +49,12 @@ public:
inline rgb_t to_rgba()
{
__m128i anded = _mm_and_si128(m_value, _mm_set1_epi32(0x000000ff));
return _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(anded, anded), _mm_setzero_si128()));
return _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(m_value, _mm_setzero_si128()), _mm_setzero_si128()));
}
inline UINT32 to_argb8()
{
return _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(m_value, _mm_setzero_si128()), _mm_setzero_si128()));
}
inline rgb_t to_rgba_clamp()
@ -296,6 +300,13 @@ public:
m_value = _mm_or_si128(_mm_and_si128(val, mask), _mm_and_si128(m_value, _mm_xor_si128(mask, _mm_set1_epi32(0xffffffff))));
}
inline void max(const UINT32 value)
{
__m128i val = _mm_set1_epi32(value);
__m128i mask = _mm_cmplt_epi32(m_value, val);
m_value = _mm_or_si128(_mm_and_si128(val, mask), _mm_and_si128(m_value, _mm_xor_si128(mask, _mm_set1_epi32(0xffffffff))));
}
void blend(const rgbaint_t& other, UINT8 factor);
void scale_and_clamp(const rgbaint_t& scale);
@ -398,7 +409,29 @@ public:
m_value = _mm_insert_epi16(m_value, _mm_extract_epi16(alpha.m_value, 6), 6);
}
static UINT32 bilinear_filter(UINT32 rgb00, UINT32 rgb01, UINT32 rgb10, UINT32 rgb11, UINT8 u, UINT8 v);
static UINT32 bilinear_filter(UINT32 rgb00, UINT32 rgb01, UINT32 rgb10, UINT32 rgb11, UINT8 u, UINT8 v)
{
__m128i color00 = _mm_cvtsi32_si128(rgb00);
__m128i color01 = _mm_cvtsi32_si128(rgb01);
__m128i color10 = _mm_cvtsi32_si128(rgb10);
__m128i color11 = _mm_cvtsi32_si128(rgb11);
/* interleave color01 and color00 at the byte level */
color01 = _mm_unpacklo_epi8(color01, color00);
color11 = _mm_unpacklo_epi8(color11, color10);
color01 = _mm_unpacklo_epi8(color01, _mm_setzero_si128());
color11 = _mm_unpacklo_epi8(color11, _mm_setzero_si128());
color01 = _mm_madd_epi16(color01, *(__m128i *)&rgbsse_statics.scale_table[u][0]);
color11 = _mm_madd_epi16(color11, *(__m128i *)&rgbsse_statics.scale_table[u][0]);
color01 = _mm_slli_epi32(color01, 15);
color11 = _mm_srli_epi32(color11, 1);
color01 = _mm_max_epi16(color01, color11);
color01 = _mm_madd_epi16(color01, *(__m128i *)&rgbsse_statics.scale_table[v][0]);
color01 = _mm_srli_epi32(color01, 15);
color01 = _mm_packs_epi32(color01, _mm_setzero_si128());
color01 = _mm_packus_epi16(color01, _mm_setzero_si128());
return _mm_cvtsi128_si32(color01);
}
protected:
__m128i m_value;

View File

@ -209,33 +209,4 @@ void rgbaint_t::scale_add_and_clamp(const rgbaint_t& scale, const rgbaint_t& oth
min(255);
}
UINT32 rgbaint_t::bilinear_filter(UINT32 rgb00, UINT32 rgb01, UINT32 rgb10, UINT32 rgb11, UINT8 u, UINT8 v)
{
const VECS32 zero = vec_splat_s32(0);
VECS32 color00 = vec_perm((VECS32)vec_lde(0, &rgb00), zero, vec_lvsl(0, &rgb00));
VECS32 color01 = vec_perm((VECS32)vec_lde(0, &rgb01), zero, vec_lvsl(0, &rgb01));
VECS32 color10 = vec_perm((VECS32)vec_lde(0, &rgb10), zero, vec_lvsl(0, &rgb10));
VECS32 color11 = vec_perm((VECS32)vec_lde(0, &rgb11), zero, vec_lvsl(0, &rgb11));
/* interleave color01 and color00 at the byte level */
color01 = vec_mergeh((VECU8)color01, (VECU8)color00);
color11 = vec_mergeh((VECU8)color11, (VECU8)color10);
color01 = vec_mergeh((VECU8)zero, (VECU8)color01);
color11 = vec_mergeh((VECU8)zero, (VECU8)color11);
color01 = vec_msum((VECS16)color01, scale_table[u], zero);
color11 = vec_msum((VECS16)color11, scale_table[u], zero);
color01 = vec_sl(color01, vec_splat_u32(15));
color11 = vec_sr(color11, vec_splat_u32(1));
color01 = vec_max((VECS16)color01, (VECS16)color11);
color01 = vec_msum((VECS16)color01, scale_table[v], zero);
color01 = vec_sr(color01, vec_splat_u32(15));
color01 = vec_packs(color01, color01);
color01 = vec_packsu((VECS16)color01, (VECS16)color01);
UINT32 result;
vec_ste((VECU32)color01, 0, &result);
return result;
}
#endif // defined(__ALTIVEC__)

View File

@ -457,7 +457,34 @@ public:
m_value = vec_perm(m_value, alpha.m_value, alpha_perm);
}
static UINT32 bilinear_filter(UINT32 rgb00, UINT32 rgb01, UINT32 rgb10, UINT32 rgb11, UINT8 u, UINT8 v);
static UINT32 bilinear_filter(UINT32 rgb00, UINT32 rgb01, UINT32 rgb10, UINT32 rgb11, UINT8 u, UINT8 v)
{
const VECS32 zero = vec_splat_s32(0);
VECS32 color00 = vec_perm((VECS32)vec_lde(0, &rgb00), zero, vec_lvsl(0, &rgb00));
VECS32 color01 = vec_perm((VECS32)vec_lde(0, &rgb01), zero, vec_lvsl(0, &rgb01));
VECS32 color10 = vec_perm((VECS32)vec_lde(0, &rgb10), zero, vec_lvsl(0, &rgb10));
VECS32 color11 = vec_perm((VECS32)vec_lde(0, &rgb11), zero, vec_lvsl(0, &rgb11));
/* interleave color01 and color00 at the byte level */
color01 = vec_mergeh((VECU8)color01, (VECU8)color00);
color11 = vec_mergeh((VECU8)color11, (VECU8)color10);
color01 = vec_mergeh((VECU8)zero, (VECU8)color01);
color11 = vec_mergeh((VECU8)zero, (VECU8)color11);
color01 = vec_msum((VECS16)color01, scale_table[u], zero);
color11 = vec_msum((VECS16)color11, scale_table[u], zero);
color01 = vec_sl(color01, vec_splat_u32(15));
color11 = vec_sr(color11, vec_splat_u32(1));
color01 = vec_max((VECS16)color01, (VECS16)color11);
color01 = vec_msum((VECS16)color01, scale_table[v], zero);
color01 = vec_sr(color01, vec_splat_u32(15));
color01 = vec_packs(color01, color01);
color01 = vec_packsu((VECS16)color01, (VECS16)color01);
UINT32 result;
vec_ste((VECU32)color01, 0, &result);
return result;
}
protected:
typedef vector unsigned char VECU8;

View File

@ -24,7 +24,7 @@ enum
};
// Use old macro style or newer SSE2 optimized functions
#define USE_OLD_RASTER 1
#define USE_OLD_RASTER 0
/* maximum number of TMUs */
#define MAX_TMU 2
@ -2219,7 +2219,7 @@ INLINE UINT32 clampARGB(INT32 iterr, INT32 iterg, INT32 iterb, INT32 itera, UINT
temp.set(colorint);
temp.cmpeq_imm(0x100);
// result.rgb.r = 0xff;
colorint.or_reg(temp);
return colorint.to_rgba();
}
else