mirror of
https://github.com/holub/mame
synced 2025-04-19 23:12:11 +03:00
voodoo: Use SSE routines for texture perspective correction. (nw)
This commit is contained in:
parent
e1f5df74b0
commit
55a81eddff
@ -2641,9 +2641,9 @@ void voodoo_device::raster_##name(void *destbase, int32_t y, const poly_extent *
|
||||
int32_t stopx = extent->stopx; \
|
||||
rgbaint_t iterargb, iterargbDelta; \
|
||||
int32_t iterz; \
|
||||
int64_t iterw, iterw0 = 0, iterw1 = 0; \
|
||||
int64_t iters0 = 0, iters1 = 0; \
|
||||
int64_t itert0 = 0, itert1 = 0; \
|
||||
int64_t iterw; \
|
||||
tmu_state::stw_t iterstw0, iterstw1; \
|
||||
tmu_state::stw_t deltastw0, deltastw1; \
|
||||
uint16_t *depth; \
|
||||
uint16_t *dest; \
|
||||
int32_t dx, dy; \
|
||||
@ -2713,15 +2713,19 @@ void voodoo_device::raster_##name(void *destbase, int32_t y, const poly_extent *
|
||||
iterw = extra->startw + dy * extra->dwdy + dx * extra->dwdx; \
|
||||
if (TMUS >= 1) \
|
||||
{ \
|
||||
iterw0 = extra->startw0 + dy * extra->dw0dy + dx * extra->dw0dx; \
|
||||
iters0 = extra->starts0 + dy * extra->ds0dy + dx * extra->ds0dx; \
|
||||
itert0 = extra->startt0 + dy * extra->dt0dy + dx * extra->dt0dx; \
|
||||
iterstw0.set( \
|
||||
extra->starts0 + dy * extra->ds0dy + dx * extra->ds0dx, \
|
||||
extra->startt0 + dy * extra->dt0dy + dx * extra->dt0dx, \
|
||||
extra->startw0 + dy * extra->dw0dy + dx * extra->dw0dx); \
|
||||
deltastw0.set(extra->ds0dx, extra->dt0dx, extra->dw0dx); \
|
||||
} \
|
||||
if (TMUS >= 2) \
|
||||
{ \
|
||||
iterw1 = extra->startw1 + dy * extra->dw1dy + dx * extra->dw1dx; \
|
||||
iters1 = extra->starts1 + dy * extra->ds1dy + dx * extra->ds1dx; \
|
||||
itert1 = extra->startt1 + dy * extra->dt1dy + dx * extra->dt1dx; \
|
||||
iterstw1.set( \
|
||||
extra->starts1 + dy * extra->ds1dy + dx * extra->ds1dx, \
|
||||
extra->startt1 + dy * extra->dt1dy + dx * extra->dt1dx, \
|
||||
extra->startw1 + dy * extra->dw1dy + dx * extra->dw1dx); \
|
||||
deltastw1.set(extra->ds1dx, extra->dt1dx, extra->dw1dx); \
|
||||
} \
|
||||
extra->info->hits++; \
|
||||
/* loop in X */ \
|
||||
@ -2743,7 +2747,7 @@ void voodoo_device::raster_##name(void *destbase, int32_t y, const poly_extent *
|
||||
int32_t tmp; \
|
||||
const rgbaint_t texelZero(0); \
|
||||
texel = vd->tmu[1].genTexture(x, dither4, TEXMODE1, vd->tmu[1].lookup, extra->lodbase1, \
|
||||
iters1, itert1, iterw1, tmp); \
|
||||
iterstw1, tmp); \
|
||||
texel = vd->tmu[1].combineTexture(TEXMODE1, texel, texelZero, tmp); \
|
||||
} \
|
||||
/* run the texture pipeline on TMU0 to produce a final */ \
|
||||
@ -2756,7 +2760,7 @@ void voodoo_device::raster_##name(void *destbase, int32_t y, const poly_extent *
|
||||
int32_t lod0; \
|
||||
rgbaint_t texelT0; \
|
||||
texelT0 = vd->tmu[0].genTexture(x, dither4, TEXMODE0, vd->tmu[0].lookup, extra->lodbase0, \
|
||||
iters0, itert0, iterw0, lod0); \
|
||||
iterstw0, lod0); \
|
||||
texel = vd->tmu[0].combineTexture(TEXMODE0, texelT0, texel, lod0); \
|
||||
} \
|
||||
else \
|
||||
@ -2792,15 +2796,11 @@ void voodoo_device::raster_##name(void *destbase, int32_t y, const poly_extent *
|
||||
iterw += extra->dwdx; \
|
||||
if (TMUS >= 1) \
|
||||
{ \
|
||||
iterw0 += extra->dw0dx; \
|
||||
iters0 += extra->ds0dx; \
|
||||
itert0 += extra->dt0dx; \
|
||||
iterstw0.add(deltastw0); \
|
||||
} \
|
||||
if (TMUS >= 2) \
|
||||
{ \
|
||||
iterw1 += extra->dw1dx; \
|
||||
iters1 += extra->ds1dx; \
|
||||
itert1 += extra->dt1dx; \
|
||||
iterstw1.add(deltastw1); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
@ -2812,38 +2812,23 @@ void voodoo_device::raster_##name(void *destbase, int32_t y, const poly_extent *
|
||||
// The maximum error using a 4 bit lookup from the mantissa is 0.0875, which is less than 1/2 lsb (0.125) for 2 bits of fraction.
|
||||
// An offset of +(56 << 8) is added for alignment in multi_reciplog
|
||||
// ******************************************************************************************************************************
|
||||
static inline int32_t ATTR_FORCE_INLINE new_log2(double &value)
|
||||
inline int32_t ATTR_FORCE_INLINE voodoo_device::tmu_state::new_log2(double &value, const int &offset)
|
||||
{
|
||||
static const int32_t new_log2_table[16] = {0 + (56 << 8), 22 + (56 << 8), 44 + (56 << 8), 63 + (56 << 8), 82 + (56 << 8),
|
||||
100 + (56 << 8), 118 + (56 << 8), 134 + (56 << 8), 150 + (56 << 8), 165 + (56 << 8), 179 + (56 << 8), 193 + (56 << 8),
|
||||
207 + (56 << 8), 220 + (56 << 8), 232 + (56 << 8), 244 + (56 << 8)};
|
||||
static const int32_t new_log2_table[16] = {0, 22, 44, 63, 82, 100, 118, 134, 150, 165, 179, 193, 207, 220, 232, 244};
|
||||
uint64_t ival = *((uint64_t *)&value);
|
||||
// Return 0 if negative
|
||||
if (ival & ((uint64_t)1 << 63))
|
||||
return 0;
|
||||
// We zero the result if negative so don't worry about the sign bit
|
||||
int32_t exp = (ival>>52);
|
||||
exp -= 1023+32;
|
||||
exp -= 1023+32-offset;
|
||||
exp <<= 8;
|
||||
uint32_t addr = (uint64_t)(ival>>48) & 0xf;
|
||||
exp += new_log2_table[addr];
|
||||
return exp;
|
||||
}
|
||||
|
||||
// Computes A/C and B/C and returns log2 of 1/C
|
||||
// A, B and C are 16.32 values. The results are 24.8.
|
||||
static inline void ATTR_FORCE_INLINE multi_reciplog(int64_t valueA, int64_t valueB, int64_t valueC, int32_t &log, int32_t &resA, int32_t &resB)
|
||||
{
|
||||
double recip = double(1ULL<<(47-39))/valueC;
|
||||
double resAD = valueA * recip;
|
||||
double resBD = valueB * recip;
|
||||
log = new_log2(recip);
|
||||
resA = resAD;
|
||||
resB = resBD;
|
||||
}
|
||||
|
||||
|
||||
inline rgbaint_t ATTR_FORCE_INLINE voodoo_device::tmu_state::genTexture(int32_t x, const uint8_t *dither4, const uint32_t TEXMODE, rgb_t *LOOKUP, int32_t LODBASE, int64_t ITERS, int64_t ITERT, int64_t ITERW, int32_t &lod)
|
||||
inline rgbaint_t ATTR_FORCE_INLINE voodoo_device::tmu_state::genTexture(int32_t x, const uint8_t *dither4, const uint32_t TEXMODE, rgb_t *LOOKUP, int32_t LODBASE, const stw_t &iterstw, int32_t &lod)
|
||||
{
|
||||
rgbaint_t result;
|
||||
int32_t s, t, ilod;
|
||||
@ -2853,23 +2838,16 @@ inline rgbaint_t ATTR_FORCE_INLINE voodoo_device::tmu_state::genTexture(int32_t
|
||||
if (TEXMODE_ENABLE_PERSPECTIVE(TEXMODE))
|
||||
{
|
||||
int32_t wLog;
|
||||
if (USE_FAST_RECIP) {
|
||||
const int32_t oow = fast_reciplog((ITERW), &wLog);
|
||||
s = ((int64_t)oow * (ITERS)) >> (29+10);
|
||||
t = ((int64_t)oow * (ITERT)) >> (29+10);
|
||||
} else {
|
||||
multi_reciplog(ITERS, ITERT, ITERW, wLog, s, t);
|
||||
}
|
||||
iterstw.calc_stow(s, t, wLog);
|
||||
lod += wLog;
|
||||
}
|
||||
else
|
||||
{
|
||||
s = (ITERS) >> (14+10);
|
||||
t = (ITERT) >> (14+10);
|
||||
iterstw.get_st_shiftr(s, t, (14 + 10));
|
||||
}
|
||||
|
||||
/* clamp W */
|
||||
if (TEXMODE_CLAMP_NEG_W(TEXMODE) && (ITERW) < 0)
|
||||
if (TEXMODE_CLAMP_NEG_W(TEXMODE) && iterstw.is_w_neg())
|
||||
{
|
||||
s = t = 0;
|
||||
}
|
||||
|
@ -1431,8 +1431,8 @@ inline int32_t voodoo_device::tmu_state::prepare()
|
||||
return (-lodbase + (12 << 8)) / 2;
|
||||
#else
|
||||
double tmpTex = texdx;
|
||||
lodbase = new_log2(tmpTex);
|
||||
return (lodbase + (12 << 8) - (56 << 8)) / 2;
|
||||
lodbase = new_log2(tmpTex, 0);
|
||||
return (lodbase + (12 << 8)) / 2;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -1593,10 +1593,12 @@ protected:
|
||||
|
||||
struct tmu_state
|
||||
{
|
||||
class stw_t;
|
||||
void recompute_texture_params();
|
||||
void init(uint8_t vdt, tmu_shared_state &share, voodoo_reg *r, void *memory, int tmem);
|
||||
int32_t prepare();
|
||||
rgbaint_t genTexture(int32_t x, const uint8_t *dither4, const uint32_t TEXMODE, rgb_t *LOOKUP, int32_t LODBASE, int64_t ITERS, int64_t ITERT, int64_t ITERW, int32_t &lod);
|
||||
static int32_t new_log2(double &value, const int &offset);
|
||||
rgbaint_t genTexture(int32_t x, const uint8_t *dither4, const uint32_t TEXMODE, rgb_t *LOOKUP, int32_t LODBASE, const stw_t &iterstw, int32_t &lod);
|
||||
rgbaint_t combineTexture(const uint32_t TEXMODE, const rgbaint_t& c_local, const rgbaint_t& c_other, int32_t lod);
|
||||
|
||||
struct ncc_table
|
||||
@ -1958,4 +1960,86 @@ DECLARE_DEVICE_TYPE(VOODOO_2, voodoo_2_device)
|
||||
DECLARE_DEVICE_TYPE(VOODOO_BANSHEE, voodoo_banshee_device)
|
||||
DECLARE_DEVICE_TYPE(VOODOO_3, voodoo_3_device)
|
||||
|
||||
// use SSE on 64-bit implementations, where it can be assumed
|
||||
#if 1 && ((!defined(MAME_DEBUG) || defined(__OPTIMIZE__)) && (defined(__SSE2__) || defined(_MSC_VER)) && defined(PTR64))
|
||||
#include <emmintrin.h>
|
||||
#ifdef __SSE4_1__
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
class voodoo_device::tmu_state::stw_t
|
||||
{
|
||||
public:
|
||||
stw_t() {}
|
||||
stw_t(const stw_t& other) = default;
|
||||
stw_t &operator=(const stw_t& other) = default;
|
||||
|
||||
inline void set(s64 s, s64 t, s64 w) { m_st = _mm_set_pd(s, t); m_w = _mm_set1_pd(w); }
|
||||
inline int is_w_neg() const { return _mm_comilt_sd(m_w, _mm_set1_pd(0.0)); }
|
||||
inline void get_st_shiftr(s32 &s, s32 &t, const s32 &shift) const {
|
||||
s64 tmpS = _mm_cvtsd_si64(_mm_shuffle_pd(m_st, _mm_setzero_pd(), 1));
|
||||
s = tmpS >> shift;
|
||||
s64 tmpT = _mm_cvtsd_si64(m_st);
|
||||
t = tmpT >> shift;
|
||||
}
|
||||
inline void add(const stw_t& other)
|
||||
{
|
||||
m_st = _mm_add_pd(m_st, other.m_st);
|
||||
m_w = _mm_add_pd(m_w, other.m_w);
|
||||
}
|
||||
inline void calc_stow(s32 &sow, s32 &tow, int32_t &oowlog) const
|
||||
{
|
||||
__m128d tmp = _mm_div_pd(m_st, m_w);
|
||||
// Allow for 8 bits of decimal in integer
|
||||
tmp = _mm_mul_pd(tmp, _mm_set1_pd(256.0));
|
||||
__m128i tmp2 = _mm_cvttpd_epi32(tmp);
|
||||
#ifdef __SSE4_1__
|
||||
sow = _mm_extract_epi32(tmp2, 0);
|
||||
tow = _mm_extract_epi32(tmp2, 1);
|
||||
#else
|
||||
sow = _mm_cvtsi128_si32(_mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 0, 1)));
|
||||
tow = _mm_cvtsi128_si32(tmp2);
|
||||
#endif
|
||||
double dW = _mm_cvtsd_f64(m_w);
|
||||
oowlog = -new_log2(dW, 0);
|
||||
}
|
||||
private:
|
||||
__m128d m_st;
|
||||
__m128d m_w;
|
||||
};
|
||||
#else
|
||||
class voodoo_device::tmu_state::stw_t
|
||||
{
|
||||
public:
|
||||
stw_t() {}
|
||||
stw_t(const stw_t& other) = default;
|
||||
stw_t &operator=(const stw_t& other) = default;
|
||||
|
||||
inline void set(s64 s, s64 t, s64 w) { m_s = s; m_t = t; m_w = w; }
|
||||
inline int is_w_neg() const { return (m_w < 0) ? 1 : 0; }
|
||||
inline void get_st_shiftr(s32 &s, s32 &t, const s32 &shift) const {
|
||||
s = m_s >> shift;
|
||||
t = m_t >> shift;
|
||||
}
|
||||
inline void add(const stw_t& other)
|
||||
{
|
||||
m_s += other.m_s;
|
||||
m_t += other.m_t;
|
||||
m_w += other.m_w;
|
||||
}
|
||||
// Computes s/w and t/w and returns log2 of 1/w
|
||||
// s, t and c are 16.32 values. The results are 24.8.
|
||||
inline void calc_stow(s32 &sow, s32 &tow, int32_t &oowlog) const
|
||||
{
|
||||
double recip = double(1ULL << (47 - 39)) / m_w;
|
||||
double resAD = m_s * recip;
|
||||
double resBD = m_t * recip;
|
||||
oowlog = new_log2(recip, 56);
|
||||
sow = resAD;
|
||||
tow = resBD;
|
||||
}
|
||||
private:
|
||||
s64 m_s, m_t, m_w;
|
||||
};
|
||||
#endif
|
||||
|
||||
#endif // MAME_VIDEO_VOODOO_H
|
||||
|
Loading…
Reference in New Issue
Block a user