voodoo: Fix LOD calculations. Improve LOD calculation accuracy. Remove stw_helper as any benefit it provides is within the noise.

This commit is contained in:
Aaron Giles 2021-07-10 22:08:36 -07:00
parent 2b39e4c5c8
commit eae7139f40
2 changed files with 115 additions and 203 deletions

View File

@ -16,7 +16,6 @@ namespace voodoo
static constexpr bool LOG_RASTERIZERS = false;
struct static_rasterizer_info
{
voodoo_renderer::rasterizer_mfp mfp;
@ -60,7 +59,7 @@ static const rectangle global_cliprect(-4096, 4095, -4096, 4095);
// point version of the iterated W value
//-------------------------------------------------
static inline s32 ATTR_FORCE_INLINE compute_wfloat(s64 iterw)
inline s32 ATTR_FORCE_INLINE compute_wfloat(s64 iterw)
{
int exp = count_leading_zeros_64(iterw) - 16;
if (exp < 0)
@ -78,7 +77,7 @@ static inline s32 ATTR_FORCE_INLINE compute_wfloat(s64 iterw)
// manual
//-------------------------------------------------
static inline rgbaint_t ATTR_FORCE_INLINE clamped_argb(const rgbaint_t &iterargb, reg_fbz_colorpath const fbzcp)
inline rgbaint_t ATTR_FORCE_INLINE clamped_argb(const rgbaint_t &iterargb, reg_fbz_colorpath const fbzcp)
{
rgbaint_t result(iterargb);
result.shr_imm(20);
@ -113,7 +112,7 @@ static inline rgbaint_t ATTR_FORCE_INLINE clamped_argb(const rgbaint_t &iterargb
// described in the Voodoo manual
//-------------------------------------------------
static inline s32 ATTR_FORCE_INLINE clamped_z(s32 iterz, reg_fbz_colorpath const fbzcp)
inline s32 ATTR_FORCE_INLINE clamped_z(s32 iterz, reg_fbz_colorpath const fbzcp)
{
// clamped case is easy
if (fbzcp.rgbzw_clamp() != 0)
@ -135,7 +134,7 @@ static inline s32 ATTR_FORCE_INLINE clamped_z(s32 iterz, reg_fbz_colorpath const
// described in the Voodoo manual
//-------------------------------------------------
static inline s32 ATTR_FORCE_INLINE clamped_w(s64 iterw, reg_fbz_colorpath const fbzcp)
inline s32 ATTR_FORCE_INLINE clamped_w(s64 iterw, reg_fbz_colorpath const fbzcp)
{
// clamped case is easy
if (fbzcp.rgbzw_clamp() != 0)
@ -151,29 +150,72 @@ static inline s32 ATTR_FORCE_INLINE clamped_w(s64 iterw, reg_fbz_colorpath const
}
//-------------------------------------------------
// fast_log2 - computes the log2 of a double-
// precision value as a 24.8 value; if the double
// was converted from a fixed-point integer, the
// number of fractional bits should be specified
// by fracbits
//-------------------------------------------------
inline s32 ATTR_FORCE_INLINE fast_log2(double value, int fracbits = 0)
{
// negative values return 0
if (UNEXPECTED(value < 0))
return 0;
// convert the value to a raw integer
union { double d; u64 i; } temp;
temp.d = value;
// we only care about the 11-bit exponent and top 7 bits of mantissa
// (sign is already assured to be 0)
u32 ival = temp.i >> 45;
// extract exponent, unbias, and adjust for fixed-point fraction
s32 exp = (ival >> 7) - 1023 - fracbits;
// use top 7 bits of mantissa to look up fractional log
static u8 const s_log2_table[128] =
{
0, 2, 5, 8, 11, 14, 16, 19, 22, 25, 27, 30, 33, 35, 38, 40,
43, 46, 48, 51, 53, 56, 58, 61, 63, 65, 68, 70, 73, 75, 77, 80,
82, 84, 87, 89, 91, 93, 96, 98, 100, 102, 104, 106, 109, 111, 113, 115,
117, 119, 121, 123, 125, 127, 129, 132, 134, 136, 138, 140, 141, 143, 145, 147,
149, 151, 153, 155, 157, 159, 161, 162, 164, 166, 168, 170, 172, 173, 175, 177,
179, 181, 182, 184, 186, 188, 189, 191, 193, 194, 196, 198, 200, 201, 203, 205,
206, 208, 209, 211, 213, 214, 216, 218, 219, 221, 222, 224, 225, 227, 229, 230,
232, 233, 235, 236, 238, 239, 241, 242, 244, 245, 247, 248, 250, 251, 253, 254
};
// combine the integral and fractional parts
return (exp << 8) | s_log2_table[ival & 127];
}
//-------------------------------------------------
// compute_lodbase - compute the base LOD value
//-------------------------------------------------
static inline s32 ATTR_FORCE_INLINE compute_lodbase(s64 dsdx, s64 dsdy, s64 dtdx, s64 dtdy)
inline s32 ATTR_FORCE_INLINE compute_lodbase(s64 dsdx, s64 dsdy, s64 dtdx, s64 dtdy)
{
// compute (ds^2 + dt^2) in both X and Y as 28.36 numbers
dsdx >>= 14;
dsdy >>= 14;
dtdx >>= 14;
dtdy >>= 14;
s64 texdx = dsdx * dsdx + dtdx * dtdx;
s64 texdy = dsdy * dsdy + dtdy * dtdy;
// compute (ds^2 + dt^2) in both X and Y; note that these values are
// each .32, so the square is a .64 fixed point value
double fdsdx = double(dsdx);
double fdsdy = double(dsdy);
double fdtdx = double(dtdx);
double fdtdy = double(dtdy);
double texdx = fdsdx * fdsdx + fdtdx * fdtdx;
double texdy = fdsdy * fdsdy + fdtdy * fdtdy;
// pick whichever is larger and shift off some high bits -> 28.20
s64 maxval = std::max(texdx, texdy) >> 16;
// pick whichever is larger
double maxval = std::max(texdx, texdy);
// use our fast reciprocal/log on this value; it expects input as a
// 16.32 number, and returns the log of the reciprocal, so we have to
// adjust the result: negative to get the log of the original value
// plus 12 to account for the extra exponent, and divided by 2 to
// get the log of the square root of texdx
return (fast_log2(double(maxval), 0) + (12 << 8)) / 2;
// use our fast reciprocal/log on this value; 64 to indicate how many
// bits of fractional resolution in the source, and divide by 2 because
// we really want the log of the square root
return fast_log2(maxval, 64) / 2;
}
@ -351,121 +393,6 @@ void dither_helper::init_static()
}
//**************************************************************************
// STW HELPER
//**************************************************************************
// use SSE on 64-bit implementations, where it can be assumed
#if 1 && ((!defined(MAME_DEBUG) || defined(__OPTIMIZE__)) && (defined(__SSE2__) || defined(_MSC_VER)) && defined(PTR64))
#include <emmintrin.h>
#ifdef __SSE4_1__
#include <smmintrin.h>
#endif
class stw_helper
{
public:
stw_helper() { }
stw_helper(stw_helper const &other) = default;
stw_helper &operator=(stw_helper const &other) = default;
void set(s64 s, s64 t, s64 w)
{
m_st = _mm_set_pd(s << 8, t << 8);
m_w = _mm_set1_pd(w);
}
bool is_w_neg() const
{
return _mm_comilt_sd(m_w, _mm_set1_pd(0.0));
}
void get_st_shiftr(s32 &s, s32 &t, s32 shift) const
{
shift += 8;
s = _mm_cvtsd_si64(_mm_shuffle_pd(m_st, _mm_setzero_pd(), 1)) >> shift;
t = _mm_cvtsd_si64(m_st) >> shift;
}
void add(stw_helper const &delta)
{
m_st = _mm_add_pd(m_st, delta.m_st);
m_w = _mm_add_pd(m_w, delta.m_w);
}
void calc_stow(s32 &sow, s32 &tow, s32 &oowlog) const
{
__m128d tmp = _mm_div_pd(m_st, m_w);
__m128i tmp2 = _mm_cvttpd_epi32(tmp);
#ifdef __SSE4_1__
sow = _mm_extract_epi32(tmp2, 1);
tow = _mm_extract_epi32(tmp2, 0);
#else
sow = _mm_cvtsi128_si32(_mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 0, 1)));
tow = _mm_cvtsi128_si32(tmp2);
#endif
oowlog = -fast_log2(_mm_cvtsd_f64(m_w), 0);
}
private:
__m128d m_st;
__m128d m_w;
};
#else
class stw_helper
{
public:
stw_helper() {}
stw_helper(stw_helper const &other) = default;
stw_helper &operator=(stw_helper const &other) = default;
void set(s64 s, s64 t, s64 w)
{
m_s = s;
m_t = t;
m_w = w;
}
bool is_w_neg() const
{
return (m_w < 0) ? true : false;
}
void get_st_shiftr(s32 &s, s32 &t, s32 shift) const
{
s = m_s >> shift;
t = m_t >> shift;
}
void add(stw_helper const &other)
{
m_s += other.m_s;
m_t += other.m_t;
m_w += other.m_w;
}
// Computes s/w and t/w and returns log2 of 1/w
// s, t and c are 16.32 values. The results are 24.8.
void calc_stow(s32 &sow, s32 &tow, s32 &oowlog) const
{
double recip = double(1ULL << (47 - 39)) / m_w;
sow = s32(m_s * recip);
tow = s32(m_t * recip);
oowlog = fast_log2(recip, 56);
}
private:
s64 m_s, m_t, m_w;
};
#endif
//**************************************************************************
// COLOR SOURCE
//**************************************************************************
@ -978,26 +905,36 @@ inline rgb_t rasterizer_texture::lookup_single_texel(u32 format, u32 texbase, s3
// the S,T coordinates and LOD
//-------------------------------------------------
inline rgbaint_t ATTR_FORCE_INLINE rasterizer_texture::fetch_texel(reg_texture_mode const texmode, dither_helper const &dither, s32 x, const stw_helper &iterstw, s32 lodbase, s32 &lod, u8 bilinear_mask)
inline rgbaint_t ATTR_FORCE_INLINE rasterizer_texture::fetch_texel(reg_texture_mode const texmode, dither_helper const &dither, s32 x, double iters, double itert, double iterw, s32 &lod, u8 bilinear_mask)
{
lod = lodbase;
// determine the S/T/LOD values for this texture
// determine the S/T/LOD values for this texture; iterated S/T are
// in 32.32 format and we want final S/T in 24.8 format
s32 s, t;
if (texmode.enable_perspective())
{
s32 wlog;
iterstw.calc_stow(s, t, wlog);
lod += wlog;
// iterws is also 32.32, so division would leave no fractional bits;
// the 256 factor is there to produce 8 bits of fraction
double recip = 256.0 / iterw;
s = s32(iters * recip);
t = s32(itert * recip);
// compute the log2 of the non-reciprocal W value; negating it gives
// the log2 of the reciprocal, so we subtract instead of add it
lod -= fast_log2(iterw, 32);
}
else
iterstw.get_st_shiftr(s, t, 14 + 10);
{
// scale the .32 values down to .8 values as doubles to avoid 64-bit
// integers
s = s32(iters * (1.0 / (1 << 24)));
t = s32(itert * (1.0 / (1 << 24)));
}
// clamp W
if (texmode.clamp_neg_w() && iterstw.is_w_neg())
// clamp S/T if the iterated W is negative
if (texmode.clamp_neg_w() && iterw < 0)
s = t = 0;
// clamp the LOD
// clamp the LOD after applying bias and dither
lod += m_lodbias;
if (texmode.enable_lod_dither())
lod += dither.raw_4x4(x) << 4;
@ -1018,8 +955,8 @@ inline rgbaint_t ATTR_FORCE_INLINE rasterizer_texture::fetch_texel(reg_texture_m
rgbaint_t result;
if ((lod == m_lodmin && !texmode.magnification_filter()) || (lod != m_lodmin && !texmode.minification_filter()))
{
// adjust S/T for the LOD and strip off the fractions
ilod += 18 - 10;
// incorporate the fraction shift into ilod
ilod += 8;
s >>= ilod;
t >>= ilod;
@ -2287,8 +2224,8 @@ void voodoo_renderer::rasterizer(s32 y, const voodoo_renderer::extent_t &extent,
reg_alpha_mode const alphamode(AlphaMode, poly.raster.alphamode());
reg_fbz_mode const fbzmode(FbzMode, poly.raster.fbzmode());
reg_fog_mode const fogmode(FogMode, poly.raster.fogmode());
stw_helper iterstw0, iterstw1;
stw_helper deltastw0, deltastw1;
double iters0, itert0, iterw0, iters1, itert1, iterw1;
double deltas0, deltat0, deltaw0, deltas1, deltat1, deltaw1;
u32 stipple = poly.stipple;
// determine the screen Y
@ -2359,21 +2296,23 @@ void voodoo_renderer::rasterizer(s32 y, const voodoo_renderer::extent_t &extent,
s32 lodbase0 = 0;
if (GenericFlags & rasterizer_params::GENERIC_TEX0)
{
iterstw0.set(
poly.starts0 + dy * poly.ds0dy + dx * poly.ds0dx,
poly.startt0 + dy * poly.dt0dy + dx * poly.dt0dx,
poly.startw0 + dy * poly.dw0dy + dx * poly.dw0dx);
deltastw0.set(poly.ds0dx, poly.dt0dx, poly.dw0dx);
deltas0 = double(poly.ds0dx);
deltat0 = double(poly.dt0dx);
deltaw0 = double(poly.dw0dx);
iters0 = double(poly.starts0 + dy * poly.ds0dy + dx * poly.ds0dx);
itert0 = double(poly.startt0 + dy * poly.dt0dy + dx * poly.dt0dx);
iterw0 = double(poly.startw0 + dy * poly.dw0dy + dx * poly.dw0dx);
lodbase0 = compute_lodbase(poly.ds0dx, poly.ds0dy, poly.dt0dx, poly.dt0dy);
}
s32 lodbase1 = 0;
if (GenericFlags & rasterizer_params::GENERIC_TEX1)
{
iterstw1.set(
poly.starts1 + dy * poly.ds1dy + dx * poly.ds1dx,
poly.startt1 + dy * poly.dt1dy + dx * poly.dt1dx,
poly.startw1 + dy * poly.dw1dy + dx * poly.dw1dx);
deltastw1.set(poly.ds1dx, poly.dt1dx, poly.dw1dx);
deltas1 = double(poly.ds1dx);
deltat1 = double(poly.dt1dx);
deltaw1 = double(poly.dw1dx);
iters1 = double(poly.starts1 + dy * poly.ds1dy + dx * poly.ds1dx);
itert1 = double(poly.startt1 + dy * poly.dt1dy + dx * poly.dt1dx);
iterw1 = double(poly.startw1 + dy * poly.dw1dy + dx * poly.dw1dx);
lodbase1 = compute_lodbase(poly.ds1dx, poly.ds1dy, poly.dt1dx, poly.dt1dy);
}
poly.info->scanlines++;
@ -2402,8 +2341,8 @@ void voodoo_renderer::rasterizer(s32 y, const voodoo_renderer::extent_t &extent,
rgbaint_t texel(0);
if (GenericFlags & rasterizer_params::GENERIC_TEX1)
{
s32 lod1;
rgbaint_t texel_t1 = poly.tex1->fetch_texel(texmode1, dither, x, iterstw1, lodbase1, lod1, m_bilinear_mask);
s32 lod1 = lodbase1;
rgbaint_t texel_t1 = poly.tex1->fetch_texel(texmode1, dither, x, iters1, itert1, iterw1, lod1, m_bilinear_mask);
if (GenericFlags & rasterizer_params::GENERIC_TEX1_IDENTITY)
texel = texel_t1;
else
@ -2417,8 +2356,8 @@ void voodoo_renderer::rasterizer(s32 y, const voodoo_renderer::extent_t &extent,
// we should send the configuration byte
if (!texmode0.seq_8_downld())
{
s32 lod0;
rgbaint_t texel_t0 = poly.tex0->fetch_texel(texmode0, dither, x, iterstw0, lodbase0, lod0, m_bilinear_mask);
s32 lod0 = lodbase0;
rgbaint_t texel_t0 = poly.tex0->fetch_texel(texmode0, dither, x, iters0, itert0, iterw0, lod0, m_bilinear_mask);
if (GenericFlags & rasterizer_params::GENERIC_TEX0_IDENTITY)
texel = texel_t0;
else
@ -2455,9 +2394,17 @@ void voodoo_renderer::rasterizer(s32 y, const voodoo_renderer::extent_t &extent,
iterz += poly.dzdx;
iterw += iterw_delta;
if (GenericFlags & rasterizer_params::GENERIC_TEX0)
iterstw0.add(deltastw0);
{
iters0 += deltas0;
itert0 += deltat0;
iterw0 += deltaw0;
}
if (GenericFlags & rasterizer_params::GENERIC_TEX1)
iterstw1.add(deltastw1);
{
iters1 += deltas1;
itert1 += deltat1;
iterw1 += deltaw1;
}
}
}

View File

@ -24,7 +24,6 @@ namespace voodoo
struct rasterizer_info;
struct poly_data;
class dither_helper;
class stw_helper;
// base class for our renderer
using voodoo_poly_manager = poly_manager<float, poly_data, 0, POLY_FLAG_NO_CLIPPING>;
@ -354,7 +353,7 @@ public:
rgb_t lookup_single_texel(u32 format, u32 texbase, s32 s, s32 t);
// fetch a texel given coordinates and LOD information
rgbaint_t fetch_texel(voodoo::reg_texture_mode const texmode, voodoo::dither_helper const &dither, s32 x, const voodoo::stw_helper &iterstw, s32 lodbase, s32 &lod, u8 bilinear_mask);
rgbaint_t fetch_texel(voodoo::reg_texture_mode const texmode, voodoo::dither_helper const &dither, s32 x, double iters, double itert, double iterw, s32 &lod, u8 bilinear_mask);
// texture-specific color combination unit
rgbaint_t combine_texture(voodoo::reg_texture_mode const texmode, rgbaint_t const &c_local, rgbaint_t const &c_other, s32 lod);
@ -613,40 +612,6 @@ private:
std::vector<thread_stats_block> m_thread_stats;
};
//**************************************************************************
// MATH HELPERS
//**************************************************************************
//-------------------------------------------------
// fast_log2 - computes the log2 of a double-
// precision value as a 24.8 value
//-------------------------------------------------
inline s32 fast_log2(double value, int offset)
{
// negative values return 0
if (UNEXPECTED(value < 0))
return 0;
// convert the value to a raw integer
union { double d; u64 i; } temp;
temp.d = value;
// we only care about the 11-bit exponent and top 4 bits of mantissa
// (sign is already assured to be 0)
u32 ival = temp.i >> 48;
// exponent in the upper bits, plus an 8-bit log value from 4 bits of mantissa
s32 exp = (ival >> 4) - 1023 + 32 - offset;
// the maximum error using a 4 bit lookup from the mantissa is 0.0875, which is
// less than 1/2 lsb (0.125) for 2 bits of fraction
static u8 const s_log2_table[16] = { 0, 22, 44, 63, 82, 100, 118, 134, 150, 165, 179, 193, 207, 220, 232, 244 };
return (exp << 8) | s_log2_table[ival & 15];
}
}
#endif // MAME_VIDEO_VOODOO_RENDER_H