voodoo: Fix LOD calculations. Improve LOD calculation accuracy. Remove stw_helper as any benefit it provides is within the noise.

2025-07-13 13:35:41 +03:00 · 2021-07-10 22:08:36 -07:00 · 2021-07-10 22:08:36 -07:00 · eae7139f40
commit eae7139f40
parent 2b39e4c5c8
2 changed files with 115 additions and 203 deletions
--- a/src/devices/video/voodoo_render.cpp
+++ b/src/devices/video/voodoo_render.cpp
@ -16,7 +16,6 @@ namespace voodoo

 static constexpr bool LOG_RASTERIZERS = false;

-
 struct static_rasterizer_info
 {
 	voodoo_renderer::rasterizer_mfp mfp;
@ -60,7 +59,7 @@ static const rectangle global_cliprect(-4096, 4095, -4096, 4095);
 //  point version of the iterated W value
 //-------------------------------------------------

-static inline s32 ATTR_FORCE_INLINE compute_wfloat(s64 iterw)
+inline s32 ATTR_FORCE_INLINE compute_wfloat(s64 iterw)
 {
 	int exp = count_leading_zeros_64(iterw) - 16;
 	if (exp < 0)
@ -78,7 +77,7 @@ static inline s32 ATTR_FORCE_INLINE compute_wfloat(s64 iterw)
 //  manual
 //-------------------------------------------------

-static inline rgbaint_t ATTR_FORCE_INLINE clamped_argb(const rgbaint_t &iterargb, reg_fbz_colorpath const fbzcp)
+inline rgbaint_t ATTR_FORCE_INLINE clamped_argb(const rgbaint_t &iterargb, reg_fbz_colorpath const fbzcp)
 {
 	rgbaint_t result(iterargb);
 	result.shr_imm(20);
@ -113,7 +112,7 @@ static inline rgbaint_t ATTR_FORCE_INLINE clamped_argb(const rgbaint_t &iterargb
 //  described in the Voodoo manual
 //-------------------------------------------------

-static inline s32 ATTR_FORCE_INLINE clamped_z(s32 iterz, reg_fbz_colorpath const fbzcp)
+inline s32 ATTR_FORCE_INLINE clamped_z(s32 iterz, reg_fbz_colorpath const fbzcp)
 {
 	// clamped case is easy
 	if (fbzcp.rgbzw_clamp() != 0)
@ -135,7 +134,7 @@ static inline s32 ATTR_FORCE_INLINE clamped_z(s32 iterz, reg_fbz_colorpath const
 //  described in the Voodoo manual
 //-------------------------------------------------

-static inline s32 ATTR_FORCE_INLINE clamped_w(s64 iterw, reg_fbz_colorpath const fbzcp)
+inline s32 ATTR_FORCE_INLINE clamped_w(s64 iterw, reg_fbz_colorpath const fbzcp)
 {
 	// clamped case is easy
 	if (fbzcp.rgbzw_clamp() != 0)
@ -151,29 +150,72 @@ static inline s32 ATTR_FORCE_INLINE clamped_w(s64 iterw, reg_fbz_colorpath const
 }


+//-------------------------------------------------
+//  fast_log2 - computes the log2 of a double-
+//  precision value as a 24.8 value; if the double
+//  was converted from a fixed-point integer, the
+//  number of fractional bits should be specified
+//  by fracbits
+//-------------------------------------------------
+
+inline s32 ATTR_FORCE_INLINE fast_log2(double value, int fracbits = 0)
+{
+	// negative values return 0
+	if (UNEXPECTED(value < 0))
+		return 0;
+
+	// convert the value to a raw integer
+	union { double d; u64 i; } temp;
+	temp.d = value;
+
+	// we only care about the 11-bit exponent and top 7 bits of mantissa
+	// (sign is already assured to be 0)
+	u32 ival = temp.i >> 45;
+
+	// extract exponent, unbias, and adjust for fixed-point fraction
+	s32 exp = (ival >> 7) - 1023 - fracbits;
+
+	// use top 7 bits of mantissa to look up fractional log
+	static u8 const s_log2_table[128] =
+	{
+		  0,   2,   5,   8,  11,  14,  16,  19,  22,  25,  27,  30,  33,  35,  38,  40,
+		 43,  46,  48,  51,  53,  56,  58,  61,  63,  65,  68,  70,  73,  75,  77,  80,
+		 82,  84,  87,  89,  91,  93,  96,  98, 100, 102, 104, 106, 109, 111, 113, 115,
+		117, 119, 121, 123, 125, 127, 129, 132, 134, 136, 138, 140, 141, 143, 145, 147,
+		149, 151, 153, 155, 157, 159, 161, 162, 164, 166, 168, 170, 172, 173, 175, 177,
+		179, 181, 182, 184, 186, 188, 189, 191, 193, 194, 196, 198, 200, 201, 203, 205,
+		206, 208, 209, 211, 213, 214, 216, 218, 219, 221, 222, 224, 225, 227, 229, 230,
+		232, 233, 235, 236, 238, 239, 241, 242, 244, 245, 247, 248, 250, 251, 253, 254
+	};
+
+	// combine the integral and fractional parts
+	return (exp << 8) | s_log2_table[ival & 127];
+}
+
+
+
 //-------------------------------------------------
 //  compute_lodbase - compute the base LOD value
 //-------------------------------------------------

-static inline s32 ATTR_FORCE_INLINE compute_lodbase(s64 dsdx, s64 dsdy, s64 dtdx, s64 dtdy)
+inline s32 ATTR_FORCE_INLINE compute_lodbase(s64 dsdx, s64 dsdy, s64 dtdx, s64 dtdy)
 {
-	// compute (ds^2 + dt^2) in both X and Y as 28.36 numbers
-	dsdx >>= 14;
-	dsdy >>= 14;
-	dtdx >>= 14;
-	dtdy >>= 14;
-	s64 texdx = dsdx * dsdx + dtdx * dtdx;
-	s64 texdy = dsdy * dsdy + dtdy * dtdy;
+	// compute (ds^2 + dt^2) in both X and Y; note that these values are
+	// each .32, so the square is a .64 fixed point value
+	double fdsdx = double(dsdx);
+	double fdsdy = double(dsdy);
+	double fdtdx = double(dtdx);
+	double fdtdy = double(dtdy);
+	double texdx = fdsdx * fdsdx + fdtdx * fdtdx;
+	double texdy = fdsdy * fdsdy + fdtdy * fdtdy;

-	// pick whichever is larger and shift off some high bits -> 28.20
-	s64 maxval = std::max(texdx, texdy) >> 16;
+	// pick whichever is larger
+	double maxval = std::max(texdx, texdy);

-	// use our fast reciprocal/log on this value; it expects input as a
-	// 16.32 number, and returns the log of the reciprocal, so we have to
-	// adjust the result: negative to get the log of the original value
-	// plus 12 to account for the extra exponent, and divided by 2 to
-	// get the log of the square root of texdx
-	return (fast_log2(double(maxval), 0) + (12 << 8)) / 2;
+	// use our fast reciprocal/log on this value; 64 to indicate how many
+	// bits of fractional resolution in the source, and divide by 2 because
+	// we really want the log of the square root
+	return fast_log2(maxval, 64) / 2;
 }


@ -351,121 +393,6 @@ void dither_helper::init_static()
 }


-
-//**************************************************************************
-//  STW HELPER
-//**************************************************************************
-
-// use SSE on 64-bit implementations, where it can be assumed
-#if 1 && ((!defined(MAME_DEBUG) || defined(__OPTIMIZE__)) && (defined(__SSE2__) || defined(_MSC_VER)) && defined(PTR64))
-
-#include <emmintrin.h>
-#ifdef __SSE4_1__
-#include <smmintrin.h>
-#endif
-
-class stw_helper
-{
-public:
-	stw_helper() { }
-	stw_helper(stw_helper const &other) = default;
-	stw_helper &operator=(stw_helper const &other) = default;
-
-	void set(s64 s, s64 t, s64 w)
-	{
-		m_st = _mm_set_pd(s << 8, t << 8);
-		m_w = _mm_set1_pd(w);
-	}
-
-	bool is_w_neg() const
-	{
-		return _mm_comilt_sd(m_w, _mm_set1_pd(0.0));
-	}
-
-	void get_st_shiftr(s32 &s, s32 &t, s32 shift) const
-	{
-		shift += 8;
-		s = _mm_cvtsd_si64(_mm_shuffle_pd(m_st, _mm_setzero_pd(), 1)) >> shift;
-		t = _mm_cvtsd_si64(m_st) >> shift;
-	}
-
-	void add(stw_helper const &delta)
-	{
-		m_st = _mm_add_pd(m_st, delta.m_st);
-		m_w = _mm_add_pd(m_w, delta.m_w);
-	}
-
-	void calc_stow(s32 &sow, s32 &tow, s32 &oowlog) const
-	{
-		__m128d tmp = _mm_div_pd(m_st, m_w);
-		__m128i tmp2 = _mm_cvttpd_epi32(tmp);
-#ifdef __SSE4_1__
-		sow = _mm_extract_epi32(tmp2, 1);
-		tow = _mm_extract_epi32(tmp2, 0);
-#else
-		sow = _mm_cvtsi128_si32(_mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 0, 1)));
-		tow = _mm_cvtsi128_si32(tmp2);
-#endif
-		oowlog = -fast_log2(_mm_cvtsd_f64(m_w), 0);
-	}
-
-private:
-	__m128d m_st;
-	__m128d m_w;
-};
-
-#else
-
-class stw_helper
-{
-public:
-	stw_helper() {}
-	stw_helper(stw_helper const &other) = default;
-	stw_helper &operator=(stw_helper const &other) = default;
-
-	void set(s64 s, s64 t, s64 w)
-	{
-		m_s = s;
-		m_t = t;
-		m_w = w;
-	}
-
-	bool is_w_neg() const
-	{
-		return (m_w < 0) ? true : false;
-	}
-
-	void get_st_shiftr(s32 &s, s32 &t, s32 shift) const
-	{
-		s = m_s >> shift;
-		t = m_t >> shift;
-	}
-
-	void add(stw_helper const &other)
-	{
-		m_s += other.m_s;
-		m_t += other.m_t;
-		m_w += other.m_w;
-	}
-
-	// Computes s/w and t/w and returns log2 of 1/w
-	// s, t and c are 16.32 values.  The results are 24.8.
-	void calc_stow(s32 &sow, s32 &tow, s32 &oowlog) const
-	{
-		double recip = double(1ULL << (47 - 39)) / m_w;
-		sow = s32(m_s * recip);
-		tow = s32(m_t * recip);
-		oowlog = fast_log2(recip, 56);
-	}
-
-private:
-	s64 m_s, m_t, m_w;
-};
-
-#endif
-
-
-
 //**************************************************************************
 //  COLOR SOURCE
 //**************************************************************************
@ -978,26 +905,36 @@ inline rgb_t rasterizer_texture::lookup_single_texel(u32 format, u32 texbase, s3
 //  the S,T coordinates and LOD
 //-------------------------------------------------

-inline rgbaint_t ATTR_FORCE_INLINE rasterizer_texture::fetch_texel(reg_texture_mode const texmode, dither_helper const &dither, s32 x, const stw_helper &iterstw, s32 lodbase, s32 &lod, u8 bilinear_mask)
+inline rgbaint_t ATTR_FORCE_INLINE rasterizer_texture::fetch_texel(reg_texture_mode const texmode, dither_helper const &dither, s32 x, double iters, double itert, double iterw, s32 &lod, u8 bilinear_mask)
 {
-	lod = lodbase;
-
-	// determine the S/T/LOD values for this texture
+	// determine the S/T/LOD values for this texture; iterated S/T are
+	// in 32.32 format and we want final S/T in 24.8 format
 	s32 s, t;
 	if (texmode.enable_perspective())
 	{
-		s32 wlog;
-		iterstw.calc_stow(s, t, wlog);
-		lod += wlog;
+		// iterws is also 32.32, so division would leave no fractional bits;
+		// the 256 factor is there to produce 8 bits of fraction
+		double recip = 256.0 / iterw;
+		s = s32(iters * recip);
+		t = s32(itert * recip);
+
+		// compute the log2 of the non-reciprocal W value; negating it gives
+		// the log2 of the reciprocal, so we subtract instead of add it
+		lod -= fast_log2(iterw, 32);
 	}
 	else
-		iterstw.get_st_shiftr(s, t, 14 + 10);
+	{
+		// scale the .32 values down to .8 values as doubles to avoid 64-bit
+		// integers
+		s = s32(iters * (1.0 / (1 << 24)));
+		t = s32(itert * (1.0 / (1 << 24)));
+	}

-	// clamp W
-	if (texmode.clamp_neg_w() && iterstw.is_w_neg())
+	// clamp S/T if the iterated W is negative
+	if (texmode.clamp_neg_w() && iterw < 0)
 		s = t = 0;

-	// clamp the LOD
+	// clamp the LOD after applying bias and dither
 	lod += m_lodbias;
 	if (texmode.enable_lod_dither())
 		lod += dither.raw_4x4(x) << 4;
@ -1018,8 +955,8 @@ inline rgbaint_t ATTR_FORCE_INLINE rasterizer_texture::fetch_texel(reg_texture_m
 	rgbaint_t result;
 	if ((lod == m_lodmin && !texmode.magnification_filter()) || (lod != m_lodmin && !texmode.minification_filter()))
 	{
-		// adjust S/T for the LOD and strip off the fractions
-		ilod += 18 - 10;
+		// incorporate the fraction shift into ilod
+		ilod += 8;
 		s >>= ilod;
 		t >>= ilod;

@ -2287,8 +2224,8 @@ void voodoo_renderer::rasterizer(s32 y, const voodoo_renderer::extent_t &extent,
 	reg_alpha_mode const alphamode(AlphaMode, poly.raster.alphamode());
 	reg_fbz_mode const fbzmode(FbzMode, poly.raster.fbzmode());
 	reg_fog_mode const fogmode(FogMode, poly.raster.fogmode());
-	stw_helper iterstw0, iterstw1;
-	stw_helper deltastw0, deltastw1;
+	double iters0, itert0, iterw0, iters1, itert1, iterw1;
+	double deltas0, deltat0, deltaw0, deltas1, deltat1, deltaw1;
 	u32 stipple = poly.stipple;

 	// determine the screen Y
@ -2359,21 +2296,23 @@ void voodoo_renderer::rasterizer(s32 y, const voodoo_renderer::extent_t &extent,
 	s32 lodbase0 = 0;
 	if (GenericFlags & rasterizer_params::GENERIC_TEX0)
 	{
-		iterstw0.set(
-			poly.starts0 + dy * poly.ds0dy + dx * poly.ds0dx,
-			poly.startt0 + dy * poly.dt0dy + dx * poly.dt0dx,
-			poly.startw0 + dy * poly.dw0dy + dx * poly.dw0dx);
-		deltastw0.set(poly.ds0dx, poly.dt0dx, poly.dw0dx);
+		deltas0 = double(poly.ds0dx);
+		deltat0 = double(poly.dt0dx);
+		deltaw0 = double(poly.dw0dx);
+		iters0 = double(poly.starts0 + dy * poly.ds0dy + dx * poly.ds0dx);
+		itert0 = double(poly.startt0 + dy * poly.dt0dy + dx * poly.dt0dx);
+		iterw0 = double(poly.startw0 + dy * poly.dw0dy + dx * poly.dw0dx);
 		lodbase0 = compute_lodbase(poly.ds0dx, poly.ds0dy, poly.dt0dx, poly.dt0dy);
 	}
 	s32 lodbase1 = 0;
 	if (GenericFlags & rasterizer_params::GENERIC_TEX1)
 	{
-		iterstw1.set(
-			poly.starts1 + dy * poly.ds1dy + dx * poly.ds1dx,
-			poly.startt1 + dy * poly.dt1dy + dx * poly.dt1dx,
-			poly.startw1 + dy * poly.dw1dy + dx * poly.dw1dx);
-		deltastw1.set(poly.ds1dx, poly.dt1dx, poly.dw1dx);
+		deltas1 = double(poly.ds1dx);
+		deltat1 = double(poly.dt1dx);
+		deltaw1 = double(poly.dw1dx);
+		iters1 = double(poly.starts1 + dy * poly.ds1dy + dx * poly.ds1dx);
+		itert1 = double(poly.startt1 + dy * poly.dt1dy + dx * poly.dt1dx);
+		iterw1 = double(poly.startw1 + dy * poly.dw1dy + dx * poly.dw1dx);
 		lodbase1 = compute_lodbase(poly.ds1dx, poly.ds1dy, poly.dt1dx, poly.dt1dy);
 	}
 	poly.info->scanlines++;
@ -2402,8 +2341,8 @@ void voodoo_renderer::rasterizer(s32 y, const voodoo_renderer::extent_t &extent,
 			rgbaint_t texel(0);
 			if (GenericFlags & rasterizer_params::GENERIC_TEX1)
 			{
-				s32 lod1;
-				rgbaint_t texel_t1 = poly.tex1->fetch_texel(texmode1, dither, x, iterstw1, lodbase1, lod1, m_bilinear_mask);
+				s32 lod1 = lodbase1;
+				rgbaint_t texel_t1 = poly.tex1->fetch_texel(texmode1, dither, x, iters1, itert1, iterw1, lod1, m_bilinear_mask);
 				if (GenericFlags & rasterizer_params::GENERIC_TEX1_IDENTITY)
 					texel = texel_t1;
 				else
@ -2417,8 +2356,8 @@ void voodoo_renderer::rasterizer(s32 y, const voodoo_renderer::extent_t &extent,
 				// we should send the configuration byte
 				if (!texmode0.seq_8_downld())
 				{
-					s32 lod0;
-					rgbaint_t texel_t0 = poly.tex0->fetch_texel(texmode0, dither, x, iterstw0, lodbase0, lod0, m_bilinear_mask);
+					s32 lod0 = lodbase0;
+					rgbaint_t texel_t0 = poly.tex0->fetch_texel(texmode0, dither, x, iters0, itert0, iterw0, lod0, m_bilinear_mask);
 					if (GenericFlags & rasterizer_params::GENERIC_TEX0_IDENTITY)
 						texel = texel_t0;
 					else
@ -2455,9 +2394,17 @@ void voodoo_renderer::rasterizer(s32 y, const voodoo_renderer::extent_t &extent,
 		iterz += poly.dzdx;
 		iterw += iterw_delta;
 		if (GenericFlags & rasterizer_params::GENERIC_TEX0)
-			iterstw0.add(deltastw0);
+		{
+			iters0 += deltas0;
+			itert0 += deltat0;
+			iterw0 += deltaw0;
+		}
 		if (GenericFlags & rasterizer_params::GENERIC_TEX1)
-			iterstw1.add(deltastw1);
+		{
+			iters1 += deltas1;
+			itert1 += deltat1;
+			iterw1 += deltaw1;
+		}
 	}
 }

--- a/src/devices/video/voodoo_render.h
+++ b/src/devices/video/voodoo_render.h
@ -24,7 +24,6 @@ namespace voodoo
 struct rasterizer_info;
 struct poly_data;
 class dither_helper;
-class stw_helper;

 // base class for our renderer
 using voodoo_poly_manager = poly_manager<float, poly_data, 0, POLY_FLAG_NO_CLIPPING>;
@ -354,7 +353,7 @@ public:
 	rgb_t lookup_single_texel(u32 format, u32 texbase, s32 s, s32 t);

 	// fetch a texel given coordinates and LOD information
-	rgbaint_t fetch_texel(voodoo::reg_texture_mode const texmode, voodoo::dither_helper const &dither, s32 x, const voodoo::stw_helper &iterstw, s32 lodbase, s32 &lod, u8 bilinear_mask);
+	rgbaint_t fetch_texel(voodoo::reg_texture_mode const texmode, voodoo::dither_helper const &dither, s32 x, double iters, double itert, double iterw, s32 &lod, u8 bilinear_mask);

 	// texture-specific color combination unit
 	rgbaint_t combine_texture(voodoo::reg_texture_mode const texmode, rgbaint_t const &c_local, rgbaint_t const &c_other, s32 lod);
@ -613,40 +612,6 @@ private:
 	std::vector<thread_stats_block> m_thread_stats;
 };

-
-
-//**************************************************************************
-//  MATH HELPERS
-//**************************************************************************
-
-//-------------------------------------------------
-//  fast_log2 - computes the log2 of a double-
-//  precision value as a 24.8 value
-//-------------------------------------------------
-
-inline s32 fast_log2(double value, int offset)
-{
-	// negative values return 0
-	if (UNEXPECTED(value < 0))
-		return 0;
-
-	// convert the value to a raw integer
-	union { double d; u64 i; } temp;
-	temp.d = value;
-
-	// we only care about the 11-bit exponent and top 4 bits of mantissa
-	// (sign is already assured to be 0)
-	u32 ival = temp.i >> 48;
-
-	// exponent in the upper bits, plus an 8-bit log value from 4 bits of mantissa
-	s32 exp = (ival >> 4) - 1023 + 32 - offset;
-
-	// the maximum error using a 4 bit lookup from the mantissa is 0.0875, which is
-	// less than 1/2 lsb (0.125) for 2 bits of fraction
-	static u8 const s_log2_table[16] = { 0, 22, 44, 63, 82, 100, 118, 134, 150, 165, 179, 193, 207, 220, 232, 244 };
-	return (exp << 8) | s_log2_table[ival & 15];
-}
-
 }

 #endif // MAME_VIDEO_VOODOO_RENDER_H