diff --git a/src/devices/video/voodoo_render.cpp b/src/devices/video/voodoo_render.cpp index 643511a2690..c715aed8a9f 100644 --- a/src/devices/video/voodoo_render.cpp +++ b/src/devices/video/voodoo_render.cpp @@ -16,7 +16,6 @@ namespace voodoo static constexpr bool LOG_RASTERIZERS = false; - struct static_rasterizer_info { voodoo_renderer::rasterizer_mfp mfp; @@ -60,7 +59,7 @@ static const rectangle global_cliprect(-4096, 4095, -4096, 4095); // point version of the iterated W value //------------------------------------------------- -static inline s32 ATTR_FORCE_INLINE compute_wfloat(s64 iterw) +inline s32 ATTR_FORCE_INLINE compute_wfloat(s64 iterw) { int exp = count_leading_zeros_64(iterw) - 16; if (exp < 0) @@ -78,7 +77,7 @@ static inline s32 ATTR_FORCE_INLINE compute_wfloat(s64 iterw) // manual //------------------------------------------------- -static inline rgbaint_t ATTR_FORCE_INLINE clamped_argb(const rgbaint_t &iterargb, reg_fbz_colorpath const fbzcp) +inline rgbaint_t ATTR_FORCE_INLINE clamped_argb(const rgbaint_t &iterargb, reg_fbz_colorpath const fbzcp) { rgbaint_t result(iterargb); result.shr_imm(20); @@ -113,7 +112,7 @@ static inline rgbaint_t ATTR_FORCE_INLINE clamped_argb(const rgbaint_t &iterargb // described in the Voodoo manual //------------------------------------------------- -static inline s32 ATTR_FORCE_INLINE clamped_z(s32 iterz, reg_fbz_colorpath const fbzcp) +inline s32 ATTR_FORCE_INLINE clamped_z(s32 iterz, reg_fbz_colorpath const fbzcp) { // clamped case is easy if (fbzcp.rgbzw_clamp() != 0) @@ -135,7 +134,7 @@ static inline s32 ATTR_FORCE_INLINE clamped_z(s32 iterz, reg_fbz_colorpath const // described in the Voodoo manual //------------------------------------------------- -static inline s32 ATTR_FORCE_INLINE clamped_w(s64 iterw, reg_fbz_colorpath const fbzcp) +inline s32 ATTR_FORCE_INLINE clamped_w(s64 iterw, reg_fbz_colorpath const fbzcp) { // clamped case is easy if (fbzcp.rgbzw_clamp() != 0) @@ -151,29 +150,72 @@ static inline s32 ATTR_FORCE_INLINE clamped_w(s64 iterw, reg_fbz_colorpath const } +//------------------------------------------------- +// fast_log2 - computes the log2 of a double- +// precision value as a 24.8 value; if the double +// was converted from a fixed-point integer, the +// number of fractional bits should be specified +// by fracbits +//------------------------------------------------- + +inline s32 ATTR_FORCE_INLINE fast_log2(double value, int fracbits = 0) +{ + // negative values return 0 + if (UNEXPECTED(value < 0)) + return 0; + + // convert the value to a raw integer + union { double d; u64 i; } temp; + temp.d = value; + + // we only care about the 11-bit exponent and top 7 bits of mantissa + // (sign is already assured to be 0) + u32 ival = temp.i >> 45; + + // extract exponent, unbias, and adjust for fixed-point fraction + s32 exp = (ival >> 7) - 1023 - fracbits; + + // use top 7 bits of mantissa to look up fractional log + static u8 const s_log2_table[128] = + { + 0, 2, 5, 8, 11, 14, 16, 19, 22, 25, 27, 30, 33, 35, 38, 40, + 43, 46, 48, 51, 53, 56, 58, 61, 63, 65, 68, 70, 73, 75, 77, 80, + 82, 84, 87, 89, 91, 93, 96, 98, 100, 102, 104, 106, 109, 111, 113, 115, + 117, 119, 121, 123, 125, 127, 129, 132, 134, 136, 138, 140, 141, 143, 145, 147, + 149, 151, 153, 155, 157, 159, 161, 162, 164, 166, 168, 170, 172, 173, 175, 177, + 179, 181, 182, 184, 186, 188, 189, 191, 193, 194, 196, 198, 200, 201, 203, 205, + 206, 208, 209, 211, 213, 214, 216, 218, 219, 221, 222, 224, 225, 227, 229, 230, + 232, 233, 235, 236, 238, 239, 241, 242, 244, 245, 247, 248, 250, 251, 253, 254 + }; + + // combine the integral and fractional parts + return (exp << 8) | s_log2_table[ival & 127]; +} + + + //------------------------------------------------- // compute_lodbase - compute the base LOD value //------------------------------------------------- -static inline s32 ATTR_FORCE_INLINE compute_lodbase(s64 dsdx, s64 dsdy, s64 dtdx, s64 dtdy) +inline s32 ATTR_FORCE_INLINE compute_lodbase(s64 dsdx, s64 dsdy, s64 dtdx, s64 dtdy) { - // compute (ds^2 + dt^2) in both X and Y as 28.36 numbers - dsdx >>= 14; - dsdy >>= 14; - dtdx >>= 14; - dtdy >>= 14; - s64 texdx = dsdx * dsdx + dtdx * dtdx; - s64 texdy = dsdy * dsdy + dtdy * dtdy; + // compute (ds^2 + dt^2) in both X and Y; note that these values are + // each .32, so the square is a .64 fixed point value + double fdsdx = double(dsdx); + double fdsdy = double(dsdy); + double fdtdx = double(dtdx); + double fdtdy = double(dtdy); + double texdx = fdsdx * fdsdx + fdtdx * fdtdx; + double texdy = fdsdy * fdsdy + fdtdy * fdtdy; - // pick whichever is larger and shift off some high bits -> 28.20 - s64 maxval = std::max(texdx, texdy) >> 16; + // pick whichever is larger + double maxval = std::max(texdx, texdy); - // use our fast reciprocal/log on this value; it expects input as a - // 16.32 number, and returns the log of the reciprocal, so we have to - // adjust the result: negative to get the log of the original value - // plus 12 to account for the extra exponent, and divided by 2 to - // get the log of the square root of texdx - return (fast_log2(double(maxval), 0) + (12 << 8)) / 2; + // use our fast reciprocal/log on this value; 64 to indicate how many + // bits of fractional resolution in the source, and divide by 2 because + // we really want the log of the square root + return fast_log2(maxval, 64) / 2; } @@ -351,121 +393,6 @@ void dither_helper::init_static() } - -//************************************************************************** -// STW HELPER -//************************************************************************** - -// use SSE on 64-bit implementations, where it can be assumed -#if 1 && ((!defined(MAME_DEBUG) || defined(__OPTIMIZE__)) && (defined(__SSE2__) || defined(_MSC_VER)) && defined(PTR64)) - -#include -#ifdef __SSE4_1__ -#include -#endif - -class stw_helper -{ -public: - stw_helper() { } - stw_helper(stw_helper const &other) = default; - stw_helper &operator=(stw_helper const &other) = default; - - void set(s64 s, s64 t, s64 w) - { - m_st = _mm_set_pd(s << 8, t << 8); - m_w = _mm_set1_pd(w); - } - - bool is_w_neg() const - { - return _mm_comilt_sd(m_w, _mm_set1_pd(0.0)); - } - - void get_st_shiftr(s32 &s, s32 &t, s32 shift) const - { - shift += 8; - s = _mm_cvtsd_si64(_mm_shuffle_pd(m_st, _mm_setzero_pd(), 1)) >> shift; - t = _mm_cvtsd_si64(m_st) >> shift; - } - - void add(stw_helper const &delta) - { - m_st = _mm_add_pd(m_st, delta.m_st); - m_w = _mm_add_pd(m_w, delta.m_w); - } - - void calc_stow(s32 &sow, s32 &tow, s32 &oowlog) const - { - __m128d tmp = _mm_div_pd(m_st, m_w); - __m128i tmp2 = _mm_cvttpd_epi32(tmp); -#ifdef __SSE4_1__ - sow = _mm_extract_epi32(tmp2, 1); - tow = _mm_extract_epi32(tmp2, 0); -#else - sow = _mm_cvtsi128_si32(_mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 0, 1))); - tow = _mm_cvtsi128_si32(tmp2); -#endif - oowlog = -fast_log2(_mm_cvtsd_f64(m_w), 0); - } - -private: - __m128d m_st; - __m128d m_w; -}; - -#else - -class stw_helper -{ -public: - stw_helper() {} - stw_helper(stw_helper const &other) = default; - stw_helper &operator=(stw_helper const &other) = default; - - void set(s64 s, s64 t, s64 w) - { - m_s = s; - m_t = t; - m_w = w; - } - - bool is_w_neg() const - { - return (m_w < 0) ? true : false; - } - - void get_st_shiftr(s32 &s, s32 &t, s32 shift) const - { - s = m_s >> shift; - t = m_t >> shift; - } - - void add(stw_helper const &other) - { - m_s += other.m_s; - m_t += other.m_t; - m_w += other.m_w; - } - - // Computes s/w and t/w and returns log2 of 1/w - // s, t and c are 16.32 values. The results are 24.8. - void calc_stow(s32 &sow, s32 &tow, s32 &oowlog) const - { - double recip = double(1ULL << (47 - 39)) / m_w; - sow = s32(m_s * recip); - tow = s32(m_t * recip); - oowlog = fast_log2(recip, 56); - } - -private: - s64 m_s, m_t, m_w; -}; - -#endif - - - //************************************************************************** // COLOR SOURCE //************************************************************************** @@ -978,26 +905,36 @@ inline rgb_t rasterizer_texture::lookup_single_texel(u32 format, u32 texbase, s3 // the S,T coordinates and LOD //------------------------------------------------- -inline rgbaint_t ATTR_FORCE_INLINE rasterizer_texture::fetch_texel(reg_texture_mode const texmode, dither_helper const &dither, s32 x, const stw_helper &iterstw, s32 lodbase, s32 &lod, u8 bilinear_mask) +inline rgbaint_t ATTR_FORCE_INLINE rasterizer_texture::fetch_texel(reg_texture_mode const texmode, dither_helper const &dither, s32 x, double iters, double itert, double iterw, s32 &lod, u8 bilinear_mask) { - lod = lodbase; - - // determine the S/T/LOD values for this texture + // determine the S/T/LOD values for this texture; iterated S/T are + // in 32.32 format and we want final S/T in 24.8 format s32 s, t; if (texmode.enable_perspective()) { - s32 wlog; - iterstw.calc_stow(s, t, wlog); - lod += wlog; + // iterws is also 32.32, so division would leave no fractional bits; + // the 256 factor is there to produce 8 bits of fraction + double recip = 256.0 / iterw; + s = s32(iters * recip); + t = s32(itert * recip); + + // compute the log2 of the non-reciprocal W value; negating it gives + // the log2 of the reciprocal, so we subtract instead of add it + lod -= fast_log2(iterw, 32); } else - iterstw.get_st_shiftr(s, t, 14 + 10); + { + // scale the .32 values down to .8 values as doubles to avoid 64-bit + // integers + s = s32(iters * (1.0 / (1 << 24))); + t = s32(itert * (1.0 / (1 << 24))); + } - // clamp W - if (texmode.clamp_neg_w() && iterstw.is_w_neg()) + // clamp S/T if the iterated W is negative + if (texmode.clamp_neg_w() && iterw < 0) s = t = 0; - // clamp the LOD + // clamp the LOD after applying bias and dither lod += m_lodbias; if (texmode.enable_lod_dither()) lod += dither.raw_4x4(x) << 4; @@ -1018,8 +955,8 @@ inline rgbaint_t ATTR_FORCE_INLINE rasterizer_texture::fetch_texel(reg_texture_m rgbaint_t result; if ((lod == m_lodmin && !texmode.magnification_filter()) || (lod != m_lodmin && !texmode.minification_filter())) { - // adjust S/T for the LOD and strip off the fractions - ilod += 18 - 10; + // incorporate the fraction shift into ilod + ilod += 8; s >>= ilod; t >>= ilod; @@ -2287,8 +2224,8 @@ void voodoo_renderer::rasterizer(s32 y, const voodoo_renderer::extent_t &extent, reg_alpha_mode const alphamode(AlphaMode, poly.raster.alphamode()); reg_fbz_mode const fbzmode(FbzMode, poly.raster.fbzmode()); reg_fog_mode const fogmode(FogMode, poly.raster.fogmode()); - stw_helper iterstw0, iterstw1; - stw_helper deltastw0, deltastw1; + double iters0, itert0, iterw0, iters1, itert1, iterw1; + double deltas0, deltat0, deltaw0, deltas1, deltat1, deltaw1; u32 stipple = poly.stipple; // determine the screen Y @@ -2359,21 +2296,23 @@ void voodoo_renderer::rasterizer(s32 y, const voodoo_renderer::extent_t &extent, s32 lodbase0 = 0; if (GenericFlags & rasterizer_params::GENERIC_TEX0) { - iterstw0.set( - poly.starts0 + dy * poly.ds0dy + dx * poly.ds0dx, - poly.startt0 + dy * poly.dt0dy + dx * poly.dt0dx, - poly.startw0 + dy * poly.dw0dy + dx * poly.dw0dx); - deltastw0.set(poly.ds0dx, poly.dt0dx, poly.dw0dx); + deltas0 = double(poly.ds0dx); + deltat0 = double(poly.dt0dx); + deltaw0 = double(poly.dw0dx); + iters0 = double(poly.starts0 + dy * poly.ds0dy + dx * poly.ds0dx); + itert0 = double(poly.startt0 + dy * poly.dt0dy + dx * poly.dt0dx); + iterw0 = double(poly.startw0 + dy * poly.dw0dy + dx * poly.dw0dx); lodbase0 = compute_lodbase(poly.ds0dx, poly.ds0dy, poly.dt0dx, poly.dt0dy); } s32 lodbase1 = 0; if (GenericFlags & rasterizer_params::GENERIC_TEX1) { - iterstw1.set( - poly.starts1 + dy * poly.ds1dy + dx * poly.ds1dx, - poly.startt1 + dy * poly.dt1dy + dx * poly.dt1dx, - poly.startw1 + dy * poly.dw1dy + dx * poly.dw1dx); - deltastw1.set(poly.ds1dx, poly.dt1dx, poly.dw1dx); + deltas1 = double(poly.ds1dx); + deltat1 = double(poly.dt1dx); + deltaw1 = double(poly.dw1dx); + iters1 = double(poly.starts1 + dy * poly.ds1dy + dx * poly.ds1dx); + itert1 = double(poly.startt1 + dy * poly.dt1dy + dx * poly.dt1dx); + iterw1 = double(poly.startw1 + dy * poly.dw1dy + dx * poly.dw1dx); lodbase1 = compute_lodbase(poly.ds1dx, poly.ds1dy, poly.dt1dx, poly.dt1dy); } poly.info->scanlines++; @@ -2402,8 +2341,8 @@ void voodoo_renderer::rasterizer(s32 y, const voodoo_renderer::extent_t &extent, rgbaint_t texel(0); if (GenericFlags & rasterizer_params::GENERIC_TEX1) { - s32 lod1; - rgbaint_t texel_t1 = poly.tex1->fetch_texel(texmode1, dither, x, iterstw1, lodbase1, lod1, m_bilinear_mask); + s32 lod1 = lodbase1; + rgbaint_t texel_t1 = poly.tex1->fetch_texel(texmode1, dither, x, iters1, itert1, iterw1, lod1, m_bilinear_mask); if (GenericFlags & rasterizer_params::GENERIC_TEX1_IDENTITY) texel = texel_t1; else @@ -2417,8 +2356,8 @@ void voodoo_renderer::rasterizer(s32 y, const voodoo_renderer::extent_t &extent, // we should send the configuration byte if (!texmode0.seq_8_downld()) { - s32 lod0; - rgbaint_t texel_t0 = poly.tex0->fetch_texel(texmode0, dither, x, iterstw0, lodbase0, lod0, m_bilinear_mask); + s32 lod0 = lodbase0; + rgbaint_t texel_t0 = poly.tex0->fetch_texel(texmode0, dither, x, iters0, itert0, iterw0, lod0, m_bilinear_mask); if (GenericFlags & rasterizer_params::GENERIC_TEX0_IDENTITY) texel = texel_t0; else @@ -2455,9 +2394,17 @@ void voodoo_renderer::rasterizer(s32 y, const voodoo_renderer::extent_t &extent, iterz += poly.dzdx; iterw += iterw_delta; if (GenericFlags & rasterizer_params::GENERIC_TEX0) - iterstw0.add(deltastw0); + { + iters0 += deltas0; + itert0 += deltat0; + iterw0 += deltaw0; + } if (GenericFlags & rasterizer_params::GENERIC_TEX1) - iterstw1.add(deltastw1); + { + iters1 += deltas1; + itert1 += deltat1; + iterw1 += deltaw1; + } } } diff --git a/src/devices/video/voodoo_render.h b/src/devices/video/voodoo_render.h index fa3f134a84c..722dc080fdc 100644 --- a/src/devices/video/voodoo_render.h +++ b/src/devices/video/voodoo_render.h @@ -24,7 +24,6 @@ namespace voodoo struct rasterizer_info; struct poly_data; class dither_helper; -class stw_helper; // base class for our renderer using voodoo_poly_manager = poly_manager; @@ -354,7 +353,7 @@ public: rgb_t lookup_single_texel(u32 format, u32 texbase, s32 s, s32 t); // fetch a texel given coordinates and LOD information - rgbaint_t fetch_texel(voodoo::reg_texture_mode const texmode, voodoo::dither_helper const &dither, s32 x, const voodoo::stw_helper &iterstw, s32 lodbase, s32 &lod, u8 bilinear_mask); + rgbaint_t fetch_texel(voodoo::reg_texture_mode const texmode, voodoo::dither_helper const &dither, s32 x, double iters, double itert, double iterw, s32 &lod, u8 bilinear_mask); // texture-specific color combination unit rgbaint_t combine_texture(voodoo::reg_texture_mode const texmode, rgbaint_t const &c_local, rgbaint_t const &c_other, s32 lod); @@ -613,40 +612,6 @@ private: std::vector m_thread_stats; }; - - -//************************************************************************** -// MATH HELPERS -//************************************************************************** - -//------------------------------------------------- -// fast_log2 - computes the log2 of a double- -// precision value as a 24.8 value -//------------------------------------------------- - -inline s32 fast_log2(double value, int offset) -{ - // negative values return 0 - if (UNEXPECTED(value < 0)) - return 0; - - // convert the value to a raw integer - union { double d; u64 i; } temp; - temp.d = value; - - // we only care about the 11-bit exponent and top 4 bits of mantissa - // (sign is already assured to be 0) - u32 ival = temp.i >> 48; - - // exponent in the upper bits, plus an 8-bit log value from 4 bits of mantissa - s32 exp = (ival >> 4) - 1023 + 32 - offset; - - // the maximum error using a 4 bit lookup from the mantissa is 0.0875, which is - // less than 1/2 lsb (0.125) for 2 bits of fraction - static u8 const s_log2_table[16] = { 0, 22, 44, 63, 82, 100, 118, 134, 150, 165, 179, 193, 207, 220, 232, 244 }; - return (exp << 8) | s_log2_table[ival & 15]; -} - } #endif // MAME_VIDEO_VOODOO_RENDER_H