diff --git a/src/emu/video/vooddefs.h b/src/emu/video/vooddefs.h index f20ed902896..bb320bf237e 100644 --- a/src/emu/video/vooddefs.h +++ b/src/emu/video/vooddefs.h @@ -26,6 +26,9 @@ enum // Use old macro style or newer SSE2 optimized functions #define USE_OLD_RASTER 0 +// Use old table lookup versus straight double divide +#define USE_FAST_RECIP 1 + /* maximum number of TMUs */ #define MAX_TMU 2 @@ -2196,11 +2199,8 @@ do } \ while (0) -/* use SSE on 64-bit implementations, where it can be assumed */ -#if (!defined(MAME_DEBUG) || defined(__OPTIMIZE__)) && (defined(__SSE2__) || defined(_MSC_VER)) && defined(PTR64) - // NB: This code should no longer be SSE2-specific now that it uses rgbaint_t, consider removing the #define and the #else case. -INLINE UINT32 clampARGB(INT32 iterr, INT32 iterg, INT32 iterb, INT32 itera, UINT32 FBZCP) +INLINE UINT32 ATTR_FORCE_INLINE clampARGB(INT32 iterr, INT32 iterg, INT32 iterb, INT32 itera, UINT32 FBZCP) { rgb_t result; rgbaint_t colorint((INT32) (itera>>12), (INT32) (iterr>>12), (INT32) (iterg>>12), (INT32) (iterb>>12)); @@ -2228,61 +2228,6 @@ INLINE UINT32 clampARGB(INT32 iterr, INT32 iterg, INT32 iterb, INT32 itera, UINT } } -#else - -INLINE UINT32 clampARGB(INT32 iterr, INT32 iterg, INT32 iterb, INT32 itera, UINT32 FBZCP) -{ - rgb_union result; - INT16 r, g, b, a; - r = (INT16)(iterr >> 12); - g = (INT16)(iterg >> 12); - b = (INT16)(iterb >> 12); - a = (INT16)(itera >> 12); - - if (FBZCP_RGBZW_CLAMP(FBZCP) == 0) - { - r &= 0xfff; - result.rgb.r = r; - if (r == 0xfff) - result.rgb.r = 0; - else if (r == 0x100) - result.rgb.r = 0xff; - - g &= 0xfff; - result.rgb.g = g; - if (g == 0xfff) - result.rgb.g = 0; - else if (g == 0x100) - result.rgb.g = 0xff; - - b &= 0xfff; - result.rgb.b = b; - if (b == 0xfff) - result.rgb.b = 0; - else if (b == 0x100) - result.rgb.b = 0xff; - - a &= 0xfff; - result.rgb.a = a; - if (a == 0xfff) - result.rgb.a = 0; - else if (a == 0x100) - result.rgb.a = 0xff; - } - else - { - result.rgb.r = (r < 0) ? 0 : (r > 0xff) ? 0xff : r; - result.rgb.g = (g < 0) ? 0 : (g > 0xff) ? 0xff : g; - result.rgb.b = (b < 0) ? 0 : (b > 0xff) ? 0xff : b; - result.rgb.a = (a < 0) ? 0 : (a > 0xff) ? 0xff : a; - } - return result.u; -} - -#endif - - - #define CLAMPED_Z(ITERZ, FBZCP, RESULT) \ do \ { \ @@ -2399,7 +2344,7 @@ do } \ while (0) -INLINE bool chromaKeyTest(voodoo_state *v, stats_block *stats, UINT32 fbzModeReg, rgb_union color) +INLINE bool ATTR_FORCE_INLINE chromaKeyTest(voodoo_state *v, stats_block *stats, UINT32 fbzModeReg, rgb_union color) { if (FBZMODE_ENABLE_CHROMAKEY(fbzModeReg)) { @@ -2572,7 +2517,7 @@ do } \ while (0) -INLINE bool alphaTest(voodoo_state *v, stats_block *stats, UINT32 alphaModeReg, UINT8 alpha) +INLINE bool ATTR_FORCE_INLINE alphaTest(voodoo_state *v, stats_block *stats, UINT32 alphaModeReg, UINT8 alpha) { if (ALPHAMODE_ALPHATEST(alphaModeReg)) { @@ -2801,7 +2746,7 @@ do } \ while (0) -INLINE void alphaBlend(UINT32 FBZMODE, UINT32 ALPHAMODE, INT32 x, const UINT8 *dither, int dpix, UINT16 *depth, rgb_union preFog, rgb_union &color) +INLINE void ATTR_FORCE_INLINE alphaBlend(UINT32 FBZMODE, UINT32 ALPHAMODE, INT32 x, const UINT8 *dither, int dpix, UINT16 *depth, rgb_union preFog, rgb_union &color) { if (ALPHAMODE_ALPHABLEND(ALPHAMODE)) { @@ -3112,7 +3057,7 @@ do } \ while (0) -INLINE void applyFogging(voodoo_state *v, UINT32 fogModeReg, UINT32 fbzCpReg, INT32 x, const UINT8 *dither4, INT32 fogDepth, rgb_union &color, INT32 iterz, INT64 iterw, rgb_union iterargb) +INLINE void ATTR_FORCE_INLINE applyFogging(voodoo_state *v, UINT32 fogModeReg, UINT32 fbzCpReg, INT32 x, const UINT8 *dither4, INT32 fogDepth, rgb_union &color, INT32 iterz, INT64 iterw, rgb_union iterargb) { if (FOGMODE_ENABLE_FOG(fogModeReg)) { @@ -3251,17 +3196,21 @@ do { \ INT32 blendr, blendg, blendb, blenda; \ INT32 tr, tg, tb, ta; \ - INT32 oow, s, t, lod, ilod; \ + INT32 s, t, lod, ilod; \ INT32 smax, tmax; \ UINT32 texbase; \ rgb_union c_local; \ - \ + \ /* determine the S/T/LOD values for this texture */ \ if (TEXMODE_ENABLE_PERSPECTIVE(TEXMODE)) \ { \ - oow = fast_reciplog((ITERW), &lod); \ - s = ((INT64)oow * (ITERS)) >> 29; \ - t = ((INT64)oow * (ITERT)) >> 29; \ + if (USE_FAST_RECIP) { \ + const INT32 oow = fast_reciplog((ITERW), &lod); \ + s = ((INT64)oow * (ITERS)) >> 29; \ + t = ((INT64)oow * (ITERT)) >> 29; \ + } else { \ + multi_reciplog(ITERS, ITERT, ITERW, lod, s, t); \ + } \ lod += (LODBASE); \ } \ else \ @@ -3753,7 +3702,7 @@ do } \ while (0) -INLINE bool depthTest(UINT16 zaColorReg, stats_block *stats, INT32 destDepth, UINT32 fbzModeReg, INT32 biasdepth) +INLINE bool ATTR_FORCE_INLINE depthTest(UINT16 zaColorReg, stats_block *stats, INT32 destDepth, UINT32 fbzModeReg, INT32 biasdepth) { /* handle depth buffer testing */ if (FBZMODE_ENABLE_DEPTHBUF(fbzModeReg)) @@ -4178,7 +4127,7 @@ do } \ while (0) -INLINE bool combineColor(voodoo_state *VV, stats_block *STATS, UINT32 FBZCOLORPATH, UINT32 FBZMODE, UINT32 ALPHAMODE, +INLINE bool ATTR_FORCE_INLINE combineColor(voodoo_state *VV, stats_block *STATS, UINT32 FBZCOLORPATH, UINT32 FBZMODE, UINT32 ALPHAMODE, rgb_union TEXELARGB, INT32 ITERZ, INT64 ITERW, rgb_union ITERARGB, rgb_union &color) { rgb_union c_other; @@ -4621,7 +4570,39 @@ static void raster_##name(void *destbase, INT32 y, const poly_extent *extent, co } \ } -INLINE UINT32 genTexture(tmu_state *TT, INT32 x, const UINT8 *dither4, const UINT32 TEXMODE, rgb_t *LOOKUP, INT32 LODBASE, INT64 ITERS, INT64 ITERT, INT64 ITERW, INT32 &lod) +// ****************************************************************************************************************************** +// Computes a log2 of a 16.32 value to 2 fractional bits of precision. +// The return value is coded as a 24.8 value. +// The maximum error using a 4 bit lookup from the mantissa is 0.0875, which is less than 1/2 lsb (0.125) for 2 bits of fraction. +// ****************************************************************************************************************************** +INLINE INT32 ATTR_FORCE_INLINE new_log2(double &value) +{ + static const INT32 new_log2_table[16] = {0, 22, 44, 63, 82, 100, 118, 134, 150, 165, 179, 193, 207, 220, 232, 244}; + UINT64 ival = *((UINT64 *)&value); + // We zero the result if negative so don't worry about the sign bit + INT32 exp = (ival>>52); + exp -= 1023+32; + exp <<= 8; + UINT32 addr = (UINT64)(ival>>48) & 0xf; + exp += new_log2_table[addr]; + // Return 0 if negative + return (ival & ((UINT64)1<<63)) ? 0 : exp; +} + +// Computes A/C and B/C and returns log2 of 1/C +// A, B and C are 16.32 values. The results are 14.18. +INLINE void ATTR_FORCE_INLINE multi_reciplog(INT64 valueA, INT64 valueB, INT64 valueC, INT32 &log, INT32 &resA, INT32 &resB) +{ + double recip = 1.0f/valueC; + double resAD = valueA * recip * ((INT64)1<<(47-29)); + double resBD = valueB * recip * ((INT64)1<<(47-29)); + log = new_log2(recip); + resA = resAD; + resB = resBD; +} + + +INLINE UINT32 ATTR_FORCE_INLINE genTexture(tmu_state *TT, INT32 x, const UINT8 *dither4, const UINT32 TEXMODE, rgb_t *LOOKUP, INT32 LODBASE, INT64 ITERS, INT64 ITERT, INT64 ITERW, INT32 &lod) { UINT32 result; INT32 s, t, ilod; @@ -4631,10 +4612,14 @@ INLINE UINT32 genTexture(tmu_state *TT, INT32 x, const UINT8 *dither4, const UIN if (TEXMODE_ENABLE_PERSPECTIVE(TEXMODE)) { INT32 wLog; - const INT32 oow = fast_reciplog((ITERW), &wLog); + if (USE_FAST_RECIP) { + const INT32 oow = fast_reciplog((ITERW), &wLog); + s = ((INT64)oow * (ITERS)) >> 29; + t = ((INT64)oow * (ITERT)) >> 29; + } else { + multi_reciplog(ITERS, ITERT, ITERW, wLog, s, t); + } lod += wLog; - s = ((INT64)oow * (ITERS)) >> 29; - t = ((INT64)oow * (ITERT)) >> 29; } else { @@ -4808,7 +4793,7 @@ INLINE UINT32 genTexture(tmu_state *TT, INT32 x, const UINT8 *dither4, const UIN return result; } -INLINE UINT32 combineTexture(tmu_state *TT, const UINT32 TEXMODE, rgb_union c_local, rgb_union c_other, INT32 lod) +INLINE UINT32 ATTR_FORCE_INLINE combineTexture(tmu_state *TT, const UINT32 TEXMODE, rgb_union c_local, rgb_union c_other, INT32 lod) { UINT32 result; //INT32 blendr, blendg, blendb, blenda;