powervr2.cpp: turn some per-pixel branches into per-polygon branches

In commit ec325ef14f I did something stupid from
a performance standpoint, which was to select the shading function on a
per-pixel basis even though it can only change on a per-polygon basis.  This
commit uses c++ templates and inline functions to so that the compiler's
constant folding can eliminate the switch statement that selects shading
functions (thus making the choice a per-polygon one).  I also did the same thing
for the texture and bilinear filtering checks.

This brings back a little less than half of the performance hit on my PC, so
things still aren't as fast as they used to be.

Benchmark (running a full cycle of Crazy Taxi NAOMI's attract) on my PC:

Before ec325ef14f: 30%
After ec325ef14f:  23%
With this commit:  26%

Regarding the other 4% of the drop, that is coming from the pixel shading.
When I hardcode powervr2 to always use shading instruction 0, performance
jumps up to 29.5%, but when I hardcode it to use shading instruction 3
performance falls a bit down to 25%.  Shading instruction 3 is the most
complicated of the four and shading instruction 0 is the simplest, so I think
that proves that the shading instructions themselves are a major bottleneck.
I don't have any ideas about how to make them faster yet.
This commit is contained in:
snickerbockers 2018-07-29 23:51:37 -07:00
parent b1c5b7c40d
commit 01b1422c26
2 changed files with 154 additions and 81 deletions

View File

@ -2301,7 +2301,54 @@ void powervr2_device::computedilated()
dilatechose[(b << 3) + a]=3+(a < b ? a : b); dilatechose[(b << 3) + a]=3+(a < b ? a : b);
} }
void powervr2_device::render_hline(bitmap_rgb32 &bitmap, texinfo *ti, int y, float xl, float xr, float ul, float ur, float vl, float vr, float wl, float wr, float const bl_in[4], float const br_in[4], float const offl_in[4], float const offr_in[4]) inline uint32_t powervr2_device::sample_nontextured(texinfo *ti, float u, float v, uint32_t offset_color, uint32_t base_color)
{
return bls24(base_color, offset_color) | (base_color & 0xff000000);
}
template <int tsinst, bool bilinear>
inline uint32_t powervr2_device::sample_textured(texinfo *ti, float u, float v, uint32_t offset_color, uint32_t base_color)
{
uint32_t tmp;
uint32_t c = (this->*(ti->r))(ti, u, v);
if (bilinear)
{
uint32_t c1 = (this->*(ti->r))(ti, u+1.0f, v);
uint32_t c2 = (this->*(ti->r))(ti, u+1.0f, v+1.0f);
uint32_t c3 = (this->*(ti->r))(ti, u, v+1.0f);
c = bilinear_filter(c, c1, c2, c3, u, v);
}
switch (tsinst) {
case 0:
// decal
c = bls24(c, offset_color) | (c & 0xff000000);
break;
case 1:
// modulate
tmp = blc(c, base_color);
tmp = bls24(tmp, offset_color);
tmp |= c & 0xff000000;
c = tmp;
break;
case 2:
// decal with alpha
tmp = bls24(bla(c, c), blia(base_color, c));
c = bls24(tmp, offset_color) | (base_color & 0xff000000);
break;
case 3:
// modulate with alpha
tmp = blc(c, base_color);
tmp = bls24(tmp, offset_color);
tmp |= (((c >> 24) * (base_color >> 24)) >> 8) << 24;
c = tmp;
break;
}
return c;
}
template <powervr2_device::pix_sample_fn sample_fn>
inline void powervr2_device::render_hline(bitmap_rgb32 &bitmap, texinfo *ti, int y, float xl, float xr, float ul, float ur, float vl, float vr, float wl, float wr, float const bl_in[4], float const br_in[4], float const offl_in[4], float const offr_in[4])
{ {
int idx; int idx;
int xxl, xxr; int xxl, xxr;
@ -2364,64 +2411,15 @@ void powervr2_device::render_hline(bitmap_rgb32 &bitmap, texinfo *ti, int y, flo
while(xxl < xxr) { while(xxl < xxr) {
if((wl >= *wbufline)) { if((wl >= *wbufline)) {
uint32_t c;
float u = ul/wl; float u = ul/wl;
float v = vl/wl; float v = vl/wl;
/*
* TODO: Not sure if blending should be done in
* floating point or fixed point, or if it even matters.
*/
uint32_t c;
uint32_t offset_color = float_argb_to_packed_argb(offl); uint32_t offset_color = float_argb_to_packed_argb(offl);
uint32_t base_color = float_argb_to_packed_argb(bl); uint32_t base_color = float_argb_to_packed_argb(bl);
c = (this->*sample_fn)(ti, u, v, offset_color, base_color);
if (ti->textured) {
c = (this->*(ti->r))(ti, u, v);
// debug dip to turn on/off bilinear filtering, it's slooooow
if (debug_dip_status&0x1)
{
if(ti->filter_mode >= TEX_FILTER_BILINEAR)
{
uint32_t c1 = (this->*(ti->r))(ti, u+1.0f, v);
uint32_t c2 = (this->*(ti->r))(ti, u+1.0f, v+1.0f);
uint32_t c3 = (this->*(ti->r))(ti, u, v+1.0f);
c = bilinear_filter(c, c1, c2, c3, u, v);
}
}
uint32_t tmp;
switch (ti->tsinstruction) {
case 0:
// decal
c = bls24(c, offset_color) | (c & 0xff000000);
break;
case 1:
// modulate
tmp = blc(c, base_color);
tmp = bls24(tmp, offset_color);
tmp |= c & 0xff000000;
c = tmp;
break;
case 2:
// decal with alpha
tmp = bls24(bla(c, c), blia(base_color, c));
c = bls24(tmp, offset_color) | (base_color & 0xff000000);
break;
case 3:
// modulate with alpha
tmp = blc(c, base_color);
tmp = bls24(tmp, offset_color);
tmp |= (((c >> 24) * (base_color >> 24)) >> 8) << 24;
c = tmp;
break;
}
} else {
c = bls24(base_color, offset_color) | (base_color & 0xff000000);
}
if(c & 0xff000000) { if(c & 0xff000000) {
*tdata = ti->blend(c, *tdata);
*wbufline = wl; *wbufline = wl;
*tdata = ti->blend(c, *tdata);
} }
} }
wbufline++; wbufline++;
@ -2438,7 +2436,8 @@ void powervr2_device::render_hline(bitmap_rgb32 &bitmap, texinfo *ti, int y, flo
} }
} }
void powervr2_device::render_span(bitmap_rgb32 &bitmap, texinfo *ti, template <powervr2_device::pix_sample_fn sample_fn>
inline void powervr2_device::render_span(bitmap_rgb32 &bitmap, texinfo *ti,
float y0, float y1, float y0, float y1,
float xl, float xr, float xl, float xr,
float ul, float ur, float ul, float ur,
@ -2516,7 +2515,7 @@ void powervr2_device::render_span(bitmap_rgb32 &bitmap, texinfo *ti,
} }
while(yy0 < yy1) { while(yy0 < yy1) {
render_hline(bitmap, ti, yy0, xl, xr, ul, ur, vl, vr, wl, wr, bl, br, offl, offr); render_hline<sample_fn>(bitmap, ti, yy0, xl, xr, ul, ur, vl, vr, wl, wr, bl, br, offl, offr);
xl += dxldy; xl += dxldy;
xr += dxrdy; xr += dxrdy;
@ -2568,7 +2567,8 @@ void powervr2_device::sort_vertices(const vert *v, int *i0, int *i1, int *i2)
} }
void powervr2_device::render_tri_sorted(bitmap_rgb32 &bitmap, texinfo *ti, const vert *v0, const vert *v1, const vert *v2) template <powervr2_device::pix_sample_fn sample_fn>
inline void powervr2_device::render_tri_sorted(bitmap_rgb32 &bitmap, texinfo *ti, const vert *v0, const vert *v1, const vert *v2)
{ {
float dy01, dy02, dy12; float dy01, dy02, dy12;
@ -2686,15 +2686,15 @@ void powervr2_device::render_tri_sorted(bitmap_rgb32 &bitmap, texinfo *ti, const
return; return;
if(v1->x > v0->x) if(v1->x > v0->x)
render_span(bitmap, ti, v1->y, v2->y, v0->x, v1->x, v0->u, v1->u, v0->v, v1->v, v0->w, v1->w, v0->b, v1->b, v0->o, v1->o, dx02dy, dx12dy, du02dy, du12dy, dv02dy, dv12dy, dw02dy, dw12dy, db02dy, db12dy, do02dy, do12dy); render_span<sample_fn>(bitmap, ti, v1->y, v2->y, v0->x, v1->x, v0->u, v1->u, v0->v, v1->v, v0->w, v1->w, v0->b, v1->b, v0->o, v1->o, dx02dy, dx12dy, du02dy, du12dy, dv02dy, dv12dy, dw02dy, dw12dy, db02dy, db12dy, do02dy, do12dy);
else else
render_span(bitmap, ti, v1->y, v2->y, v1->x, v0->x, v1->u, v0->u, v1->v, v0->v, v1->w, v0->w, v1->b, v0->b, v1->o, v0->o, dx12dy, dx02dy, du12dy, du02dy, dv12dy, dv02dy, dw12dy, dw02dy, db12dy, db02dy, do12dy, do02dy); render_span<sample_fn>(bitmap, ti, v1->y, v2->y, v1->x, v0->x, v1->u, v0->u, v1->v, v0->v, v1->w, v0->w, v1->b, v0->b, v1->o, v0->o, dx12dy, dx02dy, du12dy, du02dy, dv12dy, dv02dy, dw12dy, dw02dy, db12dy, db02dy, do12dy, do02dy);
} else if(!dy12) { } else if(!dy12) {
if(v2->x > v1->x) if(v2->x > v1->x)
render_span(bitmap, ti, v0->y, v1->y, v0->x, v0->x, v0->u, v0->u, v0->v, v0->v, v0->w, v0->w, v0->b, v0->b, v0->o, v0->o, dx01dy, dx02dy, du01dy, du02dy, dv01dy, dv02dy, dw01dy, dw02dy, db01dy, db02dy, do01dy, do02dy); render_span<sample_fn>(bitmap, ti, v0->y, v1->y, v0->x, v0->x, v0->u, v0->u, v0->v, v0->v, v0->w, v0->w, v0->b, v0->b, v0->o, v0->o, dx01dy, dx02dy, du01dy, du02dy, dv01dy, dv02dy, dw01dy, dw02dy, db01dy, db02dy, do01dy, do02dy);
else else
render_span(bitmap, ti, v0->y, v1->y, v0->x, v0->x, v0->u, v0->u, v0->v, v0->v, v0->w, v0->w, v0->b, v0->b, v0->o, v0->o, dx02dy, dx01dy, du02dy, du01dy, dv02dy, dv01dy, dw02dy, dw01dy, db02dy, db01dy, do02dy, do01dy); render_span<sample_fn>(bitmap, ti, v0->y, v1->y, v0->x, v0->x, v0->u, v0->u, v0->v, v0->v, v0->w, v0->w, v0->b, v0->b, v0->o, v0->o, dx02dy, dx01dy, du02dy, du01dy, dv02dy, dv01dy, dw02dy, dw01dy, db02dy, db01dy, do02dy, do01dy);
} else { } else {
float idk_b[4] = { float idk_b[4] = {
@ -2710,17 +2710,17 @@ void powervr2_device::render_tri_sorted(bitmap_rgb32 &bitmap, texinfo *ti, const
v0->o[3] + do02dy[3] * dy01 v0->o[3] + do02dy[3] * dy01
}; };
if(dx01dy < dx02dy) { if(dx01dy < dx02dy) {
render_span(bitmap, ti, v0->y, v1->y, render_span<sample_fn>(bitmap, ti, v0->y, v1->y,
v0->x, v0->x, v0->u, v0->u, v0->v, v0->v, v0->w, v0->w, v0->b, v0->b, v0->o, v0->o, v0->x, v0->x, v0->u, v0->u, v0->v, v0->v, v0->w, v0->w, v0->b, v0->b, v0->o, v0->o,
dx01dy, dx02dy, du01dy, du02dy, dv01dy, dv02dy, dw01dy, dw02dy, db01dy, db02dy, do01dy, do02dy); dx01dy, dx02dy, du01dy, du02dy, dv01dy, dv02dy, dw01dy, dw02dy, db01dy, db02dy, do01dy, do02dy);
render_span(bitmap, ti, v1->y, v2->y, render_span<sample_fn>(bitmap, ti, v1->y, v2->y,
v1->x, v0->x + dx02dy*dy01, v1->u, v0->u + du02dy*dy01, v1->v, v0->v + dv02dy*dy01, v1->w, v0->w + dw02dy*dy01, v1->b, idk_b, v1->o, idk_o, v1->x, v0->x + dx02dy*dy01, v1->u, v0->u + du02dy*dy01, v1->v, v0->v + dv02dy*dy01, v1->w, v0->w + dw02dy*dy01, v1->b, idk_b, v1->o, idk_o,
dx12dy, dx02dy, du12dy, du02dy, dv12dy, dv02dy, dw12dy, dw02dy, db12dy, db02dy, do12dy, do02dy); dx12dy, dx02dy, du12dy, du02dy, dv12dy, dv02dy, dw12dy, dw02dy, db12dy, db02dy, do12dy, do02dy);
} else { } else {
render_span(bitmap, ti, v0->y, v1->y, render_span<sample_fn>(bitmap, ti, v0->y, v1->y,
v0->x, v0->x, v0->u, v0->u, v0->v, v0->v, v0->w, v0->w, v0->b, v0->b, v0->o, v0->o, v0->x, v0->x, v0->u, v0->u, v0->v, v0->v, v0->w, v0->w, v0->b, v0->b, v0->o, v0->o,
dx02dy, dx01dy, du02dy, du01dy, dv02dy, dv01dy, dw02dy, dw01dy, db02dy, db01dy, do02dy, do01dy); dx02dy, dx01dy, du02dy, du01dy, dv02dy, dv01dy, dw02dy, dw01dy, db02dy, db01dy, do02dy, do01dy);
render_span(bitmap, ti, v1->y, v2->y, render_span<sample_fn>(bitmap, ti, v1->y, v2->y,
v0->x + dx02dy*dy01, v1->x, v0->u + du02dy*dy01, v1->u, v0->v + dv02dy*dy01, v1->v, v0->w + dw02dy*dy01, v1->w, idk_b, v1->b, idk_o, v1->o, v0->x + dx02dy*dy01, v1->x, v0->u + du02dy*dy01, v1->u, v0->v + dv02dy*dy01, v1->v, v0->w + dw02dy*dy01, v1->w, idk_b, v1->b, idk_o, v1->o,
dx02dy, dx12dy, du02dy, du12dy, dv02dy, dv12dy, dw02dy, dw12dy, db02dy, db12dy, do02dy, do12dy); dx02dy, dx12dy, du02dy, du12dy, dv02dy, dv12dy, dw02dy, dw12dy, db02dy, db12dy, do02dy, do12dy);
} }
@ -2732,7 +2732,59 @@ void powervr2_device::render_tri(bitmap_rgb32 &bitmap, texinfo *ti, const vert *
int i0, i1, i2; int i0, i1, i2;
sort_vertices(v, &i0, &i1, &i2); sort_vertices(v, &i0, &i1, &i2);
render_tri_sorted(bitmap, ti, v+i0, v+i1, v+i2);
bool textured = ti->textured;
if (textured) {
bool bilinear = (debug_dip_status & 1) &&
(ti->filter_mode >= TEX_FILTER_BILINEAR);
if (bilinear) {
switch (ti->tsinstruction) {
case 0:
render_tri_sorted<&powervr2_device::sample_textured<0,true>>(bitmap, ti, v+i0, v+i1, v+i2);
break;
case 1:
render_tri_sorted<&powervr2_device::sample_textured<1,true>>(bitmap, ti, v+i0, v+i1, v+i2);
break;
case 2:
render_tri_sorted<&powervr2_device::sample_textured<2,true>>(bitmap, ti, v+i0, v+i1, v+i2);
break;
case 3:
render_tri_sorted<&powervr2_device::sample_textured<3,true>>(bitmap, ti, v+i0, v+i1, v+i2);
break;
default:
/*
* This should be impossible because tsinstruction was previously
* AND'd with 3
*/
logerror("%s - tsinstruction is 0x%08x\n", (unsigned)ti->tsinstruction);
render_tri_sorted<&powervr2_device::sample_nontextured>(bitmap, ti, v+i0, v+i1, v+i2);
}
} else {
switch (ti->tsinstruction) {
case 0:
render_tri_sorted<&powervr2_device::sample_textured<0,false>>(bitmap, ti, v+i0, v+i1, v+i2);
break;
case 1:
render_tri_sorted<&powervr2_device::sample_textured<1,false>>(bitmap, ti, v+i0, v+i1, v+i2);
break;
case 2:
render_tri_sorted<&powervr2_device::sample_textured<2,false>>(bitmap, ti, v+i0, v+i1, v+i2);
break;
case 3:
render_tri_sorted<&powervr2_device::sample_textured<3,false>>(bitmap, ti, v+i0, v+i1, v+i2);
break;
default:
/*
* This should be impossible because tsinstruction was previously
* AND'd with 3
*/
logerror("%s - tsinstruction is 0x%08x\n", (unsigned)ti->tsinstruction);
render_tri_sorted<&powervr2_device::sample_nontextured>(bitmap, ti, v+i0, v+i1, v+i2);
}
}
} else {
render_tri_sorted<&powervr2_device::sample_nontextured>(bitmap, ti, v+i0, v+i1, v+i2);
}
} }
void powervr2_device::render_to_accumulation_buffer(bitmap_rgb32 &bitmap,const rectangle &cliprect) void powervr2_device::render_to_accumulation_buffer(bitmap_rgb32 &bitmap,const rectangle &cliprect)

View File

@ -305,6 +305,14 @@ public:
void pvr_scanline_timer(int vpos); void pvr_scanline_timer(int vpos);
uint32_t screen_update(screen_device &screen, bitmap_rgb32 &bitmap, const rectangle &cliprect); uint32_t screen_update(screen_device &screen, bitmap_rgb32 &bitmap, const rectangle &cliprect);
typedef uint32_t(powervr2_device::*pix_sample_fn)(texinfo*,float,float,uint32_t,uint32_t);
typedef std::function<uint32_t(texinfo*,float,float,uint32_t,uint32_t)> sample_fn;
inline uint32_t sample_nontextured(texinfo *ti, float u, float v, uint32_t offset_color, uint32_t base_color);
template <int tsinst, bool bilinear>
inline uint32_t sample_textured(texinfo *ti, float u, float v, uint32_t offset_color, uint32_t base_color);
protected: protected:
virtual void device_start() override; virtual void device_start() override;
virtual void device_reset() override; virtual void device_reset() override;
@ -462,8 +470,16 @@ private:
uint32_t tex_r_default(texinfo *t, float x, float y); uint32_t tex_r_default(texinfo *t, float x, float y);
void tex_get_info(texinfo *t); void tex_get_info(texinfo *t);
void render_hline(bitmap_rgb32 &bitmap, texinfo *ti, int y, float xl, float xr, float ul, float ur, float vl, float vr, float wl, float wr, float const bl[4], float const br[4], float const offl[4], float const offr[4]); template <pix_sample_fn sample_fn>
void render_span(bitmap_rgb32 &bitmap, texinfo *ti, inline void render_hline(bitmap_rgb32 &bitmap, texinfo *ti,
int y, float xl, float xr,
float ul, float ur, float vl, float vr,
float wl, float wr,
float const bl[4], float const br[4],
float const offl[4], float const offr[4]);
template <pix_sample_fn sample_fn>
inline void render_span(bitmap_rgb32 &bitmap, texinfo *ti,
float y0, float y1, float y0, float y1,
float xl, float xr, float xl, float xr,
float ul, float ur, float ul, float ur,
@ -477,8 +493,13 @@ private:
float dwldy, float dwrdy, float dwldy, float dwrdy,
float const dbldy[4], float const dbrdy[4], float const dbldy[4], float const dbrdy[4],
float const doldy[4], float const dordy[4]); float const doldy[4], float const dordy[4]);
template <pix_sample_fn sample_fn>
inline void render_tri_sorted(bitmap_rgb32 &bitmap, texinfo *ti,
const vert *v0,
const vert *v1, const vert *v2);
void sort_vertices(const vert *v, int *i0, int *i1, int *i2); void sort_vertices(const vert *v, int *i0, int *i1, int *i2);
void render_tri_sorted(bitmap_rgb32 &bitmap, texinfo *ti, const vert *v0, const vert *v1, const vert *v2);
void render_tri(bitmap_rgb32 &bitmap, texinfo *ti, const vert *v); void render_tri(bitmap_rgb32 &bitmap, texinfo *ti, const vert *v);
void render_to_accumulation_buffer(bitmap_rgb32 &bitmap, const rectangle &cliprect); void render_to_accumulation_buffer(bitmap_rgb32 &bitmap, const rectangle &cliprect);
void pvr_accumulationbuffer_to_framebuffer(address_space &space, int x, int y); void pvr_accumulationbuffer_to_framebuffer(address_space &space, int x, int y);