powervr2.cpp: turn some per-pixel branches into per-polygon branches

In commit ec325ef14f I did something stupid from
a performance standpoint, which was to select the shading function on a
per-pixel basis even though it can only change on a per-polygon basis.  This
commit uses c++ templates and inline functions to so that the compiler's
constant folding can eliminate the switch statement that selects shading
functions (thus making the choice a per-polygon one).  I also did the same thing
for the texture and bilinear filtering checks.

This brings back a little less than half of the performance hit on my PC, so
things still aren't as fast as they used to be.

Benchmark (running a full cycle of Crazy Taxi NAOMI's attract) on my PC:

Before ec325ef14f: 30%
After ec325ef14f:  23%
With this commit:  26%

Regarding the other 4% of the drop, that is coming from the pixel shading.
When I hardcode powervr2 to always use shading instruction 0, performance
jumps up to 29.5%, but when I hardcode it to use shading instruction 3
performance falls a bit down to 25%.  Shading instruction 3 is the most
complicated of the four and shading instruction 0 is the simplest, so I think
that proves that the shading instructions themselves are a major bottleneck.
I don't have any ideas about how to make them faster yet.
This commit is contained in:
snickerbockers 2018-07-29 23:51:37 -07:00
parent b1c5b7c40d
commit 01b1422c26
2 changed files with 154 additions and 81 deletions

View File

@ -2301,7 +2301,54 @@ void powervr2_device::computedilated()
dilatechose[(b << 3) + a]=3+(a < b ? a : b); dilatechose[(b << 3) + a]=3+(a < b ? a : b);
} }
void powervr2_device::render_hline(bitmap_rgb32 &bitmap, texinfo *ti, int y, float xl, float xr, float ul, float ur, float vl, float vr, float wl, float wr, float const bl_in[4], float const br_in[4], float const offl_in[4], float const offr_in[4]) inline uint32_t powervr2_device::sample_nontextured(texinfo *ti, float u, float v, uint32_t offset_color, uint32_t base_color)
{
return bls24(base_color, offset_color) | (base_color & 0xff000000);
}
template <int tsinst, bool bilinear>
inline uint32_t powervr2_device::sample_textured(texinfo *ti, float u, float v, uint32_t offset_color, uint32_t base_color)
{
uint32_t tmp;
uint32_t c = (this->*(ti->r))(ti, u, v);
if (bilinear)
{
uint32_t c1 = (this->*(ti->r))(ti, u+1.0f, v);
uint32_t c2 = (this->*(ti->r))(ti, u+1.0f, v+1.0f);
uint32_t c3 = (this->*(ti->r))(ti, u, v+1.0f);
c = bilinear_filter(c, c1, c2, c3, u, v);
}
switch (tsinst) {
case 0:
// decal
c = bls24(c, offset_color) | (c & 0xff000000);
break;
case 1:
// modulate
tmp = blc(c, base_color);
tmp = bls24(tmp, offset_color);
tmp |= c & 0xff000000;
c = tmp;
break;
case 2:
// decal with alpha
tmp = bls24(bla(c, c), blia(base_color, c));
c = bls24(tmp, offset_color) | (base_color & 0xff000000);
break;
case 3:
// modulate with alpha
tmp = blc(c, base_color);
tmp = bls24(tmp, offset_color);
tmp |= (((c >> 24) * (base_color >> 24)) >> 8) << 24;
c = tmp;
break;
}
return c;
}
template <powervr2_device::pix_sample_fn sample_fn>
inline void powervr2_device::render_hline(bitmap_rgb32 &bitmap, texinfo *ti, int y, float xl, float xr, float ul, float ur, float vl, float vr, float wl, float wr, float const bl_in[4], float const br_in[4], float const offl_in[4], float const offr_in[4])
{ {
int idx; int idx;
int xxl, xxr; int xxl, xxr;
@ -2364,64 +2411,15 @@ void powervr2_device::render_hline(bitmap_rgb32 &bitmap, texinfo *ti, int y, flo
while(xxl < xxr) { while(xxl < xxr) {
if((wl >= *wbufline)) { if((wl >= *wbufline)) {
uint32_t c;
float u = ul/wl; float u = ul/wl;
float v = vl/wl; float v = vl/wl;
/*
* TODO: Not sure if blending should be done in
* floating point or fixed point, or if it even matters.
*/
uint32_t c;
uint32_t offset_color = float_argb_to_packed_argb(offl); uint32_t offset_color = float_argb_to_packed_argb(offl);
uint32_t base_color = float_argb_to_packed_argb(bl); uint32_t base_color = float_argb_to_packed_argb(bl);
c = (this->*sample_fn)(ti, u, v, offset_color, base_color);
if (ti->textured) {
c = (this->*(ti->r))(ti, u, v);
// debug dip to turn on/off bilinear filtering, it's slooooow
if (debug_dip_status&0x1)
{
if(ti->filter_mode >= TEX_FILTER_BILINEAR)
{
uint32_t c1 = (this->*(ti->r))(ti, u+1.0f, v);
uint32_t c2 = (this->*(ti->r))(ti, u+1.0f, v+1.0f);
uint32_t c3 = (this->*(ti->r))(ti, u, v+1.0f);
c = bilinear_filter(c, c1, c2, c3, u, v);
}
}
uint32_t tmp;
switch (ti->tsinstruction) {
case 0:
// decal
c = bls24(c, offset_color) | (c & 0xff000000);
break;
case 1:
// modulate
tmp = blc(c, base_color);
tmp = bls24(tmp, offset_color);
tmp |= c & 0xff000000;
c = tmp;
break;
case 2:
// decal with alpha
tmp = bls24(bla(c, c), blia(base_color, c));
c = bls24(tmp, offset_color) | (base_color & 0xff000000);
break;
case 3:
// modulate with alpha
tmp = blc(c, base_color);
tmp = bls24(tmp, offset_color);
tmp |= (((c >> 24) * (base_color >> 24)) >> 8) << 24;
c = tmp;
break;
}
} else {
c = bls24(base_color, offset_color) | (base_color & 0xff000000);
}
if(c & 0xff000000) { if(c & 0xff000000) {
*tdata = ti->blend(c, *tdata);
*wbufline = wl; *wbufline = wl;
*tdata = ti->blend(c, *tdata);
} }
} }
wbufline++; wbufline++;
@ -2438,7 +2436,8 @@ void powervr2_device::render_hline(bitmap_rgb32 &bitmap, texinfo *ti, int y, flo
} }
} }
void powervr2_device::render_span(bitmap_rgb32 &bitmap, texinfo *ti, template <powervr2_device::pix_sample_fn sample_fn>
inline void powervr2_device::render_span(bitmap_rgb32 &bitmap, texinfo *ti,
float y0, float y1, float y0, float y1,
float xl, float xr, float xl, float xr,
float ul, float ur, float ul, float ur,
@ -2516,7 +2515,7 @@ void powervr2_device::render_span(bitmap_rgb32 &bitmap, texinfo *ti,
} }
while(yy0 < yy1) { while(yy0 < yy1) {
render_hline(bitmap, ti, yy0, xl, xr, ul, ur, vl, vr, wl, wr, bl, br, offl, offr); render_hline<sample_fn>(bitmap, ti, yy0, xl, xr, ul, ur, vl, vr, wl, wr, bl, br, offl, offr);
xl += dxldy; xl += dxldy;
xr += dxrdy; xr += dxrdy;
@ -2568,7 +2567,8 @@ void powervr2_device::sort_vertices(const vert *v, int *i0, int *i1, int *i2)
} }
void powervr2_device::render_tri_sorted(bitmap_rgb32 &bitmap, texinfo *ti, const vert *v0, const vert *v1, const vert *v2) template <powervr2_device::pix_sample_fn sample_fn>
inline void powervr2_device::render_tri_sorted(bitmap_rgb32 &bitmap, texinfo *ti, const vert *v0, const vert *v1, const vert *v2)
{ {
float dy01, dy02, dy12; float dy01, dy02, dy12;
@ -2686,15 +2686,15 @@ void powervr2_device::render_tri_sorted(bitmap_rgb32 &bitmap, texinfo *ti, const
return; return;
if(v1->x > v0->x) if(v1->x > v0->x)
render_span(bitmap, ti, v1->y, v2->y, v0->x, v1->x, v0->u, v1->u, v0->v, v1->v, v0->w, v1->w, v0->b, v1->b, v0->o, v1->o, dx02dy, dx12dy, du02dy, du12dy, dv02dy, dv12dy, dw02dy, dw12dy, db02dy, db12dy, do02dy, do12dy); render_span<sample_fn>(bitmap, ti, v1->y, v2->y, v0->x, v1->x, v0->u, v1->u, v0->v, v1->v, v0->w, v1->w, v0->b, v1->b, v0->o, v1->o, dx02dy, dx12dy, du02dy, du12dy, dv02dy, dv12dy, dw02dy, dw12dy, db02dy, db12dy, do02dy, do12dy);
else else
render_span(bitmap, ti, v1->y, v2->y, v1->x, v0->x, v1->u, v0->u, v1->v, v0->v, v1->w, v0->w, v1->b, v0->b, v1->o, v0->o, dx12dy, dx02dy, du12dy, du02dy, dv12dy, dv02dy, dw12dy, dw02dy, db12dy, db02dy, do12dy, do02dy); render_span<sample_fn>(bitmap, ti, v1->y, v2->y, v1->x, v0->x, v1->u, v0->u, v1->v, v0->v, v1->w, v0->w, v1->b, v0->b, v1->o, v0->o, dx12dy, dx02dy, du12dy, du02dy, dv12dy, dv02dy, dw12dy, dw02dy, db12dy, db02dy, do12dy, do02dy);
} else if(!dy12) { } else if(!dy12) {
if(v2->x > v1->x) if(v2->x > v1->x)
render_span(bitmap, ti, v0->y, v1->y, v0->x, v0->x, v0->u, v0->u, v0->v, v0->v, v0->w, v0->w, v0->b, v0->b, v0->o, v0->o, dx01dy, dx02dy, du01dy, du02dy, dv01dy, dv02dy, dw01dy, dw02dy, db01dy, db02dy, do01dy, do02dy); render_span<sample_fn>(bitmap, ti, v0->y, v1->y, v0->x, v0->x, v0->u, v0->u, v0->v, v0->v, v0->w, v0->w, v0->b, v0->b, v0->o, v0->o, dx01dy, dx02dy, du01dy, du02dy, dv01dy, dv02dy, dw01dy, dw02dy, db01dy, db02dy, do01dy, do02dy);
else else
render_span(bitmap, ti, v0->y, v1->y, v0->x, v0->x, v0->u, v0->u, v0->v, v0->v, v0->w, v0->w, v0->b, v0->b, v0->o, v0->o, dx02dy, dx01dy, du02dy, du01dy, dv02dy, dv01dy, dw02dy, dw01dy, db02dy, db01dy, do02dy, do01dy); render_span<sample_fn>(bitmap, ti, v0->y, v1->y, v0->x, v0->x, v0->u, v0->u, v0->v, v0->v, v0->w, v0->w, v0->b, v0->b, v0->o, v0->o, dx02dy, dx01dy, du02dy, du01dy, dv02dy, dv01dy, dw02dy, dw01dy, db02dy, db01dy, do02dy, do01dy);
} else { } else {
float idk_b[4] = { float idk_b[4] = {
@ -2710,17 +2710,17 @@ void powervr2_device::render_tri_sorted(bitmap_rgb32 &bitmap, texinfo *ti, const
v0->o[3] + do02dy[3] * dy01 v0->o[3] + do02dy[3] * dy01
}; };
if(dx01dy < dx02dy) { if(dx01dy < dx02dy) {
render_span(bitmap, ti, v0->y, v1->y, render_span<sample_fn>(bitmap, ti, v0->y, v1->y,
v0->x, v0->x, v0->u, v0->u, v0->v, v0->v, v0->w, v0->w, v0->b, v0->b, v0->o, v0->o, v0->x, v0->x, v0->u, v0->u, v0->v, v0->v, v0->w, v0->w, v0->b, v0->b, v0->o, v0->o,
dx01dy, dx02dy, du01dy, du02dy, dv01dy, dv02dy, dw01dy, dw02dy, db01dy, db02dy, do01dy, do02dy); dx01dy, dx02dy, du01dy, du02dy, dv01dy, dv02dy, dw01dy, dw02dy, db01dy, db02dy, do01dy, do02dy);
render_span(bitmap, ti, v1->y, v2->y, render_span<sample_fn>(bitmap, ti, v1->y, v2->y,
v1->x, v0->x + dx02dy*dy01, v1->u, v0->u + du02dy*dy01, v1->v, v0->v + dv02dy*dy01, v1->w, v0->w + dw02dy*dy01, v1->b, idk_b, v1->o, idk_o, v1->x, v0->x + dx02dy*dy01, v1->u, v0->u + du02dy*dy01, v1->v, v0->v + dv02dy*dy01, v1->w, v0->w + dw02dy*dy01, v1->b, idk_b, v1->o, idk_o,
dx12dy, dx02dy, du12dy, du02dy, dv12dy, dv02dy, dw12dy, dw02dy, db12dy, db02dy, do12dy, do02dy); dx12dy, dx02dy, du12dy, du02dy, dv12dy, dv02dy, dw12dy, dw02dy, db12dy, db02dy, do12dy, do02dy);
} else { } else {
render_span(bitmap, ti, v0->y, v1->y, render_span<sample_fn>(bitmap, ti, v0->y, v1->y,
v0->x, v0->x, v0->u, v0->u, v0->v, v0->v, v0->w, v0->w, v0->b, v0->b, v0->o, v0->o, v0->x, v0->x, v0->u, v0->u, v0->v, v0->v, v0->w, v0->w, v0->b, v0->b, v0->o, v0->o,
dx02dy, dx01dy, du02dy, du01dy, dv02dy, dv01dy, dw02dy, dw01dy, db02dy, db01dy, do02dy, do01dy); dx02dy, dx01dy, du02dy, du01dy, dv02dy, dv01dy, dw02dy, dw01dy, db02dy, db01dy, do02dy, do01dy);
render_span(bitmap, ti, v1->y, v2->y, render_span<sample_fn>(bitmap, ti, v1->y, v2->y,
v0->x + dx02dy*dy01, v1->x, v0->u + du02dy*dy01, v1->u, v0->v + dv02dy*dy01, v1->v, v0->w + dw02dy*dy01, v1->w, idk_b, v1->b, idk_o, v1->o, v0->x + dx02dy*dy01, v1->x, v0->u + du02dy*dy01, v1->u, v0->v + dv02dy*dy01, v1->v, v0->w + dw02dy*dy01, v1->w, idk_b, v1->b, idk_o, v1->o,
dx02dy, dx12dy, du02dy, du12dy, dv02dy, dv12dy, dw02dy, dw12dy, db02dy, db12dy, do02dy, do12dy); dx02dy, dx12dy, du02dy, du12dy, dv02dy, dv12dy, dw02dy, dw12dy, db02dy, db12dy, do02dy, do12dy);
} }
@ -2732,7 +2732,59 @@ void powervr2_device::render_tri(bitmap_rgb32 &bitmap, texinfo *ti, const vert *
int i0, i1, i2; int i0, i1, i2;
sort_vertices(v, &i0, &i1, &i2); sort_vertices(v, &i0, &i1, &i2);
render_tri_sorted(bitmap, ti, v+i0, v+i1, v+i2);
bool textured = ti->textured;
if (textured) {
bool bilinear = (debug_dip_status & 1) &&
(ti->filter_mode >= TEX_FILTER_BILINEAR);
if (bilinear) {
switch (ti->tsinstruction) {
case 0:
render_tri_sorted<&powervr2_device::sample_textured<0,true>>(bitmap, ti, v+i0, v+i1, v+i2);
break;
case 1:
render_tri_sorted<&powervr2_device::sample_textured<1,true>>(bitmap, ti, v+i0, v+i1, v+i2);
break;
case 2:
render_tri_sorted<&powervr2_device::sample_textured<2,true>>(bitmap, ti, v+i0, v+i1, v+i2);
break;
case 3:
render_tri_sorted<&powervr2_device::sample_textured<3,true>>(bitmap, ti, v+i0, v+i1, v+i2);
break;
default:
/*
* This should be impossible because tsinstruction was previously
* AND'd with 3
*/
logerror("%s - tsinstruction is 0x%08x\n", (unsigned)ti->tsinstruction);
render_tri_sorted<&powervr2_device::sample_nontextured>(bitmap, ti, v+i0, v+i1, v+i2);
}
} else {
switch (ti->tsinstruction) {
case 0:
render_tri_sorted<&powervr2_device::sample_textured<0,false>>(bitmap, ti, v+i0, v+i1, v+i2);
break;
case 1:
render_tri_sorted<&powervr2_device::sample_textured<1,false>>(bitmap, ti, v+i0, v+i1, v+i2);
break;
case 2:
render_tri_sorted<&powervr2_device::sample_textured<2,false>>(bitmap, ti, v+i0, v+i1, v+i2);
break;
case 3:
render_tri_sorted<&powervr2_device::sample_textured<3,false>>(bitmap, ti, v+i0, v+i1, v+i2);
break;
default:
/*
* This should be impossible because tsinstruction was previously
* AND'd with 3
*/
logerror("%s - tsinstruction is 0x%08x\n", (unsigned)ti->tsinstruction);
render_tri_sorted<&powervr2_device::sample_nontextured>(bitmap, ti, v+i0, v+i1, v+i2);
}
}
} else {
render_tri_sorted<&powervr2_device::sample_nontextured>(bitmap, ti, v+i0, v+i1, v+i2);
}
} }
void powervr2_device::render_to_accumulation_buffer(bitmap_rgb32 &bitmap,const rectangle &cliprect) void powervr2_device::render_to_accumulation_buffer(bitmap_rgb32 &bitmap,const rectangle &cliprect)

View File

@ -305,6 +305,14 @@ public:
void pvr_scanline_timer(int vpos); void pvr_scanline_timer(int vpos);
uint32_t screen_update(screen_device &screen, bitmap_rgb32 &bitmap, const rectangle &cliprect); uint32_t screen_update(screen_device &screen, bitmap_rgb32 &bitmap, const rectangle &cliprect);
typedef uint32_t(powervr2_device::*pix_sample_fn)(texinfo*,float,float,uint32_t,uint32_t);
typedef std::function<uint32_t(texinfo*,float,float,uint32_t,uint32_t)> sample_fn;
inline uint32_t sample_nontextured(texinfo *ti, float u, float v, uint32_t offset_color, uint32_t base_color);
template <int tsinst, bool bilinear>
inline uint32_t sample_textured(texinfo *ti, float u, float v, uint32_t offset_color, uint32_t base_color);
protected: protected:
virtual void device_start() override; virtual void device_start() override;
virtual void device_reset() override; virtual void device_reset() override;
@ -462,23 +470,36 @@ private:
uint32_t tex_r_default(texinfo *t, float x, float y); uint32_t tex_r_default(texinfo *t, float x, float y);
void tex_get_info(texinfo *t); void tex_get_info(texinfo *t);
void render_hline(bitmap_rgb32 &bitmap, texinfo *ti, int y, float xl, float xr, float ul, float ur, float vl, float vr, float wl, float wr, float const bl[4], float const br[4], float const offl[4], float const offr[4]); template <pix_sample_fn sample_fn>
void render_span(bitmap_rgb32 &bitmap, texinfo *ti, inline void render_hline(bitmap_rgb32 &bitmap, texinfo *ti,
float y0, float y1, int y, float xl, float xr,
float xl, float xr, float ul, float ur, float vl, float vr,
float ul, float ur, float wl, float wr,
float vl, float vr, float const bl[4], float const br[4],
float wl, float wr, float const offl[4], float const offr[4]);
float const bl[4], float const br[4],
float const offl[4], float const offr[4], template <pix_sample_fn sample_fn>
float dxldy, float dxrdy, inline void render_span(bitmap_rgb32 &bitmap, texinfo *ti,
float duldy, float durdy, float y0, float y1,
float dvldy, float dvrdy, float xl, float xr,
float dwldy, float dwrdy, float ul, float ur,
float const dbldy[4], float const dbrdy[4], float vl, float vr,
float const doldy[4], float const dordy[4]); float wl, float wr,
float const bl[4], float const br[4],
float const offl[4], float const offr[4],
float dxldy, float dxrdy,
float duldy, float durdy,
float dvldy, float dvrdy,
float dwldy, float dwrdy,
float const dbldy[4], float const dbrdy[4],
float const doldy[4], float const dordy[4]);
template <pix_sample_fn sample_fn>
inline void render_tri_sorted(bitmap_rgb32 &bitmap, texinfo *ti,
const vert *v0,
const vert *v1, const vert *v2);
void sort_vertices(const vert *v, int *i0, int *i1, int *i2); void sort_vertices(const vert *v, int *i0, int *i1, int *i2);
void render_tri_sorted(bitmap_rgb32 &bitmap, texinfo *ti, const vert *v0, const vert *v1, const vert *v2);
void render_tri(bitmap_rgb32 &bitmap, texinfo *ti, const vert *v); void render_tri(bitmap_rgb32 &bitmap, texinfo *ti, const vert *v);
void render_to_accumulation_buffer(bitmap_rgb32 &bitmap, const rectangle &cliprect); void render_to_accumulation_buffer(bitmap_rgb32 &bitmap, const rectangle &cliprect);
void pvr_accumulationbuffer_to_framebuffer(address_space &space, int x, int y); void pvr_accumulationbuffer_to_framebuffer(address_space &space, int x, int y);