From 3c5cd12782d854f766fc4d8f1dea211676a945a2 Mon Sep 17 00:00:00 2001 From: "therealmogminer@gmail.com" Date: Thu, 25 Jun 2015 19:19:25 +0200 Subject: [PATCH] nw, start merging in RSP vector ops from CEN64 --- src/emu/cpu/rsp/clamp.h | 37 +++++ src/emu/cpu/rsp/rsp.h | 1 + src/emu/cpu/rsp/rspcp2.c | 299 +++++++++++++++++++++++++++++++++++++- src/emu/cpu/rsp/rspcp2.h | 174 +++++++++++++++++++++- src/emu/cpu/rsp/rspcp2d.c | 1 - src/emu/cpu/rsp/rspcp2d.h | 1 + src/emu/cpu/rsp/rspdrc.c | 1 - src/emu/cpu/rsp/vabs.h | 15 ++ src/emu/cpu/rsp/vadd.h | 17 +++ src/emu/cpu/rsp/vaddc.h | 13 ++ src/emu/cpu/rsp/vand.h | 9 ++ src/emu/cpu/rsp/vch.h | 57 ++++++++ src/emu/cpu/rsp/vcl.h | 65 +++++++++ src/emu/cpu/rsp/vcmp.h | 49 +++++++ src/emu/cpu/rsp/vcr.h | 35 +++++ src/emu/cpu/rsp/vdivh.h | 12 ++ src/emu/cpu/rsp/vmac.h | 57 ++++++++ src/emu/cpu/rsp/vmov.h | 9 ++ src/emu/cpu/rsp/vmrg.h | 13 ++ src/emu/cpu/rsp/vmudh.h | 11 ++ src/emu/cpu/rsp/vmul.h | 39 +++++ src/emu/cpu/rsp/vmulh.h | 31 ++++ src/emu/cpu/rsp/vmull.h | 44 ++++++ src/emu/cpu/rsp/vmulm.h | 56 +++++++ src/emu/cpu/rsp/vmuln.h | 55 +++++++ src/emu/cpu/rsp/vor.h | 10 ++ src/emu/cpu/rsp/vrcpsq.h | 60 ++++++++ src/emu/cpu/rsp/vrsq.h | 65 +++++++++ src/emu/cpu/rsp/vsub.h | 17 +++ src/emu/cpu/rsp/vsubc.h | 14 ++ src/emu/cpu/rsp/vxor.h | 10 ++ 31 files changed, 1267 insertions(+), 10 deletions(-) create mode 100644 src/emu/cpu/rsp/clamp.h create mode 100644 src/emu/cpu/rsp/vabs.h create mode 100644 src/emu/cpu/rsp/vadd.h create mode 100644 src/emu/cpu/rsp/vaddc.h create mode 100644 src/emu/cpu/rsp/vand.h create mode 100644 src/emu/cpu/rsp/vch.h create mode 100644 src/emu/cpu/rsp/vcl.h create mode 100644 src/emu/cpu/rsp/vcmp.h create mode 100644 src/emu/cpu/rsp/vcr.h create mode 100644 src/emu/cpu/rsp/vdivh.h create mode 100644 src/emu/cpu/rsp/vmac.h create mode 100644 src/emu/cpu/rsp/vmov.h create mode 100644 src/emu/cpu/rsp/vmrg.h create mode 100644 src/emu/cpu/rsp/vmudh.h create mode 100644 src/emu/cpu/rsp/vmul.h create mode 100644 src/emu/cpu/rsp/vmulh.h create mode 100644 src/emu/cpu/rsp/vmull.h create mode 100644 src/emu/cpu/rsp/vmulm.h create mode 100644 src/emu/cpu/rsp/vmuln.h create mode 100644 src/emu/cpu/rsp/vor.h create mode 100644 src/emu/cpu/rsp/vrcpsq.h create mode 100644 src/emu/cpu/rsp/vrsq.h create mode 100644 src/emu/cpu/rsp/vsub.h create mode 100644 src/emu/cpu/rsp/vsubc.h create mode 100644 src/emu/cpu/rsp/vxor.h diff --git a/src/emu/cpu/rsp/clamp.h b/src/emu/cpu/rsp/clamp.h new file mode 100644 index 00000000000..614fd9c9465 --- /dev/null +++ b/src/emu/cpu/rsp/clamp.h @@ -0,0 +1,37 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t sclamp_acc_to_mid(rsp_vec_t acc_mid, rsp_vec_t acc_hi) +{ + return _mm_packs_epi32( + _mm_unpacklo_epi16(acc_mid, acc_hi), + _mm_unpackhi_epi16(acc_mid, acc_hi) + ); +} + +static inline rsp_vec_t uclamp_acc(rsp_vec_t val, rsp_vec_t acc_mid, rsp_vec_t acc_hi, rsp_vec_t zero) +{ + rsp_vec_t hi_negative = _mm_srai_epi16(acc_hi, 15); + rsp_vec_t mid_negative = _mm_srai_epi16(acc_mid, 15); + + // We don't have to clamp if the HI part of the + // accumulator is sign-extended down to the MD part. + rsp_vec_t hi_sign_check = _mm_cmpeq_epi16(hi_negative, acc_hi); + rsp_vec_t mid_sign_check = _mm_cmpeq_epi16(hi_negative, mid_negative); + rsp_vec_t clamp_mask = _mm_and_si128(mid_sign_check, hi_sign_check); + + // Generate the value in the event we need to clamp. + // * hi_negative, mid_sign => xxxx + // * hi_negative, !mid_sign => 0000 + // * !hi_negative, mid_sign => FFFF + // * !hi_negative, !mid_sign => xxxx + rsp_vec_t clamped_val = _mm_cmpeq_epi16(hi_negative, zero); + +#ifndef __SSE4_1__ + clamped_val = _mm_and_si128(clamp_mask, val); + val = _mm_andnot_si128(clamp_mask, clamped_val); + return _mm_or_si128(val, clamped_val); +#else + return _mm_blendv_epi8(clamped_val, val, clamp_mask); +#endif +} diff --git a/src/emu/cpu/rsp/rsp.h b/src/emu/cpu/rsp/rsp.h index 34d53f2eb61..fce4635a359 100644 --- a/src/emu/cpu/rsp/rsp.h +++ b/src/emu/cpu/rsp/rsp.h @@ -14,6 +14,7 @@ #ifndef __RSP_H__ #define __RSP_H__ +#include "emu.h" #include "cpu/drcfe.h" #include "cpu/drcuml.h" diff --git a/src/emu/cpu/rsp/rspcp2.c b/src/emu/cpu/rsp/rspcp2.c index c7d15bec393..8b60e4169f7 100644 --- a/src/emu/cpu/rsp/rspcp2.c +++ b/src/emu/cpu/rsp/rspcp2.c @@ -1,5 +1,5 @@ // license:BSD-3-Clause -// copyright-holders:Ryan Holtz +// copyright-holders:Ryan Holtz,Tyler J. Stachecki /*************************************************************************** rspcp2.c @@ -11,13 +11,176 @@ #include "emu.h" #include "rsp.h" -#include "rspdiv.h" #include "rspcp2.h" -#include "cpu/drcfe.h" -#include "cpu/drcuml.h" -#include "cpu/drcumlsh.h" -using namespace uml; +#if USE_SIMD +#include + +const rsp_cop2::vec_helpers_t rsp_cop2::m_vec_helpers = { + { 0 }, + { // logic_mask + { 0, 0, 0, 0, 0, 0, 0, 0 }, + { ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 } + }, + { // vrsq_mask_table + { ~0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, ~0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, ~0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, ~0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, ~0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, ~0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, ~0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, ~0 } + }, + { // shuffle_keys +/* -- */{0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0b0a, 0x0d0c, 0x0f0e}, +/* -- */{0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0b0a, 0x0d0c, 0x0f0e}, + +/* 0q */{0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0d0c, 0x0d0c}, +/* 1q */{0x0302, 0x0302, 0x0706, 0x0706, 0x0b0a, 0x0b0a, 0x0f0e, 0x0f0e}, + +/* 0h */{0x0100, 0x0100, 0x0100, 0x0100, 0x0908, 0x0908, 0x0908, 0x0908}, +/* 1h */{0x0302, 0x0302, 0x0302, 0x0302, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a}, +/* 2h */{0x0504, 0x0504, 0x0504, 0x0504, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c}, +/* 3h */{0x0706, 0x0706, 0x0706, 0x0706, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e}, + +/* 0w */{0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100}, +/* 1w */{0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302}, +/* 2w */{0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504}, +/* 3w */{0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706}, +/* 4w */{0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908}, +/* 5w */{0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a}, +/* 6w */{0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c}, +/* 7w */{0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e} + }, + { // sll_b2l_keys + {0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d}, + {0x0102, 0x8000, 0x0506, 0x0304, 0x090a, 0x0708, 0x0d0e, 0x0b0c}, + {0x0001, 0x8080, 0x0405, 0x0203, 0x0809, 0x0607, 0x0c0d, 0x0a0b}, + {0x8000, 0x8080, 0x0304, 0x0102, 0x0708, 0x0506, 0x0b0c, 0x090a}, + + {0x8080, 0x8080, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809}, + {0x8080, 0x8080, 0x0102, 0x8000, 0x0506, 0x0304, 0x090a, 0x0708}, + {0x8080, 0x8080, 0x0001, 0x8080, 0x0405, 0x0203, 0x0809, 0x0607}, + {0x8080, 0x8080, 0x8000, 0x8080, 0x0304, 0x0102, 0x0708, 0x0506}, + + {0x8080, 0x8080, 0x8080, 0x8080, 0x0203, 0x0001, 0x0607, 0x0405}, + {0x8080, 0x8080, 0x8080, 0x8080, 0x0102, 0x8000, 0x0506, 0x0304}, + {0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x8080, 0x0405, 0x0203}, + {0x8080, 0x8080, 0x8080, 0x8080, 0x8000, 0x8080, 0x0304, 0x0102}, + + {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0203, 0x0001}, + {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0102, 0x8000}, + {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x8080}, + {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8000, 0x8080} + }, + { // sll_l2b_keys + {0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d}, + {0x0380, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e}, + {0x8080, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f}, + {0x8080, 0x0380, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08}, + + {0x8080, 0x8080, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809}, + {0x8080, 0x8080, 0x0380, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a}, + {0x8080, 0x8080, 0x8080, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b}, + {0x8080, 0x8080, 0x8080, 0x0380, 0x0102, 0x0700, 0x0506, 0x0b04}, + + {0x8080, 0x8080, 0x8080, 0x8080, 0x0203, 0x0001, 0x0607, 0x0405}, + {0x8080, 0x8080, 0x8080, 0x8080, 0x0380, 0x0102, 0x0700, 0x0506}, + {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0203, 0x0001, 0x0607}, + {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0380, 0x0102, 0x0700}, + + {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0203, 0x0001}, + {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0380, 0x0102}, + {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0203}, + {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0380} + }, + { // srl_b2l_keys + {0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d}, + {0x0304, 0x0102, 0x0708, 0x0506, 0x0b0c, 0x090a, 0x0f80, 0x0d0e}, + {0x0405, 0x0203, 0x0809, 0x0607, 0x0c0d, 0x0a0b, 0x8080, 0x0e0f}, + {0x0506, 0x0304, 0x090a, 0x0708, 0x0d0e, 0x0b0c, 0x8080, 0x0f80}, + + {0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x8080, 0x8080}, + {0x0708, 0x0506, 0x0b0c, 0x090a, 0x0f80, 0x0d0e, 0x8080, 0x8080}, + {0x0809, 0x0607, 0x0c0d, 0x0a0b, 0x8080, 0x0e0f, 0x8080, 0x8080}, + {0x090a, 0x0708, 0x0d0e, 0x0b0c, 0x8080, 0x0f80, 0x8080, 0x8080}, + + {0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x8080, 0x8080, 0x8080, 0x8080}, + {0x0b0c, 0x090a, 0x0f80, 0x0d0e, 0x8080, 0x8080, 0x8080, 0x8080}, + {0x0c0d, 0x0a0b, 0x8080, 0x0e0f, 0x8080, 0x8080, 0x8080, 0x8080}, + {0x0d0e, 0x0b0c, 0x8080, 0x0f80, 0x8080, 0x8080, 0x8080, 0x8080}, + + {0x0e0f, 0x0c0d, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080}, + {0x0f80, 0x0d0e, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080}, + {0x8080, 0x0e0f, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080}, + {0x8080, 0x0f80, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080} + }, + { // ror_b2l_keys + {0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d}, + {0x0304, 0x0102, 0x0708, 0x0506, 0x0b0c, 0x090a, 0x0f00, 0x0d0e}, + {0x0405, 0x0203, 0x0809, 0x0607, 0x0c0d, 0x0a0b, 0x0001, 0x0e0f}, + {0x0506, 0x0304, 0x090a, 0x0708, 0x0d0e, 0x0b0c, 0x0102, 0x0f00}, + + {0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001}, + {0x0708, 0x0506, 0x0b0c, 0x090a, 0x0f00, 0x0d0e, 0x0304, 0x0102}, + {0x0809, 0x0607, 0x0c0d, 0x0a0b, 0x0001, 0x0e0f, 0x0405, 0x0203}, + {0x090a, 0x0708, 0x0d0e, 0x0b0c, 0x0102, 0x0f00, 0x0506, 0x0304}, + + {0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405}, + {0x0b0c, 0x090a, 0x0f00, 0x0d0e, 0x0304, 0x0102, 0x0708, 0x0506}, + {0x0c0d, 0x0a0b, 0x0001, 0x0e0f, 0x0405, 0x0203, 0x0809, 0x0607}, + {0x0d0e, 0x0b0c, 0x0102, 0x0f00, 0x0506, 0x0304, 0x090a, 0x0708}, + + {0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809}, + {0x0f00, 0x0d0e, 0x0304, 0x0102, 0x0708, 0x0506, 0x0b0c, 0x090a}, + {0x0001, 0x0e0f, 0x0405, 0x0203, 0x0809, 0x0607, 0x0c0d, 0x0a0b}, + {0x0102, 0x0f00, 0x0506, 0x0304, 0x090a, 0x0708, 0x0d0e, 0x0b0c} + }, + { // rol_l2b_keys + {0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d}, + {0x030c, 0x0102, 0x0400, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e}, + {0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f}, + {0x0d0e, 0x030c, 0x0102, 0x0400, 0x0506, 0x0b04, 0x090a, 0x0f08}, + + {0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809}, + {0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0400, 0x0506, 0x0b04, 0x090a}, + {0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b}, + {0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0400, 0x0506, 0x0b04}, + + {0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405}, + {0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0400, 0x0506}, + {0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607}, + {0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0400}, + + {0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001}, + {0x0400, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102}, + {0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203}, + {0x0102, 0x0400, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c} + }, + { // ror_l2b_keys + {0x0203, 0x0001, 0x0607, 0x0405, 0x0c0b, 0x0809, 0x0e0f, 0x0c0d}, + {0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c}, + {0x0001, 0x0607, 0x0405, 0x0c0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203}, + {0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102}, + + {0x0607, 0x0405, 0x0c0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001}, + {0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0700}, + {0x0405, 0x0c0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607}, + {0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0700, 0x0506}, + + {0x0c0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405}, + {0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0700, 0x0506, 0x0b04}, + {0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0c0b}, + {0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a}, + + {0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0c0b, 0x0809}, + {0x0d0e, 0x030c, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08}, + {0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0c0b, 0x0809, 0x0e0f}, + {0x030c, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e} + } +}; +#endif extern offs_t rsp_dasm_one(char *buffer, offs_t pc, UINT32 op); @@ -100,6 +263,7 @@ extern offs_t rsp_dasm_one(char *buffer, offs_t pc, UINT32 op); VREG_S(VDREG, 7) = m_vres[7]; \ } +#if !USE_SIMD static const int vector_elements_2[16][8] = { { 0, 1, 2, 3, 4, 5, 6, 7 }, // none @@ -119,6 +283,7 @@ static const int vector_elements_2[16][8] = { 6, 6, 6, 6, 6, 6, 6, 6 }, // 6 { 7, 7, 7, 7, 7, 7, 7, 7 }, // 7 }; +#endif rsp_cop2::rsp_cop2(rsp_device &rsp, running_machine &machine) : m_rsp(rsp) @@ -905,7 +1070,9 @@ UINT16 rsp_cop2::SATURATE_ACCUM(int accum, int slice, UINT16 negative, UINT16 po void rsp_cop2::handle_vector_ops(UINT32 op) { +#if !USE_SIMD int i; +#endif // Opcode legend: // E = VS2 element type @@ -924,6 +1091,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Multiplies signed integer by signed integer * 2 +#if USE_SIMD +#else for (i=0; i < 8; i++) { INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); @@ -948,8 +1117,9 @@ void rsp_cop2::handle_vector_ops(UINT32 op) } } WRITEBACK_RESULT(); - +#endif break; + } case 0x01: /* VMULU */ @@ -960,6 +1130,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // ------------------------------------------------------ // +#if USE_SIMD +#else for (i=0; i < 8; i++) { INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); @@ -986,6 +1158,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) } } WRITEBACK_RESULT(); +#endif break; } @@ -1000,6 +1173,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // Stores the higher 16 bits of the 32-bit result to accumulator // The low slice of accumulator is stored into destination element +#if USE_SIMD +#else for (i=0; i < 8; i++) { UINT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, i); @@ -1013,6 +1188,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) m_vres[i] = ACCUM_L(i); } WRITEBACK_RESULT(); +#endif break; } @@ -1027,6 +1203,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // The result is stored into accumulator // The middle slice of accumulator is stored into destination element +#if USE_SIMD +#else for (i=0; i < 8; i++) { INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); @@ -1040,6 +1218,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) m_vres[i] = ACCUM_M(i); } WRITEBACK_RESULT(); +#endif break; } @@ -1055,6 +1234,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // The result is stored into accumulator // The low slice of accumulator is stored into destination element +#if USE_SIMD +#else for (i=0; i < 8; i++) { INT32 s1 = (UINT16)VREG_S(VS1REG, i); // not sign-extended @@ -1068,6 +1249,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) m_vres[i] = ACCUM_L(i); } WRITEBACK_RESULT(); +#endif break; } @@ -1082,6 +1264,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // The result is stored into highest 32 bits of accumulator, the low slice is zero // The highest 32 bits of accumulator is saturated into destination element +#if USE_SIMD +#else for (i=0; i < 8; i++) { INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); @@ -1097,6 +1281,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) m_vres[i] = (INT16)(r); } WRITEBACK_RESULT(); +#endif break; } @@ -1110,6 +1295,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // Multiplies signed integer by signed integer * 2 // The result is added to accumulator +#if USE_SIMD +#else for (i=0; i < 8; i++) { INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); @@ -1131,6 +1318,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) m_vres[i] = SATURATE_ACCUM(i, 1, 0x8000, 0x7fff); } WRITEBACK_RESULT(); +#endif break; } case 0x09: /* VMACU */ @@ -1141,6 +1329,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // ------------------------------------------------------ // +#if USE_SIMD +#else for (i = 0; i < 8; i++) { INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); @@ -1177,6 +1367,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) } } WRITEBACK_RESULT(); +#endif break; } @@ -1191,6 +1382,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // Adds the higher 16 bits of the 32-bit result to accumulator // The low slice of accumulator is stored into destination element +#if USE_SIMD +#else for (i = 0; i < 8; i++) { UINT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, i); @@ -1206,6 +1399,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) m_vres[i] = SATURATE_ACCUM(i, 0, 0x0000, 0xffff); } WRITEBACK_RESULT(); +#endif break; } @@ -1220,6 +1414,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // The result is added into accumulator // The middle slice of accumulator is stored into destination element +#if USE_SIMD +#else for (i=0; i < 8; i++) { UINT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); @@ -1237,6 +1433,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) m_vres[i] = SATURATE_ACCUM(i, 1, 0x8000, 0x7fff); } WRITEBACK_RESULT(); +#endif break; } @@ -1251,6 +1448,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // The result is added into accumulator // The low slice of accumulator is stored into destination element +#if USE_SIMD +#else for (i=0; i < 8; i++) { INT32 s1 = (UINT16)VREG_S(VS1REG, i); // not sign-extended @@ -1271,6 +1470,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) } WRITEBACK_RESULT(); +#endif break; } @@ -1285,6 +1485,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // The result is added into highest 32 bits of accumulator, the low slice is zero // The highest 32 bits of accumulator is saturated into destination element +#if USE_SIMD +#else for (i = 0; i < 8; i++) { INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); @@ -1301,6 +1503,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) } WRITEBACK_RESULT(); +#endif break; } @@ -1315,6 +1518,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // TODO: check VS2REG == VDREG +#if USE_SIMD +#else for (i=0; i < 8; i++) { INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); @@ -1330,6 +1535,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); WRITEBACK_RESULT(); +#endif break; } @@ -1344,6 +1550,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // TODO: check VS2REG == VDREG +#if USE_SIMD +#else for (i = 0; i < 8; i++) { INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i); @@ -1360,6 +1568,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); WRITEBACK_RESULT(); +#endif break; } @@ -1373,6 +1582,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // Changes the sign of source register 2 if source register 1 is negative and stores // the result to destination register +#if USE_SIMD +#else for (i=0; i < 8; i++) { INT16 s1 = (INT16)VREG_S(VS1REG, i); @@ -1401,6 +1612,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) SET_ACCUM_L(m_vres[i], i); } WRITEBACK_RESULT(); +#endif break; } @@ -1415,6 +1627,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // TODO: check VS2REG = VDREG +#if USE_SIMD +#else CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); @@ -1433,6 +1647,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) } } WRITEBACK_RESULT(); +#endif break; } @@ -1447,6 +1662,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // TODO: check VS2REG = VDREG +#if USE_SIMD +#else CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); @@ -1469,6 +1686,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) } } WRITEBACK_RESULT(); +#endif break; } @@ -1481,6 +1699,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Stores high, middle or low slice of accumulator to destination vector +#if USE_SIMD +#else switch (EL) { case 0x08: // VSAWH @@ -1511,6 +1731,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) printf("RSP: VSAW: el = %d\n", EL);//??? ??? exit(0); } +#endif break; } @@ -1524,6 +1745,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // Sets compare flags if elements in VS1 are less than VS2 // Moves the element in VS2 to destination vector +#if USE_SIMD +#else CLEAR_COMPARE_FLAGS(); CLEAR_CLIP2_FLAGS(); @@ -1559,6 +1782,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) CLEAR_CARRY_FLAGS(); CLEAR_ZERO_FLAGS(); WRITEBACK_RESULT(); +#endif break; } @@ -1572,6 +1796,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // Sets compare flags if elements in VS1 are equal with VS2 // Moves the element in VS2 to destination vector +#if USE_SIMD +#else CLEAR_COMPARE_FLAGS(); CLEAR_CLIP2_FLAGS(); @@ -1595,6 +1821,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) CLEAR_ZERO_FLAGS(); CLEAR_CARRY_FLAGS(); WRITEBACK_RESULT(); +#endif break; } @@ -1608,6 +1835,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // Sets compare flags if elements in VS1 are not equal with VS2 // Moves the element in VS2 to destination vector +#if USE_SIMD +#else CLEAR_COMPARE_FLAGS(); CLEAR_CLIP2_FLAGS(); @@ -1632,6 +1861,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) CLEAR_CARRY_FLAGS(); CLEAR_ZERO_FLAGS(); WRITEBACK_RESULT(); +#endif break; } @@ -1645,6 +1875,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // Sets compare flags if elements in VS1 are greater or equal with VS2 // Moves the element in VS2 to destination vector +#if USE_SIMD +#else CLEAR_COMPARE_FLAGS(); CLEAR_CLIP2_FLAGS(); @@ -1669,6 +1901,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) CLEAR_CARRY_FLAGS(); CLEAR_ZERO_FLAGS(); WRITEBACK_RESULT(); +#endif break; } @@ -1681,6 +1914,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Vector clip low +#if USE_SIMD +#else for (i = 0; i < 8; i++) { INT16 s1 = VREG_S(VS1REG, i); @@ -1763,6 +1998,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) CLEAR_ZERO_FLAGS(); CLEAR_CLIP1_FLAGS(); WRITEBACK_RESULT(); +#endif break; } @@ -1775,6 +2011,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Vector clip high +#if USE_SIMD +#else CLEAR_CARRY_FLAGS(); CLEAR_COMPARE_FLAGS(); CLEAR_CLIP1_FLAGS(); @@ -1847,6 +2085,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) SET_ACCUM_L(m_vres[i], i); } WRITEBACK_RESULT(); +#endif break; } @@ -1859,6 +2098,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Vector clip reverse +#if USE_SIMD +#else CLEAR_CARRY_FLAGS(); CLEAR_COMPARE_FLAGS(); CLEAR_CLIP1_FLAGS(); @@ -1906,6 +2147,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) m_vres[i] = ACCUM_L(i); } WRITEBACK_RESULT(); +#endif break; } @@ -1918,6 +2160,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Merges two vectors according to compare flags +#if USE_SIMD +#else for (i = 0; i < 8; i++) { if (COMPARE_FLAG(i) != 0) @@ -1932,6 +2176,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) SET_ACCUM_L(m_vres[i], i); } WRITEBACK_RESULT(); +#endif break; } case 0x28: /* VAND */ @@ -1943,12 +2188,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Bitwise AND of two vector registers +#if USE_SIMD +#else for (i = 0; i < 8; i++) { m_vres[i] = VREG_S(VS1REG, i) & VREG_S(VS2REG, VEC_EL_2(EL, i)); SET_ACCUM_L(m_vres[i], i); } WRITEBACK_RESULT(); +#endif break; } case 0x29: /* VNAND */ @@ -1960,12 +2208,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Bitwise NOT AND of two vector registers +#if USE_SIMD +#else for (i = 0; i < 8; i++) { m_vres[i] = ~((VREG_S(VS1REG, i) & VREG_S(VS2REG, VEC_EL_2(EL, i)))); SET_ACCUM_L(m_vres[i], i); } WRITEBACK_RESULT(); +#endif break; } case 0x2a: /* VOR */ @@ -1977,12 +2228,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Bitwise OR of two vector registers +#if USE_SIMD +#else for (i = 0; i < 8; i++) { m_vres[i] = VREG_S(VS1REG, i) | VREG_S(VS2REG, VEC_EL_2(EL, i)); SET_ACCUM_L(m_vres[i], i); } WRITEBACK_RESULT(); +#endif break; } case 0x2b: /* VNOR */ @@ -1994,12 +2248,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Bitwise NOT OR of two vector registers +#if USE_SIMD +#else for (i=0; i < 8; i++) { m_vres[i] = ~((VREG_S(VS1REG, i) | VREG_S(VS2REG, VEC_EL_2(EL, i)))); SET_ACCUM_L(m_vres[i], i); } WRITEBACK_RESULT(); +#endif break; } case 0x2c: /* VXOR */ @@ -2011,12 +2268,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Bitwise XOR of two vector registers +#if USE_SIMD +#else for (i=0; i < 8; i++) { m_vres[i] = VREG_S(VS1REG, i) ^ VREG_S(VS2REG, VEC_EL_2(EL, i)); SET_ACCUM_L(m_vres[i], i); } WRITEBACK_RESULT(); +#endif break; } case 0x2d: /* VNXOR */ @@ -2028,12 +2288,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Bitwise NOT XOR of two vector registers +#if USE_SIMD +#else for (i=0; i < 8; i++) { m_vres[i] = ~((VREG_S(VS1REG, i) ^ VREG_S(VS2REG, VEC_EL_2(EL, i)))); SET_ACCUM_L(m_vres[i], i); } WRITEBACK_RESULT(); +#endif break; } @@ -2045,6 +2308,9 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // ------------------------------------------------------ // // Calculates reciprocal + +#if USE_SIMD +#else INT32 shifter = 0; INT32 rec = (INT16)(VREG_S(VS2REG, EL & 7)); @@ -2093,6 +2359,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) } +#endif break; } @@ -2105,6 +2372,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Calculates reciprocal low part +#if USE_SIMD +#else INT32 shifter = 0; INT32 rec = (INT16)VREG_S(VS2REG, EL & 7); @@ -2169,6 +2438,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i); } +#endif break; } @@ -2181,6 +2451,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Calculates reciprocal high part +#if USE_SIMD +#else m_reciprocal_high = (VREG_S(VS2REG, EL & 7)) << 16; m_dp_allowed = 1; @@ -2191,6 +2463,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) VREG_S(VDREG, VS1REG & 7) = (INT16)(m_reciprocal_res >> 16); +#endif break; } @@ -2203,11 +2476,14 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Moves element from vector to destination vector +#if USE_SIMD +#else VREG_S(VDREG, VS1REG & 7) = VREG_S(VS2REG, EL & 7); for (i = 0; i < 8; i++) { SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i); } +#endif break; } @@ -2220,6 +2496,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Calculates reciprocal square-root +#if USE_SIMD +#else INT32 shifter = 0; INT32 rec = (INT16)(VREG_S(VS2REG, EL & 7)); @@ -2269,6 +2547,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i); } +#endif break; } @@ -2281,6 +2560,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Calculates reciprocal square-root low part +#if USE_SIMD +#else INT32 shifter = 0; INT32 rec = (INT16)VREG_S(VS2REG, EL & 7); INT32 datainput = rec; @@ -2348,6 +2629,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i); } +#endif break; } @@ -2360,6 +2642,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op) // // Calculates reciprocal square-root high part +#if USE_SIMD +#else m_reciprocal_high = (VREG_S(VS2REG, EL & 7)) << 16; m_dp_allowed = 1; @@ -2369,6 +2653,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op) } VREG_S(VDREG, VS1REG & 7) = (INT16)(m_reciprocal_res >> 16); // store high part +#endif break; } diff --git a/src/emu/cpu/rsp/rspcp2.h b/src/emu/cpu/rsp/rspcp2.h index babfab2d873..73b8f94da54 100644 --- a/src/emu/cpu/rsp/rspcp2.h +++ b/src/emu/cpu/rsp/rspcp2.h @@ -1,5 +1,5 @@ // license:BSD-3-Clause -// copyright-holders:Ryan Holtz +// copyright-holders:Ryan Holtz,Tyler J. Stachecki /*************************************************************************** rspcp2.h @@ -15,6 +15,37 @@ #include "cpu/drcuml.h" #include "rsp.h" +#include "rspdiv.h" + +#define SIMD_OFF (1) + +#if (defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__) || defined(__SSE4_1__) || defined(__SSE4_2__)) +#define SSE_AVAILABLE (1) +#else +#define SSE_AVAILABLE (0) +#endif + +#if (!defined(MAME_DEBUG) || defined(__OPTIMIZE__)) && (SSE_AVAILABLE || defined(_MSC_VER)) && defined(PTR64) && !SIMD_OFF +#define USE_SIMD (1) +#else +#define USE_SIMD (0) +#endif + +#if USE_SIMD +#ifdef __SSE4_2__ +#include +#elif defined(__SSE4_1__) +#include +#elif defined(__SSSE3__) +#include +#elif defined(__SSE3__) +#include +#else +#include +#endif + +typedef __m128i rsp_vec_t; +#endif union VECTOR_REG { @@ -22,6 +53,9 @@ union VECTOR_REG UINT32 l[4]; INT16 s[8]; UINT8 b[16]; +#if USE_SIMD + rsp_vec_t v; +#endif }; union ACCUMULATOR_REG @@ -152,10 +186,148 @@ protected: UINT32 m_reciprocal_high; INT32 m_dp_allowed; +#if USE_SIMD + typedef struct + { + rsp_vec_t dummy_for_alignment; + const UINT16 logic_mask[2][8]; + const UINT16 vrsq_mask_table[8][8]; + const UINT16 shuffle_keys[16][8]; + const UINT16 sll_b2l_keys[16][8]; + const UINT16 sll_l2b_keys[16][8]; + const UINT16 srl_b2l_keys[16][8]; + const UINT16 ror_b2l_keys[16][8]; + const UINT16 rol_l2b_keys[16][8]; + const UINT16 ror_l2b_keys[16][8]; + } vec_helpers_t; + + static const vec_helpers_t m_vec_helpers; + + rsp_vec_t vec_load_and_shuffle_operand(const UINT16* src, UINT32 element); + static inline rsp_vec_t vec_load_unshuffled_operand(const UINT16* src) + { + return _mm_load_si128((rsp_vec_t*) src); + } + static inline void vec_write_operand(UINT16* dest, rsp_vec_t src) + { + _mm_store_si128((rsp_vec_t*) dest, src); + } + static inline rsp_vec_t read_acc_lo(const UINT16 *acc) + { + return vec_load_unshuffled_operand(acc + sizeof(rsp_vec_t) * 2); + } + static inline rsp_vec_t read_acc_mid(const UINT16 *acc) + { + return vec_load_unshuffled_operand(acc + sizeof(rsp_vec_t)); + } + static inline rsp_vec_t read_acc_hi(const UINT16 *acc) + { + return vec_load_unshuffled_operand(acc); + } + static inline rsp_vec_t read_vcc_lo(const UINT16 *vcc) + { + return vec_load_unshuffled_operand(vcc + sizeof(rsp_vec_t)); + } + static inline rsp_vec_t read_vcc_hi(const UINT16 *vcc) + { + return vec_load_unshuffled_operand(vcc); + } + static inline rsp_vec_t read_vco_lo(const UINT16 *vco) + { + return vec_load_unshuffled_operand(vco + sizeof(rsp_vec_t)); + } + static inline rsp_vec_t read_vco_hi(const UINT16 *vco) + { + return vec_load_unshuffled_operand(vco); + } + static inline rsp_vec_t read_vce(const UINT16 *vce) + { + return vec_load_unshuffled_operand(vce + sizeof(rsp_vec_t)); + } + static inline void write_acc_lo(UINT16 *acc, rsp_vec_t acc_lo) + { + return vec_write_operand(acc + sizeof(rsp_vec_t) * 2, acc_lo); + } + static inline void write_acc_mid(UINT16 *acc, rsp_vec_t acc_mid) + { + return vec_write_operand(acc + sizeof(rsp_vec_t), acc_mid); + } + static inline void write_acc_hi(UINT16 *acc, rsp_vec_t acc_hi) + { + return vec_write_operand(acc, acc_hi); + } + static inline void write_vcc_lo(UINT16 *vcc, rsp_vec_t vcc_lo) + { + return vec_write_operand(vcc + sizeof(rsp_vec_t), vcc_lo); + } + static inline void write_vcc_hi(UINT16 *vcc, rsp_vec_t vcc_hi) + { + return vec_write_operand(vcc, vcc_hi); + } + static inline void write_vco_lo(UINT16 *vcc, rsp_vec_t vco_lo) + { + return vec_write_operand(vcc + sizeof(rsp_vec_t), vco_lo); + } + static inline void write_vco_hi(UINT16 *vcc, rsp_vec_t vco_hi) + { + return vec_write_operand(vcc, vco_hi); + } + static inline void write_vce(UINT16 *vce, rsp_vec_t vce_r) + { + return vec_write_operand(vce, vce_r); + } + + static inline INT16 get_flags(const UINT16 *flags) + { + return (INT16)_mm_movemask_epi8( + _mm_packs_epi16( + _mm_load_si128((rsp_vec_t*) (flags + sizeof(rsp_vec_t))), + _mm_load_si128((rsp_vec_t*) flags) + ) + ); + } + + static inline rsp_vec_t vec_zero() + { + return _mm_setzero_si128(); + } + + void vec_load_group1(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm); + void vec_load_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm); + void vec_load_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm); + void vec_store_group1(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm); + void vec_store_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm); + void vec_store_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm); + +#include "clamp.h" +#include "vabs.h" +#include "vadd.h" +#include "vaddc.h" +#include "vand.h" +#include "vch.h" +#include "vcmp.h" +#include "vcl.h" +#include "vcr.h" +#include "vmac.h" +#include "vmrg.h" +#include "vmul.h" +#include "vmulh.h" +#include "vmull.h" +#include "vmulm.h" +#include "vmuln.h" +#include "vor.h" +#include "vsub.h" +#include "vsubc.h" +#include "vxor.h" +#endif + private: void handle_lwc2(UINT32 op); void handle_swc2(UINT32 op); void handle_vector_ops(UINT32 op); + + UINT32 m_div_in; + UINT32 m_div_out; }; #endif /* __RSPCP2_H__ */ diff --git a/src/emu/cpu/rsp/rspcp2d.c b/src/emu/cpu/rsp/rspcp2d.c index 779fea5819c..d7844be717c 100644 --- a/src/emu/cpu/rsp/rspcp2d.c +++ b/src/emu/cpu/rsp/rspcp2d.c @@ -11,7 +11,6 @@ #include "emu.h" #include "rsp.h" -#include "rspdiv.h" #include "rspcp2.h" #include "rspcp2d.h" #include "cpu/drcfe.h" diff --git a/src/emu/cpu/rsp/rspcp2d.h b/src/emu/cpu/rsp/rspcp2d.h index d56e04e7532..831773d5c66 100644 --- a/src/emu/cpu/rsp/rspcp2d.h +++ b/src/emu/cpu/rsp/rspcp2d.h @@ -31,6 +31,7 @@ class rsp_cop2_drc : public rsp_cop2 virtual void state_string_export(const int index, std::string &str); void cfunc_unimplemented_opcode(); + public: virtual void lbv(); virtual void lsv(); diff --git a/src/emu/cpu/rsp/rspdrc.c b/src/emu/cpu/rsp/rspdrc.c index a1fabc19638..c33b0d95c36 100644 --- a/src/emu/cpu/rsp/rspdrc.c +++ b/src/emu/cpu/rsp/rspdrc.c @@ -21,7 +21,6 @@ #include "emu.h" #include "debugger.h" #include "rsp.h" -#include "rspdiv.h" #include "rspfe.h" #include "rspcp2.h" #include "cpu/drcfe.h" diff --git a/src/emu/cpu/rsp/vabs.h b/src/emu/cpu/rsp/vabs.h new file mode 100644 index 00000000000..b6af4e5ef5d --- /dev/null +++ b/src/emu/cpu/rsp/vabs.h @@ -0,0 +1,15 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_vabs(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo) +{ + rsp_vec_t vs_zero = _mm_cmpeq_epi16(vs, zero); + rsp_vec_t sign_lt = _mm_srai_epi16(vs, 15); + rsp_vec_t vd = _mm_andnot_si128(vs_zero, vt); + + // Careful: if VT = 0x8000 and VS is negative, + // acc_lo will be 0x8000 but vd will be 0x7FFF. + vd = _mm_xor_si128(vd, sign_lt); + *acc_lo = _mm_sub_epi16(vd, sign_lt); + return _mm_subs_epi16(vd, sign_lt); +} diff --git a/src/emu/cpu/rsp/vadd.h b/src/emu/cpu/rsp/vadd.h new file mode 100644 index 00000000000..70735059513 --- /dev/null +++ b/src/emu/cpu/rsp/vadd.h @@ -0,0 +1,17 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_vadd(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t carry, rsp_vec_t *acc_lo) +{ + // VCC uses unsaturated arithmetic. + rsp_vec_t vd = _mm_add_epi16(vs, vt); + *acc_lo = _mm_sub_epi16(vd, carry); + + // VD is the signed sum of the two sources and the carry. Since we + // have to saturate the sum of all three, we have to be clever. + rsp_vec_t minimum = _mm_min_epi16(vs, vt); + rsp_vec_t maximum = _mm_max_epi16(vs, vt); + minimum = _mm_subs_epi16(minimum, carry); + return _mm_adds_epi16(minimum, maximum); +} + diff --git a/src/emu/cpu/rsp/vaddc.h b/src/emu/cpu/rsp/vaddc.h new file mode 100644 index 00000000000..586058b2114 --- /dev/null +++ b/src/emu/cpu/rsp/vaddc.h @@ -0,0 +1,13 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_vaddc(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *sn) +{ + rsp_vec_t sat_sum = _mm_adds_epu16(vs, vt); + rsp_vec_t unsat_sum = _mm_add_epi16(vs, vt); + + *sn = _mm_cmpeq_epi16(sat_sum, unsat_sum); + *sn = _mm_cmpeq_epi16(*sn, zero); + + return unsat_sum; +} diff --git a/src/emu/cpu/rsp/vand.h b/src/emu/cpu/rsp/vand.h new file mode 100644 index 00000000000..ecf2259d352 --- /dev/null +++ b/src/emu/cpu/rsp/vand.h @@ -0,0 +1,9 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_vand_vnand(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt) { + rsp_vec_t vmask = _mm_load_si128((rsp_vec_t *) m_vec_helpers.logic_mask[iw & 0x1]); + + rsp_vec_t vd = _mm_and_si128(vs, vt); + return _mm_xor_si128(vd, vmask); +} diff --git a/src/emu/cpu/rsp/vch.h b/src/emu/cpu/rsp/vch.h new file mode 100644 index 00000000000..3cd75731c29 --- /dev/null +++ b/src/emu/cpu/rsp/vch.h @@ -0,0 +1,57 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_vch(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *ge, rsp_vec_t *le, rsp_vec_t *eq, rsp_vec_t *sign, rsp_vec_t *vce) { + // sign = (vs ^ vt) < 0 + *sign = _mm_xor_si128(vs, vt); + *sign = _mm_cmplt_epi16(*sign, zero); + + // sign_negvt = sign ? -vt : vt + rsp_vec_t sign_negvt = _mm_xor_si128(vt, *sign); + sign_negvt = _mm_sub_epi16(sign_negvt, *sign); + + // Compute diff, diff_zero: + rsp_vec_t diff = _mm_sub_epi16(vs, sign_negvt); + rsp_vec_t diff_zero = _mm_cmpeq_epi16(diff, zero); + + // Compute le/ge: + rsp_vec_t vt_neg = _mm_cmplt_epi16(vt, zero); + rsp_vec_t diff_lez = _mm_cmpgt_epi16(diff, zero); + rsp_vec_t diff_gez = _mm_or_si128(diff_lez, diff_zero); + diff_lez = _mm_cmpeq_epi16(zero, diff_lez); + +#ifdef __SSE4_1__ + *ge = _mm_blendv_epi8(diff_gez, vt_neg, *sign); + *le = _mm_blendv_epi8(vt_neg, diff_lez, *sign); +#else + *ge = _mm_and_si128(*sign, vt_neg); + diff_gez = _mm_andnot_si128(*sign, diff_gez); + *ge = _mm_or_si128(*ge, diff_gez); + + *le = _mm_and_si128(*sign, diff_lez); + diff_lez = _mm_andnot_si128(*sign, vt_neg); + *le = _mm_or_si128(*le, diff_lez); +#endif + + // Compute vce: + *vce = _mm_cmpeq_epi16(diff, *sign); + *vce = _mm_and_si128(*vce, *sign); + + // Compute !eq: + *eq = _mm_or_si128(diff_zero, *vce); + *eq = _mm_cmpeq_epi16(*eq, zero); + + // Compute result: +#ifdef __SSE4_1__ + rsp_vec_t diff_sel_mask = _mm_blendv_epi8(*ge, *le, *sign); + return _mm_blendv_epi8(vs, sign_negvt, diff_sel_mask); +#else + diff_lez = _mm_and_si128(*sign, *le); + diff_gez = _mm_andnot_si128(*sign, *ge); + rsp_vec_t diff_sel_mask = _mm_or_si128(diff_lez, diff_gez); + + diff_lez = _mm_and_si128(diff_sel_mask, sign_negvt); + diff_gez = _mm_andnot_si128(diff_sel_mask, vs); + return _mm_or_si128(diff_lez, diff_gez); +#endif +} diff --git a/src/emu/cpu/rsp/vcl.h b/src/emu/cpu/rsp/vcl.h new file mode 100644 index 00000000000..7eed532fad2 --- /dev/null +++ b/src/emu/cpu/rsp/vcl.h @@ -0,0 +1,65 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_vcl(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *ge, rsp_vec_t *le, rsp_vec_t eq, rsp_vec_t sign, rsp_vec_t vce) +{ + // sign_negvt = sign ? -vt : vt + rsp_vec_t sign_negvt = _mm_xor_si128(vt, sign); + sign_negvt = _mm_sub_epi16(sign_negvt, sign); + + // Compute diff, diff_zero, ncarry, and nvce: + // Note: diff = sign ? (vs + vt) : (vs - vt). + rsp_vec_t diff = _mm_sub_epi16(vs, sign_negvt); + rsp_vec_t ncarry = _mm_adds_epu16(vs, vt); + ncarry = _mm_cmpeq_epi16(diff, ncarry); + rsp_vec_t nvce = _mm_cmpeq_epi16(vce, zero); + rsp_vec_t diff_zero = _mm_cmpeq_epi16(diff, zero); + + // Compute results for if (sign && ne): + rsp_vec_t le_case1 = _mm_and_si128(diff_zero, ncarry); + le_case1 = _mm_and_si128(nvce, le_case1); + rsp_vec_t le_case2 = _mm_or_si128(diff_zero, ncarry); + le_case2 = _mm_and_si128(vce, le_case2); + rsp_vec_t le_eq = _mm_or_si128(le_case1, le_case2); + + // Compute results for if (!sign && ne): + rsp_vec_t ge_eq = _mm_subs_epu16(vt, vs); + ge_eq = _mm_cmpeq_epi16(ge_eq, zero); + + // Blend everything together. Caveat: we don't update + // the results of ge/le if ne is false, so be careful. + rsp_vec_t do_le = _mm_andnot_si128(eq, sign); +#ifdef __SSE4_1__ + *le = _mm_blendv_epi8(*le, le_eq, do_le); +#else + le_eq = _mm_and_si128(do_le, le_eq); + *le = _mm_andnot_si128(do_le, *le); + *le = _mm_or_si128(le_eq, *le); +#endif + + rsp_vec_t do_ge = _mm_or_si128(sign, eq); +#ifdef __SSE4_1__ + *ge = _mm_blendv_epi8(ge_eq, *ge, do_ge); +#else + *ge = _mm_and_si128(do_ge, *ge); + ge_eq = _mm_andnot_si128(do_ge, ge_eq); + *ge = _mm_or_si128(ge_eq, *ge); +#endif + + // Mux the result based on the value of sign. +#ifdef __SSE4_1__ + rsp_vec_t mux_mask = _mm_blendv_epi8(*ge, *le, sign); +#else + do_le = _mm_and_si128(sign, *le); + do_ge = _mm_andnot_si128(sign, *ge); + rsp_vec_t mux_mask = _mm_or_si128(do_le, do_ge); +#endif + +#ifdef __SSE4_1__ + return _mm_blendv_epi8(vs, sign_negvt, mux_mask); +#else + sign_negvt = _mm_and_si128(mux_mask, sign_negvt); + vs = _mm_andnot_si128(mux_mask, vs); + return _mm_or_si128(sign_negvt, vs); +#endif +} diff --git a/src/emu/cpu/rsp/vcmp.h b/src/emu/cpu/rsp/vcmp.h new file mode 100644 index 00000000000..b883f44ff6c --- /dev/null +++ b/src/emu/cpu/rsp/vcmp.h @@ -0,0 +1,49 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_veq_vge_vlt_vne(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *le, rsp_vec_t eq, rsp_vec_t sign) +{ + rsp_vec_t equal = _mm_cmpeq_epi16(vs, vt); + + if (iw & 0x2) // VNE & VGE + { + if (iw & 0x1) // VGE + { + rsp_vec_t gt = _mm_cmpgt_epi16(vs, vt); + rsp_vec_t equalsign = _mm_and_si128(eq, sign); + + equal = _mm_andnot_si128(equalsign, equal); + *le = _mm_or_si128(gt, equal); + } + else // VNE + { + rsp_vec_t nequal = _mm_cmpeq_epi16(equal, zero); + + *le = _mm_and_si128(eq, equal); + *le = _mm_or_si128(*le, nequal); + } + } + else // VEQ & VLT + { + if (iw & 0x1) // VEQ + { + *le = _mm_andnot_si128(eq, equal); + } + else // VLT + { + rsp_vec_t lt = _mm_cmplt_epi16(vs, vt); + + equal = _mm_and_si128(eq, equal); + equal = _mm_and_si128(sign, equal); + *le = _mm_or_si128(lt, equal); + } + } + +#ifdef __SSE4_1__ + return _mm_blendv_epi8(vt, vs, *le); +#else + vs = _mm_and_si128(*le, vs); + vt = _mm_andnot_si128(*le, vt); + return _mm_or_si128(vs, vt); +#endif +} diff --git a/src/emu/cpu/rsp/vcr.h b/src/emu/cpu/rsp/vcr.h new file mode 100644 index 00000000000..adea422041e --- /dev/null +++ b/src/emu/cpu/rsp/vcr.h @@ -0,0 +1,35 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_vcr(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *ge, rsp_vec_t *le) { + // sign = (vs ^ vt) < 0 + rsp_vec_t sign = _mm_xor_si128(vs, vt); + sign = _mm_srai_epi16(sign, 15); + + // Compute le + rsp_vec_t diff_lez = _mm_and_si128(vs, sign); + diff_lez = _mm_add_epi16(diff_lez, vt); + *le = _mm_srai_epi16(diff_lez, 15); + + // Compute ge + rsp_vec_t diff_gez = _mm_or_si128(vs, sign); + diff_gez = _mm_min_epi16(diff_gez, vt); + *ge = _mm_cmpeq_epi16(diff_gez, vt); + + // sign_notvt = sn ? ~vt : vt + rsp_vec_t sign_notvt = _mm_xor_si128(vt, sign); + + // Compute result: +#ifdef __SSE4_1__ + rsp_vec_t diff_sel_mask = _mm_blendv_epi8(*ge, *le, sign); + return _mm_blendv_epi8(vs, sign_notvt, diff_sel_mask); +#else + rsp_vec_t diff_sel_mask = _mm_sub_epi16(*le, *ge); + diff_sel_mask = _mm_and_si128(diff_sel_mask, sign); + diff_sel_mask = _mm_add_epi16(diff_sel_mask, *ge); + + zero = _mm_sub_epi16(sign_notvt, vs); + zero = _mm_and_si128(zero, diff_sel_mask); + return _mm_add_epi16(zero, vs); +#endif +} diff --git a/src/emu/cpu/rsp/vdivh.h b/src/emu/cpu/rsp/vdivh.h new file mode 100644 index 00000000000..91a4d5a99e4 --- /dev/null +++ b/src/emu/cpu/rsp/vdivh.h @@ -0,0 +1,12 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +rsp_vec_t vec_vdivh(UINT32 src, UINT32 e, UINT32 dest, UINT32 de) +{ + // Get the element from VT. + m_div_in = m_v[src].s[e & 0x7]; + + // Write out the upper part of the result. + m_v[dest].s[de & 0x7] = m_div_out; + return vec_load_unshuffled_operand(m_v[dest].s); +} diff --git a/src/emu/cpu/rsp/vmac.h b/src/emu/cpu/rsp/vmac.h new file mode 100644 index 00000000000..bf2baabc1a0 --- /dev/null +++ b/src/emu/cpu/rsp/vmac.h @@ -0,0 +1,57 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_vmacf_vmacu(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_mid, rsp_vec_t *acc_hi) +{ + // Get the product and shift it over + // being sure to save the carries. + rsp_vec_t lo = _mm_mullo_epi16(vs, vt); + rsp_vec_t hi = _mm_mulhi_epi16(vs, vt); + + rsp_vec_t mid = _mm_slli_epi16(hi, 1); + rsp_vec_t carry = _mm_srli_epi16(lo, 15); + hi = _mm_srai_epi16(hi, 15); + mid = _mm_or_si128(mid, carry); + lo = _mm_slli_epi16(lo, 1); + + // Tricky part: start accumulating everything. + // Get/keep the carry as we'll add it in later. + rsp_vec_t overflow_mask = _mm_adds_epu16(*acc_lo, lo); + *acc_lo = _mm_add_epi16(*acc_lo, lo); + + overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask); + overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero); + + // Add in the carry. If the middle portion is + // already 0xFFFF and we have a carry, we have + // to carry the all the way up to hi. + mid = _mm_sub_epi16(mid, overflow_mask); + carry = _mm_cmpeq_epi16(mid, zero); + carry = _mm_and_si128(carry, overflow_mask); + hi = _mm_sub_epi16(hi, carry); + + // Accumulate the middle portion. + overflow_mask = _mm_adds_epu16(*acc_mid, mid); + *acc_mid = _mm_add_epi16(*acc_mid, mid); + + overflow_mask = _mm_cmpeq_epi16(*acc_mid, overflow_mask); + overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero); + + // Finish up the accumulation of the... accumulator. + *acc_hi = _mm_add_epi16(*acc_hi, hi); + *acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask); + + if (iw & 0x1) // VMACU + { + rsp_vec_t overflow_hi_mask = _mm_srai_epi16(*acc_hi, 15); + rsp_vec_t overflow_mid_mask = _mm_srai_epi16(*acc_mid, 15); + mid = _mm_or_si128(overflow_mid_mask, *acc_mid); + overflow_mask = _mm_cmpgt_epi16(*acc_hi, zero); + mid = _mm_andnot_si128(overflow_hi_mask, mid); + return _mm_or_si128(overflow_mask, mid); + } + else // VMACF + { + return sclamp_acc_to_mid(*acc_mid, *acc_hi); + } +} diff --git a/src/emu/cpu/rsp/vmov.h b/src/emu/cpu/rsp/vmov.h new file mode 100644 index 00000000000..4dd4196ca4f --- /dev/null +++ b/src/emu/cpu/rsp/vmov.h @@ -0,0 +1,9 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +rsp_vec_t vec_vmov(UINT32 src, UINT32 e, UINT32 dest, UINT32 de) +{ + // Get the element from VT and write out the upper part of the result. + m_v[dest].s[de & 0x7] = m_v[src].s[e & 0x7]; + return rsp_vect_load_unshuffled_operand(m_v[dest].s); +} diff --git a/src/emu/cpu/rsp/vmrg.h b/src/emu/cpu/rsp/vmrg.h new file mode 100644 index 00000000000..3415577e035 --- /dev/null +++ b/src/emu/cpu/rsp/vmrg.h @@ -0,0 +1,13 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_vmrg(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t le) +{ +#ifdef __SSE4_1__ + return _mm_blendv_epi8(vt, vs, le); +#else + vs = _mm_and_si128(le, vs); + vt = _mm_andnot_si128(le, vt); + return _mm_or_si128(vs, vt); +#endif +} diff --git a/src/emu/cpu/rsp/vmudh.h b/src/emu/cpu/rsp/vmudh.h new file mode 100644 index 00000000000..f1d62dcafd8 --- /dev/null +++ b/src/emu/cpu/rsp/vmudh.h @@ -0,0 +1,11 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t rsp_vmudh(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t *acc_md, rsp_vec_t *acc_hi) +{ + *acc_md = _mm_mullo_epi16(vs, vt); + *acc_hi = _mm_mulhi_epi16(vs, vt); + + return sclamp_acc_to_mid(*acc_md, *acc_hi); +} + diff --git a/src/emu/cpu/rsp/vmul.h b/src/emu/cpu/rsp/vmul.h new file mode 100644 index 00000000000..5521dac5b71 --- /dev/null +++ b/src/emu/cpu/rsp/vmul.h @@ -0,0 +1,39 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +// +// TODO: CHECK ME. +// + +static inline rsp_vec_t vec_vmulf_vmulu(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_md, rsp_vec_t *acc_hi) +{ + rsp_vec_t lo = _mm_mullo_epi16(vs, vt); + rsp_vec_t round = _mm_cmpeq_epi16(zero, zero); + rsp_vec_t sign1 = _mm_srli_epi16(lo, 15); + lo = _mm_add_epi16(lo, lo); + round = _mm_slli_epi16(round, 15); + rsp_vec_t hi = _mm_mulhi_epi16(vs, vt); + rsp_vec_t sign2 = _mm_srli_epi16(lo, 15); + *acc_lo = _mm_add_epi16(round, lo); + sign1 = _mm_add_epi16(sign1, sign2); + + hi = _mm_slli_epi16(hi, 1); + rsp_vec_t eq = _mm_cmpeq_epi16(vs, vt); + rsp_vec_t neq = eq; + *acc_md = _mm_add_epi16(hi, sign1); + + rsp_vec_t neg = _mm_srai_epi16(*acc_md, 15); + + if (iw & 0x1) // VMULU + { + *acc_hi = _mm_andnot_si128(eq, neg); + hi =_mm_or_si128(*acc_md, neg); + return _mm_andnot_si128(*acc_hi, hi); + } + else // VMULF + { + eq = _mm_and_si128(eq, neg); + *acc_hi = _mm_andnot_si128(neq, neg); + return _mm_add_epi16(*acc_md, eq); + } +} diff --git a/src/emu/cpu/rsp/vmulh.h b/src/emu/cpu/rsp/vmulh.h new file mode 100644 index 00000000000..ee8babc481a --- /dev/null +++ b/src/emu/cpu/rsp/vmulh.h @@ -0,0 +1,31 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_vmadh_vmudh(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_md, rsp_vec_t *acc_hi) +{ + rsp_vec_t lo = _mm_mullo_epi16(vs, vt); + rsp_vec_t hi = _mm_mulhi_epi16(vs, vt); + + if (iw & 0x8) // VMADH + { + // Tricky part: start accumulating everything. + // Get/keep the carry as we'll add it in later. + rsp_vec_t overflow_mask = _mm_adds_epu16(*acc_md, lo); + *acc_md = _mm_add_epi16(*acc_md, lo); + + overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask); + overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero); + + hi = _mm_sub_epi16(hi, overflow_mask); + *acc_hi = _mm_add_epi16(*acc_hi, hi); + } + else // VMUDH + { + *acc_lo = zero; + *acc_md = lo; + *acc_hi = hi; + } + + return sclamp_acc_to_mid(*acc_md, *acc_hi); +} + diff --git a/src/emu/cpu/rsp/vmull.h b/src/emu/cpu/rsp/vmull.h new file mode 100644 index 00000000000..152a9678796 --- /dev/null +++ b/src/emu/cpu/rsp/vmull.h @@ -0,0 +1,44 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_vmadl_vmudl(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_md, rsp_vec_t *acc_hi) +{ + rsp_vec_t hi = _mm_mulhi_epu16(vs, vt); + + if (iw & 0x8) // VMADL + { + // Tricky part: start accumulating everything. + // Get/keep the carry as we'll add it in later. + rsp_vec_t overflow_mask = _mm_adds_epu16(*acc_lo, hi); + *acc_lo = _mm_add_epi16(*acc_lo, hi); + + overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask); + overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero); + hi = _mm_sub_epi16(zero, overflow_mask); + + // Check for overflow of the upper sum. + // + // TODO: Since hi can only be {0,1}, we should + // be able to generalize this for performance. + overflow_mask = _mm_adds_epu16(*acc_md, hi); + *acc_md = _mm_add_epi16(*acc_md, hi); + + overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask); + overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero); + + // Finish up the accumulation of the... accumulator. + // Since the product was unsigned, only worry about + // positive overflow (i.e.: borrowing not possible). + *acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask); + + return uclamp_acc(*acc_lo, *acc_md, *acc_hi, zero); + } + else // VMUDL + { + *acc_lo = hi; + *acc_md = zero; + *acc_hi = zero; + + return hi; + } +} diff --git a/src/emu/cpu/rsp/vmulm.h b/src/emu/cpu/rsp/vmulm.h new file mode 100644 index 00000000000..93cd349da36 --- /dev/null +++ b/src/emu/cpu/rsp/vmulm.h @@ -0,0 +1,56 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_vmadm_vmudm(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_md, rsp_vec_t *acc_hi) +{ + rsp_vec_t lo = _mm_mullo_epi16(vs, vt); + rsp_vec_t hi = _mm_mulhi_epu16(vs, vt); + + // What we really want to do is unsigned vs * signed vt. + // However, we have no such instructions to do so. + // + // There's a trick to "fix" an unsigned product, though: + // If vt was negative, take the upper 16-bits of the product + // and subtract vs. + rsp_vec_t sign = _mm_srai_epi16(vs, 15); + vt = _mm_and_si128(vt, sign); + hi = _mm_sub_epi16(hi, vt); + + if (iw & 0x8) // VMADM + { + // Tricky part: start accumulating everything. + // Get/keep the carry as we'll add it in later. + rsp_vec_t overflow_mask = _mm_adds_epu16(*acc_lo, lo); + *acc_lo = _mm_add_epi16(*acc_lo, lo); + + overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask); + overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero); + + // This is REALLY clever. Since the product results from + // two 16-bit components, one positive and one negative, + // we don't have to worry about carrying the 1 (we can + // only borrow) past 32-bits. So we can just add it here. + hi = _mm_sub_epi16(hi, overflow_mask); + + // Check for overflow of the upper sum. + overflow_mask = _mm_adds_epu16(*acc_md, hi); + *acc_md = _mm_add_epi16(*acc_md, hi); + + overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask); + overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero); + + // Finish up the accumulation of the... accumulator. + *acc_hi = _mm_add_epi16(*acc_hi, _mm_srai_epi16(hi, 15)); + *acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask); + + return sclamp_acc_to_mid(*acc_md, *acc_hi); + } + else // VMUDM + { + *acc_lo = lo; + *acc_md = hi; + *acc_hi = _mm_srai_epi16(hi, 15); + + return hi; + } +} diff --git a/src/emu/cpu/rsp/vmuln.h b/src/emu/cpu/rsp/vmuln.h new file mode 100644 index 00000000000..ee3e4f3556b --- /dev/null +++ b/src/emu/cpu/rsp/vmuln.h @@ -0,0 +1,55 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_vmadn_vmudn(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_md, rsp_vec_t *acc_hi) +{ + rsp_vec_t lo = _mm_mullo_epi16(vs, vt); + rsp_vec_t hi = _mm_mulhi_epu16(vs, vt); + + // What we really want to do is unsigned vs * signed vt. + // However, we have no such instructions to do so. + // + // There's a trick to "fix" an unsigned product, though: + // If vt was negative, take the upper 16-bits of the product + // and subtract vs. + rsp_vec_t sign = _mm_srai_epi16(vt, 15); + vs = _mm_and_si128(vs, sign); + hi = _mm_sub_epi16(hi, vs); + + if (iw & 0x8) // VMADN + { + // Tricky part: start accumulate everything. + // Get/keep the carry as we'll add it in later. + rsp_vec_t overflow_mask = _mm_adds_epu16(*acc_lo, lo); + *acc_lo = _mm_add_epi16(*acc_lo, lo); + + overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask); + overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero); + + // This is REALLY clever. Since the product results from + // two 16-bit components, one positive and one negative, + // we don't have to worry about carrying the 1 (we can + // only borrow) past 32-bits. So we can just add it here. + hi = _mm_sub_epi16(hi, overflow_mask); + + // Check for overflow of the upper sum. + overflow_mask = _mm_adds_epu16(*acc_md, hi); + *acc_md = _mm_add_epi16(*acc_md, hi); + + overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask); + overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero); + + // Finish up the accumulation of the... accumulator. + *acc_hi = _mm_add_epi16(*acc_hi, _mm_srai_epi16(hi, 15)); + *acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask); + return uclamp_acc(*acc_lo, *acc_md, *acc_hi, zero); + } + else // VMUDN + { + *acc_lo = lo; + *acc_md = hi; + *acc_hi = _mm_srai_epi16(hi, 15); + + return lo; + } +} diff --git a/src/emu/cpu/rsp/vor.h b/src/emu/cpu/rsp/vor.h new file mode 100644 index 00000000000..4b4f1fac506 --- /dev/null +++ b/src/emu/cpu/rsp/vor.h @@ -0,0 +1,10 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_vor_vnor(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt) +{ + rsp_vec_t vmask = _mm_load_si128((rsp_vec_t *) m_vec_helpers.logic_mask[iw & 0x1]); + + rsp_vec_t vd = _mm_or_si128(vs, vt); + return _mm_xor_si128(vd, vmask); +} diff --git a/src/emu/cpu/rsp/vrcpsq.h b/src/emu/cpu/rsp/vrcpsq.h new file mode 100644 index 00000000000..76454e896a3 --- /dev/null +++ b/src/emu/cpu/rsp/vrcpsq.h @@ -0,0 +1,60 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_vrcp_vrsq(UINT32 iw, INT32 dp, UINT32 src, UINT32 e, UINT32 dest, UINT32 de) +{ + UINT32 shift, idx; + + // Get the element from VT. + INT16 vt = m_v[src].s[e & 0x7]; + + UINT32 dp_input = ((UINT32) m_div_in << 16) | (UINT16) vt; + UINT32 sp_input = vt; + + INT32 input = (dp) ? dp_input : sp_input; + INT32 input_mask = input >> 31; + INT32 data = input ^ input_mask; + + if (input > -32768) + { + data -= input_mask; + } + + // Handle edge cases. + INT32 result; + if (data == 0) + { + result = 0x7fffffff; + } + else if (input == -32768) + { + result = 0xffff0000; + } + else // Main case: compute the reciprocal. + { + UINT32 shift = count_leading_zeros(data); + UINT32 idx = (((UINT64) data << shift) & 0x7FC00000) >> 22; + + if (iw & 0x4) // VRSQ + { + idx = (idx | 0x200) & 0x3FE | (shift % 2); + result = rsp_divtable[idx]; + + result = ((0x10000 | result) << 14) >> ((31 - shift) >> 1); + } + else // VRCP + { + result = rsp_divtable[idx]; + + result = ((0x10000 | result) << 14) >> (31 - shift); + } + + result = result ^ input_mask; + } + + // Write out the results. + m_div_out = result >> 16; + m_v[dest].s[de & 0x7] = result; + + return vec_load_unshuffled_operand(m_v[dest].s); +} diff --git a/src/emu/cpu/rsp/vrsq.h b/src/emu/cpu/rsp/vrsq.h new file mode 100644 index 00000000000..008db8b8725 --- /dev/null +++ b/src/emu/cpu/rsp/vrsq.h @@ -0,0 +1,65 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +rsp_vec_t vec_vrsq(INT32 dp, UINT32 src, UINT32 e, UINT32 dest, UINT32 de) +{ + // Get the element from VT. + INT16 vt = m_v[src].s[e & 0x7]; + + UINT32 dp_input = ((UINT32) m_div_in << 16) | (UINT16) vt; + UINT32 sp_input = vt; + + INT32 input = (dp) ? dp_input : sp_input; + INT32 input_mask = input >> 31; + INT32 data = input ^ input_mask; + + if (input > -32768) + { + data -= input_mask; + } + + // Handle edge cases. + if (data == 0) + { + result = 0x7fffFFFFU; + } + else if (input == -32768) + { + result = 0xffff0000U; + } + else // Main case: compute the reciprocal. + { + UINT32 shift = count_leading_zeros(data); + + UINT32 idx = (((UINT64) data << shift) & 0x7FC00000U) >> 22; + idx = (idx | 0x200) & 0x3FE | (shift % 2); + INT32 result = rsp_reciprocal_rom[idx]; + + result = ((0x10000 | result) << 14) >> ((31 - shift) >> 1); + result = result ^ input_mask; + } + + // Write out the results. + m_div_out = result >> 16; + m_v[dest].s[de & 0x7] = result; + + return vec_load_unshuffled_operand(m_v[dest].s); +} + +rsp_vec_t vec_vrsqh(UINT32 src, UINT32 e, UINT32 dest, UINT32 de) +{ + INT16 elements[8]; + + // Get the element from VT. + memcpy(elements, &m_v[src], sizeof(rsp_vec_t)); + m_div_in = elements[e]; + + // Write out the upper part of the result. + rsp_vec_t vd_mask = _mm_load_si128((rsp_vec_t *) m_vec_helpers.vrsq_mask_table[de]); + rsp_vec_t vd = _mm_load_si128((rsp_vec_t *) &m_v[dest]); + vd = _mm_andnot_si128(vd_mask, vd); + + rsp_vec_t b_result = _mm_set1_epi16(m_div_out); + b_result = _mm_and_si128(vd_mask, b_result); + return _mm_or_si128(b_result, vd); +} diff --git a/src/emu/cpu/rsp/vsub.h b/src/emu/cpu/rsp/vsub.h new file mode 100644 index 00000000000..5a1ce8ef264 --- /dev/null +++ b/src/emu/cpu/rsp/vsub.h @@ -0,0 +1,17 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t rsp_vsub(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t carry, rsp_vec_t *acc_lo) +{ + // acc_lo uses saturated arithmetic. + rsp_vec_t unsat_diff = _mm_sub_epi16(vt, carry); + rsp_vec_t sat_diff = _mm_subs_epi16(vt, carry); + + *acc_lo = _mm_sub_epi16(vs, unsat_diff); + rsp_vec_t vd = _mm_subs_epi16(vs, sat_diff); + + // VD is the signed diff of the two sources and the carry. Since we + // have to saturate the diff of all three, we have to be clever. + rsp_vec_t overflow = _mm_cmpgt_epi16(sat_diff, unsat_diff); + return _mm_adds_epi16(vd, overflow); +} diff --git a/src/emu/cpu/rsp/vsubc.h b/src/emu/cpu/rsp/vsubc.h new file mode 100644 index 00000000000..63986eac76a --- /dev/null +++ b/src/emu/cpu/rsp/vsubc.h @@ -0,0 +1,14 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_vsubc(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *eq, rsp_vec_t *sn) +{ + rsp_vec_t sat_udiff = _mm_subs_epu16(vs, vt); + rsp_vec_t equal = _mm_cmpeq_epi16(vs, vt); + rsp_vec_t sat_udiff_zero = _mm_cmpeq_epi16(sat_udiff, zero); + + *eq = _mm_cmpeq_epi16(equal, zero); + *sn = _mm_andnot_si128(equal, sat_udiff_zero); + + return _mm_sub_epi16(vs, vt); +} diff --git a/src/emu/cpu/rsp/vxor.h b/src/emu/cpu/rsp/vxor.h new file mode 100644 index 00000000000..0397f6aa84d --- /dev/null +++ b/src/emu/cpu/rsp/vxor.h @@ -0,0 +1,10 @@ +// license:BSD-3-Clause +// copyright-holders:Tyler J. Stachecki,Ryan Holtz + +static inline rsp_vec_t vec_vxor_vnxor(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt) +{ + rsp_vec_t vmask = _mm_load_si128((rsp_vec_t *) m_vec_helpers.logic_mask[iw & 0x1]); + + rsp_vec_t vd = _mm_xor_si128(vs, vt); + return _mm_xor_si128(vd, vmask); +}