nw, start merging in RSP vector ops from CEN64

This commit is contained in:
therealmogminer@gmail.com 2015-06-25 19:19:25 +02:00
parent 9c6f6114c7
commit 3c5cd12782
31 changed files with 1267 additions and 10 deletions

37
src/emu/cpu/rsp/clamp.h Normal file
View File

@ -0,0 +1,37 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t sclamp_acc_to_mid(rsp_vec_t acc_mid, rsp_vec_t acc_hi)
{
return _mm_packs_epi32(
_mm_unpacklo_epi16(acc_mid, acc_hi),
_mm_unpackhi_epi16(acc_mid, acc_hi)
);
}
static inline rsp_vec_t uclamp_acc(rsp_vec_t val, rsp_vec_t acc_mid, rsp_vec_t acc_hi, rsp_vec_t zero)
{
rsp_vec_t hi_negative = _mm_srai_epi16(acc_hi, 15);
rsp_vec_t mid_negative = _mm_srai_epi16(acc_mid, 15);
// We don't have to clamp if the HI part of the
// accumulator is sign-extended down to the MD part.
rsp_vec_t hi_sign_check = _mm_cmpeq_epi16(hi_negative, acc_hi);
rsp_vec_t mid_sign_check = _mm_cmpeq_epi16(hi_negative, mid_negative);
rsp_vec_t clamp_mask = _mm_and_si128(mid_sign_check, hi_sign_check);
// Generate the value in the event we need to clamp.
// * hi_negative, mid_sign => xxxx
// * hi_negative, !mid_sign => 0000
// * !hi_negative, mid_sign => FFFF
// * !hi_negative, !mid_sign => xxxx
rsp_vec_t clamped_val = _mm_cmpeq_epi16(hi_negative, zero);
#ifndef __SSE4_1__
clamped_val = _mm_and_si128(clamp_mask, val);
val = _mm_andnot_si128(clamp_mask, clamped_val);
return _mm_or_si128(val, clamped_val);
#else
return _mm_blendv_epi8(clamped_val, val, clamp_mask);
#endif
}

View File

@ -14,6 +14,7 @@
#ifndef __RSP_H__
#define __RSP_H__
#include "emu.h"
#include "cpu/drcfe.h"
#include "cpu/drcuml.h"

View File

@ -1,5 +1,5 @@
// license:BSD-3-Clause
// copyright-holders:Ryan Holtz
// copyright-holders:Ryan Holtz,Tyler J. Stachecki
/***************************************************************************
rspcp2.c
@ -11,13 +11,176 @@
#include "emu.h"
#include "rsp.h"
#include "rspdiv.h"
#include "rspcp2.h"
#include "cpu/drcfe.h"
#include "cpu/drcuml.h"
#include "cpu/drcumlsh.h"
using namespace uml;
#if USE_SIMD
#include <emmintrin.h>
const rsp_cop2::vec_helpers_t rsp_cop2::m_vec_helpers = {
{ 0 },
{ // logic_mask
{ 0, 0, 0, 0, 0, 0, 0, 0 },
{ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 }
},
{ // vrsq_mask_table
{ ~0, 0, 0, 0, 0, 0, 0, 0 },
{ 0, ~0, 0, 0, 0, 0, 0, 0 },
{ 0, 0, ~0, 0, 0, 0, 0, 0 },
{ 0, 0, 0, ~0, 0, 0, 0, 0 },
{ 0, 0, 0, 0, ~0, 0, 0, 0 },
{ 0, 0, 0, 0, 0, ~0, 0, 0 },
{ 0, 0, 0, 0, 0, 0, ~0, 0 },
{ 0, 0, 0, 0, 0, 0, 0, ~0 }
},
{ // shuffle_keys
/* -- */{0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0b0a, 0x0d0c, 0x0f0e},
/* -- */{0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0b0a, 0x0d0c, 0x0f0e},
/* 0q */{0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0d0c, 0x0d0c},
/* 1q */{0x0302, 0x0302, 0x0706, 0x0706, 0x0b0a, 0x0b0a, 0x0f0e, 0x0f0e},
/* 0h */{0x0100, 0x0100, 0x0100, 0x0100, 0x0908, 0x0908, 0x0908, 0x0908},
/* 1h */{0x0302, 0x0302, 0x0302, 0x0302, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a},
/* 2h */{0x0504, 0x0504, 0x0504, 0x0504, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c},
/* 3h */{0x0706, 0x0706, 0x0706, 0x0706, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e},
/* 0w */{0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100},
/* 1w */{0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302},
/* 2w */{0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504},
/* 3w */{0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706},
/* 4w */{0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908},
/* 5w */{0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a, 0x0b0a},
/* 6w */{0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c, 0x0d0c},
/* 7w */{0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e, 0x0f0e}
},
{ // sll_b2l_keys
{0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d},
{0x0102, 0x8000, 0x0506, 0x0304, 0x090a, 0x0708, 0x0d0e, 0x0b0c},
{0x0001, 0x8080, 0x0405, 0x0203, 0x0809, 0x0607, 0x0c0d, 0x0a0b},
{0x8000, 0x8080, 0x0304, 0x0102, 0x0708, 0x0506, 0x0b0c, 0x090a},
{0x8080, 0x8080, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809},
{0x8080, 0x8080, 0x0102, 0x8000, 0x0506, 0x0304, 0x090a, 0x0708},
{0x8080, 0x8080, 0x0001, 0x8080, 0x0405, 0x0203, 0x0809, 0x0607},
{0x8080, 0x8080, 0x8000, 0x8080, 0x0304, 0x0102, 0x0708, 0x0506},
{0x8080, 0x8080, 0x8080, 0x8080, 0x0203, 0x0001, 0x0607, 0x0405},
{0x8080, 0x8080, 0x8080, 0x8080, 0x0102, 0x8000, 0x0506, 0x0304},
{0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x8080, 0x0405, 0x0203},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8000, 0x8080, 0x0304, 0x0102},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0203, 0x0001},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0102, 0x8000},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x8080},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8000, 0x8080}
},
{ // sll_l2b_keys
{0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d},
{0x0380, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e},
{0x8080, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f},
{0x8080, 0x0380, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08},
{0x8080, 0x8080, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809},
{0x8080, 0x8080, 0x0380, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a},
{0x8080, 0x8080, 0x8080, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b},
{0x8080, 0x8080, 0x8080, 0x0380, 0x0102, 0x0700, 0x0506, 0x0b04},
{0x8080, 0x8080, 0x8080, 0x8080, 0x0203, 0x0001, 0x0607, 0x0405},
{0x8080, 0x8080, 0x8080, 0x8080, 0x0380, 0x0102, 0x0700, 0x0506},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0203, 0x0001, 0x0607},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0380, 0x0102, 0x0700},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0203, 0x0001},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0380, 0x0102},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0203},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0380}
},
{ // srl_b2l_keys
{0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d},
{0x0304, 0x0102, 0x0708, 0x0506, 0x0b0c, 0x090a, 0x0f80, 0x0d0e},
{0x0405, 0x0203, 0x0809, 0x0607, 0x0c0d, 0x0a0b, 0x8080, 0x0e0f},
{0x0506, 0x0304, 0x090a, 0x0708, 0x0d0e, 0x0b0c, 0x8080, 0x0f80},
{0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x8080, 0x8080},
{0x0708, 0x0506, 0x0b0c, 0x090a, 0x0f80, 0x0d0e, 0x8080, 0x8080},
{0x0809, 0x0607, 0x0c0d, 0x0a0b, 0x8080, 0x0e0f, 0x8080, 0x8080},
{0x090a, 0x0708, 0x0d0e, 0x0b0c, 0x8080, 0x0f80, 0x8080, 0x8080},
{0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x8080, 0x8080, 0x8080, 0x8080},
{0x0b0c, 0x090a, 0x0f80, 0x0d0e, 0x8080, 0x8080, 0x8080, 0x8080},
{0x0c0d, 0x0a0b, 0x8080, 0x0e0f, 0x8080, 0x8080, 0x8080, 0x8080},
{0x0d0e, 0x0b0c, 0x8080, 0x0f80, 0x8080, 0x8080, 0x8080, 0x8080},
{0x0e0f, 0x0c0d, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
{0x0f80, 0x0d0e, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
{0x8080, 0x0e0f, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
{0x8080, 0x0f80, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080}
},
{ // ror_b2l_keys
{0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d},
{0x0304, 0x0102, 0x0708, 0x0506, 0x0b0c, 0x090a, 0x0f00, 0x0d0e},
{0x0405, 0x0203, 0x0809, 0x0607, 0x0c0d, 0x0a0b, 0x0001, 0x0e0f},
{0x0506, 0x0304, 0x090a, 0x0708, 0x0d0e, 0x0b0c, 0x0102, 0x0f00},
{0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001},
{0x0708, 0x0506, 0x0b0c, 0x090a, 0x0f00, 0x0d0e, 0x0304, 0x0102},
{0x0809, 0x0607, 0x0c0d, 0x0a0b, 0x0001, 0x0e0f, 0x0405, 0x0203},
{0x090a, 0x0708, 0x0d0e, 0x0b0c, 0x0102, 0x0f00, 0x0506, 0x0304},
{0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405},
{0x0b0c, 0x090a, 0x0f00, 0x0d0e, 0x0304, 0x0102, 0x0708, 0x0506},
{0x0c0d, 0x0a0b, 0x0001, 0x0e0f, 0x0405, 0x0203, 0x0809, 0x0607},
{0x0d0e, 0x0b0c, 0x0102, 0x0f00, 0x0506, 0x0304, 0x090a, 0x0708},
{0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809},
{0x0f00, 0x0d0e, 0x0304, 0x0102, 0x0708, 0x0506, 0x0b0c, 0x090a},
{0x0001, 0x0e0f, 0x0405, 0x0203, 0x0809, 0x0607, 0x0c0d, 0x0a0b},
{0x0102, 0x0f00, 0x0506, 0x0304, 0x090a, 0x0708, 0x0d0e, 0x0b0c}
},
{ // rol_l2b_keys
{0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d},
{0x030c, 0x0102, 0x0400, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e},
{0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f},
{0x0d0e, 0x030c, 0x0102, 0x0400, 0x0506, 0x0b04, 0x090a, 0x0f08},
{0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809},
{0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0400, 0x0506, 0x0b04, 0x090a},
{0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0a0b},
{0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0400, 0x0506, 0x0b04},
{0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405},
{0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0400, 0x0506},
{0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607},
{0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0400},
{0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001},
{0x0400, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102},
{0x0001, 0x0607, 0x0405, 0x0a0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203},
{0x0102, 0x0400, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c}
},
{ // ror_l2b_keys
{0x0203, 0x0001, 0x0607, 0x0405, 0x0c0b, 0x0809, 0x0e0f, 0x0c0d},
{0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c},
{0x0001, 0x0607, 0x0405, 0x0c0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203},
{0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102},
{0x0607, 0x0405, 0x0c0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001},
{0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0700},
{0x0405, 0x0c0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607},
{0x0b04, 0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0700, 0x0506},
{0x0c0b, 0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405},
{0x090a, 0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0700, 0x0506, 0x0b04},
{0x0809, 0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0c0b},
{0x0f08, 0x0d0e, 0x030c, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a},
{0x0e0f, 0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0c0b, 0x0809},
{0x0d0e, 0x030c, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08},
{0x0c0d, 0x0203, 0x0001, 0x0607, 0x0405, 0x0c0b, 0x0809, 0x0e0f},
{0x030c, 0x0102, 0x0700, 0x0506, 0x0b04, 0x090a, 0x0f08, 0x0d0e}
}
};
#endif
extern offs_t rsp_dasm_one(char *buffer, offs_t pc, UINT32 op);
@ -100,6 +263,7 @@ extern offs_t rsp_dasm_one(char *buffer, offs_t pc, UINT32 op);
VREG_S(VDREG, 7) = m_vres[7]; \
}
#if !USE_SIMD
static const int vector_elements_2[16][8] =
{
{ 0, 1, 2, 3, 4, 5, 6, 7 }, // none
@ -119,6 +283,7 @@ static const int vector_elements_2[16][8] =
{ 6, 6, 6, 6, 6, 6, 6, 6 }, // 6
{ 7, 7, 7, 7, 7, 7, 7, 7 }, // 7
};
#endif
rsp_cop2::rsp_cop2(rsp_device &rsp, running_machine &machine)
: m_rsp(rsp)
@ -905,7 +1070,9 @@ UINT16 rsp_cop2::SATURATE_ACCUM(int accum, int slice, UINT16 negative, UINT16 po
void rsp_cop2::handle_vector_ops(UINT32 op)
{
#if !USE_SIMD
int i;
#endif
// Opcode legend:
// E = VS2 element type
@ -924,6 +1091,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Multiplies signed integer by signed integer * 2
#if USE_SIMD
#else
for (i=0; i < 8; i++)
{
INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -948,8 +1117,9 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
}
}
WRITEBACK_RESULT();
#endif
break;
}
case 0x01: /* VMULU */
@ -960,6 +1130,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// ------------------------------------------------------
//
#if USE_SIMD
#else
for (i=0; i < 8; i++)
{
INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -986,6 +1158,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
}
}
WRITEBACK_RESULT();
#endif
break;
}
@ -1000,6 +1173,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// Stores the higher 16 bits of the 32-bit result to accumulator
// The low slice of accumulator is stored into destination element
#if USE_SIMD
#else
for (i=0; i < 8; i++)
{
UINT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, i);
@ -1013,6 +1188,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
m_vres[i] = ACCUM_L(i);
}
WRITEBACK_RESULT();
#endif
break;
}
@ -1027,6 +1203,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// The result is stored into accumulator
// The middle slice of accumulator is stored into destination element
#if USE_SIMD
#else
for (i=0; i < 8; i++)
{
INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -1040,6 +1218,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
m_vres[i] = ACCUM_M(i);
}
WRITEBACK_RESULT();
#endif
break;
}
@ -1055,6 +1234,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// The result is stored into accumulator
// The low slice of accumulator is stored into destination element
#if USE_SIMD
#else
for (i=0; i < 8; i++)
{
INT32 s1 = (UINT16)VREG_S(VS1REG, i); // not sign-extended
@ -1068,6 +1249,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
m_vres[i] = ACCUM_L(i);
}
WRITEBACK_RESULT();
#endif
break;
}
@ -1082,6 +1264,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// The result is stored into highest 32 bits of accumulator, the low slice is zero
// The highest 32 bits of accumulator is saturated into destination element
#if USE_SIMD
#else
for (i=0; i < 8; i++)
{
INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -1097,6 +1281,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
m_vres[i] = (INT16)(r);
}
WRITEBACK_RESULT();
#endif
break;
}
@ -1110,6 +1295,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// Multiplies signed integer by signed integer * 2
// The result is added to accumulator
#if USE_SIMD
#else
for (i=0; i < 8; i++)
{
INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -1131,6 +1318,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
m_vres[i] = SATURATE_ACCUM(i, 1, 0x8000, 0x7fff);
}
WRITEBACK_RESULT();
#endif
break;
}
case 0x09: /* VMACU */
@ -1141,6 +1329,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// ------------------------------------------------------
//
#if USE_SIMD
#else
for (i = 0; i < 8; i++)
{
INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -1177,6 +1367,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
}
}
WRITEBACK_RESULT();
#endif
break;
}
@ -1191,6 +1382,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// Adds the higher 16 bits of the 32-bit result to accumulator
// The low slice of accumulator is stored into destination element
#if USE_SIMD
#else
for (i = 0; i < 8; i++)
{
UINT32 s1 = (UINT32)(UINT16)VREG_S(VS1REG, i);
@ -1206,6 +1399,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
m_vres[i] = SATURATE_ACCUM(i, 0, 0x0000, 0xffff);
}
WRITEBACK_RESULT();
#endif
break;
}
@ -1220,6 +1414,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// The result is added into accumulator
// The middle slice of accumulator is stored into destination element
#if USE_SIMD
#else
for (i=0; i < 8; i++)
{
UINT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -1237,6 +1433,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
m_vres[i] = SATURATE_ACCUM(i, 1, 0x8000, 0x7fff);
}
WRITEBACK_RESULT();
#endif
break;
}
@ -1251,6 +1448,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// The result is added into accumulator
// The low slice of accumulator is stored into destination element
#if USE_SIMD
#else
for (i=0; i < 8; i++)
{
INT32 s1 = (UINT16)VREG_S(VS1REG, i); // not sign-extended
@ -1271,6 +1470,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
}
WRITEBACK_RESULT();
#endif
break;
}
@ -1285,6 +1485,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// The result is added into highest 32 bits of accumulator, the low slice is zero
// The highest 32 bits of accumulator is saturated into destination element
#if USE_SIMD
#else
for (i = 0; i < 8; i++)
{
INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -1301,6 +1503,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
}
WRITEBACK_RESULT();
#endif
break;
}
@ -1315,6 +1518,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// TODO: check VS2REG == VDREG
#if USE_SIMD
#else
for (i=0; i < 8; i++)
{
INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -1330,6 +1535,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
CLEAR_ZERO_FLAGS();
CLEAR_CARRY_FLAGS();
WRITEBACK_RESULT();
#endif
break;
}
@ -1344,6 +1550,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// TODO: check VS2REG == VDREG
#if USE_SIMD
#else
for (i = 0; i < 8; i++)
{
INT32 s1 = (INT32)(INT16)VREG_S(VS1REG, i);
@ -1360,6 +1568,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
CLEAR_ZERO_FLAGS();
CLEAR_CARRY_FLAGS();
WRITEBACK_RESULT();
#endif
break;
}
@ -1373,6 +1582,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// Changes the sign of source register 2 if source register 1 is negative and stores
// the result to destination register
#if USE_SIMD
#else
for (i=0; i < 8; i++)
{
INT16 s1 = (INT16)VREG_S(VS1REG, i);
@ -1401,6 +1612,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
SET_ACCUM_L(m_vres[i], i);
}
WRITEBACK_RESULT();
#endif
break;
}
@ -1415,6 +1627,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// TODO: check VS2REG = VDREG
#if USE_SIMD
#else
CLEAR_ZERO_FLAGS();
CLEAR_CARRY_FLAGS();
@ -1433,6 +1647,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
}
}
WRITEBACK_RESULT();
#endif
break;
}
@ -1447,6 +1662,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// TODO: check VS2REG = VDREG
#if USE_SIMD
#else
CLEAR_ZERO_FLAGS();
CLEAR_CARRY_FLAGS();
@ -1469,6 +1686,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
}
}
WRITEBACK_RESULT();
#endif
break;
}
@ -1481,6 +1699,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Stores high, middle or low slice of accumulator to destination vector
#if USE_SIMD
#else
switch (EL)
{
case 0x08: // VSAWH
@ -1511,6 +1731,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
printf("RSP: VSAW: el = %d\n", EL);//??? ???
exit(0);
}
#endif
break;
}
@ -1524,6 +1745,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// Sets compare flags if elements in VS1 are less than VS2
// Moves the element in VS2 to destination vector
#if USE_SIMD
#else
CLEAR_COMPARE_FLAGS();
CLEAR_CLIP2_FLAGS();
@ -1559,6 +1782,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
CLEAR_CARRY_FLAGS();
CLEAR_ZERO_FLAGS();
WRITEBACK_RESULT();
#endif
break;
}
@ -1572,6 +1796,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// Sets compare flags if elements in VS1 are equal with VS2
// Moves the element in VS2 to destination vector
#if USE_SIMD
#else
CLEAR_COMPARE_FLAGS();
CLEAR_CLIP2_FLAGS();
@ -1595,6 +1821,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
CLEAR_ZERO_FLAGS();
CLEAR_CARRY_FLAGS();
WRITEBACK_RESULT();
#endif
break;
}
@ -1608,6 +1835,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// Sets compare flags if elements in VS1 are not equal with VS2
// Moves the element in VS2 to destination vector
#if USE_SIMD
#else
CLEAR_COMPARE_FLAGS();
CLEAR_CLIP2_FLAGS();
@ -1632,6 +1861,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
CLEAR_CARRY_FLAGS();
CLEAR_ZERO_FLAGS();
WRITEBACK_RESULT();
#endif
break;
}
@ -1645,6 +1875,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// Sets compare flags if elements in VS1 are greater or equal with VS2
// Moves the element in VS2 to destination vector
#if USE_SIMD
#else
CLEAR_COMPARE_FLAGS();
CLEAR_CLIP2_FLAGS();
@ -1669,6 +1901,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
CLEAR_CARRY_FLAGS();
CLEAR_ZERO_FLAGS();
WRITEBACK_RESULT();
#endif
break;
}
@ -1681,6 +1914,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Vector clip low
#if USE_SIMD
#else
for (i = 0; i < 8; i++)
{
INT16 s1 = VREG_S(VS1REG, i);
@ -1763,6 +1998,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
CLEAR_ZERO_FLAGS();
CLEAR_CLIP1_FLAGS();
WRITEBACK_RESULT();
#endif
break;
}
@ -1775,6 +2011,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Vector clip high
#if USE_SIMD
#else
CLEAR_CARRY_FLAGS();
CLEAR_COMPARE_FLAGS();
CLEAR_CLIP1_FLAGS();
@ -1847,6 +2085,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
SET_ACCUM_L(m_vres[i], i);
}
WRITEBACK_RESULT();
#endif
break;
}
@ -1859,6 +2098,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Vector clip reverse
#if USE_SIMD
#else
CLEAR_CARRY_FLAGS();
CLEAR_COMPARE_FLAGS();
CLEAR_CLIP1_FLAGS();
@ -1906,6 +2147,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
m_vres[i] = ACCUM_L(i);
}
WRITEBACK_RESULT();
#endif
break;
}
@ -1918,6 +2160,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Merges two vectors according to compare flags
#if USE_SIMD
#else
for (i = 0; i < 8; i++)
{
if (COMPARE_FLAG(i) != 0)
@ -1932,6 +2176,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
SET_ACCUM_L(m_vres[i], i);
}
WRITEBACK_RESULT();
#endif
break;
}
case 0x28: /* VAND */
@ -1943,12 +2188,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Bitwise AND of two vector registers
#if USE_SIMD
#else
for (i = 0; i < 8; i++)
{
m_vres[i] = VREG_S(VS1REG, i) & VREG_S(VS2REG, VEC_EL_2(EL, i));
SET_ACCUM_L(m_vres[i], i);
}
WRITEBACK_RESULT();
#endif
break;
}
case 0x29: /* VNAND */
@ -1960,12 +2208,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Bitwise NOT AND of two vector registers
#if USE_SIMD
#else
for (i = 0; i < 8; i++)
{
m_vres[i] = ~((VREG_S(VS1REG, i) & VREG_S(VS2REG, VEC_EL_2(EL, i))));
SET_ACCUM_L(m_vres[i], i);
}
WRITEBACK_RESULT();
#endif
break;
}
case 0x2a: /* VOR */
@ -1977,12 +2228,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Bitwise OR of two vector registers
#if USE_SIMD
#else
for (i = 0; i < 8; i++)
{
m_vres[i] = VREG_S(VS1REG, i) | VREG_S(VS2REG, VEC_EL_2(EL, i));
SET_ACCUM_L(m_vres[i], i);
}
WRITEBACK_RESULT();
#endif
break;
}
case 0x2b: /* VNOR */
@ -1994,12 +2248,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Bitwise NOT OR of two vector registers
#if USE_SIMD
#else
for (i=0; i < 8; i++)
{
m_vres[i] = ~((VREG_S(VS1REG, i) | VREG_S(VS2REG, VEC_EL_2(EL, i))));
SET_ACCUM_L(m_vres[i], i);
}
WRITEBACK_RESULT();
#endif
break;
}
case 0x2c: /* VXOR */
@ -2011,12 +2268,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Bitwise XOR of two vector registers
#if USE_SIMD
#else
for (i=0; i < 8; i++)
{
m_vres[i] = VREG_S(VS1REG, i) ^ VREG_S(VS2REG, VEC_EL_2(EL, i));
SET_ACCUM_L(m_vres[i], i);
}
WRITEBACK_RESULT();
#endif
break;
}
case 0x2d: /* VNXOR */
@ -2028,12 +2288,15 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Bitwise NOT XOR of two vector registers
#if USE_SIMD
#else
for (i=0; i < 8; i++)
{
m_vres[i] = ~((VREG_S(VS1REG, i) ^ VREG_S(VS2REG, VEC_EL_2(EL, i))));
SET_ACCUM_L(m_vres[i], i);
}
WRITEBACK_RESULT();
#endif
break;
}
@ -2045,6 +2308,9 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
// ------------------------------------------------------
//
// Calculates reciprocal
#if USE_SIMD
#else
INT32 shifter = 0;
INT32 rec = (INT16)(VREG_S(VS2REG, EL & 7));
@ -2093,6 +2359,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
}
#endif
break;
}
@ -2105,6 +2372,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Calculates reciprocal low part
#if USE_SIMD
#else
INT32 shifter = 0;
INT32 rec = (INT16)VREG_S(VS2REG, EL & 7);
@ -2169,6 +2438,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i);
}
#endif
break;
}
@ -2181,6 +2451,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Calculates reciprocal high part
#if USE_SIMD
#else
m_reciprocal_high = (VREG_S(VS2REG, EL & 7)) << 16;
m_dp_allowed = 1;
@ -2191,6 +2463,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
VREG_S(VDREG, VS1REG & 7) = (INT16)(m_reciprocal_res >> 16);
#endif
break;
}
@ -2203,11 +2476,14 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Moves element from vector to destination vector
#if USE_SIMD
#else
VREG_S(VDREG, VS1REG & 7) = VREG_S(VS2REG, EL & 7);
for (i = 0; i < 8; i++)
{
SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i);
}
#endif
break;
}
@ -2220,6 +2496,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Calculates reciprocal square-root
#if USE_SIMD
#else
INT32 shifter = 0;
INT32 rec = (INT16)(VREG_S(VS2REG, EL & 7));
@ -2269,6 +2547,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i);
}
#endif
break;
}
@ -2281,6 +2560,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Calculates reciprocal square-root low part
#if USE_SIMD
#else
INT32 shifter = 0;
INT32 rec = (INT16)VREG_S(VS2REG, EL & 7);
INT32 datainput = rec;
@ -2348,6 +2629,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
SET_ACCUM_L(VREG_S(VS2REG, VEC_EL_2(EL, i)), i);
}
#endif
break;
}
@ -2360,6 +2642,8 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
//
// Calculates reciprocal square-root high part
#if USE_SIMD
#else
m_reciprocal_high = (VREG_S(VS2REG, EL & 7)) << 16;
m_dp_allowed = 1;
@ -2369,6 +2653,7 @@ void rsp_cop2::handle_vector_ops(UINT32 op)
}
VREG_S(VDREG, VS1REG & 7) = (INT16)(m_reciprocal_res >> 16); // store high part
#endif
break;
}

View File

@ -1,5 +1,5 @@
// license:BSD-3-Clause
// copyright-holders:Ryan Holtz
// copyright-holders:Ryan Holtz,Tyler J. Stachecki
/***************************************************************************
rspcp2.h
@ -15,6 +15,37 @@
#include "cpu/drcuml.h"
#include "rsp.h"
#include "rspdiv.h"
#define SIMD_OFF (1)
#if (defined(__SSE2__) || defined(__SSE3__) || defined(__SSSE3__) || defined(__SSE4_1__) || defined(__SSE4_2__))
#define SSE_AVAILABLE (1)
#else
#define SSE_AVAILABLE (0)
#endif
#if (!defined(MAME_DEBUG) || defined(__OPTIMIZE__)) && (SSE_AVAILABLE || defined(_MSC_VER)) && defined(PTR64) && !SIMD_OFF
#define USE_SIMD (1)
#else
#define USE_SIMD (0)
#endif
#if USE_SIMD
#ifdef __SSE4_2__
#include <nmmintrin.h>
#elif defined(__SSE4_1__)
#include <smmintrin.h>
#elif defined(__SSSE3__)
#include <tmmintrin.h>
#elif defined(__SSE3__)
#include <pmmintrin.h>
#else
#include <emmintrin.h>
#endif
typedef __m128i rsp_vec_t;
#endif
union VECTOR_REG
{
@ -22,6 +53,9 @@ union VECTOR_REG
UINT32 l[4];
INT16 s[8];
UINT8 b[16];
#if USE_SIMD
rsp_vec_t v;
#endif
};
union ACCUMULATOR_REG
@ -152,10 +186,148 @@ protected:
UINT32 m_reciprocal_high;
INT32 m_dp_allowed;
#if USE_SIMD
typedef struct
{
rsp_vec_t dummy_for_alignment;
const UINT16 logic_mask[2][8];
const UINT16 vrsq_mask_table[8][8];
const UINT16 shuffle_keys[16][8];
const UINT16 sll_b2l_keys[16][8];
const UINT16 sll_l2b_keys[16][8];
const UINT16 srl_b2l_keys[16][8];
const UINT16 ror_b2l_keys[16][8];
const UINT16 rol_l2b_keys[16][8];
const UINT16 ror_l2b_keys[16][8];
} vec_helpers_t;
static const vec_helpers_t m_vec_helpers;
rsp_vec_t vec_load_and_shuffle_operand(const UINT16* src, UINT32 element);
static inline rsp_vec_t vec_load_unshuffled_operand(const UINT16* src)
{
return _mm_load_si128((rsp_vec_t*) src);
}
static inline void vec_write_operand(UINT16* dest, rsp_vec_t src)
{
_mm_store_si128((rsp_vec_t*) dest, src);
}
static inline rsp_vec_t read_acc_lo(const UINT16 *acc)
{
return vec_load_unshuffled_operand(acc + sizeof(rsp_vec_t) * 2);
}
static inline rsp_vec_t read_acc_mid(const UINT16 *acc)
{
return vec_load_unshuffled_operand(acc + sizeof(rsp_vec_t));
}
static inline rsp_vec_t read_acc_hi(const UINT16 *acc)
{
return vec_load_unshuffled_operand(acc);
}
static inline rsp_vec_t read_vcc_lo(const UINT16 *vcc)
{
return vec_load_unshuffled_operand(vcc + sizeof(rsp_vec_t));
}
static inline rsp_vec_t read_vcc_hi(const UINT16 *vcc)
{
return vec_load_unshuffled_operand(vcc);
}
static inline rsp_vec_t read_vco_lo(const UINT16 *vco)
{
return vec_load_unshuffled_operand(vco + sizeof(rsp_vec_t));
}
static inline rsp_vec_t read_vco_hi(const UINT16 *vco)
{
return vec_load_unshuffled_operand(vco);
}
static inline rsp_vec_t read_vce(const UINT16 *vce)
{
return vec_load_unshuffled_operand(vce + sizeof(rsp_vec_t));
}
static inline void write_acc_lo(UINT16 *acc, rsp_vec_t acc_lo)
{
return vec_write_operand(acc + sizeof(rsp_vec_t) * 2, acc_lo);
}
static inline void write_acc_mid(UINT16 *acc, rsp_vec_t acc_mid)
{
return vec_write_operand(acc + sizeof(rsp_vec_t), acc_mid);
}
static inline void write_acc_hi(UINT16 *acc, rsp_vec_t acc_hi)
{
return vec_write_operand(acc, acc_hi);
}
static inline void write_vcc_lo(UINT16 *vcc, rsp_vec_t vcc_lo)
{
return vec_write_operand(vcc + sizeof(rsp_vec_t), vcc_lo);
}
static inline void write_vcc_hi(UINT16 *vcc, rsp_vec_t vcc_hi)
{
return vec_write_operand(vcc, vcc_hi);
}
static inline void write_vco_lo(UINT16 *vcc, rsp_vec_t vco_lo)
{
return vec_write_operand(vcc + sizeof(rsp_vec_t), vco_lo);
}
static inline void write_vco_hi(UINT16 *vcc, rsp_vec_t vco_hi)
{
return vec_write_operand(vcc, vco_hi);
}
static inline void write_vce(UINT16 *vce, rsp_vec_t vce_r)
{
return vec_write_operand(vce, vce_r);
}
static inline INT16 get_flags(const UINT16 *flags)
{
return (INT16)_mm_movemask_epi8(
_mm_packs_epi16(
_mm_load_si128((rsp_vec_t*) (flags + sizeof(rsp_vec_t))),
_mm_load_si128((rsp_vec_t*) flags)
)
);
}
static inline rsp_vec_t vec_zero()
{
return _mm_setzero_si128();
}
void vec_load_group1(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
void vec_load_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
void vec_load_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
void vec_store_group1(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
void vec_store_group2(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
void vec_store_group4(UINT32 addr, UINT32 element, UINT16* regp, rsp_vec_t reg, rsp_vec_t dqm);
#include "clamp.h"
#include "vabs.h"
#include "vadd.h"
#include "vaddc.h"
#include "vand.h"
#include "vch.h"
#include "vcmp.h"
#include "vcl.h"
#include "vcr.h"
#include "vmac.h"
#include "vmrg.h"
#include "vmul.h"
#include "vmulh.h"
#include "vmull.h"
#include "vmulm.h"
#include "vmuln.h"
#include "vor.h"
#include "vsub.h"
#include "vsubc.h"
#include "vxor.h"
#endif
private:
void handle_lwc2(UINT32 op);
void handle_swc2(UINT32 op);
void handle_vector_ops(UINT32 op);
UINT32 m_div_in;
UINT32 m_div_out;
};
#endif /* __RSPCP2_H__ */

View File

@ -11,7 +11,6 @@
#include "emu.h"
#include "rsp.h"
#include "rspdiv.h"
#include "rspcp2.h"
#include "rspcp2d.h"
#include "cpu/drcfe.h"

View File

@ -31,6 +31,7 @@ class rsp_cop2_drc : public rsp_cop2
virtual void state_string_export(const int index, std::string &str);
void cfunc_unimplemented_opcode();
public:
virtual void lbv();
virtual void lsv();

View File

@ -21,7 +21,6 @@
#include "emu.h"
#include "debugger.h"
#include "rsp.h"
#include "rspdiv.h"
#include "rspfe.h"
#include "rspcp2.h"
#include "cpu/drcfe.h"

15
src/emu/cpu/rsp/vabs.h Normal file
View File

@ -0,0 +1,15 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_vabs(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo)
{
rsp_vec_t vs_zero = _mm_cmpeq_epi16(vs, zero);
rsp_vec_t sign_lt = _mm_srai_epi16(vs, 15);
rsp_vec_t vd = _mm_andnot_si128(vs_zero, vt);
// Careful: if VT = 0x8000 and VS is negative,
// acc_lo will be 0x8000 but vd will be 0x7FFF.
vd = _mm_xor_si128(vd, sign_lt);
*acc_lo = _mm_sub_epi16(vd, sign_lt);
return _mm_subs_epi16(vd, sign_lt);
}

17
src/emu/cpu/rsp/vadd.h Normal file
View File

@ -0,0 +1,17 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_vadd(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t carry, rsp_vec_t *acc_lo)
{
// VCC uses unsaturated arithmetic.
rsp_vec_t vd = _mm_add_epi16(vs, vt);
*acc_lo = _mm_sub_epi16(vd, carry);
// VD is the signed sum of the two sources and the carry. Since we
// have to saturate the sum of all three, we have to be clever.
rsp_vec_t minimum = _mm_min_epi16(vs, vt);
rsp_vec_t maximum = _mm_max_epi16(vs, vt);
minimum = _mm_subs_epi16(minimum, carry);
return _mm_adds_epi16(minimum, maximum);
}

13
src/emu/cpu/rsp/vaddc.h Normal file
View File

@ -0,0 +1,13 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_vaddc(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *sn)
{
rsp_vec_t sat_sum = _mm_adds_epu16(vs, vt);
rsp_vec_t unsat_sum = _mm_add_epi16(vs, vt);
*sn = _mm_cmpeq_epi16(sat_sum, unsat_sum);
*sn = _mm_cmpeq_epi16(*sn, zero);
return unsat_sum;
}

9
src/emu/cpu/rsp/vand.h Normal file
View File

@ -0,0 +1,9 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_vand_vnand(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt) {
rsp_vec_t vmask = _mm_load_si128((rsp_vec_t *) m_vec_helpers.logic_mask[iw & 0x1]);
rsp_vec_t vd = _mm_and_si128(vs, vt);
return _mm_xor_si128(vd, vmask);
}

57
src/emu/cpu/rsp/vch.h Normal file
View File

@ -0,0 +1,57 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_vch(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *ge, rsp_vec_t *le, rsp_vec_t *eq, rsp_vec_t *sign, rsp_vec_t *vce) {
// sign = (vs ^ vt) < 0
*sign = _mm_xor_si128(vs, vt);
*sign = _mm_cmplt_epi16(*sign, zero);
// sign_negvt = sign ? -vt : vt
rsp_vec_t sign_negvt = _mm_xor_si128(vt, *sign);
sign_negvt = _mm_sub_epi16(sign_negvt, *sign);
// Compute diff, diff_zero:
rsp_vec_t diff = _mm_sub_epi16(vs, sign_negvt);
rsp_vec_t diff_zero = _mm_cmpeq_epi16(diff, zero);
// Compute le/ge:
rsp_vec_t vt_neg = _mm_cmplt_epi16(vt, zero);
rsp_vec_t diff_lez = _mm_cmpgt_epi16(diff, zero);
rsp_vec_t diff_gez = _mm_or_si128(diff_lez, diff_zero);
diff_lez = _mm_cmpeq_epi16(zero, diff_lez);
#ifdef __SSE4_1__
*ge = _mm_blendv_epi8(diff_gez, vt_neg, *sign);
*le = _mm_blendv_epi8(vt_neg, diff_lez, *sign);
#else
*ge = _mm_and_si128(*sign, vt_neg);
diff_gez = _mm_andnot_si128(*sign, diff_gez);
*ge = _mm_or_si128(*ge, diff_gez);
*le = _mm_and_si128(*sign, diff_lez);
diff_lez = _mm_andnot_si128(*sign, vt_neg);
*le = _mm_or_si128(*le, diff_lez);
#endif
// Compute vce:
*vce = _mm_cmpeq_epi16(diff, *sign);
*vce = _mm_and_si128(*vce, *sign);
// Compute !eq:
*eq = _mm_or_si128(diff_zero, *vce);
*eq = _mm_cmpeq_epi16(*eq, zero);
// Compute result:
#ifdef __SSE4_1__
rsp_vec_t diff_sel_mask = _mm_blendv_epi8(*ge, *le, *sign);
return _mm_blendv_epi8(vs, sign_negvt, diff_sel_mask);
#else
diff_lez = _mm_and_si128(*sign, *le);
diff_gez = _mm_andnot_si128(*sign, *ge);
rsp_vec_t diff_sel_mask = _mm_or_si128(diff_lez, diff_gez);
diff_lez = _mm_and_si128(diff_sel_mask, sign_negvt);
diff_gez = _mm_andnot_si128(diff_sel_mask, vs);
return _mm_or_si128(diff_lez, diff_gez);
#endif
}

65
src/emu/cpu/rsp/vcl.h Normal file
View File

@ -0,0 +1,65 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_vcl(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *ge, rsp_vec_t *le, rsp_vec_t eq, rsp_vec_t sign, rsp_vec_t vce)
{
// sign_negvt = sign ? -vt : vt
rsp_vec_t sign_negvt = _mm_xor_si128(vt, sign);
sign_negvt = _mm_sub_epi16(sign_negvt, sign);
// Compute diff, diff_zero, ncarry, and nvce:
// Note: diff = sign ? (vs + vt) : (vs - vt).
rsp_vec_t diff = _mm_sub_epi16(vs, sign_negvt);
rsp_vec_t ncarry = _mm_adds_epu16(vs, vt);
ncarry = _mm_cmpeq_epi16(diff, ncarry);
rsp_vec_t nvce = _mm_cmpeq_epi16(vce, zero);
rsp_vec_t diff_zero = _mm_cmpeq_epi16(diff, zero);
// Compute results for if (sign && ne):
rsp_vec_t le_case1 = _mm_and_si128(diff_zero, ncarry);
le_case1 = _mm_and_si128(nvce, le_case1);
rsp_vec_t le_case2 = _mm_or_si128(diff_zero, ncarry);
le_case2 = _mm_and_si128(vce, le_case2);
rsp_vec_t le_eq = _mm_or_si128(le_case1, le_case2);
// Compute results for if (!sign && ne):
rsp_vec_t ge_eq = _mm_subs_epu16(vt, vs);
ge_eq = _mm_cmpeq_epi16(ge_eq, zero);
// Blend everything together. Caveat: we don't update
// the results of ge/le if ne is false, so be careful.
rsp_vec_t do_le = _mm_andnot_si128(eq, sign);
#ifdef __SSE4_1__
*le = _mm_blendv_epi8(*le, le_eq, do_le);
#else
le_eq = _mm_and_si128(do_le, le_eq);
*le = _mm_andnot_si128(do_le, *le);
*le = _mm_or_si128(le_eq, *le);
#endif
rsp_vec_t do_ge = _mm_or_si128(sign, eq);
#ifdef __SSE4_1__
*ge = _mm_blendv_epi8(ge_eq, *ge, do_ge);
#else
*ge = _mm_and_si128(do_ge, *ge);
ge_eq = _mm_andnot_si128(do_ge, ge_eq);
*ge = _mm_or_si128(ge_eq, *ge);
#endif
// Mux the result based on the value of sign.
#ifdef __SSE4_1__
rsp_vec_t mux_mask = _mm_blendv_epi8(*ge, *le, sign);
#else
do_le = _mm_and_si128(sign, *le);
do_ge = _mm_andnot_si128(sign, *ge);
rsp_vec_t mux_mask = _mm_or_si128(do_le, do_ge);
#endif
#ifdef __SSE4_1__
return _mm_blendv_epi8(vs, sign_negvt, mux_mask);
#else
sign_negvt = _mm_and_si128(mux_mask, sign_negvt);
vs = _mm_andnot_si128(mux_mask, vs);
return _mm_or_si128(sign_negvt, vs);
#endif
}

49
src/emu/cpu/rsp/vcmp.h Normal file
View File

@ -0,0 +1,49 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_veq_vge_vlt_vne(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *le, rsp_vec_t eq, rsp_vec_t sign)
{
rsp_vec_t equal = _mm_cmpeq_epi16(vs, vt);
if (iw & 0x2) // VNE & VGE
{
if (iw & 0x1) // VGE
{
rsp_vec_t gt = _mm_cmpgt_epi16(vs, vt);
rsp_vec_t equalsign = _mm_and_si128(eq, sign);
equal = _mm_andnot_si128(equalsign, equal);
*le = _mm_or_si128(gt, equal);
}
else // VNE
{
rsp_vec_t nequal = _mm_cmpeq_epi16(equal, zero);
*le = _mm_and_si128(eq, equal);
*le = _mm_or_si128(*le, nequal);
}
}
else // VEQ & VLT
{
if (iw & 0x1) // VEQ
{
*le = _mm_andnot_si128(eq, equal);
}
else // VLT
{
rsp_vec_t lt = _mm_cmplt_epi16(vs, vt);
equal = _mm_and_si128(eq, equal);
equal = _mm_and_si128(sign, equal);
*le = _mm_or_si128(lt, equal);
}
}
#ifdef __SSE4_1__
return _mm_blendv_epi8(vt, vs, *le);
#else
vs = _mm_and_si128(*le, vs);
vt = _mm_andnot_si128(*le, vt);
return _mm_or_si128(vs, vt);
#endif
}

35
src/emu/cpu/rsp/vcr.h Normal file
View File

@ -0,0 +1,35 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_vcr(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *ge, rsp_vec_t *le) {
// sign = (vs ^ vt) < 0
rsp_vec_t sign = _mm_xor_si128(vs, vt);
sign = _mm_srai_epi16(sign, 15);
// Compute le
rsp_vec_t diff_lez = _mm_and_si128(vs, sign);
diff_lez = _mm_add_epi16(diff_lez, vt);
*le = _mm_srai_epi16(diff_lez, 15);
// Compute ge
rsp_vec_t diff_gez = _mm_or_si128(vs, sign);
diff_gez = _mm_min_epi16(diff_gez, vt);
*ge = _mm_cmpeq_epi16(diff_gez, vt);
// sign_notvt = sn ? ~vt : vt
rsp_vec_t sign_notvt = _mm_xor_si128(vt, sign);
// Compute result:
#ifdef __SSE4_1__
rsp_vec_t diff_sel_mask = _mm_blendv_epi8(*ge, *le, sign);
return _mm_blendv_epi8(vs, sign_notvt, diff_sel_mask);
#else
rsp_vec_t diff_sel_mask = _mm_sub_epi16(*le, *ge);
diff_sel_mask = _mm_and_si128(diff_sel_mask, sign);
diff_sel_mask = _mm_add_epi16(diff_sel_mask, *ge);
zero = _mm_sub_epi16(sign_notvt, vs);
zero = _mm_and_si128(zero, diff_sel_mask);
return _mm_add_epi16(zero, vs);
#endif
}

12
src/emu/cpu/rsp/vdivh.h Normal file
View File

@ -0,0 +1,12 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
rsp_vec_t vec_vdivh(UINT32 src, UINT32 e, UINT32 dest, UINT32 de)
{
// Get the element from VT.
m_div_in = m_v[src].s[e & 0x7];
// Write out the upper part of the result.
m_v[dest].s[de & 0x7] = m_div_out;
return vec_load_unshuffled_operand(m_v[dest].s);
}

57
src/emu/cpu/rsp/vmac.h Normal file
View File

@ -0,0 +1,57 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_vmacf_vmacu(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_mid, rsp_vec_t *acc_hi)
{
// Get the product and shift it over
// being sure to save the carries.
rsp_vec_t lo = _mm_mullo_epi16(vs, vt);
rsp_vec_t hi = _mm_mulhi_epi16(vs, vt);
rsp_vec_t mid = _mm_slli_epi16(hi, 1);
rsp_vec_t carry = _mm_srli_epi16(lo, 15);
hi = _mm_srai_epi16(hi, 15);
mid = _mm_or_si128(mid, carry);
lo = _mm_slli_epi16(lo, 1);
// Tricky part: start accumulating everything.
// Get/keep the carry as we'll add it in later.
rsp_vec_t overflow_mask = _mm_adds_epu16(*acc_lo, lo);
*acc_lo = _mm_add_epi16(*acc_lo, lo);
overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
// Add in the carry. If the middle portion is
// already 0xFFFF and we have a carry, we have
// to carry the all the way up to hi.
mid = _mm_sub_epi16(mid, overflow_mask);
carry = _mm_cmpeq_epi16(mid, zero);
carry = _mm_and_si128(carry, overflow_mask);
hi = _mm_sub_epi16(hi, carry);
// Accumulate the middle portion.
overflow_mask = _mm_adds_epu16(*acc_mid, mid);
*acc_mid = _mm_add_epi16(*acc_mid, mid);
overflow_mask = _mm_cmpeq_epi16(*acc_mid, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
// Finish up the accumulation of the... accumulator.
*acc_hi = _mm_add_epi16(*acc_hi, hi);
*acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask);
if (iw & 0x1) // VMACU
{
rsp_vec_t overflow_hi_mask = _mm_srai_epi16(*acc_hi, 15);
rsp_vec_t overflow_mid_mask = _mm_srai_epi16(*acc_mid, 15);
mid = _mm_or_si128(overflow_mid_mask, *acc_mid);
overflow_mask = _mm_cmpgt_epi16(*acc_hi, zero);
mid = _mm_andnot_si128(overflow_hi_mask, mid);
return _mm_or_si128(overflow_mask, mid);
}
else // VMACF
{
return sclamp_acc_to_mid(*acc_mid, *acc_hi);
}
}

9
src/emu/cpu/rsp/vmov.h Normal file
View File

@ -0,0 +1,9 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
rsp_vec_t vec_vmov(UINT32 src, UINT32 e, UINT32 dest, UINT32 de)
{
// Get the element from VT and write out the upper part of the result.
m_v[dest].s[de & 0x7] = m_v[src].s[e & 0x7];
return rsp_vect_load_unshuffled_operand(m_v[dest].s);
}

13
src/emu/cpu/rsp/vmrg.h Normal file
View File

@ -0,0 +1,13 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_vmrg(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t le)
{
#ifdef __SSE4_1__
return _mm_blendv_epi8(vt, vs, le);
#else
vs = _mm_and_si128(le, vs);
vt = _mm_andnot_si128(le, vt);
return _mm_or_si128(vs, vt);
#endif
}

11
src/emu/cpu/rsp/vmudh.h Normal file
View File

@ -0,0 +1,11 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t rsp_vmudh(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t *acc_md, rsp_vec_t *acc_hi)
{
*acc_md = _mm_mullo_epi16(vs, vt);
*acc_hi = _mm_mulhi_epi16(vs, vt);
return sclamp_acc_to_mid(*acc_md, *acc_hi);
}

39
src/emu/cpu/rsp/vmul.h Normal file
View File

@ -0,0 +1,39 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
//
// TODO: CHECK ME.
//
static inline rsp_vec_t vec_vmulf_vmulu(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_md, rsp_vec_t *acc_hi)
{
rsp_vec_t lo = _mm_mullo_epi16(vs, vt);
rsp_vec_t round = _mm_cmpeq_epi16(zero, zero);
rsp_vec_t sign1 = _mm_srli_epi16(lo, 15);
lo = _mm_add_epi16(lo, lo);
round = _mm_slli_epi16(round, 15);
rsp_vec_t hi = _mm_mulhi_epi16(vs, vt);
rsp_vec_t sign2 = _mm_srli_epi16(lo, 15);
*acc_lo = _mm_add_epi16(round, lo);
sign1 = _mm_add_epi16(sign1, sign2);
hi = _mm_slli_epi16(hi, 1);
rsp_vec_t eq = _mm_cmpeq_epi16(vs, vt);
rsp_vec_t neq = eq;
*acc_md = _mm_add_epi16(hi, sign1);
rsp_vec_t neg = _mm_srai_epi16(*acc_md, 15);
if (iw & 0x1) // VMULU
{
*acc_hi = _mm_andnot_si128(eq, neg);
hi =_mm_or_si128(*acc_md, neg);
return _mm_andnot_si128(*acc_hi, hi);
}
else // VMULF
{
eq = _mm_and_si128(eq, neg);
*acc_hi = _mm_andnot_si128(neq, neg);
return _mm_add_epi16(*acc_md, eq);
}
}

31
src/emu/cpu/rsp/vmulh.h Normal file
View File

@ -0,0 +1,31 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_vmadh_vmudh(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_md, rsp_vec_t *acc_hi)
{
rsp_vec_t lo = _mm_mullo_epi16(vs, vt);
rsp_vec_t hi = _mm_mulhi_epi16(vs, vt);
if (iw & 0x8) // VMADH
{
// Tricky part: start accumulating everything.
// Get/keep the carry as we'll add it in later.
rsp_vec_t overflow_mask = _mm_adds_epu16(*acc_md, lo);
*acc_md = _mm_add_epi16(*acc_md, lo);
overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
hi = _mm_sub_epi16(hi, overflow_mask);
*acc_hi = _mm_add_epi16(*acc_hi, hi);
}
else // VMUDH
{
*acc_lo = zero;
*acc_md = lo;
*acc_hi = hi;
}
return sclamp_acc_to_mid(*acc_md, *acc_hi);
}

44
src/emu/cpu/rsp/vmull.h Normal file
View File

@ -0,0 +1,44 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_vmadl_vmudl(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_md, rsp_vec_t *acc_hi)
{
rsp_vec_t hi = _mm_mulhi_epu16(vs, vt);
if (iw & 0x8) // VMADL
{
// Tricky part: start accumulating everything.
// Get/keep the carry as we'll add it in later.
rsp_vec_t overflow_mask = _mm_adds_epu16(*acc_lo, hi);
*acc_lo = _mm_add_epi16(*acc_lo, hi);
overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
hi = _mm_sub_epi16(zero, overflow_mask);
// Check for overflow of the upper sum.
//
// TODO: Since hi can only be {0,1}, we should
// be able to generalize this for performance.
overflow_mask = _mm_adds_epu16(*acc_md, hi);
*acc_md = _mm_add_epi16(*acc_md, hi);
overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
// Finish up the accumulation of the... accumulator.
// Since the product was unsigned, only worry about
// positive overflow (i.e.: borrowing not possible).
*acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask);
return uclamp_acc(*acc_lo, *acc_md, *acc_hi, zero);
}
else // VMUDL
{
*acc_lo = hi;
*acc_md = zero;
*acc_hi = zero;
return hi;
}
}

56
src/emu/cpu/rsp/vmulm.h Normal file
View File

@ -0,0 +1,56 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_vmadm_vmudm(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_md, rsp_vec_t *acc_hi)
{
rsp_vec_t lo = _mm_mullo_epi16(vs, vt);
rsp_vec_t hi = _mm_mulhi_epu16(vs, vt);
// What we really want to do is unsigned vs * signed vt.
// However, we have no such instructions to do so.
//
// There's a trick to "fix" an unsigned product, though:
// If vt was negative, take the upper 16-bits of the product
// and subtract vs.
rsp_vec_t sign = _mm_srai_epi16(vs, 15);
vt = _mm_and_si128(vt, sign);
hi = _mm_sub_epi16(hi, vt);
if (iw & 0x8) // VMADM
{
// Tricky part: start accumulating everything.
// Get/keep the carry as we'll add it in later.
rsp_vec_t overflow_mask = _mm_adds_epu16(*acc_lo, lo);
*acc_lo = _mm_add_epi16(*acc_lo, lo);
overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
// This is REALLY clever. Since the product results from
// two 16-bit components, one positive and one negative,
// we don't have to worry about carrying the 1 (we can
// only borrow) past 32-bits. So we can just add it here.
hi = _mm_sub_epi16(hi, overflow_mask);
// Check for overflow of the upper sum.
overflow_mask = _mm_adds_epu16(*acc_md, hi);
*acc_md = _mm_add_epi16(*acc_md, hi);
overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
// Finish up the accumulation of the... accumulator.
*acc_hi = _mm_add_epi16(*acc_hi, _mm_srai_epi16(hi, 15));
*acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask);
return sclamp_acc_to_mid(*acc_md, *acc_hi);
}
else // VMUDM
{
*acc_lo = lo;
*acc_md = hi;
*acc_hi = _mm_srai_epi16(hi, 15);
return hi;
}
}

55
src/emu/cpu/rsp/vmuln.h Normal file
View File

@ -0,0 +1,55 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_vmadn_vmudn(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *acc_lo, rsp_vec_t *acc_md, rsp_vec_t *acc_hi)
{
rsp_vec_t lo = _mm_mullo_epi16(vs, vt);
rsp_vec_t hi = _mm_mulhi_epu16(vs, vt);
// What we really want to do is unsigned vs * signed vt.
// However, we have no such instructions to do so.
//
// There's a trick to "fix" an unsigned product, though:
// If vt was negative, take the upper 16-bits of the product
// and subtract vs.
rsp_vec_t sign = _mm_srai_epi16(vt, 15);
vs = _mm_and_si128(vs, sign);
hi = _mm_sub_epi16(hi, vs);
if (iw & 0x8) // VMADN
{
// Tricky part: start accumulate everything.
// Get/keep the carry as we'll add it in later.
rsp_vec_t overflow_mask = _mm_adds_epu16(*acc_lo, lo);
*acc_lo = _mm_add_epi16(*acc_lo, lo);
overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
// This is REALLY clever. Since the product results from
// two 16-bit components, one positive and one negative,
// we don't have to worry about carrying the 1 (we can
// only borrow) past 32-bits. So we can just add it here.
hi = _mm_sub_epi16(hi, overflow_mask);
// Check for overflow of the upper sum.
overflow_mask = _mm_adds_epu16(*acc_md, hi);
*acc_md = _mm_add_epi16(*acc_md, hi);
overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
// Finish up the accumulation of the... accumulator.
*acc_hi = _mm_add_epi16(*acc_hi, _mm_srai_epi16(hi, 15));
*acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask);
return uclamp_acc(*acc_lo, *acc_md, *acc_hi, zero);
}
else // VMUDN
{
*acc_lo = lo;
*acc_md = hi;
*acc_hi = _mm_srai_epi16(hi, 15);
return lo;
}
}

10
src/emu/cpu/rsp/vor.h Normal file
View File

@ -0,0 +1,10 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_vor_vnor(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt)
{
rsp_vec_t vmask = _mm_load_si128((rsp_vec_t *) m_vec_helpers.logic_mask[iw & 0x1]);
rsp_vec_t vd = _mm_or_si128(vs, vt);
return _mm_xor_si128(vd, vmask);
}

60
src/emu/cpu/rsp/vrcpsq.h Normal file
View File

@ -0,0 +1,60 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_vrcp_vrsq(UINT32 iw, INT32 dp, UINT32 src, UINT32 e, UINT32 dest, UINT32 de)
{
UINT32 shift, idx;
// Get the element from VT.
INT16 vt = m_v[src].s[e & 0x7];
UINT32 dp_input = ((UINT32) m_div_in << 16) | (UINT16) vt;
UINT32 sp_input = vt;
INT32 input = (dp) ? dp_input : sp_input;
INT32 input_mask = input >> 31;
INT32 data = input ^ input_mask;
if (input > -32768)
{
data -= input_mask;
}
// Handle edge cases.
INT32 result;
if (data == 0)
{
result = 0x7fffffff;
}
else if (input == -32768)
{
result = 0xffff0000;
}
else // Main case: compute the reciprocal.
{
UINT32 shift = count_leading_zeros(data);
UINT32 idx = (((UINT64) data << shift) & 0x7FC00000) >> 22;
if (iw & 0x4) // VRSQ
{
idx = (idx | 0x200) & 0x3FE | (shift % 2);
result = rsp_divtable[idx];
result = ((0x10000 | result) << 14) >> ((31 - shift) >> 1);
}
else // VRCP
{
result = rsp_divtable[idx];
result = ((0x10000 | result) << 14) >> (31 - shift);
}
result = result ^ input_mask;
}
// Write out the results.
m_div_out = result >> 16;
m_v[dest].s[de & 0x7] = result;
return vec_load_unshuffled_operand(m_v[dest].s);
}

65
src/emu/cpu/rsp/vrsq.h Normal file
View File

@ -0,0 +1,65 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
rsp_vec_t vec_vrsq(INT32 dp, UINT32 src, UINT32 e, UINT32 dest, UINT32 de)
{
// Get the element from VT.
INT16 vt = m_v[src].s[e & 0x7];
UINT32 dp_input = ((UINT32) m_div_in << 16) | (UINT16) vt;
UINT32 sp_input = vt;
INT32 input = (dp) ? dp_input : sp_input;
INT32 input_mask = input >> 31;
INT32 data = input ^ input_mask;
if (input > -32768)
{
data -= input_mask;
}
// Handle edge cases.
if (data == 0)
{
result = 0x7fffFFFFU;
}
else if (input == -32768)
{
result = 0xffff0000U;
}
else // Main case: compute the reciprocal.
{
UINT32 shift = count_leading_zeros(data);
UINT32 idx = (((UINT64) data << shift) & 0x7FC00000U) >> 22;
idx = (idx | 0x200) & 0x3FE | (shift % 2);
INT32 result = rsp_reciprocal_rom[idx];
result = ((0x10000 | result) << 14) >> ((31 - shift) >> 1);
result = result ^ input_mask;
}
// Write out the results.
m_div_out = result >> 16;
m_v[dest].s[de & 0x7] = result;
return vec_load_unshuffled_operand(m_v[dest].s);
}
rsp_vec_t vec_vrsqh(UINT32 src, UINT32 e, UINT32 dest, UINT32 de)
{
INT16 elements[8];
// Get the element from VT.
memcpy(elements, &m_v[src], sizeof(rsp_vec_t));
m_div_in = elements[e];
// Write out the upper part of the result.
rsp_vec_t vd_mask = _mm_load_si128((rsp_vec_t *) m_vec_helpers.vrsq_mask_table[de]);
rsp_vec_t vd = _mm_load_si128((rsp_vec_t *) &m_v[dest]);
vd = _mm_andnot_si128(vd_mask, vd);
rsp_vec_t b_result = _mm_set1_epi16(m_div_out);
b_result = _mm_and_si128(vd_mask, b_result);
return _mm_or_si128(b_result, vd);
}

17
src/emu/cpu/rsp/vsub.h Normal file
View File

@ -0,0 +1,17 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t rsp_vsub(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t carry, rsp_vec_t *acc_lo)
{
// acc_lo uses saturated arithmetic.
rsp_vec_t unsat_diff = _mm_sub_epi16(vt, carry);
rsp_vec_t sat_diff = _mm_subs_epi16(vt, carry);
*acc_lo = _mm_sub_epi16(vs, unsat_diff);
rsp_vec_t vd = _mm_subs_epi16(vs, sat_diff);
// VD is the signed diff of the two sources and the carry. Since we
// have to saturate the diff of all three, we have to be clever.
rsp_vec_t overflow = _mm_cmpgt_epi16(sat_diff, unsat_diff);
return _mm_adds_epi16(vd, overflow);
}

14
src/emu/cpu/rsp/vsubc.h Normal file
View File

@ -0,0 +1,14 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_vsubc(rsp_vec_t vs, rsp_vec_t vt, rsp_vec_t zero, rsp_vec_t *eq, rsp_vec_t *sn)
{
rsp_vec_t sat_udiff = _mm_subs_epu16(vs, vt);
rsp_vec_t equal = _mm_cmpeq_epi16(vs, vt);
rsp_vec_t sat_udiff_zero = _mm_cmpeq_epi16(sat_udiff, zero);
*eq = _mm_cmpeq_epi16(equal, zero);
*sn = _mm_andnot_si128(equal, sat_udiff_zero);
return _mm_sub_epi16(vs, vt);
}

10
src/emu/cpu/rsp/vxor.h Normal file
View File

@ -0,0 +1,10 @@
// license:BSD-3-Clause
// copyright-holders:Tyler J. Stachecki,Ryan Holtz
static inline rsp_vec_t vec_vxor_vnxor(UINT32 iw, rsp_vec_t vs, rsp_vec_t vt)
{
rsp_vec_t vmask = _mm_load_si128((rsp_vec_t *) m_vec_helpers.logic_mask[iw & 0x1]);
rsp_vec_t vd = _mm_xor_si128(vs, vt);
return _mm_xor_si128(vd, vmask);
}