i386: sse opcodes improvements [Samuele Zannoli]

- add opcodes MOVHLPS MOVLHPS
 - safer implementation of PACKUSWB PACKSSDW SHUFPS UNPCKLPS UNPCKHPS

The safer implementation is needed in cases where the source and destination registers are the same.
This commit is contained in:
yz70s 2015-01-25 22:07:51 +01:00
parent f2a9d4e90a
commit 35947ff256

View File

@ -2249,26 +2249,30 @@ void i386_device::mmx_packuswb_r64_rm64() // Opcode 0f 67
MMXPROLOG(); MMXPROLOG();
UINT8 modrm = FETCH(); UINT8 modrm = FETCH();
if( modrm >= 0xc0 ) { if( modrm >= 0xc0 ) {
MMX_REG ds, sd;
int s,d; int s,d;
s=modrm & 0x7; s=modrm & 0x7;
d=(modrm >> 3) & 0x7; d=(modrm >> 3) & 0x7;
MMX(d).b[0]=SaturatedSignedWordToUnsignedByte(MMX(d).s[0]); ds.q = MMX(d).q;
MMX(d).b[1]=SaturatedSignedWordToUnsignedByte(MMX(d).s[1]); sd.q = MMX(s).q;
MMX(d).b[2]=SaturatedSignedWordToUnsignedByte(MMX(d).s[2]); MMX(d).b[0]=SaturatedSignedWordToUnsignedByte(ds.s[0]);
MMX(d).b[3]=SaturatedSignedWordToUnsignedByte(MMX(d).s[3]); MMX(d).b[1]=SaturatedSignedWordToUnsignedByte(ds.s[1]);
MMX(d).b[4]=SaturatedSignedWordToUnsignedByte(MMX(s).s[0]); MMX(d).b[2]=SaturatedSignedWordToUnsignedByte(ds.s[2]);
MMX(d).b[5]=SaturatedSignedWordToUnsignedByte(MMX(s).s[1]); MMX(d).b[3]=SaturatedSignedWordToUnsignedByte(ds.s[3]);
MMX(d).b[6]=SaturatedSignedWordToUnsignedByte(MMX(s).s[2]); MMX(d).b[4]=SaturatedSignedWordToUnsignedByte(sd.s[0]);
MMX(d).b[7]=SaturatedSignedWordToUnsignedByte(MMX(s).s[3]); MMX(d).b[5]=SaturatedSignedWordToUnsignedByte(sd.s[1]);
MMX(d).b[6]=SaturatedSignedWordToUnsignedByte(sd.s[2]);
MMX(d).b[7]=SaturatedSignedWordToUnsignedByte(sd.s[3]);
} else { } else {
MMX_REG s; MMX_REG s,t;
int d=(modrm >> 3) & 0x7; int d=(modrm >> 3) & 0x7;
UINT32 ea = GetEA(modrm, 0); UINT32 ea = GetEA(modrm, 0);
READMMX(ea, s); READMMX(ea, s);
MMX(d).b[0]=SaturatedSignedWordToUnsignedByte(MMX(d).s[0]); t.q = MMX(d).q;
MMX(d).b[1]=SaturatedSignedWordToUnsignedByte(MMX(d).s[1]); MMX(d).b[0]=SaturatedSignedWordToUnsignedByte(t.s[0]);
MMX(d).b[2]=SaturatedSignedWordToUnsignedByte(MMX(d).s[2]); MMX(d).b[1]=SaturatedSignedWordToUnsignedByte(t.s[1]);
MMX(d).b[3]=SaturatedSignedWordToUnsignedByte(MMX(d).s[3]); MMX(d).b[2]=SaturatedSignedWordToUnsignedByte(t.s[2]);
MMX(d).b[3]=SaturatedSignedWordToUnsignedByte(t.s[3]);
MMX(d).b[4]=SaturatedSignedWordToUnsignedByte(s.s[0]); MMX(d).b[4]=SaturatedSignedWordToUnsignedByte(s.s[0]);
MMX(d).b[5]=SaturatedSignedWordToUnsignedByte(s.s[1]); MMX(d).b[5]=SaturatedSignedWordToUnsignedByte(s.s[1]);
MMX(d).b[6]=SaturatedSignedWordToUnsignedByte(s.s[2]); MMX(d).b[6]=SaturatedSignedWordToUnsignedByte(s.s[2]);
@ -2362,21 +2366,30 @@ void i386_device::mmx_packssdw_r64_rm64() // Opcode 0f 6b
UINT8 modrm = FETCH(); UINT8 modrm = FETCH();
if( modrm >= 0xc0 ) { if( modrm >= 0xc0 ) {
int s,d; int s,d;
INT32 t1, t2, t3, t4;
s=modrm & 0x7; s=modrm & 0x7;
d=(modrm >> 3) & 0x7; d=(modrm >> 3) & 0x7;
MMX(d).s[0]=SaturatedSignedDwordToSignedWord(MMX(d).i[0]); t1 = MMX(d).i[0];
MMX(d).s[1]=SaturatedSignedDwordToSignedWord(MMX(d).i[1]); t2 = MMX(d).i[1];
MMX(d).s[2]=SaturatedSignedDwordToSignedWord(MMX(s).i[0]); t3 = MMX(s).i[0];
MMX(d).s[3]=SaturatedSignedDwordToSignedWord(MMX(s).i[1]); t4 = MMX(s).i[1];
} else { MMX(d).s[0] = SaturatedSignedDwordToSignedWord(t1);
MMX(d).s[1] = SaturatedSignedDwordToSignedWord(t2);
MMX(d).s[2] = SaturatedSignedDwordToSignedWord(t3);
MMX(d).s[3] = SaturatedSignedDwordToSignedWord(t4);
}
else {
MMX_REG s; MMX_REG s;
INT32 t1, t2;
int d=(modrm >> 3) & 0x7; int d=(modrm >> 3) & 0x7;
UINT32 ea = GetEA(modrm, 0); UINT32 ea = GetEA(modrm, 0);
READMMX(ea, s); READMMX(ea, s);
MMX(d).s[0]=SaturatedSignedDwordToSignedWord(MMX(d).i[0]); t1 = MMX(d).i[0];
MMX(d).s[1]=SaturatedSignedDwordToSignedWord(MMX(d).i[1]); t2 = MMX(d).i[1];
MMX(d).s[2]=SaturatedSignedDwordToSignedWord(s.i[0]); MMX(d).s[0] = SaturatedSignedDwordToSignedWord(t1);
MMX(d).s[3]=SaturatedSignedDwordToSignedWord(s.i[1]); MMX(d).s[1] = SaturatedSignedDwordToSignedWord(t2);
MMX(d).s[2] = SaturatedSignedDwordToSignedWord(s.i[0]);
MMX(d).s[3] = SaturatedSignedDwordToSignedWord(s.i[1]);
} }
CYCLES(1); // TODO: correct cycle count CYCLES(1); // TODO: correct cycle count
} }
@ -2711,9 +2724,11 @@ void i386_device::sse_movlps_r128_m64() // Opcode 0f 12
{ {
UINT8 modrm = FETCH(); UINT8 modrm = FETCH();
if( modrm >= 0xc0 ) { if( modrm >= 0xc0 ) {
// unsupported by cpu // MOVHLPS opcode
XMM((modrm >> 3) & 0x7).q[0] = XMM(modrm & 0x7).q[1];
CYCLES(1); // TODO: correct cycle count CYCLES(1); // TODO: correct cycle count
} else { } else {
// MOVLPS opcode
UINT32 ea = GetEA(modrm, 0); UINT32 ea = GetEA(modrm, 0);
READXMM_LO64(ea, XMM((modrm >> 3) & 0x7)); READXMM_LO64(ea, XMM((modrm >> 3) & 0x7));
CYCLES(1); // TODO: correct cycle count CYCLES(1); // TODO: correct cycle count
@ -2737,9 +2752,11 @@ void i386_device::sse_movhps_r128_m64() // Opcode 0f 16
{ {
UINT8 modrm = FETCH(); UINT8 modrm = FETCH();
if( modrm >= 0xc0 ) { if( modrm >= 0xc0 ) {
// unsupported by cpu // MOVLHPS opcode
XMM((modrm >> 3) & 0x7).q[1] = XMM(modrm & 0x7).q[0];
CYCLES(1); // TODO: correct cycle count CYCLES(1); // TODO: correct cycle count
} else { } else {
// MOVHPS opcode
UINT32 ea = GetEA(modrm, 0); UINT32 ea = GetEA(modrm, 0);
READXMM_HI64(ea, XMM((modrm >> 3) & 0x7)); READXMM_HI64(ea, XMM((modrm >> 3) & 0x7));
CYCLES(1); // TODO: correct cycle count CYCLES(1); // TODO: correct cycle count
@ -3367,7 +3384,7 @@ void i386_device::sse_ucomiss_r128_r128m32() // Opcode 0f 2e
CYCLES(1); // TODO: correct cycle count CYCLES(1); // TODO: correct cycle count
} }
void i386_device::sse_shufps() // Opcode 0f 67 void i386_device::sse_shufps() // Opcode 0f c6
{ {
UINT8 modrm = FETCH(); UINT8 modrm = FETCH();
UINT8 sel = FETCH(); UINT8 sel = FETCH();
@ -3380,20 +3397,24 @@ void i386_device::sse_shufps() // Opcode 0f 67
s=modrm & 0x7; s=modrm & 0x7;
d=(modrm >> 3) & 0x7; d=(modrm >> 3) & 0x7;
if( modrm >= 0xc0 ) { if( modrm >= 0xc0 ) {
UINT32 t; UINT32 t1,t2,t3,t4;
t=XMM(d).d[m1]; t1=XMM(d).d[m1];
XMM(d).d[1]=XMM(d).d[m2]; t2=XMM(d).d[m2];
XMM(d).d[0]=t; t3=XMM(s).d[m3];
XMM(d).d[2]=XMM(s).d[m3]; t4=XMM(s).d[m4];
XMM(d).d[3]=XMM(s).d[m4]; XMM(d).d[0]=t1;
XMM(d).d[1]=t2;
XMM(d).d[2]=t3;
XMM(d).d[3]=t4;
} else { } else {
UINT32 t; UINT32 t1,t2;
XMM_REG src; XMM_REG src;
UINT32 ea = GetEA(modrm, 0); UINT32 ea = GetEA(modrm, 0);
READXMM(ea, src); READXMM(ea, src);
t=XMM(d).d[m1]; t1=XMM(d).d[m1];
XMM(d).d[1]=XMM(d).d[m2]; t2=XMM(d).d[m2];
XMM(d).d[0]=t; XMM(d).d[0]=t1;
XMM(d).d[1]=t2;
XMM(d).d[2]=src.d[m3]; XMM(d).d[2]=src.d[m3];
XMM(d).d[3]=src.d[m4]; XMM(d).d[3]=src.d[m4];
} }
@ -3404,19 +3425,25 @@ void i386_device::sse_unpcklps_r128_rm128() // Opcode 0f 14
{ {
UINT8 modrm = FETCH(); UINT8 modrm = FETCH();
int s,d; int s,d;
UINT32 t1, t2, t3, t4;
s=modrm & 0x7; s=modrm & 0x7;
d=(modrm >> 3) & 0x7; d=(modrm >> 3) & 0x7;
if( modrm >= 0xc0 ) { if( modrm >= 0xc0 ) {
XMM(d).d[3]=XMM(s).d[1]; t1 = XMM(s).d[1];
XMM(d).d[2]=XMM(d).d[1]; t2 = XMM(d).d[1];
XMM(d).d[1]=XMM(s).d[0]; t3 = XMM(s).d[0];
//XMM(d).d[0]=XMM(d).d[0]; t4 = XMM(d).d[0];
XMM(d).d[3]=t1;
XMM(d).d[2]=t2;
XMM(d).d[1]=t3;
XMM(d).d[0]=t4;
} else { } else {
XMM_REG src; XMM_REG src;
UINT32 ea = GetEA(modrm, 0); UINT32 ea = GetEA(modrm, 0);
READXMM(ea, src); READXMM(ea, src);
t2 = XMM(d).d[1];
XMM(d).d[3]=src.d[1]; XMM(d).d[3]=src.d[1];
XMM(d).d[2]=XMM(d).d[1]; XMM(d).d[2]=t2;
XMM(d).d[1]=src.d[0]; XMM(d).d[1]=src.d[0];
} }
CYCLES(1); // TODO: correct cycle count CYCLES(1); // TODO: correct cycle count
@ -3426,20 +3453,27 @@ void i386_device::sse_unpckhps_r128_rm128() // Opcode 0f 15
{ {
UINT8 modrm = FETCH(); UINT8 modrm = FETCH();
int s,d; int s,d;
UINT32 t1, t2, t3, t4;
s=modrm & 0x7; s=modrm & 0x7;
d=(modrm >> 3) & 0x7; d=(modrm >> 3) & 0x7;
if( modrm >= 0xc0 ) { if( modrm >= 0xc0 ) {
XMM(d).d[0]=XMM(d).d[2]; t1 = XMM(d).d[2];
XMM(d).d[1]=XMM(s).d[2]; t2 = XMM(s).d[2];
XMM(d).d[2]=XMM(d).d[3]; t3 = XMM(d).d[3];
XMM(d).d[3]=XMM(s).d[3]; t4 = XMM(s).d[3];
XMM(d).d[0]=t1;
XMM(d).d[1]=t2;
XMM(d).d[2]=t3;
XMM(d).d[3]=t4;
} else { } else {
XMM_REG src; XMM_REG src;
UINT32 ea = GetEA(modrm, 0); UINT32 ea = GetEA(modrm, 0);
READXMM(ea, src); READXMM(ea, src);
XMM(d).d[0]=XMM(d).d[2]; t1 = XMM(d).d[2];
t3 = XMM(d).d[3];
XMM(d).d[0]=t1;
XMM(d).d[1]=src.d[2]; XMM(d).d[1]=src.d[2];
XMM(d).d[2]=XMM(d).d[3]; XMM(d).d[2]=t3;
XMM(d).d[3]=src.d[3]; XMM(d).d[3]=src.d[3];
} }
CYCLES(1); // TODO: correct cycle count CYCLES(1); // TODO: correct cycle count