diff --git a/src/emu/cpu/tms32082/dis32082.c b/src/emu/cpu/tms32082/dis32082.c
index 0eec74e684c..9ef481d9f07 100644
--- a/src/emu/cpu/tms32082/dis32082.c
+++ b/src/emu/cpu/tms32082/dis32082.c
@@ -38,7 +38,7 @@ static const char *FLOATOP_PRECISION[4] =
 
 static const char *ACC_SEL[4] =
 {
-	"a0", "a1", "a2", "a3"
+	"A0", "A1", "A2", "A3"
 };
 
 static char *output;
@@ -94,7 +94,7 @@ static char* get_creg_name(UINT32 reg)
 		case 0x4000:    sprintf(buffer, "IN0P"); break;
 		case 0x4001:    sprintf(buffer, "IN1P"); break;
 		case 0x4002:    sprintf(buffer, "OUTP"); break;
-		default:        sprintf(buffer, "CR %04X\n", reg);
+		default:        sprintf(buffer, "CR %04X", reg);
 	}
 
 	return buffer;
@@ -171,9 +171,9 @@ static char* format_vector_op(UINT32 op, UINT32 imm32)
 
 	// align the line end
 	int len = strlen(buffer);
-	if (len < 27)
+	if (len < 29)
 	{
-		for (int i=0; i < (27-len); i++)
+		for (int i=0; i < (29-len); i++)
 		{
 			b += sprintf(b, " ");
 		}
diff --git a/src/emu/cpu/tms32082/mp_ops.c b/src/emu/cpu/tms32082/mp_ops.c
index bd315d66553..4a12725953e 100644
--- a/src/emu/cpu/tms32082/mp_ops.c
+++ b/src/emu/cpu/tms32082/mp_ops.c
@@ -17,6 +17,7 @@
 #define OP_PD() ((m_ir >> 9) & 0x3)
 #define OP_P1() ((m_ir >> 5) & 0x3)
 #define OP_P2() ((m_ir >> 7) & 0x3)
+#define OP_ACC() ((m_ir >> 15) & 0x2) | ((m_ir >> 11) & 1)
 
 #define ROTATE_L(x, r) ((x << r) | (x >> (32-r)))
 #define ROTATE_R(x, r) ((x >> r) | (x << (32-r)))
@@ -130,6 +131,57 @@ UINT32 tms32082_mp_device::calculate_cmp(UINT32 src1, UINT32 src2)
 	return flags;
 }
 
+void tms32082_mp_device::vector_loadstore()
+{
+	int rd = OP_RD();
+	int vector_ls_bits = (((m_ir >> 9) & 0x3) << 1) | ((m_ir >> 6) & 1);
+
+	switch (vector_ls_bits)
+	{
+		case 0x01:			// vst.s
+		{
+			m_program->write_dword(m_outp, m_reg[rd]);
+			m_outp += 4;
+			break;
+		}
+		case 0x03:			// vst.d
+		{
+			UINT64 data = m_fpair[rd >> 1];
+			m_program->write_qword(m_outp, data);
+			m_outp += 8;
+			break;
+		}
+		case 0x04:			// vld0.s
+		{
+			m_reg[rd] = m_program->read_dword(m_in0p);
+			m_in0p += 4;
+			break;
+		}
+		case 0x05:			// vld1.s
+		{
+			m_reg[rd] = m_program->read_dword(m_in1p);
+			m_in1p += 4;
+			break;
+		}
+		case 0x06:			// vld0.d
+		{
+			m_fpair[rd >> 1] = m_program->read_qword(m_in0p);
+			m_in0p += 8;
+			break;
+		}
+		case 0x07:			// vld1.d
+		{
+			m_fpair[rd >> 1] = m_program->read_qword(m_in1p);
+			m_in1p += 8;
+			break;
+		}
+
+		default:
+			fatalerror("vector_loadstore(): ls bits = %02X\n", vector_ls_bits);
+			break;
+	}
+}
+
 void tms32082_mp_device::execute_short_imm()
 {
 	switch ((m_ir >> 15) & 0x7f)
@@ -137,7 +189,7 @@ void tms32082_mp_device::execute_short_imm()
 		case 0x04:          // rdcr
 		{
 			int rd = OP_RD();
-			INT32 imm = OP_SIMM15();
+			UINT32 imm = OP_UIMM15();
 
 			UINT32 r = read_creg(imm);
 
@@ -150,7 +202,7 @@ void tms32082_mp_device::execute_short_imm()
 		{
 			int rd = OP_RD();
 			int rs = OP_RS();
-			INT32 imm = OP_SIMM15();
+			UINT32 imm = OP_UIMM15();
 
 			UINT32 r = read_creg(imm);
 			if (rd)
@@ -998,7 +1050,188 @@ void tms32082_mp_device::execute_reg_long_imm()
 			break;
 		}
 
-		case 0xe4:			// fmpy
+		case 0xc4:
+		case 0xd4:
+		case 0xc5:
+		case 0xd5:			// vmpy
+		{
+			int p1 = m_ir & (1 << 5);
+			int pd = m_ir & (1 << 7);
+			int ls_bit1 = m_ir & (1 << 10);
+			int ls_bit2 = m_ir & (1 << 6);
+			int rd = OP_RS();
+			int src1 OP_SRC1();
+
+			double source = has_imm ? (double)u2f(imm32) : (p1 ? u2d(m_fpair[src1 >> 1]) : (double)u2f(m_reg[src1]));
+
+			if (rd)
+			{
+				if (pd)
+				{
+					double res = source * u2d(m_fpair[rd >> 1]);
+					m_fpair[rd >> 1] = d2u(res);
+				}
+				else
+				{
+					float res = (float)(source) * u2f(m_reg[rd]);
+					m_reg[rd] = f2u(res);
+				}
+			}
+
+			// parallel load/store op
+			if (!(ls_bit1 == 0 && ls_bit2 == 0))
+			{
+				vector_loadstore();
+			}
+			break;
+		}
+
+		case 0xcc:
+		case 0xdc:
+		case 0xcd:
+		case 0xdd:			// vmac
+		{
+			int acc = OP_ACC();
+			int z = m_ir & (1 << 8);
+			int pd = m_ir & (1 << 9);
+			int ls_bit1 = m_ir & (1 << 10);
+			int ls_bit2 = m_ir & (1 << 6);
+			int rd = OP_RD();
+
+			float src1 = u2f(m_reg[OP_SRC1()]);
+			float src2 = u2f(m_reg[OP_RS()]);
+
+			float res = (src1 * src2) + (z ? 0.0f : m_acc[acc]);
+
+			// parallel load/store op
+			if (!(ls_bit1 == 0 && ls_bit2 == 0))
+			{
+				vector_loadstore();
+
+				// if the opcode has load/store, dest is always accumulator
+				m_facc[acc] = (double)res;
+			}
+			else
+			{
+				if (rd)
+				{
+					if (pd)
+						m_fpair[rd >> 1] = d2u(res);
+					else
+						m_reg[rd] = f2u((float)res);
+				}
+				else
+				{
+					// write to accumulator
+					m_facc[acc] = (double)res;
+				}
+			}
+			break;
+		}
+
+		case 0xce:
+		case 0xde:
+		case 0xcf:
+		case 0xdf:			// vmsc
+		{
+			int acc = OP_ACC();
+			int z = m_ir & (1 << 8);
+			int pd = m_ir & (1 << 9);
+			int ls_bit1 = m_ir & (1 << 10);
+			int ls_bit2 = m_ir & (1 << 6);
+			int rd = OP_RD();
+
+			float src1 = u2f(m_reg[OP_SRC1()]);
+			float src2 = u2f(m_reg[OP_RS()]);
+
+			float res = (z ? 0.0f : m_acc[acc]) - (src1 * src2);
+
+			// parallel load/store op
+			if (!(ls_bit1 == 0 && ls_bit2 == 0))
+			{
+				vector_loadstore();
+
+				// if the opcode has load/store, dest is always accumulator
+				m_facc[acc] = (double)res;
+			}
+			else
+			{
+				if (rd)
+				{
+					if (pd)
+						m_fpair[rd >> 1] = d2u(res);
+					else
+						m_reg[rd] = f2u((float)res);
+				}
+				else
+				{
+					// write to accumulator
+					m_facc[acc] = (double)res;
+				}
+			}
+			break;
+		}
+
+		case 0xe2:
+		case 0xe3:			// fsub
+		{
+			int rd = OP_RD();
+			int rs = OP_RS();
+			int src1 = OP_SRC1();
+			int precision = (m_ir >> 5) & 0x3f;
+
+			if (rd)		// only calculate if destination register is valid
+			{
+				switch (precision)
+				{
+					case 0x00:			// SP - SP -> SP
+					{
+						float s1 = u2f(has_imm ? imm32 : m_reg[src1]);
+						float s2 = u2f(m_reg[rs]);
+						m_reg[rd] = f2u(s1 - s2);
+						break;
+					}
+					case 0x10:			// SP - SP -> DP
+					{
+						float s1 = u2f(has_imm ? imm32 : m_reg[src1]);
+						float s2 = u2f(m_reg[rs]);
+						UINT64 res = d2u((double)(s1 - s2));
+						m_fpair[rd >> 1] = res;
+						break;
+					}
+					case 0x14:			// SP - DP -> DP
+					{
+						float s1 = u2f(has_imm ? imm32 : m_reg[src1]);
+						double s2 = u2d(m_fpair[rs >> 1]);
+						UINT64 res = d2u((double)(s1 - s2));
+						m_fpair[rd >> 1] = res;
+						break;
+					}
+					case 0x11:			// DP - SP -> DP
+					{
+						double s1 = u2d(m_fpair[src1 >> 1]);
+						float s2 = u2f(m_reg[rs]);
+						UINT64 res = d2u((double)(s1 - s2));
+						m_fpair[rd >> 1] = res;
+						break;
+					}
+					case 0x15:			// DP - DP -> DP
+					{
+						double s1 = u2d(m_fpair[src1 >> 1]);
+						double s2 = u2d(m_fpair[rs >> 1]);
+						UINT64 res = d2u((double)(s1 - s2));
+						m_fpair[rd >> 1] = res;
+						break;
+					}
+					default:
+						fatalerror("fsub: invalid precision combination %02X\n", precision);
+				}
+			}
+			break;
+		}
+
+		case 0xe4:
+		case 0xe5:			// fmpy
 		{
 			int rd = OP_RD();
 			int rs = OP_RS();
@@ -1065,6 +1298,27 @@ void tms32082_mp_device::execute_reg_long_imm()
 			break;
 		}
 
+		case 0xee:
+		case 0xef:			// fsqrt
+		{
+			int rd = OP_RD();
+			int src1 = OP_SRC1();
+			int p1 = m_ir & (1 << 5);
+			int pd = m_ir & (1 << 9);
+			double source = has_imm ? (double)u2f(imm32) : (p1 ? u2d(m_fpair[src1 >> 1]) : (double)u2f(m_reg[src1]));
+
+			if (rd)
+			{
+				double res = sqrt(source);
+
+				if (pd)
+					m_fpair[rd >> 1] = d2u(res);
+				else
+					m_reg[rd] = f2u((float)res);
+			}
+			break;
+		}
+
 		case 0xf2:			// rmo
 		{
 			UINT32 source = m_reg[OP_RS()];
diff --git a/src/emu/cpu/tms32082/tms32082.c b/src/emu/cpu/tms32082/tms32082.c
index 446bca69731..0a76b055db1 100644
--- a/src/emu/cpu/tms32082/tms32082.c
+++ b/src/emu/cpu/tms32082/tms32082.c
@@ -173,6 +173,10 @@ void tms32082_mp_device::device_start()
 	state_add(MP_ACC2, "acc2", m_acc[2]).formatstr("%016X");
 	state_add(MP_ACC3, "acc3", m_acc[3]).formatstr("%016X");
 
+	state_add(MP_IN0P, "in0p", m_in0p).formatstr("%08X");
+	state_add(MP_IN1P, "in1p", m_in1p).formatstr("%08X");
+	state_add(MP_OUTP, "outp", m_outp).formatstr("%08X");
+
 	state_add(STATE_GENPC, "curpc", m_pc).noshow();
 
 	m_param_ram = auto_alloc_array(machine(), UINT32, 0x800);
@@ -216,6 +220,15 @@ UINT32 tms32082_mp_device::read_creg(int reg)
 		case 0xa:           // PPERROR
 			return 0xe0000;
 
+		case 0x4000:		// IN0P
+			return m_in0p;
+
+		case 0x4001:		// IN1P
+			return m_in1p;
+
+		case 0x4002:		// OUTP
+			return m_outp;
+
 		default:
 			printf("read_creg(): %08X\n", reg);
 			break;
@@ -225,7 +238,24 @@ UINT32 tms32082_mp_device::read_creg(int reg)
 
 void tms32082_mp_device::write_creg(int reg, UINT32 data)
 {
-	printf("write_creg(): %08X, %08X\n", reg, data);
+	switch (reg)
+	{
+		case 0x4000:		// IN0P
+			m_in0p = data;
+			break;
+
+		case 0x4001:		// IN1P
+			m_in1p = data;
+			break;
+
+		case 0x4002:		// OUTP
+			m_outp = data;
+			break;
+
+		default:
+			printf("write_creg(): %08X, %08X\n", reg, data);
+			break;
+	}
 }
 
 UINT32 tms32082_mp_device::fetch()
diff --git a/src/emu/cpu/tms32082/tms32082.h b/src/emu/cpu/tms32082/tms32082.h
index c3b3d1216dd..d8f709d5e84 100644
--- a/src/emu/cpu/tms32082/tms32082.h
+++ b/src/emu/cpu/tms32082/tms32082.h
@@ -48,6 +48,9 @@ public:
 		MP_ACC1,
 		MP_ACC2,
 		MP_ACC3,
+		MP_IN0P,
+		MP_IN1P,
+		MP_OUTP
 	};
 
 	DECLARE_READ32_MEMBER(mp_param_r);
@@ -90,15 +93,22 @@ protected:
 
 	UINT32 m_pc;
 	UINT32 m_fetchpc;
-	//UINT32 m_reg[32];
 	union
 	{
 		UINT32 m_reg[32];
 		UINT64 m_fpair[16];
 	};
-	UINT64 m_acc[4];
+	union
+	{
+		UINT64 m_acc[4];
+		double m_facc[4];
+	};
 	UINT32 m_ir;
 
+	UINT32 m_in0p;
+	UINT32 m_in1p;
+	UINT32 m_outp;
+
 	UINT32 *m_param_ram;
 
 	int m_icount;
@@ -115,6 +125,7 @@ protected:
 	void write_creg(int reg, UINT32 data);
 	bool test_condition(int condition, UINT32 value);
 	UINT32 calculate_cmp(UINT32 src1, UINT32 src2);
+	void vector_loadstore();
 };
 
 extern const device_type TMS32082_MP;