diff --git a/src/mame/includes/xbox_nv2a.h b/src/mame/includes/xbox_nv2a.h
index 90c8fff55e7..a64863e603c 100644
--- a/src/mame/includes/xbox_nv2a.h
+++ b/src/mame/includes/xbox_nv2a.h
@@ -11,8 +11,6 @@
 #include "machine/pic8259.h"
 #include "video/poly.h"
 
-#include <mutex>
-
 class vertex_program_disassembler {
 	static char const *const srctypes[];
 	static char const *const scaops[];
@@ -126,7 +124,7 @@ public:
 	} c_constant[192];
 	union temp {
 		float fv[4];
-	} r_temp[32];
+	} r_register[32];
 	// output vertex
 	vertex_nv *output;
 	// instructions
@@ -134,9 +132,9 @@ public:
 		unsigned int i[4];
 		int modified;
 		struct decoded {
-			int SwizzleA[4], SignA, ParameterTypeA, TempIndexA;
-			int SwizzleB[4], SignB, ParameterTypeB, TempIndexB;
-			int SwizzleC[4], SignC, ParameterTypeC, TempIndexC;
+			int SwizzleA[4], NegateA, ParameterTypeA, TempIndexA;
+			int SwizzleB[4], NegateB, ParameterTypeB, TempIndexB;
+			int SwizzleC[4], NegateC, ParameterTypeC, TempIndexC;
 			VectorialOperation VecOperation;
 			ScalarOperation ScaOperation;
 			int OutputWriteMask, MultiplexerControl;
@@ -165,11 +163,165 @@ private:
 	void generate_input(float t[4], int sign, int type, int temp, int swizzle[4]);
 	void compute_vectorial_operation(float t[4], int instruction, float par[3 * 4]);
 	void compute_scalar_operation(float t[4], int instruction, float par[3 * 4]);
+	void assign_output(int index, float t[4], int mask);
+	void assign_register(int index, float t[4], int mask);
+	void assign_constant(int index, float t[4], int mask);
 
 	int ip;
 	int a0x;
 };
 
+struct Combiner {
+	enum class InputRegister {
+		Zero = 0,
+		Color0,
+		Color1,
+		FogColor,
+		PrimaryColor,
+		SecondaryColor,
+		Texture0Color = 8,
+		Texture1Color,
+		Texture2Color,
+		Texture3Color,
+		Spare0,
+		Spare1,
+		SumClamp,
+		EF
+	};
+	enum class MapFunction {
+		UnsignedIdentity = 0,
+		UnsignedInvert,
+		ExpandNormal,
+		ExpandNegate,
+		HalfBiasNormal,
+		HalfBiasNegate,
+		SignedIdentyty,
+		SignedNegate
+	};
+	struct {
+		struct {
+			float A[4]; // 0=R 1=G 2=B 3=A
+			float B[4];
+			float C[4];
+			float D[4];
+			float E[4];
+			float F[4];
+			float G;
+			float EF[4];
+			float sumclamp[4];
+		} variables;
+		struct {
+			float RGBop1[4]; // 0=R 1=G 2=B
+			float RGBop2[4];
+			float RGBop3[4];
+			float Aop1;
+			float Aop2;
+			float Aop3;
+		} functions;
+		struct {
+			float primarycolor[4]; // rw
+			float secondarycolor[4];
+			float texture0color[4];
+			float texture1color[4];
+			float texture2color[4];
+			float texture3color[4];
+			float color0[4];
+			float color1[4];
+			float spare0[4];
+			float spare1[4];
+			float fogcolor[4]; // ro
+			float zero[4];
+		} registers;
+		float output[4];
+	} work[WORK_MAX_THREADS];
+	struct {
+		struct {
+			float constantcolor0[4];
+			float constantcolor1[4];
+			struct {
+				InputRegister A_input;
+				int A_component;
+				MapFunction A_mapping;
+				InputRegister B_input;
+				int B_component;
+				MapFunction B_mapping;
+				InputRegister C_input;
+				int C_component;
+				MapFunction C_mapping;
+				InputRegister D_input;
+				int D_component;
+				MapFunction D_mapping;
+			} mapin_alpha;
+			struct {
+				InputRegister A_input;
+				int A_component;
+				MapFunction A_mapping;
+				InputRegister B_input;
+				int B_component;
+				MapFunction B_mapping;
+				InputRegister C_input;
+				int C_component;
+				MapFunction C_mapping;
+				InputRegister D_input;
+				int D_component;
+				MapFunction D_mapping;
+			} mapin_rgb;
+			struct {
+				InputRegister CD_output;
+				InputRegister AB_output;
+				InputRegister SUM_output;
+				int CD_dotproduct;
+				int AB_dotproduct;
+				int muxsum;
+				int bias;
+				int scale;
+			} mapout_alpha;
+			struct {
+				InputRegister CD_output;
+				InputRegister AB_output;
+				InputRegister SUM_output;
+				int CD_dotproduct;
+				int AB_dotproduct;
+				int muxsum;
+				int bias;
+				int scale;
+			} mapout_rgb;
+		} stage[8];
+		struct {
+			float constantcolor0[4];
+			float constantcolor1[4];
+			int color_sum_clamp;
+			struct {
+				InputRegister G_input;
+				int G_component;
+				MapFunction G_mapping;
+			} mapin_alpha;
+			struct {
+				InputRegister A_input;
+				int A_component;
+				MapFunction A_mapping;
+				InputRegister B_input;
+				int B_component;
+				MapFunction B_mapping;
+				InputRegister C_input;
+				int C_component;
+				MapFunction C_mapping;
+				InputRegister D_input;
+				int D_component;
+				MapFunction D_mapping;
+				InputRegister E_input;
+				int E_component;
+				MapFunction E_mapping;
+				InputRegister F_input;
+				int F_component;
+				MapFunction F_mapping;
+			} mapin_rgb;
+		} final;
+		int stages;
+	} setup;
+	int used;
+};
+
 class nv2a_renderer; // forward declaration
 struct nvidia_object_data
 {
@@ -196,10 +348,10 @@ objects have methods used to do drawing
 most methods set parameters, others actually draw
 */
 
-class nv2a_rasterizer : public poly_manager<double, nvidia_object_data, 13, 8192>
+class nv2a_rasterizer : public poly_manager<double, nvidia_object_data, 26, 8192>
 {
 public:
-	nv2a_rasterizer(running_machine& machine) : poly_manager<double, nvidia_object_data, 13, 8192>(machine)
+	nv2a_rasterizer(running_machine &machine) : poly_manager<double, nvidia_object_data, 26, 8192>(machine)
 	{
 	}
 };
@@ -209,18 +361,32 @@ class nv2a_renderer
 public:
 	enum class VERTEX_PARAMETER {
 		PARAM_COLOR_B = 0,
-		PARAM_COLOR_G = 1,
-		PARAM_COLOR_R = 2,
-		PARAM_COLOR_A = 3,
-		PARAM_TEXTURE0_U = 4,
-		PARAM_TEXTURE0_V = 5,
-		PARAM_TEXTURE1_U = 6,
-		PARAM_TEXTURE1_V = 7,
-		PARAM_TEXTURE2_U = 8,
-		PARAM_TEXTURE2_V = 9,
-		PARAM_TEXTURE3_U = 10,
-		PARAM_TEXTURE3_V = 11,
-		PARAM_Z = 12
+		PARAM_COLOR_G,
+		PARAM_COLOR_R,
+		PARAM_COLOR_A,
+		PARAM_TEXTURE0_S,
+		PARAM_TEXTURE0_T,
+		PARAM_TEXTURE0_R,
+		PARAM_TEXTURE0_Q,
+		PARAM_TEXTURE1_S,
+		PARAM_TEXTURE1_T,
+		PARAM_TEXTURE1_R,
+		PARAM_TEXTURE1_Q,
+		PARAM_TEXTURE2_S,
+		PARAM_TEXTURE2_T,
+		PARAM_TEXTURE2_R,
+		PARAM_TEXTURE2_Q,
+		PARAM_TEXTURE3_S,
+		PARAM_TEXTURE3_T,
+		PARAM_TEXTURE3_R,
+		PARAM_TEXTURE3_Q,
+		PARAM_SECONDARY_COLOR_B,
+		PARAM_SECONDARY_COLOR_G,
+		PARAM_SECONDARY_COLOR_R,
+		PARAM_SECONDARY_COLOR_A,
+		PARAM_Z,
+		PARAM_1W,
+		ALL
 	};
 	enum class NV2A_BEGIN_END {
 		STOP = 0,
@@ -413,6 +579,7 @@ public:
 		objectdata->data = this;
 		combiner.used = 0;
 		primitives_total_count = 0;
+		primitives_batches_count = 0;
 		indexesleft_count = 0;
 		triangles_bfculled = 0;
 		vertex_pipeline = 4;
@@ -440,6 +607,14 @@ public:
 		blend_function_source = NV2A_BLEND_FACTOR::ONE;
 		logical_operation_enabled = false;
 		logical_operation = NV2A_LOGIC_OP::COPY;
+		fog_color = 0;
+		for (int n = 0; n < 4; n++) {
+			texture[n].enabled = 0;
+			texture[n].mode = 0;
+			texture[n].addrmodes = 1;
+			texture[n].addrmodet = 1;
+			texture[n].addrmoder = 1;
+		}
 		for (int n = 0; n < 8; n++)
 			clippingwindows[n].set(0, 0, 640, 480);
 		limits_rendertarget.set(0, 0, 640, 480);
@@ -463,6 +638,7 @@ public:
 		rendertarget = nullptr;
 		depthbuffer = nullptr;
 		displayedtarget = nullptr;
+		old_rendertarget = nullptr;
 		puller_waiting = 0;
 		debug_grab_texttype = -1;
 		debug_grab_textfile = nullptr;
@@ -497,29 +673,29 @@ public:
 	void surface_2d_blit();
 	uint32_t texture_get_texel(int number, int x, int y);
 	uint8_t *read_pixel(int x, int y, int32_t c[4]);
-	void write_pixel(int x, int y, uint32_t color, int depth);
-	void combiner_initialize_registers(uint32_t argb8[6]);
-	void combiner_initialize_stage(int stage_number);
-	void combiner_initialize_final();
-	void combiner_map_input(int stage_number); // map combiner registers to variables A..D
-	void combiner_map_output(int stage_number); // map combiner calculation results to combiner registers
-	void combiner_map_final_input(); // map final combiner registers to variables A..F
-	void combiner_final_output(); // generate final combiner output
-	float combiner_map_input_select(int code, int index); // get component index in register code
-	float *combiner_map_input_select3(int code); // get pointer to register code
-	float *combiner_map_output_select3(int code); // get pointer to register code for output
-	float combiner_map_input_function(int code, float value); // apply input mapping function code to value
-	void combiner_map_input_function3(int code, float *data); // apply input mapping function code to data
-	void combiner_function_AB(float result[4]);
-	void combiner_function_AdotB(float result[4]);
-	void combiner_function_CD(float result[4]);
-	void combiner_function_CdotD(float result[4]);
-	void combiner_function_ABmuxCD(float result[4]);
-	void combiner_function_ABsumCD(float result[4]);
-	void combiner_compute_rgb_outputs(int index);
-	void combiner_compute_a_outputs(int index);
-	void combiner_argb8_float(uint32_t color, float reg[4]);
-	uint32_t combiner_float_argb8(float reg[4]);
+	void write_pixel(int x, int y, uint32_t color, int z);
+	void combiner_initialize_registers(int id, float rgba[6][4]);
+	void combiner_initialize_stage(int id, int stage_number);
+	void combiner_initialize_final(int id);
+	void combiner_map_stage_input(int id, int stage_number); // map combiner registers to variables A..D
+	void combiner_map_stage_output(int id, int stage_number); // map combiner calculation results to combiner registers
+	void combiner_map_final_input(int id); // map final combiner registers to variables A..F
+	void combiner_final_output(int id); // generate final combiner output
+	float combiner_map_input_select(int id, Combiner::InputRegister code, int index); // get component index in register selected as input
+	float *combiner_map_input_select_array(int id, Combiner::InputRegister code); // get pointer to register selected as input
+	float *combiner_map_output_select_array(int id, Combiner::InputRegister code); // get pointer to register selected as output
+	float combiner_map_input_function(Combiner::MapFunction code, float value); // apply input mapping function code to value
+	void combiner_map_input_function_array(Combiner::MapFunction code, float *data); // apply input mapping function code to data
+	void combiner_function_AB(int id, float result[4]); // calculate values for possible outputs
+	void combiner_function_AdotB(int id, float result[4]);
+	void combiner_function_CD(int id, float result[4]);
+	void combiner_function_CdotD(int id, float result[4]);
+	void combiner_function_ABmuxCD(int id, float result[4]);
+	void combiner_function_ABsumCD(int id, float result[4]);
+	void combiner_compute_rgb_outputs(int id, int index);
+	void combiner_compute_alpha_outputs(int id, int index);
+	void combiner_argb8_float(uint32_t color, float reg[4]); // convert from color to float array
+	uint32_t combiner_float_argb8(float reg[4]); // convert from float array to color
 	uint32_t dilate0(uint32_t value, int bits);
 	uint32_t dilate1(uint32_t value, int bits);
 	void computedilated(void);
@@ -542,8 +718,8 @@ public:
 	void convert_vertices(vertex_nv *source, nv2avertex_t *destination, int count);
 	void assemble_primitive(vertex_nv *source, int count);
 	int clip_triangle_w(nv2avertex_t *vi[3], nv2avertex_t *vo);
-	uint32_t render_triangle_clipping(const rectangle &cliprect, int paramcount, nv2avertex_t &_v1, nv2avertex_t &_v2, nv2avertex_t &_v3);
-	uint32_t render_triangle_culling(const rectangle &cliprect, int paramcount, nv2avertex_t &_v1, nv2avertex_t &_v2, nv2avertex_t &_v3);
+	uint32_t render_triangle_clipping(const rectangle &cliprect, nv2avertex_t &_v1, nv2avertex_t &_v2, nv2avertex_t &_v3);
+	uint32_t render_triangle_culling(const rectangle &cliprect, nv2avertex_t &_v1, nv2avertex_t &_v2, nv2avertex_t &_v3);
 	void clear_render_target(int what, uint32_t value);
 	void clear_depth_buffer(int what, uint32_t value);
 	inline uint8_t *direct_access_ptr(offs_t address);
@@ -590,21 +766,20 @@ public:
 	uint32_t *rendertarget;
 	uint32_t *depthbuffer;
 	uint32_t *displayedtarget;
+	uint32_t *old_rendertarget;
 	struct {
 		uint32_t address[16];
 		int type[16];
 		int stride[16];
-		NV2A_VTXBUF_TYPE kind[16];
-		int size[16];
 		int words[16];
-		int offset[16];
+		int offset[16 + 1];
 		int enabled; // bitmask
 	} vertexbuffer;
 	struct {
 		int enabled;
-		int sizeu;
-		int sizev;
-		int sizew;
+		int sizes;
+		int sizet;
+		int sizer;
 		int dilate;
 		NV2A_TEX_FORMAT format;
 		bool rectangle;
@@ -612,6 +787,7 @@ public:
 		void *buffer;
 		int dma0;
 		int dma1;
+		int mode;
 		int cubic;
 		int noborder;
 		int dims;
@@ -623,11 +799,15 @@ public:
 		int mipmapminlod;
 		int rectheight;
 		int rectwidth;
+		int addrmodes;
+		int addrmodet;
+		int addrmoder;
 	} texture[4];
 	uint32_t triangles_bfculled;
 	NV2A_BEGIN_END primitive_type;
 	uint32_t primitives_count;
 	uint32_t primitives_total_count;
+	uint32_t primitives_batches_count;
 	int indexesleft_count;
 	int indexesleft_first;
 	uint32_t vertex_indexes[1024]; // vertex indices sent by the software to the 3d accelerator
@@ -638,134 +818,32 @@ public:
 	nv2avertex_t vertex_xy[1024+2]; // vertex attributes computed by the 3d accelerator
 	vertex_nv persistvertexattr; // persistent vertex attributes
 	nv2a_rasterizer::render_delegate render_spans_callback;
-
-	struct {
-		float variable_A[4]; // 0=R 1=G 2=B 3=A
-		float variable_B[4];
-		float variable_C[4];
-		float variable_D[4];
-		float variable_E[4];
-		float variable_F[4];
-		float variable_G;
-		float variable_EF[4];
-		float variable_sumclamp[4];
-		float function_RGBop1[4]; // 0=R 1=G 2=B
-		float function_RGBop2[4];
-		float function_RGBop3[4];
-		float function_Aop1;
-		float function_Aop2;
-		float function_Aop3;
-		float register_primarycolor[4]; // rw
-		float register_secondarycolor[4];
-		float register_texture0color[4];
-		float register_texture1color[4];
-		float register_texture2color[4];
-		float register_texture3color[4];
-		float register_color0[4];
-		float register_color1[4];
-		float register_spare0[4];
-		float register_spare1[4];
-		float register_fogcolor[4]; // ro
-		float register_zero[4];
-		float output[4];
-		struct {
-			float register_constantcolor0[4];
-			float register_constantcolor1[4];
-			int mapin_aA_input;
-			int mapin_aA_component;
-			int mapin_aA_mapping;
-			int mapin_aB_input;
-			int mapin_aB_component;
-			int mapin_aB_mapping;
-			int mapin_aC_input;
-			int mapin_aC_component;
-			int mapin_aC_mapping;
-			int mapin_aD_input;
-			int mapin_aD_component;
-			int mapin_aD_mapping;
-			int mapin_rgbA_input;
-			int mapin_rgbA_component;
-			int mapin_rgbA_mapping;
-			int mapin_rgbB_input;
-			int mapin_rgbB_component;
-			int mapin_rgbB_mapping;
-			int mapin_rgbC_input;
-			int mapin_rgbC_component;
-			int mapin_rgbC_mapping;
-			int mapin_rgbD_input;
-			int mapin_rgbD_component;
-			int mapin_rgbD_mapping;
-			int mapout_aCD_output;
-			int mapout_aAB_output;
-			int mapout_aSUM_output;
-			int mapout_aCD_dotproduct;
-			int mapout_aAB_dotproduct;
-			int mapout_a_muxsum;
-			int mapout_a_bias;
-			int mapout_a_scale;
-			int mapout_rgbCD_output;
-			int mapout_rgbAB_output;
-			int mapout_rgbSUM_output;
-			int mapout_rgbCD_dotproduct;
-			int mapout_rgbAB_dotproduct;
-			int mapout_rgb_muxsum;
-			int mapout_rgb_bias;
-			int mapout_rgb_scale;
-		} stage[8];
-		struct {
-			float register_constantcolor0[4];
-			float register_constantcolor1[4];
-			int color_sum_clamp;
-			int mapin_rgbA_input;
-			int mapin_rgbA_component;
-			int mapin_rgbA_mapping;
-			int mapin_rgbB_input;
-			int mapin_rgbB_component;
-			int mapin_rgbB_mapping;
-			int mapin_rgbC_input;
-			int mapin_rgbC_component;
-			int mapin_rgbC_mapping;
-			int mapin_rgbD_input;
-			int mapin_rgbD_component;
-			int mapin_rgbD_mapping;
-			int mapin_rgbE_input;
-			int mapin_rgbE_component;
-			int mapin_rgbE_mapping;
-			int mapin_rgbF_input;
-			int mapin_rgbF_component;
-			int mapin_rgbF_mapping;
-			int mapin_aG_input;
-			int mapin_aG_component;
-			int mapin_aG_mapping;
-		} final;
-		int stages;
-		int used;
-		std::mutex lock;
-	} combiner;
+	Combiner combiner;
 	uint32_t color_mask;
 	bool backface_culling_enabled;
 	NV2A_GL_FRONT_FACE backface_culling_winding;
 	NV2A_GL_CULL_FACE backface_culling_culled;
 	bool alpha_test_enabled;
+	bool depth_test_enabled;
+	bool stencil_test_enabled;
+	bool depth_write_enabled;
+	bool blending_enabled;
+	bool logical_operation_enabled;
 	NV2A_COMPARISON_OP alpha_func;
 	int alpha_reference;
-	bool depth_test_enabled;
 	NV2A_COMPARISON_OP depth_function;
-	bool depth_write_enabled;
-	bool stencil_test_enabled;
 	NV2A_COMPARISON_OP stencil_func;
 	int stencil_ref;
 	int stencil_mask;
 	NV2A_STENCIL_OP stencil_op_fail;
 	NV2A_STENCIL_OP stencil_op_zfail;
 	NV2A_STENCIL_OP stencil_op_zpass;
-	bool blending_enabled;
 	NV2A_BLEND_EQUATION blend_equation;
 	NV2A_BLEND_FACTOR blend_function_source;
 	NV2A_BLEND_FACTOR blend_function_destination;
 	uint32_t blend_color;
-	bool logical_operation_enabled;
 	NV2A_LOGIC_OP logical_operation;
+	uint32_t fog_color;
 	struct {
 		float modelview[4][4];
 		float modelview_inverse[4][4];
@@ -787,13 +865,17 @@ public:
 
 	struct {
 		int format;
-		int pitch_source;
-		int pitch_destination;
-		uint32_t source_address;
-		uint32_t destination_address;
+		uint32_t pitch_source;
+		uint32_t pitch_destination;
+		offs_t source_address;
+		offs_t destination_address;
 		int op;
 		int width;
 		int heigth;
+		uint32_t sourcex;
+		uint32_t sourcey;
+		uint32_t destinationx;
+		uint32_t destinationy;
 	} bitblit;
 	emu_timer *puller_timer;
 	int puller_waiting;
diff --git a/src/mame/video/xbox_nv2a.cpp b/src/mame/video/xbox_nv2a.cpp
index 2d1132b13bd..9de440a9d87 100644
--- a/src/mame/video/xbox_nv2a.cpp
+++ b/src/mame/video/xbox_nv2a.cpp
@@ -5,6 +5,7 @@
 #include "bitmap.h"
 #include "includes/xbox_nv2a.h"
 #include <bitset>
+#include <cfloat>
 
 //#define LOG_NV2A
 #define DEBUG_CHECKS // enable for debugging
@@ -462,21 +463,21 @@ void vertex_program_simulator::decode_instruction(int address)
 	instruction *i;
 
 	i = &op[address];
-	i->d.SignA = i->i[1] & (1 << 8);
+	i->d.NegateA = i->i[1] & (1 << 8);
 	i->d.ParameterTypeA = (i->i[2] >> 26) & 3;
 	i->d.TempIndexA = (i->i[2] >> 28) & 15;
 	i->d.SwizzleA[0] = (i->i[1] >> 6) & 3;
 	i->d.SwizzleA[1] = (i->i[1] >> 4) & 3;
 	i->d.SwizzleA[2] = (i->i[1] >> 2) & 3;
 	i->d.SwizzleA[3] = (i->i[1] >> 0) & 3;
-	i->d.SignB = i->i[2] & (1 << 25);
+	i->d.NegateB = i->i[2] & (1 << 25);
 	i->d.ParameterTypeB = (i->i[2] >> 11) & 3;
 	i->d.TempIndexB = (i->i[2] >> 13) & 15;
 	i->d.SwizzleB[0] = (i->i[2] >> 23) & 3;
 	i->d.SwizzleB[1] = (i->i[2] >> 21) & 3;
 	i->d.SwizzleB[2] = (i->i[2] >> 19) & 3;
 	i->d.SwizzleB[3] = (i->i[2] >> 17) & 3;
-	i->d.SignC = i->i[2] & (1 << 10);
+	i->d.NegateC = i->i[2] & (1 << 10);
 	i->d.ParameterTypeC = (i->i[3] >> 28) & 3;
 	i->d.TempIndexC = ((i->i[2] & 3) << 2) + (i->i[3] >> 30);
 	i->d.SwizzleC[0] = (i->i[2] >> 8) & 3;
@@ -519,6 +520,7 @@ int vertex_program_simulator::step()
 			*pp = '\n';
 			pp++;
 			*pp = 0;
+			printf("%s", disbuffer);
 		}
 	}
 #endif
@@ -528,83 +530,58 @@ int vertex_program_simulator::step()
 	d = &(op[ip].d);
 	// prepare inputs
 	//  input A
-	generate_input(&tmp[0], d->SignA, d->ParameterTypeA, d->TempIndexA, d->SwizzleA);
+	generate_input(&tmp[0], d->NegateA, d->ParameterTypeA, d->TempIndexA, d->SwizzleA);
 	//  input B
-	generate_input(&tmp[4], d->SignB, d->ParameterTypeB, d->TempIndexB, d->SwizzleB);
+	generate_input(&tmp[4], d->NegateB, d->ParameterTypeB, d->TempIndexB, d->SwizzleB);
 	//  input C
-	generate_input(&tmp[8], d->SignC, d->ParameterTypeC, d->TempIndexC, d->SwizzleC);
+	generate_input(&tmp[8], d->NegateC, d->ParameterTypeC, d->TempIndexC, d->SwizzleC);
 	// compute 2 instructions
 	//  vectorial
 	compute_vectorial_operation(tmpv, d->VecOperation, tmp);
 	//  scalar
 	compute_scalar_operation(tmps, d->ScaOperation, tmp);
 	// assign destinations
-	if (d->VecOperation > 0) {
-		if (d->VecOperation == 13)
+	if (d->VecOperation != VecNOP) {
+		if (d->VecOperation == VecARL)
 			//o[4] = 1;
 			a0x = (int)tmpv[0];
 		else {
 			if (d->VecTempWriteMask != 0) { // assign to Rn
 				//o[0] = 1;
-				int wm = d->VecTempWriteMask;
-				for (p1 = 0; p1 < 4; p1++) {
-					if (wm & 8)
-						r_temp[d->VecTempIndex].fv[p1] = tmpv[p1];
-					wm = wm << 1;
-				}
+				assign_register(d->VecTempIndex, tmpv, d->VecTempWriteMask);
 			}
 			if ((d->OutputWriteMask != 0) && (d->MultiplexerControl == 0)) {
 				//o[1] = 1;
 				if (d->OutputSelect) { // assign to output
-					int wm = d->OutputWriteMask;
-					for (p1 = 0; p1 < 4; p1++) {
-						if (wm & 8)
-							output->attribute[d->OutputIndex].fv[p1] = tmpv[p1];
-						wm = wm << 1;
-					}
+					assign_output(d->OutputIndex, tmpv, d->OutputWriteMask);
 					// remeber, output position == r12
 					if (d->OutputIndex == 0)
 						for (p1 = 0; p1 < 4; p1++) {
-							r_temp[12].fv[p1] = output->attribute[d->OutputIndex].fv[p1];
+							r_register[12].fv[p1] = output->attribute[d->OutputIndex].fv[p1];
 						}
 				}
 				else { // assign to constant
-					int wm = d->OutputWriteMask;
-					for (p1 = 0; p1 < 4; p1++) {
-						if (wm & 8)
-							c_constant[d->OutputIndex].fv[p1] = tmpv[p1];
-						wm = wm << 1;
-					}
+					assign_constant(d->OutputIndex, tmpv, d->OutputWriteMask);
 				}
 			}
 		}
 	}
-	if (d->ScaOperation > 0) {
+	if (d->ScaOperation != ScaNOP) {
 		if (d->ScaTempWriteMask != 0) { // assign to Rn
 			//o[3] = 1;
-			if (d->VecOperation > 0)
+			if (d->VecOperation != VecNOP)
 				p2 = 1;
 			else
 				p2 = d->VecTempIndex;
-			int wm = d->ScaTempWriteMask;
-			for (p1 = 0; p1 < 4; p1++) {
-				if (wm & 8)
-					r_temp[p2].fv[p1] = tmps[p1];
-				wm = wm << 1;
-			}
+			assign_register(p2, tmps, d->ScaTempWriteMask);
 		}
 		if ((d->OutputWriteMask != 0) && (d->MultiplexerControl != 0)) { // assign to output
 			//o[2] = 1;
-			int wm = d->OutputWriteMask;
-			for (p1 = 0; p1 < 4; p1++) {
-				if (wm & 8)
-					output->attribute[d->OutputIndex].fv[p1] = tmps[p1];
-				wm = wm << 1;
-			}
+			assign_output(d->OutputIndex, tmps, d->OutputWriteMask);
 			// remeber, output position == r12
 			if (d->OutputIndex == 0) {
 				for (p1 = 0; p1 < 4; p1++) {
-					r_temp[12].fv[p1] = output->attribute[d->OutputIndex].fv[p1];
+					r_register[12].fv[p1] = output->attribute[d->OutputIndex].fv[p1];
 				}
 			}
 		}
@@ -630,26 +607,37 @@ void vertex_program_simulator::jump(int address)
 
 void vertex_program_simulator::process(int address, vertex_nv *in, vertex_nv *out, int count)
 {
-#if 1 // useful while debugging to see what is being executed
+#if 0 // useful while debugging to see what is being executed
 	static int debugvps = 0;
 	if (debugvps) {
+		FILE *f;
 		char *pp;
 		vertex_program_disassembler vdis;
 		char disbuffer[128];
 
-		jump(address);
 		debugvps--;
-		for (int t = 0; t < 128; t++) {
-			pp = disbuffer;
-			while (vdis.disassemble(op[ip + t].i, pp) != 0) {
-				pp = pp + strlen(pp);
-				*pp = '\n';
-				pp++;
-				*pp = 0;
+		if ((f = fopen("vertexshader_debug.txt", "wb")) != nullptr) {
+			jump(address);
+			fprintf(f, "SHADER:\n");
+			for (int t = 0; t < 128; t++) {
+				pp = disbuffer;
+				while (vdis.disassemble(op[ip + t].i, pp) != 0) {
+					pp = pp + strlen(pp);
+					*pp = '\n';
+					pp++;
+					*pp = 0;
+				}
+				fprintf(f, "%08X %08X %08X %s", op[ip + t].i[1], op[ip + t].i[2], op[ip + t].i[3], disbuffer);
+				if (op[ip + t].i[3] & 1)
+					break;
 			}
-			printf("%08X %08X %08X %s", op[ip + t].i[1], op[ip + t].i[2], op[ip + t].i[3], disbuffer);
-			if (op[ip + t].i[3] & 1)
-				break;
+			fprintf(f, "INPUTS:\n");
+			for (int t = 0; t < 16; t++)
+				fprintf(f, "v%d %f %f %f %f\n", t, in->attribute[t].fv[0], in->attribute[t].fv[1], in->attribute[t].fv[2], in->attribute[t].fv[3]);
+			fprintf(f, "CONSTANTS:\n");
+			for (int t = 0; t < 192; t++)
+				fprintf(f, "c[%d] %f %f %f %f\n", t - 96, c_constant[t].fv[0], c_constant[t].fv[1], c_constant[t].fv[2], c_constant[t].fv[3]);
+			fclose(f);
 		}
 	}
 #endif
@@ -679,7 +667,7 @@ void vertex_program_simulator::initialize_outputs()
 
 void vertex_program_simulator::initialize_temps()
 {
-	for (auto & elem : r_temp) {
+	for (auto & elem : r_register) {
 		for (int m = 0; m < 4; m++)
 			elem.fv[m] = 0;
 	}
@@ -700,26 +688,26 @@ void vertex_program_simulator::generate_input(float t[4], int sign, int type, in
 	if (sign)
 		sgn = -1;
 	if (type == 1) {
-		t[0] = sgn*r_temp[temp].fv[swizzle[0]];
-		t[1] = sgn*r_temp[temp].fv[swizzle[1]];
-		t[2] = sgn*r_temp[temp].fv[swizzle[2]];
-		t[3] = sgn*r_temp[temp].fv[swizzle[3]];
+		t[0] = sgn * r_register[temp].fv[swizzle[0]];
+		t[1] = sgn * r_register[temp].fv[swizzle[1]];
+		t[2] = sgn * r_register[temp].fv[swizzle[2]];
+		t[3] = sgn * r_register[temp].fv[swizzle[3]];
 	}
 	else if (type == 2) {
 		int InputIndex = op[ip].d.InputIndex;
-		t[0] = sgn*input->attribute[InputIndex].fv[swizzle[0]];
-		t[1] = sgn*input->attribute[InputIndex].fv[swizzle[1]];
-		t[2] = sgn*input->attribute[InputIndex].fv[swizzle[2]];
-		t[3] = sgn*input->attribute[InputIndex].fv[swizzle[3]];
+		t[0] = sgn * input->attribute[InputIndex].fv[swizzle[0]];
+		t[1] = sgn * input->attribute[InputIndex].fv[swizzle[1]];
+		t[2] = sgn * input->attribute[InputIndex].fv[swizzle[2]];
+		t[3] = sgn * input->attribute[InputIndex].fv[swizzle[3]];
 	}
 	else if (type == 3) {
 		int SourceConstantIndex = op[ip].d.SourceConstantIndex;
 		if (op[ip].d.Usea0x)
 			SourceConstantIndex = SourceConstantIndex + a0x;
-		t[0] = sgn*c_constant[SourceConstantIndex].fv[swizzle[0]];
-		t[1] = sgn*c_constant[SourceConstantIndex].fv[swizzle[1]];
-		t[2] = sgn*c_constant[SourceConstantIndex].fv[swizzle[2]];
-		t[3] = sgn*c_constant[SourceConstantIndex].fv[swizzle[3]];
+		t[0] = sgn * c_constant[SourceConstantIndex].fv[swizzle[0]];
+		t[1] = sgn * c_constant[SourceConstantIndex].fv[swizzle[1]];
+		t[2] = sgn * c_constant[SourceConstantIndex].fv[swizzle[2]];
+		t[3] = sgn * c_constant[SourceConstantIndex].fv[swizzle[3]];
 	}
 }
 
@@ -879,6 +867,33 @@ void vertex_program_simulator::compute_scalar_operation(float t_out[4], int inst
 	}
 }
 
+void vertex_program_simulator::assign_output(int index, float t[4], int mask)
+{
+	for (int p1 = 0; p1 < 4; p1++) {
+		if (mask & 8)
+			output->attribute[index].fv[p1] = t[p1];
+		mask = mask << 1;
+	}
+}
+
+void vertex_program_simulator::assign_register(int index, float t[4], int mask)
+{
+	for (int p1 = 0; p1 < 4; p1++) {
+		if (mask & 8)
+			r_register[index].fv[p1] = t[p1];
+		mask = mask << 1;
+	}
+}
+
+void vertex_program_simulator::assign_constant(int index, float t[4], int mask)
+{
+	for (int p1 = 0; p1 < 4; p1++) {
+		if (mask & 8)
+			c_constant[index].fv[p1] = t[p1];
+		mask = mask << 1;
+	}
+}
+
 /*
  * Graphics
  */
@@ -997,7 +1012,7 @@ void nv2a_renderer::geforce_read_dma_object(uint32_t handle, uint32_t &offset, u
 	offset = dma_frame + dma_adjust;
 }
 
-/*void myline(bitmap_rgb32 &bmp,float x1,float y1,float x2,float y2)
+/*void debug(uint32_t *bmp, int width, int eight, float x1, float y1, float x2, float y2, uint32_t color)
 {
 int xx1,yy1,xx2,yy2;
 
@@ -1011,16 +1026,17 @@ int xx1,yy1,xx2,yy2;
             yy1=yy2;
             yy2=t;
         }
-        for (int y=yy1;y <= yy2;y++)
-            *((uint32_t *)bmp.raw_pixptr(y,xx1))= -1;
-        } else if (yy1 == yy2) {
+        for (int y=yy1;y <= yy2;y++) {
+            *(bmp+y*width+xx1) = color;
+		}
+    } else if (yy1 == yy2) {
         if (xx1 > xx2) {
             int t=xx1;
             xx1=xx2;
             xx2=t;
         }
         for (int x=xx1;x <= xx2;x++)
-            *((uint32_t *)bmp.raw_pixptr(yy1,x))= -1;
+            *(bmp+yy1*width+x) = color;
     }
 }*/
 
@@ -1073,22 +1089,41 @@ uint32_t nv2a_renderer::texture_get_texel(int number, int x, int y)
 	int cr, cg, cb;
 	int sizeu, sizev;
 
-	// force to [0,size-1]
 	if (texture[number].rectangle == false) {
-		sizeu = texture[number].sizeu;
-		sizev = texture[number].sizev;
-		x = (unsigned int)x & (sizeu - 1);
-		y = (unsigned int)y & (sizev - 1);
-	} else
-	{
+		sizeu = texture[number].sizes;
+		sizev = texture[number].sizet;
+	}
+	else {
 		sizeu = texture[number].rectwidth;
 		sizev = texture[number].rectheight;
+	}
+	switch (texture[number].addrmodes) {
+	case 1: // wrap
+	default:
 		x = x % sizeu;
 		if (x < 0)
 			x = sizeu + x;
+		break;
+	case 3: // clamp
+		if (x < 0)
+			x = 0;
+		if (x >= sizeu)
+			x = sizeu - 1;
+		break;
+	}
+	switch (texture[number].addrmodet) {
+	case 1: // wrap
+	default:
 		y = y % sizev;
 		if (y < 0)
 			y = sizev + y;
+		break;
+	case 3: // clamp
+		if (y < 0)
+			y = 0;
+		if (y >= sizev)
+			y = sizev - 1;
+		break;
 	}
 	switch (texture[number].format) {
 	case NV2A_TEX_FORMAT::A8R8G8B8:
@@ -1329,21 +1364,21 @@ inline uint8_t *nv2a_renderer::read_pixel(int x, int y, int32_t c[4])
 	return nullptr;
 }
 
-void nv2a_renderer::write_pixel(int x, int y, uint32_t color, int depth)
+void nv2a_renderer::write_pixel(int x, int y, uint32_t color, int z)
 {
 	uint8_t *addr;
 	uint32_t *daddr32;
 	uint16_t *daddr16;
-	uint32_t deptsten;
+	uint32_t depthandstencil;
 	int32_t c[4], fb[4], s[4], d[4], cc[4];
-	uint32_t dep, sten, stenc, stenv;
+	uint32_t depth, stencil, stenc, stenv;
 	uint32_t udepth;
 	bool stencil_passed;
 	bool depth_passed;
 
-	if ((depth > 0xffffff) || (depth < 0) || (x < 0))
+	if ((z > 0xffffff) || (z < 0) || (x < 0))
 		return;
-	udepth = (uint32_t)depth;
+	udepth = (uint32_t)z;
 	fb[3] = fb[2] = fb[1] = fb[0] = 0;
 	addr = nullptr;
 	if (color_mask != 0)
@@ -1357,9 +1392,9 @@ void nv2a_renderer::write_pixel(int x, int y, uint32_t color, int depth)
 		}
 #endif
 		daddr32 = depthbuffer + (pitch_depthbuffer / 4) * y + x;
-		deptsten = *daddr32;
-		dep = deptsten >> 8;
-		sten = deptsten & 255;
+		depthandstencil = *daddr32;
+		depth = depthandstencil >> 8;
+		stencil = depthandstencil & 255;
 		daddr16 = nullptr;
 	}
 	else if (depthformat_rendertarget == NV2A_RT_DEPTH_FORMAT::Z16) {
@@ -1371,16 +1406,16 @@ void nv2a_renderer::write_pixel(int x, int y, uint32_t color, int depth)
 		}
 #endif
 		daddr16 = (uint16_t *)depthbuffer + (pitch_depthbuffer / 2) * y + x;
-		deptsten = *daddr16;
-		dep = (deptsten << 8) | 0xff;
-		sten = 0;
+		depthandstencil = *daddr16;
+		depth = (depthandstencil << 8) | 0xff;
+		stencil = 0;
 		daddr32 = nullptr;
 	}
 	else {
 		daddr32 = nullptr;
 		daddr16 = nullptr;
-		dep = 0xffffff;
-		sten = 0;
+		depth = 0xffffff;
+		stencil = 0;
 	}
 	c[3] = color >> 24;
 	c[2] = (color >> 16) & 255;
@@ -1429,7 +1464,7 @@ void nv2a_renderer::write_pixel(int x, int y, uint32_t color, int depth)
 	stencil_passed = true;
 	if (stencil_test_enabled) {
 		stenc=stencil_mask & stencil_ref;
-		stenv=stencil_mask & sten;
+		stenv=stencil_mask & stencil;
 		switch (stencil_func) {
 		case NV2A_COMPARISON_OP::NEVER:
 			stencil_passed = false;
@@ -1465,45 +1500,45 @@ void nv2a_renderer::write_pixel(int x, int y, uint32_t color, int depth)
 		if (stencil_passed == false) {
 			switch (stencil_op_fail) {
 			case NV2A_STENCIL_OP::ZEROOP:
-				sten = 0;
+				stencil = 0;
 				break;
 			case NV2A_STENCIL_OP::INVERTOP:
-				sten = sten ^ 255;
+				stencil = stencil ^ 255;
 				break;
 			case NV2A_STENCIL_OP::KEEP:
 			default:
 				break;
 			case NV2A_STENCIL_OP::REPLACE:
-				sten = stencil_ref;
+				stencil = stencil_ref;
 				break;
 			case NV2A_STENCIL_OP::INCR:
-				if (sten < 255)
-					sten++;
+				if (stencil < 255)
+					stencil++;
 				break;
 			case NV2A_STENCIL_OP::DECR:
-				if (sten > 0)
-					sten--;
+				if (stencil > 0)
+					stencil--;
 				break;
 			case NV2A_STENCIL_OP::INCR_WRAP:
-				if (sten < 255)
-					sten++;
+				if (stencil < 255)
+					stencil++;
 				else
-					sten = 0;
+					stencil = 0;
 				break;
 			case NV2A_STENCIL_OP::DECR_WRAP:
-				if (sten > 0)
-					sten--;
+				if (stencil > 0)
+					stencil--;
 				else
-					sten = 255;
+					stencil = 255;
 				break;
 			}
 			if (depthformat_rendertarget == NV2A_RT_DEPTH_FORMAT::Z24S8) {
-				deptsten = (dep << 8) | sten;
-				*daddr32 = deptsten;
+				depthandstencil = (depth << 8) | stencil;
+				*daddr32 = depthandstencil;
 			}
 			else if (depthformat_rendertarget == NV2A_RT_DEPTH_FORMAT::Z16) {
-				deptsten = dep >> 8;
-				*daddr16 = (uint16_t)deptsten;
+				depthandstencil = depth >> 8;
+				*daddr16 = (uint16_t)depthandstencil;
 			}
 			return;
 		}
@@ -1516,27 +1551,27 @@ void nv2a_renderer::write_pixel(int x, int y, uint32_t color, int depth)
 				depth_passed = false;
 				break;
 			case NV2A_COMPARISON_OP::LESS:
-				if (udepth >= dep)
+				if (udepth >= depth)
 					depth_passed = false;
 				break;
 			case NV2A_COMPARISON_OP::EQUAL:
-				if (udepth != dep)
+				if (udepth != depth)
 					depth_passed = false;
 				break;
 			case NV2A_COMPARISON_OP::LEQUAL:
-				if (udepth > dep)
+				if (udepth > depth)
 					depth_passed = false;
 				break;
 			case NV2A_COMPARISON_OP::GREATER:
-				if (udepth <= dep)
+				if (udepth <= depth)
 					depth_passed = false;
 				break;
 			case NV2A_COMPARISON_OP::NOTEQUAL:
-				if (udepth == dep)
+				if (udepth == depth)
 					depth_passed = false;
 				break;
 			case NV2A_COMPARISON_OP::GEQUAL:
-				if (udepth < dep)
+				if (udepth < depth)
 					depth_passed = false;
 				break;
 			case NV2A_COMPARISON_OP::ALWAYS:
@@ -1546,80 +1581,80 @@ void nv2a_renderer::write_pixel(int x, int y, uint32_t color, int depth)
 		if (depth_passed == false) {
 			switch (stencil_op_zfail) {
 			case NV2A_STENCIL_OP::ZEROOP:
-				sten = 0;
+				stencil = 0;
 				break;
 			case NV2A_STENCIL_OP::INVERTOP:
-				sten = sten ^ 255;
+				stencil = stencil ^ 255;
 				break;
 			case NV2A_STENCIL_OP::KEEP:
 			default:
 				break;
 			case NV2A_STENCIL_OP::REPLACE:
-				sten = stencil_ref;
+				stencil = stencil_ref;
 				break;
 			case NV2A_STENCIL_OP::INCR:
-				if (sten < 255)
-					sten++;
+				if (stencil < 255)
+					stencil++;
 				break;
 			case NV2A_STENCIL_OP::DECR:
-				if (sten > 0)
-					sten--;
+				if (stencil > 0)
+					stencil--;
 				break;
 			case NV2A_STENCIL_OP::INCR_WRAP:
-				if (sten < 255)
-					sten++;
+				if (stencil < 255)
+					stencil++;
 				else
-					sten = 0;
+					stencil = 0;
 				break;
 			case NV2A_STENCIL_OP::DECR_WRAP:
-				if (sten > 0)
-					sten--;
+				if (stencil > 0)
+					stencil--;
 				else
-					sten = 255;
+					stencil = 255;
 				break;
 			}
 			if (depthformat_rendertarget == NV2A_RT_DEPTH_FORMAT::Z24S8) {
-				deptsten = (dep << 8) | sten;
-				*daddr32 = deptsten;
+				depthandstencil = (depth << 8) | stencil;
+				*daddr32 = depthandstencil;
 			}
 			else if (depthformat_rendertarget == NV2A_RT_DEPTH_FORMAT::Z16) {
-				deptsten = dep >> 8;
-				*daddr16 = (uint16_t)deptsten;
+				depthandstencil = depth >> 8;
+				*daddr16 = (uint16_t)depthandstencil;
 			}
 			return;
 		}
 		switch (stencil_op_zpass) {
 		case NV2A_STENCIL_OP::ZEROOP:
-			sten = 0;
+			stencil = 0;
 			break;
 		case NV2A_STENCIL_OP::INVERTOP:
-			sten = sten ^ 255;
+			stencil = stencil ^ 255;
 			break;
 		case NV2A_STENCIL_OP::KEEP:
 		default:
 			break;
 		case NV2A_STENCIL_OP::REPLACE:
-			sten = stencil_ref;
+			stencil = stencil_ref;
 			break;
 		case NV2A_STENCIL_OP::INCR:
-			if (sten < 255)
-				sten++;
+			if (stencil < 255)
+				stencil++;
 			break;
 		case NV2A_STENCIL_OP::DECR:
-			if (sten > 0)
-				sten--;
+			if (stencil > 0)
+				stencil--;
 			break;
 		case NV2A_STENCIL_OP::INCR_WRAP:
-			if (sten < 255)
-				sten++;
+			if (stencil < 255)
+				stencil++;
 			else
-				sten = 0;
+				stencil = 0;
 			break;
 		case NV2A_STENCIL_OP::DECR_WRAP:
-			if (sten > 0)
-				sten--;
+			if (stencil > 0)
+				stencil--;
 			else
-				sten = 255;
+				stencil = 255;
 			break;
 		}
 	}
@@ -1913,6 +1948,11 @@ void nv2a_renderer::write_pixel(int x, int y, uint32_t color, int depth)
 		ct = ((uint32_t)c[3] << 24) | ((uint32_t)c[2] << 16) | ((uint32_t)c[1] << 8) | (uint32_t)c[0];
 		ft = ((uint32_t)fb[3] << 24) | ((uint32_t)fb[2] << 16) | ((uint32_t)fb[1] << 8) | (uint32_t)fb[0];
 		w = (ft & ~color_mask) | (ct & color_mask);
+
+/* for debugging
+		if (w == 0x94737d7b)
+			x++;
+*/
 		switch (colorformat_rendertarget) {
 		case NV2A_COLOR_FORMAT::R5G6B5:
 			w = ((w >> 8) & 0xf800) + ((w >> 5) & 0x7e0) + ((w >> 3) & 0x1f);
@@ -1933,14 +1973,14 @@ void nv2a_renderer::write_pixel(int x, int y, uint32_t color, int depth)
 		}
 	}
 	if (depth_write_enabled)
-		dep = udepth;
+		depth = udepth;
 	if (depthformat_rendertarget == NV2A_RT_DEPTH_FORMAT::Z24S8) {
-		deptsten = (dep << 8) | sten;
-		*daddr32 = deptsten;
+		depthandstencil = (depth << 8) | stencil;
+		*daddr32 = depthandstencil;
 	}
 	else if (depthformat_rendertarget == NV2A_RT_DEPTH_FORMAT::Z16) {
-		deptsten = dep >> 8;
-		*daddr16 = (uint16_t)deptsten;
+		depthandstencil = depth >> 8;
+		*daddr16 = (uint16_t)depthandstencil;
 	}
 }
 
@@ -1958,17 +1998,36 @@ void nv2a_renderer::render_color(int32_t scanline, const nv2a_rasterizer::extent
 		x = x - (extent.stopx - lx - 1);
 	x--;
 	while (x >= 0) {
+		double zf;
 		uint32_t a8r8g8b8;
 		int z;
 		int ca, cr, cg, cb;
 		int xp = extent.startx + x; // x coordinate of current pixel
 
-		cb = ((extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_B].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_B].dpdx))*255.0f;
-		cg = ((extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_G].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_G].dpdx))*255.0f;
-		cr = ((extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_R].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_R].dpdx))*255.0f;
-		ca = ((extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_A].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_A].dpdx))*255.0f;
-		a8r8g8b8 = (ca << 24) + (cr << 16) + (cg << 8) + cb; // pixel color obtained by interpolating the colors of the vertices
-		z = (extent.param[(int)VERTEX_PARAMETER::PARAM_Z].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_Z].dpdx);
+		z = (extent.param[(int)VERTEX_PARAMETER::PARAM_Z].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_Z].dpdx);
+		zf = (extent.param[(int)VERTEX_PARAMETER::PARAM_1W].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_1W].dpdx);
+		zf = 1.0f / zf;
+		cb = ((extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_B].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_B].dpdx)) * zf * 255.0f;
+		cg = ((extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_G].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_G].dpdx)) * zf * 255.0f;
+		cr = ((extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_R].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_R].dpdx)) * zf * 255.0f;
+		ca = ((extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_A].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_A].dpdx)) * zf * 255.0f;
+		if (cb > 255)
+			cb = 255;
+		if (cb < 0)
+			cb = 0;
+		if (cg > 255)
+			cg = 255;
+		if (cg < 0)
+			cg = 0;
+		if (cr > 255)
+			cr = 255;
+		if (cr < 0)
+			cr = 0;
+		if (ca > 255)
+			ca = 255;
+		if (ca < 0)
+			ca = 0;
+		a8r8g8b8 = (ca << 24) | (cr << 16) | (cg << 8) | cb; // pixel color obtained by interpolating the colors of the vertices
 		write_pixel(xp, scanline, a8r8g8b8, z);
 		x--;
 	}
@@ -1992,20 +2051,29 @@ void nv2a_renderer::render_texture_simple(int32_t scanline, const nv2a_rasterize
 		x = x - (extent.stopx - lx - 1);
 	x--;
 	while (x >= 0) {
-		int up, vp;
+		float zf;
+		double spf, tpf;
+		//double rpf, qpf; // disabled to remove "set but not used" warning
+		int sp, tp;
 		int z;
 		int xp = extent.startx + x; // x coordinate of current pixel
 
+		z = (extent.param[(int)VERTEX_PARAMETER::PARAM_Z].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_Z].dpdx);
+		zf = (extent.param[(int)VERTEX_PARAMETER::PARAM_1W].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_1W].dpdx);
+		zf = 1.0f / zf;
+		spf = (extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_S].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_S].dpdx) * zf;
+		tpf = (extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_T].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_T].dpdx) * zf;
+		//rpf = (extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_R].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_R].dpdx) * zf;
+		//qpf = (extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_Q].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_Q].dpdx) * zf;
 		if (objectdata.data->texture[0].rectangle == false) {
-			up = (extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_U].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_U].dpdx)*(double)(objectdata.data->texture[0].sizeu - 1); // x coordinate of texel in texture
-			vp = (extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_V].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_V].dpdx)*(double)(objectdata.data->texture[0].sizev - 1); // y coordinate of texel in texture
-		} else
-		{
-			up = extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_U].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_U].dpdx; // x coordinate of texel in texture
-			vp = extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_V].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_V].dpdx; // y coordinate of texel in texture
+			sp = spf * (double)(objectdata.data->texture[0].sizes - 1); // x coordinate of texel in texture
+			tp = tpf * (double)(objectdata.data->texture[0].sizet - 1); // y coordinate of texel in texture
 		}
-		a8r8g8b8 = texture_get_texel(0, up, vp);
-		z = (extent.param[(int)VERTEX_PARAMETER::PARAM_Z].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_Z].dpdx);
+		else {
+			sp = spf;
+			tp = tpf;
+		}
+		a8r8g8b8 = texture_get_texel(0, sp, tp);
 		write_pixel(xp, scanline, a8r8g8b8, z);
 		x--;
 	}
@@ -2014,8 +2082,8 @@ void nv2a_renderer::render_texture_simple(int32_t scanline, const nv2a_rasterize
 void nv2a_renderer::render_register_combiners(int32_t scanline, const nv2a_rasterizer::extent_t &extent, const nvidia_object_data &objectdata, int threadid)
 {
 	int x, lx, xp;
-	int up, vp;
-	int ca, cr, cg, cb;
+	int tc[4];
+	float colorf[7][4];
 	uint32_t color[6];
 	uint32_t a8r8g8b8;
 	int z;
@@ -2032,53 +2100,71 @@ void nv2a_renderer::render_register_combiners(int32_t scanline, const nv2a_raste
 	if (extent.stopx > lx)
 		x = x - (extent.stopx - lx - 1);
 	x--;
-	std::lock_guard<std::mutex> lock(combiner.lock); // needed since multithreading is not supported yet
 	while (x >= 0) {
+		float zf;
+
 		xp = extent.startx + x;
 		// 1: fetch data
 		// 1.1: interpolated color from vertices
-		cb = ((extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_B].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_B].dpdx))*255.0f;
-		cg = ((extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_G].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_G].dpdx))*255.0f;
-		cr = ((extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_R].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_R].dpdx))*255.0f;
-		ca = ((extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_A].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_A].dpdx))*255.0f;
-		color[0] = (ca << 24) + (cr << 16) + (cg << 8) + cb; // pixel color obtained by interpolating the colors of the vertices
-		color[1] = 0; // lighting not yet
-		// 1.2: color for each of the 4 possible textures
+		z = (extent.param[(int)VERTEX_PARAMETER::PARAM_Z].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_Z].dpdx);
+		zf = (extent.param[(int)VERTEX_PARAMETER::PARAM_1W].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_1W].dpdx);
+		zf = 1.0f / zf;
+		colorf[0][0] = (extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_R].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_R].dpdx) * zf;
+		colorf[0][1] = (extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_G].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_G].dpdx) * zf;
+		colorf[0][2] = (extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_B].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_B].dpdx) * zf;
+		colorf[0][3] = (extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_A].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_COLOR_A].dpdx) * zf;
+		colorf[1][0] = (extent.param[(int)VERTEX_PARAMETER::PARAM_SECONDARY_COLOR_R].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_SECONDARY_COLOR_R].dpdx) * zf;
+		colorf[1][1] = (extent.param[(int)VERTEX_PARAMETER::PARAM_SECONDARY_COLOR_G].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_SECONDARY_COLOR_G].dpdx) * zf;
+		colorf[1][2] = (extent.param[(int)VERTEX_PARAMETER::PARAM_SECONDARY_COLOR_B].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_SECONDARY_COLOR_B].dpdx) * zf;
+		colorf[1][3] = (extent.param[(int)VERTEX_PARAMETER::PARAM_SECONDARY_COLOR_A].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_SECONDARY_COLOR_A].dpdx) * zf;
+		// 1.2: coordinates for each of the 4 possible textures
 		for (n = 0; n < 4; n++) {
-			if (texture[n].enabled) {
+			colorf[n + 2][0] = (extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_S + n * 4].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_S + n * 4].dpdx) * zf;
+			colorf[n + 2][1] = (extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_T + n * 4].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_T + n * 4].dpdx) * zf;
+			colorf[n + 2][2] = (extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_R + n * 4].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_R + n * 4].dpdx) * zf;
+			colorf[n + 2][3] = (extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_Q + n * 4].start + (double)x * extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_Q + n * 4].dpdx) * zf;
+		}
+		// 1.3: fog
+		combiner_argb8_float(fog_color, colorf[6]);
+		colorf[6][3] = 1.0f; // should it be from the ofog output of the vertex shader ?
+		// 1.4: colors from textures
+		for (n = 0; n < 4; n++) {
+			if (texture[n].mode == 1) {
 				if (texture[n].rectangle == false) {
-					up = (extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_U + n * 2].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_U + n * 2].dpdx)*(double)(objectdata.data->texture[n].sizeu - 1);
-					vp = (extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_V + n * 2].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_V + n * 2].dpdx)*(double)(objectdata.data->texture[n].sizev - 1);
-				} else
-				{
-					up = extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_U + n * 2].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_U + n * 2].dpdx;
-					vp = extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_V + n * 2].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_V + n * 2].dpdx;
+					tc[0] = colorf[n + 2][0] * (float)(objectdata.data->texture[n].sizes - 1);
+					tc[1] = colorf[n + 2][1] * (float)(objectdata.data->texture[n].sizet - 1);
 				}
-				color[n + 2] = texture_get_texel(n, up, vp);
+				else {
+					tc[0] = colorf[n + 2][0];
+					tc[1] = colorf[n + 2][1];
+				}
+				a8r8g8b8 = texture_get_texel(n, tc[0], tc[1]);
+				combiner_argb8_float(a8r8g8b8, colorf[n + 2]);
 			}
+			else if (texture[n].mode != 4)
+				combiner_argb8_float(0xff000000, colorf[n + 2]);
 		}
 		// 2: compute
 		// 2.1: initialize
-		combiner_initialize_registers(color);
+		combiner_initialize_registers(threadid, colorf);
 		// 2.2: general cmbiner stages
-		for (n = 0; n < combiner.stages; n++) {
+		for (n = 0; n < combiner.setup.stages; n++) {
 			// 2.2.1 initialize
-			combiner_initialize_stage(n);
+			combiner_initialize_stage(threadid, n);
 			// 2.2.2 map inputs
-			combiner_map_input(n);
+			combiner_map_stage_input(threadid, n);
 			// 2.2.3 compute possible outputs
-			combiner_compute_rgb_outputs(n);
-			combiner_compute_a_outputs(n);
+			combiner_compute_rgb_outputs(threadid, n);
+			combiner_compute_alpha_outputs(threadid, n);
 			// 2.2.4 map outputs to registers
-			combiner_map_output(n);
+			combiner_map_stage_output(threadid, n);
 		}
 		// 2.3: final cmbiner stage
-		combiner_initialize_final();
-		combiner_map_final_input();
-		combiner_final_output();
-		a8r8g8b8 = combiner_float_argb8(combiner.output);
+		combiner_initialize_final(threadid);
+		combiner_map_final_input(threadid);
+		combiner_final_output(threadid);
+		a8r8g8b8 = combiner_float_argb8(combiner.work[threadid].output);
 		// 3: write pixel
-		z = (extent.param[(int)VERTEX_PARAMETER::PARAM_Z].start + (double)x*extent.param[(int)VERTEX_PARAMETER::PARAM_Z].dpdx);
 		write_pixel(xp, scanline, a8r8g8b8, z);
 		x--;
 	}
@@ -2226,143 +2312,80 @@ void dumpcombiners(uint32_t *m)
 
 void nv2a_renderer::extract_packed_float(uint32_t data, float &first, float &second, float &third)
 {
-	int32_t p1, p2, p3;
-	int32_t e1, e2, e3;
-	int32_t m1, m2, m3;
-	float scale, decimal;
-	union
-	{
-		float f;
-		uint32_t i;
-	} i2f;
+	float f1, f2, f3;
+	int p1, p2, p3;
 
-	// convert r11g11b10f to 3 float values
-	// each 32 bit words contains 2 11 bit float values and one 10 bit float value
-	p1 = data & 0b11111111111;
-	p2 = (data >> 11) & 0b11111111111;
-	p3 = (data >> 22) & 0b1111111111;
-	// 11 bit values have 6 bits of mantissa and 5 of exponent, 10 bit values have 5 bits of mantissa and 5 of exponent
-	m1 = p1 & 0b111111;
-	e1 = (p1 >> 6) & 0b11111;
-	m2 = p2 & 0b111111;
-	e2 = (p2 >> 6) & 0b11111;
-	m3 = p3 & 0b11111;
-	e3 = (p3 >> 5) & 0b11111;
-	// the following is based on routine UF11toF32 in appendix G of the "OpenGL Programming Guide 8th edition" book
-	if (e1 == 0) {
-		if (m1 != 0) {
-			scale = 1.0 / (1 << 20);
-			first = scale * m1;
-		}
-		else
-			first = 0;
-	}
-	else if (e1 == 31) {
-		i2f.i = 0x7f800000 | m1;
-		first = i2f.f;
-	}
-	else {
-		e1 -= 15;
-		if (e1 < 0) {
-			scale = 1.0 / (1 << -e1);
-		}
-		else {
-			scale = 1 << e1;
-		}
-		decimal = 1.0 + (float)m1 / 64;
-		first = scale * decimal;
-	}
-	if (e2 == 0) {
-		if (m2 != 0) {
-			scale = 1.0 / (1 << 20);
-			second = scale * m2;
-		}
-		else
-			second = 0;
-	}
-	else if (e2 == 31) {
-		i2f.i = 0x7f800000 | m2;
-		second = i2f.f;
-	}
-	else {
-		e2 -= 15;
-		if (e2 < 0) {
-			scale = 1.0 / (1 << -e2);
-		}
-		else {
-			scale = 1 << e2;
-		}
-		decimal = 1.0 + (float)m2 / 64;
-		second = scale * decimal;
-	}
-	if (e3 == 0) {
-		if (m3 != 0) {
-			scale = 1.0 / (1 << 20);
-			third = scale * m3;
-		}
-		else
-			third = 0;
-	}
-	else if (e3 == 31) {
-		i2f.i = 0x7f800000 | m3;
-		third = i2f.f;
-	}
-	else {
-		e3 -= 15;
-		if (e3 < 0) {
-			scale = 1.0 / (1 << -e3);
-		}
-		else {
-			scale = 1 << e3;
-		}
-		decimal = 1.0 + (float)m3 / 32;
-		third = scale * decimal;
-	}
+	p1 = data & 0x7ff;
+	if (p1 & 0x400)
+		f1 = (float)(p1 - 0x800) / 1023.0;
+	else
+		f1 = (float)p1 / 1023.0;
+	p2 = (data >> 11) & 0x7ff;
+	if (p2 & 0x400)
+		f2 = (float)(p2 - 0x800) / 1023.0;
+	else
+		f2 = (float)p2 / 1023.0;
+	p3 = (data >> 22) & 0x3ff;
+	if (p3 & 0x200)
+		f3 = (float)(p3 - 0x400) / 511.0;
+	else
+		f3 = (float)p3 / 511.0;
+	first = f1;
+	second = f2;
+	third = f3;
 }
 
-
 void nv2a_renderer::read_vertex(address_space &space, offs_t address, vertex_nv &vertex, int attrib)
 {
 	uint32_t u;
-	int c, d, l;
 
-	l = vertexbuffer.size[attrib];
-	switch (vertexbuffer.kind[attrib]) {
-	case NV2A_VTXBUF_TYPE::FLOAT:
-		vertex.attribute[attrib].fv[0] = 0;
+	switch (vertexbuffer.type[attrib])
+	{
+	case 0x02: // none
+		return;
+	case 0x12: // float1
+		vertex.attribute[attrib].iv[0] = space.read_dword(address + 0);
 		vertex.attribute[attrib].fv[1] = 0;
 		vertex.attribute[attrib].fv[2] = 0;
 		vertex.attribute[attrib].fv[3] = 1.0;
-		for (c = d = 0; c < l; c++) {
-			vertex.attribute[attrib].iv[c] = space.read_dword(address + d);
-			d = d + 4;
-		}
 		break;
-	case NV2A_VTXBUF_TYPE::UBYTE_OGL:
-		u = space.read_dword(address + 0);
-		for (c = l-1; c >= 0; c--) {
-			vertex.attribute[attrib].fv[c] = (u & 0xff) / 255.0;
-			u = u >> 8;
-		}
-		break;
-	case  NV2A_VTXBUF_TYPE::UBYTE_D3D:
-		u = space.read_dword(address + 0);
-		for (c = 0; c < l; c++) {
-			vertex.attribute[attrib].fv[c] = (u & 0xff) / 255.0;
-			u = u >> 8;
-		}
-		break;
-	case NV2A_VTXBUF_TYPE::FLOAT_PACKED: // 3 floating point numbers packed into 32 bits
+	case 0x16: // normpacked3
 		u = space.read_dword(address + 0);
 		extract_packed_float(u, vertex.attribute[attrib].fv[0], vertex.attribute[attrib].fv[1], vertex.attribute[attrib].fv[2]);
 		vertex.attribute[attrib].fv[3] = 1.0;
 		break;
-	default:
-		vertex.attribute[attrib].fv[0] = 0;
-		vertex.attribute[attrib].fv[1] = 0;
+	case 0x22: // float2
+		vertex.attribute[attrib].iv[0] = space.read_dword(address + 0);
+		vertex.attribute[attrib].iv[1] = space.read_dword(address + 4);
 		vertex.attribute[attrib].fv[2] = 0;
 		vertex.attribute[attrib].fv[3] = 1.0;
-		machine().logerror("Unsupported vertex type in read_vertex !\n");
+		break;
+	case 0x32: // float3
+		vertex.attribute[attrib].iv[0] = space.read_dword(address + 0);
+		vertex.attribute[attrib].iv[1] = space.read_dword(address + 4);
+		vertex.attribute[attrib].iv[2] = space.read_dword(address + 8);
+		vertex.attribute[attrib].fv[3] = 1.0;
+		break;
+	case 0x40: // d3dcolor
+		u = space.read_dword(address + 0);
+		// aarrggbb -> (rr, gg, bb, aa)
+		vertex.attribute[attrib].fv[2] = (u & 0xff) / 255.0;
+		u = u >> 8;
+		vertex.attribute[attrib].fv[1] = (u & 0xff) / 255.0;
+		u = u >> 8;
+		vertex.attribute[attrib].fv[0] = (u & 0xff) / 255.0;
+		u = u >> 8;
+		vertex.attribute[attrib].fv[3] = (u & 0xff) / 255.0;
+		break;
+	case 0x42: // float4
+		vertex.attribute[attrib].iv[0] = space.read_dword(address + 0);
+		vertex.attribute[attrib].iv[1] = space.read_dword(address + 4);
+		vertex.attribute[attrib].iv[2] = space.read_dword(address + 8);
+		vertex.attribute[attrib].iv[3] = space.read_dword(address + 12);
+		break;
+	default:
+		machine().logerror("Yet unsupported vertex data type %x\n\r", vertexbuffer.type[attrib]);
+		return;
 	}
 }
 
@@ -2421,7 +2444,7 @@ int nv2a_renderer::read_vertices_0x1818(address_space &space, vertex_nv *destina
 #ifdef MAME_DEBUG
 	memset(destination, 0, sizeof(vertex_nv)*limit);
 #endif
-	vwords = vertexbuffer.words[15] + vertexbuffer.offset[15];
+	vwords = vertexbuffer.offset[16];
 	for (m = 0; m < limit; m++) {
 		memcpy(&destination[m], &persistvertexattr, sizeof(persistvertexattr));
 		b = vertexbuffer.enabled;
@@ -2476,6 +2499,7 @@ void nv2a_renderer::convert_vertices(vertex_nv *source, nv2avertex_t *destinatio
 	vertex_nv vert[4];
 	int m, u;
 	float v[4];
+	double c;
 
 	// take each vertex with its attributes and obtain data for drawing
 	// should use either the vertex program or transformation matrices
@@ -2491,12 +2515,25 @@ void nv2a_renderer::convert_vertices(vertex_nv *source, nv2avertex_t *destinatio
 			destination[m].w = v[3];
 			destination[m].x = (v[0] / v[3]) * supersample_factor_x; // source[m].attribute[0].fv[0];
 			destination[m].y = (v[1] / v[3]) * supersample_factor_y; // source[m].attribute[0].fv[1];
+			c = v[3];
+			if (c == 0)
+				c = FLT_MIN;
+			c = 1.0f / c;
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_1W] = c;
 			destination[m].p[(int)VERTEX_PARAMETER::PARAM_Z] = v[2] / v[3];
-			for (u = (int)VERTEX_PARAMETER::PARAM_COLOR_B; u <= (int)VERTEX_PARAMETER::PARAM_COLOR_A; u++) // 0=b 1=g 2=r 3=a
-				destination[m].p[u] = source[m].attribute[3].fv[u];
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_COLOR_R] = source[m].attribute[3].fv[0] * c;
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_COLOR_G] = source[m].attribute[3].fv[1] * c;
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_COLOR_B] = source[m].attribute[3].fv[2] * c;
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_COLOR_A] = source[m].attribute[3].fv[3] * c;
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_SECONDARY_COLOR_R] = source[m].attribute[4].fv[0] * c;
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_SECONDARY_COLOR_G] = source[m].attribute[4].fv[1] * c;
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_SECONDARY_COLOR_B] = source[m].attribute[4].fv[2] * c;
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_SECONDARY_COLOR_A] = source[m].attribute[4].fv[3] * c;
 			for (u = 0; u < 4; u++) {
-				destination[m].p[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_U + u * 2] = source[m].attribute[9 + u].fv[0];
-				destination[m].p[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_V + u * 2] = source[m].attribute[9 + u].fv[1];
+				destination[m].p[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_S + u * 4] = source[m].attribute[9 + u].fv[0] * c;
+				destination[m].p[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_T + u * 4] = source[m].attribute[9 + u].fv[1] * c;
+				destination[m].p[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_R + u * 4] = source[m].attribute[9 + u].fv[2] * c;
+				destination[m].p[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_Q + u * 4] = source[m].attribute[9 + u].fv[3] * c;
 			}
 		}
 	}
@@ -2510,13 +2547,26 @@ void nv2a_renderer::convert_vertices(vertex_nv *source, nv2avertex_t *destinatio
 			destination[m].w = vert[m].attribute[0].fv[3];
 			destination[m].x = (vert[m].attribute[0].fv[0] - 0.53125) * supersample_factor_x;
 			destination[m].y = (vert[m].attribute[0].fv[1] - 0.53125) * supersample_factor_y;
-			for (u = (int)VERTEX_PARAMETER::PARAM_COLOR_B; u <= (int)VERTEX_PARAMETER::PARAM_COLOR_A; u++) // 0=b 1=g 2=r 3=a
-				destination[m].p[u] = vert[m].attribute[3].fv[u];
-			for (u = 0; u < 4; u++) {
-				destination[m].p[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_U + u * 2] = vert[m].attribute[9 + u].fv[0];
-				destination[m].p[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_V + u * 2] = vert[m].attribute[9 + u].fv[1];
-			}
+			c = vert[m].attribute[0].fv[3];
+			if (c == 0)
+				c = FLT_MIN;
+			c = 1.0f / c;
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_1W] = c;
 			destination[m].p[(int)VERTEX_PARAMETER::PARAM_Z] = vert[m].attribute[0].fv[2];
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_COLOR_R] = vert[m].attribute[3].fv[0] * c;
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_COLOR_G] = vert[m].attribute[3].fv[1] * c;
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_COLOR_B] = vert[m].attribute[3].fv[2] * c;
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_COLOR_A] = vert[m].attribute[3].fv[3] * c;
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_SECONDARY_COLOR_R] = vert[m].attribute[4].fv[0] * c;
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_SECONDARY_COLOR_G] = vert[m].attribute[4].fv[1] * c;
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_SECONDARY_COLOR_B] = vert[m].attribute[4].fv[2] * c;
+			destination[m].p[(int)VERTEX_PARAMETER::PARAM_SECONDARY_COLOR_A] = vert[m].attribute[4].fv[3] * c;
+			for (u = 0; u < 4; u++) {
+				destination[m].p[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_S + u * 4] = vert[m].attribute[9 + u].fv[0] * c;
+				destination[m].p[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_T + u * 4] = vert[m].attribute[9 + u].fv[1] * c;
+				destination[m].p[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_R + u * 4] = vert[m].attribute[9 + u].fv[2] * c;
+				destination[m].p[(int)VERTEX_PARAMETER::PARAM_TEXTURE0_Q + u * 4] = vert[m].attribute[9 + u].fv[3] * c;
+			}
 		}
 	}
 }
@@ -2734,19 +2784,23 @@ void nv2a_renderer::clear_depth_buffer(int what, uint32_t value)
 		}
 }
 
-uint32_t nv2a_renderer::render_triangle_culling(const rectangle &cliprect, int paramcount, nv2avertex_t &_v1, nv2avertex_t &_v2, nv2avertex_t &_v3)
+uint32_t nv2a_renderer::render_triangle_culling(const rectangle &cliprect, nv2avertex_t &_v1, nv2avertex_t &_v2, nv2avertex_t &_v3)
 {
 	float areax2;
 	NV2A_GL_CULL_FACE face = NV2A_GL_CULL_FACE::FRONT;
 
 	if (backface_culling_enabled == false)
-		return rasterizer.render_triangle(cliprect, render_spans_callback, paramcount, _v1, _v2, _v3);
+		return rasterizer.render_triangle(cliprect, render_spans_callback, (int)VERTEX_PARAMETER::ALL, _v1, _v2, _v3);
 	if (backface_culling_culled == NV2A_GL_CULL_FACE::FRONT_AND_BACK)
 	{
 		triangles_bfculled++;
 		return 0;
 	}
 	areax2 = _v1.x*(_v2.y - _v3.y) + _v2.x*(_v3.y - _v1.y) + _v3.x*(_v1.y - _v2.y);
+	if (areax2 == 0.0f) {
+		triangles_bfculled++;
+		return 0;
+	}
 	if (backface_culling_winding == NV2A_GL_FRONT_FACE::CCW)
 	{
 		if (-areax2 <= 0)
@@ -2762,10 +2816,10 @@ uint32_t nv2a_renderer::render_triangle_culling(const rectangle &cliprect, int p
 	}
 	if (face == NV2A_GL_CULL_FACE::FRONT)
 		if (backface_culling_culled == NV2A_GL_CULL_FACE::BACK)
-			return rasterizer.render_triangle(cliprect, render_spans_callback, paramcount, _v1, _v2, _v3);
+			return rasterizer.render_triangle(cliprect, render_spans_callback, (int)VERTEX_PARAMETER::ALL, _v1, _v2, _v3);
 	if (face == NV2A_GL_CULL_FACE::BACK)
 		if (backface_culling_culled == NV2A_GL_CULL_FACE::FRONT)
-			return rasterizer.render_triangle(cliprect, render_spans_callback, paramcount, _v1, _v2, _v3);
+			return rasterizer.render_triangle(cliprect, render_spans_callback, (int)VERTEX_PARAMETER::ALL, _v1, _v2, _v3);
 	triangles_bfculled++;
 	return 0;
 }
@@ -2812,14 +2866,14 @@ int nv2a_renderer::clip_triangle_w(nv2avertex_t *vi[3], nv2avertex_t *vo)
 	return idx;
 }
 
-uint32_t nv2a_renderer::render_triangle_clipping(const rectangle &cliprect, int paramcount, nv2avertex_t &_v1, nv2avertex_t &_v2, nv2avertex_t &_v3)
+uint32_t nv2a_renderer::render_triangle_clipping(const rectangle &cliprect, nv2avertex_t &_v1, nv2avertex_t &_v2, nv2avertex_t &_v3)
 {
 	nv2avertex_t *vi[3];
 	nv2avertex_t vo[8];
 	int nv;
 
 	if ((_v1.w > 0) && (_v2.w > 0) && (_v3.w > 0))
-		return render_triangle_culling(cliprect, paramcount, _v1, _v2, _v3);
+		return render_triangle_culling(cliprect, _v1, _v2, _v3);
 	if (enable_clipping_w == false)
 		return 0;
 	if ((_v1.w <= 0) && (_v2.w <= 0) && (_v3.w <= 0))
@@ -2877,7 +2931,7 @@ uint32_t nv2a_renderer::render_triangle_clipping(const rectangle &cliprect, int
 		}
 	}
 	for (int n = 1; n <= (nv - 2); n++)
-		render_triangle_culling(cliprect, paramcount, vo[0], vo[n], vo[n + 1]);
+		render_triangle_culling(cliprect, vo[0], vo[n], vo[n + 1]);
 	return 0;
 }
 
@@ -2892,8 +2946,8 @@ void nv2a_renderer::assemble_primitive(vertex_nv *source, int count)
 			if (vertex_accumulated == 4) {
 				primitives_count++;
 				vertex_accumulated = 0;
-				render_triangle_clipping(limits_rendertarget, 5 + 4 * 2, vertex_xy[vertex_count], vertex_xy[vertex_count + 1], vertex_xy[vertex_count + 2]);
-				render_triangle_clipping(limits_rendertarget, 5 + 4 * 2, vertex_xy[vertex_count], vertex_xy[vertex_count + 2], vertex_xy[vertex_count + 3]);
+				render_triangle_clipping(limits_rendertarget, vertex_xy[vertex_count], vertex_xy[vertex_count + 1], vertex_xy[vertex_count + 2]);
+				render_triangle_clipping(limits_rendertarget, vertex_xy[vertex_count], vertex_xy[vertex_count + 2], vertex_xy[vertex_count + 3]);
 				vertex_count = (vertex_count + 4) & 1023;
 				rasterizer.wait();
 			}
@@ -2904,7 +2958,7 @@ void nv2a_renderer::assemble_primitive(vertex_nv *source, int count)
 			if (vertex_accumulated == 3) {
 				primitives_count++;
 				vertex_accumulated = 0;
-				render_triangle_clipping(limits_rendertarget, 5 + 4 * 2, vertex_xy[vertex_count], vertex_xy[(vertex_count + 1) & 1023], vertex_xy[(vertex_count + 2) & 1023]); // 4 rgba, 4 texture units 2 uv
+				render_triangle_clipping(limits_rendertarget, vertex_xy[vertex_count], vertex_xy[(vertex_count + 1) & 1023], vertex_xy[(vertex_count + 2) & 1023]); // 4 rgba, 4 texture units 2 uv
 				vertex_count = (vertex_count + 3) & 1023;
 				rasterizer.wait();
 			}
@@ -2927,7 +2981,7 @@ void nv2a_renderer::assemble_primitive(vertex_nv *source, int count)
 				// if software sends the vertices 0 1 2 3 4 5 6
 				// hardware will draw triangles made by (0,1,2) (0,2,3) (0,3,4) (0,4,5) (0,5,6)
 				convert_vertices(source, vertex_xy + vertex_count, 1);
-				render_triangle_clipping(limits_rendertarget, 5 + 4 * 2, vertex_xy[1024], vertex_xy[(vertex_count - 1) & 1023], vertex_xy[vertex_count]);
+				render_triangle_clipping(limits_rendertarget, vertex_xy[1024], vertex_xy[(vertex_count - 1) & 1023], vertex_xy[vertex_count]);
 				vertex_count = (vertex_count + 1) & 1023;
 				rasterizer.wait();
 			}
@@ -2951,9 +3005,9 @@ void nv2a_renderer::assemble_primitive(vertex_nv *source, int count)
 				// hardware will draw triangles made by (0,1,2) (1,3,2) (2,3,4) (3,5,4) (4,5,6)
 				convert_vertices(source, vertex_xy + vertex_count, 1);
 				if ((vertex_count & 1) == 0)
-					render_triangle_clipping(limits_rendertarget, 5 + 4 * 2, vertex_xy[(vertex_count - 2) & 1023], vertex_xy[(vertex_count - 1) & 1023], vertex_xy[vertex_count]);
+					render_triangle_clipping(limits_rendertarget, vertex_xy[(vertex_count - 2) & 1023], vertex_xy[(vertex_count - 1) & 1023], vertex_xy[vertex_count]);
 				else
-					render_triangle_clipping(limits_rendertarget, 5 + 4 * 2, vertex_xy[(vertex_count - 2) & 1023], vertex_xy[vertex_count], vertex_xy[(vertex_count - 1) & 1023]);
+					render_triangle_clipping(limits_rendertarget, vertex_xy[(vertex_count - 2) & 1023], vertex_xy[vertex_count], vertex_xy[(vertex_count - 1) & 1023]);
 				vertex_count = (vertex_count + 1) & 1023;
 				rasterizer.wait();
 			}
@@ -2977,9 +3031,10 @@ void nv2a_renderer::assemble_primitive(vertex_nv *source, int count)
 				if (vertex_accumulated == 4)
 				{
 					primitives_count++;
-					// quad is made of vertices vertex count +0 +1 +3 +2
-					render_triangle_clipping(limits_rendertarget, 5 + 4 * 2, vertex_xy[vertex_count + 0], vertex_xy[vertex_count + 1], vertex_xy[(vertex_count + 3) & 1023]);
-					render_triangle_clipping(limits_rendertarget, 5 + 4 * 2, vertex_xy[(vertex_count + 3) & 1023], vertex_xy[(vertex_count + 2) & 1023], vertex_xy[vertex_count + 0]);
+					// if software sends the vertices 0 1 2 3 4 5 6 7
+					// hardware will draw triangles made by (0,1,2) (2,1,3) (2,3,4) (4,3,5) (4,5,6) (6,5,7)
+					render_triangle_clipping(limits_rendertarget, vertex_xy[vertex_count + 0], vertex_xy[vertex_count + 1], vertex_xy[(vertex_count + 2) & 1023]);
+					render_triangle_clipping(limits_rendertarget, vertex_xy[(vertex_count + 2) & 1023], vertex_xy[(vertex_count + 1) & 1023], vertex_xy[vertex_count + 3]);
 					vertex_accumulated = 2;
 					vertex_count = (vertex_count + 2) & 1023;
 					rasterizer.wait();
@@ -3070,7 +3125,10 @@ int nv2a_renderer::execute_method_3d(address_space& space, uint32_t chanel, uint
 		indexesleft_first = 0;
 		primitives_count = 0;
 		primitive_type = (NV2A_BEGIN_END)data;
-		if (data != 0) {
+		if (data == 0)
+			primitives_batches_count++;
+		else
+		{
 			if (((channel[chanel][subchannel].object.method[0x1e60 / 4] & 7) > 0) && (combiner.used != 0))
 				render_spans_callback = nv2a_rasterizer::render_delegate(&nv2a_renderer::render_register_combiners, this);
 			else if (texture[0].enabled)
@@ -3129,6 +3187,8 @@ int nv2a_renderer::execute_method_3d(address_space& space, uint32_t chanel, uint
 		}
 	}
 	if (maddress == 0x1818) {
+		if (countlen == 0)
+			machine().logerror("Method 0x1818 with 0 vertices\n");
 		// vertices are taken from the next words, not from a vertex buffer
 		// first send primitive type with 17fc
 		// then countlen number of dwords with 1818
@@ -3187,10 +3247,11 @@ int nv2a_renderer::execute_method_3d(address_space& space, uint32_t chanel, uint
 		uint8_t d3 = (data >> 16) & 255;
 		uint8_t d4 = data >> 24;
 
-		persistvertexattr.attribute[attr].fv[0] = (float)d1;
-		persistvertexattr.attribute[attr].fv[1] = (float)d2;
-		persistvertexattr.attribute[attr].fv[2] = (float)d3;
-		persistvertexattr.attribute[attr].fv[3] = (float)d4;
+		// if sending color dword is aabbggrr
+		persistvertexattr.attribute[attr].fv[0] = (float)d1 / 255.0;
+		persistvertexattr.attribute[attr].fv[1] = (float)d2 / 255.0;
+		persistvertexattr.attribute[attr].fv[2] = (float)d3 / 255.0;
+		persistvertexattr.attribute[attr].fv[3] = (float)d4 / 255.0;
 		if (attr == 0)
 			assemble_primitive(&persistvertexattr, 1);
 	}
@@ -3228,6 +3289,10 @@ int nv2a_renderer::execute_method_3d(address_space& space, uint32_t chanel, uint
 		if (comp == 3)
 			assemble_primitive(&persistvertexattr, 1);
 	}
+	else if ((maddress >= 0x1500) && (maddress < 0x1590))
+	{
+		machine().logerror("Yet unsupported method %x\n\r", maddress);
+	}
 	if ((maddress >= 0x1720) && (maddress < 0x1760)) {
 		int bit = maddress / 4 - 0x1720 / 4;
 
@@ -3241,32 +3306,39 @@ int nv2a_renderer::execute_method_3d(address_space& space, uint32_t chanel, uint
 
 		vertexbuffer.type[bit] = data & 255;
 		vertexbuffer.stride[bit] = (data >> 8) & 255;
-		vertexbuffer.kind[bit] = (NV2A_VTXBUF_TYPE)(vertexbuffer.type[bit] & 15);
-		vertexbuffer.size[bit] = (vertexbuffer.type[bit] >> 4) & 15;
-		switch (vertexbuffer.kind[bit]) {
-		case NV2A_VTXBUF_TYPE::UBYTE_D3D:
-			vertexbuffer.words[bit] = (vertexbuffer.size[bit] * 1) >> 2;
+		switch (vertexbuffer.type[bit])
+		{
+		case 0x02: // none
+			vertexbuffer.words[bit] = 0;
 			break;
-		case NV2A_VTXBUF_TYPE::FLOAT:
-			vertexbuffer.words[bit] = (vertexbuffer.size[bit] * 4) >> 2;
+		case 0x12: // float1
+			vertexbuffer.words[bit] = 1;
 			break;
-		case NV2A_VTXBUF_TYPE::UBYTE_OGL:
-			vertexbuffer.words[bit] = (vertexbuffer.size[bit] * 1) >> 2;
+		case 0x16: // normpacked3
+			vertexbuffer.words[bit] = 1;
 			break;
-		case NV2A_VTXBUF_TYPE::USHORT:
-			vertexbuffer.words[bit] = (vertexbuffer.size[bit] * 2) >> 2;
+		case 0x22: // float2
+			vertexbuffer.words[bit] = 2;
 			break;
-		case NV2A_VTXBUF_TYPE::FLOAT_PACKED:
-			vertexbuffer.words[bit] = (vertexbuffer.size[bit] * 4) >> 2;
+		case 0x32: // float3
+			vertexbuffer.words[bit] = 3;
+			break;
+		case 0x40: // d3dcolor
+			vertexbuffer.words[bit] = 1;
+			break;
+		case 0x42: // float4
+			vertexbuffer.words[bit] = 4;
 			break;
 		default:
+			machine().logerror("Yet unsupported vertex data type %x\n\r", vertexbuffer.type[bit]);
 			vertexbuffer.words[bit] = 0;
 		}
-		if (vertexbuffer.size[bit] > 0)
+		if (vertexbuffer.words[bit] > 0)
 			vertexbuffer.enabled |= (1 << bit);
 		else
 			vertexbuffer.enabled &= ~(1 << bit);
-		for (int n = bit + 1; n < 16; n++) {
+		vertexbuffer.offset[0] = 0;
+		for (int n = bit + 1; n <= 16; n++) {
 			if ((vertexbuffer.enabled & (1 << (n - 1))) != 0)
 				vertexbuffer.offset[n] = vertexbuffer.offset[n - 1] + vertexbuffer.words[n - 1];
 			else
@@ -3392,6 +3464,10 @@ int nv2a_renderer::execute_method_3d(address_space& space, uint32_t chanel, uint
 		compute_supersample_factors(supersample_factor_x, supersample_factor_y);
 		compute_limits_rendertarget(chanel, subchannel);
 		compute_size_rendertarget(chanel, subchannel);
+/* for debugging
+		if (limits_rendertarget.max_x == 1023)
+			type_rendertarget = NV2A_RT_TYPE::LINEAR;
+*/
 		switch (colorformat_rendertarget) {
 		case NV2A_COLOR_FORMAT::R5G6B5:
 			bytespixel_rendertarget = 2;
@@ -3459,7 +3535,9 @@ int nv2a_renderer::execute_method_3d(address_space& space, uint32_t chanel, uint
 		pgraph[0x186c / 4] = data;
 	}
 	if (maddress == 0x0210) {
-		// framebuffer offset ?
+		// framebuffer offset
+		old_rendertarget = rendertarget;
+		// To see it with the image watch extension: @mem(0x000002d2263af060, UINT8, 4, 640, 480, 2560)
 		rendertarget = (uint32_t *)direct_access_ptr(data);
 #ifdef LOG_NV2A
 		printf("Render target at %08X\n\r", data);
@@ -3599,9 +3677,9 @@ int nv2a_renderer::execute_method_3d(address_space& space, uint32_t chanel, uint
 			basesizeu = (data >> 20) & 15;
 			basesizev = (data >> 24) & 15;
 			basesizew = (data >> 28) & 15;
-			texture[unit].sizeu = 1 << basesizeu;
-			texture[unit].sizev = 1 << basesizev;
-			texture[unit].sizew = 1 << basesizew;
+			texture[unit].sizes = 1 << basesizeu;
+			texture[unit].sizet = 1 << basesizev;
+			texture[unit].sizer = 1 << basesizew;
 			texture[unit].dilate = dilatechose[(basesizeu << 4) + basesizev];
 			texture[unit].format = (NV2A_TEX_FORMAT)format;
 			switch (texture[unit].format)
@@ -3630,7 +3708,7 @@ int nv2a_renderer::execute_method_3d(address_space& space, uint32_t chanel, uint
 				debug_grab_texttype = -1;
 				f = fopen(debug_grab_textfile, "wb");
 				if (f) {
-					written = (int)fwrite(texture[unit].buffer, texture[unit].sizeu * texture[unit].sizev * 4, 1, f);
+					written = (int)fwrite(texture[unit].buffer, texture[unit].sizes * texture[unit].sizet * 4, 1, f);
 					fclose(f);
 					machine().logerror("Written %d bytes of texture to specified file\n", written);
 				}
@@ -3638,22 +3716,18 @@ int nv2a_renderer::execute_method_3d(address_space& space, uint32_t chanel, uint
 					machine().logerror("Unable to save texture to specified file\n");
 			}
 		}
-		/*if (maddress == 0x1b08) {
-		    (data >> 0) & 15;
-		    (data >> 4) & 15;
-		    (data >> 8) & 15;
-		    (data >> 12) & 15;
-		    (data >> 16) & 15;
-		    (data >> 20) & 15;
-		    (data >> 24) & 255;
-		}*/
+		if (maddress == 0x1b08) {
+			texture[unit].addrmodes = (data >> 0) & 15;
+			texture[unit].addrmodet = (data >> 8) & 15;
+			texture[unit].addrmoder = (data >> 16) & 15;
+		}
 		if (maddress == 0x1b0c) {
 			texture[unit].colorkey = (data >> 0) & 3;
 			texture[unit].imagefield = (data >> 3) & 1;
 			texture[unit].aniso = (data >> 4) & 3;
 			texture[unit].mipmapmaxlod = (data >> 6) & 0xfff;
 			texture[unit].mipmapminlod = (data >> 18) & 0xfff;
-			// enable texture
+			// enable texture ?
 			texture[unit].enabled = (data >> 30) & 3;
 		}
 		if (maddress == 0x1b10) {
@@ -3665,6 +3739,12 @@ int nv2a_renderer::execute_method_3d(address_space& space, uint32_t chanel, uint
 		}
 		countlen--;
 	}
+	if (maddress == 0x1e70) {
+		texture[0].mode = data & 31;
+		texture[1].mode = (data >> 5) & 31;
+		texture[2].mode = (data >> 10) & 31;
+		texture[3].mode = (data >> 15) & 31;
+	}
 	// projection matrix
 	if ((maddress >= 0x0440) && (maddress < 0x0480)) {
 		maddress = (maddress - 0x0440) / 4;
@@ -3809,127 +3889,130 @@ int nv2a_renderer::execute_method_3d(address_space& space, uint32_t chanel, uint
 	if (maddress == 0x1e90) {
 		machine().logerror("Received explicit method to run vertex program\n");
 	}
-	// Register combiners
-	if (maddress == 0x1e60) {
-		combiner.stages = data & 15;
-		countlen--;
+	if (maddress == 0x02a8) {
+		fog_color = data;
 	}
+	// Register combiners
 	if (maddress == 0x0288) {
-		combiner.final.mapin_rgbD_input = data & 15;
-		combiner.final.mapin_rgbD_component = (data >> 4) & 1;
-		combiner.final.mapin_rgbD_mapping = (data >> 5) & 7;
-		combiner.final.mapin_rgbC_input = (data >> 8) & 15;
-		combiner.final.mapin_rgbC_component = (data >> 12) & 1;
-		combiner.final.mapin_rgbC_mapping = (data >> 13) & 7;
-		combiner.final.mapin_rgbB_input = (data >> 16) & 15;
-		combiner.final.mapin_rgbB_component = (data >> 20) & 1;
-		combiner.final.mapin_rgbB_mapping = (data >> 21) & 7;
-		combiner.final.mapin_rgbA_input = (data >> 24) & 15;
-		combiner.final.mapin_rgbA_component = (data >> 28) & 1;
-		combiner.final.mapin_rgbA_mapping = (data >> 29) & 7;
+		combiner.setup.final.mapin_rgb.D_input = (Combiner::InputRegister)(data & 15);
+		combiner.setup.final.mapin_rgb.D_component = (data >> 4) & 1;
+		combiner.setup.final.mapin_rgb.D_mapping = (Combiner::MapFunction)((data >> 5) & 7);
+		combiner.setup.final.mapin_rgb.C_input = (Combiner::InputRegister)((data >> 8) & 15);
+		combiner.setup.final.mapin_rgb.C_component = (data >> 12) & 1;
+		combiner.setup.final.mapin_rgb.C_mapping = (Combiner::MapFunction)((data >> 13) & 7);
+		combiner.setup.final.mapin_rgb.B_input = (Combiner::InputRegister)((data >> 16) & 15);
+		combiner.setup.final.mapin_rgb.B_component = (data >> 20) & 1;
+		combiner.setup.final.mapin_rgb.B_mapping = (Combiner::MapFunction)((data >> 21) & 7);
+		combiner.setup.final.mapin_rgb.A_input = (Combiner::InputRegister)((data >> 24) & 15);
+		combiner.setup.final.mapin_rgb.A_component = (data >> 28) & 1;
+		combiner.setup.final.mapin_rgb.A_mapping = (Combiner::MapFunction)((data >> 29) & 7);
 		countlen--;
 	}
 	if (maddress == 0x028c) {
-		combiner.final.color_sum_clamp = (data >> 7) & 1;
-		combiner.final.mapin_aG_input = (data >> 8) & 15;
-		combiner.final.mapin_aG_component = (data >> 12) & 1;
-		combiner.final.mapin_aG_mapping = (data >> 13) & 7;
-		combiner.final.mapin_rgbF_input = (data >> 16) & 15;
-		combiner.final.mapin_rgbF_component = (data >> 20) & 1;
-		combiner.final.mapin_rgbF_mapping = (data >> 21) & 7;
-		combiner.final.mapin_rgbE_input = (data >> 24) & 15;
-		combiner.final.mapin_rgbE_component = (data >> 28) & 1;
-		combiner.final.mapin_rgbE_mapping = (data >> 29) & 7;
-		countlen--;
-	}
-	if (maddress == 0x1e20) {
-		combiner_argb8_float(data, combiner.final.register_constantcolor0);
-		countlen--;
-	}
-	if (maddress == 0x1e24) {
-		combiner_argb8_float(data, combiner.final.register_constantcolor1);
+		combiner.setup.final.color_sum_clamp = (data >> 7) & 1;
+		combiner.setup.final.mapin_alpha.G_input = (Combiner::InputRegister)((data >> 8) & 15);
+		combiner.setup.final.mapin_alpha.G_component = (data >> 12) & 1;
+		combiner.setup.final.mapin_alpha.G_mapping = (Combiner::MapFunction)((data >> 13) & 7);
+		combiner.setup.final.mapin_rgb.F_input = (Combiner::InputRegister)((data >> 16) & 15);
+		combiner.setup.final.mapin_rgb.F_component = (data >> 20) & 1;
+		combiner.setup.final.mapin_rgb.F_mapping = (Combiner::MapFunction)((data >> 21) & 7);
+		combiner.setup.final.mapin_rgb.E_input = (Combiner::InputRegister)((data >> 24) & 15);
+		combiner.setup.final.mapin_rgb.E_component = (data >> 28) & 1;
+		combiner.setup.final.mapin_rgb.E_mapping = (Combiner::MapFunction)((data >> 29) & 7);
 		countlen--;
 	}
 	if ((maddress >= 0x0260) && (maddress < 0x0280)) {
 		int n;
 
 		n = (maddress - 0x0260) >> 2;
-		combiner.stage[n].mapin_aD_input = data & 15;
-		combiner.stage[n].mapin_aD_component = (data >> 4) & 1;
-		combiner.stage[n].mapin_aD_mapping = (data >> 5) & 7;
-		combiner.stage[n].mapin_aC_input = (data >> 8) & 15;
-		combiner.stage[n].mapin_aC_component = (data >> 12) & 1;
-		combiner.stage[n].mapin_aC_mapping = (data >> 13) & 7;
-		combiner.stage[n].mapin_aB_input = (data >> 16) & 15;
-		combiner.stage[n].mapin_aB_component = (data >> 20) & 1;
-		combiner.stage[n].mapin_aB_mapping = (data >> 21) & 7;
-		combiner.stage[n].mapin_aA_input = (data >> 24) & 15;
-		combiner.stage[n].mapin_aA_component = (data >> 28) & 1;
-		combiner.stage[n].mapin_aA_mapping = (data >> 29) & 7;
+		combiner.setup.stage[n].mapin_alpha.D_input = (Combiner::InputRegister)(data & 15);
+		combiner.setup.stage[n].mapin_alpha.D_component = (data >> 4) & 1;
+		combiner.setup.stage[n].mapin_alpha.D_mapping = (Combiner::MapFunction)((data >> 5) & 7);
+		combiner.setup.stage[n].mapin_alpha.C_input = (Combiner::InputRegister)((data >> 8) & 15);
+		combiner.setup.stage[n].mapin_alpha.C_component = (data >> 12) & 1;
+		combiner.setup.stage[n].mapin_alpha.C_mapping = (Combiner::MapFunction)((data >> 13) & 7);
+		combiner.setup.stage[n].mapin_alpha.B_input = (Combiner::InputRegister)((data >> 16) & 15);
+		combiner.setup.stage[n].mapin_alpha.B_component = (data >> 20) & 1;
+		combiner.setup.stage[n].mapin_alpha.B_mapping = (Combiner::MapFunction)((data >> 21) & 7);
+		combiner.setup.stage[n].mapin_alpha.A_input = (Combiner::InputRegister)((data >> 24) & 15);
+		combiner.setup.stage[n].mapin_alpha.A_component = (data >> 28) & 1;
+		combiner.setup.stage[n].mapin_alpha.A_mapping = (Combiner::MapFunction)((data >> 29) & 7);
 		countlen--;
 	}
 	if ((maddress >= 0x0ac0) && (maddress < 0x0ae0)) {
 		int n;
 
 		n = (maddress - 0x0ac0) >> 2;
-		combiner.stage[n].mapin_rgbD_input = data & 15;
-		combiner.stage[n].mapin_rgbD_component = (data >> 4) & 1;
-		combiner.stage[n].mapin_rgbD_mapping = (data >> 5) & 7;
-		combiner.stage[n].mapin_rgbC_input = (data >> 8) & 15;
-		combiner.stage[n].mapin_rgbC_component = (data >> 12) & 1;
-		combiner.stage[n].mapin_rgbC_mapping = (data >> 13) & 7;
-		combiner.stage[n].mapin_rgbB_input = (data >> 16) & 15;
-		combiner.stage[n].mapin_rgbB_component = (data >> 20) & 1;
-		combiner.stage[n].mapin_rgbB_mapping = (data >> 21) & 7;
-		combiner.stage[n].mapin_rgbA_input = (data >> 24) & 15;
-		combiner.stage[n].mapin_rgbA_component = (data >> 28) & 1;
-		combiner.stage[n].mapin_rgbA_mapping = (data >> 29) & 7;
+		combiner.setup.stage[n].mapin_rgb.D_input = (Combiner::InputRegister)(data & 15);
+		combiner.setup.stage[n].mapin_rgb.D_component = (data >> 4) & 1;
+		combiner.setup.stage[n].mapin_rgb.D_mapping = (Combiner::MapFunction)((data >> 5) & 7);
+		combiner.setup.stage[n].mapin_rgb.C_input = (Combiner::InputRegister)((data >> 8) & 15);
+		combiner.setup.stage[n].mapin_rgb.C_component = (data >> 12) & 1;
+		combiner.setup.stage[n].mapin_rgb.C_mapping = (Combiner::MapFunction)((data >> 13) & 7);
+		combiner.setup.stage[n].mapin_rgb.B_input = (Combiner::InputRegister)((data >> 16) & 15);
+		combiner.setup.stage[n].mapin_rgb.B_component = (data >> 20) & 1;
+		combiner.setup.stage[n].mapin_rgb.B_mapping = (Combiner::MapFunction)((data >> 21) & 7);
+		combiner.setup.stage[n].mapin_rgb.A_input = (Combiner::InputRegister)((data >> 24) & 15);
+		combiner.setup.stage[n].mapin_rgb.A_component = (data >> 28) & 1;
+		combiner.setup.stage[n].mapin_rgb.A_mapping = (Combiner::MapFunction)((data >> 29) & 7);
 		countlen--;
 	}
 	if ((maddress >= 0x0a60) && (maddress < 0x0a80)) {
 		int n;
 
 		n = (maddress - 0x0a60) >> 2;
-		combiner_argb8_float(data, combiner.stage[n].register_constantcolor0);
+		combiner_argb8_float(data, combiner.setup.stage[n].constantcolor0);
 		countlen--;
 	}
 	if ((maddress >= 0x0a80) && (maddress < 0x0aa0)) {
 		int n;
 
 		n = (maddress - 0x0a80) >> 2;
-		combiner_argb8_float(data, combiner.stage[n].register_constantcolor1);
+		combiner_argb8_float(data, combiner.setup.stage[n].constantcolor1);
 		countlen--;
 	}
 	if ((maddress >= 0x0aa0) && (maddress < 0x0ac0)) {
 		int n;
 
 		n = (maddress - 0x0aa0) >> 2;
-		combiner.stage[n].mapout_aCD_output = data & 15;
-		combiner.stage[n].mapout_aAB_output = (data >> 4) & 15;
-		combiner.stage[n].mapout_aSUM_output = (data >> 8) & 15;
-		combiner.stage[n].mapout_aCD_dotproduct = (data >> 12) & 1;
-		combiner.stage[n].mapout_aAB_dotproduct = (data >> 13) & 1;
-		combiner.stage[n].mapout_a_muxsum = (data >> 14) & 1;
-		combiner.stage[n].mapout_a_bias = (data >> 15) & 1;
-		combiner.stage[n].mapout_a_scale = (data >> 16) & 3;
+		combiner.setup.stage[n].mapout_alpha.CD_output = (Combiner::InputRegister)(data & 15);
+		combiner.setup.stage[n].mapout_alpha.AB_output = (Combiner::InputRegister)((data >> 4) & 15);
+		combiner.setup.stage[n].mapout_alpha.SUM_output = (Combiner::InputRegister)((data >> 8) & 15);
+		combiner.setup.stage[n].mapout_alpha.CD_dotproduct = (data >> 12) & 1;
+		combiner.setup.stage[n].mapout_alpha.AB_dotproduct = (data >> 13) & 1;
+		combiner.setup.stage[n].mapout_alpha.muxsum = (data >> 14) & 1;
+		combiner.setup.stage[n].mapout_alpha.bias = (data >> 15) & 1;
+		combiner.setup.stage[n].mapout_alpha.scale = (data >> 16) & 3;
 		//combiner.=(data >> 27) & 7;
 		countlen--;
 	}
+	if (maddress == 0x1e20) {
+		combiner_argb8_float(data, combiner.setup.final.constantcolor0);
+		countlen--;
+	}
+	if (maddress == 0x1e24) {
+		combiner_argb8_float(data, combiner.setup.final.constantcolor1);
+		countlen--;
+	}
 	if ((maddress >= 0x1e40) && (maddress < 0x1e60)) {
 		int n;
 
 		n = (maddress - 0x1e40) >> 2;
-		combiner.stage[n].mapout_rgbCD_output = data & 15;
-		combiner.stage[n].mapout_rgbAB_output = (data >> 4) & 15;
-		combiner.stage[n].mapout_rgbSUM_output = (data >> 8) & 15;
-		combiner.stage[n].mapout_rgbCD_dotproduct = (data >> 12) & 1;
-		combiner.stage[n].mapout_rgbAB_dotproduct = (data >> 13) & 1;
-		combiner.stage[n].mapout_rgb_muxsum = (data >> 14) & 1;
-		combiner.stage[n].mapout_rgb_bias = (data >> 15) & 1;
-		combiner.stage[n].mapout_rgb_scale = (data >> 16) & 3;
+		combiner.setup.stage[n].mapout_rgb.CD_output = (Combiner::InputRegister)(data & 15);
+		combiner.setup.stage[n].mapout_rgb.AB_output = (Combiner::InputRegister)((data >> 4) & 15);
+		combiner.setup.stage[n].mapout_rgb.SUM_output = (Combiner::InputRegister)((data >> 8) & 15);
+		combiner.setup.stage[n].mapout_rgb.CD_dotproduct = (data >> 12) & 1;
+		combiner.setup.stage[n].mapout_rgb.AB_dotproduct = (data >> 13) & 1;
+		combiner.setup.stage[n].mapout_rgb.muxsum = (data >> 14) & 1;
+		combiner.setup.stage[n].mapout_rgb.bias = (data >> 15) & 1;
+		combiner.setup.stage[n].mapout_rgb.scale = (data >> 16) & 3;
 		//combiner.=(data >> 27) & 7;
 		countlen--;
 	}
+	if (maddress == 0x1e60) {
+		combiner.setup.stages = data & 15;
+		countlen--;
+	}
 	return 0;
 }
 
@@ -3987,20 +4070,14 @@ int nv2a_renderer::execute_method_blit(address_space &space, uint32_t chanel, ui
 #endif
 		bitblit.op = data;
 	}
-#if 0
 	if (method == 0x0300) {
-		int x, y;
-
-		x = data & 0xffff;
-		y = data >> 16;
+		bitblit.sourcex = data & 0xffff;
+		bitblit.sourcey = data >> 16;
 	}
 	if (method == 0x0304) {
-		int x, y;
-
-		x = data & 0xffff;
-		y = data >> 16;
+		bitblit.destinationx = data & 0xffff;
+		bitblit.destinationy = data >> 16;
 	}
-#endif
 	if (method == 0x0308) {
 		bitblit.width = data & 0xffff;
 		bitblit.heigth = data >> 16;
@@ -4019,8 +4096,8 @@ void nv2a_renderer::surface_2d_blit()
 		machine().logerror("Unsupported format %d in surface_2d_blit\n", bitblit.format);
 		return;
 	}
-	srcrow = (uint32_t *)(basemempointer + bitblit.source_address);
-	destrow = (uint32_t*)(basemempointer + bitblit.destination_address);
+	srcrow = (uint32_t *)direct_access_ptr(bitblit.source_address + bitblit.pitch_source * bitblit.sourcey + bitblit.sourcex * 4);
+	destrow = (uint32_t *)direct_access_ptr(bitblit.destination_address + bitblit.pitch_destination * bitblit.destinationy + bitblit.destinationx * 4);
 	for (y = 0; y < bitblit.heigth; y++) {
 		src = srcrow;
 		dest = destrow;
@@ -4072,9 +4149,9 @@ void nv2a_renderer::debug_grab_vertex_program_slot(int slot, uint32_t *instructi
 
 void nv2a_renderer::combiner_argb8_float(uint32_t color, float reg[4])
 {
-	reg[0] = (float)(color & 0xff) / 255.0f;
+	reg[2] = (float)(color & 0xff) / 255.0f;
 	reg[1] = (float)((color >> 8) & 0xff) / 255.0f;
-	reg[2] = (float)((color >> 16) & 0xff) / 255.0f;
+	reg[0] = (float)((color >> 16) & 0xff) / 255.0f;
 	reg[3] = (float)((color >> 24) & 0xff) / 255.0f;
 }
 
@@ -4083,91 +4160,91 @@ uint32_t nv2a_renderer::combiner_float_argb8(float reg[4])
 	uint32_t r, g, b, a;
 
 	a = reg[3] * 255.0f;
-	r = reg[2] * 255.0f;
+	b = reg[2] * 255.0f;
 	g = reg[1] * 255.0f;
-	b = reg[0] * 255.0f;
+	r = reg[0] * 255.0f;
 	return (a << 24) | (r << 16) | (g << 8) | b;
 }
 
-float nv2a_renderer::combiner_map_input_select(int code, int index)
+float nv2a_renderer::combiner_map_input_select(int id, Combiner::InputRegister code, int index)
 {
-	switch (code) {
+	switch ((int)code) {
 	case 0:
 	default:
-		return combiner.register_zero[index];
+		return combiner.work[id].registers.zero[index];
 	case 1:
-		return combiner.register_color0[index];
+		return combiner.work[id].registers.color0[index];
 	case 2:
-		return combiner.register_color1[index];
+		return combiner.work[id].registers.color1[index];
 	case 3:
-		return combiner.register_fogcolor[index];
+		return combiner.work[id].registers.fogcolor[index];
 	case 4:
-		return combiner.register_primarycolor[index];
+		return combiner.work[id].registers.primarycolor[index];
 	case 5:
-		return combiner.register_secondarycolor[index];
+		return combiner.work[id].registers.secondarycolor[index];
 	case 8:
-		return combiner.register_texture0color[index];
+		return combiner.work[id].registers.texture0color[index];
 	case 9:
-		return combiner.register_texture1color[index];
+		return combiner.work[id].registers.texture1color[index];
 	case 10:
-		return combiner.register_texture2color[index];
+		return combiner.work[id].registers.texture2color[index];
 	case 11:
-		return combiner.register_texture3color[index];
+		return combiner.work[id].registers.texture3color[index];
 	case 12:
-		return combiner.register_spare0[index];
+		return combiner.work[id].registers.spare0[index];
 	case 13:
-		return combiner.register_spare1[index];
+		return combiner.work[id].registers.spare1[index];
 	case 14:
-		return combiner.variable_sumclamp[index];
+		return combiner.work[id].variables.sumclamp[index];
 	case 15:
-		return combiner.variable_EF[index];
+		return combiner.work[id].variables.EF[index];
 	}
 
 	// never executed
 	//return 0;
 }
 
-float *nv2a_renderer::combiner_map_input_select3(int code)
+float *nv2a_renderer::combiner_map_input_select_array(int id, Combiner::InputRegister code)
 {
-	switch (code) {
+	switch ((int)code) {
 	case 0:
 	default:
-		return combiner.register_zero;
+		return combiner.work[id].registers.zero;
 	case 1:
-		return combiner.register_color0;
+		return combiner.work[id].registers.color0;
 	case 2:
-		return combiner.register_color1;
+		return combiner.work[id].registers.color1;
 	case 3:
-		return combiner.register_fogcolor;
+		return combiner.work[id].registers.fogcolor;
 	case 4:
-		return combiner.register_primarycolor;
+		return combiner.work[id].registers.primarycolor;
 	case 5:
-		return combiner.register_secondarycolor;
+		return combiner.work[id].registers.secondarycolor;
 	case 8:
-		return combiner.register_texture0color;
+		return combiner.work[id].registers.texture0color;
 	case 9:
-		return combiner.register_texture1color;
+		return combiner.work[id].registers.texture1color;
 	case 10:
-		return combiner.register_texture2color;
+		return combiner.work[id].registers.texture2color;
 	case 11:
-		return combiner.register_texture3color;
+		return combiner.work[id].registers.texture3color;
 	case 12:
-		return combiner.register_spare0;
+		return combiner.work[id].registers.spare0;
 	case 13:
-		return combiner.register_spare1;
+		return combiner.work[id].registers.spare1;
 	case 14:
-		return combiner.variable_sumclamp;
+		return combiner.work[id].variables.sumclamp;
 	case 15:
-		return combiner.variable_EF;
+		return combiner.work[id].variables.EF;
 	}
 
 	// never executed
 	//return 0;
 }
 
-float *nv2a_renderer::combiner_map_output_select3(int code)
+float *nv2a_renderer::combiner_map_output_select_array(int id, Combiner::InputRegister code)
 {
-	switch (code) {
+	switch ((int)code) {
 	case 0:
 		return nullptr;
 	case 1:
@@ -4177,21 +4254,21 @@ float *nv2a_renderer::combiner_map_output_select3(int code)
 	case 3:
 		return nullptr;
 	case 4:
-		return combiner.register_primarycolor;
+		return combiner.work[id].registers.primarycolor;
 	case 5:
-		return combiner.register_secondarycolor;
+		return combiner.work[id].registers.secondarycolor;
 	case 8:
-		return combiner.register_texture0color;
+		return combiner.work[id].registers.texture0color;
 	case 9:
-		return combiner.register_texture1color;
+		return combiner.work[id].registers.texture1color;
 	case 10:
-		return combiner.register_texture2color;
+		return combiner.work[id].registers.texture2color;
 	case 11:
-		return combiner.register_texture3color;
+		return combiner.work[id].registers.texture3color;
 	case 12:
-		return combiner.register_spare0;
+		return combiner.work[id].registers.spare0;
 	case 13:
-		return combiner.register_spare1;
+		return combiner.work[id].registers.spare1;
 	case 14:
 		return nullptr;
 	case 15:
@@ -4200,27 +4277,24 @@ float *nv2a_renderer::combiner_map_output_select3(int code)
 	}
 }
 
-float nv2a_renderer::combiner_map_input_function(int code, float value)
+float nv2a_renderer::combiner_map_input_function(Combiner::MapFunction code, float value)
 {
-	float t;
-
-	switch (code) {
-	case 0:
+	switch ((int)code) {
+	case 0: // unsigned identity
 		return std::max(0.0f, value);
-	case 1:
-		t = std::max(value, 0.0f);
-		return 1.0f - std::min(t, 1.0f);
-	case 2:
+	case 1: // unsigned invert
+		return 1.0f - std::min(std::max(value, 0.0f), 1.0f);
+	case 2: // expand normal
 		return 2.0f * std::max(0.0f, value) - 1.0f;
-	case 3:
+	case 3: // expand negate
 		return -2.0f * std::max(0.0f, value) + 1.0f;
-	case 4:
+	case 4: // half bias normal
 		return std::max(0.0f, value) - 0.5f;
-	case 5:
+	case 5: // half bias negate
 		return -std::max(0.0f, value) + 0.5f;
-	case 6:
+	case 6: // signed identyty
 		return value;
-	case 7:
+	case 7: // signed negate
 	default:
 		return -value;
 	}
@@ -4229,23 +4303,18 @@ float nv2a_renderer::combiner_map_input_function(int code, float value)
 	//return 0;
 }
 
-void nv2a_renderer::combiner_map_input_function3(int code, float *data)
+void nv2a_renderer::combiner_map_input_function_array(Combiner::MapFunction code, float *data)
 {
-	float t;
-
-	switch (code) {
+	switch ((int)code) {
 	case 0:
 		data[0] = std::max(0.0f, data[0]);
 		data[1] = std::max(0.0f, data[1]);
 		data[2] = std::max(0.0f, data[2]);
 		break;
 	case 1:
-		t = std::max(data[0], 0.0f);
-		data[0] = 1.0f - std::min(t, 1.0f);
-		t = std::max(data[1], 0.0f);
-		data[1] = 1.0f - std::min(t, 1.0f);
-		t = std::max(data[2], 0.0f);
-		data[2] = 1.0f - std::min(t, 1.0f);
+		data[0] = 1.0f - std::min(std::max(data[0], 0.0f), 1.0f);
+		data[1] = 1.0f - std::min(std::max(data[1], 0.0f), 1.0f);
+		data[2] = 1.0f - std::min(std::max(data[2], 0.0f), 1.0f);
 		break;
 	case 2:
 		data[0] = 2.0f * std::max(0.0f, data[0]) - 1.0f;
@@ -4278,368 +4347,386 @@ void nv2a_renderer::combiner_map_input_function3(int code, float *data)
 	}
 }
 
-void nv2a_renderer::combiner_initialize_registers(uint32_t argb8[6])
+void nv2a_renderer::combiner_initialize_registers(int id, float rgba[6][4])
 {
-	combiner_argb8_float(argb8[0], combiner.register_primarycolor);
-	combiner_argb8_float(argb8[1], combiner.register_secondarycolor);
-	combiner_argb8_float(argb8[2], combiner.register_texture0color);
-	combiner_argb8_float(argb8[3], combiner.register_texture1color);
-	combiner_argb8_float(argb8[4], combiner.register_texture2color);
-	combiner_argb8_float(argb8[5], combiner.register_texture3color);
-	combiner.register_spare0[3] = combiner.register_texture0color[3];
-	combiner.register_zero[0] = combiner.register_zero[1] = combiner.register_zero[2] = combiner.register_zero[3] = 0;
+	for (int n = 0; n < 4; n++) {
+		combiner.work[id].registers.primarycolor[n] = rgba[0][n];
+		combiner.work[id].registers.secondarycolor[n] = rgba[1][n];
+		combiner.work[id].registers.texture0color[n] = rgba[2][n];
+		combiner.work[id].registers.texture1color[n] = rgba[3][n];
+		combiner.work[id].registers.texture2color[n] = rgba[4][n];
+		combiner.work[id].registers.texture3color[n] = rgba[5][n];
+		combiner.work[id].registers.fogcolor[n] = rgba[6][n];
+	}
+	combiner.work[id].registers.spare0[3] = combiner.work[id].registers.texture0color[3]; // alpha of spare 0 must be the alpha of the pixel from texture 0
+	combiner.work[id].registers.zero[0] = combiner.work[id].registers.zero[1] = combiner.work[id].registers.zero[2] = combiner.work[id].registers.zero[3] = 0;
 }
 
-void nv2a_renderer::combiner_initialize_stage(int stage_number)
+void nv2a_renderer::combiner_initialize_stage(int id, int stage_number)
 {
 	int n = stage_number;
 
 	// put register_constantcolor0 in register_color0
-	combiner.register_color0[0] = combiner.stage[n].register_constantcolor0[0];
-	combiner.register_color0[1] = combiner.stage[n].register_constantcolor0[1];
-	combiner.register_color0[2] = combiner.stage[n].register_constantcolor0[2];
-	combiner.register_color0[3] = combiner.stage[n].register_constantcolor0[3];
+	combiner.work[id].registers.color0[0] = combiner.setup.stage[n].constantcolor0[0];
+	combiner.work[id].registers.color0[1] = combiner.setup.stage[n].constantcolor0[1];
+	combiner.work[id].registers.color0[2] = combiner.setup.stage[n].constantcolor0[2];
+	combiner.work[id].registers.color0[3] = combiner.setup.stage[n].constantcolor0[3];
 	// put register_constantcolor1 in register_color1
-	combiner.register_color1[0] = combiner.stage[n].register_constantcolor1[0];
-	combiner.register_color1[1] = combiner.stage[n].register_constantcolor1[1];
-	combiner.register_color1[2] = combiner.stage[n].register_constantcolor1[2];
-	combiner.register_color1[3] = combiner.stage[n].register_constantcolor1[3];
+	combiner.work[id].registers.color1[0] = combiner.setup.stage[n].constantcolor1[0];
+	combiner.work[id].registers.color1[1] = combiner.setup.stage[n].constantcolor1[1];
+	combiner.work[id].registers.color1[2] = combiner.setup.stage[n].constantcolor1[2];
+	combiner.work[id].registers.color1[3] = combiner.setup.stage[n].constantcolor1[3];
 }
 
-void nv2a_renderer::combiner_initialize_final()
+void nv2a_renderer::combiner_initialize_final(int id)
 {
 	// put register_constantcolor0 in register_color0
-	combiner.register_color0[0] = combiner.final.register_constantcolor0[0];
-	combiner.register_color0[1] = combiner.final.register_constantcolor0[1];
-	combiner.register_color0[2] = combiner.final.register_constantcolor0[2];
-	combiner.register_color0[3] = combiner.final.register_constantcolor0[3];
+	combiner.work[id].registers.color0[0] = combiner.setup.final.constantcolor0[0];
+	combiner.work[id].registers.color0[1] = combiner.setup.final.constantcolor0[1];
+	combiner.work[id].registers.color0[2] = combiner.setup.final.constantcolor0[2];
+	combiner.work[id].registers.color0[3] = combiner.setup.final.constantcolor0[3];
 	// put register_constantcolor1 in register_color1
-	combiner.register_color1[0] = combiner.final.register_constantcolor1[0];
-	combiner.register_color1[1] = combiner.final.register_constantcolor1[1];
-	combiner.register_color1[2] = combiner.final.register_constantcolor1[2];
-	combiner.register_color1[3] = combiner.final.register_constantcolor1[3];
+	combiner.work[id].registers.color1[0] = combiner.setup.final.constantcolor1[0];
+	combiner.work[id].registers.color1[1] = combiner.setup.final.constantcolor1[1];
+	combiner.work[id].registers.color1[2] = combiner.setup.final.constantcolor1[2];
+	combiner.work[id].registers.color1[3] = combiner.setup.final.constantcolor1[3];
 }
 
-void nv2a_renderer::combiner_map_input(int stage_number)
+void nv2a_renderer::combiner_map_stage_input(int id, int stage_number)
 {
 	int n = stage_number;
 	int c, d, i;
 	float v, *pv;
 
+	// rgb portion
 	// A
-	v = combiner_map_input_select(combiner.stage[n].mapin_aA_input, 2 + combiner.stage[n].mapin_aA_component);
-	combiner.variable_A[3] = combiner_map_input_function(combiner.stage[n].mapin_aA_mapping, v);
+	// get pointer to rgb components of selected input register
+	pv = combiner_map_input_select_array(id, combiner.setup.stage[n].mapin_rgb.A_input);
+	c = combiner.setup.stage[n].mapin_rgb.A_component * 3;
+	i = combiner.setup.stage[n].mapin_rgb.A_component ^ 1;
+	// copy components to A
+	for (d = 0; d < 3; d++) {
+		combiner.work[id].variables.A[d] = pv[c];
+		c += i;
+	}
+	// apply mapping function
+	combiner_map_input_function_array(combiner.setup.stage[n].mapin_rgb.A_mapping, combiner.work[id].variables.A);
 	// B
-	v = combiner_map_input_select(combiner.stage[n].mapin_aB_input, 2 + combiner.stage[n].mapin_aB_component);
-	combiner.variable_B[3] = combiner_map_input_function(combiner.stage[n].mapin_aB_mapping, v);
+	pv = combiner_map_input_select_array(id, combiner.setup.stage[n].mapin_rgb.B_input);
+	c = combiner.setup.stage[n].mapin_rgb.B_component * 3;
+	i = combiner.setup.stage[n].mapin_rgb.B_component ^ 1;
+	for (d = 0; d < 3; d++) {
+		combiner.work[id].variables.B[d] = pv[c];
+		c += i;
+	}
+	combiner_map_input_function_array(combiner.setup.stage[n].mapin_rgb.B_mapping, combiner.work[id].variables.B);
 	// C
-	v = combiner_map_input_select(combiner.stage[n].mapin_aC_input, 2 + combiner.stage[n].mapin_aC_component);
-	combiner.variable_C[3] = combiner_map_input_function(combiner.stage[n].mapin_aC_mapping, v);
+	pv = combiner_map_input_select_array(id, combiner.setup.stage[n].mapin_rgb.C_input);
+	c = combiner.setup.stage[n].mapin_rgb.C_component * 3;
+	i = combiner.setup.stage[n].mapin_rgb.C_component ^ 1;
+	for (d = 0; d < 3; d++) {
+		combiner.work[id].variables.C[d] = pv[c];
+		c += i;
+	}
+	combiner_map_input_function_array(combiner.setup.stage[n].mapin_rgb.C_mapping, combiner.work[id].variables.C);
 	// D
-	v = combiner_map_input_select(combiner.stage[n].mapin_aD_input, 2 + combiner.stage[n].mapin_aD_component);
-	combiner.variable_D[3] = combiner_map_input_function(combiner.stage[n].mapin_aD_mapping, v);
+	pv = combiner_map_input_select_array(id, combiner.setup.stage[n].mapin_rgb.D_input);
+	c = combiner.setup.stage[n].mapin_rgb.D_component * 3;
+	i = combiner.setup.stage[n].mapin_rgb.D_component ^ 1;
+	for (d = 0; d < 3; d++) {
+		combiner.work[id].variables.D[d] = pv[c];
+		c += i;
+	}
+	combiner_map_input_function_array(combiner.setup.stage[n].mapin_rgb.D_mapping, combiner.work[id].variables.D);
 
+	// alpha portion
 	// A
-	pv = combiner_map_input_select3(combiner.stage[n].mapin_rgbA_input);
-	c = combiner.stage[n].mapin_rgbA_component * 3;
-	i = ~combiner.stage[n].mapin_rgbA_component & 1;
-	for (d = 0; d < 3; d++) {
-		combiner.variable_A[d] = pv[c];
-		c = c + i;
-	}
-	combiner_map_input_function3(combiner.stage[n].mapin_rgbA_mapping, combiner.variable_A);
+	// get component (blue or alpha) from selected input
+	v = combiner_map_input_select(id, combiner.setup.stage[n].mapin_alpha.A_input, 2 + combiner.setup.stage[n].mapin_alpha.A_component);
+	// copy component to A
+	combiner.work[id].variables.A[3] = combiner_map_input_function(combiner.setup.stage[n].mapin_alpha.A_mapping, v);
 	// B
-	pv = combiner_map_input_select3(combiner.stage[n].mapin_rgbB_input);
-	c = combiner.stage[n].mapin_rgbB_component * 3;
-	i = ~combiner.stage[n].mapin_rgbB_component & 1;
-	for (d = 0; d < 3; d++) {
-		combiner.variable_B[d] = pv[c];
-		c = c + i;
-	}
-	combiner_map_input_function3(combiner.stage[n].mapin_rgbB_mapping, combiner.variable_B);
+	v = combiner_map_input_select(id, combiner.setup.stage[n].mapin_alpha.B_input, 2 + combiner.setup.stage[n].mapin_alpha.B_component);
+	combiner.work[id].variables.B[3] = combiner_map_input_function(combiner.setup.stage[n].mapin_alpha.B_mapping, v);
 	// C
-	pv = combiner_map_input_select3(combiner.stage[n].mapin_rgbC_input);
-	c = combiner.stage[n].mapin_rgbC_component * 3;
-	i = ~combiner.stage[n].mapin_rgbC_component & 1;
-	for (d = 0; d < 3; d++) {
-		combiner.variable_C[d] = pv[c];
-		c = c + i;
-	}
-	combiner_map_input_function3(combiner.stage[n].mapin_rgbC_mapping, combiner.variable_C);
+	v = combiner_map_input_select(id, combiner.setup.stage[n].mapin_alpha.C_input, 2 + combiner.setup.stage[n].mapin_alpha.C_component);
+	combiner.work[id].variables.C[3] = combiner_map_input_function(combiner.setup.stage[n].mapin_alpha.C_mapping, v);
 	// D
-	pv = combiner_map_input_select3(combiner.stage[n].mapin_rgbD_input);
-	c = combiner.stage[n].mapin_rgbD_component * 3;
-	i = ~combiner.stage[n].mapin_rgbD_component & 1;
-	for (d = 0; d < 3; d++) {
-		combiner.variable_D[d] = pv[c];
-		c = c + i;
-	}
-	combiner_map_input_function3(combiner.stage[n].mapin_rgbD_mapping, combiner.variable_D);
+	v = combiner_map_input_select(id, combiner.setup.stage[n].mapin_alpha.D_input, 2 + combiner.setup.stage[n].mapin_alpha.D_component);
+	combiner.work[id].variables.D[3] = combiner_map_input_function(combiner.setup.stage[n].mapin_alpha.D_mapping, v);
 }
 
-void nv2a_renderer::combiner_map_output(int stage_number)
+void nv2a_renderer::combiner_map_stage_output(int id, int stage_number)
 {
 	int n = stage_number;
 	float *f;
 
 	// rgb
-	f = combiner_map_output_select3(combiner.stage[n].mapout_rgbAB_output);
+	f = combiner_map_output_select_array(id, combiner.setup.stage[n].mapout_rgb.AB_output);
 	if (f) {
-		f[0] = combiner.function_RGBop1[0];
-		f[1] = combiner.function_RGBop1[1];
-		f[2] = combiner.function_RGBop1[2];
+		f[0] = combiner.work[id].functions.RGBop1[0];
+		f[1] = combiner.work[id].functions.RGBop1[1];
+		f[2] = combiner.work[id].functions.RGBop1[2];
 	}
-	f = combiner_map_output_select3(combiner.stage[n].mapout_rgbCD_output);
+	f = combiner_map_output_select_array(id, combiner.setup.stage[n].mapout_rgb.CD_output);
 	if (f) {
-		f[0] = combiner.function_RGBop2[0];
-		f[1] = combiner.function_RGBop2[1];
-		f[2] = combiner.function_RGBop2[2];
+		f[0] = combiner.work[id].functions.RGBop2[0];
+		f[1] = combiner.work[id].functions.RGBop2[1];
+		f[2] = combiner.work[id].functions.RGBop2[2];
 	}
-	if ((combiner.stage[n].mapout_rgbAB_dotproduct | combiner.stage[n].mapout_rgbCD_dotproduct) == 0) {
-		f = combiner_map_output_select3(combiner.stage[n].mapout_rgbSUM_output);
+	if ((combiner.setup.stage[n].mapout_rgb.AB_dotproduct | combiner.setup.stage[n].mapout_rgb.CD_dotproduct) == 0) {
+		f = combiner_map_output_select_array(id, combiner.setup.stage[n].mapout_rgb.SUM_output);
 		if (f) {
-			f[0] = combiner.function_RGBop3[0];
-			f[1] = combiner.function_RGBop3[1];
-			f[2] = combiner.function_RGBop3[2];
+			f[0] = combiner.work[id].functions.RGBop3[0];
+			f[1] = combiner.work[id].functions.RGBop3[1];
+			f[2] = combiner.work[id].functions.RGBop3[2];
 		}
 	}
-	// a
-	f = combiner_map_output_select3(combiner.stage[n].mapout_aAB_output);
+	// alpha
+	f = combiner_map_output_select_array(id, combiner.setup.stage[n].mapout_alpha.AB_output);
 	if (f)
-		f[3] = combiner.function_Aop1;
-	f = combiner_map_output_select3(combiner.stage[n].mapout_aCD_output);
+		f[3] = combiner.work[id].functions.Aop1;
+	f = combiner_map_output_select_array(id, combiner.setup.stage[n].mapout_alpha.CD_output);
 	if (f)
-		f[3] = combiner.function_Aop2;
-	f = combiner_map_output_select3(combiner.stage[n].mapout_aSUM_output);
+		f[3] = combiner.work[id].functions.Aop2;
+	f = combiner_map_output_select_array(id, combiner.setup.stage[n].mapout_alpha.SUM_output);
 	if (f)
-		f[3] = combiner.function_Aop3;
+		f[3] = combiner.work[id].functions.Aop3;
 }
 
-void nv2a_renderer::combiner_map_final_input()
+void nv2a_renderer::combiner_map_final_input(int id)
 {
-	int i, c, d;
+	int c, d, i;
 	float *pv;
 
 	// E
-	pv = combiner_map_input_select3(combiner.final.mapin_rgbE_input);
-	c = combiner.final.mapin_rgbE_component * 3;
-	i = ~combiner.final.mapin_rgbE_component & 1;
+	pv = combiner_map_input_select_array(id, combiner.setup.final.mapin_rgb.E_input);
+	c = combiner.setup.final.mapin_rgb.E_component * 3;
+	i = combiner.setup.final.mapin_rgb.E_component ^ 1;
 	for (d = 0; d < 3; d++) {
-		combiner.variable_E[d] = pv[c];
-		c = c + i;
+		combiner.work[id].variables.E[d] = pv[c];
+		c += i;
 	}
-	combiner_map_input_function3(combiner.final.mapin_rgbE_mapping, combiner.variable_E);
+	combiner_map_input_function_array(combiner.setup.final.mapin_rgb.E_mapping, combiner.work[id].variables.E);
 	// F
-	pv = combiner_map_input_select3(combiner.final.mapin_rgbF_input);
-	c = combiner.final.mapin_rgbF_component * 3;
-	i = ~combiner.final.mapin_rgbF_component & 1;
+	pv = combiner_map_input_select_array(id, combiner.setup.final.mapin_rgb.F_input);
+	c = combiner.setup.final.mapin_rgb.F_component * 3;
+	i = combiner.setup.final.mapin_rgb.F_component ^ 1;
 	for (d = 0; d < 3; d++) {
-		combiner.variable_F[d] = pv[c];
-		c = c + i;
+		combiner.work[id].variables.F[d] = pv[c];
+		c += i;
 	}
-	combiner_map_input_function3(combiner.final.mapin_rgbF_mapping, combiner.variable_F);
+	combiner_map_input_function_array(combiner.setup.final.mapin_rgb.F_mapping, combiner.work[id].variables.F);
 	// EF
-	combiner.variable_EF[0] = combiner.variable_E[0] * combiner.variable_F[0];
-	combiner.variable_EF[1] = combiner.variable_E[1] * combiner.variable_F[1];
-	combiner.variable_EF[2] = combiner.variable_E[2] * combiner.variable_F[2];
+	combiner.work[id].variables.EF[0] = combiner.work[id].variables.E[0] * combiner.work[id].variables.F[0];
+	combiner.work[id].variables.EF[1] = combiner.work[id].variables.E[1] * combiner.work[id].variables.F[1];
+	combiner.work[id].variables.EF[2] = combiner.work[id].variables.E[2] * combiner.work[id].variables.F[2];
 	// sumclamp
-	combiner.variable_sumclamp[0] = std::max(0.0f, combiner.register_spare0[0]) + std::max(0.0f, combiner.register_secondarycolor[0]);
-	combiner.variable_sumclamp[1] = std::max(0.0f, combiner.register_spare0[1]) + std::max(0.0f, combiner.register_secondarycolor[1]);
-	combiner.variable_sumclamp[2] = std::max(0.0f, combiner.register_spare0[2]) + std::max(0.0f, combiner.register_secondarycolor[2]);
-	if (combiner.final.color_sum_clamp != 0) {
-		combiner.variable_sumclamp[0] = std::min(combiner.variable_sumclamp[0], 1.0f);
-		combiner.variable_sumclamp[1] = std::min(combiner.variable_sumclamp[1], 1.0f);
-		combiner.variable_sumclamp[2] = std::min(combiner.variable_sumclamp[2], 1.0f);
+	combiner.work[id].variables.sumclamp[0] = std::max(0.0f, combiner.work[id].registers.spare0[0]) + std::max(0.0f, combiner.work[id].registers.secondarycolor[0]);
+	combiner.work[id].variables.sumclamp[1] = std::max(0.0f, combiner.work[id].registers.spare0[1]) + std::max(0.0f, combiner.work[id].registers.secondarycolor[1]);
+	combiner.work[id].variables.sumclamp[2] = std::max(0.0f, combiner.work[id].registers.spare0[2]) + std::max(0.0f, combiner.work[id].registers.secondarycolor[2]);
+	if (combiner.setup.final.color_sum_clamp != 0) {
+		combiner.work[id].variables.sumclamp[0] = std::min(combiner.work[id].variables.sumclamp[0], 1.0f);
+		combiner.work[id].variables.sumclamp[1] = std::min(combiner.work[id].variables.sumclamp[1], 1.0f);
+		combiner.work[id].variables.sumclamp[2] = std::min(combiner.work[id].variables.sumclamp[2], 1.0f);
 	}
 	// A
-	pv = combiner_map_input_select3(combiner.final.mapin_rgbA_input);
-	c = combiner.final.mapin_rgbA_component * 3;
-	i = ~combiner.final.mapin_rgbA_component & 1;
+	pv = combiner_map_input_select_array(id, combiner.setup.final.mapin_rgb.A_input);
+	c = combiner.setup.final.mapin_rgb.A_component * 3;
+	i = combiner.setup.final.mapin_rgb.A_component ^ 1;
 	for (d = 0; d < 3; d++) {
-		combiner.variable_A[d] = pv[c];
-		c = c + i;
+		combiner.work[id].variables.A[d] = pv[c];
+		c += i;
 	}
-	combiner_map_input_function3(combiner.final.mapin_rgbA_mapping, combiner.variable_A);
+	combiner_map_input_function_array(combiner.setup.final.mapin_rgb.A_mapping, combiner.work[id].variables.A);
 	// B
-	pv = combiner_map_input_select3(combiner.final.mapin_rgbB_input);
-	c = combiner.final.mapin_rgbB_component * 3;
-	i = ~combiner.final.mapin_rgbB_component & 1;
+	pv = combiner_map_input_select_array(id, combiner.setup.final.mapin_rgb.B_input);
+	c = combiner.setup.final.mapin_rgb.B_component * 3;
+	i = combiner.setup.final.mapin_rgb.B_component ^ 1;
 	for (d = 0; d < 3; d++) {
-		combiner.variable_B[d] = pv[c];
-		c = c + i;
+		combiner.work[id].variables.B[d] = pv[c];
+		c += i;
 	}
-	combiner_map_input_function3(combiner.final.mapin_rgbB_mapping, combiner.variable_B);
+	combiner_map_input_function_array(combiner.setup.final.mapin_rgb.B_mapping, combiner.work[id].variables.B);
 	// C
-	pv = combiner_map_input_select3(combiner.final.mapin_rgbC_input);
-	c = combiner.final.mapin_rgbC_component * 3;
-	i = ~combiner.final.mapin_rgbC_component & 1;
+	pv = combiner_map_input_select_array(id, combiner.setup.final.mapin_rgb.C_input);
+	c = combiner.setup.final.mapin_rgb.C_component * 3;
+	i = combiner.setup.final.mapin_rgb.C_component ^ 1;
 	for (d = 0; d < 3; d++) {
-		combiner.variable_C[d] = pv[c];
-		c = c + i;
+		combiner.work[id].variables.C[d] = pv[c];
+		c += i;
 	}
-	combiner_map_input_function3(combiner.final.mapin_rgbC_mapping, combiner.variable_C);
+	combiner_map_input_function_array(combiner.setup.final.mapin_rgb.C_mapping, combiner.work[id].variables.C);
 	// D
-	pv = combiner_map_input_select3(combiner.final.mapin_rgbD_input);
-	c = combiner.final.mapin_rgbD_component * 3;
-	i = ~combiner.final.mapin_rgbD_component & 1;
+	pv = combiner_map_input_select_array(id, combiner.setup.final.mapin_rgb.D_input);
+	c = combiner.setup.final.mapin_rgb.D_component * 3;
+	i = combiner.setup.final.mapin_rgb.D_component ^ 1;
 	for (d = 0; d < 3; d++) {
-		combiner.variable_D[d] = pv[c];
-		c = c + i;
+		combiner.work[id].variables.D[d] = pv[c];
+		c += i;
 	}
-	combiner_map_input_function3(combiner.final.mapin_rgbD_mapping, combiner.variable_D);
+	combiner_map_input_function_array(combiner.setup.final.mapin_rgb.D_mapping, combiner.work[id].variables.D);
 	// G
-	combiner.variable_G = combiner_map_input_select(combiner.final.mapin_aG_input, 2 + combiner.final.mapin_aG_component);
+	combiner.work[id].variables.G = combiner_map_input_select(id, combiner.setup.final.mapin_alpha.G_input, 2 + combiner.setup.final.mapin_alpha.G_component);
 }
 
-void nv2a_renderer::combiner_final_output()
+void nv2a_renderer::combiner_final_output(int id)
 {
 	// rgb
-	combiner.output[0] = combiner.variable_A[0] * combiner.variable_B[0] + (1.0f - combiner.variable_A[0])*combiner.variable_C[0] + combiner.variable_D[0];
-	combiner.output[1] = combiner.variable_A[1] * combiner.variable_B[1] + (1.0f - combiner.variable_A[1])*combiner.variable_C[1] + combiner.variable_D[1];
-	combiner.output[2] = combiner.variable_A[2] * combiner.variable_B[2] + (1.0f - combiner.variable_A[2])*combiner.variable_C[2] + combiner.variable_D[2];
-	combiner.output[0] = std::min(combiner.output[0], 1.0f);
-	combiner.output[1] = std::min(combiner.output[1], 1.0f);
-	combiner.output[2] = std::min(combiner.output[2], 1.0f);
+	combiner.work[id].output[0] = combiner.work[id].variables.A[0] * combiner.work[id].variables.B[0] + (1.0f - combiner.work[id].variables.A[0])*combiner.work[id].variables.C[0] + combiner.work[id].variables.D[0];
+	combiner.work[id].output[1] = combiner.work[id].variables.A[1] * combiner.work[id].variables.B[1] + (1.0f - combiner.work[id].variables.A[1])*combiner.work[id].variables.C[1] + combiner.work[id].variables.D[1];
+	combiner.work[id].output[2] = combiner.work[id].variables.A[2] * combiner.work[id].variables.B[2] + (1.0f - combiner.work[id].variables.A[2])*combiner.work[id].variables.C[2] + combiner.work[id].variables.D[2];
+	combiner.work[id].output[0] = std::min(combiner.work[id].output[0], 2.0f);
+	combiner.work[id].output[1] = std::min(combiner.work[id].output[1], 2.0f);
+	combiner.work[id].output[2] = std::min(combiner.work[id].output[2], 2.0f);
 	// a
-	combiner.output[3] = combiner_map_input_function(combiner.final.mapin_aG_mapping, combiner.variable_G);
+	combiner.work[id].output[3] = combiner_map_input_function(combiner.setup.final.mapin_alpha.G_mapping, combiner.work[id].variables.G);
 }
 
-void nv2a_renderer::combiner_function_AB(float result[4])
+void nv2a_renderer::combiner_function_AB(int id, float result[4])
 {
-	result[0] = combiner.variable_A[0] * combiner.variable_B[0];
-	result[1] = combiner.variable_A[1] * combiner.variable_B[1];
-	result[2] = combiner.variable_A[2] * combiner.variable_B[2];
+	result[0] = combiner.work[id].variables.A[0] * combiner.work[id].variables.B[0];
+	result[1] = combiner.work[id].variables.A[1] * combiner.work[id].variables.B[1];
+	result[2] = combiner.work[id].variables.A[2] * combiner.work[id].variables.B[2];
 }
 
-void nv2a_renderer::combiner_function_AdotB(float result[4])
+void nv2a_renderer::combiner_function_AdotB(int id, float result[4])
 {
-	result[0] = combiner.variable_A[0] * combiner.variable_B[0] + combiner.variable_A[1] * combiner.variable_B[1] + combiner.variable_A[2] * combiner.variable_B[2];
+	result[0] = combiner.work[id].variables.A[0] * combiner.work[id].variables.B[0] + combiner.work[id].variables.A[1] * combiner.work[id].variables.B[1] + combiner.work[id].variables.A[2] * combiner.work[id].variables.B[2];
 	result[1] = result[0];
 	result[2] = result[0];
 }
 
-void nv2a_renderer::combiner_function_CD(float result[4])
+void nv2a_renderer::combiner_function_CD(int id, float result[4])
 {
-	result[0] = combiner.variable_C[0] * combiner.variable_D[0];
-	result[1] = combiner.variable_C[1] * combiner.variable_D[1];
-	result[2] = combiner.variable_C[2] * combiner.variable_D[2];
+	result[0] = combiner.work[id].variables.C[0] * combiner.work[id].variables.D[0];
+	result[1] = combiner.work[id].variables.C[1] * combiner.work[id].variables.D[1];
+	result[2] = combiner.work[id].variables.C[2] * combiner.work[id].variables.D[2];
 }
 
-void nv2a_renderer::combiner_function_CdotD(float result[4])
+void nv2a_renderer::combiner_function_CdotD(int id, float result[4])
 {
-	result[0] = combiner.variable_C[0] * combiner.variable_D[0] + combiner.variable_C[1] * combiner.variable_D[1] + combiner.variable_C[2] * combiner.variable_D[2];
+	result[0] = combiner.work[id].variables.C[0] * combiner.work[id].variables.D[0] + combiner.work[id].variables.C[1] * combiner.work[id].variables.D[1] + combiner.work[id].variables.C[2] * combiner.work[id].variables.D[2];
 	result[1] = result[0];
 	result[2] = result[0];
 }
 
-void nv2a_renderer::combiner_function_ABmuxCD(float result[4])
+void nv2a_renderer::combiner_function_ABmuxCD(int id, float result[4])
 {
-	if (combiner.register_spare0[3] >= 0.5f)
-		combiner_function_AB(result);
+	if (combiner.work[id].registers.spare0[3] >= 0.5f)
+		combiner_function_AB(id, result);
 	else
-		combiner_function_CD(result);
+		combiner_function_CD(id, result);
 }
 
-void nv2a_renderer::combiner_function_ABsumCD(float result[4])
+void nv2a_renderer::combiner_function_ABsumCD(int id, float result[4])
 {
-	result[0] = combiner.variable_A[0] * combiner.variable_B[0] + combiner.variable_C[0] * combiner.variable_D[0];
-	result[1] = combiner.variable_A[1] * combiner.variable_B[1] + combiner.variable_C[1] * combiner.variable_D[1];
-	result[2] = combiner.variable_A[2] * combiner.variable_B[2] + combiner.variable_C[2] * combiner.variable_D[2];
+	result[0] = combiner.work[id].variables.A[0] * combiner.work[id].variables.B[0] + combiner.work[id].variables.C[0] * combiner.work[id].variables.D[0];
+	result[1] = combiner.work[id].variables.A[1] * combiner.work[id].variables.B[1] + combiner.work[id].variables.C[1] * combiner.work[id].variables.D[1];
+	result[2] = combiner.work[id].variables.A[2] * combiner.work[id].variables.B[2] + combiner.work[id].variables.C[2] * combiner.work[id].variables.D[2];
 }
 
-void nv2a_renderer::combiner_compute_rgb_outputs(int stage_number)
+void nv2a_renderer::combiner_compute_rgb_outputs(int id, int stage_number)
 {
 	int n = stage_number;
 	int m;
-	float biasrgb, scalergb;
+	float bias, scale;
 
-	if (combiner.stage[n].mapout_rgb_bias)
-		biasrgb = -0.5;
+	// select bias and scale
+	if (combiner.setup.stage[n].mapout_rgb.bias)
+		bias = -0.5;
 	else
-		biasrgb = 0;
-	switch (combiner.stage[n].mapout_rgb_scale) {
+		bias = 0;
+	switch (combiner.setup.stage[n].mapout_rgb.scale) {
 	case 0:
 	default:
-		scalergb = 1.0;
+		scale = 1.0;
 		break;
 	case 1:
-		scalergb = 2.0;
+		scale = 2.0;
 		break;
 	case 2:
-		scalergb = 4.0;
+		scale = 4.0;
 		break;
 	case 3:
-		scalergb = 0.5;
+		scale = 0.5;
 		break;
 	}
-	if (combiner.stage[n].mapout_rgbAB_dotproduct) {
+	// first
+	if (combiner.setup.stage[n].mapout_rgb.AB_dotproduct) {
 		m = 1;
-		combiner_function_AdotB(combiner.function_RGBop1);
+		combiner_function_AdotB(id, combiner.work[id].functions.RGBop1);
 	}
 	else {
 		m = 0;
-		combiner_function_AB(combiner.function_RGBop1);
+		combiner_function_AB(id, combiner.work[id].functions.RGBop1);
 	}
-	combiner.function_RGBop1[0] = std::max(std::min((combiner.function_RGBop1[0] + biasrgb) * scalergb, 1.0f), -1.0f);
-	combiner.function_RGBop1[1] = std::max(std::min((combiner.function_RGBop1[1] + biasrgb) * scalergb, 1.0f), -1.0f);
-	combiner.function_RGBop1[2] = std::max(std::min((combiner.function_RGBop1[2] + biasrgb) * scalergb, 1.0f), -1.0f);
-	if (combiner.stage[n].mapout_rgbCD_dotproduct) {
+	combiner.work[id].functions.RGBop1[0] = std::max(std::min((combiner.work[id].functions.RGBop1[0] + bias) * scale, 1.0f), -1.0f);
+	combiner.work[id].functions.RGBop1[1] = std::max(std::min((combiner.work[id].functions.RGBop1[1] + bias) * scale, 1.0f), -1.0f);
+	combiner.work[id].functions.RGBop1[2] = std::max(std::min((combiner.work[id].functions.RGBop1[2] + bias) * scale, 1.0f), -1.0f);
+	// second
+	if (combiner.setup.stage[n].mapout_rgb.CD_dotproduct) {
 		m = m | 1;
-		combiner_function_CdotD(combiner.function_RGBop2);
+		combiner_function_CdotD(id, combiner.work[id].functions.RGBop2);
 	}
 	else
-		combiner_function_CD(combiner.function_RGBop2);
-	combiner.function_RGBop2[0] = std::max(std::min((combiner.function_RGBop2[0] + biasrgb) * scalergb, 1.0f), -1.0f);
-	combiner.function_RGBop2[1] = std::max(std::min((combiner.function_RGBop2[1] + biasrgb) * scalergb, 1.0f), -1.0f);
-	combiner.function_RGBop2[2] = std::max(std::min((combiner.function_RGBop2[2] + biasrgb) * scalergb, 1.0f), -1.0f);
+		combiner_function_CD(id, combiner.work[id].functions.RGBop2);
+	combiner.work[id].functions.RGBop2[0] = std::max(std::min((combiner.work[id].functions.RGBop2[0] + bias) * scale, 1.0f), -1.0f);
+	combiner.work[id].functions.RGBop2[1] = std::max(std::min((combiner.work[id].functions.RGBop2[1] + bias) * scale, 1.0f), -1.0f);
+	combiner.work[id].functions.RGBop2[2] = std::max(std::min((combiner.work[id].functions.RGBop2[2] + bias) * scale, 1.0f), -1.0f);
+	// third
 	if (m == 0) {
-		if (combiner.stage[n].mapout_rgb_muxsum)
-			combiner_function_ABmuxCD(combiner.function_RGBop3);
+		if (combiner.setup.stage[n].mapout_rgb.muxsum)
+			combiner_function_ABmuxCD(id, combiner.work[id].functions.RGBop3);
 		else
-			combiner_function_ABsumCD(combiner.function_RGBop3);
-		combiner.function_RGBop3[0] = std::max(std::min((combiner.function_RGBop3[0] + biasrgb) * scalergb, 1.0f), -1.0f);
-		combiner.function_RGBop3[1] = std::max(std::min((combiner.function_RGBop3[1] + biasrgb) * scalergb, 1.0f), -1.0f);
-		combiner.function_RGBop3[2] = std::max(std::min((combiner.function_RGBop3[2] + biasrgb) * scalergb, 1.0f), -1.0f);
+			combiner_function_ABsumCD(id, combiner.work[id].functions.RGBop3);
+		combiner.work[id].functions.RGBop3[0] = std::max(std::min((combiner.work[id].functions.RGBop3[0] + bias) * scale, 1.0f), -1.0f);
+		combiner.work[id].functions.RGBop3[1] = std::max(std::min((combiner.work[id].functions.RGBop3[1] + bias) * scale, 1.0f), -1.0f);
+		combiner.work[id].functions.RGBop3[2] = std::max(std::min((combiner.work[id].functions.RGBop3[2] + bias) * scale, 1.0f), -1.0f);
 	}
 }
 
-void nv2a_renderer::combiner_compute_a_outputs(int stage_number)
+void nv2a_renderer::combiner_compute_alpha_outputs(int id, int stage_number)
 {
 	int n = stage_number;
-	float biasa, scalea;
+	float bias, scale;
 
-	if (combiner.stage[n].mapout_a_bias)
-		biasa = -0.5;
+	// select bias and scale
+	if (combiner.setup.stage[n].mapout_alpha.bias)
+		bias = -0.5;
 	else
-		biasa = 0;
-	switch (combiner.stage[n].mapout_a_scale) {
+		bias = 0;
+	switch (combiner.setup.stage[n].mapout_alpha.scale) {
 	case 0:
 	default:
-		scalea = 1.0;
+		scale = 1.0;
 		break;
 	case 1:
-		scalea = 2.0;
+		scale = 2.0;
 		break;
 	case 2:
-		scalea = 4.0;
+		scale = 4.0;
 		break;
 	case 3:
-		scalea = 0.5;
+		scale = 0.5;
 		break;
 	}
-	combiner.function_Aop1 = combiner.variable_A[3] * combiner.variable_B[3];
-	combiner.function_Aop1 = std::max(std::min((combiner.function_Aop1 + biasa) * scalea, 1.0f), -1.0f);
-	combiner.function_Aop2 = combiner.variable_C[3] * combiner.variable_D[3];
-	combiner.function_Aop2 = std::max(std::min((combiner.function_Aop2 + biasa) * scalea, 1.0f), -1.0f);
-	if (combiner.stage[n].mapout_a_muxsum) {
-		if (combiner.register_spare0[3] >= 0.5f)
-			combiner.function_Aop3 = combiner.variable_A[3] * combiner.variable_B[3];
+	// first
+	combiner.work[id].functions.Aop1 = combiner.work[id].variables.A[3] * combiner.work[id].variables.B[3];
+	combiner.work[id].functions.Aop1 = std::max(std::min((combiner.work[id].functions.Aop1 + bias) * scale, 1.0f), -1.0f);
+	// second
+	combiner.work[id].functions.Aop2 = combiner.work[id].variables.C[3] * combiner.work[id].variables.D[3];
+	combiner.work[id].functions.Aop2 = std::max(std::min((combiner.work[id].functions.Aop2 + bias) * scale, 1.0f), -1.0f);
+	// third
+	if (combiner.setup.stage[n].mapout_alpha.muxsum) {
+		if (combiner.work[id].registers.spare0[3] >= 0.5f)
+			combiner.work[id].functions.Aop3 = combiner.work[id].variables.A[3] * combiner.work[id].variables.B[3];
 		else
-			combiner.function_Aop3 = combiner.variable_C[3] * combiner.variable_D[3];
+			combiner.work[id].functions.Aop3 = combiner.work[id].variables.C[3] * combiner.work[id].variables.D[3];
 	}
 	else
-		combiner.function_Aop3 = combiner.variable_A[3] * combiner.variable_B[3] + combiner.variable_C[3] * combiner.variable_D[3];
-	combiner.function_Aop3 = std::max(std::min((combiner.function_Aop3 + biasa) * scalea, 1.0f), -1.0f);
+		combiner.work[id].functions.Aop3 = combiner.work[id].variables.A[3] * combiner.work[id].variables.B[3] + combiner.work[id].variables.C[3] * combiner.work[id].variables.D[3];
+	combiner.work[id].functions.Aop3 = std::max(std::min((combiner.work[id].functions.Aop3 + bias) * scale, 1.0f), -1.0f);
 }
 
 WRITE_LINE_MEMBER(nv2a_renderer::vblank_callback)