mirror of
				https://github.com/thunderbrewhq/thunderbrew
				synced 2025-11-01 00:36:04 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			418 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			418 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| //-------------------------------------------------------------------------------------
 | |
| // DirectXMathSSE4.h -- SSE4.1 extensions for SIMD C++ Math library
 | |
| //
 | |
| // Copyright (c) Microsoft Corporation.
 | |
| // Licensed under the MIT License.
 | |
| //
 | |
| // http://go.microsoft.com/fwlink/?LinkID=615560
 | |
| //-------------------------------------------------------------------------------------
 | |
| 
 | |
| #pragma once
 | |
| 
 | |
| #if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __arm__ || __aarch64__
 | |
| #error SSE4 not supported on ARM platform
 | |
| #endif
 | |
| 
 | |
| #include <smmintrin.h>
 | |
| 
 | |
| #include <DirectXMath.h>
 | |
| 
 | |
| namespace DirectX
 | |
| {
 | |
| 
 | |
| namespace SSE4
 | |
| {
 | |
| 
 | |
| inline bool XMVerifySSE4Support()
 | |
| {
 | |
|     // Should return true on AMD Bulldozer, Intel Core 2 ("Penryn"), and Intel Core i7 ("Nehalem") or later processors
 | |
| 
 | |
|     // See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
 | |
|     int CPUInfo[4] = { -1 };
 | |
| #if (defined(__clang__) || defined(__GNUC__)) && defined(__cpuid)
 | |
|     __cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
 | |
| #else
 | |
|     __cpuid(CPUInfo, 0);
 | |
| #endif
 | |
|     if ( CPUInfo[0] < 1  )
 | |
|         return false;
 | |
| 
 | |
| #if (defined(__clang__) || defined(__GNUC__)) && defined(__cpuid)
 | |
|     __cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
 | |
| #else
 | |
|     __cpuid(CPUInfo, 1);
 | |
| #endif
 | |
| 
 | |
|     // We only check for SSE4.1 instruction set. SSE4.2 instructions are not used.
 | |
|     return ( (CPUInfo[2] & 0x80000) == 0x80000 );
 | |
| }
 | |
| 
 | |
| 
 | |
| //-------------------------------------------------------------------------------------
 | |
| // Vector
 | |
| //-------------------------------------------------------------------------------------
 | |
| 
 | |
| #ifdef __clang__
 | |
| #pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
 | |
| #endif
 | |
| 
 | |
| inline void XM_CALLCONV XMVectorGetYPtr(_Out_ float *y, _In_ FXMVECTOR V)
 | |
| {
 | |
|     assert( y != nullptr );
 | |
|     *reinterpret_cast<int*>(y) = _mm_extract_ps( V, 1 );
 | |
| }
 | |
| 
 | |
| inline void XM_CALLCONV XMVectorGetZPtr(_Out_ float *z, _In_ FXMVECTOR V)
 | |
| {
 | |
|     assert( z != nullptr );
 | |
|     *reinterpret_cast<int*>(z) = _mm_extract_ps( V, 2 );
 | |
| }
 | |
| 
 | |
| inline void XM_CALLCONV XMVectorGetWPtr(_Out_ float *w, _In_ FXMVECTOR V)
 | |
| {
 | |
|     assert( w != nullptr );
 | |
|     *reinterpret_cast<int*>(w) = _mm_extract_ps( V, 3 );
 | |
| }
 | |
| 
 | |
| inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V)
 | |
| {
 | |
|     __m128i V1 = _mm_castps_si128( V );
 | |
|     return static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
 | |
| }
 | |
| 
 | |
| inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V)
 | |
| {
 | |
|     __m128i V1 = _mm_castps_si128( V );
 | |
|     return static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
 | |
| }
 | |
| 
 | |
| inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V)
 | |
| {
 | |
|     __m128i V1 = _mm_castps_si128( V );
 | |
|     return static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
 | |
| }
 | |
| 
 | |
| inline void XM_CALLCONV XMVectorGetIntYPtr(_Out_ uint32_t *y, _In_ FXMVECTOR V)
 | |
| {
 | |
|     assert( y != nullptr );
 | |
|     __m128i V1 = _mm_castps_si128( V );
 | |
|     *y = static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
 | |
| }
 | |
| 
 | |
| inline void XM_CALLCONV XMVectorGetIntZPtr(_Out_ uint32_t *z, _In_ FXMVECTOR V)
 | |
| {
 | |
|     assert( z != nullptr );
 | |
|     __m128i V1 = _mm_castps_si128( V );
 | |
|     *z = static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
 | |
| }
 | |
| 
 | |
| inline void XM_CALLCONV XMVectorGetIntWPtr(_Out_ uint32_t *w, _In_ FXMVECTOR V)
 | |
| {
 | |
|     assert( w != nullptr );
 | |
|     __m128i V1 = _mm_castps_si128( V );
 | |
|     *w = static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y)
 | |
| {
 | |
|     XMVECTOR vResult = _mm_set_ss(y);
 | |
|     vResult = _mm_insert_ps( V, vResult, 0x10 );
 | |
|     return vResult;
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z)
 | |
| {
 | |
|     XMVECTOR vResult = _mm_set_ss(z);
 | |
|     vResult = _mm_insert_ps( V, vResult, 0x20 );
 | |
|     return vResult;
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w)
 | |
| {
 | |
|     XMVECTOR vResult = _mm_set_ss(w);
 | |
|     vResult = _mm_insert_ps( V, vResult, 0x30 );
 | |
|     return vResult;
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y)
 | |
| {
 | |
|     __m128i vResult = _mm_castps_si128( V );
 | |
|     vResult = _mm_insert_epi32( vResult, static_cast<int>(y), 1 );
 | |
|     return _mm_castsi128_ps( vResult );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z)
 | |
| {
 | |
|     __m128i vResult = _mm_castps_si128( V );
 | |
|     vResult = _mm_insert_epi32( vResult, static_cast<int>(z), 2 );
 | |
|     return _mm_castsi128_ps( vResult );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w)
 | |
| {
 | |
|     __m128i vResult = _mm_castps_si128( V );
 | |
|     vResult = _mm_insert_epi32( vResult, static_cast<int>(w), 3 );
 | |
|     return _mm_castsi128_ps( vResult );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVectorRound( FXMVECTOR V )
 | |
| {
 | |
|     return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVectorTruncate( FXMVECTOR V )
 | |
| {
 | |
|     return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVectorFloor( FXMVECTOR V )
 | |
| {
 | |
|     return _mm_floor_ps( V );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVectorCeiling( FXMVECTOR V )
 | |
| {
 | |
|     return _mm_ceil_ps( V );
 | |
| }
 | |
| 
 | |
| 
 | |
| //-------------------------------------------------------------------------------------
 | |
| // Vector2
 | |
| //-------------------------------------------------------------------------------------
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector2Dot( FXMVECTOR V1, FXMVECTOR V2 )
 | |
| {
 | |
|     return _mm_dp_ps( V1, V2, 0x3f );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector2LengthSq( FXMVECTOR V )
 | |
| {
 | |
|     return SSE4::XMVector2Dot(V, V);
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
 | |
|     return _mm_rsqrt_ps( vTemp );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
 | |
|     XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
 | |
|     return _mm_div_ps( g_XMOne, vLengthSq );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector2LengthEst( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
 | |
|     return _mm_sqrt_ps( vTemp );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector2Length( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
 | |
|     return _mm_sqrt_ps( vTemp );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
 | |
|     XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
 | |
|     return _mm_mul_ps(vResult, V);
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector2Normalize( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f );
 | |
|     // Prepare for the division
 | |
|     XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
 | |
|     // Create zero with a single instruction
 | |
|     XMVECTOR vZeroMask = _mm_setzero_ps();
 | |
|     // Test for a divide by zero (Must be FP to detect -0.0)
 | |
|     vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
 | |
|     // Failsafe on zero (Or epsilon) length planes
 | |
|     // If the length is infinity, set the elements to zero
 | |
|     vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
 | |
|     // Reciprocal mul to perform the normalization
 | |
|     vResult = _mm_div_ps(V,vResult);
 | |
|     // Any that are infinity, set to zero
 | |
|     vResult = _mm_and_ps(vResult,vZeroMask);
 | |
|     // Select qnan or result based on infinite length
 | |
|     XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
 | |
|     XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
 | |
|     vResult = _mm_or_ps(vTemp1,vTemp2);
 | |
|     return vResult;
 | |
| }
 | |
| 
 | |
| 
 | |
| //-------------------------------------------------------------------------------------
 | |
| // Vector3
 | |
| //-------------------------------------------------------------------------------------
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector3Dot( FXMVECTOR V1, FXMVECTOR V2 )
 | |
| {
 | |
|     return _mm_dp_ps( V1, V2, 0x7f );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector3LengthSq( FXMVECTOR V )
 | |
| {
 | |
|     return SSE4::XMVector3Dot(V, V);
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
 | |
|     return _mm_rsqrt_ps( vTemp );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
 | |
|     XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
 | |
|     return _mm_div_ps( g_XMOne, vLengthSq );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector3LengthEst( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
 | |
|     return _mm_sqrt_ps( vTemp );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector3Length( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
 | |
|     return _mm_sqrt_ps( vTemp );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
 | |
|     XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
 | |
|     return _mm_mul_ps(vResult, V);
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector3Normalize( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f );
 | |
|     // Prepare for the division
 | |
|     XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
 | |
|     // Create zero with a single instruction
 | |
|     XMVECTOR vZeroMask = _mm_setzero_ps();
 | |
|     // Test for a divide by zero (Must be FP to detect -0.0)
 | |
|     vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
 | |
|     // Failsafe on zero (Or epsilon) length planes
 | |
|     // If the length is infinity, set the elements to zero
 | |
|     vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
 | |
|     // Divide to perform the normalization
 | |
|     vResult = _mm_div_ps(V,vResult);
 | |
|     // Any that are infinity, set to zero
 | |
|     vResult = _mm_and_ps(vResult,vZeroMask);
 | |
|     // Select qnan or result based on infinite length
 | |
|     XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
 | |
|     XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
 | |
|     vResult = _mm_or_ps(vTemp1,vTemp2);
 | |
|     return vResult;
 | |
| }
 | |
| 
 | |
| 
 | |
| //-------------------------------------------------------------------------------------
 | |
| // Vector4
 | |
| //-------------------------------------------------------------------------------------
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector4Dot( FXMVECTOR V1, FXMVECTOR V2 )
 | |
| {
 | |
|     return _mm_dp_ps( V1, V2, 0xff );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector4LengthSq( FXMVECTOR V )
 | |
| {
 | |
|     return SSE4::XMVector4Dot(V, V);
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
 | |
|     return _mm_rsqrt_ps( vTemp );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
 | |
|     XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
 | |
|     return _mm_div_ps( g_XMOne, vLengthSq );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector4LengthEst( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
 | |
|     return _mm_sqrt_ps( vTemp );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector4Length( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
 | |
|     return _mm_sqrt_ps( vTemp );
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
 | |
|     XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
 | |
|     return _mm_mul_ps(vResult, V);
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMVector4Normalize( FXMVECTOR V )
 | |
| {
 | |
|     XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff );
 | |
|     // Prepare for the division
 | |
|     XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
 | |
|     // Create zero with a single instruction
 | |
|     XMVECTOR vZeroMask = _mm_setzero_ps();
 | |
|     // Test for a divide by zero (Must be FP to detect -0.0)
 | |
|     vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
 | |
|     // Failsafe on zero (Or epsilon) length planes
 | |
|     // If the length is infinity, set the elements to zero
 | |
|     vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
 | |
|     // Divide to perform the normalization
 | |
|     vResult = _mm_div_ps(V,vResult);
 | |
|     // Any that are infinity, set to zero
 | |
|     vResult = _mm_and_ps(vResult,vZeroMask);
 | |
|     // Select qnan or result based on infinite length
 | |
|     XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
 | |
|     XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
 | |
|     vResult = _mm_or_ps(vTemp1,vTemp2);
 | |
|     return vResult;
 | |
| }
 | |
| 
 | |
| 
 | |
| //-------------------------------------------------------------------------------------
 | |
| // Plane
 | |
| //-------------------------------------------------------------------------------------
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst( FXMVECTOR P )
 | |
| {
 | |
|     XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f );
 | |
|     XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
 | |
|     return _mm_mul_ps(vResult, P);
 | |
| }
 | |
| 
 | |
| inline XMVECTOR XM_CALLCONV XMPlaneNormalize( FXMVECTOR P )
 | |
| {
 | |
|     XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f );
 | |
|     // Prepare for the division
 | |
|     XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
 | |
|     // Failsafe on zero (Or epsilon) length planes
 | |
|     // If the length is infinity, set the elements to zero
 | |
|     vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
 | |
|     // Reciprocal mul to perform the normalization
 | |
|     vResult = _mm_div_ps(P,vResult);
 | |
|     // Any that are infinity, set to zero
 | |
|     vResult = _mm_and_ps(vResult,vLengthSq);
 | |
|     return vResult;
 | |
| }
 | |
| 
 | |
| } // namespace SSE4
 | |
| 
 | |
| } // namespace DirectX
 | 
