// // // #ifndef __SSE_PRIM__H__ #define __SSE_PRIM__H__ #ifdef SSE_OPTIMIZATIONS #include "ssetypes.h" #if defined( _MSC_VER ) /* Checking for the DAZ Flag in the MXCSR Register The denormals-are-zero flag in the MXCSR register is available in most of the Pentium 4 processors and in the Intel Xeon processor, with the exception of some early steppings. To check for the presence of the DAZ flag in the MXCSR register, do the following: 1. Establish a 512-byte FXSAVE area in memory. 2. Clear the FXSAVE area to all 0s. 3. Execute the FXSAVE instruction, using the address of the first byte of the cleared FXSAVE area as a source operand. See “FXSAVE—Save x87 FPU, MMX, SSE, and SSE2 State” in Chapter 3 of the Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 2A, for a description of the FXSAVE instruction and the layout of the FXSAVE image. 4. Check the value in the MXCSR_MASK field in the FXSAVE image (bytes 28 through 31). — If the value of the MXCSR_MASK field is 00000000H, the DAZ flag and denormals-are-zero mode are not supported. Vol. 1 11-29 PROGRAMMING WITH STREAMING SIMD EXTENSIONS 2 (SSE2) — If the value of the MXCSR_MASK field is non-zero and bit 6 is set, the DAZ flag and denormals-are-zero mode are supported. If the DAZ flag is not supported, then it is a reserved bit and attempting to write a 1 to it will cause a general-protection exception (#GP). See Section 11.6.6, “Guidelines for Writing to the MXCSR Register,” for general guidelines for preventing generalprotection exceptions when writing to the MXCSR register. 11.6.4 Initialization of SSE/SE2 Extensions The SSE and SSE2 state is contained in the XMM and MXCSR registers. Upon a hardware reset of the processor, this state is initialized as follows (see Table 11-2): • All SIMD floating-point exceptions are masked (bits 7 through 12 of the MXCSR register is set to 1). • All SIMD floating-point exception flags are cleared (bits 0 through 5 of the MXCSR register is set to 0). • The rounding control is set to round-nearest (bits 13 and 14 of the MXCSR register are set to 00B). • The flush-to-zero mode is disabled (bit 15 of the MXCSR register is set to 0). • The denormals-are-zeros mode is disabled (bit 6 of the MXCSR register is set to 0). If the denormals-are-zeros mode is not supported, this bit is reserved and will be set to 0 on initialization. • Each of the XMM registers is cleared (set to all zeros). to read HT Technology Intel® 64 and IA-32 Architectures Optimization Reference Manual FTZ and DAZ flags in the MXCSR oprof */ /* 1. Check that the processor supports the CPUID instruction. Bit 21 of the EFLAGS register can be used to check processor’s support the CPUID instruction. 2. Check that the processor supports the SSE and/or SSE2 extensions (true if CPUID.01H:EDX.SSE[bit 25] = 1 and/or CPUID.01H:EDX.SSE2[bit 26] = 1). */ inline bool sse2_available() { } /* Check that the processor supports the SIMD and x87 SSE3 extensions (if CPUID.01H:ECX.SSE3[bit 0] = 1). */ inline bool sse3_available() { } /* Before an application attempts to use the SSSE3 extensions, the application should follow the steps illustrated in Section 11.6.2, “Checking for SSE/SSE2 Support.” Next, use the additional step provided below: • Check that the processor supports SSSE3 (if CPUID.01H:ECX.SSSE3[bit 9] = 1). */ inline bool ssse3_available() { } /* 12.12.2 Checking for SSE4.1 Support Before an application attempts to use SSE4.1 instructions, the application should follow the steps illustrated in Section 11.6.2, “Checking for SSE/SSE2 Support.” Next, use the additional step provided below: Check that the processor supports SSE4.1 (if CPUID.01H:ECX.SSE4_1[bit 19] = 1), SSE3 (if CPUID.01H:ECX.SSE3[bit 0] = 1), and SSSE3 (if CPUID.01H:ECX.SSSE3[bit 9] = 1). */ inline bool sse4_1_available() { } /* Before an application attempts to use the following SSE4.2 instructions: PCMPESTRI/PCMPESTRM/PCMPISTRI/PCMPISTRM, PCMPGTQ;the application should follow the steps illustrated in Section 11.6.2, “Checking for SSE/SSE2 Support.” Next, use the additional step provided below: Check that the processor supports SSE4.2 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1), SSE4.1 (if CPUID.01H:ECX.SSE4_1[bit 19] = 1), and SSSE3 (if CPUID.01H:ECX.SSSE3[bit 9] = 1). Before an application attempts to use the CRC32 instruction, it must check that the processor supports SSE4.2 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1). Before an application attempts to use the POPCNT instruction, it must check that the processor supports SSE4.2 (if CPUID.01H:ECX.POPCNT[bit 23] = 1). */ inline bool sse4_2_available() { } #if 1 // // Make vector. // __forceinline v2pd get_v2pd( const short* p ) { double tmp[2]; tmp[0] = double( p[0] ); tmp[1] = double( p[1] ); return _mm_loadu_pd( tmp ); } __forceinline v2pd get_v2pd( const int32* p ) { __declspec(align(16)) double tmp[2]; tmp[0] = double( p[0] ); tmp[1] = double( p[1] ); return _mm_loadu_pd( tmp ); } __forceinline v2pd get_v2pd( const int64* p ) { double tmp[2]; tmp[0] = double( p[0] ); tmp[1] = double( p[1] ); return _mm_loadu_pd( tmp ); } __forceinline v2pd get_v2pd( const double* p ) { return _mm_loadu_pd( p ); // movupd reg, p[0] } // // Make regular types. // __forceinline void set_v2pd( short* p, register v2pd reg ) { double tmp[2]; _mm_storeu_pd( tmp, reg ); p[0] = short( tmp[0] ); p[1] = short( tmp[1] ); } __forceinline void set_v2pd( int32* p, register v2pd reg ) { __declspec(align(16)) double tmp[2]; _mm_storeu_pd( tmp, reg ); p[0] = int32( tmp[0] ); p[1] = int32( tmp[1] ); } __forceinline void set_v2pd( int64* p, register v2pd reg ) { double tmp[2]; _mm_storeu_pd( tmp, reg ); p[0] = int64( tmp[0] ); p[1] = int64( tmp[1] ); } __forceinline void set_v2pd( double* p, register v2pd reg ) { _mm_storeu_pd( p, reg ); // movupd [p], reg } #endif #elif defined( __GNUC__ ) #endif #endif//SSE_OPTIMIZATIONS #endif//__SSE_PRIM__H__