199 lines
5.8 KiB
C
199 lines
5.8 KiB
C
//
|
||
//
|
||
//
|
||
#ifndef __SSE_PRIM__H__
|
||
#define __SSE_PRIM__H__
|
||
|
||
#ifdef SSE_OPTIMIZATIONS
|
||
|
||
#include "ssetypes.h"
|
||
|
||
#if defined( _MSC_VER )
|
||
|
||
/*
|
||
Checking for the DAZ Flag in the MXCSR Register
|
||
The denormals-are-zero flag in the MXCSR register is available in most of the
|
||
Pentium 4 processors and in the Intel Xeon processor, with the exception of some
|
||
early steppings. To check for the presence of the DAZ flag in the MXCSR register, do
|
||
the following:
|
||
1. Establish a 512-byte FXSAVE area in memory.
|
||
2. Clear the FXSAVE area to all 0s.
|
||
3. Execute the FXSAVE instruction, using the address of the first byte of the cleared
|
||
FXSAVE area as a source operand. See “FXSAVE—Save x87 FPU, MMX, SSE, and
|
||
SSE2 State” in Chapter 3 of the Intel® 64 and IA-32 Architectures Software
|
||
Developer’s Manual, Volume 2A, for a description of the FXSAVE instruction and
|
||
the layout of the FXSAVE image.
|
||
4. Check the value in the MXCSR_MASK field in the FXSAVE image (bytes 28
|
||
through 31).
|
||
— If the value of the MXCSR_MASK field is 00000000H, the DAZ flag and
|
||
denormals-are-zero mode are not supported.
|
||
Vol. 1 11-29
|
||
PROGRAMMING WITH STREAMING SIMD EXTENSIONS 2 (SSE2)
|
||
— If the value of the MXCSR_MASK field is non-zero and bit 6 is set, the DAZ
|
||
flag and denormals-are-zero mode are supported.
|
||
If the DAZ flag is not supported, then it is a reserved bit and attempting to write a 1
|
||
to it will cause a general-protection exception (#GP). See Section 11.6.6, “Guidelines
|
||
for Writing to the MXCSR Register,” for general guidelines for preventing generalprotection
|
||
exceptions when writing to the MXCSR register.
|
||
|
||
11.6.4 Initialization of SSE/SE2 Extensions
|
||
The SSE and SSE2 state is contained in the XMM and MXCSR registers. Upon a hardware
|
||
reset of the processor, this state is initialized as follows (see Table 11-2):
|
||
• All SIMD floating-point exceptions are masked (bits 7 through 12 of the MXCSR
|
||
register is set to 1).
|
||
• All SIMD floating-point exception flags are cleared (bits 0 through 5 of the MXCSR
|
||
register is set to 0).
|
||
• The rounding control is set to round-nearest (bits 13 and 14 of the MXCSR
|
||
register are set to 00B).
|
||
• The flush-to-zero mode is disabled (bit 15 of the MXCSR register is set to 0).
|
||
• The denormals-are-zeros mode is disabled (bit 6 of the MXCSR register is set to
|
||
0). If the denormals-are-zeros mode is not supported, this bit is reserved and will
|
||
be set to 0 on initialization.
|
||
• Each of the XMM registers is cleared (set to all zeros).
|
||
|
||
to read
|
||
|
||
HT Technology
|
||
|
||
Intel® 64 and IA-32 Architectures Optimization Reference Manual
|
||
|
||
FTZ and DAZ flags in the MXCSR
|
||
oprof
|
||
*/
|
||
|
||
/*
|
||
1. Check that the processor supports the CPUID instruction. Bit 21 of the EFLAGS
|
||
register can be used to check processor’s support the CPUID instruction.
|
||
2. Check that the processor supports the SSE and/or SSE2 extensions (true if
|
||
CPUID.01H:EDX.SSE[bit 25] = 1 and/or CPUID.01H:EDX.SSE2[bit 26] = 1).
|
||
*/
|
||
inline bool sse2_available()
|
||
{
|
||
}
|
||
|
||
/*
|
||
Check that the processor supports the SIMD and x87 SSE3 extensions (if
|
||
CPUID.01H:ECX.SSE3[bit 0] = 1).
|
||
*/
|
||
inline bool sse3_available()
|
||
{
|
||
}
|
||
|
||
/*
|
||
Before an application attempts to use the SSSE3 extensions, the application should
|
||
follow the steps illustrated in Section 11.6.2, “Checking for SSE/SSE2 Support.”
|
||
Next, use the additional step provided below:
|
||
• Check that the processor supports SSSE3 (if CPUID.01H:ECX.SSSE3[bit 9] = 1).
|
||
*/
|
||
inline bool ssse3_available()
|
||
{
|
||
}
|
||
|
||
/*
|
||
12.12.2 Checking for SSE4.1 Support
|
||
Before an application attempts to use SSE4.1 instructions, the application should
|
||
follow the steps illustrated in Section 11.6.2, “Checking for SSE/SSE2 Support.”
|
||
Next, use the additional step provided below:
|
||
Check that the processor supports SSE4.1 (if CPUID.01H:ECX.SSE4_1[bit 19] = 1),
|
||
SSE3 (if CPUID.01H:ECX.SSE3[bit 0] = 1), and SSSE3 (if CPUID.01H:ECX.SSSE3[bit
|
||
9] = 1).
|
||
*/
|
||
inline bool sse4_1_available()
|
||
{
|
||
}
|
||
|
||
/*
|
||
Before an application attempts to use the following SSE4.2 instructions:
|
||
PCMPESTRI/PCMPESTRM/PCMPISTRI/PCMPISTRM, PCMPGTQ;the application should
|
||
follow the steps illustrated in Section 11.6.2, “Checking for SSE/SSE2 Support.”
|
||
Next, use the additional step provided below:
|
||
Check that the processor supports SSE4.2 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1),
|
||
SSE4.1 (if CPUID.01H:ECX.SSE4_1[bit 19] = 1), and SSSE3 (if
|
||
CPUID.01H:ECX.SSSE3[bit 9] = 1).
|
||
Before an application attempts to use the CRC32 instruction, it must check that the
|
||
processor supports SSE4.2 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1).
|
||
Before an application attempts to use the POPCNT instruction, it must check that the
|
||
processor supports SSE4.2 (if CPUID.01H:ECX.POPCNT[bit 23] = 1).
|
||
*/
|
||
inline bool sse4_2_available()
|
||
{
|
||
}
|
||
|
||
#if 1
|
||
//
|
||
// Make vector.
|
||
//
|
||
|
||
__forceinline v2pd get_v2pd( const short* p )
|
||
{
|
||
double tmp[2];
|
||
tmp[0] = double( p[0] );
|
||
tmp[1] = double( p[1] );
|
||
return _mm_loadu_pd( tmp );
|
||
}
|
||
|
||
__forceinline v2pd get_v2pd( const int32* p )
|
||
{
|
||
__declspec(align(16)) double tmp[2];
|
||
tmp[0] = double( p[0] );
|
||
tmp[1] = double( p[1] );
|
||
return _mm_loadu_pd( tmp );
|
||
}
|
||
|
||
__forceinline v2pd get_v2pd( const int64* p )
|
||
{
|
||
double tmp[2];
|
||
tmp[0] = double( p[0] );
|
||
tmp[1] = double( p[1] );
|
||
return _mm_loadu_pd( tmp );
|
||
}
|
||
|
||
__forceinline v2pd get_v2pd( const double* p )
|
||
{
|
||
return _mm_loadu_pd( p );
|
||
// movupd reg, p[0]
|
||
}
|
||
|
||
//
|
||
// Make regular types.
|
||
//
|
||
|
||
__forceinline void set_v2pd( short* p, register v2pd reg )
|
||
{
|
||
double tmp[2];
|
||
_mm_storeu_pd( tmp, reg );
|
||
p[0] = short( tmp[0] );
|
||
p[1] = short( tmp[1] );
|
||
}
|
||
|
||
__forceinline void set_v2pd( int32* p, register v2pd reg )
|
||
{
|
||
__declspec(align(16)) double tmp[2];
|
||
_mm_storeu_pd( tmp, reg );
|
||
p[0] = int32( tmp[0] );
|
||
p[1] = int32( tmp[1] );
|
||
}
|
||
|
||
__forceinline void set_v2pd( int64* p, register v2pd reg )
|
||
{
|
||
double tmp[2];
|
||
_mm_storeu_pd( tmp, reg );
|
||
p[0] = int64( tmp[0] );
|
||
p[1] = int64( tmp[1] );
|
||
}
|
||
|
||
__forceinline void set_v2pd( double* p, register v2pd reg )
|
||
{
|
||
_mm_storeu_pd( p, reg );
|
||
// movupd [p], reg
|
||
}
|
||
|
||
#endif
|
||
|
||
#elif defined( __GNUC__ )
|
||
|
||
#endif
|
||
|
||
#endif//SSE_OPTIMIZATIONS
|
||
#endif//__SSE_PRIM__H__
|