initial check in
This commit is contained in:
198
testTransform/sseprim.h
Normal file
198
testTransform/sseprim.h
Normal file
@@ -0,0 +1,198 @@
|
||||
//
|
||||
//
|
||||
//
|
||||
#ifndef __SSE_PRIM__H__
|
||||
#define __SSE_PRIM__H__
|
||||
|
||||
#ifdef SSE_OPTIMIZATIONS
|
||||
|
||||
#include "ssetypes.h"
|
||||
|
||||
#if defined( _MSC_VER )
|
||||
|
||||
/*
|
||||
Checking for the DAZ Flag in the MXCSR Register
|
||||
The denormals-are-zero flag in the MXCSR register is available in most of the
|
||||
Pentium 4 processors and in the Intel Xeon processor, with the exception of some
|
||||
early steppings. To check for the presence of the DAZ flag in the MXCSR register, do
|
||||
the following:
|
||||
1. Establish a 512-byte FXSAVE area in memory.
|
||||
2. Clear the FXSAVE area to all 0s.
|
||||
3. Execute the FXSAVE instruction, using the address of the first byte of the cleared
|
||||
FXSAVE area as a source operand. See <20>FXSAVE<56>Save x87 FPU, MMX, SSE, and
|
||||
SSE2 State<74> in Chapter 3 of the Intel<65> 64 and IA-32 Architectures Software
|
||||
Developer<EFBFBD>s Manual, Volume 2A, for a description of the FXSAVE instruction and
|
||||
the layout of the FXSAVE image.
|
||||
4. Check the value in the MXCSR_MASK field in the FXSAVE image (bytes 28
|
||||
through 31).
|
||||
<EFBFBD> If the value of the MXCSR_MASK field is 00000000H, the DAZ flag and
|
||||
denormals-are-zero mode are not supported.
|
||||
Vol. 1 11-29
|
||||
PROGRAMMING WITH STREAMING SIMD EXTENSIONS 2 (SSE2)
|
||||
<EFBFBD> If the value of the MXCSR_MASK field is non-zero and bit 6 is set, the DAZ
|
||||
flag and denormals-are-zero mode are supported.
|
||||
If the DAZ flag is not supported, then it is a reserved bit and attempting to write a 1
|
||||
to it will cause a general-protection exception (#GP). See Section 11.6.6, <20>Guidelines
|
||||
for Writing to the MXCSR Register,<2C> for general guidelines for preventing generalprotection
|
||||
exceptions when writing to the MXCSR register.
|
||||
|
||||
11.6.4 Initialization of SSE/SE2 Extensions
|
||||
The SSE and SSE2 state is contained in the XMM and MXCSR registers. Upon a hardware
|
||||
reset of the processor, this state is initialized as follows (see Table 11-2):
|
||||
<EFBFBD> All SIMD floating-point exceptions are masked (bits 7 through 12 of the MXCSR
|
||||
register is set to 1).
|
||||
<EFBFBD> All SIMD floating-point exception flags are cleared (bits 0 through 5 of the MXCSR
|
||||
register is set to 0).
|
||||
<EFBFBD> The rounding control is set to round-nearest (bits 13 and 14 of the MXCSR
|
||||
register are set to 00B).
|
||||
<EFBFBD> The flush-to-zero mode is disabled (bit 15 of the MXCSR register is set to 0).
|
||||
<EFBFBD> The denormals-are-zeros mode is disabled (bit 6 of the MXCSR register is set to
|
||||
0). If the denormals-are-zeros mode is not supported, this bit is reserved and will
|
||||
be set to 0 on initialization.
|
||||
<EFBFBD> Each of the XMM registers is cleared (set to all zeros).
|
||||
|
||||
to read
|
||||
|
||||
HT Technology
|
||||
|
||||
Intel<EFBFBD> 64 and IA-32 Architectures Optimization Reference Manual
|
||||
|
||||
FTZ and DAZ flags in the MXCSR
|
||||
oprof
|
||||
*/
|
||||
|
||||
/*
|
||||
1. Check that the processor supports the CPUID instruction. Bit 21 of the EFLAGS
|
||||
register can be used to check processor<6F>s support the CPUID instruction.
|
||||
2. Check that the processor supports the SSE and/or SSE2 extensions (true if
|
||||
CPUID.01H:EDX.SSE[bit 25] = 1 and/or CPUID.01H:EDX.SSE2[bit 26] = 1).
|
||||
*/
|
||||
inline bool sse2_available()
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
Check that the processor supports the SIMD and x87 SSE3 extensions (if
|
||||
CPUID.01H:ECX.SSE3[bit 0] = 1).
|
||||
*/
|
||||
inline bool sse3_available()
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
Before an application attempts to use the SSSE3 extensions, the application should
|
||||
follow the steps illustrated in Section 11.6.2, <20>Checking for SSE/SSE2 Support.<2E>
|
||||
Next, use the additional step provided below:
|
||||
<EFBFBD> Check that the processor supports SSSE3 (if CPUID.01H:ECX.SSSE3[bit 9] = 1).
|
||||
*/
|
||||
inline bool ssse3_available()
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
12.12.2 Checking for SSE4.1 Support
|
||||
Before an application attempts to use SSE4.1 instructions, the application should
|
||||
follow the steps illustrated in Section 11.6.2, <20>Checking for SSE/SSE2 Support.<2E>
|
||||
Next, use the additional step provided below:
|
||||
Check that the processor supports SSE4.1 (if CPUID.01H:ECX.SSE4_1[bit 19] = 1),
|
||||
SSE3 (if CPUID.01H:ECX.SSE3[bit 0] = 1), and SSSE3 (if CPUID.01H:ECX.SSSE3[bit
|
||||
9] = 1).
|
||||
*/
|
||||
inline bool sse4_1_available()
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
Before an application attempts to use the following SSE4.2 instructions:
|
||||
PCMPESTRI/PCMPESTRM/PCMPISTRI/PCMPISTRM, PCMPGTQ;the application should
|
||||
follow the steps illustrated in Section 11.6.2, <20>Checking for SSE/SSE2 Support.<2E>
|
||||
Next, use the additional step provided below:
|
||||
Check that the processor supports SSE4.2 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1),
|
||||
SSE4.1 (if CPUID.01H:ECX.SSE4_1[bit 19] = 1), and SSSE3 (if
|
||||
CPUID.01H:ECX.SSSE3[bit 9] = 1).
|
||||
Before an application attempts to use the CRC32 instruction, it must check that the
|
||||
processor supports SSE4.2 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1).
|
||||
Before an application attempts to use the POPCNT instruction, it must check that the
|
||||
processor supports SSE4.2 (if CPUID.01H:ECX.POPCNT[bit 23] = 1).
|
||||
*/
|
||||
inline bool sse4_2_available()
|
||||
{
|
||||
}
|
||||
|
||||
#if 1
|
||||
//
|
||||
// Make vector.
|
||||
//
|
||||
|
||||
__forceinline v2pd get_v2pd( const short* p )
|
||||
{
|
||||
double tmp[2];
|
||||
tmp[0] = double( p[0] );
|
||||
tmp[1] = double( p[1] );
|
||||
return _mm_loadu_pd( tmp );
|
||||
}
|
||||
|
||||
__forceinline v2pd get_v2pd( const int32* p )
|
||||
{
|
||||
__declspec(align(16)) double tmp[2];
|
||||
tmp[0] = double( p[0] );
|
||||
tmp[1] = double( p[1] );
|
||||
return _mm_loadu_pd( tmp );
|
||||
}
|
||||
|
||||
__forceinline v2pd get_v2pd( const int64* p )
|
||||
{
|
||||
double tmp[2];
|
||||
tmp[0] = double( p[0] );
|
||||
tmp[1] = double( p[1] );
|
||||
return _mm_loadu_pd( tmp );
|
||||
}
|
||||
|
||||
__forceinline v2pd get_v2pd( const double* p )
|
||||
{
|
||||
return _mm_loadu_pd( p );
|
||||
// movupd reg, p[0]
|
||||
}
|
||||
|
||||
//
|
||||
// Make regular types.
|
||||
//
|
||||
|
||||
__forceinline void set_v2pd( short* p, register v2pd reg )
|
||||
{
|
||||
double tmp[2];
|
||||
_mm_storeu_pd( tmp, reg );
|
||||
p[0] = short( tmp[0] );
|
||||
p[1] = short( tmp[1] );
|
||||
}
|
||||
|
||||
__forceinline void set_v2pd( int32* p, register v2pd reg )
|
||||
{
|
||||
__declspec(align(16)) double tmp[2];
|
||||
_mm_storeu_pd( tmp, reg );
|
||||
p[0] = int32( tmp[0] );
|
||||
p[1] = int32( tmp[1] );
|
||||
}
|
||||
|
||||
__forceinline void set_v2pd( int64* p, register v2pd reg )
|
||||
{
|
||||
double tmp[2];
|
||||
_mm_storeu_pd( tmp, reg );
|
||||
p[0] = int64( tmp[0] );
|
||||
p[1] = int64( tmp[1] );
|
||||
}
|
||||
|
||||
__forceinline void set_v2pd( double* p, register v2pd reg )
|
||||
{
|
||||
_mm_storeu_pd( p, reg );
|
||||
// movupd [p], reg
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#elif defined( __GNUC__ )
|
||||
|
||||
#endif
|
||||
|
||||
#endif//SSE_OPTIMIZATIONS
|
||||
#endif//__SSE_PRIM__H__
|
||||
Reference in New Issue
Block a user