initial check in

2012-12-06 21:43:03 +04:00
commit 4bc273824d
179 changed files with 29415 additions and 0 deletions
--- a/testTransform/sseprim.h
+++ b/testTransform/sseprim.h
@@ -0,0 +1,198 @@
+//
+//
+//
+#ifndef __SSE_PRIM__H__
+#define __SSE_PRIM__H__
+
+#ifdef SSE_OPTIMIZATIONS
+
+#include "ssetypes.h"
+
+#if defined( _MSC_VER )
+
+/*
+Checking for the DAZ Flag in the MXCSR Register
+The denormals-are-zero flag in the MXCSR register is available in most of the
+Pentium 4 processors and in the Intel Xeon processor, with the exception of some
+early steppings. To check for the presence of the DAZ flag in the MXCSR register, do
+the following:
+1. Establish a 512-byte FXSAVE area in memory.
+2. Clear the FXSAVE area to all 0s.
+3. Execute the FXSAVE instruction, using the address of the first byte of the cleared
+FXSAVE area as a source operand. See <20>FXSAVE<56>Save x87 FPU, MMX, SSE, and
+SSE2 State<74> in Chapter 3 of the Intel<65> 64 and IA-32 Architectures Software
+Developer<EFBFBD>s Manual, Volume 2A, for a description of the FXSAVE instruction and
+the layout of the FXSAVE image.
+4. Check the value in the MXCSR_MASK field in the FXSAVE image (bytes 28
+through 31).
+<EFBFBD> If the value of the MXCSR_MASK field is 00000000H, the DAZ flag and
+denormals-are-zero mode are not supported.
+Vol. 1 11-29
+PROGRAMMING WITH STREAMING SIMD EXTENSIONS 2 (SSE2)
+<EFBFBD> If the value of the MXCSR_MASK field is non-zero and bit 6 is set, the DAZ
+flag and denormals-are-zero mode are supported.
+If the DAZ flag is not supported, then it is a reserved bit and attempting to write a 1
+to it will cause a general-protection exception (#GP). See Section 11.6.6, <20>Guidelines
+for Writing to the MXCSR Register,<2C> for general guidelines for preventing generalprotection
+exceptions when writing to the MXCSR register.
+
+11.6.4 Initialization of SSE/SE2 Extensions
+The SSE and SSE2 state is contained in the XMM and MXCSR registers. Upon a hardware
+reset of the processor, this state is initialized as follows (see Table 11-2):
+<EFBFBD> All SIMD floating-point exceptions are masked (bits 7 through 12 of the MXCSR
+register is set to 1).
+<EFBFBD> All SIMD floating-point exception flags are cleared (bits 0 through 5 of the MXCSR
+register is set to 0).
+<EFBFBD> The rounding control is set to round-nearest (bits 13 and 14 of the MXCSR
+register are set to 00B).
+<EFBFBD> The flush-to-zero mode is disabled (bit 15 of the MXCSR register is set to 0).
+<EFBFBD> The denormals-are-zeros mode is disabled (bit 6 of the MXCSR register is set to
+0). If the denormals-are-zeros mode is not supported, this bit is reserved and will
+be set to 0 on initialization.
+<EFBFBD> Each of the XMM registers is cleared (set to all zeros).
+
+to read
+
+HT Technology
+
+Intel<EFBFBD> 64 and IA-32 Architectures Optimization Reference Manual
+
+FTZ and DAZ flags in the MXCSR
+oprof
+*/
+
+/*
+1. Check that the processor supports the CPUID instruction. Bit 21 of the EFLAGS
+register can be used to check processor<6F>s support the CPUID instruction.
+2. Check that the processor supports the SSE and/or SSE2 extensions (true if
+CPUID.01H:EDX.SSE[bit 25] = 1 and/or CPUID.01H:EDX.SSE2[bit 26] = 1).
+*/
+inline bool sse2_available()
+{
+}
+
+/*
+Check that the processor supports the SIMD and x87 SSE3 extensions (if
+CPUID.01H:ECX.SSE3[bit 0] = 1).
+*/
+inline bool sse3_available()
+{
+}
+
+/*
+Before an application attempts to use the SSSE3 extensions, the application should
+follow the steps illustrated in Section 11.6.2, <20>Checking for SSE/SSE2 Support.<2E>
+Next, use the additional step provided below:
+<EFBFBD> Check that the processor supports SSSE3 (if CPUID.01H:ECX.SSSE3[bit 9] = 1).
+*/
+inline bool ssse3_available()
+{
+}
+
+/*
+12.12.2 Checking for SSE4.1 Support
+Before an application attempts to use SSE4.1 instructions, the application should
+follow the steps illustrated in Section 11.6.2, <20>Checking for SSE/SSE2 Support.<2E>
+Next, use the additional step provided below:
+Check that the processor supports SSE4.1 (if CPUID.01H:ECX.SSE4_1[bit 19] = 1),
+SSE3 (if CPUID.01H:ECX.SSE3[bit 0] = 1), and SSSE3 (if CPUID.01H:ECX.SSSE3[bit
+9] = 1).
+*/
+inline bool sse4_1_available()
+{
+}
+
+/*
+Before an application attempts to use the following SSE4.2 instructions:
+PCMPESTRI/PCMPESTRM/PCMPISTRI/PCMPISTRM, PCMPGTQ;the application should
+follow the steps illustrated in Section 11.6.2, <20>Checking for SSE/SSE2 Support.<2E>
+Next, use the additional step provided below:
+Check that the processor supports SSE4.2 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1),
+SSE4.1 (if CPUID.01H:ECX.SSE4_1[bit 19] = 1), and SSSE3 (if
+CPUID.01H:ECX.SSSE3[bit 9] = 1).
+Before an application attempts to use the CRC32 instruction, it must check that the
+processor supports SSE4.2 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1).
+Before an application attempts to use the POPCNT instruction, it must check that the
+processor supports SSE4.2 (if CPUID.01H:ECX.POPCNT[bit 23] = 1).
+*/
+inline bool sse4_2_available()
+{
+}
+
+#if 1
+//
+//	Make vector.
+//
+
+__forceinline v2pd get_v2pd( const short* p )
+{
+	double tmp[2];
+	tmp[0] = double( p[0] );
+	tmp[1] = double( p[1] );
+	return _mm_loadu_pd( tmp );
+}
+
+__forceinline v2pd get_v2pd( const int32* p )
+{
+	__declspec(align(16)) double tmp[2];
+	tmp[0] = double( p[0] );
+	tmp[1] = double( p[1] );
+	return _mm_loadu_pd( tmp );
+}
+
+__forceinline v2pd get_v2pd( const int64* p )
+{
+	double tmp[2];
+	tmp[0] = double( p[0] );
+	tmp[1] = double( p[1] );
+	return _mm_loadu_pd( tmp );
+}
+
+__forceinline v2pd get_v2pd( const double* p )
+{
+	return _mm_loadu_pd( p );
+//	movupd reg, p[0]
+}
+
+//
+//	Make regular types.
+//
+
+__forceinline void set_v2pd( short* p, register v2pd reg )
+{
+	double tmp[2];
+	_mm_storeu_pd( tmp, reg );
+	p[0] = short( tmp[0] );
+	p[1] = short( tmp[1] );
+}
+
+__forceinline void set_v2pd( int32* p, register v2pd reg )
+{
+	__declspec(align(16)) double tmp[2];
+	_mm_storeu_pd( tmp, reg );
+	p[0] = int32( tmp[0] );
+	p[1] = int32( tmp[1] );
+}
+
+__forceinline void set_v2pd( int64* p, register v2pd reg )
+{
+	double tmp[2];
+	_mm_storeu_pd( tmp, reg );
+	p[0] = int64( tmp[0] );
+	p[1] = int64( tmp[1] );
+}
+
+__forceinline void set_v2pd( double*  p, register v2pd reg )
+{
+	_mm_storeu_pd( p, reg );
+//		movupd [p], reg
+}
+
+#endif
+
+#elif defined( __GNUC__ )
+
+#endif
+
+#endif//SSE_OPTIMIZATIONS
+#endif//__SSE_PRIM__H__