896 lines
15 KiB
C++
896 lines
15 KiB
C++
|
|
//
|
|
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
|
|
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
|
|
//
|
|
|
|
#ifndef _SSE2_CMPL_ABSTRACTION_MSC_H_
|
|
#define _SSE2_CMPL_ABSTRACTION_MSC_H_
|
|
|
|
#include <emmintrin.h>
|
|
#include <dvec.h>
|
|
|
|
//
|
|
// Namespace sse2
|
|
//
|
|
namespace sse2
|
|
{
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
//
|
|
// Primitive types
|
|
//
|
|
|
|
/// 2xdouble
|
|
//
|
|
typedef __m128d rxmm128d;
|
|
|
|
/// 4xfloat
|
|
//
|
|
typedef __m128 rxmm128s;
|
|
|
|
/// 2xint64
|
|
//
|
|
typedef __m128i rxmm128l;
|
|
|
|
/// 4xint32
|
|
//
|
|
typedef __m128 rxmm128i;
|
|
|
|
/// int64
|
|
//
|
|
typedef __int64 int64;
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
//
|
|
/// Packed double arithmetic
|
|
//
|
|
class arithmetic_pd
|
|
{
|
|
public:
|
|
/*!
|
|
r0 := a0 + b0
|
|
r1 := a1 + b1
|
|
*/
|
|
static inline rxmm128d add( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_add_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := a0 - b0
|
|
r1 := a1 - b1
|
|
*/
|
|
static inline rxmm128d sub( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_sub_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := a0 * b0
|
|
r1 := a1 * b1
|
|
*/
|
|
static inline rxmm128d mul( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_mul_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := a0 / b0
|
|
r1 := a1 / b1
|
|
*/
|
|
static inline rxmm128d div( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_div_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := max( a0, b0 )
|
|
r1 := max( a1, b1 )
|
|
*/
|
|
static inline rxmm128d max( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_max_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := min( a0, b0 )
|
|
r1 := min( a1, b1 )
|
|
*/
|
|
static inline rxmm128d min( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_min_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := sqrt( a0 )
|
|
r1 := sqrt( a1 )
|
|
*/
|
|
static inline rxmm128d sqrt( rxmm128d a )
|
|
{
|
|
return _mm_sqrt_pd( a, b );
|
|
}
|
|
};
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
//
|
|
/// Packed double logic
|
|
//
|
|
class logic_pd
|
|
{
|
|
public:
|
|
/*!
|
|
r0 := (~a0) & b0
|
|
r1 := (~a1) & b1
|
|
*/
|
|
static inline rxmm128d andnot( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_andnot_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := a0 & b0
|
|
r1 := a1 & b1
|
|
*/
|
|
static inline XMM_TYPE and( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_and_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := a0 | b0
|
|
r1 := a1 | b1
|
|
*/
|
|
static inline XMM_TYPE or( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_or_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := a0 ^ b0
|
|
r1 := a1 ^ b1
|
|
*/
|
|
static inline XMM_TYPE xor( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_xor_pd( a, b );
|
|
}
|
|
};
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
//
|
|
/// Packed double comparision
|
|
//
|
|
class comparision_pd
|
|
{
|
|
public:
|
|
/*!
|
|
r0 := (a0 == b0) ? 0xffffffffffffffff : 0x0
|
|
r1 := (a1 == b1) ? 0xffffffffffffffff : 0x0
|
|
*/
|
|
static inline rxmm128d cmp_eq( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_cmpeq_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := (a0 != b0) ? 0xffffffffffffffff : 0x0
|
|
r1 := (a1 != b1) ? 0xffffffffffffffff : 0x0
|
|
*/
|
|
static inline rxmm128d cmp_neq( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_cmpneq_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := (a0 < b0) ? 0xffffffffffffffff : 0x0
|
|
r1 := (a1 < b1) ? 0xffffffffffffffff : 0x0
|
|
*/
|
|
static inline rxmm128d cmp_lt( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_cmplt_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := (a0 <= b0) ? 0xffffffffffffffff : 0x0
|
|
r1 := (a1 <= b1) ? 0xffffffffffffffff : 0x0
|
|
*/
|
|
static inline rxmm128d cmp_le( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_cmple_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := (a0 > b0) ? 0xffffffffffffffff : 0x0
|
|
r1 := (a1 > b1) ? 0xffffffffffffffff : 0x0
|
|
*/
|
|
static inline rxmm128d cmp_gt( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_cmpgt_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := (a0 >= b0) ? 0xffffffffffffffff : 0x0
|
|
r1 := (a1 >= b1) ? 0xffffffffffffffff : 0x0
|
|
*/
|
|
static inline rxmm128d cmp_ge( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_cmpge_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := (a0 ord b0) ? 0xffffffffffffffff : 0x0
|
|
r1 := (a1 ord b1) ? 0xffffffffffffffff : 0x0
|
|
*/
|
|
static inline rxmm128d cmp_ord( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_cmpord_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := (a0 unord b0) ? 0xffffffffffffffff : 0x0
|
|
r1 := (a1 unord b1) ? 0xffffffffffffffff : 0x0
|
|
*/
|
|
static inline rxmm128d cmp_unord( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_cmpunord_pd( a, b );
|
|
}
|
|
|
|
};
|
|
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
//
|
|
/// Packed double logic
|
|
//
|
|
class logic_pd
|
|
{
|
|
public:
|
|
};
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
//
|
|
/// Packed double logic
|
|
//
|
|
class logic_pd
|
|
{
|
|
public:
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/// Abstract
|
|
//
|
|
class func_d64x2
|
|
{
|
|
public:
|
|
typedef XMM_TYPE rxmm128d;
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
// Arithmetic PD
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
// Arithmetic SD
|
|
|
|
/*!
|
|
r0 := a0 + b0
|
|
r1 := a1
|
|
*/
|
|
static inline XMM_TYPE addsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_add_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := a0 - b0
|
|
r1 := a1
|
|
*/
|
|
static inline XMM_TYPE subsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_sub_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := a0 * b0
|
|
r1 := a1
|
|
*/
|
|
static inline XMM_TYPE mulsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_mul_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := a0 / b0
|
|
r1 := a1
|
|
*/
|
|
static inline XMM_TYPE divsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_div_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := max( a0, b0 )
|
|
r1 := a1
|
|
*/
|
|
static inline XMM_TYPE maxsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_max_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := min( a0, b0 )
|
|
r1 := a1
|
|
*/
|
|
static inline XMM_TYPE minsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_min_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := sqrt( b0 )
|
|
r1 := a1
|
|
*/
|
|
static inline XMM_TYPE sqrtsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_sqrt_sd( a, b );
|
|
}
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
// Logic PD
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
// Comparision PD
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
// Comparision SD
|
|
|
|
/*!
|
|
r0 := (a0 == b0) ? 0xffffffffffffffff : 0x0
|
|
r1 := a1
|
|
*/
|
|
static inline XMM_TYPE cmpeqsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_cmpeq_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := (a0 != b0) ? 0xffffffffffffffff : 0x0
|
|
r1 := a1
|
|
*/
|
|
static inline XMM_TYPE cmpneqsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_cmpneq_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := (a0 < b0) ? 0xffffffffffffffff : 0x0
|
|
r1 := a1
|
|
*/
|
|
static inline XMM_TYPE cmpltsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_cmplt_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := (a0 <= b0) ? 0xffffffffffffffff : 0x0
|
|
r1 := a1
|
|
*/
|
|
static inline XMM_TYPE cmplesd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_cmple_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := (a0 > b0) ? 0xffffffffffffffff : 0x0
|
|
r1 := a1
|
|
*/
|
|
static inline XMM_TYPE cmpgtsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_cmpgt_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := (a0 >= b0) ? 0xffffffffffffffff : 0x0
|
|
r1 := a1
|
|
*/
|
|
static inline XMM_TYPE cmpgesd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_cmpge_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := (a0 ord b0) ? 0xffffffffffffffff : 0x0
|
|
r1 := a1
|
|
*/
|
|
static inline XMM_TYPE cmpordsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_cmpord_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := (a0 unord b0) ? 0xffffffffffffffff : 0x0
|
|
r1 := a1
|
|
*/
|
|
static inline XMM_TYPE cmpunordsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_cmpunord_sd( a, b );
|
|
}
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
// Comparision SD
|
|
|
|
/*!
|
|
r := (a0 == b0) ? 0x1 : 0x0
|
|
*/
|
|
static inline int comieqsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_comieq_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r := (a0 != b0) ? 0x1 : 0x0
|
|
*/
|
|
static inline int comineqsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_comineq_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r := (a0 < b0) ? 0x1 : 0x0
|
|
*/
|
|
static inline int comiltsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_comilt_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r := (a0 <= b0) ? 0x1 : 0x0
|
|
*/
|
|
static inline int comilesd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_comile_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r := (a0 > b0) ? 0x1 : 0x0
|
|
*/
|
|
static inline int comigtsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_comigt_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r := (a0 >= b0) ? 0x1 : 0x0
|
|
*/
|
|
static inline int comigesd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_comige_sd( a, b );
|
|
}
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
// Comparision SD
|
|
|
|
/*!
|
|
r := (a0 == b0) ? 0x1 : 0x0
|
|
*/
|
|
static inline int ucomieqsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_ucomieq_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r := (a0 != b0) ? 0x1 : 0x0
|
|
*/
|
|
static inline int ucomineqsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_ucomineq_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r := (a0 < b0) ? 0x1 : 0x0
|
|
*/
|
|
static inline int ucomiltsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_ucomilt_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r := (a0 <= b0) ? 0x1 : 0x0
|
|
*/
|
|
static inline int ucomilesd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_ucomile_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r := (a0 > b0) ? 0x1 : 0x0
|
|
*/
|
|
static inline int ucomigtsd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_ucomigt_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r := (a0 >= b0) ? 0x1 : 0x0
|
|
*/
|
|
static inline int ucomigesd( XMM_TYPE a, XMM_TYPE b )
|
|
{
|
|
return _mm_ucomige_sd( a, b );
|
|
}
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
// Conversion
|
|
|
|
/*!
|
|
r0 := (float) a0
|
|
r1 := (float) a1
|
|
r2 := 0.0
|
|
r3 := 0.0
|
|
*/
|
|
static inline rxmm128s cvtpd2ps( rxmm128d a )
|
|
{
|
|
return _mm_cvtpd_ps( a );
|
|
}
|
|
|
|
/*!
|
|
r0 := (double) a0
|
|
r1 := (double) a1
|
|
*/
|
|
static inline rxmm128d cvtps2pd( rxmm128s a )
|
|
{
|
|
return _mm_cvtps_pd( a );
|
|
}
|
|
|
|
/*!
|
|
r0 := (int) a0
|
|
r1 := (int) a1
|
|
r2 := 0.0
|
|
r3 := 0.0
|
|
*/
|
|
static inline rxmm128l cvtpd2dq( rxmm128d a )
|
|
{
|
|
return _mm_cvtpd_epi32( a );
|
|
}
|
|
|
|
/*!
|
|
r0 := (double) a0
|
|
r1 := (double) a1
|
|
*/
|
|
static inline rxmm128d cvtdq2pd( rxmm128l a )
|
|
{
|
|
return _mm_cvtepi32_pd( a );
|
|
}
|
|
|
|
/*!
|
|
r := (int) a0
|
|
*/
|
|
static inline int cvtsd2si( rxmm128d a )
|
|
{
|
|
return _mm_cvtsd_si32( a );
|
|
}
|
|
|
|
/*!
|
|
r0 := (float) b0
|
|
r1 := a1
|
|
r2 := a2
|
|
r3 := a3
|
|
*/
|
|
static inline rxmm128s cvtsd2ss( rxmm128l a, rxmm128d b )
|
|
{
|
|
return _mm_cvtsd_ss( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := (double) b
|
|
r1 := a1
|
|
*/
|
|
static inline rxmm128d cvtsi2sd( rxmm128d a, int b )
|
|
{
|
|
return _mm_cvtsi32_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := (double) b0
|
|
r1 := a1
|
|
*/
|
|
static inline rxmm128d cvtss2sd( rxmm128d a, rxmm128s b )
|
|
{
|
|
return _mm_cvtss_sd( a, b );
|
|
}
|
|
|
|
/*!
|
|
using truncate
|
|
r0 := (int) a0
|
|
r1 := (int) a1
|
|
r2 := 0x0
|
|
r3 := 0x0
|
|
*/
|
|
static inline rxmm128l cvttpd2dq( rxmm128d a )
|
|
{
|
|
return _mm_cvttpd_epi32( a );
|
|
}
|
|
|
|
/*!
|
|
using truncate
|
|
r := (int) a0
|
|
*/
|
|
static inline int cvttsd2si( rxmm128d a )
|
|
{
|
|
return _mm_cvttsd_si32( a );
|
|
}
|
|
|
|
/*!
|
|
r0 := (float) a0
|
|
r1 := (float) a1
|
|
r2 := (float) a2
|
|
r3 := (float) a3
|
|
*/
|
|
static inline rxmm128s cvtdq2ps( rxmm128l a )
|
|
{
|
|
return _mm_cvtepi32_ps( a );
|
|
}
|
|
|
|
/*!
|
|
r0 := (int) a0
|
|
r1 := (int) a1
|
|
r2 := (int) a2
|
|
r3 := (int) a3
|
|
*/
|
|
static inline rxmm128l cvtps2dq( rxmm128s a )
|
|
{
|
|
return _mm_cvtps_epi32( a );
|
|
}
|
|
|
|
/*!
|
|
uses trancate
|
|
r0 := (int) a0
|
|
r1 := (int) a1
|
|
r2 := (int) a2
|
|
r3 := (int) a3
|
|
*/
|
|
static inline rxmm128l cvttps2dq( rxmm128s a )
|
|
{
|
|
return _mm_cvttps_epi32( a );
|
|
}
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
// Misc
|
|
|
|
/*!
|
|
r0 := a1
|
|
r1 := b1
|
|
*/
|
|
static inline rxmm128d unpckhpd( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_unpackhi_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := a0
|
|
r1 := b0
|
|
*/
|
|
static inline rxmm128d unpcklpd( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_unpacklo_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r := sign(a1) << 1 | sign(a0)
|
|
*/
|
|
static inline int movmskpd( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_movemask_pd( a, b );
|
|
}
|
|
|
|
/*!
|
|
r0 := (i0 == 1) ? b0 : a0
|
|
r1 := (i1 == 1) ? b1 : a1
|
|
*/
|
|
static inline int shuffle_pd( rxmm128d a, rxmm128d b, int i )
|
|
{
|
|
return _mm_shuffle_pd( a, b, i );
|
|
}
|
|
|
|
/*!
|
|
== shuffle_pd( a, b, 1 )
|
|
r0 := b0
|
|
r1 := a1
|
|
*/
|
|
static inline rxmm128d move_sd( rxmm128d a, rxmm128d b )
|
|
{
|
|
return _mm_move_sd( a0 );
|
|
}
|
|
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
// Memory load
|
|
|
|
/*!
|
|
The address \arg p must be 16-byte aligned.
|
|
r0 := p[0]
|
|
r1 := p[1]
|
|
*/
|
|
static inline rxmm128d load_pd( double * p )
|
|
{
|
|
return _mm_load_pd( p );
|
|
}
|
|
|
|
/*!
|
|
The address \arg p must be 16-byte aligned.
|
|
r0 := p[1]
|
|
r1 := p[0]
|
|
*/
|
|
static inline rxmm128d load_pd_reverse( double * p )
|
|
{
|
|
return _mm_loadr_pd( p );
|
|
}
|
|
|
|
/*!
|
|
The address \arg p does not need to be 16-byte aligned.
|
|
r0 := p[0]
|
|
r1 := p[1]
|
|
*/
|
|
static inline rxmm128d load_pd_unaligned( double * p )
|
|
{
|
|
return _mm_loadu_pd( p );
|
|
}
|
|
|
|
/*!
|
|
The address \arg p does not need to be 16-byte aligned.
|
|
r0 := a0
|
|
r1 := *p
|
|
*/
|
|
static inline rxmm128d load_pd_hi( rxmm128d a, double * p )
|
|
{
|
|
return _mm_loadh_pd( a, p );
|
|
}
|
|
|
|
/*!
|
|
The address \arg p does not need to be 16-byte aligned.
|
|
r0 := *p
|
|
r1 := a1
|
|
*/
|
|
static inline rxmm128d load_pd_lo( rxmm128d a, double * p )
|
|
{
|
|
return _mm_loadl_pd( a, p );
|
|
}
|
|
|
|
/*!
|
|
The address \arg p does not need to be 16-byte aligned.
|
|
r0 := *p
|
|
r1 := *p
|
|
*/
|
|
static inline rxmm128d load_pd_both( double * p )
|
|
{
|
|
return _mm_load1_pd( p );
|
|
}
|
|
|
|
/*!
|
|
The address \arg p does not need to be 16-byte aligned.
|
|
r0 := *p
|
|
r1 := 0.0
|
|
*/
|
|
static inline rxmm128d load_sd( double * p )
|
|
{
|
|
return _mm_load_sd( p );
|
|
}
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
// Memory store
|
|
|
|
/*!
|
|
The address \arg p must be 16-byte aligned.
|
|
p[0] := a0
|
|
p[1] := a1
|
|
*/
|
|
static inline void store_pd( double * p, rxmm128d a )
|
|
{
|
|
_mm_load_pd( p, a );
|
|
}
|
|
|
|
/*!
|
|
The address \arg p must be 16-byte aligned.
|
|
p[0] := a1
|
|
p[1] := a0
|
|
*/
|
|
static inline void store_pd_reverse( double * p, rxmm128d a )
|
|
{
|
|
_mm_storer_pd( p, a );
|
|
}
|
|
|
|
/*!
|
|
The address \arg p does not need to be 16-byte aligned.
|
|
p[0] := a0
|
|
p[1] := a1
|
|
*/
|
|
static inline void store_pd_unaligned(double * p, rxmm128d a )
|
|
{
|
|
_mm_storeu_pd( p, a );
|
|
}
|
|
|
|
/*!
|
|
The address \arg p does not need to be 16-byte aligned.
|
|
*p := a1
|
|
*/
|
|
static inline void store_pd_hi( double * p, rxmm128d a )
|
|
{
|
|
_mm_storeh_pd( p, a );
|
|
}
|
|
|
|
/*!
|
|
The address \arg p does not need to be 16-byte aligned.
|
|
*p := a0
|
|
*/
|
|
static inline void store_pd_lo( double * p, rxmm128d a )
|
|
{
|
|
_mm_storel_pd( p, a );
|
|
}
|
|
|
|
/*!
|
|
The address \arg p does not need to be 16-byte aligned.
|
|
p[0] := a0
|
|
p[1] := a0
|
|
*/
|
|
static inline void store_pd_both( double * p, rxmm128d a )
|
|
{
|
|
return _mm_store1_pd( p );
|
|
}
|
|
|
|
/*!
|
|
The address \arg p does not need to be 16-byte aligned.
|
|
*p := a0
|
|
*/
|
|
static inline void store_sd( double * p, rxmm128d a )
|
|
{
|
|
return _mm_store_sd( p );
|
|
}
|
|
|
|
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
|
// Memory set
|
|
|
|
/*!
|
|
r0 := a0
|
|
r1 := a1
|
|
*/
|
|
static inline rxmm128d set_pd( double a1, double a0 )
|
|
{
|
|
return _mm_set_pd( a1, a0 );
|
|
}
|
|
|
|
/*!
|
|
r0 := 0.0
|
|
r1 := 0.0
|
|
*/
|
|
static inline rxmm128d set_pd_zero()
|
|
{
|
|
return _mm_setzero_pd( a0 );
|
|
}
|
|
|
|
/*!
|
|
r0 := a0
|
|
r1 := a0
|
|
*/
|
|
static inline rxmm128d set_pd_both( double a0)
|
|
{
|
|
return _mm_set1_pd( a0 );
|
|
}
|
|
|
|
/*!
|
|
The address \arg p does not need to be 16-byte aligned.
|
|
r0 := a0
|
|
r1 := 0.0
|
|
*/
|
|
static inline rxmm128d set_sd( double a0 )
|
|
{
|
|
return _mm_set_sd( a0 );
|
|
}
|
|
};
|
|
|
|
//
|
|
// Namespace sse2
|
|
//
|
|
}
|
|
|
|
#endif/*_SSE2_CMPL_ABSTRACTION_MSC_H_*/
|