Reorganizing some files.

This commit is contained in:
2015-02-08 23:37:56 +04:00
parent 7a53e5a3a1
commit 12af9d1b7f
22 changed files with 0 additions and 270 deletions

1562
misc/SSE2_transform.cpp Normal file

File diff suppressed because it is too large Load Diff

1119
misc/SSE2_transform.h Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,280 @@
/*
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
* Inc., and is fully protected under copyright and trade secret laws. You may
* not view, use, disclose, copy, or distribute this file or any information
* contained herein except pursuant to a valid written license from Synopsys.
*/
//
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
//
#ifndef _SSE2_CMPL_ABSTRACTION_H_
#define _SSE2_CMPL_ABSTRACTION_H_
#include <boost/static_assert.hpp>
#include <boost/type_traits.hpp>
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
// Ms Visual Studio
//
#if defined( _MSC_VER ) && _MSC_VER
#include "SSE_cmplr_abstraction_MSC.h"
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
// the GCC
//
#elif defined( _gcc )
#include "SSE_cmplr_abstraction_GCC.h"
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
// Other
//
#else
#include "SSE_cmplr_abstraction_other.h"
#endif
//
// Namespace sse2
//
namespace sse2
{
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// the wrapper for 128 bit xmm registers.
//
template <class X, class T, int N = (16/sizeof(T)) >
class xmm128
{
// Type
protected:
//
/// The XMM register type.
//
typedef X MY_XMM;
//
/// The XMM register type.
//
typedef T MY_TYPE;
// Data
protected:
//
/// The union of
//
union
{
MY_XMM x;
MY_TYPE n[N];
};
// Construction
public:
//
/// The default constructor.
//
xmm128()
{
//
// We must be 128 bits only.
//
BOOST_STATIC_ASSERT( 16/sizeof( MY_TYPE ) == N );
}
//
/// The copy constructor.
//
xmm128( const xmm128& op )
{
//
// We must be 128 bits only.
//
BOOST_STATIC_ASSERT( 16/sizeof( MY_TYPE ) == N );
//
// Just assign.
//
x = op.x;
}
//
/// The copy constructor.
//
xmm128( const MY_XMM& op )
{
//
// We must be 128 bits only.
//
BOOST_STATIC_ASSERT( 16/sizeof( MY_TYPE ) == N );
//
// Just assign.
//
x = op.x;
x = op;
}
//
/// The destructor.
//
~xmm128()
{}
// Interface
public:
//
/// Assign our kind.
//
xmm128& operator= ( const xmm128& op )
{
x = op.x;
return *this;
}
//
/// Assign the xmm type.
//
xmm128& operator= ( const MY_XMM& op )
{
x = op;
return *this;
}
//
/// Operator to get packed type. The const version.
//
operator MY_XMM () const
{
return x;
}
//
/// Operator to get packed type reference. Can be used as a lvalue.
//
MY_TYPE& operator[] ( int idx )
{
assert( 0<= idx && idx < N );
return n[idx];
}
//
/// Operator to get packed type. The const version.
//
MY_TYPE operator[] ( int idx ) const
{
assert( 0<= idx && idx < N );
return n[idx];
}
//
/// Set from two values.
//
void set( MY_TYPE v1, MY_TYPE v2 )
{
BOOST_STATIC_ASSERT( 16 / sizeof( MY_TYPE ) == 2 );
operator[0] = v1;
operator[1] = v2;
}
//
/// Set from two values.
//
void set( MY_TYPE v1, MY_TYPE v2, MY_TYPE v2, MY_TYPE v4 )
{
BOOST_STATIC_ASSERT( 16 / sizeof( MY_TYPE ) == 4 );
operator[0] = v1;
operator[1] = v2;
operator[2] = v3;
operator[3] = v4;
}
add
sub
andnot
and
or
xor
sqrt
mul
div
min
max
shift
comp
};
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
/*
class xmm128d : public xmm128 < rxmm128d, double, 2 >
{
// Construction
public:
//
/// The default constructor.
//
xmm128d()
{}
//
/// The copy constructor.
//
xmm128d( const xmm128d& op )
: xmm128( op )
{}
//
/// The copy constructor.
//
xmm128d( const MY_XMM& op )
: xmm128( op )
{}
//
/// The copy constructor.
//
xmm128d( double d1, double d2 )
{
set( d1, d2 );
}
// Interface
public:
//
/// Set from two doubles.
//
void set( double d1, double d2 )
{
operator[0] = d1;
operator[1] = d2;
}
};
*/
//
// Namespace sse2
//
}
#endif/*_SSE2_CMPL_ABSTRACTION_H_*/

View File

@@ -0,0 +1,60 @@
/*
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
* Inc., and is fully protected under copyright and trade secret laws. You may
* not view, use, disclose, copy, or distribute this file or any information
* contained herein except pursuant to a valid written license from Synopsys.
*/
//
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
//
#ifndef _SSE2_CMPL_ABSTRACTION_GCC_H_
#define _SSE2_CMPL_ABSTRACTION_GCC_H_
#include <boost/static_assert.hpp>
//
// Namespace sse2
//
namespace sse2
{
//
// Primitive types
//
/// 2xdouble
//
typedef int xmm128d __attribute__ ((mode(V2DF)));
/// 4xfloat
//
typedef int xmm128s __attribute__ ((mode(V4SF)));
/// 2xint64
//
typedef int xmm128l __attribute__ ((mode(V4SF)));
/// 4xint32
//
typedef int xmm128i __attribute__ ((mode(V4SF)));
/// int64
//
typedef long int;
//
// Namespace sse2
//
}
#include "SSE_cmplr_abstraction_GCC_pckdbl.h"
#include "SSE_cmplr_abstraction_GCC_pckfloat.h"
#include "SSE_cmplr_abstraction_GCC_pckint8.h"
#include "SSE_cmplr_abstraction_GCC_pckint16.h"
#include "SSE_cmplr_abstraction_GCC_pckint32.h"
#include "SSE_cmplr_abstraction_GCC_pckint64.h"
#endif/*_SSE2_CMPL_ABSTRACTION_GCC_H_*/

View File

@@ -0,0 +1,61 @@
/*
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
* Inc., and is fully protected under copyright and trade secret laws. You may
* not view, use, disclose, copy, or distribute this file or any information
* contained herein except pursuant to a valid written license from Synopsys.
*/
//
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
//
#ifndef _SSE2_CMPL_ABSTRACTION_MSC_H_
#define _SSE2_CMPL_ABSTRACTION_MSC_H_
#include <boost/static_assert.hpp>
#include <emmintrin.h>
//
// Namespace sse2
//
namespace sse2
{
//
// Primitive types
//
/// 2xdouble
//
typedef __m128d rxmm128d;
/// 4xfloat
//
typedef __m128 rxmm128s;
/// 2xint64
//
typedef __m128i rxmm128l;
/// 4xint32
//
typedef __m128 rxmm128i;
/// int64
//
typedef __int64 int64;
//
// Namespace sse2
//
}
#include "SSE_cmplr_abstraction_MSC_pckdbl.h"
#include "SSE_cmplr_abstraction_MSC_pckfloat.h"
#include "SSE_cmplr_abstraction_MSC_pckint8.h"
#include "SSE_cmplr_abstraction_MSC_pckint16.h"
#include "SSE_cmplr_abstraction_MSC_pckint32.h"
#include "SSE_cmplr_abstraction_MSC_pckint64.h"
#endif/*_SSE2_CMPL_ABSTRACTION_MSC_H_*/

View File

@@ -0,0 +1,895 @@
//
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
//
#ifndef _SSE2_CMPL_ABSTRACTION_MSC_H_
#define _SSE2_CMPL_ABSTRACTION_MSC_H_
#include <emmintrin.h>
#include <dvec.h>
//
// Namespace sse2
//
namespace sse2
{
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
// Primitive types
//
/// 2xdouble
//
typedef __m128d rxmm128d;
/// 4xfloat
//
typedef __m128 rxmm128s;
/// 2xint64
//
typedef __m128i rxmm128l;
/// 4xint32
//
typedef __m128 rxmm128i;
/// int64
//
typedef __int64 int64;
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double arithmetic
//
class arithmetic_pd
{
public:
/*!
r0 := a0 + b0
r1 := a1 + b1
*/
static inline rxmm128d add( rxmm128d a, rxmm128d b )
{
return _mm_add_pd( a, b );
}
/*!
r0 := a0 - b0
r1 := a1 - b1
*/
static inline rxmm128d sub( rxmm128d a, rxmm128d b )
{
return _mm_sub_pd( a, b );
}
/*!
r0 := a0 * b0
r1 := a1 * b1
*/
static inline rxmm128d mul( rxmm128d a, rxmm128d b )
{
return _mm_mul_pd( a, b );
}
/*!
r0 := a0 / b0
r1 := a1 / b1
*/
static inline rxmm128d div( rxmm128d a, rxmm128d b )
{
return _mm_div_pd( a, b );
}
/*!
r0 := max( a0, b0 )
r1 := max( a1, b1 )
*/
static inline rxmm128d max( rxmm128d a, rxmm128d b )
{
return _mm_max_pd( a, b );
}
/*!
r0 := min( a0, b0 )
r1 := min( a1, b1 )
*/
static inline rxmm128d min( rxmm128d a, rxmm128d b )
{
return _mm_min_pd( a, b );
}
/*!
r0 := sqrt( a0 )
r1 := sqrt( a1 )
*/
static inline rxmm128d sqrt( rxmm128d a )
{
return _mm_sqrt_pd( a, b );
}
};
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double logic
//
class logic_pd
{
public:
/*!
r0 := (~a0) & b0
r1 := (~a1) & b1
*/
static inline rxmm128d andnot( rxmm128d a, rxmm128d b )
{
return _mm_andnot_pd( a, b );
}
/*!
r0 := a0 & b0
r1 := a1 & b1
*/
static inline XMM_TYPE and( rxmm128d a, rxmm128d b )
{
return _mm_and_pd( a, b );
}
/*!
r0 := a0 | b0
r1 := a1 | b1
*/
static inline XMM_TYPE or( rxmm128d a, rxmm128d b )
{
return _mm_or_pd( a, b );
}
/*!
r0 := a0 ^ b0
r1 := a1 ^ b1
*/
static inline XMM_TYPE xor( rxmm128d a, rxmm128d b )
{
return _mm_xor_pd( a, b );
}
};
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double comparision
//
class comparision_pd
{
public:
/*!
r0 := (a0 == b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 == b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_eq( rxmm128d a, rxmm128d b )
{
return _mm_cmpeq_pd( a, b );
}
/*!
r0 := (a0 != b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 != b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_neq( rxmm128d a, rxmm128d b )
{
return _mm_cmpneq_pd( a, b );
}
/*!
r0 := (a0 < b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 < b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_lt( rxmm128d a, rxmm128d b )
{
return _mm_cmplt_pd( a, b );
}
/*!
r0 := (a0 <= b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 <= b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_le( rxmm128d a, rxmm128d b )
{
return _mm_cmple_pd( a, b );
}
/*!
r0 := (a0 > b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 > b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_gt( rxmm128d a, rxmm128d b )
{
return _mm_cmpgt_pd( a, b );
}
/*!
r0 := (a0 >= b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 >= b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_ge( rxmm128d a, rxmm128d b )
{
return _mm_cmpge_pd( a, b );
}
/*!
r0 := (a0 ord b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 ord b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_ord( rxmm128d a, rxmm128d b )
{
return _mm_cmpord_pd( a, b );
}
/*!
r0 := (a0 unord b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 unord b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_unord( rxmm128d a, rxmm128d b )
{
return _mm_cmpunord_pd( a, b );
}
};
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double logic
//
class logic_pd
{
public:
};
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double logic
//
class logic_pd
{
public:
};
/// Abstract
//
class func_d64x2
{
public:
typedef XMM_TYPE rxmm128d;
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Arithmetic PD
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Arithmetic SD
/*!
r0 := a0 + b0
r1 := a1
*/
static inline XMM_TYPE addsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_add_sd( a, b );
}
/*!
r0 := a0 - b0
r1 := a1
*/
static inline XMM_TYPE subsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_sub_sd( a, b );
}
/*!
r0 := a0 * b0
r1 := a1
*/
static inline XMM_TYPE mulsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_mul_sd( a, b );
}
/*!
r0 := a0 / b0
r1 := a1
*/
static inline XMM_TYPE divsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_div_sd( a, b );
}
/*!
r0 := max( a0, b0 )
r1 := a1
*/
static inline XMM_TYPE maxsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_max_sd( a, b );
}
/*!
r0 := min( a0, b0 )
r1 := a1
*/
static inline XMM_TYPE minsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_min_sd( a, b );
}
/*!
r0 := sqrt( b0 )
r1 := a1
*/
static inline XMM_TYPE sqrtsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_sqrt_sd( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Logic PD
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Comparision PD
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Comparision SD
/*!
r0 := (a0 == b0) ? 0xffffffffffffffff : 0x0
r1 := a1
*/
static inline XMM_TYPE cmpeqsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_cmpeq_sd( a, b );
}
/*!
r0 := (a0 != b0) ? 0xffffffffffffffff : 0x0
r1 := a1
*/
static inline XMM_TYPE cmpneqsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_cmpneq_sd( a, b );
}
/*!
r0 := (a0 < b0) ? 0xffffffffffffffff : 0x0
r1 := a1
*/
static inline XMM_TYPE cmpltsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_cmplt_sd( a, b );
}
/*!
r0 := (a0 <= b0) ? 0xffffffffffffffff : 0x0
r1 := a1
*/
static inline XMM_TYPE cmplesd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_cmple_sd( a, b );
}
/*!
r0 := (a0 > b0) ? 0xffffffffffffffff : 0x0
r1 := a1
*/
static inline XMM_TYPE cmpgtsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_cmpgt_sd( a, b );
}
/*!
r0 := (a0 >= b0) ? 0xffffffffffffffff : 0x0
r1 := a1
*/
static inline XMM_TYPE cmpgesd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_cmpge_sd( a, b );
}
/*!
r0 := (a0 ord b0) ? 0xffffffffffffffff : 0x0
r1 := a1
*/
static inline XMM_TYPE cmpordsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_cmpord_sd( a, b );
}
/*!
r0 := (a0 unord b0) ? 0xffffffffffffffff : 0x0
r1 := a1
*/
static inline XMM_TYPE cmpunordsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_cmpunord_sd( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Comparision SD
/*!
r := (a0 == b0) ? 0x1 : 0x0
*/
static inline int comieqsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_comieq_sd( a, b );
}
/*!
r := (a0 != b0) ? 0x1 : 0x0
*/
static inline int comineqsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_comineq_sd( a, b );
}
/*!
r := (a0 < b0) ? 0x1 : 0x0
*/
static inline int comiltsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_comilt_sd( a, b );
}
/*!
r := (a0 <= b0) ? 0x1 : 0x0
*/
static inline int comilesd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_comile_sd( a, b );
}
/*!
r := (a0 > b0) ? 0x1 : 0x0
*/
static inline int comigtsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_comigt_sd( a, b );
}
/*!
r := (a0 >= b0) ? 0x1 : 0x0
*/
static inline int comigesd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_comige_sd( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Comparision SD
/*!
r := (a0 == b0) ? 0x1 : 0x0
*/
static inline int ucomieqsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_ucomieq_sd( a, b );
}
/*!
r := (a0 != b0) ? 0x1 : 0x0
*/
static inline int ucomineqsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_ucomineq_sd( a, b );
}
/*!
r := (a0 < b0) ? 0x1 : 0x0
*/
static inline int ucomiltsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_ucomilt_sd( a, b );
}
/*!
r := (a0 <= b0) ? 0x1 : 0x0
*/
static inline int ucomilesd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_ucomile_sd( a, b );
}
/*!
r := (a0 > b0) ? 0x1 : 0x0
*/
static inline int ucomigtsd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_ucomigt_sd( a, b );
}
/*!
r := (a0 >= b0) ? 0x1 : 0x0
*/
static inline int ucomigesd( XMM_TYPE a, XMM_TYPE b )
{
return _mm_ucomige_sd( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Conversion
/*!
r0 := (float) a0
r1 := (float) a1
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128s cvtpd2ps( rxmm128d a )
{
return _mm_cvtpd_ps( a );
}
/*!
r0 := (double) a0
r1 := (double) a1
*/
static inline rxmm128d cvtps2pd( rxmm128s a )
{
return _mm_cvtps_pd( a );
}
/*!
r0 := (int) a0
r1 := (int) a1
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128l cvtpd2dq( rxmm128d a )
{
return _mm_cvtpd_epi32( a );
}
/*!
r0 := (double) a0
r1 := (double) a1
*/
static inline rxmm128d cvtdq2pd( rxmm128l a )
{
return _mm_cvtepi32_pd( a );
}
/*!
r := (int) a0
*/
static inline int cvtsd2si( rxmm128d a )
{
return _mm_cvtsd_si32( a );
}
/*!
r0 := (float) b0
r1 := a1
r2 := a2
r3 := a3
*/
static inline rxmm128s cvtsd2ss( rxmm128l a, rxmm128d b )
{
return _mm_cvtsd_ss( a, b );
}
/*!
r0 := (double) b
r1 := a1
*/
static inline rxmm128d cvtsi2sd( rxmm128d a, int b )
{
return _mm_cvtsi32_sd( a, b );
}
/*!
r0 := (double) b0
r1 := a1
*/
static inline rxmm128d cvtss2sd( rxmm128d a, rxmm128s b )
{
return _mm_cvtss_sd( a, b );
}
/*!
using truncate
r0 := (int) a0
r1 := (int) a1
r2 := 0x0
r3 := 0x0
*/
static inline rxmm128l cvttpd2dq( rxmm128d a )
{
return _mm_cvttpd_epi32( a );
}
/*!
using truncate
r := (int) a0
*/
static inline int cvttsd2si( rxmm128d a )
{
return _mm_cvttsd_si32( a );
}
/*!
r0 := (float) a0
r1 := (float) a1
r2 := (float) a2
r3 := (float) a3
*/
static inline rxmm128s cvtdq2ps( rxmm128l a )
{
return _mm_cvtepi32_ps( a );
}
/*!
r0 := (int) a0
r1 := (int) a1
r2 := (int) a2
r3 := (int) a3
*/
static inline rxmm128l cvtps2dq( rxmm128s a )
{
return _mm_cvtps_epi32( a );
}
/*!
uses trancate
r0 := (int) a0
r1 := (int) a1
r2 := (int) a2
r3 := (int) a3
*/
static inline rxmm128l cvttps2dq( rxmm128s a )
{
return _mm_cvttps_epi32( a );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Misc
/*!
r0 := a1
r1 := b1
*/
static inline rxmm128d unpckhpd( rxmm128d a, rxmm128d b )
{
return _mm_unpackhi_pd( a, b );
}
/*!
r0 := a0
r1 := b0
*/
static inline rxmm128d unpcklpd( rxmm128d a, rxmm128d b )
{
return _mm_unpacklo_pd( a, b );
}
/*!
r := sign(a1) << 1 | sign(a0)
*/
static inline int movmskpd( rxmm128d a, rxmm128d b )
{
return _mm_movemask_pd( a, b );
}
/*!
r0 := (i0 == 1) ? b0 : a0
r1 := (i1 == 1) ? b1 : a1
*/
static inline int shuffle_pd( rxmm128d a, rxmm128d b, int i )
{
return _mm_shuffle_pd( a, b, i );
}
/*!
== shuffle_pd( a, b, 1 )
r0 := b0
r1 := a1
*/
static inline rxmm128d move_sd( rxmm128d a, rxmm128d b )
{
return _mm_move_sd( a0 );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory load
/*!
The address \arg p must be 16-byte aligned.
r0 := p[0]
r1 := p[1]
*/
static inline rxmm128d load_pd( double * p )
{
return _mm_load_pd( p );
}
/*!
The address \arg p must be 16-byte aligned.
r0 := p[1]
r1 := p[0]
*/
static inline rxmm128d load_pd_reverse( double * p )
{
return _mm_loadr_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := p[0]
r1 := p[1]
*/
static inline rxmm128d load_pd_unaligned( double * p )
{
return _mm_loadu_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := a0
r1 := *p
*/
static inline rxmm128d load_pd_hi( rxmm128d a, double * p )
{
return _mm_loadh_pd( a, p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := a1
*/
static inline rxmm128d load_pd_lo( rxmm128d a, double * p )
{
return _mm_loadl_pd( a, p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := *p
*/
static inline rxmm128d load_pd_both( double * p )
{
return _mm_load1_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := 0.0
*/
static inline rxmm128d load_sd( double * p )
{
return _mm_load_sd( p );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory store
/*!
The address \arg p must be 16-byte aligned.
p[0] := a0
p[1] := a1
*/
static inline void store_pd( double * p, rxmm128d a )
{
_mm_load_pd( p, a );
}
/*!
The address \arg p must be 16-byte aligned.
p[0] := a1
p[1] := a0
*/
static inline void store_pd_reverse( double * p, rxmm128d a )
{
_mm_storer_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
p[0] := a0
p[1] := a1
*/
static inline void store_pd_unaligned(double * p, rxmm128d a )
{
_mm_storeu_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
*p := a1
*/
static inline void store_pd_hi( double * p, rxmm128d a )
{
_mm_storeh_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
*p := a0
*/
static inline void store_pd_lo( double * p, rxmm128d a )
{
_mm_storel_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
p[0] := a0
p[1] := a0
*/
static inline void store_pd_both( double * p, rxmm128d a )
{
return _mm_store1_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
*p := a0
*/
static inline void store_sd( double * p, rxmm128d a )
{
return _mm_store_sd( p );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory set
/*!
r0 := a0
r1 := a1
*/
static inline rxmm128d set_pd( double a1, double a0 )
{
return _mm_set_pd( a1, a0 );
}
/*!
r0 := 0.0
r1 := 0.0
*/
static inline rxmm128d set_pd_zero()
{
return _mm_setzero_pd( a0 );
}
/*!
r0 := a0
r1 := a0
*/
static inline rxmm128d set_pd_both( double a0)
{
return _mm_set1_pd( a0 );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := a0
r1 := 0.0
*/
static inline rxmm128d set_sd( double a0 )
{
return _mm_set_sd( a0 );
}
};
//
// Namespace sse2
//
}
#endif/*_SSE2_CMPL_ABSTRACTION_MSC_H_*/

View File

@@ -0,0 +1,624 @@
/*
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
* Inc., and is fully protected under copyright and trade secret laws. You may
* not view, use, disclose, copy, or distribute this file or any information
* contained herein except pursuant to a valid written license from Synopsys.
*/
//
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
//
#ifndef _SSE2_CMPL_ABSTRACTION_MSC_PCKDBL_H_
#define _SSE2_CMPL_ABSTRACTION_MSC_PCKDBL_H_
//
// Namespace sse2
//
namespace sse2
{
//
/// class pd (packed double)
//
class pd
{
public:
//
/// The type.
//
typedef rxmm128d my_rxmm;
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double arithmetic
//
/*!
r0 := a0 + b0
r1 := a1 + b1
*/
static inline rxmm128d add( rxmm128d a, rxmm128d b )
{
return _mm_add_pd( a, b );
}
/*!
r0 := a0 - b0
r1 := a1 - b1
*/
static inline rxmm128d sub( rxmm128d a, rxmm128d b )
{
return _mm_sub_pd( a, b );
}
/*!
r0 := a0 * b0
r1 := a1 * b1
*/
static inline rxmm128d mul( rxmm128d a, rxmm128d b )
{
return _mm_mul_pd( a, b );
}
/*!
r0 := a0 / b0
r1 := a1 / b1
*/
static inline rxmm128d div( rxmm128d a, rxmm128d b )
{
return _mm_div_pd( a, b );
}
/*!
r0 := max( a0, b0 )
r1 := max( a1, b1 )
*/
static inline rxmm128d max( rxmm128d a, rxmm128d b )
{
return _mm_max_pd( a, b );
}
/*!
r0 := min( a0, b0 )
r1 := min( a1, b1 )
*/
static inline rxmm128d min( rxmm128d a, rxmm128d b )
{
return _mm_min_pd( a, b );
}
/*!
r0 := sqrt( a0 )
r1 := sqrt( a1 )
*/
static inline rxmm128d sqrt( rxmm128d a )
{
return _mm_sqrt_pd( a );
}
/*!
r0 := recip(a0)
r1 := recip(a1)
*/
static inline rxmm128d rcp( rxmm128d a )
{
rxmm128d t = _mm_set1_pd( 1.0 );
return _mm_div_pd( t, a );
}
/*!
r0 := recip(sqrt(a0))
r1 := recip(sqrt(a1))
*/
static inline rxmm128d rsqrt( rxmm128d a )
{
rxmm128d t = _mm_set1_pd( 1.0 );
rxmm128d u = _mm_sqrt_pd( a );
return _mm_div_pd( t, u );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double logic
//
/*!
r0 := (~a0) & b0
r1 := (~a1) & b1
*/
static inline rxmm128d andnot( rxmm128d a, rxmm128d b )
{
return _mm_andnot_pd( a, b );
}
/*!
r0 := a0 & b0
r1 := a1 & b1
*/
static inline XMM_TYPE and( rxmm128d a, rxmm128d b )
{
return _mm_and_pd( a, b );
}
/*!
r0 := a0 | b0
r1 := a1 | b1
*/
static inline XMM_TYPE or( rxmm128d a, rxmm128d b )
{
return _mm_or_pd( a, b );
}
/*!
r0 := a0 ^ b0
r1 := a1 ^ b1
*/
static inline XMM_TYPE xor( rxmm128d a, rxmm128d b )
{
return _mm_xor_pd( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double comparision
//
/*!
r0 := (a0 == b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 == b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_eq( rxmm128d a, rxmm128d b )
{
return _mm_cmpeq_pd( a, b );
}
/*!
r0 := (a0 != b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 != b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_neq( rxmm128d a, rxmm128d b )
{
return _mm_cmpneq_pd( a, b );
}
/*!
r0 := (a0 < b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 < b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_lt( rxmm128d a, rxmm128d b )
{
return _mm_cmplt_pd( a, b );
}
/*!
r0 := (a0 <= b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 <= b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_le( rxmm128d a, rxmm128d b )
{
return _mm_cmple_pd( a, b );
}
/*!
r0 := (a0 > b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 > b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_gt( rxmm128d a, rxmm128d b )
{
return _mm_cmpgt_pd( a, b );
}
/*!
r0 := (a0 >= b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 >= b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_ge( rxmm128d a, rxmm128d b )
{
return _mm_cmpge_pd( a, b );
}
/*!
r0 := (a0 ord b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 ord b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_ord( rxmm128d a, rxmm128d b )
{
return _mm_cmpord_pd( a, b );
}
/*!
r0 := (a0 unord b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 unord b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_unord( rxmm128d a, rxmm128d b )
{
return _mm_cmpunord_pd( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double load
//
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Misc
/*!
r0 := a1
r1 := b1
*/
static inline rxmm128d unpckh( rxmm128d a, rxmm128d b )
{
return _mm_unpackhi_pd( a, b );
}
/*!
r0 := a0
r1 := b0
*/
static inline rxmm128d unpckl( rxmm128d a, rxmm128d b )
{
return _mm_unpacklo_pd( a, b );
}
/*!
r := sign(a1) << 1 | sign(a0)
*/
static inline int movmsk( rxmm128d a, rxmm128d b )
{
return _mm_movemask_pd( a, b );
}
/*!
r0 := (i0 == 1) ? b0 : a0
r1 := (i1 == 1) ? b1 : a1
\sa movmsk
*/
static inline int shuffle( rxmm128d a, rxmm128d b, int i )
{
return _mm_shuffle_pd( a, b, i );
}
/*!
== shuffle( a, b, 1 )
r0 := b0
r1 := a1
*/
static inline rxmm128d move_sd( rxmm128d a, rxmm128d b )
{
return _mm_move_sd( a0 );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory load
/*!
The address \arg p must be 16-byte aligned.
r0 := p[0]
r1 := p[1]
*/
static inline rxmm128d load( double * p )
{
return _mm_load_pd( p );
}
/*!
The address \arg p must be 16-byte aligned.
r0 := p[1]
r1 := p[0]
*/
static inline rxmm128d load_reverse( double * p )
{
return _mm_loadr_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := p[0]
r1 := p[1]
*/
static inline rxmm128d load_unaligned( double * p )
{
return _mm_loadu_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := a0
r1 := *p
*/
static inline rxmm128d load_hi( rxmm128d a, double * p )
{
return _mm_loadh_pd( a, p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := a1
*/
static inline rxmm128d load_lo( rxmm128d a, double * p )
{
return _mm_loadl_pd( a, p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := *p
*/
static inline rxmm128d load_both( double * p )
{
return _mm_load1_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := 0.0
*/
static inline rxmm128d load_s( double * p )
{
return _mm_load_sd( p );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory store
/*!
The address \arg p must be 16-byte aligned.
p[0] := a0
p[1] := a1
*/
static inline void store( double * p, rxmm128d a )
{
_mm_store_pd( p, a );
}
/*!
The address \arg p must be 16-byte aligned.
p[0] := a1
p[1] := a0
*/
static inline void store_reverse( double * p, rxmm128d a )
{
_mm_storer_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
p[0] := a0
p[1] := a1
*/
static inline void store_unaligned(double * p, rxmm128d a )
{
_mm_storeu_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
*p := a1
*/
static inline void store_hi( double * p, rxmm128d a )
{
_mm_storeh_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
*p := a0
*/
static inline void store_lo( double * p, rxmm128d a )
{
_mm_storel_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
p[0] := a0
p[1] := a0
*/
static inline void store_both( double * p, rxmm128d a )
{
return _mm_store1_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
*p := a0
*/
static inline void store_s( double * p, rxmm128d a )
{
return _mm_store_sd( p );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory set
/*!
r0 := a0
r1 := a1
*/
static inline rxmm128d set( double a1, double a0 )
{
return _mm_set_pd( a1, a0 );
}
/*!
r0 := 0.0
r1 := 0.0
*/
static inline rxmm128d set_zero()
{
return _mm_setzero_pd( a0 );
}
/*!
r0 := a0
r1 := a0
*/
static inline rxmm128d set_both( double a0 )
{
return _mm_set1_pd( a0 );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := a0
r1 := 0.0
*/
static inline rxmm128d set_s( double a0 )
{
return _mm_set_sd( a0 );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double convertion
//
/*!
r0 := (float) a0
r1 := (float) a1
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128s cvtpd2ps( rxmm128d a )
{
return _mm_cvtpd_ps( a );
}
/*!
r0 := (double) a0
r1 := (double) a1
*/
static inline rxmm128d cvtps2pd( rxmm128s a )
{
return _mm_cvtps_pd( a );
}
/*!
r0 := (int) a0
r1 := (int) a1
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128l cvtpd2dq( rxmm128d a )
{
return _mm_cvtpd_epi32( a );
}
/*!
r0 := (double) a0
r1 := (double) a1
*/
static inline rxmm128d cvtdq2pd( rxmm128l a )
{
return _mm_cvtepi32_pd( a );
}
/*!
r := (int) a0
*/
static inline int cvtsd2si( rxmm128d a )
{
return _mm_cvtsd_si32( a );
}
/*!
r0 := (float) b0
r1 := a1
r2 := a2
r3 := a3
*/
static inline rxmm128s cvtsd2ss( rxmm128l a, rxmm128d b )
{
return _mm_cvtsd_ss( a, b );
}
/*!
r0 := (double) b
r1 := a1
*/
static inline rxmm128d cvtsi2sd( rxmm128d a, int b )
{
return _mm_cvtsi32_sd( a, b );
}
/*!
r0 := (double) b0
r1 := a1
*/
static inline rxmm128d cvtss2sd( rxmm128d a, rxmm128s b )
{
return _mm_cvtss_sd( a, b );
}
/*!
using truncate
r0 := (int) a0
r1 := (int) a1
r2 := 0x0
r3 := 0x0
*/
static inline rxmm128l cvttpd2dq( rxmm128d a )
{
return _mm_cvttpd_epi32( a );
}
/*!
using truncate
r := (int) a0
*/
static inline int cvttsd2si( rxmm128d a )
{
return _mm_cvttsd_si32( a );
}
/*!
r0 := (float) a0
r1 := (float) a1
r2 := (float) a2
r3 := (float) a3
*/
static inline rxmm128s cvtdq2ps( rxmm128l a )
{
return _mm_cvtepi32_ps( a );
}
/*!
r0 := (int) a0
r1 := (int) a1
r2 := (int) a2
r3 := (int) a3
*/
static inline rxmm128l cvtps2dq( rxmm128s a )
{
return _mm_cvtps_epi32( a );
}
/*!
uses trancate
r0 := (int) a0
r1 := (int) a1
r2 := (int) a2
r3 := (int) a3
*/
static inline rxmm128l cvttps2dq( rxmm128s a )
{
return _mm_cvttps_epi32( a );
}
//
// class pd
//
};
//
// Namespace sse2
//
}
#endif/*_SSE2_CMPL_ABSTRACTION_MSC_PCKDBL_H_*/

View File

@@ -0,0 +1,667 @@
/*
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
* Inc., and is fully protected under copyright and trade secret laws. You may
* not view, use, disclose, copy, or distribute this file or any information
* contained herein except pursuant to a valid written license from Synopsys.
*/
//
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
//
#ifndef _SSE2_CMPL_ABSTRACTION_MSC_PCKFLOAT_H_
#define _SSE2_CMPL_ABSTRACTION_MSC_PCKFLOAT_H_
//
// Namespace sse2
//
namespace sse2
{
//
/// class ps (packed single precision)
//
class ps
{
public:
//
/// The type.
//
typedef rxmm128s my_rxmm;
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double arithmetic
//
/*!
r0 := a0 + b0
r1 := a1 + b1
r2 := a2 + b2
r3 := a3 + b3
*/
static inline rxmm128s add( rxmm128s a, rxmm128s b )
{
return _mm_add_ps( a, b );
}
/*!
r0 := a0 - b0
r1 := a1 - b1
r2 := a2 - b2
r3 := a3 - b3
*/
static inline rxmm128s sub( rxmm128s a, rxmm128s b )
{
return _mm_sub_ps( a, b );
}
/*!
r0 := a0 * b0
r1 := a1 * b1
r2 := a2 * b2
r3 := a3 * b3
*/
static inline rxmm128s mul( rxmm128s a, rxmm128s b )
{
return _mm_mul_ps( a, b );
}
/*!
r0 := a0 / b0
r1 := a1 / b1
r2 := a2 / b2
r3 := a3 / b3
*/
static inline rxmm128s div( rxmm128s a, rxmm128s b )
{
return _mm_div_ps( a, b );
}
/*!
r0 := max(a0, b0)
r1 := max(a1, b1)
r2 := max(a2, b2)
r3 := max(a3, b3)
*/
static inline rxmm128s max( rxmm128s a, rxmm128s b )
{
return _mm_max_ps( a, b );
}
/*!
r0 := min(a0, b0)
r1 := min(a1, b1)
r2 := min(a2, b2)
r3 := min(a3, b3)
*/
static inline rxmm128s min( rxmm128s a, rxmm128s b )
{
return _mm_min_ps( a, b );
}
/*!
r0 := sqrt(a0)
r1 := sqrt(a1)
r2 := sqrt(a2)
r3 := sqrt(a3)
*/
static inline rxmm128s sqrt( rxmm128s a )
{
return _mm_sqrt_ps( a );
}
/*!
r0 := recip(a0)
r1 := recip(a1)
r2 := recip(a2)
r3 := recip(a3)
*/
static inline rxmm128s rcp( rxmm128s a )
{
return _mm_rcp_ps( a );
}
/*!
r0 := recip(sqrt(a0))
r1 := recip(sqrt(a1))
r2 := recip(sqrt(a2))
r3 := recip(sqrt(a3))
*/
static inline rxmm128s rsqrt( rxmm128s a )
{
return _mm_rsqrt_ps( a );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double logic
//
/*!
r0 := ~a0 & b0
r1 := ~a1 & b1
r2 := ~a2 & b2
r3 := ~a3 & b3
*/
static inline rxmm128s andnot( rxmm128s a, rxmm128s b )
{
return _mm_andnot_ps( a, b );
}
/*!
r0 := a0 & b0
r1 := a1 & b1
*/
static inline XMM_TYPE and( rxmm128s a, rxmm128s b )
{
return _mm_and_ps( a, b );
}
/*!
r0 := a0 | b0
r1 := a1 | b1
*/
static inline XMM_TYPE or( rxmm128s a, rxmm128s b )
{
return _mm_or_ps( a, b );
}
/*!
r0 := a0 ^ b0
r1 := a1 ^ b1
r2 := a2 ^ b2
r3 := a3 ^ b3
*/
static inline XMM_TYPE xor( rxmm128s a, rxmm128s b )
{
return _mm_xor_ps( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double comparision
//
/*!
r0 := (a0 == b0) ? 0xffffffff : 0x0
r1 := (a1 == b1) ? 0xffffffff : 0x0
r2 := (a2 == b2) ? 0xffffffff : 0x0
r3 := (a3 == b3) ? 0xffffffff : 0x0
*/
static inline rxmm128s cmp_eq( rxmm128s a, rxmm128s b )
{
BOOST_STATIC_ASSERT( false );
return _mm_cmpeq_ps( a, b );
}
/*!
r0 := (a0 != b0) ? 0xffffffff : 0x0
r1 := (a1 != b1) ? 0xffffffff : 0x0
r2 := (a2 != b2) ? 0xffffffff : 0x0
r3 := (a3 != b3) ? 0xffffffff : 0x0
*/
static inline rxmm128s cmp_neq( rxmm128s a, rxmm128s b )
{
return _mm_cmpneq_ps( a, b );
}
/*!
r0 := (a0 < b0) ? 0xffffffff : 0x0
r1 := (a1 < b1) ? 0xffffffff : 0x0
r2 := (a2 < b2) ? 0xffffffff : 0x0
r3 := (a3 < b3) ? 0xffffffff : 0x0
*/
static inline rxmm128s cmp_lt( rxmm128s a, rxmm128s b )
{
return _mm_cmplt_ps( a, b );
}
/*!
r0 := (a0 <= b0) ? 0xffffffff : 0x0
r1 := (a1 <= b1) ? 0xffffffff : 0x0
r2 := (a2 <= b2) ? 0xffffffff : 0x0
r3 := (a3 <= b3) ? 0xffffffff : 0x0
*/
static inline rxmm128s cmp_le( rxmm128s a, rxmm128s b )
{
BOOST_STATIC_ASSERT( false );
return _mm_cmple_ps( a, b );
}
/*!
r0 := (a0 > b0) ? 0xffffffff : 0x0
r1 := (a1 > b1) ? 0xffffffff : 0x0
r2 := (a2 > b2) ? 0xffffffff : 0x0
r3 := (a3 > b3) ? 0xffffffff : 0x0
*/
static inline rxmm128s cmp_gt( rxmm128s a, rxmm128s b )
{
BOOST_STATIC_ASSERT( false );
return _mm_cmpgt_ps( a, b );
}
/*!
r0 := (a0 >= b0) ? 0xffffffff : 0x0
r1 := (a1 >= b1) ? 0xffffffff : 0x0
r2 := (a2 >= b2) ? 0xffffffff : 0x0
r3 := (a3 >= b3) ? 0xffffffff : 0x0
*/
static inline rxmm128s cmp_ge( rxmm128s a, rxmm128s b )
{
return _mm_cmpge_ps( a, b );
}
/*!
r0 := (a0 ord b0) ? 0xffffffff : 0x0
r1 := (a1 ord b1) ? 0xffffffff : 0x0
r2 := (a2 ord b2) ? 0xffffffff : 0x0
r3 := (a3 ord b3) ? 0xffffffff : 0x0
*/
static inline rxmm128s cmp_ord( rxmm128s a, rxmm128s b )
{
return _mm_cmpord_ps( a, b );
}
/*!
r0 := (a0 unord b0) ? 0xffffffff : 0x0
r1 := (a1 unord b1) ? 0xffffffff : 0x0
r2 := (a2 unord b2) ? 0xffffffff : 0x0
r3 := (a3 unord b3) ? 0xffffffff : 0x0
*/
static inline rxmm128s cmp_unord( rxmm128s a, rxmm128s b )
{
return _mm_cmpunord_ps( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double load
//
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Misc
/*!
r0 := a2
r1 := b2
r2 := a3
r3 := b3
*/
static inline rxmm128s unpckh( rxmm128s a, rxmm128s b )
{
return _mm_unpackhi_ps( a, b );
}
/*!
r0 := a0
r1 := b0
r2 := a1
r3 := b1
*/
static inline rxmm128s unpckl( rxmm128s a, rxmm128s b )
{
return _mm_unpacklo_ps( a, b );
}
/*!
r := sign(a3)<<3 | sign(a2)<<2 | sign(a1)<<1 | sign(a0)
*/
static inline int movmsk( rxmm128s a, rxmm128s b )
{
return _mm_movemask_ps( a, b );
}
/*!
r0 := (i0 == 1) ? b0 : a0
r1 := (i1 == 1) ? b1 : a1
r2 := (i2 == 1) ? b2 : a2
r3 := (i3 == 1) ? b3 : a3
\sa movmsk
*/
static inline int shuffle( rxmm128s a, rxmm128s b, int i )
{
return _mm_shuffle_ps( a, b, i );
}
/*!
r3 := a3
r2 := a2
r1 := b3
r0 := b2
*/
static inline rxmm128s move_hl( rxmm128s a, rxmm128s b )
{
return mm_movehl_ps( a0 );
}
/*!
r3 := b1
r2 := b0
r1 := a1
r0 := a0
*/
static inline rxmm128s move_lh( rxmm128s a, rxmm128s b )
{
return _mm_movelh_ps( a0 );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory load
/*!
The address \arg p must be 16-byte aligned.
r0 := p[0]
r1 := p[1]
r2 := p[2]
r3 := p[3]
*/
static inline rxmm128s load( float * p )
{
return _mm_load_ps( p );
}
/*!
The address \arg p must be 16-byte aligned.
r0 := p[3]
r1 := p[2]
r2 := p[1]
r3 := p[0]
*/
static inline rxmm128s load_reverse( float * p )
{
return _mm_loadr_ps( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := p[0]
r1 := p[1]
r2 := p[2]
r3 := p[3]
*/
static inline rxmm128s load_unaligned( float * p )
{
return _mm_loadu_ps( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := *p
r2 := *p
r3 := *p
*/
static inline rxmm128s load_both( float * p )
{
return _mm_load1_ps( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := 0.0
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128s load_s( float * p )
{
return _mm_load_ss( p );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory store
/*!
The address \arg p must be 16-byte aligned.
p[0] := a0
p[1] := a1
p[2] := a2
p[3] := a3
*/
static inline void store( float * p, rxmm128s a )
{
_mm_store_ps( p, a );
}
/*!
The address \arg p must be 16-byte aligned.
p[0] := a3
p[1] := a2
p[2] := a1
p[3] := a0
*/
static inline void store_reverse( float * p, rxmm128s a )
{
_mm_storer_ps( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
p[0] := a0
p[1] := a1
p[2] := a2
p[3] := a3
*/
static inline void store_unaligned(float * p, rxmm128s a )
{
_mm_storeu_ps( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
p[0] := a0
p[1] := a0
*/
static inline void store_both( float * p, rxmm128s a )
{
return _mm_store1_ps( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
*p := a0
*/
static inline void store_s( float * p, rxmm128s a )
{
return _mm_store_ss( p );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory set
/*!
r0 := a0
r1 := a1
r2 := a2
r3 := a3
*/
static inline rxmm128s set( float a3, float a2, float a1, float a0 )
{
return _mm_set_ps( a3, a2, a1, a0 );
}
/*!
r0 := 0.0
r1 := 0.0
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128s set_zero()
{
return _mm_setzero_ps( a0 );
}
/*!
r0 := a0
r1 := a0
r2 := a0
r3 := a0
*/
static inline rxmm128s set_both( float a0 )
{
return _mm_set1_ps( a0 );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := a0
r1 := 0.0
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128s set_s( float a0 )
{
return _mm_set_ss( a0 );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double convertion
//
/*!
r0 := (float) a0
r1 := (float) a1
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128s cvtpd2ps( rxmm128s a )
{
return _mm_cvtpd_ps( a );
}
/*!
r0 := (double) a0
r1 := (double) a1
*/
static inline rxmm128s cvtps2pd( rxmm128s a )
{
return _mm_cvtps_ps( a );
}
/*!
r0 := (int) a0
r1 := (int) a1
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128l cvtpd2dq( rxmm128s a )
{
return _mm_cvtpd_epi32( a );
}
/*!
r0 := (double) a0
r1 := (double) a1
*/
static inline rxmm128s cvtdq2pd( rxmm128l a )
{
return _mm_cvtepi32_ps( a );
}
/*!
r := (int) a0
*/
static inline int cvtsd2si( rxmm128s a )
{
return _mm_cvtsd_si32( a );
}
/*!
r0 := (float) b0
r1 := a1
r2 := a2
r3 := a3
*/
static inline rxmm128s cvtsd2ss( rxmm128l a, rxmm128s b )
{
return _mm_cvtsd_ss( a, b );
}
/*!
r0 := (double) b
r1 := a1
*/
static inline rxmm128s cvtsi2sd( rxmm128s a, int b )
{
return _mm_cvtsi32_sd( a, b );
}
/*!
r0 := (double) b0
r1 := a1
*/
static inline rxmm128s cvtss2sd( rxmm128s a, rxmm128s b )
{
return _mm_cvtss_sd( a, b );
}
/*!
using truncate
r0 := (int) a0
r1 := (int) a1
r2 := 0x0
r3 := 0x0
*/
static inline rxmm128l cvttpd2dq( rxmm128s a )
{
return _mm_cvttpd_epi32( a );
}
/*!
using truncate
r := (int) a0
*/
static inline int cvttsd2si( rxmm128s a )
{
return _mm_cvttsd_si32( a );
}
/*!
r0 := (float) a0
r1 := (float) a1
r2 := (float) a2
r3 := (float) a3
*/
static inline rxmm128s cvtdq2ps( rxmm128l a )
{
return _mm_cvtepi32_ps( a );
}
/*!
r0 := (int) a0
r1 := (int) a1
r2 := (int) a2
r3 := (int) a3
*/
static inline rxmm128l cvtps2dq( rxmm128s a )
{
return _mm_cvtps_epi32( a );
}
/*!
uses trancate
r0 := (int) a0
r1 := (int) a1
r2 := (int) a2
r3 := (int) a3
*/
static inline rxmm128l cvttps2dq( rxmm128s a )
{
return _mm_cvttps_epi32( a );
}
//
// class ps
//
};
//
// Namespace sse2
//
}
#endif/*_SSE2_CMPL_ABSTRACTION_MSC_PCKFLOAT_H_*/

View File

@@ -0,0 +1,618 @@
/*
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
* Inc., and is fully protected under copyright and trade secret laws. You may
* not view, use, disclose, copy, or distribute this file or any information
* contained herein except pursuant to a valid written license from Synopsys.
*/
//
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
//
#ifndef _SSE2_CMPL_ABSTRACTION_MSC_PCKINT16_H_
#define _SSE2_CMPL_ABSTRACTION_MSC_PCKINT16_H_
//
// Namespace sse2
//
namespace sse2
{
//
/// class epi64 (packed single precision)
//
class epi64
{
public:
//
/// The type.
//
typedef rxmm128l my_rxmm;
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed integer arithmetic
//
/*!
r0 := a0 + b0
r1 := a1 + b1
*/
static inline rxmm128d add( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_add_pd( a, b );
}
/*!
r0 := a0 - b0
r1 := a1 - b1
*/
static inline rxmm128d sub( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_sub_pd( a, b );
}
/*!
r0 := a0 * b0
r1 := a1 * b1
*/
static inline rxmm128d mul( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_mul_pd( a, b );
}
/*!
r0 := a0 / b0
r1 := a1 / b1
*/
static inline rxmm128d div( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_div_pd( a, b );
}
/*!
r0 := max( a0, b0 )
r1 := max( a1, b1 )
*/
static inline rxmm128d max( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_max_pd( a, b );
}
/*!
r0 := min( a0, b0 )
r1 := min( a1, b1 )
*/
static inline rxmm128d min( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_min_pd( a, b );
}
/*!
r0 := sqrt( a0 )
r1 := sqrt( a1 )
*/
static inline rxmm128d sqrt( rxmm128d a )
{
BOOST_STATIC_ASSERT( false );
return _mm_sqrt_pd( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed integer logic
//
/*!
r0 := (~a0) & b0
r1 := (~a1) & b1
*/
static inline rxmm128d andnot( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_andnot_pd( a, b );
}
/*!
r0 := a0 & b0
r1 := a1 & b1
*/
static inline XMM_TYPE and( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_and_pd( a, b );
}
/*!
r0 := a0 | b0
r1 := a1 | b1
*/
static inline XMM_TYPE or( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_or_pd( a, b );
}
/*!
r0 := a0 ^ b0
r1 := a1 ^ b1
*/
static inline XMM_TYPE xor( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_xor_pd( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed integer comparision
//
/*!
r0 := (a0 == b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 == b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_eq( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_cmpeq_pd( a, b );
}
/*!
r0 := (a0 != b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 != b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_neq( rxmm128d a, rxmm128d b )
{
return _mm_cmpneq_pd( a, b );
}
/*!
r0 := (a0 < b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 < b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_lt( rxmm128d a, rxmm128d b )
{
return _mm_cmplt_pd( a, b );
}
/*!
r0 := (a0 <= b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 <= b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_le( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_cmple_pd( a, b );
}
/*!
r0 := (a0 > b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 > b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_gt( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_cmpgt_pd( a, b );
}
/*!
r0 := (a0 >= b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 >= b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_ge( rxmm128d a, rxmm128d b )
{
return _mm_cmpge_pd( a, b );
}
/*!
r0 := (a0 ord b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 ord b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_ord( rxmm128d a, rxmm128d b )
{
return _mm_cmpord_pd( a, b );
}
/*!
r0 := (a0 unord b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 unord b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_unord( rxmm128d a, rxmm128d b )
{
return _mm_cmpunord_pd( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed integer load
//
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Misc
/*!
r0 := a1
r1 := b1
*/
static inline rxmm128d unpckh( rxmm128d a, rxmm128d b )
{
return _mm_unpackhi_pd( a, b );
}
/*!
r0 := a0
r1 := b0
*/
static inline rxmm128d unpckl( rxmm128d a, rxmm128d b )
{
return _mm_unpacklo_pd( a, b );
}
/*!
r := sign(a1) << 1 | sign(a0)
*/
static inline int movmsk( rxmm128d a, rxmm128d b )
{
return _mm_movemask_pd( a, b );
}
/*!
r0 := (i0 == 1) ? b0 : a0
r1 := (i1 == 1) ? b1 : a1
\sa movmsk
*/
static inline int shuffle( rxmm128d a, rxmm128d b, int i )
{
return _mm_shuffle_pd( a, b, i );
}
/*!
== shuffle( a, b, 1 )
r0 := b0
r1 := a1
*/
static inline rxmm128d move_sd( rxmm128d a, rxmm128d b )
{
return _mm_move_sd( a0 );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory load
/*!
The address \arg p must be 16-byte aligned.
r0 := p[0]
r1 := p[1]
*/
static inline rxmm128d load( double * p )
{
return _mm_load_pd( p );
}
/*!
The address \arg p must be 16-byte aligned.
r0 := p[1]
r1 := p[0]
*/
static inline rxmm128d load_reverse( double * p )
{
return _mm_loadr_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := p[0]
r1 := p[1]
*/
static inline rxmm128d load_unaligned( double * p )
{
return _mm_loadu_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := a0
r1 := *p
*/
static inline rxmm128d load_hi( rxmm128d a, double * p )
{
return _mm_loadh_pd( a, p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := a1
*/
static inline rxmm128d load_lo( rxmm128d a, double * p )
{
return _mm_loadl_pd( a, p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := *p
*/
static inline rxmm128d load_both( double * p )
{
return _mm_load1_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := 0.0
*/
static inline rxmm128d load_sd( double * p )
{
return _mm_load_sd( p );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory store
/*!
The address \arg p must be 16-byte aligned.
p[0] := a0
p[1] := a1
*/
static inline void store( double * p, rxmm128d a )
{
_mm_load_pd( p, a );
}
/*!
The address \arg p must be 16-byte aligned.
p[0] := a1
p[1] := a0
*/
static inline void store_reverse( double * p, rxmm128d a )
{
_mm_storer_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
p[0] := a0
p[1] := a1
*/
static inline void store_unaligned(double * p, rxmm128d a )
{
_mm_storeu_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
*p := a1
*/
static inline void store_hi( double * p, rxmm128d a )
{
_mm_storeh_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
*p := a0
*/
static inline void store_lo( double * p, rxmm128d a )
{
_mm_storel_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
p[0] := a0
p[1] := a0
*/
static inline void store_both( double * p, rxmm128d a )
{
return _mm_store1_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
*p := a0
*/
static inline void store_sd( double * p, rxmm128d a )
{
return _mm_store_sd( p );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory set
/*!
r0 := a0
r1 := a1
*/
static inline rxmm128d set( double a1, double a0 )
{
return _mm_set_pd( a1, a0 );
}
/*!
r0 := 0.0
r1 := 0.0
*/
static inline rxmm128d set_zero()
{
return _mm_setzero_pd( a0 );
}
/*!
r0 := a0
r1 := a0
*/
static inline rxmm128d set_both( double a0 )
{
return _mm_set1_pd( a0 );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := a0
r1 := 0.0
*/
static inline rxmm128d set_sd( double a0 )
{
return _mm_set_sd( a0 );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed integer convertion
//
/*!
r0 := (float) a0
r1 := (float) a1
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128s cvtpd2ps( rxmm128d a )
{
return _mm_cvtpd_ps( a );
}
/*!
r0 := (double) a0
r1 := (double) a1
*/
static inline rxmm128d cvtps2pd( rxmm128s a )
{
return _mm_cvtps_pd( a );
}
/*!
r0 := (int) a0
r1 := (int) a1
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128l cvtpd2dq( rxmm128d a )
{
return _mm_cvtpd_epi32( a );
}
/*!
r0 := (double) a0
r1 := (double) a1
*/
static inline rxmm128d cvtdq2pd( rxmm128l a )
{
return _mm_cvtepi32_pd( a );
}
/*!
r := (int) a0
*/
static inline int cvtsd2si( rxmm128d a )
{
return _mm_cvtsd_si32( a );
}
/*!
r0 := (float) b0
r1 := a1
r2 := a2
r3 := a3
*/
static inline rxmm128s cvtsd2ss( rxmm128l a, rxmm128d b )
{
return _mm_cvtsd_ss( a, b );
}
/*!
r0 := (double) b
r1 := a1
*/
static inline rxmm128d cvtsi2sd( rxmm128d a, int b )
{
return _mm_cvtsi32_sd( a, b );
}
/*!
r0 := (double) b0
r1 := a1
*/
static inline rxmm128d cvtss2sd( rxmm128d a, rxmm128s b )
{
return _mm_cvtss_sd( a, b );
}
/*!
using truncate
r0 := (int) a0
r1 := (int) a1
r2 := 0x0
r3 := 0x0
*/
static inline rxmm128l cvttpd2dq( rxmm128d a )
{
return _mm_cvttpd_epi32( a );
}
/*!
using truncate
r := (int) a0
*/
static inline int cvttsd2si( rxmm128d a )
{
return _mm_cvttsd_si32( a );
}
/*!
r0 := (float) a0
r1 := (float) a1
r2 := (float) a2
r3 := (float) a3
*/
static inline rxmm128s cvtdq2ps( rxmm128l a )
{
return _mm_cvtepi32_ps( a );
}
/*!
r0 := (int) a0
r1 := (int) a1
r2 := (int) a2
r3 := (int) a3
*/
static inline rxmm128l cvtps2dq( rxmm128s a )
{
return _mm_cvtps_epi32( a );
}
/*!
uses trancate
r0 := (int) a0
r1 := (int) a1
r2 := (int) a2
r3 := (int) a3
*/
static inline rxmm128l cvttps2dq( rxmm128s a )
{
return _mm_cvttps_epi32( a );
}
//
// class epi64
//
};
//
// Namespace sse2
//
}
#endif/*_SSE2_CMPL_ABSTRACTION_MSC_PCKINT16_H_*/

View File

@@ -0,0 +1,676 @@
/*
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
* Inc., and is fully protected under copyright and trade secret laws. You may
* not view, use, disclose, copy, or distribute this file or any information
* contained herein except pursuant to a valid written license from Synopsys.
*/
//
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
//
#ifndef _SSE2_CMPL_ABSTRACTION_MSC_PCKINT32_H_
#define _SSE2_CMPL_ABSTRACTION_MSC_PCKINT32_H_
//
// Namespace sse2
//
namespace sse2
{
//
/// class epi64 (packed single precision)
//
class epi64
{
public:
//
/// The type.
//
typedef rxmm128l my_rxmm;
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed integer arithmetic
//
/*!
r0 := a0 + b0
r1 := a1 + b1
r2 := a2 + b2
r3 := a3 + b3
*/
static inline rxmm128l add( rxmm128l a, rxmm128l b )
{
return _mm_add_epi32( a, b );
}
/*!
r0 := a0 - b0
r1 := a1 - b1
r2 := a2 - b2
r3 := a3 - b3
*/
static inline rxmm128l sub( rxmm128l a, rxmm128l b )
{
return _mm_sub_epi32( a, b );
}
/*!
r0 := a0 * b0
r1 := a1 * b1
r2 := a2 * b2
r3 := a3 * b3
\note Emulating through float. May be precision loss.
*/
static inline rxmm128l mul( rxmm128l a, rxmm128l b )
{
register rxmm128s t = _mm_cvtepi32_ps( a );
register rxmm128s u = _mm_cvtepi32_ps( b );
register rxmm128s v = _mm_mul_ps( t, u );
return _mm_cvtps_epi32( v );
}
/*!
r0 := a0 / b0
r1 := a1 / b1
r2 := a2 / b2
r3 := a3 / b3
\note Emulating through float. May be precision loss.
*/
static inline rxmm128l div( rxmm128l a, rxmm128l b )
{
register rxmm128s t = _mm_cvtepi32_ps( a );
register rxmm128s u = _mm_cvtepi32_ps( b );
register rxmm128s v = _mm_div_ps( t, u );
return _mm_cvtps_epi32( v );
}
/*!
r0 := max(a0, b0)
r1 := max(a1, b1)
r2 := max(a2, b2)
r3 := max(a3, b3)
*/
static inline rxmm128l max( rxmm128l a, rxmm128l b )
{
register rxmm128l t = _mm_cmplt_epi32( a, b );
int mask = _mm_movemask_epi8( t );
_mm_shuffle_epi32
BOOST_STATIC_ASSERT( false );
return 0;
}
/*!
r0 := min(a0, b0)
r1 := min(a1, b1)
r2 := min(a2, b2)
r3 := min(a3, b3)
*/
static inline rxmm128l min( rxmm128l a, rxmm128l b )
{
BOOST_STATIC_ASSERT( false );
return 0;
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double logic
//
/*!
r0 := ~a0
r1 := ~a1
r2 := ~a2
r3 := ~a3
*/
static inline rxmm128l not( rxmm128l a )
{
BOOST_STATIC_ASSERT( false );
return _mm_andnot_si128( a, b );
}
/*!
r0 := ~a0 & b0
r1 := ~a1 & b1
r2 := ~a2 & b2
r3 := ~a3 & b3
*/
static inline rxmm128l andnot( rxmm128l a, rxmm128l b )
{
return _mm_andnot_si128( a, b );
}
/*!
r0 := a0 & b0
r1 := a1 & b1
*/
static inline XMM_TYPE and( rxmm128l a, rxmm128l b )
{
return _mm_and_si128( a, b );
}
/*!
r0 := a0 | b0
r1 := a1 | b1
*/
static inline XMM_TYPE or( rxmm128l a, rxmm128l b )
{
return _mm_or_si128( a, b );
}
/*!
r0 := a0 ^ b0
r1 := a1 ^ b1
r2 := a2 ^ b2
r3 := a3 ^ b3
*/
static inline XMM_TYPE xor( rxmm128l a, rxmm128l b )
{
return _mm_xor_si128( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double comparision
//
/*!
r0 := (a0 == b0) ? 0xffffffff : 0x0
r1 := (a1 == b1) ? 0xffffffff : 0x0
r2 := (a2 == b2) ? 0xffffffff : 0x0
r3 := (a3 == b3) ? 0xffffffff : 0x0
*/
static inline rxmm128l cmp_eq( rxmm128l a, rxmm128l b )
{
return _mm_cmpeq_epi32( a, b );
}
/*!
r0 := (a0 != b0) ? 0xffffffff : 0x0
r1 := (a1 != b1) ? 0xffffffff : 0x0
r2 := (a2 != b2) ? 0xffffffff : 0x0
r3 := (a3 != b3) ? 0xffffffff : 0x0
*/
static inline rxmm128l cmp_neq( rxmm128l a, rxmm128l b )
{
rxmm128l t = _mm_cmplt_epi32( a, b );
rxmm128l u = _mm_cmpgt_epi32( a, b );
return _mm_cmpor_si128( t, u );
}
/*!
r0 := (a0 < b0) ? 0xffffffff : 0x0
r1 := (a1 < b1) ? 0xffffffff : 0x0
r2 := (a2 < b2) ? 0xffffffff : 0x0
r3 := (a3 < b3) ? 0xffffffff : 0x0
*/
static inline rxmm128l cmp_lt( rxmm128l a, rxmm128l b )
{
return _mm_cmplt_epi32( a, b );
}
/*!
r0 := (a0 <= b0) ? 0xffffffff : 0x0
r1 := (a1 <= b1) ? 0xffffffff : 0x0
r2 := (a2 <= b2) ? 0xffffffff : 0x0
r3 := (a3 <= b3) ? 0xffffffff : 0x0
*/
static inline rxmm128l cmp_le( rxmm128l a, rxmm128l b )
{
rxmm128l t = _mm_cmplt_epi32( a, b );
rxmm128l u = _mm_cmpeq_epi32( a, b );
return _mm_cmpor_si128( t, u );
}
/*!
r0 := (a0 > b0) ? 0xffffffff : 0x0
r1 := (a1 > b1) ? 0xffffffff : 0x0
r2 := (a2 > b2) ? 0xffffffff : 0x0
r3 := (a3 > b3) ? 0xffffffff : 0x0
*/
static inline rxmm128l cmp_gt( rxmm128l a, rxmm128l b )
{
return _mm_cmpgt_epi32( a, b );
}
/*!
r0 := (a0 >= b0) ? 0xffffffff : 0x0
r1 := (a1 >= b1) ? 0xffffffff : 0x0
r2 := (a2 >= b2) ? 0xffffffff : 0x0
r3 := (a3 >= b3) ? 0xffffffff : 0x0
*/
static inline rxmm128l cmp_ge( rxmm128l a, rxmm128l b )
{
rxmm128l t = _mm_cmpgt_epi32( a, b );
rxmm128l u = _mm_cmpeq_epi32( a, b );
return _mm_cmpor_si128( t, u );
}
/*!
r0 := (a0 ord b0) ? 0xffffffff : 0x0
r1 := (a1 ord b1) ? 0xffffffff : 0x0
r2 := (a2 ord b2) ? 0xffffffff : 0x0
r3 := (a3 ord b3) ? 0xffffffff : 0x0
*/
static inline rxmm128l cmp_ord( rxmm128l a, rxmm128l b )
{
return _mm_cmpord_epi32( a, b );
}
/*!
r0 := (a0 unord b0) ? 0xffffffff : 0x0
r1 := (a1 unord b1) ? 0xffffffff : 0x0
r2 := (a2 unord b2) ? 0xffffffff : 0x0
r3 := (a3 unord b3) ? 0xffffffff : 0x0
*/
static inline rxmm128l cmp_unord( rxmm128l a, rxmm128l b )
{
return _mm_cmpunord_epi32( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed integer load
//
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Misc
/*!
r0 := a1
r1 := b2
r2 := a3
r3 := b3
*/
static inline rxmm128l unpckh( rxmm128l a, rxmm128l b )
{
BOOST_STATIC_ASSERT( false );
return _mm_unpackhi_epi32( a, b );
}
/*!
r0 := a0
r1 := b0
r2 := a1
r3 := b1
*/
static inline rxmm128l unpckl( rxmm128l a, rxmm128l b )
{
BOOST_STATIC_ASSERT( false );
return _mm_unpacklo_epi32( a, b );
}
/*!
r := sign(a3)<<3 | sign(a2)<<2 | sign(a1)<<1 | sign(a0)
*/
static inline int movmsk( rxmm128l a, rxmm128l b )
{
BOOST_STATIC_ASSERT( false );
return _mm_movemask_epi32( a, b );
}
/*!
r0 := (i0 == 1) ? b0 : a0
r1 := (i1 == 1) ? b1 : a1
r2 := (i2 == 1) ? b2 : a2
r3 := (i3 == 1) ? b3 : a3
\sa movmsk
*/
static inline int shuffle( rxmm128l a, rxmm128l b, int i )
{
BOOST_STATIC_ASSERT( false );
return _mm_shuffle_epi32( a, b, i );
}
/*!
r3 := a3
r2 := a2
r1 := b3
r0 := b2
*/
static inline rxmm128l move_hl( rxmm128l a, rxmm128l b )
{
BOOST_STATIC_ASSERT( false );
return mm_movehl_epi32( a0 );
}
/*!
r3 := b1
r2 := b0
r1 := a1
r0 := a0
*/
static inline rxmm128l move_lh( rxmm128l a, rxmm128l b )
{
BOOST_STATIC_ASSERT( false );
return _mm_movelh_epi32( a0 );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory load
/*!
The address \arg p must be 16-byte aligned.
r0 := p[0]
r1 := p[1]
r2 := p[2]
r3 := p[3]
*/
static inline rxmm128l load( int * p )
{
return _mm_load_epi32( reinterpret_cast<__m128i*>(p) );
}
/*!
The address \arg p must be 16-byte aligned.
r0 := p[3]
r1 := p[2]
r2 := p[1]
r3 := p[0]
*/
static inline rxmm128l load_reverse( int * p )
{
BOOST_STATIC_ASSERT( false );
rxmm128l t = _mm_loadr_epi32( reinterpret_cast<__m128i*>(p) )
return _mm_loadr_epi32( reinterpret_cast<__m128i*>(p) );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := p[0]
r1 := p[1]
r2 := p[2]
r3 := p[3]
*/
static inline rxmm128l load_unaligned( int * p )
{
return _mm_loadu_epi32( reinterpret_cast<__m128i*>(p) );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := *p
r2 := *p
r3 := *p
*/
static inline rxmm128l load_both( int * p )
{
BOOST_STATIC_ASSERT( false );
return _mm_load1_epi32( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := 0.0
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128l load_s( int * p )
{
BOOST_STATIC_ASSERT( false );
return _mm_load_ss( p );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory store
/*!
The address \arg p must be 16-byte aligned.
p[0] := a0
p[1] := a1
p[2] := a2
p[3] := a3
*/
static inline void store( int * p, rxmm128l a )
{
_mm_store_si128( reinterpret_cast<__m128i*>(p), a );
}
/*!
The address \arg p must be 16-byte aligned.
p[0] := a3
p[1] := a2
p[2] := a1
p[3] := a0
*/
static inline void store_reverse( int * p, rxmm128l a )
{
BOOST_STATIC_ASSERT( false );
_mm_storer_epi32( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
p[0] := a0
p[1] := a1
p[2] := a2
p[3] := a3
*/
static inline void store_unaligned(int * p, rxmm128l a )
{
_mm_storeu_si128( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
p[0] := a0
p[1] := a0
*/
static inline void store_both( int * p, rxmm128l a )
{
BOOST_STATIC_ASSERT( false );
return _mm_store1_epi32( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
*p := a0
*/
static inline void store_s( int * p, rxmm128l a )
{
BOOST_STATIC_ASSERT( false );
return _mm_store_ss( p );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory set
/*!
r0 := a0
r1 := a1
r2 := a2
r3 := a3
*/
static inline rxmm128l set( int a3, int a2, int a1, int a0 )
{
return _mm_set_epi32( a3, a2, a1, a0 );
}
/*!
r0 := 0.0
r1 := 0.0
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128l set_zero()
{
return _mm_setzero_si32( a0 );
}
/*!
r0 := a0
r1 := a0
r2 := a0
r3 := a0
*/
static inline rxmm128l set_both( int a0 )
{
return _mm_set1_epi32( a0 );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := a0
r1 := 0.0
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128l set_s( int a0 )
{
BOOST_STATIC_ASSERT( false );
return _mm_set_ss( a0 );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed double convertion
//
/*!
r0 := (float) a0
r1 := (float) a1
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128l cvtpd2ps( rxmm128l a )
{
return _mm_cvtpd_epi32( a );
}
/*!
r0 := (double) a0
r1 := (double) a1
*/
static inline rxmm128l cvtps2pd( rxmm128l a )
{
return _mm_cvtps_epi32( a );
}
/*!
r0 := (int) a0
r1 := (int) a1
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128l cvtpd2dq( rxmm128l a )
{
return _mm_cvtpd_epi32( a );
}
/*!
r0 := (double) a0
r1 := (double) a1
*/
static inline rxmm128l cvtdq2pd( rxmm128l a )
{
return _mm_cvtepi32_epi32( a );
}
/*!
r := (int) a0
*/
static inline int cvtsd2si( rxmm128l a )
{
return _mm_cvtsd_si32( a );
}
/*!
r0 := (float) b0
r1 := a1
r2 := a2
r3 := a3
*/
static inline rxmm128l cvtsd2ss( rxmm128l a, rxmm128l b )
{
return _mm_cvtsd_ss( a, b );
}
/*!
r0 := (double) b
r1 := a1
*/
static inline rxmm128l cvtsi2sd( rxmm128l a, int b )
{
return _mm_cvtsi32_sd( a, b );
}
/*!
r0 := (double) b0
r1 := a1
*/
static inline rxmm128l cvtss2sd( rxmm128l a, rxmm128l b )
{
return _mm_cvtss_sd( a, b );
}
/*!
using truncate
r0 := (int) a0
r1 := (int) a1
r2 := 0x0
r3 := 0x0
*/
static inline rxmm128l cvttpd2dq( rxmm128l a )
{
return _mm_cvttpd_epi32( a );
}
/*!
using truncate
r := (int) a0
*/
static inline int cvttsd2si( rxmm128l a )
{
return _mm_cvttsd_si32( a );
}
/*!
r0 := (float) a0
r1 := (float) a1
r2 := (float) a2
r3 := (float) a3
*/
static inline rxmm128l cvtdq2ps( rxmm128l a )
{
return _mm_cvtepi32_epi32( a );
}
/*!
r0 := (int) a0
r1 := (int) a1
r2 := (int) a2
r3 := (int) a3
*/
static inline rxmm128l cvtps2dq( rxmm128l a )
{
return _mm_cvtps_epi32( a );
}
/*!
uses trancate
r0 := (int) a0
r1 := (int) a1
r2 := (int) a2
r3 := (int) a3
*/
static inline rxmm128l cvttps2dq( rxmm128l a )
{
return _mm_cvttps_epi32( a );
}
//
// class epi64
//
};
//
// Namespace sse2
//
}
#endif/*_SSE2_CMPL_ABSTRACTION_MSC_PCKINT32_H_*/

View File

@@ -0,0 +1,618 @@
/*
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
* Inc., and is fully protected under copyright and trade secret laws. You may
* not view, use, disclose, copy, or distribute this file or any information
* contained herein except pursuant to a valid written license from Synopsys.
*/
//
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
//
#ifndef _SSE2_CMPL_ABSTRACTION_MSC_PCKINT64_H_
#define _SSE2_CMPL_ABSTRACTION_MSC_PCKINT64_H_
//
// Namespace sse2
//
namespace sse2
{
//
/// class epi64 (packed single precision)
//
class epi64
{
public:
//
/// The type.
//
typedef rxmm128l my_rxmm;
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed integer arithmetic
//
/*!
r0 := a0 + b0
r1 := a1 + b1
*/
static inline rxmm128d add( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_add_pd( a, b );
}
/*!
r0 := a0 - b0
r1 := a1 - b1
*/
static inline rxmm128d sub( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_sub_pd( a, b );
}
/*!
r0 := a0 * b0
r1 := a1 * b1
*/
static inline rxmm128d mul( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_mul_pd( a, b );
}
/*!
r0 := a0 / b0
r1 := a1 / b1
*/
static inline rxmm128d div( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_div_pd( a, b );
}
/*!
r0 := max( a0, b0 )
r1 := max( a1, b1 )
*/
static inline rxmm128d max( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_max_pd( a, b );
}
/*!
r0 := min( a0, b0 )
r1 := min( a1, b1 )
*/
static inline rxmm128d min( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_min_pd( a, b );
}
/*!
r0 := sqrt( a0 )
r1 := sqrt( a1 )
*/
static inline rxmm128d sqrt( rxmm128d a )
{
BOOST_STATIC_ASSERT( false );
return _mm_sqrt_pd( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed integer logic
//
/*!
r0 := (~a0) & b0
r1 := (~a1) & b1
*/
static inline rxmm128d andnot( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_andnot_pd( a, b );
}
/*!
r0 := a0 & b0
r1 := a1 & b1
*/
static inline XMM_TYPE and( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_and_pd( a, b );
}
/*!
r0 := a0 | b0
r1 := a1 | b1
*/
static inline XMM_TYPE or( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_or_pd( a, b );
}
/*!
r0 := a0 ^ b0
r1 := a1 ^ b1
*/
static inline XMM_TYPE xor( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_xor_pd( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed integer comparision
//
/*!
r0 := (a0 == b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 == b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_eq( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_cmpeq_pd( a, b );
}
/*!
r0 := (a0 != b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 != b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_neq( rxmm128d a, rxmm128d b )
{
return _mm_cmpneq_pd( a, b );
}
/*!
r0 := (a0 < b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 < b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_lt( rxmm128d a, rxmm128d b )
{
return _mm_cmplt_pd( a, b );
}
/*!
r0 := (a0 <= b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 <= b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_le( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_cmple_pd( a, b );
}
/*!
r0 := (a0 > b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 > b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_gt( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_cmpgt_pd( a, b );
}
/*!
r0 := (a0 >= b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 >= b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_ge( rxmm128d a, rxmm128d b )
{
return _mm_cmpge_pd( a, b );
}
/*!
r0 := (a0 ord b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 ord b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_ord( rxmm128d a, rxmm128d b )
{
return _mm_cmpord_pd( a, b );
}
/*!
r0 := (a0 unord b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 unord b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_unord( rxmm128d a, rxmm128d b )
{
return _mm_cmpunord_pd( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed integer load
//
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Misc
/*!
r0 := a1
r1 := b1
*/
static inline rxmm128d unpckh( rxmm128d a, rxmm128d b )
{
return _mm_unpackhi_pd( a, b );
}
/*!
r0 := a0
r1 := b0
*/
static inline rxmm128d unpckl( rxmm128d a, rxmm128d b )
{
return _mm_unpacklo_pd( a, b );
}
/*!
r := sign(a1) << 1 | sign(a0)
*/
static inline int movmsk( rxmm128d a, rxmm128d b )
{
return _mm_movemask_pd( a, b );
}
/*!
r0 := (i0 == 1) ? b0 : a0
r1 := (i1 == 1) ? b1 : a1
\sa movmsk
*/
static inline int shuffle( rxmm128d a, rxmm128d b, int i )
{
return _mm_shuffle_pd( a, b, i );
}
/*!
== shuffle( a, b, 1 )
r0 := b0
r1 := a1
*/
static inline rxmm128d move_sd( rxmm128d a, rxmm128d b )
{
return _mm_move_sd( a0 );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory load
/*!
The address \arg p must be 16-byte aligned.
r0 := p[0]
r1 := p[1]
*/
static inline rxmm128d load( double * p )
{
return _mm_load_pd( p );
}
/*!
The address \arg p must be 16-byte aligned.
r0 := p[1]
r1 := p[0]
*/
static inline rxmm128d load_reverse( double * p )
{
return _mm_loadr_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := p[0]
r1 := p[1]
*/
static inline rxmm128d load_unaligned( double * p )
{
return _mm_loadu_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := a0
r1 := *p
*/
static inline rxmm128d load_hi( rxmm128d a, double * p )
{
return _mm_loadh_pd( a, p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := a1
*/
static inline rxmm128d load_lo( rxmm128d a, double * p )
{
return _mm_loadl_pd( a, p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := *p
*/
static inline rxmm128d load_both( double * p )
{
return _mm_load1_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := 0.0
*/
static inline rxmm128d load_sd( double * p )
{
return _mm_load_sd( p );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory store
/*!
The address \arg p must be 16-byte aligned.
p[0] := a0
p[1] := a1
*/
static inline void store( double * p, rxmm128d a )
{
_mm_load_pd( p, a );
}
/*!
The address \arg p must be 16-byte aligned.
p[0] := a1
p[1] := a0
*/
static inline void store_reverse( double * p, rxmm128d a )
{
_mm_storer_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
p[0] := a0
p[1] := a1
*/
static inline void store_unaligned(double * p, rxmm128d a )
{
_mm_storeu_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
*p := a1
*/
static inline void store_hi( double * p, rxmm128d a )
{
_mm_storeh_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
*p := a0
*/
static inline void store_lo( double * p, rxmm128d a )
{
_mm_storel_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
p[0] := a0
p[1] := a0
*/
static inline void store_both( double * p, rxmm128d a )
{
return _mm_store1_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
*p := a0
*/
static inline void store_sd( double * p, rxmm128d a )
{
return _mm_store_sd( p );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory set
/*!
r0 := a0
r1 := a1
*/
static inline rxmm128d set( double a1, double a0 )
{
return _mm_set_pd( a1, a0 );
}
/*!
r0 := 0.0
r1 := 0.0
*/
static inline rxmm128d set_zero()
{
return _mm_setzero_pd( a0 );
}
/*!
r0 := a0
r1 := a0
*/
static inline rxmm128d set_both( double a0 )
{
return _mm_set1_pd( a0 );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := a0
r1 := 0.0
*/
static inline rxmm128d set_sd( double a0 )
{
return _mm_set_sd( a0 );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed integer convertion
//
/*!
r0 := (float) a0
r1 := (float) a1
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128s cvtpd2ps( rxmm128d a )
{
return _mm_cvtpd_ps( a );
}
/*!
r0 := (double) a0
r1 := (double) a1
*/
static inline rxmm128d cvtps2pd( rxmm128s a )
{
return _mm_cvtps_pd( a );
}
/*!
r0 := (int) a0
r1 := (int) a1
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128l cvtpd2dq( rxmm128d a )
{
return _mm_cvtpd_epi32( a );
}
/*!
r0 := (double) a0
r1 := (double) a1
*/
static inline rxmm128d cvtdq2pd( rxmm128l a )
{
return _mm_cvtepi32_pd( a );
}
/*!
r := (int) a0
*/
static inline int cvtsd2si( rxmm128d a )
{
return _mm_cvtsd_si32( a );
}
/*!
r0 := (float) b0
r1 := a1
r2 := a2
r3 := a3
*/
static inline rxmm128s cvtsd2ss( rxmm128l a, rxmm128d b )
{
return _mm_cvtsd_ss( a, b );
}
/*!
r0 := (double) b
r1 := a1
*/
static inline rxmm128d cvtsi2sd( rxmm128d a, int b )
{
return _mm_cvtsi32_sd( a, b );
}
/*!
r0 := (double) b0
r1 := a1
*/
static inline rxmm128d cvtss2sd( rxmm128d a, rxmm128s b )
{
return _mm_cvtss_sd( a, b );
}
/*!
using truncate
r0 := (int) a0
r1 := (int) a1
r2 := 0x0
r3 := 0x0
*/
static inline rxmm128l cvttpd2dq( rxmm128d a )
{
return _mm_cvttpd_epi32( a );
}
/*!
using truncate
r := (int) a0
*/
static inline int cvttsd2si( rxmm128d a )
{
return _mm_cvttsd_si32( a );
}
/*!
r0 := (float) a0
r1 := (float) a1
r2 := (float) a2
r3 := (float) a3
*/
static inline rxmm128s cvtdq2ps( rxmm128l a )
{
return _mm_cvtepi32_ps( a );
}
/*!
r0 := (int) a0
r1 := (int) a1
r2 := (int) a2
r3 := (int) a3
*/
static inline rxmm128l cvtps2dq( rxmm128s a )
{
return _mm_cvtps_epi32( a );
}
/*!
uses trancate
r0 := (int) a0
r1 := (int) a1
r2 := (int) a2
r3 := (int) a3
*/
static inline rxmm128l cvttps2dq( rxmm128s a )
{
return _mm_cvttps_epi32( a );
}
//
// class epi64
//
};
//
// Namespace sse2
//
}
#endif/*_SSE2_CMPL_ABSTRACTION_MSC_PCKINT64_H_*/

View File

@@ -0,0 +1,618 @@
/*
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
* Inc., and is fully protected under copyright and trade secret laws. You may
* not view, use, disclose, copy, or distribute this file or any information
* contained herein except pursuant to a valid written license from Synopsys.
*/
//
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
//
#ifndef _SSE2_CMPL_ABSTRACTION_MSC_PCKINT8_H_
#define _SSE2_CMPL_ABSTRACTION_MSC_PCKINT8_H_
//
// Namespace sse2
//
namespace sse2
{
//
/// class epi64 (packed single precision)
//
class epi64
{
public:
//
/// The type.
//
typedef rxmm128l my_rxmm;
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed integer arithmetic
//
/*!
r0 := a0 + b0
r1 := a1 + b1
*/
static inline rxmm128d add( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_add_pd( a, b );
}
/*!
r0 := a0 - b0
r1 := a1 - b1
*/
static inline rxmm128d sub( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_sub_pd( a, b );
}
/*!
r0 := a0 * b0
r1 := a1 * b1
*/
static inline rxmm128d mul( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_mul_pd( a, b );
}
/*!
r0 := a0 / b0
r1 := a1 / b1
*/
static inline rxmm128d div( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_div_pd( a, b );
}
/*!
r0 := max( a0, b0 )
r1 := max( a1, b1 )
*/
static inline rxmm128d max( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_max_pd( a, b );
}
/*!
r0 := min( a0, b0 )
r1 := min( a1, b1 )
*/
static inline rxmm128d min( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_min_pd( a, b );
}
/*!
r0 := sqrt( a0 )
r1 := sqrt( a1 )
*/
static inline rxmm128d sqrt( rxmm128d a )
{
BOOST_STATIC_ASSERT( false );
return _mm_sqrt_pd( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed integer logic
//
/*!
r0 := (~a0) & b0
r1 := (~a1) & b1
*/
static inline rxmm128d andnot( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_andnot_pd( a, b );
}
/*!
r0 := a0 & b0
r1 := a1 & b1
*/
static inline XMM_TYPE and( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_and_pd( a, b );
}
/*!
r0 := a0 | b0
r1 := a1 | b1
*/
static inline XMM_TYPE or( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_or_pd( a, b );
}
/*!
r0 := a0 ^ b0
r1 := a1 ^ b1
*/
static inline XMM_TYPE xor( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_xor_pd( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed integer comparision
//
/*!
r0 := (a0 == b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 == b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_eq( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_cmpeq_pd( a, b );
}
/*!
r0 := (a0 != b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 != b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_neq( rxmm128d a, rxmm128d b )
{
return _mm_cmpneq_pd( a, b );
}
/*!
r0 := (a0 < b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 < b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_lt( rxmm128d a, rxmm128d b )
{
return _mm_cmplt_pd( a, b );
}
/*!
r0 := (a0 <= b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 <= b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_le( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_cmple_pd( a, b );
}
/*!
r0 := (a0 > b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 > b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_gt( rxmm128d a, rxmm128d b )
{
BOOST_STATIC_ASSERT( false );
return _mm_cmpgt_pd( a, b );
}
/*!
r0 := (a0 >= b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 >= b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_ge( rxmm128d a, rxmm128d b )
{
return _mm_cmpge_pd( a, b );
}
/*!
r0 := (a0 ord b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 ord b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_ord( rxmm128d a, rxmm128d b )
{
return _mm_cmpord_pd( a, b );
}
/*!
r0 := (a0 unord b0) ? 0xffffffffffffffff : 0x0
r1 := (a1 unord b1) ? 0xffffffffffffffff : 0x0
*/
static inline rxmm128d cmp_unord( rxmm128d a, rxmm128d b )
{
return _mm_cmpunord_pd( a, b );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed integer load
//
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Misc
/*!
r0 := a1
r1 := b1
*/
static inline rxmm128d unpckh( rxmm128d a, rxmm128d b )
{
return _mm_unpackhi_pd( a, b );
}
/*!
r0 := a0
r1 := b0
*/
static inline rxmm128d unpckl( rxmm128d a, rxmm128d b )
{
return _mm_unpacklo_pd( a, b );
}
/*!
r := sign(a1) << 1 | sign(a0)
*/
static inline int movmsk( rxmm128d a, rxmm128d b )
{
return _mm_movemask_pd( a, b );
}
/*!
r0 := (i0 == 1) ? b0 : a0
r1 := (i1 == 1) ? b1 : a1
\sa movmsk
*/
static inline int shuffle( rxmm128d a, rxmm128d b, int i )
{
return _mm_shuffle_pd( a, b, i );
}
/*!
== shuffle( a, b, 1 )
r0 := b0
r1 := a1
*/
static inline rxmm128d move_sd( rxmm128d a, rxmm128d b )
{
return _mm_move_sd( a0 );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory load
/*!
The address \arg p must be 16-byte aligned.
r0 := p[0]
r1 := p[1]
*/
static inline rxmm128d load( double * p )
{
return _mm_load_pd( p );
}
/*!
The address \arg p must be 16-byte aligned.
r0 := p[1]
r1 := p[0]
*/
static inline rxmm128d load_reverse( double * p )
{
return _mm_loadr_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := p[0]
r1 := p[1]
*/
static inline rxmm128d load_unaligned( double * p )
{
return _mm_loadu_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := a0
r1 := *p
*/
static inline rxmm128d load_hi( rxmm128d a, double * p )
{
return _mm_loadh_pd( a, p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := a1
*/
static inline rxmm128d load_lo( rxmm128d a, double * p )
{
return _mm_loadl_pd( a, p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := *p
*/
static inline rxmm128d load_both( double * p )
{
return _mm_load1_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := *p
r1 := 0.0
*/
static inline rxmm128d load_sd( double * p )
{
return _mm_load_sd( p );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory store
/*!
The address \arg p must be 16-byte aligned.
p[0] := a0
p[1] := a1
*/
static inline void store( double * p, rxmm128d a )
{
_mm_load_pd( p, a );
}
/*!
The address \arg p must be 16-byte aligned.
p[0] := a1
p[1] := a0
*/
static inline void store_reverse( double * p, rxmm128d a )
{
_mm_storer_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
p[0] := a0
p[1] := a1
*/
static inline void store_unaligned(double * p, rxmm128d a )
{
_mm_storeu_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
*p := a1
*/
static inline void store_hi( double * p, rxmm128d a )
{
_mm_storeh_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
*p := a0
*/
static inline void store_lo( double * p, rxmm128d a )
{
_mm_storel_pd( p, a );
}
/*!
The address \arg p does not need to be 16-byte aligned.
p[0] := a0
p[1] := a0
*/
static inline void store_both( double * p, rxmm128d a )
{
return _mm_store1_pd( p );
}
/*!
The address \arg p does not need to be 16-byte aligned.
*p := a0
*/
static inline void store_sd( double * p, rxmm128d a )
{
return _mm_store_sd( p );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
// Memory set
/*!
r0 := a0
r1 := a1
*/
static inline rxmm128d set( double a1, double a0 )
{
return _mm_set_pd( a1, a0 );
}
/*!
r0 := 0.0
r1 := 0.0
*/
static inline rxmm128d set_zero()
{
return _mm_setzero_pd( a0 );
}
/*!
r0 := a0
r1 := a0
*/
static inline rxmm128d set_both( double a0 )
{
return _mm_set1_pd( a0 );
}
/*!
The address \arg p does not need to be 16-byte aligned.
r0 := a0
r1 := 0.0
*/
static inline rxmm128d set_sd( double a0 )
{
return _mm_set_sd( a0 );
}
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
//
/// Packed integer convertion
//
/*!
r0 := (float) a0
r1 := (float) a1
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128s cvtpd2ps( rxmm128d a )
{
return _mm_cvtpd_ps( a );
}
/*!
r0 := (double) a0
r1 := (double) a1
*/
static inline rxmm128d cvtps2pd( rxmm128s a )
{
return _mm_cvtps_pd( a );
}
/*!
r0 := (int) a0
r1 := (int) a1
r2 := 0.0
r3 := 0.0
*/
static inline rxmm128l cvtpd2dq( rxmm128d a )
{
return _mm_cvtpd_epi32( a );
}
/*!
r0 := (double) a0
r1 := (double) a1
*/
static inline rxmm128d cvtdq2pd( rxmm128l a )
{
return _mm_cvtepi32_pd( a );
}
/*!
r := (int) a0
*/
static inline int cvtsd2si( rxmm128d a )
{
return _mm_cvtsd_si32( a );
}
/*!
r0 := (float) b0
r1 := a1
r2 := a2
r3 := a3
*/
static inline rxmm128s cvtsd2ss( rxmm128l a, rxmm128d b )
{
return _mm_cvtsd_ss( a, b );
}
/*!
r0 := (double) b
r1 := a1
*/
static inline rxmm128d cvtsi2sd( rxmm128d a, int b )
{
return _mm_cvtsi32_sd( a, b );
}
/*!
r0 := (double) b0
r1 := a1
*/
static inline rxmm128d cvtss2sd( rxmm128d a, rxmm128s b )
{
return _mm_cvtss_sd( a, b );
}
/*!
using truncate
r0 := (int) a0
r1 := (int) a1
r2 := 0x0
r3 := 0x0
*/
static inline rxmm128l cvttpd2dq( rxmm128d a )
{
return _mm_cvttpd_epi32( a );
}
/*!
using truncate
r := (int) a0
*/
static inline int cvttsd2si( rxmm128d a )
{
return _mm_cvttsd_si32( a );
}
/*!
r0 := (float) a0
r1 := (float) a1
r2 := (float) a2
r3 := (float) a3
*/
static inline rxmm128s cvtdq2ps( rxmm128l a )
{
return _mm_cvtepi32_ps( a );
}
/*!
r0 := (int) a0
r1 := (int) a1
r2 := (int) a2
r3 := (int) a3
*/
static inline rxmm128l cvtps2dq( rxmm128s a )
{
return _mm_cvtps_epi32( a );
}
/*!
uses trancate
r0 := (int) a0
r1 := (int) a1
r2 := (int) a2
r3 := (int) a3
*/
static inline rxmm128l cvttps2dq( rxmm128s a )
{
return _mm_cvttps_epi32( a );
}
//
// class epi64
//
};
//
// Namespace sse2
//
}
#endif/*_SSE2_CMPL_ABSTRACTION_MSC_PCKINT8_H_*/

View File

@@ -0,0 +1,60 @@
/*
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
* Inc., and is fully protected under copyright and trade secret laws. You may
* not view, use, disclose, copy, or distribute this file or any information
* contained herein except pursuant to a valid written license from Synopsys.
*/
//
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
//
#ifndef _SSE2_CMPL_ABSTRACTION_OTHER_H_
#define _SSE2_CMPL_ABSTRACTION_OTHER_H_
#include <boost/static_assert.hpp>
//
// Namespace sse2
//
namespace sse2
{
//
// Primitive types
//
/// 2xdouble
//
typedef char xmm128d[16];
/// 4xfloat
//
typedef char xmm128s[16];
/// 2xint64
//
typedef char xmm128l[16];
/// 4xint32
//
typedef char xmm128i[16];
/// int64
//
typedef long int;
//
// Namespace sse2
//
}
#include "SSE_cmplr_abstraction_other_pckdbl.h"
#include "SSE_cmplr_abstraction_other_pckfloat.h"
#include "SSE_cmplr_abstraction_other_pckint8.h"
#include "SSE_cmplr_abstraction_other_pckint16.h"
#include "SSE_cmplr_abstraction_other_pckint32.h"
#include "SSE_cmplr_abstraction_other_pckint64.h"
#endif/*_SSE2_CMPL_ABSTRACTION_OTHER_H_*/

43
misc/SSE_transform.cpp Normal file
View File

@@ -0,0 +1,43 @@
// toIntTest.cpp : Defines the entry point for the console application.
//
#include <stdio.h>
#include "getCurrentTime.h"
#include "base/transform.h"
//#include "base/point.h"
#if !defined(__AIX) && !defined(__sparc)
#define LOW_ENDIAN_ARCH
#endif
#ifdef _MSC_VER
#define MY_INLINE __forceinline
#else
#define MY_INLINE inline
#endif
#ifndef _MSC_VER
#define __int64 long
#endif
void test1()
{
}
void test2()
{
}
int main(int argc, char* argv[])
{
// Init
initGetCurrentTimeLib();
// test1();
test2();
return 0;
}

329
misc/getcurrenttime.h Normal file
View File

@@ -0,0 +1,329 @@
// toIntTest.cpp : Defines the entry point for the console application.
//
#include <fstream>
#include <iostream>
#include <limits>
#include <math.h>
#undef min
#undef max
#if defined( WIN32 )
//#define QUERY_PERFORMANCE_COUNTER
#define RDTSC
#else
#define GET_TIME_OF_DAY
#endif
#ifdef RDTSC
#include <windows.h>
class perf
{
__int64 r64;
__forceinline __int64 getCurrentTime()
{
__asm
{
//
// Serialized instruction ensure all previouse
// instructions a done befor reading the performance
// counter.
//
// cpuid
//
// Read the time stamp counter.
//
rdtsc
}
}
public:
__forceinline perf()
{
::Sleep( 0 );
__asm cpuid
r64 = getCurrentTime();
}
__forceinline double elapsed()
{
__int64 now = getCurrentTime();
return double(now - r64 );
}
};
__int64 nCPUFrequency;
double dblCPUFrequency;
inline __int64 getCurrentTimeI()
{
__asm
{
rdtsc
}
}
inline double getCurrentTime()
{
// return double(getCurrentTimeI());
//
// The time stamp counter.
//
union {
__int64 r64;
__int32 r32[2];
} tsc;
//
// Read the time stamp counter.
//
__asm
{
//
// Serialized instruction ensure all previouse
// instructions a done befor reading the performance
// counter.
//
// cpuid
//
// Read the counter.
//
rdtsc
//
//
//
// mov tsc.r32[0], eax
// mov tsc.r32[4], edx
movd xmm0,eax
movd xmm1,edx
pshufd xmm1, xmm0, 0xF7
}
//
// Get time in seconds.
//
return double(tsc.r64);// / dblCPUFrequency;
}
void initGetCurrentTimeLib_hlpr()
{
//
// Use only one fixed CPU
//
BOOL b;
DWORD_PTR proc_affi;
DWORD_PTR sys_affi;
DWORD_PTR exclud_affi;
GetProcessAffinityMask( GetCurrentProcess(), &proc_affi, &sys_affi );
exclud_affi = proc_affi & ~sys_affi;
proc_affi = ( exclud_affi ) ? proc_affi : proc_affi;
int i = 0;
while (( proc_affi >>= 1 )) ++i;
proc_affi = 1 << i;
b = SetProcessAffinityMask( GetCurrentProcess(), proc_affi );
//
// Set the priority of thread high.
//
b = SetPriorityClass( GetCurrentProcess(), REALTIME_PRIORITY_CLASS );
b = SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
//
// Get the frequency.
//
nCPUFrequency = 2000000000;
// QueryPerformanceFrequency(
// reinterpret_cast<LARGE_INTEGER*>( &nCPUFrequency ) );
//
// Frequency counter supported in CPUs of family x86
// starting from Pentium 4 or Pentium 3. So for old CPUs
// this will not work.
//
// If CPU doesn't support performance counter then just return.
//
if ( !nCPUFrequency )
puts("WARNING: This CPU doesn't support QueryPerformanceFrequency.");
//
// Convert to double.
//
dblCPUFrequency = double(nCPUFrequency);
}
#endif
#ifdef QUERY_PERFORMANCE_COUNTER
#include <windows.h>
double dblCPUFrequency;
inline double getCurrentTime()
{
//
// This call must be quite fast. Since, in x86 architectur
// it is one instruction. Yet WIN32 API might added some
// additional processing.
//
// \todo Vahagn: add our assembly optimised function.
//
__int64 nCPUTickCount;
QueryPerformanceCounter(
reinterpret_cast<LARGE_INTEGER*>( &nCPUTickCount )
);
//
// Get time in seconds.
//
return double(nCPUTickCount) / dblCPUFrequency;
}
void initGetCurrentTimeLib_hlpr()
{
//
// Use only one fixed CPU
//
BOOL b;
DWORD_PTR proc_affi;
DWORD_PTR sys_affi;
DWORD_PTR exclud_affi;
GetProcessAffinityMask( GetCurrentProcess(), &proc_affi, &sys_affi );
exclud_affi = proc_affi & ~sys_affi;
proc_affi = ( exclud_affi ) ? proc_affi : proc_affi;
int i = 0;
while (( proc_affi >>= 1 )) ++i;
proc_affi = 1 << i;
b = SetProcessAffinityMask( GetCurrentProcess(), proc_affi );
//
// Set the priority of thread high.
//
b = SetPriorityClass( GetCurrentProcess(), REALTIME_PRIORITY_CLASS );
b = SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
//
// Get the frequency.
//
__int64 nCPUFrequency;
QueryPerformanceFrequency(
reinterpret_cast<LARGE_INTEGER*>( &nCPUFrequency )
);
//
// Frequency counter supported in CPUs of family x86
// starting from Pentium 4 or Pentium 3. So for old CPUs
// this will not work.
//
// If CPU doesn't support performance counter then just return.
//
if ( !nCPUFrequency )
puts("WARNING: This CPU doesn't support QueryPerformanceFrequency.");
//
// Convert to double.
//
dblCPUFrequency = double(nCPUFrequency);
}
#endif
#ifdef GET_TIME_OF_DAY
#include <sys/time.h>
inline double getCurrentTime()
{
timeval t;
gettimeofday(&t,0);
return (double)t.tv_sec + ((double)t.tv_usec/1000000.0);
}
void initGetCurrentTimeLib_hlpr()
{
}
#endif
void initGetCurrentTimeLib()
{
initGetCurrentTimeLib_hlpr();
#if 0
for ( int j=0; j < 10000; ++j )
{
//
// Calculate the time expectation and dispersion
// of getCurrentTime on this CPU.
//
const int nProbeCount = 100000;
double dblTimeExpect = 0.;
double dblTimeDispersia = 0.;
for ( int i = nProbeCount; i; --i )
{
register double dblTimeBase = getCurrentTime();
register double dblTimeCurrent = getCurrentTime();
double dblTimeDelta = dblTimeCurrent - dblTimeBase;
dblTimeExpect += dblTimeDelta;
dblTimeDispersia += dblTimeDelta * dblTimeDelta;
}
//
// finalize.
//
dblTimeExpect /= double( nProbeCount );
dblTimeDispersia = dblTimeDispersia / double( nProbeCount )
- dblTimeExpect * dblTimeExpect;
printf( "Expectation: %f\n"
"Dispersion: %f\n",
dblTimeExpect,
sqrt(dblTimeDispersia) );
puts( "----------------------------------------------------" );
}
#endif
#if 0
const int nProbeCount = 1000;
double* ddd = new double[ nProbeCount ];
double* p = ddd;
double m = std::numeric_limits<double>::max();
for ( int i = nProbeCount; i; --i )
{
register double dblTimeBase = getCurrentTime();
register double dblTimeCurrent = getCurrentTime();
*p++ = dblTimeCurrent - dblTimeBase;
m = std::min( m, dblTimeCurrent - dblTimeBase );
// printf( "%10.1f\n", dblTimeCurrent - dblTimeBase );
}
printf( "%10.1f\n", m );
std::ofstream o;
o.open( "times.txt" );
p = ddd;
for ( int i = nProbeCount; i; --i )
{
o << *p++ << std::endl;
}
o << std::endl;
delete [] ddd;
#endif
#if 1
for ( int j = 0; j < 1000; ++j )
{
const int nProbeCount = 10000;
double m = 1e300;
for ( int i = nProbeCount; i; --i )
{
perf pc;
__asm cpuid
__asm cpuid
__asm cpuid
__asm cpuid
double c = pc.elapsed();
m = min( m, c );
}
std::cout << m << std::endl;
}
#endif
}

287
misc/getcurrenttime2.h Normal file
View File

@@ -0,0 +1,287 @@
//
// Use "rdtsc" to mesure performance.
//
//
#ifndef __PERFORMANCE__H__
#define __PERFORMANCE__H__
#include <fstream>
#include <iostream>
#include <limits>
#include <math.h>
#include <map>
#if defined( WIN32 )
#undef min
#undef max
#endif
#if defined( WIN32 )
#define QUERY_PERFORMANCE_COUNTER
#define RDTSC
#else
#define GET_TIME_OF_DAY
#endif
#if defined( RDTSC )
#include <windows.h>
class perf
{
__int64 beginning;
static double frequency;
__forceinline __int64 getCurrentTime()
{
__asm
{
//
// Read the time stamp counter.
//
rdtsc
}
}
public:
__forceinline perf()
{
// ::Sleep( 0 );
//
// Serialized instruction ensure all previouse
// instructions a done befor reading the performance
// counter.
//
__asm xor eax,eax
__asm cpuid
beginning = getCurrentTime();
// __asm xor eax,eax
// __asm cpuid
}
__forceinline double elapsed()
{
__int64 now = getCurrentTime();
return double(now - beginning - 60 ) / frequency;
}
static void init()
{
//
// Use only one fixed CPU
//
BOOL b;
DWORD_PTR proc_affi;
DWORD_PTR sys_affi;
DWORD_PTR exclud_affi;
GetProcessAffinityMask( GetCurrentProcess(), &proc_affi, &sys_affi );
exclud_affi = proc_affi & ~sys_affi;
proc_affi = ( exclud_affi ) ? proc_affi : proc_affi;
int i = 0;
while (( proc_affi >>= 1 )) ++i;
proc_affi = 1 << i;
b = SetProcessAffinityMask( GetCurrentProcess(), proc_affi );
//
// Set the priority of thread high.
//
b = SetPriorityClass( GetCurrentProcess(), REALTIME_PRIORITY_CLASS );
b = SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
//
// Get the frequency.
// Temporaily put 1.
//
frequency = 1.;
}
};
double perf::frequency;
#elif defined( QUERY_PERFORMANCE_COUNTER )
#include <windows.h>
class perf
{
__int64 beginning;
static double frequency;
__forceinline __int64 getCurrentTime()
{
//
// This call must be quite fast. Since, in x86 architectur
// it is one instruction. Yet WIN32 API might added some
// additional processing.
//
// \todo Vahagn: add our assembly optimised function.
//
__int64 tc;
QueryPerformanceCounter(
reinterpret_cast<LARGE_INTEGER*>( &tc )
);
return tc;
}
public:
__forceinline perf()
{
// ::Sleep( 0 );
beginning = getCurrentTime();
}
__forceinline double elapsed()
{
__int64 now = getCurrentTime();
return double(now - beginning ); // frequency;
}
static void init()
{
//
// Use only one fixed CPU
//
BOOL b;
DWORD_PTR proc_affi;
DWORD_PTR sys_affi;
DWORD_PTR exclud_affi;
GetProcessAffinityMask( GetCurrentProcess(), &proc_affi, &sys_affi );
exclud_affi = proc_affi & ~sys_affi;
proc_affi = ( exclud_affi ) ? proc_affi : proc_affi;
int i = 0;
while (( proc_affi >>= 1 )) ++i;
proc_affi = 1 << i;
b = SetProcessAffinityMask( GetCurrentProcess(), proc_affi );
//
// Set the priority of thread high.
//
b = SetPriorityClass( GetCurrentProcess(), REALTIME_PRIORITY_CLASS );
b = SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
//
// Get the frequency.
//
__int64 pf;
QueryPerformanceFrequency(
reinterpret_cast<LARGE_INTEGER*>( &pf )
);
//
// Get the frequency.
//
frequency = double(pf);
}
};
double perf::frequency;
#elif defined ( GET_TIME_OF_DAY )
#include <sys/time.h>
class perf
{
timeval beginning;
public:
__forceinline perf()
{
gettimeofday(&beginning,0);
}
__forceinline double elapsed()
{
timeval now;
gettimeofday(&now,0);
return double(now.tv_sec) - double(beginning.tv_sec)
+ (double(now.tv_usec)-double(beginning.tv_usec))/1000000.0;
}
static void init()
{
}
};
#endif
template<class F >
double mesure( F& fnctr, int nProbes = 100000, bool bPrint = false )
{
typedef std::map<double,int> probs_type;
probs_type probs;
int n = 0;
for ( int i = 0; i < nProbes; ++i )
{
perf pc;
fnctr();
double m = pc.elapsed();
n = ++probs[ m ];
}
double m;
n = 0;
for ( probs_type::iterator it = probs.begin();
it != probs.end();
++it )
{
if ( it->second > n )
{
n = it->second;
m = it->first;
}
}
if ( bPrint )
{
std::cout << "tsc=" << m << " probes=" << nProbes << std::endl;
std::cout << "===============================" << std::endl;
for ( probs_type::iterator it = probs.begin();
it != probs.end();
++it )
std::cout << "prob=" << it->first << "\t amount=" << it->second << std::endl;
}
return m;
};
struct nop
{
__forceinline void operator() ()
{
}
};
void perf_init()
{
perf::init();
mesure<nop>( nop(), 100000, true );
#if 0
typedef std::map<double,int> probs_type;
probs_type probs;
double m = 1e300;
double s = 0;
int i_last = 0;
int i = 0;
int n = 0;
double c;
for ( ; n < 1000000; ++i )
{
perf pc;
c = pc.elapsed();
n = ++probs[ c ];
}
std::cout << "tsc=" << c << " probes=" << i << std::endl;
std::cout << "=========================" << std::endl;
for ( probs_type::iterator it = probs.begin();
it != probs.end();
++it )
{
std::cout << "prob=" << it->first << "\t amount=" << it->second << std::endl;
}
#endif
}
#endif//__PERFORMANCE__H__