Reorganizing some files.
This commit is contained in:
1562
misc/SSE2_transform.cpp
Normal file
1562
misc/SSE2_transform.cpp
Normal file
File diff suppressed because it is too large
Load Diff
1119
misc/SSE2_transform.h
Normal file
1119
misc/SSE2_transform.h
Normal file
File diff suppressed because it is too large
Load Diff
280
misc/SSE_cmplr_abstraction.h
Normal file
280
misc/SSE_cmplr_abstraction.h
Normal file
@@ -0,0 +1,280 @@
|
||||
/*
|
||||
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
|
||||
* Inc., and is fully protected under copyright and trade secret laws. You may
|
||||
* not view, use, disclose, copy, or distribute this file or any information
|
||||
* contained herein except pursuant to a valid written license from Synopsys.
|
||||
*/
|
||||
|
||||
//
|
||||
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
|
||||
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
|
||||
//
|
||||
|
||||
#ifndef _SSE2_CMPL_ABSTRACTION_H_
|
||||
#define _SSE2_CMPL_ABSTRACTION_H_
|
||||
|
||||
#include <boost/static_assert.hpp>
|
||||
#include <boost/type_traits.hpp>
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
// Ms Visual Studio
|
||||
//
|
||||
#if defined( _MSC_VER ) && _MSC_VER
|
||||
|
||||
#include "SSE_cmplr_abstraction_MSC.h"
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
// the GCC
|
||||
//
|
||||
#elif defined( _gcc )
|
||||
|
||||
#include "SSE_cmplr_abstraction_GCC.h"
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
// Other
|
||||
//
|
||||
#else
|
||||
|
||||
#include "SSE_cmplr_abstraction_other.h"
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
namespace sse2
|
||||
{
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// the wrapper for 128 bit xmm registers.
|
||||
//
|
||||
template <class X, class T, int N = (16/sizeof(T)) >
|
||||
class xmm128
|
||||
{
|
||||
// Type
|
||||
protected:
|
||||
|
||||
//
|
||||
/// The XMM register type.
|
||||
//
|
||||
typedef X MY_XMM;
|
||||
|
||||
//
|
||||
/// The XMM register type.
|
||||
//
|
||||
typedef T MY_TYPE;
|
||||
|
||||
// Data
|
||||
protected:
|
||||
|
||||
//
|
||||
/// The union of
|
||||
//
|
||||
union
|
||||
{
|
||||
MY_XMM x;
|
||||
MY_TYPE n[N];
|
||||
};
|
||||
|
||||
// Construction
|
||||
public:
|
||||
|
||||
//
|
||||
/// The default constructor.
|
||||
//
|
||||
xmm128()
|
||||
{
|
||||
//
|
||||
// We must be 128 bits only.
|
||||
//
|
||||
BOOST_STATIC_ASSERT( 16/sizeof( MY_TYPE ) == N );
|
||||
}
|
||||
|
||||
//
|
||||
/// The copy constructor.
|
||||
//
|
||||
xmm128( const xmm128& op )
|
||||
{
|
||||
//
|
||||
// We must be 128 bits only.
|
||||
//
|
||||
BOOST_STATIC_ASSERT( 16/sizeof( MY_TYPE ) == N );
|
||||
|
||||
//
|
||||
// Just assign.
|
||||
//
|
||||
x = op.x;
|
||||
}
|
||||
|
||||
//
|
||||
/// The copy constructor.
|
||||
//
|
||||
xmm128( const MY_XMM& op )
|
||||
{
|
||||
//
|
||||
// We must be 128 bits only.
|
||||
//
|
||||
BOOST_STATIC_ASSERT( 16/sizeof( MY_TYPE ) == N );
|
||||
|
||||
//
|
||||
// Just assign.
|
||||
//
|
||||
x = op.x;
|
||||
x = op;
|
||||
}
|
||||
|
||||
//
|
||||
/// The destructor.
|
||||
//
|
||||
~xmm128()
|
||||
{}
|
||||
|
||||
// Interface
|
||||
public:
|
||||
|
||||
//
|
||||
/// Assign our kind.
|
||||
//
|
||||
xmm128& operator= ( const xmm128& op )
|
||||
{
|
||||
x = op.x;
|
||||
return *this;
|
||||
}
|
||||
|
||||
//
|
||||
/// Assign the xmm type.
|
||||
//
|
||||
xmm128& operator= ( const MY_XMM& op )
|
||||
{
|
||||
x = op;
|
||||
return *this;
|
||||
}
|
||||
|
||||
//
|
||||
/// Operator to get packed type. The const version.
|
||||
//
|
||||
operator MY_XMM () const
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
//
|
||||
/// Operator to get packed type reference. Can be used as a lvalue.
|
||||
//
|
||||
MY_TYPE& operator[] ( int idx )
|
||||
{
|
||||
assert( 0<= idx && idx < N );
|
||||
return n[idx];
|
||||
}
|
||||
|
||||
//
|
||||
/// Operator to get packed type. The const version.
|
||||
//
|
||||
MY_TYPE operator[] ( int idx ) const
|
||||
{
|
||||
assert( 0<= idx && idx < N );
|
||||
return n[idx];
|
||||
}
|
||||
|
||||
//
|
||||
/// Set from two values.
|
||||
//
|
||||
void set( MY_TYPE v1, MY_TYPE v2 )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( 16 / sizeof( MY_TYPE ) == 2 );
|
||||
operator[0] = v1;
|
||||
operator[1] = v2;
|
||||
}
|
||||
|
||||
//
|
||||
/// Set from two values.
|
||||
//
|
||||
void set( MY_TYPE v1, MY_TYPE v2, MY_TYPE v2, MY_TYPE v4 )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( 16 / sizeof( MY_TYPE ) == 4 );
|
||||
operator[0] = v1;
|
||||
operator[1] = v2;
|
||||
operator[2] = v3;
|
||||
operator[3] = v4;
|
||||
}
|
||||
|
||||
add
|
||||
sub
|
||||
andnot
|
||||
and
|
||||
or
|
||||
xor
|
||||
|
||||
sqrt
|
||||
mul
|
||||
div
|
||||
min
|
||||
max
|
||||
|
||||
shift
|
||||
|
||||
comp
|
||||
};
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
|
||||
/*
|
||||
class xmm128d : public xmm128 < rxmm128d, double, 2 >
|
||||
{
|
||||
// Construction
|
||||
public:
|
||||
|
||||
//
|
||||
/// The default constructor.
|
||||
//
|
||||
xmm128d()
|
||||
{}
|
||||
|
||||
//
|
||||
/// The copy constructor.
|
||||
//
|
||||
xmm128d( const xmm128d& op )
|
||||
: xmm128( op )
|
||||
{}
|
||||
|
||||
//
|
||||
/// The copy constructor.
|
||||
//
|
||||
xmm128d( const MY_XMM& op )
|
||||
: xmm128( op )
|
||||
{}
|
||||
|
||||
//
|
||||
/// The copy constructor.
|
||||
//
|
||||
xmm128d( double d1, double d2 )
|
||||
{
|
||||
set( d1, d2 );
|
||||
}
|
||||
|
||||
|
||||
// Interface
|
||||
public:
|
||||
|
||||
//
|
||||
/// Set from two doubles.
|
||||
//
|
||||
void set( double d1, double d2 )
|
||||
{
|
||||
operator[0] = d1;
|
||||
operator[1] = d2;
|
||||
}
|
||||
};
|
||||
*/
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
}
|
||||
|
||||
#endif/*_SSE2_CMPL_ABSTRACTION_H_*/
|
||||
60
misc/SSE_cmplr_abstraction_GCC.h
Normal file
60
misc/SSE_cmplr_abstraction_GCC.h
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
|
||||
* Inc., and is fully protected under copyright and trade secret laws. You may
|
||||
* not view, use, disclose, copy, or distribute this file or any information
|
||||
* contained herein except pursuant to a valid written license from Synopsys.
|
||||
*/
|
||||
|
||||
//
|
||||
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
|
||||
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
|
||||
//
|
||||
|
||||
#ifndef _SSE2_CMPL_ABSTRACTION_GCC_H_
|
||||
#define _SSE2_CMPL_ABSTRACTION_GCC_H_
|
||||
|
||||
#include <boost/static_assert.hpp>
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
namespace sse2
|
||||
{
|
||||
|
||||
//
|
||||
// Primitive types
|
||||
//
|
||||
|
||||
/// 2xdouble
|
||||
//
|
||||
typedef int xmm128d __attribute__ ((mode(V2DF)));
|
||||
|
||||
/// 4xfloat
|
||||
//
|
||||
typedef int xmm128s __attribute__ ((mode(V4SF)));
|
||||
|
||||
/// 2xint64
|
||||
//
|
||||
typedef int xmm128l __attribute__ ((mode(V4SF)));
|
||||
|
||||
/// 4xint32
|
||||
//
|
||||
typedef int xmm128i __attribute__ ((mode(V4SF)));
|
||||
|
||||
/// int64
|
||||
//
|
||||
typedef long int;
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
}
|
||||
|
||||
#include "SSE_cmplr_abstraction_GCC_pckdbl.h"
|
||||
#include "SSE_cmplr_abstraction_GCC_pckfloat.h"
|
||||
#include "SSE_cmplr_abstraction_GCC_pckint8.h"
|
||||
#include "SSE_cmplr_abstraction_GCC_pckint16.h"
|
||||
#include "SSE_cmplr_abstraction_GCC_pckint32.h"
|
||||
#include "SSE_cmplr_abstraction_GCC_pckint64.h"
|
||||
|
||||
#endif/*_SSE2_CMPL_ABSTRACTION_GCC_H_*/
|
||||
61
misc/SSE_cmplr_abstraction_MSC.h
Normal file
61
misc/SSE_cmplr_abstraction_MSC.h
Normal file
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
|
||||
* Inc., and is fully protected under copyright and trade secret laws. You may
|
||||
* not view, use, disclose, copy, or distribute this file or any information
|
||||
* contained herein except pursuant to a valid written license from Synopsys.
|
||||
*/
|
||||
|
||||
//
|
||||
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
|
||||
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
|
||||
//
|
||||
|
||||
#ifndef _SSE2_CMPL_ABSTRACTION_MSC_H_
|
||||
#define _SSE2_CMPL_ABSTRACTION_MSC_H_
|
||||
|
||||
#include <boost/static_assert.hpp>
|
||||
#include <emmintrin.h>
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
namespace sse2
|
||||
{
|
||||
|
||||
//
|
||||
// Primitive types
|
||||
//
|
||||
|
||||
/// 2xdouble
|
||||
//
|
||||
typedef __m128d rxmm128d;
|
||||
|
||||
/// 4xfloat
|
||||
//
|
||||
typedef __m128 rxmm128s;
|
||||
|
||||
/// 2xint64
|
||||
//
|
||||
typedef __m128i rxmm128l;
|
||||
|
||||
/// 4xint32
|
||||
//
|
||||
typedef __m128 rxmm128i;
|
||||
|
||||
/// int64
|
||||
//
|
||||
typedef __int64 int64;
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
}
|
||||
|
||||
#include "SSE_cmplr_abstraction_MSC_pckdbl.h"
|
||||
#include "SSE_cmplr_abstraction_MSC_pckfloat.h"
|
||||
#include "SSE_cmplr_abstraction_MSC_pckint8.h"
|
||||
#include "SSE_cmplr_abstraction_MSC_pckint16.h"
|
||||
#include "SSE_cmplr_abstraction_MSC_pckint32.h"
|
||||
#include "SSE_cmplr_abstraction_MSC_pckint64.h"
|
||||
|
||||
#endif/*_SSE2_CMPL_ABSTRACTION_MSC_H_*/
|
||||
895
misc/SSE_cmplr_abstraction_MSC_backup.h
Normal file
895
misc/SSE_cmplr_abstraction_MSC_backup.h
Normal file
@@ -0,0 +1,895 @@
|
||||
|
||||
//
|
||||
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
|
||||
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
|
||||
//
|
||||
|
||||
#ifndef _SSE2_CMPL_ABSTRACTION_MSC_H_
|
||||
#define _SSE2_CMPL_ABSTRACTION_MSC_H_
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <dvec.h>
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
namespace sse2
|
||||
{
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
// Primitive types
|
||||
//
|
||||
|
||||
/// 2xdouble
|
||||
//
|
||||
typedef __m128d rxmm128d;
|
||||
|
||||
/// 4xfloat
|
||||
//
|
||||
typedef __m128 rxmm128s;
|
||||
|
||||
/// 2xint64
|
||||
//
|
||||
typedef __m128i rxmm128l;
|
||||
|
||||
/// 4xint32
|
||||
//
|
||||
typedef __m128 rxmm128i;
|
||||
|
||||
/// int64
|
||||
//
|
||||
typedef __int64 int64;
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double arithmetic
|
||||
//
|
||||
class arithmetic_pd
|
||||
{
|
||||
public:
|
||||
/*!
|
||||
r0 := a0 + b0
|
||||
r1 := a1 + b1
|
||||
*/
|
||||
static inline rxmm128d add( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_add_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 - b0
|
||||
r1 := a1 - b1
|
||||
*/
|
||||
static inline rxmm128d sub( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_sub_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 * b0
|
||||
r1 := a1 * b1
|
||||
*/
|
||||
static inline rxmm128d mul( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_mul_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 / b0
|
||||
r1 := a1 / b1
|
||||
*/
|
||||
static inline rxmm128d div( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_div_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := max( a0, b0 )
|
||||
r1 := max( a1, b1 )
|
||||
*/
|
||||
static inline rxmm128d max( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_max_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := min( a0, b0 )
|
||||
r1 := min( a1, b1 )
|
||||
*/
|
||||
static inline rxmm128d min( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_min_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := sqrt( a0 )
|
||||
r1 := sqrt( a1 )
|
||||
*/
|
||||
static inline rxmm128d sqrt( rxmm128d a )
|
||||
{
|
||||
return _mm_sqrt_pd( a, b );
|
||||
}
|
||||
};
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double logic
|
||||
//
|
||||
class logic_pd
|
||||
{
|
||||
public:
|
||||
/*!
|
||||
r0 := (~a0) & b0
|
||||
r1 := (~a1) & b1
|
||||
*/
|
||||
static inline rxmm128d andnot( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_andnot_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 & b0
|
||||
r1 := a1 & b1
|
||||
*/
|
||||
static inline XMM_TYPE and( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_and_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 | b0
|
||||
r1 := a1 | b1
|
||||
*/
|
||||
static inline XMM_TYPE or( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_or_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 ^ b0
|
||||
r1 := a1 ^ b1
|
||||
*/
|
||||
static inline XMM_TYPE xor( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_xor_pd( a, b );
|
||||
}
|
||||
};
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double comparision
|
||||
//
|
||||
class comparision_pd
|
||||
{
|
||||
public:
|
||||
/*!
|
||||
r0 := (a0 == b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 == b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_eq( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpeq_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 != b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 != b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_neq( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpneq_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 < b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 < b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_lt( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmplt_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 <= b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 <= b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_le( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmple_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 > b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 > b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_gt( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpgt_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 >= b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 >= b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_ge( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpge_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 ord b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 ord b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_ord( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpord_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 unord b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 unord b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_unord( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpunord_pd( a, b );
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double logic
|
||||
//
|
||||
class logic_pd
|
||||
{
|
||||
public:
|
||||
};
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double logic
|
||||
//
|
||||
class logic_pd
|
||||
{
|
||||
public:
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/// Abstract
|
||||
//
|
||||
class func_d64x2
|
||||
{
|
||||
public:
|
||||
typedef XMM_TYPE rxmm128d;
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Arithmetic PD
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Arithmetic SD
|
||||
|
||||
/*!
|
||||
r0 := a0 + b0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline XMM_TYPE addsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_add_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 - b0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline XMM_TYPE subsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_sub_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 * b0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline XMM_TYPE mulsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_mul_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 / b0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline XMM_TYPE divsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_div_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := max( a0, b0 )
|
||||
r1 := a1
|
||||
*/
|
||||
static inline XMM_TYPE maxsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_max_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := min( a0, b0 )
|
||||
r1 := a1
|
||||
*/
|
||||
static inline XMM_TYPE minsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_min_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := sqrt( b0 )
|
||||
r1 := a1
|
||||
*/
|
||||
static inline XMM_TYPE sqrtsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_sqrt_sd( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Logic PD
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Comparision PD
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Comparision SD
|
||||
|
||||
/*!
|
||||
r0 := (a0 == b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline XMM_TYPE cmpeqsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_cmpeq_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 != b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline XMM_TYPE cmpneqsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_cmpneq_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 < b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline XMM_TYPE cmpltsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_cmplt_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 <= b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline XMM_TYPE cmplesd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_cmple_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 > b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline XMM_TYPE cmpgtsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_cmpgt_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 >= b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline XMM_TYPE cmpgesd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_cmpge_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 ord b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline XMM_TYPE cmpordsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_cmpord_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 unord b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline XMM_TYPE cmpunordsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_cmpunord_sd( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Comparision SD
|
||||
|
||||
/*!
|
||||
r := (a0 == b0) ? 0x1 : 0x0
|
||||
*/
|
||||
static inline int comieqsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_comieq_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := (a0 != b0) ? 0x1 : 0x0
|
||||
*/
|
||||
static inline int comineqsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_comineq_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := (a0 < b0) ? 0x1 : 0x0
|
||||
*/
|
||||
static inline int comiltsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_comilt_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := (a0 <= b0) ? 0x1 : 0x0
|
||||
*/
|
||||
static inline int comilesd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_comile_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := (a0 > b0) ? 0x1 : 0x0
|
||||
*/
|
||||
static inline int comigtsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_comigt_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := (a0 >= b0) ? 0x1 : 0x0
|
||||
*/
|
||||
static inline int comigesd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_comige_sd( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Comparision SD
|
||||
|
||||
/*!
|
||||
r := (a0 == b0) ? 0x1 : 0x0
|
||||
*/
|
||||
static inline int ucomieqsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_ucomieq_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := (a0 != b0) ? 0x1 : 0x0
|
||||
*/
|
||||
static inline int ucomineqsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_ucomineq_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := (a0 < b0) ? 0x1 : 0x0
|
||||
*/
|
||||
static inline int ucomiltsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_ucomilt_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := (a0 <= b0) ? 0x1 : 0x0
|
||||
*/
|
||||
static inline int ucomilesd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_ucomile_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := (a0 > b0) ? 0x1 : 0x0
|
||||
*/
|
||||
static inline int ucomigtsd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_ucomigt_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := (a0 >= b0) ? 0x1 : 0x0
|
||||
*/
|
||||
static inline int ucomigesd( XMM_TYPE a, XMM_TYPE b )
|
||||
{
|
||||
return _mm_ucomige_sd( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Conversion
|
||||
|
||||
/*!
|
||||
r0 := (float) a0
|
||||
r1 := (float) a1
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128s cvtpd2ps( rxmm128d a )
|
||||
{
|
||||
return _mm_cvtpd_ps( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) a0
|
||||
r1 := (double) a1
|
||||
*/
|
||||
static inline rxmm128d cvtps2pd( rxmm128s a )
|
||||
{
|
||||
return _mm_cvtps_pd( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128l cvtpd2dq( rxmm128d a )
|
||||
{
|
||||
return _mm_cvtpd_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) a0
|
||||
r1 := (double) a1
|
||||
*/
|
||||
static inline rxmm128d cvtdq2pd( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtepi32_pd( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := (int) a0
|
||||
*/
|
||||
static inline int cvtsd2si( rxmm128d a )
|
||||
{
|
||||
return _mm_cvtsd_si32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (float) b0
|
||||
r1 := a1
|
||||
r2 := a2
|
||||
r3 := a3
|
||||
*/
|
||||
static inline rxmm128s cvtsd2ss( rxmm128l a, rxmm128d b )
|
||||
{
|
||||
return _mm_cvtsd_ss( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) b
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d cvtsi2sd( rxmm128d a, int b )
|
||||
{
|
||||
return _mm_cvtsi32_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) b0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d cvtss2sd( rxmm128d a, rxmm128s b )
|
||||
{
|
||||
return _mm_cvtss_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
using truncate
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := 0x0
|
||||
r3 := 0x0
|
||||
*/
|
||||
static inline rxmm128l cvttpd2dq( rxmm128d a )
|
||||
{
|
||||
return _mm_cvttpd_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
using truncate
|
||||
r := (int) a0
|
||||
*/
|
||||
static inline int cvttsd2si( rxmm128d a )
|
||||
{
|
||||
return _mm_cvttsd_si32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (float) a0
|
||||
r1 := (float) a1
|
||||
r2 := (float) a2
|
||||
r3 := (float) a3
|
||||
*/
|
||||
static inline rxmm128s cvtdq2ps( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtepi32_ps( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := (int) a2
|
||||
r3 := (int) a3
|
||||
*/
|
||||
static inline rxmm128l cvtps2dq( rxmm128s a )
|
||||
{
|
||||
return _mm_cvtps_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
uses trancate
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := (int) a2
|
||||
r3 := (int) a3
|
||||
*/
|
||||
static inline rxmm128l cvttps2dq( rxmm128s a )
|
||||
{
|
||||
return _mm_cvttps_epi32( a );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Misc
|
||||
|
||||
/*!
|
||||
r0 := a1
|
||||
r1 := b1
|
||||
*/
|
||||
static inline rxmm128d unpckhpd( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_unpackhi_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := b0
|
||||
*/
|
||||
static inline rxmm128d unpcklpd( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_unpacklo_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := sign(a1) << 1 | sign(a0)
|
||||
*/
|
||||
static inline int movmskpd( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_movemask_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (i0 == 1) ? b0 : a0
|
||||
r1 := (i1 == 1) ? b1 : a1
|
||||
*/
|
||||
static inline int shuffle_pd( rxmm128d a, rxmm128d b, int i )
|
||||
{
|
||||
return _mm_shuffle_pd( a, b, i );
|
||||
}
|
||||
|
||||
/*!
|
||||
== shuffle_pd( a, b, 1 )
|
||||
r0 := b0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d move_sd( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_move_sd( a0 );
|
||||
}
|
||||
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory load
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
r0 := p[0]
|
||||
r1 := p[1]
|
||||
*/
|
||||
static inline rxmm128d load_pd( double * p )
|
||||
{
|
||||
return _mm_load_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
r0 := p[1]
|
||||
r1 := p[0]
|
||||
*/
|
||||
static inline rxmm128d load_pd_reverse( double * p )
|
||||
{
|
||||
return _mm_loadr_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := p[0]
|
||||
r1 := p[1]
|
||||
*/
|
||||
static inline rxmm128d load_pd_unaligned( double * p )
|
||||
{
|
||||
return _mm_loadu_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := a0
|
||||
r1 := *p
|
||||
*/
|
||||
static inline rxmm128d load_pd_hi( rxmm128d a, double * p )
|
||||
{
|
||||
return _mm_loadh_pd( a, p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d load_pd_lo( rxmm128d a, double * p )
|
||||
{
|
||||
return _mm_loadl_pd( a, p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := *p
|
||||
*/
|
||||
static inline rxmm128d load_pd_both( double * p )
|
||||
{
|
||||
return _mm_load1_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := 0.0
|
||||
*/
|
||||
static inline rxmm128d load_sd( double * p )
|
||||
{
|
||||
return _mm_load_sd( p );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory store
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a1
|
||||
*/
|
||||
static inline void store_pd( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_load_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
p[0] := a1
|
||||
p[1] := a0
|
||||
*/
|
||||
static inline void store_pd_reverse( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storer_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a1
|
||||
*/
|
||||
static inline void store_pd_unaligned(double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storeu_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
*p := a1
|
||||
*/
|
||||
static inline void store_pd_hi( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storeh_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
*p := a0
|
||||
*/
|
||||
static inline void store_pd_lo( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storel_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a0
|
||||
*/
|
||||
static inline void store_pd_both( double * p, rxmm128d a )
|
||||
{
|
||||
return _mm_store1_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
*p := a0
|
||||
*/
|
||||
static inline void store_sd( double * p, rxmm128d a )
|
||||
{
|
||||
return _mm_store_sd( p );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory set
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d set_pd( double a1, double a0 )
|
||||
{
|
||||
return _mm_set_pd( a1, a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := 0.0
|
||||
r1 := 0.0
|
||||
*/
|
||||
static inline rxmm128d set_pd_zero()
|
||||
{
|
||||
return _mm_setzero_pd( a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := a0
|
||||
*/
|
||||
static inline rxmm128d set_pd_both( double a0)
|
||||
{
|
||||
return _mm_set1_pd( a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := a0
|
||||
r1 := 0.0
|
||||
*/
|
||||
static inline rxmm128d set_sd( double a0 )
|
||||
{
|
||||
return _mm_set_sd( a0 );
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
}
|
||||
|
||||
#endif/*_SSE2_CMPL_ABSTRACTION_MSC_H_*/
|
||||
624
misc/SSE_cmplr_abstraction_MSC_pckdbl.h
Normal file
624
misc/SSE_cmplr_abstraction_MSC_pckdbl.h
Normal file
@@ -0,0 +1,624 @@
|
||||
/*
|
||||
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
|
||||
* Inc., and is fully protected under copyright and trade secret laws. You may
|
||||
* not view, use, disclose, copy, or distribute this file or any information
|
||||
* contained herein except pursuant to a valid written license from Synopsys.
|
||||
*/
|
||||
|
||||
//
|
||||
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
|
||||
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
|
||||
//
|
||||
|
||||
#ifndef _SSE2_CMPL_ABSTRACTION_MSC_PCKDBL_H_
|
||||
#define _SSE2_CMPL_ABSTRACTION_MSC_PCKDBL_H_
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
namespace sse2
|
||||
{
|
||||
|
||||
//
|
||||
/// class pd (packed double)
|
||||
//
|
||||
class pd
|
||||
{
|
||||
public:
|
||||
|
||||
//
|
||||
/// The type.
|
||||
//
|
||||
typedef rxmm128d my_rxmm;
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double arithmetic
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := a0 + b0
|
||||
r1 := a1 + b1
|
||||
*/
|
||||
static inline rxmm128d add( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_add_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 - b0
|
||||
r1 := a1 - b1
|
||||
*/
|
||||
static inline rxmm128d sub( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_sub_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 * b0
|
||||
r1 := a1 * b1
|
||||
*/
|
||||
static inline rxmm128d mul( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_mul_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 / b0
|
||||
r1 := a1 / b1
|
||||
*/
|
||||
static inline rxmm128d div( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_div_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := max( a0, b0 )
|
||||
r1 := max( a1, b1 )
|
||||
*/
|
||||
static inline rxmm128d max( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_max_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := min( a0, b0 )
|
||||
r1 := min( a1, b1 )
|
||||
*/
|
||||
static inline rxmm128d min( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_min_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := sqrt( a0 )
|
||||
r1 := sqrt( a1 )
|
||||
*/
|
||||
static inline rxmm128d sqrt( rxmm128d a )
|
||||
{
|
||||
return _mm_sqrt_pd( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := recip(a0)
|
||||
r1 := recip(a1)
|
||||
*/
|
||||
static inline rxmm128d rcp( rxmm128d a )
|
||||
{
|
||||
rxmm128d t = _mm_set1_pd( 1.0 );
|
||||
return _mm_div_pd( t, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := recip(sqrt(a0))
|
||||
r1 := recip(sqrt(a1))
|
||||
*/
|
||||
static inline rxmm128d rsqrt( rxmm128d a )
|
||||
{
|
||||
rxmm128d t = _mm_set1_pd( 1.0 );
|
||||
rxmm128d u = _mm_sqrt_pd( a );
|
||||
return _mm_div_pd( t, u );
|
||||
}
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double logic
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := (~a0) & b0
|
||||
r1 := (~a1) & b1
|
||||
*/
|
||||
static inline rxmm128d andnot( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_andnot_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 & b0
|
||||
r1 := a1 & b1
|
||||
*/
|
||||
static inline XMM_TYPE and( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_and_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 | b0
|
||||
r1 := a1 | b1
|
||||
*/
|
||||
static inline XMM_TYPE or( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_or_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 ^ b0
|
||||
r1 := a1 ^ b1
|
||||
*/
|
||||
static inline XMM_TYPE xor( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_xor_pd( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double comparision
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := (a0 == b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 == b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_eq( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpeq_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 != b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 != b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_neq( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpneq_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 < b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 < b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_lt( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmplt_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 <= b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 <= b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_le( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmple_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 > b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 > b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_gt( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpgt_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 >= b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 >= b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_ge( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpge_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 ord b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 ord b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_ord( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpord_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 unord b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 unord b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_unord( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpunord_pd( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double load
|
||||
//
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Misc
|
||||
|
||||
/*!
|
||||
r0 := a1
|
||||
r1 := b1
|
||||
*/
|
||||
static inline rxmm128d unpckh( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_unpackhi_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := b0
|
||||
*/
|
||||
static inline rxmm128d unpckl( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_unpacklo_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := sign(a1) << 1 | sign(a0)
|
||||
*/
|
||||
static inline int movmsk( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_movemask_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (i0 == 1) ? b0 : a0
|
||||
r1 := (i1 == 1) ? b1 : a1
|
||||
\sa movmsk
|
||||
*/
|
||||
static inline int shuffle( rxmm128d a, rxmm128d b, int i )
|
||||
{
|
||||
return _mm_shuffle_pd( a, b, i );
|
||||
}
|
||||
|
||||
/*!
|
||||
== shuffle( a, b, 1 )
|
||||
r0 := b0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d move_sd( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_move_sd( a0 );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory load
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
r0 := p[0]
|
||||
r1 := p[1]
|
||||
*/
|
||||
static inline rxmm128d load( double * p )
|
||||
{
|
||||
return _mm_load_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
r0 := p[1]
|
||||
r1 := p[0]
|
||||
*/
|
||||
static inline rxmm128d load_reverse( double * p )
|
||||
{
|
||||
return _mm_loadr_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := p[0]
|
||||
r1 := p[1]
|
||||
*/
|
||||
static inline rxmm128d load_unaligned( double * p )
|
||||
{
|
||||
return _mm_loadu_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := a0
|
||||
r1 := *p
|
||||
*/
|
||||
static inline rxmm128d load_hi( rxmm128d a, double * p )
|
||||
{
|
||||
return _mm_loadh_pd( a, p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d load_lo( rxmm128d a, double * p )
|
||||
{
|
||||
return _mm_loadl_pd( a, p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := *p
|
||||
*/
|
||||
static inline rxmm128d load_both( double * p )
|
||||
{
|
||||
return _mm_load1_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := 0.0
|
||||
*/
|
||||
static inline rxmm128d load_s( double * p )
|
||||
{
|
||||
return _mm_load_sd( p );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory store
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a1
|
||||
*/
|
||||
static inline void store( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_store_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
p[0] := a1
|
||||
p[1] := a0
|
||||
*/
|
||||
static inline void store_reverse( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storer_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a1
|
||||
*/
|
||||
static inline void store_unaligned(double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storeu_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
*p := a1
|
||||
*/
|
||||
static inline void store_hi( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storeh_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
*p := a0
|
||||
*/
|
||||
static inline void store_lo( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storel_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a0
|
||||
*/
|
||||
static inline void store_both( double * p, rxmm128d a )
|
||||
{
|
||||
return _mm_store1_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
*p := a0
|
||||
*/
|
||||
static inline void store_s( double * p, rxmm128d a )
|
||||
{
|
||||
return _mm_store_sd( p );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory set
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d set( double a1, double a0 )
|
||||
{
|
||||
return _mm_set_pd( a1, a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := 0.0
|
||||
r1 := 0.0
|
||||
*/
|
||||
static inline rxmm128d set_zero()
|
||||
{
|
||||
return _mm_setzero_pd( a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := a0
|
||||
*/
|
||||
static inline rxmm128d set_both( double a0 )
|
||||
{
|
||||
return _mm_set1_pd( a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := a0
|
||||
r1 := 0.0
|
||||
*/
|
||||
static inline rxmm128d set_s( double a0 )
|
||||
{
|
||||
return _mm_set_sd( a0 );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double convertion
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := (float) a0
|
||||
r1 := (float) a1
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128s cvtpd2ps( rxmm128d a )
|
||||
{
|
||||
return _mm_cvtpd_ps( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) a0
|
||||
r1 := (double) a1
|
||||
*/
|
||||
static inline rxmm128d cvtps2pd( rxmm128s a )
|
||||
{
|
||||
return _mm_cvtps_pd( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128l cvtpd2dq( rxmm128d a )
|
||||
{
|
||||
return _mm_cvtpd_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) a0
|
||||
r1 := (double) a1
|
||||
*/
|
||||
static inline rxmm128d cvtdq2pd( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtepi32_pd( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := (int) a0
|
||||
*/
|
||||
static inline int cvtsd2si( rxmm128d a )
|
||||
{
|
||||
return _mm_cvtsd_si32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (float) b0
|
||||
r1 := a1
|
||||
r2 := a2
|
||||
r3 := a3
|
||||
*/
|
||||
static inline rxmm128s cvtsd2ss( rxmm128l a, rxmm128d b )
|
||||
{
|
||||
return _mm_cvtsd_ss( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) b
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d cvtsi2sd( rxmm128d a, int b )
|
||||
{
|
||||
return _mm_cvtsi32_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) b0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d cvtss2sd( rxmm128d a, rxmm128s b )
|
||||
{
|
||||
return _mm_cvtss_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
using truncate
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := 0x0
|
||||
r3 := 0x0
|
||||
*/
|
||||
static inline rxmm128l cvttpd2dq( rxmm128d a )
|
||||
{
|
||||
return _mm_cvttpd_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
using truncate
|
||||
r := (int) a0
|
||||
*/
|
||||
static inline int cvttsd2si( rxmm128d a )
|
||||
{
|
||||
return _mm_cvttsd_si32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (float) a0
|
||||
r1 := (float) a1
|
||||
r2 := (float) a2
|
||||
r3 := (float) a3
|
||||
*/
|
||||
static inline rxmm128s cvtdq2ps( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtepi32_ps( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := (int) a2
|
||||
r3 := (int) a3
|
||||
*/
|
||||
static inline rxmm128l cvtps2dq( rxmm128s a )
|
||||
{
|
||||
return _mm_cvtps_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
uses trancate
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := (int) a2
|
||||
r3 := (int) a3
|
||||
*/
|
||||
static inline rxmm128l cvttps2dq( rxmm128s a )
|
||||
{
|
||||
return _mm_cvttps_epi32( a );
|
||||
}
|
||||
|
||||
//
|
||||
// class pd
|
||||
//
|
||||
};
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
}
|
||||
|
||||
#endif/*_SSE2_CMPL_ABSTRACTION_MSC_PCKDBL_H_*/
|
||||
667
misc/SSE_cmplr_abstraction_MSC_pckfloat.h
Normal file
667
misc/SSE_cmplr_abstraction_MSC_pckfloat.h
Normal file
@@ -0,0 +1,667 @@
|
||||
/*
|
||||
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
|
||||
* Inc., and is fully protected under copyright and trade secret laws. You may
|
||||
* not view, use, disclose, copy, or distribute this file or any information
|
||||
* contained herein except pursuant to a valid written license from Synopsys.
|
||||
*/
|
||||
|
||||
//
|
||||
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
|
||||
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
|
||||
//
|
||||
|
||||
#ifndef _SSE2_CMPL_ABSTRACTION_MSC_PCKFLOAT_H_
|
||||
#define _SSE2_CMPL_ABSTRACTION_MSC_PCKFLOAT_H_
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
namespace sse2
|
||||
{
|
||||
|
||||
//
|
||||
/// class ps (packed single precision)
|
||||
//
|
||||
class ps
|
||||
{
|
||||
public:
|
||||
|
||||
//
|
||||
/// The type.
|
||||
//
|
||||
typedef rxmm128s my_rxmm;
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double arithmetic
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := a0 + b0
|
||||
r1 := a1 + b1
|
||||
r2 := a2 + b2
|
||||
r3 := a3 + b3
|
||||
*/
|
||||
static inline rxmm128s add( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_add_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 - b0
|
||||
r1 := a1 - b1
|
||||
r2 := a2 - b2
|
||||
r3 := a3 - b3
|
||||
*/
|
||||
static inline rxmm128s sub( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_sub_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 * b0
|
||||
r1 := a1 * b1
|
||||
r2 := a2 * b2
|
||||
r3 := a3 * b3
|
||||
*/
|
||||
static inline rxmm128s mul( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_mul_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 / b0
|
||||
r1 := a1 / b1
|
||||
r2 := a2 / b2
|
||||
r3 := a3 / b3
|
||||
*/
|
||||
static inline rxmm128s div( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_div_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := max(a0, b0)
|
||||
r1 := max(a1, b1)
|
||||
r2 := max(a2, b2)
|
||||
r3 := max(a3, b3)
|
||||
*/
|
||||
static inline rxmm128s max( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_max_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := min(a0, b0)
|
||||
r1 := min(a1, b1)
|
||||
r2 := min(a2, b2)
|
||||
r3 := min(a3, b3)
|
||||
*/
|
||||
static inline rxmm128s min( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_min_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := sqrt(a0)
|
||||
r1 := sqrt(a1)
|
||||
r2 := sqrt(a2)
|
||||
r3 := sqrt(a3)
|
||||
*/
|
||||
static inline rxmm128s sqrt( rxmm128s a )
|
||||
{
|
||||
return _mm_sqrt_ps( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := recip(a0)
|
||||
r1 := recip(a1)
|
||||
r2 := recip(a2)
|
||||
r3 := recip(a3)
|
||||
*/
|
||||
static inline rxmm128s rcp( rxmm128s a )
|
||||
{
|
||||
return _mm_rcp_ps( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := recip(sqrt(a0))
|
||||
r1 := recip(sqrt(a1))
|
||||
r2 := recip(sqrt(a2))
|
||||
r3 := recip(sqrt(a3))
|
||||
*/
|
||||
static inline rxmm128s rsqrt( rxmm128s a )
|
||||
{
|
||||
return _mm_rsqrt_ps( a );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double logic
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := ~a0 & b0
|
||||
r1 := ~a1 & b1
|
||||
r2 := ~a2 & b2
|
||||
r3 := ~a3 & b3
|
||||
*/
|
||||
static inline rxmm128s andnot( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_andnot_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 & b0
|
||||
r1 := a1 & b1
|
||||
*/
|
||||
static inline XMM_TYPE and( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_and_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 | b0
|
||||
r1 := a1 | b1
|
||||
*/
|
||||
static inline XMM_TYPE or( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_or_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 ^ b0
|
||||
r1 := a1 ^ b1
|
||||
r2 := a2 ^ b2
|
||||
r3 := a3 ^ b3
|
||||
*/
|
||||
static inline XMM_TYPE xor( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_xor_ps( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double comparision
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := (a0 == b0) ? 0xffffffff : 0x0
|
||||
r1 := (a1 == b1) ? 0xffffffff : 0x0
|
||||
r2 := (a2 == b2) ? 0xffffffff : 0x0
|
||||
r3 := (a3 == b3) ? 0xffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128s cmp_eq( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_cmpeq_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 != b0) ? 0xffffffff : 0x0
|
||||
r1 := (a1 != b1) ? 0xffffffff : 0x0
|
||||
r2 := (a2 != b2) ? 0xffffffff : 0x0
|
||||
r3 := (a3 != b3) ? 0xffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128s cmp_neq( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_cmpneq_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 < b0) ? 0xffffffff : 0x0
|
||||
r1 := (a1 < b1) ? 0xffffffff : 0x0
|
||||
r2 := (a2 < b2) ? 0xffffffff : 0x0
|
||||
r3 := (a3 < b3) ? 0xffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128s cmp_lt( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_cmplt_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 <= b0) ? 0xffffffff : 0x0
|
||||
r1 := (a1 <= b1) ? 0xffffffff : 0x0
|
||||
r2 := (a2 <= b2) ? 0xffffffff : 0x0
|
||||
r3 := (a3 <= b3) ? 0xffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128s cmp_le( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_cmple_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 > b0) ? 0xffffffff : 0x0
|
||||
r1 := (a1 > b1) ? 0xffffffff : 0x0
|
||||
r2 := (a2 > b2) ? 0xffffffff : 0x0
|
||||
r3 := (a3 > b3) ? 0xffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128s cmp_gt( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_cmpgt_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 >= b0) ? 0xffffffff : 0x0
|
||||
r1 := (a1 >= b1) ? 0xffffffff : 0x0
|
||||
r2 := (a2 >= b2) ? 0xffffffff : 0x0
|
||||
r3 := (a3 >= b3) ? 0xffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128s cmp_ge( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_cmpge_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 ord b0) ? 0xffffffff : 0x0
|
||||
r1 := (a1 ord b1) ? 0xffffffff : 0x0
|
||||
r2 := (a2 ord b2) ? 0xffffffff : 0x0
|
||||
r3 := (a3 ord b3) ? 0xffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128s cmp_ord( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_cmpord_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 unord b0) ? 0xffffffff : 0x0
|
||||
r1 := (a1 unord b1) ? 0xffffffff : 0x0
|
||||
r2 := (a2 unord b2) ? 0xffffffff : 0x0
|
||||
r3 := (a3 unord b3) ? 0xffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128s cmp_unord( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_cmpunord_ps( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double load
|
||||
//
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Misc
|
||||
|
||||
/*!
|
||||
r0 := a2
|
||||
r1 := b2
|
||||
r2 := a3
|
||||
r3 := b3
|
||||
*/
|
||||
static inline rxmm128s unpckh( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_unpackhi_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := b0
|
||||
r2 := a1
|
||||
r3 := b1
|
||||
*/
|
||||
static inline rxmm128s unpckl( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_unpacklo_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := sign(a3)<<3 | sign(a2)<<2 | sign(a1)<<1 | sign(a0)
|
||||
*/
|
||||
static inline int movmsk( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_movemask_ps( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (i0 == 1) ? b0 : a0
|
||||
r1 := (i1 == 1) ? b1 : a1
|
||||
r2 := (i2 == 1) ? b2 : a2
|
||||
r3 := (i3 == 1) ? b3 : a3
|
||||
\sa movmsk
|
||||
*/
|
||||
static inline int shuffle( rxmm128s a, rxmm128s b, int i )
|
||||
{
|
||||
return _mm_shuffle_ps( a, b, i );
|
||||
}
|
||||
|
||||
/*!
|
||||
r3 := a3
|
||||
r2 := a2
|
||||
r1 := b3
|
||||
r0 := b2
|
||||
*/
|
||||
static inline rxmm128s move_hl( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return mm_movehl_ps( a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
r3 := b1
|
||||
r2 := b0
|
||||
r1 := a1
|
||||
r0 := a0
|
||||
*/
|
||||
static inline rxmm128s move_lh( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_movelh_ps( a0 );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory load
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
r0 := p[0]
|
||||
r1 := p[1]
|
||||
r2 := p[2]
|
||||
r3 := p[3]
|
||||
*/
|
||||
static inline rxmm128s load( float * p )
|
||||
{
|
||||
return _mm_load_ps( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
r0 := p[3]
|
||||
r1 := p[2]
|
||||
r2 := p[1]
|
||||
r3 := p[0]
|
||||
*/
|
||||
static inline rxmm128s load_reverse( float * p )
|
||||
{
|
||||
return _mm_loadr_ps( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := p[0]
|
||||
r1 := p[1]
|
||||
r2 := p[2]
|
||||
r3 := p[3]
|
||||
*/
|
||||
static inline rxmm128s load_unaligned( float * p )
|
||||
{
|
||||
return _mm_loadu_ps( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := *p
|
||||
r2 := *p
|
||||
r3 := *p
|
||||
*/
|
||||
static inline rxmm128s load_both( float * p )
|
||||
{
|
||||
return _mm_load1_ps( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := 0.0
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128s load_s( float * p )
|
||||
{
|
||||
return _mm_load_ss( p );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory store
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a1
|
||||
p[2] := a2
|
||||
p[3] := a3
|
||||
*/
|
||||
static inline void store( float * p, rxmm128s a )
|
||||
{
|
||||
_mm_store_ps( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
p[0] := a3
|
||||
p[1] := a2
|
||||
p[2] := a1
|
||||
p[3] := a0
|
||||
*/
|
||||
static inline void store_reverse( float * p, rxmm128s a )
|
||||
{
|
||||
_mm_storer_ps( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a1
|
||||
p[2] := a2
|
||||
p[3] := a3
|
||||
*/
|
||||
static inline void store_unaligned(float * p, rxmm128s a )
|
||||
{
|
||||
_mm_storeu_ps( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a0
|
||||
*/
|
||||
static inline void store_both( float * p, rxmm128s a )
|
||||
{
|
||||
return _mm_store1_ps( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
*p := a0
|
||||
*/
|
||||
static inline void store_s( float * p, rxmm128s a )
|
||||
{
|
||||
return _mm_store_ss( p );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory set
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := a1
|
||||
r2 := a2
|
||||
r3 := a3
|
||||
*/
|
||||
static inline rxmm128s set( float a3, float a2, float a1, float a0 )
|
||||
{
|
||||
return _mm_set_ps( a3, a2, a1, a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := 0.0
|
||||
r1 := 0.0
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128s set_zero()
|
||||
{
|
||||
return _mm_setzero_ps( a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := a0
|
||||
r2 := a0
|
||||
r3 := a0
|
||||
*/
|
||||
static inline rxmm128s set_both( float a0 )
|
||||
{
|
||||
return _mm_set1_ps( a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := a0
|
||||
r1 := 0.0
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128s set_s( float a0 )
|
||||
{
|
||||
return _mm_set_ss( a0 );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double convertion
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := (float) a0
|
||||
r1 := (float) a1
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128s cvtpd2ps( rxmm128s a )
|
||||
{
|
||||
return _mm_cvtpd_ps( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) a0
|
||||
r1 := (double) a1
|
||||
*/
|
||||
static inline rxmm128s cvtps2pd( rxmm128s a )
|
||||
{
|
||||
return _mm_cvtps_ps( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128l cvtpd2dq( rxmm128s a )
|
||||
{
|
||||
return _mm_cvtpd_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) a0
|
||||
r1 := (double) a1
|
||||
*/
|
||||
static inline rxmm128s cvtdq2pd( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtepi32_ps( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := (int) a0
|
||||
*/
|
||||
static inline int cvtsd2si( rxmm128s a )
|
||||
{
|
||||
return _mm_cvtsd_si32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (float) b0
|
||||
r1 := a1
|
||||
r2 := a2
|
||||
r3 := a3
|
||||
*/
|
||||
static inline rxmm128s cvtsd2ss( rxmm128l a, rxmm128s b )
|
||||
{
|
||||
return _mm_cvtsd_ss( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) b
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128s cvtsi2sd( rxmm128s a, int b )
|
||||
{
|
||||
return _mm_cvtsi32_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) b0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128s cvtss2sd( rxmm128s a, rxmm128s b )
|
||||
{
|
||||
return _mm_cvtss_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
using truncate
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := 0x0
|
||||
r3 := 0x0
|
||||
*/
|
||||
static inline rxmm128l cvttpd2dq( rxmm128s a )
|
||||
{
|
||||
return _mm_cvttpd_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
using truncate
|
||||
r := (int) a0
|
||||
*/
|
||||
static inline int cvttsd2si( rxmm128s a )
|
||||
{
|
||||
return _mm_cvttsd_si32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (float) a0
|
||||
r1 := (float) a1
|
||||
r2 := (float) a2
|
||||
r3 := (float) a3
|
||||
*/
|
||||
static inline rxmm128s cvtdq2ps( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtepi32_ps( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := (int) a2
|
||||
r3 := (int) a3
|
||||
*/
|
||||
static inline rxmm128l cvtps2dq( rxmm128s a )
|
||||
{
|
||||
return _mm_cvtps_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
uses trancate
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := (int) a2
|
||||
r3 := (int) a3
|
||||
*/
|
||||
static inline rxmm128l cvttps2dq( rxmm128s a )
|
||||
{
|
||||
return _mm_cvttps_epi32( a );
|
||||
}
|
||||
|
||||
//
|
||||
// class ps
|
||||
//
|
||||
};
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
}
|
||||
|
||||
#endif/*_SSE2_CMPL_ABSTRACTION_MSC_PCKFLOAT_H_*/
|
||||
618
misc/SSE_cmplr_abstraction_MSC_pckint16.h
Normal file
618
misc/SSE_cmplr_abstraction_MSC_pckint16.h
Normal file
@@ -0,0 +1,618 @@
|
||||
/*
|
||||
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
|
||||
* Inc., and is fully protected under copyright and trade secret laws. You may
|
||||
* not view, use, disclose, copy, or distribute this file or any information
|
||||
* contained herein except pursuant to a valid written license from Synopsys.
|
||||
*/
|
||||
|
||||
//
|
||||
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
|
||||
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
|
||||
//
|
||||
|
||||
#ifndef _SSE2_CMPL_ABSTRACTION_MSC_PCKINT16_H_
|
||||
#define _SSE2_CMPL_ABSTRACTION_MSC_PCKINT16_H_
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
namespace sse2
|
||||
{
|
||||
|
||||
//
|
||||
/// class epi64 (packed single precision)
|
||||
//
|
||||
class epi64
|
||||
{
|
||||
public:
|
||||
|
||||
//
|
||||
/// The type.
|
||||
//
|
||||
typedef rxmm128l my_rxmm;
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed integer arithmetic
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := a0 + b0
|
||||
r1 := a1 + b1
|
||||
*/
|
||||
static inline rxmm128d add( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_add_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 - b0
|
||||
r1 := a1 - b1
|
||||
*/
|
||||
static inline rxmm128d sub( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_sub_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 * b0
|
||||
r1 := a1 * b1
|
||||
*/
|
||||
static inline rxmm128d mul( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_mul_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 / b0
|
||||
r1 := a1 / b1
|
||||
*/
|
||||
static inline rxmm128d div( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_div_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := max( a0, b0 )
|
||||
r1 := max( a1, b1 )
|
||||
*/
|
||||
static inline rxmm128d max( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_max_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := min( a0, b0 )
|
||||
r1 := min( a1, b1 )
|
||||
*/
|
||||
static inline rxmm128d min( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_min_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := sqrt( a0 )
|
||||
r1 := sqrt( a1 )
|
||||
*/
|
||||
static inline rxmm128d sqrt( rxmm128d a )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_sqrt_pd( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed integer logic
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := (~a0) & b0
|
||||
r1 := (~a1) & b1
|
||||
*/
|
||||
static inline rxmm128d andnot( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_andnot_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 & b0
|
||||
r1 := a1 & b1
|
||||
*/
|
||||
static inline XMM_TYPE and( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_and_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 | b0
|
||||
r1 := a1 | b1
|
||||
*/
|
||||
static inline XMM_TYPE or( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_or_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 ^ b0
|
||||
r1 := a1 ^ b1
|
||||
*/
|
||||
static inline XMM_TYPE xor( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_xor_pd( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed integer comparision
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := (a0 == b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 == b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_eq( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_cmpeq_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 != b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 != b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_neq( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpneq_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 < b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 < b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_lt( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmplt_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 <= b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 <= b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_le( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_cmple_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 > b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 > b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_gt( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_cmpgt_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 >= b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 >= b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_ge( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpge_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 ord b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 ord b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_ord( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpord_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 unord b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 unord b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_unord( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpunord_pd( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed integer load
|
||||
//
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Misc
|
||||
|
||||
/*!
|
||||
r0 := a1
|
||||
r1 := b1
|
||||
*/
|
||||
static inline rxmm128d unpckh( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_unpackhi_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := b0
|
||||
*/
|
||||
static inline rxmm128d unpckl( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_unpacklo_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := sign(a1) << 1 | sign(a0)
|
||||
*/
|
||||
static inline int movmsk( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_movemask_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (i0 == 1) ? b0 : a0
|
||||
r1 := (i1 == 1) ? b1 : a1
|
||||
\sa movmsk
|
||||
*/
|
||||
static inline int shuffle( rxmm128d a, rxmm128d b, int i )
|
||||
{
|
||||
return _mm_shuffle_pd( a, b, i );
|
||||
}
|
||||
|
||||
/*!
|
||||
== shuffle( a, b, 1 )
|
||||
r0 := b0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d move_sd( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_move_sd( a0 );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory load
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
r0 := p[0]
|
||||
r1 := p[1]
|
||||
*/
|
||||
static inline rxmm128d load( double * p )
|
||||
{
|
||||
return _mm_load_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
r0 := p[1]
|
||||
r1 := p[0]
|
||||
*/
|
||||
static inline rxmm128d load_reverse( double * p )
|
||||
{
|
||||
return _mm_loadr_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := p[0]
|
||||
r1 := p[1]
|
||||
*/
|
||||
static inline rxmm128d load_unaligned( double * p )
|
||||
{
|
||||
return _mm_loadu_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := a0
|
||||
r1 := *p
|
||||
*/
|
||||
static inline rxmm128d load_hi( rxmm128d a, double * p )
|
||||
{
|
||||
return _mm_loadh_pd( a, p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d load_lo( rxmm128d a, double * p )
|
||||
{
|
||||
return _mm_loadl_pd( a, p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := *p
|
||||
*/
|
||||
static inline rxmm128d load_both( double * p )
|
||||
{
|
||||
return _mm_load1_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := 0.0
|
||||
*/
|
||||
static inline rxmm128d load_sd( double * p )
|
||||
{
|
||||
return _mm_load_sd( p );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory store
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a1
|
||||
*/
|
||||
static inline void store( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_load_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
p[0] := a1
|
||||
p[1] := a0
|
||||
*/
|
||||
static inline void store_reverse( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storer_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a1
|
||||
*/
|
||||
static inline void store_unaligned(double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storeu_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
*p := a1
|
||||
*/
|
||||
static inline void store_hi( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storeh_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
*p := a0
|
||||
*/
|
||||
static inline void store_lo( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storel_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a0
|
||||
*/
|
||||
static inline void store_both( double * p, rxmm128d a )
|
||||
{
|
||||
return _mm_store1_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
*p := a0
|
||||
*/
|
||||
static inline void store_sd( double * p, rxmm128d a )
|
||||
{
|
||||
return _mm_store_sd( p );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory set
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d set( double a1, double a0 )
|
||||
{
|
||||
return _mm_set_pd( a1, a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := 0.0
|
||||
r1 := 0.0
|
||||
*/
|
||||
static inline rxmm128d set_zero()
|
||||
{
|
||||
return _mm_setzero_pd( a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := a0
|
||||
*/
|
||||
static inline rxmm128d set_both( double a0 )
|
||||
{
|
||||
return _mm_set1_pd( a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := a0
|
||||
r1 := 0.0
|
||||
*/
|
||||
static inline rxmm128d set_sd( double a0 )
|
||||
{
|
||||
return _mm_set_sd( a0 );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed integer convertion
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := (float) a0
|
||||
r1 := (float) a1
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128s cvtpd2ps( rxmm128d a )
|
||||
{
|
||||
return _mm_cvtpd_ps( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) a0
|
||||
r1 := (double) a1
|
||||
*/
|
||||
static inline rxmm128d cvtps2pd( rxmm128s a )
|
||||
{
|
||||
return _mm_cvtps_pd( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128l cvtpd2dq( rxmm128d a )
|
||||
{
|
||||
return _mm_cvtpd_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) a0
|
||||
r1 := (double) a1
|
||||
*/
|
||||
static inline rxmm128d cvtdq2pd( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtepi32_pd( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := (int) a0
|
||||
*/
|
||||
static inline int cvtsd2si( rxmm128d a )
|
||||
{
|
||||
return _mm_cvtsd_si32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (float) b0
|
||||
r1 := a1
|
||||
r2 := a2
|
||||
r3 := a3
|
||||
*/
|
||||
static inline rxmm128s cvtsd2ss( rxmm128l a, rxmm128d b )
|
||||
{
|
||||
return _mm_cvtsd_ss( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) b
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d cvtsi2sd( rxmm128d a, int b )
|
||||
{
|
||||
return _mm_cvtsi32_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) b0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d cvtss2sd( rxmm128d a, rxmm128s b )
|
||||
{
|
||||
return _mm_cvtss_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
using truncate
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := 0x0
|
||||
r3 := 0x0
|
||||
*/
|
||||
static inline rxmm128l cvttpd2dq( rxmm128d a )
|
||||
{
|
||||
return _mm_cvttpd_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
using truncate
|
||||
r := (int) a0
|
||||
*/
|
||||
static inline int cvttsd2si( rxmm128d a )
|
||||
{
|
||||
return _mm_cvttsd_si32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (float) a0
|
||||
r1 := (float) a1
|
||||
r2 := (float) a2
|
||||
r3 := (float) a3
|
||||
*/
|
||||
static inline rxmm128s cvtdq2ps( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtepi32_ps( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := (int) a2
|
||||
r3 := (int) a3
|
||||
*/
|
||||
static inline rxmm128l cvtps2dq( rxmm128s a )
|
||||
{
|
||||
return _mm_cvtps_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
uses trancate
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := (int) a2
|
||||
r3 := (int) a3
|
||||
*/
|
||||
static inline rxmm128l cvttps2dq( rxmm128s a )
|
||||
{
|
||||
return _mm_cvttps_epi32( a );
|
||||
}
|
||||
|
||||
//
|
||||
// class epi64
|
||||
//
|
||||
};
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
}
|
||||
|
||||
#endif/*_SSE2_CMPL_ABSTRACTION_MSC_PCKINT16_H_*/
|
||||
676
misc/SSE_cmplr_abstraction_MSC_pckint32.h
Normal file
676
misc/SSE_cmplr_abstraction_MSC_pckint32.h
Normal file
@@ -0,0 +1,676 @@
|
||||
/*
|
||||
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
|
||||
* Inc., and is fully protected under copyright and trade secret laws. You may
|
||||
* not view, use, disclose, copy, or distribute this file or any information
|
||||
* contained herein except pursuant to a valid written license from Synopsys.
|
||||
*/
|
||||
|
||||
//
|
||||
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
|
||||
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
|
||||
//
|
||||
|
||||
#ifndef _SSE2_CMPL_ABSTRACTION_MSC_PCKINT32_H_
|
||||
#define _SSE2_CMPL_ABSTRACTION_MSC_PCKINT32_H_
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
namespace sse2
|
||||
{
|
||||
|
||||
//
|
||||
/// class epi64 (packed single precision)
|
||||
//
|
||||
class epi64
|
||||
{
|
||||
public:
|
||||
|
||||
//
|
||||
/// The type.
|
||||
//
|
||||
typedef rxmm128l my_rxmm;
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed integer arithmetic
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := a0 + b0
|
||||
r1 := a1 + b1
|
||||
r2 := a2 + b2
|
||||
r3 := a3 + b3
|
||||
*/
|
||||
static inline rxmm128l add( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
return _mm_add_epi32( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 - b0
|
||||
r1 := a1 - b1
|
||||
r2 := a2 - b2
|
||||
r3 := a3 - b3
|
||||
*/
|
||||
static inline rxmm128l sub( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
return _mm_sub_epi32( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 * b0
|
||||
r1 := a1 * b1
|
||||
r2 := a2 * b2
|
||||
r3 := a3 * b3
|
||||
\note Emulating through float. May be precision loss.
|
||||
*/
|
||||
static inline rxmm128l mul( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
register rxmm128s t = _mm_cvtepi32_ps( a );
|
||||
register rxmm128s u = _mm_cvtepi32_ps( b );
|
||||
register rxmm128s v = _mm_mul_ps( t, u );
|
||||
return _mm_cvtps_epi32( v );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 / b0
|
||||
r1 := a1 / b1
|
||||
r2 := a2 / b2
|
||||
r3 := a3 / b3
|
||||
\note Emulating through float. May be precision loss.
|
||||
*/
|
||||
static inline rxmm128l div( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
register rxmm128s t = _mm_cvtepi32_ps( a );
|
||||
register rxmm128s u = _mm_cvtepi32_ps( b );
|
||||
register rxmm128s v = _mm_div_ps( t, u );
|
||||
return _mm_cvtps_epi32( v );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := max(a0, b0)
|
||||
r1 := max(a1, b1)
|
||||
r2 := max(a2, b2)
|
||||
r3 := max(a3, b3)
|
||||
*/
|
||||
static inline rxmm128l max( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
register rxmm128l t = _mm_cmplt_epi32( a, b );
|
||||
int mask = _mm_movemask_epi8( t );
|
||||
_mm_shuffle_epi32
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := min(a0, b0)
|
||||
r1 := min(a1, b1)
|
||||
r2 := min(a2, b2)
|
||||
r3 := min(a3, b3)
|
||||
*/
|
||||
static inline rxmm128l min( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double logic
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := ~a0
|
||||
r1 := ~a1
|
||||
r2 := ~a2
|
||||
r3 := ~a3
|
||||
*/
|
||||
static inline rxmm128l not( rxmm128l a )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_andnot_si128( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := ~a0 & b0
|
||||
r1 := ~a1 & b1
|
||||
r2 := ~a2 & b2
|
||||
r3 := ~a3 & b3
|
||||
*/
|
||||
static inline rxmm128l andnot( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
return _mm_andnot_si128( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 & b0
|
||||
r1 := a1 & b1
|
||||
*/
|
||||
static inline XMM_TYPE and( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
return _mm_and_si128( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 | b0
|
||||
r1 := a1 | b1
|
||||
*/
|
||||
static inline XMM_TYPE or( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
return _mm_or_si128( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 ^ b0
|
||||
r1 := a1 ^ b1
|
||||
r2 := a2 ^ b2
|
||||
r3 := a3 ^ b3
|
||||
*/
|
||||
static inline XMM_TYPE xor( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
return _mm_xor_si128( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double comparision
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := (a0 == b0) ? 0xffffffff : 0x0
|
||||
r1 := (a1 == b1) ? 0xffffffff : 0x0
|
||||
r2 := (a2 == b2) ? 0xffffffff : 0x0
|
||||
r3 := (a3 == b3) ? 0xffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128l cmp_eq( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
return _mm_cmpeq_epi32( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 != b0) ? 0xffffffff : 0x0
|
||||
r1 := (a1 != b1) ? 0xffffffff : 0x0
|
||||
r2 := (a2 != b2) ? 0xffffffff : 0x0
|
||||
r3 := (a3 != b3) ? 0xffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128l cmp_neq( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
rxmm128l t = _mm_cmplt_epi32( a, b );
|
||||
rxmm128l u = _mm_cmpgt_epi32( a, b );
|
||||
return _mm_cmpor_si128( t, u );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 < b0) ? 0xffffffff : 0x0
|
||||
r1 := (a1 < b1) ? 0xffffffff : 0x0
|
||||
r2 := (a2 < b2) ? 0xffffffff : 0x0
|
||||
r3 := (a3 < b3) ? 0xffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128l cmp_lt( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
return _mm_cmplt_epi32( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 <= b0) ? 0xffffffff : 0x0
|
||||
r1 := (a1 <= b1) ? 0xffffffff : 0x0
|
||||
r2 := (a2 <= b2) ? 0xffffffff : 0x0
|
||||
r3 := (a3 <= b3) ? 0xffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128l cmp_le( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
rxmm128l t = _mm_cmplt_epi32( a, b );
|
||||
rxmm128l u = _mm_cmpeq_epi32( a, b );
|
||||
return _mm_cmpor_si128( t, u );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 > b0) ? 0xffffffff : 0x0
|
||||
r1 := (a1 > b1) ? 0xffffffff : 0x0
|
||||
r2 := (a2 > b2) ? 0xffffffff : 0x0
|
||||
r3 := (a3 > b3) ? 0xffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128l cmp_gt( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
return _mm_cmpgt_epi32( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 >= b0) ? 0xffffffff : 0x0
|
||||
r1 := (a1 >= b1) ? 0xffffffff : 0x0
|
||||
r2 := (a2 >= b2) ? 0xffffffff : 0x0
|
||||
r3 := (a3 >= b3) ? 0xffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128l cmp_ge( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
rxmm128l t = _mm_cmpgt_epi32( a, b );
|
||||
rxmm128l u = _mm_cmpeq_epi32( a, b );
|
||||
return _mm_cmpor_si128( t, u );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 ord b0) ? 0xffffffff : 0x0
|
||||
r1 := (a1 ord b1) ? 0xffffffff : 0x0
|
||||
r2 := (a2 ord b2) ? 0xffffffff : 0x0
|
||||
r3 := (a3 ord b3) ? 0xffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128l cmp_ord( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
return _mm_cmpord_epi32( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 unord b0) ? 0xffffffff : 0x0
|
||||
r1 := (a1 unord b1) ? 0xffffffff : 0x0
|
||||
r2 := (a2 unord b2) ? 0xffffffff : 0x0
|
||||
r3 := (a3 unord b3) ? 0xffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128l cmp_unord( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
return _mm_cmpunord_epi32( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed integer load
|
||||
//
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Misc
|
||||
|
||||
/*!
|
||||
r0 := a1
|
||||
r1 := b2
|
||||
r2 := a3
|
||||
r3 := b3
|
||||
*/
|
||||
static inline rxmm128l unpckh( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_unpackhi_epi32( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := b0
|
||||
r2 := a1
|
||||
r3 := b1
|
||||
*/
|
||||
static inline rxmm128l unpckl( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_unpacklo_epi32( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := sign(a3)<<3 | sign(a2)<<2 | sign(a1)<<1 | sign(a0)
|
||||
*/
|
||||
static inline int movmsk( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_movemask_epi32( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (i0 == 1) ? b0 : a0
|
||||
r1 := (i1 == 1) ? b1 : a1
|
||||
r2 := (i2 == 1) ? b2 : a2
|
||||
r3 := (i3 == 1) ? b3 : a3
|
||||
\sa movmsk
|
||||
*/
|
||||
static inline int shuffle( rxmm128l a, rxmm128l b, int i )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_shuffle_epi32( a, b, i );
|
||||
}
|
||||
|
||||
/*!
|
||||
r3 := a3
|
||||
r2 := a2
|
||||
r1 := b3
|
||||
r0 := b2
|
||||
*/
|
||||
static inline rxmm128l move_hl( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return mm_movehl_epi32( a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
r3 := b1
|
||||
r2 := b0
|
||||
r1 := a1
|
||||
r0 := a0
|
||||
*/
|
||||
static inline rxmm128l move_lh( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_movelh_epi32( a0 );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory load
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
r0 := p[0]
|
||||
r1 := p[1]
|
||||
r2 := p[2]
|
||||
r3 := p[3]
|
||||
*/
|
||||
static inline rxmm128l load( int * p )
|
||||
{
|
||||
return _mm_load_epi32( reinterpret_cast<__m128i*>(p) );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
r0 := p[3]
|
||||
r1 := p[2]
|
||||
r2 := p[1]
|
||||
r3 := p[0]
|
||||
*/
|
||||
static inline rxmm128l load_reverse( int * p )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
rxmm128l t = _mm_loadr_epi32( reinterpret_cast<__m128i*>(p) )
|
||||
return _mm_loadr_epi32( reinterpret_cast<__m128i*>(p) );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := p[0]
|
||||
r1 := p[1]
|
||||
r2 := p[2]
|
||||
r3 := p[3]
|
||||
*/
|
||||
static inline rxmm128l load_unaligned( int * p )
|
||||
{
|
||||
return _mm_loadu_epi32( reinterpret_cast<__m128i*>(p) );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := *p
|
||||
r2 := *p
|
||||
r3 := *p
|
||||
*/
|
||||
static inline rxmm128l load_both( int * p )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_load1_epi32( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := 0.0
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128l load_s( int * p )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_load_ss( p );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory store
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a1
|
||||
p[2] := a2
|
||||
p[3] := a3
|
||||
*/
|
||||
static inline void store( int * p, rxmm128l a )
|
||||
{
|
||||
_mm_store_si128( reinterpret_cast<__m128i*>(p), a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
p[0] := a3
|
||||
p[1] := a2
|
||||
p[2] := a1
|
||||
p[3] := a0
|
||||
*/
|
||||
static inline void store_reverse( int * p, rxmm128l a )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
_mm_storer_epi32( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a1
|
||||
p[2] := a2
|
||||
p[3] := a3
|
||||
*/
|
||||
static inline void store_unaligned(int * p, rxmm128l a )
|
||||
{
|
||||
_mm_storeu_si128( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a0
|
||||
*/
|
||||
static inline void store_both( int * p, rxmm128l a )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_store1_epi32( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
*p := a0
|
||||
*/
|
||||
static inline void store_s( int * p, rxmm128l a )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_store_ss( p );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory set
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := a1
|
||||
r2 := a2
|
||||
r3 := a3
|
||||
*/
|
||||
static inline rxmm128l set( int a3, int a2, int a1, int a0 )
|
||||
{
|
||||
return _mm_set_epi32( a3, a2, a1, a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := 0.0
|
||||
r1 := 0.0
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128l set_zero()
|
||||
{
|
||||
return _mm_setzero_si32( a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := a0
|
||||
r2 := a0
|
||||
r3 := a0
|
||||
*/
|
||||
static inline rxmm128l set_both( int a0 )
|
||||
{
|
||||
return _mm_set1_epi32( a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := a0
|
||||
r1 := 0.0
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128l set_s( int a0 )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_set_ss( a0 );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed double convertion
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := (float) a0
|
||||
r1 := (float) a1
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128l cvtpd2ps( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtpd_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) a0
|
||||
r1 := (double) a1
|
||||
*/
|
||||
static inline rxmm128l cvtps2pd( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtps_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128l cvtpd2dq( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtpd_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) a0
|
||||
r1 := (double) a1
|
||||
*/
|
||||
static inline rxmm128l cvtdq2pd( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtepi32_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := (int) a0
|
||||
*/
|
||||
static inline int cvtsd2si( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtsd_si32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (float) b0
|
||||
r1 := a1
|
||||
r2 := a2
|
||||
r3 := a3
|
||||
*/
|
||||
static inline rxmm128l cvtsd2ss( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
return _mm_cvtsd_ss( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) b
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128l cvtsi2sd( rxmm128l a, int b )
|
||||
{
|
||||
return _mm_cvtsi32_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) b0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128l cvtss2sd( rxmm128l a, rxmm128l b )
|
||||
{
|
||||
return _mm_cvtss_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
using truncate
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := 0x0
|
||||
r3 := 0x0
|
||||
*/
|
||||
static inline rxmm128l cvttpd2dq( rxmm128l a )
|
||||
{
|
||||
return _mm_cvttpd_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
using truncate
|
||||
r := (int) a0
|
||||
*/
|
||||
static inline int cvttsd2si( rxmm128l a )
|
||||
{
|
||||
return _mm_cvttsd_si32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (float) a0
|
||||
r1 := (float) a1
|
||||
r2 := (float) a2
|
||||
r3 := (float) a3
|
||||
*/
|
||||
static inline rxmm128l cvtdq2ps( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtepi32_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := (int) a2
|
||||
r3 := (int) a3
|
||||
*/
|
||||
static inline rxmm128l cvtps2dq( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtps_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
uses trancate
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := (int) a2
|
||||
r3 := (int) a3
|
||||
*/
|
||||
static inline rxmm128l cvttps2dq( rxmm128l a )
|
||||
{
|
||||
return _mm_cvttps_epi32( a );
|
||||
}
|
||||
|
||||
//
|
||||
// class epi64
|
||||
//
|
||||
};
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
}
|
||||
|
||||
#endif/*_SSE2_CMPL_ABSTRACTION_MSC_PCKINT32_H_*/
|
||||
618
misc/SSE_cmplr_abstraction_MSC_pckint64.h
Normal file
618
misc/SSE_cmplr_abstraction_MSC_pckint64.h
Normal file
@@ -0,0 +1,618 @@
|
||||
/*
|
||||
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
|
||||
* Inc., and is fully protected under copyright and trade secret laws. You may
|
||||
* not view, use, disclose, copy, or distribute this file or any information
|
||||
* contained herein except pursuant to a valid written license from Synopsys.
|
||||
*/
|
||||
|
||||
//
|
||||
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
|
||||
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
|
||||
//
|
||||
|
||||
#ifndef _SSE2_CMPL_ABSTRACTION_MSC_PCKINT64_H_
|
||||
#define _SSE2_CMPL_ABSTRACTION_MSC_PCKINT64_H_
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
namespace sse2
|
||||
{
|
||||
|
||||
//
|
||||
/// class epi64 (packed single precision)
|
||||
//
|
||||
class epi64
|
||||
{
|
||||
public:
|
||||
|
||||
//
|
||||
/// The type.
|
||||
//
|
||||
typedef rxmm128l my_rxmm;
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed integer arithmetic
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := a0 + b0
|
||||
r1 := a1 + b1
|
||||
*/
|
||||
static inline rxmm128d add( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_add_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 - b0
|
||||
r1 := a1 - b1
|
||||
*/
|
||||
static inline rxmm128d sub( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_sub_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 * b0
|
||||
r1 := a1 * b1
|
||||
*/
|
||||
static inline rxmm128d mul( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_mul_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 / b0
|
||||
r1 := a1 / b1
|
||||
*/
|
||||
static inline rxmm128d div( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_div_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := max( a0, b0 )
|
||||
r1 := max( a1, b1 )
|
||||
*/
|
||||
static inline rxmm128d max( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_max_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := min( a0, b0 )
|
||||
r1 := min( a1, b1 )
|
||||
*/
|
||||
static inline rxmm128d min( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_min_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := sqrt( a0 )
|
||||
r1 := sqrt( a1 )
|
||||
*/
|
||||
static inline rxmm128d sqrt( rxmm128d a )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_sqrt_pd( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed integer logic
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := (~a0) & b0
|
||||
r1 := (~a1) & b1
|
||||
*/
|
||||
static inline rxmm128d andnot( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_andnot_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 & b0
|
||||
r1 := a1 & b1
|
||||
*/
|
||||
static inline XMM_TYPE and( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_and_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 | b0
|
||||
r1 := a1 | b1
|
||||
*/
|
||||
static inline XMM_TYPE or( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_or_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 ^ b0
|
||||
r1 := a1 ^ b1
|
||||
*/
|
||||
static inline XMM_TYPE xor( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_xor_pd( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed integer comparision
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := (a0 == b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 == b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_eq( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_cmpeq_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 != b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 != b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_neq( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpneq_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 < b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 < b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_lt( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmplt_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 <= b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 <= b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_le( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_cmple_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 > b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 > b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_gt( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_cmpgt_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 >= b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 >= b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_ge( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpge_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 ord b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 ord b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_ord( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpord_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 unord b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 unord b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_unord( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpunord_pd( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed integer load
|
||||
//
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Misc
|
||||
|
||||
/*!
|
||||
r0 := a1
|
||||
r1 := b1
|
||||
*/
|
||||
static inline rxmm128d unpckh( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_unpackhi_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := b0
|
||||
*/
|
||||
static inline rxmm128d unpckl( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_unpacklo_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := sign(a1) << 1 | sign(a0)
|
||||
*/
|
||||
static inline int movmsk( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_movemask_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (i0 == 1) ? b0 : a0
|
||||
r1 := (i1 == 1) ? b1 : a1
|
||||
\sa movmsk
|
||||
*/
|
||||
static inline int shuffle( rxmm128d a, rxmm128d b, int i )
|
||||
{
|
||||
return _mm_shuffle_pd( a, b, i );
|
||||
}
|
||||
|
||||
/*!
|
||||
== shuffle( a, b, 1 )
|
||||
r0 := b0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d move_sd( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_move_sd( a0 );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory load
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
r0 := p[0]
|
||||
r1 := p[1]
|
||||
*/
|
||||
static inline rxmm128d load( double * p )
|
||||
{
|
||||
return _mm_load_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
r0 := p[1]
|
||||
r1 := p[0]
|
||||
*/
|
||||
static inline rxmm128d load_reverse( double * p )
|
||||
{
|
||||
return _mm_loadr_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := p[0]
|
||||
r1 := p[1]
|
||||
*/
|
||||
static inline rxmm128d load_unaligned( double * p )
|
||||
{
|
||||
return _mm_loadu_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := a0
|
||||
r1 := *p
|
||||
*/
|
||||
static inline rxmm128d load_hi( rxmm128d a, double * p )
|
||||
{
|
||||
return _mm_loadh_pd( a, p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d load_lo( rxmm128d a, double * p )
|
||||
{
|
||||
return _mm_loadl_pd( a, p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := *p
|
||||
*/
|
||||
static inline rxmm128d load_both( double * p )
|
||||
{
|
||||
return _mm_load1_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := 0.0
|
||||
*/
|
||||
static inline rxmm128d load_sd( double * p )
|
||||
{
|
||||
return _mm_load_sd( p );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory store
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a1
|
||||
*/
|
||||
static inline void store( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_load_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
p[0] := a1
|
||||
p[1] := a0
|
||||
*/
|
||||
static inline void store_reverse( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storer_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a1
|
||||
*/
|
||||
static inline void store_unaligned(double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storeu_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
*p := a1
|
||||
*/
|
||||
static inline void store_hi( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storeh_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
*p := a0
|
||||
*/
|
||||
static inline void store_lo( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storel_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a0
|
||||
*/
|
||||
static inline void store_both( double * p, rxmm128d a )
|
||||
{
|
||||
return _mm_store1_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
*p := a0
|
||||
*/
|
||||
static inline void store_sd( double * p, rxmm128d a )
|
||||
{
|
||||
return _mm_store_sd( p );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory set
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d set( double a1, double a0 )
|
||||
{
|
||||
return _mm_set_pd( a1, a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := 0.0
|
||||
r1 := 0.0
|
||||
*/
|
||||
static inline rxmm128d set_zero()
|
||||
{
|
||||
return _mm_setzero_pd( a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := a0
|
||||
*/
|
||||
static inline rxmm128d set_both( double a0 )
|
||||
{
|
||||
return _mm_set1_pd( a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := a0
|
||||
r1 := 0.0
|
||||
*/
|
||||
static inline rxmm128d set_sd( double a0 )
|
||||
{
|
||||
return _mm_set_sd( a0 );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed integer convertion
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := (float) a0
|
||||
r1 := (float) a1
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128s cvtpd2ps( rxmm128d a )
|
||||
{
|
||||
return _mm_cvtpd_ps( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) a0
|
||||
r1 := (double) a1
|
||||
*/
|
||||
static inline rxmm128d cvtps2pd( rxmm128s a )
|
||||
{
|
||||
return _mm_cvtps_pd( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128l cvtpd2dq( rxmm128d a )
|
||||
{
|
||||
return _mm_cvtpd_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) a0
|
||||
r1 := (double) a1
|
||||
*/
|
||||
static inline rxmm128d cvtdq2pd( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtepi32_pd( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := (int) a0
|
||||
*/
|
||||
static inline int cvtsd2si( rxmm128d a )
|
||||
{
|
||||
return _mm_cvtsd_si32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (float) b0
|
||||
r1 := a1
|
||||
r2 := a2
|
||||
r3 := a3
|
||||
*/
|
||||
static inline rxmm128s cvtsd2ss( rxmm128l a, rxmm128d b )
|
||||
{
|
||||
return _mm_cvtsd_ss( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) b
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d cvtsi2sd( rxmm128d a, int b )
|
||||
{
|
||||
return _mm_cvtsi32_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) b0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d cvtss2sd( rxmm128d a, rxmm128s b )
|
||||
{
|
||||
return _mm_cvtss_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
using truncate
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := 0x0
|
||||
r3 := 0x0
|
||||
*/
|
||||
static inline rxmm128l cvttpd2dq( rxmm128d a )
|
||||
{
|
||||
return _mm_cvttpd_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
using truncate
|
||||
r := (int) a0
|
||||
*/
|
||||
static inline int cvttsd2si( rxmm128d a )
|
||||
{
|
||||
return _mm_cvttsd_si32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (float) a0
|
||||
r1 := (float) a1
|
||||
r2 := (float) a2
|
||||
r3 := (float) a3
|
||||
*/
|
||||
static inline rxmm128s cvtdq2ps( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtepi32_ps( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := (int) a2
|
||||
r3 := (int) a3
|
||||
*/
|
||||
static inline rxmm128l cvtps2dq( rxmm128s a )
|
||||
{
|
||||
return _mm_cvtps_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
uses trancate
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := (int) a2
|
||||
r3 := (int) a3
|
||||
*/
|
||||
static inline rxmm128l cvttps2dq( rxmm128s a )
|
||||
{
|
||||
return _mm_cvttps_epi32( a );
|
||||
}
|
||||
|
||||
//
|
||||
// class epi64
|
||||
//
|
||||
};
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
}
|
||||
|
||||
#endif/*_SSE2_CMPL_ABSTRACTION_MSC_PCKINT64_H_*/
|
||||
618
misc/SSE_cmplr_abstraction_MSC_pckint8.h
Normal file
618
misc/SSE_cmplr_abstraction_MSC_pckint8.h
Normal file
@@ -0,0 +1,618 @@
|
||||
/*
|
||||
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
|
||||
* Inc., and is fully protected under copyright and trade secret laws. You may
|
||||
* not view, use, disclose, copy, or distribute this file or any information
|
||||
* contained herein except pursuant to a valid written license from Synopsys.
|
||||
*/
|
||||
|
||||
//
|
||||
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
|
||||
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
|
||||
//
|
||||
|
||||
#ifndef _SSE2_CMPL_ABSTRACTION_MSC_PCKINT8_H_
|
||||
#define _SSE2_CMPL_ABSTRACTION_MSC_PCKINT8_H_
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
namespace sse2
|
||||
{
|
||||
|
||||
//
|
||||
/// class epi64 (packed single precision)
|
||||
//
|
||||
class epi64
|
||||
{
|
||||
public:
|
||||
|
||||
//
|
||||
/// The type.
|
||||
//
|
||||
typedef rxmm128l my_rxmm;
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed integer arithmetic
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := a0 + b0
|
||||
r1 := a1 + b1
|
||||
*/
|
||||
static inline rxmm128d add( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_add_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 - b0
|
||||
r1 := a1 - b1
|
||||
*/
|
||||
static inline rxmm128d sub( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_sub_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 * b0
|
||||
r1 := a1 * b1
|
||||
*/
|
||||
static inline rxmm128d mul( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_mul_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 / b0
|
||||
r1 := a1 / b1
|
||||
*/
|
||||
static inline rxmm128d div( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_div_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := max( a0, b0 )
|
||||
r1 := max( a1, b1 )
|
||||
*/
|
||||
static inline rxmm128d max( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_max_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := min( a0, b0 )
|
||||
r1 := min( a1, b1 )
|
||||
*/
|
||||
static inline rxmm128d min( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_min_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := sqrt( a0 )
|
||||
r1 := sqrt( a1 )
|
||||
*/
|
||||
static inline rxmm128d sqrt( rxmm128d a )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_sqrt_pd( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed integer logic
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := (~a0) & b0
|
||||
r1 := (~a1) & b1
|
||||
*/
|
||||
static inline rxmm128d andnot( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_andnot_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 & b0
|
||||
r1 := a1 & b1
|
||||
*/
|
||||
static inline XMM_TYPE and( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_and_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 | b0
|
||||
r1 := a1 | b1
|
||||
*/
|
||||
static inline XMM_TYPE or( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_or_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0 ^ b0
|
||||
r1 := a1 ^ b1
|
||||
*/
|
||||
static inline XMM_TYPE xor( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_xor_pd( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed integer comparision
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := (a0 == b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 == b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_eq( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_cmpeq_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 != b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 != b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_neq( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpneq_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 < b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 < b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_lt( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmplt_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 <= b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 <= b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_le( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_cmple_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 > b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 > b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_gt( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
BOOST_STATIC_ASSERT( false );
|
||||
return _mm_cmpgt_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 >= b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 >= b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_ge( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpge_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 ord b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 ord b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_ord( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpord_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (a0 unord b0) ? 0xffffffffffffffff : 0x0
|
||||
r1 := (a1 unord b1) ? 0xffffffffffffffff : 0x0
|
||||
*/
|
||||
static inline rxmm128d cmp_unord( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_cmpunord_pd( a, b );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed integer load
|
||||
//
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Misc
|
||||
|
||||
/*!
|
||||
r0 := a1
|
||||
r1 := b1
|
||||
*/
|
||||
static inline rxmm128d unpckh( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_unpackhi_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := b0
|
||||
*/
|
||||
static inline rxmm128d unpckl( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_unpacklo_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := sign(a1) << 1 | sign(a0)
|
||||
*/
|
||||
static inline int movmsk( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_movemask_pd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (i0 == 1) ? b0 : a0
|
||||
r1 := (i1 == 1) ? b1 : a1
|
||||
\sa movmsk
|
||||
*/
|
||||
static inline int shuffle( rxmm128d a, rxmm128d b, int i )
|
||||
{
|
||||
return _mm_shuffle_pd( a, b, i );
|
||||
}
|
||||
|
||||
/*!
|
||||
== shuffle( a, b, 1 )
|
||||
r0 := b0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d move_sd( rxmm128d a, rxmm128d b )
|
||||
{
|
||||
return _mm_move_sd( a0 );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory load
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
r0 := p[0]
|
||||
r1 := p[1]
|
||||
*/
|
||||
static inline rxmm128d load( double * p )
|
||||
{
|
||||
return _mm_load_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
r0 := p[1]
|
||||
r1 := p[0]
|
||||
*/
|
||||
static inline rxmm128d load_reverse( double * p )
|
||||
{
|
||||
return _mm_loadr_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := p[0]
|
||||
r1 := p[1]
|
||||
*/
|
||||
static inline rxmm128d load_unaligned( double * p )
|
||||
{
|
||||
return _mm_loadu_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := a0
|
||||
r1 := *p
|
||||
*/
|
||||
static inline rxmm128d load_hi( rxmm128d a, double * p )
|
||||
{
|
||||
return _mm_loadh_pd( a, p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d load_lo( rxmm128d a, double * p )
|
||||
{
|
||||
return _mm_loadl_pd( a, p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := *p
|
||||
*/
|
||||
static inline rxmm128d load_both( double * p )
|
||||
{
|
||||
return _mm_load1_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := *p
|
||||
r1 := 0.0
|
||||
*/
|
||||
static inline rxmm128d load_sd( double * p )
|
||||
{
|
||||
return _mm_load_sd( p );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory store
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a1
|
||||
*/
|
||||
static inline void store( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_load_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p must be 16-byte aligned.
|
||||
p[0] := a1
|
||||
p[1] := a0
|
||||
*/
|
||||
static inline void store_reverse( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storer_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a1
|
||||
*/
|
||||
static inline void store_unaligned(double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storeu_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
*p := a1
|
||||
*/
|
||||
static inline void store_hi( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storeh_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
*p := a0
|
||||
*/
|
||||
static inline void store_lo( double * p, rxmm128d a )
|
||||
{
|
||||
_mm_storel_pd( p, a );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
p[0] := a0
|
||||
p[1] := a0
|
||||
*/
|
||||
static inline void store_both( double * p, rxmm128d a )
|
||||
{
|
||||
return _mm_store1_pd( p );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
*p := a0
|
||||
*/
|
||||
static inline void store_sd( double * p, rxmm128d a )
|
||||
{
|
||||
return _mm_store_sd( p );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
// Memory set
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d set( double a1, double a0 )
|
||||
{
|
||||
return _mm_set_pd( a1, a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := 0.0
|
||||
r1 := 0.0
|
||||
*/
|
||||
static inline rxmm128d set_zero()
|
||||
{
|
||||
return _mm_setzero_pd( a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := a0
|
||||
r1 := a0
|
||||
*/
|
||||
static inline rxmm128d set_both( double a0 )
|
||||
{
|
||||
return _mm_set1_pd( a0 );
|
||||
}
|
||||
|
||||
/*!
|
||||
The address \arg p does not need to be 16-byte aligned.
|
||||
r0 := a0
|
||||
r1 := 0.0
|
||||
*/
|
||||
static inline rxmm128d set_sd( double a0 )
|
||||
{
|
||||
return _mm_set_sd( a0 );
|
||||
}
|
||||
|
||||
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/
|
||||
//
|
||||
/// Packed integer convertion
|
||||
//
|
||||
|
||||
/*!
|
||||
r0 := (float) a0
|
||||
r1 := (float) a1
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128s cvtpd2ps( rxmm128d a )
|
||||
{
|
||||
return _mm_cvtpd_ps( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) a0
|
||||
r1 := (double) a1
|
||||
*/
|
||||
static inline rxmm128d cvtps2pd( rxmm128s a )
|
||||
{
|
||||
return _mm_cvtps_pd( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := 0.0
|
||||
r3 := 0.0
|
||||
*/
|
||||
static inline rxmm128l cvtpd2dq( rxmm128d a )
|
||||
{
|
||||
return _mm_cvtpd_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) a0
|
||||
r1 := (double) a1
|
||||
*/
|
||||
static inline rxmm128d cvtdq2pd( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtepi32_pd( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r := (int) a0
|
||||
*/
|
||||
static inline int cvtsd2si( rxmm128d a )
|
||||
{
|
||||
return _mm_cvtsd_si32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (float) b0
|
||||
r1 := a1
|
||||
r2 := a2
|
||||
r3 := a3
|
||||
*/
|
||||
static inline rxmm128s cvtsd2ss( rxmm128l a, rxmm128d b )
|
||||
{
|
||||
return _mm_cvtsd_ss( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) b
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d cvtsi2sd( rxmm128d a, int b )
|
||||
{
|
||||
return _mm_cvtsi32_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (double) b0
|
||||
r1 := a1
|
||||
*/
|
||||
static inline rxmm128d cvtss2sd( rxmm128d a, rxmm128s b )
|
||||
{
|
||||
return _mm_cvtss_sd( a, b );
|
||||
}
|
||||
|
||||
/*!
|
||||
using truncate
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := 0x0
|
||||
r3 := 0x0
|
||||
*/
|
||||
static inline rxmm128l cvttpd2dq( rxmm128d a )
|
||||
{
|
||||
return _mm_cvttpd_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
using truncate
|
||||
r := (int) a0
|
||||
*/
|
||||
static inline int cvttsd2si( rxmm128d a )
|
||||
{
|
||||
return _mm_cvttsd_si32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (float) a0
|
||||
r1 := (float) a1
|
||||
r2 := (float) a2
|
||||
r3 := (float) a3
|
||||
*/
|
||||
static inline rxmm128s cvtdq2ps( rxmm128l a )
|
||||
{
|
||||
return _mm_cvtepi32_ps( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := (int) a2
|
||||
r3 := (int) a3
|
||||
*/
|
||||
static inline rxmm128l cvtps2dq( rxmm128s a )
|
||||
{
|
||||
return _mm_cvtps_epi32( a );
|
||||
}
|
||||
|
||||
/*!
|
||||
uses trancate
|
||||
r0 := (int) a0
|
||||
r1 := (int) a1
|
||||
r2 := (int) a2
|
||||
r3 := (int) a3
|
||||
*/
|
||||
static inline rxmm128l cvttps2dq( rxmm128s a )
|
||||
{
|
||||
return _mm_cvttps_epi32( a );
|
||||
}
|
||||
|
||||
//
|
||||
// class epi64
|
||||
//
|
||||
};
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
}
|
||||
|
||||
#endif/*_SSE2_CMPL_ABSTRACTION_MSC_PCKINT8_H_*/
|
||||
60
misc/SSE_cmplr_abstraction_other.h
Normal file
60
misc/SSE_cmplr_abstraction_other.h
Normal file
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* SYNOPSYS CONFIDENTIAL - This is an unpublished, proprietary work of Synopsys,
|
||||
* Inc., and is fully protected under copyright and trade secret laws. You may
|
||||
* not view, use, disclose, copy, or distribute this file or any information
|
||||
* contained herein except pursuant to a valid written license from Synopsys.
|
||||
*/
|
||||
|
||||
//
|
||||
// The purpose of this file is to define SSE2 data types to abstacr from the compiler
|
||||
// specific constructs. Currently the target compilers are GCC and the MS VC 2005.
|
||||
//
|
||||
|
||||
#ifndef _SSE2_CMPL_ABSTRACTION_OTHER_H_
|
||||
#define _SSE2_CMPL_ABSTRACTION_OTHER_H_
|
||||
|
||||
#include <boost/static_assert.hpp>
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
namespace sse2
|
||||
{
|
||||
|
||||
//
|
||||
// Primitive types
|
||||
//
|
||||
|
||||
/// 2xdouble
|
||||
//
|
||||
typedef char xmm128d[16];
|
||||
|
||||
/// 4xfloat
|
||||
//
|
||||
typedef char xmm128s[16];
|
||||
|
||||
/// 2xint64
|
||||
//
|
||||
typedef char xmm128l[16];
|
||||
|
||||
/// 4xint32
|
||||
//
|
||||
typedef char xmm128i[16];
|
||||
|
||||
/// int64
|
||||
//
|
||||
typedef long int;
|
||||
|
||||
//
|
||||
// Namespace sse2
|
||||
//
|
||||
}
|
||||
|
||||
#include "SSE_cmplr_abstraction_other_pckdbl.h"
|
||||
#include "SSE_cmplr_abstraction_other_pckfloat.h"
|
||||
#include "SSE_cmplr_abstraction_other_pckint8.h"
|
||||
#include "SSE_cmplr_abstraction_other_pckint16.h"
|
||||
#include "SSE_cmplr_abstraction_other_pckint32.h"
|
||||
#include "SSE_cmplr_abstraction_other_pckint64.h"
|
||||
|
||||
#endif/*_SSE2_CMPL_ABSTRACTION_OTHER_H_*/
|
||||
43
misc/SSE_transform.cpp
Normal file
43
misc/SSE_transform.cpp
Normal file
@@ -0,0 +1,43 @@
|
||||
// toIntTest.cpp : Defines the entry point for the console application.
|
||||
//
|
||||
|
||||
#include <stdio.h>
|
||||
#include "getCurrentTime.h"
|
||||
|
||||
#include "base/transform.h"
|
||||
//#include "base/point.h"
|
||||
|
||||
#if !defined(__AIX) && !defined(__sparc)
|
||||
#define LOW_ENDIAN_ARCH
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define MY_INLINE __forceinline
|
||||
#else
|
||||
#define MY_INLINE inline
|
||||
#endif
|
||||
|
||||
#ifndef _MSC_VER
|
||||
#define __int64 long
|
||||
#endif
|
||||
|
||||
|
||||
void test1()
|
||||
{
|
||||
}
|
||||
|
||||
void test2()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// Init
|
||||
initGetCurrentTimeLib();
|
||||
|
||||
// test1();
|
||||
test2();
|
||||
|
||||
return 0;
|
||||
}
|
||||
329
misc/getcurrenttime.h
Normal file
329
misc/getcurrenttime.h
Normal file
@@ -0,0 +1,329 @@
|
||||
// toIntTest.cpp : Defines the entry point for the console application.
|
||||
//
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <math.h>
|
||||
|
||||
#undef min
|
||||
#undef max
|
||||
|
||||
|
||||
#if defined( WIN32 )
|
||||
//#define QUERY_PERFORMANCE_COUNTER
|
||||
#define RDTSC
|
||||
#else
|
||||
#define GET_TIME_OF_DAY
|
||||
#endif
|
||||
|
||||
#ifdef RDTSC
|
||||
#include <windows.h>
|
||||
|
||||
class perf
|
||||
{
|
||||
__int64 r64;
|
||||
|
||||
__forceinline __int64 getCurrentTime()
|
||||
{
|
||||
__asm
|
||||
{
|
||||
//
|
||||
// Serialized instruction ensure all previouse
|
||||
// instructions a done befor reading the performance
|
||||
// counter.
|
||||
//
|
||||
// cpuid
|
||||
//
|
||||
// Read the time stamp counter.
|
||||
//
|
||||
rdtsc
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
__forceinline perf()
|
||||
{
|
||||
::Sleep( 0 );
|
||||
__asm cpuid
|
||||
r64 = getCurrentTime();
|
||||
}
|
||||
|
||||
__forceinline double elapsed()
|
||||
{
|
||||
__int64 now = getCurrentTime();
|
||||
return double(now - r64 );
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
__int64 nCPUFrequency;
|
||||
double dblCPUFrequency;
|
||||
|
||||
inline __int64 getCurrentTimeI()
|
||||
{
|
||||
__asm
|
||||
{
|
||||
rdtsc
|
||||
}
|
||||
}
|
||||
|
||||
inline double getCurrentTime()
|
||||
{
|
||||
// return double(getCurrentTimeI());
|
||||
//
|
||||
// The time stamp counter.
|
||||
//
|
||||
union {
|
||||
__int64 r64;
|
||||
__int32 r32[2];
|
||||
} tsc;
|
||||
//
|
||||
// Read the time stamp counter.
|
||||
//
|
||||
__asm
|
||||
{
|
||||
//
|
||||
// Serialized instruction ensure all previouse
|
||||
// instructions a done befor reading the performance
|
||||
// counter.
|
||||
//
|
||||
// cpuid
|
||||
//
|
||||
// Read the counter.
|
||||
//
|
||||
rdtsc
|
||||
//
|
||||
//
|
||||
//
|
||||
// mov tsc.r32[0], eax
|
||||
// mov tsc.r32[4], edx
|
||||
movd xmm0,eax
|
||||
movd xmm1,edx
|
||||
pshufd xmm1, xmm0, 0xF7
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Get time in seconds.
|
||||
//
|
||||
return double(tsc.r64);// / dblCPUFrequency;
|
||||
}
|
||||
|
||||
void initGetCurrentTimeLib_hlpr()
|
||||
{
|
||||
//
|
||||
// Use only one fixed CPU
|
||||
//
|
||||
BOOL b;
|
||||
DWORD_PTR proc_affi;
|
||||
DWORD_PTR sys_affi;
|
||||
DWORD_PTR exclud_affi;
|
||||
GetProcessAffinityMask( GetCurrentProcess(), &proc_affi, &sys_affi );
|
||||
exclud_affi = proc_affi & ~sys_affi;
|
||||
proc_affi = ( exclud_affi ) ? proc_affi : proc_affi;
|
||||
int i = 0;
|
||||
while (( proc_affi >>= 1 )) ++i;
|
||||
proc_affi = 1 << i;
|
||||
b = SetProcessAffinityMask( GetCurrentProcess(), proc_affi );
|
||||
//
|
||||
// Set the priority of thread high.
|
||||
//
|
||||
b = SetPriorityClass( GetCurrentProcess(), REALTIME_PRIORITY_CLASS );
|
||||
b = SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
|
||||
//
|
||||
// Get the frequency.
|
||||
//
|
||||
nCPUFrequency = 2000000000;
|
||||
// QueryPerformanceFrequency(
|
||||
// reinterpret_cast<LARGE_INTEGER*>( &nCPUFrequency ) );
|
||||
//
|
||||
// Frequency counter supported in CPUs of family x86
|
||||
// starting from Pentium 4 or Pentium 3. So for old CPUs
|
||||
// this will not work.
|
||||
//
|
||||
// If CPU doesn't support performance counter then just return.
|
||||
//
|
||||
if ( !nCPUFrequency )
|
||||
puts("WARNING: This CPU doesn't support QueryPerformanceFrequency.");
|
||||
//
|
||||
// Convert to double.
|
||||
//
|
||||
dblCPUFrequency = double(nCPUFrequency);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef QUERY_PERFORMANCE_COUNTER
|
||||
#include <windows.h>
|
||||
|
||||
double dblCPUFrequency;
|
||||
inline double getCurrentTime()
|
||||
{
|
||||
//
|
||||
// This call must be quite fast. Since, in x86 architectur
|
||||
// it is one instruction. Yet WIN32 API might added some
|
||||
// additional processing.
|
||||
//
|
||||
// \todo Vahagn: add our assembly optimised function.
|
||||
//
|
||||
__int64 nCPUTickCount;
|
||||
QueryPerformanceCounter(
|
||||
reinterpret_cast<LARGE_INTEGER*>( &nCPUTickCount )
|
||||
);
|
||||
//
|
||||
// Get time in seconds.
|
||||
//
|
||||
return double(nCPUTickCount) / dblCPUFrequency;
|
||||
}
|
||||
|
||||
void initGetCurrentTimeLib_hlpr()
|
||||
{
|
||||
//
|
||||
// Use only one fixed CPU
|
||||
//
|
||||
BOOL b;
|
||||
DWORD_PTR proc_affi;
|
||||
DWORD_PTR sys_affi;
|
||||
DWORD_PTR exclud_affi;
|
||||
GetProcessAffinityMask( GetCurrentProcess(), &proc_affi, &sys_affi );
|
||||
exclud_affi = proc_affi & ~sys_affi;
|
||||
proc_affi = ( exclud_affi ) ? proc_affi : proc_affi;
|
||||
int i = 0;
|
||||
while (( proc_affi >>= 1 )) ++i;
|
||||
proc_affi = 1 << i;
|
||||
b = SetProcessAffinityMask( GetCurrentProcess(), proc_affi );
|
||||
//
|
||||
// Set the priority of thread high.
|
||||
//
|
||||
b = SetPriorityClass( GetCurrentProcess(), REALTIME_PRIORITY_CLASS );
|
||||
b = SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
|
||||
//
|
||||
// Get the frequency.
|
||||
//
|
||||
__int64 nCPUFrequency;
|
||||
QueryPerformanceFrequency(
|
||||
reinterpret_cast<LARGE_INTEGER*>( &nCPUFrequency )
|
||||
);
|
||||
//
|
||||
// Frequency counter supported in CPUs of family x86
|
||||
// starting from Pentium 4 or Pentium 3. So for old CPUs
|
||||
// this will not work.
|
||||
//
|
||||
// If CPU doesn't support performance counter then just return.
|
||||
//
|
||||
if ( !nCPUFrequency )
|
||||
puts("WARNING: This CPU doesn't support QueryPerformanceFrequency.");
|
||||
//
|
||||
// Convert to double.
|
||||
//
|
||||
dblCPUFrequency = double(nCPUFrequency);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GET_TIME_OF_DAY
|
||||
#include <sys/time.h>
|
||||
|
||||
inline double getCurrentTime()
|
||||
{
|
||||
timeval t;
|
||||
gettimeofday(&t,0);
|
||||
return (double)t.tv_sec + ((double)t.tv_usec/1000000.0);
|
||||
}
|
||||
|
||||
void initGetCurrentTimeLib_hlpr()
|
||||
{
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void initGetCurrentTimeLib()
|
||||
{
|
||||
initGetCurrentTimeLib_hlpr();
|
||||
|
||||
#if 0
|
||||
for ( int j=0; j < 10000; ++j )
|
||||
{
|
||||
|
||||
//
|
||||
// Calculate the time expectation and dispersion
|
||||
// of getCurrentTime on this CPU.
|
||||
//
|
||||
const int nProbeCount = 100000;
|
||||
double dblTimeExpect = 0.;
|
||||
double dblTimeDispersia = 0.;
|
||||
for ( int i = nProbeCount; i; --i )
|
||||
{
|
||||
register double dblTimeBase = getCurrentTime();
|
||||
register double dblTimeCurrent = getCurrentTime();
|
||||
double dblTimeDelta = dblTimeCurrent - dblTimeBase;
|
||||
dblTimeExpect += dblTimeDelta;
|
||||
dblTimeDispersia += dblTimeDelta * dblTimeDelta;
|
||||
}
|
||||
//
|
||||
// finalize.
|
||||
//
|
||||
dblTimeExpect /= double( nProbeCount );
|
||||
dblTimeDispersia = dblTimeDispersia / double( nProbeCount )
|
||||
- dblTimeExpect * dblTimeExpect;
|
||||
printf( "Expectation: %f\n"
|
||||
"Dispersion: %f\n",
|
||||
dblTimeExpect,
|
||||
sqrt(dblTimeDispersia) );
|
||||
puts( "----------------------------------------------------" );
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if 0
|
||||
const int nProbeCount = 1000;
|
||||
double* ddd = new double[ nProbeCount ];
|
||||
double* p = ddd;
|
||||
double m = std::numeric_limits<double>::max();
|
||||
for ( int i = nProbeCount; i; --i )
|
||||
{
|
||||
register double dblTimeBase = getCurrentTime();
|
||||
register double dblTimeCurrent = getCurrentTime();
|
||||
*p++ = dblTimeCurrent - dblTimeBase;
|
||||
m = std::min( m, dblTimeCurrent - dblTimeBase );
|
||||
// printf( "%10.1f\n", dblTimeCurrent - dblTimeBase );
|
||||
}
|
||||
|
||||
printf( "%10.1f\n", m );
|
||||
|
||||
std::ofstream o;
|
||||
o.open( "times.txt" );
|
||||
p = ddd;
|
||||
for ( int i = nProbeCount; i; --i )
|
||||
{
|
||||
o << *p++ << std::endl;
|
||||
}
|
||||
o << std::endl;
|
||||
delete [] ddd;
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
|
||||
for ( int j = 0; j < 1000; ++j )
|
||||
{
|
||||
const int nProbeCount = 10000;
|
||||
double m = 1e300;
|
||||
for ( int i = nProbeCount; i; --i )
|
||||
{
|
||||
perf pc;
|
||||
__asm cpuid
|
||||
__asm cpuid
|
||||
__asm cpuid
|
||||
__asm cpuid
|
||||
double c = pc.elapsed();
|
||||
m = min( m, c );
|
||||
}
|
||||
std::cout << m << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
287
misc/getcurrenttime2.h
Normal file
287
misc/getcurrenttime2.h
Normal file
@@ -0,0 +1,287 @@
|
||||
//
|
||||
// Use "rdtsc" to mesure performance.
|
||||
//
|
||||
//
|
||||
#ifndef __PERFORMANCE__H__
|
||||
#define __PERFORMANCE__H__
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <math.h>
|
||||
#include <map>
|
||||
|
||||
#if defined( WIN32 )
|
||||
#undef min
|
||||
#undef max
|
||||
#endif
|
||||
|
||||
#if defined( WIN32 )
|
||||
#define QUERY_PERFORMANCE_COUNTER
|
||||
#define RDTSC
|
||||
#else
|
||||
#define GET_TIME_OF_DAY
|
||||
#endif
|
||||
|
||||
#if defined( RDTSC )
|
||||
#include <windows.h>
|
||||
|
||||
class perf
|
||||
{
|
||||
__int64 beginning;
|
||||
static double frequency;
|
||||
|
||||
__forceinline __int64 getCurrentTime()
|
||||
{
|
||||
__asm
|
||||
{
|
||||
//
|
||||
// Read the time stamp counter.
|
||||
//
|
||||
rdtsc
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
__forceinline perf()
|
||||
{
|
||||
// ::Sleep( 0 );
|
||||
//
|
||||
// Serialized instruction ensure all previouse
|
||||
// instructions a done befor reading the performance
|
||||
// counter.
|
||||
//
|
||||
__asm xor eax,eax
|
||||
__asm cpuid
|
||||
beginning = getCurrentTime();
|
||||
// __asm xor eax,eax
|
||||
// __asm cpuid
|
||||
}
|
||||
|
||||
__forceinline double elapsed()
|
||||
{
|
||||
__int64 now = getCurrentTime();
|
||||
return double(now - beginning - 60 ) / frequency;
|
||||
}
|
||||
|
||||
static void init()
|
||||
{
|
||||
//
|
||||
// Use only one fixed CPU
|
||||
//
|
||||
BOOL b;
|
||||
DWORD_PTR proc_affi;
|
||||
DWORD_PTR sys_affi;
|
||||
DWORD_PTR exclud_affi;
|
||||
GetProcessAffinityMask( GetCurrentProcess(), &proc_affi, &sys_affi );
|
||||
exclud_affi = proc_affi & ~sys_affi;
|
||||
proc_affi = ( exclud_affi ) ? proc_affi : proc_affi;
|
||||
int i = 0;
|
||||
while (( proc_affi >>= 1 )) ++i;
|
||||
proc_affi = 1 << i;
|
||||
b = SetProcessAffinityMask( GetCurrentProcess(), proc_affi );
|
||||
//
|
||||
// Set the priority of thread high.
|
||||
//
|
||||
b = SetPriorityClass( GetCurrentProcess(), REALTIME_PRIORITY_CLASS );
|
||||
b = SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
|
||||
|
||||
//
|
||||
// Get the frequency.
|
||||
// Temporaily put 1.
|
||||
//
|
||||
frequency = 1.;
|
||||
}
|
||||
};
|
||||
|
||||
double perf::frequency;
|
||||
|
||||
#elif defined( QUERY_PERFORMANCE_COUNTER )
|
||||
#include <windows.h>
|
||||
|
||||
class perf
|
||||
{
|
||||
__int64 beginning;
|
||||
static double frequency;
|
||||
|
||||
__forceinline __int64 getCurrentTime()
|
||||
{
|
||||
//
|
||||
// This call must be quite fast. Since, in x86 architectur
|
||||
// it is one instruction. Yet WIN32 API might added some
|
||||
// additional processing.
|
||||
//
|
||||
// \todo Vahagn: add our assembly optimised function.
|
||||
//
|
||||
__int64 tc;
|
||||
QueryPerformanceCounter(
|
||||
reinterpret_cast<LARGE_INTEGER*>( &tc )
|
||||
);
|
||||
return tc;
|
||||
}
|
||||
|
||||
public:
|
||||
__forceinline perf()
|
||||
{
|
||||
// ::Sleep( 0 );
|
||||
beginning = getCurrentTime();
|
||||
}
|
||||
|
||||
__forceinline double elapsed()
|
||||
{
|
||||
__int64 now = getCurrentTime();
|
||||
return double(now - beginning ); // frequency;
|
||||
}
|
||||
|
||||
static void init()
|
||||
{
|
||||
//
|
||||
// Use only one fixed CPU
|
||||
//
|
||||
BOOL b;
|
||||
DWORD_PTR proc_affi;
|
||||
DWORD_PTR sys_affi;
|
||||
DWORD_PTR exclud_affi;
|
||||
GetProcessAffinityMask( GetCurrentProcess(), &proc_affi, &sys_affi );
|
||||
exclud_affi = proc_affi & ~sys_affi;
|
||||
proc_affi = ( exclud_affi ) ? proc_affi : proc_affi;
|
||||
int i = 0;
|
||||
while (( proc_affi >>= 1 )) ++i;
|
||||
proc_affi = 1 << i;
|
||||
b = SetProcessAffinityMask( GetCurrentProcess(), proc_affi );
|
||||
//
|
||||
// Set the priority of thread high.
|
||||
//
|
||||
b = SetPriorityClass( GetCurrentProcess(), REALTIME_PRIORITY_CLASS );
|
||||
b = SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
|
||||
|
||||
//
|
||||
// Get the frequency.
|
||||
//
|
||||
__int64 pf;
|
||||
QueryPerformanceFrequency(
|
||||
reinterpret_cast<LARGE_INTEGER*>( &pf )
|
||||
);
|
||||
//
|
||||
// Get the frequency.
|
||||
//
|
||||
frequency = double(pf);
|
||||
}
|
||||
};
|
||||
|
||||
double perf::frequency;
|
||||
|
||||
#elif defined ( GET_TIME_OF_DAY )
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
class perf
|
||||
{
|
||||
timeval beginning;
|
||||
|
||||
public:
|
||||
__forceinline perf()
|
||||
{
|
||||
gettimeofday(&beginning,0);
|
||||
}
|
||||
|
||||
__forceinline double elapsed()
|
||||
{
|
||||
timeval now;
|
||||
gettimeofday(&now,0);
|
||||
return double(now.tv_sec) - double(beginning.tv_sec)
|
||||
+ (double(now.tv_usec)-double(beginning.tv_usec))/1000000.0;
|
||||
}
|
||||
|
||||
static void init()
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
template<class F >
|
||||
double mesure( F& fnctr, int nProbes = 100000, bool bPrint = false )
|
||||
{
|
||||
typedef std::map<double,int> probs_type;
|
||||
|
||||
probs_type probs;
|
||||
int n = 0;
|
||||
for ( int i = 0; i < nProbes; ++i )
|
||||
{
|
||||
perf pc;
|
||||
fnctr();
|
||||
double m = pc.elapsed();
|
||||
n = ++probs[ m ];
|
||||
}
|
||||
|
||||
double m;
|
||||
n = 0;
|
||||
for ( probs_type::iterator it = probs.begin();
|
||||
it != probs.end();
|
||||
++it )
|
||||
{
|
||||
if ( it->second > n )
|
||||
{
|
||||
n = it->second;
|
||||
m = it->first;
|
||||
}
|
||||
}
|
||||
|
||||
if ( bPrint )
|
||||
{
|
||||
std::cout << "tsc=" << m << " probes=" << nProbes << std::endl;
|
||||
std::cout << "===============================" << std::endl;
|
||||
for ( probs_type::iterator it = probs.begin();
|
||||
it != probs.end();
|
||||
++it )
|
||||
std::cout << "prob=" << it->first << "\t amount=" << it->second << std::endl;
|
||||
}
|
||||
|
||||
return m;
|
||||
};
|
||||
|
||||
struct nop
|
||||
{
|
||||
__forceinline void operator() ()
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
void perf_init()
|
||||
{
|
||||
perf::init();
|
||||
|
||||
mesure<nop>( nop(), 100000, true );
|
||||
|
||||
#if 0
|
||||
|
||||
|
||||
typedef std::map<double,int> probs_type;
|
||||
|
||||
probs_type probs;
|
||||
double m = 1e300;
|
||||
double s = 0;
|
||||
int i_last = 0;
|
||||
int i = 0;
|
||||
int n = 0;
|
||||
double c;
|
||||
for ( ; n < 1000000; ++i )
|
||||
{
|
||||
perf pc;
|
||||
c = pc.elapsed();
|
||||
n = ++probs[ c ];
|
||||
}
|
||||
std::cout << "tsc=" << c << " probes=" << i << std::endl;
|
||||
std::cout << "=========================" << std::endl;
|
||||
for ( probs_type::iterator it = probs.begin();
|
||||
it != probs.end();
|
||||
++it )
|
||||
{
|
||||
std::cout << "prob=" << it->first << "\t amount=" << it->second << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#endif//__PERFORMANCE__H__
|
||||
Reference in New Issue
Block a user