21#ifndef _libint2_src_lib_libint_vectorx86_h_
22#define _libint2_src_lib_libint_vectorx86_h_
24#include <libint2/util/cxxstd.h>
25#include <libint2/util/type_traits.h>
33#elif defined(__SSE2__) || defined(__SSE__) || defined(__AVX__)
81 d = _mm_add_pd(d, a.d);
86 d = _mm_sub_pd(d, a.d);
92 const static __m128d minus_one = _mm_set_pd(-1.0, -1.0);
95 result.d = _mm_mul_pd(this->d, minus_one);
99#if LIBINT2_CPLUSPLUS_STD >= 2011
102 operator double()
const {
104 ::memcpy(&(d0[0]), &d,
sizeof(__m128d));
114 operator __m128d()
const {
return d; }
117 void load(T
const* a) { d = _mm_loadu_pd(a); }
122 void convert(T* a)
const { _mm_storeu_pd(&a[0], d); }
129inline VectorSSEDouble
operator*(
double a, VectorSSEDouble b) {
131 VectorSSEDouble _a(a);
132 c.d = _mm_mul_pd(_a.d, b.d);
136inline VectorSSEDouble operator*(VectorSSEDouble a,
double b) {
138 VectorSSEDouble _b(b);
139 c.d = _mm_mul_pd(a.d, _b.d);
143inline VectorSSEDouble
operator*(
int a, VectorSSEDouble b) {
148 VectorSSEDouble _a((
double)a);
149 c.d = _mm_mul_pd(_a.d, b.d);
154inline VectorSSEDouble
operator*(VectorSSEDouble a,
int b) {
159 VectorSSEDouble _b((
double)b);
160 c.d = _mm_mul_pd(a.d, _b.d);
165inline VectorSSEDouble
operator*(VectorSSEDouble a, VectorSSEDouble b) {
167 c.d = _mm_mul_pd(a.d, b.d);
171inline VectorSSEDouble operator+(VectorSSEDouble a, VectorSSEDouble b) {
173 c.d = _mm_add_pd(a.d, b.d);
177inline VectorSSEDouble operator-(VectorSSEDouble a, VectorSSEDouble b) {
179 c.d = _mm_sub_pd(a.d, b.d);
183inline VectorSSEDouble operator/(VectorSSEDouble a, VectorSSEDouble b) {
185 c.d = _mm_div_pd(a.d, b.d);
190inline VectorSSEDouble
fma_plus(VectorSSEDouble a, VectorSSEDouble b,
193 d.d = _mm_fmadd_pd(a.d, b.d, c.d);
196inline VectorSSEDouble
fma_minus(VectorSSEDouble a, VectorSSEDouble b,
199 d.d = _mm_fmsub_pd(a.d, b.d, c.d);
202#elif defined(__FMA4__)
203inline VectorSSEDouble
fma_plus(VectorSSEDouble a, VectorSSEDouble b,
206 d.d = _mm_macc_pd(a.d, b.d, c.d);
209inline VectorSSEDouble
fma_minus(VectorSSEDouble a, VectorSSEDouble b,
212 d.d = _mm_msub_pd(a.d, b.d, c.d);
223 __m128d t1 = _mm_hadd_pd(a, a);
224 return _mm_cvtsd_f64(t1);
226 __m128 t0 = _mm_castpd_ps(a);
227 __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0, t0));
228 __m128d t2 = _mm_add_sd(a, t1);
229 return _mm_cvtsd_f64(t2);
240 return _mm_hadd_pd(a, b);
249inline VectorSSEDouble exp(VectorSSEDouble a) {
251 VectorSSEDouble result;
252 result.d = _mm_exp_pd(a.d);
256 for (
int i = 0; i < 2; ++i) a_d[i] = std::exp(a_d[i]);
257 VectorSSEDouble result(a_d);
261inline VectorSSEDouble sqrt(VectorSSEDouble a) {
263 VectorSSEDouble result;
264 result.d = _mm_sqrt_pd(a.d);
268 for (
int i = 0; i < 2; ++i) a_d[i] = std::sqrt(a_d[i]);
269 VectorSSEDouble result(a_d);
273inline VectorSSEDouble erf(VectorSSEDouble a) {
275 VectorSSEDouble result;
276 result.d = _mm_erf_pd(a.d);
280 for (
int i = 0; i < 2; ++i) a_d[i] = ::erf(a_d[i]);
281 VectorSSEDouble result(a_d);
285inline VectorSSEDouble erfc(VectorSSEDouble a) {
287 VectorSSEDouble result;
288 result.d = _mm_erfc_pd(a.d);
292 for (
int i = 0; i < 2; ++i) a_d[i] = ::erfc(a_d[i]);
293 VectorSSEDouble result(a_d);
303inline std::ostream& operator<<(std::ostream& os,
307 os <<
"{" << ad[0] <<
"," << ad[1] <<
"}";
318 static const bool value =
true;
323 typedef double scalar_type;
324 static const size_t extent = 2;
367 d = _mm_set_ps(a, a, a, a);
372 d = _mm_add_ps(d, a.d);
377 d = _mm_sub_ps(d, a.d);
383 const static __m128 minus_one = _mm_set_ps(-1.0, -1.0, -1.0, -1.0);
386 result.d = _mm_mul_ps(this->d, minus_one);
390#if LIBINT2_CPLUSPLUS_STD >= 2011
393 operator float()
const {
395 ::memcpy(&(d0[0]), &d,
sizeof(__m128));
404#if LIBINT2_CPLUSPLUS_STD >= 2011
407 operator double()
const {
408 const float result_flt = this->
operator float();
413 operator __m128()
const {
return d; }
416 void load(T
const* a) { d = _mm_loadu_ps(a); }
421 void convert(T* a)
const { _mm_storeu_ps(&a[0], d); }
428inline VectorSSEFloat
operator*(
float a, VectorSSEFloat b) {
430 VectorSSEFloat _a(a);
431 c.d = _mm_mul_ps(_a.d, b.d);
435inline VectorSSEFloat operator*(VectorSSEFloat a,
float b) {
437 VectorSSEFloat _b(b);
438 c.d = _mm_mul_ps(a.d, _b.d);
443inline VectorSSEFloat
operator*(
double a, VectorSSEFloat b) {
445 VectorSSEFloat _a((
float)a);
446 c.d = _mm_mul_ps(_a.d, b.d);
451inline VectorSSEFloat
operator*(VectorSSEFloat a,
double b) {
453 VectorSSEFloat _b((
float)b);
454 c.d = _mm_mul_ps(a.d, _b.d);
458inline VectorSSEFloat
operator*(
int a, VectorSSEFloat b) {
463 VectorSSEFloat _a((
float)a);
464 c.d = _mm_mul_ps(_a.d, b.d);
469inline VectorSSEFloat
operator*(VectorSSEFloat a,
int b) {
474 VectorSSEFloat _b((
float)b);
475 c.d = _mm_mul_ps(a.d, _b.d);
480inline VectorSSEFloat
operator*(VectorSSEFloat a, VectorSSEFloat b) {
482 c.d = _mm_mul_ps(a.d, b.d);
486inline VectorSSEFloat operator+(VectorSSEFloat a, VectorSSEFloat b) {
488 c.d = _mm_add_ps(a.d, b.d);
492inline VectorSSEFloat operator-(VectorSSEFloat a, VectorSSEFloat b) {
494 c.d = _mm_sub_ps(a.d, b.d);
498inline VectorSSEFloat operator/(VectorSSEFloat a, VectorSSEFloat b) {
500 c.d = _mm_div_ps(a.d, b.d);
505inline VectorSSEFloat
fma_plus(VectorSSEFloat a, VectorSSEFloat b,
508 d.d = _mm_fmadd_ps(a.d, b.d, c.d);
511inline VectorSSEFloat
fma_minus(VectorSSEFloat a, VectorSSEFloat b,
514 d.d = _mm_fmsub_ps(a.d, b.d, c.d);
517#elif defined(__FMA4__)
518inline VectorSSEFloat
fma_plus(VectorSSEFloat a, VectorSSEFloat b,
521 d.d = _mm_macc_ps(a.d, b.d, c.d);
524inline VectorSSEFloat
fma_minus(VectorSSEFloat a, VectorSSEFloat b,
527 d.d = _mm_msub_ps(a.d, b.d, c.d);
535inline VectorSSEFloat exp(VectorSSEFloat a) {
537 VectorSSEFloat result;
538 result.d = _mm_exp_ps(a.d);
542 for (
int i = 0; i < 4; ++i) a_d[i] = std::exp(a_d[i]);
543 VectorSSEFloat result(a_d);
547inline VectorSSEFloat sqrt(VectorSSEFloat a) {
549 VectorSSEFloat result;
550 result.d = _mm_sqrt_ps(a.d);
554 for (
int i = 0; i < 4; ++i) a_d[i] = std::sqrt(a_d[i]);
555 VectorSSEFloat result(a_d);
559inline VectorSSEFloat erf(VectorSSEFloat a) {
561 VectorSSEFloat result;
562 result.d = _mm_erf_ps(a.d);
566 for (
int i = 0; i < 4; ++i) a_d[i] = ::erf(a_d[i]);
567 VectorSSEFloat result(a_d);
571inline VectorSSEFloat erfc(VectorSSEFloat a) {
573 VectorSSEFloat result;
574 result.d = _mm_erfc_ps(a.d);
578 for (
int i = 0; i < 4; ++i) a_d[i] = ::erfc(a_d[i]);
579 VectorSSEFloat result(a_d);
589inline std::ostream& operator<<(std::ostream& os,
593 os <<
"{" << ad[0] <<
"," << ad[1] <<
"," << ad[2] <<
"," << ad[3] <<
"}";
604 static const bool value =
true;
609 typedef float scalar_type;
610 static const size_t extent = 4;
660 d = _mm256_set_pd(a, a, a, a);
665 d = _mm256_add_pd(d, a.d);
670 d = _mm256_sub_pd(d, a.d);
676 const static __m256d minus_one = _mm256_set_pd(-1.0, -1.0, -1.0, -1.0);
679 result.d = _mm256_mul_pd(this->d, minus_one);
683#if LIBINT2_CPLUSPLUS_STD >= 2011
686 operator double()
const {
688 ::memcpy(&(d0[0]), &d,
sizeof(__m256d));
699 operator __m256d()
const {
return d; }
702 void load(T
const* a) { d = _mm256_loadu_pd(a); }
707 void convert(T* a)
const { _mm256_storeu_pd(&a[0], d); }
714inline VectorAVXDouble
operator*(
double a, VectorAVXDouble b) {
716 VectorAVXDouble _a(a);
717 c.d = _mm256_mul_pd(_a.d, b.d);
721inline VectorAVXDouble operator*(VectorAVXDouble a,
double b) {
723 VectorAVXDouble _b(b);
724 c.d = _mm256_mul_pd(a.d, _b.d);
728inline VectorAVXDouble
operator*(
int a, VectorAVXDouble b) {
733 VectorAVXDouble _a((
double)a);
734 c.d = _mm256_mul_pd(_a.d, b.d);
739inline VectorAVXDouble
operator*(VectorAVXDouble a,
int b) {
744 VectorAVXDouble _b((
double)b);
745 c.d = _mm256_mul_pd(a.d, _b.d);
750inline VectorAVXDouble
operator*(VectorAVXDouble a, VectorAVXDouble b) {
752 c.d = _mm256_mul_pd(a.d, b.d);
756inline VectorAVXDouble operator+(VectorAVXDouble a, VectorAVXDouble b) {
758 c.d = _mm256_add_pd(a.d, b.d);
762inline VectorAVXDouble operator+(
int a, VectorAVXDouble b) {
767 VectorAVXDouble _a = (
static_cast<double>(a));
768 c.d = _mm256_add_pd(_a.d, b.d);
773inline VectorAVXDouble operator+(VectorAVXDouble a,
int b) {
778 VectorAVXDouble _b = (
static_cast<double>(b));
779 c.d = _mm256_add_pd(a.d, _b.d);
784inline VectorAVXDouble operator-(VectorAVXDouble a, VectorAVXDouble b) {
786 c.d = _mm256_sub_pd(a.d, b.d);
790inline VectorAVXDouble operator/(VectorAVXDouble a, VectorAVXDouble b) {
792 c.d = _mm256_div_pd(a.d, b.d);
797inline VectorAVXDouble
fma_plus(VectorAVXDouble a, VectorAVXDouble b,
800 d.d = _mm256_fmadd_pd(a.d, b.d, c.d);
803inline VectorAVXDouble
fma_minus(VectorAVXDouble a, VectorAVXDouble b,
806 d.d = _mm256_fmsub_pd(a.d, b.d, c.d);
809#elif defined(__FMA4__)
810inline VectorAVXDouble
fma_plus(VectorAVXDouble a, VectorAVXDouble b,
813 d.d = _mm256_macc_pd(a.d, b.d, c.d);
816inline VectorAVXDouble
fma_minus(VectorAVXDouble a, VectorAVXDouble b,
819 d.d = _mm256_msub_pd(a.d, b.d, c.d);
828 __m256d s = _mm256_hadd_pd(a, a);
829 return ((
double*)&s)[0] + ((
double*)&s)[2];
845 __m256d sum = _mm256_hadd_pd(a, b);
847 __m128d sum_high = _mm256_extractf128_pd(sum, 1);
849 return _mm_add_pd(sum_high, _mm256_castpd256_pd128(sum));
866 __m256d sumab = _mm256_hadd_pd(a, b);
868 __m256d sumcd = _mm256_hadd_pd(c, d);
871 __m256d blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
873 __m256d perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);
875 return _mm256_add_pd(perm, blend);
881inline VectorAVXDouble exp(VectorAVXDouble a) {
883 VectorAVXDouble result;
884 result.d = _mm256_exp_pd(a.d);
888 for (
int i = 0; i < 4; ++i) a_d[i] = ::exp(a_d[i]);
889 VectorAVXDouble result(a_d);
893inline VectorAVXDouble sqrt(VectorAVXDouble a) {
895 VectorAVXDouble result;
896 result.d = _mm256_sqrt_pd(a.d);
900 for (
int i = 0; i < 4; ++i) a_d[i] = ::sqrt(a_d[i]);
901 VectorAVXDouble result(a_d);
905inline VectorAVXDouble erf(VectorAVXDouble a) {
907 VectorAVXDouble result;
908 result.d = _mm256_erf_pd(a.d);
912 for (
int i = 0; i < 4; ++i) a_d[i] = ::erf(a_d[i]);
913 VectorAVXDouble result(a_d);
917inline VectorAVXDouble erfc(VectorAVXDouble a) {
919 VectorAVXDouble result;
920 result.d = _mm256_erfc_pd(a.d);
924 for (
int i = 0; i < 4; ++i) a_d[i] = ::erfc(a_d[i]);
925 VectorAVXDouble result(a_d);
959 d = _mm256_set_ps(a0, a1, a2, a3, a4, a5, a6, a7);
963 d = _mm256_set_ps(a, a, a, a, a, a, a, a);
968 d = _mm256_add_ps(d, a.d);
973 d = _mm256_sub_ps(d, a.d);
979 const static __m256 minus_one =
980 _mm256_set_ps(-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0);
983 result.d = _mm256_mul_ps(this->d, minus_one);
987 explicit operator float()
const {
989 ::memcpy(&(d0[0]), &d,
sizeof(__m256));
999 explicit operator double()
const {
1000 const float result_flt = this->
operator float();
1004 void convert(T (&a)[8])
const { _mm256_storeu_ps(&a[0], d); }
1008inline VectorAVXFloat
operator*(
double a, VectorAVXFloat b) {
1010 VectorAVXFloat _a(a);
1011 c.d = _mm256_mul_ps(_a.d, b.d);
1015inline VectorAVXFloat
operator*(VectorAVXFloat a,
double b) {
1017 VectorAVXFloat _b(b);
1018 c.d = _mm256_mul_ps(a.d, _b.d);
1022inline VectorAVXFloat
operator*(
int a, VectorAVXFloat b) {
1027 VectorAVXFloat _a((
float)a);
1028 c.d = _mm256_mul_ps(_a.d, b.d);
1033inline VectorAVXFloat
operator*(VectorAVXFloat a,
int b) {
1038 VectorAVXFloat _b((
float)b);
1039 c.d = _mm256_mul_ps(a.d, _b.d);
1044inline VectorAVXFloat
operator*(VectorAVXFloat a, VectorAVXFloat b) {
1046 c.d = _mm256_mul_ps(a.d, b.d);
1050inline VectorAVXFloat operator+(VectorAVXFloat a, VectorAVXFloat b) {
1052 c.d = _mm256_add_ps(a.d, b.d);
1056inline VectorAVXFloat operator-(VectorAVXFloat a, VectorAVXFloat b) {
1058 c.d = _mm256_sub_ps(a.d, b.d);
1062inline VectorAVXFloat operator/(VectorAVXFloat a, VectorAVXFloat b) {
1064 c.d = _mm256_div_ps(a.d, b.d);
1069inline VectorAVXFloat
fma_plus(VectorAVXFloat a, VectorAVXFloat b,
1072 d.d = _mm256_fmadd_ps(a.d, b.d, c.d);
1075inline VectorAVXFloat
fma_minus(VectorAVXFloat a, VectorAVXFloat b,
1078 d.d = _mm256_fmsub_ps(a.d, b.d, c.d);
1081#elif defined(__FMA4__)
1082inline VectorAVXFloat
fma_plus(VectorAVXFloat a, VectorAVXFloat b,
1085 d.d = _mm256_facc_ps(a.d, b.d, c.d);
1088inline VectorAVXFloat
fma_minus(VectorAVXFloat a, VectorAVXFloat b,
1091 d.d = _mm256_fsub_ps(a.d, b.d, c.d);
1099inline VectorAVXFloat exp(VectorAVXFloat a) {
1101 VectorAVXFloat result;
1102 result.d = _mm256_exp_ps(a.d);
1106 for (
int i = 0; i < 8; ++i) a_d[i] = ::exp(a_d[i]);
1107 VectorAVXFloat result(a_d);
1111inline VectorAVXFloat sqrt(VectorAVXFloat a) {
1113 VectorAVXFloat result;
1114 result.d = _mm256_sqrt_ps(a.d);
1118 for (
int i = 0; i < 8; ++i) a_d[i] = ::sqrt(a_d[i]);
1119 VectorAVXFloat result(a_d);
1123inline VectorAVXFloat erf(VectorAVXFloat a) {
1125 VectorAVXFloat result;
1126 result.d = _mm256_erf_ps(a.d);
1130 for (
int i = 0; i < 8; ++i) a_d[i] = ::erf(a_d[i]);
1131 VectorAVXFloat result(a_d);
1135inline VectorAVXFloat erfc(VectorAVXFloat a) {
1137 VectorAVXFloat result;
1138 result.d = _mm256_erfc_ps(a.d);
1142 for (
int i = 0; i < 8; ++i) a_d[i] = ::erfc(a_d[i]);
1143 VectorAVXFloat result(a_d);
1153inline std::ostream& operator<<(std::ostream& os,
1157 os <<
"{" << ad[0] <<
"," << ad[1] <<
"," << ad[2] <<
"," << ad[3] <<
"}";
1162inline std::ostream& operator<<(std::ostream& os,
1166 os <<
"{" << ad[0] <<
"," << ad[1] <<
"," << ad[2] <<
"," << ad[3] <<
","
1167 << ad[4] <<
"," << ad[5] <<
"," << ad[6] <<
"," << ad[7] <<
"}";
1178 static const bool value =
true;
1183 typedef double scalar_type;
1184 static const size_t extent = 4;
1193#ifdef LIBINT2_HAVE_AGNER_VECTORCLASS
1194#include <vectorclass.h>
double horizontal_add(VectorSSEDouble const &a)
Horizontal add.
Definition vector_x86.h:220
Defaults definitions for various parameters assumed by Libint.
Definition algebra.cc:24
auto fma_plus(X x, Y y, Z z) -> decltype(x *y+z)
Definition intrinsic_operations.h:37
std::shared_ptr< CTimeEntity< typename ProductType< T, U >::result > > operator*(const std::shared_ptr< CTimeEntity< T > > &A, const std::shared_ptr< CTimeEntity< U > > &B)
Creates product A*B.
Definition entity.h:302
auto fma_minus(X x, Y y, Z z) -> decltype(x *y - z)
Definition intrinsic_operations.h:43
Definition type_traits.h:29
SIMD vector of 4 double-precision floating-point real numbers, operations on which use AVX instructio...
Definition vector_x86.h:630
void convert(T *a) const
writes this to a
Definition vector_x86.h:707
VectorAVXDouble(__m256d a)
converts a 256-bit AVX double vector type to VectorAVXDouble
Definition vector_x86.h:657
VectorAVXDouble(T a)
Initializes all elements to the same value.
Definition vector_x86.h:642
VectorAVXDouble()
creates a vector of default-initialized values.
Definition vector_x86.h:637
VectorAVXDouble(T(&a)[4])
creates a vector of values initialized by an ordinary static-sized array
Definition vector_x86.h:647
void load(T const *a)
loads a to this
Definition vector_x86.h:702
void convert_aligned(T *a) const
writes this to a
Definition vector_x86.h:710
VectorAVXDouble(T a0, T a1, T a2, T a3)
creates a vector of values initialized by an ordinary static-sized array
Definition vector_x86.h:652
void load_aligned(T const *a)
loads a to this
Definition vector_x86.h:705
SIMD vector of 8 single-precision floating-point real numbers, operations on which use AVX instructio...
Definition vector_x86.h:936
VectorAVXFloat()
creates a vector of default-initialized values.
Definition vector_x86.h:943
VectorAVXFloat(T(&a)[8])
creates a vector of values initialized by an ordinary static-sized array
Definition vector_x86.h:953
VectorAVXFloat(T a)
Initializes all elements to the same value.
Definition vector_x86.h:948
VectorAVXFloat(T a0, T a1, T a2, T a3, T a4, T a5, T a6, T a7)
creates a vector of values initialized by an ordinary static-sized array
Definition vector_x86.h:958
Definition type_traits.h:34