LIBINT 2.9.0
vector_x86.h
1/*
2 * Copyright (C) 2004-2024 Edward F. Valeev
3 *
4 * This file is part of Libint library.
5 *
6 * Libint library is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Libint library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with Libint library. If not, see <http://www.gnu.org/licenses/>.
18 *
19 */
20
21#ifndef _libint2_src_lib_libint_vectorx86_h_
22#define _libint2_src_lib_libint_vectorx86_h_
23
24#include <libint2/util/cxxstd.h>
25#include <libint2/util/type_traits.h>
26
27#include <cmath>
28#include <cstring>
29#include <iostream>
30
31#if defined(_MSC_VER)
32#include <intrin.h>
33#elif defined(__SSE2__) || defined(__SSE__) || defined(__AVX__)
34#include <x86intrin.h>
35#endif
36
37#ifdef __SSE2__
38
39namespace libint2 {
40namespace simd {
41
47 typedef double T;
48 __m128d d;
49
54
58 VectorSSEDouble(T a) { d = _mm_set_pd(a, a); }
59
63 VectorSSEDouble(T (&a)[2]) { d = _mm_loadu_pd(&a[0]); }
64
68 VectorSSEDouble(T a0, T a1) { d = _mm_set_pd(a1, a0); }
69
73 VectorSSEDouble(__m128d a) { d = a; }
74
75 VectorSSEDouble& operator=(T a) {
76 d = _mm_set_pd(a, a);
77 return *this;
78 }
79
80 VectorSSEDouble& operator+=(VectorSSEDouble a) {
81 d = _mm_add_pd(d, a.d);
82 return *this;
83 }
84
85 VectorSSEDouble& operator-=(VectorSSEDouble a) {
86 d = _mm_sub_pd(d, a.d);
87 return *this;
88 }
89
90 VectorSSEDouble operator-() const {
91 // TODO improve using bitflips
92 const static __m128d minus_one = _mm_set_pd(-1.0, -1.0);
93 ;
94 VectorSSEDouble result;
95 result.d = _mm_mul_pd(this->d, minus_one);
96 return result;
97 }
98
99#if LIBINT2_CPLUSPLUS_STD >= 2011
100 explicit
101#endif
102 operator double() const {
103 double d0[2];
104 ::memcpy(&(d0[0]), &d, sizeof(__m128d));
105 // this segfaults on OS X
106 //_mm_storeu_pd(&(d0[0]), d);
107 // // aligned alternative requires C++11's alignas, but efficiency
108 // should not matter here alignas(__m128d) double d0[2];
109 // _mm_store_pd(&(d0[0]), d);
110 return d0[0];
111 }
112
114 operator __m128d() const { return d; }
115
117 void load(T const* a) { d = _mm_loadu_pd(a); }
120 void load_aligned(T const* a) { d = _mm_load_pd(a); }
122 void convert(T* a) const { _mm_storeu_pd(&a[0], d); }
125 void convert_aligned(T* a) const { _mm_store_pd(&a[0], d); }
126};
127
129inline VectorSSEDouble operator*(double a, VectorSSEDouble b) {
130 VectorSSEDouble c;
131 VectorSSEDouble _a(a);
132 c.d = _mm_mul_pd(_a.d, b.d);
133 return c;
134}
135
136inline VectorSSEDouble operator*(VectorSSEDouble a, double b) {
137 VectorSSEDouble c;
138 VectorSSEDouble _b(b);
139 c.d = _mm_mul_pd(a.d, _b.d);
140 return c;
141}
142
143inline VectorSSEDouble operator*(int a, VectorSSEDouble b) {
144 if (a == 1)
145 return b;
146 else {
147 VectorSSEDouble c;
148 VectorSSEDouble _a((double)a);
149 c.d = _mm_mul_pd(_a.d, b.d);
150 return c;
151 }
152}
153
154inline VectorSSEDouble operator*(VectorSSEDouble a, int b) {
155 if (b == 1)
156 return a;
157 else {
158 VectorSSEDouble c;
159 VectorSSEDouble _b((double)b);
160 c.d = _mm_mul_pd(a.d, _b.d);
161 return c;
162 }
163}
164
165inline VectorSSEDouble operator*(VectorSSEDouble a, VectorSSEDouble b) {
166 VectorSSEDouble c;
167 c.d = _mm_mul_pd(a.d, b.d);
168 return c;
169}
170
171inline VectorSSEDouble operator+(VectorSSEDouble a, VectorSSEDouble b) {
172 VectorSSEDouble c;
173 c.d = _mm_add_pd(a.d, b.d);
174 return c;
175}
176
177inline VectorSSEDouble operator-(VectorSSEDouble a, VectorSSEDouble b) {
178 VectorSSEDouble c;
179 c.d = _mm_sub_pd(a.d, b.d);
180 return c;
181}
182
183inline VectorSSEDouble operator/(VectorSSEDouble a, VectorSSEDouble b) {
184 VectorSSEDouble c;
185 c.d = _mm_div_pd(a.d, b.d);
186 return c;
187}
188
189#if defined(__FMA__)
190inline VectorSSEDouble fma_plus(VectorSSEDouble a, VectorSSEDouble b,
191 VectorSSEDouble c) {
192 VectorSSEDouble d;
193 d.d = _mm_fmadd_pd(a.d, b.d, c.d);
194 return d;
195}
196inline VectorSSEDouble fma_minus(VectorSSEDouble a, VectorSSEDouble b,
197 VectorSSEDouble c) {
198 VectorSSEDouble d;
199 d.d = _mm_fmsub_pd(a.d, b.d, c.d);
200 return d;
201}
202#elif defined(__FMA4__)
203inline VectorSSEDouble fma_plus(VectorSSEDouble a, VectorSSEDouble b,
204 VectorSSEDouble c) {
205 VectorSSEDouble d;
206 d.d = _mm_macc_pd(a.d, b.d, c.d);
207 return d;
208}
209inline VectorSSEDouble fma_minus(VectorSSEDouble a, VectorSSEDouble b,
210 VectorSSEDouble c) {
211 VectorSSEDouble d;
212 d.d = _mm_msub_pd(a.d, b.d, c.d);
213 return d;
214}
215#endif
216
220inline double horizontal_add(VectorSSEDouble const& a) {
221// Agner Fog's version
222#if defined(__SSE3__)
223 __m128d t1 = _mm_hadd_pd(a, a);
224 return _mm_cvtsd_f64(t1);
225#else // SSE2 only
226 __m128 t0 = _mm_castpd_ps(a);
227 __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0, t0));
228 __m128d t2 = _mm_add_sd(a, t1);
229 return _mm_cvtsd_f64(t2);
230#endif
231}
232
238 VectorSSEDouble const& b) {
239#if defined(__SSE3__)
240 return _mm_hadd_pd(a, b);
241#else // will be very inefficient without SSE3
243#endif
244}
245
247
249inline VectorSSEDouble exp(VectorSSEDouble a) {
250#if HAVE_INTEL_SVML
251 VectorSSEDouble result;
252 result.d = _mm_exp_pd(a.d);
253#else
254 double a_d[2];
255 a.convert(a_d);
256 for (int i = 0; i < 2; ++i) a_d[i] = std::exp(a_d[i]);
257 VectorSSEDouble result(a_d);
258#endif
259 return result;
260}
261inline VectorSSEDouble sqrt(VectorSSEDouble a) {
262#if HAVE_INTEL_SVML
263 VectorSSEDouble result;
264 result.d = _mm_sqrt_pd(a.d);
265#else
266 double a_d[2];
267 a.convert(a_d);
268 for (int i = 0; i < 2; ++i) a_d[i] = std::sqrt(a_d[i]);
269 VectorSSEDouble result(a_d);
270#endif
271 return result;
272}
273inline VectorSSEDouble erf(VectorSSEDouble a) {
274#if HAVE_INTEL_SVML
275 VectorSSEDouble result;
276 result.d = _mm_erf_pd(a.d);
277#else
278 double a_d[2];
279 a.convert(a_d);
280 for (int i = 0; i < 2; ++i) a_d[i] = ::erf(a_d[i]);
281 VectorSSEDouble result(a_d);
282#endif
283 return result;
284}
285inline VectorSSEDouble erfc(VectorSSEDouble a) {
286#if HAVE_INTEL_SVML
287 VectorSSEDouble result;
288 result.d = _mm_erfc_pd(a.d);
289#else
290 double a_d[2];
291 a.convert(a_d);
292 for (int i = 0; i < 2; ++i) a_d[i] = ::erfc(a_d[i]);
293 VectorSSEDouble result(a_d);
294#endif
295 return result;
296}
298
299}; // namespace simd
300}; // namespace libint2
301
303inline std::ostream& operator<<(std::ostream& os,
305 double ad[2];
306 a.convert(ad);
307 os << "{" << ad[0] << "," << ad[1] << "}";
308 return os;
309}
311
312namespace libint2 {
313
315
316template <>
317struct is_vector<simd::VectorSSEDouble> {
318 static const bool value = true;
319};
320
321template <>
322struct vector_traits<simd::VectorSSEDouble> {
323 typedef double scalar_type;
324 static const size_t extent = 2;
325};
326
328
329} // namespace libint2
330
331#endif // SSE2-only
332
333#ifdef __SSE__
334
335namespace libint2 {
336namespace simd {
337
343 typedef float T;
344 __m128 d;
345
350
354 VectorSSEFloat(T a) { d = _mm_set_ps(a, a, a, a); }
355
359 VectorSSEFloat(T (&a)[4]) { d = _mm_loadu_ps(&a[0]); }
360
364 VectorSSEFloat(T a0, T a1, T a2, T a3) { d = _mm_set_ps(a3, a2, a1, a0); }
365
366 VectorSSEFloat& operator=(T a) {
367 d = _mm_set_ps(a, a, a, a);
368 return *this;
369 }
370
371 VectorSSEFloat& operator+=(VectorSSEFloat a) {
372 d = _mm_add_ps(d, a.d);
373 return *this;
374 }
375
376 VectorSSEFloat& operator-=(VectorSSEFloat a) {
377 d = _mm_sub_ps(d, a.d);
378 return *this;
379 }
380
381 VectorSSEFloat operator-() const {
382 // TODO improve using bitflips
383 const static __m128 minus_one = _mm_set_ps(-1.0, -1.0, -1.0, -1.0);
384 ;
385 VectorSSEFloat result;
386 result.d = _mm_mul_ps(this->d, minus_one);
387 return result;
388 }
389
390#if LIBINT2_CPLUSPLUS_STD >= 2011
391 explicit
392#endif
393 operator float() const {
394 float d0[4];
395 ::memcpy(&(d0[0]), &d, sizeof(__m128));
396 // this segfaults on OS X
397 //_mm_storeu_ps(&(d0[0]), d);
398 // // aligned alternative requires C++11's alignas, but efficiency
399 // should not matter here alignas(__m128) float d0[4];
400 // _mm_store_ps(&(d0[0]), d);
401 return d0[0];
402 }
403
404#if LIBINT2_CPLUSPLUS_STD >= 2011
405 explicit
406#endif
407 operator double() const {
408 const float result_flt = this->operator float();
409 return result_flt;
410 }
411
413 operator __m128() const { return d; }
414
416 void load(T const* a) { d = _mm_loadu_ps(a); }
419 void load_aligned(T const* a) { d = _mm_load_ps(a); }
421 void convert(T* a) const { _mm_storeu_ps(&a[0], d); }
424 void convert_aligned(T* a) const { _mm_store_ps(&a[0], d); }
425};
426
428inline VectorSSEFloat operator*(float a, VectorSSEFloat b) {
429 VectorSSEFloat c;
430 VectorSSEFloat _a(a);
431 c.d = _mm_mul_ps(_a.d, b.d);
432 return c;
433}
434
435inline VectorSSEFloat operator*(VectorSSEFloat a, float b) {
436 VectorSSEFloat c;
437 VectorSSEFloat _b(b);
438 c.d = _mm_mul_ps(a.d, _b.d);
439 return c;
440}
441
442// narrows a!
443inline VectorSSEFloat operator*(double a, VectorSSEFloat b) {
444 VectorSSEFloat c;
445 VectorSSEFloat _a((float)a);
446 c.d = _mm_mul_ps(_a.d, b.d);
447 return c;
448}
449
450// narrows b!
451inline VectorSSEFloat operator*(VectorSSEFloat a, double b) {
452 VectorSSEFloat c;
453 VectorSSEFloat _b((float)b);
454 c.d = _mm_mul_ps(a.d, _b.d);
455 return c;
456}
457
458inline VectorSSEFloat operator*(int a, VectorSSEFloat b) {
459 if (a == 1)
460 return b;
461 else {
462 VectorSSEFloat c;
463 VectorSSEFloat _a((float)a);
464 c.d = _mm_mul_ps(_a.d, b.d);
465 return c;
466 }
467}
468
469inline VectorSSEFloat operator*(VectorSSEFloat a, int b) {
470 if (b == 1)
471 return a;
472 else {
473 VectorSSEFloat c;
474 VectorSSEFloat _b((float)b);
475 c.d = _mm_mul_ps(a.d, _b.d);
476 return c;
477 }
478}
479
480inline VectorSSEFloat operator*(VectorSSEFloat a, VectorSSEFloat b) {
481 VectorSSEFloat c;
482 c.d = _mm_mul_ps(a.d, b.d);
483 return c;
484}
485
486inline VectorSSEFloat operator+(VectorSSEFloat a, VectorSSEFloat b) {
487 VectorSSEFloat c;
488 c.d = _mm_add_ps(a.d, b.d);
489 return c;
490}
491
492inline VectorSSEFloat operator-(VectorSSEFloat a, VectorSSEFloat b) {
493 VectorSSEFloat c;
494 c.d = _mm_sub_ps(a.d, b.d);
495 return c;
496}
497
498inline VectorSSEFloat operator/(VectorSSEFloat a, VectorSSEFloat b) {
499 VectorSSEFloat c;
500 c.d = _mm_div_ps(a.d, b.d);
501 return c;
502}
503
504#if defined(__FMA__)
505inline VectorSSEFloat fma_plus(VectorSSEFloat a, VectorSSEFloat b,
506 VectorSSEFloat c) {
507 VectorSSEFloat d;
508 d.d = _mm_fmadd_ps(a.d, b.d, c.d);
509 return d;
510}
511inline VectorSSEFloat fma_minus(VectorSSEFloat a, VectorSSEFloat b,
512 VectorSSEFloat c) {
513 VectorSSEFloat d;
514 d.d = _mm_fmsub_ps(a.d, b.d, c.d);
515 return d;
516}
517#elif defined(__FMA4__)
518inline VectorSSEFloat fma_plus(VectorSSEFloat a, VectorSSEFloat b,
519 VectorSSEFloat c) {
520 VectorSSEFloat d;
521 d.d = _mm_macc_ps(a.d, b.d, c.d);
522 return d;
523}
524inline VectorSSEFloat fma_minus(VectorSSEFloat a, VectorSSEFloat b,
525 VectorSSEFloat c) {
526 VectorSSEFloat d;
527 d.d = _mm_msub_ps(a.d, b.d, c.d);
528 return d;
529}
530#endif
531
533
535inline VectorSSEFloat exp(VectorSSEFloat a) {
536#if HAVE_INTEL_SVML
537 VectorSSEFloat result;
538 result.d = _mm_exp_ps(a.d);
539#else
540 float a_d[4];
541 a.convert(a_d);
542 for (int i = 0; i < 4; ++i) a_d[i] = std::exp(a_d[i]);
543 VectorSSEFloat result(a_d);
544#endif
545 return result;
546}
547inline VectorSSEFloat sqrt(VectorSSEFloat a) {
548#if HAVE_INTEL_SVML
549 VectorSSEFloat result;
550 result.d = _mm_sqrt_ps(a.d);
551#else
552 float a_d[4];
553 a.convert(a_d);
554 for (int i = 0; i < 4; ++i) a_d[i] = std::sqrt(a_d[i]);
555 VectorSSEFloat result(a_d);
556#endif
557 return result;
558}
559inline VectorSSEFloat erf(VectorSSEFloat a) {
560#if HAVE_INTEL_SVML
561 VectorSSEFloat result;
562 result.d = _mm_erf_ps(a.d);
563#else
564 float a_d[4];
565 a.convert(a_d);
566 for (int i = 0; i < 4; ++i) a_d[i] = ::erf(a_d[i]);
567 VectorSSEFloat result(a_d);
568#endif
569 return result;
570}
571inline VectorSSEFloat erfc(VectorSSEFloat a) {
572#if HAVE_INTEL_SVML
573 VectorSSEFloat result;
574 result.d = _mm_erfc_ps(a.d);
575#else
576 float a_d[4];
577 a.convert(a_d);
578 for (int i = 0; i < 4; ++i) a_d[i] = ::erfc(a_d[i]);
579 VectorSSEFloat result(a_d);
580#endif
581 return result;
582}
584
585}; // namespace simd
586}; // namespace libint2
587
589inline std::ostream& operator<<(std::ostream& os,
591 float ad[4];
592 a.convert(ad);
593 os << "{" << ad[0] << "," << ad[1] << "," << ad[2] << "," << ad[3] << "}";
594 return os;
595}
597
598namespace libint2 {
599
601
602template <>
603struct is_vector<simd::VectorSSEFloat> {
604 static const bool value = true;
605};
606
607template <>
608struct vector_traits<simd::VectorSSEFloat> {
609 typedef float scalar_type;
610 static const size_t extent = 4;
611};
612
614
615} // namespace libint2
616
617#endif // SSE-only
618
619#ifdef __AVX__
620
621namespace libint2 {
622namespace simd {
623
631 typedef double T;
632 __m256d d;
633
638
642 VectorAVXDouble(T a) { d = _mm256_set_pd(a, a, a, a); }
643
647 VectorAVXDouble(T (&a)[4]) { d = _mm256_loadu_pd(&a[0]); }
648
652 VectorAVXDouble(T a0, T a1, T a2, T a3) { d = _mm256_set_pd(a3, a2, a1, a0); }
653
657 VectorAVXDouble(__m256d a) { d = a; }
658
659 VectorAVXDouble& operator=(T a) {
660 d = _mm256_set_pd(a, a, a, a);
661 return *this;
662 }
663
664 VectorAVXDouble& operator+=(VectorAVXDouble a) {
665 d = _mm256_add_pd(d, a.d);
666 return *this;
667 }
668
669 VectorAVXDouble& operator-=(VectorAVXDouble a) {
670 d = _mm256_sub_pd(d, a.d);
671 return *this;
672 }
673
674 VectorAVXDouble operator-() const {
675 // TODO improve using bitflips
676 const static __m256d minus_one = _mm256_set_pd(-1.0, -1.0, -1.0, -1.0);
677 ;
678 VectorAVXDouble result;
679 result.d = _mm256_mul_pd(this->d, minus_one);
680 return result;
681 }
682
683#if LIBINT2_CPLUSPLUS_STD >= 2011
684 explicit
685#endif
686 operator double() const {
687 double d0[4];
688 ::memcpy(&(d0[0]), &d, sizeof(__m256d));
689 // this segfaults on OS X
690 // _mm256_storeu_pd(&d0[0], d);
691 // // aligned alternative requires C++11's alignas, but efficiency
692 // should not matter here
693 // // alignas(__m256d) double d0[4];
694 // // _mm256_store_pd(&(d0[0]), d);
695 return d0[0];
696 }
697
699 operator __m256d() const { return d; }
700
702 void load(T const* a) { d = _mm256_loadu_pd(a); }
705 void load_aligned(T const* a) { d = _mm256_load_pd(a); }
707 void convert(T* a) const { _mm256_storeu_pd(&a[0], d); }
710 void convert_aligned(T* a) const { _mm256_store_pd(&a[0], d); }
711};
712
714inline VectorAVXDouble operator*(double a, VectorAVXDouble b) {
715 VectorAVXDouble c;
716 VectorAVXDouble _a(a);
717 c.d = _mm256_mul_pd(_a.d, b.d);
718 return c;
719}
720
721inline VectorAVXDouble operator*(VectorAVXDouble a, double b) {
722 VectorAVXDouble c;
723 VectorAVXDouble _b(b);
724 c.d = _mm256_mul_pd(a.d, _b.d);
725 return c;
726}
727
728inline VectorAVXDouble operator*(int a, VectorAVXDouble b) {
729 if (a == 1)
730 return b;
731 else {
732 VectorAVXDouble c;
733 VectorAVXDouble _a((double)a);
734 c.d = _mm256_mul_pd(_a.d, b.d);
735 return c;
736 }
737}
738
739inline VectorAVXDouble operator*(VectorAVXDouble a, int b) {
740 if (b == 1)
741 return a;
742 else {
743 VectorAVXDouble c;
744 VectorAVXDouble _b((double)b);
745 c.d = _mm256_mul_pd(a.d, _b.d);
746 return c;
747 }
748}
749
750inline VectorAVXDouble operator*(VectorAVXDouble a, VectorAVXDouble b) {
751 VectorAVXDouble c;
752 c.d = _mm256_mul_pd(a.d, b.d);
753 return c;
754}
755
756inline VectorAVXDouble operator+(VectorAVXDouble a, VectorAVXDouble b) {
757 VectorAVXDouble c;
758 c.d = _mm256_add_pd(a.d, b.d);
759 return c;
760}
761
762inline VectorAVXDouble operator+(int a, VectorAVXDouble b) {
763 if (a == 0) {
764 return b;
765 } else {
766 VectorAVXDouble c;
767 VectorAVXDouble _a = (static_cast<double>(a));
768 c.d = _mm256_add_pd(_a.d, b.d);
769 return c;
770 }
771}
772
773inline VectorAVXDouble operator+(VectorAVXDouble a, int b) {
774 if (b == 0) {
775 return a;
776 } else {
777 VectorAVXDouble c;
778 VectorAVXDouble _b = (static_cast<double>(b));
779 c.d = _mm256_add_pd(a.d, _b.d);
780 return c;
781 }
782}
783
784inline VectorAVXDouble operator-(VectorAVXDouble a, VectorAVXDouble b) {
785 VectorAVXDouble c;
786 c.d = _mm256_sub_pd(a.d, b.d);
787 return c;
788}
789
790inline VectorAVXDouble operator/(VectorAVXDouble a, VectorAVXDouble b) {
791 VectorAVXDouble c;
792 c.d = _mm256_div_pd(a.d, b.d);
793 return c;
794}
795
796#if defined(__FMA__)
797inline VectorAVXDouble fma_plus(VectorAVXDouble a, VectorAVXDouble b,
798 VectorAVXDouble c) {
799 VectorAVXDouble d;
800 d.d = _mm256_fmadd_pd(a.d, b.d, c.d);
801 return d;
802}
803inline VectorAVXDouble fma_minus(VectorAVXDouble a, VectorAVXDouble b,
804 VectorAVXDouble c) {
805 VectorAVXDouble d;
806 d.d = _mm256_fmsub_pd(a.d, b.d, c.d);
807 return d;
808}
809#elif defined(__FMA4__)
810inline VectorAVXDouble fma_plus(VectorAVXDouble a, VectorAVXDouble b,
811 VectorAVXDouble c) {
812 VectorAVXDouble d;
813 d.d = _mm256_macc_pd(a.d, b.d, c.d);
814 return d;
815}
816inline VectorAVXDouble fma_minus(VectorAVXDouble a, VectorAVXDouble b,
817 VectorAVXDouble c) {
818 VectorAVXDouble d;
819 d.d = _mm256_msub_pd(a.d, b.d, c.d);
820 return d;
821}
822#endif
823
827inline double horizontal_add(VectorAVXDouble const& a) {
828 __m256d s = _mm256_hadd_pd(a, a);
829 return ((double*)&s)[0] + ((double*)&s)[2];
830 // Agner Fog's version
831 // __m256d t1 = _mm256_hadd_pd(a,a);
832 // __m128d t2 = _mm256_extractf128_pd(t1,1);
833 // __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
834 // return _mm_cvtsd_f64(t3);
835}
836
842 VectorAVXDouble const& b) {
843 // solution from
844 // http://stackoverflow.com/questions/9775538/fastest-way-to-do-horizontal-vector-sum-with-avx-instructions
845 __m256d sum = _mm256_hadd_pd(a, b);
846 // extract upper 128 bits of result
847 __m128d sum_high = _mm256_extractf128_pd(sum, 1);
848 // add upper 128 bits of sum to its lower 128 bits
849 return _mm_add_pd(sum_high, _mm256_castpd256_pd128(sum));
850}
851
860 VectorAVXDouble const& b,
861 VectorAVXDouble const& c,
862 VectorAVXDouble const& d) {
863 // solution from
864 // http://stackoverflow.com/questions/10833234/4-horizontal-double-precision-sums-in-one-go-with-avx?lq=1
865 // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
866 __m256d sumab = _mm256_hadd_pd(a, b);
867 // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
868 __m256d sumcd = _mm256_hadd_pd(c, d);
869
870 // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
871 __m256d blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
872 // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
873 __m256d perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);
874
875 return _mm256_add_pd(perm, blend);
876}
877
879
881inline VectorAVXDouble exp(VectorAVXDouble a) {
882#if HAVE_INTEL_SVML
883 VectorAVXDouble result;
884 result.d = _mm256_exp_pd(a.d);
885#else
886 double a_d[4];
887 a.convert(a_d);
888 for (int i = 0; i < 4; ++i) a_d[i] = ::exp(a_d[i]);
889 VectorAVXDouble result(a_d);
890#endif
891 return result;
892}
893inline VectorAVXDouble sqrt(VectorAVXDouble a) {
894#if HAVE_INTEL_SVML
895 VectorAVXDouble result;
896 result.d = _mm256_sqrt_pd(a.d);
897#else
898 double a_d[4];
899 a.convert(a_d);
900 for (int i = 0; i < 4; ++i) a_d[i] = ::sqrt(a_d[i]);
901 VectorAVXDouble result(a_d);
902#endif
903 return result;
904}
905inline VectorAVXDouble erf(VectorAVXDouble a) {
906#if HAVE_INTEL_SVML
907 VectorAVXDouble result;
908 result.d = _mm256_erf_pd(a.d);
909#else
910 double a_d[4];
911 a.convert(a_d);
912 for (int i = 0; i < 4; ++i) a_d[i] = ::erf(a_d[i]);
913 VectorAVXDouble result(a_d);
914#endif
915 return result;
916}
917inline VectorAVXDouble erfc(VectorAVXDouble a) {
918#if HAVE_INTEL_SVML
919 VectorAVXDouble result;
920 result.d = _mm256_erfc_pd(a.d);
921#else
922 double a_d[4];
923 a.convert(a_d);
924 for (int i = 0; i < 4; ++i) a_d[i] = ::erfc(a_d[i]);
925 VectorAVXDouble result(a_d);
926#endif
927 return result;
928}
929
937 typedef float T;
938 __m256 d;
939
944
948 VectorAVXFloat(T a) { d = _mm256_set_ps(a, a, a, a, a, a, a, a); }
949
953 VectorAVXFloat(T (&a)[8]) { d = _mm256_loadu_ps(&a[0]); }
954
958 VectorAVXFloat(T a0, T a1, T a2, T a3, T a4, T a5, T a6, T a7) {
959 d = _mm256_set_ps(a0, a1, a2, a3, a4, a5, a6, a7);
960 }
961
962 VectorAVXFloat& operator=(T a) {
963 d = _mm256_set_ps(a, a, a, a, a, a, a, a);
964 return *this;
965 }
966
967 VectorAVXFloat& operator+=(VectorAVXFloat a) {
968 d = _mm256_add_ps(d, a.d);
969 return *this;
970 }
971
972 VectorAVXFloat& operator-=(VectorAVXFloat a) {
973 d = _mm256_sub_ps(d, a.d);
974 return *this;
975 }
976
977 VectorAVXFloat operator-() const {
978 // TODO improve using bitflips
979 const static __m256 minus_one =
980 _mm256_set_ps(-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0);
981 ;
982 VectorAVXFloat result;
983 result.d = _mm256_mul_ps(this->d, minus_one);
984 return result;
985 }
986
987 explicit operator float() const {
988 float d0[8];
989 ::memcpy(&(d0[0]), &d, sizeof(__m256));
990 // this segfaults on OS X
991 // _mm256_storeu_ps(&d0[0], d);
992 // // aligned alternative requires C++11's alignas, but efficiency
993 // should not matter here
994 // // alignas(__m256) float d0[8];
995 // // _mm256_store_ps(&(d0[0]), d);
996 return d0[0];
997 }
998
999 explicit operator double() const {
1000 const float result_flt = this->operator float();
1001 return result_flt;
1002 }
1003
1004 void convert(T (&a)[8]) const { _mm256_storeu_ps(&a[0], d); }
1005};
1006
1008inline VectorAVXFloat operator*(double a, VectorAVXFloat b) {
1009 VectorAVXFloat c;
1010 VectorAVXFloat _a(a);
1011 c.d = _mm256_mul_ps(_a.d, b.d);
1012 return c;
1013}
1014
1015inline VectorAVXFloat operator*(VectorAVXFloat a, double b) {
1016 VectorAVXFloat c;
1017 VectorAVXFloat _b(b);
1018 c.d = _mm256_mul_ps(a.d, _b.d);
1019 return c;
1020}
1021
1022inline VectorAVXFloat operator*(int a, VectorAVXFloat b) {
1023 if (a == 1)
1024 return b;
1025 else {
1026 VectorAVXFloat c;
1027 VectorAVXFloat _a((float)a);
1028 c.d = _mm256_mul_ps(_a.d, b.d);
1029 return c;
1030 }
1031}
1032
1033inline VectorAVXFloat operator*(VectorAVXFloat a, int b) {
1034 if (b == 1)
1035 return a;
1036 else {
1037 VectorAVXFloat c;
1038 VectorAVXFloat _b((float)b);
1039 c.d = _mm256_mul_ps(a.d, _b.d);
1040 return c;
1041 }
1042}
1043
1044inline VectorAVXFloat operator*(VectorAVXFloat a, VectorAVXFloat b) {
1045 VectorAVXFloat c;
1046 c.d = _mm256_mul_ps(a.d, b.d);
1047 return c;
1048}
1049
1050inline VectorAVXFloat operator+(VectorAVXFloat a, VectorAVXFloat b) {
1051 VectorAVXFloat c;
1052 c.d = _mm256_add_ps(a.d, b.d);
1053 return c;
1054}
1055
1056inline VectorAVXFloat operator-(VectorAVXFloat a, VectorAVXFloat b) {
1057 VectorAVXFloat c;
1058 c.d = _mm256_sub_ps(a.d, b.d);
1059 return c;
1060}
1061
1062inline VectorAVXFloat operator/(VectorAVXFloat a, VectorAVXFloat b) {
1063 VectorAVXFloat c;
1064 c.d = _mm256_div_ps(a.d, b.d);
1065 return c;
1066}
1067
1068#if defined(__FMA__)
1069inline VectorAVXFloat fma_plus(VectorAVXFloat a, VectorAVXFloat b,
1070 VectorAVXFloat c) {
1071 VectorAVXFloat d;
1072 d.d = _mm256_fmadd_ps(a.d, b.d, c.d);
1073 return d;
1074}
1075inline VectorAVXFloat fma_minus(VectorAVXFloat a, VectorAVXFloat b,
1076 VectorAVXFloat c) {
1077 VectorAVXFloat d;
1078 d.d = _mm256_fmsub_ps(a.d, b.d, c.d);
1079 return d;
1080}
1081#elif defined(__FMA4__)
1082inline VectorAVXFloat fma_plus(VectorAVXFloat a, VectorAVXFloat b,
1083 VectorAVXFloat c) {
1084 VectorAVXFloat d;
1085 d.d = _mm256_facc_ps(a.d, b.d, c.d);
1086 return d;
1087}
1088inline VectorAVXFloat fma_minus(VectorAVXFloat a, VectorAVXFloat b,
1089 VectorAVXFloat c) {
1090 VectorAVXFloat d;
1091 d.d = _mm256_fsub_ps(a.d, b.d, c.d);
1092 return d;
1093}
1094#endif
1095
1097
1099inline VectorAVXFloat exp(VectorAVXFloat a) {
1100#if HAVE_INTEL_SVML
1101 VectorAVXFloat result;
1102 result.d = _mm256_exp_ps(a.d);
1103#else
1104 float a_d[8];
1105 a.convert(a_d);
1106 for (int i = 0; i < 8; ++i) a_d[i] = ::exp(a_d[i]);
1107 VectorAVXFloat result(a_d);
1108#endif
1109 return result;
1110}
1111inline VectorAVXFloat sqrt(VectorAVXFloat a) {
1112#if HAVE_INTEL_SVML
1113 VectorAVXFloat result;
1114 result.d = _mm256_sqrt_ps(a.d);
1115#else
1116 float a_d[8];
1117 a.convert(a_d);
1118 for (int i = 0; i < 8; ++i) a_d[i] = ::sqrt(a_d[i]);
1119 VectorAVXFloat result(a_d);
1120#endif
1121 return result;
1122}
1123inline VectorAVXFloat erf(VectorAVXFloat a) {
1124#if HAVE_INTEL_SVML
1125 VectorAVXFloat result;
1126 result.d = _mm256_erf_ps(a.d);
1127#else
1128 float a_d[8];
1129 a.convert(a_d);
1130 for (int i = 0; i < 8; ++i) a_d[i] = ::erf(a_d[i]);
1131 VectorAVXFloat result(a_d);
1132#endif
1133 return result;
1134}
1135inline VectorAVXFloat erfc(VectorAVXFloat a) {
1136#if HAVE_INTEL_SVML
1137 VectorAVXFloat result;
1138 result.d = _mm256_erfc_ps(a.d);
1139#else
1140 float a_d[8];
1141 a.convert(a_d);
1142 for (int i = 0; i < 8; ++i) a_d[i] = ::erfc(a_d[i]);
1143 VectorAVXFloat result(a_d);
1144#endif
1145 return result;
1146}
1148
1149}; // namespace simd
1150}; // namespace libint2
1151
1153inline std::ostream& operator<<(std::ostream& os,
1155 double ad[4];
1156 a.convert(ad);
1157 os << "{" << ad[0] << "," << ad[1] << "," << ad[2] << "," << ad[3] << "}";
1158 return os;
1159}
1160
1162inline std::ostream& operator<<(std::ostream& os,
1164 float ad[8];
1165 a.convert(ad);
1166 os << "{" << ad[0] << "," << ad[1] << "," << ad[2] << "," << ad[3] << ","
1167 << ad[4] << "," << ad[5] << "," << ad[6] << "," << ad[7] << "}";
1168 return os;
1169}
1171
1172namespace libint2 {
1173
1175
1176template <>
1177struct is_vector<simd::VectorAVXDouble> {
1178 static const bool value = true;
1179};
1180
1181template <>
1182struct vector_traits<simd::VectorAVXDouble> {
1183 typedef double scalar_type;
1184 static const size_t extent = 4;
1185};
1186
1188
1189} // namespace libint2
1190
1191#endif // AVX-only
1192
1193#ifdef LIBINT2_HAVE_AGNER_VECTORCLASS
1194#include <vectorclass.h>
1195#endif
1196
1197#endif // header guard
double horizontal_add(VectorSSEDouble const &a)
Horizontal add.
Definition vector_x86.h:220
Defaults definitions for various parameters assumed by Libint.
Definition algebra.cc:24
auto fma_plus(X x, Y y, Z z) -> decltype(x *y+z)
Definition intrinsic_operations.h:37
std::shared_ptr< CTimeEntity< typename ProductType< T, U >::result > > operator*(const std::shared_ptr< CTimeEntity< T > > &A, const std::shared_ptr< CTimeEntity< U > > &B)
Creates product A*B.
Definition entity.h:302
auto fma_minus(X x, Y y, Z z) -> decltype(x *y - z)
Definition intrinsic_operations.h:43
Definition type_traits.h:29
SIMD vector of 4 double-precision floating-point real numbers, operations on which use AVX instructio...
Definition vector_x86.h:630
void convert(T *a) const
writes this to a
Definition vector_x86.h:707
VectorAVXDouble(__m256d a)
converts a 256-bit AVX double vector type to VectorAVXDouble
Definition vector_x86.h:657
VectorAVXDouble(T a)
Initializes all elements to the same value.
Definition vector_x86.h:642
VectorAVXDouble()
creates a vector of default-initialized values.
Definition vector_x86.h:637
VectorAVXDouble(T(&a)[4])
creates a vector of values initialized by an ordinary static-sized array
Definition vector_x86.h:647
void load(T const *a)
loads a to this
Definition vector_x86.h:702
void convert_aligned(T *a) const
writes this to a
Definition vector_x86.h:710
VectorAVXDouble(T a0, T a1, T a2, T a3)
creates a vector of values initialized by an ordinary static-sized array
Definition vector_x86.h:652
void load_aligned(T const *a)
loads a to this
Definition vector_x86.h:705
SIMD vector of 8 single-precision floating-point real numbers, operations on which use AVX instructio...
Definition vector_x86.h:936
VectorAVXFloat()
creates a vector of default-initialized values.
Definition vector_x86.h:943
VectorAVXFloat(T(&a)[8])
creates a vector of values initialized by an ordinary static-sized array
Definition vector_x86.h:953
VectorAVXFloat(T a)
Initializes all elements to the same value.
Definition vector_x86.h:948
VectorAVXFloat(T a0, T a1, T a2, T a3, T a4, T a5, T a6, T a7)
creates a vector of values initialized by an ordinary static-sized array
Definition vector_x86.h:958
SIMD vector of 2 double-precision floating-point real numbers, operations on which use SSE2 instructi...
Definition vector_x86.h:46
VectorSSEDouble(T a)
Initializes all elements to the same value.
Definition vector_x86.h:58
void convert(T *a) const
writes this to a
Definition vector_x86.h:122
VectorSSEDouble(__m128d a)
converts a 128-bit SSE double vector type to VectorSSEDouble
Definition vector_x86.h:73
VectorSSEDouble()
creates a vector of default-initialized values.
Definition vector_x86.h:53
void load_aligned(T const *a)
loads a to this
Definition vector_x86.h:120
VectorSSEDouble(T(&a)[2])
creates a vector of values initialized by an ordinary static-sized array
Definition vector_x86.h:63
VectorSSEDouble(T a0, T a1)
creates a vector of values initialized by an ordinary static-sized array
Definition vector_x86.h:68
void convert_aligned(T *a) const
writes this to a
Definition vector_x86.h:125
void load(T const *a)
loads a to this
Definition vector_x86.h:117
SIMD vector of 4 single-precision floating-point real numbers, operations on which use SSE instructio...
Definition vector_x86.h:342
VectorSSEFloat(T a)
Initializes all elements to the same value.
Definition vector_x86.h:354
VectorSSEFloat()
creates a vector of default-initialized values.
Definition vector_x86.h:349
void convert(T *a) const
writes this to a
Definition vector_x86.h:421
void convert_aligned(T *a) const
writes this to a
Definition vector_x86.h:424
VectorSSEFloat(T(&a)[4])
creates a vector of values initialized by an ordinary static-sized array
Definition vector_x86.h:359
VectorSSEFloat(T a0, T a1, T a2, T a3)
creates a vector of values initialized by an ordinary static-sized array
Definition vector_x86.h:364
void load_aligned(T const *a)
loads a to this
Definition vector_x86.h:419
void load(T const *a)
loads a to this
Definition vector_x86.h:416
Definition type_traits.h:34