LIBINT 2.7.2
vector_x86.h
1/*
2 * Copyright (C) 2004-2021 Edward F. Valeev
3 *
4 * This file is part of Libint.
5 *
6 * Libint is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Libint is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with Libint. If not, see <http://www.gnu.org/licenses/>.
18 *
19 */
20
21#ifndef _libint2_src_lib_libint_vectorx86_h_
22#define _libint2_src_lib_libint_vectorx86_h_
23
24#include <cstring>
25#include <cmath>
26#include <iostream>
27
28#include <libint2/util/cxxstd.h>
29#include <libint2/util/type_traits.h>
30
31#if defined(__SSE2__) || defined(__SSE__) || defined(__AVX__)
32# include <x86intrin.h>
33#endif
34
35#ifdef __SSE2__
36
37namespace libint2 { namespace simd {
38
44
45 typedef double T;
46 __m128d d;
47
52
57 d = _mm_set_pd(a, a);
58 }
59
63 VectorSSEDouble(T (&a)[2]) {
64 d = _mm_loadu_pd(&a[0]);
65 }
66
70 VectorSSEDouble(T a0, T a1) {
71 d = _mm_set_pd(a1, a0);
72 }
73
77 VectorSSEDouble(__m128d a) {
78 d = a;
79 }
80
81 VectorSSEDouble& operator=(T a) {
82 d = _mm_set_pd(a, a);
83 return *this;
84 }
85
86 VectorSSEDouble& operator+=(VectorSSEDouble a) {
87 d = _mm_add_pd(d, a.d);
88 return *this;
89 }
90
91 VectorSSEDouble& operator-=(VectorSSEDouble a) {
92 d = _mm_sub_pd(d, a.d);
93 return *this;
94 }
95
96 VectorSSEDouble operator-() const {
97 // TODO improve using bitflips
98 const static __m128d minus_one = _mm_set_pd(-1.0, -1.0);;
99 VectorSSEDouble result;
100 result.d = _mm_mul_pd(this->d, minus_one);
101 return result;
102 }
103
104#if LIBINT2_CPLUSPLUS_STD >= 2011
105 explicit
106#endif
107 operator double() const {
108 double d0[2];
109 ::memcpy(&(d0[0]), &d, sizeof(__m128d));
110 // this segfaults on OS X
111 //_mm_storeu_pd(&(d0[0]), d);
112// // aligned alternative requires C++11's alignas, but efficiency should not matter here
113// alignas(__m128d) double d0[2];
114// _mm_store_pd(&(d0[0]), d);
115 return d0[0];
116 }
117
119 operator __m128d() const {
120 return d;
121 }
122
124 void load(T const* a) {
125 d = _mm_loadu_pd(a);
126 }
129 void load_aligned(T const* a) {
130 d = _mm_load_pd(a);
131 }
133 void convert(T* a) const {
134 _mm_storeu_pd(&a[0], d);
135 }
138 void convert_aligned(T* a) const {
139 _mm_store_pd(&a[0], d);
140 }
141
142 };
143
145 inline VectorSSEDouble operator*(double a, VectorSSEDouble b) {
146 VectorSSEDouble c;
147 VectorSSEDouble _a(a);
148 c.d = _mm_mul_pd(_a.d, b.d);
149 return c;
150 }
151
152 inline VectorSSEDouble operator*(VectorSSEDouble a, double b) {
153 VectorSSEDouble c;
154 VectorSSEDouble _b(b);
155 c.d = _mm_mul_pd(a.d, _b.d);
156 return c;
157 }
158
159 inline VectorSSEDouble operator*(int a, VectorSSEDouble b) {
160 if (a == 1)
161 return b;
162 else {
163 VectorSSEDouble c;
164 VectorSSEDouble _a((double)a);
165 c.d = _mm_mul_pd(_a.d, b.d);
166 return c;
167 }
168 }
169
170 inline VectorSSEDouble operator*(VectorSSEDouble a, int b) {
171 if (b == 1)
172 return a;
173 else {
174 VectorSSEDouble c;
175 VectorSSEDouble _b((double)b);
176 c.d = _mm_mul_pd(a.d, _b.d);
177 return c;
178 }
179 }
180
181 inline VectorSSEDouble operator*(VectorSSEDouble a, VectorSSEDouble b) {
182 VectorSSEDouble c;
183 c.d = _mm_mul_pd(a.d, b.d);
184 return c;
185 }
186
187 inline VectorSSEDouble operator+(VectorSSEDouble a, VectorSSEDouble b) {
188 VectorSSEDouble c;
189 c.d = _mm_add_pd(a.d, b.d);
190 return c;
191 }
192
193 inline VectorSSEDouble operator-(VectorSSEDouble a, VectorSSEDouble b) {
194 VectorSSEDouble c;
195 c.d = _mm_sub_pd(a.d, b.d);
196 return c;
197 }
198
199 inline VectorSSEDouble operator/(VectorSSEDouble a, VectorSSEDouble b) {
200 VectorSSEDouble c;
201 c.d = _mm_div_pd(a.d, b.d);
202 return c;
203 }
204
205#if defined(__FMA__)
206 inline VectorSSEDouble fma_plus(VectorSSEDouble a, VectorSSEDouble b, VectorSSEDouble c) {
207 VectorSSEDouble d;
208 d.d = _mm_fmadd_pd(a.d, b.d, c.d);
209 return d;
210 }
211 inline VectorSSEDouble fma_minus(VectorSSEDouble a, VectorSSEDouble b, VectorSSEDouble c) {
212 VectorSSEDouble d;
213 d.d = _mm_fmsub_pd(a.d, b.d, c.d);
214 return d;
215 }
216#elif defined(__FMA4__)
217 inline VectorSSEDouble fma_plus(VectorSSEDouble a, VectorSSEDouble b, VectorSSEDouble c) {
218 VectorSSEDouble d;
219 d.d = _mm_macc_pd(a.d, b.d, c.d);
220 return d;
221 }
222 inline VectorSSEDouble fma_minus(VectorSSEDouble a, VectorSSEDouble b, VectorSSEDouble c) {
223 VectorSSEDouble d;
224 d.d = _mm_msub_pd(a.d, b.d, c.d);
225 return d;
226 }
227#endif
228
232 inline double horizontal_add (VectorSSEDouble const & a) {
233// Agner Fog's version
234#if defined(__SSE3__)
235 __m128d t1 = _mm_hadd_pd(a,a);
236 return _mm_cvtsd_f64(t1);
237#else // SSE2 only
238 __m128 t0 = _mm_castpd_ps(a);
239 __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0,t0));
240 __m128d t2 = _mm_add_sd(a,t1);
241 return _mm_cvtsd_f64(t2);
242#endif
243 }
244
250#if defined(__SSE3__)
251 return _mm_hadd_pd(a, b);
252#else // will be very inefficient without SSE3
254#endif
255 }
256
258
260 inline VectorSSEDouble exp(VectorSSEDouble a) {
261#if HAVE_INTEL_SVML
262 VectorSSEDouble result;
263 result.d = _mm_exp_pd(a.d);
264#else
265 double a_d[2]; a.convert(a_d);
266 for(int i=0; i<2; ++i) a_d[i] = std::exp(a_d[i]);
267 VectorSSEDouble result(a_d);
268#endif
269 return result;
270 }
271 inline VectorSSEDouble sqrt(VectorSSEDouble a) {
272#if HAVE_INTEL_SVML
273 VectorSSEDouble result;
274 result.d = _mm_sqrt_pd(a.d);
275#else
276 double a_d[2]; a.convert(a_d);
277 for(int i=0; i<2; ++i) a_d[i] = std::sqrt(a_d[i]);
278 VectorSSEDouble result(a_d);
279#endif
280 return result;
281 }
282 inline VectorSSEDouble erf(VectorSSEDouble a) {
283#if HAVE_INTEL_SVML
284 VectorSSEDouble result;
285 result.d = _mm_erf_pd(a.d);
286#else
287 double a_d[2]; a.convert(a_d);
288 for(int i=0; i<2; ++i) a_d[i] = ::erf(a_d[i]);
289 VectorSSEDouble result(a_d);
290#endif
291 return result;
292 }
293 inline VectorSSEDouble erfc(VectorSSEDouble a) {
294#if HAVE_INTEL_SVML
295 VectorSSEDouble result;
296 result.d = _mm_erfc_pd(a.d);
297#else
298 double a_d[2]; a.convert(a_d);
299 for(int i=0; i<2; ++i) a_d[i] = ::erfc(a_d[i]);
300 VectorSSEDouble result(a_d);
301#endif
302 return result;
303 }
305
306};}; // namespace libint2::simd
307
309inline std::ostream& operator<<(std::ostream& os, libint2::simd::VectorSSEDouble a) {
310 double ad[2];
311 a.convert(ad);
312 os << "{" << ad[0] << "," << ad[1] << "}";
313 return os;
314}
316
317namespace libint2 {
318
320
321 template <>
322 struct is_vector<simd::VectorSSEDouble> {
323 static const bool value = true;
324 };
325
326 template <>
327 struct vector_traits<simd::VectorSSEDouble> {
328 typedef double scalar_type;
329 static const size_t extent = 2;
330 };
331
333
334} // namespace libint2
335
336#endif // SSE2-only
337
338#ifdef __SSE__
339
340namespace libint2 { namespace simd {
341
347
348 typedef float T;
349 __m128 d;
350
355
360 d = _mm_set_ps(a, a, a, a);
361 }
362
366 VectorSSEFloat(T (&a)[4]) {
367 d = _mm_loadu_ps(&a[0]);
368 }
369
373 VectorSSEFloat(T a0, T a1, T a2, T a3) {
374 d = _mm_set_ps(a3, a2, a1, a0);
375 }
376
377 VectorSSEFloat& operator=(T a) {
378 d = _mm_set_ps(a, a, a, a);
379 return *this;
380 }
381
382 VectorSSEFloat& operator+=(VectorSSEFloat a) {
383 d = _mm_add_ps(d, a.d);
384 return *this;
385 }
386
387 VectorSSEFloat& operator-=(VectorSSEFloat a) {
388 d = _mm_sub_ps(d, a.d);
389 return *this;
390 }
391
392 VectorSSEFloat operator-() const {
393 // TODO improve using bitflips
394 const static __m128 minus_one = _mm_set_ps(-1.0, -1.0, -1.0, -1.0);;
395 VectorSSEFloat result;
396 result.d = _mm_mul_ps(this->d, minus_one);
397 return result;
398 }
399
400#if LIBINT2_CPLUSPLUS_STD >= 2011
401 explicit
402#endif
403 operator float() const {
404 float d0[4];
405 ::memcpy(&(d0[0]), &d, sizeof(__m128));
406 // this segfaults on OS X
407 //_mm_storeu_ps(&(d0[0]), d);
408// // aligned alternative requires C++11's alignas, but efficiency should not matter here
409// alignas(__m128) float d0[4];
410// _mm_store_ps(&(d0[0]), d);
411 return d0[0];
412 }
413
414#if LIBINT2_CPLUSPLUS_STD >= 2011
415 explicit
416#endif
417 operator double() const {
418 const float result_flt = this->operator float();
419 return result_flt;
420 }
421
423 operator __m128() const {
424 return d;
425 }
426
428 void load(T const* a) {
429 d = _mm_loadu_ps(a);
430 }
433 void load_aligned(T const* a) {
434 d = _mm_load_ps(a);
435 }
437 void convert(T* a) const {
438 _mm_storeu_ps(&a[0], d);
439 }
442 void convert_aligned(T* a) const {
443 _mm_store_ps(&a[0], d);
444 }
445 };
446
448 inline VectorSSEFloat operator*(float a, VectorSSEFloat b) {
449 VectorSSEFloat c;
450 VectorSSEFloat _a(a);
451 c.d = _mm_mul_ps(_a.d, b.d);
452 return c;
453 }
454
455 inline VectorSSEFloat operator*(VectorSSEFloat a, float b) {
456 VectorSSEFloat c;
457 VectorSSEFloat _b(b);
458 c.d = _mm_mul_ps(a.d, _b.d);
459 return c;
460 }
461
462 // narrows a!
463 inline VectorSSEFloat operator*(double a, VectorSSEFloat b) {
464 VectorSSEFloat c;
465 VectorSSEFloat _a((float)a);
466 c.d = _mm_mul_ps(_a.d, b.d);
467 return c;
468 }
469
470 // narrows b!
471 inline VectorSSEFloat operator*(VectorSSEFloat a, double b) {
472 VectorSSEFloat c;
473 VectorSSEFloat _b((float)b);
474 c.d = _mm_mul_ps(a.d, _b.d);
475 return c;
476 }
477
478
479 inline VectorSSEFloat operator*(int a, VectorSSEFloat b) {
480 if (a == 1)
481 return b;
482 else {
483 VectorSSEFloat c;
484 VectorSSEFloat _a((float)a);
485 c.d = _mm_mul_ps(_a.d, b.d);
486 return c;
487 }
488 }
489
490 inline VectorSSEFloat operator*(VectorSSEFloat a, int b) {
491 if (b == 1)
492 return a;
493 else {
494 VectorSSEFloat c;
495 VectorSSEFloat _b((float)b);
496 c.d = _mm_mul_ps(a.d, _b.d);
497 return c;
498 }
499 }
500
501 inline VectorSSEFloat operator*(VectorSSEFloat a, VectorSSEFloat b) {
502 VectorSSEFloat c;
503 c.d = _mm_mul_ps(a.d, b.d);
504 return c;
505 }
506
507 inline VectorSSEFloat operator+(VectorSSEFloat a, VectorSSEFloat b) {
508 VectorSSEFloat c;
509 c.d = _mm_add_ps(a.d, b.d);
510 return c;
511 }
512
513 inline VectorSSEFloat operator-(VectorSSEFloat a, VectorSSEFloat b) {
514 VectorSSEFloat c;
515 c.d = _mm_sub_ps(a.d, b.d);
516 return c;
517 }
518
519 inline VectorSSEFloat operator/(VectorSSEFloat a, VectorSSEFloat b) {
520 VectorSSEFloat c;
521 c.d = _mm_div_ps(a.d, b.d);
522 return c;
523 }
524
525#if defined(__FMA__)
526 inline VectorSSEFloat fma_plus(VectorSSEFloat a, VectorSSEFloat b, VectorSSEFloat c) {
527 VectorSSEFloat d;
528 d.d = _mm_fmadd_ps(a.d, b.d, c.d);
529 return d;
530 }
531 inline VectorSSEFloat fma_minus(VectorSSEFloat a, VectorSSEFloat b, VectorSSEFloat c) {
532 VectorSSEFloat d;
533 d.d = _mm_fmsub_ps(a.d, b.d, c.d);
534 return d;
535 }
536#elif defined(__FMA4__)
537 inline VectorSSEFloat fma_plus(VectorSSEFloat a, VectorSSEFloat b, VectorSSEFloat c) {
538 VectorSSEFloat d;
539 d.d = _mm_macc_ps(a.d, b.d, c.d);
540 return d;
541 }
542 inline VectorSSEFloat fma_minus(VectorSSEFloat a, VectorSSEFloat b, VectorSSEFloat c) {
543 VectorSSEFloat d;
544 d.d = _mm_msub_ps(a.d, b.d, c.d);
545 return d;
546 }
547#endif
548
550
552 inline VectorSSEFloat exp(VectorSSEFloat a) {
553#if HAVE_INTEL_SVML
554 VectorSSEFloat result;
555 result.d = _mm_exp_ps(a.d);
556#else
557 float a_d[4]; a.convert(a_d);
558 for(int i=0; i<4; ++i) a_d[i] = std::exp(a_d[i]);
559 VectorSSEFloat result(a_d);
560#endif
561 return result;
562 }
563 inline VectorSSEFloat sqrt(VectorSSEFloat a) {
564#if HAVE_INTEL_SVML
565 VectorSSEFloat result;
566 result.d = _mm_sqrt_ps(a.d);
567#else
568 float a_d[4]; a.convert(a_d);
569 for(int i=0; i<4; ++i) a_d[i] = std::sqrt(a_d[i]);
570 VectorSSEFloat result(a_d);
571#endif
572 return result;
573 }
574 inline VectorSSEFloat erf(VectorSSEFloat a) {
575#if HAVE_INTEL_SVML
576 VectorSSEFloat result;
577 result.d = _mm_erf_ps(a.d);
578#else
579 float a_d[4]; a.convert(a_d);
580 for(int i=0; i<4; ++i) a_d[i] = ::erf(a_d[i]);
581 VectorSSEFloat result(a_d);
582#endif
583 return result;
584 }
585 inline VectorSSEFloat erfc(VectorSSEFloat a) {
586#if HAVE_INTEL_SVML
587 VectorSSEFloat result;
588 result.d = _mm_erfc_ps(a.d);
589#else
590 float a_d[4]; a.convert(a_d);
591 for(int i=0; i<4; ++i) a_d[i] = ::erfc(a_d[i]);
592 VectorSSEFloat result(a_d);
593#endif
594 return result;
595 }
597
598};}; // namespace libint2::simd
599
601inline std::ostream& operator<<(std::ostream& os, libint2::simd::VectorSSEFloat a) {
602 float ad[4];
603 a.convert(ad);
604 os << "{" << ad[0] << "," << ad[1] << "," << ad[2] << "," << ad[3] << "}";
605 return os;
606}
608
609namespace libint2 {
610
612
613 template <>
614 struct is_vector<simd::VectorSSEFloat> {
615 static const bool value = true;
616 };
617
618 template <>
619 struct vector_traits<simd::VectorSSEFloat> {
620 typedef float scalar_type;
621 static const size_t extent = 4;
622 };
623
625
626} // namespace libint2
627
628#endif // SSE-only
629
630#ifdef __AVX__
631
632namespace libint2 { namespace simd {
633
640
641 typedef double T;
642 __m256d d;
643
648
653 d = _mm256_set_pd(a, a, a, a);
654 }
655
659 VectorAVXDouble(T (&a)[4]) {
660 d = _mm256_loadu_pd(&a[0]);
661 }
662
666 VectorAVXDouble(T a0, T a1, T a2, T a3) {
667 d = _mm256_set_pd(a3, a2, a1, a0);
668 }
669
673 VectorAVXDouble(__m256d a) {
674 d = a;
675 }
676
677 VectorAVXDouble& operator=(T a) {
678 d = _mm256_set_pd(a, a, a, a);
679 return *this;
680 }
681
682 VectorAVXDouble& operator+=(VectorAVXDouble a) {
683 d = _mm256_add_pd(d, a.d);
684 return *this;
685 }
686
687 VectorAVXDouble& operator-=(VectorAVXDouble a) {
688 d = _mm256_sub_pd(d, a.d);
689 return *this;
690 }
691
692 VectorAVXDouble operator-() const {
693 // TODO improve using bitflips
694 const static __m256d minus_one = _mm256_set_pd(-1.0, -1.0, -1.0, -1.0);;
695 VectorAVXDouble result;
696 result.d = _mm256_mul_pd(this->d, minus_one);
697 return result;
698 }
699
700#if LIBINT2_CPLUSPLUS_STD >= 2011
701 explicit
702#endif
703 operator double() const {
704 double d0[4];
705 ::memcpy(&(d0[0]), &d, sizeof(__m256d));
706 // this segfaults on OS X
707// _mm256_storeu_pd(&d0[0], d);
708// // aligned alternative requires C++11's alignas, but efficiency should not matter here
709// // alignas(__m256d) double d0[4];
710// // _mm256_store_pd(&(d0[0]), d);
711 return d0[0];
712 }
713
715 operator __m256d() const {
716 return d;
717 }
718
720 void load(T const* a) {
721 d = _mm256_loadu_pd(a);
722 }
725 void load_aligned(T const* a) {
726 d = _mm256_load_pd(a);
727 }
729 void convert(T* a) const {
730 _mm256_storeu_pd(&a[0], d);
731 }
734 void convert_aligned(T* a) const {
735 _mm256_store_pd(&a[0], d);
736 }
737 };
738
740 inline VectorAVXDouble operator*(double a, VectorAVXDouble b) {
741 VectorAVXDouble c;
742 VectorAVXDouble _a(a);
743 c.d = _mm256_mul_pd(_a.d, b.d);
744 return c;
745 }
746
747 inline VectorAVXDouble operator*(VectorAVXDouble a, double b) {
748 VectorAVXDouble c;
749 VectorAVXDouble _b(b);
750 c.d = _mm256_mul_pd(a.d, _b.d);
751 return c;
752 }
753
754 inline VectorAVXDouble operator*(int a, VectorAVXDouble b) {
755 if (a == 1)
756 return b;
757 else {
758 VectorAVXDouble c;
759 VectorAVXDouble _a((double)a);
760 c.d = _mm256_mul_pd(_a.d, b.d);
761 return c;
762 }
763 }
764
765 inline VectorAVXDouble operator*(VectorAVXDouble a, int b) {
766 if (b == 1)
767 return a;
768 else {
769 VectorAVXDouble c;
770 VectorAVXDouble _b((double)b);
771 c.d = _mm256_mul_pd(a.d, _b.d);
772 return c;
773 }
774 }
775
776 inline VectorAVXDouble operator*(VectorAVXDouble a, VectorAVXDouble b) {
777 VectorAVXDouble c;
778 c.d = _mm256_mul_pd(a.d, b.d);
779 return c;
780 }
781
782 inline VectorAVXDouble operator+(VectorAVXDouble a, VectorAVXDouble b) {
783 VectorAVXDouble c;
784 c.d = _mm256_add_pd(a.d, b.d);
785 return c;
786 }
787
788 inline VectorAVXDouble operator+(int a, VectorAVXDouble b) {
789 if(a == 0){
790 return b;
791 }
792 else {
793 VectorAVXDouble c;
794 VectorAVXDouble _a = (static_cast<double>(a));
795 c.d = _mm256_add_pd(_a.d,b.d);
796 return c;
797 }
798 }
799
800
801 inline VectorAVXDouble operator+(VectorAVXDouble a, int b) {
802 if(b == 0){
803 return a;
804 }
805 else {
806 VectorAVXDouble c;
807 VectorAVXDouble _b = (static_cast<double>(b));
808 c.d = _mm256_add_pd(a.d,_b.d);
809 return c;
810 }
811 }
812
813
814 inline VectorAVXDouble operator-(VectorAVXDouble a, VectorAVXDouble b) {
815 VectorAVXDouble c;
816 c.d = _mm256_sub_pd(a.d, b.d);
817 return c;
818 }
819
820 inline VectorAVXDouble operator/(VectorAVXDouble a, VectorAVXDouble b) {
821 VectorAVXDouble c;
822 c.d = _mm256_div_pd(a.d, b.d);
823 return c;
824 }
825
826#if defined(__FMA__)
827 inline VectorAVXDouble fma_plus(VectorAVXDouble a, VectorAVXDouble b, VectorAVXDouble c) {
828 VectorAVXDouble d;
829 d.d = _mm256_fmadd_pd(a.d, b.d, c.d);
830 return d;
831 }
832 inline VectorAVXDouble fma_minus(VectorAVXDouble a, VectorAVXDouble b, VectorAVXDouble c) {
833 VectorAVXDouble d;
834 d.d = _mm256_fmsub_pd(a.d, b.d, c.d);
835 return d;
836 }
837#elif defined(__FMA4__)
838 inline VectorAVXDouble fma_plus(VectorAVXDouble a, VectorAVXDouble b, VectorAVXDouble c) {
839 VectorAVXDouble d;
840 d.d = _mm256_macc_pd(a.d, b.d, c.d);
841 return d;
842 }
843 inline VectorAVXDouble fma_minus(VectorAVXDouble a, VectorAVXDouble b, VectorAVXDouble c) {
844 VectorAVXDouble d;
845 d.d = _mm256_msub_pd(a.d, b.d, c.d);
846 return d;
847 }
848#endif
849
853 inline double horizontal_add (VectorAVXDouble const & a) {
854 __m256d s = _mm256_hadd_pd(a,a);
855 return ((double*)&s)[0] + ((double*)&s)[2];
856// Agner Fog's version
857// __m256d t1 = _mm256_hadd_pd(a,a);
858// __m128d t2 = _mm256_extractf128_pd(t1,1);
859// __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
860// return _mm_cvtsd_f64(t3);
861 }
862
868 // solution from http://stackoverflow.com/questions/9775538/fastest-way-to-do-horizontal-vector-sum-with-avx-instructions
869 __m256d sum = _mm256_hadd_pd(a, b);
870 // extract upper 128 bits of result
871 __m128d sum_high = _mm256_extractf128_pd(sum, 1);
872 // add upper 128 bits of sum to its lower 128 bits
873 return _mm_add_pd(sum_high, _mm256_castpd256_pd128(sum));
874 }
875
883 VectorAVXDouble const & b,
884 VectorAVXDouble const & c,
885 VectorAVXDouble const & d) {
886 // solution from http://stackoverflow.com/questions/10833234/4-horizontal-double-precision-sums-in-one-go-with-avx?lq=1
887 // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
888 __m256d sumab = _mm256_hadd_pd(a, b);
889 // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
890 __m256d sumcd = _mm256_hadd_pd(c, d);
891
892 // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
893 __m256d blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
894 // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
895 __m256d perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);
896
897 return _mm256_add_pd(perm, blend);
898 }
899
901
903 inline VectorAVXDouble exp(VectorAVXDouble a) {
904#if HAVE_INTEL_SVML
905 VectorAVXDouble result;
906 result.d = _mm256_exp_pd(a.d);
907#else
908 double a_d[4]; a.convert(a_d);
909 for(int i=0; i<4; ++i) a_d[i] = ::exp(a_d[i]);
910 VectorAVXDouble result(a_d);
911#endif
912 return result;
913 }
914 inline VectorAVXDouble sqrt(VectorAVXDouble a) {
915#if HAVE_INTEL_SVML
916 VectorAVXDouble result;
917 result.d = _mm256_sqrt_pd(a.d);
918#else
919 double a_d[4]; a.convert(a_d);
920 for(int i=0; i<4; ++i) a_d[i] = ::sqrt(a_d[i]);
921 VectorAVXDouble result(a_d);
922#endif
923 return result;
924 }
925 inline VectorAVXDouble erf(VectorAVXDouble a) {
926#if HAVE_INTEL_SVML
927 VectorAVXDouble result;
928 result.d = _mm256_erf_pd(a.d);
929#else
930 double a_d[4]; a.convert(a_d);
931 for(int i=0; i<4; ++i) a_d[i] = ::erf(a_d[i]);
932 VectorAVXDouble result(a_d);
933#endif
934 return result;
935 }
936 inline VectorAVXDouble erfc(VectorAVXDouble a) {
937#if HAVE_INTEL_SVML
938 VectorAVXDouble result;
939 result.d = _mm256_erfc_pd(a.d);
940#else
941 double a_d[4]; a.convert(a_d);
942 for(int i=0; i<4; ++i) a_d[i] = ::erfc(a_d[i]);
943 VectorAVXDouble result(a_d);
944#endif
945 return result;
946 }
947
948
949
950
957
958 typedef float T;
959 __m256 d;
960
965
970 d = _mm256_set_ps(a, a, a, a, a, a, a, a);
971 }
972
976 VectorAVXFloat(T (&a)[8]) {
977 d = _mm256_loadu_ps(&a[0]);
978 }
979
983 VectorAVXFloat(T a0, T a1, T a2, T a3, T a4, T a5, T a6, T a7) {
984 d = _mm256_set_ps(a0, a1, a2, a3, a4, a5, a6, a7);
985 }
986
987 VectorAVXFloat& operator=(T a) {
988 d = _mm256_set_ps(a, a, a, a, a, a, a, a);
989 return *this;
990 }
991
992 VectorAVXFloat& operator+=(VectorAVXFloat a) {
993 d = _mm256_add_ps(d, a.d);
994 return *this;
995 }
996
997 VectorAVXFloat& operator-=(VectorAVXFloat a) {
998 d = _mm256_sub_ps(d, a.d);
999 return *this;
1000 }
1001
1002 VectorAVXFloat operator-() const {
1003 // TODO improve using bitflips
1004 const static __m256 minus_one = _mm256_set_ps(-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0);;
1005 VectorAVXFloat result;
1006 result.d = _mm256_mul_ps(this->d, minus_one);
1007 return result;
1008 }
1009
1010 explicit operator float() const {
1011 float d0[8];
1012 ::memcpy(&(d0[0]), &d, sizeof(__m256));
1013 // this segfaults on OS X
1014// _mm256_storeu_ps(&d0[0], d);
1015// // aligned alternative requires C++11's alignas, but efficiency should not matter here
1016// // alignas(__m256) float d0[8];
1017// // _mm256_store_ps(&(d0[0]), d);
1018 return d0[0];
1019 }
1020
1021 explicit operator double() const {
1022 const float result_flt = this->operator float();
1023 return result_flt;
1024 }
1025
1026 void convert(T(&a)[8]) const {
1027 _mm256_storeu_ps(&a[0], d);
1028 }
1029 };
1030
1032 inline VectorAVXFloat operator*(double a, VectorAVXFloat b) {
1033 VectorAVXFloat c;
1034 VectorAVXFloat _a(a);
1035 c.d = _mm256_mul_ps(_a.d, b.d);
1036 return c;
1037 }
1038
1039 inline VectorAVXFloat operator*(VectorAVXFloat a, double b) {
1040 VectorAVXFloat c;
1041 VectorAVXFloat _b(b);
1042 c.d = _mm256_mul_ps(a.d, _b.d);
1043 return c;
1044 }
1045
1046 inline VectorAVXFloat operator*(int a, VectorAVXFloat b) {
1047 if (a == 1)
1048 return b;
1049 else {
1050 VectorAVXFloat c;
1051 VectorAVXFloat _a((float)a);
1052 c.d = _mm256_mul_ps(_a.d, b.d);
1053 return c;
1054 }
1055 }
1056
1057 inline VectorAVXFloat operator*(VectorAVXFloat a, int b) {
1058 if (b == 1)
1059 return a;
1060 else {
1061 VectorAVXFloat c;
1062 VectorAVXFloat _b((float)b);
1063 c.d = _mm256_mul_ps(a.d, _b.d);
1064 return c;
1065 }
1066 }
1067
1068 inline VectorAVXFloat operator*(VectorAVXFloat a, VectorAVXFloat b) {
1069 VectorAVXFloat c;
1070 c.d = _mm256_mul_ps(a.d, b.d);
1071 return c;
1072 }
1073
1074 inline VectorAVXFloat operator+(VectorAVXFloat a, VectorAVXFloat b) {
1075 VectorAVXFloat c;
1076 c.d = _mm256_add_ps(a.d, b.d);
1077 return c;
1078 }
1079
1080 inline VectorAVXFloat operator-(VectorAVXFloat a, VectorAVXFloat b) {
1081 VectorAVXFloat c;
1082 c.d = _mm256_sub_ps(a.d, b.d);
1083 return c;
1084 }
1085
1086 inline VectorAVXFloat operator/(VectorAVXFloat a, VectorAVXFloat b) {
1087 VectorAVXFloat c;
1088 c.d = _mm256_div_ps(a.d, b.d);
1089 return c;
1090 }
1091
1092#if defined(__FMA__)
1093 inline VectorAVXFloat fma_plus(VectorAVXFloat a, VectorAVXFloat b, VectorAVXFloat c) {
1094 VectorAVXFloat d;
1095 d.d = _mm256_fmadd_ps(a.d, b.d, c.d);
1096 return d;
1097 }
1098 inline VectorAVXFloat fma_minus(VectorAVXFloat a, VectorAVXFloat b, VectorAVXFloat c) {
1099 VectorAVXFloat d;
1100 d.d = _mm256_fmsub_ps(a.d, b.d, c.d);
1101 return d;
1102 }
1103#elif defined(__FMA4__)
1104 inline VectorAVXFloat fma_plus(VectorAVXFloat a, VectorAVXFloat b, VectorAVXFloat c) {
1105 VectorAVXFloat d;
1106 d.d = _mm256_facc_ps(a.d, b.d, c.d);
1107 return d;
1108 }
1109 inline VectorAVXFloat fma_minus(VectorAVXFloat a, VectorAVXFloat b, VectorAVXFloat c) {
1110 VectorAVXFloat d;
1111 d.d = _mm256_fsub_ps(a.d, b.d, c.d);
1112 return d;
1113 }
1114#endif
1115
1117
1119 inline VectorAVXFloat exp(VectorAVXFloat a) {
1120#if HAVE_INTEL_SVML
1121 VectorAVXFloat result;
1122 result.d = _mm256_exp_ps(a.d);
1123#else
1124 float a_d[8]; a.convert(a_d);
1125 for(int i=0; i<8; ++i) a_d[i] = ::exp(a_d[i]);
1126 VectorAVXFloat result(a_d);
1127#endif
1128 return result;
1129 }
1130 inline VectorAVXFloat sqrt(VectorAVXFloat a) {
1131#if HAVE_INTEL_SVML
1132 VectorAVXFloat result;
1133 result.d = _mm256_sqrt_ps(a.d);
1134#else
1135 float a_d[8]; a.convert(a_d);
1136 for(int i=0; i<8; ++i) a_d[i] = ::sqrt(a_d[i]);
1137 VectorAVXFloat result(a_d);
1138#endif
1139 return result;
1140 }
1141 inline VectorAVXFloat erf(VectorAVXFloat a) {
1142#if HAVE_INTEL_SVML
1143 VectorAVXFloat result;
1144 result.d = _mm256_erf_ps(a.d);
1145#else
1146 float a_d[8]; a.convert(a_d);
1147 for(int i=0; i<8; ++i) a_d[i] = ::erf(a_d[i]);
1148 VectorAVXFloat result(a_d);
1149#endif
1150 return result;
1151 }
1152 inline VectorAVXFloat erfc(VectorAVXFloat a) {
1153#if HAVE_INTEL_SVML
1154 VectorAVXFloat result;
1155 result.d = _mm256_erfc_ps(a.d);
1156#else
1157 float a_d[8]; a.convert(a_d);
1158 for(int i=0; i<8; ++i) a_d[i] = ::erfc(a_d[i]);
1159 VectorAVXFloat result(a_d);
1160#endif
1161 return result;
1162 }
1164
1165};}; // namespace libint2::simd
1166
1168inline std::ostream& operator<<(std::ostream& os, libint2::simd::VectorAVXDouble a) {
1169 double ad[4];
1170 a.convert(ad);
1171 os << "{" << ad[0] << "," << ad[1] << "," << ad[2] << "," << ad[3] << "}";
1172 return os;
1173}
1174
1176inline std::ostream& operator<<(std::ostream& os, libint2::simd::VectorAVXFloat a) {
1177 float ad[8];
1178 a.convert(ad);
1179 os << "{" << ad[0] << "," << ad[1] << "," << ad[2] << "," << ad[3] << "," << ad[4]<< "," << ad[5]<< "," << ad[6]<< "," << ad[7] << "}";
1180 return os;
1181}
1183
1184namespace libint2 {
1185
1187
1188 template <>
1189 struct is_vector<simd::VectorAVXDouble> {
1190 static const bool value = true;
1191 };
1192
1193 template <>
1194 struct vector_traits<simd::VectorAVXDouble> {
1195 typedef double scalar_type;
1196 static const size_t extent = 4;
1197 };
1198
1200
1201} // namespace libint2
1202
1203#endif // AVX-only
1204
1205
1206#ifdef LIBINT2_HAVE_AGNER_VECTORCLASS
1207#include <vectorclass.h>
1208#endif
1209
1210#endif // header guard
double horizontal_add(VectorSSEDouble const &a)
Horizontal add.
Definition: vector_x86.h:232
Defaults definitions for various parameters assumed by Libint.
Definition: algebra.cc:24
auto fma_plus(X x, Y y, Z z) -> decltype(x *y+z)
Definition: intrinsic_operations.h:36
SafePtr< CTimeEntity< typename ProductType< T, U >::result > > operator*(const SafePtr< CTimeEntity< T > > &A, const SafePtr< CTimeEntity< U > > &B)
Creates product A*B.
Definition: entity.h:280
auto fma_minus(X x, Y y, Z z) -> decltype(x *y-z)
Definition: intrinsic_operations.h:42
Definition: type_traits.h:29
SIMD vector of 4 double-precision floating-point real numbers, operations on which use AVX instructio...
Definition: vector_x86.h:639
void convert(T *a) const
writes this to a
Definition: vector_x86.h:729
VectorAVXDouble(__m256d a)
converts a 256-bit AVX double vector type to VectorAVXDouble
Definition: vector_x86.h:673
VectorAVXDouble(T a)
Initializes all elements to the same value.
Definition: vector_x86.h:652
VectorAVXDouble()
creates a vector of default-initialized values.
Definition: vector_x86.h:647
VectorAVXDouble(T(&a)[4])
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:659
void load(T const *a)
loads a to this
Definition: vector_x86.h:720
void convert_aligned(T *a) const
writes this to a
Definition: vector_x86.h:734
VectorAVXDouble(T a0, T a1, T a2, T a3)
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:666
void load_aligned(T const *a)
loads a to this
Definition: vector_x86.h:725
SIMD vector of 8 single-precision floating-point real numbers, operations on which use AVX instructio...
Definition: vector_x86.h:956
VectorAVXFloat()
creates a vector of default-initialized values.
Definition: vector_x86.h:964
VectorAVXFloat(T(&a)[8])
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:976
VectorAVXFloat(T a)
Initializes all elements to the same value.
Definition: vector_x86.h:969
VectorAVXFloat(T a0, T a1, T a2, T a3, T a4, T a5, T a6, T a7)
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:983
SIMD vector of 2 double-precision floating-point real numbers, operations on which use SSE2 instructi...
Definition: vector_x86.h:43
VectorSSEDouble(T a)
Initializes all elements to the same value.
Definition: vector_x86.h:56
void convert(T *a) const
writes this to a
Definition: vector_x86.h:133
VectorSSEDouble(__m128d a)
converts a 128-bit SSE double vector type to VectorSSEDouble
Definition: vector_x86.h:77
VectorSSEDouble()
creates a vector of default-initialized values.
Definition: vector_x86.h:51
void load_aligned(T const *a)
loads a to this
Definition: vector_x86.h:129
VectorSSEDouble(T(&a)[2])
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:63
VectorSSEDouble(T a0, T a1)
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:70
void convert_aligned(T *a) const
writes this to a
Definition: vector_x86.h:138
void load(T const *a)
loads a to this
Definition: vector_x86.h:124
SIMD vector of 4 single-precision floating-point real numbers, operations on which use SSE instructio...
Definition: vector_x86.h:346
VectorSSEFloat(T a)
Initializes all elements to the same value.
Definition: vector_x86.h:359
VectorSSEFloat()
creates a vector of default-initialized values.
Definition: vector_x86.h:354
void convert(T *a) const
writes this to a
Definition: vector_x86.h:437
void convert_aligned(T *a) const
writes this to a
Definition: vector_x86.h:442
VectorSSEFloat(T(&a)[4])
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:366
VectorSSEFloat(T a0, T a1, T a2, T a3)
creates a vector of values initialized by an ordinary static-sized array
Definition: vector_x86.h:373
void load_aligned(T const *a)
loads a to this
Definition: vector_x86.h:433
void load(T const *a)
loads a to this
Definition: vector_x86.h:428
Definition: type_traits.h:34