LIBINT 2.9.0
boys.h
1/*
2 * Copyright (C) 2004-2024 Edward F. Valeev
3 *
4 * This file is part of Libint library.
5 *
6 * Libint library is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Libint library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with Libint library. If not, see <http://www.gnu.org/licenses/>.
18 *
19 */
20
21#ifndef _libint2_src_lib_libint_boys_h_
22#define _libint2_src_lib_libint_boys_h_
23
24#if defined(__cplusplus)
25
26#include <libint2/util/vector.h>
27
28#include <algorithm>
29#include <cassert>
30#include <cmath>
31#include <cstdlib>
32#include <iostream>
33#include <limits>
34#include <memory>
35#include <mutex>
36#include <stdexcept>
37#include <type_traits>
38#include <vector>
39
40#ifdef _MSC_VER
41#define posix_memalign(p, a, s) \
42 (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
43#define posix_memfree(p) ((_aligned_free((p))))
44#else
45#define posix_memfree(p) ((free((p))))
46#endif
47
48// from now on at least C++11 is required by default
49#include <libint2/util/cxxstd.h>
50#if LIBINT2_CPLUSPLUS_STD < 2011
51#error "Libint2 C++ API requires C++11 support"
52#endif
53
54#include <libint2/boys_fwd.h>
55#include <libint2/initialize.h>
56#include <libint2/numeric.h>
57
58#if HAVE_LAPACK // use F77-type interface for now, switch to LAPACKE later
59extern "C" void dgesv_(const int* n, const int* nrhs, double* A, const int* lda,
60 int* ipiv, double* b, const int* ldb, int* info);
61#endif
62
63namespace libint2 {
64
66template <typename Real>
68 public:
69 ExpensiveNumbers(int ifac = -1, int idf = -1, int ibc = -1) {
70 if (ifac >= 0) {
71 fac.resize(ifac + 1);
72 fac[0] = 1.0;
73 for (int i = 1; i <= ifac; i++) {
74 fac[i] = i * fac[i - 1];
75 }
76 }
77
78 if (idf >= 0) {
79 df.resize(idf + 1);
80 /* df[i] gives (i-1)!!, so that (-1)!! is defined... */
81 df[0] = 1.0;
82 if (idf >= 1) df[1] = 1.0;
83 if (idf >= 2) df[2] = 1.0;
84 for (int i = 3; i <= idf; i++) {
85 df[i] = (i - 1) * df[i - 2];
86 }
87 }
88
89 if (ibc >= 0) {
90 bc_.resize((ibc + 1) * (ibc + 1));
91 std::fill(bc_.begin(), bc_.end(), Real(0));
92 bc.resize(ibc + 1);
93 bc[0] = &bc_[0];
94 for (int i = 1; i <= ibc; ++i) bc[i] = bc[i - 1] + (ibc + 1);
95
96 for (int i = 0; i <= ibc; i++) bc[i][0] = 1.0;
97 for (int i = 0; i <= ibc; i++)
98 for (int j = 1; j <= i; ++j)
99 bc[i][j] = bc[i][j - 1] * Real(i - j + 1) / Real(j);
100 }
101
102 for (int i = 0; i < 128; i++) {
103 twoi1[i] = 1.0 / (Real(2.0) * i + Real(1.0));
104 ihalf[i] = Real(i) - Real(0.5);
105 }
106 }
107
109
110 std::vector<Real> fac;
111 std::vector<Real> df;
112 std::vector<Real*> bc;
113
114 // these quantitites are needed with indices <= mmax
115 // 64 is sufficient to handle up to 4 center integrals with up to L=15 basis
116 // functions but need higher values for Yukawa integrals ...
117 Real twoi1[128]; /* 1/(2 i + 1); needed for downward recursion */
118 Real ihalf[128]; /* i - 0.5, needed for upward recursion */
119
120 private:
121 std::vector<Real> bc_;
122};
123
143template <typename Real>
145 static std::shared_ptr<const FmEval_Reference> instance(
146 int /* mmax */, Real /* precision */) {
147 // thread-safe per C++11 standard [6.7.4]
148 static auto instance_ = std::make_shared<const FmEval_Reference>();
149
150 return instance_;
151 }
152
155 static Real eval(Real T, size_t m) {
156 assert(m < 100);
157 static const Real T_crit =
158 std::numeric_limits<Real>::is_bounded
159 ? -log(std::numeric_limits<Real>::min() * 100.5 / 2.)
160 : Real(0);
161 if (std::numeric_limits<Real>::is_bounded && T > T_crit)
162 throw std::overflow_error(
163 "FmEval_Reference<Real>::eval: Real lacks precision for the given "
164 "value of argument T");
165 static const Real half = Real(1) / 2;
166 Real denom = (m + half);
167 using std::exp;
168 Real term = exp(-T) / (2 * denom);
169 Real old_term = 0;
170 Real sum = term;
171 const Real epsilon = get_epsilon(T);
172 const Real epsilon_divided_10 = epsilon / 10;
173 do {
174 denom += 1;
175 old_term = term;
176 term = old_term * T / denom;
177 sum += term;
178 // rel_error = term / sum , hence iterate until rel_error = epsilon
179 // however, must ensure that contributions are decreasing to ensure that
180 // omitted contributions are smaller than epsilon
181 } while (term > sum * epsilon_divided_10 || old_term < term);
182
183 return sum;
184 }
185
192 static void eval(Real* Fm, Real T, size_t mmax) {
193 // evaluate for mmax using MacLaurin series
194 // it converges fastest for the largest m -> use it to compute Fmmax(T)
195 // see JPC 94, 5564 (1990).
196 for (size_t m = 0; m <= mmax; ++m) Fm[m] = eval(T, m);
197 return;
198 }
199};
200
213template <typename Real>
215 static std::shared_ptr<const FmEval_Reference2> instance(
216 int /* mmax */, Real /* precision */) {
217 // thread-safe per C++11 standard [6.7.4]
218 static auto instance_ = std::make_shared<const FmEval_Reference2>();
219
220 return instance_;
221 }
222
229 static void eval(Real* Fm, Real t, size_t mmax) {
230 if (t < Real(117)) {
231 FmEval_Reference<Real>::eval(Fm, t, mmax);
232 } else {
233 const Real two_over_sqrt_pi{
234 1.128379167095512573896158903121545171688101258657997713688171443421284936882986828973487320404214727};
235 const Real K = 1 / two_over_sqrt_pi;
236
237 auto t2 = 2 * t;
238 auto et = exp(-t);
239 auto sqrt_t = sqrt(t);
240 Fm[0] = K * erf(sqrt_t) / sqrt_t;
241 if (mmax > 0)
242 for (size_t m = 0; m <= mmax - 1; m++) {
243 Fm[m + 1] = ((2 * m + 1) * Fm[m] - et) / (t2);
244 }
245 }
246 }
247};
248
252template <typename Real = double>
254#include <libint2/boys_cheb7.h>
255
256 static_assert(std::is_same<Real, double>::value,
257 "FmEval_Chebyshev7 only supports double as the real type");
258
259 static constexpr int ORDER = interpolation_order;
260 static constexpr int ORDERp1 = ORDER + 1;
261
262 static constexpr Real T_crit =
263 cheb_table_tmax;
265 static constexpr Real delta = cheb_table_delta;
266 static constexpr Real one_over_delta = 1 / delta;
267
268 int mmax;
270 Real* c; /* the Chebyshev coefficients table, T_crit*one_over_delta by
271 mmax*ORDERp1 */
272
273 public:
281 double precision = std::numeric_limits<double>::epsilon())
282 : mmax(m_max), numbers_(14) {
283#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || \
284 defined(_M_IX86)
285#if !defined(__AVX__) && defined(NDEBUG)
286 if (libint2::verbose()) {
287 static bool printed_performance_warning = false;
288 if (!printed_performance_warning) {
290 << "libint2::FmEval_Chebyshev7 on x86(-64) platforms needs AVX "
291 "support for best performance"
292 << std::endl;
293 printed_performance_warning = true;
294 }
295 }
296#endif
297#endif
298 if (precision < std::numeric_limits<double>::epsilon())
299 throw std::invalid_argument(
300 std::string(
301 "FmEval_Chebyshev7 does not support precision smaller than ") +
302 std::to_string(std::numeric_limits<double>::epsilon()));
303 if (mmax > cheb_table_mmax)
304 throw std::invalid_argument(
305 "FmEval_Chebyshev7::init() : requested mmax exceeds the "
306 "hard-coded mmax");
307 if (m_max >= 0) init_table();
308 }
310 if (mmax >= 0) {
311 posix_memfree(c);
312 }
313 }
314
317 static std::shared_ptr<const FmEval_Chebyshev7> instance(int m_max,
318 double = 0.0) {
319 assert(m_max >= 0);
320 // thread-safe per C++11 standard [6.7.4]
321 static auto instance_ = std::make_shared<const FmEval_Chebyshev7>(m_max);
322
323 while (instance_->max_m() < m_max) {
324 static std::mutex mtx;
325 std::lock_guard<std::mutex> lck(mtx);
326 if (instance_->max_m() < m_max) {
327 auto new_instance = std::make_shared<const FmEval_Chebyshev7>(m_max);
328 instance_ = new_instance;
329 }
330 }
331
332 return instance_;
333 }
334
337 int max_m() const { return mmax; }
338
345 inline void eval(Real* Fm, Real x, int m_max) const {
346 // large T => use upward recursion
347 // cost = 1 div + 1 sqrt + (1 + 2*(m-1)) muls
348 if (x > T_crit) {
349 const double one_over_x = 1 / x;
350 Fm[0] = 0.88622692545275801365 *
351 sqrt(one_over_x); // see Eq. (9.8.9) in Helgaker-Jorgensen-Olsen
352 if (m_max == 0) return;
353 // this upward recursion formula omits - e^(-x)/(2x), which for x>T_crit
354 // is small enough to guarantee full double precision
355 for (int i = 1; i <= m_max; i++)
356 Fm[i] = Fm[i - 1] * numbers_.ihalf[i] * one_over_x; // see Eq. (9.8.13)
357 return;
358 }
359
360 // ---------------------------------------------
361 // small and intermediate arguments => interpolate Fm and (optional)
362 // downward recursion
363 // ---------------------------------------------
364 // which interval does this x fall into?
365 const Real x_over_delta = x * one_over_delta;
366 const int iv = int(x_over_delta); // the interval index
367 const Real xd =
368 x_over_delta - (Real)iv - 0.5; // this ranges from -0.5 to 0.5
369 const int m_min = 0;
370
371#if defined(__AVX__)
372 const auto x2 = xd * xd;
373 const auto x3 = x2 * xd;
374 const auto x4 = x2 * x2;
375 const auto x5 = x2 * x3;
376 const auto x6 = x3 * x3;
377 const auto x7 = x3 * x4;
378 libint2::simd::VectorAVXDouble x0vec(1., xd, x2, x3);
379 libint2::simd::VectorAVXDouble x1vec(x4, x5, x6, x7);
380#endif // AVX
381
382 const Real* d =
383 c + (ORDERp1) * (iv * (mmax + 1) +
384 m_min); // ptr to the interpolation data for m=mmin
385 int m = m_min;
386#if defined(__AVX__)
387 if (m_max - m >= 3) {
388 const int unroll_size = 4;
389 const int m_fence = (m_max + 2 - unroll_size);
390 for (; m < m_fence; m += unroll_size, d += ORDERp1 * unroll_size) {
391 libint2::simd::VectorAVXDouble d00v, d01v, d10v, d11v, d20v, d21v, d30v,
392 d31v;
393 d00v.load_aligned(d);
394 d01v.load_aligned((d + 4));
395 d10v.load_aligned(d + ORDERp1);
396 d11v.load_aligned((d + 4) + ORDERp1);
397 d20v.load_aligned(d + 2 * ORDERp1);
398 d21v.load_aligned((d + 4) + 2 * ORDERp1);
399 d30v.load_aligned(d + 3 * ORDERp1);
400 d31v.load_aligned((d + 4) + 3 * ORDERp1);
401 libint2::simd::VectorAVXDouble fm0 = d00v * x0vec + d01v * x1vec;
402 libint2::simd::VectorAVXDouble fm1 = d10v * x0vec + d11v * x1vec;
403 libint2::simd::VectorAVXDouble fm2 = d20v * x0vec + d21v * x1vec;
404 libint2::simd::VectorAVXDouble fm3 = d30v * x0vec + d31v * x1vec;
406 horizontal_add(fm0, fm1, fm2, fm3);
407 sum0123.convert(&Fm[m]);
408 }
409 } // unroll_size=4
410 if (m_max - m >= 1) {
411 const int unroll_size = 2;
412 const int m_fence = (m_max + 2 - unroll_size);
413 for (; m < m_fence; m += unroll_size, d += ORDERp1 * unroll_size) {
414 libint2::simd::VectorAVXDouble d00v, d01v, d10v, d11v;
415 d00v.load_aligned(d);
416 d01v.load_aligned((d + 4));
417 d10v.load_aligned(d + ORDERp1);
418 d11v.load_aligned((d + 4) + ORDERp1);
419 libint2::simd::VectorAVXDouble fm0 = d00v * x0vec + d01v * x1vec;
420 libint2::simd::VectorAVXDouble fm1 = d10v * x0vec + d11v * x1vec;
421 libint2::simd::VectorSSEDouble sum01 = horizontal_add(fm0, fm1);
422 sum01.convert(&Fm[m]);
423 }
424 } // unroll_size=2
425 { // no unrolling
426 for (; m <= m_max; ++m, d += ORDERp1) {
428 d0v.load_aligned(d);
429 d1v.load_aligned(d + 4);
430 Fm[m] = horizontal_add(d0v * x0vec + d1v * x1vec);
431 }
432 }
433#else // AVX not available
434 for (m = m_min; m <= m_max; ++m, d += ORDERp1) {
435 Fm[m] =
436 d[0] +
437 xd * (d[1] +
438 xd * (d[2] +
439 xd * (d[3] +
440 xd * (d[4] +
441 xd * (d[5] + xd * (d[6] + xd * (d[7])))))));
442
443 // // check against the reference value
444 // if (false) {
445 // double refvalue = FmEval_Reference2<double>::eval(x, m); //
446 // compute F(T) if (abs(refvalue - Fm[m]) > 1e-10) {
447 // std::cout << "T = " << x << " m = " << m << " cheb = "
448 // << Fm[m] << " ref = " << refvalue << std::endl;
449 // }
450 // }
451 }
452#endif
453
454 } // eval()
455
456 private:
457 void init_table() {
458 // get memory
459 void* result;
460 int status = posix_memalign(
461 &result, ORDERp1 * sizeof(Real),
462 (mmax + 1) * cheb_table_nintervals * ORDERp1 * sizeof(Real));
463 if (status != 0) {
464 if (status == EINVAL)
465 throw std::logic_error(
466 "FmEval_Chebyshev7::init() : posix_memalign failed, alignment must "
467 "be a power of 2 at least as large as sizeof(void *)");
468 if (status == ENOMEM) throw std::bad_alloc();
469 abort(); // should be unreachable
470 }
471 c = static_cast<Real*>(result);
472
473 // copy contents of static table into c
474 // need all intervals
475 for (std::size_t iv = 0; iv < cheb_table_nintervals; ++iv) {
476 // but only values of m up to mmax
477 std::copy(cheb_table[iv], cheb_table[iv] + (mmax + 1) * ORDERp1,
478 c + (iv * (mmax + 1)) * ORDERp1);
479 }
480 }
481
482}; // FmEval_Chebyshev7
483
484#if LIBINT2_CONSTEXPR_STATICS
485template <typename Real>
486constexpr double FmEval_Chebyshev7<
487 Real>::cheb_table[FmEval_Chebyshev7<Real>::cheb_table_nintervals]
488 [(FmEval_Chebyshev7<Real>::cheb_table_mmax + 1) *
489 (FmEval_Chebyshev7<Real>::interpolation_order + 1)];
490#else
491// clang needs an explicit specifalization declaration to avoid warning
492#ifdef __clang__
493template <typename Real>
494double FmEval_Chebyshev7<
495 Real>::cheb_table[FmEval_Chebyshev7<Real>::cheb_table_nintervals]
496 [(FmEval_Chebyshev7<Real>::cheb_table_mmax + 1) *
497 (FmEval_Chebyshev7<Real>::interpolation_order + 1)];
498#endif
499#endif
500
501#ifndef STATIC_OON
502#define STATIC_OON
503namespace {
504const double oon[] = {0.0, 1.0, 1.0 / 2.0, 1.0 / 3.0,
505 1.0 / 4.0, 1.0 / 5.0, 1.0 / 6.0, 1.0 / 7.0,
506 1.0 / 8.0, 1.0 / 9.0, 1.0 / 10.0, 1.0 / 11.0};
507}
508#endif
509
519template <typename Real = double, int INTERPOLATION_ORDER = 7>
521 public:
522 static_assert(std::is_same<Real, double>::value,
523 "FmEval_Taylor only supports double as the real type");
524
525 static const int max_interp_order = 8;
526 static const bool INTERPOLATION_AND_RECURSION =
527 false; // compute F_lmax(T) and then iterate down to F_0(T)? Else use
528 // interpolation only
529 const double soft_zero_;
530
533 FmEval_Taylor(unsigned int mmax,
534 Real precision = std::numeric_limits<Real>::epsilon())
535 : soft_zero_(1e-6),
536 cutoff_(precision),
537 numbers_(INTERPOLATION_ORDER + 1, 2 * (mmax + INTERPOLATION_ORDER)) {
538#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || \
539 defined(_M_IX86)
540#if !defined(__AVX__) && defined(NDEBUG)
541 if (libint2::verbose()) {
542 static bool printed_performance_warning = false;
543 if (!printed_performance_warning) {
545 << "libint2::FmEval_Taylor on x86(-64) platforms needs AVX support "
546 "for best performance"
547 << std::endl;
548 printed_performance_warning = true;
549 }
550 }
551#endif
552#endif
553
554 assert(mmax <= 63);
555
556 const double sqrt_pi = std::sqrt(M_PI);
557
558 /*---------------------------------------
559 We are doing Taylor interpolation with
560 n=TAYLOR_ORDER terms here:
561 error <= delT^n/(n+1)!
562 ---------------------------------------*/
563 delT_ = 2.0 * std::pow(cutoff_ * numbers_.fac[INTERPOLATION_ORDER + 1],
564 1.0 / INTERPOLATION_ORDER);
565 oodelT_ = 1.0 / delT_;
566 max_m_ = mmax + INTERPOLATION_ORDER;
567
568 T_crit_ = new Real[max_m_ + 1]; /*--- m=0 is included! ---*/
569 max_T_ = 0;
570 /*--- Figure out T_crit for each m and put into the T_crit ---*/
571 for (int m = max_m_; m >= 0; --m) {
572 /*------------------------------------------
573 Damped Newton-Raphson method to solve
574 T^{m-0.5}*exp(-T) = epsilon*Gamma(m+0.5)
575 The solution is the max T for which to do
576 the interpolation
577 ------------------------------------------*/
578 double T = -log(cutoff_);
579 const double egamma =
580 cutoff_ * sqrt_pi * numbers_.df[2 * m] / std::pow(2.0, m);
581 double T_new = T;
582 double func;
583 do {
584 const double damping_factor = 0.2;
585 T = T_new;
586 /* f(T) = the difference between LHS and RHS of the equation above */
587 func = std::pow(T, m - 0.5) * std::exp(-T) - egamma;
588 const double dfuncdT =
589 ((m - 0.5) * std::pow(T, m - 1.5) - std::pow(T, m - 0.5)) *
590 std::exp(-T);
591 /* f(T) has 2 roots and has a maximum in between. If f'(T) > 0 we are to
592 * the left of the hump. Make a big step to the right. */
593 if (dfuncdT > 0.0) {
594 T_new *= 2.0;
595 } else {
596 /* damp the step */
597 double deltaT = -func / dfuncdT;
598 const double sign_deltaT = (deltaT > 0.0) ? 1.0 : -1.0;
599 const double max_deltaT = damping_factor * T;
600 if (std::fabs(deltaT) > max_deltaT) deltaT = sign_deltaT * max_deltaT;
601 T_new = T + deltaT;
602 }
603 if (T_new <= 0.0) {
604 T_new = T / 2.0;
605 }
606 } while (std::fabs(func / egamma) >= soft_zero_);
607 T_crit_[m] = T_new;
608 const int T_idx = (int)std::floor(T_new / delT_);
609 max_T_ = std::max(max_T_, T_idx);
610 }
611
612 // allocate the grid (see the comments below)
613 {
614 const int nrow = max_T_ + 1;
615 const int ncol = max_m_ + 1;
616 grid_ = new Real*[nrow];
617 grid_[0] = new Real[nrow * ncol];
618 // std::cout << "Allocated interpolation table of " << nrow * ncol << "
619 // reals" << std::endl;
620 for (int r = 1; r < nrow; ++r) grid_[r] = grid_[r - 1] + ncol;
621 }
622
623 /*-------------------------------------------------------
624 Tabulate the gamma function from t=delT to T_crit[m]:
625 1) include T=0 though the table is empty for T=0 since
626 Fm(0) is simple to compute
627 -------------------------------------------------------*/
628 /*--- do the mmax first ---*/
629 for (int T_idx = max_T_; T_idx >= 0; --T_idx) {
630 const double T = T_idx * delT_;
631 libint2::FmEval_Reference2<double>::eval(grid_[T_idx], T, max_m_);
632 }
633 }
634
636 delete[] T_crit_;
637 delete[] grid_[0];
638 delete[] grid_;
639 }
640
643 static std::shared_ptr<const FmEval_Taylor> instance(
644 unsigned int mmax,
645 Real precision = std::numeric_limits<Real>::epsilon()) {
646 assert(mmax >= 0);
647 assert(precision >= 0);
648 // thread-safe per C++11 standard [6.7.4]
649 static auto instance_ =
650 std::make_shared<const FmEval_Taylor>(mmax, precision);
651
652 while (instance_->max_m() < mmax || instance_->precision() > precision) {
653 static std::mutex mtx;
654 std::lock_guard<std::mutex> lck(mtx);
655 if (instance_->max_m() < mmax || instance_->precision() > precision) {
656 auto new_instance =
657 std::make_shared<const FmEval_Taylor>(mmax, precision);
658 instance_ = new_instance;
659 }
660 }
661
662 return instance_;
663 }
664
667 int max_m() const { return max_m_ - INTERPOLATION_ORDER + 1; }
669 Real precision() const { return cutoff_; }
670
679 void eval(Real* Fm, Real T, int mmax) const {
680 const double sqrt_pio2 = 1.2533141373155002512;
681 const double two_T = 2.0 * T;
682
683 // stop recursion at mmin
684 const int mmin = INTERPOLATION_AND_RECURSION ? mmax : 0;
685 /*-------------------------------------
686 Compute Fm(T) from mmax down to mmin
687 -------------------------------------*/
688 const bool use_upward_recursion = true;
689 if (use_upward_recursion) {
690 // if (T > 30.0) {
691 if (T > T_crit_[0]) {
692 const double one_over_x = 1.0 / T;
693 Fm[0] =
694 0.88622692545275801365 *
695 sqrt(one_over_x); // see Eq. (9.8.9) in Helgaker-Jorgensen-Olsen
696 if (mmax == 0) return;
697 // this upward recursion formula omits - e^(-x)/(2x), which for x>T_crit
698 // is <1e-15
699 for (int i = 1; i <= mmax; i++)
700 Fm[i] =
701 Fm[i - 1] * numbers_.ihalf[i] * one_over_x; // see Eq. (9.8.13)
702 return;
703 }
704 }
705
706 // since Tcrit grows with mmax, this condition only needs to be determined
707 // once
708 if (T > T_crit_[mmax]) {
709 double pow_two_T_to_minusjp05 = std::pow(two_T, -mmax - 0.5);
710 for (int m = mmax; m >= mmin; --m) {
711 /*--- Asymptotic formula ---*/
712 Fm[m] = numbers_.df[2 * m] * sqrt_pio2 * pow_two_T_to_minusjp05;
713 pow_two_T_to_minusjp05 *= two_T;
714 }
715 } else {
716 const int T_ind = (int)(0.5 + T * oodelT_);
717 const Real h = T_ind * delT_ - T;
718 const Real* F_row = grid_[T_ind] + mmin;
719
720#if defined(__AVX__)
722 if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) {
723 const double h2 = h * h * oon[2];
724 const double h3 = h2 * h * oon[3];
725 h0123 = libint2::simd::VectorAVXDouble(1.0, h, h2, h3);
726 if (INTERPOLATION_ORDER == 7) {
727 const double h4 = h3 * h * oon[4];
728 const double h5 = h4 * h * oon[5];
729 const double h6 = h5 * h * oon[6];
730 const double h7 = h6 * h * oon[7];
731 h4567 = libint2::simd::VectorAVXDouble(h4, h5, h6, h7);
732 }
733 }
734 // libint2::simd::VectorAVXDouble h0123(1.0);
735 // libint2::simd::VectorAVXDouble h4567(1.0);
736#endif
737
738 int m = mmin;
739 if (mmax - m >= 1) {
740 const int unroll_size = 2;
741 const int m_fence = (mmax + 2 - unroll_size);
742 for (; m < m_fence; m += unroll_size, F_row += unroll_size) {
743#if defined(__AVX__)
744 if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) {
746 fr0_0123.load(F_row);
748 fr1_0123.load(F_row + 1);
750 horizontal_add(fr0_0123 * h0123, fr1_0123 * h0123);
751 if (INTERPOLATION_ORDER == 7) {
753 fr0_4567.load(F_row + 4);
755 fr1_4567.load(F_row + 5);
756 fm01 += horizontal_add(fr0_4567 * h4567, fr1_4567 * h4567);
757 }
758 fm01.convert(&Fm[m]);
759 } else {
760#endif
761 Real total0 = 0.0, total1 = 0.0;
762 for (int i = INTERPOLATION_ORDER; i >= 1; --i) {
763 total0 = oon[i] * h * (F_row[i] + total0);
764 total1 = oon[i] * h * (F_row[i + 1] + total1);
765 }
766 Fm[m] = F_row[0] + total0;
767 Fm[m + 1] = F_row[1] + total1;
768#if defined(__AVX__)
769 }
770#endif
771 }
772 } // unroll_size = 2
773 if (m <= mmax) { // unroll_size = 1
774#if defined(__AVX__)
775 if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) {
777 fr0123.load(F_row);
778 if (INTERPOLATION_ORDER == 7) {
780 fr4567.load(F_row + 4);
781 // libint2::simd::VectorSSEDouble fm =
782 // horizontal_add(fr0123*h0123, fr4567*h4567); Fm[m]
783 // = horizontal_add(fm);
784 Fm[m] = horizontal_add(fr0123 * h0123 + fr4567 * h4567);
785 } else { // INTERPOLATION_ORDER == 3
786 Fm[m] = horizontal_add(fr0123 * h0123);
787 }
788 } else {
789#endif
790 Real total = 0.0;
791 for (int i = INTERPOLATION_ORDER; i >= 1; --i) {
792 total = oon[i] * h * (F_row[i] + total);
793 }
794 Fm[m] = F_row[0] + total;
795#if defined(__AVX__)
796 }
797#endif
798 } // unroll_size = 1
799
800 // check against the reference value
801 // if (false) {
802 // double refvalue = FmEval_Reference2<double>::eval(T, mmax);
803 // // compute F(T) with m=mmax if (abs(refvalue - Fm[mmax]) >
804 // 1e-14) {
805 // std::cout << "T = " << T << " m = " << mmax << " cheb = "
806 // << Fm[mmax] << " ref = " << refvalue << std::endl;
807 // }
808 // }
809
810 } // if T < T_crit
811
812 /*------------------------------------
813 And then do downward recursion in j
814 ------------------------------------*/
815 if (INTERPOLATION_AND_RECURSION && mmin > 0) {
816 const Real exp_mT = std::exp(-T);
817 for (int m = mmin - 1; m >= 0; --m) {
818 Fm[m] = (exp_mT + two_T * Fm[m + 1]) * numbers_.twoi1[m];
819 }
820 }
821 }
822
823 private:
824 Real** grid_; /* Table of "exact" Fm(T) values. Row index corresponds to
825 values of T (max_T+1 rows), column index to values
826 of m (max_m+1 columns) */
827 Real delT_; /* The step size for T, depends on cutoff */
828 Real oodelT_; /* 1.0 / delT_, see above */
829 Real cutoff_; /* Tolerance cutoff used in all computations of Fm(T) */
830 int max_m_; /* Maximum value of m in the table, depends on cutoff
831 and the number of terms in Taylor interpolation */
832 int max_T_; /* Maximum index of T in the table, depends on cutoff
833 and m */
834 Real* T_crit_; /* Maximum T for each row, depends on cutoff;
835 for a given m and T_idx <= max_T_idx[m] use Taylor interpolation,
836 for a given m and T_idx > max_T_idx[m] use the asymptotic formula */
837
839
849 static double truncation_error(unsigned int m, double T) {
850 const double m2 = m * m;
851 const double m3 = m2 * m;
852 const double m4 = m2 * m2;
853 const double T2 = T * T;
854 const double T3 = T2 * T;
855 const double T4 = T2 * T2;
856 const double T5 = T2 * T3;
857
858 const double result =
859 exp(-T) *
860 (105 + 16 * m4 + 16 * m3 * (T - 8) - 30 * T + 12 * T2 - 8 * T3 +
861 16 * T4 + 8 * m2 * (43 - 9 * T + 2 * T2) +
862 4 * m * (-88 + 23 * T - 8 * T2 + 4 * T3)) /
863 (32 * T5);
864 return result;
865 }
875 static double truncation_error(double T) {
876 const double result = exp(-T) / (2 * T);
877 return result;
878 }
879};
880
881namespace detail {
882constexpr double pow10(long exp) {
883 return (exp == 0) ? 1.
884 : ((exp > 0) ? 10. * pow10(exp - 1) : 0.1 * pow10(exp + 1));
885}
886} // namespace detail
887
891
900template <typename Real = double>
902 private:
903#include <libint2/tenno_cheb.h>
904
905 static_assert(std::is_same<Real, double>::value,
906 "TennoGmEval only supports double as the real type");
907
908 static const int mmin_ = -1;
909 static constexpr Real Tmax =
910 (1 << cheb_table_tmaxlog2);
912 static constexpr Real Umax = detail::pow10(
913 cheb_table_umaxlog10);
914 static constexpr Real Umin = detail::pow10(
915 cheb_table_uminlog10);
916 static constexpr Real one_over_Umin =
917 1. / Umin;
918 static constexpr std::size_t ORDERp1 = interpolation_order + 1;
919 static constexpr Real maxabsprecision =
920 1.4e-14;
921
922 public:
933 TennoGmEval(unsigned int mmax, Real precision = maxabsprecision)
934 : mmax_(mmax), precision_(precision), numbers_() {
935#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || \
936 defined(_M_IX86)
937#if !defined(__AVX__) && defined(NDEBUG)
938 if (libint2::verbose()) {
939 static bool printed_performance_warning = false;
940 if (!printed_performance_warning) {
942 << "libint2::TennoGmEval on x86(-64) platforms needs AVX support "
943 "for best performance"
944 << std::endl;
945 printed_performance_warning = true;
946 }
947 }
948#endif
949#endif
950
951 if (precision_ < maxabsprecision)
952 throw std::invalid_argument(
953 std::string("TennoGmEval does not support precision smaller than ") +
954 std::to_string(maxabsprecision));
955
956 if (mmax > cheb_table_mmax)
957 throw std::invalid_argument(
958 "TennoGmEval::init() : requested mmax exceeds the "
959 "hard-coded mmax");
960 init_table();
961 }
962
963 ~TennoGmEval() {
964 if (c_ != nullptr) posix_memfree(c_);
965 }
966
969 static std::shared_ptr<const TennoGmEval> instance(int m_max, double = 0) {
970 assert(m_max >= 0);
971 // thread-safe per C++11 standard [6.7.4]
972 static auto instance_ = std::make_shared<const TennoGmEval>(m_max);
973
974 while (instance_->max_m() < m_max) {
975 static std::mutex mtx;
976 std::lock_guard<std::mutex> lck(mtx);
977 if (instance_->max_m() < m_max) {
978 auto new_instance = std::make_shared<const TennoGmEval>(m_max);
979 instance_ = new_instance;
980 }
981 }
982
983 return instance_;
984 }
985
986 unsigned int max_m() const { return mmax_; }
988 Real precision() const { return precision_; }
989
1001 void eval_yukawa(Real* Gm, Real one_over_rho, Real T, size_t mmax,
1002 Real zeta) const {
1003 assert(mmax <= mmax_);
1004 assert(T >= 0);
1005 const auto U = 0.25 * zeta * zeta * one_over_rho;
1006 assert(U >= 0);
1007 if (T > Tmax || U < Umin) {
1008 eval_urr<false>(Gm, T, U, /* zeta_over_two_rho = */ 0, mmax);
1009 } else {
1010 interpolate_Gm<false>(Gm, T, U, /* zeta_over_two_rho = */ 0, mmax);
1011 }
1012 }
1025 void eval_slater(Real* Gm, Real one_over_rho, Real T, size_t mmax,
1026 Real zeta) const {
1027 assert(mmax <= mmax_);
1028 assert(T >= 0);
1029 const auto U = 0.25 * zeta * zeta * one_over_rho;
1030 assert(U > 0); // integral factorizes into 2 overlaps for U = 0
1031 const auto zeta_over_two_rho = 0.5 * zeta * one_over_rho;
1032 if (T > Tmax) {
1033 eval_urr<true>(Gm, T, U, zeta_over_two_rho, mmax);
1034 } else {
1035 interpolate_Gm<true>(Gm, T, U, zeta_over_two_rho, mmax);
1036 }
1037 }
1038
1039 private:
1045 template <bool compute_Gm1>
1046 static inline std::tuple<Real, Real> eval_G0_and_maybe_Gm1(Real T, Real U) {
1047 assert(T >= 0);
1048 assert(U >= 0);
1049 const Real sqrtPi(1.7724538509055160272981674833411451827975494561224);
1050
1051 Real G_m1 = 0;
1052 Real G_0 = 0;
1053 if (U ==
1054 0) { // \sqrt{U} G_{-1} is finite, need to handle that case separately
1055 assert(!compute_Gm1);
1056 if (T < std::numeric_limits<Real>::epsilon()) {
1057 G_0 = 1;
1058 } else {
1059 const Real sqrt_T = sqrt(T);
1060 const Real sqrtPi_over_2 = sqrtPi * 0.5;
1061 G_0 = sqrtPi_over_2 * erf(sqrt_T) / sqrt_T;
1062 }
1063 } else if (T > std::numeric_limits<Real>::epsilon()) { // U > 0
1064 const Real sqrt_U = sqrt(U);
1065 const Real sqrt_T = sqrt(T);
1066 const Real oo_sqrt_T = 1 / sqrt_T;
1067 const Real oo_sqrt_U = compute_Gm1 ? (1 / sqrt_U) : 0;
1068 const Real kappa = sqrt_U - sqrt_T;
1069 const Real lambda = sqrt_U + sqrt_T;
1070 const Real sqrtPi_over_4 = sqrtPi * 0.25;
1071 const Real pfac = sqrtPi_over_4;
1072 const Real erfc_k = exp(kappa * kappa - T) * erfc(kappa);
1073 // TODO when lambda * lambda - T exceeds 709 exp overflows, so need to
1074 // handle this more carefully
1075 if (lambda * lambda - T > 709)
1076 throw std::invalid_argument(
1077 "TennoGmEval::eval_G0_and_maybe_Gm1(): for large T and/or U "
1078 "current implementation overflows, need to implement asymptotic "
1079 "expansion");
1080 const Real erfc_l = exp(lambda * lambda - T) * erfc(lambda);
1081
1082 G_m1 = compute_Gm1 ? pfac * (erfc_k + erfc_l) * oo_sqrt_U : 0.0;
1083 G_0 = pfac * (erfc_k - erfc_l) * oo_sqrt_T;
1084 } else { // T = 0, U > 0
1085 if (U <= 709) { // exp(<=709) is finite
1086 const Real exp_U = exp(U);
1087 const Real sqrt_U = sqrt(U);
1088 if (compute_Gm1) {
1089 const Real sqrtPi_over_2 = sqrtPi * 0.5;
1090 const Real oo_sqrt_U = compute_Gm1 ? (1 / sqrt_U) : 0;
1091
1092 G_m1 = exp_U * sqrtPi_over_2 * erfc(sqrt_U) * oo_sqrt_U;
1093 }
1094 G_0 = 1 - exp_U * sqrtPi * erfc(sqrt_U) * sqrt_U;
1095 } else { // exp(>=710) is "infinity", handle larger values as a series
1096 const Real oo_U = Real{1} / U;
1097 const Real x = oo_U;
1098 const Real x2 = x * x;
1099 const Real x3 = x2 * x;
1100 const Real x4 = x2 * x2;
1101 const Real x5 = x3 * x2;
1102 const Real x6 = x3 * x3;
1103 if (compute_Gm1) {
1104 // 6th-order is sufficient
1105 // in Wolfram: Series[Exp[U]*(Sqrt[Pi]/2)*Erfc[Sqrt[U]]/Sqrt[U], {U,
1106 // Infinity, 6}]
1107 const Real cm1[] = {1. / 2, -1. / 4, 3. / 8,
1108 -15. / 16, 105. / 32, -945. / 64};
1109 G_m1 = cm1[0] * x + cm1[1] * x2 + cm1[2] * x3 + cm1[3] * x4 +
1110 cm1[4] * x5 + cm1[5] * x6;
1111 }
1112 // 6th-order is sufficient
1113 // in Wolfram: Series[1 - Exp[U]*Sqrt[Pi]*Erfc[Sqrt[U]]*Sqrt[U], {U,
1114 // Infinity, 6}]
1115 const Real c0[] = {1. / 2, -3. / 4, 15. / 8,
1116 -105. / 16, 945. / 32, -10395. / 64};
1117 G_0 = c0[0] * x + c0[1] * x2 + c0[2] * x3 + c0[3] * x4 + c0[4] * x5 +
1118 c0[5] * x6;
1119 }
1120 }
1121
1122 return std::make_tuple(G_0, G_m1);
1123 }
1124
1140 template <bool Exp>
1141 static void eval_urr(Real* Gm_vec, Real T, Real U, Real zeta_over_two_rho,
1142 size_t mmax) {
1143 assert(T > 0);
1144 assert(U > 0);
1145
1146 const Real sqrt_U = sqrt(U);
1147 const Real sqrt_T = sqrt(T);
1148 const Real oo_sqrt_T = 1 / sqrt_T;
1149 const Real oo_sqrt_U = 1 / sqrt_U;
1150 const Real kappa = sqrt_U - sqrt_T;
1151 const Real lambda = sqrt_U + sqrt_T;
1152 const Real sqrtPi_over_4(
1153 0.44311346272637900682454187083528629569938736403060);
1154 const Real pfac = sqrtPi_over_4;
1155 const Real erfc_k = exp(kappa * kappa - T) * erfc(kappa);
1156 const Real erfc_l = exp(lambda * lambda - T) * erfc(lambda);
1157
1158 Real Gmm1; // will contain G[m-1]
1159 Real Gm; // will contain G[m]
1160 Real Gmp1; // will contain G[m+1]
1161 Gmm1 = pfac * (erfc_k + erfc_l) * oo_sqrt_U; // G_{-1}
1162 Gm = pfac * (erfc_k - erfc_l) * oo_sqrt_T; // G_{0}
1163 if
1164#if __cplusplus >= 201703L
1165 constexpr
1166#endif
1167 (!Exp) {
1168 Gm_vec[0] = Gm;
1169 } else {
1170 Gm_vec[0] = (Gmm1 - Gm) * zeta_over_two_rho;
1171 }
1172
1173 if (mmax > 0) {
1174 // first application of URR
1175 const Real oo_two_T = 0.5 / T;
1176 const Real two_U = 2.0 * U;
1177 const Real exp_mT = exp(-T);
1178
1179 for (unsigned int m = 0, two_m_minus_1 = 1; m < mmax;
1180 ++m, two_m_minus_1 += 2) {
1181 Gmp1 = oo_two_T * (two_m_minus_1 * Gm + two_U * Gmm1 - exp_mT);
1182 if
1183#if __cplusplus >= 201703L
1184 constexpr
1185#endif
1186 (!Exp) {
1187 Gm_vec[m + 1] = Gmp1;
1188 } else {
1189 Gm_vec[m + 1] = (Gm - Gmp1) * zeta_over_two_rho;
1190 }
1191 Gmm1 = Gm;
1192 Gm = Gmp1;
1193 }
1194 }
1195
1196 return;
1197 }
1198
1211 template <bool Exp>
1212 void interpolate_Gm(Real* Gm_vec, Real T, Real U, Real zeta_over_two_rho,
1213 long mmax) const {
1214 assert(T >= 0);
1215 if (U > Umax) {
1216 throw std::invalid_argument(
1217 "TennoGmEval::eval_{yukawa,slater}() : arguments out of range, "
1218 "zeta*zeta*one_over_rho/4=" +
1219 std::to_string(U) + " cannot exceed " + std::to_string(Umax));
1220 }
1221 assert(U >= Umin && U <= Umax);
1222
1223 // maps x in [0,2] to [-1/2,1/2] in a linear fashion
1224 auto linear_map_02 = [](Real x) {
1225 assert(x >= 0 && x <= 2);
1226 return (x - 1) * 0.5;
1227 };
1228 // maps x in [a, 2*a] to [-1/2,1/2] in a logarithmic fashion
1229 auto log2_map = [](Real x, Real one_over_a) {
1230 return std::log2(x * one_over_a) - 0.5;
1231 };
1232 // maps x in [a, 10*a] to [-1/2,1/2] in a logarithmic fashion
1233 auto log10_map = [](Real x, Real one_over_a) {
1234 return std::log10(x * one_over_a) - 0.5;
1235 };
1236
1237 // which interval does this T fall into?
1238 const int Tint =
1239 T < 2 ? 0
1240 : int(std::floor(std::log2(T) *
1241 (1. - std::numeric_limits<double>::epsilon())));
1242 assert(Tint >= 0 && Tint < cheb_table_tmaxlog2 - cheb_table_tminlog2 + 1);
1243 // precomputed 1 / 2^K
1244 constexpr Real one_over_2K[] = {1, 0.5, 0.25, 0.125,
1245 0.0625, 0.03125, 0.015625, 0.0078125,
1246 0.00390625, .001953125};
1247 // which interval does this U fall into?
1248 const auto log10_U_over_Umin =
1249 std::log10(U * one_over_Umin) *
1250 (1. - std::numeric_limits<double>::epsilon());
1251 const int Uint = int(std::floor(log10_U_over_Umin));
1252 assert(Uint >= 0 && Uint < cheb_table_umaxlog10 - cheb_table_uminlog10);
1253 // precomputed 1 / 10^(cheb_table_uminlog10 + K)
1254 constexpr Real one_over_10K[] = {
1255 1. / detail::pow10(cheb_table_uminlog10),
1256 1. / detail::pow10(cheb_table_uminlog10 + 1),
1257 1. / detail::pow10(cheb_table_uminlog10 + 2),
1258 1. / detail::pow10(cheb_table_uminlog10 + 3),
1259 1. / detail::pow10(cheb_table_uminlog10 + 4),
1260 1. / detail::pow10(cheb_table_uminlog10 + 5),
1261 1. / detail::pow10(cheb_table_uminlog10 + 6),
1262 1. / detail::pow10(cheb_table_uminlog10 + 7),
1263 1. / detail::pow10(cheb_table_uminlog10 + 8),
1264 1. / detail::pow10(cheb_table_uminlog10 + 9)};
1265 const Real t =
1266 Tint == 0
1267 ? linear_map_02(T)
1268 : log2_map(T, one_over_2K[Tint]); // this ranges from -0.5 to 0.5
1269 const Real u =
1270 log10_map(U, one_over_10K[Uint]); // this ranges from -0.5 to 0.5
1271
1272 const int interval =
1273 Tint * (cheb_table_umaxlog10 - cheb_table_uminlog10) + Uint;
1274
1275#if defined(__AVX__)
1276 assert(ORDERp1 == 16);
1277 const auto t2 = t * t;
1278 const auto t3 = t2 * t;
1279 const auto t4 = t2 * t2;
1280 const auto t5 = t2 * t3;
1281 const auto t6 = t3 * t3;
1282 const auto t7 = t3 * t4;
1283 const auto t8 = t4 * t4;
1284 const auto t9 = t4 * t5;
1285 const auto t10 = t5 * t5;
1286 const auto t11 = t5 * t6;
1287 const auto t12 = t6 * t6;
1288 const auto t13 = t6 * t7;
1289 const auto t14 = t7 * t7;
1290 const auto t15 = t7 * t8;
1291 const auto u2 = u * u;
1292 const auto u3 = u2 * u;
1293 const auto u4 = u2 * u2;
1294 const auto u5 = u2 * u3;
1295 const auto u6 = u3 * u3;
1296 const auto u7 = u3 * u4;
1297 const auto u8 = u4 * u4;
1298 const auto u9 = u4 * u5;
1299 const auto u10 = u5 * u5;
1300 const auto u11 = u5 * u6;
1301 const auto u12 = u6 * u6;
1302 const auto u13 = u6 * u7;
1303 const auto u14 = u7 * u7;
1304 const auto u15 = u7 * u8;
1305 libint2::simd::VectorAVXDouble u0vec(1., u, u2, u3);
1306 libint2::simd::VectorAVXDouble u1vec(u4, u5, u6, u7);
1307 libint2::simd::VectorAVXDouble u2vec(u8, u9, u10, u11);
1308 libint2::simd::VectorAVXDouble u3vec(u12, u13, u14, u15);
1309#endif // AVX
1310
1311 Real Gmm1 = 0.0; // will track the previous value, only used for Exp==false
1312
1313 constexpr bool compute_Gmm10 = true;
1314 long mmin_interp;
1315 if (compute_Gmm10) {
1316 // precision of interpolation for m=-1,0 can be insufficient, just
1317 // evaluate explicitly
1318 Real G0;
1319 std::tie(Exp ? G0 : Gm_vec[0], Gmm1) = eval_G0_and_maybe_Gm1<Exp>(T, U);
1320 if
1321#if __cplusplus >= 201703L
1322 constexpr
1323#endif
1324 (Exp) {
1325 Gm_vec[0] = (Gmm1 - G0) * zeta_over_two_rho;
1326 Gmm1 = G0;
1327 }
1328 mmin_interp = 1;
1329 } else
1330 mmin_interp = Exp ? -1 : 0;
1331
1332 // now compute the rest
1333 for (long m = mmin_interp; m <= mmax; ++m) {
1334 const Real* c_tuint =
1335 c_ + (ORDERp1) * (ORDERp1) *
1336 (interval * (mmax_ - mmin_ + 1) +
1337 (m - mmin_)); // ptr to the interpolation data for m=mmin
1338#if defined(__AVX__)
1339 libint2::simd::VectorAVXDouble c00v, c01v, c02v, c03v, c10v, c11v, c12v,
1340 c13v, c20v, c21v, c22v, c23v, c30v, c31v, c32v, c33v, c40v, c41v,
1341 c42v, c43v, c50v, c51v, c52v, c53v, c60v, c61v, c62v, c63v, c70v,
1342 c71v, c72v, c73v;
1343 libint2::simd::VectorAVXDouble c80v, c81v, c82v, c83v, c90v, c91v, c92v,
1344 c93v, ca0v, ca1v, ca2v, ca3v, cb0v, cb1v, cb2v, cb3v, cc0v, cc1v,
1345 cc2v, cc3v, cd0v, cd1v, cd2v, cd3v, ce0v, ce1v, ce2v, ce3v, cf0v,
1346 cf1v, cf2v, cf3v;
1347 c00v.load_aligned(c_tuint);
1348 c01v.load_aligned((c_tuint + 4));
1349 c02v.load_aligned(c_tuint + 8);
1350 c03v.load_aligned((c_tuint + 12));
1351 libint2::simd::VectorAVXDouble t0vec(1, 1, 1, 1);
1353 t0vec * (c00v * u0vec + c01v * u1vec + c02v * u2vec + c03v * u3vec);
1354 c10v.load_aligned(c_tuint + ORDERp1);
1355 c11v.load_aligned((c_tuint + 4) + ORDERp1);
1356 c12v.load_aligned((c_tuint + 8) + ORDERp1);
1357 c13v.load_aligned((c_tuint + 12) + ORDERp1);
1358 libint2::simd::VectorAVXDouble t1vec(t, t, t, t);
1360 t1vec * (c10v * u0vec + c11v * u1vec + c12v * u2vec + c13v * u3vec);
1361 c20v.load_aligned(c_tuint + 2 * ORDERp1);
1362 c21v.load_aligned((c_tuint + 4) + 2 * ORDERp1);
1363 c22v.load_aligned((c_tuint + 8) + 2 * ORDERp1);
1364 c23v.load_aligned((c_tuint + 12) + 2 * ORDERp1);
1365 libint2::simd::VectorAVXDouble t2vec(t2, t2, t2, t2);
1367 t2vec * (c20v * u0vec + c21v * u1vec + c22v * u2vec + c23v * u3vec);
1368 c30v.load_aligned(c_tuint + 3 * ORDERp1);
1369 c31v.load_aligned((c_tuint + 4) + 3 * ORDERp1);
1370 c32v.load_aligned((c_tuint + 8) + 3 * ORDERp1);
1371 c33v.load_aligned((c_tuint + 12) + 3 * ORDERp1);
1372 libint2::simd::VectorAVXDouble t3vec(t3, t3, t3, t3);
1374 t3vec * (c30v * u0vec + c31v * u1vec + c32v * u2vec + c33v * u3vec);
1375 libint2::simd::VectorAVXDouble t0123 = horizontal_add(t0, t1, t2, t3);
1376
1377 c40v.load_aligned(c_tuint + 4 * ORDERp1);
1378 c41v.load_aligned((c_tuint + 4) + 4 * ORDERp1);
1379 c42v.load_aligned((c_tuint + 8) + 4 * ORDERp1);
1380 c43v.load_aligned((c_tuint + 12) + 4 * ORDERp1);
1381 libint2::simd::VectorAVXDouble t4vec(t4, t4, t4, t4);
1383 t4vec * (c40v * u0vec + c41v * u1vec + c42v * u2vec + c43v * u3vec);
1384 c50v.load_aligned(c_tuint + 5 * ORDERp1);
1385 c51v.load_aligned((c_tuint + 4) + 5 * ORDERp1);
1386 c52v.load_aligned((c_tuint + 8) + 5 * ORDERp1);
1387 c53v.load_aligned((c_tuint + 12) + 5 * ORDERp1);
1388 libint2::simd::VectorAVXDouble t5vec(t5, t5, t5, t5);
1390 t5vec * (c50v * u0vec + c51v * u1vec + c52v * u2vec + c53v * u3vec);
1391 c60v.load_aligned(c_tuint + 6 * ORDERp1);
1392 c61v.load_aligned((c_tuint + 4) + 6 * ORDERp1);
1393 c62v.load_aligned((c_tuint + 8) + 6 * ORDERp1);
1394 c63v.load_aligned((c_tuint + 12) + 6 * ORDERp1);
1395 libint2::simd::VectorAVXDouble t6vec(t6, t6, t6, t6);
1397 t6vec * (c60v * u0vec + c61v * u1vec + c62v * u2vec + c63v * u3vec);
1398 c70v.load_aligned(c_tuint + 7 * ORDERp1);
1399 c71v.load_aligned((c_tuint + 4) + 7 * ORDERp1);
1400 c72v.load_aligned((c_tuint + 8) + 7 * ORDERp1);
1401 c73v.load_aligned((c_tuint + 12) + 7 * ORDERp1);
1402 libint2::simd::VectorAVXDouble t7vec(t7, t7, t7, t7);
1404 t7vec * (c70v * u0vec + c71v * u1vec + c72v * u2vec + c73v * u3vec);
1405 libint2::simd::VectorAVXDouble t4567 = horizontal_add(t4, t5, t6, t7);
1406
1407 c80v.load_aligned(c_tuint + 8 * ORDERp1);
1408 c81v.load_aligned((c_tuint + 4) + 8 * ORDERp1);
1409 c82v.load_aligned((c_tuint + 8) + 8 * ORDERp1);
1410 c83v.load_aligned((c_tuint + 12) + 8 * ORDERp1);
1411 libint2::simd::VectorAVXDouble t8vec(t8, t8, t8, t8);
1413 t8vec * (c80v * u0vec + c81v * u1vec + c82v * u2vec + c83v * u3vec);
1414 c90v.load_aligned(c_tuint + 9 * ORDERp1);
1415 c91v.load_aligned((c_tuint + 4) + 9 * ORDERp1);
1416 c92v.load_aligned((c_tuint + 8) + 9 * ORDERp1);
1417 c93v.load_aligned((c_tuint + 12) + 9 * ORDERp1);
1418 libint2::simd::VectorAVXDouble t9vec(t9, t9, t9, t9);
1420 t9vec * (c90v * u0vec + c91v * u1vec + c92v * u2vec + c93v * u3vec);
1421 ca0v.load_aligned(c_tuint + 10 * ORDERp1);
1422 ca1v.load_aligned((c_tuint + 4) + 10 * ORDERp1);
1423 ca2v.load_aligned((c_tuint + 8) + 10 * ORDERp1);
1424 ca3v.load_aligned((c_tuint + 12) + 10 * ORDERp1);
1425 libint2::simd::VectorAVXDouble tavec(t10, t10, t10, t10);
1427 tavec * (ca0v * u0vec + ca1v * u1vec + ca2v * u2vec + ca3v * u3vec);
1428 cb0v.load_aligned(c_tuint + 11 * ORDERp1);
1429 cb1v.load_aligned((c_tuint + 4) + 11 * ORDERp1);
1430 cb2v.load_aligned((c_tuint + 8) + 11 * ORDERp1);
1431 cb3v.load_aligned((c_tuint + 12) + 11 * ORDERp1);
1432 libint2::simd::VectorAVXDouble tbvec(t11, t11, t11, t11);
1434 tbvec * (cb0v * u0vec + cb1v * u1vec + cb2v * u2vec + cb3v * u3vec);
1435 libint2::simd::VectorAVXDouble t89ab = horizontal_add(t8, t9, ta, tb);
1436
1437 cc0v.load_aligned(c_tuint + 12 * ORDERp1);
1438 cc1v.load_aligned((c_tuint + 4) + 12 * ORDERp1);
1439 cc2v.load_aligned((c_tuint + 8) + 12 * ORDERp1);
1440 cc3v.load_aligned((c_tuint + 12) + 12 * ORDERp1);
1441 libint2::simd::VectorAVXDouble tcvec(t12, t12, t12, t12);
1443 tcvec * (cc0v * u0vec + cc1v * u1vec + cc2v * u2vec + cc3v * u3vec);
1444 cd0v.load_aligned(c_tuint + 13 * ORDERp1);
1445 cd1v.load_aligned((c_tuint + 4) + 13 * ORDERp1);
1446 cd2v.load_aligned((c_tuint + 8) + 13 * ORDERp1);
1447 cd3v.load_aligned((c_tuint + 12) + 13 * ORDERp1);
1448 libint2::simd::VectorAVXDouble tdvec(t13, t13, t13, t13);
1450 tdvec * (cd0v * u0vec + cd1v * u1vec + cd2v * u2vec + cd3v * u3vec);
1451 ce0v.load_aligned(c_tuint + 14 * ORDERp1);
1452 ce1v.load_aligned((c_tuint + 4) + 14 * ORDERp1);
1453 ce2v.load_aligned((c_tuint + 8) + 14 * ORDERp1);
1454 ce3v.load_aligned((c_tuint + 12) + 14 * ORDERp1);
1455 libint2::simd::VectorAVXDouble tevec(t14, t14, t14, t14);
1457 tevec * (ce0v * u0vec + ce1v * u1vec + ce2v * u2vec + ce3v * u3vec);
1458 cf0v.load_aligned(c_tuint + 15 * ORDERp1);
1459 cf1v.load_aligned((c_tuint + 4) + 15 * ORDERp1);
1460 cf2v.load_aligned((c_tuint + 8) + 15 * ORDERp1);
1461 cf3v.load_aligned((c_tuint + 4) + 15 * ORDERp1);
1462 libint2::simd::VectorAVXDouble tfvec(t15, t15, t15, t15);
1464 tfvec * (cf0v * u0vec + cf1v * u1vec + cf2v * u2vec + cf3v * u3vec);
1465 libint2::simd::VectorAVXDouble tcdef = horizontal_add(tc, td, te, tf);
1466
1467 auto tall = horizontal_add(t0123, t4567, t89ab, tcdef);
1468 const auto Gm = horizontal_add(tall);
1469#else // AVX
1470 // no AVX, use explicit loops (probably slow)
1471 double uvec[16];
1472 double tvec[16];
1473 uvec[0] = 1;
1474 tvec[0] = 1;
1475 for (int i = 1; i != 16; ++i) {
1476 uvec[i] = uvec[i - 1] * u;
1477 tvec[i] = tvec[i - 1] * t;
1478 }
1479 double Gm = 0.0;
1480 for (int i = 0, ij = 0; i != 16; ++i) {
1481 for (int j = 0; j != 16; ++j, ++ij) {
1482 Gm += c_tuint[ij] * tvec[i] * uvec[j];
1483 }
1484 }
1485#endif // AVX
1486
1487 if
1488#if __cplusplus >= 201703L
1489 constexpr
1490#endif
1491 (!Exp) {
1492 Gm_vec[m] = Gm;
1493 } else {
1494 if (m != -1) {
1495 Gm_vec[m] = (Gmm1 - Gm) * zeta_over_two_rho;
1496 }
1497 Gmm1 = Gm;
1498 }
1499 }
1500
1501 return;
1502 }
1503
1504 private:
1505 unsigned int mmax_;
1506 Real precision_;
1507 ExpensiveNumbers<Real> numbers_;
1508 Real* c_ = nullptr;
1509
1510 void init_table() {
1511 // get memory
1512 void* result;
1513 int status = posix_memalign(&result, std::max(sizeof(Real), (size_t)32),
1514 (mmax_ - mmin_ + 1) * cheb_table_nintervals *
1515 ORDERp1 * ORDERp1 * sizeof(Real));
1516 if (status != 0) {
1517 if (status == EINVAL)
1518 throw std::logic_error(
1519 "TennoGmEval::init() : posix_memalign failed, alignment must be a "
1520 "power of 2 at least as large as sizeof(void *)");
1521 if (status == ENOMEM) throw std::bad_alloc();
1522 abort(); // should be unreachable
1523 }
1524 c_ = static_cast<Real*>(result);
1525
1526 // copy contents of static table into c
1527 // need all intervals
1528 for (std::size_t iv = 0; iv < cheb_table_nintervals; ++iv) {
1529 // but only values of m up to mmax
1530 std::copy(cheb_table[iv],
1531 cheb_table[iv] + ((mmax_ - mmin_) + 1) * ORDERp1 * ORDERp1,
1532 c_ + (iv * ((mmax_ - mmin_) + 1)) * ORDERp1 * ORDERp1);
1533 }
1534 }
1535
1536}; // TennoGmEval
1537
1538#if LIBINT2_CONSTEXPR_STATICS
1539template <typename Real>
1540constexpr double
1541 TennoGmEval<Real>::cheb_table[TennoGmEval<Real>::cheb_table_nintervals]
1542 [(TennoGmEval<Real>::cheb_table_mmax + 2) *
1543 (TennoGmEval<Real>::interpolation_order + 1) *
1544 (TennoGmEval<Real>::interpolation_order + 1)];
1545#else
1546// clang needs an explicit specifalization declaration to avoid warning
1547#ifdef __clang__
1548template <typename Real>
1549double
1550 TennoGmEval<Real>::cheb_table[TennoGmEval<Real>::cheb_table_nintervals]
1551 [(TennoGmEval<Real>::cheb_table_mmax + 2) *
1552 (TennoGmEval<Real>::interpolation_order + 1) *
1553 (TennoGmEval<Real>::interpolation_order + 1)];
1554#endif
1555#endif
1556
1557template <typename Real, int k>
1559
1560namespace detail {
1561
1563template <typename CoreEval>
1565 CoreEvalScratch(const CoreEvalScratch&) = default;
1566 CoreEvalScratch(CoreEvalScratch&&) = default;
1567 explicit CoreEvalScratch(int) {}
1568};
1570template <typename Real>
1572 std::vector<Real> Fm_;
1573 std::vector<Real> g_i;
1574 std::vector<Real> r_i;
1575 std::vector<Real> oorhog_i;
1576 CoreEvalScratch(const CoreEvalScratch&) = default;
1577 CoreEvalScratch(CoreEvalScratch&&) = default;
1578 explicit CoreEvalScratch(int mmax) { init(mmax); }
1579
1580 private:
1581 void init(int mmax) {
1582 Fm_.resize(mmax + 1);
1583 g_i.resize(mmax + 1);
1584 r_i.resize(mmax + 1);
1585 oorhog_i.resize(mmax + 1);
1586 g_i[0] = 1.0;
1587 r_i[0] = 1.0;
1588 }
1589};
1590} // namespace detail
1591
1595
1616template <typename Real, int k>
1617struct GaussianGmEval
1618 : private detail::CoreEvalScratch<
1619 GaussianGmEval<Real, k>> // N.B. empty-base optimization
1620{
1621#ifndef LIBINT_USER_DEFINED_REAL
1622 using FmEvalType = libint2::FmEval_Chebyshev7<double>;
1623#else
1624 using FmEvalType = libint2::FmEval_Reference<scalar_type>;
1625#endif
1626
1631 GaussianGmEval(int mmax, Real precision)
1632 : detail::CoreEvalScratch<GaussianGmEval<Real, k>>(mmax),
1633 mmax_(mmax),
1634 precision_(precision),
1635 fm_eval_(),
1636 numbers_(-1, -1, mmax) {
1637 assert(k == -1 || k == 0 || k == 2);
1638 // for k=-1 need to evaluate the Boys function
1639 if (k == -1) {
1640 fm_eval_ = FmEvalType::instance(mmax_, precision_);
1641 }
1642 }
1643
1644 ~GaussianGmEval() {}
1645
1648 static std::shared_ptr<GaussianGmEval> instance(
1649 unsigned int mmax,
1650 Real precision = std::numeric_limits<Real>::epsilon()) {
1651 assert(mmax >= 0);
1652 assert(precision >= 0);
1653 // thread-safe per C++11 standard [6.7.4]
1654 static auto instance_ = std::make_shared<GaussianGmEval>(mmax, precision);
1655
1656 while (instance_->max_m() < mmax || instance_->precision() > precision) {
1657 static std::mutex mtx;
1658 std::lock_guard<std::mutex> lck(mtx);
1659 if (instance_->max_m() < mmax || instance_->precision() > precision) {
1660 auto new_instance = std::make_shared<GaussianGmEval>(mmax, precision);
1661 instance_ = new_instance;
1662 }
1663 }
1664
1665 return instance_;
1666 }
1667
1670 int max_m() const { return mmax_; }
1672 Real precision() const { return precision_; }
1673
1693 template <typename AnyReal>
1694 void eval(Real* Gm, Real rho, Real T, size_t mmax,
1695 const std::vector<std::pair<AnyReal, AnyReal>>& geminal,
1696 void* scr = 0) {
1697 std::fill(Gm, Gm + mmax + 1, Real(0));
1698
1699 const auto sqrt_rho = sqrt(rho);
1700 const auto oo_sqrt_rho = 1 / sqrt_rho;
1701 if (k == -1) {
1702 void* _scr = (scr == 0) ? this : scr;
1703 auto& scratch = *(
1704 reinterpret_cast<detail::CoreEvalScratch<GaussianGmEval<Real, -1>>*>(
1705 _scr));
1706 for (int i = 1; i <= mmax; i++) {
1707 scratch.r_i[i] = scratch.r_i[i - 1] * rho;
1708 }
1709 }
1710
1711 typedef
1712 typename std::vector<std::pair<AnyReal, AnyReal>>::const_iterator citer;
1713 const citer gend = geminal.end();
1714 for (citer i = geminal.begin(); i != gend; ++i) {
1715 const auto gamma = i->first;
1716 const auto gcoef = i->second;
1717 const auto rhog = rho + gamma;
1718 const auto oorhog = 1 / rhog;
1719
1720 const auto gorg = gamma * oorhog;
1721 const auto rorg = rho * oorhog;
1722 const auto sqrt_rho_org = sqrt_rho * oorhog;
1723 const auto sqrt_rhog = sqrt(rhog);
1724 const auto sqrt_rorg = sqrt_rho_org * sqrt_rhog;
1725
1727 constexpr Real const_SQRTPI_2(
1728 0.88622692545275801364908374167057259139877472806119); /* sqrt(pi)/2
1729 */
1730 const auto SS_K0G12_SS = gcoef * oo_sqrt_rho * const_SQRTPI_2 * rorg *
1731 sqrt_rorg * exp(-gorg * T);
1732
1733 if (k == -1) {
1734 void* _scr = (scr == 0) ? this : scr;
1735 auto& scratch =
1736 *(reinterpret_cast<
1737 detail::CoreEvalScratch<GaussianGmEval<Real, -1>>*>(_scr));
1738
1739 const auto rorgT = rorg * T;
1740 fm_eval_->eval(&scratch.Fm_[0], rorgT, mmax);
1741
1742#if 1
1743 constexpr Real const_2_SQRTPI(
1744 1.12837916709551257389615890312154517); /* 2/sqrt(pi) */
1745 const auto pfac = const_2_SQRTPI * sqrt_rhog * SS_K0G12_SS;
1746 scratch.oorhog_i[0] = pfac;
1747 for (int i = 1; i <= mmax; i++) {
1748 scratch.g_i[i] = scratch.g_i[i - 1] * gamma;
1749 scratch.oorhog_i[i] = scratch.oorhog_i[i - 1] * oorhog;
1750 }
1751 for (int m = 0; m <= mmax; m++) {
1752 Real ssss = 0.0;
1753 Real* bcm = numbers_.bc[m];
1754 for (int n = 0; n <= m; n++) {
1755 ssss +=
1756 bcm[n] * scratch.r_i[n] * scratch.g_i[m - n] * scratch.Fm_[n];
1757 }
1758 Gm[m] += ssss * scratch.oorhog_i[m];
1759 }
1760#endif
1761 }
1762
1763 if (k == 0) {
1764 auto ss_oper_ss_m = SS_K0G12_SS;
1765 Gm[0] += ss_oper_ss_m;
1766 for (int m = 1; m <= mmax; ++m) {
1767 ss_oper_ss_m *= gorg;
1768 Gm[m] += ss_oper_ss_m;
1769 }
1770 }
1771
1772 if (k == 2) {
1774 const auto rorgT = rorg * T;
1775 const auto SS_K2G12_SS_0 = (1.5 + rorgT) * (SS_K0G12_SS * oorhog);
1776 const auto SS_K2G12_SS_m1 = rorg * (SS_K0G12_SS * oorhog);
1777
1778 auto SS_K2G12_SS_gorg_m = SS_K2G12_SS_0;
1779 auto SS_K2G12_SS_gorg_m1 = SS_K2G12_SS_m1;
1780 Gm[0] += SS_K2G12_SS_gorg_m;
1781 for (int m = 1; m <= mmax; ++m) {
1782 SS_K2G12_SS_gorg_m *= gorg;
1783 Gm[m] += SS_K2G12_SS_gorg_m - m * SS_K2G12_SS_gorg_m1;
1784 SS_K2G12_SS_gorg_m1 *= gorg;
1785 }
1786 }
1787 }
1788 }
1789
1790 private:
1791 int mmax_;
1792 Real precision_; //< absolute precision
1793 std::shared_ptr<const FmEvalType> fm_eval_;
1794
1795 ExpensiveNumbers<Real> numbers_;
1796};
1797
1798template <typename GmEvalFunction>
1799struct GenericGmEval : private GmEvalFunction {
1800 typedef typename GmEvalFunction::value_type Real;
1801
1802 GenericGmEval(int mmax, Real precision)
1803 : GmEvalFunction(mmax, precision), mmax_(mmax), precision_(precision) {}
1804
1805 static std::shared_ptr<const GenericGmEval> instance(
1806 int mmax, Real precision = std::numeric_limits<Real>::epsilon()) {
1807 return std::make_shared<const GenericGmEval>(mmax, precision);
1808 }
1809
1810 template <typename Real, typename... ExtraArgs>
1811 void eval(Real* Gm, Real rho, Real T, int mmax, ExtraArgs... args) const {
1812 assert(mmax <= mmax_);
1813 (GmEvalFunction(*this))(Gm, rho, T, mmax, std::forward<ExtraArgs>(args)...);
1814 }
1815
1818 int max_m() const { return mmax_; }
1820 Real precision() const { return precision_; }
1821
1822 private:
1823 int mmax_;
1824 Real precision_;
1825};
1826
1827// these Gm engines need extra scratch data
1828namespace os_core_ints {
1829template <typename Real, int K>
1830struct r12_xx_K_gm_eval;
1831template <typename Real>
1832struct erfc_coulomb_gm_eval;
1833} // namespace os_core_ints
1834
1835namespace detail {
1837template <typename Real>
1838struct CoreEvalScratch<os_core_ints::r12_xx_K_gm_eval<Real, 1>> {
1839 std::vector<Real> Fm_;
1840 CoreEvalScratch(const CoreEvalScratch&) = default;
1841 CoreEvalScratch(CoreEvalScratch&&) = default;
1842 // need to store Fm(T) for m = 0 .. mmax+1
1843 explicit CoreEvalScratch(int mmax) { Fm_.resize(mmax + 2); }
1844};
1846template <typename Real>
1847struct CoreEvalScratch<os_core_ints::erfc_coulomb_gm_eval<Real>> {
1848 std::vector<Real> Fm_;
1849 CoreEvalScratch(const CoreEvalScratch&) = default;
1850 CoreEvalScratch(CoreEvalScratch&&) = default;
1851 // need to store Fm(T) for m = 0 .. mmax
1852 explicit CoreEvalScratch(int mmax) { Fm_.resize(mmax + 1); }
1853};
1854} // namespace detail
1855
1857namespace os_core_ints {
1858
1860template <typename Real>
1861struct delta_gm_eval {
1862 typedef Real value_type;
1863
1864 delta_gm_eval(unsigned int, Real) {}
1865 void operator()(Real* Gm, Real rho, Real T, int mmax) const {
1866 constexpr static auto one_over_two_pi = 1.0 / (2.0 * M_PI);
1867 const auto G0 = exp(-T) * rho * one_over_two_pi;
1868 std::fill(Gm, Gm + mmax + 1, G0);
1869 }
1870};
1871
1876
1877template <typename Real, int K>
1878struct r12_xx_K_gm_eval;
1879
1880template <typename Real>
1881struct r12_xx_K_gm_eval<Real, 1>
1882 : private detail::CoreEvalScratch<r12_xx_K_gm_eval<Real, 1>> {
1883 typedef detail::CoreEvalScratch<r12_xx_K_gm_eval<Real, 1>> base_type;
1884 typedef Real value_type;
1885
1886#ifndef LIBINT_USER_DEFINED_REAL
1887 using FmEvalType = libint2::FmEval_Chebyshev7<double>;
1888#else
1889 using FmEvalType = libint2::FmEval_Reference<scalar_type>;
1890#endif
1891
1892 r12_xx_K_gm_eval(unsigned int mmax, Real precision) : base_type(mmax) {
1893 fm_eval_ = FmEvalType::instance(mmax + 1, precision);
1894 }
1895 void operator()(Real* Gm, Real rho, Real T, int mmax) {
1896 fm_eval_->eval(&base_type::Fm_[0], T, mmax + 1);
1897 auto T_plus_m_plus_one = T + 1.0;
1898 Gm[0] = T_plus_m_plus_one * base_type::Fm_[0] - T * base_type::Fm_[1];
1899 auto minus_m = -1.0;
1900 T_plus_m_plus_one += 1.0;
1901 for (auto m = 1; m <= mmax; ++m, minus_m -= 1.0, T_plus_m_plus_one += 1.0) {
1902 Gm[m] = minus_m * base_type::Fm_[m - 1] +
1903 T_plus_m_plus_one * base_type::Fm_[m] - T * base_type::Fm_[m + 1];
1904 }
1905 }
1906
1907 private:
1908 std::shared_ptr<const FmEvalType> fm_eval_; // need for odd K
1909};
1910
1912template <typename Real>
1913struct erf_coulomb_gm_eval {
1914 typedef Real value_type;
1915
1916#ifndef LIBINT_USER_DEFINED_REAL
1917 using FmEvalType = libint2::FmEval_Chebyshev7<double>;
1918#else
1919 using FmEvalType = libint2::FmEval_Reference<scalar_type>;
1920#endif
1921
1922 erf_coulomb_gm_eval(unsigned int mmax, Real precision) {
1923 fm_eval_ = FmEvalType::instance(mmax, precision);
1924 }
1925 void operator()(Real* Gm, Real rho, Real T, int mmax, Real omega) const {
1926 if (omega > 0) {
1927 auto omega2 = omega * omega;
1928 auto omega2_over_omega2_plus_rho = omega2 / (omega2 + rho);
1929 fm_eval_->eval(Gm, T * omega2_over_omega2_plus_rho, mmax);
1930
1931 using std::sqrt;
1932 auto ooversqrto2prho_exp_2mplus1 = sqrt(omega2_over_omega2_plus_rho);
1933 for (auto m = 0; m <= mmax;
1934 ++m, ooversqrto2prho_exp_2mplus1 *= omega2_over_omega2_plus_rho) {
1935 Gm[m] *= ooversqrto2prho_exp_2mplus1;
1936 }
1937 } else {
1938 std::fill(Gm, Gm + mmax + 1, Real{0});
1939 }
1940 }
1941
1942 private:
1943 std::shared_ptr<const FmEvalType> fm_eval_; // need for odd K
1944};
1945
1949template <typename Real>
1950struct erfc_coulomb_gm_eval
1951 : private detail::CoreEvalScratch<erfc_coulomb_gm_eval<Real>> {
1952 typedef detail::CoreEvalScratch<erfc_coulomb_gm_eval<Real>> base_type;
1953 typedef Real value_type;
1954
1955#ifndef LIBINT_USER_DEFINED_REAL
1956 using FmEvalType = libint2::FmEval_Chebyshev7<double>;
1957#else
1958 using FmEvalType = libint2::FmEval_Reference<scalar_type>;
1959#endif
1960
1961 erfc_coulomb_gm_eval(unsigned int mmax, Real precision) : base_type(mmax) {
1962 fm_eval_ = FmEvalType::instance(mmax, precision);
1963 }
1964 void operator()(Real* Gm, Real rho, Real T, int mmax, Real omega) {
1965 fm_eval_->eval(&base_type::Fm_[0], T, mmax);
1966 std::copy(base_type::Fm_.cbegin(), base_type::Fm_.cbegin() + mmax + 1, Gm);
1967 if (omega > 0) {
1968 auto omega2 = omega * omega;
1969 auto omega2_over_omega2_plus_rho = omega2 / (omega2 + rho);
1970 fm_eval_->eval(&base_type::Fm_[0], T * omega2_over_omega2_plus_rho, mmax);
1971
1972 using std::sqrt;
1973 auto ooversqrto2prho_exp_2mplus1 = sqrt(omega2_over_omega2_plus_rho);
1974 for (auto m = 0; m <= mmax;
1975 ++m, ooversqrto2prho_exp_2mplus1 *= omega2_over_omega2_plus_rho) {
1976 Gm[m] -= ooversqrto2prho_exp_2mplus1 * base_type::Fm_[m];
1977 }
1978 }
1979 }
1980
1981 private:
1982 std::shared_ptr<const FmEvalType> fm_eval_; // need for odd K
1983};
1984
1985} // namespace os_core_ints
1986
1987/*
1988 * Slater geminal fitting is available only if have LAPACK
1989 */
1990#if HAVE_LAPACK
1991/*
1992f[x_] := - Exp[-\[Zeta] x] / \[Zeta];
1993
1994ff[cc_, aa_, x_] := Sum[cc[[i]]*Exp[-aa[[i]] x^2], {i, 1, n}];
1995*/
1996template <typename Real>
1997Real fstg(Real zeta, Real x) {
1998 return -std::exp(-zeta * x) / zeta;
1999}
2000
2001template <typename Real>
2002Real fngtg(const std::vector<Real>& cc, const std::vector<Real>& aa, Real x) {
2003 Real value = 0.0;
2004 const Real x2 = x * x;
2005 const unsigned int n = cc.size();
2006 for (unsigned int i = 0; i < n; ++i) value += cc[i] * std::exp(-aa[i] * x2);
2007 return value;
2008}
2009
2010// --- weighting functions ---
2011// L2 error is weighted by ww(x)
2012// hence error is weighted by sqrt(ww(x))
2013template <typename Real>
2014Real wwtewklopper(Real x) {
2015 const Real x2 = x * x;
2016 return x2 * std::exp(-2 * x2);
2017}
2018template <typename Real>
2019Real wwcusp(Real x) {
2020 const Real x2 = x * x;
2021 const Real x6 = x2 * x2 * x2;
2022 return std::exp(-0.005 * x6);
2023}
2024// default is Tew-Klopper
2025template <typename Real>
2026Real ww(Real x) {
2027 // return wwtewklopper(x);
2028 return wwcusp(x);
2029}
2030
2031template <typename Real>
2032Real norm(const std::vector<Real>& vec) {
2033 Real value = 0.0;
2034 const unsigned int n = vec.size();
2035 for (unsigned int i = 0; i < n; ++i) value += vec[i] * vec[i];
2036 return value;
2037}
2038
2039template <typename Real>
2040void LinearSolveDamped(const std::vector<Real>& A, const std::vector<Real>& b,
2041 Real lambda, std::vector<Real>& x) {
2042 const size_t n = b.size();
2043 std::vector<Real> Acopy(A);
2044 for (size_t m = 0; m < n; ++m) Acopy[m * n + m] *= (1 + lambda);
2045 std::vector<Real> e(b);
2046
2047 // int info = LAPACKE_dgesv( LAPACK_ROW_MAJOR, n, 1, &Acopy[0], n, &ipiv[0],
2048 // &e[0], n );
2049 {
2050 std::vector<int> ipiv(n);
2051 int n = b.size();
2052 int one = 1;
2053 int info;
2054 dgesv_(&n, &one, &Acopy[0], &n, &ipiv[0], &e[0], &n, &info);
2055 assert(info == 0);
2056 }
2057
2058 x = e;
2059}
2060
2072template <typename Real>
2073void stg_ng_fit(unsigned int n, Real zeta,
2074 std::vector<std::pair<Real, Real>>& geminal, Real xmin = 0.0,
2075 Real xmax = 10.0, unsigned int npts = 1001) {
2076 // initial guess
2077 std::vector<Real> cc(n, 1.0); // coefficients
2078 std::vector<Real> aa(n); // exponents
2079 for (unsigned int i = 0; i < n; ++i)
2080 aa[i] = std::pow(3.0, (i + 2 - (n + 1) / 2.0));
2081
2082 // first rescale cc for ff[x] to match the norm of f[x]
2083 Real ffnormfac = 0.0;
2084 using std::sqrt;
2085 for (unsigned int i = 0; i < n; ++i)
2086 for (unsigned int j = 0; j < n; ++j)
2087 ffnormfac += cc[i] * cc[j] / sqrt(aa[i] + aa[j]);
2088 const Real Nf = sqrt(2.0 * zeta) * zeta;
2089 const Real Nff = sqrt(2.0) / (sqrt(ffnormfac) * sqrt(sqrt(M_PI)));
2090 for (unsigned int i = 0; i < n; ++i) cc[i] *= -Nff / Nf;
2091
2092 Real lambda0 = 1000; // damping factor is initially set to 1000, eventually
2093 // should end up at 0
2094 const Real nu = 3.0; // increase/decrease the damping factor scale it by this
2095 const Real epsilon = 1e-15; // convergence
2096 const unsigned int maxniter = 200;
2097
2098 // grid points on which we will fit
2099 std::vector<Real> xi(npts);
2100 for (unsigned int i = 0; i < npts; ++i)
2101 xi[i] = xmin + (xmax - xmin) * i / (npts - 1);
2102
2103 std::vector<Real> err(npts);
2104
2105 const size_t nparams =
2106 2 * n; // params = expansion coefficients + gaussian exponents
2107 std::vector<Real> J(npts * nparams);
2108 std::vector<Real> delta(nparams);
2109
2110 // std::cout << "iteration 0" << std::endl;
2111 // for(unsigned int i=0; i<n; ++i)
2112 // std::cout << cc[i] << " " << aa[i] << std::endl;
2113
2114 Real errnormI;
2115 Real errnormIm1 = 1e3;
2116 bool converged = false;
2117 unsigned int iter = 0;
2118 while (!converged && iter < maxniter) {
2119 // std::cout << "Iteration " << ++iter << ": lambda = " << lambda0/nu
2120 // << std::endl;
2121
2122 for (unsigned int i = 0; i < npts; ++i) {
2123 const Real x = xi[i];
2124 err[i] = (fstg(zeta, x) - fngtg(cc, aa, x)) * sqrt(ww(x));
2125 }
2126 errnormI = norm(err) / sqrt((Real)npts);
2127
2128 // std::cout << "|err|=" << errnormI << std::endl;
2129 converged = std::abs((errnormI - errnormIm1) / errnormIm1) <= epsilon;
2130 if (converged) break;
2131 errnormIm1 = errnormI;
2132
2133 for (unsigned int i = 0; i < npts; ++i) {
2134 const Real x2 = xi[i] * xi[i];
2135 const Real sqrt_ww_x = sqrt(ww(xi[i]));
2136 const unsigned int ioffset = i * nparams;
2137 for (unsigned int j = 0; j < n; ++j)
2138 J[ioffset + j] = (std::exp(-aa[j] * x2)) * sqrt_ww_x;
2139 const unsigned int ioffsetn = ioffset + n;
2140 for (unsigned int j = 0; j < n; ++j)
2141 J[ioffsetn + j] = -sqrt_ww_x * x2 * cc[j] * std::exp(-aa[j] * x2);
2142 }
2143
2144 std::vector<Real> A(nparams * nparams);
2145 for (size_t r = 0, rc = 0; r < nparams; ++r) {
2146 for (size_t c = 0; c < nparams; ++c, ++rc) {
2147 double Arc = 0.0;
2148 for (size_t i = 0, ir = r, ic = c; i < npts;
2149 ++i, ir += nparams, ic += nparams)
2150 Arc += J[ir] * J[ic];
2151 A[rc] = Arc;
2152 }
2153 }
2154
2155 std::vector<Real> b(nparams);
2156 for (size_t r = 0; r < nparams; ++r) {
2157 Real br = 0.0;
2158 for (size_t i = 0, ir = r; i < npts; ++i, ir += nparams)
2159 br += J[ir] * err[i];
2160 b[r] = br;
2161 }
2162
2163 // try decreasing damping first
2164 // if not successful try increasing damping until it results in a decrease
2165 // in the error
2166 lambda0 /= nu;
2167 for (int l = -1; l < 1000; ++l) {
2168 LinearSolveDamped(A, b, lambda0, delta);
2169
2170 std::vector<double> cc_0(cc);
2171 for (unsigned int i = 0; i < n; ++i) cc_0[i] += delta[i];
2172 std::vector<double> aa_0(aa);
2173 for (unsigned int i = 0; i < n; ++i) aa_0[i] += delta[i + n];
2174
2175 // if any of the exponents are negative the step is too large and need to
2176 // increase damping
2177 bool step_too_large = false;
2178 for (unsigned int i = 0; i < n; ++i)
2179 if (aa_0[i] < 0.0) {
2180 step_too_large = true;
2181 break;
2182 }
2183 if (!step_too_large) {
2184 std::vector<double> err_0(npts);
2185 for (unsigned int i = 0; i < npts; ++i) {
2186 const double x = xi[i];
2187 err_0[i] = (fstg(zeta, x) - fngtg(cc_0, aa_0, x)) * sqrt(ww(x));
2188 }
2189 const double errnorm_0 = norm(err_0) / sqrt((double)npts);
2190 if (errnorm_0 < errnormI) {
2191 cc = cc_0;
2192 aa = aa_0;
2193 break;
2194 } else // step lead to increase of the error -- try dampening a bit
2195 // more
2196 lambda0 *= nu;
2197 } else // too large of a step
2198 lambda0 *= nu;
2199 } // done adjusting the damping factor
2200
2201 } // end of iterative minimization
2202
2203 // if reached max # of iterations throw if the error is too terrible
2204 assert(not(iter == maxniter && errnormI > 1e-10));
2205
2206 for (unsigned int i = 0; i < n; ++i)
2207 geminal[i] = std::make_pair(aa[i], cc[i]);
2208}
2209#endif
2210
2211} // end of namespace libint2
2212
2213#endif // C++ only
2214#endif // header guard
holds tables of expensive quantities
Definition boys.h:67
Computes the Boys function, $ F_m (T) = \int_0^1 u^{2m} \exp(-T u^2) \, {\rm d}u $,...
Definition boys.h:253
static std::shared_ptr< const FmEval_Chebyshev7 > instance(int m_max, double=0.0)
Singleton interface allows to manage the lone instance; adjusts max m values as needed in thread-safe...
Definition boys.h:317
void eval(Real *Fm, Real x, int m_max) const
fills in Fm with computed Boys function values for m in [0,mmax]
Definition boys.h:345
FmEval_Chebyshev7(int m_max, double precision=std::numeric_limits< double >::epsilon())
Definition boys.h:280
int max_m() const
Definition boys.h:337
Computes the Boys function, $ F_m (T) = \int_0^1 u^{2m} \exp(-T u^2) \, {\rm d}u $,...
Definition boys.h:520
static std::shared_ptr< const FmEval_Taylor > instance(unsigned int mmax, Real precision=std::numeric_limits< Real >::epsilon())
Singleton interface allows to manage the lone instance; adjusts max m and precision values as needed ...
Definition boys.h:643
void eval(Real *Fm, Real T, int mmax) const
computes Boys function values with m index in range [0,mmax]
Definition boys.h:679
int max_m() const
Definition boys.h:667
Real precision() const
Definition boys.h:669
FmEval_Taylor(unsigned int mmax, Real precision=std::numeric_limits< Real >::epsilon())
Constructs the object to be able to compute Boys funcion for m in [0,mmax], with relative precision.
Definition boys.h:533
double horizontal_add(VectorSSEDouble const &a)
Horizontal add.
Definition vector_x86.h:220
Defaults definitions for various parameters assumed by Libint.
Definition algebra.cc:24
std::ostream & verbose_stream()
Accessor for the disgnostics stream.
Definition initialize.h:128
bool verbose()
Accessor for the verbose flag.
Definition initialize.h:135
Computes the Boys function, $ F_m (T) = \int_0^1 u^{2m} \exp(-T u^2) \, {\rm d}u $,...
Definition boys.h:214
static void eval(Real *Fm, Real t, size_t mmax)
fills up an array of Fm(T) for m in [0,mmax]
Definition boys.h:229
Computes the Boys function, , using single algorithm (asymptotic expansion).
Definition boys.h:144
static void eval(Real *Fm, Real T, size_t mmax)
fills up an array of Fm(T) for m in [0,mmax]
Definition boys.h:192
static Real eval(Real T, size_t m)
computes a single value of using MacLaurin series to full precision of Real
Definition boys.h:155
Definition boys.h:1558
core integral for Yukawa and exponential interactions
Definition boys.h:901
TennoGmEval(unsigned int mmax, Real precision=maxabsprecision)
Definition boys.h:933
void eval_slater(Real *Gm, Real one_over_rho, Real T, size_t mmax, Real zeta) const
Definition boys.h:1025
static std::shared_ptr< const TennoGmEval > instance(int m_max, double=0)
Singleton interface allows to manage the lone instance; adjusts max m values as needed in thread-safe...
Definition boys.h:969
Real precision() const
Definition boys.h:988
void eval_yukawa(Real *Gm, Real one_over_rho, Real T, size_t mmax, Real zeta) const
Definition boys.h:1001
some evaluators need thread-local scratch, but most don't
Definition boys.h:1564
SIMD vector of 4 double-precision floating-point real numbers, operations on which use AVX instructio...
Definition vector_x86.h:630
void convert(T *a) const
writes this to a
Definition vector_x86.h:707
void load(T const *a)
loads a to this
Definition vector_x86.h:702
void load_aligned(T const *a)
loads a to this
Definition vector_x86.h:705
SIMD vector of 2 double-precision floating-point real numbers, operations on which use SSE2 instructi...
Definition vector_x86.h:46
void convert(T *a) const
writes this to a
Definition vector_x86.h:122