LIBINT 2.7.2
boys.h
1/*
2 * Copyright (C) 2004-2021 Edward F. Valeev
3 *
4 * This file is part of Libint.
5 *
6 * Libint is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Libint is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with Libint. If not, see <http://www.gnu.org/licenses/>.
18 *
19 */
20
21#ifndef _libint2_src_lib_libint_boys_h_
22#define _libint2_src_lib_libint_boys_h_
23
24#if defined(__cplusplus)
25
26#include <iostream>
27#include <cstdlib>
28#include <cmath>
29#include <stdexcept>
30#include <libint2/util/vector.h>
31#include <cassert>
32#include <vector>
33#include <algorithm>
34#include <limits>
35#include <mutex>
36#include <type_traits>
37#include <memory>
38
39// from now on at least C++11 is required by default
40#include <libint2/util/cxxstd.h>
41#if LIBINT2_CPLUSPLUS_STD < 2011
42# error "Libint2 C++ API requires C++11 support"
43#endif
44
45#include <libint2/boys_fwd.h>
46#include <libint2/numeric.h>
47#include <libint2/initialize.h>
48
49#if HAVE_LAPACK // use F77-type interface for now, switch to LAPACKE later
50extern "C" void dgesv_(const int* n,
51 const int* nrhs, double* A, const int* lda,
52 int* ipiv, double* b, const int* ldb,
53 int* info);
54#endif
55
56namespace libint2 {
57
59 template<typename Real>
61 public:
62 ExpensiveNumbers(int ifac = -1, int idf = -1, int ibc = -1) {
63 if (ifac >= 0) {
64 fac.resize(ifac + 1);
65 fac[0] = 1.0;
66 for (int i = 1; i <= ifac; i++) {
67 fac[i] = i * fac[i - 1];
68 }
69 }
70
71 if (idf >= 0) {
72 df.resize(idf + 1);
73 /* df[i] gives (i-1)!!, so that (-1)!! is defined... */
74 df[0] = 1.0;
75 if (idf >= 1)
76 df[1] = 1.0;
77 if (idf >= 2)
78 df[2] = 1.0;
79 for (int i = 3; i <= idf; i++) {
80 df[i] = (i - 1) * df[i - 2];
81 }
82 }
83
84 if (ibc >= 0) {
85 bc_.resize((ibc+1)*(ibc+1));
86 std::fill(bc_.begin(), bc_.end(), Real(0));
87 bc.resize(ibc+1);
88 bc[0] = &bc_[0];
89 for(int i=1; i<=ibc; ++i)
90 bc[i] = bc[i-1] + (ibc+1);
91
92 for(int i=0; i<=ibc; i++)
93 bc[i][0] = 1.0;
94 for(int i=0; i<=ibc; i++)
95 for(int j=1; j<=i; ++j)
96 bc[i][j] = bc[i][j-1] * Real(i-j+1) / Real(j);
97 }
98
99 for (int i = 0; i < 128; i++) {
100 twoi1[i] = 1.0 / (Real(2.0) * i + Real(1.0));
101 ihalf[i] = Real(i) - Real(0.5);
102 }
103
104 }
105
107 }
108
109 std::vector<Real> fac;
110 std::vector<Real> df;
111 std::vector<Real*> bc;
112
113 // these quantitites are needed with indices <= mmax
114 // 64 is sufficient to handle up to 4 center integrals with up to L=15 basis functions
115 // but need higher values for Yukawa integrals ...
116 Real twoi1[128]; /* 1/(2 i + 1); needed for downward recursion */
117 Real ihalf[128]; /* i - 0.5, needed for upward recursion */
118
119 private:
120 std::vector<Real> bc_;
121 };
122
141 template<typename Real>
143
144 static std::shared_ptr<const FmEval_Reference> instance(int /* mmax */, Real /* precision */) {
145
146 // thread-safe per C++11 standard [6.7.4]
147 static auto instance_ = std::make_shared<const FmEval_Reference>();
148
149 return instance_;
150 }
151
153 static Real eval(Real T, size_t m) {
154 assert(m < 100);
155 static const Real T_crit = std::numeric_limits<Real>::is_bounded ? -log( std::numeric_limits<Real>::min() * 100.5 / 2. ) : Real(0) ;
156 if (std::numeric_limits<Real>::is_bounded && T > T_crit)
157 throw std::overflow_error("FmEval_Reference<Real>::eval: Real lacks precision for the given value of argument T");
158 static const Real half = Real(1)/2;
159 Real denom = (m + half);
160 using std::exp;
161 Real term = exp(-T) / (2 * denom);
162 Real old_term = 0;
163 Real sum = term;
164 const Real epsilon = get_epsilon(T);
165 const Real epsilon_divided_10 = epsilon / 10;
166 do {
167 denom += 1;
168 old_term = term;
169 term = old_term * T / denom;
170 sum += term;
171 //rel_error = term / sum , hence iterate until rel_error = epsilon
172 // however, must ensure that contributions are decreasing to ensure that omitted contributions are smaller than epsilon
173 } while (term > sum * epsilon_divided_10 || old_term < term);
174
175 return sum;
176 }
177
182 static void eval(Real* Fm, Real T, size_t mmax) {
183
184 // evaluate for mmax using MacLaurin series
185 // it converges fastest for the largest m -> use it to compute Fmmax(T)
186 // see JPC 94, 5564 (1990).
187 for(size_t m=0; m<=mmax; ++m)
188 Fm[m] = eval(T, m);
189 return;
190 }
191
192 };
193
204 template<typename Real>
206
207 static std::shared_ptr<const FmEval_Reference2> instance(int /* mmax */, Real /* precision */) {
208
209 // thread-safe per C++11 standard [6.7.4]
210 static auto instance_ = std::make_shared<const FmEval_Reference2>();
211
212 return instance_;
213 }
214
219 static void eval(Real* Fm, Real t, size_t mmax) {
220
221 if (t < Real(117)) {
223 }
224 else {
225 const Real two_over_sqrt_pi{1.128379167095512573896158903121545171688101258657997713688171443421284936882986828973487320404214727};
226 const Real K = 1/two_over_sqrt_pi;
227
228 auto t2 = 2*t;
229 auto et = exp(-t);
230 auto sqrt_t = sqrt(t);
231 Fm[0] = K*erf(sqrt_t)/sqrt_t;
232 if (mmax > 0)
233 for(size_t m=0; m<=mmax-1; m++) {
234 Fm[m+1] = ((2*m + 1)*Fm[m] - et)/(t2);
235 }
236 }
237 }
238
239 };
240
244 template <typename Real = double>
246
247#include <libint2/boys_cheb7.h>
248
249 static_assert(std::is_same<Real,double>::value, "FmEval_Chebyshev7 only supports double as the real type");
250
251 static constexpr int ORDER = interpolation_order;
252 static constexpr int ORDERp1 = ORDER+1;
253
254 static constexpr Real T_crit = cheb_table_tmax;
255 static constexpr Real delta = cheb_table_delta;
256 static constexpr Real one_over_delta = 1/delta;
257
258 int mmax;
260 Real *c; /* the Chebyshev coefficients table, T_crit*one_over_delta by mmax*ORDERp1 */
261
262 public:
267 FmEval_Chebyshev7(int m_max, double precision = std::numeric_limits<double>::epsilon()) :
268 mmax(m_max), numbers_(14) {
269#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
270# if !defined(__AVX__) && defined(NDEBUG)
271 if (libint2::verbose()) {
272 static bool printed_performance_warning = false;
273 if (!printed_performance_warning) {
275 << "libint2::FmEval_Chebyshev7 on x86(-64) platforms needs AVX support for best performance"
276 << std::endl;
277 printed_performance_warning = true;
278 }
279 }
280# endif
281#endif
282 if (precision < std::numeric_limits<double>::epsilon())
283 throw std::invalid_argument(std::string("FmEval_Chebyshev7 does not support precision smaller than ") + std::to_string(std::numeric_limits<double>::epsilon()));
284 if (mmax > cheb_table_mmax)
285 throw std::invalid_argument(
286 "FmEval_Chebyshev7::init() : requested mmax exceeds the "
287 "hard-coded mmax");
288 if (m_max >= 0)
289 init_table();
290 }
292 if (mmax >= 0) {
293 free(c);
294 }
295 }
296
298 static std::shared_ptr<const FmEval_Chebyshev7> instance(int m_max, double = 0.0) {
299
300 assert(m_max >= 0);
301 // thread-safe per C++11 standard [6.7.4]
302 static auto instance_ = std::make_shared<const FmEval_Chebyshev7>(m_max);
303
304 while (instance_->max_m() < m_max) {
305 static std::mutex mtx;
306 std::lock_guard<std::mutex> lck(mtx);
307 if (instance_->max_m() < m_max) {
308 auto new_instance = std::make_shared<const FmEval_Chebyshev7>(m_max);
309 instance_ = new_instance;
310 }
311 }
312
313 return instance_;
314 }
315
317 int max_m() const { return mmax; }
318
323 inline void eval(Real* Fm, Real x, int m_max) const {
324
325 // large T => use upward recursion
326 // cost = 1 div + 1 sqrt + (1 + 2*(m-1)) muls
327 if (x > T_crit) {
328 const double one_over_x = 1/x;
329 Fm[0] = 0.88622692545275801365 * sqrt(one_over_x); // see Eq. (9.8.9) in Helgaker-Jorgensen-Olsen
330 if (m_max == 0)
331 return;
332 // this upward recursion formula omits - e^(-x)/(2x), which for x>T_crit is small enough to guarantee full double precision
333 for (int i = 1; i <= m_max; i++)
334 Fm[i] = Fm[i - 1] * numbers_.ihalf[i] * one_over_x; // see Eq. (9.8.13)
335 return;
336 }
337
338 // ---------------------------------------------
339 // small and intermediate arguments => interpolate Fm and (optional) downward recursion
340 // ---------------------------------------------
341 // which interval does this x fall into?
342 const Real x_over_delta = x * one_over_delta;
343 const int iv = int(x_over_delta); // the interval index
344 const Real xd = x_over_delta - (Real)iv - 0.5; // this ranges from -0.5 to 0.5
345 const int m_min = 0;
346
347#if defined(__AVX__)
348 const auto x2 = xd*xd;
349 const auto x3 = x2*xd;
350 const auto x4 = x2*x2;
351 const auto x5 = x2*x3;
352 const auto x6 = x3*x3;
353 const auto x7 = x3*x4;
354 libint2::simd::VectorAVXDouble x0vec(1., xd, x2, x3);
355 libint2::simd::VectorAVXDouble x1vec(x4, x5, x6, x7);
356#endif // AVX
357
358 const Real *d = c + (ORDERp1) * (iv * (mmax+1) + m_min); // ptr to the interpolation data for m=mmin
359 int m = m_min;
360#if defined(__AVX__)
361 if (m_max-m >=3) {
362 const int unroll_size = 4;
363 const int m_fence = (m_max + 2 - unroll_size);
364 for(; m<m_fence; m+=unroll_size, d+=ORDERp1*unroll_size) {
365 libint2::simd::VectorAVXDouble d00v, d01v, d10v, d11v,
366 d20v, d21v, d30v, d31v;
367 d00v.load_aligned(d); d01v.load_aligned((d+4));
368 d10v.load_aligned(d+ORDERp1); d11v.load_aligned((d+4)+ORDERp1);
369 d20v.load_aligned(d+2*ORDERp1); d21v.load_aligned((d+4)+2*ORDERp1);
370 d30v.load_aligned(d+3*ORDERp1); d31v.load_aligned((d+4)+3*ORDERp1);
371 libint2::simd::VectorAVXDouble fm0 = d00v * x0vec + d01v * x1vec;
372 libint2::simd::VectorAVXDouble fm1 = d10v * x0vec + d11v * x1vec;
373 libint2::simd::VectorAVXDouble fm2 = d20v * x0vec + d21v * x1vec;
374 libint2::simd::VectorAVXDouble fm3 = d30v * x0vec + d31v * x1vec;
375 libint2::simd::VectorAVXDouble sum0123 = horizontal_add(fm0, fm1, fm2, fm3);
376 sum0123.convert(&Fm[m]);
377 }
378 } // unroll_size=4
379 if (m_max-m >=1) {
380 const int unroll_size = 2;
381 const int m_fence = (m_max + 2 - unroll_size);
382 for(; m<m_fence; m+=unroll_size, d+=ORDERp1*unroll_size) {
383 libint2::simd::VectorAVXDouble d00v, d01v, d10v, d11v;
384 d00v.load_aligned(d);
385 d01v.load_aligned((d+4));
386 d10v.load_aligned(d+ORDERp1);
387 d11v.load_aligned((d+4)+ORDERp1);
388 libint2::simd::VectorAVXDouble fm0 = d00v * x0vec + d01v * x1vec;
389 libint2::simd::VectorAVXDouble fm1 = d10v * x0vec + d11v * x1vec;
390 libint2::simd::VectorSSEDouble sum01 = horizontal_add(fm0, fm1);
391 sum01.convert(&Fm[m]);
392 }
393 } // unroll_size=2
394 { // no unrolling
395 for(; m<=m_max; ++m, d+=ORDERp1) {
397 d0v.load_aligned(d);
398 d1v.load_aligned(d+4);
399 Fm[m] = horizontal_add(d0v * x0vec + d1v * x1vec);
400 }
401 }
402#else // AVX not available
403 for(m=m_min; m<=m_max; ++m, d+=ORDERp1) {
404 Fm[m] = d[0]
405 + xd * (d[1]
406 + xd * (d[2]
407 + xd * (d[3]
408 + xd * (d[4]
409 + xd * (d[5]
410 + xd * (d[6]
411 + xd * (d[7])))))));
412
413 // // check against the reference value
414 // if (false) {
415 // double refvalue = FmEval_Reference2<double>::eval(x, m); // compute F(T)
416 // if (abs(refvalue - Fm[m]) > 1e-10) {
417 // std::cout << "T = " << x << " m = " << m << " cheb = "
418 // << Fm[m] << " ref = " << refvalue << std::endl;
419 // }
420 // }
421 }
422#endif
423
424
425 } // eval()
426
427 private:
428
429 void init_table() {
430
431 // get memory
432 void* result;
433 int status = posix_memalign(&result, ORDERp1*sizeof(Real), (mmax + 1) * cheb_table_nintervals * ORDERp1 * sizeof(Real));
434 if (status != 0) {
435 if (status == EINVAL)
436 throw std::logic_error(
437 "FmEval_Chebyshev7::init() : posix_memalign failed, alignment must be a power of 2 at least as large as sizeof(void *)");
438 if (status == ENOMEM)
439 throw std::bad_alloc();
440 abort(); // should be unreachable
441 }
442 c = static_cast<Real*>(result);
443
444 // copy contents of static table into c
445 // need all intervals
446 for (std::size_t iv = 0; iv < cheb_table_nintervals; ++iv) {
447 // but only values of m up to mmax
448 std::copy(cheb_table[iv], cheb_table[iv]+(mmax+1)*ORDERp1, c+(iv*(mmax+1))*ORDERp1);
449 }
450 }
451
452 }; // FmEval_Chebyshev7
453
454#if LIBINT2_CONSTEXPR_STATICS
455 template <typename Real>
456 constexpr
457 double FmEval_Chebyshev7<Real>::cheb_table[FmEval_Chebyshev7<Real>::cheb_table_nintervals][(FmEval_Chebyshev7<Real>::cheb_table_mmax+1)*(FmEval_Chebyshev7<Real>::interpolation_order+1)];
458#else
459 // clang needs an explicit specifalization declaration to avoid warning
460# ifdef __clang__
461 template <typename Real>
462 double FmEval_Chebyshev7<Real>::cheb_table[FmEval_Chebyshev7<Real>::cheb_table_nintervals][(FmEval_Chebyshev7<Real>::cheb_table_mmax+1)*(FmEval_Chebyshev7<Real>::interpolation_order+1)];
463# endif
464#endif
465
466#ifndef STATIC_OON
467#define STATIC_OON
468 namespace {
469 const double oon[] = {0.0, 1.0, 1.0/2.0, 1.0/3.0, 1.0/4.0, 1.0/5.0, 1.0/6.0, 1.0/7.0, 1.0/8.0, 1.0/9.0, 1.0/10.0, 1.0/11.0};
470 }
471#endif
472
478 template<typename Real = double, int INTERPOLATION_ORDER = 7>
480 public:
481 static_assert(std::is_same<Real,double>::value, "FmEval_Taylor only supports double as the real type");
482
483 static const int max_interp_order = 8;
484 static const bool INTERPOLATION_AND_RECURSION = false; // compute F_lmax(T) and then iterate down to F_0(T)? Else use interpolation only
485 const double soft_zero_;
486
488 FmEval_Taylor(unsigned int mmax, Real precision = std::numeric_limits<Real>::epsilon()) :
489 soft_zero_(1e-6), cutoff_(precision), numbers_(
490 INTERPOLATION_ORDER + 1, 2 * (mmax + INTERPOLATION_ORDER)) {
491
492#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
493# if !defined(__AVX__) && defined(NDEBUG)
494 if (libint2::verbose()) {
495 static bool printed_performance_warning = false;
496 if (!printed_performance_warning) {
498 << "libint2::FmEval_Taylor on x86(-64) platforms needs AVX support for best performance"
499 << std::endl;
500 printed_performance_warning = true;
501 }
502 }
503# endif
504#endif
505
506 assert(mmax <= 63);
507
508 const double sqrt_pi = std::sqrt(M_PI);
509
510 /*---------------------------------------
511 We are doing Taylor interpolation with
512 n=TAYLOR_ORDER terms here:
513 error <= delT^n/(n+1)!
514 ---------------------------------------*/
515 delT_ = 2.0
516 * std::pow(cutoff_ * numbers_.fac[INTERPOLATION_ORDER + 1],
517 1.0 / INTERPOLATION_ORDER);
518 oodelT_ = 1.0 / delT_;
519 max_m_ = mmax + INTERPOLATION_ORDER;
520
521 T_crit_ = new Real[max_m_ + 1]; /*--- m=0 is included! ---*/
522 max_T_ = 0;
523 /*--- Figure out T_crit for each m and put into the T_crit ---*/
524 for (int m = max_m_; m >= 0; --m) {
525 /*------------------------------------------
526 Damped Newton-Raphson method to solve
527 T^{m-0.5}*exp(-T) = epsilon*Gamma(m+0.5)
528 The solution is the max T for which to do
529 the interpolation
530 ------------------------------------------*/
531 double T = -log(cutoff_);
532 const double egamma = cutoff_ * sqrt_pi * numbers_.df[2 * m]
533 / std::pow(2.0, m);
534 double T_new = T;
535 double func;
536 do {
537 const double damping_factor = 0.2;
538 T = T_new;
539 /* f(T) = the difference between LHS and RHS of the equation above */
540 func = std::pow(T, m - 0.5) * std::exp(-T) - egamma;
541 const double dfuncdT = ((m - 0.5) * std::pow(T, m - 1.5)
542 - std::pow(T, m - 0.5)) * std::exp(-T);
543 /* f(T) has 2 roots and has a maximum in between. If f'(T) > 0 we are to the left of the hump. Make a big step to the right. */
544 if (dfuncdT > 0.0) {
545 T_new *= 2.0;
546 } else {
547 /* damp the step */
548 double deltaT = -func / dfuncdT;
549 const double sign_deltaT = (deltaT > 0.0) ? 1.0 : -1.0;
550 const double max_deltaT = damping_factor * T;
551 if (std::fabs(deltaT) > max_deltaT)
552 deltaT = sign_deltaT * max_deltaT;
553 T_new = T + deltaT;
554 }
555 if (T_new <= 0.0) {
556 T_new = T / 2.0;
557 }
558 } while (std::fabs(func / egamma) >= soft_zero_);
559 T_crit_[m] = T_new;
560 const int T_idx = (int) std::floor(T_new / delT_);
561 max_T_ = std::max(max_T_, T_idx);
562 }
563
564 // allocate the grid (see the comments below)
565 {
566 const int nrow = max_T_ + 1;
567 const int ncol = max_m_ + 1;
568 grid_ = new Real*[nrow];
569 grid_[0] = new Real[nrow * ncol];
570 //std::cout << "Allocated interpolation table of " << nrow * ncol << " reals" << std::endl;
571 for (int r = 1; r < nrow; ++r)
572 grid_[r] = grid_[r - 1] + ncol;
573 }
574
575 /*-------------------------------------------------------
576 Tabulate the gamma function from t=delT to T_crit[m]:
577 1) include T=0 though the table is empty for T=0 since
578 Fm(0) is simple to compute
579 -------------------------------------------------------*/
580 /*--- do the mmax first ---*/
581 for (int T_idx = max_T_; T_idx >= 0; --T_idx) {
582 const double T = T_idx * delT_;
583 libint2::FmEval_Reference2<double>::eval(grid_[T_idx], T, max_m_);
584 }
585 }
586
588 delete[] T_crit_;
589 delete[] grid_[0];
590 delete[] grid_;
591 }
592
595 static std::shared_ptr<const FmEval_Taylor> instance(unsigned int mmax, Real precision = std::numeric_limits<Real>::epsilon()) {
596 assert(mmax >= 0);
597 assert(precision >= 0);
598 // thread-safe per C++11 standard [6.7.4]
599 static auto instance_ = std::make_shared<const FmEval_Taylor>(mmax, precision);
600
601 while (instance_->max_m() < mmax || instance_->precision() > precision) {
602 static std::mutex mtx;
603 std::lock_guard<std::mutex> lck(mtx);
604 if (instance_->max_m() < mmax || instance_->precision() > precision) {
605 auto new_instance = std::make_shared<const FmEval_Taylor>(mmax, precision);
606 instance_ = new_instance;
607 }
608 }
609
610 return instance_;
611 }
612
614 int max_m() const { return max_m_ - INTERPOLATION_ORDER + 1; }
616 Real precision() const { return cutoff_; }
617
623 void eval(Real* Fm, Real T, int mmax) const {
624 const double sqrt_pio2 = 1.2533141373155002512;
625 const double two_T = 2.0 * T;
626
627 // stop recursion at mmin
628 const int mmin = INTERPOLATION_AND_RECURSION ? mmax : 0;
629 /*-------------------------------------
630 Compute Fm(T) from mmax down to mmin
631 -------------------------------------*/
632 const bool use_upward_recursion = true;
633 if (use_upward_recursion) {
634// if (T > 30.0) {
635 if (T > T_crit_[0]) {
636 const double one_over_x = 1.0/T;
637 Fm[0] = 0.88622692545275801365 * sqrt(one_over_x); // see Eq. (9.8.9) in Helgaker-Jorgensen-Olsen
638 if (mmax == 0)
639 return;
640 // this upward recursion formula omits - e^(-x)/(2x), which for x>T_crit is <1e-15
641 for (int i = 1; i <= mmax; i++)
642 Fm[i] = Fm[i - 1] * numbers_.ihalf[i] * one_over_x; // see Eq. (9.8.13)
643 return;
644 }
645 }
646
647 // since Tcrit grows with mmax, this condition only needs to be determined once
648 if (T > T_crit_[mmax]) {
649 double pow_two_T_to_minusjp05 = std::pow(two_T, -mmax - 0.5);
650 for (int m = mmax; m >= mmin; --m) {
651 /*--- Asymptotic formula ---*/
652 Fm[m] = numbers_.df[2 * m] * sqrt_pio2 * pow_two_T_to_minusjp05;
653 pow_two_T_to_minusjp05 *= two_T;
654 }
655 }
656 else
657 {
658 const int T_ind = (int) (0.5 + T * oodelT_);
659 const Real h = T_ind * delT_ - T;
660 const Real* F_row = grid_[T_ind] + mmin;
661
662#if defined (__AVX__)
664 if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) {
665 const double h2 = h*h*oon[2];
666 const double h3 = h2*h*oon[3];
667 h0123 = libint2::simd::VectorAVXDouble (1.0, h, h2, h3);
668 if (INTERPOLATION_ORDER == 7) {
669 const double h4 = h3*h*oon[4];
670 const double h5 = h4*h*oon[5];
671 const double h6 = h5*h*oon[6];
672 const double h7 = h6*h*oon[7];
673 h4567 = libint2::simd::VectorAVXDouble (h4, h5, h6, h7);
674 }
675 }
676 // libint2::simd::VectorAVXDouble h0123(1.0);
677 // libint2::simd::VectorAVXDouble h4567(1.0);
678#endif
679
680 int m = mmin;
681 if (mmax-m >=1) {
682 const int unroll_size = 2;
683 const int m_fence = (mmax + 2 - unroll_size);
684 for(; m<m_fence; m+=unroll_size, F_row+=unroll_size) {
685
686#if defined(__AVX__)
687 if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) {
688 libint2::simd::VectorAVXDouble fr0_0123; fr0_0123.load(F_row);
689 libint2::simd::VectorAVXDouble fr1_0123; fr1_0123.load(F_row+1);
690 libint2::simd::VectorSSEDouble fm01 = horizontal_add(fr0_0123*h0123, fr1_0123*h0123);
691 if (INTERPOLATION_ORDER == 7) {
692 libint2::simd::VectorAVXDouble fr0_4567; fr0_4567.load(F_row+4);
693 libint2::simd::VectorAVXDouble fr1_4567; fr1_4567.load(F_row+5);
694 fm01 += horizontal_add(fr0_4567*h4567, fr1_4567*h4567);
695 }
696 fm01.convert(&Fm[m]);
697 }
698 else {
699#endif
700 Real total0 = 0.0, total1 = 0.0;
701 for(int i=INTERPOLATION_ORDER; i>=1; --i) {
702 total0 = oon[i]*h*(F_row[i] + total0);
703 total1 = oon[i]*h*(F_row[i+1] + total1);
704 }
705 Fm[m] = F_row[0] + total0;
706 Fm[m+1] = F_row[1] + total1;
707#if defined(__AVX__)
708 }
709#endif
710 }
711 } // unroll_size = 2
712 if (m<=mmax) { // unroll_size = 1
713#if defined(__AVX__)
714 if (INTERPOLATION_ORDER == 3 || INTERPOLATION_ORDER == 7) {
715 libint2::simd::VectorAVXDouble fr0123; fr0123.load(F_row);
716 if (INTERPOLATION_ORDER == 7) {
717 libint2::simd::VectorAVXDouble fr4567; fr4567.load(F_row+4);
718// libint2::simd::VectorSSEDouble fm = horizontal_add(fr0123*h0123, fr4567*h4567);
719// Fm[m] = horizontal_add(fm);
720 Fm[m] = horizontal_add(fr0123*h0123 + fr4567*h4567);
721 }
722 else { // INTERPOLATION_ORDER == 3
723 Fm[m] = horizontal_add(fr0123*h0123);
724 }
725 }
726 else {
727#endif
728 Real total = 0.0;
729 for(int i=INTERPOLATION_ORDER; i>=1; --i) {
730 total = oon[i]*h*(F_row[i] + total);
731 }
732 Fm[m] = F_row[0] + total;
733#if defined(__AVX__)
734 }
735#endif
736 } // unroll_size = 1
737
738 // check against the reference value
739// if (false) {
740// double refvalue = FmEval_Reference2<double>::eval(T, mmax); // compute F(T) with m=mmax
741// if (abs(refvalue - Fm[mmax]) > 1e-14) {
742// std::cout << "T = " << T << " m = " << mmax << " cheb = "
743// << Fm[mmax] << " ref = " << refvalue << std::endl;
744// }
745// }
746
747 } // if T < T_crit
748
749 /*------------------------------------
750 And then do downward recursion in j
751 ------------------------------------*/
752 if (INTERPOLATION_AND_RECURSION && mmin > 0) {
753 const Real exp_mT = std::exp(-T);
754 for (int m = mmin - 1; m >= 0; --m) {
755 Fm[m] = (exp_mT + two_T * Fm[m+1]) * numbers_.twoi1[m];
756 }
757 }
758 }
759
760 private:
761 Real **grid_; /* Table of "exact" Fm(T) values. Row index corresponds to
762 values of T (max_T+1 rows), column index to values
763 of m (max_m+1 columns) */
764 Real delT_; /* The step size for T, depends on cutoff */
765 Real oodelT_; /* 1.0 / delT_, see above */
766 Real cutoff_; /* Tolerance cutoff used in all computations of Fm(T) */
767 int max_m_; /* Maximum value of m in the table, depends on cutoff
768 and the number of terms in Taylor interpolation */
769 int max_T_; /* Maximum index of T in the table, depends on cutoff
770 and m */
771 Real *T_crit_; /* Maximum T for each row, depends on cutoff;
772 for a given m and T_idx <= max_T_idx[m] use Taylor interpolation,
773 for a given m and T_idx > max_T_idx[m] use the asymptotic formula */
774
776
785 static double truncation_error(unsigned int m, double T) {
786 const double m2= m * m;
787 const double m3= m2 * m;
788 const double m4= m2 * m2;
789 const double T2= T * T;
790 const double T3= T2 * T;
791 const double T4= T2 * T2;
792 const double T5= T2 * T3;
793
794 const double result = exp(-T) * (105 + 16*m4 + 16*m3*(T - 8) - 30*T + 12*T2
795 - 8*T3 + 16*T4 + 8*m2*(43 - 9*T + 2*T2) +
796 4*m*(-88 + 23*T - 8*T2 + 4*T3))/(32*T5);
797 return result;
798 }
807 static double truncation_error(double T) {
808 const double result = exp(-T) /(2*T);
809 return result;
810 }
811 };
812
813
814 namespace detail {
815 constexpr double pow10(long exp) {
816 return (exp == 0) ? 1. : ((exp > 0) ? 10. * pow10(exp-1) : 0.1 * pow10(exp+1));
817 }
818 }
819
823
831 template<typename Real = double>
832 struct TennoGmEval {
833
834 private:
835
836 #include <libint2/tenno_cheb.h>
837
838 static_assert(std::is_same<Real,double>::value, "TennoGmEval only supports double as the real type");
839
840 static const int mmin_ = -1;
841 static constexpr Real Tmax = (1 << cheb_table_tmaxlog2);
842 static constexpr Real Umax = detail::pow10(cheb_table_umaxlog10);
843 static constexpr Real Umin = detail::pow10(cheb_table_uminlog10);
844 static constexpr std::size_t ORDERp1 = interpolation_order + 1;
845 static constexpr Real maxabsprecision = 1.4e-14;
846
847 public:
852 TennoGmEval(unsigned int mmax, Real precision = -1) :
853 mmax_(mmax), precision_(precision),
854 numbers_() {
855#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
856# if !defined(__AVX__) && defined(NDEBUG)
857 if (libint2::verbose()) {
858 static bool printed_performance_warning = false;
859 if (!printed_performance_warning) {
861 << "libint2::TennoGmEval on x86(-64) platforms needs AVX support for best performance"
862 << std::endl;
863 printed_performance_warning = true;
864 }
865 }
866# endif
867#endif
868
869// if (precision_ < maxabsprecision)
870// throw std::invalid_argument(
871// std::string(
872// "TennoGmEval does not support precision smaller than ") +
873// std::to_string(maxabsprecision));
874
875 if (mmax > cheb_table_mmax)
876 throw std::invalid_argument(
877 "TennoGmEval::init() : requested mmax exceeds the "
878 "hard-coded mmax");
879 init_table();
880 }
881
882 ~TennoGmEval() {
883 if (c_ != nullptr)
884 free(c_);
885 }
886
888 static std::shared_ptr<const TennoGmEval> instance(int m_max, double = 0) {
889
890 assert(m_max >= 0);
891 // thread-safe per C++11 standard [6.7.4]
892 static auto instance_ = std::make_shared<const TennoGmEval>(m_max);
893
894 while (instance_->max_m() < m_max) {
895 static std::mutex mtx;
896 std::lock_guard<std::mutex> lck(mtx);
897 if (instance_->max_m() < m_max) {
898 auto new_instance = std::make_shared<const TennoGmEval>(m_max);
899 instance_ = new_instance;
900 }
901 }
902
903 return instance_;
904 }
905
906 unsigned int max_m() const { return mmax_; }
908 Real precision() const { return precision_; }
909
920 void eval_yukawa(Real* Gm, Real one_over_rho, Real T, size_t mmax, Real zeta) const {
921 assert(mmax <= mmax_);
922 assert(T >= 0);
923 const auto U = 0.25 * zeta * zeta * one_over_rho;
924 assert(U >= 0);
925 if (T > Tmax || U < Umin) {
926 eval_urr<false>(Gm, T, U, /* zeta_over_two_rho = */ 0, mmax);
927 } else {
928 interpolate_Gm<false>(Gm, T, U, /* zeta_over_two_rho = */ 0, mmax);
929 }
930 }
942 void eval_slater(Real* Gm, Real one_over_rho, Real T, size_t mmax, Real zeta) const {
943 assert(mmax <= mmax_);
944 assert(T >= 0);
945 const auto U = 0.25 * zeta * zeta * one_over_rho;
946 assert(U > 0); // integral factorizes into 2 overlaps for U = 0
947 const auto zeta_over_two_rho = 0.5 * zeta * one_over_rho;
948 if (T > Tmax) {
949 eval_urr<true>(Gm, T, U, zeta_over_two_rho, mmax);
950 } else {
951 interpolate_Gm<true>(Gm, T, U, zeta_over_two_rho, mmax);
952 }
953 }
954
955 private:
956
962 template <bool compute_Gm1>
963 static inline std::tuple<Real,Real> eval_G0_and_maybe_Gm1(Real T, Real U) {
964 assert(T >= 0);
965 assert(U >= 0);
966 const Real sqrtPi(1.7724538509055160272981674833411451827975494561224);
967
968 Real G_m1 = 0;
969 Real G_0 = 0;
970 if (U == 0) { // \sqrt{U} G_{-1} is finite, need to handle that case separately
971 assert(!compute_Gm1);
972 if (T < std::numeric_limits<Real>::epsilon()) {
973 G_0 = 1;
974 }
975 else {
976 const Real sqrt_T = sqrt(T);
977 const Real sqrtPi_over_2 = sqrtPi * 0.5;
978 G_0 = sqrtPi_over_2 * erf(sqrt_T) / sqrt_T;
979 }
980 }
981 else if (T > std::numeric_limits<Real>::epsilon()) { // U > 0
982 const Real sqrt_U = sqrt(U);
983 const Real sqrt_T = sqrt(T);
984 const Real oo_sqrt_T = 1 / sqrt_T;
985 const Real oo_sqrt_U = compute_Gm1 ? (1 / sqrt_U) : 0;
986 const Real kappa = sqrt_U - sqrt_T;
987 const Real lambda = sqrt_U + sqrt_T;
988 const Real sqrtPi_over_4 = sqrtPi * 0.25;
989 const Real pfac = sqrtPi_over_4;
990 const Real erfc_k = exp(kappa * kappa - T) * erfc(kappa);
991 const Real erfc_l = exp(lambda * lambda - T) * erfc(lambda);
992
993 G_m1 = compute_Gm1 ? pfac * (erfc_k + erfc_l) * oo_sqrt_U : 0.0;
994 G_0 = pfac * (erfc_k - erfc_l) * oo_sqrt_T;
995 }
996 else { // T = 0, U > 0
997 const Real exp_U = exp(U);
998 const Real sqrt_U = sqrt(U);
999 if (compute_Gm1) {
1000 const Real sqrtPi_over_2 = sqrtPi * 0.5;
1001 const Real oo_sqrt_U = compute_Gm1 ? (1 / sqrt_U) : 0;
1002
1003 G_m1 = exp_U * sqrtPi_over_2 * erfc(sqrt_U) * oo_sqrt_U;
1004 }
1005 G_0 = 1 - exp_U * sqrtPi * erfc(sqrt_U) * sqrt_U;
1006 }
1007
1008 return std::make_tuple(G_0, G_m1);
1009 }
1010
1021 template <bool Exp>
1022 static void eval_urr(Real* Gm_vec, Real T, Real U, Real zeta_over_two_rho, size_t mmax) {
1023 assert(T > 0);
1024 assert(U > 0);
1025
1026 const Real sqrt_U = sqrt(U);
1027 const Real sqrt_T = sqrt(T);
1028 const Real oo_sqrt_T = 1 / sqrt_T;
1029 const Real oo_sqrt_U = 1 / sqrt_U;
1030 const Real kappa = sqrt_U - sqrt_T;
1031 const Real lambda = sqrt_U + sqrt_T;
1032 const Real sqrtPi_over_4(0.44311346272637900682454187083528629569938736403060);
1033 const Real pfac = sqrtPi_over_4;
1034 const Real erfc_k = exp(kappa*kappa - T) * erfc(kappa);
1035 const Real erfc_l = exp(lambda*lambda - T) * erfc(lambda);
1036
1037 Real Gmm1; // will contain G[m-1]
1038 Real Gm; // will contain G[m]
1039 Real Gmp1; // will contain G[m+1]
1040 Gmm1 = pfac * (erfc_k + erfc_l) * oo_sqrt_U; // G_{-1}
1041 Gm = pfac * (erfc_k - erfc_l) * oo_sqrt_T; // G_{0}
1042 if
1043#if __cplusplus >= 201703L
1044 constexpr
1045#endif
1046 (!Exp) {
1047 Gm_vec[0] = Gm;
1048 }
1049 else {
1050 Gm_vec[0] = (Gmm1 - Gm) * zeta_over_two_rho;
1051 }
1052
1053 if (mmax > 0) {
1054
1055 // first application of URR
1056 const Real oo_two_T = 0.5 / T;
1057 const Real two_U = 2.0 * U;
1058 const Real exp_mT = exp(-T);
1059
1060 for(unsigned int m=0, two_m_minus_1=1; m<mmax; ++m, two_m_minus_1+=2) {
1061 Gmp1 = oo_two_T * (two_m_minus_1 * Gm + two_U * Gmm1 - exp_mT);
1062 if
1063#if __cplusplus >= 201703L
1064 constexpr
1065#endif
1066 (!Exp) {
1067 Gm_vec[m + 1] = Gmp1;
1068 } else {
1069 Gm_vec[m + 1] = (Gm - Gmp1) * zeta_over_two_rho;
1070 }
1071 Gmm1 = Gm;
1072 Gm = Gmp1;
1073 }
1074 }
1075
1076 return;
1077 }
1078
1086 template <bool Exp>
1087 void interpolate_Gm(Real* Gm_vec, Real T, Real U, Real zeta_over_two_rho, long mmax) const {
1088 assert(T >= 0);
1089 assert(U >= Umin && U <= Umax);
1090
1091 // maps x in [0,2] to [-1/2,1/2] in a linear fashion
1092 auto linear_map_02 = [](Real x) {
1093 assert(x >= 0 && x <= 2);
1094 return (x - 1) * 0.5;
1095 };
1096 // maps x in [a, 2*a] to [-1/2,1/2] in a logarithmic fashion
1097 auto log2_map = [](Real x, Real one_over_a) {
1098 return std::log2(x * one_over_a) - 0.5;
1099 };
1100 // maps x in [a, 10*a] to [-1/2,1/2] in a logarithmic fashion
1101 auto log10_map = [](Real x, Real one_over_a) {
1102 return std::log10(x * one_over_a) - 0.5;
1103 };
1104
1105 // which interval does this T fall into?
1106 const int Tint = T < 2 ? 0 : int(std::floor(std::log2(T))); assert(Tint >= 0 && Tint < 10);
1107 // precomputed 1 / 2^K
1108 constexpr Real one_over_2K[] = {1, 0.5, 0.25, 0.125, 0.0625, 0.03125, 0.015625, 0.0078125, 0.00390625, .001953125};
1109 // which interval does this U fall into?
1110 const int Uint = int(std::floor(std::log10(U / Umin))); assert(Uint >= 0 && Uint < 10);
1111 // precomputed 1 / 10^(cheb_table_uminlog10 + K)
1112 constexpr Real one_over_10K[] = {1. / detail::pow10(cheb_table_uminlog10),
1113 1. / detail::pow10(cheb_table_uminlog10+1),
1114 1. / detail::pow10(cheb_table_uminlog10+2),
1115 1. / detail::pow10(cheb_table_uminlog10+3),
1116 1. / detail::pow10(cheb_table_uminlog10+4),
1117 1. / detail::pow10(cheb_table_uminlog10+5),
1118 1. / detail::pow10(cheb_table_uminlog10+6),
1119 1. / detail::pow10(cheb_table_uminlog10+7),
1120 1. / detail::pow10(cheb_table_uminlog10+8),
1121 1. / detail::pow10(cheb_table_uminlog10+9)};
1122 const Real t = Tint == 0 ? linear_map_02(T) : log2_map(T, one_over_2K[Tint]); // this ranges from -0.5 to 0.5
1123 const Real u = log10_map(U, one_over_10K[Uint]); // this ranges from -0.5 to 0.5
1124
1125 const int interval = Tint * 10 + Uint;
1126
1127#if defined(__AVX__)
1128 assert(ORDERp1 == 16);
1129 const auto t2 = t*t;
1130 const auto t3 = t2*t;
1131 const auto t4 = t2*t2;
1132 const auto t5 = t2*t3;
1133 const auto t6 = t3*t3;
1134 const auto t7 = t3*t4;
1135 const auto t8 = t4*t4;
1136 const auto t9 = t4*t5;
1137 const auto t10 = t5*t5;
1138 const auto t11 = t5*t6;
1139 const auto t12 = t6*t6;
1140 const auto t13 = t6*t7;
1141 const auto t14 = t7*t7;
1142 const auto t15 = t7*t8;
1143 const auto u2 = u*u;
1144 const auto u3 = u2*u;
1145 const auto u4 = u2*u2;
1146 const auto u5 = u2*u3;
1147 const auto u6 = u3*u3;
1148 const auto u7 = u3*u4;
1149 const auto u8 = u4*u4;
1150 const auto u9 = u4*u5;
1151 const auto u10 = u5*u5;
1152 const auto u11 = u5*u6;
1153 const auto u12 = u6*u6;
1154 const auto u13 = u6*u7;
1155 const auto u14 = u7*u7;
1156 const auto u15 = u7*u8;
1157 libint2::simd::VectorAVXDouble u0vec(1., u, u2, u3);
1158 libint2::simd::VectorAVXDouble u1vec(u4, u5, u6, u7);
1159 libint2::simd::VectorAVXDouble u2vec(u8, u9, u10, u11);
1160 libint2::simd::VectorAVXDouble u3vec(u12, u13, u14, u15);
1161#endif // AVX
1162
1163 Real Gmm1 = 0.0; // will track the previous value, only used for Exp==false
1164
1165 constexpr bool compute_Gmm10 = true;
1166 long mmin_interp;
1167 if (compute_Gmm10) {
1168 // precision of interpolation for m=-1,0 can be insufficient, just evaluate explicitly
1169 Real G0;
1170 std::tie(Exp ? G0 : Gm_vec[0], Gmm1) = eval_G0_and_maybe_Gm1<Exp>(T, U);
1171 if
1172#if __cplusplus >= 201703L
1173 constexpr
1174#endif
1175 (Exp) {
1176 Gm_vec[0] = (Gmm1 - G0) * zeta_over_two_rho;
1177 Gmm1 = G0;
1178 }
1179 mmin_interp = 1;
1180 }
1181 else
1182 mmin_interp = Exp ? -1 : 0;
1183
1184 // now compute the rest
1185 for(long m=mmin_interp; m<=mmax; ++m){
1186 const Real *c_tuint = c_ + (ORDERp1) * (ORDERp1) * (interval * (mmax_ - mmin_ + 1) + (m - mmin_)); // ptr to the interpolation data for m=mmin
1187#if defined(__AVX__)
1188 libint2::simd::VectorAVXDouble c00v, c01v, c02v, c03v, c10v, c11v, c12v, c13v,
1189 c20v, c21v, c22v, c23v, c30v, c31v, c32v, c33v,
1190 c40v, c41v, c42v, c43v, c50v, c51v, c52v, c53v,
1191 c60v, c61v, c62v, c63v, c70v, c71v, c72v, c73v;
1192 libint2::simd::VectorAVXDouble c80v, c81v, c82v, c83v, c90v, c91v, c92v, c93v,
1193 ca0v, ca1v, ca2v, ca3v, cb0v, cb1v, cb2v, cb3v,
1194 cc0v, cc1v, cc2v, cc3v, cd0v, cd1v, cd2v, cd3v,
1195 ce0v, ce1v, ce2v, ce3v, cf0v, cf1v, cf2v, cf3v;
1196 c00v.load_aligned(c_tuint); c01v.load_aligned((c_tuint+4));
1197 c02v.load_aligned(c_tuint+8); c03v.load_aligned((c_tuint+12));
1198 libint2::simd::VectorAVXDouble t0vec(1, 1, 1, 1);
1199 libint2::simd::VectorAVXDouble t0 = t0vec * (c00v * u0vec + c01v * u1vec + c02v * u2vec + c03v * u3vec);
1200 c10v.load_aligned(c_tuint +ORDERp1); c11v.load_aligned((c_tuint+4) +ORDERp1);
1201 c12v.load_aligned((c_tuint+8) +ORDERp1); c13v.load_aligned((c_tuint+12) +ORDERp1);
1202 libint2::simd::VectorAVXDouble t1vec(t, t, t, t);
1203 libint2::simd::VectorAVXDouble t1 = t1vec * (c10v * u0vec + c11v * u1vec + c12v * u2vec + c13v * u3vec);
1204 c20v.load_aligned(c_tuint +2*ORDERp1); c21v.load_aligned((c_tuint+4) +2*ORDERp1);
1205 c22v.load_aligned((c_tuint+8)+2*ORDERp1); c23v.load_aligned((c_tuint+12)+2*ORDERp1);
1206 libint2::simd::VectorAVXDouble t2vec(t2, t2, t2, t2);
1207 libint2::simd::VectorAVXDouble t2 = t2vec * (c20v * u0vec + c21v * u1vec + c22v * u2vec + c23v * u3vec);
1208 c30v.load_aligned(c_tuint +3*ORDERp1); c31v.load_aligned((c_tuint+4) +3*ORDERp1);
1209 c32v.load_aligned((c_tuint+8)+3*ORDERp1); c33v.load_aligned((c_tuint+12)+3*ORDERp1);
1210 libint2::simd::VectorAVXDouble t3vec(t3, t3, t3, t3);
1211 libint2::simd::VectorAVXDouble t3 = t3vec * (c30v * u0vec + c31v * u1vec + c32v * u2vec + c33v * u3vec);
1212 libint2::simd::VectorAVXDouble t0123 = horizontal_add(t0, t1, t2, t3);
1213
1214 c40v.load_aligned(c_tuint +4*ORDERp1); c41v.load_aligned((c_tuint+4) +4*ORDERp1);
1215 c42v.load_aligned((c_tuint+8)+4*ORDERp1); c43v.load_aligned((c_tuint+12)+4*ORDERp1);
1216 libint2::simd::VectorAVXDouble t4vec(t4, t4, t4, t4);
1217 libint2::simd::VectorAVXDouble t4 = t4vec * (c40v * u0vec + c41v * u1vec + c42v * u2vec + c43v * u3vec);
1218 c50v.load_aligned(c_tuint +5*ORDERp1); c51v.load_aligned((c_tuint+4) +5*ORDERp1);
1219 c52v.load_aligned((c_tuint+8)+5*ORDERp1); c53v.load_aligned((c_tuint+12)+5*ORDERp1);
1220 libint2::simd::VectorAVXDouble t5vec(t5, t5, t5, t5);
1221 libint2::simd::VectorAVXDouble t5 = t5vec * (c50v * u0vec + c51v * u1vec + c52v * u2vec + c53v * u3vec);
1222 c60v.load_aligned(c_tuint +6*ORDERp1); c61v.load_aligned((c_tuint+4) +6*ORDERp1);
1223 c62v.load_aligned((c_tuint+8)+6*ORDERp1); c63v.load_aligned((c_tuint+12)+6*ORDERp1);
1224 libint2::simd::VectorAVXDouble t6vec(t6, t6, t6, t6);
1225 libint2::simd::VectorAVXDouble t6 = t6vec * (c60v * u0vec + c61v * u1vec + c62v * u2vec + c63v * u3vec);
1226 c70v.load_aligned(c_tuint +7*ORDERp1); c71v.load_aligned((c_tuint+4) +7*ORDERp1);
1227 c72v.load_aligned((c_tuint+8)+7*ORDERp1); c73v.load_aligned((c_tuint+12)+7*ORDERp1);
1228 libint2::simd::VectorAVXDouble t7vec(t7, t7, t7, t7);
1229 libint2::simd::VectorAVXDouble t7 = t7vec * (c70v * u0vec + c71v * u1vec + c72v * u2vec + c73v * u3vec);
1230 libint2::simd::VectorAVXDouble t4567 = horizontal_add(t4, t5, t6, t7);
1231
1232 c80v.load_aligned(c_tuint +8*ORDERp1); c81v.load_aligned((c_tuint+4) +8*ORDERp1);
1233 c82v.load_aligned((c_tuint+8)+8*ORDERp1); c83v.load_aligned((c_tuint+12)+8*ORDERp1);
1234 libint2::simd::VectorAVXDouble t8vec(t8, t8, t8, t8);
1235 libint2::simd::VectorAVXDouble t8 = t8vec * (c80v * u0vec + c81v * u1vec + c82v * u2vec + c83v * u3vec);
1236 c90v.load_aligned(c_tuint +9*ORDERp1); c91v.load_aligned((c_tuint+4) +9*ORDERp1);
1237 c92v.load_aligned((c_tuint+8)+9*ORDERp1); c93v.load_aligned((c_tuint+12)+9*ORDERp1);
1238 libint2::simd::VectorAVXDouble t9vec(t9, t9, t9, t9);
1239 libint2::simd::VectorAVXDouble t9 = t9vec * (c90v * u0vec + c91v * u1vec + c92v * u2vec + c93v * u3vec);
1240 ca0v.load_aligned(c_tuint +10*ORDERp1); ca1v.load_aligned((c_tuint+4) +10*ORDERp1);
1241 ca2v.load_aligned((c_tuint+8)+10*ORDERp1); ca3v.load_aligned((c_tuint+12)+10*ORDERp1);
1242 libint2::simd::VectorAVXDouble tavec(t10, t10, t10, t10);
1243 libint2::simd::VectorAVXDouble ta = tavec * (ca0v * u0vec + ca1v * u1vec + ca2v * u2vec + ca3v * u3vec);
1244 cb0v.load_aligned(c_tuint +11*ORDERp1); cb1v.load_aligned((c_tuint+4) +11*ORDERp1);
1245 cb2v.load_aligned((c_tuint+8)+11*ORDERp1); cb3v.load_aligned((c_tuint+12)+11*ORDERp1);
1246 libint2::simd::VectorAVXDouble tbvec(t11, t11, t11, t11);
1247 libint2::simd::VectorAVXDouble tb = tbvec * (cb0v * u0vec + cb1v * u1vec + cb2v * u2vec + cb3v * u3vec);
1248 libint2::simd::VectorAVXDouble t89ab = horizontal_add(t8, t9, ta, tb);
1249
1250 cc0v.load_aligned(c_tuint +12*ORDERp1); cc1v.load_aligned((c_tuint+4) +12*ORDERp1);
1251 cc2v.load_aligned((c_tuint+8)+12*ORDERp1); cc3v.load_aligned((c_tuint+12)+12*ORDERp1);
1252 libint2::simd::VectorAVXDouble tcvec(t12, t12, t12, t12);
1253 libint2::simd::VectorAVXDouble tc = tcvec * (cc0v * u0vec + cc1v * u1vec + cc2v * u2vec + cc3v * u3vec);
1254 cd0v.load_aligned(c_tuint +13*ORDERp1); cd1v.load_aligned((c_tuint+4) +13*ORDERp1);
1255 cd2v.load_aligned((c_tuint+8)+13*ORDERp1); cd3v.load_aligned((c_tuint+12)+13*ORDERp1);
1256 libint2::simd::VectorAVXDouble tdvec(t13, t13, t13, t13);
1257 libint2::simd::VectorAVXDouble td = tdvec * (cd0v * u0vec + cd1v * u1vec + cd2v * u2vec + cd3v * u3vec);
1258 ce0v.load_aligned(c_tuint +14*ORDERp1); ce1v.load_aligned((c_tuint+4) +14*ORDERp1);
1259 ce2v.load_aligned((c_tuint+8)+14*ORDERp1); ce3v.load_aligned((c_tuint+12)+14*ORDERp1);
1260 libint2::simd::VectorAVXDouble tevec(t14, t14, t14, t14);
1261 libint2::simd::VectorAVXDouble te = tevec * (ce0v * u0vec + ce1v * u1vec + ce2v * u2vec + ce3v * u3vec);
1262 cf0v.load_aligned(c_tuint +15*ORDERp1); cf1v.load_aligned((c_tuint+4)+15*ORDERp1);
1263 cf2v.load_aligned((c_tuint+8)+15*ORDERp1); cf3v.load_aligned((c_tuint+4)+15*ORDERp1);
1264 libint2::simd::VectorAVXDouble tfvec(t15, t15, t15, t15);
1265 libint2::simd::VectorAVXDouble tf = tfvec * (cf0v * u0vec + cf1v * u1vec + cf2v * u2vec + cf3v * u3vec);
1266 libint2::simd::VectorAVXDouble tcdef = horizontal_add(tc, td, te, tf);
1267
1268 auto tall = horizontal_add(t0123, t4567, t89ab, tcdef);
1269 const auto Gm = horizontal_add(tall);
1270#else // AVX
1271 // no AVX, use explicit loops (probably slow)
1272 double uvec[16];
1273 double tvec[16];
1274 uvec[0] = 1;
1275 tvec[0] = 1;
1276 for(int i=1; i!=16; ++i) {
1277 uvec[i] = uvec[i-1] * u;
1278 tvec[i] = tvec[i-1] * t;
1279 }
1280 double Gm = 0.0;
1281 for(int i=0, ij=0; i!=16; ++i) {
1282 for (int j = 0; j != 16; ++j, ++ij) {
1283 Gm += c_tuint[ij] * tvec[i] * uvec[j];
1284 }
1285 }
1286#endif // AVX
1287
1288 if
1289#if __cplusplus >= 201703L
1290 constexpr
1291#endif
1292 (!Exp) {
1293 Gm_vec[m] = Gm;
1294 }
1295 else {
1296 if (m != -1) {
1297 Gm_vec[m] = (Gmm1 - Gm) * zeta_over_two_rho;
1298 }
1299 Gmm1 = Gm;
1300 }
1301 }
1302
1303 return;
1304 }
1305
1306 private:
1307 unsigned int mmax_;
1308 Real precision_;
1309 ExpensiveNumbers<Real> numbers_;
1310 Real* c_ = nullptr;
1311
1312 void init_table() {
1313
1314 // get memory
1315 void* result;
1316 int status = posix_memalign(&result, std::max(sizeof(Real), 32ul), (mmax_ - mmin_ + 1) * cheb_table_nintervals * ORDERp1 * ORDERp1 * sizeof(Real));
1317 if (status != 0) {
1318 if (status == EINVAL)
1319 throw std::logic_error(
1320 "TennoGmEval::init() : posix_memalign failed, alignment must be a power of 2 at least as large as sizeof(void *)");
1321 if (status == ENOMEM)
1322 throw std::bad_alloc();
1323 abort(); // should be unreachable
1324 }
1325 c_ = static_cast<Real*>(result);
1326
1327 // copy contents of static table into c
1328 // need all intervals
1329 for (std::size_t iv = 0; iv < cheb_table_nintervals; ++iv) {
1330 // but only values of m up to mmax
1331 std::copy(cheb_table[iv], cheb_table[iv]+((mmax_-mmin_)+1)*ORDERp1*ORDERp1, c_+(iv*((mmax_-mmin_)+1))*ORDERp1*ORDERp1);
1332 }
1333 }
1334
1335
1336 }; // TennoGmEval
1337
1338#if LIBINT2_CONSTEXPR_STATICS
1339 template <typename Real>
1340 constexpr
1341 double TennoGmEval<Real>::cheb_table[TennoGmEval<Real>::cheb_table_nintervals][(TennoGmEval<Real>::cheb_table_mmax+2)*(TennoGmEval<Real>::interpolation_order+1)*(TennoGmEval<Real>::interpolation_order+1)];
1342#else
1343 // clang needs an explicit specifalization declaration to avoid warning
1344# ifdef __clang__
1345 template <typename Real>
1346 double TennoGmEval<Real>::cheb_table[TennoGmEval<Real>::cheb_table_nintervals][(TennoGmEval<Real>::cheb_table_mmax+2)*(TennoGmEval<Real>::interpolation_order+1)*(TennoGmEval<Real>::interpolation_order+1)];
1347# endif
1348#endif
1349
1350 template<typename Real, int k>
1352
1353 namespace detail {
1354
1356 template <typename CoreEval> struct CoreEvalScratch {
1357 CoreEvalScratch(const CoreEvalScratch&) = default;
1358 CoreEvalScratch(CoreEvalScratch&&) = default;
1359 explicit CoreEvalScratch(int) { }
1360 };
1362 template <typename Real>
1364 std::vector<Real> Fm_;
1365 std::vector<Real> g_i;
1366 std::vector<Real> r_i;
1367 std::vector<Real> oorhog_i;
1368 CoreEvalScratch(const CoreEvalScratch&) = default;
1369 CoreEvalScratch(CoreEvalScratch&&) = default;
1370 explicit CoreEvalScratch(int mmax) {
1371 init(mmax);
1372 }
1373 private:
1374 void init(int mmax) {
1375 Fm_.resize(mmax+1);
1376 g_i.resize(mmax+1);
1377 r_i.resize(mmax+1);
1378 oorhog_i.resize(mmax+1);
1379 g_i[0] = 1.0;
1380 r_i[0] = 1.0;
1381 }
1382 };
1383 } // namespace libint2::detail
1384
1388
1410 template<typename Real, int k>
1411 struct GaussianGmEval : private detail::CoreEvalScratch<GaussianGmEval<Real,k>> // N.B. empty-base optimization
1412 {
1413#ifndef LIBINT_USER_DEFINED_REAL
1414 using FmEvalType = libint2::FmEval_Chebyshev7<double>;
1415#else
1416 using FmEvalType = libint2::FmEval_Reference<scalar_type>;
1417#endif
1418
1422 GaussianGmEval(int mmax, Real precision) :
1423 detail::CoreEvalScratch<GaussianGmEval<Real, k>>(mmax), mmax_(mmax),
1424 precision_(precision), fm_eval_(),
1425 numbers_(-1,-1,mmax) {
1426 assert(k == -1 || k == 0 || k == 2);
1427 // for k=-1 need to evaluate the Boys function
1428 if (k == -1) {
1429 fm_eval_ = FmEvalType::instance(mmax_, precision_);
1430 }
1431 }
1432
1433 ~GaussianGmEval() {
1434 }
1435
1438 static std::shared_ptr<GaussianGmEval> instance(unsigned int mmax, Real precision = std::numeric_limits<Real>::epsilon()) {
1439 assert(mmax >= 0);
1440 assert(precision >= 0);
1441 // thread-safe per C++11 standard [6.7.4]
1442 static auto instance_ = std::make_shared<GaussianGmEval>(mmax, precision);
1443
1444 while (instance_->max_m() < mmax || instance_->precision() > precision) {
1445 static std::mutex mtx;
1446 std::lock_guard<std::mutex> lck(mtx);
1447 if (instance_->max_m() < mmax || instance_->precision() > precision) {
1448 auto new_instance = std::make_shared<GaussianGmEval>(mmax, precision);
1449 instance_ = new_instance;
1450 }
1451 }
1452
1453 return instance_;
1454 }
1455
1457 int max_m() const { return mmax_; }
1459 Real precision() const { return precision_; }
1460
1475 template <typename AnyReal>
1476 void eval(Real* Gm, Real rho, Real T, size_t mmax,
1477 const std::vector<std::pair<AnyReal, AnyReal> >& geminal,
1478 void* scr = 0) {
1479
1480 std::fill(Gm, Gm+mmax+1, Real(0));
1481
1482 const auto sqrt_rho = sqrt(rho);
1483 const auto oo_sqrt_rho = 1/sqrt_rho;
1484 if (k == -1) {
1485 void* _scr = (scr == 0) ? this : scr;
1486 auto& scratch = *(reinterpret_cast<detail::CoreEvalScratch<GaussianGmEval<Real, -1>>*>(_scr));
1487 for(int i=1; i<=mmax; i++) {
1488 scratch.r_i[i] = scratch.r_i[i-1] * rho;
1489 }
1490 }
1491
1492 typedef typename std::vector<std::pair<AnyReal, AnyReal> >::const_iterator citer;
1493 const citer gend = geminal.end();
1494 for(citer i=geminal.begin(); i!= gend; ++i) {
1495
1496 const auto gamma = i->first;
1497 const auto gcoef = i->second;
1498 const auto rhog = rho + gamma;
1499 const auto oorhog = 1/rhog;
1500
1501 const auto gorg = gamma * oorhog;
1502 const auto rorg = rho * oorhog;
1503 const auto sqrt_rho_org = sqrt_rho * oorhog;
1504 const auto sqrt_rhog = sqrt(rhog);
1505 const auto sqrt_rorg = sqrt_rho_org * sqrt_rhog;
1506
1508 constexpr Real const_SQRTPI_2(0.88622692545275801364908374167057259139877472806119); /* sqrt(pi)/2 */
1509 const auto SS_K0G12_SS = gcoef * oo_sqrt_rho * const_SQRTPI_2 * rorg * sqrt_rorg * exp(-gorg*T);
1510
1511 if (k == -1) {
1512 void* _scr = (scr == 0) ? this : scr;
1513 auto& scratch = *(reinterpret_cast<detail::CoreEvalScratch<GaussianGmEval<Real, -1>>*>(_scr));
1514
1515 const auto rorgT = rorg * T;
1516 fm_eval_->eval(&scratch.Fm_[0], rorgT, mmax);
1517
1518#if 1
1519 constexpr Real const_2_SQRTPI(1.12837916709551257389615890312154517); /* 2/sqrt(pi) */
1520 const auto pfac = const_2_SQRTPI * sqrt_rhog * SS_K0G12_SS;
1521 scratch.oorhog_i[0] = pfac;
1522 for(int i=1; i<=mmax; i++) {
1523 scratch.g_i[i] = scratch.g_i[i-1] * gamma;
1524 scratch.oorhog_i[i] = scratch.oorhog_i[i-1] * oorhog;
1525 }
1526 for(int m=0; m<=mmax; m++) {
1527 Real ssss = 0.0;
1528 Real* bcm = numbers_.bc[m];
1529 for(int n=0; n<=m; n++) {
1530 ssss += bcm[n] * scratch.r_i[n] * scratch.g_i[m-n] * scratch.Fm_[n];
1531 }
1532 Gm[m] += ssss * scratch.oorhog_i[m];
1533 }
1534#endif
1535 }
1536
1537 if (k == 0) {
1538 auto ss_oper_ss_m = SS_K0G12_SS;
1539 Gm[0] += ss_oper_ss_m;
1540 for(int m=1; m<=mmax; ++m) {
1541 ss_oper_ss_m *= gorg;
1542 Gm[m] += ss_oper_ss_m;
1543 }
1544 }
1545
1546 if (k == 2) {
1547
1549 const auto rorgT = rorg * T;
1550 const auto SS_K2G12_SS_0 = (1.5 + rorgT) * (SS_K0G12_SS * oorhog);
1551 const auto SS_K2G12_SS_m1 = rorg * (SS_K0G12_SS * oorhog);
1552
1553 auto SS_K2G12_SS_gorg_m = SS_K2G12_SS_0 ;
1554 auto SS_K2G12_SS_gorg_m1 = SS_K2G12_SS_m1;
1555 Gm[0] += SS_K2G12_SS_gorg_m;
1556 for(int m=1; m<=mmax; ++m) {
1557 SS_K2G12_SS_gorg_m *= gorg;
1558 Gm[m] += SS_K2G12_SS_gorg_m - m * SS_K2G12_SS_gorg_m1;
1559 SS_K2G12_SS_gorg_m1 *= gorg;
1560 }
1561 }
1562
1563 }
1564
1565 }
1566
1567 private:
1568 int mmax_;
1569 Real precision_; //< absolute precision
1570 std::shared_ptr<const FmEvalType> fm_eval_;
1571
1572 ExpensiveNumbers<Real> numbers_;
1573 };
1574
1575 template <typename GmEvalFunction>
1576 struct GenericGmEval : private GmEvalFunction {
1577
1578 typedef typename GmEvalFunction::value_type Real;
1579
1580 GenericGmEval(int mmax, Real precision) : GmEvalFunction(mmax, precision),
1581 mmax_(mmax), precision_(precision) {}
1582
1583 static std::shared_ptr<const GenericGmEval> instance(int mmax, Real precision = std::numeric_limits<Real>::epsilon()) {
1584 return std::make_shared<const GenericGmEval>(mmax, precision);
1585 }
1586
1587 template <typename Real, typename... ExtraArgs>
1588 void eval(Real* Gm, Real rho, Real T, int mmax, ExtraArgs... args) const {
1589 assert(mmax <= mmax_);
1590 (GmEvalFunction(*this))(Gm, rho, T, mmax, std::forward<ExtraArgs>(args)...);
1591 }
1592
1594 int max_m() const { return mmax_; }
1596 Real precision() const { return precision_; }
1597
1598 private:
1599 int mmax_;
1600 Real precision_;
1601 };
1602
1603 // these Gm engines need extra scratch data
1604 namespace os_core_ints {
1605 template <typename Real, int K> struct r12_xx_K_gm_eval;
1606 template <typename Real> struct erfc_coulomb_gm_eval;
1607 }
1608
1609 namespace detail {
1611 template <typename Real>
1612 struct CoreEvalScratch<os_core_ints::r12_xx_K_gm_eval<Real, 1>> {
1613 std::vector<Real> Fm_;
1614 CoreEvalScratch(const CoreEvalScratch&) = default;
1615 CoreEvalScratch(CoreEvalScratch&&) = default;
1616 // need to store Fm(T) for m = 0 .. mmax+1
1617 explicit CoreEvalScratch(int mmax) { Fm_.resize(mmax+2); }
1618 };
1620 template <typename Real>
1621 struct CoreEvalScratch<os_core_ints::erfc_coulomb_gm_eval<Real>> {
1622 std::vector<Real> Fm_;
1623 CoreEvalScratch(const CoreEvalScratch&) = default;
1624 CoreEvalScratch(CoreEvalScratch&&) = default;
1625 // need to store Fm(T) for m = 0 .. mmax
1626 explicit CoreEvalScratch(int mmax) { Fm_.resize(mmax+1); }
1627 };
1628 }
1629
1631 namespace os_core_ints {
1632
1634 template <typename Real>
1635 struct delta_gm_eval {
1636 typedef Real value_type;
1637
1638 delta_gm_eval(unsigned int, Real) {}
1639 void operator()(Real* Gm, Real rho, Real T, int mmax) const {
1640 constexpr static auto one_over_two_pi = 1.0 / (2.0 * M_PI);
1641 const auto G0 = exp(-T) * rho * one_over_two_pi;
1642 std::fill(Gm, Gm + mmax + 1, G0);
1643 }
1644 };
1645
1650
1651 template <typename Real, int K>
1652 struct r12_xx_K_gm_eval;
1653
1654 template <typename Real>
1655 struct r12_xx_K_gm_eval<Real, 1>
1656 : private detail::CoreEvalScratch<r12_xx_K_gm_eval<Real, 1>> {
1657 typedef detail::CoreEvalScratch<r12_xx_K_gm_eval<Real, 1>> base_type;
1658 typedef Real value_type;
1659
1660#ifndef LIBINT_USER_DEFINED_REAL
1661 using FmEvalType = libint2::FmEval_Chebyshev7<double>;
1662#else
1663 using FmEvalType = libint2::FmEval_Reference<scalar_type>;
1664#endif
1665
1666 r12_xx_K_gm_eval(unsigned int mmax, Real precision)
1667 : base_type(mmax) {
1668 fm_eval_ = FmEvalType::instance(mmax + 1, precision);
1669 }
1670 void operator()(Real* Gm, Real rho, Real T, int mmax) {
1671 fm_eval_->eval(&base_type::Fm_[0], T, mmax + 1);
1672 auto T_plus_m_plus_one = T + 1.0;
1673 Gm[0] = T_plus_m_plus_one * base_type::Fm_[0] - T * base_type::Fm_[1];
1674 auto minus_m = -1.0;
1675 T_plus_m_plus_one += 1.0;
1676 for (auto m = 1; m <= mmax;
1677 ++m, minus_m -= 1.0, T_plus_m_plus_one += 1.0) {
1678 Gm[m] =
1679 minus_m * base_type::Fm_[m - 1] + T_plus_m_plus_one * base_type::Fm_[m] - T * base_type::Fm_[m + 1];
1680 }
1681 }
1682
1683 private:
1684 std::shared_ptr<const FmEvalType> fm_eval_; // need for odd K
1685 };
1686
1688 template <typename Real>
1689 struct erf_coulomb_gm_eval {
1690 typedef Real value_type;
1691
1692#ifndef LIBINT_USER_DEFINED_REAL
1693 using FmEvalType = libint2::FmEval_Chebyshev7<double>;
1694#else
1695 using FmEvalType = libint2::FmEval_Reference<scalar_type>;
1696#endif
1697
1698 erf_coulomb_gm_eval(unsigned int mmax, Real precision) {
1699 fm_eval_ = FmEvalType::instance(mmax, precision);
1700 }
1701 void operator()(Real* Gm, Real rho, Real T, int mmax, Real omega) const {
1702 if (omega > 0) {
1703 auto omega2 = omega * omega;
1704 auto omega2_over_omega2_plus_rho = omega2 / (omega2 + rho);
1705 fm_eval_->eval(Gm, T * omega2_over_omega2_plus_rho,
1706 mmax);
1707
1708 using std::sqrt;
1709 auto ooversqrto2prho_exp_2mplus1 =
1710 sqrt(omega2_over_omega2_plus_rho);
1711 for (auto m = 0; m <= mmax;
1712 ++m, ooversqrto2prho_exp_2mplus1 *= omega2_over_omega2_plus_rho) {
1713 Gm[m] *= ooversqrto2prho_exp_2mplus1;
1714 }
1715 }
1716 else {
1717 std::fill(Gm, Gm+mmax+1, Real{0});
1718 }
1719 }
1720
1721 private:
1722 std::shared_ptr<const FmEvalType> fm_eval_; // need for odd K
1723 };
1724
1728 template <typename Real>
1729 struct erfc_coulomb_gm_eval : private
1730 detail::CoreEvalScratch<erfc_coulomb_gm_eval<Real>> {
1731 typedef detail::CoreEvalScratch<erfc_coulomb_gm_eval<Real>> base_type;
1732 typedef Real value_type;
1733
1734 #ifndef LIBINT_USER_DEFINED_REAL
1735 using FmEvalType = libint2::FmEval_Chebyshev7<double>;
1736#else
1737 using FmEvalType = libint2::FmEval_Reference<scalar_type>;
1738#endif
1739
1740 erfc_coulomb_gm_eval(unsigned int mmax, Real precision)
1741 : base_type(mmax) {
1742 fm_eval_ = FmEvalType::instance(mmax, precision);
1743 }
1744 void operator()(Real* Gm, Real rho, Real T, int mmax, Real omega) {
1745 fm_eval_->eval(&base_type::Fm_[0], T, mmax);
1746 std::copy(base_type::Fm_.cbegin(), base_type::Fm_.cbegin() + mmax + 1, Gm);
1747 if (omega > 0) {
1748 auto omega2 = omega * omega;
1749 auto omega2_over_omega2_plus_rho = omega2 / (omega2 + rho);
1750 fm_eval_->eval(&base_type::Fm_[0], T * omega2_over_omega2_plus_rho,
1751 mmax);
1752
1753 using std::sqrt;
1754 auto ooversqrto2prho_exp_2mplus1 =
1755 sqrt(omega2_over_omega2_plus_rho);
1756 for (auto m = 0; m <= mmax;
1757 ++m, ooversqrto2prho_exp_2mplus1 *= omega2_over_omega2_plus_rho) {
1758 Gm[m] -= ooversqrto2prho_exp_2mplus1 * base_type::Fm_[m];
1759 }
1760 }
1761 }
1762
1763 private:
1764 std::shared_ptr<const FmEvalType> fm_eval_; // need for odd K
1765 };
1766
1767 } // namespace os_core_ints
1768
1769 /*
1770 * Slater geminal fitting is available only if have LAPACK
1771 */
1772#if HAVE_LAPACK
1773 /*
1774 f[x_] := - Exp[-\[Zeta] x] / \[Zeta];
1775
1776 ff[cc_, aa_, x_] := Sum[cc[[i]]*Exp[-aa[[i]] x^2], {i, 1, n}];
1777 */
1778 template <typename Real>
1779 Real
1780 fstg(Real zeta,
1781 Real x) {
1782 return -std::exp(-zeta*x)/zeta;
1783 }
1784
1785 template <typename Real>
1786 Real
1787 fngtg(const std::vector<Real>& cc,
1788 const std::vector<Real>& aa,
1789 Real x) {
1790 Real value = 0.0;
1791 const Real x2 = x * x;
1792 const unsigned int n = cc.size();
1793 for(unsigned int i=0; i<n; ++i)
1794 value += cc[i] * std::exp(- aa[i] * x2);
1795 return value;
1796 }
1797
1798 // --- weighting functions ---
1799 // L2 error is weighted by ww(x)
1800 // hence error is weighted by sqrt(ww(x))
1801 template <typename Real>
1802 Real
1803 wwtewklopper(Real x) {
1804 const Real x2 = x * x;
1805 return x2 * std::exp(-2 * x2);
1806 }
1807 template <typename Real>
1808 Real
1809 wwcusp(Real x) {
1810 const Real x2 = x * x;
1811 const Real x6 = x2 * x2 * x2;
1812 return std::exp(-0.005 * x6);
1813 }
1814 // default is Tew-Klopper
1815 template <typename Real>
1816 Real
1817 ww(Real x) {
1818 //return wwtewklopper(x);
1819 return wwcusp(x);
1820 }
1821
1822 template <typename Real>
1823 Real
1824 norm(const std::vector<Real>& vec) {
1825 Real value = 0.0;
1826 const unsigned int n = vec.size();
1827 for(unsigned int i=0; i<n; ++i)
1828 value += vec[i] * vec[i];
1829 return value;
1830 }
1831
1832 template <typename Real>
1833 void LinearSolveDamped(const std::vector<Real>& A,
1834 const std::vector<Real>& b,
1835 Real lambda,
1836 std::vector<Real>& x) {
1837 const size_t n = b.size();
1838 std::vector<Real> Acopy(A);
1839 for(size_t m=0; m<n; ++m) Acopy[m*n + m] *= (1 + lambda);
1840 std::vector<Real> e(b);
1841
1842 //int info = LAPACKE_dgesv( LAPACK_ROW_MAJOR, n, 1, &Acopy[0], n, &ipiv[0], &e[0], n );
1843 {
1844 std::vector<int> ipiv(n);
1845 int n = b.size();
1846 int one = 1;
1847 int info;
1848 dgesv_(&n, &one, &Acopy[0], &n, &ipiv[0], &e[0], &n, &info);
1849 assert (info == 0);
1850 }
1851
1852 x = e;
1853 }
1854
1865 template <typename Real>
1866 void stg_ng_fit(unsigned int n,
1867 Real zeta,
1868 std::vector< std::pair<Real, Real> >& geminal,
1869 Real xmin = 0.0,
1870 Real xmax = 10.0,
1871 unsigned int npts = 1001) {
1872
1873 // initial guess
1874 std::vector<Real> cc(n, 1.0); // coefficients
1875 std::vector<Real> aa(n); // exponents
1876 for(unsigned int i=0; i<n; ++i)
1877 aa[i] = std::pow(3.0, (i + 2 - (n + 1)/2.0));
1878
1879 // first rescale cc for ff[x] to match the norm of f[x]
1880 Real ffnormfac = 0.0;
1881 using std::sqrt;
1882 for(unsigned int i=0; i<n; ++i)
1883 for(unsigned int j=0; j<n; ++j)
1884 ffnormfac += cc[i] * cc[j]/sqrt(aa[i] + aa[j]);
1885 const Real Nf = sqrt(2.0 * zeta) * zeta;
1886 const Real Nff = sqrt(2.0) / (sqrt(ffnormfac) *
1887 sqrt(sqrt(M_PI)));
1888 for(unsigned int i=0; i<n; ++i) cc[i] *= -Nff/Nf;
1889
1890 Real lambda0 = 1000; // damping factor is initially set to 1000, eventually should end up at 0
1891 const Real nu = 3.0; // increase/decrease the damping factor scale it by this
1892 const Real epsilon = 1e-15; // convergence
1893 const unsigned int maxniter = 200;
1894
1895 // grid points on which we will fit
1896 std::vector<Real> xi(npts);
1897 for(unsigned int i=0; i<npts; ++i) xi[i] = xmin + (xmax - xmin)*i/(npts - 1);
1898
1899 std::vector<Real> err(npts);
1900
1901 const size_t nparams = 2*n; // params = expansion coefficients + gaussian exponents
1902 std::vector<Real> J( npts * nparams );
1903 std::vector<Real> delta(nparams);
1904
1905// std::cout << "iteration 0" << std::endl;
1906// for(unsigned int i=0; i<n; ++i)
1907// std::cout << cc[i] << " " << aa[i] << std::endl;
1908
1909 Real errnormI;
1910 Real errnormIm1 = 1e3;
1911 bool converged = false;
1912 unsigned int iter = 0;
1913 while (!converged && iter < maxniter) {
1914// std::cout << "Iteration " << ++iter << ": lambda = " << lambda0/nu << std::endl;
1915
1916 for(unsigned int i=0; i<npts; ++i) {
1917 const Real x = xi[i];
1918 err[i] = (fstg(zeta, x) - fngtg(cc, aa, x)) * sqrt(ww(x));
1919 }
1920 errnormI = norm(err)/sqrt((Real)npts);
1921
1922// std::cout << "|err|=" << errnormI << std::endl;
1923 converged = std::abs((errnormI - errnormIm1)/errnormIm1) <= epsilon;
1924 if (converged) break;
1925 errnormIm1 = errnormI;
1926
1927 for(unsigned int i=0; i<npts; ++i) {
1928 const Real x2 = xi[i] * xi[i];
1929 const Real sqrt_ww_x = sqrt(ww(xi[i]));
1930 const unsigned int ioffset = i * nparams;
1931 for(unsigned int j=0; j<n; ++j)
1932 J[ioffset+j] = (std::exp(-aa[j] * x2)) * sqrt_ww_x;
1933 const unsigned int ioffsetn = ioffset+n;
1934 for(unsigned int j=0; j<n; ++j)
1935 J[ioffsetn+j] = - sqrt_ww_x * x2 * cc[j] * std::exp(-aa[j] * x2);
1936 }
1937
1938 std::vector<Real> A( nparams * nparams);
1939 for(size_t r=0, rc=0; r<nparams; ++r) {
1940 for(size_t c=0; c<nparams; ++c, ++rc) {
1941 double Arc = 0.0;
1942 for(size_t i=0, ir=r, ic=c; i<npts; ++i, ir+=nparams, ic+=nparams)
1943 Arc += J[ir] * J[ic];
1944 A[rc] = Arc;
1945 }
1946 }
1947
1948 std::vector<Real> b( nparams );
1949 for(size_t r=0; r<nparams; ++r) {
1950 Real br = 0.0;
1951 for(size_t i=0, ir=r; i<npts; ++i, ir+=nparams)
1952 br += J[ir] * err[i];
1953 b[r] = br;
1954 }
1955
1956 // try decreasing damping first
1957 // if not successful try increasing damping until it results in a decrease in the error
1958 lambda0 /= nu;
1959 for(int l=-1; l<1000; ++l) {
1960
1961 LinearSolveDamped(A, b, lambda0, delta );
1962
1963 std::vector<double> cc_0(cc); for(unsigned int i=0; i<n; ++i) cc_0[i] += delta[i];
1964 std::vector<double> aa_0(aa); for(unsigned int i=0; i<n; ++i) aa_0[i] += delta[i+n];
1965
1966 // if any of the exponents are negative the step is too large and need to increase damping
1967 bool step_too_large = false;
1968 for(unsigned int i=0; i<n; ++i)
1969 if (aa_0[i] < 0.0) {
1970 step_too_large = true;
1971 break;
1972 }
1973 if (!step_too_large) {
1974 std::vector<double> err_0(npts);
1975 for(unsigned int i=0; i<npts; ++i) {
1976 const double x = xi[i];
1977 err_0[i] = (fstg(zeta, x) - fngtg(cc_0, aa_0, x)) * sqrt(ww(x));
1978 }
1979 const double errnorm_0 = norm(err_0)/sqrt((double)npts);
1980 if (errnorm_0 < errnormI) {
1981 cc = cc_0;
1982 aa = aa_0;
1983 break;
1984 }
1985 else // step lead to increase of the error -- try dampening a bit more
1986 lambda0 *= nu;
1987 }
1988 else // too large of a step
1989 lambda0 *= nu;
1990 } // done adjusting the damping factor
1991
1992 } // end of iterative minimization
1993
1994 // if reached max # of iterations throw if the error is too terrible
1995 assert(not (iter == maxniter && errnormI > 1e-10));
1996
1997 for(unsigned int i=0; i<n; ++i)
1998 geminal[i] = std::make_pair(aa[i], cc[i]);
1999 }
2000#endif
2001
2002} // end of namespace libint2
2003
2004#endif // C++ only
2005#endif // header guard
holds tables of expensive quantities
Definition: boys.h:60
Computes the Boys function, $ F_m (T) = \int_0^1 u^{2m} \exp(-T u^2) \, {\rm d}u $,...
Definition: boys.h:245
static std::shared_ptr< const FmEval_Chebyshev7 > instance(int m_max, double=0.0)
Singleton interface allows to manage the lone instance; adjusts max m values as needed in thread-safe...
Definition: boys.h:298
void eval(Real *Fm, Real x, int m_max) const
fills in Fm with computed Boys function values for m in [0,mmax]
Definition: boys.h:323
FmEval_Chebyshev7(int m_max, double precision=std::numeric_limits< double >::epsilon())
Definition: boys.h:267
int max_m() const
Definition: boys.h:317
Computes the Boys function, $ F_m (T) = \int_0^1 u^{2m} \exp(-T u^2) \, {\rm d}u $,...
Definition: boys.h:479
static std::shared_ptr< const FmEval_Taylor > instance(unsigned int mmax, Real precision=std::numeric_limits< Real >::epsilon())
Singleton interface allows to manage the lone instance; adjusts max m and precision values as needed ...
Definition: boys.h:595
void eval(Real *Fm, Real T, int mmax) const
computes Boys function values with m index in range [0,mmax]
Definition: boys.h:623
int max_m() const
Definition: boys.h:614
Real precision() const
Definition: boys.h:616
FmEval_Taylor(unsigned int mmax, Real precision=std::numeric_limits< Real >::epsilon())
Constructs the object to be able to compute Boys funcion for m in [0,mmax], with relative precision.
Definition: boys.h:488
double horizontal_add(VectorSSEDouble const &a)
Horizontal add.
Definition: vector_x86.h:232
Defaults definitions for various parameters assumed by Libint.
Definition: algebra.cc:24
std::ostream & verbose_stream()
Accessor for the disgnostics stream.
Definition: initialize.h:100
bool verbose()
Accessor for the verbose flag.
Definition: initialize.h:106
Computes the Boys function, $ F_m (T) = \int_0^1 u^{2m} \exp(-T u^2) \, {\rm d}u $,...
Definition: boys.h:205
static void eval(Real *Fm, Real t, size_t mmax)
fills up an array of Fm(T) for m in [0,mmax]
Definition: boys.h:219
Computes the Boys function, , using single algorithm (asymptotic expansion).
Definition: boys.h:142
static void eval(Real *Fm, Real T, size_t mmax)
fills up an array of Fm(T) for m in [0,mmax]
Definition: boys.h:182
static Real eval(Real T, size_t m)
computes a single value of using MacLaurin series to full precision of Real
Definition: boys.h:153
Definition: boys.h:1351
core integral for Yukawa and exponential interactions
Definition: boys.h:832
void eval_slater(Real *Gm, Real one_over_rho, Real T, size_t mmax, Real zeta) const
Definition: boys.h:942
static std::shared_ptr< const TennoGmEval > instance(int m_max, double=0)
Singleton interface allows to manage the lone instance; adjusts max m values as needed in thread-safe...
Definition: boys.h:888
Real precision() const
Definition: boys.h:908
void eval_yukawa(Real *Gm, Real one_over_rho, Real T, size_t mmax, Real zeta) const
Definition: boys.h:920
TennoGmEval(unsigned int mmax, Real precision=-1)
Definition: boys.h:852
some evaluators need thread-local scratch, but most don't
Definition: boys.h:1356
SIMD vector of 4 double-precision floating-point real numbers, operations on which use AVX instructio...
Definition: vector_x86.h:639
void convert(T *a) const
writes this to a
Definition: vector_x86.h:729
void load(T const *a)
loads a to this
Definition: vector_x86.h:720
void load_aligned(T const *a)
loads a to this
Definition: vector_x86.h:725
SIMD vector of 2 double-precision floating-point real numbers, operations on which use SSE2 instructi...
Definition: vector_x86.h:43
void convert(T *a) const
writes this to a
Definition: vector_x86.h:133