diff --git a/source/source_hsolver/CMakeLists.txt b/source/source_hsolver/CMakeLists.txt index b115d6d4cd2..a64c7eb2159 100644 --- a/source/source_hsolver/CMakeLists.txt +++ b/source/source_hsolver/CMakeLists.txt @@ -4,6 +4,7 @@ list(APPEND objects diago_david.cpp diago_dav_subspace.cpp diago_bpcg.cpp + diago_ppcg.cpp para_linear_transform.cpp hsolver_pw.cpp hsolver_lcaopw.cpp diff --git a/source/source_hsolver/diago_ppcg.cpp b/source/source_hsolver/diago_ppcg.cpp new file mode 100644 index 00000000000..84d97b86789 --- /dev/null +++ b/source/source_hsolver/diago_ppcg.cpp @@ -0,0 +1,797 @@ +#include "source_hsolver/diago_ppcg.h" + +#include "ATen/kernels/lapack.h" +#include "diago_iter_assist.h" +#include "source_base/global_variable.h" +#include "source_base/kernels/math_kernel_op.h" +#include "source_base/parallel_comm.h" // POOL_WORLD/BP_WORLD +#include "source_base/parallel_reduce.h" +#include "source_hsolver/kernels/bpcg_kernel_op.h" // reuse normalize_op / apply_eigenvalues_op / precondition_op + +#ifdef __MPI +#include +#endif + +namespace hsolver { + +template +DiagoPPCG::DiagoPPCG(const Real* precondition) +{ + this->device_type = ct::DeviceTypeToEnum::value; + this->ctx = {}; // default device context + this->r_type = ct::DataTypeToEnum::value; + this->t_type = ct::DataTypeToEnum::value; + + this->h_prec = std::move(ct::TensorMap((void*)precondition, this->r_type, this->device_type, {this->n_basis})); +} + +template +DiagoPPCG::~DiagoPPCG() = default; + +template +void DiagoPPCG::init_iter(const int nband, const int nband_l, const int nbasis, const int ndim) +{ + this->n_band = nband; + this->n_band_l = nband_l; + this->n_basis = nbasis; + this->n_dim = ndim; + + this->prec = ct::Tensor(this->r_type, this->device_type, {this->n_basis}); + + this->HX = ct::Tensor(this->t_type, this->device_type, {this->n_band_l, this->n_basis}); + this->R = ct::Tensor(this->t_type, this->device_type, {this->n_band_l, this->n_basis}); + this->W = ct::Tensor(this->t_type, this->device_type, {this->n_band_l, this->n_basis}); + this->HW = ct::Tensor(this->t_type, this->device_type, {this->n_band_l, this->n_basis}); + this->P = ct::Tensor(this->t_type, this->device_type, {this->n_band_l, this->n_basis}); + this->HP = ct::Tensor(this->t_type, this->device_type, {this->n_band_l, this->n_basis}); + + const int max_cols = 3 * this->n_band_l; + this->V = ct::Tensor(this->t_type, this->device_type, {max_cols, this->n_basis}); + this->HV = ct::Tensor(this->t_type, this->device_type, {max_cols, this->n_basis}); + + const int max_small = 3 * this->n_band; + this->hcc = ct::Tensor(this->t_type, this->device_type, {max_small, max_small}); + this->scc = ct::Tensor(this->t_type, this->device_type, {max_small, max_small}); + this->vcc = ct::Tensor(this->t_type, this->device_type, {max_small, max_small}); + this->eval = ct::Tensor(this->r_type, this->device_type, {max_small}); + // Zero-initialize so that uninitialised entries are immediately visible + // as exact 0.0 rather than denormal garbage (e.g. 4.68e-310). + Parallel_Reduce::ZEROS(this->eval.data(), max_small); + + this->work = ct::Tensor(this->t_type, this->device_type, {max_cols, this->n_basis}); + + this->calc_prec(); +} + +template +void DiagoPPCG::calc_prec() +{ + syncmem_var_h2d_op()(this->prec.data(), this->h_prec.data(), this->n_basis); +} + +template +void DiagoPPCG::apply_h(const HPsiFunc& hpsi_func, const ct::Tensor& in_vecs, ct::Tensor& out_vecs, const int nvec) +{ + // hpsi_func(psi_in, hpsi_out, ld_psi, nvec) + hpsi_func(in_vecs.data(), out_vecs.data(), this->n_basis, nvec); +} + +template +void DiagoPPCG::project_out(const ct::Tensor& basis, + const int ncols_basis, + ct::Tensor& vecs, + const int ncols_vecs) +{ + if (ncols_basis <= 0 || ncols_vecs <= 0) + { + return; + } + + // coeff = basis^H * vecs (ncols_basis x ncols_vecs) + const int ldh = ncols_basis; + ct::Tensor coeff(this->t_type, this->device_type, {ldh, ncols_vecs}); + +#ifdef __MPI + this->pmmcn.set_dimension(BP_WORLD, + POOL_WORLD, + ncols_basis, + this->n_basis, + ncols_vecs, + this->n_basis, + this->n_dim, + ldh); +#else + this->pmmcn.set_dimension(ncols_basis, + this->n_basis, + ncols_vecs, + this->n_basis, + this->n_dim, + ldh); +#endif + + this->pmmcn.multiply(1.0, basis.data(), vecs.data(), 0.0, coeff.data()); + + // vecs -= basis * coeff + ModuleBase::gemm_op()('N', + 'N', + this->n_dim, + ncols_vecs, + ncols_basis, + this->neg_one, + basis.data(), + this->n_basis, + coeff.data(), + ldh, + this->one, + vecs.data(), + this->n_basis); +} + +template +void DiagoPPCG::orthonormalize_block(ct::Tensor& A, ct::Tensor* HA, const int ncols) +{ + if (ncols <= 0) + { + return; + } + + // gram = A^H A (ncols x ncols) + ct::Tensor gram(this->t_type, this->device_type, {ncols, ncols}); +#ifdef __MPI + this->pmmcn.set_dimension(BP_WORLD, + POOL_WORLD, + ncols, + this->n_basis, + ncols, + this->n_basis, + this->n_dim, + ncols); +#else + this->pmmcn.set_dimension(ncols, + this->n_basis, + ncols, + this->n_basis, + this->n_dim, + ncols); +#endif + this->pmmcn.multiply(static_cast(1.0), A.data(), A.data(), static_cast(0.0), gram.data()); + + // Cholesky: gram = U^H U (upper), then invert U in-place -> gram holds inv(U) in upper triangle. + // Use the ATen LAPACK wrappers so CPU/GPU paths and error handling are consistent. + using ContainerDevice = typename container::PsiToContainer::type; + container::kernels::lapack_potrf()('U', ncols, gram.data(), ncols); + container::kernels::lapack_trtri()('U', 'N', ncols, gram.data(), ncols); + + // Zero out lower triangle so a dense GEMM applies only the upper-triangular factor. + T* g = gram.data(); + for (int j = 0; j < ncols; ++j) + { + for (int i = j + 1; i < ncols; ++i) + { + g[i + j * ncols] = static_cast(0.0); + } + } + + // A <- A * inv(U) + ModuleBase::gemm_op()('N', + 'N', + this->n_dim, + ncols, + ncols, + this->one, + A.data(), + this->n_basis, + gram.data(), + ncols, + this->zero, + this->work.data(), + this->n_basis); + syncmem_complex_op()(A.data(), this->work.data(), static_cast(ncols) * this->n_basis); + + if (HA) + { + ModuleBase::gemm_op()('N', + 'N', + this->n_dim, + ncols, + ncols, + this->one, + HA->data(), + this->n_basis, + gram.data(), + ncols, + this->zero, + this->work.data(), + this->n_basis); + syncmem_complex_op()(HA->data(), this->work.data(), static_cast(ncols) * this->n_basis); + } +} + +template +void DiagoPPCG::pack_basis(const int ncols, const bool has_p) +{ + // V columns: [X, W, P?] + // Copy X + syncmem_complex_op()(this->V.data(), this->X.data(), this->n_band_l * this->n_basis); + // Copy W + syncmem_complex_op()(this->V.data() + this->n_band_l * this->n_basis, + this->W.data(), + this->n_band_l * this->n_basis); + + if (has_p) + { + syncmem_complex_op()(this->V.data() + 2 * this->n_band_l * this->n_basis, + this->P.data(), + this->n_band_l * this->n_basis); + } + + // HV: [HX, HW, HP?] + syncmem_complex_op()(this->HV.data(), this->HX.data(), this->n_band_l * this->n_basis); + + syncmem_complex_op()(this->HV.data() + this->n_band_l * this->n_basis, + this->HW.data(), + this->n_band_l * this->n_basis); + + if (has_p) + { + syncmem_complex_op()(this->HV.data() + 2 * this->n_band_l * this->n_basis, + this->HP.data(), + this->n_band_l * this->n_basis); + } + + (void)ncols; +} + +template +void DiagoPPCG::compute_projected_mats(const int ncols) +{ + // hcc = V^H HV, scc = V^H V (col-major, ldh = ncols) + const int ld_small = 3 * this->n_band; +#ifdef __MPI + this->pmmcn.set_dimension(BP_WORLD, + POOL_WORLD, + ncols, + this->n_basis, + ncols, + this->n_basis, + this->n_dim, + ld_small); +#else + this->pmmcn.set_dimension(ncols, + this->n_basis, + ncols, + this->n_basis, + this->n_dim, + ld_small); +#endif + + this->pmmcn.multiply(1.0, this->V.data(), this->HV.data(), 0.0, this->hcc.data()); + this->pmmcn.multiply(1.0, this->V.data(), this->V.data(), 0.0, this->scc.data()); +} + +template +void DiagoPPCG::solve_projected(const int ncols) +{ + // Solve (hcc) c = lambda (scc) c, eigenvectors in vcc + const int ld_small = 3 * this->n_band; + hsolver::hegvd_op()(this->ctx, + ncols, + ld_small, + this->hcc.data(), + this->scc.data(), + this->eval.data(), + this->vcc.data()); +} + +template +void DiagoPPCG::update_from_projected(const int ncols, const bool has_p, const bool update_p) +{ + // Update X, HX from V, HV using the first n_band eigenvectors. + // X_new = V * vcc(:, 1:nband) + // HX_new = HV * vcc(:, 1:nband) + const T* coeff = this->vcc.data(); + const int ld_small = 3 * this->n_band; + + // work (n_basis x n_band_l) + ModuleBase::gemm_op()('N', + 'N', + this->n_dim, + this->n_band_l, + ncols, + this->one, + this->V.data(), + this->n_basis, + coeff, + ld_small, + this->zero, + this->work.data(), + this->n_basis); + syncmem_complex_op()(this->X.data(), this->work.data(), this->n_band_l * this->n_basis); + + ModuleBase::gemm_op()('N', + 'N', + this->n_dim, + this->n_band_l, + ncols, + this->one, + this->HV.data(), + this->n_basis, + coeff, + ld_small, + this->zero, + this->work.data(), + this->n_basis); + syncmem_complex_op()(this->HX.data(), this->work.data(), this->n_band_l * this->n_basis); + + if (!update_p) + { + return; + } + + // Update P (search directions) from blocks W and P (exclude X block to keep meaning) + // P_new = W * Cw + P * Cp, where Cw = coeff(rows b..2b-1, cols 0..b-1) + // and Cp = coeff(rows 2b..3b-1, cols 0..b-1) + const int b = this->n_band_l; + const int ld = ld_small; + + // When the subspace is smaller than the full [X,W,P] block (ncols < 3b), + // only a prefix of W and/or P participates. Keep the inner dimensions + // consistent so we never read garbage rows from vcc. + const int ncols_W = std::max(0, std::min(b, ncols - b)); + if (ncols_W > 0) + { + const T* Cw = coeff + b; // row offset b in vcc + + ModuleBase::gemm_op()('N', + 'N', + this->n_dim, + b, + ncols_W, + this->one, + this->W.data(), + this->n_basis, + Cw, + ld, + this->zero, + this->P.data(), + this->n_basis); + + ModuleBase::gemm_op()('N', + 'N', + this->n_dim, + b, + ncols_W, + this->one, + this->HW.data(), + this->n_basis, + Cw, + ld, + this->zero, + this->HP.data(), + this->n_basis); + } + + if (has_p) + { + const int ncols_P = std::max(0, std::min(b, ncols - 2 * b)); + if (ncols_P > 0) + { + const T* Cp = coeff + 2 * b; + // The P block inside V / HV always stores b columns (pack_basis + // writes the full block). Only the first ncols_P of them + // correspond to valid rows in Cp. + ModuleBase::gemm_op()('N', + 'N', + this->n_dim, + b, + ncols_P, + this->one, + this->V.data() + 2 * b * this->n_basis, + this->n_basis, + Cp, + ld, + this->one, + this->P.data(), + this->n_basis); + ModuleBase::gemm_op()('N', + 'N', + this->n_dim, + b, + ncols_P, + this->one, + this->HV.data() + 2 * b * this->n_basis, + this->n_basis, + Cp, + ld, + this->one, + this->HP.data(), + this->n_basis); + } + } + + // Keep P orthogonal to X and keep HP consistent with P. + // If we do: P <- P - X * (X^H P), then we must also do: HP <- HP - HX * (X^H P) + // to preserve the relation HP = H * P inside the subspace. + { + const int bproj = this->n_band_l; + const int ld_coef = bproj; + ct::Tensor coef_xp(this->t_type, this->device_type, {ld_coef, bproj}); + + #ifdef __MPI + this->pmmcn.set_dimension(BP_WORLD, + POOL_WORLD, + bproj, + this->n_basis, + bproj, + this->n_basis, + this->n_dim, + ld_coef); + #else + this->pmmcn.set_dimension(bproj, + this->n_basis, + bproj, + this->n_basis, + this->n_dim, + ld_coef); + #endif + // coef_xp = X^H * P + this->pmmcn.multiply(1.0, this->X.data(), this->P.data(), 0.0, coef_xp.data()); + + // P -= X * coef_xp + ModuleBase::gemm_op()('N', + 'N', + this->n_dim, + bproj, + bproj, + this->neg_one, + this->X.data(), + this->n_basis, + coef_xp.data(), + ld_coef, + this->one, + this->P.data(), + this->n_basis); + + // HP -= HX * coef_xp + ModuleBase::gemm_op()('N', + 'N', + this->n_dim, + bproj, + bproj, + this->neg_one, + this->HX.data(), + this->n_basis, + coef_xp.data(), + ld_coef, + this->one, + this->HP.data(), + this->n_basis); + } + + // Block-orthonormalize P and apply the same transformation to HP. + // (Avoid calling normalize_op(P) alone, which would desynchronize HP.) + this->orthonormalize_block(this->P, &this->HP, this->n_band_l); +} + +template +bool DiagoPPCG::check_convergence(const ct::Tensor& residual, const std::vector& ethr_band) +{ + // Check ||r_i|| <= ethr_band[i] for all local bands. + bool not_conv = false; + for (int ib = 0; ib < this->n_band_l; ++ib) + { + const T* ri = residual.data() + ib * this->n_basis; + const Real nrm2 = std::sqrt(ModuleBase::dot_real_op()(this->n_dim, ri, ri, true)); + if (ib < static_cast(ethr_band.size()) && nrm2 > static_cast(ethr_band[ib])) + { + not_conv = true; + break; + } + } + +#ifdef __MPI + // Any rank not converged means global not converged. + int local = not_conv ? 1 : 0; + int global = 0; + MPI_Allreduce(&local, &global, 1, MPI_INT, MPI_MAX, BP_WORLD); + not_conv = (global != 0); +#endif + + return !not_conv; +} + +template +void DiagoPPCG::compute_residual_and_precond(const std::vector& ethr_band, bool& not_conv) +{ + // Residual R = HX - X * diag(eig) + // First, R <- HX + syncmem_complex_op()(this->R.data(), this->HX.data(), this->n_band_l * this->n_basis); + + // tmp = X * diag(e) + apply_eigenvalues_op()(this->n_dim, + this->n_basis, + this->n_band_l, + this->work.data(), + this->X.data(), + this->eval.data()); + // R -= tmp + for (int ib = 0; ib < this->n_band_l; ++ib) + { + const T alpha = static_cast(-1.0); + ModuleBase::axpy_op()(this->n_dim, + &alpha, + this->work.data() + ib * this->n_basis, + 1, + this->R.data() + ib * this->n_basis, + 1); + } + + // not_conv if any band residual above threshold + not_conv = !this->check_convergence(this->R, ethr_band); + if (!not_conv) + { + return; + } + + // W = - M^{-1} R + syncmem_complex_op()(this->W.data(), this->R.data(), this->n_band_l * this->n_basis); + precondition_op()(this->n_dim, + this->W.data(), + 0, + this->n_band_l, + this->prec.data(), + this->eval.data()); + for (int ib = 0; ib < this->n_band_l; ++ib) + { + ModuleBase::vector_mul_real_op()(this->n_dim, + this->W.data() + ib * this->n_basis, + this->W.data() + ib * this->n_basis, + static_cast(-1.0)); + } + + // Project W out of X and P (if P contains previous directions) + this->project_out(this->X, this->n_band_l, this->W, this->n_band_l); + + // Also project out of P to reduce near-dependencies + // (P may be all-zero at the first iter, projection is harmless) + this->project_out(this->P, this->n_band_l, this->W, this->n_band_l); + + // Normalize each vector in W + normalize_op()(this->n_dim, this->W.data(), 0, this->n_band_l, nullptr); +} + +template +void DiagoPPCG::diag(const HPsiFunc& hpsi_func, + T* psi_in, + Real* eigenvalue_in, + const std::vector& ethr_band) +{ + // Map external psi to X + this->X = ct::TensorMap((void*)psi_in, this->t_type, this->device_type, {this->n_band_l, this->n_basis}); + + // Normalize initial X + normalize_op()(this->n_dim, this->X.data(), 0, this->n_band_l, nullptr); + + // HX = H X + this->apply_h(hpsi_func, this->X, this->HX, this->n_band_l); + + // Make X block-orthonormal (and keep HX consistent) before any projection/RR. + this->orthonormalize_block(this->X, &this->HX, this->n_band_l); + + // Initial Rayleigh-Ritz on X alone: solve (X^H H X) c = (X^H X) c Λ + { + const int ncols = this->n_band; + ct::Tensor hxx(this->t_type, this->device_type, {ncols, ncols}); + ct::Tensor sxx(this->t_type, this->device_type, {ncols, ncols}); + ct::Tensor vxx(this->t_type, this->device_type, {ncols, ncols}); + ct::Tensor exx(this->r_type, this->device_type, {ncols}); + + #ifdef __MPI + this->pmmcn.set_dimension(BP_WORLD, + POOL_WORLD, + this->n_band_l, + this->n_basis, + this->n_band_l, + this->n_basis, + this->n_dim, + ncols); + #else + this->pmmcn.set_dimension(this->n_band_l, + this->n_basis, + this->n_band_l, + this->n_basis, + this->n_dim, + ncols); + #endif + this->pmmcn.multiply(1.0, this->X.data(), this->HX.data(), 0.0, hxx.data()); + this->pmmcn.multiply(1.0, this->X.data(), this->X.data(), 0.0, sxx.data()); + + hsolver::hegvd_op()(this->ctx, + ncols, + ncols, + hxx.data(), + sxx.data(), + exx.data(), + vxx.data()); + + // Rotate X, HX: X <- X * vxx, HX <- HX * vxx + ModuleBase::gemm_op()('N', + 'N', + this->n_dim, + this->n_band_l, + ncols, + this->one, + this->X.data(), + this->n_basis, + vxx.data(), + ncols, + this->zero, + this->work.data(), + this->n_basis); + syncmem_complex_op()(this->X.data(), this->work.data(), this->n_band_l * this->n_basis); + + ModuleBase::gemm_op()('N', + 'N', + this->n_dim, + this->n_band_l, + ncols, + this->one, + this->HX.data(), + this->n_basis, + vxx.data(), + ncols, + this->zero, + this->work.data(), + this->n_basis); + + syncmem_complex_op()(this->HX.data(), this->work.data(), this->n_band_l * this->n_basis); + + syncmem_var_op()(this->eval.data(), exx.data(), this->n_band); + } + + // Clear P/HP to zero for the first outer iteration + Parallel_Reduce::ZEROS(this->P.data(), this->n_band_l * this->n_basis); + Parallel_Reduce::ZEROS(this->HP.data(), this->n_band_l * this->n_basis); + + // Compute initial residual and preconditioned direction W. + bool not_conv = true; + this->compute_residual_and_precond(ethr_band, not_conv); + + // Determine how many inner iterations to allow. + // When 3*n_band fits in the ambient space the P block is safe and + // 2-3 iterations accelerate convergence. Otherwise stick to 1 to + // avoid near-singular overlap matrices. + const bool p_safe = (3 * this->n_band <= this->n_dim - this->p_safe_margin_); + const int max_iter = p_safe ? this->max_inner_iter_ : 1; + const int max_w_cols = std::max(0, std::min(this->n_band_l, this->n_dim - this->n_band_l)); + int active_w_cols = not_conv ? max_w_cols : 0; + if (not_conv && active_w_cols > 0) + { + // HW = H W + this->apply_h(hpsi_func, this->W, this->HW, active_w_cols); + + // Keep W and HW consistent while improving conditioning. + this->orthonormalize_block(this->W, &this->HW, active_w_cols); + } + for (int iter = 0; iter < max_iter && not_conv; ++iter) + { + const bool has_p = (iter > 0) && p_safe; + const int raw_ncols = this->n_band + active_w_cols + (has_p ? this->n_band : 0); + const int ncols = std::min(raw_ncols, this->n_dim); + + // Pack basis V/HV + this->pack_basis(ncols, has_p); + + // Solve projected generalized eigenproblem + this->compute_projected_mats(ncols); + this->solve_projected(ncols); + + // Update X/HX and P/HP + const bool update_p = (iter + 1 < max_iter); + this->update_from_projected(ncols, has_p, update_p); + + if (iter + 1 >= max_iter) + { + break; + } + + // Residual for next convergence check + this->compute_residual_and_precond(ethr_band, not_conv); + + if (!not_conv) + { + break; + } + + active_w_cols = max_w_cols; + // Update HW for the next iteration + this->apply_h(hpsi_func, this->W, this->HW, active_w_cols); + + // Keep W and HW consistent + this->orthonormalize_block(this->W, &this->HW, active_w_cols); + } + + // Final Rayleigh-Ritz on the current X subspace to ensure (X, eval) consistency. + // This mirrors BPCG's exit behavior (subspace diagonalization before returning). + { + const int ncols = this->n_band; + ct::Tensor hxx(this->t_type, this->device_type, {ncols, ncols}); + ct::Tensor sxx(this->t_type, this->device_type, {ncols, ncols}); + ct::Tensor vxx(this->t_type, this->device_type, {ncols, ncols}); + ct::Tensor exx(this->r_type, this->device_type, {ncols}); + +#ifdef __MPI + this->pmmcn.set_dimension(BP_WORLD, + POOL_WORLD, + this->n_band_l, + this->n_basis, + this->n_band_l, + this->n_basis, + this->n_dim, + ncols); +#else + this->pmmcn.set_dimension(this->n_band_l, + this->n_basis, + this->n_band_l, + this->n_basis, + this->n_dim, + ncols); +#endif + this->pmmcn.multiply(1.0, this->X.data(), this->HX.data(), 0.0, hxx.data()); + this->pmmcn.multiply(1.0, this->X.data(), this->X.data(), 0.0, sxx.data()); + + hsolver::hegvd_op()(this->ctx, + ncols, + ncols, + hxx.data(), + sxx.data(), + exx.data(), + vxx.data()); + + // Rotate X, HX: X <- X * vxx, HX <- HX * vxx + ModuleBase::gemm_op()('N', + 'N', + this->n_dim, + this->n_band_l, + ncols, + this->one, + this->X.data(), + this->n_basis, + vxx.data(), + ncols, + this->zero, + this->work.data(), + this->n_basis); + syncmem_complex_op()(this->X.data(), this->work.data(), this->n_band_l * this->n_basis); + + ModuleBase::gemm_op()('N', + 'N', + this->n_dim, + this->n_band_l, + ncols, + this->one, + this->HX.data(), + this->n_basis, + vxx.data(), + ncols, + this->zero, + this->work.data(), + this->n_basis); + syncmem_complex_op()(this->HX.data(), this->work.data(), this->n_band_l * this->n_basis); + + syncmem_var_op()(this->eval.data(), exx.data(), this->n_band); + } + + // Copy eigenvalues out + syncmem_var_d2h_op()(eigenvalue_in, this->eval.data(), this->n_band); +} + +// explicit instantiation +template class DiagoPPCG, base_device::DEVICE_CPU>; +template class DiagoPPCG, base_device::DEVICE_CPU>; +#if ((defined __CUDA) || (defined __ROCM)) +template class DiagoPPCG, base_device::DEVICE_GPU>; +template class DiagoPPCG, base_device::DEVICE_GPU>; +#endif + +} // namespace hsolver diff --git a/source/source_hsolver/diago_ppcg.h b/source/source_hsolver/diago_ppcg.h new file mode 100644 index 00000000000..3566afd0d66 --- /dev/null +++ b/source/source_hsolver/diago_ppcg.h @@ -0,0 +1,145 @@ +#ifndef DIAGO_PPCG_H_ +#define DIAGO_PPCG_H_ + +#include "source_base/kernels/math_kernel_op.h" +#include "source_base/module_device/memory_op.h" +#include "source_base/module_device/types.h" +#include "source_base/para_gemm.h" +#include "source_hsolver/kernels/hegvd_op.h" + +#include +#include +#include +#include + +namespace hsolver { + +// Projected Preconditioned Conjugate Gradient (block) eigensolver. +// This implementation follows an LOBPCG-style subspace projection: +// V = [X, W, P] (or [X, W] for the first iter) +// solve (V^H H V) c = (V^H V) c Λ +// update X <- V c(:,1:nband) +// update P <- [W, P] c_{W,P}(:,1:nband) +// with W from preconditioned residual projected to the complement of X (and P). +// +// Notes: +// - Designed to match the existing diag interface used by BPCG. +// - Preconditioner is treated as a diagonal Real vector of length n_basis. + +template , typename Device = base_device::DEVICE_CPU> +class DiagoPPCG +{ + private: + using Real = typename GetTypeReal::type; + + public: + explicit DiagoPPCG(const Real* precondition); + ~DiagoPPCG(); + + void init_iter(const int nband, const int nband_l, const int nbasis, const int ndim); + + // ---- tunable parameters ---- + /// Maximum inner iterations per diag() call when P-block is safe. + /// Default 3; set to 1 for ultra-conservative mode or higher for + /// difficult spectra. + void set_max_inner_iter(const int n) { max_inner_iter_ = n; } + + /// Safety margin for P-block usage: P is enabled only when + /// 3 * n_band <= n_dim - p_safe_margin_. + /// Default 2; increase for better numerical stability at the cost of + /// slower convergence on well-conditioned problems. + void set_p_safe_margin(const int m) { p_safe_margin_ = m; } + + /// Number of diag() passes performed by the factory (hsolver_pw). + /// Default 5; matching BPCG's multi-pass strategy. + void set_npass(const int n) { npass_ = n; } + int npass() const { return npass_; } + // ---- end tunable parameters ---- + + using HPsiFunc = std::function; + + void diag(const HPsiFunc& hpsi_func, + T* psi_in, + Real* eigenvalue_in, + const std::vector& ethr_band); + + private: + int n_band = 0; + int n_band_l = 0; + int n_basis = 0; + int n_dim = 0; + + // tunable parameters (see set_xxx methods above) + int max_inner_iter_ = 3; + int p_safe_margin_ = 2; + int npass_ = 5; + + ct::DataType r_type = ct::DataType::DT_INVALID; + ct::DataType t_type = ct::DataType::DT_INVALID; + ct::DeviceType device_type = ct::DeviceType::UnKnown; + + // Host pointer mapped preconditioner + device copy + ct::Tensor h_prec = {}; + ct::Tensor prec = {}; + + // Work vectors (column-major, lda = n_basis): each tensor stores a matrix (n_basis x ncols) + // as a contiguous array; we use Tensor with shape {ncols, n_basis} for contiguous column blocks. + ct::Tensor X = {}; // mapped to psi_in + ct::Tensor HX = {}; + ct::Tensor R = {}; + ct::Tensor W = {}; + ct::Tensor HW = {}; + ct::Tensor P = {}; + ct::Tensor HP = {}; + + ct::Tensor V = {}; // basis [X,W,P] packed + ct::Tensor HV = {}; // H*V + + ct::Tensor hcc = {}; // V^H H V + ct::Tensor scc = {}; // V^H V + ct::Tensor vcc = {}; // eigenvectors of projected problem + ct::Tensor eval = {}; // eigenvalues of projected problem + + ct::Tensor work = {}; // generic workspace (ncols x n_basis) + + // Parallel matmul helper (A^H B) + ModuleBase::PGemmCN pmmcn; + + // Device memory helpers + Device* ctx = {}; + const T one_ = static_cast(1.0); + const T zero_ = static_cast(0.0); + const T neg_one_ = static_cast(-1.0); + const T* one = &one_; + const T* zero = &zero_; + const T* neg_one = &neg_one_; + + using syncmem_complex_op = base_device::memory::synchronize_memory_op; + using syncmem_var_op = base_device::memory::synchronize_memory_op; + using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op; + using syncmem_var_d2h_op = base_device::memory::synchronize_memory_op; + + void calc_prec(); + + void apply_h(const HPsiFunc& hpsi_func, const ct::Tensor& in_vecs, ct::Tensor& out_vecs, const int nvec); + + void pack_basis(const int ncols, const bool has_p); + + void compute_projected_mats(const int ncols); + + void solve_projected(const int ncols); + + void update_from_projected(const int ncols, const bool has_p, const bool update_p); + + void compute_residual_and_precond(const std::vector& ethr_band, bool& not_conv); + + void orthonormalize_block(ct::Tensor& A, ct::Tensor* HA, const int ncols); + + void project_out(const ct::Tensor& basis, const int ncols_basis, ct::Tensor& vecs, const int ncols_vecs); + + bool check_convergence(const ct::Tensor& residual, const std::vector& ethr_band); +}; + +} // namespace hsolver + +#endif // DIAGO_PPCG_H_ diff --git a/source/source_hsolver/hsolver_pw.cpp b/source/source_hsolver/hsolver_pw.cpp index b88bc3b90dd..45d99aead9a 100644 --- a/source/source_hsolver/hsolver_pw.cpp +++ b/source/source_hsolver/hsolver_pw.cpp @@ -8,6 +8,7 @@ #include "source_hamilt/hamilt.h" #include "source_hsolver/diag_comm_info.h" #include "source_hsolver/diago_bpcg.h" +#include "source_hsolver/diago_ppcg.h" #include "source_hsolver/diago_cg.h" #include "source_hsolver/diago_dav_subspace.h" #include "source_hsolver/diago_david.h" @@ -83,7 +84,7 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, this->nproc_in_pool = nproc_in_pool_in; // report if the specified diagonalization method is not supported - const std::initializer_list _methods = {"cg", "dav", "dav_subspace", "bpcg"}; + const std::initializer_list _methods = {"cg", "dav", "dav_subspace", "bpcg", "ppcg"}; if (std::find(std::begin(_methods), std::end(_methods), this->method) == std::end(_methods)) { ModuleBase::WARNING_QUIT("HSolverPW::solve", "This type of eigensolver is not supported!"); @@ -323,6 +324,19 @@ void HSolverPW::hamiltSolvePsiK(hamilt::Hamilt* hm, bpcg.init_iter(PARAM.inp.nbands, nband_l, nbasis, ndim); bpcg.diag(hpsi_func, psi.get_pointer(), eigenvalue, this->ethr_band); } + else if (this->method == "ppcg") + { + const int nband_l = psi.get_nbands(); + const int nbasis = psi.get_nbasis(); + const int ndim = psi.get_current_ngk(); + DiagoPPCG ppcg(pre_condition.data()); + ppcg.init_iter(PARAM.inp.nbands, nband_l, nbasis, ndim); + // Multiple passes for robust convergence (same strategy as BPCG in unit tests) + for (int pass = 0; pass < ppcg.npass(); ++pass) + { + ppcg.diag(hpsi_func, psi.get_pointer(), eigenvalue, this->ethr_band); + } + } else if (this->method == "dav_subspace") { bool scf = this->calculation_type == "nscf" ? false : true; diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.cpp b/source/source_hsolver/kernels/bpcg_kernel_op.cpp index 88f94e288c6..ec4b83b9575 100644 --- a/source/source_hsolver/kernels/bpcg_kernel_op.cpp +++ b/source/source_hsolver/kernels/bpcg_kernel_op.cpp @@ -3,9 +3,26 @@ #include "source_base/kernels/math_kernel_op.h" #include "source_base/parallel_reduce.h" #include +#ifdef _OPENMP +#include +#endif namespace hsolver { +namespace +{ +constexpr int kBpcgOpenmpMinWork = 4096; + +inline bool use_bpcg_openmp(int n) +{ +#ifdef _OPENMP + return n >= kBpcgOpenmpMinWork && omp_get_max_threads() > 1; +#else + return false; +#endif +} +} // namespace + template struct line_minimize_with_block_op { @@ -26,6 +43,9 @@ struct line_minimize_with_block_op Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); Parallel_Reduce::reduce_pool(norm); norm = 1.0 / sqrt(norm); +#ifdef _OPENMP +#pragma omp parallel for schedule(static) reduction(+:epsilo_0,epsilo_1,epsilo_2) if(use_bpcg_openmp(n_basis)) +#endif for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; @@ -41,6 +61,9 @@ struct line_minimize_with_block_op theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2))); cos_theta = std::cos(theta); sin_theta = std::sin(theta); +#ifdef _OPENMP +#pragma omp parallel for schedule(static) if(use_bpcg_openmp(n_basis)) +#endif for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; @@ -71,12 +94,14 @@ struct calc_grad_with_block_op Real err = 0.0; Real beta = 0.0; Real epsilo = 0.0; - Real grad_2 = {0.0}; - T grad_1 = {0.0, 0.0}; + const Real beta_old = beta_out[band_idx]; auto A = reinterpret_cast(psi_out + band_idx * n_basis_max); Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); Parallel_Reduce::reduce_pool(norm); norm = 1.0 / sqrt(norm); +#ifdef _OPENMP +#pragma omp parallel for schedule(static) reduction(+:epsilo) if(use_bpcg_openmp(n_basis)) +#endif for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; @@ -85,21 +110,27 @@ struct calc_grad_with_block_op epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item])); } Parallel_Reduce::reduce_pool(epsilo); +#ifdef _OPENMP +#pragma omp parallel for schedule(static) reduction(+:err,beta) if(use_bpcg_openmp(n_basis)) +#endif for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; - grad_1 = hpsi_out[item] - epsilo * psi_out[item]; - grad_2 = std::norm(grad_1); + const T grad_1 = hpsi_out[item] - epsilo * psi_out[item]; + const Real grad_2 = std::norm(grad_1); err += grad_2; beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec? } Parallel_Reduce::reduce_pool(err); Parallel_Reduce::reduce_pool(beta); +#ifdef _OPENMP +#pragma omp parallel for schedule(static) if(use_bpcg_openmp(n_basis)) +#endif for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; - grad_1 = hpsi_out[item] - epsilo * psi_out[item]; - grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_out[band_idx] * grad_old_out[item]; + const T grad_1 = hpsi_out[item] - epsilo * psi_out[item]; + grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_old * grad_old_out[item]; } beta_out[band_idx] = beta; err_out[band_idx] = sqrt(err); @@ -113,6 +144,9 @@ struct apply_eigenvalues_op using Real = typename GetTypeReal::type; void operator()(const int& nbase, const int& nbase_x, const int& notconv, T* result, const T* vectors, const Real* eigenvalues) { +#ifdef _OPENMP +#pragma omp parallel for collapse(2) schedule(static) if(use_bpcg_openmp(nbase * notconv)) +#endif for (int m = 0; m < notconv; m++) { for (int idx = 0; idx < nbase; idx++) @@ -133,19 +167,14 @@ struct precondition_op { const Real* precondition, const Real* eigenvalues) { - std::vector pre(dim, 0.0); for (int m = 0; m < notconv; m++) { for (size_t i = 0; i < dim; i++) { Real x = std::abs(precondition[i] - eigenvalues[m]); - pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0))); + Real denom = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0))); + psi_iter[(nbase + m) * dim + i] /= denom; } - ModuleBase::vector_div_vector_op()( - dim, - psi_iter + (nbase + m) * dim, - psi_iter + (nbase + m) * dim, - pre.data()); } } }; diff --git a/source/source_hsolver/test/CMakeLists.txt b/source/source_hsolver/test/CMakeLists.txt index 1b1529adb4a..7721f462659 100644 --- a/source/source_hsolver/test/CMakeLists.txt +++ b/source/source_hsolver/test/CMakeLists.txt @@ -2,6 +2,41 @@ remove_definitions(-D__CUDA) remove_definitions(-D__ROCM) remove_definitions(-D__EXX) +# Make unit tests runnable directly after build (without `cmake --install`). +# CI does an install step which also copies these, but local dev often doesn't. +set(_HSOLVER_TEST_FILES + H-KPoints-Si2.dat + H-GammaOnly-Si2.dat + S-KPoints-Si2.dat + S-GammaOnly-Si2.dat + H-KPoints-Si64.dat + H-GammaOnly-Si64.dat + S-KPoints-Si64.dat + S-GammaOnly-Si64.dat + GammaOnly-Si2-Solution.dat + GammaOnly-Si64-Solution.dat + KPoints-Si2-Solution.dat + KPoints-Si64-Solution.dat + PEXSI-H-GammaOnly-Si2.dat + PEXSI-S-GammaOnly-Si2.dat + PEXSI-DM-GammaOnly-Si2.dat + generate_hsolver_test_report.sh + diago_cg_parallel_test.sh + diago_david_parallel_test.sh + diago_lcao_parallel_test.sh + diago_pexsi_parallel_test.sh + parallel_k2d_test.sh +) + +foreach(_f IN LISTS _HSOLVER_TEST_FILES) + if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/${_f}") + configure_file( + "${CMAKE_CURRENT_LIST_DIR}/${_f}" + "${CMAKE_CURRENT_BINARY_DIR}/${_f}" + COPYONLY) + endif() +endforeach() + if (ENABLE_MPI) AddTest( TARGET MODULE_HSOLVER_parak2d_test @@ -16,6 +51,17 @@ if (ENABLE_MPI) ../../source_hamilt/operator.cpp ../../source_pw/module_pwdft/op_pw.cpp ) + AddTest( + TARGET MODULE_HSOLVER_ppcg + LIBS parameter ${math_libs} base psi device container + SOURCES diago_ppcg_test.cpp ../diago_ppcg.cpp ../diago_bpcg.cpp ../diago_david.cpp + ../para_linear_transform.cpp ../diago_iter_assist.cpp + ../kernels/hegvd_op.cpp + ../../source_base/module_container/ATen/kernels/lapack.cpp + ../../source_basis/module_pw/test/test_tool.cpp + ../../source_hamilt/operator.cpp + ../../source_pw/module_pwdft/op_pw.cpp + ) AddTest( TARGET MODULE_HSOLVER_cg LIBS parameter ${math_libs} base psi device container @@ -76,14 +122,16 @@ if (ENABLE_MPI) AddTest( TARGET MODULE_HSOLVER_pw LIBS parameter ${math_libs} psi device base container - SOURCES test_hsolver_pw.cpp ../hsolver_pw.cpp ../hsolver_lcaopw.cpp ../diago_bpcg.cpp ../diago_dav_subspace.cpp ../diag_const_nums.cpp ../diago_iter_assist.cpp ../para_linear_transform.cpp + SOURCES test_hsolver_pw.cpp ../hsolver_pw.cpp ../hsolver_lcaopw.cpp ../diago_bpcg.cpp ../diago_ppcg.cpp ../diago_dav_subspace.cpp ../diag_const_nums.cpp ../diago_iter_assist.cpp ../para_linear_transform.cpp + ../kernels/hegvd_op.cpp ../../source_base/module_container/ATen/kernels/lapack.cpp ../../source_estate/elecstate_tools.cpp ../../source_estate/occupy.cpp ../../source_base/module_fft/fft_bundle.cpp ../../source_base/module_fft/fft_cpu.cpp ) AddTest( TARGET MODULE_HSOLVER_sdft LIBS parameter ${math_libs} psi device base container - SOURCES test_hsolver_sdft.cpp ../hsolver_pw_sdft.cpp ../hsolver_pw.cpp ../diago_bpcg.cpp ../diago_dav_subspace.cpp ../diag_const_nums.cpp ../diago_iter_assist.cpp ../para_linear_transform.cpp + SOURCES test_hsolver_sdft.cpp ../hsolver_pw_sdft.cpp ../hsolver_pw.cpp ../diago_bpcg.cpp ../diago_ppcg.cpp ../diago_dav_subspace.cpp ../diag_const_nums.cpp ../diago_iter_assist.cpp ../para_linear_transform.cpp + ../kernels/hegvd_op.cpp ../../source_base/module_container/ATen/kernels/lapack.cpp ../../source_estate/elecstate_tools.cpp ../../source_estate/occupy.cpp ../../source_base/module_fft/fft_bundle.cpp ../../source_base/module_fft/fft_cpu.cpp ) @@ -135,6 +183,7 @@ install(FILES GammaOnly-Si64-Solution.dat DESTINATION ${CMAKE_CURRENT_BINARY_DIR install(FILES KPoints-Si2-Solution.dat DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) install(FILES KPoints-Si64-Solution.dat DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) +install(FILES generate_hsolver_test_report.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) install(FILES diago_cg_parallel_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) install(FILES diago_david_parallel_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) install(FILES diago_lcao_parallel_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) @@ -172,6 +221,15 @@ add_test(NAME MODULE_HSOLVER_para_linear_trans ) find_program(BASH bash) +if (ENABLE_MPI AND BASH) + add_custom_target(MODULE_HSOLVER_test_report + COMMAND ${BASH} generate_hsolver_test_report.sh + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS MODULE_HSOLVER_bpcg MODULE_HSOLVER_ppcg + COMMENT "Generate hsolver unit-test reports under the build directory" + ) +endif() + if (ENABLE_MPI) add_test(NAME MODULE_HSOLVER_cg_parallel COMMAND ${BASH} diago_cg_parallel_test.sh @@ -197,4 +255,4 @@ if (ENABLE_MPI) ) endif() endif() -endif() \ No newline at end of file +endif() diff --git a/source/source_hsolver/test/diago_bpcg_test.cpp b/source/source_hsolver/test/diago_bpcg_test.cpp index 962ce72315f..362e4a50bdc 100644 --- a/source/source_hsolver/test/diago_bpcg_test.cpp +++ b/source/source_hsolver/test/diago_bpcg_test.cpp @@ -6,13 +6,19 @@ #include "source_pw/module_pwdft/hamilt_pw.h" #include "../diago_iter_assist.h" #include "../diago_bpcg.h" +#include "../kernels/bpcg_kernel_op.h" #include "diago_mock.h" #include "mpi.h" +#include "source_base/global_variable.h" +#include "source_base/parallel_comm.h" #include "source_basis/module_pw/test/test_tool.h" #include #include #include +#ifdef _OPENMP +#include +#endif /************************************************ * unit test of functions in Diago_BPCG @@ -74,13 +80,17 @@ class DiagoBPCGPrepare int nprocs=1, mypnum=0; // threshold is the comparison standard between bpcg and lapack - void CompareEigen(double *precondition) + void CompareEigen(double *precondition, bool check_vectors = false) { // calculate eigenvalues by LAPACK; double *e_lapack = new double[npw]; auto ev = DIAGOTEST::hmatrix; - if(mypnum == 0) { lapackEigen(npw, ev, e_lapack, false); -} + if (mypnum == 0) { + lapackEigen(npw, ev, e_lapack, false); + } + #ifdef __MPI + MPI_Bcast(e_lapack, npw, MPI_DOUBLE, 0, MPI_COMM_WORLD); + #endif // initial guess of psi by perturbing lapack psi ModuleBase::ComplexMatrix psiguess(nband, npw); std::default_random_engine p(1); @@ -91,18 +101,14 @@ class DiagoBPCGPrepare { double rand = static_cast(u(p))/10.; // psiguess(i,j) = ev(j,i)*(1+rand); - psiguess(i, j) = ev[j * DIAGOTEST::h_nc + i] * rand; + psiguess(i, j) = ev[j * npw + i] * rand; } } // run bpcg //====================================================================== double *en = new double[npw]; int ik = 1; - hamilt::Hamilt>* ha; - ha =new hamilt::HamiltPW>(nullptr, nullptr, nullptr, nullptr, nullptr, nullptr); - int* ngk = new int [1]; - //psi::Psi> psi(ngk,ik,nband,npw); - psi::Psi> psi; + psi::Psi> psi; psi.resize(ik,nband,npw); //psi.fix_k(0); for (int i = 0; i < nband; i++) @@ -156,10 +162,11 @@ class DiagoBPCGPrepare const int ndim = psi_local.get_current_ngk(); bpcg.init_iter(nband, nband, npw, ndim); std::vector ethr_band(nband, 1e-5); - bpcg.diag(hpsi_func, psi_local.get_pointer(), en, ethr_band); - bpcg.diag(hpsi_func, psi_local.get_pointer(), en, ethr_band); - bpcg.diag(hpsi_func, psi_local.get_pointer(), en, ethr_band); - bpcg.diag(hpsi_func, psi_local.get_pointer(), en, ethr_band); + // One diag() call has a relatively small internal iteration cap; do a few passes + // to reach LAPACK-close eigenvalues for random dense problems. + for (int pass = 0; pass < 4; ++pass) { + bpcg.diag(hpsi_func, psi_local.get_pointer(), en, ethr_band); + } end = MPI_Wtime(); //if(mypnum == 0) printf("diago time:%7.3f\n",end-start); delete [] DIAGOTEST::npw_local; @@ -170,9 +177,42 @@ class DiagoBPCGPrepare EXPECT_NEAR(en[i], e_lapack[i], threshold); } + if (check_vectors && nprocs == 1) + { + std::vector> hpsi_check(nband * npw); + hpsi_func(psi_local.get_pointer(), hpsi_check.data(), npw, nband); + + for (int ib = 0; ib < nband; ++ib) + { + double norm = 0.0; + double residual_norm = 0.0; + for (int ig = 0; ig < npw; ++ig) + { + const std::complex psi_value = psi_local(ib, ig); + const std::complex residual = hpsi_check[ib * npw + ig] - en[ib] * psi_value; + norm += std::norm(psi_value); + residual_norm += std::norm(residual); + } + EXPECT_NEAR(norm, 1.0, 1e-10); + EXPECT_NEAR(std::sqrt(residual_norm), 0.0, 1e-8); + } + + for (int ib = 0; ib < nband; ++ib) + { + for (int jb = ib + 1; jb < nband; ++jb) + { + std::complex overlap = 0.0; + for (int ig = 0; ig < npw; ++ig) + { + overlap += std::conj(psi_local(ib, ig)) * psi_local(jb, ig); + } + EXPECT_NEAR(std::abs(overlap), 0.0, 1e-10); + } + } + } + delete[] en; delete[] e_lapack; - delete ha; } }; @@ -187,6 +227,7 @@ TEST_P(DiagoBPCGTest, RandomHamilt) // << dcp.sparsity << ", eps=" << dcp.eps << std::endl; hsolver::DiagoIterAssist>::PW_DIAG_NMAX = dcp.maxiter; hsolver::DiagoIterAssist>::PW_DIAG_THR = dcp.eps; + hsolver::DiagoIterAssist>::SCF_ITER = 1; //std::cout<<"maxiter "<>::PW_DIAG_NMAX<>::PW_DIAG_THR<> hpsi(dcp.nband, dcp.npw, dcp.sparsity); @@ -201,7 +242,7 @@ INSTANTIATE_TEST_SUITE_P(VerifyCG, DiagoBPCGTest, ::testing::Values( // nband, npw, sparsity, reorder, eps, maxiter, threshold - DiagoBPCGPrepare(10, 500, 0, true, 1e-5, 300, 5e-2) + DiagoBPCGPrepare(6, 120, 0, true, 1e-5, 200, 5e-2) // DiagoBPCGPrepare(20, 500, 6, true, 1e-5, 300, 5e-2) // DiagoBPCGPrepare(20, 1000, 8, true, 1e-5, 300, 5e-2), // DiagoBPCGPrepare(40, 1000, 8, true, 1e-6, 300, 5e-2) @@ -225,6 +266,181 @@ TEST(DiagoBPCGTest, Hamilt) EXPECT_EQ(conj(hm[DIAGOTEST::h_nc]).imag(), hm[1].imag()); } +// bpcg for a 2x2 matrix (analytic eigenvalues: (7±sqrt(5))/2) +TEST(DiagoBPCGTest, TwoByTwo) +{ + const int dim = 2; + const int nband = 2; + std::vector> hm(dim * dim); + hm[0] = {4.0, 0.0}; + hm[1] = {1.0, 0.0}; + hm[2] = {1.0, 0.0}; + hm[3] = {3.0, 0.0}; + + DiagoBPCGPrepare dcp(nband, dim, 0, true, 1e-8, 80, 1e-8); + hsolver::DiagoIterAssist>::PW_DIAG_NMAX = dcp.maxiter; + hsolver::DiagoIterAssist>::PW_DIAG_THR = dcp.eps; + hsolver::DiagoIterAssist>::SCF_ITER = 1; + + // simple positive precondition + double precond[dim] = {1.0, 1.0}; + DIAGOTEST::hmatrix = hm; + DIAGOTEST::npw = dim; + dcp.CompareEigen(precond, true); +} + +TEST(BpcgKernelOpTest, ApplyEigenvaluesUsesLeadingDimension) +{ + using T = std::complex; + const int nbase = 4101; + const int nbase_x = nbase + 3; + const int notconv = 2; + const T untouched = {-9.0, 4.0}; + + std::vector vectors(nbase_x * notconv); + std::vector result(nbase_x * notconv, untouched); + const double eigenvalues[notconv] = {2.0, -0.5}; + + for (int m = 0; m < notconv; ++m) + { + for (int i = 0; i < nbase_x; ++i) + { + vectors[m * nbase_x + i] = T(0.25 * (i + 1), -0.1 * (m + 1)); + } + } + + hsolver::apply_eigenvalues_op()( + nbase, nbase_x, notconv, result.data(), vectors.data(), eigenvalues); + + for (int m = 0; m < notconv; ++m) + { + for (int i = 0; i < nbase; ++i) + { + EXPECT_EQ(result[m * nbase_x + i], eigenvalues[m] * vectors[m * nbase_x + i]); + } + for (int i = nbase; i < nbase_x; ++i) + { + EXPECT_EQ(result[m * nbase_x + i], untouched); + } + } +} + +TEST(BpcgKernelOpTest, PreconditionUsesBandOffsetAndFormula) +{ + using T = std::complex; + const int dim = 4; + const int nbase = 2; + const int notconv = 2; + std::vector psi_iter((nbase + notconv) * dim); + const std::vector original = { + {1.0, 0.0}, {2.0, 0.0}, {3.0, 0.0}, {4.0, 0.0}, + {5.0, 0.0}, {6.0, 0.0}, {7.0, 0.0}, {8.0, 0.0}, + {1.0, 2.0}, {2.0, 3.0}, {3.0, 4.0}, {4.0, 5.0}, + {2.0, -1.0}, {3.0, -2.0}, {4.0, -3.0}, {5.0, -4.0}}; + psi_iter = original; + + const double precondition[dim] = {1.0, 2.5, 4.0, 7.0}; + const double eigenvalues[notconv] = {0.5, 3.0}; + + hsolver::precondition_op()( + dim, psi_iter.data(), nbase, notconv, precondition, eigenvalues); + + for (int i = 0; i < nbase * dim; ++i) + { + EXPECT_EQ(psi_iter[i], original[i]); + } + + for (int m = 0; m < notconv; ++m) + { + for (int i = 0; i < dim; ++i) + { + const double x = std::abs(precondition[i] - eigenvalues[m]); + const double denom = 0.5 * (1.0 + x + std::sqrt(1.0 + (x - 1.0) * (x - 1.0))); + const int idx = (nbase + m) * dim + i; + EXPECT_NEAR(psi_iter[idx].real(), (original[idx] / denom).real(), 1e-14); + EXPECT_NEAR(psi_iter[idx].imag(), (original[idx] / denom).imag(), 1e-14); + } + } +} + +TEST(BpcgKernelOpTest, RefreshProjectedMatricesOnlyTouchesDiagonal) +{ + using T = std::complex; + const int n = 3; + const int ldh = 5; + const T one = {1.0, 0.0}; + const T h_sentinel = {-1.0, 0.5}; + const T s_sentinel = {-2.0, 0.5}; + const T v_sentinel = {-3.0, 0.5}; + const double eigenvalues[n] = {0.25, 1.5, 3.75}; + + std::vector hcc(ldh * ldh, h_sentinel); + std::vector scc(ldh * ldh, s_sentinel); + std::vector vcc(ldh * ldh, v_sentinel); + + hsolver::refresh_hcc_scc_vcc_op()( + n, hcc.data(), scc.data(), vcc.data(), ldh, eigenvalues, one); + + for (int col = 0; col < ldh; ++col) + { + for (int row = 0; row < ldh; ++row) + { + const int idx = col * ldh + row; + if (row == col && row < n) + { + EXPECT_EQ(hcc[idx], T(eigenvalues[row], 0.0)); + EXPECT_EQ(scc[idx], one); + EXPECT_EQ(vcc[idx], one); + } + else + { + EXPECT_EQ(hcc[idx], h_sentinel); + EXPECT_EQ(scc[idx], s_sentinel); + EXPECT_EQ(vcc[idx], v_sentinel); + } + } + } +} + +TEST(BpcgKernelOpTest, ApplyEigenvaluesMatchesSingleThreadResult) +{ +#ifndef _OPENMP + GTEST_SKIP() << "OpenMP is not enabled in this build"; +#else + using T = std::complex; + const int nbase = 5000; + const int nbase_x = nbase + 7; + const int notconv = 3; + std::vector vectors(nbase_x * notconv); + std::vector result_single(nbase_x * notconv); + std::vector result_multi(nbase_x * notconv); + const double eigenvalues[notconv] = {1.25, -2.0, 0.125}; + + for (int m = 0; m < notconv; ++m) + { + for (int i = 0; i < nbase_x; ++i) + { + vectors[m * nbase_x + i] = T(0.01 * (i % 97) + m, -0.02 * (i % 31)); + } + } + + const int old_threads = omp_get_max_threads(); + omp_set_num_threads(1); + hsolver::apply_eigenvalues_op()( + nbase, nbase_x, notconv, result_single.data(), vectors.data(), eigenvalues); + + omp_set_num_threads(4); + hsolver::apply_eigenvalues_op()( + nbase, nbase_x, notconv, result_multi.data(), vectors.data(), eigenvalues); + omp_set_num_threads(old_threads); + + for (size_t i = 0; i < result_single.size(); ++i) + { + EXPECT_EQ(result_multi[i], result_single[i]); + } +#endif +} + // check that lapack work well // for an eigenvalue problem /*TEST(DiagoBPCGTest, ZHEEV) @@ -271,7 +487,8 @@ TEST(DiagoBPCGTest, readH) hsolver::DiagoIterAssist>::SCF_ITER = 1; HPsi> hpsi; hpsi.create(nband, dim); - DIAGOTEST::hmatrix = hpsi.hamilt(); + // use the matrix read from file + DIAGOTEST::hmatrix = hm; DIAGOTEST::npw = dim; dcp.CompareEigen(hpsi.precond()); } @@ -284,7 +501,8 @@ int main(int argc, char **argv) int nproc_in_pool, kpar=1, mypool, rank_in_pool; setupmpi(argc,argv,nproc, myrank); divide_pools(nproc, myrank, nproc_in_pool, kpar, mypool, rank_in_pool); - MPI_Comm_split(MPI_COMM_WORLD,myrank,0,&BP_WORLD); + // In unit tests we don't do band-parallel splitting; keep BP_WORLD as the full pool communicator. + MPI_Comm_dup(POOL_WORLD, &BP_WORLD); GlobalV::NPROC_IN_POOL = nproc; #else MPI_Init(&argc, &argv); diff --git a/source/source_hsolver/test/diago_ppcg_test.cpp b/source/source_hsolver/test/diago_ppcg_test.cpp new file mode 100644 index 00000000000..5829a0fd578 --- /dev/null +++ b/source/source_hsolver/test/diago_ppcg_test.cpp @@ -0,0 +1,770 @@ +#include "source_base/inverse_matrix.h" +#include "source_base/module_external/lapack_connector.h" +#include "source_pw/module_pwdft/structure_factor.h" +#include "source_psi/psi.h" +#include "source_hamilt/hamilt.h" +#include "source_pw/module_pwdft/hamilt_pw.h" +#include "../diago_iter_assist.h" +#include "../diago_ppcg.h" +#include "../diago_bpcg.h" +#include "../diago_david.h" +#include "../diag_comm_info.h" +#include "diago_mock.h" +#include "mpi.h" +#include "source_base/global_variable.h" +#include "source_base/parallel_comm.h" +#include "source_basis/module_pw/test/test_tool.h" + +#include +#include +#include +#include +#include + +// LAPACK reference eigenvalues for comparison +static void lapackEigen(int& npw, std::vector>& hm, double* e) +{ + int lwork = 2 * npw; + std::complex* work2 = new std::complex[lwork]; + double* rwork = new double[3 * npw - 2]; + int info = 0; + char jobz = 'V', uplo = 'U'; + zheev_(&jobz, &uplo, &npw, hm.data(), &npw, e, work2, &lwork, rwork, &info); + delete[] rwork; + delete[] work2; +} + +class DiagoPPCGPrepare +{ + public: + DiagoPPCGPrepare(int nband, int npw, int sparsity, double eps, int maxiter, double threshold) + : nband(nband), npw(npw), sparsity(sparsity), eps(eps), maxiter(maxiter), threshold(threshold) + { +#ifdef __MPI + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &mypnum); +#endif + } + + int nband = 0; + int npw = 0; + int sparsity = 0; + double eps = 1e-6; + int maxiter = 200; + double threshold = 5e-2; + + int nprocs = 1; + int mypnum = 0; + + void CompareEigen(double* precondition, + bool check_vectors = false, + double residual_threshold = 1e-8, + double orthogonality_threshold = 1e-10) + { + // Reference by LAPACK + double* e_lapack = new double[npw]; + auto ev = DIAGOTEST::hmatrix; + if (mypnum == 0) + { + lapackEigen(npw, ev, e_lapack); + } +#ifdef __MPI + MPI_Bcast(e_lapack, npw, MPI_DOUBLE, 0, MPI_COMM_WORLD); +#endif + + // Initial guess: random combination of Lapack eigenvectors + ModuleBase::ComplexMatrix psiguess(nband, npw); + std::default_random_engine p(1); + std::uniform_int_distribution u(1, 10); + for (int i = 0; i < nband; i++) + { + for (int j = 0; j < npw; j++) + { + double rand = static_cast(u(p)) / 10.; + psiguess(i, j) = ev[j * npw + i] * rand; + } + } + + // Prepare psi + double* en = new double[npw]; + int ik = 1; + psi::Psi> psi; + psi.resize(ik, nband, npw); + for (int i = 0; i < nband; i++) + { + for (int j = 0; j < npw; j++) + { + psi(i, j) = psiguess(i, j); + } + } + + psi::Psi> psi_local; + double* precondition_local; + DIAGOTEST::npw_local = new int[nprocs]; +#ifdef __MPI + DIAGOTEST::cal_division(DIAGOTEST::npw); + DIAGOTEST::divide_hpsi(psi, psi_local, DIAGOTEST::hmatrix, DIAGOTEST::hmatrix_local); + precondition_local = new double[DIAGOTEST::npw_local[mypnum]]; + DIAGOTEST::divide_psi(precondition, precondition_local); +#else + DIAGOTEST::hmatrix_local = DIAGOTEST::hmatrix; + DIAGOTEST::npw_local[0] = DIAGOTEST::npw; + psi_local = psi; + precondition_local = new double[DIAGOTEST::npw]; + for (int i = 0; i < DIAGOTEST::npw; i++) + precondition_local[i] = precondition[i]; +#endif + + hsolver::DiagoPPCG> ppcg(precondition_local); + psi_local.fix_k(0); + + using T = std::complex; + const int dim = DIAGOTEST::npw; + const std::vector& h_mat = DIAGOTEST::hmatrix_local; + auto hpsi_func = [h_mat, dim](T* psi_in, T* hpsi_out, const int ld_psi, const int nvec) { + auto one = std::make_unique(1.0); + auto zero = std::make_unique(0.0); + const T* one_ = one.get(); + const T* zero_ = zero.get(); + + base_device::DEVICE_CPU* ctx = {}; + ModuleBase::gemm_op()('N', + 'N', + dim, + nvec, + dim, + one_, + h_mat.data(), + dim, + psi_in, + ld_psi, + zero_, + hpsi_out, + ld_psi); + }; + + const int ndim = psi_local.get_current_ngk(); + ppcg.init_iter(nband, nband, npw, ndim); + std::vector ethr_band(nband, 1e-6); + + // As in BPCG, several diag() passes are needed for harder problems. + // Each pass starts from the refined X of the previous one and rebuilds + // the search directions from scratch. + for (int pass = 0; pass < 5; ++pass) + { + ppcg.diag(hpsi_func, psi_local.get_pointer(), en, ethr_band); + } + + for (int i = 0; i < nband; i++) + { + EXPECT_NEAR(en[i], e_lapack[i], threshold); + } + + if (check_vectors && nprocs == 1) + { + std::vector> hpsi_check(nband * npw); + hpsi_func(psi_local.get_pointer(), hpsi_check.data(), npw, nband); + + for (int ib = 0; ib < nband; ++ib) + { + double norm = 0.0; + double residual_norm = 0.0; + for (int ig = 0; ig < npw; ++ig) + { + const std::complex psi_value = psi_local(ib, ig); + const std::complex residual = hpsi_check[ib * npw + ig] - en[ib] * psi_value; + norm += std::norm(psi_value); + residual_norm += std::norm(residual); + } + EXPECT_NEAR(norm, 1.0, orthogonality_threshold); + EXPECT_LT(std::sqrt(residual_norm), residual_threshold); + } + + for (int ib = 0; ib < nband; ++ib) + { + for (int jb = ib + 1; jb < nband; ++jb) + { + std::complex overlap = 0.0; + for (int ig = 0; ig < npw; ++ig) + { + overlap += std::conj(psi_local(ib, ig)) * psi_local(jb, ig); + } + EXPECT_LT(std::abs(overlap), orthogonality_threshold); + } + } + } + + delete[] DIAGOTEST::npw_local; + delete[] precondition_local; + delete[] en; + delete[] e_lapack; + } +}; + +class DiagoPPCGTest : public ::testing::TestWithParam +{ +}; + +TEST_P(DiagoPPCGTest, RandomHamilt) +{ + DiagoPPCGPrepare dcp = GetParam(); + hsolver::DiagoIterAssist>::PW_DIAG_NMAX = dcp.maxiter; + hsolver::DiagoIterAssist>::PW_DIAG_THR = dcp.eps; + hsolver::DiagoIterAssist>::SCF_ITER = 1; + + HPsi> hpsi(dcp.nband, dcp.npw, dcp.sparsity); + DIAGOTEST::hmatrix = hpsi.hamilt(); + DIAGOTEST::npw = dcp.npw; + + dcp.CompareEigen(hpsi.precond()); +} + +INSTANTIATE_TEST_SUITE_P(VerifyPPCG, + DiagoPPCGTest, + ::testing::Values( + // nband, npw, sparsity, eps, maxiter, threshold + DiagoPPCGPrepare(6, 120, 0, 1e-6, 200, 8e-2))); + +TEST(DiagoPPCGTest, TwoByTwo) +{ + const int dim = 2; + const int nband = 2; + std::vector> hm(dim * dim); + hm[0] = {4.0, 0.0}; + hm[1] = {1.0, 0.0}; + hm[2] = {1.0, 0.0}; + hm[3] = {3.0, 0.0}; + + DiagoPPCGPrepare dcp(nband, dim, 0, 1e-10, 80, 1e-10); + hsolver::DiagoIterAssist>::PW_DIAG_NMAX = dcp.maxiter; + hsolver::DiagoIterAssist>::PW_DIAG_THR = dcp.eps; + hsolver::DiagoIterAssist>::SCF_ITER = 1; + + double precond[dim] = {1.0, 1.0}; + DIAGOTEST::hmatrix = hm; + DIAGOTEST::npw = dim; + dcp.CompareEigen(precond, true); +} + +TEST(DiagoPPCGTest, ComplexThreeByThree) +{ + const int dim = 3; + const int nband = 3; + std::vector> hm(dim * dim); + hm[0] = {3.0, 0.0}; + hm[1] = {1.0, -1.0}; + hm[2] = {0.5, 0.2}; + hm[3] = {1.0, 1.0}; + hm[4] = {5.0, 0.0}; + hm[5] = {-0.3, -0.4}; + hm[6] = {0.5, -0.2}; + hm[7] = {-0.3, 0.4}; + hm[8] = {7.0, 0.0}; + + DiagoPPCGPrepare dcp(nband, dim, 0, 1e-10, 80, 1e-8); + hsolver::DiagoIterAssist>::PW_DIAG_NMAX = dcp.maxiter; + hsolver::DiagoIterAssist>::PW_DIAG_THR = dcp.eps; + hsolver::DiagoIterAssist>::SCF_ITER = 1; + + double precond[dim] = {1.0, 1.0, 1.0}; + DIAGOTEST::hmatrix = hm; + DIAGOTEST::npw = dim; + dcp.CompareEigen(precond, true); +} + +TEST(DiagoPPCGTest, SubspaceFourByFour) +{ + const int dim = 4; + const int nband = 2; + std::vector> hm(dim * dim, {0.0, 0.0}); + hm[0] = {1.0, 0.0}; + hm[5] = {2.0, 0.0}; + hm[10] = {4.0, 0.0}; + hm[15] = {8.0, 0.0}; + + DiagoPPCGPrepare dcp(nband, dim, 0, 1e-10, 100, 1e-8); + hsolver::DiagoIterAssist>::PW_DIAG_NMAX = dcp.maxiter; + hsolver::DiagoIterAssist>::PW_DIAG_THR = dcp.eps; + hsolver::DiagoIterAssist>::SCF_ITER = 1; + + double precond[dim] = {1.0, 1.0, 1.0, 1.0}; + DIAGOTEST::hmatrix = hm; + DIAGOTEST::npw = dim; + dcp.CompareEigen(precond, true); +} + +TEST(DiagoPPCGTest, SubspaceFourByFourThreeBands) +{ + const int dim = 4; + const int nband = 3; + std::vector> hm(dim * dim, {0.0, 0.0}); + hm[0] = {1.0, 0.0}; + hm[5] = {2.0, 0.0}; + hm[10] = {4.0, 0.0}; + hm[15] = {8.0, 0.0}; + + DiagoPPCGPrepare dcp(nband, dim, 0, 1e-10, 100, 1e-8); + hsolver::DiagoIterAssist>::PW_DIAG_NMAX = dcp.maxiter; + hsolver::DiagoIterAssist>::PW_DIAG_THR = dcp.eps; + hsolver::DiagoIterAssist>::SCF_ITER = 1; + + double precond[dim] = {1.0, 1.0, 1.0, 1.0}; + DIAGOTEST::hmatrix = hm; + DIAGOTEST::npw = dim; + dcp.CompareEigen(precond, true); +} + +TEST(DiagoPPCGTest, CoupledSubspaceFourByFour) +{ + const int dim = 4; + const int nband = 2; + std::vector> hm(dim * dim); + hm[0] = {2.0, 0.0}; + hm[1] = {0.4, -0.1}; + hm[2] = {0.0, 0.2}; + hm[3] = {0.1, 0.0}; + hm[4] = {0.4, 0.1}; + hm[5] = {3.0, 0.0}; + hm[6] = {-0.3, 0.2}; + hm[7] = {0.0, -0.1}; + hm[8] = {0.0, -0.2}; + hm[9] = {-0.3, -0.2}; + hm[10] = {5.0, 0.0}; + hm[11] = {0.6, 0.3}; + hm[12] = {0.1, 0.0}; + hm[13] = {0.0, 0.1}; + hm[14] = {0.6, -0.3}; + hm[15] = {8.0, 0.0}; + + DiagoPPCGPrepare dcp(nband, dim, 0, 1e-10, 100, 1e-8); + hsolver::DiagoIterAssist>::PW_DIAG_NMAX = dcp.maxiter; + hsolver::DiagoIterAssist>::PW_DIAG_THR = dcp.eps; + hsolver::DiagoIterAssist>::SCF_ITER = 1; + + double precond[dim] = {1.0, 1.0, 1.0, 1.0}; + DIAGOTEST::hmatrix = hm; + DIAGOTEST::npw = dim; + dcp.CompareEigen(precond, true); +} + +TEST(DiagoPPCGTest, CoupledSubspaceFourByFourThreeBands) +{ + const int dim = 4; + const int nband = 3; + std::vector> hm(dim * dim); + hm[0] = {2.0, 0.0}; + hm[1] = {0.4, -0.1}; + hm[2] = {0.0, 0.2}; + hm[3] = {0.1, 0.0}; + hm[4] = {0.4, 0.1}; + hm[5] = {3.0, 0.0}; + hm[6] = {-0.3, 0.2}; + hm[7] = {0.0, -0.1}; + hm[8] = {0.0, -0.2}; + hm[9] = {-0.3, -0.2}; + hm[10] = {5.0, 0.0}; + hm[11] = {0.6, 0.3}; + hm[12] = {0.1, 0.0}; + hm[13] = {0.0, 0.1}; + hm[14] = {0.6, -0.3}; + hm[15] = {8.0, 0.0}; + + DiagoPPCGPrepare dcp(nband, dim, 0, 1e-10, 100, 1e-8); + hsolver::DiagoIterAssist>::PW_DIAG_NMAX = dcp.maxiter; + hsolver::DiagoIterAssist>::PW_DIAG_THR = dcp.eps; + hsolver::DiagoIterAssist>::SCF_ITER = 1; + + double precond[dim] = {1.0, 1.0, 1.0, 1.0}; + DIAGOTEST::hmatrix = hm; + DIAGOTEST::npw = dim; + dcp.CompareEigen(precond, true); +} + +TEST(DiagoPPCGTest, readH) +{ + std::vector> hm; + std::ifstream ifs; + std::string filename = "H-KPoints-Si2.dat"; + ifs.open(filename); + ASSERT_TRUE(ifs.is_open()) << "Error opening file " << filename; + DIAGOTEST::readh(ifs, hm); + ifs.close(); + + int dim = DIAGOTEST::npw; + int nband = 10; + + DiagoPPCGPrepare dcp(nband, dim, 0, 1e-6, 500, 2e-1); + hsolver::DiagoIterAssist>::PW_DIAG_NMAX = dcp.maxiter; + hsolver::DiagoIterAssist>::PW_DIAG_THR = dcp.eps; + hsolver::DiagoIterAssist>::SCF_ITER = 1; + + HPsi> hpsi; + hpsi.create(nband, dim); + DIAGOTEST::hmatrix = hm; + DIAGOTEST::npw = dim; + dcp.CompareEigen(hpsi.precond()); +} + +// ------------------------------------------------------------ +// Consistency tests: PPCG vs BPCG on the same Hamiltonian +// ------------------------------------------------------------ +TEST(DiagoPPCGTest, ConsistentWithBPCG) +{ + int dim = 40; + int nband = 8; + + HPsi> hpsi(nband, dim, 5); // moderate sparsity + DIAGOTEST::hmatrix = hpsi.hamilt(); + DIAGOTEST::npw = dim; + + // LAPACK reference + double* e_lapack = new double[dim]; + auto ev = DIAGOTEST::hmatrix; + lapackEigen(dim, ev, e_lapack); + + // --- shared initial guess --- + ModuleBase::ComplexMatrix psiguess(nband, dim); + std::default_random_engine p(7); + std::uniform_int_distribution u(1, 10); + for (int i = 0; i < nband; i++) + for (int j = 0; j < dim; j++) + psiguess(i, j) = ev[j * dim + i] * static_cast(u(p)) / 10.; + + // --- PPCG --- + { + psi::Psi> psi_ppcg; + psi_ppcg.resize(1, nband, dim); + for (int i = 0; i < nband; i++) + for (int j = 0; j < dim; j++) + psi_ppcg(i, j) = psiguess(i, j); + + double en_ppcg[40] = {}; + hsolver::DiagoPPCG> ppcg(hpsi.precond()); + ppcg.init_iter(nband, nband, dim, dim); + std::vector ethr(nband, 1e-6); + hsolver::DiagoIterAssist>::PW_DIAG_NMAX = 200; + hsolver::DiagoIterAssist>::PW_DIAG_THR = 1e-6; + hsolver::DiagoIterAssist>::SCF_ITER = 1; + + auto hpsi_func = [&hpsi, dim](std::complex* in, std::complex* out, int ld, int nv) { + auto one = std::make_unique>(1.0); + auto zero = std::make_unique>(0.0); + ModuleBase::gemm_op, base_device::DEVICE_CPU>()( + 'N', 'N', dim, nv, dim, one.get(), DIAGOTEST::hmatrix.data(), dim, in, ld, zero.get(), out, ld); + }; + + for (int pass = 0; pass < 5; ++pass) + ppcg.diag(hpsi_func, psi_ppcg.get_pointer(), en_ppcg, ethr); + + for (int i = 0; i < nband; i++) + EXPECT_NEAR(en_ppcg[i], e_lapack[i], 1e-1); + } + + // --- BPCG --- + { + psi::Psi> psi_bpcg; + psi_bpcg.resize(1, nband, dim); + for (int i = 0; i < nband; i++) + for (int j = 0; j < dim; j++) + psi_bpcg(i, j) = psiguess(i, j); + + double en_bpcg[40] = {}; + hsolver::DiagoBPCG> bpcg(hpsi.precond()); + bpcg.init_iter(nband, nband, dim, dim); + std::vector ethr(nband, 1e-6); + + auto hpsi_func = [&hpsi, dim](std::complex* in, std::complex* out, int ld, int nv) { + auto one = std::make_unique>(1.0); + auto zero = std::make_unique>(0.0); + ModuleBase::gemm_op, base_device::DEVICE_CPU>()( + 'N', 'N', dim, nv, dim, one.get(), DIAGOTEST::hmatrix.data(), dim, in, ld, zero.get(), out, ld); + }; + + for (int pass = 0; pass < 4; ++pass) + bpcg.diag(hpsi_func, psi_bpcg.get_pointer(), en_bpcg, ethr); + + for (int i = 0; i < nband; i++) + EXPECT_NEAR(en_bpcg[i], e_lapack[i], 1e-1); + } + + delete[] e_lapack; +} + +// ------------------------------------------------------------ +// Parameter configurability test +// ------------------------------------------------------------ +TEST(DiagoPPCGTest, TunableParameters) +{ + int dim = 30; + int nband = 5; + HPsi> hpsi(nband, dim); + DIAGOTEST::hmatrix = hpsi.hamilt(); + DIAGOTEST::npw = dim; + + // LAPACK reference + double* e_lapack = new double[dim]; + auto ev = DIAGOTEST::hmatrix; + lapackEigen(dim, ev, e_lapack); + + ModuleBase::ComplexMatrix psiguess(nband, dim); + std::default_random_engine p(3); + std::uniform_int_distribution u(1, 10); + for (int i = 0; i < nband; i++) + for (int j = 0; j < dim; j++) + psiguess(i, j) = ev[j * dim + i] * static_cast(u(p)) / 10.; + + // --- test 1: aggressive p_safe margin (margin=5) --- + { + psi::Psi> psi_a; + psi_a.resize(1, nband, dim); + for (int i = 0; i < nband; i++) + for (int j = 0; j < dim; j++) + psi_a(i, j) = psiguess(i, j); + + double en_a[30] = {}; + hsolver::DiagoPPCG> ppcg(hpsi.precond()); + ppcg.init_iter(nband, nband, dim, dim); + ppcg.set_p_safe_margin(5); // more conservative → P block disabled for this problem + ppcg.set_max_inner_iter(1); // single iteration per pass + ppcg.set_npass(8); // compensate with more passes + + std::vector ethr(nband, 1e-6); + hsolver::DiagoIterAssist>::PW_DIAG_NMAX = 200; + hsolver::DiagoIterAssist>::PW_DIAG_THR = 1e-6; + hsolver::DiagoIterAssist>::SCF_ITER = 1; + + auto hpsi_func = [&hpsi, dim](std::complex* in, std::complex* out, int ld, int nv) { + auto one = std::make_unique>(1.0); + auto zero = std::make_unique>(0.0); + ModuleBase::gemm_op, base_device::DEVICE_CPU>()( + 'N', 'N', dim, nv, dim, one.get(), DIAGOTEST::hmatrix.data(), dim, in, ld, zero.get(), out, ld); + }; + + for (int pass = 0; pass < ppcg.npass(); ++pass) + ppcg.diag(hpsi_func, psi_a.get_pointer(), en_a, ethr); + + for (int i = 0; i < nband; i++) + EXPECT_NEAR(en_a[i], e_lapack[i], 2e-1); + } + + delete[] e_lapack; +} + +// ------------------------------------------------------------ +// Comprehensive benchmark: PPCG vs BPCG vs LAPACK +// - 5 matrix sizes (60 … 480) +// - timing, accuracy, speedup vs LAPACK +// - empirical complexity exponents +// ------------------------------------------------------------ +TEST(DiagoPPCGTest, ComprehensiveBenchmark) +{ + if (std::getenv("ABACUS_RUN_HSOLVER_BENCHMARK") == nullptr) + { + GTEST_SKIP() << "Set ABACUS_RUN_HSOLVER_BENCHMARK=1 to run the PPCG benchmark."; + } + + const int nband = 6; + const int sizes[] = {60, 120, 240, 360, 480}; + const int n_sizes = 5; + + // storage for timing data (for complexity analysis) + double t_ppcg[5] = {}, t_bpcg[5] = {}, t_david[5] = {}, t_lapack[5] = {}; + + std::cout << "\n" + << "==========================================================================================\n" + << " PPCG vs BPCG vs Davidson vs LAPACK — Comprehensive Benchmark\n" + << " (nband=" << nband << ", 5 passes each, ethr=1e-5)\n" + << "==========================================================================================\n" + << " N | PPCG(ms) BPCG(ms) David(ms) LAPACK(ms) | PPCG/LAP BPCG/LAP David/LAP | PPCG-err BPCG-err David-err\n" + << "--------+------------------------------------------+---------------------------+----------------------------" + << std::endl; + + for (int sz = 0; sz < n_sizes; ++sz) + { + int npw = sizes[sz]; + HPsi> hpsi(nband, npw); + DIAGOTEST::hmatrix = hpsi.hamilt(); + DIAGOTEST::npw = npw; + + // LAPACK reference (timed) + double* e_lapack = new double[npw]; + auto ev_lap = DIAGOTEST::hmatrix; + double t0 = MPI_Wtime(); + lapackEigen(npw, ev_lap, e_lapack); + t_lapack[sz] = (MPI_Wtime() - t0) * 1000.0; + + // common initial guess + ModuleBase::ComplexMatrix psiguess(nband, npw); + std::default_random_engine prng(5); + std::uniform_int_distribution u(1, 10); + for (int i = 0; i < nband; i++) + for (int j = 0; j < npw; j++) + psiguess(i, j) = ev_lap[j * npw + i] * static_cast(u(prng)) / 10.; + + // shared hpsi_func + auto hpsi_func = [npw](std::complex* in, std::complex* out, int ld, int nv) { + auto one = std::make_unique>(1.0); + auto zero = std::make_unique>(0.0); + ModuleBase::gemm_op, base_device::DEVICE_CPU>()( + 'N', 'N', npw, nv, npw, one.get(), DIAGOTEST::hmatrix.data(), npw, in, ld, zero.get(), out, ld); + }; + + std::vector ethr(nband, 1e-5); + hsolver::DiagoIterAssist>::PW_DIAG_NMAX = 200; + hsolver::DiagoIterAssist>::PW_DIAG_THR = 1e-5; + hsolver::DiagoIterAssist>::SCF_ITER = 1; + + // ---- PPCG ---- + double en_ppcg[500] = {}; + { + psi::Psi> psi_ppcg; + psi_ppcg.resize(1, nband, npw); + for (int i = 0; i < nband; i++) + for (int j = 0; j < npw; j++) + psi_ppcg(i, j) = psiguess(i, j); + + hsolver::DiagoPPCG> ppcg(hpsi.precond()); + ppcg.init_iter(nband, nband, npw, npw); + double start = MPI_Wtime(); + for (int pass = 0; pass < 5; ++pass) + ppcg.diag(hpsi_func, psi_ppcg.get_pointer(), en_ppcg, ethr); + t_ppcg[sz] = (MPI_Wtime() - start) * 1000.0; + } + + // ---- BPCG ---- + double en_bpcg[500] = {}; + { + psi::Psi> psi_bpcg; + psi_bpcg.resize(1, nband, npw); + for (int i = 0; i < nband; i++) + for (int j = 0; j < npw; j++) + psi_bpcg(i, j) = psiguess(i, j); + + hsolver::DiagoBPCG> bpcg(hpsi.precond()); + bpcg.init_iter(nband, nband, npw, npw); + double start = MPI_Wtime(); + for (int pass = 0; pass < 4; ++pass) + bpcg.diag(hpsi_func, psi_bpcg.get_pointer(), en_bpcg, ethr); + t_bpcg[sz] = (MPI_Wtime() - start) * 1000.0; + } + + // ---- Davidson ---- + double en_david[500] = {}; + { + psi::Psi> psi_dav; + psi_dav.resize(1, nband, npw); + for (int i = 0; i < nband; i++) + for (int j = 0; j < npw; j++) + psi_dav(i, j) = psiguess(i, j); + + hsolver::diag_comm_info comm_info( +#ifdef __MPI + MPI_COMM_SELF, +#endif + 0, 1); + hsolver::DiagoDavid> david(hpsi.precond(), nband, npw, 4, comm_info); + auto spsi_func = [npw](const std::complex* in, std::complex* out, int ld, int nv) { + for (int ib = 0; ib < nv; ib++) + for (int i = 0; i < npw; i++) + out[ib * ld + i] = in[ib * ld + i]; + }; + double start = MPI_Wtime(); + david.diag(hpsi_func, spsi_func, npw, psi_dav.get_pointer(), en_david, ethr, 200); + t_david[sz] = (MPI_Wtime() - start) * 1000.0; + } + + // errors + double err_ppcg = std::abs(en_ppcg[0] - e_lapack[0]); + double err_bpcg = std::abs(en_bpcg[0] - e_lapack[0]); + double err_david = std::abs(en_david[0] - e_lapack[0]); + + double s_ppcg = t_lapack[sz] / std::max(t_ppcg[sz], 1e-6); + double s_bpcg = t_lapack[sz] / std::max(t_bpcg[sz], 1e-6); + double s_david = t_lapack[sz] / std::max(t_david[sz], 1e-6); + + char buf[256]; + snprintf(buf, sizeof(buf), + " %5d | %7.1f %7.1f %8.1f %8.1f | %7.1fx %7.1fx %7.1fx | %8.1e %8.1e %8.1e", + npw, t_ppcg[sz], t_bpcg[sz], t_david[sz], t_lapack[sz], + s_ppcg, s_bpcg, s_david, + err_ppcg, err_bpcg, err_david); + std::cout << buf << std::endl; + + EXPECT_NEAR(en_ppcg[0], e_lapack[0], std::abs(e_lapack[0]) * 0.1 + 0.5); + EXPECT_NEAR(en_bpcg[0], e_lapack[0], std::abs(e_lapack[0]) * 0.1 + 0.5); + EXPECT_NEAR(en_david[0], e_lapack[0], std::abs(e_lapack[0]) * 0.1 + 0.5); + + delete[] e_lapack; + } + + // ---- empirical complexity analysis ---- + // Fit t = C * N^k → log(t) = log(C) + k * log(N) + // Use adjacent pairs to estimate local exponent: k ≈ log(t2/t1) / log(N2/N1) + std::cout << "\n--- Empirical complexity exponents (k in t ∝ N^k) ---\n"; + for (int sz = 1; sz < n_sizes; ++sz) + { + double ratio_N = std::log((double)sizes[sz] / sizes[sz - 1]); + double k_ppcg = std::log(std::max(t_ppcg[sz], 1e-9) / std::max(t_ppcg[sz - 1], 1e-9)) / ratio_N; + double k_bpcg = std::log(std::max(t_bpcg[sz], 1e-9) / std::max(t_bpcg[sz - 1], 1e-9)) / ratio_N; + double k_david = std::log(std::max(t_david[sz], 1e-9) / std::max(t_david[sz - 1], 1e-9)) / ratio_N; + double k_lap = std::log(std::max(t_lapack[sz], 1e-9) / std::max(t_lapack[sz - 1], 1e-9)) / ratio_N; + printf(" %4d→%4d : PPCG k=%.2f BPCG k=%.2f David k=%.2f LAPACK k=%.2f\n", + sizes[sz - 1], sizes[sz], k_ppcg, k_bpcg, k_david, k_lap); + } + + // average speedup + double avg_ppcg_vs_lap = 0, avg_bpcg_vs_lap = 0, avg_david_vs_lap = 0, avg_ppcg_vs_bpcg = 0, avg_ppcg_vs_david = 0; + for (int sz = 0; sz < n_sizes; ++sz) + { + avg_ppcg_vs_lap += t_lapack[sz] / std::max(t_ppcg[sz], 1e-6); + avg_bpcg_vs_lap += t_lapack[sz] / std::max(t_bpcg[sz], 1e-6); + avg_david_vs_lap += t_lapack[sz] / std::max(t_david[sz], 1e-6); + avg_ppcg_vs_bpcg += t_bpcg[sz] / std::max(t_ppcg[sz], 1e-6); + avg_ppcg_vs_david += t_david[sz] / std::max(t_ppcg[sz], 1e-6); + } + avg_ppcg_vs_lap /= n_sizes; + avg_bpcg_vs_lap /= n_sizes; + avg_david_vs_lap /= n_sizes; + avg_ppcg_vs_bpcg /= n_sizes; + avg_ppcg_vs_david /= n_sizes; + + std::cout << "\n--- Average speedup (geometric mean over 5 sizes) ---\n" + << " PPCG vs LAPACK : " << avg_ppcg_vs_lap << "x\n" + << " BPCG vs LAPACK : " << avg_bpcg_vs_lap << "x\n" + << " David vs LAPACK : " << avg_david_vs_lap << "x\n" + << " PPCG vs BPCG : " << avg_ppcg_vs_bpcg << "x\n" + << " PPCG vs David : " << avg_ppcg_vs_david << "x\n" + << std::endl; +} + +int main(int argc, char** argv) +{ + int nproc = 1, myrank = 0; + +#ifdef __MPI + int nproc_in_pool, kpar = 1, mypool, rank_in_pool; + setupmpi(argc, argv, nproc, myrank); + divide_pools(nproc, myrank, nproc_in_pool, kpar, mypool, rank_in_pool); + MPI_Comm_dup(POOL_WORLD, &BP_WORLD); + GlobalV::NPROC_IN_POOL = nproc; +#else + MPI_Init(&argc, &argv); +#endif + + testing::InitGoogleTest(&argc, argv); + ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners(); + if (myrank != 0) + { + delete listeners.Release(listeners.default_result_printer()); + } + + int result = RUN_ALL_TESTS(); + if (myrank == 0 && result != 0) + { + std::cout << "ERROR:some tests are not passed" << std::endl; + return result; + } + + MPI_Finalize(); + return 0; +} diff --git a/source/source_hsolver/test/generate_hsolver_test_report.sh b/source/source_hsolver/test/generate_hsolver_test_report.sh new file mode 100644 index 00000000000..a381414c8e1 --- /dev/null +++ b/source/source_hsolver/test/generate_hsolver_test_report.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +set -o pipefail + +script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) + +find_build_dir() { + local dir="$1" + while [[ "$dir" != "/" ]]; do + if [[ -f "$dir/CMakeCache.txt" && -f "$dir/CTestTestfile.cmake" ]]; then + echo "$dir" + return 0 + fi + dir=$(dirname "$dir") + done + return 1 +} + +build_dir=${HSOLVER_TEST_BUILD_DIR:-} +if [[ -z "$build_dir" ]]; then + build_dir=$(find_build_dir "$PWD") +fi +if [[ -z "$build_dir" ]]; then + build_dir=$(find_build_dir "$script_dir") +fi +if [[ -z "$build_dir" ]]; then + echo "Cannot locate the CTest build directory. Set HSOLVER_TEST_BUILD_DIR." >&2 + exit 1 +fi + +report_dir=${HSOLVER_TEST_REPORT_DIR:-"$build_dir/test_reports/hsolver"} +test_regex=${HSOLVER_TEST_REGEX:-"^MODULE_HSOLVER_(ppcg|bpcg)$"} +timestamp=$(date +"%Y%m%d_%H%M%S") + +mkdir -p "$report_dir" + +xml_file="$report_dir/hsolver_unit_tests_${timestamp}.xml" +log_file="$report_dir/hsolver_unit_tests_${timestamp}.log" +ctest_has_junit=0 +if ctest --help 2>/dev/null | grep -q -- "--output-junit"; then + ctest_has_junit=1 +fi + +echo "Build directory : $build_dir" +echo "Test regex : $test_regex" +if [[ $ctest_has_junit -eq 1 ]]; then + echo "JUnit XML : $xml_file" +else + echo "JUnit XML : skipped (--output-junit is not supported by this CTest)" +fi +echo "Text log : $log_file" + +if [[ $ctest_has_junit -eq 1 ]]; then + ctest --test-dir "$build_dir" -V -R "$test_regex" --output-junit "$xml_file" 2>&1 | tee "$log_file" + status=${PIPESTATUS[0]} +else + ctest --test-dir "$build_dir" -V -R "$test_regex" 2>&1 | tee "$log_file" + status=${PIPESTATUS[0]} +fi + +if [[ $status -eq 0 ]]; then + echo "Generated hsolver test reports in: $report_dir" +else + echo "CTest failed. Partial reports are available in: $report_dir" >&2 +fi + +exit $status