Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
7c13930
PPCG_5.29
Sereiner-stu May 29, 2026
77823ca
Add bpcg kernel op
Growl1234 May 29, 2026
93a52f5
Merge pull request #1 from Sereiner-stu/PPCG
L12345j May 29, 2026
8e88b90
Merge pull request #3 from Inteygros/feature/eigen-solver-optimization
L12345j May 29, 2026
3e942c5
test: extend hsolver bpcg and ppcg unit tests
May 29, 2026
6fd9f58
Merge pull request #4 from L12345j/hsolver-unit-tests-q1-q4
L12345j May 29, 2026
ec5f7b9
PPCG_6.2_xyr
Sereiner-stu Jun 2, 2026
d93edad
Merge pull request #5 from Sereiner-stu/PPCG
L12345j Jun 5, 2026
b27fc9e
Improve PPCG boundary tests and small-subspace handling
Jun 5, 2026
6e8faec
Merge pull request #7 from L12345j/hsolver-unit-tests-ppcg-deep-opt
L12345j Jun 5, 2026
4fc9906
PPCG: rebase onto latest L12345j/develop
Sereiner-stu Jun 17, 2026
148dbca
Merge pull request #10 from Sereiner-stu/PPCG_clean
L12345j Jun 19, 2026
68c3359
Improve hsolver PPCG tests and add test report target
Jun 19, 2026
a54926d
Merge pull request #11 from L12345j/hsolver-final-unit-tests-0619
L12345j Jun 19, 2026
6df6c62
Remove accidentally committed report files
L12345j Jun 19, 2026
2610631
Address hsolver PPCG test review comments
Jun 19, 2026
b56b969
Add missing functional include for DiagoPPCG
Jun 19, 2026
e9cea85
Merge pull request #12 from L12345j/hsolver-final-unit-tests-0619
L12345j Jun 19, 2026
1c433a1
Merge pull request #13 from L12345j/hsolver-final-unit-tests-0619
L12345j Jun 19, 2026
f786c24
Merge branch 'develop' into develop
L12345j Jun 19, 2026
f40da25
Link PPCG sources into hsolver pw tests
Jun 19, 2026
5b50e3d
Merge pull request #14 from L12345j/hsolver-final-unit-tests-0619
L12345j Jun 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions source/source_hsolver/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ list(APPEND objects
diago_david.cpp
diago_dav_subspace.cpp
diago_bpcg.cpp
diago_ppcg.cpp
para_linear_transform.cpp
hsolver_pw.cpp
hsolver_lcaopw.cpp
Expand Down
797 changes: 797 additions & 0 deletions source/source_hsolver/diago_ppcg.cpp

Large diffs are not rendered by default.

145 changes: 145 additions & 0 deletions source/source_hsolver/diago_ppcg.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#ifndef DIAGO_PPCG_H_
#define DIAGO_PPCG_H_

#include "source_base/kernels/math_kernel_op.h"
#include "source_base/module_device/memory_op.h"
#include "source_base/module_device/types.h"
#include "source_base/para_gemm.h"
#include "source_hsolver/kernels/hegvd_op.h"

#include <ATen/core/tensor.h>
#include <ATen/core/tensor_map.h>
#include <functional>
#include <source_base/macros.h>

namespace hsolver {

// Projected Preconditioned Conjugate Gradient (block) eigensolver.
// This implementation follows an LOBPCG-style subspace projection:
// V = [X, W, P] (or [X, W] for the first iter)
// solve (V^H H V) c = (V^H V) c Λ
// update X <- V c(:,1:nband)
// update P <- [W, P] c_{W,P}(:,1:nband)
// with W from preconditioned residual projected to the complement of X (and P).
//
// Notes:
// - Designed to match the existing diag interface used by BPCG.
// - Preconditioner is treated as a diagonal Real vector of length n_basis.

template <typename T = std::complex<double>, typename Device = base_device::DEVICE_CPU>
class DiagoPPCG
{
private:
using Real = typename GetTypeReal<T>::type;

public:
explicit DiagoPPCG(const Real* precondition);
~DiagoPPCG();

void init_iter(const int nband, const int nband_l, const int nbasis, const int ndim);

// ---- tunable parameters ----
/// Maximum inner iterations per diag() call when P-block is safe.
/// Default 3; set to 1 for ultra-conservative mode or higher for
/// difficult spectra.
void set_max_inner_iter(const int n) { max_inner_iter_ = n; }

/// Safety margin for P-block usage: P is enabled only when
/// 3 * n_band <= n_dim - p_safe_margin_.
/// Default 2; increase for better numerical stability at the cost of
/// slower convergence on well-conditioned problems.
void set_p_safe_margin(const int m) { p_safe_margin_ = m; }

/// Number of diag() passes performed by the factory (hsolver_pw).
/// Default 5; matching BPCG's multi-pass strategy.
void set_npass(const int n) { npass_ = n; }
int npass() const { return npass_; }
// ---- end tunable parameters ----

using HPsiFunc = std::function<void(T*, T*, const int, const int)>;

void diag(const HPsiFunc& hpsi_func,
T* psi_in,
Real* eigenvalue_in,
const std::vector<double>& ethr_band);

private:
int n_band = 0;
int n_band_l = 0;
int n_basis = 0;
int n_dim = 0;

// tunable parameters (see set_xxx methods above)
int max_inner_iter_ = 3;
int p_safe_margin_ = 2;
int npass_ = 5;

ct::DataType r_type = ct::DataType::DT_INVALID;
ct::DataType t_type = ct::DataType::DT_INVALID;
ct::DeviceType device_type = ct::DeviceType::UnKnown;

// Host pointer mapped preconditioner + device copy
ct::Tensor h_prec = {};
ct::Tensor prec = {};

// Work vectors (column-major, lda = n_basis): each tensor stores a matrix (n_basis x ncols)
// as a contiguous array; we use Tensor with shape {ncols, n_basis} for contiguous column blocks.
ct::Tensor X = {}; // mapped to psi_in
ct::Tensor HX = {};
ct::Tensor R = {};
ct::Tensor W = {};
ct::Tensor HW = {};
ct::Tensor P = {};
ct::Tensor HP = {};

ct::Tensor V = {}; // basis [X,W,P] packed
ct::Tensor HV = {}; // H*V

ct::Tensor hcc = {}; // V^H H V
ct::Tensor scc = {}; // V^H V
ct::Tensor vcc = {}; // eigenvectors of projected problem
ct::Tensor eval = {}; // eigenvalues of projected problem

ct::Tensor work = {}; // generic workspace (ncols x n_basis)

// Parallel matmul helper (A^H B)
ModuleBase::PGemmCN<T, Device> pmmcn;

// Device memory helpers
Device* ctx = {};
const T one_ = static_cast<T>(1.0);
const T zero_ = static_cast<T>(0.0);
const T neg_one_ = static_cast<T>(-1.0);
const T* one = &one_;
const T* zero = &zero_;
const T* neg_one = &neg_one_;

using syncmem_complex_op = base_device::memory::synchronize_memory_op<T, Device, Device>;
using syncmem_var_op = base_device::memory::synchronize_memory_op<Real, Device, Device>;
using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op<Real, Device, base_device::DEVICE_CPU>;
using syncmem_var_d2h_op = base_device::memory::synchronize_memory_op<Real, base_device::DEVICE_CPU, Device>;

void calc_prec();

void apply_h(const HPsiFunc& hpsi_func, const ct::Tensor& in_vecs, ct::Tensor& out_vecs, const int nvec);

void pack_basis(const int ncols, const bool has_p);

void compute_projected_mats(const int ncols);

void solve_projected(const int ncols);

void update_from_projected(const int ncols, const bool has_p, const bool update_p);

void compute_residual_and_precond(const std::vector<double>& ethr_band, bool& not_conv);

void orthonormalize_block(ct::Tensor& A, ct::Tensor* HA, const int ncols);

void project_out(const ct::Tensor& basis, const int ncols_basis, ct::Tensor& vecs, const int ncols_vecs);

bool check_convergence(const ct::Tensor& residual, const std::vector<double>& ethr_band);
};

} // namespace hsolver

#endif // DIAGO_PPCG_H_
16 changes: 15 additions & 1 deletion source/source_hsolver/hsolver_pw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "source_hamilt/hamilt.h"
#include "source_hsolver/diag_comm_info.h"
#include "source_hsolver/diago_bpcg.h"
#include "source_hsolver/diago_ppcg.h"
#include "source_hsolver/diago_cg.h"
#include "source_hsolver/diago_dav_subspace.h"
#include "source_hsolver/diago_david.h"
Expand Down Expand Up @@ -83,7 +84,7 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
this->nproc_in_pool = nproc_in_pool_in;

// report if the specified diagonalization method is not supported
const std::initializer_list<std::string> _methods = {"cg", "dav", "dav_subspace", "bpcg"};
const std::initializer_list<std::string> _methods = {"cg", "dav", "dav_subspace", "bpcg", "ppcg"};
if (std::find(std::begin(_methods), std::end(_methods), this->method) == std::end(_methods))
{
ModuleBase::WARNING_QUIT("HSolverPW::solve", "This type of eigensolver is not supported!");
Expand Down Expand Up @@ -323,6 +324,19 @@ void HSolverPW<T, Device>::hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm,
bpcg.init_iter(PARAM.inp.nbands, nband_l, nbasis, ndim);
bpcg.diag(hpsi_func, psi.get_pointer(), eigenvalue, this->ethr_band);
}
else if (this->method == "ppcg")
{
const int nband_l = psi.get_nbands();
const int nbasis = psi.get_nbasis();
const int ndim = psi.get_current_ngk();
DiagoPPCG<T, Device> ppcg(pre_condition.data());
ppcg.init_iter(PARAM.inp.nbands, nband_l, nbasis, ndim);
// Multiple passes for robust convergence (same strategy as BPCG in unit tests)
for (int pass = 0; pass < ppcg.npass(); ++pass)
{
ppcg.diag(hpsi_func, psi.get_pointer(), eigenvalue, this->ethr_band);
}
}
else if (this->method == "dav_subspace")
{
bool scf = this->calculation_type == "nscf" ? false : true;
Expand Down
55 changes: 42 additions & 13 deletions source/source_hsolver/kernels/bpcg_kernel_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,26 @@
#include "source_base/kernels/math_kernel_op.h"
#include "source_base/parallel_reduce.h"
#include <vector>
#ifdef _OPENMP
#include <omp.h>
#endif
namespace hsolver
{

namespace
{
constexpr int kBpcgOpenmpMinWork = 4096;

inline bool use_bpcg_openmp(int n)
{
#ifdef _OPENMP
return n >= kBpcgOpenmpMinWork && omp_get_max_threads() > 1;
#else
return false;
#endif
}
} // namespace

template <typename T>
struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
{
Expand All @@ -26,6 +43,9 @@ struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
Parallel_Reduce::reduce_pool(norm);
norm = 1.0 / sqrt(norm);
#ifdef _OPENMP
#pragma omp parallel for schedule(static) reduction(+:epsilo_0,epsilo_1,epsilo_2) if(use_bpcg_openmp(n_basis))
#endif
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
auto item = band_idx * n_basis_max + basis_idx;
Expand All @@ -41,6 +61,9 @@ struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2)));
cos_theta = std::cos(theta);
sin_theta = std::sin(theta);
#ifdef _OPENMP
#pragma omp parallel for schedule(static) if(use_bpcg_openmp(n_basis))
#endif
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
auto item = band_idx * n_basis_max + basis_idx;
Expand Down Expand Up @@ -71,12 +94,14 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
Real err = 0.0;
Real beta = 0.0;
Real epsilo = 0.0;
Real grad_2 = {0.0};
T grad_1 = {0.0, 0.0};
const Real beta_old = beta_out[band_idx];
auto A = reinterpret_cast<const Real*>(psi_out + band_idx * n_basis_max);
Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
Parallel_Reduce::reduce_pool(norm);
norm = 1.0 / sqrt(norm);
#ifdef _OPENMP
#pragma omp parallel for schedule(static) reduction(+:epsilo) if(use_bpcg_openmp(n_basis))
#endif
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
auto item = band_idx * n_basis_max + basis_idx;
Expand All @@ -85,21 +110,27 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item]));
}
Parallel_Reduce::reduce_pool(epsilo);
#ifdef _OPENMP
#pragma omp parallel for schedule(static) reduction(+:err,beta) if(use_bpcg_openmp(n_basis))
#endif
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
auto item = band_idx * n_basis_max + basis_idx;
grad_1 = hpsi_out[item] - epsilo * psi_out[item];
grad_2 = std::norm(grad_1);
const T grad_1 = hpsi_out[item] - epsilo * psi_out[item];
const Real grad_2 = std::norm(grad_1);
err += grad_2;
beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec?
}
Parallel_Reduce::reduce_pool(err);
Parallel_Reduce::reduce_pool(beta);
#ifdef _OPENMP
#pragma omp parallel for schedule(static) if(use_bpcg_openmp(n_basis))
#endif
for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
{
auto item = band_idx * n_basis_max + basis_idx;
grad_1 = hpsi_out[item] - epsilo * psi_out[item];
grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_out[band_idx] * grad_old_out[item];
const T grad_1 = hpsi_out[item] - epsilo * psi_out[item];
grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_old * grad_old_out[item];
}
beta_out[band_idx] = beta;
err_out[band_idx] = sqrt(err);
Expand All @@ -113,6 +144,9 @@ struct apply_eigenvalues_op<T, base_device::DEVICE_CPU>
using Real = typename GetTypeReal<T>::type;
void operator()(const int& nbase, const int& nbase_x, const int& notconv, T* result, const T* vectors, const Real* eigenvalues)
{
#ifdef _OPENMP
#pragma omp parallel for collapse(2) schedule(static) if(use_bpcg_openmp(nbase * notconv))
#endif
for (int m = 0; m < notconv; m++)
{
for (int idx = 0; idx < nbase; idx++)
Expand All @@ -133,19 +167,14 @@ struct precondition_op<T, base_device::DEVICE_CPU> {
const Real* precondition,
const Real* eigenvalues)
{
std::vector<Real> pre(dim, 0.0);
for (int m = 0; m < notconv; m++)
{
for (size_t i = 0; i < dim; i++)
{
Real x = std::abs(precondition[i] - eigenvalues[m]);
pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
Real denom = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
psi_iter[(nbase + m) * dim + i] /= denom;
}
ModuleBase::vector_div_vector_op<T, base_device::DEVICE_CPU>()(
dim,
psi_iter + (nbase + m) * dim,
psi_iter + (nbase + m) * dim,
pre.data());
}
}
};
Expand Down
Loading
Loading