deepmodeling · L12345j · May 29, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/source/source_hsolver/CMakeLists.txt b/source/source_hsolver/CMakeLists.txt
@@ -4,6 +4,7 @@ list(APPEND objects
     diago_david.cpp
     diago_dav_subspace.cpp
     diago_bpcg.cpp
+  diago_ppcg.cpp
     para_linear_transform.cpp
     hsolver_pw.cpp
     hsolver_lcaopw.cpp

diff --git a/source/source_hsolver/diago_ppcg.cpp b/source/source_hsolver/diago_ppcg.cpp
diff --git a/source/source_hsolver/diago_ppcg.h b/source/source_hsolver/diago_ppcg.h
@@ -0,0 +1,145 @@
+#ifndef DIAGO_PPCG_H_
+#define DIAGO_PPCG_H_
+
+#include "source_base/kernels/math_kernel_op.h"
+#include "source_base/module_device/memory_op.h"
+#include "source_base/module_device/types.h"
+#include "source_base/para_gemm.h"
+#include "source_hsolver/kernels/hegvd_op.h"
+
+#include <ATen/core/tensor.h>
+#include <ATen/core/tensor_map.h>
+#include <functional>
+#include <source_base/macros.h>
+
+namespace hsolver {
+
+// Projected Preconditioned Conjugate Gradient (block) eigensolver.
+// This implementation follows an LOBPCG-style subspace projection:
+//   V = [X, W, P] (or [X, W] for the first iter)
+//   solve (V^H H V) c = (V^H V) c Λ
+//   update X <- V c(:,1:nband)
+//   update P <- [W, P] c_{W,P}(:,1:nband)
+// with W from preconditioned residual projected to the complement of X (and P).
+//
+// Notes:
+// - Designed to match the existing diag interface used by BPCG.
+// - Preconditioner is treated as a diagonal Real vector of length n_basis.
+
+template <typename T = std::complex<double>, typename Device = base_device::DEVICE_CPU>
+class DiagoPPCG
+{
+  private:
+    using Real = typename GetTypeReal<T>::type;
+
+  public:
+    explicit DiagoPPCG(const Real* precondition);
+    ~DiagoPPCG();
+
+    void init_iter(const int nband, const int nband_l, const int nbasis, const int ndim);
+
+    // ---- tunable parameters ----
+    /// Maximum inner iterations per diag() call when P-block is safe.
+    /// Default 3; set to 1 for ultra-conservative mode or higher for
+    /// difficult spectra.
+    void set_max_inner_iter(const int n) { max_inner_iter_ = n; }
+
+    /// Safety margin for P-block usage: P is enabled only when
+    /// 3 * n_band <= n_dim - p_safe_margin_.
+    /// Default 2; increase for better numerical stability at the cost of
+    /// slower convergence on well-conditioned problems.
+    void set_p_safe_margin(const int m) { p_safe_margin_ = m; }
+
+    /// Number of diag() passes performed by the factory (hsolver_pw).
+    /// Default 5; matching BPCG's multi-pass strategy.
+    void set_npass(const int n) { npass_ = n; }
+    int npass() const { return npass_; }
+    // ---- end tunable parameters ----
+
+    using HPsiFunc = std::function<void(T*, T*, const int, const int)>;
+
+    void diag(const HPsiFunc& hpsi_func,
+              T* psi_in,
+              Real* eigenvalue_in,
+              const std::vector<double>& ethr_band);
+
+  private:
+    int n_band = 0;
+    int n_band_l = 0;
+    int n_basis = 0;
+    int n_dim = 0;
+
+    // tunable parameters (see set_xxx methods above)
+    int max_inner_iter_ = 3;
+    int p_safe_margin_ = 2;
+    int npass_ = 5;
+
+    ct::DataType r_type = ct::DataType::DT_INVALID;
+    ct::DataType t_type = ct::DataType::DT_INVALID;
+    ct::DeviceType device_type = ct::DeviceType::UnKnown;
+
+    // Host pointer mapped preconditioner + device copy
+    ct::Tensor h_prec = {};
+    ct::Tensor prec = {};
+
+    // Work vectors (column-major, lda = n_basis): each tensor stores a matrix (n_basis x ncols)
+    // as a contiguous array; we use Tensor with shape {ncols, n_basis} for contiguous column blocks.
+    ct::Tensor X = {};   // mapped to psi_in
+    ct::Tensor HX = {};
+    ct::Tensor R = {};
+    ct::Tensor W = {};
+    ct::Tensor HW = {};
+    ct::Tensor P = {};
+    ct::Tensor HP = {};
+
+    ct::Tensor V = {};   // basis [X,W,P] packed
+    ct::Tensor HV = {};  // H*V
+
+    ct::Tensor hcc = {}; // V^H H V
+    ct::Tensor scc = {}; // V^H V
+    ct::Tensor vcc = {}; // eigenvectors of projected problem
+    ct::Tensor eval = {}; // eigenvalues of projected problem
+
+    ct::Tensor work = {}; // generic workspace (ncols x n_basis)
+
+    // Parallel matmul helper (A^H B)
+    ModuleBase::PGemmCN<T, Device> pmmcn;
+
+    // Device memory helpers
+    Device* ctx = {};
+    const T one_ = static_cast<T>(1.0);
+    const T zero_ = static_cast<T>(0.0);
+    const T neg_one_ = static_cast<T>(-1.0);
+    const T* one = &one_;
+    const T* zero = &zero_;
+    const T* neg_one = &neg_one_;
+
+    using syncmem_complex_op = base_device::memory::synchronize_memory_op<T, Device, Device>;
+    using syncmem_var_op = base_device::memory::synchronize_memory_op<Real, Device, Device>;
+    using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op<Real, Device, base_device::DEVICE_CPU>;
+    using syncmem_var_d2h_op = base_device::memory::synchronize_memory_op<Real, base_device::DEVICE_CPU, Device>;
+
+    void calc_prec();
+
+    void apply_h(const HPsiFunc& hpsi_func, const ct::Tensor& in_vecs, ct::Tensor& out_vecs, const int nvec);
+
+    void pack_basis(const int ncols, const bool has_p);
+
+    void compute_projected_mats(const int ncols);
+
+    void solve_projected(const int ncols);
+
+    void update_from_projected(const int ncols, const bool has_p, const bool update_p);
+
+    void compute_residual_and_precond(const std::vector<double>& ethr_band, bool& not_conv);
+
+    void orthonormalize_block(ct::Tensor& A, ct::Tensor* HA, const int ncols);
+
+    void project_out(const ct::Tensor& basis, const int ncols_basis, ct::Tensor& vecs, const int ncols_vecs);
+
+    bool check_convergence(const ct::Tensor& residual, const std::vector<double>& ethr_band);
+};
+
+} // namespace hsolver
+
+#endif // DIAGO_PPCG_H_
diff --git a/source/source_hsolver/hsolver_pw.cpp b/source/source_hsolver/hsolver_pw.cpp
@@ -8,6 +8,7 @@
 #include "source_hamilt/hamilt.h"
 #include "source_hsolver/diag_comm_info.h"
 #include "source_hsolver/diago_bpcg.h"
+#include "source_hsolver/diago_ppcg.h"
 #include "source_hsolver/diago_cg.h"
 #include "source_hsolver/diago_dav_subspace.h"
 #include "source_hsolver/diago_david.h"
@@ -83,7 +84,7 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
     this->nproc_in_pool = nproc_in_pool_in;
 
     // report if the specified diagonalization method is not supported
-    const std::initializer_list<std::string> _methods = {"cg", "dav", "dav_subspace", "bpcg"};
+    const std::initializer_list<std::string> _methods = {"cg", "dav", "dav_subspace", "bpcg", "ppcg"};
     if (std::find(std::begin(_methods), std::end(_methods), this->method) == std::end(_methods))
     {
         ModuleBase::WARNING_QUIT("HSolverPW::solve", "This type of eigensolver is not supported!");
@@ -323,6 +324,19 @@ void HSolverPW<T, Device>::hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm,
         bpcg.init_iter(PARAM.inp.nbands, nband_l, nbasis, ndim);
         bpcg.diag(hpsi_func, psi.get_pointer(), eigenvalue, this->ethr_band);
     }
+    else if (this->method == "ppcg")
+    {
+        const int nband_l = psi.get_nbands();
+        const int nbasis = psi.get_nbasis();
+        const int ndim = psi.get_current_ngk();
+        DiagoPPCG<T, Device> ppcg(pre_condition.data());
+        ppcg.init_iter(PARAM.inp.nbands, nband_l, nbasis, ndim);
+        // Multiple passes for robust convergence (same strategy as BPCG in unit tests)
+        for (int pass = 0; pass < ppcg.npass(); ++pass)
+        {
+            ppcg.diag(hpsi_func, psi.get_pointer(), eigenvalue, this->ethr_band);
+        }
+    }
     else if (this->method == "dav_subspace")
     {
         bool scf = this->calculation_type == "nscf" ? false : true;

diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.cpp b/source/source_hsolver/kernels/bpcg_kernel_op.cpp
@@ -3,9 +3,26 @@
 #include "source_base/kernels/math_kernel_op.h"
 #include "source_base/parallel_reduce.h"
 #include <vector>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 namespace hsolver
 {
 
+namespace
+{
+constexpr int kBpcgOpenmpMinWork = 4096;
+
+inline bool use_bpcg_openmp(int n)
+{
+#ifdef _OPENMP
+    return n >= kBpcgOpenmpMinWork && omp_get_max_threads() > 1;
+#else
+    return false;
+#endif
+}
+} // namespace
+
 template <typename T>
 struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
 {
@@ -26,6 +43,9 @@ struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
             Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
             Parallel_Reduce::reduce_pool(norm);
             norm = 1.0 / sqrt(norm);
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static) reduction(+:epsilo_0,epsilo_1,epsilo_2) if(use_bpcg_openmp(n_basis))
+#endif
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
@@ -41,6 +61,9 @@ struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
             theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2)));
             cos_theta = std::cos(theta);
             sin_theta = std::sin(theta);
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static) if(use_bpcg_openmp(n_basis))
+#endif
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
@@ -71,12 +94,14 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
             Real err = 0.0;
             Real beta = 0.0;
             Real epsilo = 0.0;
-            Real grad_2 = {0.0};
-            T grad_1 = {0.0, 0.0};
+            const Real beta_old = beta_out[band_idx];
             auto A = reinterpret_cast<const Real*>(psi_out + band_idx * n_basis_max);
             Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
             Parallel_Reduce::reduce_pool(norm);
             norm = 1.0 / sqrt(norm);
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static) reduction(+:epsilo) if(use_bpcg_openmp(n_basis))
+#endif
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
@@ -85,21 +110,27 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
                 epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item]));
             }
             Parallel_Reduce::reduce_pool(epsilo);
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static) reduction(+:err,beta) if(use_bpcg_openmp(n_basis))
+#endif
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
-                grad_1 = hpsi_out[item] - epsilo * psi_out[item];
-                grad_2 = std::norm(grad_1);
+                const T grad_1 = hpsi_out[item] - epsilo * psi_out[item];
+                const Real grad_2 = std::norm(grad_1);
                 err += grad_2;
                 beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec?
             }
             Parallel_Reduce::reduce_pool(err);
             Parallel_Reduce::reduce_pool(beta);
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static) if(use_bpcg_openmp(n_basis))
+#endif
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
-                grad_1 = hpsi_out[item] - epsilo * psi_out[item];
-                grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_out[band_idx] * grad_old_out[item];
+                const T grad_1 = hpsi_out[item] - epsilo * psi_out[item];
+                grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_old * grad_old_out[item];
             }
             beta_out[band_idx] = beta;
             err_out[band_idx] = sqrt(err);
@@ -113,6 +144,9 @@ struct apply_eigenvalues_op<T, base_device::DEVICE_CPU>
     using Real = typename GetTypeReal<T>::type;
     void operator()(const int& nbase, const int& nbase_x, const int& notconv, T* result, const T* vectors, const Real* eigenvalues)
     {
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2) schedule(static) if(use_bpcg_openmp(nbase * notconv))
+#endif
         for (int m = 0; m < notconv; m++)
         {
             for (int idx = 0; idx < nbase; idx++)
@@ -133,19 +167,14 @@ struct precondition_op<T, base_device::DEVICE_CPU> {
                    const Real* precondition,
                    const Real* eigenvalues)
     {
-        std::vector<Real> pre(dim, 0.0);
         for (int m = 0; m < notconv; m++)
         {
             for (size_t i = 0; i < dim; i++)
             {
                 Real x = std::abs(precondition[i] - eigenvalues[m]);
-                pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
+                Real denom = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
+                psi_iter[(nbase + m) * dim + i] /= denom;
             }
-            ModuleBase::vector_div_vector_op<T, base_device::DEVICE_CPU>()(
-                                                             dim,
-                                                             psi_iter + (nbase + m) * dim,
-                                                             psi_iter + (nbase + m) * dim,
-                                                             pre.data());
         }
     }
 };