diff --git a/source/source_hsolver/CMakeLists.txt b/source/source_hsolver/CMakeLists.txt
index b115d6d4cd2..a64c7eb2159 100644
--- a/source/source_hsolver/CMakeLists.txt
+++ b/source/source_hsolver/CMakeLists.txt
@@ -4,6 +4,7 @@ list(APPEND objects
     diago_david.cpp
     diago_dav_subspace.cpp
     diago_bpcg.cpp
+  diago_ppcg.cpp
     para_linear_transform.cpp
     hsolver_pw.cpp
     hsolver_lcaopw.cpp
diff --git a/source/source_hsolver/diago_ppcg.cpp b/source/source_hsolver/diago_ppcg.cpp
new file mode 100644
index 00000000000..84d97b86789
--- /dev/null
+++ b/source/source_hsolver/diago_ppcg.cpp
@@ -0,0 +1,797 @@
+#include "source_hsolver/diago_ppcg.h"
+
+#include "ATen/kernels/lapack.h"
+#include "diago_iter_assist.h"
+#include "source_base/global_variable.h"
+#include "source_base/kernels/math_kernel_op.h"
+#include "source_base/parallel_comm.h" // POOL_WORLD/BP_WORLD
+#include "source_base/parallel_reduce.h"
+#include "source_hsolver/kernels/bpcg_kernel_op.h" // reuse normalize_op / apply_eigenvalues_op / precondition_op
+
+#ifdef __MPI
+#include <mpi.h>
+#endif
+
+namespace hsolver {
+
+template <typename T, typename Device>
+DiagoPPCG<T, Device>::DiagoPPCG(const Real* precondition)
+{
+    this->device_type = ct::DeviceTypeToEnum<Device>::value;
+    this->ctx = {}; // default device context
+    this->r_type = ct::DataTypeToEnum<Real>::value;
+    this->t_type = ct::DataTypeToEnum<T>::value;
+
+    this->h_prec = std::move(ct::TensorMap((void*)precondition, this->r_type, this->device_type, {this->n_basis}));
+}
+
+template <typename T, typename Device>
+DiagoPPCG<T, Device>::~DiagoPPCG() = default;
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::init_iter(const int nband, const int nband_l, const int nbasis, const int ndim)
+{
+    this->n_band = nband;
+    this->n_band_l = nband_l;
+    this->n_basis = nbasis;
+    this->n_dim = ndim;
+
+    this->prec = ct::Tensor(this->r_type, this->device_type, {this->n_basis});
+
+    this->HX = ct::Tensor(this->t_type, this->device_type, {this->n_band_l, this->n_basis});
+    this->R = ct::Tensor(this->t_type, this->device_type, {this->n_band_l, this->n_basis});
+    this->W = ct::Tensor(this->t_type, this->device_type, {this->n_band_l, this->n_basis});
+    this->HW = ct::Tensor(this->t_type, this->device_type, {this->n_band_l, this->n_basis});
+    this->P = ct::Tensor(this->t_type, this->device_type, {this->n_band_l, this->n_basis});
+    this->HP = ct::Tensor(this->t_type, this->device_type, {this->n_band_l, this->n_basis});
+
+    const int max_cols = 3 * this->n_band_l;
+    this->V = ct::Tensor(this->t_type, this->device_type, {max_cols, this->n_basis});
+    this->HV = ct::Tensor(this->t_type, this->device_type, {max_cols, this->n_basis});
+
+    const int max_small = 3 * this->n_band;
+    this->hcc = ct::Tensor(this->t_type, this->device_type, {max_small, max_small});
+    this->scc = ct::Tensor(this->t_type, this->device_type, {max_small, max_small});
+    this->vcc = ct::Tensor(this->t_type, this->device_type, {max_small, max_small});
+    this->eval = ct::Tensor(this->r_type, this->device_type, {max_small});
+    // Zero-initialize so that uninitialised entries are immediately visible
+    // as exact 0.0 rather than denormal garbage (e.g. 4.68e-310).
+    Parallel_Reduce::ZEROS(this->eval.data<Real>(), max_small);
+
+    this->work = ct::Tensor(this->t_type, this->device_type, {max_cols, this->n_basis});
+
+    this->calc_prec();
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::calc_prec()
+{
+    syncmem_var_h2d_op()(this->prec.data<Real>(), this->h_prec.data<Real>(), this->n_basis);
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::apply_h(const HPsiFunc& hpsi_func, const ct::Tensor& in_vecs, ct::Tensor& out_vecs, const int nvec)
+{
+    // hpsi_func(psi_in, hpsi_out, ld_psi, nvec)
+    hpsi_func(in_vecs.data<T>(), out_vecs.data<T>(), this->n_basis, nvec);
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::project_out(const ct::Tensor& basis,
+                                      const int ncols_basis,
+                                      ct::Tensor& vecs,
+                                      const int ncols_vecs)
+{
+    if (ncols_basis <= 0 || ncols_vecs <= 0)
+    {
+        return;
+    }
+
+    // coeff = basis^H * vecs  (ncols_basis x ncols_vecs)
+    const int ldh = ncols_basis;
+    ct::Tensor coeff(this->t_type, this->device_type, {ldh, ncols_vecs});
+
+#ifdef __MPI
+    this->pmmcn.set_dimension(BP_WORLD,
+                              POOL_WORLD,
+                              ncols_basis,
+                              this->n_basis,
+                              ncols_vecs,
+                              this->n_basis,
+                              this->n_dim,
+                              ldh);
+#else
+    this->pmmcn.set_dimension(ncols_basis,
+                              this->n_basis,
+                              ncols_vecs,
+                              this->n_basis,
+                              this->n_dim,
+                              ldh);
+#endif
+
+    this->pmmcn.multiply(1.0, basis.data<T>(), vecs.data<T>(), 0.0, coeff.data<T>());
+
+    // vecs -= basis * coeff
+    ModuleBase::gemm_op<T, Device>()('N',
+                                    'N',
+                                    this->n_dim,
+                                    ncols_vecs,
+                                    ncols_basis,
+                                    this->neg_one,
+                                    basis.data<T>(),
+                                    this->n_basis,
+                                    coeff.data<T>(),
+                                    ldh,
+                                    this->one,
+                                    vecs.data<T>(),
+                                    this->n_basis);
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::orthonormalize_block(ct::Tensor& A, ct::Tensor* HA, const int ncols)
+{
+    if (ncols <= 0)
+    {
+        return;
+    }
+
+    // gram = A^H A  (ncols x ncols)
+    ct::Tensor gram(this->t_type, this->device_type, {ncols, ncols});
+#ifdef __MPI
+    this->pmmcn.set_dimension(BP_WORLD,
+                              POOL_WORLD,
+                              ncols,
+                              this->n_basis,
+                              ncols,
+                              this->n_basis,
+                              this->n_dim,
+                              ncols);
+#else
+    this->pmmcn.set_dimension(ncols,
+                              this->n_basis,
+                              ncols,
+                              this->n_basis,
+                              this->n_dim,
+                              ncols);
+#endif
+    this->pmmcn.multiply(static_cast<T>(1.0), A.data<T>(), A.data<T>(), static_cast<T>(0.0), gram.data<T>());
+
+    // Cholesky: gram = U^H U (upper), then invert U in-place -> gram holds inv(U) in upper triangle.
+    // Use the ATen LAPACK wrappers so CPU/GPU paths and error handling are consistent.
+    using ContainerDevice = typename container::PsiToContainer<Device>::type;
+    container::kernels::lapack_potrf<T, ContainerDevice>()('U', ncols, gram.data<T>(), ncols);
+    container::kernels::lapack_trtri<T, ContainerDevice>()('U', 'N', ncols, gram.data<T>(), ncols);
+
+    // Zero out lower triangle so a dense GEMM applies only the upper-triangular factor.
+    T* g = gram.data<T>();
+    for (int j = 0; j < ncols; ++j)
+    {
+        for (int i = j + 1; i < ncols; ++i)
+        {
+            g[i + j * ncols] = static_cast<T>(0.0);
+        }
+    }
+
+    // A <- A * inv(U)
+    ModuleBase::gemm_op<T, Device>()('N',
+                                    'N',
+                                    this->n_dim,
+                                    ncols,
+                                    ncols,
+                                    this->one,
+                                    A.data<T>(),
+                                    this->n_basis,
+                                    gram.data<T>(),
+                                    ncols,
+                                    this->zero,
+                                    this->work.data<T>(),
+                                    this->n_basis);
+    syncmem_complex_op()(A.data<T>(), this->work.data<T>(), static_cast<size_t>(ncols) * this->n_basis);
+
+    if (HA)
+    {
+        ModuleBase::gemm_op<T, Device>()('N',
+                                        'N',
+                                        this->n_dim,
+                                        ncols,
+                                        ncols,
+                                        this->one,
+                                        HA->data<T>(),
+                                        this->n_basis,
+                                        gram.data<T>(),
+                                        ncols,
+                                        this->zero,
+                                        this->work.data<T>(),
+                                        this->n_basis);
+        syncmem_complex_op()(HA->data<T>(), this->work.data<T>(), static_cast<size_t>(ncols) * this->n_basis);
+    }
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::pack_basis(const int ncols, const bool has_p)
+{
+    // V columns: [X, W, P?]
+    // Copy X
+    syncmem_complex_op()(this->V.data<T>(), this->X.data<T>(), this->n_band_l * this->n_basis);
+    // Copy W
+    syncmem_complex_op()(this->V.data<T>() + this->n_band_l * this->n_basis,
+                         this->W.data<T>(),
+                         this->n_band_l * this->n_basis);
+
+    if (has_p)
+    {
+        syncmem_complex_op()(this->V.data<T>() + 2 * this->n_band_l * this->n_basis,
+                             this->P.data<T>(),
+                             this->n_band_l * this->n_basis);
+    }
+
+    // HV: [HX, HW, HP?]
+    syncmem_complex_op()(this->HV.data<T>(), this->HX.data<T>(), this->n_band_l * this->n_basis);
+
+    syncmem_complex_op()(this->HV.data<T>() + this->n_band_l * this->n_basis,
+                         this->HW.data<T>(),
+                         this->n_band_l * this->n_basis);
+
+    if (has_p)
+    {
+        syncmem_complex_op()(this->HV.data<T>() + 2 * this->n_band_l * this->n_basis,
+                             this->HP.data<T>(),
+                             this->n_band_l * this->n_basis);
+    }
+
+    (void)ncols;
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::compute_projected_mats(const int ncols)
+{
+    // hcc = V^H HV, scc = V^H V (col-major, ldh = ncols)
+    const int ld_small = 3 * this->n_band;
+#ifdef __MPI
+    this->pmmcn.set_dimension(BP_WORLD,
+                              POOL_WORLD,
+                              ncols,
+                              this->n_basis,
+                              ncols,
+                              this->n_basis,
+                              this->n_dim,
+                              ld_small);
+#else
+    this->pmmcn.set_dimension(ncols,
+                              this->n_basis,
+                              ncols,
+                              this->n_basis,
+                              this->n_dim,
+                              ld_small);
+#endif
+
+    this->pmmcn.multiply(1.0, this->V.data<T>(), this->HV.data<T>(), 0.0, this->hcc.data<T>());
+    this->pmmcn.multiply(1.0, this->V.data<T>(), this->V.data<T>(), 0.0, this->scc.data<T>());
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::solve_projected(const int ncols)
+{
+    // Solve (hcc) c = lambda (scc) c, eigenvectors in vcc
+    const int ld_small = 3 * this->n_band;
+    hsolver::hegvd_op<T, Device>()(this->ctx,
+                                   ncols,
+                                   ld_small,
+                                   this->hcc.data<T>(),
+                                   this->scc.data<T>(),
+                                   this->eval.data<Real>(),
+                                   this->vcc.data<T>());
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::update_from_projected(const int ncols, const bool has_p, const bool update_p)
+{
+    // Update X, HX from V, HV using the first n_band eigenvectors.
+    // X_new = V * vcc(:, 1:nband)
+    // HX_new = HV * vcc(:, 1:nband)
+    const T* coeff = this->vcc.data<T>();
+    const int ld_small = 3 * this->n_band;
+
+    // work (n_basis x n_band_l)
+    ModuleBase::gemm_op<T, Device>()('N',
+                                    'N',
+                                    this->n_dim,
+                                    this->n_band_l,
+                                    ncols,
+                                    this->one,
+                                    this->V.data<T>(),
+                                    this->n_basis,
+                                    coeff,
+                                    ld_small,
+                                    this->zero,
+                                    this->work.data<T>(),
+                                    this->n_basis);
+    syncmem_complex_op()(this->X.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+    ModuleBase::gemm_op<T, Device>()('N',
+                                    'N',
+                                    this->n_dim,
+                                    this->n_band_l,
+                                    ncols,
+                                    this->one,
+                                    this->HV.data<T>(),
+                                    this->n_basis,
+                                    coeff,
+                                    ld_small,
+                                    this->zero,
+                                    this->work.data<T>(),
+                                    this->n_basis);
+    syncmem_complex_op()(this->HX.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+    if (!update_p)
+    {
+        return;
+    }
+
+    // Update P (search directions) from blocks W and P (exclude X block to keep meaning)
+    // P_new = W * Cw + P * Cp, where Cw = coeff(rows b..2b-1, cols 0..b-1)
+    // and Cp = coeff(rows 2b..3b-1, cols 0..b-1)
+    const int b = this->n_band_l;
+    const int ld = ld_small;
+
+    // When the subspace is smaller than the full [X,W,P] block (ncols < 3b),
+    // only a prefix of W and/or P participates.  Keep the inner dimensions
+    // consistent so we never read garbage rows from vcc.
+    const int ncols_W = std::max(0, std::min(b, ncols - b));
+    if (ncols_W > 0)
+    {
+        const T* Cw = coeff + b; // row offset b in vcc
+
+        ModuleBase::gemm_op<T, Device>()('N',
+                                        'N',
+                                        this->n_dim,
+                                        b,
+                                        ncols_W,
+                                        this->one,
+                                        this->W.data<T>(),
+                                        this->n_basis,
+                                        Cw,
+                                        ld,
+                                        this->zero,
+                                        this->P.data<T>(),
+                                        this->n_basis);
+
+        ModuleBase::gemm_op<T, Device>()('N',
+                                        'N',
+                                        this->n_dim,
+                                        b,
+                                        ncols_W,
+                                        this->one,
+                                        this->HW.data<T>(),
+                                        this->n_basis,
+                                        Cw,
+                                        ld,
+                                        this->zero,
+                                        this->HP.data<T>(),
+                                        this->n_basis);
+    }
+
+    if (has_p)
+    {
+        const int ncols_P = std::max(0, std::min(b, ncols - 2 * b));
+        if (ncols_P > 0)
+        {
+            const T* Cp = coeff + 2 * b;
+            // The P block inside V / HV always stores b columns (pack_basis
+            // writes the full block).  Only the first ncols_P of them
+            // correspond to valid rows in Cp.
+            ModuleBase::gemm_op<T, Device>()('N',
+                                            'N',
+                                            this->n_dim,
+                                            b,
+                                            ncols_P,
+                                            this->one,
+                                            this->V.data<T>() + 2 * b * this->n_basis,
+                                            this->n_basis,
+                                            Cp,
+                                            ld,
+                                            this->one,
+                                            this->P.data<T>(),
+                                            this->n_basis);
+            ModuleBase::gemm_op<T, Device>()('N',
+                                            'N',
+                                            this->n_dim,
+                                            b,
+                                            ncols_P,
+                                            this->one,
+                                            this->HV.data<T>() + 2 * b * this->n_basis,
+                                            this->n_basis,
+                                            Cp,
+                                            ld,
+                                            this->one,
+                                            this->HP.data<T>(),
+                                            this->n_basis);
+        }
+    }
+
+        // Keep P orthogonal to X and keep HP consistent with P.
+        // If we do: P <- P - X * (X^H P), then we must also do: HP <- HP - HX * (X^H P)
+        // to preserve the relation HP = H * P inside the subspace.
+        {
+        const int bproj = this->n_band_l;
+        const int ld_coef = bproj;
+        ct::Tensor coef_xp(this->t_type, this->device_type, {ld_coef, bproj});
+
+    #ifdef __MPI
+        this->pmmcn.set_dimension(BP_WORLD,
+                      POOL_WORLD,
+                      bproj,
+                      this->n_basis,
+                      bproj,
+                      this->n_basis,
+                      this->n_dim,
+                      ld_coef);
+    #else
+        this->pmmcn.set_dimension(bproj,
+                      this->n_basis,
+                      bproj,
+                      this->n_basis,
+                      this->n_dim,
+                      ld_coef);
+    #endif
+        // coef_xp = X^H * P
+        this->pmmcn.multiply(1.0, this->X.data<T>(), this->P.data<T>(), 0.0, coef_xp.data<T>());
+
+        // P  -= X  * coef_xp
+        ModuleBase::gemm_op<T, Device>()('N',
+                        'N',
+                        this->n_dim,
+                        bproj,
+                        bproj,
+                        this->neg_one,
+                        this->X.data<T>(),
+                        this->n_basis,
+                        coef_xp.data<T>(),
+                        ld_coef,
+                        this->one,
+                        this->P.data<T>(),
+                        this->n_basis);
+
+        // HP -= HX * coef_xp
+        ModuleBase::gemm_op<T, Device>()('N',
+                        'N',
+                        this->n_dim,
+                        bproj,
+                        bproj,
+                        this->neg_one,
+                        this->HX.data<T>(),
+                        this->n_basis,
+                        coef_xp.data<T>(),
+                        ld_coef,
+                        this->one,
+                        this->HP.data<T>(),
+                        this->n_basis);
+        }
+
+        // Block-orthonormalize P and apply the same transformation to HP.
+        // (Avoid calling normalize_op(P) alone, which would desynchronize HP.)
+        this->orthonormalize_block(this->P, &this->HP, this->n_band_l);
+}
+
+template <typename T, typename Device>
+bool DiagoPPCG<T, Device>::check_convergence(const ct::Tensor& residual, const std::vector<double>& ethr_band)
+{
+    // Check ||r_i|| <= ethr_band[i] for all local bands.
+    bool not_conv = false;
+    for (int ib = 0; ib < this->n_band_l; ++ib)
+    {
+        const T* ri = residual.data<T>() + ib * this->n_basis;
+        const Real nrm2 = std::sqrt(ModuleBase::dot_real_op<T, Device>()(this->n_dim, ri, ri, true));
+        if (ib < static_cast<int>(ethr_band.size()) && nrm2 > static_cast<Real>(ethr_band[ib]))
+        {
+            not_conv = true;
+            break;
+        }
+    }
+
+#ifdef __MPI
+    // Any rank not converged means global not converged.
+    int local = not_conv ? 1 : 0;
+    int global = 0;
+    MPI_Allreduce(&local, &global, 1, MPI_INT, MPI_MAX, BP_WORLD);
+    not_conv = (global != 0);
+#endif
+
+    return !not_conv;
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::compute_residual_and_precond(const std::vector<double>& ethr_band, bool& not_conv)
+{
+    // Residual R = HX - X * diag(eig)
+    // First, R <- HX
+    syncmem_complex_op()(this->R.data<T>(), this->HX.data<T>(), this->n_band_l * this->n_basis);
+
+    // tmp = X * diag(e)
+    apply_eigenvalues_op<T, Device>()(this->n_dim,
+                                      this->n_basis,
+                                      this->n_band_l,
+                                      this->work.data<T>(),
+                                      this->X.data<T>(),
+                                      this->eval.data<Real>());
+    // R -= tmp
+    for (int ib = 0; ib < this->n_band_l; ++ib)
+    {
+        const T alpha = static_cast<T>(-1.0);
+        ModuleBase::axpy_op<T, Device>()(this->n_dim,
+                                         &alpha,
+                                         this->work.data<T>() + ib * this->n_basis,
+                                         1,
+                                         this->R.data<T>() + ib * this->n_basis,
+                                         1);
+    }
+
+    // not_conv if any band residual above threshold
+    not_conv = !this->check_convergence(this->R, ethr_band);
+    if (!not_conv)
+    {
+        return;
+    }
+
+    // W = - M^{-1} R
+    syncmem_complex_op()(this->W.data<T>(), this->R.data<T>(), this->n_band_l * this->n_basis);
+    precondition_op<T, Device>()(this->n_dim,
+                                 this->W.data<T>(),
+                                 0,
+                                 this->n_band_l,
+                                 this->prec.data<Real>(),
+                                 this->eval.data<Real>());
+    for (int ib = 0; ib < this->n_band_l; ++ib)
+    {
+        ModuleBase::vector_mul_real_op<T, Device>()(this->n_dim,
+                                                    this->W.data<T>() + ib * this->n_basis,
+                                                    this->W.data<T>() + ib * this->n_basis,
+                                                    static_cast<Real>(-1.0));
+    }
+
+    // Project W out of X and P (if P contains previous directions)
+    this->project_out(this->X, this->n_band_l, this->W, this->n_band_l);
+
+    // Also project out of P to reduce near-dependencies
+    // (P may be all-zero at the first iter, projection is harmless)
+    this->project_out(this->P, this->n_band_l, this->W, this->n_band_l);
+
+    // Normalize each vector in W
+    normalize_op<T, Device>()(this->n_dim, this->W.data<T>(), 0, this->n_band_l, nullptr);
+}
+
+template <typename T, typename Device>
+void DiagoPPCG<T, Device>::diag(const HPsiFunc& hpsi_func,
+                               T* psi_in,
+                               Real* eigenvalue_in,
+                               const std::vector<double>& ethr_band)
+{
+    // Map external psi to X
+    this->X = ct::TensorMap((void*)psi_in, this->t_type, this->device_type, {this->n_band_l, this->n_basis});
+
+    // Normalize initial X
+    normalize_op<T, Device>()(this->n_dim, this->X.data<T>(), 0, this->n_band_l, nullptr);
+
+    // HX = H X
+    this->apply_h(hpsi_func, this->X, this->HX, this->n_band_l);
+
+    // Make X block-orthonormal (and keep HX consistent) before any projection/RR.
+    this->orthonormalize_block(this->X, &this->HX, this->n_band_l);
+
+    // Initial Rayleigh-Ritz on X alone: solve (X^H H X) c = (X^H X) c Λ
+    {
+        const int ncols = this->n_band;
+        ct::Tensor hxx(this->t_type, this->device_type, {ncols, ncols});
+        ct::Tensor sxx(this->t_type, this->device_type, {ncols, ncols});
+        ct::Tensor vxx(this->t_type, this->device_type, {ncols, ncols});
+        ct::Tensor exx(this->r_type, this->device_type, {ncols});
+
+    #ifdef __MPI
+        this->pmmcn.set_dimension(BP_WORLD,
+                      POOL_WORLD,
+                      this->n_band_l,
+                      this->n_basis,
+                      this->n_band_l,
+                      this->n_basis,
+                      this->n_dim,
+                      ncols);
+    #else
+        this->pmmcn.set_dimension(this->n_band_l,
+                      this->n_basis,
+                      this->n_band_l,
+                      this->n_basis,
+                      this->n_dim,
+                      ncols);
+    #endif
+        this->pmmcn.multiply(1.0, this->X.data<T>(), this->HX.data<T>(), 0.0, hxx.data<T>());
+        this->pmmcn.multiply(1.0, this->X.data<T>(), this->X.data<T>(), 0.0, sxx.data<T>());
+
+        hsolver::hegvd_op<T, Device>()(this->ctx,
+                                       ncols,
+                                       ncols,
+                                       hxx.data<T>(),
+                                       sxx.data<T>(),
+                                       exx.data<Real>(),
+                                       vxx.data<T>());
+
+        // Rotate X, HX: X <- X * vxx, HX <- HX * vxx
+        ModuleBase::gemm_op<T, Device>()('N',
+                                        'N',
+                                        this->n_dim,
+                                        this->n_band_l,
+                                        ncols,
+                                        this->one,
+                                        this->X.data<T>(),
+                                        this->n_basis,
+                                        vxx.data<T>(),
+                                        ncols,
+                                        this->zero,
+                                        this->work.data<T>(),
+                                        this->n_basis);
+        syncmem_complex_op()(this->X.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+        ModuleBase::gemm_op<T, Device>()('N',
+                                        'N',
+                                        this->n_dim,
+                                        this->n_band_l,
+                                        ncols,
+                                        this->one,
+                                        this->HX.data<T>(),
+                                        this->n_basis,
+                                        vxx.data<T>(),
+                                        ncols,
+                                        this->zero,
+                                        this->work.data<T>(),
+                                        this->n_basis);
+
+        syncmem_complex_op()(this->HX.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+        syncmem_var_op()(this->eval.data<Real>(), exx.data<Real>(), this->n_band);
+    }
+
+    // Clear P/HP to zero for the first outer iteration
+    Parallel_Reduce::ZEROS(this->P.data<T>(), this->n_band_l * this->n_basis);
+    Parallel_Reduce::ZEROS(this->HP.data<T>(), this->n_band_l * this->n_basis);
+
+    // Compute initial residual and preconditioned direction W.
+    bool not_conv = true;
+    this->compute_residual_and_precond(ethr_band, not_conv);
+
+    // Determine how many inner iterations to allow.
+    // When 3*n_band fits in the ambient space the P block is safe and
+    // 2-3 iterations accelerate convergence.  Otherwise stick to 1 to
+    // avoid near-singular overlap matrices.
+    const bool p_safe = (3 * this->n_band <= this->n_dim - this->p_safe_margin_);
+    const int max_iter = p_safe ? this->max_inner_iter_ : 1;
+    const int max_w_cols = std::max(0, std::min(this->n_band_l, this->n_dim - this->n_band_l));
+    int active_w_cols = not_conv ? max_w_cols : 0;
+    if (not_conv && active_w_cols > 0)
+    {
+        // HW = H W
+        this->apply_h(hpsi_func, this->W, this->HW, active_w_cols);
+
+        // Keep W and HW consistent while improving conditioning.
+        this->orthonormalize_block(this->W, &this->HW, active_w_cols);
+    }
+    for (int iter = 0; iter < max_iter && not_conv; ++iter)
+    {
+        const bool has_p = (iter > 0) && p_safe;
+        const int raw_ncols = this->n_band + active_w_cols + (has_p ? this->n_band : 0);
+        const int ncols = std::min(raw_ncols, this->n_dim);
+
+        // Pack basis V/HV
+        this->pack_basis(ncols, has_p);
+
+        // Solve projected generalized eigenproblem
+        this->compute_projected_mats(ncols);
+        this->solve_projected(ncols);
+
+        // Update X/HX and P/HP
+        const bool update_p = (iter + 1 < max_iter);
+        this->update_from_projected(ncols, has_p, update_p);
+
+        if (iter + 1 >= max_iter)
+        {
+            break;
+        }
+
+        // Residual for next convergence check
+        this->compute_residual_and_precond(ethr_band, not_conv);
+
+        if (!not_conv)
+        {
+            break;
+        }
+
+        active_w_cols = max_w_cols;
+        // Update HW for the next iteration
+        this->apply_h(hpsi_func, this->W, this->HW, active_w_cols);
+
+        // Keep W and HW consistent
+        this->orthonormalize_block(this->W, &this->HW, active_w_cols);
+    }
+
+    // Final Rayleigh-Ritz on the current X subspace to ensure (X, eval) consistency.
+    // This mirrors BPCG's exit behavior (subspace diagonalization before returning).
+    {
+        const int ncols = this->n_band;
+        ct::Tensor hxx(this->t_type, this->device_type, {ncols, ncols});
+        ct::Tensor sxx(this->t_type, this->device_type, {ncols, ncols});
+        ct::Tensor vxx(this->t_type, this->device_type, {ncols, ncols});
+        ct::Tensor exx(this->r_type, this->device_type, {ncols});
+
+#ifdef __MPI
+        this->pmmcn.set_dimension(BP_WORLD,
+                                  POOL_WORLD,
+                                  this->n_band_l,
+                                  this->n_basis,
+                                  this->n_band_l,
+                                  this->n_basis,
+                                  this->n_dim,
+                                  ncols);
+#else
+        this->pmmcn.set_dimension(this->n_band_l,
+                                  this->n_basis,
+                                  this->n_band_l,
+                                  this->n_basis,
+                                  this->n_dim,
+                                  ncols);
+#endif
+        this->pmmcn.multiply(1.0, this->X.data<T>(), this->HX.data<T>(), 0.0, hxx.data<T>());
+        this->pmmcn.multiply(1.0, this->X.data<T>(), this->X.data<T>(), 0.0, sxx.data<T>());
+
+        hsolver::hegvd_op<T, Device>()(this->ctx,
+                                       ncols,
+                                       ncols,
+                                       hxx.data<T>(),
+                                       sxx.data<T>(),
+                                       exx.data<Real>(),
+                                       vxx.data<T>());
+
+        // Rotate X, HX: X <- X * vxx, HX <- HX * vxx
+        ModuleBase::gemm_op<T, Device>()('N',
+                                         'N',
+                                         this->n_dim,
+                                         this->n_band_l,
+                                         ncols,
+                                         this->one,
+                                         this->X.data<T>(),
+                                         this->n_basis,
+                                         vxx.data<T>(),
+                                         ncols,
+                                         this->zero,
+                                         this->work.data<T>(),
+                                         this->n_basis);
+        syncmem_complex_op()(this->X.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+        ModuleBase::gemm_op<T, Device>()('N',
+                                         'N',
+                                         this->n_dim,
+                                         this->n_band_l,
+                                         ncols,
+                                         this->one,
+                                         this->HX.data<T>(),
+                                         this->n_basis,
+                                         vxx.data<T>(),
+                                         ncols,
+                                         this->zero,
+                                         this->work.data<T>(),
+                                         this->n_basis);
+        syncmem_complex_op()(this->HX.data<T>(), this->work.data<T>(), this->n_band_l * this->n_basis);
+
+        syncmem_var_op()(this->eval.data<Real>(), exx.data<Real>(), this->n_band);
+    }
+
+    // Copy eigenvalues out
+    syncmem_var_d2h_op()(eigenvalue_in, this->eval.data<Real>(), this->n_band);
+}
+
+// explicit instantiation
+template class DiagoPPCG<std::complex<double>, base_device::DEVICE_CPU>;
+template class DiagoPPCG<std::complex<float>, base_device::DEVICE_CPU>;
+#if ((defined __CUDA) || (defined __ROCM))
+template class DiagoPPCG<std::complex<double>, base_device::DEVICE_GPU>;
+template class DiagoPPCG<std::complex<float>, base_device::DEVICE_GPU>;
+#endif
+
+} // namespace hsolver
diff --git a/source/source_hsolver/diago_ppcg.h b/source/source_hsolver/diago_ppcg.h
new file mode 100644
index 00000000000..3566afd0d66
--- /dev/null
+++ b/source/source_hsolver/diago_ppcg.h
@@ -0,0 +1,145 @@
+#ifndef DIAGO_PPCG_H_
+#define DIAGO_PPCG_H_
+
+#include "source_base/kernels/math_kernel_op.h"
+#include "source_base/module_device/memory_op.h"
+#include "source_base/module_device/types.h"
+#include "source_base/para_gemm.h"
+#include "source_hsolver/kernels/hegvd_op.h"
+
+#include <ATen/core/tensor.h>
+#include <ATen/core/tensor_map.h>
+#include <functional>
+#include <source_base/macros.h>
+
+namespace hsolver {
+
+// Projected Preconditioned Conjugate Gradient (block) eigensolver.
+// This implementation follows an LOBPCG-style subspace projection:
+//   V = [X, W, P] (or [X, W] for the first iter)
+//   solve (V^H H V) c = (V^H V) c Λ
+//   update X <- V c(:,1:nband)
+//   update P <- [W, P] c_{W,P}(:,1:nband)
+// with W from preconditioned residual projected to the complement of X (and P).
+//
+// Notes:
+// - Designed to match the existing diag interface used by BPCG.
+// - Preconditioner is treated as a diagonal Real vector of length n_basis.
+
+template <typename T = std::complex<double>, typename Device = base_device::DEVICE_CPU>
+class DiagoPPCG
+{
+  private:
+    using Real = typename GetTypeReal<T>::type;
+
+  public:
+    explicit DiagoPPCG(const Real* precondition);
+    ~DiagoPPCG();
+
+    void init_iter(const int nband, const int nband_l, const int nbasis, const int ndim);
+
+    // ---- tunable parameters ----
+    /// Maximum inner iterations per diag() call when P-block is safe.
+    /// Default 3; set to 1 for ultra-conservative mode or higher for
+    /// difficult spectra.
+    void set_max_inner_iter(const int n) { max_inner_iter_ = n; }
+
+    /// Safety margin for P-block usage: P is enabled only when
+    /// 3 * n_band <= n_dim - p_safe_margin_.
+    /// Default 2; increase for better numerical stability at the cost of
+    /// slower convergence on well-conditioned problems.
+    void set_p_safe_margin(const int m) { p_safe_margin_ = m; }
+
+    /// Number of diag() passes performed by the factory (hsolver_pw).
+    /// Default 5; matching BPCG's multi-pass strategy.
+    void set_npass(const int n) { npass_ = n; }
+    int npass() const { return npass_; }
+    // ---- end tunable parameters ----
+
+    using HPsiFunc = std::function<void(T*, T*, const int, const int)>;
+
+    void diag(const HPsiFunc& hpsi_func,
+              T* psi_in,
+              Real* eigenvalue_in,
+              const std::vector<double>& ethr_band);
+
+  private:
+    int n_band = 0;
+    int n_band_l = 0;
+    int n_basis = 0;
+    int n_dim = 0;
+
+    // tunable parameters (see set_xxx methods above)
+    int max_inner_iter_ = 3;
+    int p_safe_margin_ = 2;
+    int npass_ = 5;
+
+    ct::DataType r_type = ct::DataType::DT_INVALID;
+    ct::DataType t_type = ct::DataType::DT_INVALID;
+    ct::DeviceType device_type = ct::DeviceType::UnKnown;
+
+    // Host pointer mapped preconditioner + device copy
+    ct::Tensor h_prec = {};
+    ct::Tensor prec = {};
+
+    // Work vectors (column-major, lda = n_basis): each tensor stores a matrix (n_basis x ncols)
+    // as a contiguous array; we use Tensor with shape {ncols, n_basis} for contiguous column blocks.
+    ct::Tensor X = {};   // mapped to psi_in
+    ct::Tensor HX = {};
+    ct::Tensor R = {};
+    ct::Tensor W = {};
+    ct::Tensor HW = {};
+    ct::Tensor P = {};
+    ct::Tensor HP = {};
+
+    ct::Tensor V = {};   // basis [X,W,P] packed
+    ct::Tensor HV = {};  // H*V
+
+    ct::Tensor hcc = {}; // V^H H V
+    ct::Tensor scc = {}; // V^H V
+    ct::Tensor vcc = {}; // eigenvectors of projected problem
+    ct::Tensor eval = {}; // eigenvalues of projected problem
+
+    ct::Tensor work = {}; // generic workspace (ncols x n_basis)
+
+    // Parallel matmul helper (A^H B)
+    ModuleBase::PGemmCN<T, Device> pmmcn;
+
+    // Device memory helpers
+    Device* ctx = {};
+    const T one_ = static_cast<T>(1.0);
+    const T zero_ = static_cast<T>(0.0);
+    const T neg_one_ = static_cast<T>(-1.0);
+    const T* one = &one_;
+    const T* zero = &zero_;
+    const T* neg_one = &neg_one_;
+
+    using syncmem_complex_op = base_device::memory::synchronize_memory_op<T, Device, Device>;
+    using syncmem_var_op = base_device::memory::synchronize_memory_op<Real, Device, Device>;
+    using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op<Real, Device, base_device::DEVICE_CPU>;
+    using syncmem_var_d2h_op = base_device::memory::synchronize_memory_op<Real, base_device::DEVICE_CPU, Device>;
+
+    void calc_prec();
+
+    void apply_h(const HPsiFunc& hpsi_func, const ct::Tensor& in_vecs, ct::Tensor& out_vecs, const int nvec);
+
+    void pack_basis(const int ncols, const bool has_p);
+
+    void compute_projected_mats(const int ncols);
+
+    void solve_projected(const int ncols);
+
+    void update_from_projected(const int ncols, const bool has_p, const bool update_p);
+
+    void compute_residual_and_precond(const std::vector<double>& ethr_band, bool& not_conv);
+
+    void orthonormalize_block(ct::Tensor& A, ct::Tensor* HA, const int ncols);
+
+    void project_out(const ct::Tensor& basis, const int ncols_basis, ct::Tensor& vecs, const int ncols_vecs);
+
+    bool check_convergence(const ct::Tensor& residual, const std::vector<double>& ethr_band);
+};
+
+} // namespace hsolver
+
+#endif // DIAGO_PPCG_H_
diff --git a/source/source_hsolver/hsolver_pw.cpp b/source/source_hsolver/hsolver_pw.cpp
index b88bc3b90dd..45d99aead9a 100644
--- a/source/source_hsolver/hsolver_pw.cpp
+++ b/source/source_hsolver/hsolver_pw.cpp
@@ -8,6 +8,7 @@
 #include "source_hamilt/hamilt.h"
 #include "source_hsolver/diag_comm_info.h"
 #include "source_hsolver/diago_bpcg.h"
+#include "source_hsolver/diago_ppcg.h"
 #include "source_hsolver/diago_cg.h"
 #include "source_hsolver/diago_dav_subspace.h"
 #include "source_hsolver/diago_david.h"
@@ -83,7 +84,7 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
     this->nproc_in_pool = nproc_in_pool_in;
 
     // report if the specified diagonalization method is not supported
-    const std::initializer_list<std::string> _methods = {"cg", "dav", "dav_subspace", "bpcg"};
+    const std::initializer_list<std::string> _methods = {"cg", "dav", "dav_subspace", "bpcg", "ppcg"};
     if (std::find(std::begin(_methods), std::end(_methods), this->method) == std::end(_methods))
     {
         ModuleBase::WARNING_QUIT("HSolverPW::solve", "This type of eigensolver is not supported!");
@@ -323,6 +324,19 @@ void HSolverPW<T, Device>::hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm,
         bpcg.init_iter(PARAM.inp.nbands, nband_l, nbasis, ndim);
         bpcg.diag(hpsi_func, psi.get_pointer(), eigenvalue, this->ethr_band);
     }
+    else if (this->method == "ppcg")
+    {
+        const int nband_l = psi.get_nbands();
+        const int nbasis = psi.get_nbasis();
+        const int ndim = psi.get_current_ngk();
+        DiagoPPCG<T, Device> ppcg(pre_condition.data());
+        ppcg.init_iter(PARAM.inp.nbands, nband_l, nbasis, ndim);
+        // Multiple passes for robust convergence (same strategy as BPCG in unit tests)
+        for (int pass = 0; pass < ppcg.npass(); ++pass)
+        {
+            ppcg.diag(hpsi_func, psi.get_pointer(), eigenvalue, this->ethr_band);
+        }
+    }
     else if (this->method == "dav_subspace")
     {
         bool scf = this->calculation_type == "nscf" ? false : true;
diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.cpp b/source/source_hsolver/kernels/bpcg_kernel_op.cpp
index 88f94e288c6..ec4b83b9575 100644
--- a/source/source_hsolver/kernels/bpcg_kernel_op.cpp
+++ b/source/source_hsolver/kernels/bpcg_kernel_op.cpp
@@ -3,9 +3,26 @@
 #include "source_base/kernels/math_kernel_op.h"
 #include "source_base/parallel_reduce.h"
 #include <vector>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 namespace hsolver
 {
 
+namespace
+{
+constexpr int kBpcgOpenmpMinWork = 4096;
+
+inline bool use_bpcg_openmp(int n)
+{
+#ifdef _OPENMP
+    return n >= kBpcgOpenmpMinWork && omp_get_max_threads() > 1;
+#else
+    return false;
+#endif
+}
+} // namespace
+
 template <typename T>
 struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
 {
@@ -26,6 +43,9 @@ struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
             Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
             Parallel_Reduce::reduce_pool(norm);
             norm = 1.0 / sqrt(norm);
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static) reduction(+:epsilo_0,epsilo_1,epsilo_2) if(use_bpcg_openmp(n_basis))
+#endif
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
@@ -41,6 +61,9 @@ struct line_minimize_with_block_op<T, base_device::DEVICE_CPU>
             theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2)));
             cos_theta = std::cos(theta);
             sin_theta = std::sin(theta);
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static) if(use_bpcg_openmp(n_basis))
+#endif
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
@@ -71,12 +94,14 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
             Real err = 0.0;
             Real beta = 0.0;
             Real epsilo = 0.0;
-            Real grad_2 = {0.0};
-            T grad_1 = {0.0, 0.0};
+            const Real beta_old = beta_out[band_idx];
             auto A = reinterpret_cast<const Real*>(psi_out + band_idx * n_basis_max);
             Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1);
             Parallel_Reduce::reduce_pool(norm);
             norm = 1.0 / sqrt(norm);
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static) reduction(+:epsilo) if(use_bpcg_openmp(n_basis))
+#endif
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
@@ -85,21 +110,27 @@ struct calc_grad_with_block_op<T, base_device::DEVICE_CPU>
                 epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item]));
             }
             Parallel_Reduce::reduce_pool(epsilo);
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static) reduction(+:err,beta) if(use_bpcg_openmp(n_basis))
+#endif
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
-                grad_1 = hpsi_out[item] - epsilo * psi_out[item];
-                grad_2 = std::norm(grad_1);
+                const T grad_1 = hpsi_out[item] - epsilo * psi_out[item];
+                const Real grad_2 = std::norm(grad_1);
                 err += grad_2;
                 beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec?
             }
             Parallel_Reduce::reduce_pool(err);
             Parallel_Reduce::reduce_pool(beta);
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static) if(use_bpcg_openmp(n_basis))
+#endif
             for (int basis_idx = 0; basis_idx < n_basis; basis_idx++)
             {
                 auto item = band_idx * n_basis_max + basis_idx;
-                grad_1 = hpsi_out[item] - epsilo * psi_out[item];
-                grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_out[band_idx] * grad_old_out[item];
+                const T grad_1 = hpsi_out[item] - epsilo * psi_out[item];
+                grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_old * grad_old_out[item];
             }
             beta_out[band_idx] = beta;
             err_out[band_idx] = sqrt(err);
@@ -113,6 +144,9 @@ struct apply_eigenvalues_op<T, base_device::DEVICE_CPU>
     using Real = typename GetTypeReal<T>::type;
     void operator()(const int& nbase, const int& nbase_x, const int& notconv, T* result, const T* vectors, const Real* eigenvalues)
     {
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2) schedule(static) if(use_bpcg_openmp(nbase * notconv))
+#endif
         for (int m = 0; m < notconv; m++)
         {
             for (int idx = 0; idx < nbase; idx++)
@@ -133,19 +167,14 @@ struct precondition_op<T, base_device::DEVICE_CPU> {
                    const Real* precondition,
                    const Real* eigenvalues)
     {
-        std::vector<Real> pre(dim, 0.0);
         for (int m = 0; m < notconv; m++)
         {
             for (size_t i = 0; i < dim; i++)
             {
                 Real x = std::abs(precondition[i] - eigenvalues[m]);
-                pre[i] = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
+                Real denom = 0.5 * (1.0 + x + sqrt(1 + (x - 1.0) * (x - 1.0)));
+                psi_iter[(nbase + m) * dim + i] /= denom;
             }
-            ModuleBase::vector_div_vector_op<T, base_device::DEVICE_CPU>()(
-                                                             dim,
-                                                             psi_iter + (nbase + m) * dim,
-                                                             psi_iter + (nbase + m) * dim,
-                                                             pre.data());
         }
     }
 };
diff --git a/source/source_hsolver/test/CMakeLists.txt b/source/source_hsolver/test/CMakeLists.txt
index 1b1529adb4a..7721f462659 100644
--- a/source/source_hsolver/test/CMakeLists.txt
+++ b/source/source_hsolver/test/CMakeLists.txt
@@ -2,6 +2,41 @@ remove_definitions(-D__CUDA)
 remove_definitions(-D__ROCM)
 remove_definitions(-D__EXX)
 
+# Make unit tests runnable directly after build (without `cmake --install`).
+# CI does an install step which also copies these, but local dev often doesn't.
+set(_HSOLVER_TEST_FILES
+  H-KPoints-Si2.dat
+  H-GammaOnly-Si2.dat
+  S-KPoints-Si2.dat
+  S-GammaOnly-Si2.dat
+  H-KPoints-Si64.dat
+  H-GammaOnly-Si64.dat
+  S-KPoints-Si64.dat
+  S-GammaOnly-Si64.dat
+  GammaOnly-Si2-Solution.dat
+  GammaOnly-Si64-Solution.dat
+  KPoints-Si2-Solution.dat
+  KPoints-Si64-Solution.dat
+  PEXSI-H-GammaOnly-Si2.dat
+  PEXSI-S-GammaOnly-Si2.dat
+  PEXSI-DM-GammaOnly-Si2.dat
+  generate_hsolver_test_report.sh
+  diago_cg_parallel_test.sh
+  diago_david_parallel_test.sh
+  diago_lcao_parallel_test.sh
+  diago_pexsi_parallel_test.sh
+  parallel_k2d_test.sh
+)
+
+foreach(_f IN LISTS _HSOLVER_TEST_FILES)
+  if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/${_f}")
+    configure_file(
+      "${CMAKE_CURRENT_LIST_DIR}/${_f}"
+      "${CMAKE_CURRENT_BINARY_DIR}/${_f}"
+      COPYONLY)
+  endif()
+endforeach()
+
 if (ENABLE_MPI)
   AddTest(
     TARGET MODULE_HSOLVER_parak2d_test
@@ -16,6 +51,17 @@ if (ENABLE_MPI)
             ../../source_hamilt/operator.cpp
             ../../source_pw/module_pwdft/op_pw.cpp
   )
+  AddTest(
+    TARGET MODULE_HSOLVER_ppcg
+    LIBS parameter  ${math_libs} base psi device container
+    SOURCES diago_ppcg_test.cpp ../diago_ppcg.cpp ../diago_bpcg.cpp ../diago_david.cpp
+            ../para_linear_transform.cpp ../diago_iter_assist.cpp
+            ../kernels/hegvd_op.cpp
+            ../../source_base/module_container/ATen/kernels/lapack.cpp
+            ../../source_basis/module_pw/test/test_tool.cpp
+            ../../source_hamilt/operator.cpp
+            ../../source_pw/module_pwdft/op_pw.cpp
+  )
   AddTest(
     TARGET MODULE_HSOLVER_cg
     LIBS parameter  ${math_libs} base psi device container
@@ -76,14 +122,16 @@ if (ENABLE_MPI)
   AddTest(
     TARGET MODULE_HSOLVER_pw
     LIBS parameter  ${math_libs} psi device base container
-    SOURCES test_hsolver_pw.cpp ../hsolver_pw.cpp ../hsolver_lcaopw.cpp ../diago_bpcg.cpp ../diago_dav_subspace.cpp ../diag_const_nums.cpp ../diago_iter_assist.cpp ../para_linear_transform.cpp
+    SOURCES test_hsolver_pw.cpp ../hsolver_pw.cpp ../hsolver_lcaopw.cpp ../diago_bpcg.cpp ../diago_ppcg.cpp ../diago_dav_subspace.cpp ../diag_const_nums.cpp ../diago_iter_assist.cpp ../para_linear_transform.cpp
+    ../kernels/hegvd_op.cpp ../../source_base/module_container/ATen/kernels/lapack.cpp
     ../../source_estate/elecstate_tools.cpp ../../source_estate/occupy.cpp ../../source_base/module_fft/fft_bundle.cpp ../../source_base/module_fft/fft_cpu.cpp
   )
 
   AddTest(
     TARGET MODULE_HSOLVER_sdft
     LIBS parameter  ${math_libs} psi device base container
-    SOURCES test_hsolver_sdft.cpp ../hsolver_pw_sdft.cpp ../hsolver_pw.cpp ../diago_bpcg.cpp ../diago_dav_subspace.cpp ../diag_const_nums.cpp ../diago_iter_assist.cpp ../para_linear_transform.cpp
+    SOURCES test_hsolver_sdft.cpp ../hsolver_pw_sdft.cpp ../hsolver_pw.cpp ../diago_bpcg.cpp ../diago_ppcg.cpp ../diago_dav_subspace.cpp ../diag_const_nums.cpp ../diago_iter_assist.cpp ../para_linear_transform.cpp
+    ../kernels/hegvd_op.cpp ../../source_base/module_container/ATen/kernels/lapack.cpp
                 ../../source_estate/elecstate_tools.cpp ../../source_estate/occupy.cpp ../../source_base/module_fft/fft_bundle.cpp ../../source_base/module_fft/fft_cpu.cpp
     )
 
@@ -135,6 +183,7 @@ install(FILES GammaOnly-Si64-Solution.dat DESTINATION ${CMAKE_CURRENT_BINARY_DIR
 install(FILES KPoints-Si2-Solution.dat DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 install(FILES KPoints-Si64-Solution.dat DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 
+install(FILES generate_hsolver_test_report.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 install(FILES diago_cg_parallel_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 install(FILES diago_david_parallel_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 install(FILES diago_lcao_parallel_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
@@ -172,6 +221,15 @@ add_test(NAME MODULE_HSOLVER_para_linear_trans
 )
 
 find_program(BASH bash)
+if (ENABLE_MPI AND BASH)
+  add_custom_target(MODULE_HSOLVER_test_report
+    COMMAND ${BASH} generate_hsolver_test_report.sh
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    DEPENDS MODULE_HSOLVER_bpcg MODULE_HSOLVER_ppcg
+    COMMENT "Generate hsolver unit-test reports under the build directory"
+  )
+endif()
+
 if (ENABLE_MPI)
   add_test(NAME MODULE_HSOLVER_cg_parallel
         COMMAND ${BASH} diago_cg_parallel_test.sh
@@ -197,4 +255,4 @@ if (ENABLE_MPI)
       )
     endif()
   endif()
-endif()
\ No newline at end of file
+endif()
diff --git a/source/source_hsolver/test/diago_bpcg_test.cpp b/source/source_hsolver/test/diago_bpcg_test.cpp
index 962ce72315f..362e4a50bdc 100644
--- a/source/source_hsolver/test/diago_bpcg_test.cpp
+++ b/source/source_hsolver/test/diago_bpcg_test.cpp
@@ -6,13 +6,19 @@
 #include "source_pw/module_pwdft/hamilt_pw.h"
 #include "../diago_iter_assist.h"
 #include "../diago_bpcg.h"
+#include "../kernels/bpcg_kernel_op.h"
 #include "diago_mock.h"
 #include "mpi.h"
+#include "source_base/global_variable.h"
+#include "source_base/parallel_comm.h"
 #include "source_basis/module_pw/test/test_tool.h"
 
 #include <gtest/gtest.h>
 #include <complex>
 #include <random>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 
 /************************************************
  *  unit test of functions in Diago_BPCG
@@ -74,13 +80,17 @@ class DiagoBPCGPrepare
     int nprocs=1, mypnum=0;
     // threshold is the comparison standard between bpcg and lapack
 
-    void CompareEigen(double *precondition)
+    void CompareEigen(double *precondition, bool check_vectors = false)
     {
         // calculate eigenvalues by LAPACK;
         double *e_lapack = new double[npw];
         auto ev = DIAGOTEST::hmatrix;
-        if(mypnum == 0) {  lapackEigen(npw, ev, e_lapack, false);
-}
+        if (mypnum == 0) {
+            lapackEigen(npw, ev, e_lapack, false);
+        }
+    #ifdef __MPI
+        MPI_Bcast(e_lapack, npw, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+    #endif
         // initial guess of psi by perturbing lapack psi
         ModuleBase::ComplexMatrix psiguess(nband, npw);
         std::default_random_engine p(1);
@@ -91,18 +101,14 @@ class DiagoBPCGPrepare
             {
 		        double rand = static_cast<double>(u(p))/10.;
                 // psiguess(i,j) = ev(j,i)*(1+rand);
-                psiguess(i, j) = ev[j * DIAGOTEST::h_nc + i] * rand;
+                psiguess(i, j) = ev[j * npw + i] * rand;
             }
         }
         // run bpcg
 	//======================================================================
         double *en = new double[npw];
         int ik = 1;
-	    hamilt::Hamilt<std::complex<double>>* ha;
-	    ha =new hamilt::HamiltPW<std::complex<double>>(nullptr, nullptr, nullptr, nullptr, nullptr, nullptr);
-	    int* ngk = new int [1];
-	    //psi::Psi<std::complex<double>> psi(ngk,ik,nband,npw);
-	    psi::Psi<std::complex<double>> psi;
+        psi::Psi<std::complex<double>> psi;
 	    psi.resize(ik,nband,npw);
 	    //psi.fix_k(0);
         for (int i = 0; i < nband; i++)
@@ -156,10 +162,11 @@ class DiagoBPCGPrepare
         const int ndim = psi_local.get_current_ngk();
         bpcg.init_iter(nband, nband, npw, ndim);
         std::vector<double> ethr_band(nband, 1e-5);
-        bpcg.diag(hpsi_func, psi_local.get_pointer(), en, ethr_band);
-        bpcg.diag(hpsi_func, psi_local.get_pointer(), en, ethr_band);
-        bpcg.diag(hpsi_func, psi_local.get_pointer(), en, ethr_band);
-        bpcg.diag(hpsi_func, psi_local.get_pointer(), en, ethr_band);
+        // One diag() call has a relatively small internal iteration cap; do a few passes
+        // to reach LAPACK-close eigenvalues for random dense problems.
+        for (int pass = 0; pass < 4; ++pass) {
+            bpcg.diag(hpsi_func, psi_local.get_pointer(), en, ethr_band);
+        }
         end = MPI_Wtime();
         //if(mypnum == 0) printf("diago time:%7.3f\n",end-start);
         delete [] DIAGOTEST::npw_local;
@@ -170,9 +177,42 @@ class DiagoBPCGPrepare
             EXPECT_NEAR(en[i], e_lapack[i], threshold);
         }
 
+        if (check_vectors && nprocs == 1)
+        {
+            std::vector<std::complex<double>> hpsi_check(nband * npw);
+            hpsi_func(psi_local.get_pointer(), hpsi_check.data(), npw, nband);
+
+            for (int ib = 0; ib < nband; ++ib)
+            {
+                double norm = 0.0;
+                double residual_norm = 0.0;
+                for (int ig = 0; ig < npw; ++ig)
+                {
+                    const std::complex<double> psi_value = psi_local(ib, ig);
+                    const std::complex<double> residual = hpsi_check[ib * npw + ig] - en[ib] * psi_value;
+                    norm += std::norm(psi_value);
+                    residual_norm += std::norm(residual);
+                }
+                EXPECT_NEAR(norm, 1.0, 1e-10);
+                EXPECT_NEAR(std::sqrt(residual_norm), 0.0, 1e-8);
+            }
+
+            for (int ib = 0; ib < nband; ++ib)
+            {
+                for (int jb = ib + 1; jb < nband; ++jb)
+                {
+                    std::complex<double> overlap = 0.0;
+                    for (int ig = 0; ig < npw; ++ig)
+                    {
+                        overlap += std::conj(psi_local(ib, ig)) * psi_local(jb, ig);
+                    }
+                    EXPECT_NEAR(std::abs(overlap), 0.0, 1e-10);
+                }
+            }
+        }
+
         delete[] en;
         delete[] e_lapack;
-        delete ha;
     }
 };
 
@@ -187,6 +227,7 @@ TEST_P(DiagoBPCGTest, RandomHamilt)
     //		  << dcp.sparsity << ", eps=" << dcp.eps << std::endl;
     hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_NMAX = dcp.maxiter;
     hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_THR = dcp.eps;
+    hsolver::DiagoIterAssist<std::complex<double>>::SCF_ITER = 1;
     //std::cout<<"maxiter "<<hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_NMAX<<std::endl;
     //std::cout<<"eps "<<hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_THR<<std::endl;
     HPsi<std::complex<double>> hpsi(dcp.nband, dcp.npw, dcp.sparsity);
@@ -201,7 +242,7 @@ INSTANTIATE_TEST_SUITE_P(VerifyCG,
                          DiagoBPCGTest,
                          ::testing::Values(
                              // nband, npw, sparsity, reorder, eps, maxiter, threshold
-                             DiagoBPCGPrepare(10, 500, 0, true, 1e-5, 300, 5e-2)
+                             DiagoBPCGPrepare(6, 120, 0, true, 1e-5, 200, 5e-2)
                             //  DiagoBPCGPrepare(20, 500, 6, true, 1e-5, 300, 5e-2)
                             //  DiagoBPCGPrepare(20, 1000, 8, true, 1e-5, 300, 5e-2),
                             //  DiagoBPCGPrepare(40, 1000, 8, true, 1e-6, 300, 5e-2)
@@ -225,6 +266,181 @@ TEST(DiagoBPCGTest, Hamilt)
     EXPECT_EQ(conj(hm[DIAGOTEST::h_nc]).imag(), hm[1].imag());
 }
 
+// bpcg for a 2x2 matrix (analytic eigenvalues: (7±sqrt(5))/2)
+TEST(DiagoBPCGTest, TwoByTwo)
+{
+    const int dim = 2;
+    const int nband = 2;
+    std::vector<std::complex<double>> hm(dim * dim);
+    hm[0] = {4.0, 0.0};
+    hm[1] = {1.0, 0.0};
+    hm[2] = {1.0, 0.0};
+    hm[3] = {3.0, 0.0};
+
+    DiagoBPCGPrepare dcp(nband, dim, 0, true, 1e-8, 80, 1e-8);
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_NMAX = dcp.maxiter;
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_THR = dcp.eps;
+    hsolver::DiagoIterAssist<std::complex<double>>::SCF_ITER = 1;
+
+    // simple positive precondition
+    double precond[dim] = {1.0, 1.0};
+    DIAGOTEST::hmatrix = hm;
+    DIAGOTEST::npw = dim;
+    dcp.CompareEigen(precond, true);
+}
+
+TEST(BpcgKernelOpTest, ApplyEigenvaluesUsesLeadingDimension)
+{
+    using T = std::complex<double>;
+    const int nbase = 4101;
+    const int nbase_x = nbase + 3;
+    const int notconv = 2;
+    const T untouched = {-9.0, 4.0};
+
+    std::vector<T> vectors(nbase_x * notconv);
+    std::vector<T> result(nbase_x * notconv, untouched);
+    const double eigenvalues[notconv] = {2.0, -0.5};
+
+    for (int m = 0; m < notconv; ++m)
+    {
+        for (int i = 0; i < nbase_x; ++i)
+        {
+            vectors[m * nbase_x + i] = T(0.25 * (i + 1), -0.1 * (m + 1));
+        }
+    }
+
+    hsolver::apply_eigenvalues_op<T, base_device::DEVICE_CPU>()(
+        nbase, nbase_x, notconv, result.data(), vectors.data(), eigenvalues);
+
+    for (int m = 0; m < notconv; ++m)
+    {
+        for (int i = 0; i < nbase; ++i)
+        {
+            EXPECT_EQ(result[m * nbase_x + i], eigenvalues[m] * vectors[m * nbase_x + i]);
+        }
+        for (int i = nbase; i < nbase_x; ++i)
+        {
+            EXPECT_EQ(result[m * nbase_x + i], untouched);
+        }
+    }
+}
+
+TEST(BpcgKernelOpTest, PreconditionUsesBandOffsetAndFormula)
+{
+    using T = std::complex<double>;
+    const int dim = 4;
+    const int nbase = 2;
+    const int notconv = 2;
+    std::vector<T> psi_iter((nbase + notconv) * dim);
+    const std::vector<T> original = {
+        {1.0, 0.0}, {2.0, 0.0}, {3.0, 0.0}, {4.0, 0.0},
+        {5.0, 0.0}, {6.0, 0.0}, {7.0, 0.0}, {8.0, 0.0},
+        {1.0, 2.0}, {2.0, 3.0}, {3.0, 4.0}, {4.0, 5.0},
+        {2.0, -1.0}, {3.0, -2.0}, {4.0, -3.0}, {5.0, -4.0}};
+    psi_iter = original;
+
+    const double precondition[dim] = {1.0, 2.5, 4.0, 7.0};
+    const double eigenvalues[notconv] = {0.5, 3.0};
+
+    hsolver::precondition_op<T, base_device::DEVICE_CPU>()(
+        dim, psi_iter.data(), nbase, notconv, precondition, eigenvalues);
+
+    for (int i = 0; i < nbase * dim; ++i)
+    {
+        EXPECT_EQ(psi_iter[i], original[i]);
+    }
+
+    for (int m = 0; m < notconv; ++m)
+    {
+        for (int i = 0; i < dim; ++i)
+        {
+            const double x = std::abs(precondition[i] - eigenvalues[m]);
+            const double denom = 0.5 * (1.0 + x + std::sqrt(1.0 + (x - 1.0) * (x - 1.0)));
+            const int idx = (nbase + m) * dim + i;
+            EXPECT_NEAR(psi_iter[idx].real(), (original[idx] / denom).real(), 1e-14);
+            EXPECT_NEAR(psi_iter[idx].imag(), (original[idx] / denom).imag(), 1e-14);
+        }
+    }
+}
+
+TEST(BpcgKernelOpTest, RefreshProjectedMatricesOnlyTouchesDiagonal)
+{
+    using T = std::complex<double>;
+    const int n = 3;
+    const int ldh = 5;
+    const T one = {1.0, 0.0};
+    const T h_sentinel = {-1.0, 0.5};
+    const T s_sentinel = {-2.0, 0.5};
+    const T v_sentinel = {-3.0, 0.5};
+    const double eigenvalues[n] = {0.25, 1.5, 3.75};
+
+    std::vector<T> hcc(ldh * ldh, h_sentinel);
+    std::vector<T> scc(ldh * ldh, s_sentinel);
+    std::vector<T> vcc(ldh * ldh, v_sentinel);
+
+    hsolver::refresh_hcc_scc_vcc_op<T, base_device::DEVICE_CPU>()(
+        n, hcc.data(), scc.data(), vcc.data(), ldh, eigenvalues, one);
+
+    for (int col = 0; col < ldh; ++col)
+    {
+        for (int row = 0; row < ldh; ++row)
+        {
+            const int idx = col * ldh + row;
+            if (row == col && row < n)
+            {
+                EXPECT_EQ(hcc[idx], T(eigenvalues[row], 0.0));
+                EXPECT_EQ(scc[idx], one);
+                EXPECT_EQ(vcc[idx], one);
+            }
+            else
+            {
+                EXPECT_EQ(hcc[idx], h_sentinel);
+                EXPECT_EQ(scc[idx], s_sentinel);
+                EXPECT_EQ(vcc[idx], v_sentinel);
+            }
+        }
+    }
+}
+
+TEST(BpcgKernelOpTest, ApplyEigenvaluesMatchesSingleThreadResult)
+{
+#ifndef _OPENMP
+    GTEST_SKIP() << "OpenMP is not enabled in this build";
+#else
+    using T = std::complex<double>;
+    const int nbase = 5000;
+    const int nbase_x = nbase + 7;
+    const int notconv = 3;
+    std::vector<T> vectors(nbase_x * notconv);
+    std::vector<T> result_single(nbase_x * notconv);
+    std::vector<T> result_multi(nbase_x * notconv);
+    const double eigenvalues[notconv] = {1.25, -2.0, 0.125};
+
+    for (int m = 0; m < notconv; ++m)
+    {
+        for (int i = 0; i < nbase_x; ++i)
+        {
+            vectors[m * nbase_x + i] = T(0.01 * (i % 97) + m, -0.02 * (i % 31));
+        }
+    }
+
+    const int old_threads = omp_get_max_threads();
+    omp_set_num_threads(1);
+    hsolver::apply_eigenvalues_op<T, base_device::DEVICE_CPU>()(
+        nbase, nbase_x, notconv, result_single.data(), vectors.data(), eigenvalues);
+
+    omp_set_num_threads(4);
+    hsolver::apply_eigenvalues_op<T, base_device::DEVICE_CPU>()(
+        nbase, nbase_x, notconv, result_multi.data(), vectors.data(), eigenvalues);
+    omp_set_num_threads(old_threads);
+
+    for (size_t i = 0; i < result_single.size(); ++i)
+    {
+        EXPECT_EQ(result_multi[i], result_single[i]);
+    }
+#endif
+}
+
 // check that lapack work well
 // for an eigenvalue problem
 /*TEST(DiagoBPCGTest, ZHEEV)
@@ -271,7 +487,8 @@ TEST(DiagoBPCGTest, readH)
     hsolver::DiagoIterAssist<std::complex<double>>::SCF_ITER = 1;
     HPsi<std::complex<double>> hpsi;
     hpsi.create(nband, dim);
-    DIAGOTEST::hmatrix = hpsi.hamilt();
+    // use the matrix read from file
+    DIAGOTEST::hmatrix = hm;
     DIAGOTEST::npw = dim;
     dcp.CompareEigen(hpsi.precond());
 }
@@ -284,7 +501,8 @@ int main(int argc, char **argv)
 	int nproc_in_pool, kpar=1, mypool, rank_in_pool;
     setupmpi(argc,argv,nproc, myrank);
     divide_pools(nproc, myrank, nproc_in_pool, kpar, mypool, rank_in_pool);
-    MPI_Comm_split(MPI_COMM_WORLD,myrank,0,&BP_WORLD);
+    // In unit tests we don't do band-parallel splitting; keep BP_WORLD as the full pool communicator.
+    MPI_Comm_dup(POOL_WORLD, &BP_WORLD);
     GlobalV::NPROC_IN_POOL = nproc;
 #else
 	MPI_Init(&argc, &argv);	
diff --git a/source/source_hsolver/test/diago_ppcg_test.cpp b/source/source_hsolver/test/diago_ppcg_test.cpp
new file mode 100644
index 00000000000..5829a0fd578
--- /dev/null
+++ b/source/source_hsolver/test/diago_ppcg_test.cpp
@@ -0,0 +1,770 @@
+#include "source_base/inverse_matrix.h"
+#include "source_base/module_external/lapack_connector.h"
+#include "source_pw/module_pwdft/structure_factor.h"
+#include "source_psi/psi.h"
+#include "source_hamilt/hamilt.h"
+#include "source_pw/module_pwdft/hamilt_pw.h"
+#include "../diago_iter_assist.h"
+#include "../diago_ppcg.h"
+#include "../diago_bpcg.h"
+#include "../diago_david.h"
+#include "../diag_comm_info.h"
+#include "diago_mock.h"
+#include "mpi.h"
+#include "source_base/global_variable.h"
+#include "source_base/parallel_comm.h"
+#include "source_basis/module_pw/test/test_tool.h"
+
+#include <gtest/gtest.h>
+#include <complex>
+#include <cstdlib>
+#include <fstream>
+#include <random>
+
+// LAPACK reference eigenvalues for comparison
+static void lapackEigen(int& npw, std::vector<std::complex<double>>& hm, double* e)
+{
+    int lwork = 2 * npw;
+    std::complex<double>* work2 = new std::complex<double>[lwork];
+    double* rwork = new double[3 * npw - 2];
+    int info = 0;
+    char jobz = 'V', uplo = 'U';
+    zheev_(&jobz, &uplo, &npw, hm.data(), &npw, e, work2, &lwork, rwork, &info);
+    delete[] rwork;
+    delete[] work2;
+}
+
+class DiagoPPCGPrepare
+{
+  public:
+    DiagoPPCGPrepare(int nband, int npw, int sparsity, double eps, int maxiter, double threshold)
+        : nband(nband), npw(npw), sparsity(sparsity), eps(eps), maxiter(maxiter), threshold(threshold)
+    {
+#ifdef __MPI
+        MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+        MPI_Comm_rank(MPI_COMM_WORLD, &mypnum);
+#endif
+    }
+
+    int nband = 0;
+    int npw = 0;
+    int sparsity = 0;
+    double eps = 1e-6;
+    int maxiter = 200;
+    double threshold = 5e-2;
+
+    int nprocs = 1;
+    int mypnum = 0;
+
+    void CompareEigen(double* precondition,
+                      bool check_vectors = false,
+                      double residual_threshold = 1e-8,
+                      double orthogonality_threshold = 1e-10)
+    {
+        // Reference by LAPACK
+        double* e_lapack = new double[npw];
+        auto ev = DIAGOTEST::hmatrix;
+        if (mypnum == 0)
+        {
+            lapackEigen(npw, ev, e_lapack);
+        }
+#ifdef __MPI
+        MPI_Bcast(e_lapack, npw, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+#endif
+
+        // Initial guess: random combination of Lapack eigenvectors
+        ModuleBase::ComplexMatrix psiguess(nband, npw);
+        std::default_random_engine p(1);
+        std::uniform_int_distribution<unsigned> u(1, 10);
+        for (int i = 0; i < nband; i++)
+        {
+            for (int j = 0; j < npw; j++)
+            {
+                double rand = static_cast<double>(u(p)) / 10.;
+                psiguess(i, j) = ev[j * npw + i] * rand;
+            }
+        }
+
+        // Prepare psi
+        double* en = new double[npw];
+        int ik = 1;
+        psi::Psi<std::complex<double>> psi;
+        psi.resize(ik, nband, npw);
+        for (int i = 0; i < nband; i++)
+        {
+            for (int j = 0; j < npw; j++)
+            {
+                psi(i, j) = psiguess(i, j);
+            }
+        }
+
+        psi::Psi<std::complex<double>> psi_local;
+        double* precondition_local;
+        DIAGOTEST::npw_local = new int[nprocs];
+#ifdef __MPI
+        DIAGOTEST::cal_division(DIAGOTEST::npw);
+        DIAGOTEST::divide_hpsi(psi, psi_local, DIAGOTEST::hmatrix, DIAGOTEST::hmatrix_local);
+        precondition_local = new double[DIAGOTEST::npw_local[mypnum]];
+        DIAGOTEST::divide_psi<double>(precondition, precondition_local);
+#else
+        DIAGOTEST::hmatrix_local = DIAGOTEST::hmatrix;
+        DIAGOTEST::npw_local[0] = DIAGOTEST::npw;
+        psi_local = psi;
+        precondition_local = new double[DIAGOTEST::npw];
+        for (int i = 0; i < DIAGOTEST::npw; i++)
+            precondition_local[i] = precondition[i];
+#endif
+
+        hsolver::DiagoPPCG<std::complex<double>> ppcg(precondition_local);
+        psi_local.fix_k(0);
+
+        using T = std::complex<double>;
+        const int dim = DIAGOTEST::npw;
+        const std::vector<T>& h_mat = DIAGOTEST::hmatrix_local;
+        auto hpsi_func = [h_mat, dim](T* psi_in, T* hpsi_out, const int ld_psi, const int nvec) {
+            auto one = std::make_unique<T>(1.0);
+            auto zero = std::make_unique<T>(0.0);
+            const T* one_ = one.get();
+            const T* zero_ = zero.get();
+
+            base_device::DEVICE_CPU* ctx = {};
+            ModuleBase::gemm_op<T, base_device::DEVICE_CPU>()('N',
+                                                              'N',
+                                                              dim,
+                                                              nvec,
+                                                              dim,
+                                                              one_,
+                                                              h_mat.data(),
+                                                              dim,
+                                                              psi_in,
+                                                              ld_psi,
+                                                              zero_,
+                                                              hpsi_out,
+                                                              ld_psi);
+        };
+
+        const int ndim = psi_local.get_current_ngk();
+        ppcg.init_iter(nband, nband, npw, ndim);
+        std::vector<double> ethr_band(nband, 1e-6);
+
+        // As in BPCG, several diag() passes are needed for harder problems.
+        // Each pass starts from the refined X of the previous one and rebuilds
+        // the search directions from scratch.
+        for (int pass = 0; pass < 5; ++pass)
+        {
+            ppcg.diag(hpsi_func, psi_local.get_pointer(), en, ethr_band);
+        }
+
+        for (int i = 0; i < nband; i++)
+        {
+            EXPECT_NEAR(en[i], e_lapack[i], threshold);
+        }
+
+        if (check_vectors && nprocs == 1)
+        {
+            std::vector<std::complex<double>> hpsi_check(nband * npw);
+            hpsi_func(psi_local.get_pointer(), hpsi_check.data(), npw, nband);
+
+            for (int ib = 0; ib < nband; ++ib)
+            {
+                double norm = 0.0;
+                double residual_norm = 0.0;
+                for (int ig = 0; ig < npw; ++ig)
+                {
+                    const std::complex<double> psi_value = psi_local(ib, ig);
+                    const std::complex<double> residual = hpsi_check[ib * npw + ig] - en[ib] * psi_value;
+                    norm += std::norm(psi_value);
+                    residual_norm += std::norm(residual);
+                }
+                EXPECT_NEAR(norm, 1.0, orthogonality_threshold);
+                EXPECT_LT(std::sqrt(residual_norm), residual_threshold);
+            }
+
+            for (int ib = 0; ib < nband; ++ib)
+            {
+                for (int jb = ib + 1; jb < nband; ++jb)
+                {
+                    std::complex<double> overlap = 0.0;
+                    for (int ig = 0; ig < npw; ++ig)
+                    {
+                        overlap += std::conj(psi_local(ib, ig)) * psi_local(jb, ig);
+                    }
+                    EXPECT_LT(std::abs(overlap), orthogonality_threshold);
+                }
+            }
+        }
+
+        delete[] DIAGOTEST::npw_local;
+        delete[] precondition_local;
+        delete[] en;
+        delete[] e_lapack;
+    }
+};
+
+class DiagoPPCGTest : public ::testing::TestWithParam<DiagoPPCGPrepare>
+{
+};
+
+TEST_P(DiagoPPCGTest, RandomHamilt)
+{
+    DiagoPPCGPrepare dcp = GetParam();
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_NMAX = dcp.maxiter;
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_THR = dcp.eps;
+    hsolver::DiagoIterAssist<std::complex<double>>::SCF_ITER = 1;
+
+    HPsi<std::complex<double>> hpsi(dcp.nband, dcp.npw, dcp.sparsity);
+    DIAGOTEST::hmatrix = hpsi.hamilt();
+    DIAGOTEST::npw = dcp.npw;
+
+    dcp.CompareEigen(hpsi.precond());
+}
+
+INSTANTIATE_TEST_SUITE_P(VerifyPPCG,
+                         DiagoPPCGTest,
+                         ::testing::Values(
+                             // nband, npw, sparsity, eps, maxiter, threshold
+                             DiagoPPCGPrepare(6, 120, 0, 1e-6, 200, 8e-2)));
+
+TEST(DiagoPPCGTest, TwoByTwo)
+{
+    const int dim = 2;
+    const int nband = 2;
+    std::vector<std::complex<double>> hm(dim * dim);
+    hm[0] = {4.0, 0.0};
+    hm[1] = {1.0, 0.0};
+    hm[2] = {1.0, 0.0};
+    hm[3] = {3.0, 0.0};
+
+    DiagoPPCGPrepare dcp(nband, dim, 0, 1e-10, 80, 1e-10);
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_NMAX = dcp.maxiter;
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_THR = dcp.eps;
+    hsolver::DiagoIterAssist<std::complex<double>>::SCF_ITER = 1;
+
+    double precond[dim] = {1.0, 1.0};
+    DIAGOTEST::hmatrix = hm;
+    DIAGOTEST::npw = dim;
+    dcp.CompareEigen(precond, true);
+}
+
+TEST(DiagoPPCGTest, ComplexThreeByThree)
+{
+    const int dim = 3;
+    const int nband = 3;
+    std::vector<std::complex<double>> hm(dim * dim);
+    hm[0] = {3.0, 0.0};
+    hm[1] = {1.0, -1.0};
+    hm[2] = {0.5, 0.2};
+    hm[3] = {1.0, 1.0};
+    hm[4] = {5.0, 0.0};
+    hm[5] = {-0.3, -0.4};
+    hm[6] = {0.5, -0.2};
+    hm[7] = {-0.3, 0.4};
+    hm[8] = {7.0, 0.0};
+
+    DiagoPPCGPrepare dcp(nband, dim, 0, 1e-10, 80, 1e-8);
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_NMAX = dcp.maxiter;
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_THR = dcp.eps;
+    hsolver::DiagoIterAssist<std::complex<double>>::SCF_ITER = 1;
+
+    double precond[dim] = {1.0, 1.0, 1.0};
+    DIAGOTEST::hmatrix = hm;
+    DIAGOTEST::npw = dim;
+    dcp.CompareEigen(precond, true);
+}
+
+TEST(DiagoPPCGTest, SubspaceFourByFour)
+{
+    const int dim = 4;
+    const int nband = 2;
+    std::vector<std::complex<double>> hm(dim * dim, {0.0, 0.0});
+    hm[0] = {1.0, 0.0};
+    hm[5] = {2.0, 0.0};
+    hm[10] = {4.0, 0.0};
+    hm[15] = {8.0, 0.0};
+
+    DiagoPPCGPrepare dcp(nband, dim, 0, 1e-10, 100, 1e-8);
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_NMAX = dcp.maxiter;
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_THR = dcp.eps;
+    hsolver::DiagoIterAssist<std::complex<double>>::SCF_ITER = 1;
+
+    double precond[dim] = {1.0, 1.0, 1.0, 1.0};
+    DIAGOTEST::hmatrix = hm;
+    DIAGOTEST::npw = dim;
+    dcp.CompareEigen(precond, true);
+}
+
+TEST(DiagoPPCGTest, SubspaceFourByFourThreeBands)
+{
+    const int dim = 4;
+    const int nband = 3;
+    std::vector<std::complex<double>> hm(dim * dim, {0.0, 0.0});
+    hm[0] = {1.0, 0.0};
+    hm[5] = {2.0, 0.0};
+    hm[10] = {4.0, 0.0};
+    hm[15] = {8.0, 0.0};
+
+    DiagoPPCGPrepare dcp(nband, dim, 0, 1e-10, 100, 1e-8);
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_NMAX = dcp.maxiter;
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_THR = dcp.eps;
+    hsolver::DiagoIterAssist<std::complex<double>>::SCF_ITER = 1;
+
+    double precond[dim] = {1.0, 1.0, 1.0, 1.0};
+    DIAGOTEST::hmatrix = hm;
+    DIAGOTEST::npw = dim;
+    dcp.CompareEigen(precond, true);
+}
+
+TEST(DiagoPPCGTest, CoupledSubspaceFourByFour)
+{
+    const int dim = 4;
+    const int nband = 2;
+    std::vector<std::complex<double>> hm(dim * dim);
+    hm[0] = {2.0, 0.0};
+    hm[1] = {0.4, -0.1};
+    hm[2] = {0.0, 0.2};
+    hm[3] = {0.1, 0.0};
+    hm[4] = {0.4, 0.1};
+    hm[5] = {3.0, 0.0};
+    hm[6] = {-0.3, 0.2};
+    hm[7] = {0.0, -0.1};
+    hm[8] = {0.0, -0.2};
+    hm[9] = {-0.3, -0.2};
+    hm[10] = {5.0, 0.0};
+    hm[11] = {0.6, 0.3};
+    hm[12] = {0.1, 0.0};
+    hm[13] = {0.0, 0.1};
+    hm[14] = {0.6, -0.3};
+    hm[15] = {8.0, 0.0};
+
+    DiagoPPCGPrepare dcp(nband, dim, 0, 1e-10, 100, 1e-8);
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_NMAX = dcp.maxiter;
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_THR = dcp.eps;
+    hsolver::DiagoIterAssist<std::complex<double>>::SCF_ITER = 1;
+
+    double precond[dim] = {1.0, 1.0, 1.0, 1.0};
+    DIAGOTEST::hmatrix = hm;
+    DIAGOTEST::npw = dim;
+    dcp.CompareEigen(precond, true);
+}
+
+TEST(DiagoPPCGTest, CoupledSubspaceFourByFourThreeBands)
+{
+    const int dim = 4;
+    const int nband = 3;
+    std::vector<std::complex<double>> hm(dim * dim);
+    hm[0] = {2.0, 0.0};
+    hm[1] = {0.4, -0.1};
+    hm[2] = {0.0, 0.2};
+    hm[3] = {0.1, 0.0};
+    hm[4] = {0.4, 0.1};
+    hm[5] = {3.0, 0.0};
+    hm[6] = {-0.3, 0.2};
+    hm[7] = {0.0, -0.1};
+    hm[8] = {0.0, -0.2};
+    hm[9] = {-0.3, -0.2};
+    hm[10] = {5.0, 0.0};
+    hm[11] = {0.6, 0.3};
+    hm[12] = {0.1, 0.0};
+    hm[13] = {0.0, 0.1};
+    hm[14] = {0.6, -0.3};
+    hm[15] = {8.0, 0.0};
+
+    DiagoPPCGPrepare dcp(nband, dim, 0, 1e-10, 100, 1e-8);
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_NMAX = dcp.maxiter;
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_THR = dcp.eps;
+    hsolver::DiagoIterAssist<std::complex<double>>::SCF_ITER = 1;
+
+    double precond[dim] = {1.0, 1.0, 1.0, 1.0};
+    DIAGOTEST::hmatrix = hm;
+    DIAGOTEST::npw = dim;
+    dcp.CompareEigen(precond, true);
+}
+
+TEST(DiagoPPCGTest, readH)
+{
+    std::vector<std::complex<double>> hm;
+    std::ifstream ifs;
+    std::string filename = "H-KPoints-Si2.dat";
+    ifs.open(filename);
+    ASSERT_TRUE(ifs.is_open()) << "Error opening file " << filename;
+    DIAGOTEST::readh(ifs, hm);
+    ifs.close();
+
+    int dim = DIAGOTEST::npw;
+    int nband = 10;
+
+    DiagoPPCGPrepare dcp(nband, dim, 0, 1e-6, 500, 2e-1);
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_NMAX = dcp.maxiter;
+    hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_THR = dcp.eps;
+    hsolver::DiagoIterAssist<std::complex<double>>::SCF_ITER = 1;
+
+    HPsi<std::complex<double>> hpsi;
+    hpsi.create(nband, dim);
+    DIAGOTEST::hmatrix = hm;
+    DIAGOTEST::npw = dim;
+    dcp.CompareEigen(hpsi.precond());
+}
+
+// ------------------------------------------------------------
+// Consistency tests: PPCG  vs  BPCG  on the same Hamiltonian
+// ------------------------------------------------------------
+TEST(DiagoPPCGTest, ConsistentWithBPCG)
+{
+    int dim = 40;
+    int nband = 8;
+
+    HPsi<std::complex<double>> hpsi(nband, dim, 5); // moderate sparsity
+    DIAGOTEST::hmatrix = hpsi.hamilt();
+    DIAGOTEST::npw = dim;
+
+    // LAPACK reference
+    double* e_lapack = new double[dim];
+    auto ev = DIAGOTEST::hmatrix;
+    lapackEigen(dim, ev, e_lapack);
+
+    // --- shared initial guess ---
+    ModuleBase::ComplexMatrix psiguess(nband, dim);
+    std::default_random_engine p(7);
+    std::uniform_int_distribution<unsigned> u(1, 10);
+    for (int i = 0; i < nband; i++)
+        for (int j = 0; j < dim; j++)
+            psiguess(i, j) = ev[j * dim + i] * static_cast<double>(u(p)) / 10.;
+
+    // --- PPCG ---
+    {
+        psi::Psi<std::complex<double>> psi_ppcg;
+        psi_ppcg.resize(1, nband, dim);
+        for (int i = 0; i < nband; i++)
+            for (int j = 0; j < dim; j++)
+                psi_ppcg(i, j) = psiguess(i, j);
+
+        double en_ppcg[40] = {};
+        hsolver::DiagoPPCG<std::complex<double>> ppcg(hpsi.precond());
+        ppcg.init_iter(nband, nband, dim, dim);
+        std::vector<double> ethr(nband, 1e-6);
+        hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_NMAX = 200;
+        hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_THR = 1e-6;
+        hsolver::DiagoIterAssist<std::complex<double>>::SCF_ITER = 1;
+
+        auto hpsi_func = [&hpsi, dim](std::complex<double>* in, std::complex<double>* out, int ld, int nv) {
+            auto one = std::make_unique<std::complex<double>>(1.0);
+            auto zero = std::make_unique<std::complex<double>>(0.0);
+            ModuleBase::gemm_op<std::complex<double>, base_device::DEVICE_CPU>()(
+                'N', 'N', dim, nv, dim, one.get(), DIAGOTEST::hmatrix.data(), dim, in, ld, zero.get(), out, ld);
+        };
+
+        for (int pass = 0; pass < 5; ++pass)
+            ppcg.diag(hpsi_func, psi_ppcg.get_pointer(), en_ppcg, ethr);
+
+        for (int i = 0; i < nband; i++)
+            EXPECT_NEAR(en_ppcg[i], e_lapack[i], 1e-1);
+    }
+
+    // --- BPCG ---
+    {
+        psi::Psi<std::complex<double>> psi_bpcg;
+        psi_bpcg.resize(1, nband, dim);
+        for (int i = 0; i < nband; i++)
+            for (int j = 0; j < dim; j++)
+                psi_bpcg(i, j) = psiguess(i, j);
+
+        double en_bpcg[40] = {};
+        hsolver::DiagoBPCG<std::complex<double>> bpcg(hpsi.precond());
+        bpcg.init_iter(nband, nband, dim, dim);
+        std::vector<double> ethr(nband, 1e-6);
+
+        auto hpsi_func = [&hpsi, dim](std::complex<double>* in, std::complex<double>* out, int ld, int nv) {
+            auto one = std::make_unique<std::complex<double>>(1.0);
+            auto zero = std::make_unique<std::complex<double>>(0.0);
+            ModuleBase::gemm_op<std::complex<double>, base_device::DEVICE_CPU>()(
+                'N', 'N', dim, nv, dim, one.get(), DIAGOTEST::hmatrix.data(), dim, in, ld, zero.get(), out, ld);
+        };
+
+        for (int pass = 0; pass < 4; ++pass)
+            bpcg.diag(hpsi_func, psi_bpcg.get_pointer(), en_bpcg, ethr);
+
+        for (int i = 0; i < nband; i++)
+            EXPECT_NEAR(en_bpcg[i], e_lapack[i], 1e-1);
+    }
+
+    delete[] e_lapack;
+}
+
+// ------------------------------------------------------------
+// Parameter configurability test
+// ------------------------------------------------------------
+TEST(DiagoPPCGTest, TunableParameters)
+{
+    int dim = 30;
+    int nband = 5;
+    HPsi<std::complex<double>> hpsi(nband, dim);
+    DIAGOTEST::hmatrix = hpsi.hamilt();
+    DIAGOTEST::npw = dim;
+
+    // LAPACK reference
+    double* e_lapack = new double[dim];
+    auto ev = DIAGOTEST::hmatrix;
+    lapackEigen(dim, ev, e_lapack);
+
+    ModuleBase::ComplexMatrix psiguess(nband, dim);
+    std::default_random_engine p(3);
+    std::uniform_int_distribution<unsigned> u(1, 10);
+    for (int i = 0; i < nband; i++)
+        for (int j = 0; j < dim; j++)
+            psiguess(i, j) = ev[j * dim + i] * static_cast<double>(u(p)) / 10.;
+
+    // --- test 1: aggressive p_safe margin (margin=5) ---
+    {
+        psi::Psi<std::complex<double>> psi_a;
+        psi_a.resize(1, nband, dim);
+        for (int i = 0; i < nband; i++)
+            for (int j = 0; j < dim; j++)
+                psi_a(i, j) = psiguess(i, j);
+
+        double en_a[30] = {};
+        hsolver::DiagoPPCG<std::complex<double>> ppcg(hpsi.precond());
+        ppcg.init_iter(nband, nband, dim, dim);
+        ppcg.set_p_safe_margin(5);   // more conservative → P block disabled for this problem
+        ppcg.set_max_inner_iter(1);  // single iteration per pass
+        ppcg.set_npass(8);           // compensate with more passes
+
+        std::vector<double> ethr(nband, 1e-6);
+        hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_NMAX = 200;
+        hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_THR = 1e-6;
+        hsolver::DiagoIterAssist<std::complex<double>>::SCF_ITER = 1;
+
+        auto hpsi_func = [&hpsi, dim](std::complex<double>* in, std::complex<double>* out, int ld, int nv) {
+            auto one = std::make_unique<std::complex<double>>(1.0);
+            auto zero = std::make_unique<std::complex<double>>(0.0);
+            ModuleBase::gemm_op<std::complex<double>, base_device::DEVICE_CPU>()(
+                'N', 'N', dim, nv, dim, one.get(), DIAGOTEST::hmatrix.data(), dim, in, ld, zero.get(), out, ld);
+        };
+
+        for (int pass = 0; pass < ppcg.npass(); ++pass)
+            ppcg.diag(hpsi_func, psi_a.get_pointer(), en_a, ethr);
+
+        for (int i = 0; i < nband; i++)
+            EXPECT_NEAR(en_a[i], e_lapack[i], 2e-1);
+    }
+
+    delete[] e_lapack;
+}
+
+// ------------------------------------------------------------
+// Comprehensive benchmark: PPCG vs BPCG vs LAPACK
+//   - 5 matrix sizes (60 … 480)
+//   - timing, accuracy, speedup vs LAPACK
+//   - empirical complexity exponents
+// ------------------------------------------------------------
+TEST(DiagoPPCGTest, ComprehensiveBenchmark)
+{
+    if (std::getenv("ABACUS_RUN_HSOLVER_BENCHMARK") == nullptr)
+    {
+        GTEST_SKIP() << "Set ABACUS_RUN_HSOLVER_BENCHMARK=1 to run the PPCG benchmark.";
+    }
+
+    const int nband = 6;
+    const int sizes[] = {60, 120, 240, 360, 480};
+    const int n_sizes = 5;
+
+    // storage for timing data (for complexity analysis)
+    double t_ppcg[5] = {}, t_bpcg[5] = {}, t_david[5] = {}, t_lapack[5] = {};
+
+    std::cout << "\n"
+              << "==========================================================================================\n"
+              << "  PPCG vs BPCG vs Davidson vs LAPACK  —  Comprehensive Benchmark\n"
+              << "  (nband=" << nband << ", 5 passes each, ethr=1e-5)\n"
+              << "==========================================================================================\n"
+              << "   N    | PPCG(ms) BPCG(ms) David(ms) LAPACK(ms) | PPCG/LAP BPCG/LAP David/LAP | PPCG-err  BPCG-err  David-err\n"
+              << "--------+------------------------------------------+---------------------------+----------------------------"
+              << std::endl;
+
+    for (int sz = 0; sz < n_sizes; ++sz)
+    {
+        int npw = sizes[sz];
+        HPsi<std::complex<double>> hpsi(nband, npw);
+        DIAGOTEST::hmatrix = hpsi.hamilt();
+        DIAGOTEST::npw = npw;
+
+        // LAPACK reference (timed)
+        double* e_lapack = new double[npw];
+        auto ev_lap = DIAGOTEST::hmatrix;
+        double t0 = MPI_Wtime();
+        lapackEigen(npw, ev_lap, e_lapack);
+        t_lapack[sz] = (MPI_Wtime() - t0) * 1000.0;
+
+        // common initial guess
+        ModuleBase::ComplexMatrix psiguess(nband, npw);
+        std::default_random_engine prng(5);
+        std::uniform_int_distribution<unsigned> u(1, 10);
+        for (int i = 0; i < nband; i++)
+            for (int j = 0; j < npw; j++)
+                psiguess(i, j) = ev_lap[j * npw + i] * static_cast<double>(u(prng)) / 10.;
+
+        // shared hpsi_func
+        auto hpsi_func = [npw](std::complex<double>* in, std::complex<double>* out, int ld, int nv) {
+            auto one = std::make_unique<std::complex<double>>(1.0);
+            auto zero = std::make_unique<std::complex<double>>(0.0);
+            ModuleBase::gemm_op<std::complex<double>, base_device::DEVICE_CPU>()(
+                'N', 'N', npw, nv, npw, one.get(), DIAGOTEST::hmatrix.data(), npw, in, ld, zero.get(), out, ld);
+        };
+
+        std::vector<double> ethr(nband, 1e-5);
+        hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_NMAX = 200;
+        hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_THR = 1e-5;
+        hsolver::DiagoIterAssist<std::complex<double>>::SCF_ITER = 1;
+
+        // ---- PPCG ----
+        double en_ppcg[500] = {};
+        {
+            psi::Psi<std::complex<double>> psi_ppcg;
+            psi_ppcg.resize(1, nband, npw);
+            for (int i = 0; i < nband; i++)
+                for (int j = 0; j < npw; j++)
+                    psi_ppcg(i, j) = psiguess(i, j);
+
+            hsolver::DiagoPPCG<std::complex<double>> ppcg(hpsi.precond());
+            ppcg.init_iter(nband, nband, npw, npw);
+            double start = MPI_Wtime();
+            for (int pass = 0; pass < 5; ++pass)
+                ppcg.diag(hpsi_func, psi_ppcg.get_pointer(), en_ppcg, ethr);
+            t_ppcg[sz] = (MPI_Wtime() - start) * 1000.0;
+        }
+
+        // ---- BPCG ----
+        double en_bpcg[500] = {};
+        {
+            psi::Psi<std::complex<double>> psi_bpcg;
+            psi_bpcg.resize(1, nband, npw);
+            for (int i = 0; i < nband; i++)
+                for (int j = 0; j < npw; j++)
+                    psi_bpcg(i, j) = psiguess(i, j);
+
+            hsolver::DiagoBPCG<std::complex<double>> bpcg(hpsi.precond());
+            bpcg.init_iter(nband, nband, npw, npw);
+            double start = MPI_Wtime();
+            for (int pass = 0; pass < 4; ++pass)
+                bpcg.diag(hpsi_func, psi_bpcg.get_pointer(), en_bpcg, ethr);
+            t_bpcg[sz] = (MPI_Wtime() - start) * 1000.0;
+        }
+
+        // ---- Davidson ----
+        double en_david[500] = {};
+        {
+            psi::Psi<std::complex<double>> psi_dav;
+            psi_dav.resize(1, nband, npw);
+            for (int i = 0; i < nband; i++)
+                for (int j = 0; j < npw; j++)
+                    psi_dav(i, j) = psiguess(i, j);
+
+            hsolver::diag_comm_info comm_info(
+#ifdef __MPI
+                MPI_COMM_SELF,
+#endif
+                0, 1);
+            hsolver::DiagoDavid<std::complex<double>> david(hpsi.precond(), nband, npw, 4, comm_info);
+            auto spsi_func = [npw](const std::complex<double>* in, std::complex<double>* out, int ld, int nv) {
+                for (int ib = 0; ib < nv; ib++)
+                    for (int i = 0; i < npw; i++)
+                        out[ib * ld + i] = in[ib * ld + i];
+            };
+            double start = MPI_Wtime();
+            david.diag(hpsi_func, spsi_func, npw, psi_dav.get_pointer(), en_david, ethr, 200);
+            t_david[sz] = (MPI_Wtime() - start) * 1000.0;
+        }
+
+        // errors
+        double err_ppcg = std::abs(en_ppcg[0] - e_lapack[0]);
+        double err_bpcg = std::abs(en_bpcg[0] - e_lapack[0]);
+        double err_david = std::abs(en_david[0] - e_lapack[0]);
+
+        double s_ppcg = t_lapack[sz] / std::max(t_ppcg[sz], 1e-6);
+        double s_bpcg = t_lapack[sz] / std::max(t_bpcg[sz], 1e-6);
+        double s_david = t_lapack[sz] / std::max(t_david[sz], 1e-6);
+
+        char buf[256];
+        snprintf(buf, sizeof(buf),
+                 " %5d  | %7.1f  %7.1f  %8.1f  %8.1f | %7.1fx  %7.1fx  %7.1fx  | %8.1e %8.1e %8.1e",
+                 npw, t_ppcg[sz], t_bpcg[sz], t_david[sz], t_lapack[sz],
+                 s_ppcg, s_bpcg, s_david,
+                 err_ppcg, err_bpcg, err_david);
+        std::cout << buf << std::endl;
+
+        EXPECT_NEAR(en_ppcg[0], e_lapack[0], std::abs(e_lapack[0]) * 0.1 + 0.5);
+        EXPECT_NEAR(en_bpcg[0], e_lapack[0], std::abs(e_lapack[0]) * 0.1 + 0.5);
+        EXPECT_NEAR(en_david[0], e_lapack[0], std::abs(e_lapack[0]) * 0.1 + 0.5);
+
+        delete[] e_lapack;
+    }
+
+    // ---- empirical complexity analysis ----
+    // Fit  t = C * N^k  →  log(t) = log(C) + k * log(N)
+    // Use adjacent pairs to estimate local exponent: k ≈ log(t2/t1) / log(N2/N1)
+    std::cout << "\n--- Empirical complexity exponents (k in  t ∝ N^k) ---\n";
+    for (int sz = 1; sz < n_sizes; ++sz)
+    {
+        double ratio_N = std::log((double)sizes[sz] / sizes[sz - 1]);
+        double k_ppcg  = std::log(std::max(t_ppcg[sz], 1e-9) / std::max(t_ppcg[sz - 1], 1e-9)) / ratio_N;
+        double k_bpcg  = std::log(std::max(t_bpcg[sz], 1e-9) / std::max(t_bpcg[sz - 1], 1e-9)) / ratio_N;
+        double k_david = std::log(std::max(t_david[sz], 1e-9) / std::max(t_david[sz - 1], 1e-9)) / ratio_N;
+        double k_lap   = std::log(std::max(t_lapack[sz], 1e-9) / std::max(t_lapack[sz - 1], 1e-9)) / ratio_N;
+        printf("  %4d→%4d :  PPCG k=%.2f   BPCG k=%.2f   David k=%.2f   LAPACK k=%.2f\n",
+               sizes[sz - 1], sizes[sz], k_ppcg, k_bpcg, k_david, k_lap);
+    }
+
+    // average speedup
+    double avg_ppcg_vs_lap = 0, avg_bpcg_vs_lap = 0, avg_david_vs_lap = 0, avg_ppcg_vs_bpcg = 0, avg_ppcg_vs_david = 0;
+    for (int sz = 0; sz < n_sizes; ++sz)
+    {
+        avg_ppcg_vs_lap   += t_lapack[sz] / std::max(t_ppcg[sz], 1e-6);
+        avg_bpcg_vs_lap   += t_lapack[sz] / std::max(t_bpcg[sz], 1e-6);
+        avg_david_vs_lap  += t_lapack[sz] / std::max(t_david[sz], 1e-6);
+        avg_ppcg_vs_bpcg  += t_bpcg[sz] / std::max(t_ppcg[sz], 1e-6);
+        avg_ppcg_vs_david += t_david[sz] / std::max(t_ppcg[sz], 1e-6);
+    }
+    avg_ppcg_vs_lap /= n_sizes;
+    avg_bpcg_vs_lap /= n_sizes;
+    avg_david_vs_lap /= n_sizes;
+    avg_ppcg_vs_bpcg /= n_sizes;
+    avg_ppcg_vs_david /= n_sizes;
+
+    std::cout << "\n--- Average speedup (geometric mean over 5 sizes) ---\n"
+              << "  PPCG  vs LAPACK  : " << avg_ppcg_vs_lap << "x\n"
+              << "  BPCG  vs LAPACK  : " << avg_bpcg_vs_lap << "x\n"
+              << "  David vs LAPACK  : " << avg_david_vs_lap << "x\n"
+              << "  PPCG  vs BPCG    : " << avg_ppcg_vs_bpcg << "x\n"
+              << "  PPCG  vs David   : " << avg_ppcg_vs_david << "x\n"
+              << std::endl;
+}
+
+int main(int argc, char** argv)
+{
+    int nproc = 1, myrank = 0;
+
+#ifdef __MPI
+    int nproc_in_pool, kpar = 1, mypool, rank_in_pool;
+    setupmpi(argc, argv, nproc, myrank);
+    divide_pools(nproc, myrank, nproc_in_pool, kpar, mypool, rank_in_pool);
+    MPI_Comm_dup(POOL_WORLD, &BP_WORLD);
+    GlobalV::NPROC_IN_POOL = nproc;
+#else
+    MPI_Init(&argc, &argv);
+#endif
+
+    testing::InitGoogleTest(&argc, argv);
+    ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners();
+    if (myrank != 0)
+    {
+        delete listeners.Release(listeners.default_result_printer());
+    }
+
+    int result = RUN_ALL_TESTS();
+    if (myrank == 0 && result != 0)
+    {
+        std::cout << "ERROR:some tests are not passed" << std::endl;
+        return result;
+    }
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/source/source_hsolver/test/generate_hsolver_test_report.sh b/source/source_hsolver/test/generate_hsolver_test_report.sh
new file mode 100644
index 00000000000..a381414c8e1
--- /dev/null
+++ b/source/source_hsolver/test/generate_hsolver_test_report.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+set -o pipefail
+
+script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+
+find_build_dir() {
+    local dir="$1"
+    while [[ "$dir" != "/" ]]; do
+        if [[ -f "$dir/CMakeCache.txt" && -f "$dir/CTestTestfile.cmake" ]]; then
+            echo "$dir"
+            return 0
+        fi
+        dir=$(dirname "$dir")
+    done
+    return 1
+}
+
+build_dir=${HSOLVER_TEST_BUILD_DIR:-}
+if [[ -z "$build_dir" ]]; then
+    build_dir=$(find_build_dir "$PWD")
+fi
+if [[ -z "$build_dir" ]]; then
+    build_dir=$(find_build_dir "$script_dir")
+fi
+if [[ -z "$build_dir" ]]; then
+    echo "Cannot locate the CTest build directory. Set HSOLVER_TEST_BUILD_DIR." >&2
+    exit 1
+fi
+
+report_dir=${HSOLVER_TEST_REPORT_DIR:-"$build_dir/test_reports/hsolver"}
+test_regex=${HSOLVER_TEST_REGEX:-"^MODULE_HSOLVER_(ppcg|bpcg)$"}
+timestamp=$(date +"%Y%m%d_%H%M%S")
+
+mkdir -p "$report_dir"
+
+xml_file="$report_dir/hsolver_unit_tests_${timestamp}.xml"
+log_file="$report_dir/hsolver_unit_tests_${timestamp}.log"
+ctest_has_junit=0
+if ctest --help 2>/dev/null | grep -q -- "--output-junit"; then
+    ctest_has_junit=1
+fi
+
+echo "Build directory : $build_dir"
+echo "Test regex      : $test_regex"
+if [[ $ctest_has_junit -eq 1 ]]; then
+    echo "JUnit XML       : $xml_file"
+else
+    echo "JUnit XML       : skipped (--output-junit is not supported by this CTest)"
+fi
+echo "Text log        : $log_file"
+
+if [[ $ctest_has_junit -eq 1 ]]; then
+    ctest --test-dir "$build_dir" -V -R "$test_regex" --output-junit "$xml_file" 2>&1 | tee "$log_file"
+    status=${PIPESTATUS[0]}
+else
+    ctest --test-dir "$build_dir" -V -R "$test_regex" 2>&1 | tee "$log_file"
+    status=${PIPESTATUS[0]}
+fi
+
+if [[ $status -eq 0 ]]; then
+    echo "Generated hsolver test reports in: $report_dir"
+else
+    echo "CTest failed. Partial reports are available in: $report_dir" >&2
+fi
+
+exit $status