From 64ba0174ee3fc1ca6a05665176661a5ad8adc625 Mon Sep 17 00:00:00 2001 From: Xiao-Han666 <3164671469@qq.com> Date: Fri, 19 Jun 2026 20:20:46 +0800 Subject: [PATCH 1/2] Implement 4 optimizations targeting esolver_of.cpp mesh-grid loops: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - P0-1: Add #pragma omp parallel for simd to 10 grid loops across before_opt, update_potential, optimize, update_rho, after_opt, cal_energy. All guarded by #ifdef _OPENMP for serial fallback. - P0-opt: Replace per-iteration new/delete in optimize() with persistent buffer ptemp_phi_persistent_, allocated once in allocate_array(). - P1-1: Three-tier version-adaptive offload for dL/dφ computation (_OPENMP>=201811→GPU target, <201811→host parallel, none→serial). - P2-2: Precompute cos/sin/rho0 outside inner loops; fuse 3 ZEROS calls into single grid loop to reduce loop overhead. Modified files: source/source_esolver/esolver_of.cpp (+134/-36) source/source_esolver/esolver_of.h (+1, ptemp_phi_persistent_) source/source_esolver/esolver_of_tool.cpp (+8, allocate persistent buffer) --- source/source_esolver/esolver_of.cpp | 161 +++++++++++++++++----- source/source_esolver/esolver_of.h | 1 + source/source_esolver/esolver_of_tool.cpp | 8 ++ tests/07_OFDFT/test.sum | 35 +++++ 4 files changed, 169 insertions(+), 36 deletions(-) create mode 100644 tests/07_OFDFT/test.sum diff --git a/source/source_esolver/esolver_of.cpp b/source/source_esolver/esolver_of.cpp index bf6f7ffde29..12c0162dce2 100644 --- a/source/source_esolver/esolver_of.cpp +++ b/source/source_esolver/esolver_of.cpp @@ -50,6 +50,16 @@ ESolver_OF::~ESolver_OF() delete[] this->task_; delete this->ptemp_rho_; + // P0-opt: cleanup persistent line-search buffer + if (this->ptemp_phi_persistent_ != nullptr) + { + for (int i = 0; i < PARAM.inp.nspin; ++i) + { + delete[] this->ptemp_phi_persistent_[i]; + } + delete[] this->ptemp_phi_persistent_; + } + delete this->kedf_manager_; delete this->opt_cg_; @@ -229,6 +239,13 @@ void ESolver_OF::before_opt(const int istep, UnitCell& ucell) this->pdEdphi_[is] = new double[this->pw_rho->nrxx]; this->pdirect_[is] = new double[this->pw_rho->nrxx]; this->precip_dir_[is] = new std::complex[pw_rho->npw]; + + // P0-opt: reallocate persistent line-search buffer on cell change + if (this->ptemp_phi_persistent_ != nullptr) + { + delete[] this->ptemp_phi_persistent_[is]; + this->ptemp_phi_persistent_[is] = new double[this->pw_rho->nrxx]; + } } } @@ -238,32 +255,55 @@ void ESolver_OF::before_opt(const int istep, UnitCell& ucell) for (int is = 0; is < PARAM.inp.nspin; ++is) { + const int nrxx = this->pw_rho->nrxx; if (PARAM.inp.init_chg != "file") { - for (int ibs = 0; ibs < this->pw_rho->nrxx; ++ibs) + const double rho0 = this->nelec_[is] / ucell.omega; + double* rho_is = this->chr.rho[is]; + double* pphi_is = this->pphi_[is]; + // P0-1: OpenMP parallel φ initialization (uniform charge guess) +#ifdef _OPENMP +#pragma omp parallel for simd +#endif + for (int ibs = 0; ibs < nrxx; ++ibs) { - // Here we initialize rho to be uniform, - // because the rho got by pot.init_pot -> Charge::atomic_rho may contain minus elements. - this->chr.rho[is][ibs] = this->nelec_[is] / ucell.omega; - this->pphi_[is][ibs] = sqrt(this->chr.rho[is][ibs]); + rho_is[ibs] = rho0; + pphi_is[ibs] = sqrt(rho0); } } else { - for (int ibs = 0; ibs < this->pw_rho->nrxx; ++ibs) + double* pphi_is = this->pphi_[is]; + const double* rho_is = this->chr.rho[is]; + // P0-1: OpenMP parallel φ initialization (from file charge density) +#ifdef _OPENMP +#pragma omp parallel for simd +#endif + for (int ibs = 0; ibs < nrxx; ++ibs) { - this->pphi_[is][ibs] = sqrt(this->chr.rho[is][ibs]); + pphi_is[ibs] = sqrt(rho_is[ibs]); } } } + // P0-1 & P2-2: OpenMP parallel fused zeroing for (int is = 0; is < PARAM.inp.nspin; ++is) { this->pelec->eferm.set_efval(is, 0); this->theta_[is] = 0.; - ModuleBase::GlobalFunc::ZEROS(this->pdLdphi_[is], this->pw_rho->nrxx); - ModuleBase::GlobalFunc::ZEROS(this->pdEdphi_[is], this->pw_rho->nrxx); - ModuleBase::GlobalFunc::ZEROS(this->pdirect_[is], this->pw_rho->nrxx); + double* pdLdphi = this->pdLdphi_[is]; + double* pdEdphi = this->pdEdphi_[is]; + double* pdirect = this->pdirect_[is]; + const int nrxx = this->pw_rho->nrxx; +#ifdef _OPENMP +#pragma omp parallel for simd +#endif + for (int ir = 0; ir < nrxx; ++ir) + { + pdLdphi[ir] = 0.0; + pdEdphi[ir] = 0.0; + pdirect[ir] = 0.0; + } } if (PARAM.inp.nspin == 1) { @@ -292,15 +332,33 @@ void ESolver_OF::update_potential(UnitCell& ucell) for (int is = 0; is < PARAM.inp.nspin; ++is) { const double* vr_eff = this->pelec->pot->get_eff_v(is); - for (int ir = 0; ir < this->pw_rho->nrxx; ++ir) + double* pdEdphi = this->pdEdphi_[is]; + const int nrxx = this->pw_rho->nrxx; + + // P0-1: OpenMP parallelization — copy effective potential to dE/dphi +#ifdef _OPENMP +#pragma omp parallel for simd +#endif + for (int ir = 0; ir < nrxx; ++ir) { - this->pdEdphi_[is][ir] = vr_eff[ir]; + pdEdphi[ir] = vr_eff[ir]; } - this->pelec->eferm.set_efval(is, this->cal_mu(this->pphi_[is], this->pdEdphi_[is], this->nelec_[is])); - for (int ir = 0; ir < this->pw_rho->nrxx; ++ir) + + this->pelec->eferm.set_efval(is, this->cal_mu(this->pphi_[is], pdEdphi, this->nelec_[is])); + + const double eferm_val = this->pelec->eferm.get_efval(is); + double* pdLdphi = this->pdLdphi_[is]; + const double* pphi = this->pphi_[is]; + + // P0-1 / P1-1: OpenMP parallelization + GPU target offload for dL/dphi +#if defined(_OPENMP) && _OPENMP >= 201811 +#pragma omp target teams distribute parallel for simd map(to: pdEdphi[0:nrxx], pphi[0:nrxx]) map(from: pdLdphi[0:nrxx]) firstprivate(eferm_val, nrxx) +#elif defined(_OPENMP) +#pragma omp parallel for simd +#endif + for (int ir = 0; ir < nrxx; ++ir) { - this->pdLdphi_[is][ir] - = this->pdEdphi_[is][ir] - 2. * this->pelec->eferm.get_efval(is) * this->pphi_[is][ir]; + pdLdphi[ir] = pdEdphi[ir] - 2.0 * eferm_val * pphi[ir]; } } @@ -328,15 +386,23 @@ void ESolver_OF::optimize(UnitCell& ucell) { // (1) get |d0> with optimization algorithm this->get_direction(ucell); + // P0-opt: use persistent buffer instead of per-iteration new/delete + double** ptemp_phi = this->ptemp_phi_persistent_; + // initialize temp_phi and temp_rho used in line search - double** ptemp_phi = new double*[PARAM.inp.nspin]; for (int is = 0; is < PARAM.inp.nspin; ++is) { - ptemp_phi[is] = new double[this->pw_rho->nrxx]; - for (int ir = 0; ir < this->pw_rho->nrxx; ++ir) + const double* pphi = this->pphi_[is]; + double* rho = this->ptemp_rho_->rho[is]; + const int nrxx = this->pw_rho->nrxx; + // P0-1: OpenMP parallel initialization +#ifdef _OPENMP +#pragma omp parallel for simd +#endif + for (int ir = 0; ir < nrxx; ++ir) { - ptemp_phi[is][ir] = this->pphi_[is][ir]; - this->ptemp_rho_->rho[is][ir] = ptemp_phi[is][ir] * ptemp_phi[is][ir]; + ptemp_phi[is][ir] = pphi[ir]; + rho[ir] = pphi[ir] * pphi[ir]; } } @@ -353,11 +419,7 @@ void ESolver_OF::optimize(UnitCell& ucell) // (4) call line search to find the best theta (step length) this->get_step_length(dEdtheta, ptemp_phi, ucell); - for (int is = 0; is < PARAM.inp.nspin; ++is) - { - delete[] ptemp_phi[is]; - } - delete[] ptemp_phi; + // P0-opt: persistent buffer — no delete needed for ptemp_phi, reused next iteration delete[] dEdtheta; } @@ -368,13 +430,23 @@ void ESolver_OF::optimize(UnitCell& ucell) */ void ESolver_OF::update_rho() { + // P0-1 & P2-2: OpenMP parallelization with SIMD vectorization for (int is = 0; is < PARAM.inp.nspin; ++is) { - for (int ir = 0; ir < this->pw_rho->nrxx; ++ir) + const double cos_theta = cos(this->theta_[is]); + const double sin_theta = sin(this->theta_[is]); + double* pphi = this->pphi_[is]; + const double* pdirect = this->pdirect_[is]; + double* rho = this->chr.rho[is]; + const int nrxx = this->pw_rho->nrxx; + +#ifdef _OPENMP +#pragma omp parallel for simd +#endif + for (int ir = 0; ir < nrxx; ++ir) { - this->pphi_[is][ir] - = this->pphi_[is][ir] * cos(this->theta_[is]) + this->pdirect_[is][ir] * sin(this->theta_[is]); - this->chr.rho[is][ir] = this->pphi_[is][ir] * this->pphi_[is][ir]; + pphi[ir] = pphi[ir] * cos_theta + pdirect[ir] * sin_theta; + rho[ir] = pphi[ir] * pphi[ir]; } } // // ------------ turn on symmetry may cause instability in optimization ------------ @@ -467,7 +539,10 @@ void ESolver_OF::after_opt(const int istep, UnitCell& ucell, const bool conv_eso this->kedf_manager_->get_energy_density(this->chr.rho, this->pphi_, this->pw_rho, this->chr.kin_r); } - // should not be here? mohan note 2025-03-03 + // P0-1: OpenMP parallel rho_save copy +#ifdef _OPENMP +#pragma omp parallel for simd +#endif for (int ir = 0; ir < this->pw_rho->nrxx; ++ir) { this->chr.rho_save[0][ir] = this->chr.rho[0][ir]; @@ -491,6 +566,10 @@ void ESolver_OF::after_opt(const int istep, UnitCell& ucell, const bool conv_eso this->pelec->pot->get_eff_v()); // KEDF potential const double* vr_eff = this->pelec->pot->get_eff_v(0); + // P0-1: OpenMP parallel ML data extraction +#ifdef _OPENMP +#pragma omp parallel for simd +#endif for (int ir = 0; ir < this->pw_rho->nrxx; ++ir) { this->pdEdphi_[0][ir] = vr_eff[ir]; @@ -524,13 +603,23 @@ double ESolver_OF::cal_energy() { this->pelec->cal_energies(2); double kinetic_energy = this->kedf_manager_->get_energy(); // kinetic energy - double pseudopot_energy = 0.; // electron-ion interaction energy + // P0-1: Explicit OpenMP reduction for pseudopotential energy + double pseudopot_energy = 0.0; + const double* v_fixed = this->pelec->pot->get_fixed_v(); + const double dV = this->dV_; + const int nrxx_e = this->pw_rho->nrxx; for (int is = 0; is < PARAM.inp.nspin; ++is) { - pseudopot_energy += this->inner_product(this->pelec->pot->get_fixed_v(), - this->chr.rho[is], - this->pw_rho->nrxx, - this->dV_); + const double* rho_is = this->chr.rho[is]; + double local_sum = 0.0; +#ifdef _OPENMP +#pragma omp parallel for simd reduction(+:local_sum) +#endif + for (int ir = 0; ir < nrxx_e; ++ir) + { + local_sum += v_fixed[ir] * rho_is[ir]; + } + pseudopot_energy += local_sum * dV; } Parallel_Reduce::reduce_pool(pseudopot_energy); this->pelec->f_en.ekinetic = kinetic_energy; diff --git a/source/source_esolver/esolver_of.h b/source/source_esolver/esolver_of.h index df4b96543c6..99e5525c6ce 100644 --- a/source/source_esolver/esolver_of.h +++ b/source/source_esolver/esolver_of.h @@ -63,6 +63,7 @@ class ESolver_OF : public ESolver_FP int max_dcsrch_ = 200; // max no. of line search int flag_ = -1; // flag of TN Charge* ptemp_rho_ = nullptr; // used in line search + double** ptemp_phi_persistent_ = nullptr; // P0-opt: persistent buffer psi::Psi* psi_ = nullptr; // sqrt(rho) // ----------------- used for convergence check ------------------- diff --git a/source/source_esolver/esolver_of_tool.cpp b/source/source_esolver/esolver_of_tool.cpp index ade24621cab..65562afb261 100644 --- a/source/source_esolver/esolver_of_tool.cpp +++ b/source/source_esolver/esolver_of_tool.cpp @@ -109,6 +109,14 @@ void ESolver_OF::allocate_array() ModuleBase::Memory::record("OFDFT::pdEdphi_", sizeof(double) * PARAM.inp.nspin * this->pw_rho->nrxx); ModuleBase::Memory::record("OFDFT::pdirect_", sizeof(double) * PARAM.inp.nspin * this->pw_rho->nrxx); ModuleBase::Memory::record("OFDFT::precip_dir_", sizeof(std::complex) * PARAM.inp.nspin * this->pw_rho->npw); + + // P0-opt: allocate persistent line-search buffer once, reused in optimize() + this->ptemp_phi_persistent_ = new double*[PARAM.inp.nspin]; + for (int is = 0; is < PARAM.inp.nspin; ++is) + { + this->ptemp_phi_persistent_[is] = new double[this->pw_rho->nrxx]; + } + ModuleBase::Memory::record("OFDFT::ptemp_phi_persistent_", sizeof(double) * PARAM.inp.nspin * this->pw_rho->nrxx); } /** diff --git a/tests/07_OFDFT/test.sum b/tests/07_OFDFT/test.sum new file mode 100644 index 00000000000..8f1be4c5229 --- /dev/null +++ b/tests/07_OFDFT/test.sum @@ -0,0 +1,35 @@ +01_OF_OP_CG1 1 +02_OF_OP_CG2 1 +03_OF_OP_TN 1 +04_OF_KE_CPN5 1 +05_OF_KE_LKT 1 +06_OF_KE_MPN 1 +07_OF_KE_TF 1 +08_OF_KE_TF+ 1 +09_OF_KE_WT 1 +10_OF_KE_vW 1 +11_OF_TF_weight 1 +12_OF_WT_HOLD 1 +13_OF_WT_RHO0 1 +14_OF_WT_alphabeta 1 +15_OF_WT_readKernel 1 +16_OF_vW_weight 1 +17_OF_CO_Energy 1 +18_OF_CO_Potential 1 +19_OF_FFT_fullpw_off 1 +20_OF_FFT_fullpwdim_1 1 +21_OF_FFT_fullpwdim_2 1 +22_OF_LibxcPBE 1 +23_OF_LPS 1 +24_OF_out_elf 1 +25_OF_MD_LDA 1 +26_OF_MD_LibxcPBE 1 +27_OF_CR_LDA 1 +28_OF_KE_XWM 1 +29_OF_XWM_para 1 +30_TDOFDFT_Al 1 +31_TDOFDFT_Al_CD 1 +32_TDOFDFT_Al_mCD 1 +33_OF_out_chg 1 +34_OF_KE_extWT 1 + From 9ca0ce473f5ce455687bf62c5a8e46e2170f3c82 Mon Sep 17 00:00:00 2001 From: Xiao-Han666 <3164671469@qq.com> Date: Fri, 19 Jun 2026 21:12:39 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E5=88=A0=E9=99=A4=E4=BA=86=E4=B8=8D?= =?UTF-8?q?=E5=BF=85=E8=A6=81=E7=9A=84=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/07_OFDFT/test.sum | 35 ----------------------------------- 1 file changed, 35 deletions(-) delete mode 100644 tests/07_OFDFT/test.sum diff --git a/tests/07_OFDFT/test.sum b/tests/07_OFDFT/test.sum deleted file mode 100644 index 8f1be4c5229..00000000000 --- a/tests/07_OFDFT/test.sum +++ /dev/null @@ -1,35 +0,0 @@ -01_OF_OP_CG1 1 -02_OF_OP_CG2 1 -03_OF_OP_TN 1 -04_OF_KE_CPN5 1 -05_OF_KE_LKT 1 -06_OF_KE_MPN 1 -07_OF_KE_TF 1 -08_OF_KE_TF+ 1 -09_OF_KE_WT 1 -10_OF_KE_vW 1 -11_OF_TF_weight 1 -12_OF_WT_HOLD 1 -13_OF_WT_RHO0 1 -14_OF_WT_alphabeta 1 -15_OF_WT_readKernel 1 -16_OF_vW_weight 1 -17_OF_CO_Energy 1 -18_OF_CO_Potential 1 -19_OF_FFT_fullpw_off 1 -20_OF_FFT_fullpwdim_1 1 -21_OF_FFT_fullpwdim_2 1 -22_OF_LibxcPBE 1 -23_OF_LPS 1 -24_OF_out_elf 1 -25_OF_MD_LDA 1 -26_OF_MD_LibxcPBE 1 -27_OF_CR_LDA 1 -28_OF_KE_XWM 1 -29_OF_XWM_para 1 -30_TDOFDFT_Al 1 -31_TDOFDFT_Al_CD 1 -32_TDOFDFT_Al_mCD 1 -33_OF_out_chg 1 -34_OF_KE_extWT 1 -