diff --git a/source/source_lcao/module_gint/gint_atom.cpp b/source/source_lcao/module_gint/gint_atom.cpp index b367e32e169..c7714ec1153 100644 --- a/source/source_lcao/module_gint/gint_atom.cpp +++ b/source/source_lcao/module_gint/gint_atom.cpp @@ -115,94 +115,163 @@ void GintAtom::set_phi(const std::vector& coords, const int stride, T* ph } } } - template void GintAtom::set_phi_dphi( - const std::vector& coords, const int stride, - T* phi, T* dphi_x, T* dphi_y, T* dphi_z) const + const std::vector& coords, + const int stride, + T* phi, + T* dphi_x, + T* dphi_y, + T* dphi_z) const { const int num_mgrids = coords.size(); - - // orb_ does not have the member variable dr_uniform + const double dr_uniform = orb_->PhiLN(0, 0).dr_uniform; - - const int nylm = std::pow(atom_->nwl + 1, 2); - std::vector rly(nylm); - std::vector grly(nylm * 3); - - for(int im = 0; im < num_mgrids; im++) + const double rcut = orb_->getRcut(); + + const int nylm = (atom_->nwl + 1) * (atom_->nwl + 1); + +#pragma omp parallel { - const Vec3d& coord = coords[im]; - // 1e-9 is to avoid division by zero - const double dist = coord.norm() < 1e-9 ? 1e-9 : coord.norm(); + std::vector rly(nylm); + std::vector grly(nylm * 3); - if(dist > orb_->getRcut()) +#pragma omp for schedule(static) + for(int im = 0; im < num_mgrids; ++im) { - // if the distance is larger than the cutoff radius, - // the wave function values are all zeros - if(phi != nullptr) + const Vec3d& coord = coords[im]; + + double dist = coord.norm(); + if(dist < 1e-9) { - ModuleBase::GlobalFunc::ZEROS(phi + im * stride, atom_->nw); + dist = 1e-9; } - ModuleBase::GlobalFunc::ZEROS(dphi_x + im * stride, atom_->nw); - ModuleBase::GlobalFunc::ZEROS(dphi_y + im * stride, atom_->nw); - ModuleBase::GlobalFunc::ZEROS(dphi_z + im * stride, atom_->nw); - } - else - { - // spherical harmonics - // TODO: vectorize the sph_harm function, - // the vectorized function can be called once for all meshgrids in a biggrid - ModuleBase::Ylm::grad_rl_sph_harm(atom_->nwl, coord.x, coord.y, coord.z, rly.data(), grly.data()); - // interpolation + if(dist > rcut) + { + if(phi != nullptr) + { + ModuleBase::GlobalFunc::ZEROS( + phi + im * stride, + atom_->nw); + } + + ModuleBase::GlobalFunc::ZEROS( + dphi_x + im * stride, + atom_->nw); + + ModuleBase::GlobalFunc::ZEROS( + dphi_y + im * stride, + atom_->nw); + + ModuleBase::GlobalFunc::ZEROS( + dphi_z + im * stride, + atom_->nw); + + continue; + } + + ModuleBase::Ylm::grad_rl_sph_harm( + atom_->nwl, + coord.x, + coord.y, + coord.z, + rly.data(), + grly.data()); + const double position = dist / dr_uniform; + const int ip = static_cast(position); + const double x0 = position - ip; const double x1 = 1.0 - x0; const double x2 = 2.0 - x0; const double x3 = 3.0 - x0; - const double x12 = x1 * x2 / 6; - const double x03 = x0 * x3 / 2; - double tmp, dtmp; + const double x12 = x1 * x2 / 6.0; + const double x03 = x0 * x3 / 2.0; + const double dist2 = dist * dist; + const double dist3 = dist2 * dist; + for(int iw = 0; iw < atom_->nw; ++iw) { - // this is a new 'l', we need 1D orbital wave - // function from interpolation method. + double tmp_iw = 0.0; + double dtmp_iw = 0.0; + if(atom_->iw2_new[iw]) { - auto psi_uniform = p_psi_uniform_[iw]; - auto dpsi_uniform = p_dpsi_uniform_[iw]; - // use Polynomia Interpolation method to get the - // wave functions + const auto psi_uniform = p_psi_uniform_[iw]; + const auto dpsi_uniform = p_dpsi_uniform_[iw]; - tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0) - + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1); + tmp_iw = + x12 * (psi_uniform[ip] * x3 + + psi_uniform[ip + 3] * x0) + + x03 * (psi_uniform[ip + 1] * x2 + - psi_uniform[ip + 2] * x1); - dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0) - + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1); - } // new l is used. + dtmp_iw = + x12 * (dpsi_uniform[ip] * x3 + + dpsi_uniform[ip + 3] * x0) + + x03 * (dpsi_uniform[ip + 1] * x2 + - dpsi_uniform[ip + 2] * x1); + } + else + { + continue; + } - // get the 'l' of this localized wave function const int ll = atom_->iw2l[iw]; const int idx_lm = atom_->iw2_ylm[iw]; - const double rl = pow_int(dist, ll); - const double tmprl = tmp / rl; + double rl; - // 3D wave functions - if(phi != nullptr) + switch(ll) { - phi[im * stride + iw] = tmprl * rly[idx_lm]; + case 0: + rl = 1.0; + break; + + case 1: + rl = dist; + break; + + case 2: + rl = dist2; + break; + + case 3: + rl = dist3; + break; + + default: + rl = pow_int(dist, ll); + break; } - - // derivative of wave functions with respect to atom positions. - const double tmpdphi_rly = (dtmp - tmp * ll / dist) / rl * rly[idx_lm] / dist; + const double tmprl = tmp_iw / rl; + +if(phi != nullptr) +{ + phi[im * stride + iw] + = tmprl * rly[idx_lm]; +} + +const double tmpdphi_rly = + (dtmp_iw - tmp_iw * ll / dist) + / rl + * rly[idx_lm] + / dist; + +dphi_x[im * stride + iw] + = tmpdphi_rly * coord.x + + tmprl * grly[idx_lm * 3]; + +dphi_y[im * stride + iw] + = tmpdphi_rly * coord.y + + tmprl * grly[idx_lm * 3 + 1]; - dphi_x[im * stride + iw] = tmpdphi_rly * coord.x + tmprl * grly[idx_lm*3]; - dphi_y[im * stride + iw] = tmpdphi_rly * coord.y + tmprl * grly[idx_lm*3 + 1]; - dphi_z[im * stride + iw] = tmpdphi_rly * coord.z + tmprl * grly[idx_lm*3 + 2]; +dphi_z[im * stride + iw] + = tmpdphi_rly * coord.z + + tmprl * grly[idx_lm * 3 + 2]; } } } diff --git a/source/source_lcao/module_gint/kernel/.phi_operator_gpu.cu.swp b/source/source_lcao/module_gint/kernel/.phi_operator_gpu.cu.swp new file mode 100644 index 00000000000..4892c7f42a2 Binary files /dev/null and b/source/source_lcao/module_gint/kernel/.phi_operator_gpu.cu.swp differ diff --git a/source/source_lcao/module_gint/kernel/phi_operator_gpu.cu b/source/source_lcao/module_gint/kernel/phi_operator_gpu.cu index 7021f44e3fe..199fd06975c 100644 --- a/source/source_lcao/module_gint/kernel/phi_operator_gpu.cu +++ b/source/source_lcao/module_gint/kernel/phi_operator_gpu.cu @@ -235,57 +235,85 @@ void PhiOperatorGpu::phi_mul_phi( double* hr_d) const { // ap_num means number of atom pairs - int ap_num = 0; - int max_m = 0; - int max_n = 0; - int max_k = mgrids_num_; - CHECK_CUDA(cudaEventSynchronize(event_)); - for (int i = 0; i < bgrid_batch_->get_batch_size(); i++) + +int ap_num = 0; +int max_m = 0; +int max_n = 0; +int max_k = mgrids_num_; +CHECK_CUDA(cudaEventSynchronize(event_)); +const int batch_size = bgrid_batch_->get_batch_size(); +const auto atom_info_host = atoms_num_info_.get_host_ptr(); +const auto phi_start_host = atom_phi_start_.get_host_ptr(); + +std::unordered_map offset_cache; + +for (int i = 0; i < batch_size; i++) +{ + auto bgrid = bgrid_batch_->get_bgrids()[i]; + const int phi_len_mgrid = bgrid->get_phi_len(); + const int mg_num = bgrid->get_mgrids_num(); + const int nat_bgrid = bgrid->get_atoms_num(); + auto* const atoms = bgrid->get_atoms(); + const int pre_atoms = atom_info_host[i].y; + + for (int ia_1 = 0; ia_1 < nat_bgrid; ia_1++) { - auto bgrid = bgrid_batch_->get_bgrids()[i]; - // the length of phi on a mesh grid - const int phi_len_mgrid = bgrid->get_phi_len(); - const int pre_atoms = atoms_num_info_.get_host_ptr()[i].y; - for (int ia_1 = 0; ia_1 < bgrid->get_atoms_num(); ia_1++) + auto atom_1 = atoms[ia_1]; + const int iat_1 = atom_1->get_iat(); + const int nw1 = atom_1->get_nw(); + atom_1->get_R(); + + for (int ia_2 = 0; ia_2 < nat_bgrid; ia_2++) { - auto atom_1 = bgrid->get_atoms()[ia_1]; - const int iat_1 = atom_1->get_iat(); - const auto& r_1 = atom_1->get_R(); - const int nw1 = atom_1->get_nw(); - const int phi_1_offset = atom_phi_start_.get_host_ptr()[pre_atoms + ia_1]; + auto atom_2 = atoms[ia_2]; + const int iat_2 = atom_2->get_iat(); + if(iat_1 > iat_2) + { continue; } - for (int ia_2 = 0; ia_2 < bgrid->get_atoms_num(); ia_2++) - { - auto atom_2 = bgrid->get_atoms()[ia_2]; - const int iat_2 = atom_2->get_iat(); - const auto& r_2 = atom_2->get_R(); - const int nw2 = atom_2->get_nw(); + const int nw2 = atom_2->get_nw(); + const int phi_2_offset = phi_start_host[pre_atoms + ia_2]; + const auto& r_2 = atom_2->get_R(); - if(iat_1 > iat_2) - { continue; } - - int hr_offset = hRGint.find_matrix_offset(iat_1, iat_2, r_1 - r_2); - if (hr_offset == -1) - { continue; } + + long long key = (long long)iat_1 * 10000 + iat_2; + int hr_offset = -1; + if (offset_cache.count(key)) + { + hr_offset = offset_cache[key]; + } + else + { + hr_offset = hRGint.find_matrix_offset(iat_1, iat_2, r_1 - r_2); + offset_cache[key] = hr_offset; + } - const int phi_2_offset = atom_phi_start_.get_host_ptr()[pre_atoms + ia_2]; + if (hr_offset == -1) + { continue; } - gemm_A_.get_host_ptr()[ap_num] = phi_d + phi_1_offset; - gemm_B_.get_host_ptr()[ap_num] = phi_vldr3_d + phi_2_offset; - gemm_C_.get_host_ptr()[ap_num] = hr_d + hr_offset; - gemm_lda_.get_host_ptr()[ap_num] = phi_len_mgrid; - gemm_ldb_.get_host_ptr()[ap_num] = phi_len_mgrid; - gemm_ldc_.get_host_ptr()[ap_num] = nw2; - gemm_m_.get_host_ptr()[ap_num] = nw1; - gemm_n_.get_host_ptr()[ap_num] = nw2; - gemm_k_.get_host_ptr()[ap_num] = bgrid->get_mgrids_num(); - ap_num++; + gemm_A_.gem_B_.get_host_ptr()[ap_num] = phi_vldr3_d + phi_2_offset; + gemm_C_.get_host_ptr()[ap_num] = hr_d + hr_offset; + gemm_lda_.get_host_ptr()[ap_num] = phi_len_mgrid; + gemm_ldb_.get_host_ptr()[ap_num] = phi_len_mgrid; + gemm_ldc_.get_host_ptr()[ap_num] = nw2; + gemm_m_.get_host_ptr()[ap_num] = nw1; + gemm_n_.get_host_ptr()[ap_num] = nw2; + gemm_k_.get_host_ptr()[ap_num] = mg_num; + ap_num++; - max_m = std::max(max_m, nw1); - max_n = std::max(max_n, nw2); - } + max_m = std::max(max_m, nw1); + max_n = std::max(max_n, nw2); } } +} +gemm_A_.reserve_host(ap_num); +gemm_B_.reserve_host(ap_num); +gemm_C_.reserve_host(ap_num); +gemm_lda_.reserve_host(ap_num); +gemm_ldb_.reserve_host(ap_num); +gemm_ldc_.reserve_host(ap_num); +gemm_m_.reserve_host(ap_num); +gemm_n_.reserve_host(ap_num); +gemm_k_.reserve_host(ap_num); gemm_A_.copy_host_to_device_async(ap_num); gemm_B_.copy_host_to_device_async(ap_num); @@ -491,4 +519,4 @@ void PhiOperatorGpu::phi_dot_dphi_r( template class PhiOperatorGpu; template class PhiOperatorGpu; -} \ No newline at end of file +} diff --git a/source/source_lcao/module_gint/phi_operator.cpp b/source/source_lcao/module_gint/phi_operator.cpp index 7f5fa700a42..c065a3a7783 100644 --- a/source/source_lcao/module_gint/phi_operator.cpp +++ b/source/source_lcao/module_gint/phi_operator.cpp @@ -19,14 +19,41 @@ void PhiOperator::set_bgrid(std::shared_ptr biggrid) const int atoms_num = biggrid_->get_atoms_num(); atom_rcoords_.resize(atoms_num); is_atom_on_mgrid_.resize(biggrid_->get_mgrids_num() * atoms_num); - for(int i = 0; i < atoms_num; ++i) + /* for(int i = 0; i < atoms_num; ++i) { biggrid_->set_atom_relative_coords(biggrid_->get_atom(i), atom_rcoords_[i]); for(int j = 0; j < rows_; ++j) { is_atom_on_mgrid_[i * rows_ + j] = atom_rcoords_[i][j].norm() <= biggrid_->get_atom(i)->get_rcut(); } + }*/ + #pragma omp parallel for schedule(static) +for(int i = 0; i < atoms_num; ++i) +{ + const auto atom = + biggrid_->get_atom(i); + + biggrid_->set_atom_relative_coords( + atom, + atom_rcoords_[i]); + + const double rcut2 = + atom->get_rcut() * atom->get_rcut(); + + for(int j = 0; j < rows_; ++j) + { + const auto& r = + atom_rcoords_[i][j]; + + const double dist2 = + r.x * r.x + + r.y * r.y + + r.z * r.z; + + is_atom_on_mgrid_[i * rows_ + j] + = (dist2 <= rcut2); } +} // init atom_pair_range_ init_atom_pair_idx_(); @@ -136,7 +163,7 @@ void PhiOperator::cal_env_gamma( const vector& trace_lo, double* rho) const { - for(int i = 0; i < biggrid_->get_atoms_num(); ++i) +/* for(int i = 0; i < biggrid_->get_atoms_num(); ++i) { const auto atom = biggrid_->get_atom(i); const int iw_start = atom->get_start_iw(); @@ -155,6 +182,39 @@ void PhiOperator::cal_env_gamma( } } } +}*/ + +for(int i = 0; i < biggrid_->get_atoms_num(); ++i) +{ + const auto atom = biggrid_->get_atom(i); + + const int iw_start = atom->get_start_iw(); + const int start_idx = atoms_startidx_[i]; + const int nw = atom->get_nw(); + + for(int j = 0; j < biggrid_->get_mgrids_num(); ++j) + { + if(is_atom_on_mgrid(i,j)) + { + double tmp = 0.0; + + const double* phi_row = + phi + j * cols_ + start_idx; + + const int iw_lo = + trace_lo[iw_start]; + +#pragma omp simd reduction(+:tmp) + for(int iw = 0; iw < nw; ++iw) + { + tmp += phi_row[iw] + * wfc[iw_lo + iw]; + } + + rho[mgrid_lidx_[j]] += tmp; + } + } +} } void PhiOperator::cal_env_k( @@ -250,4 +310,4 @@ void PhiOperator::init_atom_pair_idx_() } } -} // namespace ModuleGint \ No newline at end of file +} // namespace ModuleGint diff --git a/source/source_pw/module_stodft/sto_wf.cpp b/source/source_pw/module_stodft/sto_wf.cpp index 2fca88e4824..c8ac1b3bdd0 100644 --- a/source/source_pw/module_stodft/sto_wf.cpp +++ b/source/source_pw/module_stodft/sto_wf.cpp @@ -63,14 +63,14 @@ void Stochastic_WF::clean_chiallorder() template void Stochastic_WF::init_sto_orbitals(const int seed_in) { - const unsigned int rank_seed_offset = 10000; + unsigned int seed_scale=10000; // seed_scale is defined to avoid repetition if (seed_in == 0 || seed_in == -1) { - srand(static_cast(time(nullptr)) + GlobalV::MY_RANK * rank_seed_offset); // GlobalV global variables are reserved + srand(static_cast(time(nullptr)) + GlobalV::MY_RANK * seed_scale); // GlobalV global variables are reserved } else { - srand(static_cast(std::abs(seed_in)) + (GlobalV::MY_BNDGROUP * GlobalV::NPROC_IN_BNDGROUP + GlobalV::RANK_IN_BPGROUP) * rank_seed_offset); + srand(static_cast(std::abs(seed_in)) + (GlobalV::MY_BNDGROUP * GlobalV::NPROC_IN_BNDGROUP + GlobalV::RANK_IN_BPGROUP) * seed_scale); } this->allocate_chi0();