diff --git a/source/source_lcao/module_gint/gint_atom.cpp b/source/source_lcao/module_gint/gint_atom.cpp
index b367e32e169..c7714ec1153 100644
--- a/source/source_lcao/module_gint/gint_atom.cpp
+++ b/source/source_lcao/module_gint/gint_atom.cpp
@@ -115,94 +115,163 @@ void GintAtom::set_phi(const std::vector<Vec3d>& coords, const int stride, T* ph
         }
     }
 }
-
 template <typename T>
 void GintAtom::set_phi_dphi(
-    const std::vector<Vec3d>& coords, const int stride,
-    T* phi, T* dphi_x, T* dphi_y, T* dphi_z) const
+    const std::vector<Vec3d>& coords,
+    const int stride,
+    T* phi,
+    T* dphi_x,
+    T* dphi_y,
+    T* dphi_z) const
 {
     const int num_mgrids = coords.size();
-    
-    // orb_ does not have the member variable dr_uniform
+
     const double dr_uniform = orb_->PhiLN(0, 0).dr_uniform;
-    
-    const int nylm = std::pow(atom_->nwl + 1, 2);
-    std::vector<double> rly(nylm);
-    std::vector<double> grly(nylm * 3);
-    
-    for(int im = 0; im < num_mgrids; im++)
+    const double rcut = orb_->getRcut();
+
+    const int nylm = (atom_->nwl + 1) * (atom_->nwl + 1);
+
+#pragma omp parallel
     {
-        const Vec3d& coord = coords[im];
-        // 1e-9 is to avoid division by zero
-        const double dist = coord.norm() < 1e-9 ? 1e-9 : coord.norm();
+        std::vector<double> rly(nylm);
+        std::vector<double> grly(nylm * 3);
 
-        if(dist > orb_->getRcut())
+#pragma omp for schedule(static)
+        for(int im = 0; im < num_mgrids; ++im)
         {
-            // if the distance is larger than the cutoff radius,
-            // the wave function values are all zeros
-            if(phi != nullptr)
+            const Vec3d& coord = coords[im];
+
+            double dist = coord.norm();
+            if(dist < 1e-9)
             {
-                ModuleBase::GlobalFunc::ZEROS(phi + im * stride, atom_->nw);
+                dist = 1e-9;
             }
-            ModuleBase::GlobalFunc::ZEROS(dphi_x + im * stride, atom_->nw);
-            ModuleBase::GlobalFunc::ZEROS(dphi_y + im * stride, atom_->nw);
-            ModuleBase::GlobalFunc::ZEROS(dphi_z + im * stride, atom_->nw);
-        }
-        else
-        {
-            // spherical harmonics
-            // TODO: vectorize the sph_harm function, 
-            // the vectorized function can be called once for all meshgrids in a biggrid
-            ModuleBase::Ylm::grad_rl_sph_harm(atom_->nwl, coord.x, coord.y, coord.z, rly.data(), grly.data());
 
-            // interpolation
+            if(dist > rcut)
+            {
+                if(phi != nullptr)
+                {
+                    ModuleBase::GlobalFunc::ZEROS(
+                        phi + im * stride,
+                        atom_->nw);
+                }
+
+                ModuleBase::GlobalFunc::ZEROS(
+                    dphi_x + im * stride,
+                    atom_->nw);
+
+                ModuleBase::GlobalFunc::ZEROS(
+                    dphi_y + im * stride,
+                    atom_->nw);
+
+                ModuleBase::GlobalFunc::ZEROS(
+                    dphi_z + im * stride,
+                    atom_->nw);
+
+                continue;
+            }
+
+            ModuleBase::Ylm::grad_rl_sph_harm(
+                atom_->nwl,
+                coord.x,
+                coord.y,
+                coord.z,
+                rly.data(),
+                grly.data());
+
             const double position = dist / dr_uniform;
+
             const int ip = static_cast<int>(position);
+
             const double x0 = position - ip;
             const double x1 = 1.0 - x0;
             const double x2 = 2.0 - x0;
             const double x3 = 3.0 - x0;
-            const double x12 = x1 * x2 / 6;
-            const double x03 = x0 * x3 / 2;
 
-            double tmp, dtmp;
+            const double x12 = x1 * x2 / 6.0;
+            const double x03 = x0 * x3 / 2.0;
+	    const double dist2 = dist * dist;
+            const double dist3 = dist2 * dist;
+
             for(int iw = 0; iw < atom_->nw; ++iw)
             {
-                // this is a new 'l', we need 1D orbital wave
-                // function from interpolation method.
+                double tmp_iw = 0.0;
+                double dtmp_iw = 0.0;
+
                 if(atom_->iw2_new[iw])
                 {
-                    auto psi_uniform = p_psi_uniform_[iw];
-                    auto dpsi_uniform = p_dpsi_uniform_[iw];
-                    // use Polynomia Interpolation method to get the
-                    // wave functions
+                    const auto psi_uniform = p_psi_uniform_[iw];
+                    const auto dpsi_uniform = p_dpsi_uniform_[iw];
 
-                    tmp = x12 * (psi_uniform[ip] * x3 + psi_uniform[ip + 3] * x0)
-                        + x03 * (psi_uniform[ip + 1] * x2 - psi_uniform[ip + 2] * x1);
+                    tmp_iw =
+                        x12 * (psi_uniform[ip] * x3
+                             + psi_uniform[ip + 3] * x0)
+                      + x03 * (psi_uniform[ip + 1] * x2
+                             - psi_uniform[ip + 2] * x1);
 
-                    dtmp = x12 * (dpsi_uniform[ip] * x3 + dpsi_uniform[ip + 3] * x0)
-                        + x03 * (dpsi_uniform[ip + 1] * x2 - dpsi_uniform[ip + 2] * x1);
-                } // new l is used.
+                    dtmp_iw =
+                        x12 * (dpsi_uniform[ip] * x3
+                             + dpsi_uniform[ip + 3] * x0)
+                      + x03 * (dpsi_uniform[ip + 1] * x2
+                             - dpsi_uniform[ip + 2] * x1);
+                }
+                else
+                {
+                    continue;
+                }
 
-                // get the 'l' of this localized wave function
                 const int ll = atom_->iw2l[iw];
                 const int idx_lm = atom_->iw2_ylm[iw];
 
-                const double rl = pow_int(dist, ll);
-                const double tmprl = tmp / rl;
+                double rl;
 
-                // 3D wave functions
-                if(phi != nullptr)
+                switch(ll)
                 {
-                    phi[im * stride + iw] = tmprl * rly[idx_lm];
+                    case 0:
+                        rl = 1.0;
+                        break;
+
+                    case 1:
+                        rl = dist;
+                        break;
+
+                    case 2:
+                        rl = dist2;
+                        break;
+
+                    case 3:
+                        rl = dist3;
+                        break;
+
+                    default:
+                        rl = pow_int(dist, ll);
+                        break;
                 }
-                
-                // derivative of wave functions with respect to atom positions.
-                const double tmpdphi_rly = (dtmp - tmp * ll / dist) / rl * rly[idx_lm] / dist;
+			const double tmprl = tmp_iw / rl;
+
+if(phi != nullptr)
+{
+    phi[im * stride + iw]
+        = tmprl * rly[idx_lm];
+}
+
+const double tmpdphi_rly =
+    (dtmp_iw - tmp_iw * ll / dist)
+    / rl
+    * rly[idx_lm]
+    / dist;
+
+dphi_x[im * stride + iw]
+    = tmpdphi_rly * coord.x
+    + tmprl * grly[idx_lm * 3];
+
+dphi_y[im * stride + iw]
+    = tmpdphi_rly * coord.y
+    + tmprl * grly[idx_lm * 3 + 1];
 
-                dphi_x[im * stride + iw] =  tmpdphi_rly * coord.x + tmprl * grly[idx_lm*3];
-                dphi_y[im * stride + iw] =  tmpdphi_rly * coord.y + tmprl * grly[idx_lm*3 + 1];
-                dphi_z[im * stride + iw] =  tmpdphi_rly * coord.z + tmprl * grly[idx_lm*3 + 2];
+dphi_z[im * stride + iw]
+    = tmpdphi_rly * coord.z
+    + tmprl * grly[idx_lm * 3 + 2];
             }
         }
     }
diff --git a/source/source_lcao/module_gint/kernel/.phi_operator_gpu.cu.swp b/source/source_lcao/module_gint/kernel/.phi_operator_gpu.cu.swp
new file mode 100644
index 00000000000..4892c7f42a2
Binary files /dev/null and b/source/source_lcao/module_gint/kernel/.phi_operator_gpu.cu.swp differ
diff --git a/source/source_lcao/module_gint/kernel/phi_operator_gpu.cu b/source/source_lcao/module_gint/kernel/phi_operator_gpu.cu
index 7021f44e3fe..199fd06975c 100644
--- a/source/source_lcao/module_gint/kernel/phi_operator_gpu.cu
+++ b/source/source_lcao/module_gint/kernel/phi_operator_gpu.cu
@@ -235,57 +235,85 @@ void PhiOperatorGpu<Real>::phi_mul_phi(
     double* hr_d) const
 {
     // ap_num means number of atom pairs
-    int ap_num = 0;
-    int max_m = 0;
-    int max_n = 0;
-    int max_k = mgrids_num_;
-    CHECK_CUDA(cudaEventSynchronize(event_));
-    for (int i = 0; i < bgrid_batch_->get_batch_size(); i++)
+            
+int ap_num = 0;
+int max_m = 0;
+int max_n = 0;
+int max_k = mgrids_num_;
+CHECK_CUDA(cudaEventSynchronize(event_));
+const int batch_size = bgrid_batch_->get_batch_size();
+const auto atom_info_host = atoms_num_info_.get_host_ptr();
+const auto phi_start_host = atom_phi_start_.get_host_ptr();
+
+std::unordered_map<long long, int> offset_cache;
+
+for (int i = 0; i < batch_size; i++)
+{
+    auto bgrid = bgrid_batch_->get_bgrids()[i];
+    const int phi_len_mgrid = bgrid->get_phi_len();
+    const int mg_num = bgrid->get_mgrids_num();
+    const int nat_bgrid = bgrid->get_atoms_num();
+    auto* const atoms = bgrid->get_atoms();
+    const int pre_atoms = atom_info_host[i].y;
+
+    for (int ia_1 = 0; ia_1 < nat_bgrid; ia_1++)
     {
-        auto bgrid = bgrid_batch_->get_bgrids()[i];
-        // the length of phi on a mesh grid
-        const int phi_len_mgrid = bgrid->get_phi_len();
-        const int pre_atoms = atoms_num_info_.get_host_ptr()[i].y;
-        for (int ia_1 = 0; ia_1 < bgrid->get_atoms_num(); ia_1++)
+        auto atom_1 = atoms[ia_1];
+        const int iat_1 = atom_1->get_iat();
+        const int nw1 = atom_1->get_nw();
+       atom_1->get_R();
+
+        for (int ia_2 = 0; ia_2 < nat_bgrid; ia_2++)
         {
-            auto atom_1 = bgrid->get_atoms()[ia_1];
-            const int iat_1 = atom_1->get_iat();
-            const auto& r_1 = atom_1->get_R();
-            const int nw1 = atom_1->get_nw();
-            const int phi_1_offset = atom_phi_start_.get_host_ptr()[pre_atoms + ia_1];
+            auto atom_2 = atoms[ia_2];
+            const int iat_2 = atom_2->get_iat();
+            if(iat_1 > iat_2)
+            { continue; }
 
-            for (int ia_2 = 0; ia_2 < bgrid->get_atoms_num(); ia_2++)
-            {
-                auto atom_2 = bgrid->get_atoms()[ia_2];
-                const int iat_2 = atom_2->get_iat();
-                const auto& r_2 = atom_2->get_R();
-                const int nw2 = atom_2->get_nw();
+            const int nw2 = atom_2->get_nw();
+            const int phi_2_offset = phi_start_host[pre_atoms + ia_2];
+            const auto& r_2 = atom_2->get_R();
 
-                if(iat_1 > iat_2)
-                { continue; }
-                
-                int hr_offset = hRGint.find_matrix_offset(iat_1, iat_2, r_1 - r_2);
-                if (hr_offset == -1)
-                { continue; }
+            
+            long long key = (long long)iat_1 * 10000 + iat_2;
+            int hr_offset = -1;
+            if (offset_cache.count(key))
+            {
+                hr_offset = offset_cache[key];
+            }
+            else
+            {
+                hr_offset = hRGint.find_matrix_offset(iat_1, iat_2, r_1 - r_2);
+                offset_cache[key] = hr_offset;
+            }
 
-                const int phi_2_offset = atom_phi_start_.get_host_ptr()[pre_atoms + ia_2];
+            if (hr_offset == -1)
+            { continue; }
 
-                gemm_A_.get_host_ptr()[ap_num] = phi_d + phi_1_offset;
-                gemm_B_.get_host_ptr()[ap_num] = phi_vldr3_d + phi_2_offset;
-                gemm_C_.get_host_ptr()[ap_num] = hr_d + hr_offset;
-                gemm_lda_.get_host_ptr()[ap_num] = phi_len_mgrid;
-                gemm_ldb_.get_host_ptr()[ap_num] = phi_len_mgrid;
-                gemm_ldc_.get_host_ptr()[ap_num] = nw2;
-                gemm_m_.get_host_ptr()[ap_num] = nw1;
-                gemm_n_.get_host_ptr()[ap_num] = nw2;
-                gemm_k_.get_host_ptr()[ap_num] = bgrid->get_mgrids_num();
-                ap_num++;
+            gemm_A_.gem_B_.get_host_ptr()[ap_num] = phi_vldr3_d + phi_2_offset;
+            gemm_C_.get_host_ptr()[ap_num] = hr_d + hr_offset;
+            gemm_lda_.get_host_ptr()[ap_num] = phi_len_mgrid;
+            gemm_ldb_.get_host_ptr()[ap_num] = phi_len_mgrid;
+            gemm_ldc_.get_host_ptr()[ap_num] = nw2;
+            gemm_m_.get_host_ptr()[ap_num] = nw1;
+            gemm_n_.get_host_ptr()[ap_num] = nw2;
+            gemm_k_.get_host_ptr()[ap_num] = mg_num;
+            ap_num++;
 
-                max_m = std::max(max_m, nw1);
-                max_n = std::max(max_n, nw2);
-            }
+            max_m = std::max(max_m, nw1);
+            max_n = std::max(max_n, nw2);
         }
     }
+}
+gemm_A_.reserve_host(ap_num);
+gemm_B_.reserve_host(ap_num);
+gemm_C_.reserve_host(ap_num);
+gemm_lda_.reserve_host(ap_num);
+gemm_ldb_.reserve_host(ap_num);
+gemm_ldc_.reserve_host(ap_num);
+gemm_m_.reserve_host(ap_num);
+gemm_n_.reserve_host(ap_num);
+gemm_k_.reserve_host(ap_num);
 
     gemm_A_.copy_host_to_device_async(ap_num);
     gemm_B_.copy_host_to_device_async(ap_num);
@@ -491,4 +519,4 @@ void PhiOperatorGpu<Real>::phi_dot_dphi_r(
 template class PhiOperatorGpu<double>;
 template class PhiOperatorGpu<float>;
 
-}
\ No newline at end of file
+}
diff --git a/source/source_lcao/module_gint/phi_operator.cpp b/source/source_lcao/module_gint/phi_operator.cpp
index 7f5fa700a42..c065a3a7783 100644
--- a/source/source_lcao/module_gint/phi_operator.cpp
+++ b/source/source_lcao/module_gint/phi_operator.cpp
@@ -19,14 +19,41 @@ void PhiOperator::set_bgrid(std::shared_ptr<const BigGrid> biggrid)
     const int atoms_num = biggrid_->get_atoms_num();
     atom_rcoords_.resize(atoms_num);
     is_atom_on_mgrid_.resize(biggrid_->get_mgrids_num() * atoms_num);
-    for(int i = 0; i < atoms_num; ++i)
+   /* for(int i = 0; i < atoms_num; ++i)
     {
         biggrid_->set_atom_relative_coords(biggrid_->get_atom(i), atom_rcoords_[i]);
         for(int j = 0; j < rows_; ++j)
         {
             is_atom_on_mgrid_[i * rows_ + j] = atom_rcoords_[i][j].norm() <= biggrid_->get_atom(i)->get_rcut();
         }
+    }*/
+    #pragma omp parallel for schedule(static)
+for(int i = 0; i < atoms_num; ++i)
+{
+    const auto atom =
+        biggrid_->get_atom(i);
+
+    biggrid_->set_atom_relative_coords(
+        atom,
+        atom_rcoords_[i]);
+
+    const double rcut2 =
+        atom->get_rcut() * atom->get_rcut();
+
+    for(int j = 0; j < rows_; ++j)
+    {
+        const auto& r =
+            atom_rcoords_[i][j];
+
+        const double dist2 =
+            r.x * r.x +
+            r.y * r.y +
+            r.z * r.z;
+
+        is_atom_on_mgrid_[i * rows_ + j]
+            = (dist2 <= rcut2);
     }
+}
 
     // init atom_pair_range_
     init_atom_pair_idx_();
@@ -136,7 +163,7 @@ void PhiOperator::cal_env_gamma(
     const vector<int>& trace_lo,
     double* rho) const
 {
-    for(int i = 0; i < biggrid_->get_atoms_num(); ++i)
+/*    for(int i = 0; i < biggrid_->get_atoms_num(); ++i)
     {
         const auto atom = biggrid_->get_atom(i);
         const int iw_start = atom->get_start_iw();
@@ -155,6 +182,39 @@ void PhiOperator::cal_env_gamma(
             }
         }
     }
+}*/
+
+for(int i = 0; i < biggrid_->get_atoms_num(); ++i)
+{
+    const auto atom = biggrid_->get_atom(i);
+
+    const int iw_start = atom->get_start_iw();
+    const int start_idx = atoms_startidx_[i];
+    const int nw = atom->get_nw();
+
+    for(int j = 0; j < biggrid_->get_mgrids_num(); ++j)
+    {
+        if(is_atom_on_mgrid(i,j))
+        {
+            double tmp = 0.0;
+
+            const double* phi_row =
+                phi + j * cols_ + start_idx;
+
+            const int iw_lo =
+                trace_lo[iw_start];
+
+#pragma omp simd reduction(+:tmp)
+            for(int iw = 0; iw < nw; ++iw)
+            {
+                tmp += phi_row[iw]
+                     * wfc[iw_lo + iw];
+            }
+
+            rho[mgrid_lidx_[j]] += tmp;
+        }
+    }
+}
 }
 
 void PhiOperator::cal_env_k(
@@ -250,4 +310,4 @@ void PhiOperator::init_atom_pair_idx_()
     }
 }
 
-} // namespace ModuleGint
\ No newline at end of file
+} // namespace ModuleGint
diff --git a/source/source_pw/module_stodft/sto_wf.cpp b/source/source_pw/module_stodft/sto_wf.cpp
index 2fca88e4824..c8ac1b3bdd0 100644
--- a/source/source_pw/module_stodft/sto_wf.cpp
+++ b/source/source_pw/module_stodft/sto_wf.cpp
@@ -63,14 +63,14 @@ void Stochastic_WF<T, Device>::clean_chiallorder()
 template <typename T, typename Device>
 void Stochastic_WF<T, Device>::init_sto_orbitals(const int seed_in)
 {
-    const unsigned int rank_seed_offset = 10000;
+    unsigned int seed_scale=10000; // seed_scale is defined to avoid repetition
     if (seed_in == 0 || seed_in == -1)
     {
-        srand(static_cast<unsigned int>(time(nullptr)) + GlobalV::MY_RANK * rank_seed_offset); // GlobalV global variables are reserved
+        srand(static_cast<unsigned>(time(nullptr)) + GlobalV::MY_RANK * seed_scale); // GlobalV global variables are reserved
     }
     else
     {
-        srand(static_cast<unsigned int>(std::abs(seed_in)) + (GlobalV::MY_BNDGROUP * GlobalV::NPROC_IN_BNDGROUP + GlobalV::RANK_IN_BPGROUP) * rank_seed_offset);
+        srand(static_cast<unsigned>(std::abs(seed_in)) + (GlobalV::MY_BNDGROUP * GlobalV::NPROC_IN_BNDGROUP + GlobalV::RANK_IN_BPGROUP) * seed_scale);
     }
 
     this->allocate_chi0();