diff --git a/.gitignore b/.gitignore index 8cbc70e15..a0defafff 100644 --- a/.gitignore +++ b/.gitignore @@ -21,5 +21,9 @@ build*/ .vscode .cache +# IDEA IDE +.idea* +cmake-build-* + # mac .DS_Store diff --git a/CMakeLists.txt b/CMakeLists.txt index 54c104885..29cc5bdd8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,13 @@ # ------------------------------------------------------------------------------ cmake_minimum_required(VERSION 3.24) +# Note: when CUDA is enabled, host LTO across nvcc-generated fat binaries +# fails because of repeated `fatbinData` symbols. We therefore disable LTO +# at the IPPL library target level (see `target_compile_options(ippl ... -fno-lto)` +# applied in src/CMakeLists.txt when CUDA is in IPPL_PLATFORMS) instead of +# globally. Keeping it scoped lets pure-host code in alpine/tests benefit +# from LTO when explicitly requested. + # ------------------------------------------------------------------------------ # Policies - use the latest of everything # ------------------------------------------------------------------------------ @@ -52,9 +59,8 @@ option(IPPL_MARK_FAILING_TESTS OFF) option(IPPL_ENABLE_SCRIPTS "Generate job script templates for some benchmarks/tests" OFF) -# "Build IPPL as a shared library (ON) or static library (OFF)" OFF) if(IPPL_DYL) -# set(BUILD_SHARED_LIBS ON CACHE BOOL "" FORCE) message(WARNING "IPPL_DYL is deprecated; use -# -DBUILD_SHARED_LIBS=ON instead.") endif() +option(IPPL_ENABLE_FINUFFT "Enable (CU)FINUFFT" OFF) +option(IPPL_ENABLE_CUFFTMP "Enable cuFFTMp distributed FFT backend" OFF) # ------------------------------------------------------------------------------ # Setup Output directories @@ -87,6 +93,8 @@ include(Messages) include(CompilerOptions) include(Platforms) include(Dependencies) +include(AutoTunePresets) +ippl_configure_autotune_presets() include(FailingTests) # ------------------------------------------------------------------------------ diff --git a/alpine/AlpineManager.h b/alpine/AlpineManager.h index ff5ff85c5..36b265fac 100644 --- a/alpine/AlpineManager.h +++ b/alpine/AlpineManager.h @@ -96,6 +96,10 @@ class AlpineManager : public ippl::PicManager, void setTime(double time_) { time_m = time_; } + int getIt() const { return it_m; } + + void setIt(int it) { it_m = it; } + std::vector getPreconditionerParams() const { return preconditioner_params_m; }; virtual void dump(){/* default does nothing */}; diff --git a/alpine/BumponTailInstabilityManager.h b/alpine/BumponTailInstabilityManager.h index 7525f9af1..0b2199a12 100644 --- a/alpine/BumponTailInstabilityManager.h +++ b/alpine/BumponTailInstabilityManager.h @@ -274,10 +274,10 @@ class BumponTailInstabilityManager : public AlpineManager { this->pcontainer_m->create(nlocal); - view_type* R = &(this->pcontainer_m->R.getView()); - samplingR.generate(*R, rand_pool64); + auto R = this->pcontainer_m->R.getView(); + samplingR.generate(R, rand_pool64); - view_type* P = &(this->pcontainer_m->P.getView()); + auto P = this->pcontainer_m->P.getView(); double mu[Dim]; double sd[Dim]; @@ -288,12 +288,12 @@ class BumponTailInstabilityManager : public AlpineManager { // sample first nlocBulk with muBulk as mean velocity mu[Dim - 1] = muBulk_m; Kokkos::parallel_for(Kokkos::RangePolicy(0, nlocBulk), - ippl::random::randn(*P, rand_pool64, mu, sd)); + ippl::random::randn(P, rand_pool64, mu, sd)); // sample remaining with muBeam as mean velocity mu[Dim - 1] = muBeam_m; Kokkos::parallel_for(Kokkos::RangePolicy(nlocBulk, nlocal), - ippl::random::randn(*P, rand_pool64, mu, sd)); + ippl::random::randn(P, rand_pool64, mu, sd)); Kokkos::fence(); ippl::Comm->barrier(); diff --git a/alpine/CMakeLists.txt b/alpine/CMakeLists.txt index ff9856ba3..709778851 100644 --- a/alpine/CMakeLists.txt +++ b/alpine/CMakeLists.txt @@ -19,8 +19,9 @@ if(IPPL_ENABLE_TESTS) # Landau will write a CSV file to the data directory that we will validate later make_directory("${PROJECT_BINARY_DIR}/alpine/data") # Add the test - add_ippl_integration_test(LandauDamping + add_ippl_integration_test(LandauDamping ARGS "16" "16" "16" "10000000" "25" "FFT" "0.01" "LeapFrog" "--overallocate" "2.0" "--info" "10" + TIMEOUT 900 LABELS alpine integration WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/alpine/") # cmake-format: on @@ -31,4 +32,6 @@ endif() add_alpine_example(PenningTrap) add_alpine_example(BumponTailInstability) +add_subdirectory(ElectrostaticPIF) + add_subdirectory(validation) diff --git a/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp new file mode 100644 index 000000000..bf36395b2 --- /dev/null +++ b/alpine/ElectrostaticPIF/BumponTailInstabilityPIF.cpp @@ -0,0 +1,376 @@ +// Electrostatic Two-stream/Bump-on-tail instability test with Particle-in-Fourier schemes +// Usage: +// srun ./BumponTailInstabilityPIF
+// --info 5 nx = No. of Fourier modes in the x-direction ny = No. of Fourier modes +// in the y-direction nz = No. of Fourier modes in the z-direction Np = Total no. of +// macro-particles in the simulation Nt = Number of time steps dt = Time stepsize +// ShapeType = Shape function type B-spline only for the moment +// degree = B-spline degree (-1 for delta function) +// tol = tolerance of NUFFT +// Example: +// srun ./BumponTailInstabilityPIF 32 32 32 655360 20 0.05 B-spline 1 1e-4 --info 5 +// +// Copyright (c) 2023, Sriramkrishnan Muralikrishnan, +// Jülich Supercomputing Centre, Jülich, Germany. +// All rights reserved +// +// This file is part of IPPL. +// +// IPPL is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// You should have received a copy of the GNU General Public License +// along with IPPL. If not, see . +// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Utility/IpplTimings.h" + +#include "ChargedParticlesPIF.hpp" + +template +struct Newton1D { + double tol = 1e-12; + int max_iter = 20; + double pi = std::acos(-1.0); + + T k, delta, u; + + KOKKOS_INLINE_FUNCTION Newton1D() {} + + KOKKOS_INLINE_FUNCTION Newton1D(const T& k_, const T& delta_, const T& u_) + : k(k_) + , delta(delta_) + , u(u_) {} + + KOKKOS_INLINE_FUNCTION ~Newton1D() {} + + KOKKOS_INLINE_FUNCTION T f(T& x) { + T F; + F = x + (delta * (std::sin(k * x) / k)) - u; + return F; + } + + KOKKOS_INLINE_FUNCTION T fprime(T& x) { + T Fprime; + Fprime = 1 + (delta * std::cos(k * x)); + return Fprime; + } + + KOKKOS_FUNCTION + void solve(T& x) { + int iterations = 0; + while (iterations < max_iter && std::fabs(f(x)) > tol) { + x = x - (f(x) / fprime(x)); + iterations += 1; + } + } +}; + +template +struct generate_random { + using view_type = typename ippl::detail::ViewType::view_type; + using value_type = typename T::value_type; + // Output View for the random numbers + view_type x, v; + + // The GeneratorPool + GeneratorPool rand_pool; + + value_type delta, sigma, muBulk, muBeam; + size_type nlocBulk; + + T k, minU, maxU; + + // Initialize all members + generate_random(view_type x_, view_type v_, GeneratorPool rand_pool_, value_type& delta_, T& k_, + value_type& sigma_, value_type& muBulk_, value_type& muBeam_, + size_type& nlocBulk_, T& minU_, T& maxU_) + : x(x_) + , v(v_) + , rand_pool(rand_pool_) + , delta(delta_) + , sigma(sigma_) + , muBulk(muBulk_) + , muBeam(muBeam_) + , nlocBulk(nlocBulk_) + , k(k_) + , minU(minU_) + , maxU(maxU_) {} + + KOKKOS_INLINE_FUNCTION void operator()(const size_t i) const { + // Get a random number state from the pool for the active thread + typename GeneratorPool::generator_type rand_gen = rand_pool.get_state(); + + bool isBeam = (i >= nlocBulk); + + value_type muZ = (value_type)(((!isBeam) * muBulk) + (isBeam * muBeam)); + + for (unsigned d = 0; d < Dim - 1; ++d) { + x(i)[d] = rand_gen.drand(minU[d], maxU[d]); + v(i)[d] = rand_gen.normal(0.0, sigma); + } + v(i)[Dim - 1] = rand_gen.normal(muZ, sigma); + + value_type u = rand_gen.drand(minU[Dim - 1], maxU[Dim - 1]); + x(i)[Dim - 1] = u / (1 + delta); + Newton1D solver(k[Dim - 1], delta, u); + solver.solve(x(i)[Dim - 1]); + + // Give the state back, which will allow another thread to acquire it + rand_pool.free_state(rand_gen); + } +}; + +double CDF(const double& x, const double& delta, const double& k, const unsigned& dim) { + bool isDimZ = (dim == (Dim - 1)); + double cdf = x + (double)(isDimZ * ((delta / k) * std::sin(k * x))); + return cdf; +} + +const char* TestName = "TwoStreamInstabilityPIF"; + +int main(int argc, char* argv[]) { + ippl::initialize(argc, argv); + { + Inform msg(TestName); + Inform msg2all(TestName, INFORM_ALL_NODES); + + ippl::Vector nr = {std::atoi(argv[1]), std::atoi(argv[2]), std::atoi(argv[3])}; + + static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer"); + static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation"); + static IpplTimings::TimerRef dumpDataTimer = IpplTimings::getTimer("dumpData"); + static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("kick"); + static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("drift"); + static IpplTimings::TimerRef BCTimer = IpplTimings::getTimer("particleBC"); + static IpplTimings::TimerRef initializeShapeFunctionPIF = + IpplTimings::getTimer("initializeShapeFunctionPIF"); + + IpplTimings::startTimer(mainTimer); + + const size_type totalP = std::atoll(argv[4]); + const unsigned int nt = std::atoi(argv[5]); + const double dt = std::atof(argv[6]); + + using bunch_type = ChargedParticlesPIF; + + std::unique_ptr P; + + ippl::NDIndex domain; + for (unsigned i = 0; i < Dim; i++) { + domain[i] = ippl::Index(nr[i]); + } + + std::array isParallel; // Specifies SERIAL, PARALLEL dims + isParallel.fill(false); + + // create mesh and layout objects for this problem domain + Vector_t kw; + double sigma, muBulk, muBeam, epsilon, delta; + + if (std::strcmp(TestName, "TwoStreamInstabilityPIF") == 0) { + // Parameters for two stream instability as in + // https://www.frontiersin.org/articles/10.3389/fphy.2018.00105/full + kw = {0.5, 0.5, 0.5}; + sigma = 0.1; + epsilon = 0.5; + muBulk = -pi / 2.0; + muBeam = pi / 2.0; + delta = 0.01; + } else if (std::strcmp(TestName, "BumponTailInstabilityPIF") == 0) { + kw = {0.21, 0.21, 0.21}; + sigma = 1.0 / std::sqrt(2.0); + epsilon = 0.1; + muBulk = 0.0; + muBeam = 4.0; + delta = 0.01; + } else { + // Default value is two stream instability + kw = {0.5, 0.5, 0.5}; + sigma = 0.1; + epsilon = 0.5; + muBulk = -pi / 2.0; + muBeam = pi / 2.0; + delta = 0.01; + } + + Vector_t rmin(0.0); + Vector_t rmax = 2 * pi / kw; + Vector_t length = rmax - rmin; + double dx = rmax[0] / nr[0]; + double dy = rmax[1] / nr[1]; + double dz = rmax[2] / nr[2]; + + Vector_t hr = {dx, dy, dz}; + Vector_t origin = {rmin[0], rmin[1], rmin[2]}; + + Mesh_t mesh(domain, hr, origin); + FieldLayout_t FL(*ippl::Comm, domain, isParallel); + PLayout_t PL(FL, mesh); + + double factorConf = 1.0 / ippl::Comm->size(); + double factorVelBulk = 1.0 - epsilon; + double factorVelBeam = 1.0 - factorVelBulk; + size_type nlocBulk = (size_type)(factorConf * factorVelBulk * totalP); + size_type nlocBeam = (size_type)(factorConf * factorVelBeam * totalP); + size_type nloc = nlocBulk + nlocBeam; + size_type Total_particles = 0; + + MPI_Allreduce(&nloc, &Total_particles, 1, MPI_UNSIGNED_LONG, MPI_SUM, + ippl::Comm->getCommunicator()); + + msg << TestName << endl + << "nt " << nt << " Np= " << Total_particles << " Fourier modes = " << nr << endl; + + // Q = -\int\int f dx dv + double Q = -rmax[0] * rmax[1] * rmax[2]; + P = std::make_unique(PL, hr, rmin, rmax, isParallel, Q, Total_particles); + + P->nr_m = nr; + + P->rho_m.initialize(mesh, FL); + P->Sk_m.initialize(mesh, FL); + + //////////////////////////////////////////////////////////// + // Initialize an FFT object for getting rho in real space and + // doing charge conservation check + + ippl::ParameterList fftParams; + fftParams.add("use_heffte_defaults", false); + fftParams.add("use_pencils", true); + fftParams.add("use_reorder", false); + fftParams.add("use_gpu_aware", true); + fftParams.add("comm", ippl::p2p_pl); + fftParams.add("r2c_direction", 0); + + ippl::NDIndex domainPIFhalf; + + for (unsigned d = 0; d < Dim; ++d) { + domainPIFhalf[d] = ippl::Index(domain[d].length()); + } + + FieldLayout_t FLPIFhalf(*ippl::Comm, domainPIFhalf, isParallel); + + ippl::Vector hDummy = {1.0, 1.0, 1.0}; + ippl::Vector originDummy = {0.0, 0.0, 0.0}; + Mesh_t meshPIFhalf(domainPIFhalf, hDummy, originDummy); + + ippl::Vector hFourier = {2 * pi / length[0], 2 * pi / length[1], + 2 * pi / length[2]}; + ippl::Vector originFourier = {-pi / hr[0], -pi / hr[1], -pi / hr[2]}; + Mesh_t meshFourier(domain, hFourier, originFourier); + + P->rhoPIFreal_m.initialize(mesh, FL); + P->rhoPIFhalf_m.initialize(meshPIFhalf, FLPIFhalf); + P->rhoPIFFourierMag_m.initialize(meshFourier, FL); + + // P->fft_mp = std::make_shared(FL, FLPIFhalf, fftParams); + // P->fft_mp = std::make_shared(FLPIFhalf, fftParams); + + //////////////////////////////////////////////////////////// + + P->time_m = 0.0; + + P->shapetype_m = argv[7]; + P->shapedegree_m = std::atoi(argv[8]); + + IpplTimings::startTimer(particleCreation); + + Vector_t minU, maxU; + for (unsigned d = 0; d < Dim; ++d) { + minU[d] = CDF(rmin[d], delta, kw[d], d); + maxU[d] = CDF(rmax[d], delta, kw[d], d); + } + + + P->create(nloc); + Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100 * ippl::Comm->rank())); + Kokkos::parallel_for(nloc, generate_random, Dim>( + P->R.getView(), P->P.getView(), rand_pool64, delta, kw, + sigma, muBulk, muBeam, nlocBulk, minU, maxU)); + + Kokkos::fence(); + ippl::Comm->barrier(); + IpplTimings::stopTimer(particleCreation); + + P->q = P->Q_m / Total_particles; + msg << "particles created and initial conditions assigned " << endl; + + IpplTimings::startTimer(initializeShapeFunctionPIF); + P->initializeShapeFunctionPIF(); + IpplTimings::stopTimer(initializeShapeFunctionPIF); + + double tol = std::atof(argv[9]); + P->initNUFFT(FL, tol); + + P->scatter(); + + P->gather(); + + IpplTimings::startTimer(dumpDataTimer); + P->dumpBumponTail(); + P->dumpEnergy(); + IpplTimings::stopTimer(dumpDataTimer); + + // begin main timestep loop + msg << "Starting iterations ..." << endl; + for (unsigned int it = 0; it < nt; it++) { + // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration + // Here, we assume a constant charge-to-mass ratio of -1 for + // all the particles hence eliminating the need to store mass as + // an attribute + // kick + + IpplTimings::startTimer(PTimer); + P->P = P->P - 0.5 * dt * P->E; + IpplTimings::stopTimer(PTimer); + + // drift + IpplTimings::startTimer(RTimer); + P->R = P->R + dt * P->P; + IpplTimings::stopTimer(RTimer); + + // Apply particle BC + IpplTimings::startTimer(BCTimer); + PL.applyBC(P->R, PL.getRegionLayout().getDomain()); + IpplTimings::stopTimer(BCTimer); + + // scatter the charge onto the underlying grid + P->scatter(); + + // Solve for and gather E field + P->gather(); + + // kick + IpplTimings::startTimer(PTimer); + P->P = P->P - 0.5 * dt * P->E; + IpplTimings::stopTimer(PTimer); + + P->time_m += dt; + IpplTimings::startTimer(dumpDataTimer); + P->dumpBumponTail(); + P->dumpEnergy(); + IpplTimings::stopTimer(dumpDataTimer); + msg << "Finished time step: " << it + 1 << " time: " << P->time_m << endl; + } + + msg << "BumponTailInstability: End." << endl; + IpplTimings::stopTimer(mainTimer); + IpplTimings::print(); + IpplTimings::print(std::string("timing.dat")); + } + ippl::finalize(); + + return 0; +} diff --git a/alpine/ElectrostaticPIF/CMakeLists.txt b/alpine/ElectrostaticPIF/CMakeLists.txt new file mode 100644 index 000000000..2d052dee7 --- /dev/null +++ b/alpine/ElectrostaticPIF/CMakeLists.txt @@ -0,0 +1,47 @@ +file (RELATIVE_PATH _relPath "${CMAKE_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}") +message (STATUS "Adding index test found in ${_relPath}") + +include_directories ( + ${CMAKE_SOURCE_DIR}/src +) + +link_directories ( + ${CMAKE_CURRENT_SOURCE_DIR} + ${Kokkos_DIR}/.. +) + +set (IPPL_LIBS ippl ${MPI_CXX_LIBRARIES}) +set (COMPILE_FLAGS ${OPAL_CXX_FLAGS}) + +add_executable (LandauDampingPIF LandauDampingPIF.cpp) +target_link_libraries (LandauDampingPIF PRIVATE ${IPPL_LIBS}) + +if(IPPL_ENABLE_FINUFFT) + target_link_libraries(LandauDampingPIF PRIVATE cufinufft finufft) +endif() + +add_executable (BumponTailInstabilityPIF BumponTailInstabilityPIF.cpp) +target_link_libraries (BumponTailInstabilityPIF PRIVATE ${IPPL_LIBS}) + +if(IPPL_ENABLE_FINUFFT) + target_link_libraries(BumponTailInstabilityPIF PRIVATE cufinufft finufft) +endif() + +add_executable (PenningTrapPIF PenningTrapPIF.cpp) +target_link_libraries (PenningTrapPIF PRIVATE ${IPPL_LIBS}) + +if(IPPL_ENABLE_FINUFFT) + target_link_libraries(PenningTrapPIF PRIVATE cufinufft finufft) +endif() + +# LandauDampingPIF supports both the upsampled (default) and the pruned NUFFT +# pipelines via an optional 10th positional argument ("pruned"); no separate +# executable is required. +# vi: set et ts=4 sw=4 sts=4: + +# Local Variables: +# mode: cmake +# cmake-tab-width: 4 +# indent-tabs-mode: nil +# require-final-newline: nil +# End: diff --git a/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp new file mode 100644 index 000000000..4015b6bfb --- /dev/null +++ b/alpine/ElectrostaticPIF/ChargedParticlesPIF.hpp @@ -0,0 +1,631 @@ +// ChargedParticlesPIF header file +// Defines a particle attribute for charged particles to be used in +// test programs +// +// Copyright (c) 2021 Paul Scherrer Institut, Villigen PSI, Switzerland +// All rights reserved +// +// This file is part of IPPL. +// +// IPPL is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// You should have received a copy of the GNU General Public License +// along with IPPL. If not, see . +// + +#include "Ippl.h" + +// dimension of our positions +constexpr unsigned Dim = 3; +using Mesh_t = ippl::UniformCartesian; +using Centering_t = Mesh_t::DefaultCentering; + +// some typedefs +typedef ippl::ParticleSpatialLayout PLayout_t; +typedef ippl::UniformCartesian Mesh_t; +typedef ippl::FieldLayout FieldLayout_t; + +using size_type = ippl::detail::size_type; + +template +using Vector = ippl::Vector; + +template +using Field = ippl::Field; + +template +using ParticleAttrib = ippl::ParticleAttrib; + +typedef Vector Vector_t; +typedef Field Field_t; +typedef Field, Dim, Mesh_t, Centering_t> CxField_t; +typedef Field::uniform_type VField_t; + +typedef ippl::FFT FFT_t; + +const double pi = std::acos(-1.0); + +// Test programs have to define this variable for VTK dump purposes +extern const char* TestName; + +void dumpVTK(Field_t& rho, int nx, int ny, int nz, int iteration, double dx, double dy, double dz) { + typename Field_t::view_type::host_mirror_type host_view = rho.getHostMirror(); + + std::stringstream fname; + fname << "data/scalar_"; + fname << std::setw(4) << std::setfill('0') << iteration; + fname << ".vtk"; + + Kokkos::deep_copy(host_view, rho.getView()); + + Inform vtkout(NULL, fname.str().c_str(), Inform::OVERWRITE); + vtkout.precision(10); + vtkout.setf(std::ios::scientific, std::ios::floatfield); + + // start with header + vtkout << "# vtk DataFile Version 2.0" << endl; + vtkout << TestName << endl; + vtkout << "ASCII" << endl; + vtkout << "DATASET STRUCTURED_POINTS" << endl; + vtkout << "DIMENSIONS " << nx + 3 << " " << ny + 3 << " " << nz + 3 << endl; + vtkout << "ORIGIN " << -dx << " " << -dy << " " << -dz << endl; + vtkout << "SPACING " << dx << " " << dy << " " << dz << endl; + vtkout << "CELL_DATA " << (nx + 2) * (ny + 2) * (nz + 2) << endl; + + vtkout << "SCALARS Rho float" << endl; + vtkout << "LOOKUP_TABLE default" << endl; + for (int z = 0; z < nz + 2; z++) { + for (int y = 0; y < ny + 2; y++) { + for (int x = 0; x < nx + 2; x++) { + vtkout << host_view(x, y, z) << endl; + } + } + } +} + +/*! + * @class ChargedParticlesPIF + * @brief Particle bunch used by the Particle-in-Fourier example apps. + * + * Holds Fourier-mode density fields (@c rho_m, @c rhoDFT_m, ...), the + * forward / inverse NUFFT plans, and per-particle attributes for charge, + * velocity, and the gathered electric field. Used by LandauDampingPIF / + * BumponTailInstabilityPIF / PenningTrapPIF. + * + * Supports two NUFFT pipelines, selected via @c useUpsampledInputs (default + * @c true): the upsampled pipeline transforms a 2x grid for accuracy; the + * "pruned" pipeline (false) transforms only the lowest n_modes per axis on + * the original grid for speed. Construct with @c useUpsampledInputs=false + * for the pruned variant. + * + * @tparam PLayout Particle spatial layout type (typically + * ippl::ParticleSpatialLayout). + */ +template +class ChargedParticlesPIF : public ippl::ParticleBase { +public: + CxField_t rho_m; + CxField_t rhoPIFhalf_m; + Field_t rhoPIFreal_m; + Field_t rhoPIFFourierMag_m; + CxField_t rhoDFT_m; + Field_t Sk_m; + + Vector nr_m; + + std::array decomp_m; + + Vector_t hr_m; + Vector_t rmin_m; + Vector_t rmax_m; + + double Q_m; + + size_type Np_m; + + double time_m; + + double rhoNorm_m; + + std::string shapetype_m; + + int shapedegree_m; + //std::shared_ptr fft_mp; + + // NUFFT pipeline selector: true -> 2x-upsampled grid (default); + // false -> "pruned" mode (only the lowest n_modes per axis transformed). + bool useUpsampledInputs_m = true; + + std::shared_ptr> nufftType1_mp, nufftType2_mp; + +public: + ParticleAttrib q; // charge + typename ippl::ParticleBase::particle_position_type P; // particle velocity + typename ippl::ParticleBase::particle_position_type + E; // electric field at particle position + + /* + This constructor is mandatory for all derived classes from + ParticleBase as the bunch buffer uses this + */ + ChargedParticlesPIF(PLayout& pl) + : ippl::ParticleBase(pl) { + // register the particle attributes + this->addAttribute(q); + this->addAttribute(P); + this->addAttribute(E); + } + + ChargedParticlesPIF(PLayout& pl, Vector_t hr, Vector_t rmin, Vector_t rmax, + std::array decomp, double Q, size_type Np, + bool useUpsampledInputs = true) + : ippl::ParticleBase(pl) + , hr_m(hr) + , rmin_m(rmin) + , rmax_m(rmax) + , Q_m(Q) + , Np_m(Np) + , useUpsampledInputs_m(useUpsampledInputs) { + // register the particle attributes + this->addAttribute(q); + this->addAttribute(P); + this->addAttribute(E); + setupBCs(); + for (unsigned int i = 0; i < Dim; i++) + decomp_m[i] = decomp[i]; + } + + ~ChargedParticlesPIF() {} + + void setupBCs() { setBCAllPeriodic(); } + + void initNUFFT(FieldLayout_t& FL, double& tol) { + ippl::ParameterList fftParams1, fftParams2; + + fftParams1.add("tolerance", tol); + fftParams2.add("tolerance", tol); +#ifdef FINUFFT_USE_CUDA + fftParams1.add("gpu_method", 2); + fftParams1.add("gpu_sort", 0); + fftParams1.add("gpu_kerevalmeth", 1); + fftParams1.add("gpu_binsizex", 8); + fftParams1.add("gpu_binsizey", 8); + fftParams1.add("gpu_binsizez", 2); + fftParams1.add("gpu_maxsubprobsize", 1024); + + fftParams2.add("gpu_method", 2); + fftParams2.add("gpu_sort", 0); + fftParams2.add("gpu_kerevalmeth", 1); + fftParams2.add("gpu_binsizex", 8); + fftParams2.add("gpu_binsizey", 8); + fftParams2.add("gpu_binsizez", 2); + fftParams2.add("gpu_maxsubprobsize", 1024); + +#else + fftParams1.add("spread_kerevalmeth", 1); + fftParams1.add("spread_sort", 2); + fftParams1.add("nthreads", 0); + + fftParams2.add("spread_kerevalmeth", 1); + fftParams2.add("spread_sort", 2); + fftParams2.add("nthreads", 0); +#endif + + fftParams1.add("use_finufft_defaults", false); + fftParams2.add("use_finufft_defaults", false); + fftParams1.add("use_kokkos_nufft", false); + fftParams2.add("use_kokkos_nufft", false); + fftParams1.add("use_upsampled_inputs", useUpsampledInputs_m); + fftParams2.add("use_upsampled_inputs", useUpsampledInputs_m); + // fftParams.add("use_cufinufft_defaults", true); + + nufftType1_mp = std::make_shared>( + FL, this->getLocalNum(), 1, fftParams1); + nufftType2_mp = std::make_shared>( + FL, this->getLocalNum(), 2, fftParams2); + } + + void gather() { + gatherPIFNUFFT(this->E, rho_m, Sk_m, this->R, nufftType2_mp.get(), q); + // gatherPIFNUDFT(this->E, rho_m, Sk_m, this->R); + + // Set the charge back to original as we used this view as a + // temporary buffer during gather + q = Q_m / Np_m; + } + + void scatter() { + Inform m("scatter "); + rho_m = {0.0, 0.0}; + scatterPIFNUFFT(q, rho_m, Sk_m, this->R, nufftType1_mp.get()); + // rho_m = {0.0, 0.0}; + // scatterPIFNUDFT(q, rho_m, Sk_m, this->R); + + // dumpFieldData(); + + rho_m = + rho_m / ((rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2])); + } + + void dumpLandau() { + double fieldEnergy = 0.0; + double ExAmp = 0.0; + + auto rhoview = rho_m.getView(); + const int nghost = rho_m.getNghost(); + using mdrange_type = Kokkos::MDRangePolicy>; + + const FieldLayout_t& layout = rho_m.getLayout(); + const Mesh_t& mesh = rho_m.get_mesh(); + const Vector& dx = mesh.getMeshSpacing(); + const auto& domain = layout.getDomain(); + Vector Len; + Vector N; + + for (unsigned d = 0; d < Dim; ++d) { + N[d] = domain[d].length(); + Len[d] = dx[d] * N[d]; + } + + Kokkos::complex imag = {0.0, 1.0}; + double pi = std::acos(-1.0); + Kokkos::parallel_reduce( + "Ex energy and Max", mdrange_type({0, 0, 0}, {N[0], N[1], N[2]}), + KOKKOS_LAMBDA(const int i, const int j, const int k, double& tlSum, double& tlMax) { + Vector iVec = {i, j, k}; + Vector kVec; + double Dr = 0.0; + for (size_t d = 0; d < Dim; ++d) { + //kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2)); + bool shift = (iVec[d] > (N[d] / 2)); + kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]); + Dr += kVec[d] * kVec[d]; + } + + Kokkos::complex Ek = {0.0, 0.0}; + auto rho = rhoview(i + nghost, j + nghost, k + nghost); + bool isNotZero = (Dr != 0.0); + double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); + Ek = -(imag * kVec[0] * rho * factor); + double myVal = Ek.real() * Ek.real() + Ek.imag() * Ek.imag(); + + tlSum += myVal; + + double myValMax = std::sqrt(myVal); + + if (myValMax > tlMax) + tlMax = myValMax; + }, + Kokkos::Sum(fieldEnergy), Kokkos::Max(ExAmp)); + + Kokkos::fence(); + double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]); + fieldEnergy *= volume; + + if (ippl::Comm->rank() == 0) { + std::stringstream fname; + fname << "data/FieldLandau_"; + fname << ippl::Comm->size(); + fname << ".csv"; + + Inform csvout(NULL, fname.str().c_str(), Inform::APPEND); + csvout.precision(10); + csvout.setf(std::ios::scientific, std::ios::floatfield); + + if (time_m == 0.0) { + csvout << "time, Ex_field_energy, Ex_max_norm" << endl; + } + + csvout << time_m << " " << fieldEnergy << " " << ExAmp << endl; + } + + ippl::Comm->barrier(); + } + + void dumpBumponTail() { + double fieldEnergy = 0.0; + double EzAmp = 0.0; + + auto rhoview = rho_m.getView(); + const int nghost = rho_m.getNghost(); + using mdrange_type = Kokkos::MDRangePolicy>; + + const FieldLayout_t& layout = rho_m.getLayout(); + const Mesh_t& mesh = rho_m.get_mesh(); + const Vector& dx = mesh.getMeshSpacing(); + const auto& domain = layout.getDomain(); + const auto& lDom = layout.getLocalNDIndex(); + Vector Len; + Vector N; + + for (unsigned d = 0; d < Dim; ++d) { + N[d] = domain[d].length(); + Len[d] = dx[d] * N[d]; + } + + Kokkos::complex imag = {0.0, 1.0}; + double pi = std::acos(-1.0); + Kokkos::parallel_reduce( + "Ez energy and Max", mdrange_type({nghost, nghost, nghost}, {rhoview.extent(0)-nghost, rhoview.extent(1)-nghost, rhoview.extent(2)-nghost}), + KOKKOS_LAMBDA(const int i, const int j, const int k, double& tlSum, double& tlMax) { + Vector iVec = {i, j, k}; + for (unsigned d = 0; d < Dim; ++d) { + iVec[d] = iVec[d] - nghost + lDom[d].first(); + } + Vector kVec; + double Dr = 0.0; + for (size_t d = 0; d < Dim; ++d) { + //kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2)); + bool shift = (iVec[d] > (N[d] / 2)); + kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]); + Dr += kVec[d] * kVec[d]; + } + + Kokkos::complex Ek = {0.0, 0.0}; + auto rho = rhoview(i, j, k); + bool isNotZero = (Dr != 0.0); + double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); + Ek = -(imag * kVec[2] * rho * factor); + double myVal = Ek.real() * Ek.real() + Ek.imag() * Ek.imag(); + + tlSum += myVal; + + double myValMax = std::sqrt(myVal); + + if (myValMax > tlMax) + tlMax = myValMax; + }, + Kokkos::Sum(fieldEnergy), Kokkos::Max(EzAmp)); + + Kokkos::fence(); + double globalfieldEnergy = 0.0; + double globalEzAmp = 0.0; + ippl::Comm->reduce(fieldEnergy, globalfieldEnergy, 1, std::plus()); + ippl::Comm->reduce(EzAmp, globalEzAmp, 1, std::greater()); + double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]); + globalfieldEnergy *= volume; + + if (ippl::Comm->rank() == 0) { + std::stringstream fname; + fname << "data/FieldBumponTail_"; + fname << ippl::Comm->size(); + fname << ".csv"; + + Inform csvout(NULL, fname.str().c_str(), Inform::APPEND); + csvout.precision(10); + csvout.setf(std::ios::scientific, std::ios::floatfield); + + if (time_m == 0.0) { + csvout << "time, Ez_field_energy, Ez_max_norm" << endl; + } + + csvout << time_m << " " << globalfieldEnergy << " " << globalEzAmp << endl; + } + + ippl::Comm->barrier(); + } + + void dumpEnergy() { + double potentialEnergy, kineticEnergy; + double temp = 0.0; + + auto rhoview = rho_m.getView(); + const int nghost = rho_m.getNghost(); + using mdrange_type = Kokkos::MDRangePolicy>; + + const FieldLayout_t& layout = rho_m.getLayout(); + const Mesh_t& mesh = rho_m.get_mesh(); + const Vector& dx = mesh.getMeshSpacing(); + const auto& domain = layout.getDomain(); + const auto& lDom = layout.getLocalNDIndex(); + Vector Len; + Vector N; + + for (unsigned d = 0; d < Dim; ++d) { + N[d] = domain[d].length(); + Len[d] = dx[d] * N[d]; + } + + Kokkos::complex imag = {0.0, 1.0}; + double pi = std::acos(-1.0); + Kokkos::parallel_reduce( + "Potential energy", mdrange_type({nghost, nghost, nghost}, {rhoview.extent(0)-nghost, rhoview.extent(1)-nghost, rhoview.extent(2)-nghost}), + KOKKOS_LAMBDA(const int i, const int j, const int k, double& valL) { + Vector iVec = {i, j, k}; + for (unsigned d = 0; d < Dim; ++d) { + iVec[d] = iVec[d] - nghost + lDom[d].first(); + } + Vector kVec; + double Dr = 0.0; + for (size_t d = 0; d < Dim; ++d) { + //kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2)); + bool shift = (iVec[d] > (N[d] / 2)); + kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]); + Dr += kVec[d] * kVec[d]; + } + + Kokkos::complex Ek = {0.0, 0.0}; + double myVal = 0.0; + auto rho = rhoview(i, j, k); + for (size_t d = 0; d < Dim; ++d) { + bool isNotZero = (Dr != 0.0); + double factor = isNotZero * (1.0 / (Dr + ((!isNotZero) * 1.0))); + Ek = -(imag * kVec[d] * rho * factor); + myVal += Ek.real() * Ek.real() + Ek.imag() * Ek.imag(); + } + + valL += myVal; + }, + Kokkos::Sum(temp)); + + double globaltemp = 0.0; + ippl::Comm->reduce(temp, globaltemp, 1, std::plus()); + double volume = (rmax_m[0] - rmin_m[0]) * (rmax_m[1] - rmin_m[1]) * (rmax_m[2] - rmin_m[2]); + potentialEnergy = 0.5 * globaltemp * volume; + + auto Pview = P.getView(); + auto qView = q.getView(); + + temp = 0.0; + + Kokkos::parallel_reduce( + "Kinetic Energy", this->getLocalNum(), + KOKKOS_LAMBDA(const int i, double& valL) { + double myVal = dot(Pview(i), Pview(i)).apply(); + myVal *= -qView(i); + valL += myVal; + }, + Kokkos::Sum(temp)); + + temp *= 0.5; + globaltemp = 0.0; + MPI_Reduce(&temp, &globaltemp, 1, MPI_DOUBLE, MPI_SUM, 0, ippl::Comm->getCommunicator()); + + kineticEnergy = globaltemp; + + Vector_t totalMomentum = 0.0; + + for (size_t d = 0; d < Dim; ++d) { + double tempD = 0.0; + Kokkos::parallel_reduce( + "Total Momentum", this->getLocalNum(), + KOKKOS_LAMBDA(const int i, double& valL) { valL += (-qView(i)) * Pview(i)[d]; }, + Kokkos::Sum(tempD)); + totalMomentum[d] = tempD; + } + + Vector_t globalMom; + + double magMomentum = 0.0; + for (size_t d = 0; d < Dim; ++d) { + MPI_Allreduce(&totalMomentum[d], &globalMom[d], 1, MPI_DOUBLE, MPI_SUM, + ippl::Comm->getCommunicator()); + magMomentum += globalMom[d] * globalMom[d]; + } + + magMomentum = std::sqrt(magMomentum); + + if (ippl::Comm->rank() == 0) { + std::stringstream fname; + fname << "data/Energy_"; + fname << ippl::Comm->size(); + fname << ".csv"; + + Inform csvout(NULL, fname.str().c_str(), Inform::APPEND); + csvout.precision(17); + csvout.setf(std::ios::scientific, std::ios::floatfield); + + if (time_m == 0.0) { + //csvout << "time, Potential energy, Kinetic energy, Total energy Total charge Total " + // "Momentum" + csvout << "time, Potential energy, Kinetic energy, Total energy Total " + "Momentum" + << endl; + } + + //csvout << time_m << " " << potentialEnergy << " " << kineticEnergy << " " + // << potentialEnergy + kineticEnergy << " " << charge << " " << magMomentum + // << endl; + csvout << time_m << " " << potentialEnergy << " " << kineticEnergy << " " + << potentialEnergy + kineticEnergy << " " << magMomentum + << endl; + } + + ippl::Comm->barrier(); + } + + void initializeShapeFunctionPIF() { + using mdrange_type = Kokkos::MDRangePolicy>; + auto Skview = Sk_m.getView(); + auto N = nr_m; + const int nghost = Sk_m.getNghost(); + const Mesh_t& mesh = rho_m.get_mesh(); + const Vector_t& dx = mesh.getMeshSpacing(); + const Vector_t& Len = rmax_m - rmin_m; + const double pi = std::acos(-1.0); + int order = shapedegree_m + 1; + const FieldLayout_t& layout = Sk_m.getLayout(); + const auto& lDom = layout.getLocalNDIndex(); + if (shapetype_m == "Gaussian") { + throw IpplException("initializeShapeFunctionPIF", + "Gaussian shape function not implemented yet"); + + } else if (shapetype_m == "B-spline") { + Kokkos::parallel_for( + "B-spline shape functions", + mdrange_type({nghost, nghost, nghost}, + {Skview.extent(0) - nghost, Skview.extent(1) - nghost, + Skview.extent(2) - nghost}), + KOKKOS_LAMBDA(const int i, const int j, const int k) { + Vector iVec = {i, j, k}; + for (unsigned d = 0; d < Dim; ++d) { + iVec[d] = iVec[d] - nghost + lDom[d].first(); + } + Vector kVec; + double Sk = 1.0; + for (size_t d = 0; d < Dim; ++d) { + //kVec[d] = 2 * pi / Len[d] * (iVec[d] - (N[d] / 2)); + bool shift = (iVec[d] > (N[d] / 2)); + kVec[d] = 2 * pi / Len[d] * (iVec[d] - shift * N[d]); + //Actual mesh spacing is twice the upsampled one + double khbytwo = (kVec[d] * dx[d] / 2) * 2; + bool isNotZero = (khbytwo != 0.0); + double factor = (1.0 / (khbytwo + ((!isNotZero) * 1.0))); + double arg = + isNotZero * (Kokkos::sin(khbytwo) * factor) + (!isNotZero) * 1.0; + // Fourier transform of CIC + Sk *= std::pow(arg, order); + } + Skview(i, j, k) = Sk; + }); + + } else { + throw IpplException("initializeShapeFunctionPIF", "Unrecognized shape function type"); + } + } + + void dumpFieldData() { + typename CxField_t::view_type::host_mirror_type rhoNUFFT_host = rho_m.getHostMirror(); + typename Field_t::view_type::host_mirror_type rhoNUFFT_real = rhoPIFreal_m.getHostMirror(); + Kokkos::deep_copy(rhoNUFFT_host, rho_m.getView()); + Kokkos::deep_copy(rhoNUFFT_real, rhoPIFreal_m.getView()); + const int nghost = rho_m.getNghost(); + std::stringstream pname; + pname << "data/FieldFFT_"; + pname << ippl::Comm->rank(); + pname << ".csv"; + Inform pcsvout(NULL, pname.str().c_str(), Inform::OVERWRITE, ippl::Comm->rank()); + pcsvout.precision(10); + pcsvout.setf(std::ios::scientific, std::ios::floatfield); + pcsvout << "rho" << endl; + for (int i = 0; i < nr_m[0]; i++) { + for (int j = 0; j < nr_m[1]; j++) { + for (int k = 0; k < nr_m[2]; k++) { + pcsvout << rhoNUFFT_host(i + nghost, j + nghost, k + nghost) << endl; + } + } + } + std::stringstream pname2; + pname2 << "data/Fieldreal_"; + pname2 << ippl::Comm->rank(); + pname2 << ".csv"; + Inform pcsvout2(NULL, pname2.str().c_str(), Inform::OVERWRITE, ippl::Comm->rank()); + pcsvout2.precision(10); + pcsvout2.setf(std::ios::scientific, std::ios::floatfield); + pcsvout2 << "rho" << endl; + for (int i = 0; i < nr_m[0]; i++) { + for (int j = 0; j < nr_m[1]; j++) { + for (int k = 0; k < nr_m[2]; k++) { + pcsvout2 << rhoNUFFT_real(i + nghost, j + nghost, k + nghost) << endl; + } + } + } + ippl::Comm->barrier(); + } + +private: + void setBCAllPeriodic() { this->setParticleBC(ippl::BC::PERIODIC); } +}; diff --git a/alpine/ElectrostaticPIF/LandauDampingPIF.cpp b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp new file mode 100644 index 000000000..fed2ceef8 --- /dev/null +++ b/alpine/ElectrostaticPIF/LandauDampingPIF.cpp @@ -0,0 +1,357 @@ +// Electrostatic Landau damping test with Particle-in-Fourier schemes +// Usage: +// srun ./LandauDampingPIF
--info 5 +// nx = No. of Fourier modes in the x-direction +// ny = No. of Fourier modes in the y-direction +// nz = No. of Fourier modes in the z-direction +// Np = Total no. of macro-particles in the simulation +// Nt = Number of time steps +// dt = Time stepsize +// ShapeType = Shape function type B-spline only for the moment +// degree = B-spline degree (-1 for delta function) +// tol = tolerance of NUFFT +// Example: +// srun ./LandauDampingPIF 32 32 32 655360 20 0.05 B-spline 1 1e-4 --info 5 +// +// Copyright (c) 2022, Sriramkrishnan Muralikrishnan, +// Jülich Supercomputing Centre, Jülich, Germany. +// All rights reserved +// +// This file is part of IPPL. +// +// IPPL is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// You should have received a copy of the GNU General Public License +// along with IPPL. If not, see . +// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Utility/IpplTimings.h" + +#include "ChargedParticlesPIF.hpp" + +template +struct Newton1D { + double tol = 1e-12; + int max_iter = 20; + double pi = std::acos(-1.0); + + T k, alpha, u; + + KOKKOS_INLINE_FUNCTION Newton1D() {} + + KOKKOS_INLINE_FUNCTION Newton1D(const T& k_, const T& alpha_, const T& u_) + : k(k_) + , alpha(alpha_) + , u(u_) {} + + KOKKOS_INLINE_FUNCTION ~Newton1D() {} + + KOKKOS_INLINE_FUNCTION T f(T& x) { + T F; + F = x + (alpha * (std::sin(k * x) / k)) - u; + return F; + } + + KOKKOS_INLINE_FUNCTION T fprime(T& x) { + T Fprime; + Fprime = 1 + (alpha * std::cos(k * x)); + return Fprime; + } + + KOKKOS_FUNCTION + void solve(T& x) { + int iterations = 0; + while (iterations < max_iter && std::fabs(f(x)) > tol) { + x = x - (f(x) / fprime(x)); + iterations += 1; + } + } +}; + +template +struct generate_random { + using view_type = typename ippl::detail::ViewType::view_type; + using value_type = typename T::value_type; + // Output View for the random numbers + view_type x, v; + + // The GeneratorPool + GeneratorPool rand_pool; + + value_type alpha; + + T k, minU, maxU; + + // Initialize all members + generate_random(view_type x_, view_type v_, GeneratorPool rand_pool_, value_type& alpha_, T& k_, + T& minU_, T& maxU_) + : x(x_) + , v(v_) + , rand_pool(rand_pool_) + , alpha(alpha_) + , k(k_) + , minU(minU_) + , maxU(maxU_) {} + + KOKKOS_INLINE_FUNCTION void operator()(const size_t i) const { + // Get a random number state from the pool for the active thread + typename GeneratorPool::generator_type rand_gen = rand_pool.get_state(); + + value_type u; + for (unsigned d = 0; d < Dim; ++d) { + u = rand_gen.drand(minU[d], maxU[d]); + x(i)[d] = u / (1 + alpha); + Newton1D solver(k[d], alpha, u); + solver.solve(x(i)[d]); + v(i)[d] = rand_gen.normal(0.0, 1.0); + } + + // Give the state back, which will allow another thread to acquire it + rand_pool.free_state(rand_gen); + } +}; + +double CDF(const double& x, const double& alpha, const double& k) { + double cdf = x + (alpha / k) * std::sin(k * x); + return cdf; +} + +KOKKOS_FUNCTION +double PDF(const Vector_t& xvec, const double& alpha, const Vector_t& kw, const unsigned Dim) { + double pdf = 1.0; + + for (unsigned d = 0; d < Dim; ++d) { + pdf *= (1.0 + alpha * std::cos(kw[d] * xvec[d])); + } + return pdf; +} + +const char* TestName = "LandauDampingPIF"; + +int main(int argc, char* argv[]) { + ippl::initialize(argc, argv); + { + Inform msg("LandauDampingPIF"); + Inform msg2all("LandauDampingPIF", INFORM_ALL_NODES); + + // Optional 10th positional argument selects the NUFFT pipeline: + // "pruned" -> only the lowest n_modes per axis are transformed + // on the original grid (cheaper, slightly less + // accurate). + // anything else / absent -> upsampled pipeline on a 2x grid + // (default, matches the original example). + const bool useUpsampledInputs = + !(argc > 10 && std::string(argv[10]) == "pruned"); + + ippl::Vector nr = {std::atoi(argv[1]), std::atoi(argv[2]), std::atoi(argv[3])}; + ippl::Vector nrOrig; + + static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer"); + static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation"); + static IpplTimings::TimerRef dumpDataTimer = IpplTimings::getTimer("dumpData"); + static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("kick"); + static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("drift"); + static IpplTimings::TimerRef initializeShapeFunctionPIF = + IpplTimings::getTimer("initializeShapeFunctionPIF"); + + + const size_type totalP = std::atoll(argv[4]); + const unsigned int nt = std::atoi(argv[5]); + const double dt = std::atof(argv[6]); + + double factor = 1.0 / ippl::Comm->size(); + size_type nloc = (size_type)(factor * totalP); + size_type Total_particles = 0; + + MPI_Allreduce(&nloc, &Total_particles, 1, MPI_UNSIGNED_LONG, MPI_SUM, ippl::Comm->getCommunicator()); + + msg << "Landau damping" << endl + << "nt " << nt << " Np= " << Total_particles << " Fourier modes = " << nr << endl; + + using bunch_type = ChargedParticlesPIF; + + std::unique_ptr P; + + // Upsampled mode runs all transforms on a 2x grid (nr doubled); + // pruned mode keeps the original grid. nrOrig is the user-supplied + // resolution and is what the particle layout / NUFFT plans get bound + // to in either mode, but in upsampled mode the field storage and + // FieldLayout use the doubled nr to give the FFT room to upsample. + ippl::NDIndex domain; + ippl::NDIndex domainOrig; + for (unsigned i = 0; i < Dim; i++) { + nrOrig[i] = nr[i]; + if (useUpsampledInputs) { + nr[i] = 2 * nr[i]; + } + domain[i] = ippl::Index(nr[i]); + domainOrig[i] = ippl::Index(nrOrig[i]); + } + + std::array isParallel; // Specifies SERIAL, PARALLEL dims + isParallel.fill(true); + + // create mesh and layout objects for this problem domain + Vector_t kw = {0.5, 0.5, 0.5}; + double alpha = 0.05; + Vector_t rmin(0.0); + Vector_t rmax = 2 * pi / kw; + Vector_t length = rmax - rmin; + double dx = length[0] / nr[0]; + double dy = length[1] / nr[1]; + double dz = length[2] / nr[2]; + + Vector_t hr = {dx, dy, dz}; + Vector_t hrOrig = useUpsampledInputs ? Vector_t{2.0 * hr} : hr; + Vector_t origin = {rmin[0], rmin[1], rmin[2]}; + + const bool isAllPeriodic = true; + Mesh_t mesh(domain, hr, origin); + Mesh_t meshOrig(domainOrig, hrOrig, origin); + + FieldLayout_t FL(*ippl::Comm, domain, isParallel, isAllPeriodic); + FieldLayout_t FLOrig(*ippl::Comm, domainOrig, isParallel, isAllPeriodic); + + // Particle layout binds to the original (un-upsampled) field layout so + // particle positions and the NUFFT plan see the same resolution; the + // upsampled FL is only used for the rho/Sk field storage. + PLayout_t PL(FLOrig, meshOrig); + + // Q = -\int\int f dx dv + double Q = -length[0] * length[1] * length[2]; + P = std::make_unique(PL, hr, rmin, rmax, isParallel, Q, + Total_particles, useUpsampledInputs); + + P->nr_m = nr; + + P->rho_m.initialize(mesh, FL); + P->rhoDFT_m.initialize(mesh, FL); + P->Sk_m.initialize(mesh, FL); + + //////////////////////////////////////////////////////////// + // Initialize an FFT object for getting rho in real space and + // doing charge conservation check + + P->time_m = 0.0; + + P->shapetype_m = argv[7]; + P->shapedegree_m = std::atoi(argv[8]); + + IpplTimings::startTimer(particleCreation); + + Vector_t minU, maxU; + for (unsigned d = 0; d < Dim; ++d) { + minU[d] = CDF(rmin[d], alpha, kw[d]); + maxU[d] = CDF(rmax[d], alpha, kw[d]); + } + + + P->create(nloc); + Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100 * ippl::Comm->rank())); + Kokkos::parallel_for( + nloc, generate_random, Dim>( + P->R.getView(), P->P.getView(), rand_pool64, alpha, kw, minU, maxU)); + + Kokkos::fence(); + ippl::Comm->barrier(); + IpplTimings::stopTimer(particleCreation); + + P->q = P->Q_m / Total_particles; + msg << "particles created and initial conditions assigned " << endl; + + IpplTimings::startTimer(initializeShapeFunctionPIF); + P->initializeShapeFunctionPIF(); + IpplTimings::stopTimer(initializeShapeFunctionPIF); + msg << "After init shape function " << endl; + + double tol = std::atof(argv[9]); + P->initNUFFT(FLOrig, tol); + msg << "After init NUFFT " << endl; + + P->update(); + msg << "After update " << endl; + P->scatter(); + msg << "After scatter " << endl; + + P->gather(); + msg << "After gather " << endl; + + IpplTimings::startTimer(dumpDataTimer); + P->dumpBumponTail(); + P->dumpEnergy(); + IpplTimings::stopTimer(dumpDataTimer); + + // begin main timestep loop + msg << "Starting iterations ..." << endl; + int warmup = 3; + for (int it = -warmup; it < (int)nt; it++) { + if (it == 0) { + IpplTimings::resetAllTimers(); + IpplTimings::startTimer(mainTimer); + } + // LeapFrog time stepping https://en.wikipedia.org/wiki/Leapfrog_integration + // Here, we assume a constant charge-to-mass ratio of -1 for + // all the particles hence eliminating the need to store mass as + // an attribute + // kick + + IpplTimings::startTimer(PTimer); + P->P = P->P - 0.5 * dt * P->E; + IpplTimings::stopTimer(PTimer); + + // drift + IpplTimings::startTimer(RTimer); + P->R = P->R + dt * P->P; + IpplTimings::stopTimer(RTimer); + + // Apply particle BC + //IpplTimings::startTimer(BCTimer); + //PL.applyBC(P->R, PL.getRegionLayout().getDomain()); + //IpplTimings::stopTimer(BCTimer); + + P->update(); + // scatter the charge onto the underlying grid + P->scatter(); + + // Solve for and gather E field + P->gather(); + + // kick + IpplTimings::startTimer(PTimer); + P->P = P->P - 0.5 * dt * P->E; + IpplTimings::stopTimer(PTimer); + + P->time_m += dt; + IpplTimings::startTimer(dumpDataTimer); + P->dumpBumponTail(); + P->dumpEnergy(); + IpplTimings::stopTimer(dumpDataTimer); + msg << "Finished time step: " << it + 1 << " time: " << P->time_m << endl; + } + + msg << "LandauDamping: End." << endl; + IpplTimings::stopTimer(mainTimer); + IpplTimings::print(); + IpplTimings::print(std::string("timing.dat")); + + std::string res_file = useUpsampledInputs ? "LandauDampingPIF" : "LandauDampingPIFPruned"; + res_file += std::to_string(ippl::Comm->size()); + res_file += ".csv"; + IpplTimings::dumpToCSV(res_file); + } + ippl::finalize(); + + return 0; +} diff --git a/alpine/ElectrostaticPIF/PenningTrapPIF.cpp b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp new file mode 100644 index 000000000..b27272264 --- /dev/null +++ b/alpine/ElectrostaticPIF/PenningTrapPIF.cpp @@ -0,0 +1,418 @@ +// Electrostatic Penning trap test with Particle-in-Fourier schemes +// Usage: +// srun ./PenningTrapPIF
--info 5 +// nx = No. of Fourier modes in the x-direction +// ny = No. of Fourier modes in the y-direction +// nz = No. of Fourier modes in the z-direction +// Np = Total no. of macro-particles in the simulation +// Nt = Number of time steps +// dt = Time stepsize +// ShapeType = Shape function type B-spline only for the moment +// degree = B-spline degree (-1 for delta function) +// tol = tolerance of NUFFT +// Example: +// srun ./PenningTrapPIF 32 32 32 655360 20 0.05 B-spline 1 1e-4 --info 5 +// +// Copyright (c) 2023, Sriramkrishnan Muralikrishnan, +// Jülich Supercomputing Centre, Jülich, Germany. +// All rights reserved +// +// This file is part of IPPL. +// +// IPPL is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// You should have received a copy of the GNU General Public License +// along with IPPL. If not, see . +// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Utility/IpplTimings.h" + +#include "ChargedParticlesPIF.hpp" + +#ifdef ENABLE_CATALYST +#include "CatalystAdaptor.h" +#endif + +template +struct Newton1D { + double tol = 1e-12; + int max_iter = 20; + double pi = std::acos(-1.0); + + T mu, sigma, u; + + KOKKOS_INLINE_FUNCTION Newton1D() {} + + KOKKOS_INLINE_FUNCTION Newton1D(const T& mu_, const T& sigma_, const T& u_) + : mu(mu_) + , sigma(sigma_) + , u(u_) {} + + KOKKOS_INLINE_FUNCTION ~Newton1D() {} + + KOKKOS_INLINE_FUNCTION T f(T& x) { + T F; + F = std::erf((x - mu) / (sigma * std::sqrt(2.0))) - 2 * u + 1; + return F; + } + + KOKKOS_INLINE_FUNCTION T fprime(T& x) { + T Fprime; + Fprime = + (1 / sigma) * std::sqrt(2 / pi) * std::exp(-0.5 * (std::pow(((x - mu) / sigma), 2))); + return Fprime; + } + + KOKKOS_FUNCTION + void solve(T& x) { + int iterations = 0; + while ((iterations < max_iter) && (std::fabs(f(x)) > tol)) { + x = x - (f(x) / fprime(x)); + iterations += 1; + } + } +}; + +template +struct generate_random { + using view_type = typename ippl::detail::ViewType::view_type; + using value_type = typename T::value_type; + // Output View for the random numbers + view_type x, v; + + // The GeneratorPool + GeneratorPool rand_pool; + + T mu, sigma, minU, maxU; + + double pi = std::acos(-1.0); + + // Initialize all members + generate_random(view_type x_, view_type v_, GeneratorPool rand_pool_, T& mu_, T& sigma_, + T& minU_, T& maxU_) + : x(x_) + , v(v_) + , rand_pool(rand_pool_) + , mu(mu_) + , sigma(sigma_) + , minU(minU_) + , maxU(maxU_) {} + + KOKKOS_INLINE_FUNCTION void operator()(const size_t i) const { + // Get a random number state from the pool for the active thread + typename GeneratorPool::generator_type rand_gen = rand_pool.get_state(); + + value_type u; + for (unsigned d = 0; d < Dim; ++d) { + u = rand_gen.drand(minU[d], maxU[d]); + x(i)[d] = (std::sqrt(pi / 2) * (2 * u - 1)) * sigma[d] + mu[d]; + Newton1D solver(mu[d], sigma[d], u); + solver.solve(x(i)[d]); + v(i)[d] = rand_gen.normal(0.0, 1.0); + } + + // Give the state back, which will allow another thread to acquire it + rand_pool.free_state(rand_gen); + } +}; + +double CDF(const double& x, const double& mu, const double& sigma) { + double cdf = 0.5 * (1.0 + std::erf((x - mu) / (sigma * std::sqrt(2)))); + return cdf; +} + +const char* TestName = "PenningTrapPIF"; + +int main(int argc, char* argv[]) { + ippl::initialize(argc, argv); + { +#ifdef ENABLE_CATALYST + char* script = nullptr; + for (int i = 1; i < argc; ++i) { + if (std::string(argv[i]) == "--pvscript" && i + 1 < argc) { + script = argv[i + 1]; + i++; + } + } + char* reducedArgv[] = {argv[0], script}; + CatalystAdaptor::Initialize(2, reducedArgv); +#endif + Inform msg(TestName); + Inform msg2all(TestName, INFORM_ALL_NODES); + + ippl::Vector nr = {std::atoi(argv[1]), std::atoi(argv[2]), std::atoi(argv[3])}; + + static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("mainTimer"); + static IpplTimings::TimerRef particleCreation = IpplTimings::getTimer("particlesCreation"); + static IpplTimings::TimerRef dumpDataTimer = IpplTimings::getTimer("dumpData"); + static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("kick"); + static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("drift"); + static IpplTimings::TimerRef BCTimer = IpplTimings::getTimer("particleBC"); + static IpplTimings::TimerRef initializeShapeFunctionPIF = + IpplTimings::getTimer("initializeShapeFunctionPIF"); + + IpplTimings::startTimer(mainTimer); + + const size_type totalP = std::atoll(argv[4]); + const unsigned int nt = std::atoi(argv[5]); + const double dt = std::atof(argv[6]); + + double factor = 1.0 / ippl::Comm->size(); + size_type nloc = (size_type)(factor * totalP); + size_type Total_particles = 0; + + MPI_Allreduce(&nloc, &Total_particles, 1, MPI_UNSIGNED_LONG, MPI_SUM, + ippl::Comm->getCommunicator()); + + msg << TestName << endl + << "nt " << nt << " Np= " << Total_particles << " Fourier modes = " << nr << endl; + + using bunch_type = ChargedParticlesPIF; + + std::shared_ptr P; + + ippl::NDIndex domain; + for (unsigned i = 0; i < Dim; i++) { + domain[i] = ippl::Index(nr[i]); + } + + std::array isParallel; // Specifies SERIAL, PARALLEL dims + isParallel.fill(false); + + // create mesh and layout objects for this problem domain + Vector_t rmin(0.0); + Vector_t rmax(25.0); + double dx = rmax[0] / nr[0]; + double dy = rmax[1] / nr[1]; + double dz = rmax[2] / nr[2]; + + Vector_t length = rmax - rmin; + + Vector_t mu, sd; + + for (unsigned d = 0; d < Dim; d++) { + mu[d] = 0.5 * length[d]; + } + // sd[0] = 0.15*length[0]; + // sd[1] = 0.05*length[1]; + // sd[2] = 0.20*length[2]; + sd[0] = 0.10 * 20.0; // length[0]; + sd[1] = 0.05 * 20.0; // length[1]; + sd[2] = 0.15 * 20.0; // length[2]; + + Vector_t hr = {dx, dy, dz}; + Vector_t origin = {rmin[0], rmin[1], rmin[2]}; + + Mesh_t mesh(domain, hr, origin); + FieldLayout_t FL(*ippl::Comm, domain, isParallel); + PLayout_t PL(FL, mesh); + + double Q = -1562.5; + double Bext = 5.0; + // P = std::make_unique(PL,hr,rmin,rmax,decomp,Q,Total_particles); + P = std::make_shared(PL, hr, rmin, rmax, isParallel, Q, Total_particles); + + P->nr_m = nr; + + P->rho_m.initialize(mesh, FL); + P->Sk_m.initialize(mesh, FL); + + //////////////////////////////////////////////////////////// + // Initialize an FFT object for getting rho in real space and + // doing charge conservation check + + ippl::ParameterList fftParams; + fftParams.add("use_heffte_defaults", false); + fftParams.add("use_pencils", true); + fftParams.add("use_reorder", false); + fftParams.add("use_gpu_aware", true); + fftParams.add("comm", ippl::p2p_pl); + fftParams.add("r2c_direction", 0); + + ippl::NDIndex domainPIFhalf; + + for (unsigned d = 0; d < Dim; ++d) { + domainPIFhalf[d] = ippl::Index(domain[d].length()); + } + + FieldLayout_t FLPIFhalf(*ippl::Comm, domainPIFhalf, isParallel); + + ippl::Vector hDummy = {1.0, 1.0, 1.0}; + ippl::Vector originDummy = {0.0, 0.0, 0.0}; + Mesh_t meshPIFhalf(domainPIFhalf, hDummy, originDummy); + + ippl::Vector hFourier = {2 * pi / length[0], 2 * pi / length[1], + 2 * pi / length[2]}; + ippl::Vector originFourier = {-pi / hr[0], -pi / hr[1], -pi / hr[2]}; + Mesh_t meshFourier(domain, hFourier, originFourier); + + P->rhoPIFreal_m.initialize(mesh, FL); + P->rhoPIFhalf_m.initialize(meshPIFhalf, FLPIFhalf); + P->rhoPIFFourierMag_m.initialize(meshFourier, FL); + + P->time_m = 0.0; + + P->shapetype_m = argv[7]; + P->shapedegree_m = std::atoi(argv[8]); + + IpplTimings::startTimer(particleCreation); + + Vector_t minU, maxU; + for (unsigned d = 0; d < Dim; ++d) { + minU[d] = CDF(rmin[d], mu[d], sd[d]); + maxU[d] = CDF(rmax[d], mu[d], sd[d]); + } + + + P->create(nloc); + Kokkos::Random_XorShift64_Pool<> rand_pool64((size_type)(42 + 100 * ippl::Comm->rank())); + Kokkos::parallel_for(nloc, + generate_random, Dim>( + P->R.getView(), P->P.getView(), rand_pool64, mu, sd, minU, maxU)); + + Kokkos::fence(); + ippl::Comm->barrier(); + IpplTimings::stopTimer(particleCreation); + + P->q = P->Q_m / Total_particles; + msg << "particles created and initial conditions assigned " << endl; + + IpplTimings::startTimer(initializeShapeFunctionPIF); + P->initializeShapeFunctionPIF(); + IpplTimings::stopTimer(initializeShapeFunctionPIF); + + double tol = std::atof(argv[9]); + P->initNUFFT(FL, tol); + + P->scatter(); + + P->gather(); + + IpplTimings::startTimer(dumpDataTimer); + // P->dumpEnergy(); +#ifdef ENABLE_CATALYST + P->rhoPIFreal_m = (1 / (hr[0] * hr[1] * hr[2])) * P->rhoPIFreal_m; + std::vector fields = { + {"rhoK", CatalystAdaptor::FieldVariant(&P->rhoPIFFourierMag_m)}, + {"rhoR", CatalystAdaptor::FieldVariant(&P->rhoPIFreal_m)}}; + CatalystAdaptor::Execute(0, P->time_m, Ippl::Comm->rank(), P, fields); +#endif + IpplTimings::stopTimer(dumpDataTimer); + + double alpha = -0.5 * dt; + double DrInv = 1.0 / (1 + (std::pow((alpha * Bext), 2))); + // begin main timestep loop + msg << "Starting iterations ..." << endl; + for (unsigned int it = 0; it < nt; it++) { + // Staggered Leap frog or Boris algorithm as per + // https://www.sciencedirect.com/science/article/pii/S2590055219300526 + // eqns 4(a)-4(c). Note we don't use the Boris trick here and do + // the analytical matrix inversion which is not complex in this case. + // Here, we assume a constant charge-to-mass ratio of -1 for + // all the particles hence eliminating the need to store mass as + // an attribute + // kick + IpplTimings::startTimer(PTimer); + auto Rview = P->R.getView(); + auto Pview = P->P.getView(); + auto Eview = P->E.getView(); + double V0 = 30 * rmax[2]; + Kokkos::parallel_for( + "Kick1", P->getLocalNum(), KOKKOS_LAMBDA(const size_t j) { + double Eext_x = + -(Rview(j)[0] - 0.5 * rmax[0]) * (V0 / (2 * std::pow(rmax[2], 2))); + double Eext_y = + -(Rview(j)[1] - 0.5 * rmax[1]) * (V0 / (2 * std::pow(rmax[2], 2))); + double Eext_z = (Rview(j)[2] - 0.5 * rmax[2]) * (V0 / (std::pow(rmax[2], 2))); + + Eext_x += Eview(j)[0]; + Eext_y += Eview(j)[1]; + Eext_z += Eview(j)[2]; + + Pview(j)[0] += alpha * (Eext_x + Pview(j)[1] * Bext); + Pview(j)[1] += alpha * (Eext_y - Pview(j)[0] * Bext); + Pview(j)[2] += alpha * Eext_z; + }); + IpplTimings::stopTimer(PTimer); + + // drift + IpplTimings::startTimer(RTimer); + P->R = P->R + dt * P->P; + IpplTimings::stopTimer(RTimer); + + // Apply particle BC + IpplTimings::startTimer(BCTimer); + PL.applyBC(P->R, PL.getRegionLayout().getDomain()); + IpplTimings::stopTimer(BCTimer); + + // scatter the charge onto the underlying grid + P->scatter(); + + // Solve for and gather E field + P->gather(); + + // kick + IpplTimings::startTimer(PTimer); + auto R2view = P->R.getView(); + auto P2view = P->P.getView(); + auto E2view = P->E.getView(); + Kokkos::parallel_for( + "Kick2", P->getLocalNum(), KOKKOS_LAMBDA(const size_t j) { + double Eext_x = + -(R2view(j)[0] - 0.5 * rmax[0]) * (V0 / (2 * std::pow(rmax[2], 2))); + double Eext_y = + -(R2view(j)[1] - 0.5 * rmax[1]) * (V0 / (2 * std::pow(rmax[2], 2))); + double Eext_z = (R2view(j)[2] - 0.5 * rmax[2]) * (V0 / (std::pow(rmax[2], 2))); + + Eext_x += E2view(j)[0]; + Eext_y += E2view(j)[1]; + Eext_z += E2view(j)[2]; + + P2view(j)[0] = + DrInv + * (P2view(j)[0] + + alpha * (Eext_x + P2view(j)[1] * Bext + alpha * Bext * Eext_y)); + P2view(j)[1] = + DrInv + * (P2view(j)[1] + + alpha * (Eext_y - P2view(j)[0] * Bext - alpha * Bext * Eext_x)); + P2view(j)[2] += alpha * Eext_z; + }); + IpplTimings::stopTimer(PTimer); + + P->time_m += dt; + IpplTimings::startTimer(dumpDataTimer); + // P->dumpEnergy(); +#ifdef ENABLE_CATALYST + P->rhoPIFreal_m = (1 / (hr[0] * hr[1] * hr[2])) * P->rhoPIFreal_m; + CatalystAdaptor::Execute(it, P->time_m, Ippl::Comm->rank(), P, fields); +#endif + IpplTimings::stopTimer(dumpDataTimer); + msg << "Finished time step: " << it + 1 << " time: " << P->time_m << endl; + } + + msg << TestName << " End." << endl; + +#ifdef ENABLE_CATALYST + CatalystAdaptor::Finalize(); +#endif + + IpplTimings::stopTimer(mainTimer); + IpplTimings::print(); + IpplTimings::print(std::string("timing.dat")); + } + ippl::finalize(); + return 0; +} diff --git a/alpine/LandauDamping.cpp b/alpine/LandauDamping.cpp index 718672513..2de69bfcd 100644 --- a/alpine/LandauDamping.cpp +++ b/alpine/LandauDamping.cpp @@ -47,6 +47,23 @@ int main(int argc, char* argv[]) { Inform msg(TestName); Inform msg2all(TestName, INFORM_ALL_NODES); + // Optional warmup: --warmup . Defaults to 0 (no warmup, baseline + // behaviour). Stripping the flag from argv here lets the positional + // parser below see the same layout as before. + int n_warmup = 0; + { + int write_i = 1; + for (int read_i = 1; read_i < argc; ) { + if (std::string(argv[read_i]) == "--warmup" && read_i + 1 < argc) { + n_warmup = std::atoi(argv[read_i + 1]); + read_i += 2; + } else { + argv[write_i++] = argv[read_i++]; + } + } + argc = write_i; + } + static IpplTimings::TimerRef mainTimer = IpplTimings::getTimer("total"); static IpplTimings::TimerRef initializeTimer = IpplTimings::getTimer("initialize"); IpplTimings::startTimer(mainTimer); @@ -82,9 +99,31 @@ int main(int argc, char* argv[]) { manager.pre_run(); IpplTimings::stopTimer(initializeTimer); - + manager.setTime(0.0); + // Optional warmup: run N timesteps before the timed run so JIT, + // first-touch allocations, GPU caches, MPI/IPC registration and + // any tile-size autotune transients don't show up in the measured + // timers. After the warmup we wipe ALL accumulated timer state so + // the printed report covers exactly the measured run, not the + // warmup pass. Pass --warmup on the CLI to enable. + if (n_warmup > 0) { + msg << "Running " << n_warmup << " warmup step(s) ..." << endl; + manager.run(n_warmup); + + // Reset simulation state so the measured run starts from t = 0, + // step 0 -- comparable across branches. + manager.setTime(0.0); + manager.setIt(0); + + // Wipe all timer accumulators (including 'total' and 'initialize' + // we already started above) so the report reflects only the run. + IpplTimings::stopTimer(mainTimer); + IpplTimings::resetAllTimers(); + IpplTimings::startTimer(mainTimer); + } + msg << "Starting iterations ..." << endl; manager.run(manager.getNt()); diff --git a/alpine/LandauDampingManager.h b/alpine/LandauDampingManager.h index b8491c7a5..df9def250 100644 --- a/alpine/LandauDampingManager.h +++ b/alpine/LandauDampingManager.h @@ -1,8 +1,8 @@ #ifndef IPPL_LANDAU_DAMPING_MANAGER_H #define IPPL_LANDAU_DAMPING_MANAGER_H -#include #include +#include #include "AlpineManager.h" #include "FieldContainer.hpp" @@ -67,7 +67,7 @@ class LandauDampingManager : public AlpineManager { Inform m("Pre Run"); const double pi = Kokkos::numbers::pi_v; - + if (this->solver_m == "OPEN") { throw IpplException("LandauDamping", "Open boundaries solver incompatible with this simulation!"); @@ -84,7 +84,7 @@ class LandauDampingManager : public AlpineManager { this->rmax_m = 2 * pi / this->kw_m; bool isFEM = ((this->getSolver() == "FEM") || (this->getSolver() == "FEM_PRECON")); - + Vector nElements = this->nr_m - 1; if (isFEM) { this->hr_m = this->rmax_m / nElements; @@ -110,8 +110,7 @@ class LandauDampingManager : public AlpineManager { this->isAllPeriodic_m)); this->setParticleContainer(std::make_shared( - this->fcontainer_m->getMesh(), this->fcontainer_m->getFL(), - isFEM)); + this->fcontainer_m->getMesh(), this->fcontainer_m->getFL(), isFEM)); this->fcontainer_m->initializeFields(this->solver_m); @@ -191,7 +190,8 @@ class LandauDampingManager : public AlpineManager { this->fcontainer_m->getRho().getFieldRangePolicy(), KOKKOS_LAMBDA(const index_array_type& args) { // local to global index conversion - Vector_t xvec = (args + lDom.first() - nghost + 0.5*(!isFEM)) * hr + origin; + Vector_t xvec = + (args + lDom.first() - nghost + 0.5 * (!isFEM)) * hr + origin; // ippl::apply accesses the view at the given indices and obtains a // reference; see src/Expression/IpplOperations.h @@ -226,10 +226,10 @@ class LandauDampingManager : public AlpineManager { this->pcontainer_m->create(nlocal); - view_type* R = &(this->pcontainer_m->R.getView()); - samplingR.generate(*R, rand_pool64); + auto R = this->pcontainer_m->R.getView(); + samplingR.generate(R, rand_pool64); - view_type* P = &(this->pcontainer_m->P.getView()); + auto P = this->pcontainer_m->P.getView(); double mu[Dim]; double sd[Dim]; @@ -237,7 +237,7 @@ class LandauDampingManager : public AlpineManager { mu[i] = 0.0; sd[i] = 1.0; } - Kokkos::parallel_for(nlocal, ippl::random::randn(*P, rand_pool64, mu, sd)); + Kokkos::parallel_for(nlocal, ippl::random::randn(P, rand_pool64, mu, sd)); Kokkos::fence(); ippl::Comm->barrier(); @@ -246,7 +246,7 @@ class LandauDampingManager : public AlpineManager { this->pcontainer_m->q = this->Q_m / totalP; // For FEM need an update due to node-centering, as periodic BCs mean - // that a particle at R=0 is equivalent to R=1 so it could be on the + // that a particle at R=0 is equivalent to R=1 so it could be on the // wrong rank and needs to be sent over. if (isFEM) { this->pcontainer_m->update(); @@ -267,8 +267,11 @@ class LandauDampingManager : public AlpineManager { // Here, we assume a constant charge-to-mass ratio of -1 for // all the particles hence eliminating the need to store mass as // an attribute - static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("pushVelocity"); - static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("pushPosition"); + static IpplTimings::TimerRef PTimer = IpplTimings::getTimer("pushVelocity"); + static IpplTimings::TimerRef RTimer = IpplTimings::getTimer("pushPosition"); + static IpplTimings::TimerRef scatterTimer = IpplTimings::getTimer("scatter"); + static IpplTimings::TimerRef gatherTimer = IpplTimings::getTimer("gather"); + static IpplTimings::TimerRef updateTimer = IpplTimings::getTimer("update"); static IpplTimings::TimerRef domainDecomposition = IpplTimings::getTimer("loadBalance"); static IpplTimings::TimerRef SolveTimer = IpplTimings::getTimer("solve"); @@ -302,16 +305,21 @@ class LandauDampingManager : public AlpineManager { IpplTimings::stopTimer(domainDecomposition); } + IpplTimings::startTimer(scatterTimer); // scatter the charge onto the underlying grid this->par2grid(); + IpplTimings::stopTimer(scatterTimer); + // Field solve IpplTimings::startTimer(SolveTimer); this->fsolver_m->runSolver(); IpplTimings::stopTimer(SolveTimer); + IpplTimings::startTimer(gatherTimer); // gather E field this->grid2par(); + IpplTimings::stopTimer(gatherTimer); // kick IpplTimings::startTimer(PTimer); @@ -325,7 +333,7 @@ class LandauDampingManager : public AlpineManager { if ((this->getSolver() == "FEM") || (this->getSolver() == "FEM_PRECON")) { // When using FEM, we only have E on particles - // so we use the dump function which computes the + // so we use the dump function which computes the // energy using the particles instead of the field. dumpLandau(); } else { @@ -374,10 +382,12 @@ class LandauDampingManager : public AlpineManager { fname << ippl::Comm->size(); fname << "_manager"; fname << ".csv"; - Inform csvout(NULL, fname.str().c_str(), Inform::APPEND); + const bool firstStep = std::fabs(this->time_m) < 1e-14; + Inform csvout(NULL, fname.str().c_str(), + firstStep ? Inform::OVERWRITE : Inform::APPEND); csvout.precision(16); csvout.setf(std::ios::scientific, std::ios::floatfield); - if (std::fabs(this->time_m) < 1e-14) { + if (firstStep) { csvout << "time, Ex_field_energy, Ex_max_norm" << endl; } csvout << this->time_m << " " << fieldEnergy << " " << ExAmp << endl; @@ -385,15 +395,15 @@ class LandauDampingManager : public AlpineManager { ippl::Comm->barrier(); } - // Overloaded dumpLandau which computes the E-field energy using the particles - // instead of using the E-field on the grid (as above). Since we have E for + // Overloaded dumpLandau which computes the E-field energy using the particles + // instead of using the E-field on the grid (as above). Since we have E for // each particle, we treat the particles as Monte-Carlo samples to compute // the energy integral. void dumpLandau() { - auto Eview = this->pcontainer_m->E.getView(); + auto Eview = this->pcontainer_m->E.getView(); size_type localParticles = this->pcontainer_m->getLocalNum(); - using exec_space = typename Kokkos::View::execution_space; + using exec_space = typename Kokkos::View::execution_space; using policy_type = Kokkos::RangePolicy; policy_type iteration_policy(0, localParticles); @@ -412,9 +422,8 @@ class LandauDampingManager : public AlpineManager { // MC integration: divide by no. of particles N and multiply by volume ippl::Vector domain_size = this->rmax_m - this->rmin_m; - double fieldEnergy = - std::reduce(domain_size.begin(), domain_size.end(), - globaltemp, std::multiplies()); + double fieldEnergy = std::reduce(domain_size.begin(), domain_size.end(), globaltemp, + std::multiplies()); fieldEnergy = fieldEnergy / this->totalP_m; @@ -425,10 +434,12 @@ class LandauDampingManager : public AlpineManager { fname << ippl::Comm->size(); fname << "_manager"; fname << ".csv"; - Inform csvout(NULL, fname.str().c_str(), Inform::APPEND); + const bool firstStep = std::fabs(this->time_m) < 1e-14; + Inform csvout(NULL, fname.str().c_str(), + firstStep ? Inform::OVERWRITE : Inform::APPEND); csvout.precision(16); csvout.setf(std::ios::scientific, std::ios::floatfield); - if (std::fabs(this->time_m) < 1e-14) { + if (firstStep) { csvout << "time, Ex_field_energy" << endl; } csvout << this->time_m << " " << fieldEnergy << endl; diff --git a/alpine/ParticleContainer.hpp b/alpine/ParticleContainer.hpp index 6a1b55708..40473311a 100644 --- a/alpine/ParticleContainer.hpp +++ b/alpine/ParticleContainer.hpp @@ -31,7 +31,7 @@ class ParticleContainer : public ippl::ParticleBase>& pl) { pl_m = pl; } void registerAttributes() { - //only needed for vis + // only needed for vis P.set_name("velocity"); q.set_name("charge"); E.set_name("electric_field"); diff --git a/alpine/PenningTrapManager.h b/alpine/PenningTrapManager.h index b721f3ec8..6e22a0761 100644 --- a/alpine/PenningTrapManager.h +++ b/alpine/PenningTrapManager.h @@ -202,14 +202,14 @@ class PenningTrapManager : public AlpineManager { this->pcontainer_m->create(nlocal); - view_type* R = &(this->pcontainer_m->R.getView()); - samplingR.generate(*R, rand_pool64); + auto R = this->pcontainer_m->R.getView(); + samplingR.generate(R, rand_pool64); - view_type* P = &(this->pcontainer_m->P.getView()); + auto P = this->pcontainer_m->P.getView(); double muP[Dim] = {0.0, 0.0, 0.0}; double sdP[Dim] = {1.0, 1.0, 1.0}; - Kokkos::parallel_for(nlocal, ippl::random::randn(*P, rand_pool64, muP, sdP)); + Kokkos::parallel_for(nlocal, ippl::random::randn(P, rand_pool64, muP, sdP)); Kokkos::fence(); ippl::Comm->barrier(); diff --git a/cmake/AddIpplTest.cmake b/cmake/AddIpplTest.cmake index d4476392e..4367529a8 100644 --- a/cmake/AddIpplTest.cmake +++ b/cmake/AddIpplTest.cmake @@ -31,6 +31,8 @@ set(IPPL_DEFAULT_TEST_PROCS "2" CACHE STRING "Default MPI ranks per unit test") set(IPPL_DEFAULT_TEST_TIMEOUT "60" CACHE STRING "Default timeout (seconds) per unit test") +set(IPPL_DEFAULT_INTEGRATION_TIMEOUT "300" + CACHE STRING "Default timeout (seconds) per integration test") function(add_ippl_test TEST_NAME) set(options NO_MPI REQUIRE_MPI RUN_SERIAL USE_GTEST_MAIN INTEGRATION COMPILE_ONLY) @@ -73,10 +75,6 @@ function(add_ippl_test TEST_NAME) target_link_libraries(${TEST_NAME} PRIVATE IPPL::ippl ${TEST_LINK_LIBS}) endif() - if(TARGET ippl_build_flags) - target_link_libraries(${TEST_NAME} PRIVATE ippl_build_flags) - endif() - if(TARGET ippl::test_support) target_link_libraries(${TEST_NAME} PRIVATE ippl::test_support) endif() @@ -92,6 +90,8 @@ function(add_ippl_test TEST_NAME) if(TEST_TIMEOUT) set(_timeout "${TEST_TIMEOUT}") + elseif(TEST_INTEGRATION) + set(_timeout "${IPPL_DEFAULT_INTEGRATION_TIMEOUT}") else() set(_timeout "${IPPL_DEFAULT_TEST_TIMEOUT}") endif() @@ -169,7 +169,7 @@ function(add_ippl_test TEST_NAME) ENV_VARS "OMP_PROC_BIND=spread" "OMP_PLACES=threads" - "OMP_NUM_THREADS=${_threads} " + "OMP_NUM_THREADS=${_threads}" "KOKKOS_NUM_THREADS=${_threads}" "MKL_NUM_THREADS=${_threads}" "OPENBLAS_NUM_THREADS=${_threads}" @@ -203,4 +203,8 @@ function(add_ippl_test TEST_NAME) set_tests_properties(${_ctest_name} PROPERTIES ${TEST_PROPERTIES}) endif() endif() + + if(TEST_LINK_LIBS AND NOT TEST_INTEGRATION) + target_link_libraries(${TEST_NAME} PRIVATE ${TEST_LINK_LIBS}) + endif() endfunction() diff --git a/cmake/AutoTunePresets.cmake b/cmake/AutoTunePresets.cmake new file mode 100644 index 000000000..e85e0a359 --- /dev/null +++ b/cmake/AutoTunePresets.cmake @@ -0,0 +1,145 @@ +# ============================================================================ +# AutoTunePresets.cmake +# +# Detect the build's exec-space arch tag, copy any matching preset CSVs from +# `cmake/auto_tune//` into the build's `share/ippl/auto_tune/`, and +# expose the resulting path to the library via the generated header +# `IpplAutoTunePresets.h`. +# +# At runtime, TileSizeCache::load() and GatherCache::load() consult that +# path after env / cwd lookups, so a fresh checkout on a known arch already +# uses tuned parameters without anyone running the sweep. +# ============================================================================ + +function(ippl_configure_autotune_presets) + # Resolve paths relative to this file so the function works when IPPL is + # consumed via FetchContent (where CMAKE_SOURCE_DIR points at the parent + # project, not at IPPL). + set(_ippl_cmake_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}") + + # ---- Pick a tag that matches the layout under cmake/auto_tune/. ------- + # + # Try Kokkos_ARCH_* cache variables first (they're reliable even when + # CMAKE_CUDA_ARCHITECTURES / CMAKE_HIP_ARCHITECTURES is "native"). Fall + # back to the numeric arch list if Kokkos didn't pin one. + set(_tag "") + + if("CUDA" IN_LIST IPPL_PLATFORMS) + set(_arch_map + "KEPLER30:30" "KEPLER32:32" "KEPLER35:35" "KEPLER37:37" + "MAXWELL50:50" "MAXWELL52:52" "MAXWELL53:53" + "PASCAL60:60" "PASCAL61:61" + "VOLTA70:70" "VOLTA72:72" + "TURING75:75" + "AMPERE80:80" "AMPERE86:86" "AMPERE87:87" + "ADA89:89" + "HOPPER90:90" + "BLACKWELL100:100" "BLACKWELL120:120") + foreach(_entry ${_arch_map}) + string(REPLACE ":" ";" _pair "${_entry}") + list(GET _pair 0 _name) + list(GET _pair 1 _sm) + if(Kokkos_ARCH_${_name}) + set(_tag "sm_${_sm}") + break() + endif() + endforeach() + + if(NOT _tag AND CMAKE_CUDA_ARCHITECTURES) + list(GET CMAKE_CUDA_ARCHITECTURES 0 _first_arch) + string(REGEX REPLACE "[^0-9].*$" "" _first_arch "${_first_arch}") + if(_first_arch) + set(_tag "sm_${_first_arch}") + endif() + endif() + elseif("HIP" IN_LIST IPPL_PLATFORMS) + # Kokkos uses two naming conventions across versions: AMD_GFX* (newer) + # and VEGA*/NAVI* (older). Check both. + set(_hip_arch_map + # AMD_GFX* (Kokkos >= ~4.x) + "AMD_GFX906:gfx906" "AMD_GFX908:gfx908" "AMD_GFX90A:gfx90a" + "AMD_GFX940:gfx940" "AMD_GFX942:gfx942" + "AMD_GFX1030:gfx1030" "AMD_GFX1100:gfx1100" "AMD_GFX1103:gfx1103" + # VEGA*/NAVI* (older Kokkos) + "VEGA906:gfx906" "VEGA908:gfx908" "VEGA90A:gfx90a" + "VEGA940:gfx940" "VEGA942:gfx942" + "NAVI1030:gfx1030" "NAVI1100:gfx1100") + foreach(_entry ${_hip_arch_map}) + string(REPLACE ":" ";" _pair "${_entry}") + list(GET _pair 0 _name) + list(GET _pair 1 _gfx) + if(Kokkos_ARCH_${_name}) + set(_tag "${_gfx}") + break() + endif() + endforeach() + + if(NOT _tag AND CMAKE_HIP_ARCHITECTURES) + list(GET CMAKE_HIP_ARCHITECTURES 0 _first_arch) + # CMAKE_HIP_ARCHITECTURES entries already look like "gfx90a"; strip + # any trailing flags / colons just in case. + string(REGEX REPLACE "[:].*$" "" _first_arch "${_first_arch}") + if(_first_arch) + set(_tag "${_first_arch}") + endif() + endif() + elseif("OPENMP" IN_LIST IPPL_PLATFORMS) + set(_tag "openmp") + else() + set(_tag "serial") + endif() + + set(_src_dir "${_ippl_cmake_dir}/auto_tune/${_tag}") + set(_dst_dir "${CMAKE_BINARY_DIR}/share/ippl/auto_tune") + + # Wipe any stale presets from a previous configure (e.g. arch changed or the + # source preset directory was emptied). Otherwise the runtime would happily + # keep loading a CSV produced for a different backend, leading to + # team_size-too-large aborts on host backends. + file(REMOVE + "${_dst_dir}/tile_sweep_sa_optimal.csv" + "${_dst_dir}/gather_sweep_optimal.csv") + + file(MAKE_DIRECTORY "${_dst_dir}") + + set(_have_scatter FALSE) + set(_have_gather FALSE) + + if(_tag AND IS_DIRECTORY "${_src_dir}") + if(EXISTS "${_src_dir}/tile_sweep_sa_optimal.csv") + configure_file("${_src_dir}/tile_sweep_sa_optimal.csv" + "${_dst_dir}/tile_sweep_sa_optimal.csv" COPYONLY) + set(_have_scatter TRUE) + endif() + if(EXISTS "${_src_dir}/gather_sweep_optimal.csv") + configure_file("${_src_dir}/gather_sweep_optimal.csv" + "${_dst_dir}/gather_sweep_optimal.csv" COPYONLY) + set(_have_gather TRUE) + endif() + endif() + + if(_have_scatter OR _have_gather) + message(STATUS "📊 IPPL auto-tune presets: using ${_tag} (" + "scatter=${_have_scatter}, gather=${_have_gather})") + else() + if(_tag) + message(STATUS "📊 IPPL auto-tune presets: none for ${_tag} (drop CSVs in ${_src_dir})") + else() + message(STATUS "📊 IPPL auto-tune presets: no tag resolved") + endif() + endif() + + # Bake into a generated header consumed by TileSizeCache / GatherCache. + set(IPPL_AUTOTUNE_PRESET_DIR "${_dst_dir}") + set(IPPL_AUTOTUNE_ARCH_TAG "${_tag}") + configure_file( + "${_ippl_cmake_dir}/IpplAutoTunePresets.h.in" + "${CMAKE_BINARY_DIR}/include/IpplAutoTunePresets.h" + @ONLY) + + # Install the preset directory next to the library so installed binaries + # can find it (TileSizeCache also tries an install-relative fallback). + install(DIRECTORY "${_dst_dir}/" + DESTINATION "share/ippl/auto_tune" + FILES_MATCHING PATTERN "*.csv") +endfunction() diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index a463d286e..7a1ce0a70 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -41,7 +41,8 @@ endif() # ------------------------------------------------------------------------------ if("OPENMP" IN_LIST IPPL_PLATFORMS) find_package(OpenMP REQUIRED) - colour_message(STATUS ${Green} "✅ OpenMP platform requested OpenMP found ${OPENMP_VERSION}") + colour_message(STATUS ${Green} + "✅ OpenMP platform requested, OpenMP found ${OpenMP_CXX_VERSION}") endif() # ------------------------------------------------------------------------------ @@ -113,6 +114,8 @@ function(set_kokkos_options) set(Kokkos_ENABLE_LIBDL ON CACHE BOOL "Enable LIBDL" FORCE) endif() endif() + + set(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE ON) endfunction() # ----------------------------------------------------------------------------- @@ -319,3 +322,91 @@ if(IPPL_ENABLE_TESTS) "${DOWNLOADED_HEADERS_DIR}/stb_image_write.h") message(STATUS "✅ stb_image_write loaded for testing FDTD solver.") endif() + +# ------------------------------------------------------------------------------ +# (CU)FINUFFT +# ------------------------------------------------------------------------------ +if(IPPL_ENABLE_FFT AND IPPL_ENABLE_FINUFFT) + message(STATUS "Fetching (CU)FINUFFT") + FetchContent_Declare( + finufft + GIT_REPOSITORY https://github.com/flatironinstitute/finufft.git + GIT_SHALLOW TRUE + ) + if("CUDA" IN_LIST IPPL_PLATFORMS) + set(FINUFFT_USE_CUDA ON CACHE BOOL "") + add_compile_definitions(ENABLE_GPU_NUFFT) + add_compile_definitions(FINUFFT_USE_CUDA) + endif() + set(FINUFFT_USE_CPU ON CACHE BOOL "") + + # cufinufft's CUDA RDC fatbin registration segfaults at startup when its + # device code is wrapped in a .so. Force static for the FetchContent build + # regardless of the global BUILD_SHARED_LIBS setting. + set(_ippl_saved_bsl ${BUILD_SHARED_LIBS}) + set(BUILD_SHARED_LIBS OFF) + + # Kokkos updates CMAKE_CUDA_ARCHITECTURES only inside its own subproject + # scope. A downstream FetchContent like cufinufft would otherwise inherit + # CMake's default arch (which doesn't match the GPU) and emit kernels that + # fail at launch with cudaErrorInvalidResourceHandle. Translate the + # Kokkos_ARCH_* selection into CMAKE_CUDA_ARCHITECTURES once, here, so + # every downstream CUDA TU sees it. + if("CUDA" IN_LIST IPPL_PLATFORMS) + set(_ippl_arch_map + "KEPLER30:30" "KEPLER32:32" "KEPLER35:35" "KEPLER37:37" + "MAXWELL50:50" "MAXWELL52:52" "MAXWELL53:53" + "PASCAL60:60" "PASCAL61:61" + "VOLTA70:70" "VOLTA72:72" + "TURING75:75" + "AMPERE80:80" "AMPERE86:86" "AMPERE87:87" + "ADA89:89" + "HOPPER90:90" + "BLACKWELL100:100" "BLACKWELL120:120") + foreach(_entry ${_ippl_arch_map}) + string(REPLACE ":" ";" _pair ${_entry}) + list(GET _pair 0 _name) + list(GET _pair 1 _sm) + if(Kokkos_ARCH_${_name}) + set(CMAKE_CUDA_ARCHITECTURES ${_sm} CACHE STRING "" FORCE) + break() + endif() + endforeach() + endif() + + FetchContent_MakeAvailable(finufft) + + set(BUILD_SHARED_LIBS ${_ippl_saved_bsl}) + + add_compile_definitions(ENABLE_FINUFFT) +endif() + +# ------------------------------------------------------------------------------ +# CuFFTMp +# ------------------------------------------------------------------------------ +# CuFFTMp is opt-in; users must point NVSHMEM_HOME (and optionally +# CUFFTMP_ROOT via CMAKE_PREFIX_PATH) at their installation. +if(IPPL_ENABLE_FFT AND IPPL_ENABLE_CUFFTMP) + set(NVSHMEM_HOME $ENV{NVSHMEM_HOME}) + + find_library(NVSHMEM_HOST_LIBRARY + NAMES nvshmem_host + HINTS ${NVSHMEM_HOME} + PATH_SUFFIXES lib + ) + + find_library(CUFFTMP_LIBRARY + NAMES cufftMp + ) + + if(NVSHMEM_HOST_LIBRARY) + message(STATUS "Found NVSHMEM host library: ${NVSHMEM_HOST_LIBRARY}") + set(NVSHMEM_FOUND TRUE) + else() + message(FATAL_ERROR "NVSHMEM not found. Set NVSHMEM_HOME to your installation directory.") + set(NVSHMEM_FOUND FALSE) + endif() + + # The actual link is done at the ippl target in src/CMakeLists.txt + add_compile_definitions(IPPL_ENABLE_CUFFTMP) +endif() diff --git a/cmake/InstallIppl.cmake b/cmake/InstallIppl.cmake index ed0a2656d..4733cb542 100644 --- a/cmake/InstallIppl.cmake +++ b/cmake/InstallIppl.cmake @@ -121,6 +121,8 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/IPPLConfig.cmake" # Fix/Hack: Ensure extern dependencies are exported correctly if they were built in-tree. This is # needed for Heffte because it doesn't fully use CMake's export target mechanism # ------------------------------------------------------- -if(TARGET Heffte) - install(TARGETS Heffte EXPORT ipplTargets DESTINATION lib) -endif() +foreach(_ippl_extern_dep IN ITEMS Heffte finufft finufft_common cufinufft) + if(TARGET ${_ippl_extern_dep}) + install(TARGETS ${_ippl_extern_dep} EXPORT ipplTargets DESTINATION lib) + endif() +endforeach() diff --git a/cmake/IpplAutoTunePresets.h.in b/cmake/IpplAutoTunePresets.h.in new file mode 100644 index 000000000..feb6cdf1a --- /dev/null +++ b/cmake/IpplAutoTunePresets.h.in @@ -0,0 +1,15 @@ +#ifndef IPPL_AUTO_TUNE_PRESETS_H +#define IPPL_AUTO_TUNE_PRESETS_H + +// Generated by cmake/AutoTunePresets.cmake. + +// Absolute path to the build-tree directory holding the per-arch preset +// CSVs that ship with this IPPL build. Empty string if no preset matched +// the configured arch. +#define IPPL_AUTOTUNE_PRESET_DIR "@IPPL_AUTOTUNE_PRESET_DIR@" + +// Short human-readable label for the resolved arch (e.g. "sm_90", "openmp", +// "serial"). Used for log messages. +#define IPPL_AUTOTUNE_ARCH_TAG "@IPPL_AUTOTUNE_ARCH_TAG@" + +#endif // IPPL_AUTO_TUNE_PRESETS_H diff --git a/cmake/auto_tune/README.md b/cmake/auto_tune/README.md new file mode 100644 index 000000000..81d4419ae --- /dev/null +++ b/cmake/auto_tune/README.md @@ -0,0 +1,37 @@ +# Auto-tune presets + +Pre-generated scatter / gather sweep CSVs that ship with IPPL. At configure +time, `ippl_configure_autotune_presets()` (in `cmake/AutoTunePresets.cmake`) +picks the subdirectory that matches the current build: + +| Build | Tag | Lookup directory | +|---------------------------------------------------------|-----------|---------------------------| +| `IPPL_PLATFORMS=CUDA`, `Kokkos_ARCH_HOPPER90` | `sm_90` | `cmake/auto_tune/sm_90/` | +| `IPPL_PLATFORMS=CUDA`, `Kokkos_ARCH_AMPERE86` | `sm_86` | `cmake/auto_tune/sm_86/` | +| `IPPL_PLATFORMS=HIP`, `Kokkos_ARCH_AMD_GFX942` | `gfx942` | `cmake/auto_tune/gfx942/` | +| `IPPL_PLATFORMS=HIP`, `Kokkos_ARCH_AMD_GFX90A` | `gfx90a` | `cmake/auto_tune/gfx90a/` | +| `IPPL_PLATFORMS=OPENMP` (no GPU) | `openmp` | `cmake/auto_tune/openmp/` | +| Serial only | `serial` | `cmake/auto_tune/serial/` | + +If the matching directory contains `tile_sweep_sa_optimal.csv` and/or +`gather_sweep_optimal.csv`, those files are copied into +`/share/ippl/auto_tune/` and the build-tree path is baked into the +library via the generated header `IpplAutoTunePresets.h`. + +At runtime, `TileSizeCache::load()` and `GatherCache::load()` consult the +following sources in order: + +1. `IPPL_TILE_CSV` / `IPPL_GATHER_CSV` env var +2. `tile_sweep_sa_optimal.csv` / `gather_sweep_optimal.csv` in cwd +3. The shipped preset for this build's arch (this directory) +4. Built-in defaults seeded by `Ippl::initialize` + +To add a new preset: + +1. Build IPPL for the target arch. +2. Run any executable with `IPPL_AUTO_TUNE=full` to produce the two CSVs in + the run directory. +3. Copy them into `cmake/auto_tune//` and commit. + +Subsequent IPPL builds for that arch will pick up the CSVs automatically; +no env var or extra steps are needed at runtime. diff --git a/cmake/auto_tune/gfx90a/gather_sweep_optimal.csv b/cmake/auto_tune/gfx90a/gather_sweep_optimal.csv new file mode 100644 index 000000000..d5f54e31e --- /dev/null +++ b/cmake/auto_tune/gfx90a/gather_sweep_optimal.csv @@ -0,0 +1,2 @@ +method,kernel_width,tile_x,tile_y,tile_z,throughput_Mpts_s +Atomic,2,1,1,1,8713.87 diff --git a/cmake/auto_tune/gfx90a/tile_sweep_sa_optimal.csv b/cmake/auto_tune/gfx90a/tile_sweep_sa_optimal.csv new file mode 100644 index 000000000..d9d41d8c1 --- /dev/null +++ b/cmake/auto_tune/gfx90a/tile_sweep_sa_optimal.csv @@ -0,0 +1,49 @@ +method,value_type,kernel_width,rho,best_tile_x,best_tile_y,best_tile_z,best_team_size,best_oversubscription_factor,best_z_batches,throughput_Mpts_s,time_ms,kernel_evaluations,preflight_rejections +Atomic,real,1,0.5000,1,1,1,1,4,1,477.24,0.0000,0,0 +Tiled,real,1,0.5000,6,6,6,64,1,1,1253.81,0.0000,0,0 +OutputFocused,real,1,0.5000,8,1,6,128,1,1,370.15,0.0000,0,0 +Atomic,real,2,0.5000,1,1,1,2,1,1,476.48,0.0000,0,0 +Tiled,real,2,0.5000,8,8,8,256,1,1,1231.17,0.0000,0,0 +OutputFocused,real,2,0.5000,8,1,8,64,1,1,370.25,0.0000,0,0 +Atomic,real,1,2.0000,1,1,1,8,4,1,617.45,0.0000,0,0 +Tiled,real,1,2.0000,8,1,1,16,3,1,1864.93,0.0000,0,0 +OutputFocused,real,1,2.0000,1,1,8,128,1,1,391.31,0.0000,0,0 +Atomic,real,2,2.0000,1,1,1,8,4,1,618.21,0.0000,0,0 +Tiled,real,2,2.0000,1,1,1,16,1,1,1814.68,0.0000,0,0 +OutputFocused,real,2,2.0000,4,8,2,64,1,3,391.17,0.0000,0,0 +Atomic,real,1,4.0000,1,1,1,2,1,1,540.99,0.0000,0,0 +Tiled,real,1,4.0000,3,3,7,128,3,1,2195.38,0.0000,0,0 +OutputFocused,real,1,4.0000,8,8,1,64,1,7,394.03,0.0000,0,0 +Atomic,real,2,4.0000,1,1,1,4,4,1,541.48,0.0000,0,0 +Tiled,real,2,4.0000,5,5,5,64,1,1,2192.47,0.0000,0,0 +OutputFocused,real,2,4.0000,4,4,5,512,1,4,394.26,0.0000,0,0 +Atomic,real,1,8.0000,1,1,1,4,4,1,408.29,0.0000,0,0 +Tiled,real,1,8.0000,3,1,7,16,2,1,2446.44,0.0000,0,0 +OutputFocused,real,1,8.0000,8,8,1,64,1,8,395.44,0.0000,0,0 +Atomic,real,2,8.0000,1,1,1,1,3,1,408.34,0.0000,0,0 +Tiled,real,2,8.0000,7,8,3,256,1,1,2442.22,0.0000,0,0 +OutputFocused,real,2,8.0000,8,8,1,512,1,2,395.41,0.0000,0,0 +Atomic,complex,1,0.5000,1,1,1,1,4,1,477.24,0.0000,0,0 +Tiled,complex,1,0.5000,6,6,6,64,1,1,1253.81,0.0000,0,0 +OutputFocused,complex,1,0.5000,8,1,6,128,1,1,370.15,0.0000,0,0 +Atomic,complex,2,0.5000,1,1,1,2,1,1,476.48,0.0000,0,0 +Tiled,complex,2,0.5000,8,8,8,256,1,1,1231.17,0.0000,0,0 +OutputFocused,complex,2,0.5000,8,1,8,64,1,1,370.25,0.0000,0,0 +Atomic,complex,1,2.0000,1,1,1,8,4,1,617.45,0.0000,0,0 +Tiled,complex,1,2.0000,8,1,1,16,3,1,1864.93,0.0000,0,0 +OutputFocused,complex,1,2.0000,1,1,8,128,1,1,391.31,0.0000,0,0 +Atomic,complex,2,2.0000,1,1,1,8,4,1,618.21,0.0000,0,0 +Tiled,complex,2,2.0000,1,1,1,16,1,1,1814.68,0.0000,0,0 +OutputFocused,complex,2,2.0000,4,8,2,64,1,3,391.17,0.0000,0,0 +Atomic,complex,1,4.0000,1,1,1,2,1,1,540.99,0.0000,0,0 +Tiled,complex,1,4.0000,3,3,7,128,3,1,2195.38,0.0000,0,0 +OutputFocused,complex,1,4.0000,8,8,1,64,1,7,394.03,0.0000,0,0 +Atomic,complex,2,4.0000,1,1,1,4,4,1,541.48,0.0000,0,0 +Tiled,complex,2,4.0000,5,5,5,64,1,1,2192.47,0.0000,0,0 +OutputFocused,complex,2,4.0000,4,4,5,512,1,4,394.26,0.0000,0,0 +Atomic,complex,1,8.0000,1,1,1,4,4,1,408.29,0.0000,0,0 +Tiled,complex,1,8.0000,3,1,7,16,2,1,2446.44,0.0000,0,0 +OutputFocused,complex,1,8.0000,8,8,1,64,1,8,395.44,0.0000,0,0 +Atomic,complex,2,8.0000,1,1,1,1,3,1,408.34,0.0000,0,0 +Tiled,complex,2,8.0000,7,8,3,256,1,1,2442.22,0.0000,0,0 +OutputFocused,complex,2,8.0000,8,8,1,512,1,2,395.41,0.0000,0,0 diff --git a/cmake/auto_tune/sm_80/gather_sweep_optimal.csv b/cmake/auto_tune/sm_80/gather_sweep_optimal.csv new file mode 100644 index 000000000..0398bac1c --- /dev/null +++ b/cmake/auto_tune/sm_80/gather_sweep_optimal.csv @@ -0,0 +1,2 @@ +method,kernel_width,tile_x,tile_y,tile_z,throughput_Mpts_s +Atomic,2,1,1,1,16549.40 diff --git a/cmake/auto_tune/sm_80/tile_sweep_sa_optimal.csv b/cmake/auto_tune/sm_80/tile_sweep_sa_optimal.csv new file mode 100644 index 000000000..915c5fbf3 --- /dev/null +++ b/cmake/auto_tune/sm_80/tile_sweep_sa_optimal.csv @@ -0,0 +1,145 @@ +method,value_type,kernel_width,rho,best_tile_x,best_tile_y,best_tile_z,best_team_size,best_oversubscription_factor,best_z_batches,throughput_Mpts_s,time_ms,kernel_evaluations,preflight_rejections +Atomic,real,1,0.5000,1,1,1,32,1,1,416.37,0.0000,0,0 +Tiled,real,1,0.5000,8,8,8,256,4,1,51.23,0.0000,0,0 +OutputFocused,real,1,0.5000,4,4,4,64,2,4,52.85,0.0000,0,0 +Atomic,real,2,0.5000,1,1,1,32,1,1,480.19,0.0000,0,0 +Tiled,real,2,0.5000,3,3,3,256,2,1,61.43,0.0000,0,0 +OutputFocused,real,2,0.5000,3,3,3,128,2,1,52.94,0.0000,0,0 +Atomic,real,1,2.0000,1,1,1,32,1,1,1146.92,0.0000,0,0 +Tiled,real,1,2.0000,8,8,8,256,2,1,230.55,0.0000,0,0 +OutputFocused,real,1,2.0000,4,4,4,64,2,1,144.48,0.0000,0,0 +Atomic,real,2,2.0000,1,1,1,32,1,1,1148.75,0.0000,0,0 +Tiled,real,2,2.0000,8,8,8,64,2,1,231.07,0.0000,0,0 +OutputFocused,real,2,2.0000,3,3,3,512,2,4,144.57,0.0000,0,0 +Atomic,real,1,8.0000,1,1,1,32,1,1,1773.39,0.0000,0,0 +Tiled,real,1,8.0000,5,5,5,32,1,1,751.49,0.0000,0,0 +OutputFocused,real,1,8.0000,3,3,3,256,2,2,275.07,0.0000,0,0 +Atomic,real,2,8.0000,1,1,1,32,1,1,1775.55,0.0000,0,0 +Tiled,real,2,8.0000,2,2,2,32,1,1,751.90,0.0000,0,0 +OutputFocused,real,2,8.0000,5,5,5,512,2,1,275.26,0.0000,0,0 +Atomic,real,1,32.0000,1,1,1,32,1,1,1970.77,0.0000,0,0 +Tiled,real,1,32.0000,3,3,3,256,4,1,1585.16,0.0000,0,0 +OutputFocused,real,1,32.0000,5,5,5,256,1,8,290.20,0.0000,0,0 +Atomic,real,2,32.0000,1,1,1,32,1,1,1972.10,0.0000,0,0 +Tiled,real,2,32.0000,2,2,2,32,2,1,1585.62,0.0000,0,0 +OutputFocused,real,2,32.0000,2,2,2,512,4,4,290.62,0.0000,0,0 +Atomic,real,1,0.5000,1,1,1,32,1,1,1481.88,0.0000,0,0 +Tiled,real,1,0.5000,4,4,4,32,4,1,230.66,0.0000,0,0 +OutputFocused,real,1,0.5000,4,4,4,256,2,8,190.32,0.0000,0,0 +Atomic,real,2,0.5000,1,1,1,32,1,1,1482.70,0.0000,0,0 +Tiled,real,2,0.5000,8,8,8,64,4,1,230.69,0.0000,0,0 +OutputFocused,real,2,0.5000,6,6,6,512,2,4,190.30,0.0000,0,0 +Atomic,real,1,2.0000,1,1,1,32,1,1,1962.43,0.0000,0,0 +Tiled,real,1,2.0000,5,5,5,64,2,1,774.59,0.0000,0,0 +OutputFocused,real,1,2.0000,2,2,2,64,2,1,399.12,0.0000,0,0 +Atomic,real,2,2.0000,1,1,1,32,1,1,1947.85,0.0000,0,0 +Tiled,real,2,2.0000,6,6,6,32,4,1,767.33,0.0000,0,0 +OutputFocused,real,2,2.0000,5,5,5,512,4,2,397.68,0.0000,0,0 +Atomic,real,1,8.0000,1,1,1,32,1,1,1990.66,0.0000,0,0 +Tiled,real,1,8.0000,2,2,2,256,1,1,1630.76,0.0000,0,0 +OutputFocused,real,1,8.0000,6,6,6,128,4,2,464.03,0.0000,0,0 +Atomic,real,2,8.0000,1,1,1,32,1,1,1910.28,0.0000,0,0 +Tiled,real,2,8.0000,2,2,2,64,1,1,1592.81,0.0000,0,0 +OutputFocused,real,2,8.0000,4,4,4,128,4,2,464.54,0.0000,0,0 +Atomic,real,1,32.0000,1,1,1,32,1,1,1863.57,0.0000,0,0 +Tiled,real,1,32.0000,2,2,2,32,1,1,2067.55,0.0000,0,0 +OutputFocused,real,1,32.0000,3,3,3,512,1,8,521.55,0.0000,0,0 +Atomic,real,2,32.0000,1,1,1,32,1,1,2049.23,0.0000,0,0 +Tiled,real,2,32.0000,2,2,2,32,2,1,2188.71,0.0000,0,0 +OutputFocused,real,2,32.0000,4,4,4,512,2,8,521.61,0.0000,0,0 +Atomic,real,1,0.5000,1,1,1,32,1,1,1937.09,0.0000,0,0 +Tiled,real,1,0.5000,3,3,3,256,4,1,359.89,0.0000,0,0 +OutputFocused,real,1,0.5000,4,4,4,64,1,8,249.69,0.0000,0,0 +Atomic,real,2,0.5000,1,1,1,32,1,1,1936.45,0.0000,0,0 +Tiled,real,2,0.5000,8,8,8,64,4,1,359.66,0.0000,0,0 +OutputFocused,real,2,0.5000,2,2,2,128,2,2,249.70,0.0000,0,0 +Atomic,real,1,2.0000,1,1,1,32,1,1,2029.03,0.0000,0,0 +Tiled,real,1,2.0000,2,2,2,64,2,1,929.24,0.0000,0,0 +OutputFocused,real,1,2.0000,5,5,5,512,1,4,420.45,0.0000,0,0 +Atomic,real,2,2.0000,1,1,1,32,1,1,2029.07,0.0000,0,0 +Tiled,real,2,2.0000,2,2,2,64,1,1,930.14,0.0000,0,0 +OutputFocused,real,2,2.0000,4,4,4,64,4,2,420.46,0.0000,0,0 +Atomic,real,1,8.0000,1,1,1,32,1,1,2054.15,0.0000,0,0 +Tiled,real,1,8.0000,3,3,3,256,2,1,1597.85,0.0000,0,0 +OutputFocused,real,1,8.0000,6,6,6,128,4,2,522.66,0.0000,0,0 +Atomic,real,2,8.0000,1,1,1,32,1,1,2053.35,0.0000,0,0 +Tiled,real,2,8.0000,4,4,4,32,4,1,1597.95,0.0000,0,0 +OutputFocused,real,2,8.0000,2,2,2,64,4,1,522.60,0.0000,0,0 +Atomic,real,1,32.0000,1,1,1,32,1,1,2060.22,0.0000,0,0 +Tiled,real,1,32.0000,4,4,4,32,4,1,2081.97,0.0000,0,0 +OutputFocused,real,1,32.0000,6,6,6,512,1,1,564.46,0.0000,0,0 +Atomic,real,2,32.0000,1,1,1,32,1,1,2059.72,0.0000,0,0 +Tiled,real,2,32.0000,5,5,5,128,1,1,2082.07,0.0000,0,0 +OutputFocused,real,2,32.0000,5,5,5,128,1,8,564.58,0.0000,0,0 +Atomic,complex,1,0.5000,1,1,1,32,1,1,416.37,0.0000,0,0 +Tiled,complex,1,0.5000,8,8,8,256,4,1,51.23,0.0000,0,0 +OutputFocused,complex,1,0.5000,4,4,4,64,2,4,52.85,0.0000,0,0 +Atomic,complex,2,0.5000,1,1,1,32,1,1,480.19,0.0000,0,0 +Tiled,complex,2,0.5000,3,3,3,256,2,1,61.43,0.0000,0,0 +OutputFocused,complex,2,0.5000,3,3,3,128,2,1,52.94,0.0000,0,0 +Atomic,complex,1,2.0000,1,1,1,32,1,1,1146.92,0.0000,0,0 +Tiled,complex,1,2.0000,8,8,8,256,2,1,230.55,0.0000,0,0 +OutputFocused,complex,1,2.0000,4,4,4,64,2,1,144.48,0.0000,0,0 +Atomic,complex,2,2.0000,1,1,1,32,1,1,1148.75,0.0000,0,0 +Tiled,complex,2,2.0000,8,8,8,64,2,1,231.07,0.0000,0,0 +OutputFocused,complex,2,2.0000,3,3,3,512,2,4,144.57,0.0000,0,0 +Atomic,complex,1,8.0000,1,1,1,32,1,1,1773.39,0.0000,0,0 +Tiled,complex,1,8.0000,5,5,5,32,1,1,751.49,0.0000,0,0 +OutputFocused,complex,1,8.0000,3,3,3,256,2,2,275.07,0.0000,0,0 +Atomic,complex,2,8.0000,1,1,1,32,1,1,1775.55,0.0000,0,0 +Tiled,complex,2,8.0000,2,2,2,32,1,1,751.90,0.0000,0,0 +OutputFocused,complex,2,8.0000,5,5,5,512,2,1,275.26,0.0000,0,0 +Atomic,complex,1,32.0000,1,1,1,32,1,1,1970.77,0.0000,0,0 +Tiled,complex,1,32.0000,3,3,3,256,4,1,1585.16,0.0000,0,0 +OutputFocused,complex,1,32.0000,5,5,5,256,1,8,290.20,0.0000,0,0 +Atomic,complex,2,32.0000,1,1,1,32,1,1,1972.10,0.0000,0,0 +Tiled,complex,2,32.0000,2,2,2,32,2,1,1585.62,0.0000,0,0 +OutputFocused,complex,2,32.0000,2,2,2,512,4,4,290.62,0.0000,0,0 +Atomic,complex,1,0.5000,1,1,1,32,1,1,1481.88,0.0000,0,0 +Tiled,complex,1,0.5000,4,4,4,32,4,1,230.66,0.0000,0,0 +OutputFocused,complex,1,0.5000,4,4,4,256,2,8,190.32,0.0000,0,0 +Atomic,complex,2,0.5000,1,1,1,32,1,1,1482.70,0.0000,0,0 +Tiled,complex,2,0.5000,8,8,8,64,4,1,230.69,0.0000,0,0 +OutputFocused,complex,2,0.5000,6,6,6,512,2,4,190.30,0.0000,0,0 +Atomic,complex,1,2.0000,1,1,1,32,1,1,1962.43,0.0000,0,0 +Tiled,complex,1,2.0000,5,5,5,64,2,1,774.59,0.0000,0,0 +OutputFocused,complex,1,2.0000,2,2,2,64,2,1,399.12,0.0000,0,0 +Atomic,complex,2,2.0000,1,1,1,32,1,1,1947.85,0.0000,0,0 +Tiled,complex,2,2.0000,6,6,6,32,4,1,767.33,0.0000,0,0 +OutputFocused,complex,2,2.0000,5,5,5,512,4,2,397.68,0.0000,0,0 +Atomic,complex,1,8.0000,1,1,1,32,1,1,1990.66,0.0000,0,0 +Tiled,complex,1,8.0000,2,2,2,256,1,1,1630.76,0.0000,0,0 +OutputFocused,complex,1,8.0000,6,6,6,128,4,2,464.03,0.0000,0,0 +Atomic,complex,2,8.0000,1,1,1,32,1,1,1910.28,0.0000,0,0 +Tiled,complex,2,8.0000,2,2,2,64,1,1,1592.81,0.0000,0,0 +OutputFocused,complex,2,8.0000,4,4,4,128,4,2,464.54,0.0000,0,0 +Atomic,complex,1,32.0000,1,1,1,32,1,1,1863.57,0.0000,0,0 +Tiled,complex,1,32.0000,2,2,2,32,1,1,2067.55,0.0000,0,0 +OutputFocused,complex,1,32.0000,3,3,3,512,1,8,521.55,0.0000,0,0 +Atomic,complex,2,32.0000,1,1,1,32,1,1,2049.23,0.0000,0,0 +Tiled,complex,2,32.0000,2,2,2,32,2,1,2188.71,0.0000,0,0 +OutputFocused,complex,2,32.0000,4,4,4,512,2,8,521.61,0.0000,0,0 +Atomic,complex,1,0.5000,1,1,1,32,1,1,1937.09,0.0000,0,0 +Tiled,complex,1,0.5000,3,3,3,256,4,1,359.89,0.0000,0,0 +OutputFocused,complex,1,0.5000,4,4,4,64,1,8,249.69,0.0000,0,0 +Atomic,complex,2,0.5000,1,1,1,32,1,1,1936.45,0.0000,0,0 +Tiled,complex,2,0.5000,8,8,8,64,4,1,359.66,0.0000,0,0 +OutputFocused,complex,2,0.5000,2,2,2,128,2,2,249.70,0.0000,0,0 +Atomic,complex,1,2.0000,1,1,1,32,1,1,2029.03,0.0000,0,0 +Tiled,complex,1,2.0000,2,2,2,64,2,1,929.24,0.0000,0,0 +OutputFocused,complex,1,2.0000,5,5,5,512,1,4,420.45,0.0000,0,0 +Atomic,complex,2,2.0000,1,1,1,32,1,1,2029.07,0.0000,0,0 +Tiled,complex,2,2.0000,2,2,2,64,1,1,930.14,0.0000,0,0 +OutputFocused,complex,2,2.0000,4,4,4,64,4,2,420.46,0.0000,0,0 +Atomic,complex,1,8.0000,1,1,1,32,1,1,2054.15,0.0000,0,0 +Tiled,complex,1,8.0000,3,3,3,256,2,1,1597.85,0.0000,0,0 +OutputFocused,complex,1,8.0000,6,6,6,128,4,2,522.66,0.0000,0,0 +Atomic,complex,2,8.0000,1,1,1,32,1,1,2053.35,0.0000,0,0 +Tiled,complex,2,8.0000,4,4,4,32,4,1,1597.95,0.0000,0,0 +OutputFocused,complex,2,8.0000,2,2,2,64,4,1,522.60,0.0000,0,0 +Atomic,complex,1,32.0000,1,1,1,32,1,1,2060.22,0.0000,0,0 +Tiled,complex,1,32.0000,4,4,4,32,4,1,2081.97,0.0000,0,0 +OutputFocused,complex,1,32.0000,6,6,6,512,1,1,564.46,0.0000,0,0 +Atomic,complex,2,32.0000,1,1,1,32,1,1,2059.72,0.0000,0,0 +Tiled,complex,2,32.0000,5,5,5,128,1,1,2082.07,0.0000,0,0 +OutputFocused,complex,2,32.0000,5,5,5,128,1,8,564.58,0.0000,0,0 diff --git a/cmake/auto_tune/sm_90/gather_sweep_optimal.csv b/cmake/auto_tune/sm_90/gather_sweep_optimal.csv new file mode 100644 index 000000000..8dd307e2e --- /dev/null +++ b/cmake/auto_tune/sm_90/gather_sweep_optimal.csv @@ -0,0 +1,2 @@ +method,kernel_width,tile_x,tile_y,tile_z,throughput_Mpts_s +Atomic,2,1,1,1,29141.97 diff --git a/cmake/auto_tune/sm_90/tile_sweep_sa_optimal.csv b/cmake/auto_tune/sm_90/tile_sweep_sa_optimal.csv new file mode 100644 index 000000000..4b682ee7c --- /dev/null +++ b/cmake/auto_tune/sm_90/tile_sweep_sa_optimal.csv @@ -0,0 +1,49 @@ +method,value_type,kernel_width,rho,best_tile_x,best_tile_y,best_tile_z,best_team_size,best_oversubscription_factor,best_z_batches,throughput_Mpts_s,time_ms,kernel_evaluations,preflight_rejections +Atomic,real,1,0.5000,1,1,1,32,4,1,3015.42,0.0000,0,0 +Tiled,real,1,0.5000,7,3,1,32,4,1,445.72,0.0000,0,0 +OutputFocused,real,1,0.5000,2,6,8,64,1,7,782.69,0.0000,0,0 +Atomic,real,2,0.5000,1,1,1,4,2,1,2986.67,0.0000,0,0 +Tiled,real,2,0.5000,4,1,1,64,2,1,445.89,0.0000,0,0 +OutputFocused,real,2,0.5000,4,6,7,512,1,5,782.85,0.0000,0,0 +Atomic,real,1,2.0000,1,1,1,8,1,1,3083.52,0.0000,0,0 +Tiled,real,1,2.0000,6,1,6,64,4,1,739.06,0.0000,0,0 +OutputFocused,real,1,2.0000,7,5,3,128,1,7,575.54,0.0000,0,0 +Atomic,real,2,2.0000,1,1,1,2,2,1,3083.18,0.0000,0,0 +Tiled,real,2,2.0000,3,1,8,16,2,1,741.76,0.0000,0,0 +OutputFocused,real,2,2.0000,5,5,5,128,1,1,578.21,0.0000,0,0 +Atomic,real,1,4.0000,1,1,1,1,4,1,3136.95,0.0000,0,0 +Tiled,real,1,4.0000,3,5,1,64,3,1,1187.20,0.0000,0,0 +OutputFocused,real,1,4.0000,2,4,8,256,1,1,666.54,0.0000,0,0 +Atomic,real,2,4.0000,1,1,1,4,2,1,3115.95,0.0000,0,0 +Tiled,real,2,4.0000,4,5,4,64,1,1,1184.80,0.0000,0,0 +OutputFocused,real,2,4.0000,2,2,2,128,1,1,666.76,0.0000,0,0 +Atomic,real,1,8.0000,1,1,1,2,1,1,3133.18,0.0000,0,0 +Tiled,real,1,8.0000,8,8,2,128,4,1,3799.19,0.0000,0,0 +OutputFocused,real,1,8.0000,3,3,3,64,1,1,1358.63,0.0000,0,0 +Atomic,real,2,8.0000,1,1,1,8,4,1,3133.02,0.0000,0,0 +Tiled,real,2,8.0000,6,5,1,128,4,1,3797.57,0.0000,0,0 +OutputFocused,real,2,8.0000,2,8,8,512,1,2,1358.83,0.0000,0,0 +Atomic,complex,1,0.5000,1,1,1,32,4,1,3015.42,0.0000,0,0 +Tiled,complex,1,0.5000,7,3,1,32,4,1,445.72,0.0000,0,0 +OutputFocused,complex,1,0.5000,2,6,8,64,1,7,782.69,0.0000,0,0 +Atomic,complex,2,0.5000,1,1,1,4,2,1,2986.67,0.0000,0,0 +Tiled,complex,2,0.5000,4,1,1,64,2,1,445.89,0.0000,0,0 +OutputFocused,complex,2,0.5000,4,6,7,512,1,5,782.85,0.0000,0,0 +Atomic,complex,1,2.0000,1,1,1,8,1,1,3083.52,0.0000,0,0 +Tiled,complex,1,2.0000,6,1,6,64,4,1,739.06,0.0000,0,0 +OutputFocused,complex,1,2.0000,7,5,3,128,1,7,575.54,0.0000,0,0 +Atomic,complex,2,2.0000,1,1,1,2,2,1,3083.18,0.0000,0,0 +Tiled,complex,2,2.0000,3,1,8,16,2,1,741.76,0.0000,0,0 +OutputFocused,complex,2,2.0000,5,5,5,128,1,1,578.21,0.0000,0,0 +Atomic,complex,1,4.0000,1,1,1,1,4,1,3136.95,0.0000,0,0 +Tiled,complex,1,4.0000,3,5,1,64,3,1,1187.20,0.0000,0,0 +OutputFocused,complex,1,4.0000,2,4,8,256,1,1,666.54,0.0000,0,0 +Atomic,complex,2,4.0000,1,1,1,4,2,1,3115.95,0.0000,0,0 +Tiled,complex,2,4.0000,4,5,4,64,1,1,1184.80,0.0000,0,0 +OutputFocused,complex,2,4.0000,2,2,2,128,1,1,666.76,0.0000,0,0 +Atomic,complex,1,8.0000,1,1,1,2,1,1,3133.18,0.0000,0,0 +Tiled,complex,1,8.0000,8,8,2,128,4,1,3799.19,0.0000,0,0 +OutputFocused,complex,1,8.0000,3,3,3,64,1,1,1358.63,0.0000,0,0 +Atomic,complex,2,8.0000,1,1,1,8,4,1,3133.02,0.0000,0,0 +Tiled,complex,2,8.0000,6,5,1,128,4,1,3797.57,0.0000,0,0 +OutputFocused,complex,2,8.0000,2,8,8,512,1,2,1358.83,0.0000,0,0 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c4778b174..9451daeb8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -47,6 +47,14 @@ target_compile_options( $<$:-O3> >) +# Disable host LTO on the IPPL library when CUDA is enabled. Repeated +# `fatbinData` symbols in nvcc-generated fat binaries fail LTO merging, +# but this only matters for code that links against cuFFT/cuFFTMp/cufinufft +# — keep the rest of the build LTO-eligible. +if("CUDA" IN_LIST IPPL_PLATFORMS) + target_compile_options(ippl PRIVATE $<$:-fno-lto>) +endif() + get_target_property(_ippl_type ippl TYPE) if(_ippl_type) string(REPLACE "_LIBRARY" "" _ippl_type "${_ippl_type}") # -> STATIC / SHARED / OBJECT / INTERFACE @@ -67,7 +75,8 @@ target_sources(ippl PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/Ippl.cpp) target_include_directories( ippl PUBLIC $ $ - PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + PRIVATE ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_BINARY_DIR}/include) # generated IpplAutoTunePresets.h add_subdirectory(Communicate) @@ -103,9 +112,18 @@ include(${PROJECT_SOURCE_DIR}/cmake/PlatformOptions.cmake) target_link_libraries(ippl PUBLIC Kokkos::kokkos MPI::MPI_CXX) if(IPPL_ENABLE_FFT) + if(IPPL_ENABLE_CUFFTMP) + target_link_libraries(ippl PUBLIC ${CUFFTMP_LIBRARY} ${NVSHMEM_HOST_LIBRARY}) + endif() + target_link_libraries(ippl PUBLIC Heffte::Heffte) + if(IPPL_ENABLE_FINUFFT) + target_link_libraries(ippl PUBLIC finufft) + if("CUDA" IN_LIST IPPL_PLATFORMS) + target_link_libraries(ippl PRIVATE cufinufft) + endif() + endif() endif() - # this alias should be created after all target properties have been set add_library(IPPL::ippl ALIAS ippl) diff --git a/src/Communicate/Archive.h b/src/Communicate/Archive.h index 6ec74f946..6b730db3a 100644 --- a/src/Communicate/Archive.h +++ b/src/Communicate/Archive.h @@ -9,6 +9,9 @@ // that they have type char and thus contain raw bytes, unlike other typed buffers // such as detail::FieldBufferData used by HaloCells. // +// On CUDA/HIP the internal buffer is allocated directly via cudaMalloc/hipMalloc +// so that the pointer is page-aligned (4K) and compatible with MPI IPC. +// #ifndef IPPL_ARCHIVE_H #define IPPL_ARCHIVE_H @@ -20,8 +23,7 @@ namespace ippl { namespace detail { /*! - * @file Archive.h - * Serialize and desesrialize particle attributes. + * Serialize and deserialize particle attributes. * @tparam Properties variadic template for Kokkos::View */ @@ -32,6 +34,7 @@ namespace ippl { using pointer_type = typename buffer_type::pointer_type; Archive(size_type size = 0); + ~Archive(); /*! * Serialize. @@ -40,6 +43,14 @@ namespace ippl { template void serialize(const Kokkos::View& view, size_type nsends); + /*! + * @brief Hash-indexed serialize: pack the @p nsends entries + * @p view(hash(i)) into the buffer. + */ + template + void serialize(const Kokkos::View& view, const HashView& hash, + size_type nsends); + /*! * Serialize vector attributes * @@ -52,6 +63,13 @@ namespace ippl { void serialize(const Kokkos::View*, ViewArgs...>& view, size_type nsends); + /*! + * @brief Hash-indexed serialize for Vector views (see scalar overload). + */ + template + void serialize(const Kokkos::View*, ViewArgs...>& view, + const HashView& hash, size_type nsends); + /*! * Deserialize. * @param view to put data to @@ -59,6 +77,21 @@ namespace ippl { template void deserialize(Kokkos::View& view, size_type nrecvs); + /*! + * @brief Offset-aware deserialize: write @p nrecvs entries into + * @p view starting at index @p offset. + */ + template + void deserialize(Kokkos::View& view, size_type offset, + size_type nrecvs); + + /*! + * @brief Offset-aware deserialize for Vector views. + */ + template + void deserialize(Kokkos::View*, ViewArgs...>& view, size_type offset, + size_type nrecvs); + /*! * Deserialize vector attributes * @@ -71,33 +104,78 @@ namespace ippl { void deserialize(Kokkos::View*, ViewArgs...>& view, size_type nrecvs); /*! - * @returns a pointer to the data of the buffer + * @returns a pointer to the data of the buffer. + * On GPU this is a page-aligned device pointer from cudaMalloc/hipMalloc. */ - pointer_type getBuffer() { return buffer_m.data(); } + pointer_type getBuffer() { return bufferData(); } /*! - * @returns the size of the buffer + * @returns the number of bytes written so far */ size_type getSize() const { return writepos_m; } - size_type getBufferSize() const { return buffer_m.size(); } - - void resizeBuffer(size_type size) { Kokkos::resize(buffer_m, size); } + /*! + * @returns the total capacity of the buffer in bytes + */ + size_type getBufferSize() const { return bufferSize(); } - void reallocBuffer(size_type size) { Kokkos::realloc(buffer_m, size); } + //! Resize the buffer, preserving existing bytes when growing. + void resizeBuffer(size_type size); + //! Reallocate the buffer, discarding existing bytes. + void reallocBuffer(size_type size); + //! Reset the serialize write cursor to 0 (buffer contents preserved). void resetWritePos() { writepos_m = 0; } + //! Reset the deserialize read cursor to 0. void resetReadPos() { readpos_m = 0; } - ~Archive() = default; + using memory_space = typename buffer_type::memory_space; + + //! True iff this Archive's memory space is host-inaccessible + //! (CUDA device or HIP device). UVM is excluded -- it works with + //! the regular Kokkos::View path because the host can address + //! the memory directly. For a HostSpace archive the host-side + //! memcpy in serialize() requires a host-accessible buffer, so + //! the raw device allocation path must NOT be used there. + static constexpr bool uses_raw_device_alloc = +#if defined(KOKKOS_ENABLE_CUDA) + std::is_same_v +#elif defined(KOKKOS_ENABLE_HIP) + std::is_same_v +#else + false +#endif + ; private: //! write position for serialization size_type writepos_m; //! read position for deserialization size_type readpos_m; - //! serialized data + + //! Raw device pointer (only valid when uses_raw_device_alloc). + pointer_type buffer_ptr_m = nullptr; + size_type buffer_size_m = 0; + //! Standard Kokkos view buffer (used for host-accessible spaces). buffer_type buffer_m; + + pointer_type bufferData() const { + if constexpr (uses_raw_device_alloc) { + return buffer_ptr_m; + } else { + return buffer_m.data(); + } + } + size_type bufferSize() const { + if constexpr (uses_raw_device_alloc) { + return buffer_size_m; + } else { + return buffer_m.size(); + } + } + + void gpuAlloc(size_type size); + void gpuFree(); }; } // namespace detail } // namespace ippl diff --git a/src/Communicate/Archive.hpp b/src/Communicate/Archive.hpp index a55c33c7d..f1b97d098 100644 --- a/src/Communicate/Archive.hpp +++ b/src/Communicate/Archive.hpp @@ -3,6 +3,13 @@ // Class to (de-)serialize in MPI communication. // #include "Archive.h" +#include "Utility/IpplException.h" + +#if defined(KOKKOS_ENABLE_CUDA) +#include +#elif defined(KOKKOS_ENABLE_HIP) +#include +#endif namespace ippl { namespace detail { @@ -12,81 +19,325 @@ namespace ippl { } } +#if defined(KOKKOS_ENABLE_CUDA) + inline void* archiveDeviceAlloc(size_t size) { + void* ptr = nullptr; + cudaError_t rc = cudaMalloc(&ptr, size); + if (rc != cudaSuccess) { + throw IpplException( + "Archive::gpuAlloc", + std::string("cudaMalloc(") + std::to_string(size) + + " bytes) failed: " + cudaGetErrorString(rc)); + } + return ptr; + } + inline void archiveDeviceFree(void* ptr) { + if (!ptr) return; + cudaError_t rc = cudaFree(ptr); + if (rc != cudaSuccess) { + throw IpplException("Archive::gpuFree", + std::string("cudaFree failed: ") + cudaGetErrorString(rc)); + } + } + inline void archiveDeviceCopy(void* dst, const void* src, size_t bytes) { + cudaError_t rc = cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice); + if (rc != cudaSuccess) { + throw IpplException( + "Archive::resizeBuffer", + std::string("cudaMemcpy(D2D) failed: ") + cudaGetErrorString(rc)); + } + } +#elif defined(KOKKOS_ENABLE_HIP) + inline void* archiveDeviceAlloc(size_t size) { + void* ptr = nullptr; + hipError_t rc = hipMalloc(&ptr, size); + if (rc != hipSuccess) { + throw IpplException( + "Archive::gpuAlloc", + std::string("hipMalloc(") + std::to_string(size) + + " bytes) failed: " + hipGetErrorString(rc)); + } + return ptr; + } + inline void archiveDeviceFree(void* ptr) { + if (!ptr) return; + hipError_t rc = hipFree(ptr); + if (rc != hipSuccess) { + throw IpplException("Archive::gpuFree", + std::string("hipFree failed: ") + hipGetErrorString(rc)); + } + } + inline void archiveDeviceCopy(void* dst, const void* src, size_t bytes) { + hipError_t rc = hipMemcpy(dst, src, bytes, hipMemcpyDeviceToDevice); + if (rc != hipSuccess) { + throw IpplException( + "Archive::resizeBuffer", + std::string("hipMemcpy(D2D) failed: ") + hipGetErrorString(rc)); + } + } +#endif + + template + struct SerializeHashFunctor { + const T* view_data; + HashView hash; + BufferPtr buf; + size_t elem_size; + size_t wpos; + + KOKKOS_INLINE_FUNCTION void operator()(const size_t i) const { + // detail::copyBytes (PR #532): byte-loop avoids invoking + // std::memcpy from a Kokkos device kernel. + const char* src = reinterpret_cast(view_data + hash(i)); + char* dst = reinterpret_cast(buf) + i * elem_size + wpos; + copyBytes(dst, src, elem_size); + } + }; + + template + struct SerializeHashVectorFunctor { + const Vector* view_data; + HashView hash; + BufferPtr buf; + size_t elem_size; + size_t wpos; + + KOKKOS_INLINE_FUNCTION void operator()(const size_t i, const size_t d) const { + const Vector* vec = view_data + hash(i); + const T* elem = reinterpret_cast(vec) + d; + char* dst = reinterpret_cast(buf) + (Dim * i + d) * elem_size + wpos; + copyBytes(dst, reinterpret_cast(elem), elem_size); + } + }; + + // ================================================================= + // Buffer management + // ================================================================= + // + // Two storage paths: + // * Host-accessible memory spaces (HostSpace, OpenMP, Serial, ...): + // a regular Kokkos::View in `buffer_m`. + // * Device memory spaces (CudaSpace, HIPSpace): raw cuda/hipMalloc + // in `buffer_ptr_m`. + + template + void Archive::gpuAlloc(size_type size) { + if (!uses_raw_device_alloc || size == 0) return; +#if defined(KOKKOS_ENABLE_HIP) + // HSA IPC likes allocation sizes to be multiples of the GPU page + // granularity (64 KB on MI250X / MI300X). + static constexpr size_type kGranularity = 65536; + size = ((size + kGranularity - 1) / kGranularity) * kGranularity; +#endif +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + buffer_ptr_m = static_cast(archiveDeviceAlloc(size)); + buffer_size_m = size; +#endif + } + + template + void Archive::gpuFree() { + if (!uses_raw_device_alloc || !buffer_ptr_m) return; +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + archiveDeviceFree(buffer_ptr_m); +#endif + buffer_ptr_m = nullptr; + buffer_size_m = 0; + } + template Archive::Archive(size_type size) : writepos_m(0) - , readpos_m(0) - , buffer_m("buffer", size) {} + , readpos_m(0) { + if constexpr (uses_raw_device_alloc) { + gpuAlloc(size); + } else { + buffer_m = buffer_type("buffer", size); + } + } + + template + Archive::~Archive() { + if constexpr (uses_raw_device_alloc) { + gpuFree(); + } + } + + template + void Archive::resizeBuffer(size_type size) { + if constexpr (uses_raw_device_alloc) { + if (size <= buffer_size_m) return; +#if defined(KOKKOS_ENABLE_HIP) + static constexpr size_type kGranularity = 65536; + size = ((size + kGranularity - 1) / kGranularity) * kGranularity; +#endif +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + pointer_type new_ptr = + static_cast(archiveDeviceAlloc(size)); + + if (buffer_ptr_m && buffer_size_m > 0) { + archiveDeviceCopy(new_ptr, buffer_ptr_m, buffer_size_m); + archiveDeviceFree(buffer_ptr_m); + } + + buffer_ptr_m = new_ptr; + buffer_size_m = size; +#endif + } else { + Kokkos::resize(buffer_m, size); + } + } + + template + void Archive::reallocBuffer(size_type size) { + // Reallocation discards any data that may have been written into + // the buffer; reset read/write positions so the next caller sees + // a fresh archive. + writepos_m = 0; + readpos_m = 0; + if constexpr (uses_raw_device_alloc) { + gpuFree(); + gpuAlloc(size); + } else { + Kokkos::realloc(buffer_m, size); + } + } + + // ================================================================= + // Serialize -- scalar + // ================================================================= - // ----------------------------------- - // Scalar serialize template template void Archive::serialize(const Kokkos::View& view, size_type nsends) { + // Take main/master's Kokkos::deep_copy-over-Unmanaged-View idiom + // (PR #532) so no std::memcpy is called from a KOKKOS_LAMBDA. The + // *destination* pointer still has to go through bufferData() / + // bufferSize() because this branch keeps the raw cuda/hipMalloc + // path for device-only Archives (the buffer_m view is empty when + // uses_raw_device_alloc is true). constexpr size_t size = sizeof(T); - char* dst_ptr = (char*)(buffer_m.data()) + writepos_m; - char* src_ptr = (char*)(view.data()); - assert(writepos_m + (nsends * size) <= buffer_m.size()); + char* dst_ptr = reinterpret_cast(bufferData()) + writepos_m; + char* src_ptr = reinterpret_cast(const_cast(view.data())); + assert(writepos_m + (nsends * size) <= bufferSize()); // construct temp views of the src/dst buffers of the correct size (bytes) using src_view_type = Kokkos::View::memory_space, Kokkos::MemoryTraits>; using dst_view_type = - Kokkos::View>; src_view_type src_view(src_ptr, size * nsends); dst_view_type dst_view(dst_ptr, size * nsends); Kokkos::deep_copy(dst_view, src_view); Kokkos::fence(); - writepos_m += (nsends * size); + writepos_m += size * nsends; } - // ----------------------------------- - // Vector serialize + // ================================================================= + // Serialize -- scalar with hash + // ================================================================= + + template + template + void Archive::serialize(const Kokkos::View& view, + const HashView& hash, size_type nsends) { + using exec_space = HashView::execution_space; + using policy_type = Kokkos::RangePolicy; + using BufferPtr = pointer_type; + + SerializeHashFunctor f{view.data(), hash, bufferData(), + sizeof(T), writepos_m}; + + Kokkos::parallel_for("Archive::serialize(hash)", policy_type(0, nsends), f); + Kokkos::fence(); + writepos_m += sizeof(T) * nsends; + } + + // ================================================================= + // Serialize -- vector + // ================================================================= + template template void Archive::serialize( const Kokkos::View*, ViewArgs...>& view, size_type nsends) { - constexpr size_t size = sizeof(T); - char* dst_ptr = (char*)(buffer_m.data()); - ippl::Vector* src_ptr = view.data(); - auto wp = writepos_m; - // The Kokkos range policies expect int64 + using exec_space = + typename Kokkos::View*, ViewArgs...>::execution_space; + + // Capture raw pointers + bufferData() so the kernel calls + // detail::copyBytes (PR #532) instead of std::memcpy while still + // honouring the raw-device-alloc path that pif-pr added. + constexpr size_t size = sizeof(T); + char* dst_ptr = reinterpret_cast(bufferData()); + ippl::Vector* src_ptr = const_cast*>(view.data()); + const size_type wp = writepos_m; + // Default index type for range policies is int64, // so we have to explicitly specify size_type (uint64) - using exec_space = typename Kokkos::View::execution_space; using mdrange_t = Kokkos::MDRangePolicy, Kokkos::IndexType, exec_space>; Kokkos::parallel_for( - "Archive::serialize()", mdrange_t({0, 0}, {(long int)nsends, Dim}), + "Archive::serialize()", + // The constructor for Kokkos range policies always + // expects int64 regardless of index type provided + // by template parameters, so the typecast is necessary + // to avoid compiler warnings + mdrange_t({0, 0}, {static_cast(nsends), Dim}), KOKKOS_LAMBDA(const size_type i, const size_t d) { const char* src = reinterpret_cast(&src_ptr[i][d]); char* dst = dst_ptr + (Dim * i + d) * size + wp; copyBytes(dst, src, size); }); + Kokkos::fence(); + writepos_m += Dim * size * nsends; + } + + // ================================================================= + // Serialize -- vector with hash + // ================================================================= + + template + template + void Archive::serialize( + const Kokkos::View*, ViewArgs...>& view, const HashView& hash, + size_type nsends) { + using exec_space = typename HashView::execution_space; + size_t size = sizeof(T); + using BufferPtr = pointer_type; + using mdrange_t = + Kokkos::MDRangePolicy, Kokkos::IndexType, exec_space>; + + SerializeHashVectorFunctor f{ + view.data(), hash, bufferData(), size, writepos_m}; + Kokkos::parallel_for("Archive::serialize(hash, vector)", + mdrange_t({0, 0}, {static_cast(nsends), Dim}), f); Kokkos::fence(); writepos_m += Dim * size * nsends; } - // ----------------------------------- - // Scalar Deserialize + // ================================================================= + // Deserialize -- scalar + // ================================================================= + template template void Archive::deserialize(Kokkos::View& view, size_type nrecvs) { - // if we have to enlarge the destination view + constexpr size_t size = sizeof(T); if (nrecvs > view.extent(0)) { Kokkos::realloc(view, nrecvs); } - // - constexpr size_t size = sizeof(T); - char* src_ptr = (char*)(buffer_m.data()) + readpos_m; - char* dst_ptr = (char*)(view.data()); - assert(readpos_m + (nrecvs * size) <= buffer_m.size()); + // Same Kokkos::deep_copy-over-Unmanaged-View pattern as serialize() + // (PR #532), going through bufferData() / bufferSize() so the + // raw-device-alloc Archive variant works. + char* src_ptr = reinterpret_cast(bufferData()) + readpos_m; + char* dst_ptr = reinterpret_cast(view.data()); + assert(readpos_m + (nrecvs * size) <= bufferSize()); // construct temp views of the src/dst buffers of the correct size (bytes) using src_view_type = - Kokkos::View>; using dst_view_type = Kokkos::View::memory_space, @@ -95,30 +346,95 @@ namespace ippl { dst_view_type dst_view(dst_ptr, size * nrecvs); Kokkos::deep_copy(dst_view, src_view); Kokkos::fence(); - readpos_m += (nrecvs * size); + readpos_m += size * nrecvs; } - // ----------------------------------- - // Vector Deserialize + // ================================================================= + // Deserialize -- vector + // ================================================================= + template template void Archive::deserialize(Kokkos::View*, ViewArgs...>& view, - size_type nrecvs) - { - // if we have to enlarge the destination view + size_type nrecvs) { + using exec_space = + typename Kokkos::View*, ViewArgs...>::execution_space; + + constexpr size_t size = sizeof(T); if (nrecvs > view.extent(0)) { Kokkos::realloc(view, nrecvs); } - // - constexpr size_t size = sizeof(T); - char* src_ptr = (char*)(buffer_m.data()); - ippl::Vector* dst_ptr = view.data(); - auto rp = readpos_m; - using exec_space = typename Kokkos::View::execution_space; using mdrange_t = Kokkos::MDRangePolicy, Kokkos::IndexType, exec_space>; + char* src_ptr = reinterpret_cast(bufferData()); + ippl::Vector* dst_ptr = view.data(); + const size_type rp = readpos_m; + Kokkos::parallel_for( + "Archive::deserialize()", mdrange_t({0, 0}, {static_cast(nrecvs), Dim}), + KOKKOS_LAMBDA(const size_type i, const size_t d) { + const char* src = src_ptr + (Dim * i + d) * size + rp; + char* dst = reinterpret_cast(&dst_ptr[i][d]); + copyBytes(dst, src, size); + }); + Kokkos::fence(); + readpos_m += Dim * size * nrecvs; + } + + // ================================================================= + // Deserialize -- scalar with offset + // ================================================================= + // + // Offset variants are kept on the pif-pr branch -- ParticleAttrib uses + // them to deserialize into a sub-range of dview_m for incremental + // particle migration buffer reads. main/master removed them in PR #532 + // because its ParticleBase deserialises particles in one shot, but the + // pif-pr Particle refactor (2878b90b) still needs the offset path. + + template + template + void Archive::deserialize(Kokkos::View& view, + size_type offset, size_type nrecvs) { + using exec_space = typename Kokkos::View::execution_space; + using policy_type = Kokkos::RangePolicy; + constexpr size_t size = sizeof(T); + if (offset + nrecvs > view.extent(0)) { + Kokkos::resize(view, offset + nrecvs); + } + char* src_ptr = reinterpret_cast(bufferData()); + T* dst_ptr = view.data() + offset; + const size_type rp = readpos_m; + Kokkos::parallel_for( + "Archive::deserialize(offset)", policy_type(0, nrecvs), + KOKKOS_LAMBDA(const size_type i) { + const char* src = src_ptr + i * size + rp; + char* dst = reinterpret_cast(dst_ptr + i); + copyBytes(dst, src, size); + }); + Kokkos::fence(); + readpos_m += size * nrecvs; + } + + // ================================================================= + // Deserialize -- vector with offset + // ================================================================= + + template + template + void Archive::deserialize(Kokkos::View*, ViewArgs...>& view, + size_type offset, size_type nrecvs) { + using exec_space = + typename Kokkos::View*, ViewArgs...>::execution_space; + constexpr size_t size = sizeof(T); + if (offset + nrecvs > view.extent(0)) { + Kokkos::resize(view, offset + nrecvs); + } + using mdrange_t = + Kokkos::MDRangePolicy, Kokkos::IndexType, exec_space>; + char* src_ptr = reinterpret_cast(bufferData()); + ippl::Vector* dst_ptr = view.data() + offset; + const size_type rp = readpos_m; Kokkos::parallel_for( - "Archive::deserialize()", mdrange_t({0, 0}, {(long int)nrecvs, Dim}), + "Archive::deserialize(offset, vector)", mdrange_t({0, 0}, {static_cast(nrecvs), Dim}), KOKKOS_LAMBDA(const size_type i, const size_t d) { const char* src = src_ptr + (Dim * i + d) * size + rp; char* dst = reinterpret_cast(&dst_ptr[i][d]); diff --git a/src/Communicate/BufferHandler.hpp b/src/Communicate/BufferHandler.hpp index c6d57f0a3..c83c8d4d2 100644 --- a/src/Communicate/BufferHandler.hpp +++ b/src/Communicate/BufferHandler.hpp @@ -1,6 +1,8 @@ #ifndef IPPL_BUFFER_HANDLER_HPP #define IPPL_BUFFER_HANDLER_HPP +#include + namespace ippl { template @@ -11,6 +13,15 @@ namespace ippl { DefaultBufferHandler::getBuffer(size_type size, double overallocation) { size_type requiredSize = static_cast(size * overallocation); + // Round up to page granularity to avoid 0-byte or tiny allocations + // and to align with the GPU-aware MPI registration cache. + constexpr size_type PAGE_SIZE = 4096; + if (requiredSize < PAGE_SIZE) { + requiredSize = PAGE_SIZE; + } else { + requiredSize = ((requiredSize + PAGE_SIZE - 1) / PAGE_SIZE) * PAGE_SIZE; + } + auto freeBuffer = findFreeBuffer(requiredSize); if (freeBuffer != nullptr) { return getFreeBuffer(freeBuffer); @@ -120,17 +131,12 @@ namespace ippl { template typename DefaultBufferHandler::buffer_type DefaultBufferHandler::reallocateLargestFreeBuffer(size_type requiredSize) { - auto largest_it = std::prev(free_buffers.end()); - buffer_type buffer = *largest_it; - - freeSize_m -= buffer->getBufferSize(); - usedSize_m += requiredSize; - - free_buffers.erase(buffer); - buffer->reallocBuffer(requiredSize); - - used_buffers.insert(buffer); - return buffer; + // Always allocate a + // fresh buffer instead of reallocating the largest free one: a + // free + alloc cycle can release the device pointer that a GPU-aware + // MPI's registration cache still holds. Keeping the old buffers in the free + // pool and returning a new buffer at a new address sidesteps this. + return allocateNewBuffer(requiredSize); } template diff --git a/src/Communicate/Buffers.cpp b/src/Communicate/Buffers.cpp index 026633fec..e99100bc2 100644 --- a/src/Communicate/Buffers.cpp +++ b/src/Communicate/Buffers.cpp @@ -32,13 +32,13 @@ namespace ippl { } void Communicator::deleteAllBuffers() { - buffer_handlers_m->forAll([](BufferHandler&& bh) { + getBufferHandler().forAll([](BufferHandler&& bh) { bh.deleteAllBuffers(); }); } void Communicator::freeAllBuffers() { - buffer_handlers_m->forAll([](BufferHandler&& bh) { + getBufferHandler().forAll([](BufferHandler&& bh) { bh.freeAllBuffers(); }); } diff --git a/src/Communicate/Buffers.hpp b/src/Communicate/Buffers.hpp index 98c0b513f..597461b4d 100644 --- a/src/Communicate/Buffers.hpp +++ b/src/Communicate/Buffers.hpp @@ -20,21 +20,23 @@ // exchanging particle data between ranks. // +#include "Utility/IpplTimings.h" namespace ippl { namespace mpi { template Communicator::buffer_type Communicator::getBuffer(size_type size, double overallocation) { - auto& buffer_handler = buffer_handlers_m->get(); + auto& buffer_handler = getBufferHandler().get(); - return buffer_handler.getBuffer(size * sizeof(T), - std::max(overallocation, defaultOveralloc_m)); + auto buf = buffer_handler.getBuffer(size * sizeof(T), + std::max(overallocation, defaultOveralloc_m)); + return buf; } template void Communicator::freeBuffer(Communicator::buffer_type buffer) { - auto& buffer_handler = buffer_handlers_m->get(); + auto& buffer_handler = getBufferHandler().get(); buffer_handler.freeBuffer(buffer); } diff --git a/src/Communicate/Communicator.cpp b/src/Communicate/Communicator.cpp index 2c33f939f..e1a8b777b 100644 --- a/src/Communicate/Communicator.cpp +++ b/src/Communicate/Communicator.cpp @@ -3,25 +3,27 @@ namespace ippl::mpi { + namespace { + // Populate rank_m / size_m from the live communicator. + void cacheRankAndSize(const MPI_Comm& comm, int& rank, int& size) { + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &size); + } + } // namespace + Communicator::Communicator() - : buffer_handlers_m(get_buffer_handler_instance()) - , comm_m(new MPI_Comm(MPI_COMM_WORLD)) { - MPI_Comm_rank(*comm_m, &rank_m); - MPI_Comm_size(*comm_m, &size_m); + : comm_m(new MPI_Comm(MPI_COMM_WORLD)) { + cacheRankAndSize(*comm_m, rank_m, size_m); } Communicator::Communicator(MPI_Comm comm) { - buffer_handlers_m = get_buffer_handler_instance(); - comm_m = std::make_shared(comm); - MPI_Comm_rank(*comm_m, &rank_m); - MPI_Comm_size(*comm_m, &size_m); + comm_m = std::make_shared(comm); + cacheRankAndSize(*comm_m, rank_m, size_m); } Communicator& Communicator::operator=(MPI_Comm comm) { - buffer_handlers_m = get_buffer_handler_instance(); - comm_m = std::make_shared(comm); - MPI_Comm_rank(*comm_m, &rank_m); - MPI_Comm_size(*comm_m, &size_m); + comm_m = std::make_shared(comm); + cacheRankAndSize(*comm_m, rank_m, size_m); return *this; } @@ -41,14 +43,4 @@ namespace ippl::mpi { return (flag != 0); } - // --------------------------------------- - // singleton access to buffer manager - // --------------------------------------- - std::shared_ptr Communicator::get_buffer_handler_instance() { - static std::shared_ptr comm_buff_handler_ptr{nullptr}; - if (comm_buff_handler_ptr == nullptr) { - comm_buff_handler_ptr = std::make_shared(); - } - return comm_buff_handler_ptr; - } } // namespace ippl::mpi diff --git a/src/Communicate/Communicator.h b/src/Communicate/Communicator.h index 9819ab3bd..7685d20d5 100644 --- a/src/Communicate/Communicator.h +++ b/src/Communicate/Communicator.h @@ -15,6 +15,7 @@ //////////////////////////////////////////////// // For message size check; see below + #include #include @@ -27,7 +28,6 @@ namespace ippl { namespace mpi { - class Communicator : public TagMaker { public: Communicator(); @@ -157,16 +157,21 @@ namespace ippl { const MPI_Comm& getCommunicator() const noexcept { return *comm_m; } + // MPI uses int for byte counts; messages exceeding INT_MAX must + // be split. This shared check aborts the run with a single + // diagnostic instead of silently truncating. + void assertMessageSize(size_type msize) const { + if (msize > static_cast(INT_MAX)) { + std::cerr << "Communicator: message size " << msize + << " bytes exceeds INT_MAX (" << INT_MAX << ")\n"; + MPI_Abort(*comm_m, -1); + } + } + template void recv(int src, int tag, Buffer& buffer, Archive& ar, size_type msize, size_type nrecvs) { - // Temporary fix. MPI communication seems to have problems when the - // count argument exceeds the range of int, so large messages should - // be split into smaller messages - if (msize > INT_MAX) { - std::cerr << "Message size exceeds range of int" << std::endl; - this->abort(); - } + assertMessageSize(msize); MPI_Status status; MPI_Recv(ar.getBuffer(), msize, MPI_BYTE, src, tag, *comm_m, &status); @@ -176,20 +181,27 @@ namespace ippl { template void isend(int dest, int tag, Buffer& buffer, Archive& ar, MPI_Request& request, size_type nsends) { - if (ar.getSize() > INT_MAX) { - std::cerr << "Message size exceeds range of int" << std::endl; - this->abort(); - } + assertMessageSize(ar.getSize()); buffer.serialize(ar, nsends); MPI_Isend(ar.getBuffer(), ar.getSize(), MPI_BYTE, dest, tag, *comm_m, &request); } + template + void isend(int dest, int tag, Archive& ar, MPI_Request& request) { + assertMessageSize(ar.getSize()); + MPI_Isend(ar.getBuffer(), ar.getSize(), MPI_BYTE, dest, tag, *comm_m, &request); + } + + template + void recv(int src, int tag, Archive& ar, size_type msize) { + assertMessageSize(msize); + MPI_Status status; + MPI_Recv(ar.getBuffer(), msize, MPI_BYTE, src, tag, *comm_m, &status); + } + template void irecv(int src, int tag, Archive& ar, MPI_Request& request, size_type msize) { - if (msize > INT_MAX) { - std::cerr << "Message size exceeds range of int" << std::endl; - this->abort(); - } + assertMessageSize(msize); MPI_Irecv(ar.getBuffer(), msize, MPI_BYTE, src, tag, *comm_m, &request); } @@ -201,7 +213,11 @@ namespace ippl { std::vector gatherLogsFromAllRanks(const std::vector& localLogs); void writeLogsToFile(const std::vector& allLogs, const std::string& filename); - std::shared_ptr buffer_handlers_m; + static buffer_handler_type& getBufferHandler() { + static buffer_handler_type handler; + return handler; + } + double defaultOveralloc_m = 1.0; ///////////////////////////////////////////////////////////////////////////////////// @@ -210,12 +226,10 @@ namespace ippl { std::shared_ptr comm_m; int size_m; int rank_m; - - public: - std::shared_ptr get_buffer_handler_instance(); }; } // namespace mpi + } // namespace ippl #include "Communicate/Collectives.hpp" diff --git a/src/Communicate/CommunicatorLogging.cpp b/src/Communicate/CommunicatorLogging.cpp index 541ff4074..39c0197e8 100644 --- a/src/Communicate/CommunicatorLogging.cpp +++ b/src/Communicate/CommunicatorLogging.cpp @@ -7,8 +7,16 @@ #include "Communicate/Communicator.h" #include "Communicate/LogEntry.h" +#include "Communicate/LoggingBufferHandler.h" namespace ippl::mpi { + + template + struct is_a_logger : std::false_type {}; + + template + struct is_a_logger> : std::true_type {}; + void Communicator::printLogs(const std::string& filename) { std::vector localLogs = gatherLocalLogs(); @@ -24,20 +32,17 @@ namespace ippl::mpi { } } - template - struct is_a_logger : std::false_type {}; - - template - struct is_a_logger > : std::true_type {}; - std::vector Communicator::gatherLocalLogs() { std::vector localLogs; - if constexpr (is_a_logger::value) { - buffer_handlers_m->forAll([&](auto& loggingHandler) { - const auto& logs = loggingHandler.getLogs(); + + getBufferHandler().forAll([&](auto& handler) { + using handler_t = std::decay_t; + if constexpr (is_a_logger::value) { + const auto& logs = handler.getLogs(); localLogs.insert(localLogs.end(), logs.begin(), logs.end()); - }); - } + } + }); + return localLogs; } @@ -88,11 +93,9 @@ namespace ippl::mpi { size_t offset = 0; while (offset < buffer.size()) { - LogEntry logEntry = LogEntry::deserialize(buffer, offset); - - logs.push_back(logEntry); - - offset += logEntry.serialize().size(); + // deserializeAdvance walks offset past the consumed bytes - + // avoids the O(N^2) blowup of re-serializing each entry + logs.push_back(LogEntry::deserializeAdvance(buffer, offset)); } return logs; } diff --git a/src/Communicate/Environment.cpp b/src/Communicate/Environment.cpp index 8e19df365..706c821f1 100644 --- a/src/Communicate/Environment.cpp +++ b/src/Communicate/Environment.cpp @@ -9,9 +9,23 @@ namespace ippl { namespace mpi { Environment::Environment(int& argc, char**& argv, const MPI_Comm& comm) - : comm_m(comm) { + : comm_m(comm) + , threadMultiple_m(false) { if (!initialized()) { - MPI_Init(&argc, &argv); + int provided = MPI_THREAD_SINGLE; + int rc = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); + if (rc != MPI_SUCCESS) { + std::cerr << "MPI_Init_thread failed (rc=" << rc << ")" << std::endl; + std::exit(EXIT_FAILURE); + } + threadMultiple_m = (provided >= MPI_THREAD_MULTIPLE); + if (!threadMultiple_m) { + int rank = 0; + MPI_Comm_rank(comm_m, &rank); + if (rank == 0) { + std::cerr << "MPI doesn't support MPI_THREAD_MULTIPLE!" << std::endl; + } + } } } diff --git a/src/Communicate/Environment.h b/src/Communicate/Environment.h index fd03a9ee4..bad195df4 100644 --- a/src/Communicate/Environment.h +++ b/src/Communicate/Environment.h @@ -23,10 +23,13 @@ namespace ippl { static bool finalized(); + bool threadMultiple() noexcept { return threadMultiple_m; } + void abort(int errorcode = -1) noexcept { MPI_Abort(comm_m, errorcode); } private: MPI_Comm comm_m; + bool threadMultiple_m; }; } // namespace mpi } // namespace ippl diff --git a/src/Communicate/LogEntry.cpp b/src/Communicate/LogEntry.cpp index 444221d6d..66539e879 100644 --- a/src/Communicate/LogEntry.cpp +++ b/src/Communicate/LogEntry.cpp @@ -40,28 +40,31 @@ namespace ippl { return buffer; } - LogEntry LogEntry::deserialize(const std::vector& buffer, size_t offset) { + LogEntry LogEntry::deserializeAdvance(const std::vector& buffer, size_t& offset) { LogEntry entry; - size_t current_pos = offset; - entry.methodName = deserializeString(buffer, current_pos); - entry.usedSize = deserializeBasicType(buffer, current_pos); - entry.freeSize = deserializeBasicType(buffer, current_pos); - entry.memorySpace = deserializeString(buffer, current_pos); - entry.rank = deserializeBasicType(buffer, current_pos); + entry.methodName = deserializeString(buffer, offset); + entry.usedSize = deserializeBasicType(buffer, offset); + entry.freeSize = deserializeBasicType(buffer, offset); + entry.memorySpace = deserializeString(buffer, offset); + entry.rank = deserializeBasicType(buffer, offset); - auto duration = deserializeBasicType(buffer, current_pos); + auto duration = deserializeBasicType(buffer, offset); entry.timestamp = std::chrono::time_point( std::chrono::high_resolution_clock::duration(duration)); - size_t mapSize = deserializeBasicType(buffer, current_pos); + size_t mapSize = deserializeBasicType(buffer, offset); for (size_t i = 0; i < mapSize; ++i) { - std::string key = deserializeString(buffer, current_pos); - std::string value = deserializeString(buffer, current_pos); + std::string key = deserializeString(buffer, offset); + std::string value = deserializeString(buffer, offset); entry.parameters[key] = value; } return entry; } + LogEntry LogEntry::deserialize(const std::vector& buffer, size_t offset) { + return deserializeAdvance(buffer, offset); + } + } // namespace ippl diff --git a/src/Communicate/LogEntry.h b/src/Communicate/LogEntry.h index a853c3431..1929915c3 100644 --- a/src/Communicate/LogEntry.h +++ b/src/Communicate/LogEntry.h @@ -20,6 +20,11 @@ namespace ippl { std::vector serialize() const; static LogEntry deserialize(const std::vector& buffer, size_t offset = 0); + + /// Variant of deserialize() that advances the caller's offset past + /// the just-read entry. Lets callers walk a byte-stream of entries + /// without re-serializing each one to compute its size. + static LogEntry deserializeAdvance(const std::vector& buffer, size_t& offset); }; template diff --git a/src/Communicate/LoggingBufferHandler.h b/src/Communicate/LoggingBufferHandler.h index f6a0bd5d6..95fb418cd 100644 --- a/src/Communicate/LoggingBufferHandler.h +++ b/src/Communicate/LoggingBufferHandler.h @@ -1,7 +1,6 @@ #ifndef IPPL_LOGGING_BUFFER_HANDLER_H #define IPPL_LOGGING_BUFFER_HANDLER_H -#include #include #include #include diff --git a/src/FEM/FEMVector.hpp b/src/FEM/FEMVector.hpp index 7e94d2bee..b8ad4ce63 100644 --- a/src/FEM/FEMVector.hpp +++ b/src/FEM/FEMVector.hpp @@ -312,8 +312,8 @@ namespace ippl{ auto& bufferData = boundaryInfo_m->commBuffer_m.buffer; if (bufferData.size() < nIdxs) { - int overalloc = Comm->getDefaultOverallocation(); - Kokkos::realloc(bufferData, nIdxs * overalloc); + double overalloc = Comm->getDefaultOverallocation(); + Kokkos::realloc(bufferData, static_cast(nIdxs * overalloc)); } Kokkos::parallel_for("FEMVector::pack()", nIdxs, @@ -339,8 +339,8 @@ namespace ippl{ size_t nIdxs = idxStore.extent(0); auto& bufferData = boundaryInfo_m->commBuffer_m.buffer; if (bufferData.size() < nIdxs) { - int overalloc = Comm->getDefaultOverallocation(); - Kokkos::realloc(bufferData, nIdxs * overalloc); + double overalloc = Comm->getDefaultOverallocation(); + Kokkos::realloc(bufferData, static_cast(nIdxs * overalloc)); } Op op; diff --git a/src/FEM/LagrangeSpace.hpp b/src/FEM/LagrangeSpace.hpp index b1743cd4f..a4414f6cc 100644 --- a/src/FEM/LagrangeSpace.hpp +++ b/src/FEM/LagrangeSpace.hpp @@ -278,8 +278,7 @@ namespace ippl { LagrangeSpace:: evaluateRefElementShapeFunction( const size_t& localDOF, - const LagrangeSpace::point_t& localPoint) const { + const point_t& localPoint) const { static_assert(Order == 1, "Only order 1 is supported at the moment"); // Assert that the local vertex index is valid. assert(localDOF < numElementDOFs diff --git a/src/FFT/Backend/Backend.h b/src/FFT/Backend/Backend.h new file mode 100644 index 000000000..dcfb3e46a --- /dev/null +++ b/src/FFT/Backend/Backend.h @@ -0,0 +1,22 @@ +/*! + * @file Backend.h + * @brief Aggregate include for all FFT backends supported by IPPL. + * + * Pulls in the heFFTe wrappers unconditionally, plus optional CUDA-only + * (CuFFT) and CUDA+MPI (CuFFTMp) backends gated by the corresponding + * Kokkos / IPPL configuration macros. + */ +#ifndef IPPL_FFT_BACKEND_H +#define IPPL_FFT_BACKEND_H + +#include "FFT/Backend/Heffte.h" + +#ifdef IPPL_ENABLE_CUFFTMP +#include "FFT/Backend/CuFFTMp.h" +#endif + +#ifdef KOKKOS_ENABLE_CUDA +#include "FFT/Backend/CuFFT.h" +#endif + +#endif // IPPL_FFT_BACKEND_H diff --git a/src/FFT/Backend/CuFFT.h b/src/FFT/Backend/CuFFT.h new file mode 100644 index 000000000..b54a2fd25 --- /dev/null +++ b/src/FFT/Backend/CuFFT.h @@ -0,0 +1,388 @@ +#ifndef IPPL_FFT_BACKEND_CUFFT_H +#define IPPL_FFT_BACKEND_CUFFT_H + +#ifdef KOKKOS_ENABLE_CUDA + +#include +#include +#include +#include +#include + +#include + +#include "Utility/IpplException.h" +#include "Utility/ParameterList.h" + +namespace ippl { +namespace fft { + + namespace detail { + /*! + * @brief Throw IpplException if @p result is not CUFFT_SUCCESS. + * @param result cuFFT API return code. + * @param context Human-readable label for the failing call. + */ + inline void checkCufftResult(cufftResult result, const char* context) { + if (result != CUFFT_SUCCESS) { + std::string msg = + std::string(context) + " (error code: " + std::to_string(result) + ")"; + throw IpplException("cuFFT", msg.c_str()); + } + } + + /*! + * @brief Throw IpplException if @p err is not cudaSuccess. + * @param err CUDA runtime error code. + * @param context Human-readable label for the failing call. + */ + inline void checkCudaError(cudaError_t err, const char* context) { + if (err != cudaSuccess) { + std::string msg = std::string(context) + ": " + cudaGetErrorString(err); + throw IpplException("cuFFT", msg.c_str()); + } + } + } // namespace detail + + namespace detail { + /*! + * @brief CUDA kernel that scales each complex element of @p data by @p scale. + * + * @tparam T cuFFT complex type (cufftComplex or cufftDoubleComplex). + * @param data Device pointer to the buffer to scale. + * @param n Number of complex elements in @p data. + * @param scale Scalar multiplier applied to both .x and .y components. + */ + template + __global__ void cufftScaleKernel(T* data, size_t n, double scale) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + data[idx].x *= scale; + data[idx].y *= scale; + } + } + } // namespace detail + + //========================================================================= + // CuFFTC2C - Single-node cuFFT with Batched Support + //========================================================================= + + /*! + * @class CuFFTC2C + * @brief Single-node cuFFT C2C wrapper with batched-transform support. + * + * Mirrors the public interface of HeffteC2C / CuFFTMpC2C so transforms + * can be swapped at compile time. Maintains an internal CUDA stream and + * an additional plan for partial batches (when @c maxBatchSize > 1). + * + * Forward transforms are normalized by 1 / globalElements via a scaling + * kernel; backward transforms are unscaled. + * + * @tparam T Real precision (float / double). + * @tparam Dim Spatial dimension (only 3D is supported). + * @tparam MemSpace Must be Kokkos::CudaSpace. + */ + template + class CuFFTC2C { + public: + static_assert(std::is_same_v, + "CuFFTC2C requires Kokkos::CudaSpace"); + static_assert(Dim == 3, "CuFFTC2C only supports 3D"); + static_assert(std::is_same_v || std::is_same_v, + "CuFFTC2C only supports float and double precision"); + + using complex_t = Kokkos::complex; + using cuda_complex_t = std::conditional_t, + cufftComplex, + cufftDoubleComplex>; + + static constexpr cufftType fft_type = std::is_same_v ? CUFFT_C2C : CUFFT_Z2Z; + + /*! + * @brief Build the cuFFT plans for the given local box decomposition. + * + * Mirrors the heFFTe / cuFFTMp constructor signature. @p comm is used + * only to MPI_Allreduce the global FFT size; the actual transforms are + * single-node (one plan per rank). + * + * @param inbox Local input box (inclusive corner indices). + * @param outbox Local output box (inclusive corner indices). + * @param comm MPI communicator (used only for global-size reduction). + * @param maxBatchSize Maximum number of transforms in a single batched call. + */ + CuFFTC2C(const heffte::box3d& inbox, + const heffte::box3d& outbox, + MPI_Comm comm, + const ParameterList& /*params*/, + int maxBatchSize = 1) + : maxBatchSize_(maxBatchSize) + , comm_(comm) + { + using detail::checkCudaError; + using detail::checkCufftResult; + + // Extract local dimensions from inbox + for (int d = 0; d < 3; ++d) { + lowerIn_[d] = inbox.low[d]; + upperIn_[d] = inbox.high[d] + 1; + lowerOut_[d] = outbox.low[d]; + upperOut_[d] = outbox.high[d] + 1; + localSize_[d] = upperIn_[d] - lowerIn_[d]; + } + + // Compute global size via MPI reduction + std::array localMax; + for (int d = 0; d < 3; ++d) { + localMax[d] = std::max(upperIn_[d], upperOut_[d]); + } + MPI_Allreduce(localMax.data(), globalSize_.data(), 3, MPI_LONG_LONG, MPI_MAX, comm); + + localElements_ = localSize_[0] * localSize_[1] * localSize_[2]; + globalElements_ = globalSize_[0] * globalSize_[1] * globalSize_[2]; + + // Create CUDA stream + checkCudaError(cudaStreamCreate(&stream_), "Failed to create CUDA stream"); + + // cuFFT expects row-major (C-order) dimensions. The Kokkos views are + // LayoutLeft, so dimension 0 is fastest-varying - pass extents reversed. + int n[3] = { + static_cast(localSize_[2]), + static_cast(localSize_[1]), + static_cast(localSize_[0]) + }; + + int inembed[3] = {n[0], n[1], n[2]}; + int onembed[3] = {n[0], n[1], n[2]}; + int istride = 1; + int ostride = 1; + int idist = static_cast(localElements_); + int odist = static_cast(localElements_); + + // Create batched plan + checkCufftResult( + cufftPlanMany(&planBatched_, 3, n, + inembed, istride, idist, + onembed, ostride, odist, + fft_type, maxBatchSize), + "Failed to create batched cuFFT plan"); + + checkCufftResult(cufftSetStream(planBatched_, stream_), + "Failed to set stream on batched plan"); + + // Create single-transform plan if needed + if (maxBatchSize > 1) { + checkCufftResult( + cufftPlanMany(&planSingle_, 3, n, + inembed, istride, idist, + onembed, ostride, odist, + fft_type, 1), + "Failed to create single cuFFT plan"); + + checkCufftResult(cufftSetStream(planSingle_, stream_), + "Failed to set stream on single plan"); + } else { + planSingle_ = planBatched_; + } + } + + ~CuFFTC2C() { + if (planBatched_) cufftDestroy(planBatched_); + if (maxBatchSize_ > 1 && planSingle_) cufftDestroy(planSingle_); + if (stream_) cudaStreamDestroy(stream_); + } + + // Non-copyable + CuFFTC2C(const CuFFTC2C&) = delete; + CuFFTC2C& operator=(const CuFFTC2C&) = delete; + + // Movable + CuFFTC2C(CuFFTC2C&& other) noexcept + : planBatched_(other.planBatched_) + , planSingle_(other.planSingle_) + , stream_(other.stream_) + , comm_(other.comm_) + , maxBatchSize_(other.maxBatchSize_) + , localElements_(other.localElements_) + , globalElements_(other.globalElements_) + , localSize_(other.localSize_) + , globalSize_(other.globalSize_) + , lowerIn_(other.lowerIn_) + , upperIn_(other.upperIn_) + , lowerOut_(other.lowerOut_) + , upperOut_(other.upperOut_) + { + other.planBatched_ = 0; + other.planSingle_ = 0; + other.stream_ = nullptr; + } + + CuFFTC2C& operator=(CuFFTC2C&& other) noexcept { + if (this != &other) { + if (planBatched_) cufftDestroy(planBatched_); + if (maxBatchSize_ > 1 && planSingle_) cufftDestroy(planSingle_); + if (stream_) cudaStreamDestroy(stream_); + + planBatched_ = other.planBatched_; + planSingle_ = other.planSingle_; + stream_ = other.stream_; + comm_ = other.comm_; + maxBatchSize_ = other.maxBatchSize_; + localElements_ = other.localElements_; + globalElements_ = other.globalElements_; + localSize_ = other.localSize_; + globalSize_ = other.globalSize_; + lowerIn_ = other.lowerIn_; + upperIn_ = other.upperIn_; + lowerOut_ = other.lowerOut_; + upperOut_ = other.upperOut_; + + other.planBatched_ = 0; + other.planSingle_ = 0; + other.stream_ = nullptr; + } + return *this; + } + + //! Forward C2C transform of a single buffer; output is normalized by 1 / globalElements. + void forward(complex_t* in, complex_t* out) { + execute(planSingle_, in, out, CUFFT_FORWARD); + applyScaling(out, localElements_, T(1) / static_cast(globalElements_)); + detail::checkCudaError(cudaStreamSynchronize(stream_), "Stream sync failed"); + } + + //! Backward C2C transform of a single buffer (unscaled). + void backward(complex_t* in, complex_t* out) { + execute(planSingle_, in, out, CUFFT_INVERSE); + detail::checkCudaError(cudaStreamSynchronize(stream_), "Stream sync failed"); + } + + /*! + * @brief Batched forward C2C transform. + * + * Uses the batched plan when @p batchSize equals the configured maximum, + * otherwise loops over individual transforms with the single-shot plan. + * + * @param batchSize Number of contiguous transforms to perform. + * @param in Device pointer to the input batch. + * @param out Device pointer to the output batch. + */ + void forward(int batchSize, complex_t* in, complex_t* out) { + if (batchSize > maxBatchSize_) { + throw IpplException("CuFFTC2C", "Batch size exceeds plan capacity"); + } + + if (batchSize == maxBatchSize_) { + execute(planBatched_, in, out, CUFFT_FORWARD); + } else { + // Execute individual transforms for partial batch + for (int b = 0; b < batchSize; ++b) { + execute(planSingle_, + in + b * localElements_, + out + b * localElements_, + CUFFT_FORWARD); + } + } + applyScaling(out, localElements_ * batchSize, T(1) / static_cast(globalElements_)); + detail::checkCudaError(cudaStreamSynchronize(stream_), "Stream sync failed"); + } + + /*! + * @brief Batched backward C2C transform (unscaled). + * @param batchSize Number of contiguous transforms to perform. + * @param in Device pointer to the input batch. + * @param out Device pointer to the output batch. + */ + void backward(int batchSize, complex_t* in, complex_t* out) { + if (batchSize > maxBatchSize_) { + throw IpplException("CuFFTC2C", "Batch size exceeds plan capacity"); + } + + if (batchSize == maxBatchSize_) { + execute(planBatched_, in, out, CUFFT_INVERSE); + } else { + for (int b = 0; b < batchSize; ++b) { + execute(planSingle_, + in + b * localElements_, + out + b * localElements_, + CUFFT_INVERSE); + } + } + detail::checkCudaError(cudaStreamSynchronize(stream_), "Stream sync failed"); + } + + /*! + * @brief Replace the internal CUDA stream used by both plans. + * @param stream User-managed stream that outlives this object. + */ + void setStream(cudaStream_t stream) { + stream_ = stream; + detail::checkCufftResult(cufftSetStream(planBatched_, stream_), + "Failed to set stream on batched plan"); + if (maxBatchSize_ > 1) { + detail::checkCufftResult(cufftSetStream(planSingle_, stream_), + "Failed to set stream on single plan"); + } + } + + //! cuFFT manages its workspace internally, so this always returns 0. + size_t workspace_size() const { return 0; } + //! @return Number of complex elements in the local box. + size_t local_size() const { return localElements_; } + //! @return Total number of complex elements in the global FFT. + size_t global_size() const { return globalElements_; } + //! @return Local input box size in elements. + size_t size_inbox() const { return localElements_; } + //! @return Local output box size in elements. + size_t size_outbox() const { return localElements_; } + //! @return Maximum batch size the plans were created for. + int max_batch_size() const { return maxBatchSize_; } + //! @return Internal CUDA stream used for plan execution. + cudaStream_t stream() const { return stream_; } + //! @return Local box extent along each axis (i, j, k). + const std::array& local_dims() const { return localSize_; } + //! @return Global FFT extent along each axis (i, j, k). + const std::array& global_dims() const { return globalSize_; } + + private: + void execute(cufftHandle plan, complex_t* in, complex_t* out, int direction) { + auto* inPtr = reinterpret_cast(in); + auto* outPtr = reinterpret_cast(out); + + if constexpr (std::is_same_v) { + detail::checkCufftResult(cufftExecC2C(plan, inPtr, outPtr, direction), + "cuFFT C2C execution failed"); + } else { + detail::checkCufftResult(cufftExecZ2Z(plan, inPtr, outPtr, direction), + "cuFFT Z2Z execution failed"); + } + } + + void applyScaling(complex_t* data, size_t count, T scale) { + auto* ptr = reinterpret_cast(data); + constexpr size_t blockSize = 256; + size_t numBlocks = (count + blockSize - 1) / blockSize; + detail::cufftScaleKernel<<>>( + ptr, count, static_cast(scale)); + } + + cufftHandle planBatched_ = 0; + cufftHandle planSingle_ = 0; + cudaStream_t stream_ = nullptr; + MPI_Comm comm_; + + int maxBatchSize_; + size_t localElements_; + size_t globalElements_; + + std::array localSize_; + std::array globalSize_; + std::array lowerIn_, upperIn_; + std::array lowerOut_, upperOut_; + }; + +} // namespace fft +} // namespace ippl + +#endif // KOKKOS_ENABLE_CUDA + +#endif // IPPL_FFT_BACKEND_CUFFT_H diff --git a/src/FFT/Backend/CuFFTMp.h b/src/FFT/Backend/CuFFTMp.h new file mode 100644 index 000000000..46d745e2b --- /dev/null +++ b/src/FFT/Backend/CuFFTMp.h @@ -0,0 +1,645 @@ +/*! + * @file CuFFTMp.h + * @brief Multi-node cuFFTMp wrappers for IPPL FFT transforms. + * + * Provides C2C, R2C and pruned variants on top of cuFFTMp. Includes small + * CUDA helpers for layout transposes (LayoutLeft <-> LayoutRight) since + * cuFFTMp expects row-major data while IPPL uses Kokkos LayoutLeft views. + */ +#ifndef IPPL_FFT_BACKEND_CUFFTMP_H +#define IPPL_FFT_BACKEND_CUFFTMP_H + +#include +#include +#include +#include + +#include "Utility/IpplException.h" +#include "Utility/ParameterList.h" + +#include "FFT/Traits.h" + +namespace ippl { + namespace fft { + + namespace detail { + /*! + * @brief Throw IpplException if @p result is not CUFFT_SUCCESS. + * @param result cuFFT API return code. + * @param context Human-readable label for the failing call. + */ + inline void checkCufftResult(cufftResult result, const char* context) { + if (result != CUFFT_SUCCESS) { + std::string msg = + std::string(context) + " (error code: " + std::to_string(result) + ")"; + throw IpplException("cuFFTMp", msg.c_str()); + } + } + + /*! + * @brief Throw IpplException if @p err is not cudaSuccess. + * @param err CUDA runtime error code. + * @param context Human-readable label for the failing call. + */ + inline void checkCudaError(cudaError_t err, const char* context) { + if (err != cudaSuccess) { + std::string msg = std::string(context) + ": " + cudaGetErrorString(err); + throw IpplException("cuFFTMp", msg.c_str()); + } + } + + /*! + * @brief CUDA kernel that transposes a 3D buffer from LayoutLeft + * to LayoutRight indexing. + * + * LayoutLeft: src[i + j*n0 + k*n0*n1] (i is fastest-varying). + * LayoutRight: dst[k + j*n2 + i*n1*n2] (k is fastest-varying). + * + * @tparam T Element type (cufftComplex / cufftDoubleComplex / real). + */ + template + __global__ void transposeL2R(T* __restrict__ dst, const T* __restrict__ src, int n0, + int n1, int n2) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int k = blockIdx.z * blockDim.z + threadIdx.z; + + if (i < n0 && j < n1 && k < n2) { + size_t src_idx = i + j * n0 + k * n0 * n1; // LayoutLeft + size_t dst_idx = k + j * n2 + i * n1 * n2; // LayoutRight + dst[dst_idx] = src[src_idx]; + } + } + + /*! + * @brief CUDA kernel inverse of transposeL2R (LayoutRight -> LayoutLeft). + */ + template + __global__ void transposeR2L(T* __restrict__ dst, const T* __restrict__ src, int n0, + int n1, int n2) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int k = blockIdx.z * blockDim.z + threadIdx.z; + + if (i < n0 && j < n1 && k < n2) { + size_t src_idx = k + j * n2 + i * n1 * n2; // LayoutRight + size_t dst_idx = i + j * n0 + k * n0 * n1; // LayoutLeft + dst[dst_idx] = src[src_idx]; + } + } + } // namespace detail + + namespace detail { + /*! + * @brief CUDA kernel that scales each complex element by @p scale in-place. + * + * @tparam T cuFFT complex type. + */ + template + __global__ void cufftMpScaleKernel(T* data, size_t n, double scale) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + data[idx].x *= scale; + data[idx].y *= scale; + } + } + } // namespace detail + + //============================================================================= + // cuFFTMp C2C Backend + //============================================================================= + + /*! + * @class CuFFTMpC2C + * @brief Distributed-memory complex-to-complex FFT via cuFFTMp. + * + * Configures a cuFFTMp 3D plan over the supplied MPI communicator, + * allocates an internal CUDA stream, and exposes the IPPL-uniform + * forward()/backward() interface. Only 3D float / double precision + * is supported (compile-time enforced via static_assert). + * + * @tparam T Real precision (float / double). + * @tparam Dim Spatial dimension (only 3D). + * @tparam MemSpace Kokkos memory space holding the buffers. + */ + template + class CuFFTMpC2C { + public: + using complex_t = Kokkos::complex; + using cuda_complex_t = + std::conditional_t, cufftComplex, cufftDoubleComplex>; + + static_assert(std::is_same_v || std::is_same_v, + "cuFFTMp only supports float and double precision"); + static_assert(Dim == 3, "cuFFTMp backend currently only supports 3D transforms"); + static_assert(is_available_v, "cuFFTMp not available"); + + /*! + * @brief Create the cuFFTMp plan and CUDA stream. + * + * @param inbox Local input box (inclusive corner indices). + * @param outbox Local output box (inclusive corner indices). + * @param comm MPI communicator that participates in the transform. + */ + CuFFTMpC2C(const heffte::box3d& inbox, + const heffte::box3d& outbox, MPI_Comm comm, + const ParameterList& /*params*/) + : comm_(comm) { + using detail::checkCudaError; + using detail::checkCufftResult; + + checkCudaError(cudaStreamCreate(&stream_), "Failed to create CUDA stream"); + checkCufftResult(cufftCreate(&handle_), "Failed to create cuFFT handle"); + checkCufftResult(cufftSetStream(handle_, stream_), "Failed to set stream"); + + cufftType type = std::is_same_v ? CUFFT_C2C : CUFFT_Z2Z; + + for (int d = 0; d < 3; ++d) { + lower_in_[d] = inbox.low[d]; + upper_in_[d] = inbox.high[d] + 1; + lower_out_[d] = outbox.low[d]; + upper_out_[d] = outbox.high[d] + 1; + } + + for (int d = 0; d < 3; ++d) { + local_size_[d] = upper_in_[d] - lower_in_[d]; + } + + // Row-major strides (required by cuFFTMp - must be decreasing) + std::array strides; + strides[0] = local_size_[1] * local_size_[2]; + strides[1] = local_size_[2]; + strides[2] = 1; + + std::array local_max; + for (int d = 0; d < 3; ++d) { + local_max[d] = std::max(upper_in_[d], upper_out_[d]); + } + MPI_Allreduce(local_max.data(), global_size_.data(), 3, MPI_LONG_LONG, MPI_MAX, + comm); + + int n[3] = {static_cast(global_size_[0]), static_cast(global_size_[1]), + static_cast(global_size_[2])}; + + total_elements_ = static_cast(n[0]) * n[1] * n[2]; + local_elements_ = local_size_[0] * local_size_[1] * local_size_[2]; + + checkCufftResult(cufftMpMakePlanDecomposition( + handle_, 3, n, lower_in_.data(), upper_in_.data(), + strides.data(), lower_out_.data(), upper_out_.data(), + strides.data(), type, &comm_, CUFFT_COMM_MPI, &worksize_), + "Failed to create cuFFTMp decomposition plan"); + + checkCufftResult(cufftXtMalloc(handle_, &desc_, CUFFT_XT_FORMAT_DISTRIBUTED_INPUT), + "Failed to allocate descriptor"); + } + + ~CuFFTMpC2C() { + if (desc_) + cufftXtFree(desc_); + if (handle_) + cufftDestroy(handle_); + if (stream_) + cudaStreamDestroy(stream_); + } + + CuFFTMpC2C(const CuFFTMpC2C&) = delete; + CuFFTMpC2C& operator=(const CuFFTMpC2C&) = delete; + + CuFFTMpC2C(CuFFTMpC2C&& other) noexcept + : handle_(other.handle_) + , comm_(other.comm_) + , stream_(other.stream_) + , desc_(other.desc_) + , worksize_(other.worksize_) + , total_elements_(other.total_elements_) + , local_elements_(other.local_elements_) + , global_size_(other.global_size_) + , local_size_(other.local_size_) + , lower_in_(other.lower_in_) + , upper_in_(other.upper_in_) + , lower_out_(other.lower_out_) + , upper_out_(other.upper_out_) { + other.handle_ = 0; + other.stream_ = nullptr; + other.desc_ = nullptr; + } + + CuFFTMpC2C& operator=(CuFFTMpC2C&& other) noexcept { + if (this != &other) { + if (desc_) + cufftXtFree(desc_); + if (handle_) + cufftDestroy(handle_); + if (stream_) + cudaStreamDestroy(stream_); + + handle_ = other.handle_; + comm_ = other.comm_; + stream_ = other.stream_; + desc_ = other.desc_; + worksize_ = other.worksize_; + total_elements_ = other.total_elements_; + local_elements_ = other.local_elements_; + global_size_ = other.global_size_; + local_size_ = other.local_size_; + lower_in_ = other.lower_in_; + upper_in_ = other.upper_in_; + lower_out_ = other.lower_out_; + upper_out_ = other.upper_out_; + + other.handle_ = 0; + other.stream_ = nullptr; + other.desc_ = nullptr; + } + return *this; + } + + //! Forward C2C transform (LayoutLeft -> LayoutRight transpose, + //! cuFFTMp forward, transpose back, normalize by 1/N). + void forward(complex_t* in, complex_t* out) { + using detail::checkCudaError; + using detail::checkCufftResult; + + cuda_complex_t* desc_data = + static_cast(desc_->descriptor->data[0]); + + // Transpose input: LayoutLeft -> LayoutRight (into descriptor buffer) + launchTransposeL2R(desc_data, reinterpret_cast(in)); + + // Execute forward FFT + checkCufftResult(cufftXtExecDescriptor(handle_, desc_, desc_, CUFFT_FORWARD), + "Forward FFT execution failed"); + + // Transpose output: LayoutRight -> LayoutLeft + launchTransposeR2L(reinterpret_cast(out), desc_data); + + // Apply scaling (1/N) + T scale = T(1) / static_cast(total_elements_); + applyScaling(reinterpret_cast(out), local_elements_, scale); + + checkCudaError(cudaStreamSynchronize(stream_), "Stream sync failed"); + } + + //! Backward C2C transform (unscaled), with the same layout transpose dance. + void backward(complex_t* in, complex_t* out) { + using detail::checkCudaError; + using detail::checkCufftResult; + + cuda_complex_t* desc_data = + static_cast(desc_->descriptor->data[0]); + + // Transpose input: LayoutLeft -> LayoutRight (into descriptor buffer) + launchTransposeL2R(desc_data, reinterpret_cast(in)); + + // Execute backward FFT + checkCufftResult(cufftXtExecDescriptor(handle_, desc_, desc_, CUFFT_INVERSE), + "Backward FFT execution failed"); + + // Transpose output: LayoutRight -> LayoutLeft + launchTransposeR2L(reinterpret_cast(out), desc_data); + + checkCudaError(cudaStreamSynchronize(stream_), "Stream sync failed"); + } + + //! @return Per-rank cuFFTMp workspace size in bytes. + std::size_t workspace_size() const { return worksize_; } + + private: + void launchTransposeL2R(cuda_complex_t* dst, const cuda_complex_t* src) { + dim3 block(8, 8, 8); + dim3 grid((local_size_[0] + block.x - 1) / block.x, + (local_size_[1] + block.y - 1) / block.y, + (local_size_[2] + block.z - 1) / block.z); + detail::transposeL2R<<>>( + dst, src, static_cast(local_size_[0]), static_cast(local_size_[1]), + static_cast(local_size_[2])); + } + + void launchTransposeR2L(cuda_complex_t* dst, const cuda_complex_t* src) { + dim3 block(8, 8, 8); + dim3 grid((local_size_[0] + block.x - 1) / block.x, + (local_size_[1] + block.y - 1) / block.y, + (local_size_[2] + block.z - 1) / block.z); + detail::transposeR2L<<>>( + dst, src, static_cast(local_size_[0]), static_cast(local_size_[1]), + static_cast(local_size_[2])); + } + + void applyScaling(cuda_complex_t* data, size_t count, T scale) { + constexpr size_t blockSize = 256; + size_t numBlocks = (count + blockSize - 1) / blockSize; + detail::cufftMpScaleKernel<<>>( + data, count, static_cast(scale)); + } + + cufftHandle handle_ = 0; + MPI_Comm comm_; + cudaStream_t stream_ = nullptr; + cudaLibXtDesc* desc_ = nullptr; + + size_t worksize_ = 0; + size_t total_elements_ = 0; + size_t local_elements_ = 0; + std::array global_size_; + std::array local_size_; + std::array lower_in_, upper_in_; + std::array lower_out_, upper_out_; + }; + + //============================================================================= + // cuFFTMp R2C Backend + //============================================================================= + + /*! + * @class CuFFTMpR2C + * @brief Distributed real-to-complex FFT via cuFFTMp. + * + * Holds two cuFFTMp plans (R2C and C2R) plus a CUDA stream. Mirrors + * the IPPL R2C backend interface. Forward = real -> half-complex + * (normalized by 1/N); backward = half-complex -> real (unscaled). + * + * @tparam T Real precision (float / double). + * @tparam Dim Spatial dimension (only 3D). + * @tparam MemSpace Kokkos memory space. + */ + template + class CuFFTMpR2C { + public: + using complex_t = Kokkos::complex; + using cuda_complex_t = + std::conditional_t, cufftComplex, cufftDoubleComplex>; + + static_assert(std::is_same_v || std::is_same_v, + "cuFFTMp only supports float and double precision"); + static_assert(Dim == 3, "cuFFTMp backend currently only supports 3D transforms"); + static_assert(is_available_v, "cuFFTMp not available"); + + /*! + * @brief Build the R2C and C2R cuFFTMp plans. + * @param inbox Local real-input box. + * @param outbox Local complex-output box (Hermitian-symmetric). + * @param comm MPI communicator participating in the transform. + */ + CuFFTMpR2C(const heffte::box3d& inbox, + const heffte::box3d& outbox, int /*r2c_direction*/, + MPI_Comm comm, const ParameterList& /*params*/) + : comm_(comm) { + using detail::checkCudaError; + using detail::checkCufftResult; + + checkCudaError(cudaStreamCreate(&stream_), "Failed to create CUDA stream"); + checkCufftResult(cufftCreate(&handle_r2c_), "Failed to create R2C handle"); + checkCufftResult(cufftCreate(&handle_c2r_), "Failed to create C2R handle"); + checkCufftResult(cufftSetStream(handle_r2c_, stream_), "Failed to set stream"); + checkCufftResult(cufftSetStream(handle_c2r_, stream_), "Failed to set stream"); + + std::array lower_real, upper_real; + std::array lower_complex, upper_complex; + + for (int d = 0; d < 3; ++d) { + lower_real[d] = inbox.low[d]; + upper_real[d] = inbox.high[d] + 1; + lower_complex[d] = outbox.low[d]; + upper_complex[d] = outbox.high[d] + 1; + } + + for (int d = 0; d < 3; ++d) { + local_real_size_[d] = upper_real[d] - lower_real[d]; + local_complex_size_[d] = upper_complex[d] - lower_complex[d]; + } + + // Row-major strides for real data + std::array strides_real; + strides_real[0] = local_real_size_[1] * local_real_size_[2]; + strides_real[1] = local_real_size_[2]; + strides_real[2] = 1; + + // Row-major strides for complex data + std::array strides_complex; + strides_complex[0] = local_complex_size_[1] * local_complex_size_[2]; + strides_complex[1] = local_complex_size_[2]; + strides_complex[2] = 1; + + std::array local_max; + for (int d = 0; d < 3; ++d) { + local_max[d] = std::max(upper_real[d], upper_complex[d]); + } + MPI_Allreduce(local_max.data(), global_size_.data(), 3, MPI_LONG_LONG, MPI_MAX, + comm); + + int n[3] = {static_cast(global_size_[0]), static_cast(global_size_[1]), + static_cast(global_size_[2])}; + + total_elements_ = static_cast(n[0]) * n[1] * n[2]; + local_real_elements_ = + local_real_size_[0] * local_real_size_[1] * local_real_size_[2]; + local_complex_elements_ = + local_complex_size_[0] * local_complex_size_[1] * local_complex_size_[2]; + + size_t worksize_r2c = 0; + size_t worksize_c2r = 0; + checkCufftResult( + cufftMpMakePlanDecomposition( + handle_r2c_, 3, n, lower_real.data(), upper_real.data(), + strides_real.data(), lower_complex.data(), upper_complex.data(), + strides_complex.data(), CUFFT_R2C, &comm_, CUFFT_COMM_MPI, &worksize_r2c), + "Failed to create R2C plan"); + + checkCufftResult( + cufftMpMakePlanDecomposition( + handle_c2r_, 3, n, lower_real.data(), upper_real.data(), + strides_real.data(), lower_complex.data(), upper_complex.data(), + strides_complex.data(), CUFFT_C2R, &comm_, CUFFT_COMM_MPI, &worksize_c2r), + "Failed to create C2R plan"); + + worksize_ = std::max(worksize_r2c, worksize_c2r); + + checkCufftResult( + cufftXtMalloc(handle_r2c_, &desc_, CUFFT_XT_FORMAT_DISTRIBUTED_INPUT), + "Failed to allocate R2C descriptor"); + } + + ~CuFFTMpR2C() { + if (desc_) + cufftXtFree(desc_); + if (handle_r2c_) + cufftDestroy(handle_r2c_); + if (handle_c2r_) + cufftDestroy(handle_c2r_); + if (stream_) + cudaStreamDestroy(stream_); + } + + CuFFTMpR2C(const CuFFTMpR2C&) = delete; + CuFFTMpR2C& operator=(const CuFFTMpR2C&) = delete; + + CuFFTMpR2C(CuFFTMpR2C&& other) noexcept + : handle_r2c_(other.handle_r2c_) + , handle_c2r_(other.handle_c2r_) + , comm_(other.comm_) + , stream_(other.stream_) + , desc_(other.desc_) + , worksize_(other.worksize_) + , total_elements_(other.total_elements_) + , local_real_elements_(other.local_real_elements_) + , local_complex_elements_(other.local_complex_elements_) + , global_size_(other.global_size_) + , local_real_size_(other.local_real_size_) + , local_complex_size_(other.local_complex_size_) { + other.handle_r2c_ = 0; + other.handle_c2r_ = 0; + other.stream_ = nullptr; + other.desc_ = nullptr; + } + + CuFFTMpR2C& operator=(CuFFTMpR2C&& other) noexcept { + if (this != &other) { + if (desc_) + cufftXtFree(desc_); + if (handle_r2c_) + cufftDestroy(handle_r2c_); + if (handle_c2r_) + cufftDestroy(handle_c2r_); + if (stream_) + cudaStreamDestroy(stream_); + + handle_r2c_ = other.handle_r2c_; + handle_c2r_ = other.handle_c2r_; + comm_ = other.comm_; + stream_ = other.stream_; + desc_ = other.desc_; + worksize_ = other.worksize_; + total_elements_ = other.total_elements_; + local_real_elements_ = other.local_real_elements_; + local_complex_elements_ = other.local_complex_elements_; + global_size_ = other.global_size_; + local_real_size_ = other.local_real_size_; + local_complex_size_ = other.local_complex_size_; + + other.handle_r2c_ = 0; + other.handle_c2r_ = 0; + other.stream_ = nullptr; + other.desc_ = nullptr; + } + return *this; + } + + //! Forward R2C transform (real -> half-complex), normalized by 1/N. + void forward(T* in, complex_t* out) { + using detail::checkCudaError; + using detail::checkCufftResult; + + T* desc_data = static_cast(desc_->descriptor->data[0]); + + // Transpose real input: LayoutLeft -> LayoutRight + launchTransposeRealL2R(desc_data, in); + + checkCufftResult(cufftXtExecDescriptor(handle_r2c_, desc_, desc_, CUFFT_FORWARD), + "R2C execution failed"); + + // Transpose complex output: LayoutRight -> LayoutLeft + cuda_complex_t* complex_desc = + static_cast(desc_->descriptor->data[0]); + launchTransposeComplexR2L(reinterpret_cast(out), complex_desc); + + T scale = T(1) / static_cast(total_elements_); + applyScaling(reinterpret_cast(out), local_complex_elements_, + scale); + + checkCudaError(cudaStreamSynchronize(stream_), "Stream sync failed"); + } + + //! Backward C2R transform (half-complex -> real), unscaled. + void backward(complex_t* in, T* out) { + using detail::checkCudaError; + using detail::checkCufftResult; + + cuda_complex_t* complex_desc = + static_cast(desc_->descriptor->data[0]); + + // Transpose complex input: LayoutLeft -> LayoutRight + launchTransposeComplexL2R(complex_desc, reinterpret_cast(in)); + + checkCufftResult(cufftXtExecDescriptor(handle_c2r_, desc_, desc_, CUFFT_INVERSE), + "C2R execution failed"); + + // Transpose real output: LayoutRight -> LayoutLeft + T* real_desc = static_cast(desc_->descriptor->data[0]); + launchTransposeRealR2L(out, real_desc); + + checkCudaError(cudaStreamSynchronize(stream_), "Stream sync failed"); + } + + //! @return Per-rank cuFFTMp workspace size in bytes (max of R2C/C2R). + std::size_t workspace_size() const { return worksize_; } + + private: + void launchTransposeRealL2R(T* dst, const T* src) { + dim3 block(8, 8, 8); + dim3 grid((local_real_size_[0] + block.x - 1) / block.x, + (local_real_size_[1] + block.y - 1) / block.y, + (local_real_size_[2] + block.z - 1) / block.z); + detail::transposeL2R<<>>( + dst, src, static_cast(local_real_size_[0]), + static_cast(local_real_size_[1]), static_cast(local_real_size_[2])); + } + + void launchTransposeRealR2L(T* dst, const T* src) { + dim3 block(8, 8, 8); + dim3 grid((local_real_size_[0] + block.x - 1) / block.x, + (local_real_size_[1] + block.y - 1) / block.y, + (local_real_size_[2] + block.z - 1) / block.z); + detail::transposeR2L<<>>( + dst, src, static_cast(local_real_size_[0]), + static_cast(local_real_size_[1]), static_cast(local_real_size_[2])); + } + + void launchTransposeComplexL2R(cuda_complex_t* dst, const cuda_complex_t* src) { + dim3 block(8, 8, 8); + dim3 grid((local_complex_size_[0] + block.x - 1) / block.x, + (local_complex_size_[1] + block.y - 1) / block.y, + (local_complex_size_[2] + block.z - 1) / block.z); + detail::transposeL2R<<>>( + dst, src, static_cast(local_complex_size_[0]), + static_cast(local_complex_size_[1]), + static_cast(local_complex_size_[2])); + } + + void launchTransposeComplexR2L(cuda_complex_t* dst, const cuda_complex_t* src) { + dim3 block(8, 8, 8); + dim3 grid((local_complex_size_[0] + block.x - 1) / block.x, + (local_complex_size_[1] + block.y - 1) / block.y, + (local_complex_size_[2] + block.z - 1) / block.z); + detail::transposeR2L<<>>( + dst, src, static_cast(local_complex_size_[0]), + static_cast(local_complex_size_[1]), + static_cast(local_complex_size_[2])); + } + + void applyScaling(cuda_complex_t* data, size_t count, T scale) { + constexpr size_t blockSize = 256; + size_t numBlocks = (count + blockSize - 1) / blockSize; + detail::cufftMpScaleKernel<<>>( + data, count, static_cast(scale)); + } + + cufftHandle handle_r2c_ = 0; + cufftHandle handle_c2r_ = 0; + MPI_Comm comm_; + cudaStream_t stream_ = nullptr; + cudaLibXtDesc* desc_ = nullptr; + + size_t worksize_ = 0; + size_t total_elements_ = 0; + size_t local_real_elements_ = 0; + size_t local_complex_elements_ = 0; + std::array global_size_; + std::array local_real_size_; + std::array local_complex_size_; + }; + + } // namespace fft +} // namespace ippl + +#endif diff --git a/src/FFT/Backend/Heffte.h b/src/FFT/Backend/Heffte.h new file mode 100644 index 000000000..e00a17c63 --- /dev/null +++ b/src/FFT/Backend/Heffte.h @@ -0,0 +1,359 @@ +#ifndef IPPL_FFT_BACKEND_HEFFTE_H +#define IPPL_FFT_BACKEND_HEFFTE_H + +#include +#include +#include + +#include "Utility/ParameterList.h" + +#include "Field/BareField.h" + +#include "FFT/Traits.h" +#include "FieldLayout/FieldLayout.h" + +namespace ippl { + namespace fft { + + /*! + * @brief Multiply a complex array by a real scalar in-place on @p MemSpace. + * + * Used as a post-FFT normalization helper for the heFFTe backends. + * + * @tparam T Real (precision) type of the complex elements. + * @tparam MemSpace Kokkos memory space the buffer lives in. + * @param data Raw pointer to the device-/host-resident buffer. + * @param scale Scalar multiplier applied to every element. + * @param size Number of complex elements in @p data. + */ + template + inline void applyScale(Kokkos::complex* data, T scale, size_t size) { + Kokkos::View*, MemSpace> view(data, size); + Kokkos::parallel_for( + "Heffte_scale_complex", + Kokkos::RangePolicy(0, size), + KOKKOS_LAMBDA(const size_t i) { view(i) *= scale; }); + Kokkos::fence(); + } + + /*! + * @brief Multiply a real array by a scalar in-place on @p MemSpace. + * + * Real-buffer counterpart to applyScale used by trigonometric transforms. + * + * @tparam T Element value type. + * @tparam MemSpace Kokkos memory space the buffer lives in. + * @param data Raw pointer to the buffer. + * @param scale Scalar multiplier applied to every element. + * @param size Number of elements in @p data. + */ + template + inline void applyScaleReal(T* data, T scale, size_t size) { + Kokkos::View view(data, size); + Kokkos::parallel_for( + "Heffte_scale_real", + Kokkos::RangePolicy(0, size), + KOKKOS_LAMBDA(const size_t i) { view(i) *= scale; }); + Kokkos::fence(); + } + + /*! + * @brief Compute the total global FFT grid size by reducing the upper + * bounds of @p inbox and @p outbox across @p comm. + * + * Each rank only knows its own local box; the global size requires an + * MPI_MAX reduction over the upper-corner indices. + * + * @param inbox Local input box (low/high inclusive corner indices). + * @param outbox Local output box (low/high inclusive corner indices). + * @param comm MPI communicator over which the FFT is distributed. + * @return Total number of points in the global FFT grid. + */ + inline size_t computeGlobalSize(const heffte::box3d& inbox, + const heffte::box3d& outbox, MPI_Comm comm) { + long long local_max[3], global_max[3]; + local_max[0] = std::max(inbox.high[0], outbox.high[0]) + 1; + local_max[1] = std::max(inbox.high[1], outbox.high[1]) + 1; + local_max[2] = std::max(inbox.high[2], outbox.high[2]) + 1; + + MPI_Allreduce(local_max, global_max, 3, MPI_LONG_LONG, MPI_MAX, comm); + + return static_cast(global_max[0]) * static_cast(global_max[1]) + * static_cast(global_max[2]); + } + + /*! + * @brief Translate IPPL @p params into a heFFTe plan_options struct. + * + * If `use_heffte_defaults` is set, the heFFTe library defaults are + * returned unchanged (with GPU-aware MPI enabled). Otherwise the + * pencil/reorder flags, GPU-aware flag (only for GPU backends), and + * communication algorithm (FFTComm enum: a2a, a2av, p2p, p2p_pl) + * are pulled from @p params. + * + * @tparam HeffteBackendT Concrete heFFTe backend (e.g. heffte::backend::cufft). + * @param params IPPL parameter list with FFT tuning knobs. + * @return Configured heFFTe plan_options. + * @throws IpplException on an unknown communication enum value. + */ + template + heffte::plan_options makeHeffteOptions(const ParameterList& params) { + auto opts = heffte::default_options(); + + if (!params.get("use_heffte_defaults")) { + opts.use_pencils = params.get("use_pencils"); + opts.use_reorder = params.get("use_reorder"); + + if constexpr (is_available_v) { + opts.use_gpu_aware = params.get("use_gpu_aware"); + } + + switch (params.get("comm")) { + case a2a: + opts.algorithm = heffte::reshape_algorithm::alltoall; + break; + case a2av: + opts.algorithm = heffte::reshape_algorithm::alltoallv; + break; + case p2p: + opts.algorithm = heffte::reshape_algorithm::p2p; + break; + case p2p_pl: + opts.algorithm = heffte::reshape_algorithm::p2p_plined; + break; + default: + throw IpplException("FFT", "Unknown communication type"); + } + } else { + opts.use_gpu_aware = true; + opts.algorithm = heffte::reshape_algorithm::p2p_plined; + } + return opts; + } + + //============================================================================= + // heFFTe C2C + //============================================================================= + + /*! + * @class HeffteC2C + * @brief Thin wrapper around heffte::fft3d for complex-to-complex transforms. + * + * Owns the heFFTe plan and a workspace large enough to handle the + * configured maximum batch size. forward() applies full normalization + * (heffte::scale::full); backward() applies none, so a forward followed + * by a backward returns the input. + * + * @tparam T Real precision type (float / double). + * @tparam Dim Spatial dimension (only 2 and 3 are supported). + * @tparam MemSpace Kokkos memory space holding the input/output buffers. + */ + template + class HeffteC2C { + public: + using complex_t = Kokkos::complex; + using backend_t = typename HeffteBackend::c2c; + using heffte_t = heffte::fft3d; + using workspace_t = typename heffte_t::template buffer_container; + + /*! + * @brief Construct a heFFTe C2C plan over the given box decomposition. + * + * @param inbox Local input box (inclusive corner indices). + * @param outbox Local output box (inclusive corner indices). + * @param comm MPI communicator participating in the transform. + * @param params FFT parameter list (see makeHeffteOptions). + * @param maxBatchSize Maximum batch size for batched transforms. + */ + HeffteC2C(const heffte::box3d& inbox, const heffte::box3d& outbox, + MPI_Comm comm, const ParameterList& params, int maxBatchSize = 1) + : maxBatchSize_(maxBatchSize) { + static_assert(Dim == 2 || Dim == 3, "heFFTe only supports 2D and 3D"); + + auto opts = makeHeffteOptions(params); + heffte_ = std::make_shared(inbox, outbox, comm, opts); + + // Allocate workspace for maximum batch size + workspace_ = workspace_t(heffte_->size_workspace() * maxBatchSize); + + localSize_ = heffte_->size_outbox(); + globalSize_ = computeGlobalSize(inbox, outbox, comm); + } + + //! Single forward C2C transform with full normalization. + void forward(complex_t* in, complex_t* out) { + heffte_->forward(in, out, workspace_.data(), heffte::scale::full); + } + + //! Single backward C2C transform (no normalization). + void backward(complex_t* in, complex_t* out) { + heffte_->backward(in, out, workspace_.data(), heffte::scale::none); + } + + //! Batched forward C2C transform; @p batchSize must be <= max_batch_size(). + void forward(int batchSize, complex_t* in, complex_t* out) { + assert(batchSize <= maxBatchSize_ && "Batch size exceeds allocated workspace"); + heffte_->forward(batchSize, in, out, workspace_.data(), heffte::scale::full); + } + + //! Batched backward C2C transform; @p batchSize must be <= max_batch_size(). + void backward(int batchSize, complex_t* in, complex_t* out) { + assert(batchSize <= maxBatchSize_ && "Batch size exceeds allocated workspace"); + heffte_->backward(batchSize, in, out, workspace_.data(), heffte::scale::none); + } + + //! @return Per-plan heFFTe workspace size (single batch slot). + size_t workspace_size() const { return heffte_->size_workspace(); } + //! @return Number of local complex elements after the transform. + size_t local_size() const { return localSize_; } + //! @return Total number of points in the global FFT grid. + size_t global_size() const { return globalSize_; } + //! @return Local input-box size as reported by heFFTe. + size_t size_inbox() const { return heffte_->size_inbox(); } + //! @return Local output-box size as reported by heFFTe. + size_t size_outbox() const { return heffte_->size_outbox(); } + //! @return Maximum batch size the workspace was allocated for. + int max_batch_size() const { return maxBatchSize_; } + + private: + std::shared_ptr heffte_; + workspace_t workspace_; + size_t localSize_; + size_t globalSize_; + int maxBatchSize_; + }; + + //============================================================================= + // heFFTe R2C + //============================================================================= + + /*! + * @class HeffteR2C + * @brief Wrapper around heffte::fft3d_r2c for real-to-complex transforms. + * + * forward() consumes a real buffer and writes the half-complex spectrum; + * backward() does the inverse. Normalization matches HeffteC2C: forward + * is fully normalized, backward is unscaled. + * + * @tparam T Real precision type. + * @tparam Dim Spatial dimension. + * @tparam MemSpace Kokkos memory space holding the buffers. + */ + template + class HeffteR2C { + public: + using complex_t = Kokkos::complex; + using backend_t = typename HeffteBackend::c2c; + using heffte_t = heffte::fft3d_r2c; + using workspace_t = typename heffte_t::template buffer_container; + + /*! + * @brief Construct an R2C plan over the given decomposition. + * + * @param inbox Local real-input box. + * @param outbox Local complex-output box (Hermitian-symmetric). + * @param r2c_direction Axis along which the half-complex output lives. + * @param comm MPI communicator. + * @param params FFT parameter list (see makeHeffteOptions). + */ + HeffteR2C(const heffte::box3d& inbox, const heffte::box3d& outbox, + int r2c_direction, MPI_Comm comm, const ParameterList& params) { + auto opts = makeHeffteOptions(params); + heffte_ = std::make_shared(inbox, outbox, r2c_direction, comm, opts); + workspace_ = workspace_t(heffte_->size_workspace()); + + local_complex_size_ = heffte_->size_outbox(); + + // For R2C, normalize by the global REAL size + // inbox is the real box + long long local_max[3], global_max[3]; + local_max[0] = inbox.high[0] + 1; + local_max[1] = inbox.high[1] + 1; + local_max[2] = inbox.high[2] + 1; + + MPI_Allreduce(local_max, global_max, 3, MPI_LONG_LONG, MPI_MAX, comm); + + global_real_size_ = static_cast(global_max[0]) + * static_cast(global_max[1]) + * static_cast(global_max[2]); + } + + //! Forward R2C transform with full normalization (real -> half-complex). + void forward(T* in, complex_t* out) { + heffte_->forward(in, out, workspace_.data(), heffte::scale::full); + } + + //! Backward C2R transform without normalization (half-complex -> real). + void backward(complex_t* in, T* out) { + heffte_->backward(in, out, workspace_.data(), heffte::scale::none); + } + + private: + std::shared_ptr heffte_; + workspace_t workspace_; + size_t local_complex_size_; + size_t global_real_size_; + }; + + //============================================================================= + // heFFTe Trigonometric (Sine, Cos, Cos1) + //============================================================================= + + /*! + * @class HeffteTrig + * @brief Wrapper around heffte::fft3d for sine / cosine transforms. + * + * Specializations are generated by IPPL_FFT_DEFINE_HEFFTE_TRIG for each + * transform tag (SineTransform, CosTransform, Cos1Transform). The tag + * selects the corresponding heFFTe backend. + * + * @tparam T Real precision type. + * @tparam Dim Spatial dimension. + * @tparam MemSpace Kokkos memory space holding the buffers. + * @tparam Tag One of SineTransform, CosTransform, Cos1Transform. + */ + template + class HeffteTrig; + +#define IPPL_FFT_DEFINE_HEFFTE_TRIG(TagType, member) \ + template \ + class HeffteTrig { \ + public: \ + using backend_t = typename HeffteBackend::member; \ + using heffte_t = heffte::fft3d; \ + using workspace_t = typename heffte_t::template buffer_container; \ + \ + HeffteTrig(const heffte::box3d& inbox, const heffte::box3d& outbox, \ + MPI_Comm comm, const ParameterList& params) { \ + auto opts = makeHeffteOptions(params); \ + heffte_ = std::make_shared(inbox, outbox, comm, opts); \ + workspace_ = workspace_t(heffte_->size_workspace()); \ + local_size_ = heffte_->size_outbox(); \ + global_size_ = computeGlobalSize(inbox, outbox, comm); \ + } \ + \ + void forward(T* in, T* out) { \ + heffte_->forward(in, out, workspace_.data(), heffte::scale::full); \ + } \ + \ + void backward(T* in, T* out) { \ + heffte_->backward(in, out, workspace_.data(), heffte::scale::none); \ + } \ + \ + private: \ + std::shared_ptr heffte_; \ + workspace_t workspace_; \ + size_t local_size_; \ + size_t global_size_; \ + }; + + IPPL_FFT_DEFINE_HEFFTE_TRIG(SineTransform, sin) + IPPL_FFT_DEFINE_HEFFTE_TRIG(CosTransform, cos) + IPPL_FFT_DEFINE_HEFFTE_TRIG(Cos1Transform, cos1) + +#undef IPPL_FFT_DEFINE_HEFFTE_TRIG + + } // namespace fft +} // namespace ippl + +#endif diff --git a/src/FFT/FFT.h b/src/FFT/FFT.h index 9f4621699..7dddb200c 100644 --- a/src/FFT/FFT.h +++ b/src/FFT/FFT.h @@ -1,360 +1,9 @@ -// -// Class FFT -// The FFT class performs complex-to-complex, -// real-to-complex on IPPL Fields. -// FFT is templated on the type of transform to be performed, -// the dimensionality of the Field to transform, and the -// floating-point precision type of the Field (float or double). -// Currently, we use heffte for taking the transforms and the class FFT -// serves as an interface between IPPL and heffte. In making this interface, -// we have referred Cabana library -// https://github.com/ECP-copa/Cabana. -// -// - #ifndef IPPL_FFT_FFT_H #define IPPL_FFT_FFT_H -#include -#include -#include -#include -#include -#include - -#include "Utility/IpplException.h" -#include "Utility/ParameterList.h" - -#include "Field/Field.h" - -#include "FieldLayout/FieldLayout.h" -#include "Index/NDIndex.h" - -namespace heffte { - template <> - struct is_ccomplex> : std::true_type {}; - - template <> - struct is_zcomplex> : std::true_type {}; -} // namespace heffte - -namespace ippl { - - /** - Tag classes for Fourier transforms - */ - class CCTransform {}; - class RCTransform {}; - class SineTransform {}; - class CosTransform {}; - /** - Tag classes for Cosine of type 1 transforms - */ - class Cos1Transform {}; - - enum FFTComm { - a2av = 0, - a2a = 1, - p2p = 2, - p2p_pl = 3 - }; - - enum TransformDirection { - FORWARD, - BACKWARD - }; - - namespace detail { - /*! - * Wrapper type for heFFTe backends, templated - * on the Kokkos memory space - */ - template - struct HeffteBackendType; - -#if defined(Heffte_ENABLE_FFTW) - template <> - struct HeffteBackendType { - using backend = heffte::backend::fftw; - using backendSine = heffte::backend::fftw_sin; - using backendCos = heffte::backend::fftw_cos; - using backendCos1 = heffte::backend::fftw_cos1; - }; -#elif defined(Heffte_ENABLE_MKL) - template <> - struct HeffteBackendType { - using backend = heffte::backend::mkl; - using backendSine = heffte::backend::mkl_sin; - using backendCos = heffte::backend::mkl_cos; - }; -#endif - -#ifdef Heffte_ENABLE_CUDA -#ifdef KOKKOS_ENABLE_CUDA - template <> - struct HeffteBackendType { - using backend = heffte::backend::cufft; - using backendSine = heffte::backend::cufft_sin; - using backendCos = heffte::backend::cufft_cos; - using backendCos1 = heffte::backend::cufft_cos1; - }; -#else -#error cuFFT backend is enabled for heFFTe but CUDA is not enabled for Kokkos! -#endif -#endif -#ifdef KOKKOS_ENABLE_HIP -#ifdef Heffte_ENABLE_ROCM - template <> - struct HeffteBackendType { - using backend = heffte::backend::rocfft; - using backendSine = heffte::backend::rocfft_sin; - using backendCos = heffte::backend::rocfft_cos; - using backendCos1 = heffte::backend::rocfft_cos1; - }; -#else - template <> - struct HeffteBackendType { - using backend = heffte::backend::stock; - using backendSine = heffte::backend::stock_sin; - using backendCos = heffte::backend::stock_cos; - using backendCos1 = heffte::backend::stock_cos1; - }; -#endif -#endif +#include "Traits.h" +#include "Backend/Backend.h" +#include "Transform/Transform.h" -#ifdef KOKKOS_ENABLE_SYCL - // No SYCL-specific Heffte backend wired up yet. Heffte's oneMKL backend - // would go here when Heffte_ENABLE_ONEAPI plumbing is added in IPPL's - // Dependencies.cmake. For now, fall back to the stock CPU backend so - // SYCL builds at least compile (FFTs will run on the host). - template <> - struct HeffteBackendType { - using backend = heffte::backend::stock; - using backendSine = heffte::backend::stock_sin; - using backendCos = heffte::backend::stock_cos; - using backendCos1 = heffte::backend::stock_cos1; - }; #endif - -#if !defined(Heffte_ENABLE_MKL) && !defined(Heffte_ENABLE_FFTW) - /** - * Use heFFTe's inbuilt 1D fft computation on CPUs if no - * vendor specific or optimized backend is found - */ - template <> - struct HeffteBackendType { - using backend = heffte::backend::stock; - using backendSine = heffte::backend::stock_sin; - using backendCos = heffte::backend::stock_cos; - using backendCos1 = heffte::backend::stock_cos1; - }; -#endif - - } // namespace detail - - template class FFT, typename Backend, - typename BufferType = typename Field::value_type> - class FFTBase { - constexpr static unsigned Dim = Field::dim; - - public: - using heffteBackend = Backend; - using workspace_t = typename FFT::template buffer_container; - using Layout_t = FieldLayout; - - FFTBase(const Layout_t& layout, const ParameterList& params); - ~FFTBase() = default; - - protected: - FFTBase() = default; - - void domainToBounds(const NDIndex& domain, std::array& low, - std::array& high); - void setup(const heffte::box3d& inbox, const heffte::box3d& outbox, - const ParameterList& params); - - std::shared_ptr> heffte_m; - workspace_t workspace_m; - - template - using temp_view_type = - typename Kokkos::View::uniform_type; - temp_view_type tempField; - }; - -#define IN_PLACE_FFT_BASE_CLASS(Field, Backend) \ - FFTBase::Backend> -#define EXT_FFT_BASE_CLASS(Field, Backend, Type) \ - FFTBase::Backend, \ - typename Type> - - /** - Non-specialized FFT class. We specialize based on Transform tag class - */ - template - class FFT {}; - - /** - complex-to-complex FFT class - */ - template - class FFT : public IN_PLACE_FFT_BASE_CLASS(ComplexField, backend) { - constexpr static unsigned Dim = ComplexField::dim; - using Base = IN_PLACE_FFT_BASE_CLASS(ComplexField, backend); - - public: - using Complex_t = typename ComplexField::value_type; - - using Base::Base; - using typename Base::heffteBackend, typename Base::workspace_t, typename Base::Layout_t; - - /*! - * Warmup the FFT object by forward & backward FFT on an empty field - * @param f Field whose transformation to compute (and overwrite) - */ - void warmup(ComplexField& f); - - /*! - * Perform in-place FFT - * @param direction Forward or backward transformation - * @param f Field whose transformation to compute (and overwrite) - */ - void transform(TransformDirection direction, ComplexField& f); - }; - - /** - real-to-complex FFT class - */ - template - class FFT - : public EXT_FFT_BASE_CLASS(RealField, backend, - Kokkos::complex) { - constexpr static unsigned Dim = RealField::dim; - using Real_t = typename RealField::value_type; - using Base = EXT_FFT_BASE_CLASS(RealField, backend, - Kokkos::complex); - - public: - using Complex_t = Kokkos::complex; - using ComplexField = typename Field::uniform_type; - - using typename Base::heffteBackend, typename Base::workspace_t, typename Base::Layout_t; - - /** Create a new FFT object with the layout for the input and output Fields - * and parameters for heffte. - */ - FFT(const Layout_t& layoutInput, const Layout_t& layoutOutput, const ParameterList& params); - - /*! - * Warmup the FFT object by forward & backward FFT on an empty field - * @param f Field whose transformation to compute - * @param g Field in which to store the transformation - */ - void warmup(RealField& f, ComplexField& g); - - /*! - * Perform FFT - * @param direction Forward or backward transformation - * @param f Field whose transformation to compute - * @param g Field in which to store the transformation - */ - void transform(TransformDirection direction, RealField& f, ComplexField& g); - - private: - typename Base::template temp_view_type tempFieldComplex; - }; - - /** - Sine transform class - */ - template - class FFT : public IN_PLACE_FFT_BASE_CLASS(Field, backendSine) { - constexpr static unsigned Dim = Field::dim; - using Base = IN_PLACE_FFT_BASE_CLASS(Field, backendSine); - - public: - using Base::Base; - using typename Base::heffteBackend, typename Base::workspace_t, typename Base::Layout_t; - - /*! - * Warmup the FFT object by forward & backward FFT on an empty field - * @param f Field whose transformation to compute (and overwrite) - */ - void warmup(Field& f); - - /*! - * Perform in-place FFT - * @param direction Forward or backward transformation - * @param f Field whose transformation to compute (and overwrite) - */ - void transform(TransformDirection direction, Field& f); - }; - /** - Cosine transform class - */ - template - class FFT : public IN_PLACE_FFT_BASE_CLASS(Field, backendCos) { - constexpr static unsigned Dim = Field::dim; - using Base = IN_PLACE_FFT_BASE_CLASS(Field, backendCos); - - public: - using Base::Base; - using typename Base::heffteBackend, typename Base::workspace_t, typename Base::Layout_t; - - /*! - * Warmup the FFT object by forward & backward FFT on an empty field - * @param f Field whose transformation to compute (and overwrite) - */ - void warmup(Field& f); - - /*! - * Perform in-place FFT - * @param direction Forward or backward transformation - * @param f Field whose transformation to compute (and overwrite) - */ - void transform(TransformDirection direction, Field& f); - }; - /** - Cosine type 1 transform class - */ - template - class FFT : public IN_PLACE_FFT_BASE_CLASS(Field, backendCos1) { - constexpr static unsigned Dim = Field::dim; - using Base = IN_PLACE_FFT_BASE_CLASS(Field, backendCos1); - - public: - using Base::Base; - using typename Base::heffteBackend, typename Base::workspace_t, typename Base::Layout_t; - - /*! - * Warmup the FFT object by forward & backward FFT on an empty field - * @param f Field whose transformation to compute (and overwrite) - */ - void warmup(Field& f); - - /*! - * Perform in-place FFT - * @param direction Forward or backward transformation - * @param f Field whose transformation to compute (and overwrite) - */ - void transform(TransformDirection direction, Field& f); - }; -} // namespace ippl - -#include "FFT/FFT.hpp" - -#endif // IPPL_FFT_FFT_H - -// vi: set et ts=4 sw=4 sts=4: -// Local Variables: -// mode:c -// c-basic-offset: 4 -// indent-tabs-mode: nil -// require-final-newline: nil -// End: diff --git a/src/FFT/FFT.hpp b/src/FFT/FFT.hpp deleted file mode 100644 index 086f6d732..000000000 --- a/src/FFT/FFT.hpp +++ /dev/null @@ -1,460 +0,0 @@ -// -// Class FFT -// The FFT class performs complex-to-complex, -// real-to-complex on IPPL Fields. -// FFT is templated on the type of transform to be performed, -// the dimensionality of the Field to transform, and the -// floating-point precision type of the Field (float or double). -// Currently, we use heffte for taking the transforms and the class FFT -// serves as an interface between IPPL and heffte. In making this interface, -// we have referred Cabana library. -// https://github.com/ECP-copa/Cabana. -// -// Copyright (c) 2021, Sriramkrishnan Muralikrishnan, -// Paul Scherrer Institut, Villigen PSI, Switzerland -// All rights reserved -// -// This file is part of IPPL. -// -/** - Implementations for FFT constructor/destructor and transforms -*/ - -#include "Utility/IpplTimings.h" - -#include "Field/BareField.h" - -#include "FieldLayout/FieldLayout.h" - -namespace ippl { - - template class FFT, typename Backend, typename T> - FFTBase::FFTBase(const Layout_t& layout, const ParameterList& params) { - std::array low; - std::array high; - - const NDIndex lDom = layout.getLocalNDIndex(); - domainToBounds(lDom, low, high); - - heffte::box3d inbox = {low, high}; - heffte::box3d outbox = {low, high}; - - setup(inbox, outbox, params); - } - - template class FFT, typename Backend, typename T> - void FFTBase::domainToBounds(const NDIndex& domain, - std::array& low, - std::array& high) { - low.fill(0); - high.fill(0); - - /** - * Static cast to detail::long long (uint64_t) is necessary, as heffte::box3d requires it - * like that. - */ - for (size_t d = 0; d < Dim; ++d) { - low[d] = static_cast(domain[d].first()); - high[d] = static_cast(domain[d].length() + domain[d].first() - 1); - } - } - - /** - setup performs the initialization necessary. - */ - template class FFT, typename Backend, typename T> - void FFTBase::setup(const heffte::box3d& inbox, - const heffte::box3d& outbox, - const ParameterList& params) { - heffte::plan_options heffteOptions = heffte::default_options(); - - if (!params.get("use_heffte_defaults")) { - heffteOptions.use_pencils = params.get("use_pencils"); - heffteOptions.use_reorder = params.get("use_reorder"); -#ifdef Heffte_ENABLE_GPU - heffteOptions.use_gpu_aware = params.get("use_gpu_aware"); -#endif - - switch (params.get("comm")) { - case a2a: - heffteOptions.algorithm = heffte::reshape_algorithm::alltoall; - break; - case a2av: - heffteOptions.algorithm = heffte::reshape_algorithm::alltoallv; - break; - case p2p: - heffteOptions.algorithm = heffte::reshape_algorithm::p2p; - break; - case p2p_pl: - heffteOptions.algorithm = heffte::reshape_algorithm::p2p_plined; - break; - default: - throw IpplException("FFT::setup", "Unrecognized heffte communication type"); - } - } - - if constexpr (std::is_same_v, heffte::fft3d>) { - heffte_m = std::make_shared>( - inbox, outbox, Comm->getCommunicator(), heffteOptions); - } else { - heffte_m = std::make_shared>( - inbox, outbox, params.get("r2c_direction"), Comm->getCommunicator(), - heffteOptions); - } - - // heffte::gpu::device_set(Comm->rank() % heffte::gpu::device_count()); - if (workspace_m.size() < heffte_m->size_workspace()) { - workspace_m = workspace_t(heffte_m->size_workspace()); - } - } - - template - void FFT::warmup(ComplexField& f) { - this->transform(FORWARD, f); - this->transform(BACKWARD, f); - } - - template - void FFT::transform(TransformDirection direction, ComplexField& f) { - static_assert(Dim == 2 || Dim == 3, "heFFTe only supports 2D and 3D"); - - auto fview = f.getView(); - const int nghost = f.getNghost(); - - /** - *This copy to a temporary Kokkos view is needed because of following - *reasons: - *1) heffte wants the input and output fields without ghost layers - *2) heffte accepts data in layout left (by default) even though this - *can be changed during heffte box creation - */ - auto& tempField = this->tempField; - if (tempField.size() != f.getOwned().size()) { - tempField = detail::shrinkView("tempField", fview, nghost); - } - - using index_array_type = typename RangePolicy::index_array_type; - ippl::parallel_for( - "copy from Kokkos FFT", getRangePolicy(fview, nghost), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(tempField, args - nghost).real(apply(fview, args).real()); - apply(tempField, args - nghost).imag(apply(fview, args).imag()); - }); - - if (direction == FORWARD) { - this->heffte_m->forward(tempField.data(), tempField.data(), this->workspace_m.data(), - heffte::scale::full); - } else if (direction == BACKWARD) { - this->heffte_m->backward(tempField.data(), tempField.data(), this->workspace_m.data(), - heffte::scale::none); - } else { - throw std::logic_error("Only 1:forward and -1:backward are allowed as directions"); - } - - ippl::parallel_for( - "copy to Kokkos FFT", getRangePolicy(fview, nghost), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(fview, args).real() = apply(tempField, args - nghost).real(); - apply(fview, args).imag() = apply(tempField, args - nghost).imag(); - }); - } - - //======================================================================== - // FFT RCTransform Constructors - //======================================================================== - - /** - *Create a new FFT object of type RCTransform, with given input and output - *layouts and heffte parameters. - */ - - template - FFT::FFT(const Layout_t& layoutInput, const Layout_t& layoutOutput, - const ParameterList& params) { - /** - * Heffte requires to pass a 3D array even for 2D and - * 1D FFTs we just have to make the length in other - * dimensions to be 1. - */ - std::array lowInput; - std::array highInput; - std::array lowOutput; - std::array highOutput; - - const NDIndex& lDomInput = layoutInput.getLocalNDIndex(); - const NDIndex& lDomOutput = layoutOutput.getLocalNDIndex(); - - this->domainToBounds(lDomInput, lowInput, highInput); - this->domainToBounds(lDomOutput, lowOutput, highOutput); - - heffte::box3d inbox = {lowInput, highInput}; - heffte::box3d outbox = {lowOutput, highOutput}; - - this->setup(inbox, outbox, params); - } - - template - void FFT::warmup(RealField& f, ComplexField& g) { - this->transform(FORWARD, f, g); - this->transform(BACKWARD, f, g); - } - - template - void FFT::transform(TransformDirection direction, RealField& f, - ComplexField& g) { - static_assert(Dim == 2 || Dim == 3, "heFFTe only supports 2D and 3D"); - - auto fview = f.getView(); - auto gview = g.getView(); - const int nghostf = f.getNghost(); - const int nghostg = g.getNghost(); - - /** - *This copy to a temporary Kokkos view is needed because of following - *reasons: - *1) heffte wants the input and output fields without ghost layers - *2) heffte accepts data in layout left (by default) eventhough this - *can be changed during heffte box creation - */ - auto& tempFieldf = this->tempField; - auto& tempFieldg = this->tempFieldComplex; - if (tempFieldf.size() != f.getOwned().size()) { - tempFieldf = detail::shrinkView("tempFieldf", fview, nghostf); - } - if (tempFieldg.size() != g.getOwned().size()) { - tempFieldg = detail::shrinkView("tempFieldg", gview, nghostg); - } - - using index_array_type = typename RangePolicy::index_array_type; - ippl::parallel_for( - "copy from Kokkos f field in FFT", getRangePolicy(fview, nghostf), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(tempFieldf, args - nghostf) = apply(fview, args); - }); - ippl::parallel_for( - "copy from Kokkos g field in FFT", getRangePolicy(gview, nghostg), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(tempFieldg, args - nghostg).real(apply(gview, args).real()); - apply(tempFieldg, args - nghostg).imag(apply(gview, args).imag()); - }); - - if (direction == FORWARD) { - this->heffte_m->forward(tempFieldf.data(), tempFieldg.data(), this->workspace_m.data(), - heffte::scale::full); - } else if (direction == BACKWARD) { - this->heffte_m->backward(tempFieldg.data(), tempFieldf.data(), this->workspace_m.data(), - heffte::scale::none); - } else { - throw std::logic_error("Only 1:forward and -1:backward are allowed as directions"); - } - - ippl::parallel_for( - "copy to Kokkos f field FFT", getRangePolicy(fview, nghostf), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(fview, args) = apply(tempFieldf, args - nghostf); - }); - - ippl::parallel_for( - "copy to Kokkos g field FFT", getRangePolicy(gview, nghostg), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(gview, args).real() = apply(tempFieldg, args - nghostg).real(); - apply(gview, args).imag() = apply(tempFieldg, args - nghostg).imag(); - }); - } - - template - void FFT::warmup(Field& f) { - this->transform(FORWARD, f); - this->transform(BACKWARD, f); - } - - template - void FFT::transform(TransformDirection direction, Field& f) { - static_assert(Dim == 2 || Dim == 3, "heFFTe only supports 2D and 3D"); -#ifdef Heffte_ENABLE_FFTW - if (direction == FORWARD) { - f = f / 8.0; - } -#endif - - auto fview = f.getView(); - const int nghost = f.getNghost(); - - /** - *This copy to a temporary Kokkos view is needed because of following - *reasons: - *1) heffte wants the input and output fields without ghost layers - *2) heffte accepts data in layout left (by default) eventhough this - *can be changed during heffte box creation - */ - auto& tempField = this->tempField; - if (tempField.size() != f.getOwned().size()) { - tempField = detail::shrinkView("tempField", fview, nghost); - } - - using index_array_type = typename RangePolicy::index_array_type; - ippl::parallel_for( - "copy from Kokkos FFT", getRangePolicy(fview, nghost), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(tempField, args - nghost) = apply(fview, args); - }); - - if (direction == FORWARD) { - this->heffte_m->forward(tempField.data(), tempField.data(), this->workspace_m.data(), - heffte::scale::full); - } else if (direction == BACKWARD) { - this->heffte_m->backward(tempField.data(), tempField.data(), this->workspace_m.data(), - heffte::scale::none); - } else { - throw std::logic_error("Only 1:forward and -1:backward are allowed as directions"); - } - - ippl::parallel_for( - "copy to Kokkos FFT", getRangePolicy(fview, nghost), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(fview, args) = apply(tempField, args - nghost); - }); -#ifdef Heffte_ENABLE_FFTW - if (direction == BACKWARD) { - f = f * 8.0; - } -#endif - } - - template - void FFT::warmup(Field& f) { - this->transform(FORWARD, f); - this->transform(BACKWARD, f); - } - - template - void FFT::transform(TransformDirection direction, Field& f) { - static_assert(Dim == 2 || Dim == 3, "heFFTe only supports 2D and 3D"); -#ifdef Heffte_ENABLE_FFTW - if (direction == FORWARD) { - f = f / 8.0; - } -#endif - - auto fview = f.getView(); - const int nghost = f.getNghost(); - - /** - *This copy to a temporary Kokkos view is needed because of following - *reasons: - *1) heffte wants the input and output fields without ghost layers - *2) heffte accepts data in layout left (by default) eventhough this - *can be changed during heffte box creation - */ - auto& tempField = this->tempField; - if (tempField.size() != f.getOwned().size()) { - tempField = detail::shrinkView("tempField", fview, nghost); - } - - using index_array_type = typename RangePolicy::index_array_type; - ippl::parallel_for( - "copy from Kokkos FFT", getRangePolicy(fview, nghost), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(tempField, args - nghost) = apply(fview, args); - }); - - if (direction == FORWARD) { - this->heffte_m->forward(tempField.data(), tempField.data(), this->workspace_m.data(), - heffte::scale::full); - } else if (direction == BACKWARD) { - this->heffte_m->backward(tempField.data(), tempField.data(), this->workspace_m.data(), - heffte::scale::none); - } else { - throw std::logic_error("Only 1:forward and -1:backward are allowed as directions"); - } - - ippl::parallel_for( - "copy to Kokkos FFT", getRangePolicy(fview, nghost), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(fview, args) = apply(tempField, args - nghost); - }); -#ifdef Heffte_ENABLE_FFTW - if (direction == BACKWARD) { - f = f * 8.0; - } -#endif - } - - template - void FFT::warmup(Field& f) { - this->transform(FORWARD, f); - this->transform(BACKWARD, f); - } - - template - void FFT::transform(TransformDirection direction, Field& f) { - static_assert(Dim == 2 || Dim == 3, "heFFTe only supports 2D and 3D"); - -/** - * This rescaling is needed to match the normalization constant - * between fftw and the other gpu interfaces. fftw rescales with an extra factor of 8. - */ -#ifdef Heffte_ENABLE_FFTW - if (direction == FORWARD) { - f = f / 8.0; - } -#endif - - auto fview = f.getView(); - const int nghost = f.getNghost(); - - /** - *This copy to a temporary Kokkos view is needed because of following - *reasons: - *1) heffte wants the input and output fields without ghost layers - *2) heffte accepts data in layout left (by default) eventhough this - *can be changed during heffte box creation - */ - auto& tempField = this->tempField; - if (tempField.size() != f.getOwned().size()) { - tempField = detail::shrinkView("tempField", fview, nghost); - } - - using index_array_type = typename RangePolicy::index_array_type; - ippl::parallel_for( - "copy from Kokkos FFT", getRangePolicy(fview, nghost), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(tempField, args - nghost) = apply(fview, args); - }); - - if (direction == FORWARD) { - this->heffte_m->forward(tempField.data(), tempField.data(), this->workspace_m.data(), - heffte::scale::full); - } else if (direction == BACKWARD) { - this->heffte_m->backward(tempField.data(), tempField.data(), this->workspace_m.data(), - heffte::scale::none); - } else { - throw std::logic_error("Only 1:forward and -1:backward are allowed as directions"); - } - - ippl::parallel_for( - "copy to Kokkos FFT", getRangePolicy(fview, nghost), - KOKKOS_LAMBDA(const index_array_type& args) { - apply(fview, args) = apply(tempField, args - nghost); - }); - -/** - * This rescaling is needed to match the normalization constant - * between fftw and the other gpu interfaces. fftw rescales with an extra factor of 8. - */ -#ifdef Heffte_ENABLE_FFTW - if (direction == BACKWARD) { - f = f * 8.0; - } -#endif - } - -} // namespace ippl - -// vi: set et ts=4 sw=4 sts=4: -// Local Variables: -// mode:c -// c-basic-offset: 4 -// indent-tabs-mode: nil -// require-final-newline: nil -// End: diff --git a/src/FFT/NUFFT/Correction.h b/src/FFT/NUFFT/Correction.h new file mode 100644 index 000000000..d383ce761 --- /dev/null +++ b/src/FFT/NUFFT/Correction.h @@ -0,0 +1,387 @@ +/*! + * @file Correction.h + * @brief Deconvolution / pre-correction kernels for the native NUFFT engine. + * + * Provides: + * - @c compute_deconvolution_factors: precomputes the per-mode complex + * factor combining the kernel Fourier transform with the cell-centered + * phase shift. + * - @c applyDeconvolutionType1: post-FFT correction for Type 1 NUFFT + * (nonuniform -> uniform). + * - @c applyPreCorrectionType2: pre-IFFT correction for Type 2 NUFFT + * (uniform -> nonuniform). + * - Pruned-FFT variants of the above for the low-mode pipeline. + * + * See the `Cell-centered phase convention` section below for the derivation + * of the phase factor. + */ +#ifndef IPPL_NUFFT_CORRECTION_H +#define IPPL_NUFFT_CORRECTION_H + +#include + +#include +#include + +#include "Types/ViewTypes.h" + +#include "Utility/ParallelDispatch.h" + +#include "FFT/NUFFT/ESKernel.h" +#include "FFT/NUFFT/NUFFTUtilities.h" + +namespace ippl { + namespace nufft { + + // ==================================================================== + // Cell-centered phase convention + // ==================================================================== + // + // Scatter/gather use cell-centered DOFs at x_j = (j + 1/2) * h. + // The DFT of the scattered field is: + // + // G_hat_k = phi_hat_k * exp(+i pi k / N) * conj(f_k) + // + // where the exp(+i pi k / N) arises because the kernel is evaluated + // at (x_p / h - j - 1/2) instead of (x_p / h - j). + // + // The shared `factor` stored in the `factors` views is + // factor = deconv * exp(-i pi k / N) [negative phase] + // + // Type 1 (post-FFT): f_k = conj(G_hat_k * factor) + // Type 2 (pre-IFFT): G_hat_k = f_k * factor + // The +i pi phase needed by the cell-centered gather emerges from the + // IFFT of the conjugate-symmetric mode field, so no explicit conj() is + // applied here. + // ==================================================================== + + /** + * @brief Computes deconvolution factors for NUFFT. + */ + template + void compute_deconvolution_factors( + Kokkos::View*, typename ExecSpace::memory_space> factors, + int64_t n_modes, int64_t n_grid, const ESKernel& kernel) { + using complex_type = Kokkos::complex; + + // Set up quadrature + constexpr int q = 100; + auto nodes = Kokkos::View("nodes", q); + auto weights = Kokkos::View("weights", q); + + gauss_legendre(q, nodes, weights); + + const RealType alpha = Kokkos::numbers::pi_v * kernel.width() / n_grid; + const RealType beta = kernel.beta(); + + const int64_t l_n_modes = n_modes; + constexpr int l_q = q; + + Kokkos::parallel_for( + "compute_deconv_factors", Kokkos::RangePolicy(0, n_modes), + KOKKOS_LAMBDA(const int64_t k) { + int freq = (k < l_n_modes / 2) ? k : k - l_n_modes; + RealType ft = 0.0; + + for (int i = 0; i < l_q; ++i) { + const RealType x = nodes(i); + const RealType w = weights(i); + const RealType ker = Kokkos::exp(beta * (Kokkos::sqrt(1.0 - x * x) - 1.0)); + ft += w * ker * Kokkos::cos(freq * alpha * x); + } + + const RealType deconv = 2.0 / (kernel.width() * ft); + + // Phase correction for cell-centered DOFs: exp(-i pi freq / N_grid) + // Type-1 uses this factor directly; Type-2 applies conj(factor). + const RealType phase = Kokkos::numbers::pi_v + * static_cast(freq) + / static_cast(n_grid); + factors(k) = + complex_type(deconv * Kokkos::cos(phase), deconv * Kokkos::sin(phase)); + }); + } + + /** + * @brief Apply deconvolution correction for Type 1 NUFFT (post-FFT). + * + * Type 1: nonuniform points -> uniform Fourier modes. + * After spreading (cell-centered) and FFT, the result satisfies: + * + * G_hat_k = phi_hat_k * exp(+i pi k / N) * conj(f_k) + * + * This function recovers f_k via: + * + * output_k = conj(input_k * factor_k) + * + * where factor_k = deconv_k * exp(-i pi k / N) (stored in `factors`). + * + * Operates locally; input lives on the upsampled grid, output on the + * mode grid. Entries outside the mode band are set to zero. + */ + template + void applyDeconvolutionType1( + FieldIn& input, + const std::array*, typename ExecSpace::memory_space>, + Dim>& factors, + FieldOut& output, const Vector& n_modes, + const Vector& /*n_grid*/) { + using complex_type = Kokkos::complex; + using factor_view_t = Kokkos::View*, + typename ExecSpace::memory_space>; + using index_array_type = typename ippl::RangePolicy::index_array_type; + + auto input_view = input.getView(); + auto output_view = output.getView(); + + const auto& layout = input.getLayout(); + const auto& lDom = layout.getLocalNDIndex(); + + const int nghost_in = input.getNghost(); + const int nghost_out = output.getNghost(); + + // Capture-by-value device-friendly arrays. + Vector local_first; + Kokkos::Array begin, end; + Kokkos::Array f_arr; + Kokkos::Array n_modes_arr; + for (unsigned d = 0; d < Dim; ++d) { + local_first[d] = lDom[d].first(); + begin[d] = lDom[d].first(); + end[d] = lDom[d].last() + 1; + f_arr[d] = factors[d]; + n_modes_arr[d] = static_cast(n_modes[d]); + } + + ippl::parallel_for( + "deconv_type1_local", + ippl::createRangePolicy(begin, end), + KOKKOS_LAMBDA(const index_array_type& g) { + bool inside = true; + for (unsigned d = 0; d < Dim; ++d) { + const int gi = static_cast(g[d]); + const int n = n_modes_arr[d]; + if (!((gi >= 0 && gi < n / 2) || (gi >= n + n / 2 && gi < 2 * n))) { + inside = false; + break; + } + } + + index_array_type idx_in, idx_out; + for (unsigned d = 0; d < Dim; ++d) { + idx_in[d] = static_cast(g[d]) - local_first[d] + nghost_in; + idx_out[d] = static_cast(g[d]) - local_first[d] + nghost_out; + } + + if (inside) { + complex_type factor(T(1), T(0)); + for (unsigned d = 0; d < Dim; ++d) { + const int gi = static_cast(g[d]); + const int n = n_modes_arr[d]; + const int rescaled = (gi < n) ? gi : gi - n; + factor *= f_arr[d](rescaled); + } + ippl::apply(output_view, idx_out) = + Kokkos::conj(ippl::apply(input_view, idx_in) * factor); + } else { + ippl::apply(output_view, idx_out) = complex_type(0, 0); + } + }); + + Kokkos::fence(); + } + + /** + * @brief Apply pre-correction for Type 2 NUFFT (pre-IFFT). + */ + template + void applyPreCorrectionType2( + FieldIn& input, + const std::array*, typename ExecSpace::memory_space>, + Dim>& factors, + FieldOut& output, const Vector& n_modes, + const Vector& /*n_grid*/) { + using complex_type = Kokkos::complex; + using factor_view_t = Kokkos::View*, + typename ExecSpace::memory_space>; + using index_array_type = typename ippl::RangePolicy::index_array_type; + + auto input_view = input.getView(); + auto output_view = output.getView(); + + const auto& layout = input.getLayout(); + const auto& lDom = layout.getLocalNDIndex(); + + const int nghost_in = input.getNghost(); + const int nghost_out = output.getNghost(); + + Vector local_first; + Kokkos::Array begin, end; + Kokkos::Array f_arr; + Kokkos::Array n_modes_arr; + for (unsigned d = 0; d < Dim; ++d) { + local_first[d] = lDom[d].first(); + begin[d] = lDom[d].first(); + end[d] = lDom[d].last() + 1; + f_arr[d] = factors[d]; + n_modes_arr[d] = static_cast(n_modes[d]); + } + + ippl::parallel_for( + "precorr_type2_local", + ippl::createRangePolicy(begin, end), + KOKKOS_LAMBDA(const index_array_type& g) { + bool inside = true; + for (unsigned d = 0; d < Dim; ++d) { + const int gi = static_cast(g[d]); + const int n = n_modes_arr[d]; + if (!((gi >= 0 && gi < n / 2) || (gi >= n + n / 2 && gi < 2 * n))) { + inside = false; + break; + } + } + + index_array_type idx_in, idx_out; + for (unsigned d = 0; d < Dim; ++d) { + idx_in[d] = static_cast(g[d]) - local_first[d] + nghost_in; + idx_out[d] = static_cast(g[d]) - local_first[d] + nghost_out; + } + + if (inside) { + complex_type factor(T(1), T(0)); + for (unsigned d = 0; d < Dim; ++d) { + const int gi = static_cast(g[d]); + const int n = n_modes_arr[d]; + const int rescaled = (gi < n) ? gi : gi - n; + factor *= f_arr[d](rescaled); + } + ippl::apply(output_view, idx_out) = + ippl::apply(input_view, idx_in) * factor; + } else { + ippl::apply(output_view, idx_out) = complex_type(0, 0); + } + }); + + Kokkos::fence(); + } + + /** + * @brief Apply deconvolution for Type 1 NUFFT on the pruned mode grid. + */ + template + void applyDeconvolutionPruned( + FieldType& field, + const std::array*, typename ExecSpace::memory_space>, + Dim>& factors, + const Vector& n_modes, const Vector& /*n_grid*/) { + using complex_type = Kokkos::complex; + using factor_view_t = Kokkos::View*, + typename ExecSpace::memory_space>; + using index_array_type = typename ippl::RangePolicy::index_array_type; + + auto view = field.getView(); + auto& layout = field.getLayout(); + auto lDom = layout.getLocalNDIndex(); + + const int nghost = field.getNghost(); + + Vector local_first; + Kokkos::Array begin, end; + Kokkos::Array f_arr; + Kokkos::Array n_modes_arr; + for (unsigned d = 0; d < Dim; ++d) { + local_first[d] = lDom[d].first(); + begin[d] = lDom[d].first(); + end[d] = lDom[d].last() + 1; + f_arr[d] = factors[d]; + n_modes_arr[d] = static_cast(n_modes[d]); + } + + ippl::parallel_for( + "deconv_type1_pruned_local", + ippl::createRangePolicy(begin, end), + KOKKOS_LAMBDA(const index_array_type& g) { + for (unsigned d = 0; d < Dim; ++d) { + const int gi = static_cast(g[d]); + if (gi < 0 || gi >= n_modes_arr[d]) return; + } + + index_array_type idx; + for (unsigned d = 0; d < Dim; ++d) { + idx[d] = static_cast(g[d]) - local_first[d] + nghost; + } + + complex_type factor(T(1), T(0)); + for (unsigned d = 0; d < Dim; ++d) { + factor *= f_arr[d](static_cast(g[d])); + } + + ippl::apply(view, idx) = Kokkos::conj(ippl::apply(view, idx) * factor); + }); + + Kokkos::fence(); + } + + /** + * @brief Apply pre-correction for Type 2 NUFFT on the pruned mode grid. + */ + template + void applyPrecorrectionPruned( + FieldType& field, + const std::array*, typename ExecSpace::memory_space>, + Dim>& factors, + const Vector& n_modes, const Vector& /*n_grid*/) { + using complex_type = Kokkos::complex; + using factor_view_t = Kokkos::View*, + typename ExecSpace::memory_space>; + using index_array_type = typename ippl::RangePolicy::index_array_type; + + auto view = field.getView(); + auto& layout = field.getLayout(); + auto lDom = layout.getLocalNDIndex(); + + const int nghost = field.getNghost(); + + Vector local_first; + Kokkos::Array begin, end; + Kokkos::Array f_arr; + Kokkos::Array n_modes_arr; + for (unsigned d = 0; d < Dim; ++d) { + local_first[d] = lDom[d].first(); + begin[d] = lDom[d].first(); + end[d] = lDom[d].last() + 1; + f_arr[d] = factors[d]; + n_modes_arr[d] = static_cast(n_modes[d]); + } + + ippl::parallel_for( + "precorr_type2_pruned_local", + ippl::createRangePolicy(begin, end), + KOKKOS_LAMBDA(const index_array_type& g) { + for (unsigned d = 0; d < Dim; ++d) { + const int gi = static_cast(g[d]); + if (gi < 0 || gi >= n_modes_arr[d]) return; + } + + index_array_type idx; + for (unsigned d = 0; d < Dim; ++d) { + idx[d] = static_cast(g[d]) - local_first[d] + nghost; + } + + complex_type factor(T(1), T(0)); + for (unsigned d = 0; d < Dim; ++d) { + factor *= f_arr[d](static_cast(g[d])); + } + + ippl::apply(view, idx) = ippl::apply(view, idx) * factor; + }); + + Kokkos::fence(); + } + + } // namespace nufft +} // namespace ippl + +#endif // IPPL_NUFFT_CORRECTION_H diff --git a/src/FFT/NUFFT/ESKernel.h b/src/FFT/NUFFT/ESKernel.h new file mode 100644 index 000000000..f0ca73cd7 --- /dev/null +++ b/src/FFT/NUFFT/ESKernel.h @@ -0,0 +1,480 @@ +/*! + * @file ESKernel.h + * @brief Exponential-of-Semicircle (ES) spreading kernel for NUFFT. + * + * The header exposes: + * - @c es_kernel_eval_wN device-friendly polynomial evaluators for + * each precomputed kernel half-width N (3..15). + * - @c ESKernel a runtime selector that picks the smallest width + * achieving the requested tolerance and dispatches to the matching + * polynomial. + * + * Coefficients are minimax polynomials in t=x^2 with magic literals encoded + * as hex floating-point so they are exact across compilers. + */ +#ifndef IPPL_NUFFT_ES_KERNEL_H +#define IPPL_NUFFT_ES_KERNEL_H + +#include + +#include + +namespace ippl { + namespace nufft { + + // ============================================================ + // ES Kernel polynomial evaluation for each width + // Coefficients are inlined to work in device code + // ============================================================ + + /*! + * @brief Polynomial evaluator for the ES kernel at width @c w=4 + * (beta = 9.2, degree = 5, max error < 3.53e-4). + * + * @tparam T Floating-point precision (float / double). + * @param x Argument in [0, 1] (the kernel is even, so callers pass |x|). + * @return Polynomial approximation of @c exp(beta*(sqrt(1-x^2)-1)). + */ + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w4(T x) { + const T t = x * x; + T r = T(-0x1.7a61695c211e2p0); + r = r * t + T(0x1.7741c5626c40bp2); + r = r * t + T(-0x1.3ccc694d94a47p3); + r = r * t + T(0x1.22ccba2dfd6a5p3); + r = r * t + T(-0x1.24a8469892c07p2); + r = r * t + T(0x1.ffd1c0e2c6db8p-1); + return r; + } + + // w = 5, beta = 11.5, degree = 7, error < 1.74e-5 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w5(T x) { + const T t = x * x; + T r = T(-0x1.53ef7dfa4cddap0); + r = r * t + T(0x1.c1971511f9acbp2); + r = r * t + T(-0x1.0ed3008908634p4); + r = r * t + T(0x1.8a1fbea0250d2p4); + r = r * t + T(-0x1.7b28be8632757p4); + r = r * t + T(0x1.e1574df952c56p3); + r = r * t + T(-0x1.6fd9a0a46e05ap2); + r = r * t + T(0x1.fffdb8228ab29p-1); + return r; + } + + // w = 6, beta = 13.8, degree = 9, error < 8.50e-7 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w6(T x) { + const T t = x * x; + T r = T(-0x1.2290d68e62bcfp0); + r = r * t + T(0x1.d9edacd03ff7ep2); + r = r * t + T(-0x1.6afaba0b81c05p4); + r = r * t + T(0x1.5e4576519fd7ep5); + r = r * t + T(-0x1.dd17e73ccea97p5); + r = r * t + T(0x1.ddcc59a8e52adp5); + r = r * t + T(-0x1.5d0a435b3428ep5); + r = r * t + T(0x1.612ebf7fd3207p4); + r = r * t + T(-0x1.b996b31b5ba67p2); + r = r * t + T(0x1.ffffe37649033p-1); + return r; + } + + // w = 7, beta = 16.1, degree = 10, error < 4.52e-7 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w7(T x) { + const T t = x * x; + T r = T(0x1.0d43fb941e8cp1); + r = r * t + T(-0x1.d07f762f83e4cp3); + r = r * t + T(0x1.76aa8fe35ec4ep5); + r = r * t + T(-0x1.7b643526b5bfcp6); + r = r * t + T(0x1.0ff008e09251bp7); + r = r * t + T(-0x1.23a681af8ca75p7); + r = r * t + T(0x1.da7c21b010748p6); + r = r * t + T(-0x1.1eb0a0c2150d6p6); + r = r * t + T(0x1.e6250a3d3d121p4); + r = r * t + T(-0x1.0198abec8a4eep3); + r = r * t + T(0x1.fffff0d9a4617p-1); + return r; + } + + // w = 8, beta = 18.4, degree = 12, error < 2.33e-8 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w8(T x) { + const T t = x * x; + T r = T(0x1.d5cd3d519edbcp0); + r = r * t + T(-0x1.da1032722476ep3); + r = r * t + T(0x1.c5cddddf485b9p5); + r = r * t + T(-0x1.1533735f3e975p7); + r = r * t + T(0x1.e9cf3a5cff485p7); + r = r * t + T(-0x1.4e105fc4aa621p8); + r = r * t + T(0x1.6a81ebd6683f4p8); + r = r * t + T(-0x1.3a18274b0a1e3p8); + r = r * t + T(0x1.ab15412d571b6p7); + r = r * t + T(-0x1.b70af68ed9b3fp6); + r = r * t + T(0x1.4028006f3c69bp5); + r = r * t + T(-0x1.26665564ca36p3); + r = r * t + T(0x1.ffffff37dec0bp-1); + return r; + } + + // w = 9, beta = 20.7, degree = 14, error < 1.20e-9 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w9(T x) { + const T t = x * x; + T r = T(0x1.90982143e4ae3p0); + r = r * t + T(-0x1.cd7480dda6f42p3); + r = r * t + T(0x1.fcc7e2c6cba0dp5); + r = r * t + T(-0x1.694ede76783d8p7); + r = r * t + T(0x1.7752cfcde75a6p8); + r = r * t + T(-0x1.31e18cfaaedc8p9); + r = r * t + T(0x1.968a12dc2f151p9); + r = r * t + T(-0x1.bf8cb2e1ddc74p9); + r = r * t + T(0x1.97659066136f6p9); + r = r * t + T(-0x1.2e19d0407765dp9); + r = r * t + T(0x1.6374318a4696bp8); + r = r * t + T(-0x1.3e98a86653c26p7); + r = r * t + T(0x1.97ca274772c9fp5); + r = r * t + T(-0x1.4b33320a95fccp3); + r = r * t + T(0x1.fffffff5b40bp-1); + return r; + } + + // w = 10, beta = 23.0, degree = 16, error < 6.20e-11 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w10(T x) { + const T t = x * x; + T r = T(0x1.5017876c93c63p0); + r = r * t + T(-0x1.b266030e0096ap3); + r = r * t + T(0x1.0e462bf649f05p6); + r = r * t + T(-0x1.b3a48c1d9288p7); + r = r * t + T(0x1.025a87a6770c9p9); + r = r * t + T(-0x1.e4e7636a1af58p9); + r = r * t + T(0x1.780fc81f87388p10); + r = r * t + T(-0x1.ed395a8d098afp10); + r = r * t + T(0x1.1373eb09a6122p11); + r = r * t + T(-0x1.04dade35c1013p11); + r = r * t + T(0x1.9d913dd2b0b9ep10); + r = r * t + T(-0x1.0d05b6008f472p10); + r = r * t + T(0x1.1733f862d30f6p9); + r = r * t + T(-0x1.bbb54324b711ap7); + r = r * t + T(0x1.f9fffe1fb327ap5); + r = r * t + T(-0x1.6fffffec5ff52p3); + r = r * t + T(0x1.ffffffff77951p-1); + return r; + } + + // w = 11, beta = 25.3, degree = 17, error < 3.20e-11 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w11(T x) { + const T t = x * x; + T r = T(-0x1.59009930566c2p1); + r = r * t + T(0x1.cd02dcaa304dcp4); + r = r * t + T(-0x1.27b6792d84287p7); + r = r * t + T(0x1.e93b8598d8732p8); + r = r * t + T(-0x1.2827b4bce0869p10); + r = r * t + T(0x1.1a3cd50b8ebb6p11); + r = r * t + T(-0x1.bb6bf72d60cf3p11); + r = r * t + T(0x1.27231ed89e683p12); + r = r * t + T(-0x1.5123d0d1023adp12); + r = r * t + T(0x1.4ad53cddfcaf1p12); + r = r * t + T(-0x1.14cc6d44fbd4ap12); + r = r * t + T(0x1.856985c4590bep11); + r = r * t + T(-0x1.c3591519272cep10); + r = r * t + T(0x1.a2f8d49451968p9); + r = r * t + T(-0x1.2af4cfa9b5757p8); + r = r * t + T(0x1.33651e1b6dc6p6); + r = r * t + T(-0x1.94ccccc16875dp3); + r = r * t + T(0x1.ffffffffb99d9p-1); + return r; + } + + // w = 12, beta = 27.6, degree = 19, error < 1.65e-12 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w12(T x) { + const T t = x * x; + T r = T(-0x1.279db73ec08b8p1); + r = r * t + T(0x1.b36c753c2aa7dp4); + r = r * t + T(-0x1.3537a8d5dacp7); + r = r * t + T(0x1.1c4ad5335b3c1p9); + r = r * t + T(-0x1.7febf4612070ap10); + r = r * t + T(0x1.99bccfa2b45ffp11); + r = r * t + T(-0x1.6a7e6841cdd94p12); + r = r * t + T(0x1.1229b3df0dde7p13); + r = r * t + T(-0x1.68b72ed61ef3ap13); + r = r * t + T(0x1.9f70e8e221323p13); + r = r * t + T(-0x1.a21e19c0501eap13); + r = r * t + T(0x1.6d0860e26b0f4p13); + r = r * t + T(-0x1.11406ebe54d37p13); + r = r * t + T(0x1.594e7a8cbab51p12); + r = r * t + T(-0x1.68bd26f203d21p11); + r = r * t + T(0x1.2ed3db062aa1p10); + r = r * t + T(-0x1.8820826cc8dc9p8); + r = r * t + T(0x1.6f147ad4fda94p6); + r = r * t + T(-0x1.b9999998e030ap3); + r = r * t + T(0x1.fffffffffc5dcp-1); + return r; + } + + // w = 13, beta = 29.9, degree = 21, error < 8.48e-14 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w13(T x) { + const T t = x * x; + T r = T(-0x1.f5b0db6099cd3p0); + r = r * t + T(0x1.93556b5289c77p4); + r = r * t + T(-0x1.39b591914c767p7); + r = r * t + T(0x1.3cc43c9c9dd5p9); + r = r * t + T(-0x1.d6e0b91c3f952p10); + r = r * t + T(0x1.1530d489678cfp12); + r = r * t + T(-0x1.0f58cc3b57c6fp13); + r = r * t + T(0x1.c8810a3e2b99cp13); + r = r * t + T(-0x1.50d1cdb6d5abcp14); + r = r * t + T(0x1.b8757eaf1ebap14); + r = r * t + T(-0x1.ffa40c1c3b2a6p14); + r = r * t + T(0x1.07216db3d2af2p15); + r = r * t + T(-0x1.dc0307c1e538p14); + r = r * t + T(0x1.76f66049cee1ap14); + r = r * t + T(-0x1.fc0d2356bef94p13); + r = r * t + T(0x1.2354fa3750e5cp13); + r = r * t + T(-0x1.14f973d66e8afp12); + r = r * t + T(0x1.a85e5aa22f6bap10); + r = r * t + T(-0x1.f6e308d0ea5eap8); + r = r * t + T(0x1.b00e1479f5c26p6); + r = r * t + T(-0x1.de6666665ae9p3); + r = r * t + T(0x1.ffffffffffd04p-1); + return r; + } + + // w = 14, beta = 32.2, degree = 22, error < 4.69e-14 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w14(T x) { + const T t = x * x; + T r = T(0x1.0354efce10e2dp2); + r = r * t + T(-0x1.ae25418562cf8p5); + r = r * t + T(0x1.58827d3735653p8); + r = r * t + T(-0x1.65245773d1613p10); + r = r * t + T(0x1.0f606d2c65324p12); + r = r * t + T(-0x1.44f69b9ed1fe3p13); + r = r * t + T(0x1.41dfc1b9c2d3ap14); + r = r * t + T(-0x1.10df39932e40cp15); + r = r * t + T(0x1.9536924063e52p15); + r = r * t + T(-0x1.0b15d6e06f48p16); + r = r * t + T(0x1.3a48e2051a128p16); + r = r * t + T(-0x1.49fabd3679aeep16); + r = r * t + T(0x1.33b8d92983ebfp16); + r = r * t + T(-0x1.f9f712f3c39a1p15); + r = r * t + T(0x1.6b15557642d58p15); + r = r * t + T(-0x1.c13113d9b8e7fp14); + r = r * t + T(0x1.d7772ec3bd135p13); + r = r * t + T(-0x1.9b2d52745b9d6p12); + r = r * t + T(0x1.21a51b7b71831p11); + r = r * t + T(-0x1.3c60dfe4bed5dp9); + r = r * t + T(0x1.f651eb8483554p6); + r = r * t + T(-0x1.019999999621ep4); + r = r * t + T(0x1.ffffffffffe5bp-1); + return r; + } + + // w = 15, beta = 34.5, degree = 24, error < 2.46e-15 + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval_w15(T x) { + const T t = x * x; + T r = T(0x1.bcc3fd37452d5p1); + r = r * t + T(-0x1.8eb424f00a28dp5); + r = r * t + T(0x1.5a19d25b45029p8); + r = r * t + T(-0x1.85c332c280f8dp10); + r = r * t + T(0x1.424ce09ce2f3ap12); + r = r * t + T(-0x1.a49341a23526dp13); + r = r * t + T(0x1.c69265bf4a9d7p14); + r = r * t + T(-0x1.a5637540ea295p15); + r = r * t + T(0x1.57650b877caaep16); + r = r * t + T(-0x1.f3de53de15289p16); + r = r * t + T(0x1.47a7b42596f6ep17); + r = r * t + T(-0x1.83c339a13039ep17); + r = r * t + T(0x1.9d812c92ca63fp17); + r = r * t + T(-0x1.8b81cbd899304p17); + r = r * t + T(0x1.51124b72fffedp17); + r = r * t + T(-0x1.fbb1cbaa2b528p16); + r = r * t + T(0x1.4e6e67a82c9d4p16); + r = r * t + T(-0x1.7c8f736d5aefp15); + r = r * t + T(0x1.70230df3fd36ap14); + r = r * t + T(-0x1.28835f02a5b9bp13); + r = r * t + T(0x1.829acbfacfe6bp11); + r = r * t + T(-0x1.87a0ffff89722p9); + r = r * t + T(0x1.20effffffa61ap7); + r = r * t + T(-0x1.13ffffffffc9fp4); + r = r * t + T(0x1.fffffffffffebp-1); + return r; + } + + // ============================================================ + // Runtime width dispatch + // ============================================================ + + /** + * @brief Runtime width dispatch for ES kernel evaluation + * @param x Evaluation point in [0, 1] + * @param w Kernel width (4-15) + * @return Approximation to exp(beta * (sqrt(1-x^2) - 1)) + */ + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval(T x, int w) { + switch (w) { + case 4: + return es_kernel_eval_w4(x); + case 5: + return es_kernel_eval_w5(x); + case 6: + return es_kernel_eval_w6(x); + case 7: + return es_kernel_eval_w7(x); + case 8: + return es_kernel_eval_w8(x); + case 9: + return es_kernel_eval_w9(x); + case 10: + return es_kernel_eval_w10(x); + case 11: + return es_kernel_eval_w11(x); + case 12: + return es_kernel_eval_w12(x); + case 13: + return es_kernel_eval_w13(x); + case 14: + return es_kernel_eval_w14(x); + case 15: + return es_kernel_eval_w15(x); + default: + // Fallback to exact evaluation for unsupported widths + return Kokkos::exp(T(2.30) * w * (Kokkos::sqrt(T(1) - x * x) - T(1))); + } + } + + // ============================================================ + // Compile-time width dispatch (template version) + // ============================================================ + + /*! + * @brief Compile-time width dispatch for ES kernel evaluation. + * + * The compiler can fold a single branch when @p W is known, removing + * the runtime switch in @c es_kernel_eval(T,int). + * + * @tparam T Floating-point precision. + * @tparam W Kernel width (4..15); other values fall back to exact eval. + * @param x Argument in [0, 1]. + */ + template + KOKKOS_INLINE_FUNCTION T es_kernel_eval(T x) { + if constexpr (W == 4) { + return es_kernel_eval_w4(x); + } else if constexpr (W == 5) { + return es_kernel_eval_w5(x); + } else if constexpr (W == 6) { + return es_kernel_eval_w6(x); + } else if constexpr (W == 7) { + return es_kernel_eval_w7(x); + } else if constexpr (W == 8) { + return es_kernel_eval_w8(x); + } else if constexpr (W == 9) { + return es_kernel_eval_w9(x); + } else if constexpr (W == 10) { + return es_kernel_eval_w10(x); + } else if constexpr (W == 11) { + return es_kernel_eval_w11(x); + } else if constexpr (W == 12) { + return es_kernel_eval_w12(x); + } else if constexpr (W == 13) { + return es_kernel_eval_w13(x); + } else if constexpr (W == 14) { + return es_kernel_eval_w14(x); + } else if constexpr (W == 15) { + return es_kernel_eval_w15(x); + } else { + // Fallback to exact evaluation for unsupported widths + return Kokkos::exp(T(2.30) * W * (Kokkos::sqrt(T(1) - x * x) - T(1))); + } + } + + // ============================================================ + // ESKernel class + // ============================================================ + + /** + * @brief Exponential-of-Semicircle (ES) spreading kernel. + * + * The ES kernel is defined as: + * phi(x) = exp(beta * (sqrt(1 - x^2) - 1)) for |x| < 1 + * = 0 otherwise + * + * Width (w) and beta are derived from a user-chosen error tolerance. + * + * @tparam T Floating point type (float or double) + */ + template + class ESKernel { + public: + static constexpr bool has_width_template = true; + // Upper bound on the runtime width; matches the precomputed + // ES-kernel polynomial expansions (w = 4..15) used by NativeNUFFT. + static constexpr int max_width = 15; + using value_type = T; + + static constexpr T default_tol = T(1e-10); + static constexpr T beta_factor = T(2.30); + + /** + * @brief Construct kernel with given tolerance. + * @param tol Error tolerance for NUFFT accuracy + */ + KOKKOS_INLINE_FUNCTION explicit ESKernel(T tol = default_tol) + : w_(static_cast(Kokkos::ceil(Kokkos::log10(T(1.0) / tol))) + 1) + , beta_(beta_factor * w_) + , tol_(tol) {} + + /** + * @brief Construct kernel with explicit width and beta. + * @param width Kernel width (number of grid points) + * @param beta Beta parameter controlling decay + */ + KOKKOS_INLINE_FUNCTION ESKernel(int width, T beta) + : w_(width) + , beta_(beta) + , tol_(default_tol) {} + + /** + * @brief Evaluate the ES kernel at position x in [-1, 1]. + * @param x Normalized position + * @return Normalized kernel value + */ + KOKKOS_INLINE_FUNCTION T operator()(T x) const { + x = Kokkos::abs(x); + return x >= T(1.0) ? T(0.0) : es_kernel_eval(x, w_); + } + + /*! + * @brief Evaluate the ES kernel using compile-time width @c W. + * @tparam W Kernel width matching one of the precomputed polynomials. + */ + template + KOKKOS_INLINE_FUNCTION T eval(T x) const { + return x >= T(1.0) ? T(0.0) : es_kernel_eval(x); + } + + //! @return Runtime kernel half-width @c w (in grid points). + KOKKOS_INLINE_FUNCTION int width() const { return w_; } + //! @return Decay parameter @c beta = beta_factor * w. + KOKKOS_INLINE_FUNCTION T beta() const { return beta_; } + //! @return Target relative error tolerance. + KOKKOS_INLINE_FUNCTION T tol() const { return tol_; } + + private: + int w_; + T beta_; + T tol_; + }; + + } // namespace NUFFT +} // namespace ippl + + +#endif // IPPL_NUFFT_ES_KERNEL_H diff --git a/src/FFT/NUFFT/NUFFTUtilities.h b/src/FFT/NUFFT/NUFFTUtilities.h new file mode 100644 index 000000000..37cc4b305 --- /dev/null +++ b/src/FFT/NUFFT/NUFFTUtilities.h @@ -0,0 +1,65 @@ +#ifndef IPPL_NUFFT_UTILITIES_H +#define IPPL_NUFFT_UTILITIES_H + +/** + * @file NUFFTUtilities.h + * @brief NUFFT utility helpers used by the native Kokkos NUFFT engine. + */ + +#include +#include + +#include + +namespace ippl { +namespace nufft { + + /** + * @brief Computes Gauss-Legendre quadrature nodes and weights on [-1, 1]. + * + * Runs on host and copies results to the device-accessible views. + */ + template + void gauss_legendre(int n, + Kokkos::View &nodes, + Kokkos::View &weights) { + constexpr RealType eps = std::numeric_limits::epsilon(); + + auto h_nodes = Kokkos::create_mirror_view(Kokkos::HostSpace(), nodes); + auto h_weights = Kokkos::create_mirror_view(Kokkos::HostSpace(), weights); + + for (int i = 0; i < n; ++i) { + RealType x = std::cos(Kokkos::numbers::pi_v * (i + 0.75) / (n + 0.5)); + RealType pp, delta; + + do { + RealType p1 = 1.0, p2 = 0.0; + for (int j = 0; j < n; ++j) { + const RealType p3 = p2; + p2 = p1; + p1 = ((2.0 * j + 1.0) * x * p2 - j * p3) / (j + 1.0); + } + pp = n * (x * p1 - p2) / (x * x - 1.0); + delta = p1 / pp; + x -= delta; + } while (std::abs(delta) > eps); + + h_nodes(i) = x; + h_weights(i) = 2.0 / ((1.0 - x * x) * pp * pp); + } + + // Exploit symmetry + for (int i = 0; i < n / 2; ++i) { + const int j = n - 1 - i; + h_nodes(j) = -h_nodes(i); + h_weights(j) = h_weights(i); + } + + Kokkos::deep_copy(nodes, h_nodes); + Kokkos::deep_copy(weights, h_weights); + } + +} // namespace nufft +} // namespace ippl + +#endif // IPPL_NUFFT_UTILITIES_H diff --git a/src/FFT/NUFFT/NativeNUFFT.h b/src/FFT/NUFFT/NativeNUFFT.h new file mode 100644 index 000000000..b75cc176a --- /dev/null +++ b/src/FFT/NUFFT/NativeNUFFT.h @@ -0,0 +1,383 @@ +// +// Native NUFFT Implementation +// NUFFT using kernel-based scatter/gather and heFFTe FFT. +// Does not depend on external NUFFT libraries. +// +#ifndef IPPL_NATIVE_NUFFT_H +#define IPPL_NATIVE_NUFFT_H + +#include + +#include +#include +#include +#include + +#include "Types/Vector.h" + +#include "Utility/IpplTimings.h" + +#include "Field/Field.h" + +#include "../../Interpolation/Scatter/ScatterConfig.h" +#include "Correction.h" +#include "FFT/FFT.h" +#include "FFT/NUFFT/ESKernel.h" +#include "FFT/NUFFT/NUFFTUtilities.h" +#include "Particle/ParticleAttrib.h" + +namespace ippl { + namespace nufft { + + /** + * @brief NUFFT implementation. + * + * Type 1: Spread from nonuniform points to uniform Fourier modes + * (scatter -> FFT -> deconvolution) + * + * Type 2: Interpolate uniform Fourier modes at nonuniform points + * (correction -> FFT -> gather) + * + * @tparam Dim Number of dimensions + * @tparam T Floating point type + * @tparam ExecSpace Kokkos execution space + */ + template + class NativeNUFFT { + public: + using execution_space = ExecSpace; + using memory_space = typename ExecSpace::memory_space; + using complex_type = Kokkos::complex; + using size_type = size_t; + + // View types + using complex_view_1d = Kokkos::View; + using real_view_1d = Kokkos::View; + + // Field types + using Mesh_t = UniformCartesian; + using Centering_t = Cell; + using ComplexField = + Field::uniform_type; + using Layout_t = FieldLayout; + + /*! + * @struct Config + * @brief NUFFT engine configuration knobs. + */ + struct Config { + T tol = T(1e-6); //!< Target relative error of the kernel. + T sigma = T(2.0); //!< Oversampling factor for the upsampled grid. + Interpolation::ScatterConfig scatter_config; //!< Scatter (Type 1) tuning. + Interpolation::GatherConfig gather_config; //!< Gather (Type 2) tuning. + }; + + /*! + * @struct TimingInfo + * @brief Per-stage cumulative timings (seconds), populated by the engine. + */ + struct TimingInfo { + T spread = 0; //!< Time spent in the scatter step. + T fft = 0; //!< Time spent in the FFT step. + T correct = 0; //!< Time spent in the deconvolution / pre-correction step. + T total = 0; //!< Sum of the above. + }; + + private: + Config cfg_; + ESKernel kernel_; + Vector n_modes_; + Vector n_grid_; + + // Deconvolution factors per dimension + std::array factors_; + + // Upsampled grid for FFT + std::unique_ptr grid_field_; + std::unique_ptr grid_layout_; + std::unique_ptr grid_mesh_; + + // heFFTe FFT object + std::unique_ptr> heffte_fft_; + std::unique_ptr> pruned_fft_; + + TimingInfo timing_; + bool initialized_ = false; + bool use_upsampled_ = false; + + public: + /** + * @brief Construct NativeNUFFT with given mode counts. + * + * @param n_modes Number of Fourier modes per dimension + * @param cfg Configuration parameters + */ + NativeNUFFT(const Vector& n_modes, bool use_upsampled, Config cfg = {}) + : cfg_(cfg) + , kernel_(cfg.tol) + , n_modes_(n_modes) + , use_upsampled_(use_upsampled) { + // Compute upsampled grid sizes + for (unsigned d = 0; d < Dim; ++d) { + n_grid_[d] = std::bit_ceil( + std::max(cfg_.sigma * n_modes_[d], 2 * kernel_.width())); + } + initialized_ = false; + } + + /** + * @brief Initialize the NUFFT with a layout. + * + * Must be called before transform operations. + * + * @param comm MPI communicator + */ + void initialize(const Layout_t& modes_layout, const MPI_Comm& comm = MPI_COMM_WORLD) { + if (initialized_) + return; + + static IpplTimings::TimerRef initTimer = IpplTimings::getTimer("NativeNUFFT::init"); + IpplTimings::startTimer(initTimer); + + // Create index domain for upsampled grid + NDIndex domain; + for (unsigned d = 0; d < Dim; ++d) { + domain[d] = Index(n_grid_[d]); + } + + // Create decomposition + std::array isParallel; + isParallel.fill(true); + + // For uneven kernel width W, a point in the upper half of the last + // segment needs both the cell value at the end and the kernel support + // that extends w/2 into the halo, hence (W + 1) / 2 ghost layers. + const int nghost = (kernel_.width()) / 2 + 1; + + grid_layout_ = std::make_unique(comm, domain, isParallel, true, nghost); + + // Create mesh for upsampled grid + Vector origin, hx; + for (unsigned d = 0; d < Dim; ++d) { + origin[d] = 0; + T extent = T(2.0) * Kokkos::numbers::pi_v; + hx[d] = extent / n_grid_[d]; + } + + grid_mesh_ = std::make_unique(domain, hx, origin); + + // Native NUFFT uses field ghosts directly instead of creating an + // extended grid; carry kernel-width ghost cells on the field itself. + grid_field_ = std::make_unique(*grid_mesh_, *grid_layout_, nghost); + + // Initialize heFFTe FFT + if (use_upsampled_) { + ParameterList fftParams; + fftParams.add("use_heffte_defaults", false); + fftParams.add("use_pencils", true); + fftParams.add("use_reorder", false); + fftParams.add("use_gpu_aware", true); + fftParams.add("comm", 3); + heffte_fft_ = + std::make_unique>(*grid_layout_, fftParams); + } else { + ParameterList fftParams; + fftParams.add("use_heffte_defaults", false); + fftParams.add("use_pencils", true); + fftParams.add("use_reorder", false); + fftParams.add("use_gpu_aware", true); + fftParams.add("comm", 2); + fftParams.add("num_concurrent_ffts", 4); + PruningParams pruning_params; + // Set pruning params to output the desired n_modes_, not n_grid_/2 + for (int d = 0; d < static_cast(Dim); ++d) { + pruning_params.n_modes[d] = n_modes_[d]; + } + + pruned_fft_ = std::make_unique>( + *grid_layout_, modes_layout, pruning_params, fftParams); + } + + // Precompute deconvolution factors + ESKernel nufft_kernel(cfg_.tol); + for (unsigned d = 0; d < Dim; ++d) { + factors_[d] = complex_view_1d("deconv_factors", n_modes_[d]); + + ippl::nufft::compute_deconvolution_factors( + factors_[d], static_cast(n_modes_[d]), + static_cast(n_grid_[d]), nufft_kernel); + } + + Kokkos::fence(); + + initialized_ = true; + IpplTimings::stopTimer(initTimer); + } + + /** + * @brief Type 1 NUFFT: Spread from nonuniform points to uniform Fourier modes. + * + * Computes f_k = sum_j c_j * exp(i * k * x_j) for uniform k. + * + * @tparam Properties ParticleAttrib properties + * @tparam OutField Output field type + * @param R Particle positions in [0, 2*pi)^Dim + * @param Q Particle values (input) + * @param f Output Fourier modes field + * @param upsampled_output Whether the output field is the usampeld grid. Required on + * distributed for now + */ + template + void type1(const ParticleAttrib, Properties...>& R, + const ParticleAttrib& Q, OutField& f, + bool upsampled_output = false) { + if (!initialized_) { + throw IpplException("NativeNUFFT::type1", + "NUFFT not initialized. Call initialize() first."); + } + static IpplTimings::TimerRef NativeNUFFT1Timer = IpplTimings::getTimer("NativeNUFFT1"); + IpplTimings::startTimer(NativeNUFFT1Timer); + + + static IpplTimings::TimerRef scatterTimer = IpplTimings::getTimer("scatterTimerNUFFT1"); + IpplTimings::startTimer(scatterTimer); + Kokkos::deep_copy(grid_field_->getView(), 0.0); + + Q.scatter_kernel(*grid_field_, R, kernel_, cfg_.scatter_config); + Kokkos::fence(); + IpplTimings::stopTimer(scatterTimer); + + static IpplTimings::TimerRef fftTimer = IpplTimings::getTimer("FFTNUFFT1"); + IpplTimings::startTimer(fftTimer); + // Step 2: Inverse FFT + if (upsampled_output) { + performFFT(-1); + } else { + performPrunedFFT(1, f); + } + IpplTimings::stopTimer(fftTimer); + + static IpplTimings::TimerRef deconvolutionTimer = IpplTimings::getTimer("deconvolutionNUFFT1"); + IpplTimings::startTimer(deconvolutionTimer); + // Step 3: Deconvolution and truncation to output modes + if (upsampled_output) { + applyDeconvolutionType1( + *grid_field_, factors_, f, n_modes_, n_grid_); + } else { + applyDeconvolutionPruned( + f, factors_, n_modes_, n_grid_); + } + IpplTimings::stopTimer(deconvolutionTimer); + IpplTimings::stopTimer(NativeNUFFT1Timer); + + } + + + /*! + * @brief Type 2 NUFFT: Interpolate uniform Fourier modes at non-uniform points. + * + * Computes c_j = sum_k f_k * exp(-i k x_j) for non-uniform x_j. + * Pipeline: pre-correction -> inverse FFT (or pruned FFT) -> gather. + * + * @tparam InField Field type holding the input Fourier modes. + * @param f Input Fourier-mode field. + * @param R Particle positions in [0, 2*pi)^Dim. + * @param Q Output particle values. + * @param upsampled_output Use the full upsampled grid (true) or the + * pruned-FFT path (false; required on + * distributed runs). + */ + template + void type2(InField& f, const ParticleAttrib, Properties...>& R, + ParticleAttrib& Q, bool upsampled_output = false) { + if (!initialized_) { + throw IpplException("NativeNUFFT::type2", + "NUFFT not initialized. Call initialize() first."); + } + static IpplTimings::TimerRef NativeNUFFT2Timer = IpplTimings::getTimer("NativeNUFFT2"); + IpplTimings::startTimer(NativeNUFFT2Timer); + + // ============================================================ + // Step 1: Apply pre-correction + // ============================================================ + static IpplTimings::TimerRef PrecorrectionTimer = IpplTimings::getTimer("PrecorrectionNUFFT2"); + IpplTimings::startTimer(PrecorrectionTimer); + + if (upsampled_output) { + applyPreCorrectionType2( + f, factors_, *grid_field_, n_modes_, n_grid_); + } else { + applyPrecorrectionPruned( + f, factors_, n_modes_, n_grid_); + } + IpplTimings::stopTimer(PrecorrectionTimer); + + Kokkos::fence(); + + // ============================================================ + // Step 2: Inverse FFT + // ============================================================ + static IpplTimings::TimerRef FFTTimer = IpplTimings::getTimer("FFTNUFFT2"); + IpplTimings::startTimer(FFTTimer); + if (upsampled_output) { + performFFT(-1); // operates on grid_field_ + } else { + performPrunedFFT(-1, f); // writes into grid_field_ + } + IpplTimings::stopTimer(FFTTimer); + Kokkos::fence(); + + // ============================================================ + // Step 3: Gather/interpolate at particle positions + // ============================================================ + static IpplTimings::TimerRef GatherTimer = IpplTimings::getTimer("GatherNUFFT2"); + IpplTimings::startTimer(GatherTimer); + Q.gather(*grid_field_, R, kernel_, false, cfg_.gather_config); + IpplTimings::stopTimer(GatherTimer); + Kokkos::fence(); + IpplTimings::stopTimer(NativeNUFFT2Timer); + } + + //! @return Last-recorded per-stage timings. + const TimingInfo& timing() const { return timing_; } + //! @return The ES spreading kernel used internally. + const ESKernel& kernel() const { return kernel_; } + //! @return Size of the upsampled grid actually transformed. + Vector gridSize() const { return n_grid_; } + //! @return Number of Fourier modes the engine retains per axis. + Vector numModes() const { return n_modes_; } + + //! Reset accumulated timings back to zero. + void resetTimings() { + timing_ = TimingInfo{}; + } + + /*! + * @brief Perform the upsampled-grid FFT in place on @c grid_field_. + * @param sign +1 for forward, -1 for backward. + */ + void performFFT(int sign) { + TransformDirection direction = (sign < 0) ? BACKWARD : FORWARD; + heffte_fft_->transform(direction, *grid_field_); + } + + /*! + * @brief Perform the pruned FFT between @c grid_field_ and @p output_field. + * @param sign +1 (forward, grid -> modes) or -1 (backward, modes -> grid). + * @param output_field Pruned-modes field on the output side. + */ + void performPrunedFFT(int sign, auto& output_field) { + TransformDirection direction = (sign < 0) ? BACKWARD : FORWARD; + if (sign < 0) { + pruned_fft_->transform(direction, output_field, *grid_field_, 1); + } else { + pruned_fft_->transform(direction, *grid_field_, output_field, -1); + } + } + }; + + } // namespace NUFFT +} // namespace ippl + +#endif // IPPL_NATIVE_NUFFT_H diff --git a/src/FFT/Traits.h b/src/FFT/Traits.h new file mode 100644 index 000000000..82172132d --- /dev/null +++ b/src/FFT/Traits.h @@ -0,0 +1,313 @@ +/*! + * @file Traits.h + * @brief Compile-time traits, tags and dispatch helpers for IPPL's FFT layer. + * + * Provides: + * - Transform tag types (CCTransform, RCTransform, ...) used to select an + * FFT specialization. + * - Backend feature tags (FFTW, MKL, CuFFT, RocFFT, HeffteGPU, CuFFTMp, + * Finufft, GPUFinufft) along with @c is_available_v compile-time + * queries. + * - HeffteBackend selects the heFFTe backend type for a given Kokkos + * memory space (host vs CUDA vs HIP vs SYCL). + * - Stream provides minimal RAII-style helpers for execution-space streams. + */ +#ifndef IPPL_FFT_TRAITS_H +#define IPPL_FFT_TRAITS_H + +#include + +#ifdef IPPL_ENABLE_CUFFTMP +#include +#endif + +#include +#include +#include + +namespace ippl { + //! @name FFT transform tag types + //! Empty tag types selecting an FFT specialization at compile time. + //! @{ + struct CCTransform {}; //!< Complex-to-complex. + struct RCTransform {}; //!< Real-to-complex (and inverse). + struct SineTransform {}; //!< Discrete sine transform. + struct CosTransform {}; //!< Discrete cosine transform (Type II). + struct Cos1Transform {}; //!< Discrete cosine transform variant (Type I). + struct NUFFTransform {}; //!< Non-uniform FFT (Type 1 / Type 2). + struct PrunedCCTransform {}; //!< Pruned C2C (low-mode Fourier truncation). + struct PrunedRCTransform {}; //!< Pruned R2C. + //! @} + + //! Direction of a forward / backward transform. + enum TransformDirection { + FORWARD, + BACKWARD + }; + + //! Communication algorithm for distributed transforms (used by makeHeffteOptions). + enum FFTComm { + a2a = 0, //!< MPI_Alltoall. + a2av = 1, //!< MPI_Alltoallv. + p2p = 2, //!< Point-to-point. + p2p_pl = 3 //!< Point-to-point pipelined. + }; + + /*! + * @struct PruningParams + * @brief Per-axis kept-mode counts for pruned FFTs. + * @tparam Dim Spatial dimension. + */ + template + struct PruningParams { + Vector n_modes{}; + + PruningParams() = default; + + template + explicit PruningParams(const Vec& modes) { + for (unsigned d = 0; d < Dim; ++d) { + n_modes[d] = modes[d]; + } + } + }; + + //! Primary FFT template; specialized per transform tag in FFT/Transform/*. + template + class FFT; + + namespace fft { + + //============================================================================= + // Feature Tags + //============================================================================= + + //! @name FFT backend / library feature tags. + //! Used as template arguments to is_available to compile-time test for + //! optional FFT libraries. + //! @{ + struct FFTW {}; //!< FFTW host library. + struct MKL {}; //!< Intel MKL host FFT. + struct CuFFT {}; //!< NVIDIA cuFFT (single-GPU). + struct RocFFT {}; //!< AMD rocFFT. + struct HeffteGPU {}; //!< heFFTe with any GPU backend enabled. + struct CuFFTMp {}; //!< NVIDIA cuFFTMp (multi-GPU/-node). + struct Finufft {}; //!< Host finufft library. + struct GPUFinufft {}; //!< GPU finufft library. + //! @} + + //============================================================================= + // Unified Feature Detection: is_available + //============================================================================= + + //! Generic `false_type`; specializations below set `value = true` when + //! the corresponding feature is enabled at configure time. + template + struct is_available : std::false_type {}; + +#ifdef Heffte_ENABLE_FFTW + template <> + struct is_available : std::true_type {}; +#endif + +#ifdef Heffte_ENABLE_MKL + template <> + struct is_available : std::true_type {}; +#endif + +#ifdef Heffte_ENABLE_CUDA + template <> + struct is_available : std::true_type {}; +#endif + +#ifdef Heffte_ENABLE_ROCM + template <> + struct is_available : std::true_type {}; +#endif + +#ifdef Heffte_ENABLE_GPU + template <> + struct is_available : std::true_type {}; +#endif + +#ifdef IPPL_ENABLE_CUFFTMP + template <> + struct is_available : std::true_type {}; +#endif + +#ifdef ENABLE_FINUFFT + template <> + struct is_available : std::true_type {}; +#endif + +#if defined(ENABLE_FINUFFT) && defined(ENABLE_GPU_NUFFT) + template <> + struct is_available : std::true_type {}; +#endif + + //! Convenience alias: `is_available_v` is true iff `F` is enabled. + template + inline constexpr bool is_available_v = is_available::value; + + //============================================================================= + // heFFTe Backend Selection by Memory Space + //============================================================================= + + /*! + * @struct HeffteBackend + * @brief Pick the appropriate heFFTe backend types for a memory space. + * + * Provides nested aliases @c c2c, @c sin, @c cos, @c cos1 for the + * complex / sine / cosine / cosine-Type-I transforms. Specializations + * select FFTW / MKL on the host, cuFFT on CUDA, rocFFT on HIP, and + * fall back to the heFFTe stock backend everywhere else. + * + * @tparam MemSpace Kokkos memory space. + */ + template + struct HeffteBackend { + // Default: stock backend + using c2c = heffte::backend::stock; + using sin = heffte::backend::stock_sin; + using cos = heffte::backend::stock_cos; + using cos1 = heffte::backend::stock_cos1; + }; + + // Host: FFTW > MKL > Stock +#if defined(Heffte_ENABLE_FFTW) + template <> + struct HeffteBackend { + using c2c = heffte::backend::fftw; + using sin = heffte::backend::fftw_sin; + using cos = heffte::backend::fftw_cos; + using cos1 = heffte::backend::fftw_cos1; + }; +#elif defined(Heffte_ENABLE_MKL) + template <> + struct HeffteBackend { + using c2c = heffte::backend::mkl; + using sin = heffte::backend::mkl_sin; + using cos = heffte::backend::mkl_cos; + using cos1 = heffte::backend::mkl_cos1; + }; +#endif + +#ifdef KOKKOS_ENABLE_CUDA + template <> + struct HeffteBackend { + using c2c = heffte::backend::cufft; + using sin = heffte::backend::cufft_sin; + using cos = heffte::backend::cufft_cos; + using cos1 = heffte::backend::cufft_cos1; + }; +#endif + +#ifdef KOKKOS_ENABLE_HIP + template <> + struct HeffteBackend { +#ifdef Heffte_ENABLE_ROCM + using c2c = heffte::backend::rocfft; + using sin = heffte::backend::rocfft_sin; + using cos = heffte::backend::rocfft_cos; + using cos1 = heffte::backend::rocfft_cos1; +#else + using c2c = heffte::backend::stock; + using sin = heffte::backend::stock_sin; + using cos = heffte::backend::stock_cos; + using cos1 = heffte::backend::stock_cos1; +#endif + }; +#endif + +#ifdef KOKKOS_ENABLE_SYCL + // No SYCL-specific heFFTe backend wired up yet. Heffte's oneMKL + // backend (Heffte_ENABLE_ONEAPI) would go here once IPPL's + // Dependencies.cmake plumbs it through. For now fall back to the + // stock CPU backend so SYCL builds at least compile (FFTs run on + // the host). + template <> + struct HeffteBackend { + using c2c = heffte::backend::stock; + using sin = heffte::backend::stock_sin; + using cos = heffte::backend::stock_cos; + using cos1 = heffte::backend::stock_cos1; + }; +#endif + + //============================================================================= + // GPU Stream Support + //============================================================================= + + /*! + * @struct Stream + * @brief Minimal stream wrapper used to launch FFT-related kernels. + * + * The primary template is a no-op for memory spaces without a native + * stream concept; specializations below provide CUDA and HIP streams. + * + * @tparam MemSpace Kokkos memory space. + */ + template + struct Stream { + using stream_type = int; //!< Dummy handle for backends without streams. + using exec_space = Kokkos::DefaultExecutionSpace; + + static void create(stream_type&) {} + static void destroy(stream_type&) {} + static void sync(stream_type&) {} + static exec_space instance(stream_type&) { return exec_space(); } + }; + +#ifdef KOKKOS_ENABLE_CUDA + template <> + struct Stream { + using stream_type = cudaStream_t; + using exec_space = Kokkos::Cuda; + + static void create(stream_type& s) { cudaStreamCreate(&s); } + static void destroy(stream_type& s) { cudaStreamDestroy(s); } + static void sync(stream_type& s) { cudaStreamSynchronize(s); } + static exec_space instance(stream_type& s) { return exec_space(s); } + }; +#endif + +#ifdef KOKKOS_ENABLE_HIP + template <> + struct Stream { + using stream_type = hipStream_t; + using exec_space = Kokkos::HIP; + + static void create(stream_type& s) { (void)hipStreamCreate(&s); } + static void destroy(stream_type& s) { (void)hipStreamDestroy(s); } + static void sync(stream_type& s) { (void)hipStreamSynchronize(s); } + static exec_space instance(stream_type& s) { return exec_space(s); } + }; +#endif + + //============================================================================= + // FFTW Trig Scaling + //============================================================================= + + /*! + * @brief Extra normalization factor applied by FFTW's trig transforms. + * + * FFTW's sine / cosine transforms include an implicit factor of 8 in + * 3D (2 per axis); other backends do not. The trig wrappers multiply + * by this to keep the normalization consistent across backends. + */ + inline constexpr double fftw_trig_scale() { + return is_available_v ? 8.0 : 1.0; + } + } // namespace fft + +} // namespace ippl + +// Register Kokkos complex with heFFTe +namespace heffte { + template <> + struct is_ccomplex> : std::true_type {}; + template <> + struct is_zcomplex> : std::true_type {}; +} // namespace heffte + +#endif diff --git a/src/FFT/Transform/CC.h b/src/FFT/Transform/CC.h new file mode 100644 index 000000000..0330d1c54 --- /dev/null +++ b/src/FFT/Transform/CC.h @@ -0,0 +1,102 @@ +/*! + * @file CC.h + * @brief Complex-to-complex FFT specialization (CCTransform tag). + */ +#ifndef IPPL_FFT_TRANSFORM_CC_H +#define IPPL_FFT_TRANSFORM_CC_H + +#include "Utility/ParameterList.h" + +#include "Communicate/Communicator.h" +#include "FFT/Backend/Backend.h" +#include "FFT/Traits.h" +#include "FFT/Transform/Common.h" + +namespace ippl { + + /*! + * @class FFT + * @brief In-place complex-to-complex FFT over a complex IPPL Field. + * + * Selects cuFFTMp when @c IPPL_ENABLE_CUFFTMP is defined, otherwise the + * heFFTe C2C backend. The transform is performed on a contiguous + * LayoutLeft scratch view; ghost cells are stripped on the way in and + * restored on the way out. + * + * @tparam ComplexField Field whose value_type is a Kokkos::complex. + */ + template + class FFT { + public: + static constexpr unsigned Dim = ComplexField::dim; + + using Complex_t = typename ComplexField::value_type; + using T = typename Complex_t::value_type; + using MemSpace = typename ComplexField::memory_space; + using ExecSpace = typename ComplexField::execution_space; + using Layout_t = FieldLayout; + +#ifdef IPPL_ENABLE_CUFFTMP + using Backend_t = fft::CuFFTMpC2C; +#else + using Backend_t = fft::HeffteC2C; +#endif + using TempView_t = typename Kokkos::View::uniform_type; + + /*! + * @brief Build the FFT plan for the local subdomain described by @p layout. + * @param layout Field layout giving the local NDIndex and MPI partition. + * @param params FFT-tuning parameters forwarded to the backend. + */ + FFT(const Layout_t& layout, const ParameterList& params) { + static_assert(Dim == 2 || Dim == 3, "heFFTe only supports 2D and 3D"); + + std::array low, high; + fft::domainToBounds(layout.getLocalNDIndex(), low, high); + heffte::box3d box{low, high}; + + backend_ = std::make_unique(box, box, Comm->getCommunicator(), params); + } + + //! Run a forward + backward pair on @p f to JIT compile / cache backend kernels. + void warmup(ComplexField& f) { + transform(FORWARD, f); + transform(BACKWARD, f); + } + + /*! + * @brief In-place FFT of @p f. + * @param direction FORWARD or BACKWARD. + * @param f Field to transform; modified in place. + */ + void transform(TransformDirection direction, ComplexField& f) { + auto view = f.getView(); + const int ng = f.getNghost(); + + ensureTemp(f); + fft::copyToTemp(temp_, view, ng); + + if (direction == FORWARD) { + backend_->forward(temp_.data(), temp_.data()); + } else { + backend_->backward(temp_.data(), temp_.data()); + } + + fft::copyFromTemp(view, temp_, ng); + } + + private: + std::unique_ptr backend_; + TempView_t temp_; + + void ensureTemp(const ComplexField& f) { + if (temp_.size() != f.getOwned().size()) { + temp_ = detail::shrinkView("fft_cc_temp", f.getView(), f.getNghost()); + } + } + }; + +} // namespace ippl + +#endif diff --git a/src/FFT/Transform/Common.h b/src/FFT/Transform/Common.h new file mode 100644 index 000000000..891b6b2e0 --- /dev/null +++ b/src/FFT/Transform/Common.h @@ -0,0 +1,80 @@ +/*! + * @file Common.h + * @brief Helpers shared by the FFT transform wrappers. + * + * Provides domain<->box translation and ghost-cell-aware buffer copies + * between BareFields and the LayoutLeft scratch buffers handed to the + * underlying FFT backends. + */ +#ifndef IPPL_COMMON_H +#define IPPL_COMMON_H + +#include + +#include "Expression/IpplOperations.h" +#include "Utility/ParallelDispatch.h" +#include "Utility/ViewUtils.h" + +namespace ippl::fft { + /*! + * @brief Convert an IPPL NDIndex domain into low/high index triples. + * + * Pads unused dimensions (when @p Dim < 3) with 0 so the output is always + * length 3 - the size that heFFTe / cuFFT(Mp) box descriptors expect. + * + * @tparam Dim Active spatial dimension count. + * @tparam NDIndex IPPL NDIndex type. + * @param domain Input NDIndex describing the local subdomain. + * @param low Output: inclusive lower indices per axis. + * @param high Output: inclusive upper indices per axis. + */ + template + inline void domainToBounds(const NDIndex& domain, std::array& low, + std::array& high) { + low.fill(0); + high.fill(0); + for (unsigned d = 0; d < Dim; ++d) { + low[d] = static_cast(domain[d].first()); + high[d] = static_cast(domain[d].first() + domain[d].length() - 1); + } + } + + /*! + * @brief Copy a Field view (with @p n_ghost halo) into a contiguous FFT buffer. + * + * The input view has ghost cells; the output is expected to be the + * exact-size FFT buffer, so each index is shifted by @p n_ghost. + * + * @tparam ExecSpace Kokkos execution space the launch is dispatched on. + * @tparam OutputViewT Destination view type (no ghosts). + * @tparam InputViewT Source view type (with ghosts). + */ + template + inline void copyToTemp(OutputViewT& output, const InputViewT& input, int n_ghost) { + constexpr unsigned Dim = InputViewT::rank; + using index_array_type = typename ippl::RangePolicy::index_array_type; + ippl::parallel_for( + "FFT_toTemp", ippl::getRangePolicy(input, n_ghost), + KOKKOS_LAMBDA(const index_array_type& args) { + ippl::apply(output, args - n_ghost) = ippl::apply(input, args); + }); + } + + /*! + * @brief Inverse of copyToTemp: scatter a contiguous FFT buffer back into a + * ghost-padded Field view. + */ + template + inline void copyFromTemp(OutputViewT& output, const InputViewT& input, int n_ghost) { + constexpr unsigned Dim = OutputViewT::rank; + using index_array_type = typename ippl::RangePolicy::index_array_type; + ippl::parallel_for( + "FFT_fromTemp", ippl::getRangePolicy(output, n_ghost), + KOKKOS_LAMBDA(const index_array_type& args) { + ippl::apply(output, args) = ippl::apply(input, args - n_ghost); + }); + } + +} // namespace ippl::fft + +#endif // IPPL_COMMON_H diff --git a/src/FFT/Transform/NUFFT.h b/src/FFT/Transform/NUFFT.h new file mode 100644 index 000000000..53f64729c --- /dev/null +++ b/src/FFT/Transform/NUFFT.h @@ -0,0 +1,297 @@ +/*! + * @file NUFFT.h + * @brief Non-uniform FFT specialization (NUFFTransform tag). + * + * Provides a single FFT class that can dispatch to the native IPPL NUFFT + * (Kokkos-based ES kernel + heFFTe) or to FINUFFT / cuFINUFFT when those + * libraries are enabled at configure time. + */ +#ifndef IPPL_FFT_TRANSFORM_NUFFT_H +#define IPPL_FFT_TRANSFORM_NUFFT_H + +#include +#include +#include + +#include "Utility/IpplException.h" +#include "Utility/ParameterList.h" + +#include "Communicate/Communicator.h" +#include "FFT/NUFFT/NativeNUFFT.h" +#include "FFT/Traits.h" +#include "FFT/Transform/Common.h" + +#ifdef ENABLE_FINUFFT +#include +#ifdef ENABLE_GPU_NUFFT +#include +#endif +#endif + +namespace ippl { + + // Forward declaration + template + class ParticleAttrib; + + namespace detail { + +#ifdef ENABLE_FINUFFT + /*! + * @struct FinufftTraits + * @brief Type traits for FINUFFT backend selection. + * + * Provides a unified surface (ComplexType, PlanType, OptsType, + * CountType, plus static makeplan/setpts/execute/destroy wrappers) + * over the CPU @c finufft library and the GPU @c cufinufft library + * so the rest of the code can be written backend-agnostic. + * + * @tparam T Real precision (float or double). + */ + template + struct FinufftTraits; + +#ifdef ENABLE_GPU_NUFFT + template <> + struct FinufftTraits { + using ComplexType = cuFloatComplex; + using PlanType = cufinufftf_plan; + using OptsType = cufinufft_opts; + using CountType = int; // cufinufft uses int for point counts + + static void defaultOpts(OptsType* opts) { cufinufft_default_opts(opts); } + + static int makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntransf, + float tol, PlanType* plan, OptsType* opts) { + return cufinufftf_makeplan(type, dim, nmodes, iflag, ntransf, tol, plan, opts); + } + + static int setpts(PlanType plan, CountType M, float* x, float* y, float* z, + CountType N, float* s, float* t, float* u) { + return cufinufftf_setpts(plan, M, x, y, z, N, s, t, u); + } + + static int execute(PlanType plan, ComplexType* c, ComplexType* f) { + return cufinufftf_execute(plan, c, f); + } + + static int destroy(PlanType plan) { return cufinufftf_destroy(plan); } + }; + + template <> + struct FinufftTraits { + using ComplexType = cuDoubleComplex; + using PlanType = cufinufft_plan; + using OptsType = cufinufft_opts; + using CountType = int; + + static void defaultOpts(OptsType* opts) { cufinufft_default_opts(opts); } + + static int makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntransf, + double tol, PlanType* plan, OptsType* opts) { + return cufinufft_makeplan(type, dim, nmodes, iflag, ntransf, tol, plan, opts); + } + + static int setpts(PlanType plan, CountType M, double* x, double* y, double* z, + CountType N, double* s, double* t, double* u) { + return cufinufft_setpts(plan, M, x, y, z, N, s, t, u); + } + + static int execute(PlanType plan, ComplexType* c, ComplexType* f) { + return cufinufft_execute(plan, c, f); + } + + static int destroy(PlanType plan) { return cufinufft_destroy(plan); } + }; + +#else // CPU FINUFFT + + template <> + struct FinufftTraits { + using ComplexType = std::complex; + using PlanType = finufftf_plan; + using OptsType = finufft_opts; + using CountType = int64_t; + + static void defaultOpts(OptsType* opts) { finufft_default_opts(opts); } + + static int makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntransf, + float tol, PlanType* plan, OptsType* opts) { + return finufftf_makeplan(type, dim, nmodes, iflag, ntransf, tol, plan, opts); + } + + static int setpts(PlanType plan, CountType M, float* x, float* y, float* z, + CountType N, float* s, float* t, float* u) { + return finufftf_setpts(plan, M, x, y, z, N, s, t, u); + } + + static int execute(PlanType plan, ComplexType* c, ComplexType* f) { + return finufftf_execute(plan, c, f); + } + + static int destroy(PlanType plan) { return finufftf_destroy(plan); } + }; + + template <> + struct FinufftTraits { + using ComplexType = std::complex; + using PlanType = finufft_plan; + using OptsType = finufft_opts; + using CountType = int64_t; + + static void defaultOpts(OptsType* opts) { finufft_default_opts(opts); } + + static int makeplan(int type, int dim, int64_t* nmodes, int iflag, int ntransf, + double tol, PlanType* plan, OptsType* opts) { + return finufft_makeplan(type, dim, nmodes, iflag, ntransf, tol, plan, opts); + } + + static int setpts(PlanType plan, CountType M, double* x, double* y, double* z, + CountType N, double* s, double* t, double* u) { + return finufft_setpts(plan, M, x, y, z, N, s, t, u); + } + + static int execute(PlanType plan, ComplexType* c, ComplexType* f) { + return finufft_execute(plan, c, f); + } + + static int destroy(PlanType plan) { return finufft_destroy(plan); } + }; + +#endif // ENABLE_GPU_NUFFT +#endif // ENABLE_FINUFFT + + } // namespace detail + + /*! + * @class FFT + * @brief Non-uniform FFT for IPPL particles + uniform Fourier modes. + * + * Supports both the native IPPL implementation and the FINUFFT + * (CPU / GPU) backend. The runtime backend choice is controlled by the + * @c "useFinufft" / @c "lockMethod" parameter keys plus the configure + * macros @c ENABLE_FINUFFT and @c ENABLE_GPU_NUFFT. + * + * Type 1: non-uniform points -> uniform grid (spreading / adjoint). + * Type 2: uniform grid -> non-uniform points (interpolation). + * + * @tparam RealField IPPL Field of real values used for grid sizing. + */ + template + class FFT { + public: + static constexpr unsigned Dim = RealField::dim; + + using T = typename RealField::value_type; + using Complex_t = Kokkos::complex; + using MemSpace = typename RealField::memory_space; + using ExecSpace = typename RealField::execution_space; + using Layout_t = FieldLayout; + + using ComplexField = + typename Field::uniform_type; + + using NativeNUFFT_t = nufft::NativeNUFFT; + +#ifdef ENABLE_FINUFFT + using Traits_t = detail::FinufftTraits; + using FinufftComplex_t = typename Traits_t::ComplexType; + using FinufftPlan_t = typename Traits_t::PlanType; + using FinufftOpts_t = typename Traits_t::OptsType; + using FinufftCount_t = typename Traits_t::CountType; +#endif + + private: + // Configuration + int type_m; + T tol_m; + bool useFinufft_m; + bool useUpsampledInputs_m; + bool useR2C_m; + int r2cDir_m; + bool lockMethod_m; + + std::array nModes_m{1, 1, 1}; + + // Native NUFFT backend + std::unique_ptr nativeNufft_m; + +#ifdef ENABLE_FINUFFT + // FINUFFT backend + FinufftPlan_t finufftPlan_m{}; + + // Temporary buffers for FINUFFT + using FieldViewType = Kokkos::View; + using ParticleRealView = Kokkos::View; + using ParticleCplxView = Kokkos::View; + + FieldViewType tempField_m; + std::array tempR_m; + ParticleCplxView tempQ_m; +#endif + + public: + /** + * @brief Construct NUFFT transform + * + * @param layout Field layout + * @param localNp Local number of particles + * @param type Transform type (1 or 2) + * @param params Configuration parameters + */ + FFT(const Layout_t& layout, detail::size_type localNp, int type, + const ParameterList& params); + + ~FFT(); + + // Non-copyable, non-movable (due to FINUFFT plan) + FFT(const FFT&) = delete; + FFT& operator=(const FFT&) = delete; + FFT(FFT&&) = delete; + FFT& operator=(FFT&&) = delete; + + /*! + * @brief Execute NUFFT transform. + * + * Type 1: Spreads particle data @p Q at positions @p R onto field @p f. + * Type 2: Interpolates field @p f to positions @p R, storing results in @p Q. + * + * @param R Particle positions. + * @param Q Particle scalar values (input on Type 1, output on Type 2). + * @param f Complex field on a uniform grid (output on Type 1, input on Type 2). + */ + template + void transform(const ParticleAttrib, Properties...>& R, + ParticleAttrib& Q, ComplexField& f); + + //! Native (Kokkos / heFFTe) NUFFT path. + //! @note Public because NVCC's extended lambda support disallows + //! lambdas inside private templated member functions. + template + void transformNative(const ParticleAttrib, Properties...>& R, + ParticleAttrib& Q, ComplexField& f); + + //! FINUFFT / cuFINUFFT NUFFT path. Same NVCC public-visibility caveat. + template + void transformFinufft(const ParticleAttrib, Properties...>& R, + ParticleAttrib& Q, ComplexField& f); + private: + //! Pick a backend (native vs. finufft) based on @p params and lock_method. + void initBackend(const Layout_t& layout, const ParameterList& params); + //! Build the native NUFFT engine. + void initNative(const Layout_t& layout, const ParameterList& params); + //! Tear down whichever backend is currently allocated. + void cleanupBackend(); + + //! Build the FINUFFT plan and configure tolerances/options. + void initFinufft(const ParameterList& params); + //! Allocate the LayoutLeft scratch views FINUFFT consumes. + void allocateFinufftBuffers(const Layout_t& layout, detail::size_type localNp); + }; + +} // namespace ippl + +#include "FFT/Transform/NUFFT.hpp" + +#endif // IPPL_FFT_TRANSFORM_NUFFT_H diff --git a/src/FFT/Transform/NUFFT.hpp b/src/FFT/Transform/NUFFT.hpp new file mode 100644 index 000000000..64289efd1 --- /dev/null +++ b/src/FFT/Transform/NUFFT.hpp @@ -0,0 +1,558 @@ +/*! + * @file NUFFT.hpp + * @brief Implementation of FFT declared in NUFFT.h. + * + * Contains the Kokkos functors used to copy particles / fields between IPPL + * and FINUFFT-friendly buffers (with optional fftshift) plus the dispatch + * to the native and FINUFFT backends. + */ +#ifndef IPPL_FFT_TRANSFORM_NUFFT_HPP +#define IPPL_FFT_TRANSFORM_NUFFT_HPP + +namespace ippl { + + namespace detail { + + //===================================================================== + // Functors for NUFFT operations (must be outside class for NVCC) + //===================================================================== + + /** + * @brief Functor to scale particle positions by a factor + */ + template + struct ScalePositionsFunctor { + RView Rview_m; + Vector scale_m; + + ScalePositionsFunctor(RView Rview, Vector scale) + : Rview_m(Rview) + , scale_m(scale) {} + + KOKKOS_INLINE_FUNCTION void operator()(std::size_t i) const { + for (unsigned d = 0; d < Dim; ++d) { + Rview_m(i)[d] *= scale_m[d]; + } + } + }; + +#ifdef ENABLE_FINUFFT + + /** + * @brief Functor to copy field data to FINUFFT temp buffer with optional ifftshift + */ + template + struct CopyFieldToTempFunctor { + FieldView fview_m; + TempFieldView tempField_m; + int nghost_m; + int nx_m, ny_m, nz_m; + bool applyShift_m; + + CopyFieldToTempFunctor(FieldView fview, TempFieldView tempField, int nghost, int nx = 0, + int ny = 0, int nz = 0, bool applyShift = false) + : fview_m(fview) + , tempField_m(tempField) + , nghost_m(nghost) + , nx_m(nx) + , ny_m(ny) + , nz_m(nz) + , applyShift_m(applyShift) {} + + KOKKOS_INLINE_FUNCTION void operator()(int i, int j, int k) const { + int li = i - nghost_m; + int lj = j - nghost_m; + int lk = k - nghost_m; + + int di = li, dj = lj, dk = lk; + if (applyShift_m) { + di = (li + nx_m / 2) % nx_m; + dj = (lj + ny_m / 2) % ny_m; + dk = (lk + nz_m / 2) % nz_m; + } + + auto& dst = tempField_m(di, dj, dk); + auto src = fview_m(i, j, k); +#ifdef ENABLE_GPU_NUFFT + dst.x = src.real(); + dst.y = src.imag(); +#else + dst.real(src.real()); + dst.imag(src.imag()); +#endif + } + }; + + /** + * @brief Functor to copy field data from FINUFFT temp buffer with fftshift + */ + template + struct CopyFieldFromTempFunctor { + FieldView fview_m; + TempFieldView tempField_m; + int nghost_m; + int nx_m, ny_m, nz_m; + + CopyFieldFromTempFunctor(FieldView fview, TempFieldView tempField, int nghost, int nx, + int ny, int nz) + : fview_m(fview) + , tempField_m(tempField) + , nghost_m(nghost) + , nx_m(nx) + , ny_m(ny) + , nz_m(nz) {} + + KOKKOS_INLINE_FUNCTION void operator()(int i, int j, int k) const { + int li = i - nghost_m; + int lj = j - nghost_m; + int lk = k - nghost_m; + + int si = (li + nx_m / 2) % nx_m; + int sj = (lj + ny_m / 2) % ny_m; + int sk = (lk + nz_m / 2) % nz_m; + + auto src = tempField_m(si, sj, sk); +#ifdef ENABLE_GPU_NUFFT + fview_m(i, j, k).real() = src.x; + fview_m(i, j, k).imag() = src.y; +#else + fview_m(i, j, k).real() = src.real(); + fview_m(i, j, k).imag() = src.imag(); +#endif + } + }; + + /** + * @brief Functor to copy particle data to FINUFFT temp buffers + */ + template + struct CopyParticlesToTempFunctor { + RView Rview_m; + QView Qview_m; + TempRView tempRx_m, tempRy_m, tempRz_m; + TempQView tempQ_m; + Vector scale_m; + + CopyParticlesToTempFunctor(RView Rview, QView Qview, TempRView tempRx, TempRView tempRy, + TempRView tempRz, TempQView tempQ, Vector scale) + : Rview_m(Rview) + , Qview_m(Qview) + , tempRx_m(tempRx) + , tempRy_m(tempRy) + , tempRz_m(tempRz) + , tempQ_m(tempQ) + , scale_m(scale) {} + + KOKKOS_INLINE_FUNCTION void operator()(std::size_t i) const { + tempRx_m(i) = Rview_m(i)[0] * scale_m[0]; + tempRy_m(i) = Rview_m(i)[1] * scale_m[1]; + tempRz_m(i) = Rview_m(i)[2] * scale_m[2]; + +#ifdef ENABLE_GPU_NUFFT + tempQ_m(i).x = Qview_m(i); + tempQ_m(i).y = T(0); +#else + tempQ_m(i).real(Qview_m(i)); + tempQ_m(i).imag(T(0)); +#endif + } + }; + + /** + * @brief Functor to copy particle data from FINUFFT temp buffer + */ + template + struct CopyParticlesFromTempFunctor { + QView Qview_m; + TempQView tempQ_m; + + CopyParticlesFromTempFunctor(QView Qview, TempQView tempQ) + : Qview_m(Qview) + , tempQ_m(tempQ) {} + + KOKKOS_INLINE_FUNCTION void operator()(std::size_t i) const { +#ifdef ENABLE_GPU_NUFFT + Qview_m(i) = tempQ_m(i).x; +#else + Qview_m(i) = tempQ_m(i).real(); +#endif + } + }; + +#endif // ENABLE_FINUFFT + + } // namespace detail + + //========================================================================= + // Constructor / Destructor + //========================================================================= + + template + FFT::FFT(const Layout_t& layout, detail::size_type localNp, int type, + const ParameterList& params) + : type_m(type) + , tol_m(params.get("tolerance", T(1e-6))) + , useFinufft_m(params.get("use_finufft", false)) + , useUpsampledInputs_m(params.get("use_upsampled_inputs", false)) + , useR2C_m(params.get("use_r2c", false)) + , r2cDir_m(params.get("r2c_direction", 0)) + , lockMethod_m(params.get("lock_method", false)) { + const auto& domain = layout.getDomain(); + for (unsigned d = 0; d < Dim; ++d) { + nModes_m[d] = domain[d].length(); + } + +#ifdef ENABLE_FINUFFT + // allocateFinufftBuffers and the rest of the FINUFFT/cuFINUFFT + // pipeline are hardcoded for 3D (lDom[2], Rank<3> MDRangePolicy, ...). + // For Dim != 3 we never call into them -- the native path handles + // 2D NUFFT -- so we also skip the buffer allocation here, which + // would otherwise read past the local NDIndex for Dim < 3. + if constexpr (Dim == 3) { + allocateFinufftBuffers(layout, localNp); + } else { + (void)layout; + (void)localNp; + } +#else + (void)localNp; +#endif + + initBackend(layout, params); + } + + template + FFT::~FFT() { + cleanupBackend(); + } + + //========================================================================= + // Backend Initialization + //========================================================================= + + template + void FFT::initBackend(const Layout_t& layout, + const ParameterList& params) { + // FINUFFT path is 3D-only (see FFT::transform()); fall back to the + // native backend for any other Dim so the FINUFFT-enabled 2D unit + // tests don't tear down on the plan setup. + if constexpr (Dim == 3 && fft::is_available_v) { + if (useFinufft_m) { + initFinufft(params); + return; + } + } + + initNative(layout, params); + } + + template + void FFT::initNative(const Layout_t& layout, + const ParameterList& params) { + Vector nModesVec; + for (unsigned d = 0; d < Dim; ++d) { + nModesVec[d] = nModes_m[d]; + } + + typename NativeNUFFT_t::Config cfg; + cfg.tol = tol_m; + cfg.sigma = params.get("sigma", T(2.0)); + + cfg.scatter_config = Interpolation::ScatterConfig::template get_default(); + cfg.gather_config = Interpolation::GatherConfig::template get_default(); + + cfg.scatter_config.lock_method = lockMethod_m; + + std::string spreadMethod = params.get("spread_method", "none"); + if (spreadMethod == "atomic") { + cfg.scatter_config.method = Interpolation::ScatterMethod::Atomic; + } else if (spreadMethod == "output_focused" + || spreadMethod == "output_focused_zbatched") { + // The "_zbatched" alias is kept so old test parameter sets still + // resolve; both map to OutputFocused (the z_batches knob lives on + // ScatterConfig, not in the method enum). + cfg.scatter_config.method = Interpolation::ScatterMethod::OutputFocused; + } else if (spreadMethod == "tiled") { + cfg.scatter_config.method = Interpolation::ScatterMethod::Tiled; + } + + std::string gatherMethod = params.get("gather_method", "none"); + if (gatherMethod == "atomic") { + cfg.gather_config.method = Interpolation::GatherMethod::Atomic; + } else if (gatherMethod == "atomic_sort") { + cfg.gather_config.method = Interpolation::GatherMethod::AtomicSort; + } + + if (params.contains("tile_size_3d")) { + cfg.scatter_config.tile_size.fill(params.get("tile_size_3d")); + } + if (params.contains("team_size")) { + cfg.scatter_config.team_size = params.get("team_size"); + cfg.gather_config.team_size = params.get("team_size"); + } + + nativeNufft_m = std::make_unique(nModesVec, useUpsampledInputs_m, cfg); + nativeNufft_m->initialize(layout, Comm->getCommunicator()); + } + + template + void FFT::cleanupBackend() { +#ifdef ENABLE_FINUFFT + if (useFinufft_m && finufftPlan_m) { + Traits_t::destroy(finufftPlan_m); + finufftPlan_m = FinufftPlan_t{}; + } +#endif + } + + template + void FFT::allocateFinufftBuffers(const Layout_t& layout, + detail::size_type localNp) { +#ifdef ENABLE_FINUFFT + const auto& lDom = layout.getLocalNDIndex(); + Kokkos::realloc(tempField_m, lDom[0].length(), lDom[1].length(), lDom[2].length()); + + for (unsigned d = 0; d < Dim; ++d) { + Kokkos::realloc(tempR_m[d], localNp); + } + Kokkos::realloc(tempQ_m, localNp); +#else + (void)layout; + (void)localNp; + + throw std::runtime_error("FINUFFT is not activated. Rebuild with -DIPPL_ENABLE_FINUFFT=ON"); +#endif + } + + template + void FFT::initFinufft(const ParameterList& params) { +#ifdef ENABLE_FINUFFT + FinufftOpts_t opts; + Traits_t::defaultOpts(&opts); + +#ifdef ENABLE_GPU_NUFFT + opts.gpu_method = params.get("gpu_method", opts.gpu_method); + opts.gpu_sort = params.get("gpu_sort", opts.gpu_sort); + opts.gpu_kerevalmeth = params.get("gpu_kerevalmeth", opts.gpu_kerevalmeth); + opts.gpu_binsizex = params.get("gpu_binsizex", opts.gpu_binsizex); + opts.gpu_binsizey = params.get("gpu_binsizey", opts.gpu_binsizey); + opts.gpu_binsizez = params.get("gpu_binsizez", opts.gpu_binsizez); + opts.gpu_maxsubprobsize = params.get("gpu_maxsubprobsize", opts.gpu_maxsubprobsize); + opts.gpu_maxbatchsize = 0; +#else + opts.spread_sort = params.get("spread_sort", opts.spread_sort); + opts.spread_kerevalmeth = params.get("spread_kerevalmeth", opts.spread_kerevalmeth); + opts.nthreads = params.get("nthreads", opts.nthreads); +#endif + + int iflag = (type_m == 1) ? -1 : 1; + int dim = static_cast(Dim); + + int err = Traits_t::makeplan(type_m, dim, nModes_m.data(), iflag, 1, tol_m, &finufftPlan_m, + &opts); + + if (err != 0) { + throw IpplException("FFT", "FINUFFT makeplan failed"); + } +#else + (void)params; + throw std::runtime_error("FINUFFT is not activated. Rebuild with -DIPPL_ENABLE_FINUFFT=ON"); +#endif // ENABLE_FINUFFT + } + + //========================================================================= + // Transform Dispatch + //========================================================================= + + template + template + void FFT::transform( + const ParticleAttrib, Properties...>& R, ParticleAttrib& Q, + ComplexField& f) { + // The FINUFFT/cuFINUFFT path is hardcoded for Dim == 3 (3D mode-grid + // scratch + Rank<3> MDRangePolicy in transformFinufft). For Dim != 3 + // we skip the constexpr branch entirely so the 3D-only template body + // is never instantiated -- otherwise the unit tests that exercise the + // 2D type instantiations of FFT fail to compile + // under a FINUFFT-enabled build. + if constexpr (Dim == 3 && fft::is_available_v) { + if (useFinufft_m) { + transformFinufft(R, Q, f); + return; + } + } + + transformNative(R, Q, f); + } + + //========================================================================= + // Native NUFFT Transform + //========================================================================= + + template + template + void FFT::transformNative( + const ParticleAttrib, Properties...>& R, ParticleAttrib& Q, + ComplexField& f) { + const auto localNp = R.getParticleCount(); + const auto& layout = f.getLayout(); + const auto& mesh = f.get_mesh(); + const auto& dx = mesh.getMeshSpacing(); + const auto& domain = layout.getDomain(); + + Vector Len; + for (unsigned d = 0; d < Dim; ++d) { + int fullLength = domain[d].length(); + if (useR2C_m && static_cast(d) == r2cDir_m) { + fullLength = 2 * (fullLength - 1); + } + Len[d] = dx[d] * fullLength; + } + + constexpr T twoPi = T(2.0 * M_PI); + auto Rview = R.getView(); + + Vector scaleToTwoPi, scaleBack; + for (unsigned d = 0; d < Dim; ++d) { + scaleToTwoPi[d] = twoPi / Len[d]; + scaleBack[d] = Len[d] / twoPi; + } + + using ScaleFunctor = detail::ScalePositionsFunctor; + Kokkos::parallel_for("NUFFT_scale_to_2pi", Kokkos::RangePolicy(0, localNp), + ScaleFunctor(Rview, scaleToTwoPi)); + + if (type_m == 1) { + nativeNufft_m->type1(R, Q, f, useUpsampledInputs_m); + } else if (type_m == 2) { + nativeNufft_m->type2(f, R, Q, useUpsampledInputs_m); + } else { + throw IpplException("FFT", "Only type 1 and type 2 NUFFT supported"); + } + + Kokkos::parallel_for("NUFFT_scale_back", Kokkos::RangePolicy(0, localNp), + ScaleFunctor(Rview, scaleBack)); + } + + //========================================================================= + // FINUFFT Transform + //========================================================================= + template + template + void FFT::transformFinufft( + const ParticleAttrib, Properties...>& R, ParticleAttrib& Q, + ComplexField& f) { +#ifdef ENABLE_FINUFFT + const auto localNp = R.getParticleCount(); + const auto& layout = f.getLayout(); + const auto& mesh = f.get_mesh(); + const auto& dx = mesh.getMeshSpacing(); + const auto& domain = layout.getDomain(); + const int nghost = f.getNghost(); + + Vector Len; + for (unsigned d = 0; d < Dim; ++d) { + Len[d] = dx[d] * domain[d].length(); + } + + constexpr T twoPi = T(2.0 * M_PI); + + auto fview = f.getView(); + auto Rview = R.getView(); + auto Qview = Q.getView(); + + const auto& lDom = layout.getLocalNDIndex(); + if (tempField_m.extent(0) != static_cast(lDom[0].length()) + || tempField_m.extent(1) != static_cast(lDom[1].length()) + || tempField_m.extent(2) != static_cast(lDom[2].length())) { + Kokkos::realloc(tempField_m, lDom[0].length(), lDom[1].length(), lDom[2].length()); + } + + if (tempQ_m.extent(0) < localNp) { + Kokkos::realloc(tempQ_m, localNp); + } + + for (unsigned d = 0; d < Dim; ++d) { + if (tempR_m[d].extent(0) < localNp) { + Kokkos::realloc(tempR_m[d], localNp); + } + } + + auto tempField = tempField_m; + auto tempQ = tempQ_m; + auto tempRx = tempR_m[0]; + auto tempRy = tempR_m[1]; + auto tempRz = tempR_m[2]; + + Vector scale; + for (unsigned d = 0; d < Dim; ++d) { + scale[d] = twoPi / Len[d]; + } + + using mdrange_type = Kokkos::MDRangePolicy>; + using CopyToTemp = detail::CopyFieldToTempFunctor; + + int nx = lDom[0].length(); + int ny = lDom[1].length(); + int nz = lDom[2].length(); + bool needShift = (type_m == 2); + + Kokkos::parallel_for( + "FINUFFT_copy_field_to_temp", + mdrange_type({nghost, nghost, nghost}, {static_cast(fview.extent(0)) - nghost, + static_cast(fview.extent(1)) - nghost, + static_cast(fview.extent(2)) - nghost}), + CopyToTemp(fview, tempField, nghost, nx, ny, nz, needShift)); + + using CopyParticles = + detail::CopyParticlesToTempFunctor; + + Kokkos::parallel_for("FINUFFT_copy_particles_to_temp", localNp, + CopyParticles(Rview, Qview, tempRx, tempRy, tempRz, tempQ, scale)); + + Kokkos::fence(); + + int err = Traits_t::setpts(finufftPlan_m, static_cast(localNp), + tempRx.data(), tempRy.data(), tempRz.data(), FinufftCount_t{0}, + nullptr, nullptr, nullptr); + + if (err != 0) { + throw IpplException("FFT", "FINUFFT setpts failed"); + } + + err = Traits_t::execute(finufftPlan_m, tempQ.data(), tempField.data()); + + if (err != 0) { + throw IpplException("FFT", "FINUFFT execute failed"); + } + + Kokkos::fence(); + + if (type_m == 1) { + using CopyFromTemp = + detail::CopyFieldFromTempFunctor; + + Kokkos::parallel_for("FINUFFT_copy_field_from_temp", + mdrange_type({nghost, nghost, nghost}, + {static_cast(fview.extent(0)) - nghost, + static_cast(fview.extent(1)) - nghost, + static_cast(fview.extent(2)) - nghost}), + CopyFromTemp(fview, tempField, nghost, nx, ny, nz)); + } else if (type_m == 2) { + using CopyBack = detail::CopyParticlesFromTempFunctor; + + Kokkos::parallel_for("FINUFFT_copy_particles_from_temp", localNp, + CopyBack(Qview, tempQ)); + } +#else + throw std::runtime_error("FINUFFT is not activated. Rebuild with -DIPPL_ENABLE_FINUFFT=ON"); + (void)R; + (void)Q; + (void)f; +#endif // ENABLE_FINUFFT + } +} // namespace ippl + +#endif // IPPL_FFT_TRANSFORM_NUFFT_HPP diff --git a/src/FFT/Transform/PrunedCC.h b/src/FFT/Transform/PrunedCC.h new file mode 100644 index 000000000..23dc12c3e --- /dev/null +++ b/src/FFT/Transform/PrunedCC.h @@ -0,0 +1,567 @@ +/*! + * @file PrunedCC.h + * @brief Pruned complex-to-complex FFT (PrunedCCTransform tag). + * + * Pruned FFTs keep only the lowest @c n_modes modes per axis. The + * implementation runs @c 2^Dim sub-FFTs corresponding to all subsets of the + * pruned axes, with optional concurrent execution on independent streams / + * MPI communicator duplicates. + */ +#ifndef IPPL_FFT_TRANSFORM_PRUNEDCC_H +#define IPPL_FFT_TRANSFORM_PRUNEDCC_H + +#include +#include +#include + +#include "Utility/IpplTimings.h" +#include "Utility/ParameterList.h" + +#include "Communicate/Communicator.h" +#include "FFT/Backend/Backend.h" +#include "FFT/Traits.h" +#include "FFT/Transform/Common.h" + +namespace ippl { + + namespace detail { + /*! + * @brief Run @p func(local) for @c local in [0, count) with optional + * OpenMP outer parallelism. + * + * On CPU execution spaces the loop is serial (Kokkos already + * parallelizes the inner loops); on GPU execution spaces an OpenMP + * outer loop is used so independent stream launches can overlap. + * + * @tparam IsCPU True when Kokkos is configured with an OpenMP-capable + * host execution space; false otherwise. + */ + template + inline void runConcurrentBatch(int count, Func&& func) { + if constexpr (IsCPU) { + // Serial outer loop for CPU (Kokkos will parallelize the inner loops) + for (int local = 0; local < count; ++local) { + func(local); + } + } else { + // OpenMP outer loop for GPU (to overlap asynchronous stream launches) +#if defined(_OPENMP) +#pragma omp parallel for +#endif + for (int local = 0; local < count; ++local) { + func(local); + } + } + } + } // namespace detail + + //========================================================================= + // Pruned Complex-to-Complex Transform + //========================================================================= + + /*! + * @class FFT + * @brief Pruned C2C FFT keeping only the lowest n_modes per axis. + * + * Maintains @c 2^Dim heFFTe plans (one per subset of axes) and one + * MPI communicator duplicate per concurrent sub-FFT. The number of + * concurrent sub-FFTs is taken from the @c "num_concurrent_ffts" + * parameter, clamped to [1, 2^Dim]. + * + * @tparam ComplexField IPPL Field of Kokkos::complex elements. + */ + template + class FFT { + public: + static constexpr unsigned Dim = ComplexField::dim; + static constexpr int NumSubFFTs = 1 << Dim; + + using Complex_t = typename ComplexField::value_type; + using T = typename Complex_t::value_type; + using MemSpace = typename ComplexField::memory_space; + using ExecSpace = typename ComplexField::execution_space; + using Layout_t = FieldLayout; + + using Backend_t = fft::HeffteC2C; + using GPUOps = fft::Stream; + using Stream_t = typename GPUOps::stream_type; + using DeviceExec = typename GPUOps::exec_space; + using TempView_t = typename Kokkos::View::uniform_type; + + /*! + * @brief Build the pruned plan over the smaller of the two layouts. + * @param layoutIn Pre-pruning input layout. + * @param layoutOut Post-pruning output layout. + * @param pruning Per-axis number of modes to retain. + * @param params Backend parameters; reads @c "num_concurrent_ffts". + */ + FFT(const Layout_t& layoutIn, const Layout_t& layoutOut, const PruningParams& pruning, + const ParameterList& params) + : pruning_(pruning) + , numConcurrent_(std::clamp(params.get("num_concurrent_ffts", 4), 1, NumSubFFTs)) { + static_assert(Dim == 2 || Dim == 3, "Pruned FFT supports 2D and 3D"); + + auto& prunedLayout = + (layoutOut.getLocalNDIndex().size() < layoutIn.getLocalNDIndex().size()) ? layoutOut + : layoutIn; + + std::array low, high; + fft::domainToBounds(prunedLayout.getLocalNDIndex(), low, high); + heffte::box3d box{low, high}; + + for (int s = 0; s < numConcurrent_; ++s) { + MPI_Comm_dup(Comm->getCommunicator(), &comms_[s]); + GPUOps::create(streams_[s]); + backends_[s] = std::make_unique(box, box, comms_[s], params); + } + } + + ~FFT() { + for (int s = 0; s < numConcurrent_; ++s) { + GPUOps::destroy(streams_[s]); + MPI_Comm_free(&comms_[s]); + } + } + + /*! + * @brief Pruned forward / backward C2C transform. + * @param direction FORWARD or BACKWARD. + * @param input Pre-pruning input field. + * @param output Post-pruning output field. + * @param dir +1 / -1 swap-direction flag forwarded to the kernels. + */ + void transform(TransformDirection direction, ComplexField& input, ComplexField& output, + int dir = 1) { + if (direction == FORWARD) { + forwardPruned(dir, input, output); + } else { + backwardPruned(dir, input, output); + } + } + + //! Forward pruned C2C kernel implementation (defined out-of-class below). + void forwardPruned(int dir, ComplexField& input, ComplexField& output); + //! Backward pruned C2C kernel implementation (defined out-of-class below). + void backwardPruned(int dir, ComplexField& input, ComplexField& output); + + private: + PruningParams pruning_; + int numConcurrent_; + + std::array, NumSubFFTs> backends_; + std::array temps_; + std::array comms_{}; + std::array streams_{}; + }; + + //------------------------------------------------------------------------- + // Forward Pruned C2C Implementation + //------------------------------------------------------------------------- + + template + void FFT::forwardPruned(int dir, ComplexField& input, + ComplexField& output) { + static IpplTimings::TimerRef twiddleTimer = IpplTimings::getTimer("TwiddleAdd"); + static IpplTimings::TimerRef subFFTTimer = IpplTimings::getTimer("subFFTs"); + + auto inView = input.getView(); + auto outView = output.getView(); + const int ngIn = input.getNghost(); + const int ngOut = output.getNghost(); + + const auto& lDomPruned = output.getLayout().getLocalNDIndex(); + const auto& gDomFull = input.getLayout().getDomain(); + const auto& modes = pruning_.n_modes; + + // Ensure temps + for (int s = 0; s < numConcurrent_; ++s) { + if (temps_[s].size() != output.getOwned().size()) { + temps_[s] = detail::shrinkView("pruned_temp_" + std::to_string(s), outView, ngOut); + } + } + + Kokkos::deep_copy(outView, Complex_t(0, 0)); + + double scale = 1.0; + if (dir == 1) { + for (unsigned d = 0; d < Dim; ++d) { + scale *= double(modes[d]) / double(gDomFull[d].length()); + } + } + + std::array, NumSubFFTs> offsets; + for (int k = 0; k < NumSubFFTs; ++k) { + for (unsigned d = 0; d < Dim; ++d) { + offsets[k][d] = (k >> d) & 1; + } + } + + Vector localFirst; + for (unsigned d = 0; d < Dim; ++d) { + localFirst[d] = lDomPruned[d].first(); + } + + auto owned = output.getOwned(); + const int numBatches = (NumSubFFTs + numConcurrent_ - 1) / numConcurrent_; + + constexpr bool is_cpu = false +#ifdef KOKKOS_ENABLE_SERIAL + || std::is_same_v +#endif +#ifdef KOKKOS_ENABLE_OPENMP + || std::is_same_v +#endif + ; + + const long ext0 = static_cast(owned[0].length()); + const long ext1 = static_cast(owned[1].length()); + [[maybe_unused]] const long ext2 = + (Dim == 3) ? static_cast(owned[Dim == 3 ? 2 : 0].length()) : 1L; + + for (int batch = 0; batch < numBatches; ++batch) { + const int start = batch * numConcurrent_; + const int end = std::min(start + numConcurrent_, NumSubFFTs); + const int count = end - start; + + IpplTimings::startTimer(subFFTTimer); + + // Using the wrapper to evaluate thread parallelism safely + detail::runConcurrentBatch(count, [&](int local) { + const int k = start + local; + auto offs = offsets[k]; + auto& temp = temps_[local]; + + if constexpr (Dim == 3) { + auto copy_lambda = KOKKOS_LAMBDA(int i0, int i1, int i2) { + int si = i0 * 2 + int(offs[0]) + ngIn; + int sj = i1 * 2 + int(offs[1]) + ngIn; + int sk = i2 * 2 + int(offs[2]) + ngIn; + temp(i0, i1, i2) = inView(si, sj, sk); + }; + if constexpr (is_cpu) { + Kokkos::parallel_for( + "strided_copy_forward", + Kokkos::MDRangePolicy>( + {0, 0, 0}, {ext0, ext1, ext2}), + copy_lambda); + } else { + auto exec = GPUOps::instance(streams_[local]); + Kokkos::parallel_for( + "strided_copy_forward", + Kokkos::MDRangePolicy>( + exec, {0, 0, 0}, {ext0, ext1, ext2}), + copy_lambda); + GPUOps::sync(streams_[local]); + } + } else { + auto copy_lambda = KOKKOS_LAMBDA(int i0, int i1) { + int si = i0 * 2 + int(offs[0]) + ngIn; + int sj = i1 * 2 + int(offs[1]) + ngIn; + temp(i0, i1) = inView(si, sj); + }; + if constexpr (is_cpu) { + Kokkos::parallel_for( + "strided_copy_forward", + Kokkos::MDRangePolicy>({0, 0}, + {ext0, ext1}), + copy_lambda); + } else { + auto exec = GPUOps::instance(streams_[local]); + Kokkos::parallel_for( + "strided_copy_forward", + Kokkos::MDRangePolicy>(exec, {0, 0}, + {ext0, ext1}), + copy_lambda); + GPUOps::sync(streams_[local]); + } + } + + if (dir == 1) { + backends_[local]->forward(temp.data(), temp.data()); + } else { + backends_[local]->backward(temp.data(), temp.data()); + } + }); + + Kokkos::fence(); + IpplTimings::stopTimer(subFFTTimer); + + IpplTimings::startTimer(twiddleTimer); + + for (int local = 0; local < count; ++local) { + const int k = start + local; + auto offs = offsets[k]; + auto& temp = temps_[local]; + + const long g0 = static_cast(gDomFull[0].length()); + const long g1 = static_cast(gDomFull[1].length()); + const long g2 = + (Dim == 3) ? static_cast(gDomFull[Dim == 3 ? 2 : 0].length()) : 1L; + const long m0 = static_cast(modes[0]); + const long m1 = static_cast(modes[1]); + const long m2 = (Dim == 3) ? static_cast(modes[Dim == 3 ? 2 : 0]) : 1L; + const int lf0 = localFirst[0]; + const int lf1 = localFirst[1]; + const int lf2 = (Dim == 3) ? localFirst[Dim == 3 ? 2 : 0] : 0; + + if constexpr (Dim == 3) { + Kokkos::parallel_for( + "twiddle_add_forward", + Kokkos::MDRangePolicy>( + {ngOut, ngOut, ngOut}, + {int(outView.extent(0)) - ngOut, int(outView.extent(1)) - ngOut, + int(outView.extent(2)) - ngOut}), + KOKKOS_LAMBDA(int i, int j, int kk) { + int gi = i - ngOut + lf0; + int gj = j - ngOut + lf1; + int gk = kk - ngOut + lf2; + + int64_t f0 = (gi < int64_t(m0) / 2) ? gi : int64_t(g0) - int64_t(m0) + gi; + int64_t f1 = (gj < int64_t(m1) / 2) ? gj : int64_t(g1) - int64_t(m1) + gj; + int64_t f2 = (gk < int64_t(m2) / 2) ? gk : int64_t(g2) - int64_t(m2) + gk; + + Complex_t w(1.0, 0.0); + auto twiddle = [&](int64_t freq, int64_t N) { + double ang = -dir * 2.0 * M_PI * double(freq) / double(N); + return Complex_t(Kokkos::cos(ang), Kokkos::sin(ang)); + }; + + if (offs[0]) w *= twiddle(f0, g0); + if (offs[1]) w *= twiddle(f1, g1); + if (offs[2]) w *= twiddle(f2, g2); + + auto val = temp(i - ngOut, j - ngOut, kk - ngOut); + outView(i, j, kk) += w * val * scale; + }); + } else { + Kokkos::parallel_for( + "twiddle_add_forward", + Kokkos::MDRangePolicy>( + {ngOut, ngOut}, + {int(outView.extent(0)) - ngOut, int(outView.extent(1)) - ngOut}), + KOKKOS_LAMBDA(int i, int j) { + int gi = i - ngOut + lf0; + int gj = j - ngOut + lf1; + + int64_t f0 = (gi < int64_t(m0) / 2) ? gi : int64_t(g0) - int64_t(m0) + gi; + int64_t f1 = (gj < int64_t(m1) / 2) ? gj : int64_t(g1) - int64_t(m1) + gj; + + Complex_t w(1.0, 0.0); + auto twiddle = [&](int64_t freq, int64_t N) { + double ang = -dir * 2.0 * M_PI * double(freq) / double(N); + return Complex_t(Kokkos::cos(ang), Kokkos::sin(ang)); + }; + + if (offs[0]) w *= twiddle(f0, g0); + if (offs[1]) w *= twiddle(f1, g1); + + auto val = temp(i - ngOut, j - ngOut); + outView(i, j) += w * val * scale; + }); + } + } + + IpplTimings::stopTimer(twiddleTimer); + } + } + + //------------------------------------------------------------------------- + // Backward Pruned C2C Implementation + //------------------------------------------------------------------------- + + template + void FFT::backwardPruned(int dir, ComplexField& input, + ComplexField& output) { + static IpplTimings::TimerRef subIFFTTimer = IpplTimings::getTimer("subIFFTs"); + static IpplTimings::TimerRef stridedWriteTimer = IpplTimings::getTimer("StridedWrite"); + + auto inView = input.getView(); // Pruned frequency domain + auto outView = output.getView(); // Full spatial domain + const int ngIn = input.getNghost(); + const int ngOut = output.getNghost(); + + const auto& lDomPruned = input.getLayout().getLocalNDIndex(); + const auto& gDomFull = output.getLayout().getDomain(); + const auto& modes = pruning_.n_modes; + + for (int s = 0; s < numConcurrent_; ++s) { + if (temps_[s].size() != input.getOwned().size()) { + temps_[s] = + detail::shrinkView("pruned_ifft_temp_" + std::to_string(s), inView, ngIn); + } + } + + Kokkos::deep_copy(outView, Complex_t(0, 0)); + + std::array, NumSubFFTs> offsets; + for (int k = 0; k < NumSubFFTs; ++k) { + for (unsigned d = 0; d < Dim; ++d) { + offsets[k][d] = (k >> d) & 1; + } + } + + Vector localFirst; + for (unsigned d = 0; d < Dim; ++d) { + localFirst[d] = lDomPruned[d].first(); + } + + auto owned = input.getOwned(); + const int numBatches = (NumSubFFTs + numConcurrent_ - 1) / numConcurrent_; + + constexpr bool is_cpu = false +#ifdef KOKKOS_ENABLE_SERIAL + || std::is_same_v +#endif +#ifdef KOKKOS_ENABLE_OPENMP + || std::is_same_v +#endif + ; + + const long ext0 = static_cast(owned[0].length()); + const long ext1 = static_cast(owned[1].length()); + [[maybe_unused]] const long ext2 = + (Dim == 3) ? static_cast(owned[Dim == 3 ? 2 : 0].length()) : 1L; + const long g0 = static_cast(gDomFull[0].length()); + const long g1 = static_cast(gDomFull[1].length()); + const long g2 = (Dim == 3) ? static_cast(gDomFull[Dim == 3 ? 2 : 0].length()) : 1L; + const long m0 = static_cast(modes[0]); + const long m1 = static_cast(modes[1]); + const long m2 = (Dim == 3) ? static_cast(modes[Dim == 3 ? 2 : 0]) : 1L; + const int lf0 = localFirst[0]; + const int lf1 = localFirst[1]; + const int lf2 = (Dim == 3) ? localFirst[Dim == 3 ? 2 : 0] : 0; + + for (int batch = 0; batch < numBatches; ++batch) { + const int start = batch * numConcurrent_; + const int end = std::min(start + numConcurrent_, NumSubFFTs); + const int count = end - start; + + IpplTimings::startTimer(subIFFTTimer); + + detail::runConcurrentBatch(count, [&](int local) { + const int k = start + local; + auto offs = offsets[k]; + auto& temp = temps_[local]; + + if constexpr (Dim == 3) { + auto multiply_lambda = KOKKOS_LAMBDA(int i0, int i1, int i2) { + int gi = i0 + lf0; + int gj = i1 + lf1; + int gk = i2 + lf2; + + int64_t f0 = (gi < int64_t(m0) / 2) ? gi : int64_t(g0) - int64_t(m0) + gi; + int64_t f1 = (gj < int64_t(m1) / 2) ? gj : int64_t(g1) - int64_t(m1) + gj; + int64_t f2 = (gk < int64_t(m2) / 2) ? gk : int64_t(g2) - int64_t(m2) + gk; + + Complex_t w(1.0, 0.0); + auto twiddle = [&](int64_t freq, int64_t N) { + double ang = dir * 2.0 * M_PI * double(freq) / double(N); + return Complex_t(Kokkos::cos(ang), Kokkos::sin(ang)); + }; + + if (offs[0]) w *= twiddle(f0, g0); + if (offs[1]) w *= twiddle(f1, g1); + if (offs[2]) w *= twiddle(f2, g2); + + auto input_val = inView(i0 + ngIn, i1 + ngIn, i2 + ngIn); + temp(i0, i1, i2) = w * input_val; + }; + if constexpr (is_cpu) { + Kokkos::parallel_for( + "twiddle_multiply_backward", + Kokkos::MDRangePolicy>( + {0, 0, 0}, {ext0, ext1, ext2}), + multiply_lambda); + } else { + auto exec = GPUOps::instance(streams_[local]); + Kokkos::parallel_for( + "twiddle_multiply_backward", + Kokkos::MDRangePolicy>( + exec, {0, 0, 0}, {ext0, ext1, ext2}), + multiply_lambda); + GPUOps::sync(streams_[local]); + } + } else { + auto multiply_lambda = KOKKOS_LAMBDA(int i0, int i1) { + int gi = i0 + lf0; + int gj = i1 + lf1; + + int64_t f0 = (gi < int64_t(m0) / 2) ? gi : int64_t(g0) - int64_t(m0) + gi; + int64_t f1 = (gj < int64_t(m1) / 2) ? gj : int64_t(g1) - int64_t(m1) + gj; + + Complex_t w(1.0, 0.0); + auto twiddle = [&](int64_t freq, int64_t N) { + double ang = dir * 2.0 * M_PI * double(freq) / double(N); + return Complex_t(Kokkos::cos(ang), Kokkos::sin(ang)); + }; + + if (offs[0]) w *= twiddle(f0, g0); + if (offs[1]) w *= twiddle(f1, g1); + + auto input_val = inView(i0 + ngIn, i1 + ngIn); + temp(i0, i1) = w * input_val; + }; + if constexpr (is_cpu) { + Kokkos::parallel_for( + "twiddle_multiply_backward", + Kokkos::MDRangePolicy>({0, 0}, + {ext0, ext1}), + multiply_lambda); + } else { + auto exec = GPUOps::instance(streams_[local]); + Kokkos::parallel_for( + "twiddle_multiply_backward", + Kokkos::MDRangePolicy>(exec, {0, 0}, + {ext0, ext1}), + multiply_lambda); + GPUOps::sync(streams_[local]); + } + } + + if (dir == -1) { + backends_[local]->forward(temp.data(), temp.data()); + } else { + backends_[local]->backward(temp.data(), temp.data()); + } + }); + + Kokkos::fence(); + IpplTimings::stopTimer(subIFFTTimer); + + IpplTimings::startTimer(stridedWriteTimer); + + for (int local = 0; local < count; ++local) { + const int k = start + local; + auto offs = offsets[k]; + auto& temp = temps_[local]; + + if constexpr (Dim == 3) { + Kokkos::parallel_for( + "strided_write_backward", + Kokkos::MDRangePolicy>( + {0, 0, 0}, {ext0, ext1, ext2}), + KOKKOS_LAMBDA(int i0, int i1, int i2) { + int oi = i0 * 2 + int(offs[0]) + ngOut; + int oj = i1 * 2 + int(offs[1]) + ngOut; + int ok = i2 * 2 + int(offs[2]) + ngOut; + outView(oi, oj, ok) = temp(i0, i1, i2); + }); + } else { + Kokkos::parallel_for( + "strided_write_backward", + Kokkos::MDRangePolicy>({0, 0}, {ext0, ext1}), + KOKKOS_LAMBDA(int i0, int i1) { + int oi = i0 * 2 + int(offs[0]) + ngOut; + int oj = i1 * 2 + int(offs[1]) + ngOut; + outView(oi, oj) = temp(i0, i1); + }); + } + } + + IpplTimings::stopTimer(stridedWriteTimer); + } + } +} // namespace ippl + +#endif diff --git a/src/FFT/Transform/PrunedRC.h b/src/FFT/Transform/PrunedRC.h new file mode 100644 index 000000000..e89800bfb --- /dev/null +++ b/src/FFT/Transform/PrunedRC.h @@ -0,0 +1,320 @@ +/*! + * @file PrunedRC.h + * @brief Pruned real-to-complex FFT (PrunedRCTransform tag). + * + * The outbox layout is chosen so the mapping from pruned-global index to + * local index is communication-free; see the comment above the class + * declaration for the index-arithmetic details. + */ +#ifndef IPPL_FFT_TRANSFORM_PRUNEDRC_H +#define IPPL_FFT_TRANSFORM_PRUNEDRC_H + +#include +#include + +#include "Utility/IpplTimings.h" +#include "Utility/ParameterList.h" + +#include "Communicate/Communicator.h" +#include "FFT/Backend/Backend.h" +#include "FFT/Traits.h" +#include "FFT/Transform/Common.h" + +namespace ippl { + //========================================================================= + // Pruned Real-to-Complex Transform + // + // The key insight that makes this completely communication-free per + // transform is the choice of heFFTe outbox: + // + // R2C dim (d_r): + // Each rank's outbox lower bound in d_r = pruned field lower bound. + // The last rank (hi_pruned == K_r - 1) extends its outbox to cover + // [K_r, N_r/2], which are the "extra" full-complex modes discarded by + // the pruning. This ensures every rank's outbox contains exactly the + // full-complex indices its pruned modes map to (fi = gi, direct). + // + // Non-R2C dims: + // Every rank owns the FULL extent [0, N_d - 1]. The wrapping formula + // fj = gj < K/2 ? gj : N-K+gj always maps into [0, N-1], which is + // always within this rank's outbox. + // + // Result: the local index in tempComplexFull_ is computable from the + // pruned global index with no inter-rank communication. + //========================================================================= + + /*! + * @class FFT + * @brief Pruned R2C FFT keeping only the lowest n_modes per axis. + * + * Currently 3D-only. Forward maps a real field to a pruned complex field; + * backward goes the other way. Internally uses a full-size complex + * scratch view so the index arithmetic stays local. + * + * @tparam RealField IPPL Field of real values. + */ + template + class FFT { + public: + static constexpr unsigned Dim = RealField::dim; + + using T = typename RealField::value_type; + using Complex_t = Kokkos::complex; + using MemSpace = typename RealField::memory_space; + using ExecSpace = typename RealField::execution_space; + using Layout_t = FieldLayout; + + using ComplexField = + typename Field::uniform_type; + +#ifdef IPPL_ENABLE_CUFFTMP + using Backend_t = fft::CuFFTMpR2C; +#else + using Backend_t = fft::HeffteR2C; +#endif + using TempReal_t = Kokkos::View; + using TempComplex_t = Kokkos::View; + + /*! + * @brief Build the pruned R2C plan. + * @param layoutReal Real-input field layout. + * @param layoutComplexFull Kept for API compatibility; the outbox is + * actually recomputed internally. + * @param layoutComplexPruned Pruned complex-output layout. + * @param pruning Per-axis number of modes to retain. + * @param params Backend parameters; reads + * @c "r2c_direction" (default 0). + */ + FFT(const Layout_t& layoutReal, + const Layout_t& layoutComplexFull, // kept for API compatibility; outbox is recomputed + const Layout_t& layoutComplexPruned, const PruningParams& pruning, + const ParameterList& params); + + /*! + * @brief Forward (real->pruned complex) or backward transform. + * @param direction FORWARD or BACKWARD. + * @param f Real field (input on FORWARD, output on BACKWARD). + * @param g Pruned complex field (output on FORWARD, input on BACKWARD). + */ + void transform(TransformDirection direction, RealField& f, ComplexField& g); + + private: + PruningParams pruning_; + std::unique_ptr backend_; + int r2c_dir_ = 0; + + // Local outbox origin and dimensions (= tempComplexFull_ dimensions). + // For the R2C dim: origin = pruned field's lower bound on this rank. + // For non-R2C dims: origin = 0, size = full global extent N_d. + std::array lowComplexFull_ = {}; + std::array fullComplexDims_ = {}; + + // Global real-grid sizes for the wrapping formula: + // fj = gj < K/2 ? gj : N - K + gj + std::array globalRealDims_ = {}; + + TempReal_t tempReal_; + TempComplex_t tempComplexFull_; + }; + + //========================================================================= + // Constructor + //========================================================================= + + template + FFT::FFT(const Layout_t& layoutReal, + const Layout_t& /*layoutComplexFull*/, + const Layout_t& layoutComplexPruned, + const PruningParams& pruning, + const ParameterList& params) + : pruning_(pruning) { + static_assert(Dim == 3, "PrunedRCTransform currently only supports 3D"); + + r2c_dir_ = params.get("r2c_direction", 0); + + // Global real-grid sizes + const auto& gDomReal = layoutReal.getDomain(); + for (int d = 0; d < 3; ++d) + globalRealDims_[d] = gDomReal[d].length(); + + // Inbox: this rank's local real slab + std::array lowReal, highReal; + fft::domainToBounds(layoutReal.getLocalNDIndex(), lowReal, highReal); + + // Outbox: custom -- aligned to pruned in r2c_dir, full extent everywhere else + // + // In r2c_dir d_r: + // outbox lo = pruned lo (so fi = gi_p is always within this rank's buffer) + // outbox hi = pruned hi (except the last rank, which absorbs [K_r, N_r/2]) + // + // In every other dim d: + // outbox = [0, N_d - 1] (whole extent; wrapping formula always stays local) + const auto& lDomPruned = layoutComplexPruned.getLocalNDIndex(); + + std::array lowOut, highOut; + for (int d = 0; d < 3; ++d) { + if (d == r2c_dir_) { + const long long lo_p = lDomPruned[d].first(); + const long long hi_p = lDomPruned[d].last(); + const long long K_d = static_cast(pruning_.n_modes[d]); + const long long N_d = globalRealDims_[d]; + const long long top = N_d / 2; // last valid index of R2C output (N/2+1 elements) + + lowOut[d] = lo_p; + // If this rank owns the last pruned mode, extend outbox to cover + // all remaining full-complex elements [K_d .. N_d/2] + highOut[d] = (hi_p == K_d - 1) ? top : hi_p; + } else { + // Every rank owns the full extent of this dimension + lowOut[d] = 0; + highOut[d] = globalRealDims_[d] - 1; + } + } + + heffte::box3d inbox{lowReal, highReal}; + heffte::box3d outbox{lowOut, highOut}; + + backend_ = + std::make_unique(inbox, outbox, r2c_dir_, Comm->getCommunicator(), params); + + for (int d = 0; d < 3; ++d) { + lowComplexFull_[d] = lowOut[d]; + fullComplexDims_[d] = static_cast(highOut[d] - lowOut[d] + 1); + } + } + + //========================================================================= + // transform + //========================================================================= + + template + void FFT::transform(TransformDirection direction, RealField& f, + ComplexField& g) { + auto fview = f.getView(); + auto gview = g.getView(); + const int ngf = f.getNghost(); + const int ngg = g.getNghost(); + + // Ensure temp buffers + if (tempReal_.size() != f.getOwned().size()) + tempReal_ = detail::shrinkView("pruned_r2c_real", fview, ngf); + + const std::size_t fullSize = + fullComplexDims_[0] * fullComplexDims_[1] * fullComplexDims_[2]; + if (tempComplexFull_.size() != fullSize) + tempComplexFull_ = TempComplex_t("pruned_r2c_complex_full", fullComplexDims_[0], + fullComplexDims_[1], fullComplexDims_[2]); + + // Pruned domain info for the kernel + const auto& lDomPruned = g.getLayout().getLocalNDIndex(); + const long long lp0 = lDomPruned[0].first(); + const long long lp1 = lDomPruned[1].first(); + const long long lp2 = lDomPruned[2].first(); + + // Mode counts and real-grid sizes for the wrapping formula + const long long K0 = pruning_.n_modes[0]; + const long long K1 = pruning_.n_modes[1]; + const long long K2 = pruning_.n_modes[2]; + const long long N0 = globalRealDims_[0]; + const long long N1 = globalRealDims_[1]; + const long long N2 = globalRealDims_[2]; + + // outbox origins -- only non-zero in r2c_dir + const long long lcf0 = lowComplexFull_[0]; + const long long lcf1 = lowComplexFull_[1]; + const long long lcf2 = lowComplexFull_[2]; + const int r2c = r2c_dir_; + + auto owned = g.getOwned(); // ghost-free extent of pruned field + + if (direction == FORWARD) { + // 1. Strip ghosts -> tempReal_ + auto tempreal = tempReal_; + Kokkos::parallel_for( + "r2c_copy_real_fwd", + Kokkos::MDRangePolicy>( + {ngf, ngf, ngf}, {int(fview.extent(0)) - ngf, int(fview.extent(1)) - ngf, + int(fview.extent(2)) - ngf}), + KOKKOS_LAMBDA(int i, int j, int k) { + tempreal(i - ngf, j - ngf, k - ngf) = fview(i, j, k); + }); + Kokkos::fence(); + + // 2. Distributed R2C FFT -> tempComplexFull_ + backend_->forward(tempReal_.data(), tempComplexFull_.data()); + + // 3. Extract pruned modes from tempComplexFull_ -> pruned output field + // + // Every access is local by construction of the outbox: + // fi0_l = gi0 - lcf0 (R2C dim 0 assumed here; generalises below) + // fi1_l = wrap(gi1) (lcf1=0 for non-R2C dims) + // fi2_l = wrap(gi2) + auto& tcf = tempComplexFull_; + Kokkos::parallel_for( + "extract_pruned_r2c_fwd", + Kokkos::MDRangePolicy>( + {0, 0, 0}, + {int(owned[0].length()), int(owned[1].length()), int(owned[2].length())}), + KOKKOS_LAMBDA(int i0, int i1, int i2) { + const long long gi0 = i0 + lp0; + const long long gi1 = i1 + lp1; + const long long gi2 = i2 + lp2; + + const int fi0 = + (r2c == 0) ? int(gi0 - lcf0) : int((gi0 < K0 / 2) ? gi0 : (N0 - K0 + gi0)); + const int fi1 = + (r2c == 1) ? int(gi1 - lcf1) : int((gi1 < K1 / 2) ? gi1 : (N1 - K1 + gi1)); + const int fi2 = + (r2c == 2) ? int(gi2 - lcf2) : int((gi2 < K2 / 2) ? gi2 : (N2 - K2 + gi2)); + + gview(i0 + ngg, i1 + ngg, i2 + ngg) = tcf(fi0, fi1, fi2); + }); + + } else { // BACKWARD + + // 1. Zero-fill tempComplexFull_, then scatter pruned modes into it + Kokkos::deep_copy(tempComplexFull_, Complex_t(0, 0)); + + auto& tcf = tempComplexFull_; + Kokkos::parallel_for( + "scatter_pruned_r2c_bwd", + Kokkos::MDRangePolicy>( + {0, 0, 0}, + {int(owned[0].length()), int(owned[1].length()), int(owned[2].length())}), + KOKKOS_LAMBDA(int i0, int i1, int i2) { + const long long gi0 = i0 + lp0; + const long long gi1 = i1 + lp1; + const long long gi2 = i2 + lp2; + + const int fi0 = + (r2c == 0) ? int(gi0 - lcf0) : int((gi0 < K0 / 2) ? gi0 : (N0 - K0 + gi0)); + const int fi1 = + (r2c == 1) ? int(gi1 - lcf1) : int((gi1 < K1 / 2) ? gi1 : (N1 - K1 + gi1)); + const int fi2 = + (r2c == 2) ? int(gi2 - lcf2) : int((gi2 < K2 / 2) ? gi2 : (N2 - K2 + gi2)); + + tcf(fi0, fi1, fi2) = gview(i0 + ngg, i1 + ngg, i2 + ngg); + }); + Kokkos::fence(); + + // 2. Distributed C2R backward + backend_->backward(tempComplexFull_.data(), tempReal_.data()); + Kokkos::fence(); + + // 3. Copy tempReal_ back (restore ghost padding) + auto tempreal = tempReal_; + Kokkos::parallel_for( + "r2c_copy_real_bwd", + Kokkos::MDRangePolicy>( + {ngf, ngf, ngf}, {int(fview.extent(0)) - ngf, int(fview.extent(1)) - ngf, + int(fview.extent(2)) - ngf}), + KOKKOS_LAMBDA(int i, int j, int k) { + fview(i, j, k) = tempreal(i - ngf, j - ngf, k - ngf); + }); + } + } + +} // namespace ippl + +#endif diff --git a/src/FFT/Transform/RC.h b/src/FFT/Transform/RC.h new file mode 100644 index 000000000..a6edc4137 --- /dev/null +++ b/src/FFT/Transform/RC.h @@ -0,0 +1,124 @@ +/*! + * @file RC.h + * @brief Real-to-complex FFT specialization (RCTransform tag). + */ +#ifndef IPPL_FFT_TRANSFORM_RC_H +#define IPPL_FFT_TRANSFORM_RC_H + +#include "Utility/ParameterList.h" + +#include "Communicate/Communicator.h" +#include "FFT/Backend/Backend.h" +#include "FFT/Traits.h" +#include "FFT/Transform/Common.h" + +namespace ippl { + + /*! + * @class FFT + * @brief Real-to-complex / complex-to-real FFT over IPPL Fields. + * + * Forward transforms a real field into its half-complex spectrum, the + * backward transforms back. Selects cuFFTMp on CUDA-MP builds, otherwise + * the heFFTe R2C backend. + * + * @tparam RealField IPPL Field of real-valued elements. + */ + template + class FFT { + public: + static constexpr unsigned Dim = RealField::dim; + + using T = typename RealField::value_type; + using Complex_t = Kokkos::complex; + using MemSpace = typename RealField::memory_space; + using ExecSpace = typename RealField::execution_space; + using Layout_t = FieldLayout; + + using ComplexField = typename Field::uniform_type; + +#ifdef IPPL_ENABLE_CUFFTMP + using Backend_t = fft::CuFFTMpR2C; +#else + using Backend_t = fft::HeffteR2C; +#endif + using TempReal_t = typename Kokkos::View::uniform_type; + using TempComplex_t = typename Kokkos::View::uniform_type; + + /*! + * @brief Build the R2C plan for the given layouts. + * @param layoutIn Real-input field layout. + * @param layoutOut Complex-output field layout (Hermitian-symmetric). + * @param params Backend parameter list (R2C axis taken from key + * `r2c_direction`, default 0). + */ + FFT(const Layout_t& layoutIn, const Layout_t& layoutOut, const ParameterList& params) { + static_assert(Dim == 2 || Dim == 3, "heFFTe only supports 2D and 3D"); + + std::array lowIn, highIn, lowOut, highOut; + fft::domainToBounds(layoutIn.getLocalNDIndex(), lowIn, highIn); + fft::domainToBounds(layoutOut.getLocalNDIndex(), lowOut, highOut); + + int r2c_dir = params.get("r2c_direction", 0); + backend_ = std::make_unique(heffte::box3d{lowIn, highIn}, + heffte::box3d{lowOut, highOut}, + r2c_dir, Comm->getCommunicator(), params); + } + + //! Execute one forward + one backward to JIT-compile / warm caches. + void warmup(RealField& f, ComplexField& g) { + transform(FORWARD, f, g); + transform(BACKWARD, f, g); + } + + /*! + * @brief Forward (real -> complex) or backward (complex -> real) transform. + * @param direction FORWARD or BACKWARD. + * @param f Real field (input on FORWARD, output on BACKWARD). + * @param g Complex field (output on FORWARD, input on BACKWARD). + */ + void transform(TransformDirection direction, RealField& f, ComplexField& g) { + auto fview = f.getView(); + auto gview = g.getView(); + const int ngf = f.getNghost(); + const int ngg = g.getNghost(); + + ensureTemps(f, g); + fft::copyToTemp(tempReal_, fview, ngf); + fft::copyToTemp(tempComplex_, gview, + ngg); + + if (direction == FORWARD) { + backend_->forward(tempReal_.data(), tempComplex_.data()); + } else { + backend_->backward(tempComplex_.data(), tempReal_.data()); + } + + fft::copyFromTemp(fview, tempReal_, + ngf); + fft::copyFromTemp( + gview, tempComplex_, ngg); + } + + private: + std::unique_ptr backend_; + TempReal_t tempReal_; + TempComplex_t tempComplex_; + + void ensureTemps(const RealField& f, const ComplexField& g) { + if (tempReal_.size() != f.getOwned().size()) { + tempReal_ = detail::shrinkView("fft_rc_real", f.getView(), f.getNghost()); + } + if (tempComplex_.size() != g.getOwned().size()) { + tempComplex_ = detail::shrinkView("fft_rc_complex", g.getView(), g.getNghost()); + } + } + }; + +} // namespace ippl + +#endif diff --git a/src/FFT/Transform/Transform.h b/src/FFT/Transform/Transform.h new file mode 100644 index 000000000..72d4ed73b --- /dev/null +++ b/src/FFT/Transform/Transform.h @@ -0,0 +1,18 @@ +/*! + * @file Transform.h + * @brief Aggregate include of all IPPL FFT transform specializations. + * + * Pulls in the CC (complex-to-complex), RC (real-to-complex), NUFFT, + * pruned CC/RC, and trigonometric (sin/cos) transforms in one go. + */ +#ifndef IPPL_FFT_TRANSFORM_HPP +#define IPPL_FFT_TRANSFORM_HPP + +#include "FFT/Transform/CC.h" +#include "FFT/Transform/NUFFT.h" +#include "FFT/Transform/PrunedCC.h" +#include "FFT/Transform/PrunedRC.h" +#include "FFT/Transform/RC.h" +#include "FFT/Transform/Trig.h" + +#endif diff --git a/src/FFT/Transform/Trig.h b/src/FFT/Transform/Trig.h new file mode 100644 index 000000000..017aa1e66 --- /dev/null +++ b/src/FFT/Transform/Trig.h @@ -0,0 +1,131 @@ +/*! + * @file Trig.h + * @brief Trigonometric (sine / cosine / cosine-Type-I) FFT specializations. + */ +#ifndef IPPL_FFT_TRANSFORM_TRIG_H +#define IPPL_FFT_TRANSFORM_TRIG_H + +#include "Utility/ParameterList.h" +#include "Utility/ViewUtils.h" + +#include "Communicate/Communicator.h" +#include "FFT/Backend/Backend.h" +#include "FFT/Traits.h" +#include "FFT/Transform/Common.h" + + +namespace ippl { + + namespace fft { + + /*! + * @class TrigBase + * @brief Shared implementation for sine / cosine transforms. + * + * Holds a HeffteTrig backend, an internal LayoutLeft scratch buffer, + * and the FFTW-only renormalization (FFTW emits trig outputs scaled + * by 8 in 3D; other backends do not). + * + * @tparam Field IPPL Field of real elements. + * @tparam Tag One of SineTransform, CosTransform, Cos1Transform. + */ + template + class TrigBase { + public: + static constexpr unsigned Dim = Field::dim; + + using T = typename Field::value_type; + using MemSpace = typename Field::memory_space; + using ExecSpace = typename Field::execution_space; + using Layout_t = FieldLayout; + using Backend_t = HeffteTrig; + using TempView_t = typename Kokkos::View::uniform_type; + + //! Build the heFFTe trig plan for the local subdomain in @p layout. + TrigBase(const Layout_t& layout, const ParameterList& params) { + static_assert(Dim == 2 || Dim == 3, "heFFTe only supports 2D and 3D"); + + std::array low, high; + domainToBounds(layout.getLocalNDIndex(), low, high); + heffte::box3d box{low, high}; + + backend_ = std::make_unique(box, box, Comm->getCommunicator(), params); + } + + //! Run a forward + backward pair to JIT-compile / warm caches. + void warmup(Field& f) { + transform(FORWARD, f); + transform(BACKWARD, f); + } + + /*! + * @brief In-place trig transform of @p f. + * + * When FFTW is the backend the input is rescaled by + * @c 1/fftw_trig_scale() on FORWARD and @c fftw_trig_scale() on + * BACKWARD so the result is consistent with the other backends. + */ + void transform(TransformDirection direction, Field& f) { + // FFTW scaling + if constexpr (is_available_v) { + if (direction == FORWARD) + f = f / fftw_trig_scale(); + } + + auto view = f.getView(); + const int ng = f.getNghost(); + + ensureTemp(f); + fft::copyToTemp(temp_, view, ng); + + if (direction == FORWARD) { + backend_->forward(temp_.data(), temp_.data()); + } else { + backend_->backward(temp_.data(), temp_.data()); + } + + fft::copyFromTemp(view, temp_, ng); + + if constexpr (is_available_v) { + if (direction == BACKWARD) + f = f * fftw_trig_scale(); + } + } + + private: + std::unique_ptr backend_; + TempView_t temp_; + + void ensureTemp(const Field& f) { + if (temp_.size() != f.getOwned().size()) { + temp_ = ippl::detail::shrinkView("fft_trig_temp", f.getView(), f.getNghost()); + } + } + }; + + } // namespace fft + + //! @name Trigonometric FFT specializations + //! Thin tag-aliases over fft::TrigBase. The transform tag selects the + //! corresponding heFFTe sine/cosine backend. + //! @{ + template + class FFT : public fft::TrigBase { + using fft::TrigBase::TrigBase; + }; + + template + class FFT : public fft::TrigBase { + using fft::TrigBase::TrigBase; + }; + + template + class FFT : public fft::TrigBase { + using fft::TrigBase::TrigBase; + }; + //! @} + +} // namespace ippl + +#endif diff --git a/src/Field/BareField.h b/src/Field/BareField.h index 6ff4a5e01..4f6f37388 100644 --- a/src/Field/BareField.h +++ b/src/Field/BareField.h @@ -100,7 +100,6 @@ namespace ippl { */ void initialize(Layout_t& l, int nghost = 1); - // ML void updateLayout(Layout_t&, int nghost = 1); /*! diff --git a/src/Field/BareField.hpp b/src/Field/BareField.hpp index 695576af3..25da094f7 100644 --- a/src/Field/BareField.hpp +++ b/src/Field/BareField.hpp @@ -16,6 +16,7 @@ #include "Utility/Inform.h" #include "Utility/IpplInfo.h" +#include "Types/IpplTypes.h" #include "BareField.h" namespace Kokkos { @@ -28,6 +29,12 @@ namespace Kokkos { return ippl::Vector(1); } KOKKOS_FORCEINLINE_FUNCTION static ippl::Vector min() { + // Kokkos::reduction_identity::min/max already do the right + // thing for primitive T (-inf for float / numeric_limits::max for + // int, etc.) and stay device-callable; this matches the upstream + // change introduced via PR #532 while still allowing the + // ippl::detail::infinity helper from IpplTypes.h to be reused + // elsewhere in the branch. return ippl::Vector(Kokkos::reduction_identity::min()); } KOKKOS_FORCEINLINE_FUNCTION static ippl::Vector max() { @@ -36,6 +43,11 @@ namespace Kokkos { }; } // namespace Kokkos +// Reducer wrappers that pull ippl::max / ippl::min into the join-overload +// resolution set. The stock Kokkos::Max / Kokkos::Min join uses +// Kokkos::max only, which has no overload for ippl::Vector; the +// using-declarations below let ADL find the IPPL element-wise overloads while +// keeping the scalar Kokkos path intact. namespace KokkosCorrection { template struct Max : Kokkos::Max { @@ -104,7 +116,6 @@ namespace ippl { template BareField::BareField(Layout_t& l, int nghost) : nghost_m(nghost) - // , owned_m(0) , layout_m(&l) { setup(); } @@ -118,10 +129,8 @@ namespace ippl { } } - // ML template void BareField::updateLayout(Layout_t& l, int nghost) { - // std::cout << "Got in BareField::updateLayout()" << std::endl; layout_m = &l; nghost_m = nghost; setup(); @@ -146,7 +155,7 @@ namespace ippl { template void BareField::fillHalo() { if (layout_m->comm.size() > 1) { - halo_m.fillHalo(dview_m, layout_m); + halo_m.fillHalo(dview_m, layout_m, nghost_m); } if (layout_m->isAllPeriodic_m) { using Op = typename detail::HaloCells::assign; @@ -157,7 +166,7 @@ namespace ippl { template void BareField::accumulateHalo() { if (layout_m->comm.size() > 1) { - halo_m.accumulateHalo(dview_m, layout_m); + halo_m.accumulateHalo(dview_m, layout_m, nghost_m); } if (layout_m->isAllPeriodic_m) { using Op = typename detail::HaloCells::rhs_plus_assign; @@ -219,16 +228,17 @@ namespace ippl { template \ T BareField::name(int nghost) const { \ PAssert_LE(nghost, nghost_m); \ - T temp = Kokkos::reduction_identity::name(); \ + const T identity = Kokkos::reduction_identity::name(); \ + T temp = identity; \ using index_array_type = typename RangePolicy::index_array_type; \ ippl::parallel_reduce( \ - "fun", getRangePolicy(dview_m, nghost_m - nghost), \ + "BareField::" #name, getRangePolicy(dview_m, nghost_m - nghost), \ KOKKOS_CLASS_LAMBDA(const index_array_type& args, T& valL) { \ T myVal = apply(dview_m, args); \ op; \ }, \ KokkosCorrection::fun(temp)); \ - T globaltemp = 0.0; \ + T globaltemp = identity; \ layout_m->comm.allreduce(temp, globaltemp, 1, MPI_Op()); \ return globaltemp; \ } diff --git a/src/Field/BareFieldOperations.hpp b/src/Field/BareFieldOperations.hpp index 909c9a2ac..a00ec101f 100644 --- a/src/Field/BareFieldOperations.hpp +++ b/src/Field/BareFieldOperations.hpp @@ -3,6 +3,7 @@ // Norms and a scalar product for fields // +#include "Utility/TypeUtils.h" #include namespace ippl { @@ -10,44 +11,43 @@ namespace ippl { * Computes the inner product of two fields * @param f1 first field * @param f2 second field - * @return Result of f1^T f2 + * @return Result of f1^H f2 (Hermitian for complex T, scalar product otherwise) */ template typename BareField::value_type innerProduct(const BareField& f1, const BareField& f2) { using T = typename BareField::value_type; constexpr unsigned Dim = BareField::dim; - static IpplTimings::TimerRef setup = IpplTimings::getTimer("inner_setup"); - static IpplTimings::TimerRef ippl_red = IpplTimings::getTimer("ippl_reduce"); - static IpplTimings::TimerRef mpi_red = IpplTimings::getTimer("mpi_reduce"); - - IpplTimings::startTimer(setup); - T sum = 0; - auto& layout = f1.getLayout(); - auto& view1 = f1.getView(); - auto& view2 = f2.getView(); + auto& layout = f1.getLayout(); + auto& view1 = f1.getView(); + auto& view2 = f2.getView(); using exec_space = typename BareField::execution_space; using index_array_type = typename RangePolicy::index_array_type; - IpplTimings::stopTimer(setup); - IpplTimings::startTimer(ippl_red); - ippl::parallel_reduce( "Field::innerProduct(Field&, Field&)", f1.getFieldRangePolicy(), KOKKOS_LAMBDA(const index_array_type& args, T& val) { - val += apply(view1, args) * apply(view2, args); + // Force-capture view1/view2 outside the if-constexpr branches: + // nvcc cannot first-capture variables inside a constexpr-if + // branch on extended __host__ __device__ lambdas. + (void)view1; + (void)view2; + if constexpr (is_complex_v) { + val += apply(view1, args) * Kokkos::conj(apply(view2, args)); + } else { + val += apply(view1, args) * apply(view2, args); + } }, Kokkos::Sum(sum)); - - IpplTimings::stopTimer(ippl_red); - IpplTimings::startTimer(mpi_red); - T globalSum = 0; - layout.comm.allreduce(sum, globalSum, 1, std::plus()); - - IpplTimings::stopTimer(mpi_red); - + if constexpr (is_complex_v) { + using real_type = decltype(T{}.real()); + layout.comm.allreduce(sum.real(), globalSum.real(), 1, std::plus{}); + layout.comm.allreduce(sum.imag(), globalSum.imag(), 1, std::plus{}); + } else { + layout.comm.allreduce(sum, globalSum, 1, std::plus()); + } return globalSum; } diff --git a/src/Field/Field.hpp b/src/Field/Field.hpp index fbf443572..52e58a281 100644 --- a/src/Field/Field.hpp +++ b/src/Field/Field.hpp @@ -8,6 +8,17 @@ namespace ippl { namespace detail { template struct isExpression> : std::true_type {}; + + template + struct FieldTraits; + + template + struct FieldTraits> { + static constexpr unsigned dim = Dim; + using view_type = std::decay_t< + decltype(std::declval>() + .getView())>; + }; } // namespace detail ////////////////////////////////////////////////////////////////////////// diff --git a/src/Field/FieldBufferOps.hpp b/src/Field/FieldBufferOps.hpp index f955c948e..8ac55c89c 100644 --- a/src/Field/FieldBufferOps.hpp +++ b/src/Field/FieldBufferOps.hpp @@ -74,8 +74,8 @@ namespace ippl { size_t size = intersect.size(); nsends = size; if (buffer.size() < size) { - const int overalloc = ippl::Comm->getDefaultOverallocation(); - Kokkos::realloc(buffer, size * overalloc); + const double overalloc = ippl::Comm->getDefaultOverallocation(); + Kokkos::realloc(buffer, static_cast(size * overalloc)); } using index_type = typename ippl::RangePolicy::index_type; @@ -122,8 +122,8 @@ namespace ippl { size_t size = intersect.size(); nsends = size; if (buffer.size() < size) { - const int overalloc = ippl::Comm->getDefaultOverallocation(); - Kokkos::realloc(buffer, size * overalloc); + const double overalloc = ippl::Comm->getDefaultOverallocation(); + Kokkos::realloc(buffer, static_cast(size * overalloc)); } using index_type = typename ippl::RangePolicy::index_type; @@ -155,9 +155,12 @@ namespace ippl { // Unpack a linear buffer into a view region, with per-axis conditional // reflection of the buffer index. Setting a dimension to true via the - // coordBool list reverses the buffer ordering along that axis as it is + // coordBool list reverses the buffer ordering along that axis as it is // placed into the view — the primitive operation behind `mirrorField` - // and behind the Vico solver's reflected-quadrant assembly. + // and behind the Vico solver's reflected-quadrant assembly. Take + // main/master's generalised-over-Dim signature (PR #532) so the body + // below (which already references Dim / coordBool) compiles; the + // hardcoded-3D pif-pr variant has been superseded. template inline void unpack_impl(const ippl::NDIndex intersect, const View& view, diff --git a/src/Field/HaloCells.h b/src/Field/HaloCells.h index c4d87b9bd..6e7606083 100644 --- a/src/Field/HaloCells.h +++ b/src/Field/HaloCells.h @@ -57,7 +57,7 @@ namespace ippl { * @param view the original field data * @param layout the field layout storing the domain decomposition */ - void accumulateHalo(view_type& view, Layout_t* layout); + void accumulateHalo(view_type& view, Layout_t* layout, int nghost); /*! * Send halo data to internal cells for only the physical cells @@ -75,7 +75,7 @@ namespace ippl { * @param view the original field data * @param layout the field layout storing the domain decomposition */ - void fillHalo(view_type&, Layout_t* layout); + void fillHalo(view_type&, Layout_t* layout, int nghost); /*! * Pack the field data to be sent into a contiguous array. @@ -139,7 +139,8 @@ namespace ippl { * unpack function call */ template - void exchangeBoundaries(view_type& view, Layout_t* layout, SendOrder order, int nghost = 1); + void exchangeBoundaries(view_type& view, Layout_t* layout, SendOrder order, + int nghost); /*! * Extract the subview of the original data. This does not copy. diff --git a/src/Field/HaloCells.hpp b/src/Field/HaloCells.hpp index eb56642ce..92f3f1762 100644 --- a/src/Field/HaloCells.hpp +++ b/src/Field/HaloCells.hpp @@ -7,6 +7,7 @@ #include #include "Utility/IpplException.h" +#include "Utility/ParallelDispatch.h" #include "Communicate/Communicator.h" @@ -16,8 +17,8 @@ namespace ippl { HaloCells::HaloCells() {} template - void HaloCells::accumulateHalo(view_type& view, Layout_t* layout) { - exchangeBoundaries(view, layout, HALO_TO_INTERNAL); + void HaloCells::accumulateHalo(view_type& view, Layout_t* layout, int nghost) { + exchangeBoundaries(view, layout, HALO_TO_INTERNAL, nghost); } template @@ -25,8 +26,8 @@ namespace ippl { exchangeBoundaries(view, layout, HALO_TO_INTERNAL_NOGHOST, nghost); } template - void HaloCells::fillHalo(view_type& view, Layout_t* layout) { - exchangeBoundaries(view, layout, INTERNAL_TO_HALO); + void HaloCells::fillHalo(view_type& view, Layout_t* layout, int nghost) { + exchangeBoundaries(view, layout, INTERNAL_TO_HALO, nghost); } template @@ -45,9 +46,9 @@ namespace ippl { auto ldom = layout->getLocalNDIndex(); for (const auto& axis : ldom) { if ((axis.length() == 1) && (Dim != 1)) { - throw std::runtime_error( - "HaloCells: Cannot do neighbour exchange when domain decomposition " - "contains planes!"); + throw IpplException( + "HaloCells::exchangeBoundaries", + "Cannot do neighbour exchange when domain decomposition contains planes."); } } @@ -62,7 +63,7 @@ namespace ippl { totalRequests += componentNeighbors.size(); } - int me=Comm->rank(); + int me = Comm->rank(); using memory_space = typename view_type::memory_space; using buffer_type = mpi::Communicator::buffer_type; @@ -164,8 +165,8 @@ namespace ippl { size_t size = subview.size(); nsends = size; if (buffer.size() < size) { - int overalloc = Comm->getDefaultOverallocation(); - Kokkos::realloc(buffer, size * overalloc); + double overalloc = Comm->getDefaultOverallocation(); + Kokkos::realloc(buffer, static_cast(size * overalloc)); } using index_array_type = diff --git a/src/FieldLayout/FieldLayout.h b/src/FieldLayout/FieldLayout.h index 780040e8b..c8f1c1e2b 100644 --- a/src/FieldLayout/FieldLayout.h +++ b/src/FieldLayout/FieldLayout.h @@ -201,7 +201,7 @@ namespace ippl { FieldLayout(const mpi::Communicator& = MPI_COMM_WORLD); FieldLayout(mpi::Communicator, const NDIndex& domain, std::array decomp, - bool isAllPeriodic = false); + bool isAllPeriodic = false, int nghost = 1); // Destructor: Everything deletes itself automatically ... the base // class destructors inform all the FieldLayoutUser's we're going away. @@ -213,38 +213,29 @@ namespace ippl { // FieldLayout constructors: void initialize(const NDIndex& domain, std::array decomp, - bool isAllPeriodic = false); + bool isAllPeriodic = false, int nghost = 1); // Return the domain. const NDIndex& getDomain() const { return gDomain_m; } - // Compare FieldLayouts to see if they represent the same domain; if - // dimensionalities are different, the NDIndex operator==() will return - // false: + // Compare FieldLayouts. Different dimensionalities or different global + // domains are not equal; same global domain but different per-rank + // local-domain decompositions are also not equal. template bool operator==(const FieldLayout& x) const { - - // Throw exception if the domains are not the same - if (gDomain_m != x.getDomain()) { - throw std::runtime_error("FieldLayout: only FieldLayouts with the same global domain should be compared"); - } - - return gDomain_m == x.getDomain(); - } - - bool operator==(const FieldLayout& x) const { - - // Throw exception if the domains are not the same - if (gDomain_m != x.getDomain()) { - throw std::runtime_error("FieldLayout: only FieldLayouts with the same global domain should be compared"); - } - - for (unsigned int i = 0; i < Dim; ++i) { - if (hLocalDomains_m(comm.rank())[i] != x.getLocalNDIndex()[i]) { + if constexpr (Dim != Dim2) { + return false; + } else { + if (gDomain_m != x.getDomain()) { return false; } + for (unsigned int i = 0; i < Dim; ++i) { + if (hLocalDomains_m(comm.rank())[i] != x.getLocalNDIndex()[i]) { + return false; + } + } + return true; } - return true; } // for the requested dimension, report if the distribution is @@ -330,7 +321,7 @@ namespace ippl { * Finds all neighboring ranks based on the field layout * @param nghost number of ghost cells (default 1) */ - void findNeighbors(int nghost = 1); + void findNeighbors(int nghost); /*! * Adds a neighbor to the neighbor list @@ -389,6 +380,9 @@ namespace ippl { // Minimum width of all the local domains for each dimension unsigned int minWidth_m[Dim]; + // Nghost needed for computing send/receive ranges + int nghost_m; + void calcWidths(); }; diff --git a/src/FieldLayout/FieldLayout.hpp b/src/FieldLayout/FieldLayout.hpp index 2bcb1bcd1..ad6ae5a53 100644 --- a/src/FieldLayout/FieldLayout.hpp +++ b/src/FieldLayout/FieldLayout.hpp @@ -37,7 +37,8 @@ namespace ippl { FieldLayout::FieldLayout(const mpi::Communicator& communicator) : comm(communicator) , dLocalDomains_m("local domains (device)", 0) - , hLocalDomains_m(Kokkos::create_mirror_view(dLocalDomains_m)) { + , hLocalDomains_m(Kokkos::create_mirror_view(dLocalDomains_m)) + , nghost_m(1) { for (unsigned int d = 0; d < Dim; ++d) { minWidth_m[d] = 0; } @@ -45,9 +46,9 @@ namespace ippl { template FieldLayout::FieldLayout(mpi::Communicator communicator, const NDIndex& domain, - std::array isParallel, bool isAllPeriodic) + std::array isParallel, bool isAllPeriodic, int nghost) : FieldLayout(communicator) { - initialize(domain, isParallel, isAllPeriodic); + initialize(domain, isParallel, isAllPeriodic, nghost); } template @@ -60,7 +61,7 @@ namespace ippl { hLocalDomains_m(i) = domains[i]; } - findNeighbors(); + findNeighbors(nghost_m); Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m); @@ -69,7 +70,7 @@ namespace ippl { template void FieldLayout::initialize(const NDIndex& domain, std::array isParallel, - bool isAllPeriodic) { + bool isAllPeriodic, int nghost) { int nRanks = comm.size(); gDomain_m = domain; @@ -78,11 +79,17 @@ namespace ippl { isParallelDim_m = isParallel; + nghost_m = nghost; + if (nRanks < 2) { Kokkos::resize(dLocalDomains_m, nRanks); Kokkos::resize(hLocalDomains_m, nRanks); hLocalDomains_m(0) = domain; Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m); + // Even on a single rank we must populate minWidth_m so that + // getDistribution(d) returns the right answer; without this the + // serial-build path silently reports every dim as parallel. + calcWidths(); return; } @@ -95,8 +102,11 @@ namespace ippl { } if (totparelems < nRanks) { - throw std::runtime_error("FieldLayout:initialize: domain can only be partitioned in to " - + std::to_string(totparelems) + " local domains, but there are " + std::to_string(nRanks) + " ranks, decrease the number of ranks or increase the domain."); + throw std::runtime_error( + "FieldLayout:initialize: domain can only be partitioned in to " + + std::to_string(totparelems) + " local domains, but there are " + + std::to_string(nRanks) + + " ranks, decrease the number of ranks or increase the domain."); } Kokkos::resize(dLocalDomains_m, nRanks); @@ -105,7 +115,7 @@ namespace ippl { detail::Partitioner partitioner; partitioner.split(domain, hLocalDomains_m, isParallel, nRanks); - findNeighbors(); + findNeighbors(nghost); Kokkos::deep_copy(dLocalDomains_m, hLocalDomains_m); @@ -288,7 +298,7 @@ namespace ippl { // 0 - touching the lower axis value // 1 - touching the upper axis value // 2 - parallel to the axis - if (intersect[d].length() == 1) { + if (intersect[d].length() == static_cast(nghost)) { if (gnd[d].first() != intersect[d].first()) { index += digit; } diff --git a/src/FieldLayout/SubFieldLayout.hpp b/src/FieldLayout/SubFieldLayout.hpp index 53ee24c11..9120b2031 100644 --- a/src/FieldLayout/SubFieldLayout.hpp +++ b/src/FieldLayout/SubFieldLayout.hpp @@ -102,7 +102,7 @@ namespace ippl { } } - this->findNeighbors(); + this->findNeighbors(this->nghost_m); Kokkos::deep_copy(this->dLocalDomains_m, this->hLocalDomains_m); @@ -125,4 +125,4 @@ namespace ippl { originDomain_m = domain; } -} // namespace ippl \ No newline at end of file +} // namespace ippl diff --git a/src/Interpolation/Binning.h b/src/Interpolation/Binning.h new file mode 100644 index 000000000..c67929c39 --- /dev/null +++ b/src/Interpolation/Binning.h @@ -0,0 +1,294 @@ +/*! + * @file Binning.h + * @brief Particle-to-tile binning for tiled scatter / gather. + * + * Provides the per-particle bin computer, a counting-sort-based grouping + * (`bin_sort`) and the high-level `bin_particles` entry point used by the + * tiled scatter/gather paths. + */ +#ifndef IPPL_INTERPOLATION_BINNING_H +#define IPPL_INTERPOLATION_BINNING_H + +#include + +#include + +#include "CoordinateTransform.h" +#include "Particle/ParticleLayout.h" +#include "Particle/SortBuffer.h" + +namespace ippl { + namespace Interpolation { + namespace detail { + + /*! + * @struct BinningResult + * @brief Output of bin_particles: permutation + per-bin offsets + tile counts. + * + * @tparam Dim Spatial dimension. + * @tparam MemorySpace Kokkos memory space the views live in. + */ + template + struct BinningResult { + Kokkos::View permute; //!< particle ids grouped by bin. + Kokkos::View bin_offsets; //!< exclusive scan of bin counts. + Vector num_tiles; //!< tile-grid extent per axis. + }; + + /** + * @brief Functor to compute bin index for a particle position + * + * Maps physical particle positions to tile indices for tiled scatter/gather. + * Particles are assigned to tiles based on their stencil center location. + * + * @tparam Dim Spatial dimension + * @tparam RealType Floating point type + */ + template + struct BinComputer { + Vector n_grid_global; + Vector local_offset; + Vector tile_size; + Vector num_tiles; + int kernel_width; + CoordinateTransform transform; + + /** + * @brief Compute 1D tile index for a coordinate value + */ + KOKKOS_INLINE_FUNCTION int compute_tile_1d(RealType val, int dim) const { + const RealType grid_pos = transform.toGridCoordinate(val, dim); + const int center = + transform.getStencilCenter(grid_pos - RealType(0.5), kernel_width); + const int local_c = center - local_offset[dim]; + return Kokkos::clamp(local_c / tile_size[dim], 0, num_tiles[dim] - 1); + } + + /** + * @brief Compute flat bin index for a particle position + * + * Uses row-major ordering (dimension Dim-1 varies fastest) to match + * the decoding in TiledScatter. + */ + template + KOKKOS_INLINE_FUNCTION int operator()(const PositionType& pos) const { + int bin_idx = 0; + int stride = 1; + + for (int d = Dim - 1; d >= 0; --d) { + const int tile_d = compute_tile_1d(pos[d], d); + bin_idx += tile_d * stride; + stride *= num_tiles[d]; + } + + return bin_idx; + } + }; + + /** + * @brief Group particles by bin via counting sort. + * + * The downstream consumer (TiledScatter / GridParallelScatter) only + * needs particles *grouped* per bin, not sorted. A radix sort over + * the keys is the wrong primitive for that -- it does ~8 N bytes of + * key/perm traffic and ~5 N bytes of scratch. The counting sort + * implemented here is the textbook bucket-sort and runs in three + * memory-bandwidth-bounded kernels: + * + * Pass A: per-particle bin index + atomic histogram into + * bin_offsets[0..n_bins). Stores the bin index into + * bin_keys for reuse in Pass C. + * Pass B: exclusive scan over bin_offsets[0..n_bins+1), turning + * the histogram into the start offset of every bin + * (Kokkos::parallel_scan dispatches to CUB DeviceScan + * on CUDA / rocPRIM on HIP). + * Pass C: per-particle scatter into permute, using cursor as a + * per-bin atomic counter that starts at the bin's + * offset and is incremented once per particle landing + * in that bin. + * + * For 268 M particles on H100 this is ~50x faster than the CUB + * radix-sort path it replaces (~120 ms -> ~3-5 ms), because the + * total memory traffic shrinks from ~64 N bytes to ~24 N bytes + * and the out-of-place sort scratch + deep_copy round-trip is + * gone. + * + * Order within a bin is non-deterministic. TiledScatter does not + * require intra-bin order. + */ + template + void bin_sort(PositionViewType positions, Vector n_grid_global, + [[maybe_unused]] Vector n_grid_local, + Vector local_offset, Vector tile_size, + int kernel_width, Vector origin, + Vector invdx, PermuteViewType& permute, + OffsetViewType& bin_offsets, KeyViewType& bin_keys, + CursorViewType& cursor, size_t n_particles, + Vector num_tiles) { + using key_type = typename KeyViewType::non_const_value_type; + + static IpplTimings::TimerRef binSortTimer = IpplTimings::getTimer("binSort"); + IpplTimings::startTimer(binSortTimer); + + // Total number of bins + size_t n_bins = 1; + for (unsigned d = 0; d < Dim; ++d) { + n_bins *= num_tiles[d]; + } + + CoordinateTransform transform(origin, invdx, n_grid_global); + BinComputer bin_computer{n_grid_global, local_offset, tile_size, + num_tiles, kernel_width, transform}; + + // Pass A: bin index per particle + atomic histogram into bin_offsets. + static IpplTimings::TimerRef keyTimer = IpplTimings::getTimer("binComputeKeys"); + IpplTimings::startTimer(keyTimer); + + { + auto offsets_zero = Kokkos::subview( + bin_offsets, std::make_pair(size_t(0), n_bins + 1)); + Kokkos::deep_copy(ExecSpace(), offsets_zero, + typename OffsetViewType::value_type(0)); + } + + if (n_particles > 0) { + Kokkos::parallel_for( + "BinSort::HistogramAndKeys", + Kokkos::RangePolicy(0, n_particles), + KOKKOS_LAMBDA(const size_t i) { + const key_type k = + static_cast(bin_computer(positions(i))); + bin_keys(i) = k; + Kokkos::atomic_inc(&bin_offsets(static_cast(k))); + }); + Kokkos::fence(); + } + IpplTimings::stopTimer(keyTimer); + + // Pass B: exclusive scan turns the histogram into per-bin start + // offsets. Kokkos dispatches this to cub::DeviceScan on CUDA + // and rocPRIM on HIP, so we get the vendor-tuned scan for free. + static IpplTimings::TimerRef scanTimer = IpplTimings::getTimer("binSortByKey"); + IpplTimings::startTimer(scanTimer); + + using offset_value_type = typename OffsetViewType::value_type; + Kokkos::parallel_scan( + "BinSort::ExclusiveScan", + Kokkos::RangePolicy(0, n_bins + 1), + KOKKOS_LAMBDA(const size_t i, offset_value_type& upd, const bool final) { + const offset_value_type cnt = bin_offsets(i); + if (final) { + bin_offsets(i) = upd; + } + upd += cnt; + }); + Kokkos::fence(); + IpplTimings::stopTimer(scanTimer); + + // Pass C: scatter particle ids into permute, using cursor as a + // per-bin atomic write head that starts at the bin's offset. + static IpplTimings::TimerRef offsetTimer = + IpplTimings::getTimer("binComputeOffsets"); + IpplTimings::startTimer(offsetTimer); + + if (n_particles > 0) { + auto cursor_sub = Kokkos::subview(cursor, + std::make_pair(size_t(0), n_bins)); + auto offsets_sub = Kokkos::subview(bin_offsets, + std::make_pair(size_t(0), n_bins)); + Kokkos::deep_copy(ExecSpace(), cursor_sub, offsets_sub); + + Kokkos::parallel_for( + "BinSort::Scatter", + Kokkos::RangePolicy(0, n_particles), + KOKKOS_LAMBDA(const size_t i) { + const size_t k = static_cast(bin_keys(i)); + const size_t pos = + Kokkos::atomic_fetch_add(&cursor(k), size_t(1)); + permute(pos) = i; + }); + Kokkos::fence(); + } + IpplTimings::stopTimer(offsetTimer); + + IpplTimings::stopTimer(binSortTimer); + } + + /** + * @brief High-level interface for particle binning + * + * Bins particles into tiles for tiled scatter/gather operations. + * Uses sort-based binning for efficient GPU execution. + * + * @tparam ParticleT Particle coordinate type + * @tparam FieldT Field value type + * @tparam ParticleProperties Additional particle attribute properties + * @tparam Dim Spatial dimension + * + * @param particles Particle position attribute + * @param fieldLayout Field layout + * @param mesh Uniform Cartesian mesh + * @param tile_size Tile size per dimension + * @param kernel_width Interpolation kernel width + * + * @return Tuple of (permutation, bin_offsets, num_tiles) + */ + template + auto bin_particles( + const ParticleAttrib, ParticleProperties...>& particles, + FieldLayout fieldLayout, UniformCartesian mesh, + Vector tile_size, int kernel_width) { + using AttribType = std::decay_t; + using ExecSpace = typename AttribType::execution_space; + using memory_space = typename AttribType::memory_space; + + // Extract grid information + const NDIndex& lDom = fieldLayout.getLocalNDIndex(); + const NDIndex& gDom = fieldLayout.getDomain(); + + Vector ngrid_global; + Vector ngrid_local; + Vector local_offset; + for (unsigned d = 0; d < Dim; ++d) { + ngrid_global[d] = gDom[d].length(); + ngrid_local[d] = lDom[d].length(); + local_offset[d] = lDom[d].first(); + } + + // Compute number of tiles (+1 for boundary particles). + Vector num_tiles; + size_t total_tiles = 1; + for (unsigned d = 0; d < Dim; ++d) { + num_tiles[d] = (ngrid_local[d] + tile_size[d] - 1) / tile_size[d] + 1; + total_tiles *= num_tiles[d]; + } + + auto particle_view = particles.getView(); + const auto invdx = 1.0 / mesh.getMeshSpacing(); + const size_t n_particles = particles.getParticleCount(); + + auto& bufs = ippl::detail::getDefaultBinSortBuffers(); + // n_bins + 1 slots needed for bin_offsets and cursor + bufs.ensureCapacity(n_particles, total_tiles + 1); + + auto& permute = bufs.permute(); + auto& bin_offsets = bufs.binOffsets(); + auto& bin_keys = bufs.binKeys(); + auto& cursor = bufs.cursor(); + + bin_sort, ExecSpace>( + particle_view, ngrid_global, ngrid_local, local_offset, tile_size, + kernel_width, mesh.getOrigin(), invdx, permute, bin_offsets, bin_keys, + cursor, n_particles, num_tiles); + + return std::make_tuple(permute, bin_offsets, num_tiles); + } + + } // namespace detail + } // namespace Interpolation +} // namespace ippl + +#endif // IPPL_INTERPOLATION_BINNING_H diff --git a/src/Interpolation/CIC.h b/src/Interpolation/CIC.h index 5adac510d..ec1ea6534 100644 --- a/src/Interpolation/CIC.h +++ b/src/Interpolation/CIC.h @@ -9,7 +9,7 @@ namespace ippl { namespace detail { - /*! + /** * Computes the weight for a given point for a given axial direction * @tparam Point index of the point * @tparam Index index of the axis @@ -22,7 +22,7 @@ namespace ippl { KOKKOS_INLINE_FUNCTION constexpr typename Weights::value_type interpolationWeight( const Weights& wlo, const Weights& whi); - /*! + /** * Computes the index for a given point for a given axis * @tparam Point index of the point * @tparam Index index of the axis @@ -34,7 +34,7 @@ namespace ippl { KOKKOS_INLINE_FUNCTION constexpr typename Indices::value_type interpolationIndex( const Indices& args); - /*! + /** * Scatters to a field at a single point * @tparam ScatterPoint the index of the point to which we are scattering * @tparam Index the sequence 0...Dim - 1 @@ -54,7 +54,7 @@ namespace ippl { const Vector& wlo, const Vector& whi, const Vector& args, const T& val); - /*! + /** * Scatters the particle attribute to the field. * * The coordinates to which an attribute must be scattered is given by 2^n, @@ -81,7 +81,7 @@ namespace ippl { const Vector& wlo, const Vector& whi, const Vector& args, T val = 1); - /*! + /** * Gathers from a field at a single point * @tparam GatherPoint the index of the point from which data is gathered * @tparam Index the sequence 0...Dim - 1 @@ -101,7 +101,7 @@ namespace ippl { const Vector& wlo, const Vector& whi, const Vector& args); - /*! + /** * Gathers the particle attribute from a field (see scatter_field for more details) * @tparam GatherPoint... the indices of the points from which to gather (sequence 0 to * 2^Dim) diff --git a/src/Interpolation/CIC.hpp b/src/Interpolation/CIC.hpp index cbe8064f1..d54b19d7a 100644 --- a/src/Interpolation/CIC.hpp +++ b/src/Interpolation/CIC.hpp @@ -1,9 +1,4 @@ -// -// Class CIC -// First order/cloud-in-cell grid interpolation. Currently implemented as -// global functions, but in order to support higher or lower order interpolation, -// these should be moved into structs. -// +#include namespace ippl { namespace detail { @@ -15,9 +10,6 @@ namespace ippl { } else { return whi[Index]; } - // device code cannot throw exceptions, but we need a - // dummy return to silence the warning - return 0; } template @@ -28,9 +20,6 @@ namespace ippl { } else { return args[Index]; } - // device code cannot throw exceptions, but we need a - // dummy return to silence the warning - return 0; } template ) and are installed centrally by InstallIppl.cmake. diff --git a/src/Interpolation/CoordinateTransform.h b/src/Interpolation/CoordinateTransform.h new file mode 100644 index 000000000..4716227c7 --- /dev/null +++ b/src/Interpolation/CoordinateTransform.h @@ -0,0 +1,138 @@ +#ifndef IPPL_INTERPOLATION_COORDINATE_TRANSFORM_H +#define IPPL_INTERPOLATION_COORDINATE_TRANSFORM_H + +#include + +#include "Types/Vector.h" + +#include "Index/Index.h" + +namespace ippl::Interpolation { + + // Forward declare size types from InterpolationUtil.h + using local_index_type = decltype(Index{}.first()); + using size_type = ippl::detail::size_type; + + /** + * @brief Unified coordinate transformation for scatter/gather operations + * + * This class handles the transformation from physical coordinates to grid + * coordinates using mesh information (origin and spacing). + * + * @tparam T Floating-point type for coordinates + * @tparam Dim Spatial dimension + */ + template + struct CoordinateTransform { + using Vector_t = ippl::Vector; + using VectorInt_t = ippl::Vector; + + const Vector_t origin_; // Physical origin from mesh + const Vector_t invdx_; // Inverse of mesh spacing (1/dx) + const VectorInt_t ngrid_global_; // Global grid dimensions + + /** + * @brief Construct from mesh parameters + * + * @param origin Physical origin of the domain + * @param invdx Inverse mesh spacing (1/dx) + * @param ngrid_global Global grid dimensions + */ + KOKKOS_INLINE_FUNCTION CoordinateTransform(const Vector_t& origin, const Vector_t& invdx, + const VectorInt_t& ngrid_global) + : origin_(origin) + , invdx_(invdx) + , ngrid_global_(ngrid_global) {} + + /** + * @brief Transform physical position to grid coordinates [0, ngrid) + * + * This function scales from physical domain to grid domain: (x - origin) / dx + * + * @param physical_pos Physical position in the specified dimension + * @param dim Dimension index + * @return Grid coordinate in [0, ngrid) + */ + KOKKOS_INLINE_FUNCTION T toGridCoordinate(T physical_pos, unsigned dim) const { + return (physical_pos - origin_[dim]) * invdx_[dim]; + } + + /*! + * @brief Compile-time dimension variant of toGridCoordinate. + * @tparam D Compile-time dimension index. + */ + template + KOKKOS_FORCEINLINE_FUNCTION T toGridCoordinate(T physical_pos) const { + return (physical_pos - origin_[D]) * invdx_[D]; + } + + /** + * @brief Round a grid coordinate to the cell index that anchors the stencil. + * + * Uses width-dependent rounding: + * - Odd width: round to nearest (symmetric stencil around the particle) + * - Even width: floor (asymmetric stencil) + * + * The stencil leftmost cell is then `center - (width - 1) / 2`, computed + * by `getStencilBase`. + * + * @param grid_pos Grid coordinate (output of toGridCoordinate) + * @param width Kernel width + * @return Center cell index + */ + KOKKOS_INLINE_FUNCTION int getStencilCenter(T grid_pos, int width) const { + const bool odd = (width & 1); + return odd ? static_cast(Kokkos::round(grid_pos)) + : static_cast(Kokkos::floor(grid_pos)); + } + + /*! + * @brief Compile-time width variant of getStencilCenter. + * @tparam Width Compile-time kernel width. + */ + template + KOKKOS_FORCEINLINE_FUNCTION int getStencilCenter(T grid_pos) const { + if constexpr (Width & 1) + return static_cast(Kokkos::round(grid_pos)); + else + return static_cast(Kokkos::floor(grid_pos)); + } + + /** + * @brief Get base grid index for kernel stencil + * + * Uses width-dependent rounding to determine the base index: + * - Odd width: round to nearest (symmetric stencil around particle) + * - Even width: floor (asymmetric stencil) + * + * The stencil extends from [base_idx, base_idx + width). + * + * For odd widths (e.g., w=3): + * - Grid point at 2.7 rounds to 3 + * - Stencil covers indices [3-(3-1)/2, 3+(3-1)/2] = [2, 3, 4] + * + * For even widths (e.g., w=4): + * - Grid point at 2.7 floors to 2 + * - Stencil covers indices [2-(4-1)/2, 2+(4-1)/2+1] = [0, 1, 2, 3] + * + * @param grid_pos Grid coordinate (output of toGridCoordinate) + * @param width Kernel width + * @return Base index for the kernel stencil (leftmost index) + */ + KOKKOS_INLINE_FUNCTION int getStencilBase(T grid_pos, int width) const { + return getStencilCenter(grid_pos, width) - (width - 1) / 2; + } + + /*! + * @brief Compile-time width variant of getStencilBase. + * @tparam Width Compile-time kernel width. + */ + template + KOKKOS_FORCEINLINE_FUNCTION int getStencilBase(T grid_pos) const { + return getStencilCenter(grid_pos) - (Width - 1) / 2; + } + }; + +} // namespace ippl::Interpolation + +#endif // IPPL_INTERPOLATION_COORDINATE_TRANSFORM_H diff --git a/src/Interpolation/Gather/AtomicGather.h b/src/Interpolation/Gather/AtomicGather.h new file mode 100644 index 000000000..87bdf6a42 --- /dev/null +++ b/src/Interpolation/Gather/AtomicGather.h @@ -0,0 +1,129 @@ +/*! + * @file AtomicGather.h + * @brief Per-particle gather kernel. + * + * Each particle reads its W^Dim stencil from the field. The "Atomic" name is + * kept for symmetry with the scatter side; gather is read-only and never + * actually issues atomic operations. With @c UseSorting=true a permutation + * is applied so particles in the same tile read contiguous memory. + */ +#ifndef IPPL_ATOMIC_GATHER_H +#define IPPL_ATOMIC_GATHER_H + +#include + +#include "Interpolation/CoordinateTransform.h" +#include "Interpolation/Gather/GatherArgumentsBase.h" +#include "Interpolation/WidthDispatcher.h" + +namespace ippl::Interpolation::detail { + /*! + * @struct AtomicGather + * @brief Compile-time-width gather functor. + * @tparam W Compile-time kernel width. + * @tparam Types GatherTypes bundle. + * @tparam UseSorting When true, particles are pre-binned for better locality. + */ + template + struct AtomicGather { + static constexpr bool requires_binning = UseSorting; + static constexpr unsigned Dim = Types::Dim; + + using RealType = typename Types::RealType; + using ValueType = typename Types::ValueType; + using memory_space = typename Types::memory_space; + using execution_space = typename Types::execution_space; + + struct Arguments : GatherArgumentsBase { + using PermuteView = Kokkos::View; + PermuteView permute; // Only used when UseSorting = true + + template + static Arguments create(const Field& field, const Positions& pos, Values& vals, + const Kernel& k, const GatherConfig& cfg, + const GatherBinningResult& binning = {}) { + Arguments a; + a.initBase(field, pos, vals, k, cfg.add_to_attribute); + if constexpr (UseSorting) { + a.permute = binning.permute; + } + return a; + } + }; + Arguments args; + + struct Stencil { + Kokkos::Array base; // Stencil leftmost indices in all dims + Kokkos::Array, Dim> kw; // Precomputed kernel evals + }; + + KOKKOS_INLINE_FUNCTION void operator()(size_t j) const { + using result_type = decltype(args.grid)::non_const_value_type; + + // Get actual particle index (sorted or direct) + const size_t p = UseSorting ? args.permute(j) : j; + + // Build stencil + CoordinateTransform transform{args.origin, args.invdx, args.n_grid}; + Stencil stencil{}; + for_constexpr(std::make_integer_sequence{}, [&]() { + const RealType g_pos = transform.toGridCoordinate(args.x(p)[d], d); + const RealType g_pos_cc = g_pos - RealType(0.5); + const int idx0 = transform.getStencilBase(g_pos_cc, W); + + stencil.base[d] = idx0 - args.local_offset[d] + args.nghost; + + auto& kernel_vals = stencil.kw[d]; + for (int i = 0; i < W; ++i) { + kernel_vals[i] = + args.kernel((g_pos - (RealType(idx0 + i) + RealType(0.5))) * args.inv_hw); + } + }); + + // Gather W^d stencil around non-uniform pt + result_type out = result_type(0); + auto rec = [&](auto&& self, RealType wprod, auto... idx) -> void { + const int bD = get(stencil.base); + const auto& kD = get(stencil.kw); + + for (int i = 0; i < W; ++i) { + const RealType w = wprod * kD[i]; + if constexpr (D == 0) { + out += args.grid(bD + i, idx...) * w; + } else { + self.template operator()(self, w, bD + i, idx...); + } + } + }; + rec.template operator()(rec, RealType(1)); + + if (args.add_to_attribute) { + if constexpr (std::is_same_v, result_type> + && std::is_same_v) { + args.values(p) = args.values(p) + out.real(); + } else { + args.values(p) = args.values(p) + out; + } + } else { + if constexpr (std::is_same_v, result_type> + && std::is_same_v) { + args.values(p) = out.real(); + } else { + args.values(p) = out; + } + } + } + + void run(size_t n_particles) { + auto policy = Kokkos::RangePolicy(0, n_particles); + // `Kokkos::Experimental::prefer` + DesiredOccupancy is still in + // the Experimental namespace (Kokkos 5.x). The interface may + // move; if it does, drop the prefer() and pass `policy` directly. + auto const policy_tuned = Kokkos::Experimental::prefer( + policy, Kokkos::Experimental::DesiredOccupancy{Kokkos::AUTO}); + Kokkos::parallel_for("AtomicGather", policy_tuned, *this); + } + }; +} // namespace ippl::Interpolation::detail + +#endif // IPPL_ATOMIC_GATHER_H diff --git a/src/Interpolation/Gather/Gather.h b/src/Interpolation/Gather/Gather.h new file mode 100644 index 000000000..75c2b8961 --- /dev/null +++ b/src/Interpolation/Gather/Gather.h @@ -0,0 +1,149 @@ +/*! + * @file Gather.h + * @brief Public Gather facade dispatching to AtomicGather (sorted/unsorted). + */ +#ifndef IPPL_GATHER_H +#define IPPL_GATHER_H + +#include "Utility/IpplException.h" + +#include "Interpolation/Binning.h" +#include "Interpolation/Gather/AtomicGather.h" +#include "Interpolation/Gather/GatherArgumentsBase.h" +#include "Interpolation/Gather/GatherConfig.h" +#include "Interpolation/WidthDispatcher.h" +#include "Particle/ParticleAttrib.h" + +namespace ippl { + + namespace Interpolation::detail { + /*! + * @struct DeduceGatherTypes + * @brief Resolve a GatherTypes bundle from the user-facing field / + * positions / values views and the kernel. + */ + template + struct DeduceGatherTypes { + using FieldTr = ippl::detail::FieldTraits>; + using PosTr = ippl::detail::AttribTraits>; + using ValTr = ippl::detail::AttribTraits>; + + // Use the kernel's own value_type for the geometric real-precision + // computations (origin, invdx, weights). When the position attribute + // is float but the mesh is double, taking RealType from the position + // would silently downcast the mesh spacing and lose precision; the + // kernel knows the correct working precision and the legacy CIC + // path uses the mesh type explicitly via Kernel = LinearKernel. + using RealType = typename std::decay_t::value_type; + + using type = GatherTypes, + typename FieldTr::view_type, typename PosTr::view_type, + typename ValTr::view_type>; + }; + + template + using DeducedGatherTypes = + typename DeduceGatherTypes::type; + + } // namespace Interpolation::detail + + /*! + * @class Gather + * @brief Public functor that interpolates a Field at particle positions. + * + * Constructed with a kernel and an optional GatherConfig. The call + * operator invokes the configured backend (Atomic / AtomicSort) via + * WidthDispatcher so the kernel width is known at compile time inside + * the inner loop. + * + * @tparam Kernel Interpolation kernel type. + * @tparam Dim Spatial dimension. + */ + template + class Gather { + public: + /*! + * @param kernel Kernel instance used for stencil evaluation. + * @param config Method / tile-size / sort overrides (optional). + */ + Gather(const Kernel& kernel, const Interpolation::GatherConfig& config = {}) + : kernel_m(kernel) + , config_m(config) {} + + /*! + * @brief Gather field values at particle positions into @p values. + * @param field Input field (read-only; halo is filled before reads). + * @param positions Particle positions. + * @param values Output values (overwritten). + */ + template + void operator()(Field& field, + const ParticleAttrib, PosProps...>& positions, + ParticleAttrib& values) { + using Types = + Interpolation::detail::DeducedGatherTypes; + + switch (config_m.method) { + case Interpolation::GatherMethod::Atomic: + dispatch(field, positions, values); + break; + case Interpolation::GatherMethod::AtomicSort: + dispatch(field, positions, values); + break; + default: + throw IpplException("Gather", "Unknown GatherMethod"); + } + } + + private: + template