diff --git a/PR501_SPLIT_MAP.md b/PR501_SPLIT_MAP.md new file mode 100644 index 000000000..ba0e88300 --- /dev/null +++ b/PR501_SPLIT_MAP.md @@ -0,0 +1,365 @@ +# PR 501 Split Map + +Source PR: https://github.com/IPPL-framework/ippl/pull/501 + +Local mapping branch: `pr-501-map` + +Compared against: `origin/master` at `da43fd66a18848b65dcd82af90001f5b64a798ab` + +Overall size: + +- 41 commits +- 149 files changed +- 23,231 insertions +- 2,205 deletions + +## Subsystem Size + +| Subsystem | Files | Additions | Deletions | +| --- | ---: | ---: | ---: | +| Interpolation / Autotune | 33 | 8,138 | 20 | +| FFT / NUFFT | 24 | 6,571 | 814 | +| Particle | 21 | 3,841 | 757 | +| Alpine / PIF examples | 12 | 1,923 | 37 | +| Utility | 11 | 1,424 | 45 | +| Communicate | 14 | 626 | 156 | +| Solvers | 6 | 394 | 253 | +| Field / Halo / Layout | 11 | 116 | 87 | +| Build / CMake | 4 | 117 | 12 | +| Types | 2 | 35 | 4 | +| FEM | 3 | 12 | 13 | +| Other | 8 | 34 | 7 | + +The review load is dominated by three areas: + +- Interpolation / Autotune: new scatter/gather dispatch, binning, tiling, tuning cache. +- FFT / NUFFT: backend/transform refactor plus native NUFFT and pruned transforms. +- Particle: new spatial update path, send/recv serialization hooks, sorting/buffer reuse. + +## Commit Shape + +The branch starts with well-labeled subsystem commits: + +| Commit | Area | Notes | +| --- | --- | --- | +| `7cff674a` | Build / dependencies | Adds `IPPL_ENABLE_FINUFFT`, `IPPL_ENABLE_CUFFTMP`, autotune preset plumbing. | +| `13950c4f` | FFT / NUFFT | Large FFT backend/transform split plus native NUFFT engine. | +| `dbbe59a4` | Interpolation | Scatter/gather redesign, binning, tiling, autotune runtime cache. | +| `2878b90b` | Particle / halo | Rewrites `ParticleSpatialLayout`, adds sort buffers, threads `nghost` through field layout/halo. | +| `9507a796` | Utility | Timing overhaul, `BufferView`, `Tuning`, `ParallelDispatch`, type utilities. | +| `f2820064` | Communicate | Serialization hooks, shared buffer handler. | +| `63d6ccf2` | Integration glue | Field/solver/type updates for new FFT and particle layout APIs. | +| `99d21ba7` | Alpine PIF | Adds ElectrostaticPIF examples. | +| `f0560f76` | Unit tests | Adds NUFFT, interpolation, binning, particle update tests. | +| `927cac7b` | Integration tests | Updates particle benchmark and adds scaling benchmark. | + +After that, the branch has many fixup/regression commits. Several are descriptive and likely fold into the earlier subsystem commits; some are separable: + +| Commit | Area | Split relevance | +| --- | --- | --- | +| `40438d17` | PCG / preconditioner | Good candidate for an independent PR: hoists per-iteration allocations out of solver loop. | +| `be5f794c` | PCG / PoissonCG | Follow-up to the PCG allocation PR: pass `Field` by reference through `OperatorF`. | +| `95762403` | PIF examples | Consolidates pruned variants into parameterized source. Belongs with PIF examples. | +| `0e7bc58d`, `6c1c48ad` | FFT / NUFFT | FINUFFT guards for CUDA + 2D unit-test build. Belong with NUFFT/FINUFFT PR. | +| `cec1ca61` | Cleanup | Doxygen, formatting, size-type consistency. Should be folded into relevant split PRs or kept as a final cleanup PR. | + +## Dependency Observations + +### NUFFT Depends On Interpolation + +`src/FFT/NUFFT/NativeNUFFT.h` includes and uses: + +- `Interpolation/Scatter/ScatterConfig.h` +- `Interpolation/Gather/GatherConfig.h` +- scatter/gather kernels through the new interpolation dispatch + +That means native NUFFT cannot be cleanly reviewed before the higher-order scatter/gather layer. + +### PIF Depends On NUFFT + +The Alpine PIF examples use: + +- `ippl::FFT` +- `scatterPIFNUFFT` +- `gatherPIFNUFFT` + +So PIF examples should come after the NUFFT transform API. + +### Particle Update And Communicate Are Coupled + +The particle update rewrite uses: + +- `ParticleAttrib::serialize` / `deserialize` +- shared `BufferHandler` +- direct archive serialization +- reusable send/recv buffers + +This suggests communication and particle update may be one PR unless a clean lower-level communication PR is extracted first. + +### `nghost` / Halo Changes Cross Particle And Field + +`ParticleSpatialLayout` changes also thread `nghost` through: + +- `FieldLayout` +- `HaloCells` +- field layout APIs + +This is small in line count but high in integration risk. It should be explicitly called out in whichever PR contains particle update. + +### PCG Allocation Fix Looks Separately Reviewable + +The late commits: + +- `40438d17` +- `be5f794c` + +only touch: + +- `src/LinearSolvers/PCG.h` +- `src/LinearSolvers/Preconditioner.h` +- `src/PoissonSolvers/PoissonCG.h` + +This can likely become an independent performance/regression PR, separate from PIF. + +## Candidate Split + +### PR 1: PCG Allocation / Solver Performance + +Scope: + +- `src/LinearSolvers/PCG.h` +- `src/LinearSolvers/Preconditioner.h` +- `src/PoissonSolvers/PoissonCG.h` + +Candidate commits: + +- `40438d17` +- `be5f794c` + +Reason: + +- Smallest coherent split. +- Directly addresses the reported PCG slowdown / repeated `cudaMalloc` concern. +- Does not conceptually depend on PIF, NUFFT, or interpolation. + +Risk: + +- Need to verify solver convergence and exact iteration behavior. + +### PR 2: Communication And Particle Update Infrastructure + +Scope: + +- `src/Communicate/*` +- `src/Particle/ParticleSpatialLayout*` +- `src/Particle/ParticleAttrib*` +- `src/Particle/ParticleBase*` +- `src/Particle/ParticleSort.h` +- `src/Particle/SortBuffer.h` +- `src/FieldLayout/FieldLayout*` +- `src/Field/HaloCells*` +- particle update tests + +Candidate commits: + +- `2878b90b` +- `f2820064` +- relevant parts of `63d6ccf2` +- relevant parts of `f0560f76` +- relevant fixup commits after `728af370` + +Reason: + +- Captures the PIC scaling improvement that is not PIF-specific. +- Reviewable independently from FFT/NUFFT algorithms. + +Risk: + +- This is still a substantial behavioral change. +- Needs multi-rank CPU/GPU particle update and `ParticleSendRecv` coverage. + +### PR 3: Higher-Order Scatter/Gather And Autotune + +Scope: + +- `src/Interpolation/*` +- `cmake/AutoTunePresets.cmake` +- `cmake/IpplAutoTunePresets.h.in` +- `cmake/auto_tune/*` +- interpolation tests + +Candidate commits: + +- `7cff674a` only for autotune preset plumbing, not FINUFFT/CUFFTMP if separable. +- `dbbe59a4` +- interpolation portions of `f0560f76` +- `dacdcd9a` if routing legacy CIC through the new framework is included. + +Reason: + +- This is the algorithmic layer native NUFFT requires. +- Can be validated without PIF examples. + +Risk: + +- `dacdcd9a` changes existing CIC scatter/gather behavior by routing legacy paths through the new framework. That may be better as a second PR after the new interpolation layer lands. + +### PR 4: FFT Backend / Transform Refactor + +Scope: + +- `src/FFT/Backend/*` +- `src/FFT/Transform/CC.h` +- `src/FFT/Transform/RC.h` +- `src/FFT/Transform/Trig.h` +- `src/FFT/Transform/PrunedCC.h` +- `src/FFT/Transform/PrunedRC.h` +- `src/FFT/Transform/Common.h` +- `src/FFT/Traits.h` +- existing FFT test updates + +Candidate commits: + +- FFT refactor parts of `13950c4f` +- FFT portions of `f0560f76` +- FFT portions of later build fixes + +Reason: + +- Gives reviewers the backend/transform API without also reviewing NUFFT and PIF. + +Risk: + +- Current commit `13950c4f` mixes backend refactor with native NUFFT files. This split likely requires path-based extraction or commit surgery. + +### PR 5: Native NUFFT And FINUFFT/cuFINUFFT Integration + +Scope: + +- `src/FFT/NUFFT/*` +- `src/FFT/Transform/NUFFT.*` +- FINUFFT/CUFFTMP build switches if not already merged +- NUFFT tests + +Candidate commits: + +- NUFFT portions of `13950c4f` +- FINUFFT/CUFFTMP portions of `7cff674a` +- NUFFT portions of `f0560f76` +- `0e7bc58d` +- `6c1c48ad` + +Reason: + +- NUFFT depends on PR 3 scatter/gather and PR 4 transform structure. + +Risk: + +- Native NUFFT depends on new scatter/gather. +- FINUFFT path has Dim-guard complexity and external dependency complexity. + +### PR 6: PIF / Alpine Examples + +Scope: + +- `alpine/ElectrostaticPIF/*` +- Alpine manager wiring +- PIF-specific particle attrib convenience APIs if not included in NUFFT PR + +Candidate commits: + +- `99d21ba7` +- `95762403` +- PIF/example fixups + +Reason: + +- Examples should be reviewed after core APIs are accepted. + +Risk: + +- PIF examples currently depend on the full stack: particle update, scatter/gather, FFT transform refactor, NUFFT. + +## Minimal Split Alternative + +If six PRs is too much process overhead, a practical minimum is: + +1. PCG allocation fix. +2. Particle communication/update infrastructure. +3. Scatter/gather + FFT backend + NUFFT core. +4. Alpine PIF examples and benchmarks. + +This is less clean, but still much better than one 23k-line PR. + +## Next Mapping Step + +Prototype extraction order: + +1. Create a branch from `origin/master` for PCG. +2. Cherry-pick `40438d17` and `be5f794c`. +3. Build and run solver tests. +4. If clean, this becomes the first small PR. + +Then try the particle/communication split because it is probably the most valuable non-PIF improvement and has direct tests: + +- `unit_tests/Particle/ParticleUpdate.cpp` +- `unit_tests/Particle/ParticleUpdateNonuniform.cpp` +- existing `ParticleSendRecv` +- existing `GatherScatterTest` + +## Progress + +### 2026-05-31: PR 1 Prototype - PCG Allocation / Solver Performance + +Status: extracted cleanly. + +Worktree: + +```text +/private/tmp/ippl-pr501-pcg +``` + +Branch: + +```text +pr501-pcg-split +``` + +Base: + +```text +origin/master @ da43fd66a18848b65dcd82af90001f5b64a798ab +``` + +Cherry-picked commits: + +- `40438d17` -> `a76097f0` (`PCG/Preconditioner: hoist per-iteration allocations out of solve loop`) +- `be5f794c` -> `092a04e5` (`PCG: pass Field by reference through OperatorF`) + +Resulting diff: + +- `src/LinearSolvers/PCG.h` +- `src/LinearSolvers/Preconditioner.h` +- `src/PoissonSolvers/PoissonCG.h` +- 3 files changed, 388 insertions, 250 deletions + +Validation: + +```text +cmake -S . -B build-pcg-debug -DCMAKE_BUILD_TYPE=Debug -DIPPL_ENABLE_UNIT_TESTS=ON -DIPPL_ENABLE_FFT=ON -DIPPL_DEFAULT_TEST_PROCS=1 +cmake --build build-pcg-debug +ctest --test-dir build-pcg-debug --output-on-failure -O build-pcg-debug/ctest-rank1.log +``` + +Result: + +```text +100% tests passed, 0 tests failed out of 34 +Total Test time (real) = 54.17 sec +``` + +Assessment: + +- This is a good standalone first PR. +- It has a narrow file set and clear motivation: remove repeated allocation work from the PCG/preconditioner loop. +- Recommended next validation before opening: run the relevant solver integration tests or CSCS GPU tests that originally showed PCG allocation/slowdown symptoms. diff --git a/alpine/BumponTailInstabilityManager.h b/alpine/BumponTailInstabilityManager.h index 7525f9af1..5e03a0712 100644 --- a/alpine/BumponTailInstabilityManager.h +++ b/alpine/BumponTailInstabilityManager.h @@ -274,10 +274,10 @@ class BumponTailInstabilityManager : public AlpineManager { this->pcontainer_m->create(nlocal); - view_type* R = &(this->pcontainer_m->R.getView()); - samplingR.generate(*R, rand_pool64); + view_type R = this->pcontainer_m->R.getView(); + samplingR.generate(R, rand_pool64); - view_type* P = &(this->pcontainer_m->P.getView()); + view_type P = this->pcontainer_m->P.getView(); double mu[Dim]; double sd[Dim]; @@ -288,12 +288,12 @@ class BumponTailInstabilityManager : public AlpineManager { // sample first nlocBulk with muBulk as mean velocity mu[Dim - 1] = muBulk_m; Kokkos::parallel_for(Kokkos::RangePolicy(0, nlocBulk), - ippl::random::randn(*P, rand_pool64, mu, sd)); + ippl::random::randn(P, rand_pool64, mu, sd)); // sample remaining with muBeam as mean velocity mu[Dim - 1] = muBeam_m; Kokkos::parallel_for(Kokkos::RangePolicy(nlocBulk, nlocal), - ippl::random::randn(*P, rand_pool64, mu, sd)); + ippl::random::randn(P, rand_pool64, mu, sd)); Kokkos::fence(); ippl::Comm->barrier(); diff --git a/alpine/LandauDampingManager.h b/alpine/LandauDampingManager.h index b8491c7a5..23dcc632a 100644 --- a/alpine/LandauDampingManager.h +++ b/alpine/LandauDampingManager.h @@ -226,10 +226,10 @@ class LandauDampingManager : public AlpineManager { this->pcontainer_m->create(nlocal); - view_type* R = &(this->pcontainer_m->R.getView()); - samplingR.generate(*R, rand_pool64); + view_type R = this->pcontainer_m->R.getView(); + samplingR.generate(R, rand_pool64); - view_type* P = &(this->pcontainer_m->P.getView()); + view_type P = this->pcontainer_m->P.getView(); double mu[Dim]; double sd[Dim]; @@ -237,7 +237,7 @@ class LandauDampingManager : public AlpineManager { mu[i] = 0.0; sd[i] = 1.0; } - Kokkos::parallel_for(nlocal, ippl::random::randn(*P, rand_pool64, mu, sd)); + Kokkos::parallel_for(nlocal, ippl::random::randn(P, rand_pool64, mu, sd)); Kokkos::fence(); ippl::Comm->barrier(); diff --git a/alpine/PenningTrapManager.h b/alpine/PenningTrapManager.h index b721f3ec8..c7e81de43 100644 --- a/alpine/PenningTrapManager.h +++ b/alpine/PenningTrapManager.h @@ -202,14 +202,14 @@ class PenningTrapManager : public AlpineManager { this->pcontainer_m->create(nlocal); - view_type* R = &(this->pcontainer_m->R.getView()); - samplingR.generate(*R, rand_pool64); + view_type R = this->pcontainer_m->R.getView(); + samplingR.generate(R, rand_pool64); - view_type* P = &(this->pcontainer_m->P.getView()); + view_type P = this->pcontainer_m->P.getView(); double muP[Dim] = {0.0, 0.0, 0.0}; double sdP[Dim] = {1.0, 1.0, 1.0}; - Kokkos::parallel_for(nlocal, ippl::random::randn(*P, rand_pool64, muP, sdP)); + Kokkos::parallel_for(nlocal, ippl::random::randn(P, rand_pool64, muP, sdP)); Kokkos::fence(); ippl::Comm->barrier(); diff --git a/src/LinearSolvers/PCG.h b/src/LinearSolvers/PCG.h index bdb65778d..bf2c84002 100644 --- a/src/LinearSolvers/PCG.h +++ b/src/LinearSolvers/PCG.h @@ -20,12 +20,12 @@ namespace ippl { public: using typename Base::lhs_type, typename Base::rhs_type; - using OperatorF = std::function; - using LowerF = std::function; - using UpperF = std::function; - using UpperLowerF = std::function; - using InverseDiagF = std::function; - using DiagF = std::function; + using OperatorF = std::function; + using LowerF = std::function; + using UpperF = std::function; + using UpperLowerF = std::function; + using InverseDiagF = std::function; + using DiagF = std::function; using mesh_type = typename lhs_type::Mesh_t; using layout_type = typename lhs_type::Layout_t; @@ -188,7 +188,10 @@ namespace ippl { T residueNorm = 0; int iterations_m = 0; - private: + // Workspaces, allocated once via initializeFields() and reused across + // solves. Protected so derived solvers (e.g. PCG) can extend the + // workspace set without redeclaring r, d, q as locals on every + // operator() call. lhs_type r; lhs_type d; lhs_type q; @@ -203,11 +206,11 @@ namespace ippl { public: using typename Base::lhs_type, typename Base::rhs_type; - using OperatorF = std::function; - using LowerF = std::function; - using UpperF = std::function; - using UpperLowerF = std::function; - using InverseDiagF = std::function; + using OperatorF = std::function; + using LowerF = std::function; + using UpperF = std::function; + using UpperLowerF = std::function; + using InverseDiagF = std::function; virtual ~CG() = default; @@ -327,18 +330,37 @@ namespace ippl { public: using typename Base::lhs_type, typename Base::rhs_type; - using OperatorF = std::function; - using LowerF = std::function; - using UpperF = std::function; - using UpperLowerF = std::function; - using InverseDiagF = std::function; - using DiagF = std::function; + using OperatorF = std::function; + using LowerF = std::function; + using UpperF = std::function; + using UpperLowerF = std::function; + using InverseDiagF = std::function; + using DiagF = std::function; + + using mesh_type = typename lhs_type::Mesh_t; + using layout_type = typename lhs_type::Layout_t; PCG() : CG() , preconditioner_m(nullptr){}; + /* + * Extends the CG workspace with the preconditioner result buffers s and + * pcond_out so operator() does not allocate per solve. pcond_out keeps + * its default NoBcFace BCs: the preconditioner's internal operator + * chain therefore never triggers PeriodicFace::apply MPI calls -- if it + * did, the global MPI sequence would diverge from the master code path + * (where the preconditioner returned a fresh NoBcFace field) and + * intermittent multi-rank halo deadlocks would follow. + */ + void initializeFields(mesh_type& mesh, layout_type& layout) override { + CG::initializeFields(mesh, layout); + s.initialize(mesh, layout); + pcond_out.initialize(mesh, layout); + } + /*! * Sets the differential operator for the conjugate gradient algorithm * @param op A function that returns OpRet and takes a field of the LHS type @@ -422,19 +444,26 @@ namespace ippl { "Preconditioner has not been set for PCG solver"); } - typename lhs_type::Mesh_t& mesh = lhs.get_mesh(); - typename lhs_type::Layout_t& layout = lhs.getLayout(); - this->iterations_m = 0; const int maxIterations = params.get("max_iterations"); // Variable names mostly based on description in // https://www.cs.cmu.edu/~quake-papers/painless-conjugate-gradient.pdf - lhs_type r(mesh, layout); - lhs_type d(mesh, layout); - lhs_type s(mesh, layout); - lhs_type q(mesh, layout); - + // r, d, q come from the CG base class; s and pcond_out are PCG + // members. All are pre-allocated via initializeFields(); operator() + // only refreshes their layout so a load-balance repartition is + // tracked. The preconditioner scratch must follow the lhs layout + // too, otherwise its halo-exchange neighbor list goes out of sync + // with r/d/s/q after a repartition and halo MPI calls deadlock. + this->r.updateLayout(lhs.getLayout()); + this->d.updateLayout(lhs.getLayout()); + s.updateLayout(lhs.getLayout()); + pcond_out.updateLayout(lhs.getLayout()); + this->q.updateLayout(lhs.getLayout()); + + // Each preconditioner's init_fields() is responsible for being + // cheap on the steady-state path (refreshing layout in-place, not + // reallocating). preconditioner_m->init_fields(lhs); using bc_type = BConds; @@ -459,20 +488,26 @@ namespace ippl { } } - r = rhs - this->op_m(lhs); - d = preconditioner_m->operator()(r).deepCopy(); - d.setFieldBC(bc); + this->r = rhs - this->op_m(lhs); + // pcond_out keeps its default NoBcFace BCs so the preconditioner's + // internal operator chain does not trigger PeriodicFace::apply MPI; + // the expression assignment then folds it into d (which gets the + // periodic BCs via setFieldBC below). + (*preconditioner_m)(this->r, pcond_out); + this->d = T(1) * pcond_out; + this->d.setFieldBC(bc); - T delta1 = innerProduct(r, d); + T delta1 = innerProduct(this->r, this->d); T delta0 = delta1; this->residueNorm = Kokkos::sqrt(Kokkos::abs(delta1)); const T tolerance = params.get("tolerance") * this->residueNorm; while (this->iterations_m < maxIterations && this->residueNorm > tolerance) { - q = this->op_m(d); - q = q.deepCopy(); - T alpha = delta1 / innerProduct(d, q); - lhs = lhs + alpha * d; + // op_m(d) writes its expression into q's existing storage; no + // allocation, no per-iteration deep copy. + this->q = this->op_m(this->d); + T alpha = delta1 / innerProduct(this->d, this->q); + lhs = lhs + alpha * this->d; // The exact residue is given by // r = rhs - BaseCG::op_m(lhs); @@ -481,16 +516,17 @@ namespace ippl { // the correction does not have a significant effect on accuracy; // in some implementations, the correction may be applied every few // iterations to offset accumulated floating point errors - r = r - alpha * q; - s = preconditioner_m->operator()(r).deepCopy(); + this->r = this->r - alpha * this->q; + // s := M^{-1} r; preconditioner writes into s (NoBcFace). + (*preconditioner_m)(this->r, s); delta0 = delta1; - delta1 = innerProduct(r, s); + delta1 = innerProduct(this->r, s); T beta = delta1 / delta0; this->residueNorm = Kokkos::sqrt(Kokkos::abs(delta1)); - d = s + beta * d; + this->d = s + beta * this->d; ++this->iterations_m; } @@ -502,6 +538,13 @@ namespace ippl { protected: std::unique_ptr> preconditioner_m; + + // Preconditioner result buffers, allocated once via initializeFields() + // and reused across solves. Both deliberately keep their default + // NoBcFace BCs so the preconditioner's internal operator chain does + // NOT trigger PeriodicFace::apply MPI calls. + lhs_type s; + lhs_type pcond_out; }; }; // namespace ippl diff --git a/src/LinearSolvers/Preconditioner.h b/src/LinearSolvers/Preconditioner.h index 09eb876aa..2978fb111 100644 --- a/src/LinearSolvers/Preconditioner.h +++ b/src/LinearSolvers/Preconditioner.h @@ -7,13 +7,23 @@ #ifndef IPPL_PRECONDITIONER_H #define IPPL_PRECONDITIONER_H +#include + #include "Expression/IpplOperations.h" // get the function apply() // Expands to a lambda that acts as a wrapper for a differential operator // fun: the function for which to create the wrapper, such as ippl::laplace // type: the argument type, which should match the LHS type for the solver +// +// IMPORTANT: takes the field by *reference*, not by value. A by-value +// signature would copy the Field on every call -- BareField has shared- +// ownership Kokkos::View members, so the data isn't actually copied, but +// the embedded HaloCells::haloData_m buffer that the halo exchange grows +// via Kokkos::realloc only grows on the copy and never propagates back to +// the original. The result was a fresh cudaMalloc per op_m() call in +// multi-rank runs (see nsys profile in ~/prof/pif-pr-verify). #define IPPL_SOLVER_OPERATOR_WRAPPER(fun, type) \ - [](type arg) { \ + [](type& arg) { \ return fun(arg); \ } @@ -32,17 +42,18 @@ namespace ippl { virtual ~preconditioner() = default; - // Placeholder for the function operator, actually implemented in the derived classes - virtual Field operator()(Field& u) { - Field res = u.deepCopy(); - return res; + // Apply the preconditioner: result = M^{-1} u. Concrete preconditioners + // override this to write into the caller-provided result buffer; this + // avoids per-call Field allocations and per-call deep copies in PCG. + // The default (identity) preconditioner copies u into result. + virtual void operator()(Field& u, Field& result) { + Kokkos::deep_copy(result.getView(), u.getView()); } - // Placeholder for setting additional fields, actually implemented in the derived classes - virtual void init_fields(Field& b) { - Field res = b.deepCopy(); - return; - } + // Allocate any scratch fields the preconditioner needs. Called once + // by the owning solver after the layout is known. Concrete + // preconditioners that need scratch override this. + virtual void init_fields(Field& /*b*/) {} std::string get_type() { return type_m; }; @@ -66,14 +77,23 @@ namespace ippl { inverse_diagonal_m = std::move(inverse_diagonal); } - Field operator()(Field& u) override { - mesh_type& mesh = u.get_mesh(); - layout_type& layout = u.getLayout(); - Field res(mesh, layout); - - res = inverse_diagonal_m(u); - res = w_m * res; - return res; + void operator()(Field& u, Field& result) override { + // result = w * D^{-1} * u, written element-wise into the caller- + // provided result buffer. Two forms of inverse_diagonal_m are + // supported, matching the convention used by the other + // preconditioners in this file: + // - returns a scalar (e.g. uniform-mesh Poisson Laplacian): + // result = w * scalar * u + // - returns a Field / expression equal to D^{-1} * u + // (e.g. matrix-free FEM operators): + // result = w * inverse_diagonal_m(u) + if constexpr (std::is_same_v>) { + const double scale = w_m * inverse_diagonal_m(u); + result = scale * u; + } else { + result = inverse_diagonal_m(u); + result = w_m * result; + } } protected: @@ -125,12 +145,64 @@ namespace ippl { return *this = polynomial_newton_preconditioner(other); } - Field recursive_preconditioner(Field& u, unsigned int level) { - mesh_type& mesh = u.get_mesh(); - layout_type& layout = u.getLayout(); - // Define etas if not defined yet + // Recursive Newton expansion P_k(u) where: + // P_0(u) = eta_0 * u + // P_k(u) = eta_k * (2 P_{k-1}(u) - P_{k-1}(A P_{k-1}(u))) + // Writes the result of level `level` into `out`. Uses one scratch + // field per recursion depth (Pr, PA, PAPr) so that no Field is + // allocated per call. The two recursive calls at each level execute + // sequentially and may reuse scratch at lower depths because the + // first call's lower-depth values have already been folded into + // Pr_scratch[level] before the second call begins. + void recursive_preconditioner(Field& u, unsigned int level, Field& out) { + // Timers accumulated across all levels of the recursion: useful + // for "how much of pcond is laplace vs combine vs leaf-assign". + // Counts include every call at every level (so the call-count + // column equals 2^(level_m+1)-1, not the number of operator() + // invocations). + static IpplTimings::TimerRef t_total = IpplTimings::getTimer("newton/total"); + static IpplTimings::TimerRef t_leaf = IpplTimings::getTimer("newton/leaf_assign"); + static IpplTimings::TimerRef t_op = IpplTimings::getTimer("newton/op_m"); + static IpplTimings::TimerRef t_comb = IpplTimings::getTimer("newton/combine"); + + IpplTimings::startTimer(t_total); + if (level == 0) { + IpplTimings::startTimer(t_leaf); + out = eta_m[0] * u; + IpplTimings::stopTimer(t_leaf); + IpplTimings::stopTimer(t_total); + return; + } + Field& Pr = Pr_scratch_m[level]; + Field& PA = PA_scratch_m[level]; + Field& PAPr = PAPr_scratch_m[level]; + + // Inner recursive call owns its own timing; we don't double-count + // it here. + recursive_preconditioner(u, level - 1, Pr); + IpplTimings::startTimer(t_op); + PA = op_m(Pr); + IpplTimings::stopTimer(t_op); + recursive_preconditioner(PA, level - 1, PAPr); + IpplTimings::startTimer(t_comb); + out = eta_m[level] * (2.0 * Pr - PAPr); + IpplTimings::stopTimer(t_comb); + IpplTimings::stopTimer(t_total); + } + + void operator()(Field& u, Field& result) override { + // One outer timer per polynomial_newton apply, complementing the + // per-call recursion timers. Lets us see how many outer pcond + // calls a solve does vs. the time inside each. + static IpplTimings::TimerRef t_apply = IpplTimings::getTimer("newton/apply"); + IpplTimings::startTimer(t_apply); + recursive_preconditioner(u, level_m, result); + IpplTimings::stopTimer(t_apply); + } + + void init_fields(Field& b) override { + // One-shot precomputation of the eta coefficients. if (eta_m == nullptr) { - // Precompute the etas for later use eta_m = new double[level_m + 1]; eta_m[0] = 2.0 / ((alpha_m + beta_m) * (1.0 + zeta_m)); if (level_m > 0) { @@ -143,25 +215,31 @@ namespace ippl { } } - Field res(mesh, layout); - // Base case - if (level == 0) { - res = eta_m[0] * u; - return res; + // First call: allocate one scratch field per recursion depth. + // Subsequent calls: refresh layout in place so a load-balance + // repartition tracks correctly without freeing/reallocating + // when the local extents are unchanged. + mesh_type& mesh = b.get_mesh(); + layout_type& layout = b.getLayout(); + if (!fields_initialized_m) { + Pr_scratch_m.resize(level_m + 1); + PA_scratch_m.resize(level_m + 1); + PAPr_scratch_m.resize(level_m + 1); + for (unsigned int i = 1; i <= level_m; ++i) { + Pr_scratch_m[i] = Field(mesh, layout); + PA_scratch_m[i] = Field(mesh, layout); + PAPr_scratch_m[i] = Field(mesh, layout); + } + fields_initialized_m = true; + } else { + for (unsigned int i = 1; i <= level_m; ++i) { + Pr_scratch_m[i].updateLayout(layout); + PA_scratch_m[i].updateLayout(layout); + PAPr_scratch_m[i].updateLayout(layout); + } } - // Recursive case - Field PAPr(mesh, layout); - Field Pr(mesh, layout); - - Pr = recursive_preconditioner(u, level - 1); - PAPr = op_m(Pr); - PAPr = recursive_preconditioner(PAPr, level - 1); - res = eta_m[level] * (2.0 * Pr - PAPr); - return res; } - Field operator()(Field& u) override { return recursive_preconditioner(u, level_m); } - protected: OperatorF op_m; // Operator to be preconditioned double alpha_m; // Smallest Eigenvalue @@ -170,6 +248,10 @@ namespace ippl { double zeta_m; // smallest (alpha + beta) is multiplied by (1+zeta) to avoid clustering of // Eigenvalues double* eta_m = nullptr; // Size is determined at runtime + std::vector Pr_scratch_m; + std::vector PA_scratch_m; + std::vector PAPr_scratch_m; + bool fields_initialized_m = false; }; /*! @@ -219,19 +301,65 @@ namespace ippl { return *this = polynomial_chebyshev_preconditioner(other); } - Field operator()(Field& r) override { - mesh_type& mesh = r.get_mesh(); - layout_type& layout = r.getLayout(); + void operator()(Field& r, Field& result) override { + // x_m, x_old_m, A_m, z_m are pre-allocated scratch (init_fields). + // Coefficients in rho_m are also computed once. + static IpplTimings::TimerRef t_apply = IpplTimings::getTimer("chebyshev/apply"); + static IpplTimings::TimerRef t_init = IpplTimings::getTimer("chebyshev/init_step"); + static IpplTimings::TimerRef t_op = IpplTimings::getTimer("chebyshev/op_m"); + static IpplTimings::TimerRef t_combo = IpplTimings::getTimer("chebyshev/combine"); + static IpplTimings::TimerRef t_copy = IpplTimings::getTimer("chebyshev/deep_copy"); + + IpplTimings::startTimer(t_apply); + + IpplTimings::startTimer(t_init); + x_old_m = r / theta_m; + IpplTimings::stopTimer(t_init); + IpplTimings::startTimer(t_op); + A_m = op_m(r); + IpplTimings::stopTimer(t_op); + IpplTimings::startTimer(t_combo); + x_m = 2.0 * rho_m[1] / delta_m * (2.0 * r - A_m / theta_m); + IpplTimings::stopTimer(t_combo); + + if (degree_m == 0) { + // result = x_old + IpplTimings::startTimer(t_copy); + Kokkos::deep_copy(result.getView(), x_old_m.getView()); + IpplTimings::stopTimer(t_copy); + IpplTimings::stopTimer(t_apply); + return; + } - Field res(mesh, layout); - Field x(mesh, layout); - Field x_old(mesh, layout); - Field A(mesh, layout); - Field z(mesh, layout); + if (degree_m == 1) { + // result = x + IpplTimings::startTimer(t_copy); + Kokkos::deep_copy(result.getView(), x_m.getView()); + IpplTimings::stopTimer(t_copy); + IpplTimings::stopTimer(t_apply); + return; + } + for (unsigned int i = 2; i < degree_m + 1; ++i) { + IpplTimings::startTimer(t_op); + A_m = op_m(x_m); + IpplTimings::stopTimer(t_op); + IpplTimings::startTimer(t_combo); + z_m = 2.0 / delta_m * (r - A_m); + // Write the new x value into result (the caller's buffer); + // x_old gets a deep copy of the previous x. + result = rho_m[i] * (2 * sigma_m * x_m - rho_m[i - 1] * x_old_m + z_m); + IpplTimings::stopTimer(t_combo); + IpplTimings::startTimer(t_copy); + Kokkos::deep_copy(x_old_m.getView(), x_m.getView()); + Kokkos::deep_copy(x_m.getView(), result.getView()); + IpplTimings::stopTimer(t_copy); + } + IpplTimings::stopTimer(t_apply); + } - // Precompute the coefficients if not done yet + void init_fields(Field& b) override { + // One-shot precomputation of the rho coefficients. if (rho_m == nullptr) { - // Start precomputing the coefficients theta_m = (beta_m + alpha_m) / 2.0 * (1.0 + zeta_m); delta_m = (beta_m - alpha_m) / 2.0; sigma_m = theta_m / delta_m; @@ -241,29 +369,23 @@ namespace ippl { for (unsigned int i = 1; i < degree_m + 1; ++i) { rho_m[i] = 1.0 / (2.0 * sigma_m - rho_m[i - 1]); } - } // End of precomputing the coefficients - - res = r.deepCopy(); - - x_old = r / theta_m; - A = op_m(r); - x = 2.0 * rho_m[1] / delta_m * (2.0 * r - A / theta_m); - - if (degree_m == 0) { - return x_old; - } - - if (degree_m == 1) { - return x; } - for (unsigned int i = 2; i < degree_m + 1; ++i) { - A = op_m(x); - z = 2.0 / delta_m * (r - A); - res = rho_m[i] * (2 * sigma_m * x - rho_m[i - 1] * x_old + z); - x_old = x.deepCopy(); - x = res.deepCopy(); + mesh_type& mesh = b.get_mesh(); + layout_type& layout = b.getLayout(); + // First call allocates; subsequent calls refresh the layout so a + // repartition is tracked without throwing the storage away. + if (!fields_initialized_m) { + x_m = Field(mesh, layout); + x_old_m = Field(mesh, layout); + A_m = Field(mesh, layout); + z_m = Field(mesh, layout); + fields_initialized_m = true; + } else { + x_m.updateLayout(layout); + x_old_m.updateLayout(layout); + A_m.updateLayout(layout); + z_m.updateLayout(layout); } - return res; } protected: @@ -276,6 +398,11 @@ namespace ippl { unsigned degree_m; double zeta_m; double* rho_m = nullptr; // Size is determined at runtime + Field x_m; + Field x_old_m; + Field A_m; + Field z_m; + bool fields_initialized_m = false; }; /*! @@ -295,22 +422,18 @@ namespace ippl { inverse_diagonal_m = std::move(inverse_diagonal); } - Field operator()(Field& r) override { - - // In the FEM solver, which uses the preconditioner, - // we re-use a resultField to avoid allocating new - // memory at every iteration. - // In order for the operator calls to not rewrite - // on this same field over and over when calling - // the operators (upper, diag, inverse, lower, etc) - // we need deep copies to the preconditioner fields. + void operator()(Field& r, Field& result) override { + // Richardson iteration in-place on the caller-provided result + // buffer. ULg_m stays as a member scratch; the inner deep copies + // remain because the (upper, diag, inverse, lower) operators may + // return Field-valued expressions that alias their input. - g_m = 0; + result = 0; for (unsigned int j = 0; j < innerloops_m; ++j) { - ULg_m = upper_and_lower_m(g_m); + ULg_m = upper_and_lower_m(result); ULg_m = ULg_m.deepCopy(); - g_m = r - ULg_m; - + result = r - ULg_m; + // The inverse diagonal is applied to the // vector itself to return the result usually. // However, the operator for FEM already @@ -318,21 +441,23 @@ namespace ippl { // due to the matrix-free evaluation. // Therefore, we need this if to differentiate // the two cases. - if constexpr (std::is_same_v>) { - g_m = inverse_diagonal_m(g_m) * g_m; + if constexpr (std::is_same_v>) { + result = inverse_diagonal_m(result) * result; } else { - g_m = inverse_diagonal_m(g_m).deepCopy(); + result = inverse_diagonal_m(result).deepCopy(); } } - return g_m; } void init_fields(Field& b) override { layout_type& layout = b.getLayout(); mesh_type& mesh = b.get_mesh(); - - ULg_m = Field(mesh, layout); - g_m = Field(mesh, layout); + if (!fields_initialized_m) { + ULg_m = Field(mesh, layout); + fields_initialized_m = true; + } else { + ULg_m.updateLayout(layout); + } } protected: @@ -340,7 +465,7 @@ namespace ippl { InvDiagF inverse_diagonal_m; unsigned innerloops_m; Field ULg_m; - Field g_m; + bool fields_initialized_m = false; }; /*! @@ -364,22 +489,19 @@ namespace ippl { inverse_diagonal_m = std::move(inverse_diagonal); } - Field operator()(Field& r) override { - // In the FEM solver, which uses the preconditioner, - // we re-use a resultField to avoid allocating new - // memory at every iteration. - // In order for the operator calls to not rewrite - // on this same field over and over when calling - // the operators (upper, diag, inverse, lower, etc) - // we need deep copies to the preconditioner fields. + void operator()(Field& r, Field& result) override { + // result holds the running iterate; Ag_m and g_old_m are scratch + // members. The inner deep copies remain because the operators + // (op_m, inverse_diagonal_m) may return Field-valued expressions + // that alias their input. - g_m = 0; + result = 0; g_old_m = 0; for (unsigned int j = 0; j < innerloops_m; ++j) { - Ag_m = op_m(g_m); + Ag_m = op_m(result); Ag_m = Ag_m.deepCopy(); - g_m = r - Ag_m; + result = r - Ag_m; // The inverse diagonal is applied to the // vector itself to return the result usually. @@ -388,23 +510,26 @@ namespace ippl { // due to the matrix-free evaluation. // Therefore, we need this if to differentiate // the two cases. - if constexpr (std::is_same_v>) { - g_m = g_old_m + inverse_diagonal_m(g_m) * g_m; + if constexpr (std::is_same_v>) { + result = g_old_m + inverse_diagonal_m(result) * result; } else { - g_m = g_old_m + inverse_diagonal_m(g_m); + result = g_old_m + inverse_diagonal_m(result); } - g_old_m = g_m.deepCopy(); + Kokkos::deep_copy(g_old_m.getView(), result.getView()); } - return g_m; } void init_fields(Field& b) override { layout_type& layout = b.getLayout(); mesh_type& mesh = b.get_mesh(); - - Ag_m = Field(mesh, layout); - g_m = Field(mesh, layout); - g_old_m = Field(mesh, layout); + if (!fields_initialized_m) { + Ag_m = Field(mesh, layout); + g_old_m = Field(mesh, layout); + fields_initialized_m = true; + } else { + Ag_m.updateLayout(layout); + g_old_m.updateLayout(layout); + } } protected: @@ -412,8 +537,8 @@ namespace ippl { InvDiagF inverse_diagonal_m; unsigned innerloops_m; Field Ag_m; - Field g_m; Field g_old_m; + bool fields_initialized_m = false; }; /*! @@ -435,73 +560,55 @@ namespace ippl { inverse_diagonal_m = std::move(inverse_diagonal); } - Field operator()(Field& b) override { - layout_type& layout = b.getLayout(); - mesh_type& mesh = b.get_mesh(); + void operator()(Field& b, Field& result) override { + // The running iterate lives in result; UL_m and r_m are scratch + // members. The inner deep copies remain because the (upper, lower, + // inverse) operators may return Field-valued expressions that + // alias their input. - Field x(mesh, layout); - - x = 0; // Initial guess - - // In the FEM solver, which uses the preconditioner, - // we re-use a resultField to avoid allocating new - // memory at every iteration. - // In order for the operator calls to not rewrite - // on this same field over and over when calling - // the operators (upper, diag, inverse, lower, etc) - // we need deep copies to the preconditioner fields. + result = 0; // Initial guess for (unsigned int k = 0; k < outerloops_m; ++k) { - UL_m = upper_m(x); + UL_m = upper_m(result); UL_m = UL_m.deepCopy(); r_m = b - UL_m; for (unsigned int j = 0; j < innerloops_m; ++j) { - UL_m = lower_m(x); + UL_m = lower_m(result); UL_m = UL_m.deepCopy(); - x = r_m - UL_m; - // The inverse diagonal is applied to the - // vector itself to return the result usually. - // However, the operator for FEM already - // returns the result of inv_diag * itself - // due to the matrix-free evaluation. - // Therefore, we need this if to differentiate - // the two cases. - if constexpr (std::is_same_v>) { - x = inverse_diagonal_m(x) * x; + result = r_m - UL_m; + if constexpr (std::is_same_v>) { + result = inverse_diagonal_m(result) * result; } else { - x = inverse_diagonal_m(x).deepCopy(); + result = inverse_diagonal_m(result).deepCopy(); } } - UL_m = lower_m(x); + UL_m = lower_m(result); UL_m = UL_m.deepCopy(); r_m = b - UL_m; for (unsigned int j = 0; j < innerloops_m; ++j) { - UL_m = upper_m(x); + UL_m = upper_m(result); UL_m = UL_m.deepCopy(); - x = r_m - UL_m; - // The inverse diagonal is applied to the - // vector itself to return the result usually. - // However, the operator for FEM already - // returns the result of inv_diag * itself - // due to the matrix-free evaluation. - // Therefore, we need this if to differentiate - // the two cases. - if constexpr (std::is_same_v>) { - x = inverse_diagonal_m(x) * x; + result = r_m - UL_m; + if constexpr (std::is_same_v>) { + result = inverse_diagonal_m(result) * result; } else { - x = inverse_diagonal_m(x).deepCopy(); + result = inverse_diagonal_m(result).deepCopy(); } } } - return x; } void init_fields(Field& b) override { layout_type& layout = b.getLayout(); mesh_type& mesh = b.get_mesh(); - - UL_m = Field(mesh, layout); - r_m = Field(mesh, layout); + if (!fields_initialized_m) { + UL_m = Field(mesh, layout); + r_m = Field(mesh, layout); + fields_initialized_m = true; + } else { + UL_m.updateLayout(layout); + r_m.updateLayout(layout); + } } protected: @@ -512,6 +619,7 @@ namespace ippl { unsigned outerloops_m; Field UL_m; Field r_m; + bool fields_initialized_m = false; }; /*! @@ -536,86 +644,73 @@ namespace ippl { diagonal_m = std::move(diagonal); } - Field operator()(Field& b) override { + void operator()(Field& b, Field& result) override { static IpplTimings::TimerRef initTimer = IpplTimings::getTimer("SSOR Init"); IpplTimings::startTimer(initTimer); double D; - layout_type& layout = b.getLayout(); - mesh_type& mesh = b.get_mesh(); - - Field x(mesh, layout); - - x = 0; // Initial guess + // The running iterate lives in result; UL_m, r_m are scratch + // members. The inner deep copies remain because (upper, lower, + // inverse, diagonal) may return aliasing Field-valued expressions. + result = 0; // Initial guess IpplTimings::stopTimer(initTimer); static IpplTimings::TimerRef loopTimer = IpplTimings::getTimer("SSOR loop"); IpplTimings::startTimer(loopTimer); - // In the FEM solver, which uses the preconditioner, - // we re-use a resultField to avoid allocating new - // memory at every iteration. - // In order for the operator calls to not rewrite - // on this same field over and over when calling - // the operators (upper, diag, inverse, lower, etc) - // we need deep copies to the preconditioner fields. - - // The inverse diagonal is applied to the - // vector itself to return the result usually. - // However, the operator for FEM already - // returns the result of inv_diag * itself - // due to the matrix-free evaluation. - // Therefore, we need this if to differentiate - // the two cases. for (unsigned int k = 0; k < outerloops_m; ++k) { - if constexpr (std::is_same_v>) { - UL_m = upper_m(x); - D = diagonal_m(x); - r_m = omega_m * (b - UL_m) + (1.0 - omega_m) * D * x; + if constexpr (std::is_same_v>) { + UL_m = upper_m(result); + D = diagonal_m(result); + r_m = omega_m * (b - UL_m) + (1.0 - omega_m) * D * result; for (unsigned int j = 0; j < innerloops_m; ++j) { - UL_m = lower_m(x); - x = r_m - omega_m * UL_m; - x = inverse_diagonal_m(x) * x; + UL_m = lower_m(result); + result = r_m - omega_m * UL_m; + result = inverse_diagonal_m(result) * result; } - UL_m = lower_m(x); - D = diagonal_m(x); - r_m = omega_m * (b - UL_m) + (1.0 - omega_m) * D * x; + UL_m = lower_m(result); + D = diagonal_m(result); + r_m = omega_m * (b - UL_m) + (1.0 - omega_m) * D * result; for (unsigned int j = 0; j < innerloops_m; ++j) { - UL_m = upper_m(x); - x = r_m - omega_m * UL_m; - x = inverse_diagonal_m(x) * x; + UL_m = upper_m(result); + result = r_m - omega_m * UL_m; + result = inverse_diagonal_m(result) * result; } } else { - UL_m = upper_m(x).deepCopy(); - r_m = omega_m * (b - UL_m) + (1.0 - omega_m) * diagonal_m(x); + UL_m = upper_m(result).deepCopy(); + r_m = omega_m * (b - UL_m) + (1.0 - omega_m) * diagonal_m(result); for (unsigned int j = 0; j < innerloops_m; ++j) { - UL_m = lower_m(x).deepCopy(); - x = r_m - omega_m * UL_m; - x = inverse_diagonal_m(x).deepCopy(); + UL_m = lower_m(result).deepCopy(); + result = r_m - omega_m * UL_m; + result = inverse_diagonal_m(result).deepCopy(); } - UL_m = lower_m(x).deepCopy(); - r_m = omega_m * (b - UL_m) + (1.0 - omega_m) * diagonal_m(x); + UL_m = lower_m(result).deepCopy(); + r_m = omega_m * (b - UL_m) + (1.0 - omega_m) * diagonal_m(result); for (unsigned int j = 0; j < innerloops_m; ++j) { - UL_m = upper_m(x).deepCopy(); - x = r_m - omega_m * UL_m; - x = inverse_diagonal_m(x).deepCopy(); + UL_m = upper_m(result).deepCopy(); + result = r_m - omega_m * UL_m; + result = inverse_diagonal_m(result).deepCopy(); } } } IpplTimings::stopTimer(loopTimer); - return x; } void init_fields(Field& b) override { layout_type& layout = b.getLayout(); mesh_type& mesh = b.get_mesh(); - - UL_m = Field(mesh, layout); - r_m = Field(mesh, layout); + if (!fields_initialized_m) { + UL_m = Field(mesh, layout); + r_m = Field(mesh, layout); + fields_initialized_m = true; + } else { + UL_m.updateLayout(layout); + r_m.updateLayout(layout); + } } protected: @@ -628,6 +723,7 @@ namespace ippl { double omega_m; Field UL_m; Field r_m; + bool fields_initialized_m = false; }; /*! diff --git a/src/PoissonSolvers/PoissonCG.h b/src/PoissonSolvers/PoissonCG.h index 7c32f4257..db4c2d1dd 100644 --- a/src/PoissonSolvers/PoissonCG.h +++ b/src/PoissonSolvers/PoissonCG.h @@ -11,13 +11,12 @@ #include "Poisson.h" namespace ippl { -// Expands to a lambda that acts as a wrapper for a differential operator -// fun: the function for which to create the wrapper, such as ippl::laplace -// type: the argument type, which should match the LHS type for the solver -#define IPPL_SOLVER_OPERATOR_WRAPPER(fun, type) \ - [](type arg) { \ - return fun(arg); \ - } +// IPPL_SOLVER_OPERATOR_WRAPPER is defined once in LinearSolvers/Preconditioner.h +// (re-exported through this header via PCG.h). Defining it again here used to +// silently shadow that definition with a by-value lambda, which copies the +// Field on every op_m() call and reintroduces the per-iteration cudaMalloc +// in the halo exchange (the realloc never propagates back to the original +// Field). Keep a single by-reference definition. template class PoissonCG : public Poisson {