From 60cf99e685957a3a814b21bb031ebd61484dc612 Mon Sep 17 00:00:00 2001
From: Fischill Paul <paul.fischill@inf.ethz.ch>
Date: Sun, 31 May 2026 21:34:38 +0200
Subject: [PATCH 1/3] Hoist PCG allocations out of solve loop

Reuse PCG and preconditioner work fields across iterations instead of allocating temporary fields during every solve step. This also passes the solver field by reference through OperatorF to avoid extra halo-related allocations.
---
 src/LinearSolvers/PCG.h            | 119 ++++---
 src/LinearSolvers/Preconditioner.h | 506 +++++++++++++++++------------
 src/PoissonSolvers/PoissonCG.h     |  13 +-
 3 files changed, 388 insertions(+), 250 deletions(-)

diff --git a/src/LinearSolvers/PCG.h b/src/LinearSolvers/PCG.h
index bdb65778d..bf2c84002 100644
--- a/src/LinearSolvers/PCG.h
+++ b/src/LinearSolvers/PCG.h
@@ -20,12 +20,12 @@ namespace ippl {
 
     public:
         using typename Base::lhs_type, typename Base::rhs_type;
-        using OperatorF    = std::function<OperatorRet(lhs_type)>;
-        using LowerF       = std::function<LowerRet(lhs_type)>;
-        using UpperF       = std::function<UpperRet(lhs_type)>;
-        using UpperLowerF  = std::function<UpperLowerRet(lhs_type)>;
-        using InverseDiagF = std::function<InverseDiagRet(lhs_type)>;
-        using DiagF        = std::function<DiagRet(lhs_type)>;
+        using OperatorF    = std::function<OperatorRet(lhs_type&)>;
+        using LowerF       = std::function<LowerRet(lhs_type&)>;
+        using UpperF       = std::function<UpperRet(lhs_type&)>;
+        using UpperLowerF  = std::function<UpperLowerRet(lhs_type&)>;
+        using InverseDiagF = std::function<InverseDiagRet(lhs_type&)>;
+        using DiagF        = std::function<DiagRet(lhs_type&)>;
         using mesh_type    = typename lhs_type::Mesh_t;
         using layout_type  = typename lhs_type::Layout_t;
 
@@ -188,7 +188,10 @@ namespace ippl {
         T residueNorm    = 0;
         int iterations_m = 0;
 
-    private:
+        // Workspaces, allocated once via initializeFields() and reused across
+        // solves. Protected so derived solvers (e.g. PCG) can extend the
+        // workspace set without redeclaring r, d, q as locals on every
+        // operator() call.
         lhs_type r;
         lhs_type d;
         lhs_type q;
@@ -203,11 +206,11 @@ namespace ippl {
 
     public:
         using typename Base::lhs_type, typename Base::rhs_type;
-        using OperatorF    = std::function<OperatorRet(lhs_type)>;
-        using LowerF       = std::function<LowerRet(lhs_type)>;
-        using UpperF       = std::function<UpperRet(lhs_type)>;
-        using UpperLowerF  = std::function<UpperLowerRet(lhs_type)>;
-        using InverseDiagF = std::function<InverseDiagRet(lhs_type)>;
+        using OperatorF    = std::function<OperatorRet(lhs_type&)>;
+        using LowerF       = std::function<LowerRet(lhs_type&)>;
+        using UpperF       = std::function<UpperRet(lhs_type&)>;
+        using UpperLowerF  = std::function<UpperLowerRet(lhs_type&)>;
+        using InverseDiagF = std::function<InverseDiagRet(lhs_type&)>;
 
         virtual ~CG() = default;
 
@@ -327,18 +330,37 @@ namespace ippl {
 
     public:
         using typename Base::lhs_type, typename Base::rhs_type;
-        using OperatorF    = std::function<OperatorRet(lhs_type)>;
-        using LowerF       = std::function<LowerRet(lhs_type)>;
-        using UpperF       = std::function<UpperRet(lhs_type)>;
-        using UpperLowerF  = std::function<UpperLowerRet(lhs_type)>;
-        using InverseDiagF = std::function<InverseDiagRet(lhs_type)>;
-        using DiagF        = std::function<DiagRet(lhs_type)>;
+        using OperatorF    = std::function<OperatorRet(lhs_type&)>;
+        using LowerF       = std::function<LowerRet(lhs_type&)>;
+        using UpperF       = std::function<UpperRet(lhs_type&)>;
+        using UpperLowerF  = std::function<UpperLowerRet(lhs_type&)>;
+        using InverseDiagF = std::function<InverseDiagRet(lhs_type&)>;
+        using DiagF        = std::function<DiagRet(lhs_type&)>;
+
+        using mesh_type   = typename lhs_type::Mesh_t;
+        using layout_type = typename lhs_type::Layout_t;
 
         PCG()
             : CG<OperatorRet, LowerRet, UpperRet, UpperLowerRet, InverseDiagRet, DiagRet, FieldLHS,
                  FieldRHS>()
             , preconditioner_m(nullptr){};
 
+        /*
+         * Extends the CG workspace with the preconditioner result buffers s and
+         * pcond_out so operator() does not allocate per solve. pcond_out keeps
+         * its default NoBcFace BCs: the preconditioner's internal operator
+         * chain therefore never triggers PeriodicFace::apply MPI calls -- if it
+         * did, the global MPI sequence would diverge from the master code path
+         * (where the preconditioner returned a fresh NoBcFace field) and
+         * intermittent multi-rank halo deadlocks would follow.
+         */
+        void initializeFields(mesh_type& mesh, layout_type& layout) override {
+            CG<OperatorRet, LowerRet, UpperRet, UpperLowerRet, InverseDiagRet, DiagRet, FieldLHS,
+               FieldRHS>::initializeFields(mesh, layout);
+            s.initialize(mesh, layout);
+            pcond_out.initialize(mesh, layout);
+        }
+
         /*!
          * Sets the differential operator for the conjugate gradient algorithm
          * @param op A function that returns OpRet and takes a field of the LHS type
@@ -422,19 +444,26 @@ namespace ippl {
                                     "Preconditioner has not been set for PCG solver");
             }
 
-            typename lhs_type::Mesh_t& mesh     = lhs.get_mesh();
-            typename lhs_type::Layout_t& layout = lhs.getLayout();
-
             this->iterations_m      = 0;
             const int maxIterations = params.get<int>("max_iterations");
 
             // Variable names mostly based on description in
             // https://www.cs.cmu.edu/~quake-papers/painless-conjugate-gradient.pdf
-            lhs_type r(mesh, layout);
-            lhs_type d(mesh, layout);
-            lhs_type s(mesh, layout);
-            lhs_type q(mesh, layout);
-
+            // r, d, q come from the CG base class; s and pcond_out are PCG
+            // members. All are pre-allocated via initializeFields(); operator()
+            // only refreshes their layout so a load-balance repartition is
+            // tracked. The preconditioner scratch must follow the lhs layout
+            // too, otherwise its halo-exchange neighbor list goes out of sync
+            // with r/d/s/q after a repartition and halo MPI calls deadlock.
+            this->r.updateLayout(lhs.getLayout());
+            this->d.updateLayout(lhs.getLayout());
+            s.updateLayout(lhs.getLayout());
+            pcond_out.updateLayout(lhs.getLayout());
+            this->q.updateLayout(lhs.getLayout());
+
+            // Each preconditioner's init_fields() is responsible for being
+            // cheap on the steady-state path (refreshing layout in-place, not
+            // reallocating).
             preconditioner_m->init_fields(lhs);
 
             using bc_type  = BConds<lhs_type, Dim>;
@@ -459,20 +488,26 @@ namespace ippl {
                 }
             }
 
-            r = rhs - this->op_m(lhs);
-            d = preconditioner_m->operator()(r).deepCopy();
-            d.setFieldBC(bc);
+            this->r = rhs - this->op_m(lhs);
+            // pcond_out keeps its default NoBcFace BCs so the preconditioner's
+            // internal operator chain does not trigger PeriodicFace::apply MPI;
+            // the expression assignment then folds it into d (which gets the
+            // periodic BCs via setFieldBC below).
+            (*preconditioner_m)(this->r, pcond_out);
+            this->d = T(1) * pcond_out;
+            this->d.setFieldBC(bc);
 
-            T delta1          = innerProduct(r, d);
+            T delta1          = innerProduct(this->r, this->d);
             T delta0          = delta1;
             this->residueNorm = Kokkos::sqrt(Kokkos::abs(delta1));
             const T tolerance = params.get<T>("tolerance") * this->residueNorm;
 
             while (this->iterations_m < maxIterations && this->residueNorm > tolerance) {
-                q       = this->op_m(d);
-                q       = q.deepCopy();
-                T alpha = delta1 / innerProduct(d, q);
-                lhs     = lhs + alpha * d;
+                // op_m(d) writes its expression into q's existing storage; no
+                // allocation, no per-iteration deep copy.
+                this->q = this->op_m(this->d);
+                T alpha = delta1 / innerProduct(this->d, this->q);
+                lhs     = lhs + alpha * this->d;
 
                 // The exact residue is given by
                 // r = rhs - BaseCG::op_m(lhs);
@@ -481,16 +516,17 @@ namespace ippl {
                 // the correction does not have a significant effect on accuracy;
                 // in some implementations, the correction may be applied every few
                 // iterations to offset accumulated floating point errors
-                r = r - alpha * q;
-                s = preconditioner_m->operator()(r).deepCopy();
+                this->r = this->r - alpha * this->q;
+                // s := M^{-1} r; preconditioner writes into s (NoBcFace).
+                (*preconditioner_m)(this->r, s);
 
                 delta0 = delta1;
-                delta1 = innerProduct(r, s);
+                delta1 = innerProduct(this->r, s);
 
                 T beta            = delta1 / delta0;
                 this->residueNorm = Kokkos::sqrt(Kokkos::abs(delta1));
 
-                d = s + beta * d;
+                this->d = s + beta * this->d;
                 ++this->iterations_m;
             }
 
@@ -502,6 +538,13 @@ namespace ippl {
 
     protected:
         std::unique_ptr<preconditioner<FieldLHS>> preconditioner_m;
+
+        // Preconditioner result buffers, allocated once via initializeFields()
+        // and reused across solves. Both deliberately keep their default
+        // NoBcFace BCs so the preconditioner's internal operator chain does
+        // NOT trigger PeriodicFace::apply MPI calls.
+        lhs_type s;
+        lhs_type pcond_out;
     };
 
 };  // namespace ippl
diff --git a/src/LinearSolvers/Preconditioner.h b/src/LinearSolvers/Preconditioner.h
index 09eb876aa..2978fb111 100644
--- a/src/LinearSolvers/Preconditioner.h
+++ b/src/LinearSolvers/Preconditioner.h
@@ -7,13 +7,23 @@
 #ifndef IPPL_PRECONDITIONER_H
 #define IPPL_PRECONDITIONER_H
 
+#include <vector>
+
 #include "Expression/IpplOperations.h"  // get the function apply()
 
 // Expands to a lambda that acts as a wrapper for a differential operator
 // fun: the function for which to create the wrapper, such as ippl::laplace
 // type: the argument type, which should match the LHS type for the solver
+//
+// IMPORTANT: takes the field by *reference*, not by value. A by-value
+// signature would copy the Field on every call -- BareField has shared-
+// ownership Kokkos::View members, so the data isn't actually copied, but
+// the embedded HaloCells::haloData_m buffer that the halo exchange grows
+// via Kokkos::realloc only grows on the copy and never propagates back to
+// the original. The result was a fresh cudaMalloc per op_m() call in
+// multi-rank runs (see nsys profile in ~/prof/pif-pr-verify).
 #define IPPL_SOLVER_OPERATOR_WRAPPER(fun, type) \
-    [](type arg) {                              \
+    [](type& arg) {                             \
         return fun(arg);                        \
     }
 
@@ -32,17 +42,18 @@ namespace ippl {
 
         virtual ~preconditioner() = default;
 
-        // Placeholder for the function operator, actually implemented in the derived classes
-        virtual Field operator()(Field& u) {
-            Field res = u.deepCopy();
-            return res;
+        // Apply the preconditioner: result = M^{-1} u. Concrete preconditioners
+        // override this to write into the caller-provided result buffer; this
+        // avoids per-call Field allocations and per-call deep copies in PCG.
+        // The default (identity) preconditioner copies u into result.
+        virtual void operator()(Field& u, Field& result) {
+            Kokkos::deep_copy(result.getView(), u.getView());
         }
 
-        // Placeholder for setting additional fields, actually implemented in the derived classes
-        virtual void init_fields(Field& b) {
-            Field res = b.deepCopy();
-            return;
-        }
+        // Allocate any scratch fields the preconditioner needs. Called once
+        // by the owning solver after the layout is known. Concrete
+        // preconditioners that need scratch override this.
+        virtual void init_fields(Field& /*b*/) {}
 
         std::string get_type() { return type_m; };
 
@@ -66,14 +77,23 @@ namespace ippl {
             inverse_diagonal_m = std::move(inverse_diagonal);
         }
 
-        Field operator()(Field& u) override {
-            mesh_type& mesh     = u.get_mesh();
-            layout_type& layout = u.getLayout();
-            Field res(mesh, layout);
-
-            res = inverse_diagonal_m(u);
-            res = w_m * res;
-            return res;
+        void operator()(Field& u, Field& result) override {
+            // result = w * D^{-1} * u, written element-wise into the caller-
+            // provided result buffer. Two forms of inverse_diagonal_m are
+            // supported, matching the convention used by the other
+            // preconditioners in this file:
+            //   - returns a scalar (e.g. uniform-mesh Poisson Laplacian):
+            //       result = w * scalar * u
+            //   - returns a Field / expression equal to D^{-1} * u
+            //     (e.g. matrix-free FEM operators):
+            //       result = w * inverse_diagonal_m(u)
+            if constexpr (std::is_same_v<InvDiagF, std::function<double(Field&)>>) {
+                const double scale = w_m * inverse_diagonal_m(u);
+                result = scale * u;
+            } else {
+                result = inverse_diagonal_m(u);
+                result = w_m * result;
+            }
         }
 
     protected:
@@ -125,12 +145,64 @@ namespace ippl {
             return *this = polynomial_newton_preconditioner(other);
         }
 
-        Field recursive_preconditioner(Field& u, unsigned int level) {
-            mesh_type& mesh     = u.get_mesh();
-            layout_type& layout = u.getLayout();
-            // Define etas if not defined yet
+        // Recursive Newton expansion P_k(u) where:
+        //   P_0(u) = eta_0 * u
+        //   P_k(u) = eta_k * (2 P_{k-1}(u) - P_{k-1}(A P_{k-1}(u)))
+        // Writes the result of level `level` into `out`. Uses one scratch
+        // field per recursion depth (Pr, PA, PAPr) so that no Field is
+        // allocated per call. The two recursive calls at each level execute
+        // sequentially and may reuse scratch at lower depths because the
+        // first call's lower-depth values have already been folded into
+        // Pr_scratch[level] before the second call begins.
+        void recursive_preconditioner(Field& u, unsigned int level, Field& out) {
+            // Timers accumulated across all levels of the recursion: useful
+            // for "how much of pcond is laplace vs combine vs leaf-assign".
+            // Counts include every call at every level (so the call-count
+            // column equals 2^(level_m+1)-1, not the number of operator()
+            // invocations).
+            static IpplTimings::TimerRef t_total = IpplTimings::getTimer("newton/total");
+            static IpplTimings::TimerRef t_leaf  = IpplTimings::getTimer("newton/leaf_assign");
+            static IpplTimings::TimerRef t_op    = IpplTimings::getTimer("newton/op_m");
+            static IpplTimings::TimerRef t_comb  = IpplTimings::getTimer("newton/combine");
+
+            IpplTimings::startTimer(t_total);
+            if (level == 0) {
+                IpplTimings::startTimer(t_leaf);
+                out = eta_m[0] * u;
+                IpplTimings::stopTimer(t_leaf);
+                IpplTimings::stopTimer(t_total);
+                return;
+            }
+            Field& Pr   = Pr_scratch_m[level];
+            Field& PA   = PA_scratch_m[level];
+            Field& PAPr = PAPr_scratch_m[level];
+
+            // Inner recursive call owns its own timing; we don't double-count
+            // it here.
+            recursive_preconditioner(u, level - 1, Pr);
+            IpplTimings::startTimer(t_op);
+            PA = op_m(Pr);
+            IpplTimings::stopTimer(t_op);
+            recursive_preconditioner(PA, level - 1, PAPr);
+            IpplTimings::startTimer(t_comb);
+            out = eta_m[level] * (2.0 * Pr - PAPr);
+            IpplTimings::stopTimer(t_comb);
+            IpplTimings::stopTimer(t_total);
+        }
+
+        void operator()(Field& u, Field& result) override {
+            // One outer timer per polynomial_newton apply, complementing the
+            // per-call recursion timers. Lets us see how many outer pcond
+            // calls a solve does vs. the time inside each.
+            static IpplTimings::TimerRef t_apply = IpplTimings::getTimer("newton/apply");
+            IpplTimings::startTimer(t_apply);
+            recursive_preconditioner(u, level_m, result);
+            IpplTimings::stopTimer(t_apply);
+        }
+
+        void init_fields(Field& b) override {
+            // One-shot precomputation of the eta coefficients.
             if (eta_m == nullptr) {
-                // Precompute the etas for later use
                 eta_m    = new double[level_m + 1];
                 eta_m[0] = 2.0 / ((alpha_m + beta_m) * (1.0 + zeta_m));
                 if (level_m > 0) {
@@ -143,25 +215,31 @@ namespace ippl {
                 }
             }
 
-            Field res(mesh, layout);
-            // Base case
-            if (level == 0) {
-                res = eta_m[0] * u;
-                return res;
+            // First call: allocate one scratch field per recursion depth.
+            // Subsequent calls: refresh layout in place so a load-balance
+            // repartition tracks correctly without freeing/reallocating
+            // when the local extents are unchanged.
+            mesh_type& mesh     = b.get_mesh();
+            layout_type& layout = b.getLayout();
+            if (!fields_initialized_m) {
+                Pr_scratch_m.resize(level_m + 1);
+                PA_scratch_m.resize(level_m + 1);
+                PAPr_scratch_m.resize(level_m + 1);
+                for (unsigned int i = 1; i <= level_m; ++i) {
+                    Pr_scratch_m[i]   = Field(mesh, layout);
+                    PA_scratch_m[i]   = Field(mesh, layout);
+                    PAPr_scratch_m[i] = Field(mesh, layout);
+                }
+                fields_initialized_m = true;
+            } else {
+                for (unsigned int i = 1; i <= level_m; ++i) {
+                    Pr_scratch_m[i].updateLayout(layout);
+                    PA_scratch_m[i].updateLayout(layout);
+                    PAPr_scratch_m[i].updateLayout(layout);
+                }
             }
-            // Recursive case
-            Field PAPr(mesh, layout);
-            Field Pr(mesh, layout);
-
-            Pr   = recursive_preconditioner(u, level - 1);
-            PAPr = op_m(Pr);
-            PAPr = recursive_preconditioner(PAPr, level - 1);
-            res  = eta_m[level] * (2.0 * Pr - PAPr);
-            return res;
         }
 
-        Field operator()(Field& u) override { return recursive_preconditioner(u, level_m); }
-
     protected:
         OperatorF op_m;        // Operator to be preconditioned
         double alpha_m;        // Smallest Eigenvalue
@@ -170,6 +248,10 @@ namespace ippl {
         double zeta_m;  // smallest (alpha + beta) is multiplied by (1+zeta) to avoid clustering of
                         // Eigenvalues
         double* eta_m = nullptr;  // Size is determined at runtime
+        std::vector<Field> Pr_scratch_m;
+        std::vector<Field> PA_scratch_m;
+        std::vector<Field> PAPr_scratch_m;
+        bool fields_initialized_m = false;
     };
 
     /*!
@@ -219,19 +301,65 @@ namespace ippl {
             return *this = polynomial_chebyshev_preconditioner(other);
         }
 
-        Field operator()(Field& r) override {
-            mesh_type& mesh     = r.get_mesh();
-            layout_type& layout = r.getLayout();
+        void operator()(Field& r, Field& result) override {
+            // x_m, x_old_m, A_m, z_m are pre-allocated scratch (init_fields).
+            // Coefficients in rho_m are also computed once.
+            static IpplTimings::TimerRef t_apply = IpplTimings::getTimer("chebyshev/apply");
+            static IpplTimings::TimerRef t_init  = IpplTimings::getTimer("chebyshev/init_step");
+            static IpplTimings::TimerRef t_op    = IpplTimings::getTimer("chebyshev/op_m");
+            static IpplTimings::TimerRef t_combo = IpplTimings::getTimer("chebyshev/combine");
+            static IpplTimings::TimerRef t_copy  = IpplTimings::getTimer("chebyshev/deep_copy");
+
+            IpplTimings::startTimer(t_apply);
+
+            IpplTimings::startTimer(t_init);
+            x_old_m = r / theta_m;
+            IpplTimings::stopTimer(t_init);
+            IpplTimings::startTimer(t_op);
+            A_m     = op_m(r);
+            IpplTimings::stopTimer(t_op);
+            IpplTimings::startTimer(t_combo);
+            x_m     = 2.0 * rho_m[1] / delta_m * (2.0 * r - A_m / theta_m);
+            IpplTimings::stopTimer(t_combo);
+
+            if (degree_m == 0) {
+                // result = x_old
+                IpplTimings::startTimer(t_copy);
+                Kokkos::deep_copy(result.getView(), x_old_m.getView());
+                IpplTimings::stopTimer(t_copy);
+                IpplTimings::stopTimer(t_apply);
+                return;
+            }
 
-            Field res(mesh, layout);
-            Field x(mesh, layout);
-            Field x_old(mesh, layout);
-            Field A(mesh, layout);
-            Field z(mesh, layout);
+            if (degree_m == 1) {
+                // result = x
+                IpplTimings::startTimer(t_copy);
+                Kokkos::deep_copy(result.getView(), x_m.getView());
+                IpplTimings::stopTimer(t_copy);
+                IpplTimings::stopTimer(t_apply);
+                return;
+            }
+            for (unsigned int i = 2; i < degree_m + 1; ++i) {
+                IpplTimings::startTimer(t_op);
+                A_m   = op_m(x_m);
+                IpplTimings::stopTimer(t_op);
+                IpplTimings::startTimer(t_combo);
+                z_m   = 2.0 / delta_m * (r - A_m);
+                // Write the new x value into result (the caller's buffer);
+                // x_old gets a deep copy of the previous x.
+                result = rho_m[i] * (2 * sigma_m * x_m - rho_m[i - 1] * x_old_m + z_m);
+                IpplTimings::stopTimer(t_combo);
+                IpplTimings::startTimer(t_copy);
+                Kokkos::deep_copy(x_old_m.getView(), x_m.getView());
+                Kokkos::deep_copy(x_m.getView(), result.getView());
+                IpplTimings::stopTimer(t_copy);
+            }
+            IpplTimings::stopTimer(t_apply);
+        }
 
-            // Precompute the coefficients if not done yet
+        void init_fields(Field& b) override {
+            // One-shot precomputation of the rho coefficients.
             if (rho_m == nullptr) {
-                // Start precomputing the coefficients
                 theta_m = (beta_m + alpha_m) / 2.0 * (1.0 + zeta_m);
                 delta_m = (beta_m - alpha_m) / 2.0;
                 sigma_m = theta_m / delta_m;
@@ -241,29 +369,23 @@ namespace ippl {
                 for (unsigned int i = 1; i < degree_m + 1; ++i) {
                     rho_m[i] = 1.0 / (2.0 * sigma_m - rho_m[i - 1]);
                 }
-            }  // End of precomputing the coefficients
-
-            res = r.deepCopy();
-
-            x_old = r / theta_m;
-            A     = op_m(r);
-            x     = 2.0 * rho_m[1] / delta_m * (2.0 * r - A / theta_m);
-
-            if (degree_m == 0) {
-                return x_old;
-            }
-
-            if (degree_m == 1) {
-                return x;
             }
-            for (unsigned int i = 2; i < degree_m + 1; ++i) {
-                A     = op_m(x);
-                z     = 2.0 / delta_m * (r - A);
-                res   = rho_m[i] * (2 * sigma_m * x - rho_m[i - 1] * x_old + z);
-                x_old = x.deepCopy();
-                x     = res.deepCopy();
+            mesh_type& mesh     = b.get_mesh();
+            layout_type& layout = b.getLayout();
+            // First call allocates; subsequent calls refresh the layout so a
+            // repartition is tracked without throwing the storage away.
+            if (!fields_initialized_m) {
+                x_m     = Field(mesh, layout);
+                x_old_m = Field(mesh, layout);
+                A_m     = Field(mesh, layout);
+                z_m     = Field(mesh, layout);
+                fields_initialized_m = true;
+            } else {
+                x_m.updateLayout(layout);
+                x_old_m.updateLayout(layout);
+                A_m.updateLayout(layout);
+                z_m.updateLayout(layout);
             }
-            return res;
         }
 
     protected:
@@ -276,6 +398,11 @@ namespace ippl {
         unsigned degree_m;
         double zeta_m;
         double* rho_m = nullptr;  // Size is determined at runtime
+        Field x_m;
+        Field x_old_m;
+        Field A_m;
+        Field z_m;
+        bool fields_initialized_m = false;
     };
 
     /*!
@@ -295,22 +422,18 @@ namespace ippl {
             inverse_diagonal_m = std::move(inverse_diagonal);
         }
 
-        Field operator()(Field& r) override {
-
-            // In the FEM solver, which uses the preconditioner, 
-            // we re-use a resultField to avoid allocating new
-            // memory at every iteration.
-            // In order for the operator calls to not rewrite
-            // on this same field over and over when calling 
-            // the operators (upper, diag, inverse, lower, etc)
-            // we need deep copies to the preconditioner fields.
+        void operator()(Field& r, Field& result) override {
+            // Richardson iteration in-place on the caller-provided result
+            // buffer. ULg_m stays as a member scratch; the inner deep copies
+            // remain because the (upper, diag, inverse, lower) operators may
+            // return Field-valued expressions that alias their input.
 
-            g_m = 0;
+            result = 0;
             for (unsigned int j = 0; j < innerloops_m; ++j) {
-                ULg_m = upper_and_lower_m(g_m);
+                ULg_m = upper_and_lower_m(result);
                 ULg_m = ULg_m.deepCopy();
-                g_m   = r - ULg_m;
-         
+                result = r - ULg_m;
+
                 // The inverse diagonal is applied to the
                 // vector itself to return the result usually.
                 // However, the operator for FEM already
@@ -318,21 +441,23 @@ namespace ippl {
                 // due to the matrix-free evaluation.
                 // Therefore, we need this if to differentiate
                 // the two cases.
-                if constexpr (std::is_same_v<InvDiagF, std::function<double(Field)>>) {
-                    g_m = inverse_diagonal_m(g_m) * g_m;
+                if constexpr (std::is_same_v<InvDiagF, std::function<double(Field&)>>) {
+                    result = inverse_diagonal_m(result) * result;
                 } else {
-                    g_m = inverse_diagonal_m(g_m).deepCopy();
+                    result = inverse_diagonal_m(result).deepCopy();
                 }
             }
-            return g_m;
         }
 
         void init_fields(Field& b) override {
             layout_type& layout = b.getLayout();
             mesh_type& mesh     = b.get_mesh();
-
-            ULg_m = Field(mesh, layout);
-            g_m = Field(mesh, layout);
+            if (!fields_initialized_m) {
+                ULg_m                = Field(mesh, layout);
+                fields_initialized_m = true;
+            } else {
+                ULg_m.updateLayout(layout);
+            }
         }
 
     protected:
@@ -340,7 +465,7 @@ namespace ippl {
         InvDiagF inverse_diagonal_m;
         unsigned innerloops_m;
         Field ULg_m;
-        Field g_m;
+        bool fields_initialized_m = false;
     };
 
     /*!
@@ -364,22 +489,19 @@ namespace ippl {
             inverse_diagonal_m = std::move(inverse_diagonal);
         }
 
-        Field operator()(Field& r) override {
-            // In the FEM solver, which uses the preconditioner, 
-            // we re-use a resultField to avoid allocating new
-            // memory at every iteration.
-            // In order for the operator calls to not rewrite
-            // on this same field over and over when calling 
-            // the operators (upper, diag, inverse, lower, etc)
-            // we need deep copies to the preconditioner fields.
+        void operator()(Field& r, Field& result) override {
+            // result holds the running iterate; Ag_m and g_old_m are scratch
+            // members. The inner deep copies remain because the operators
+            // (op_m, inverse_diagonal_m) may return Field-valued expressions
+            // that alias their input.
 
-            g_m = 0;
+            result  = 0;
             g_old_m = 0;
 
             for (unsigned int j = 0; j < innerloops_m; ++j) {
-                Ag_m = op_m(g_m);
+                Ag_m = op_m(result);
                 Ag_m = Ag_m.deepCopy();
-                g_m  = r - Ag_m;
+                result = r - Ag_m;
 
                 // The inverse diagonal is applied to the
                 // vector itself to return the result usually.
@@ -388,23 +510,26 @@ namespace ippl {
                 // due to the matrix-free evaluation.
                 // Therefore, we need this if to differentiate
                 // the two cases.
-                if constexpr (std::is_same_v<InvDiagF, std::function<double(Field)>>) {
-                    g_m = g_old_m + inverse_diagonal_m(g_m) * g_m;
+                if constexpr (std::is_same_v<InvDiagF, std::function<double(Field&)>>) {
+                    result = g_old_m + inverse_diagonal_m(result) * result;
                 } else {
-                    g_m = g_old_m + inverse_diagonal_m(g_m);
+                    result = g_old_m + inverse_diagonal_m(result);
                 }
-                g_old_m = g_m.deepCopy();
+                Kokkos::deep_copy(g_old_m.getView(), result.getView());
             }
-            return g_m;
         }
 
         void init_fields(Field& b) override {
             layout_type& layout = b.getLayout();
             mesh_type& mesh     = b.get_mesh();
-
-            Ag_m = Field(mesh, layout);
-            g_m = Field(mesh, layout);
-            g_old_m = Field(mesh, layout);
+            if (!fields_initialized_m) {
+                Ag_m                 = Field(mesh, layout);
+                g_old_m              = Field(mesh, layout);
+                fields_initialized_m = true;
+            } else {
+                Ag_m.updateLayout(layout);
+                g_old_m.updateLayout(layout);
+            }
         }
 
     protected:
@@ -412,8 +537,8 @@ namespace ippl {
         InvDiagF inverse_diagonal_m;
         unsigned innerloops_m;
         Field Ag_m;
-        Field g_m;
         Field g_old_m;
+        bool fields_initialized_m = false;
     };
 
     /*!
@@ -435,73 +560,55 @@ namespace ippl {
             inverse_diagonal_m = std::move(inverse_diagonal);
         }
 
-        Field operator()(Field& b) override {
-            layout_type& layout = b.getLayout();
-            mesh_type& mesh     = b.get_mesh();
+        void operator()(Field& b, Field& result) override {
+            // The running iterate lives in result; UL_m and r_m are scratch
+            // members. The inner deep copies remain because the (upper, lower,
+            // inverse) operators may return Field-valued expressions that
+            // alias their input.
 
-            Field x(mesh, layout);
-
-            x = 0;  // Initial guess
-
-            // In the FEM solver, which uses the preconditioner, 
-            // we re-use a resultField to avoid allocating new
-            // memory at every iteration.
-            // In order for the operator calls to not rewrite
-            // on this same field over and over when calling 
-            // the operators (upper, diag, inverse, lower, etc)
-            // we need deep copies to the preconditioner fields.
+            result = 0;  // Initial guess
 
             for (unsigned int k = 0; k < outerloops_m; ++k) {
-                UL_m = upper_m(x);
+                UL_m = upper_m(result);
                 UL_m = UL_m.deepCopy();
                 r_m  = b - UL_m;
                 for (unsigned int j = 0; j < innerloops_m; ++j) {
-                    UL_m = lower_m(x);
+                    UL_m = lower_m(result);
                     UL_m = UL_m.deepCopy();
-                    x    = r_m - UL_m;
-                    // The inverse diagonal is applied to the
-                    // vector itself to return the result usually.
-                    // However, the operator for FEM already
-                    // returns the result of inv_diag * itself
-                    // due to the matrix-free evaluation.
-                    // Therefore, we need this if to differentiate
-                    // the two cases.
-                    if constexpr (std::is_same_v<InvDiagF, std::function<double(Field)>>) {
-                        x = inverse_diagonal_m(x) * x;
+                    result = r_m - UL_m;
+                    if constexpr (std::is_same_v<InvDiagF, std::function<double(Field&)>>) {
+                        result = inverse_diagonal_m(result) * result;
                     } else {
-                        x = inverse_diagonal_m(x).deepCopy();
+                        result = inverse_diagonal_m(result).deepCopy();
                     }
                 }
-                UL_m = lower_m(x);
+                UL_m = lower_m(result);
                 UL_m = UL_m.deepCopy();
                 r_m  = b - UL_m;
                 for (unsigned int j = 0; j < innerloops_m; ++j) {
-                    UL_m = upper_m(x);
+                    UL_m = upper_m(result);
                     UL_m = UL_m.deepCopy();
-                    x    = r_m - UL_m;
-                    // The inverse diagonal is applied to the
-                    // vector itself to return the result usually.
-                    // However, the operator for FEM already
-                    // returns the result of inv_diag * itself
-                    // due to the matrix-free evaluation.
-                    // Therefore, we need this if to differentiate
-                    // the two cases.
-                    if constexpr (std::is_same_v<InvDiagF, std::function<double(Field)>>) {
-                        x = inverse_diagonal_m(x) * x;
+                    result = r_m - UL_m;
+                    if constexpr (std::is_same_v<InvDiagF, std::function<double(Field&)>>) {
+                        result = inverse_diagonal_m(result) * result;
                     } else {
-                        x = inverse_diagonal_m(x).deepCopy();
+                        result = inverse_diagonal_m(result).deepCopy();
                     }
                 }
             }
-            return x;
         }
 
         void init_fields(Field& b) override {
             layout_type& layout = b.getLayout();
             mesh_type& mesh     = b.get_mesh();
-
-            UL_m = Field(mesh, layout);
-            r_m  = Field(mesh, layout);
+            if (!fields_initialized_m) {
+                UL_m                 = Field(mesh, layout);
+                r_m                  = Field(mesh, layout);
+                fields_initialized_m = true;
+            } else {
+                UL_m.updateLayout(layout);
+                r_m.updateLayout(layout);
+            }
         }
 
     protected:
@@ -512,6 +619,7 @@ namespace ippl {
         unsigned outerloops_m;
         Field UL_m;
         Field r_m;
+        bool fields_initialized_m = false;
     };
 
     /*!
@@ -536,86 +644,73 @@ namespace ippl {
             diagonal_m         = std::move(diagonal);
         }
 
-        Field operator()(Field& b) override {
+        void operator()(Field& b, Field& result) override {
             static IpplTimings::TimerRef initTimer = IpplTimings::getTimer("SSOR Init");
             IpplTimings::startTimer(initTimer);
 
             double D;
 
-            layout_type& layout = b.getLayout();
-            mesh_type& mesh     = b.get_mesh();
-
-            Field x(mesh, layout);
-
-            x = 0;  // Initial guess
+            // The running iterate lives in result; UL_m, r_m are scratch
+            // members. The inner deep copies remain because (upper, lower,
+            // inverse, diagonal) may return aliasing Field-valued expressions.
+            result = 0;  // Initial guess
 
             IpplTimings::stopTimer(initTimer);
 
             static IpplTimings::TimerRef loopTimer = IpplTimings::getTimer("SSOR loop");
             IpplTimings::startTimer(loopTimer);
 
-            // In the FEM solver, which uses the preconditioner, 
-            // we re-use a resultField to avoid allocating new
-            // memory at every iteration.
-            // In order for the operator calls to not rewrite
-            // on this same field over and over when calling 
-            // the operators (upper, diag, inverse, lower, etc)
-            // we need deep copies to the preconditioner fields.
-
-            // The inverse diagonal is applied to the
-            // vector itself to return the result usually.
-            // However, the operator for FEM already
-            // returns the result of inv_diag * itself
-            // due to the matrix-free evaluation.
-            // Therefore, we need this if to differentiate
-            // the two cases.
             for (unsigned int k = 0; k < outerloops_m; ++k) {
-                if constexpr (std::is_same_v<InvDiagF, std::function<double(Field)>>) {
-                    UL_m = upper_m(x);
-                    D    = diagonal_m(x);
-                    r_m  = omega_m * (b - UL_m) + (1.0 - omega_m) * D * x;
+                if constexpr (std::is_same_v<InvDiagF, std::function<double(Field&)>>) {
+                    UL_m = upper_m(result);
+                    D    = diagonal_m(result);
+                    r_m  = omega_m * (b - UL_m) + (1.0 - omega_m) * D * result;
 
                     for (unsigned int j = 0; j < innerloops_m; ++j) {
-                        UL_m = lower_m(x);
-                        x    = r_m - omega_m * UL_m;
-                        x    = inverse_diagonal_m(x) * x;
+                        UL_m   = lower_m(result);
+                        result = r_m - omega_m * UL_m;
+                        result = inverse_diagonal_m(result) * result;
                     }
-                    UL_m = lower_m(x);
-                    D    = diagonal_m(x);
-                    r_m  = omega_m * (b - UL_m) + (1.0 - omega_m) * D * x;
+                    UL_m = lower_m(result);
+                    D    = diagonal_m(result);
+                    r_m  = omega_m * (b - UL_m) + (1.0 - omega_m) * D * result;
                     for (unsigned int j = 0; j < innerloops_m; ++j) {
-                        UL_m = upper_m(x);
-                        x    = r_m - omega_m * UL_m;
-                        x    = inverse_diagonal_m(x) * x;
+                        UL_m   = upper_m(result);
+                        result = r_m - omega_m * UL_m;
+                        result = inverse_diagonal_m(result) * result;
                     }
                 } else {
-                    UL_m = upper_m(x).deepCopy();
-                    r_m  = omega_m * (b - UL_m) + (1.0 - omega_m) * diagonal_m(x);
+                    UL_m = upper_m(result).deepCopy();
+                    r_m  = omega_m * (b - UL_m) + (1.0 - omega_m) * diagonal_m(result);
 
                     for (unsigned int j = 0; j < innerloops_m; ++j) {
-                        UL_m = lower_m(x).deepCopy();
-                        x    = r_m - omega_m * UL_m;
-                        x    = inverse_diagonal_m(x).deepCopy();
+                        UL_m   = lower_m(result).deepCopy();
+                        result = r_m - omega_m * UL_m;
+                        result = inverse_diagonal_m(result).deepCopy();
                     }
-                    UL_m = lower_m(x).deepCopy();
-                    r_m  = omega_m * (b - UL_m) + (1.0 - omega_m) * diagonal_m(x);
+                    UL_m = lower_m(result).deepCopy();
+                    r_m  = omega_m * (b - UL_m) + (1.0 - omega_m) * diagonal_m(result);
                     for (unsigned int j = 0; j < innerloops_m; ++j) {
-                        UL_m = upper_m(x).deepCopy();
-                        x    = r_m - omega_m * UL_m;
-                        x    = inverse_diagonal_m(x).deepCopy();
+                        UL_m   = upper_m(result).deepCopy();
+                        result = r_m - omega_m * UL_m;
+                        result = inverse_diagonal_m(result).deepCopy();
                     }
                 }
             }
             IpplTimings::stopTimer(loopTimer);
-            return x;
         }
 
         void init_fields(Field& b) override {
             layout_type& layout = b.getLayout();
             mesh_type& mesh     = b.get_mesh();
-
-            UL_m = Field(mesh, layout);
-            r_m  = Field(mesh, layout);
+            if (!fields_initialized_m) {
+                UL_m                 = Field(mesh, layout);
+                r_m                  = Field(mesh, layout);
+                fields_initialized_m = true;
+            } else {
+                UL_m.updateLayout(layout);
+                r_m.updateLayout(layout);
+            }
         }
 
     protected:
@@ -628,6 +723,7 @@ namespace ippl {
         double omega_m;
         Field UL_m;
         Field r_m;
+        bool fields_initialized_m = false;
     };
 
     /*!
diff --git a/src/PoissonSolvers/PoissonCG.h b/src/PoissonSolvers/PoissonCG.h
index 7c32f4257..db4c2d1dd 100644
--- a/src/PoissonSolvers/PoissonCG.h
+++ b/src/PoissonSolvers/PoissonCG.h
@@ -11,13 +11,12 @@
 #include "Poisson.h"
 namespace ippl {
 
-// Expands to a lambda that acts as a wrapper for a differential operator
-// fun: the function for which to create the wrapper, such as ippl::laplace
-// type: the argument type, which should match the LHS type for the solver
-#define IPPL_SOLVER_OPERATOR_WRAPPER(fun, type) \
-    [](type arg) {                              \
-        return fun(arg);                        \
-    }
+// IPPL_SOLVER_OPERATOR_WRAPPER is defined once in LinearSolvers/Preconditioner.h
+// (re-exported through this header via PCG.h). Defining it again here used to
+// silently shadow that definition with a by-value lambda, which copies the
+// Field on every op_m() call and reintroduces the per-iteration cudaMalloc
+// in the halo exchange (the realloc never propagates back to the original
+// Field). Keep a single by-reference definition.
 
     template <typename FieldLHS, typename FieldRHS = FieldLHS>
     class PoissonCG : public Poisson<FieldLHS, FieldRHS> {

From 87a546e65265c93a43ac9fa8bf6243dcfbb9ca2c Mon Sep 17 00:00:00 2001
From: Andreas Adelmann <andreas.adelmann@psi.ch>
Date: Sun, 31 May 2026 21:56:06 +0200
Subject: [PATCH 2/3] this is the how to split

---
 PR501_SPLIT_MAP.md | 365 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 365 insertions(+)
 create mode 100644 PR501_SPLIT_MAP.md

diff --git a/PR501_SPLIT_MAP.md b/PR501_SPLIT_MAP.md
new file mode 100644
index 000000000..ba0e88300
--- /dev/null
+++ b/PR501_SPLIT_MAP.md
@@ -0,0 +1,365 @@
+# PR 501 Split Map
+
+Source PR: https://github.com/IPPL-framework/ippl/pull/501
+
+Local mapping branch: `pr-501-map`
+
+Compared against: `origin/master` at `da43fd66a18848b65dcd82af90001f5b64a798ab`
+
+Overall size:
+
+- 41 commits
+- 149 files changed
+- 23,231 insertions
+- 2,205 deletions
+
+## Subsystem Size
+
+| Subsystem | Files | Additions | Deletions |
+| --- | ---: | ---: | ---: |
+| Interpolation / Autotune | 33 | 8,138 | 20 |
+| FFT / NUFFT | 24 | 6,571 | 814 |
+| Particle | 21 | 3,841 | 757 |
+| Alpine / PIF examples | 12 | 1,923 | 37 |
+| Utility | 11 | 1,424 | 45 |
+| Communicate | 14 | 626 | 156 |
+| Solvers | 6 | 394 | 253 |
+| Field / Halo / Layout | 11 | 116 | 87 |
+| Build / CMake | 4 | 117 | 12 |
+| Types | 2 | 35 | 4 |
+| FEM | 3 | 12 | 13 |
+| Other | 8 | 34 | 7 |
+
+The review load is dominated by three areas:
+
+- Interpolation / Autotune: new scatter/gather dispatch, binning, tiling, tuning cache.
+- FFT / NUFFT: backend/transform refactor plus native NUFFT and pruned transforms.
+- Particle: new spatial update path, send/recv serialization hooks, sorting/buffer reuse.
+
+## Commit Shape
+
+The branch starts with well-labeled subsystem commits:
+
+| Commit | Area | Notes |
+| --- | --- | --- |
+| `7cff674a` | Build / dependencies | Adds `IPPL_ENABLE_FINUFFT`, `IPPL_ENABLE_CUFFTMP`, autotune preset plumbing. |
+| `13950c4f` | FFT / NUFFT | Large FFT backend/transform split plus native NUFFT engine. |
+| `dbbe59a4` | Interpolation | Scatter/gather redesign, binning, tiling, autotune runtime cache. |
+| `2878b90b` | Particle / halo | Rewrites `ParticleSpatialLayout`, adds sort buffers, threads `nghost` through field layout/halo. |
+| `9507a796` | Utility | Timing overhaul, `BufferView`, `Tuning`, `ParallelDispatch`, type utilities. |
+| `f2820064` | Communicate | Serialization hooks, shared buffer handler. |
+| `63d6ccf2` | Integration glue | Field/solver/type updates for new FFT and particle layout APIs. |
+| `99d21ba7` | Alpine PIF | Adds ElectrostaticPIF examples. |
+| `f0560f76` | Unit tests | Adds NUFFT, interpolation, binning, particle update tests. |
+| `927cac7b` | Integration tests | Updates particle benchmark and adds scaling benchmark. |
+
+After that, the branch has many fixup/regression commits. Several are descriptive and likely fold into the earlier subsystem commits; some are separable:
+
+| Commit | Area | Split relevance |
+| --- | --- | --- |
+| `40438d17` | PCG / preconditioner | Good candidate for an independent PR: hoists per-iteration allocations out of solver loop. |
+| `be5f794c` | PCG / PoissonCG | Follow-up to the PCG allocation PR: pass `Field` by reference through `OperatorF`. |
+| `95762403` | PIF examples | Consolidates pruned variants into parameterized source. Belongs with PIF examples. |
+| `0e7bc58d`, `6c1c48ad` | FFT / NUFFT | FINUFFT guards for CUDA + 2D unit-test build. Belong with NUFFT/FINUFFT PR. |
+| `cec1ca61` | Cleanup | Doxygen, formatting, size-type consistency. Should be folded into relevant split PRs or kept as a final cleanup PR. |
+
+## Dependency Observations
+
+### NUFFT Depends On Interpolation
+
+`src/FFT/NUFFT/NativeNUFFT.h` includes and uses:
+
+- `Interpolation/Scatter/ScatterConfig.h`
+- `Interpolation/Gather/GatherConfig.h`
+- scatter/gather kernels through the new interpolation dispatch
+
+That means native NUFFT cannot be cleanly reviewed before the higher-order scatter/gather layer.
+
+### PIF Depends On NUFFT
+
+The Alpine PIF examples use:
+
+- `ippl::FFT<ippl::NUFFTransform, Field_t>`
+- `scatterPIFNUFFT`
+- `gatherPIFNUFFT`
+
+So PIF examples should come after the NUFFT transform API.
+
+### Particle Update And Communicate Are Coupled
+
+The particle update rewrite uses:
+
+- `ParticleAttrib::serialize` / `deserialize`
+- shared `BufferHandler`
+- direct archive serialization
+- reusable send/recv buffers
+
+This suggests communication and particle update may be one PR unless a clean lower-level communication PR is extracted first.
+
+### `nghost` / Halo Changes Cross Particle And Field
+
+`ParticleSpatialLayout` changes also thread `nghost` through:
+
+- `FieldLayout`
+- `HaloCells`
+- field layout APIs
+
+This is small in line count but high in integration risk. It should be explicitly called out in whichever PR contains particle update.
+
+### PCG Allocation Fix Looks Separately Reviewable
+
+The late commits:
+
+- `40438d17`
+- `be5f794c`
+
+only touch:
+
+- `src/LinearSolvers/PCG.h`
+- `src/LinearSolvers/Preconditioner.h`
+- `src/PoissonSolvers/PoissonCG.h`
+
+This can likely become an independent performance/regression PR, separate from PIF.
+
+## Candidate Split
+
+### PR 1: PCG Allocation / Solver Performance
+
+Scope:
+
+- `src/LinearSolvers/PCG.h`
+- `src/LinearSolvers/Preconditioner.h`
+- `src/PoissonSolvers/PoissonCG.h`
+
+Candidate commits:
+
+- `40438d17`
+- `be5f794c`
+
+Reason:
+
+- Smallest coherent split.
+- Directly addresses the reported PCG slowdown / repeated `cudaMalloc` concern.
+- Does not conceptually depend on PIF, NUFFT, or interpolation.
+
+Risk:
+
+- Need to verify solver convergence and exact iteration behavior.
+
+### PR 2: Communication And Particle Update Infrastructure
+
+Scope:
+
+- `src/Communicate/*`
+- `src/Particle/ParticleSpatialLayout*`
+- `src/Particle/ParticleAttrib*`
+- `src/Particle/ParticleBase*`
+- `src/Particle/ParticleSort.h`
+- `src/Particle/SortBuffer.h`
+- `src/FieldLayout/FieldLayout*`
+- `src/Field/HaloCells*`
+- particle update tests
+
+Candidate commits:
+
+- `2878b90b`
+- `f2820064`
+- relevant parts of `63d6ccf2`
+- relevant parts of `f0560f76`
+- relevant fixup commits after `728af370`
+
+Reason:
+
+- Captures the PIC scaling improvement that is not PIF-specific.
+- Reviewable independently from FFT/NUFFT algorithms.
+
+Risk:
+
+- This is still a substantial behavioral change.
+- Needs multi-rank CPU/GPU particle update and `ParticleSendRecv` coverage.
+
+### PR 3: Higher-Order Scatter/Gather And Autotune
+
+Scope:
+
+- `src/Interpolation/*`
+- `cmake/AutoTunePresets.cmake`
+- `cmake/IpplAutoTunePresets.h.in`
+- `cmake/auto_tune/*`
+- interpolation tests
+
+Candidate commits:
+
+- `7cff674a` only for autotune preset plumbing, not FINUFFT/CUFFTMP if separable.
+- `dbbe59a4`
+- interpolation portions of `f0560f76`
+- `dacdcd9a` if routing legacy CIC through the new framework is included.
+
+Reason:
+
+- This is the algorithmic layer native NUFFT requires.
+- Can be validated without PIF examples.
+
+Risk:
+
+- `dacdcd9a` changes existing CIC scatter/gather behavior by routing legacy paths through the new framework. That may be better as a second PR after the new interpolation layer lands.
+
+### PR 4: FFT Backend / Transform Refactor
+
+Scope:
+
+- `src/FFT/Backend/*`
+- `src/FFT/Transform/CC.h`
+- `src/FFT/Transform/RC.h`
+- `src/FFT/Transform/Trig.h`
+- `src/FFT/Transform/PrunedCC.h`
+- `src/FFT/Transform/PrunedRC.h`
+- `src/FFT/Transform/Common.h`
+- `src/FFT/Traits.h`
+- existing FFT test updates
+
+Candidate commits:
+
+- FFT refactor parts of `13950c4f`
+- FFT portions of `f0560f76`
+- FFT portions of later build fixes
+
+Reason:
+
+- Gives reviewers the backend/transform API without also reviewing NUFFT and PIF.
+
+Risk:
+
+- Current commit `13950c4f` mixes backend refactor with native NUFFT files. This split likely requires path-based extraction or commit surgery.
+
+### PR 5: Native NUFFT And FINUFFT/cuFINUFFT Integration
+
+Scope:
+
+- `src/FFT/NUFFT/*`
+- `src/FFT/Transform/NUFFT.*`
+- FINUFFT/CUFFTMP build switches if not already merged
+- NUFFT tests
+
+Candidate commits:
+
+- NUFFT portions of `13950c4f`
+- FINUFFT/CUFFTMP portions of `7cff674a`
+- NUFFT portions of `f0560f76`
+- `0e7bc58d`
+- `6c1c48ad`
+
+Reason:
+
+- NUFFT depends on PR 3 scatter/gather and PR 4 transform structure.
+
+Risk:
+
+- Native NUFFT depends on new scatter/gather.
+- FINUFFT path has Dim-guard complexity and external dependency complexity.
+
+### PR 6: PIF / Alpine Examples
+
+Scope:
+
+- `alpine/ElectrostaticPIF/*`
+- Alpine manager wiring
+- PIF-specific particle attrib convenience APIs if not included in NUFFT PR
+
+Candidate commits:
+
+- `99d21ba7`
+- `95762403`
+- PIF/example fixups
+
+Reason:
+
+- Examples should be reviewed after core APIs are accepted.
+
+Risk:
+
+- PIF examples currently depend on the full stack: particle update, scatter/gather, FFT transform refactor, NUFFT.
+
+## Minimal Split Alternative
+
+If six PRs is too much process overhead, a practical minimum is:
+
+1. PCG allocation fix.
+2. Particle communication/update infrastructure.
+3. Scatter/gather + FFT backend + NUFFT core.
+4. Alpine PIF examples and benchmarks.
+
+This is less clean, but still much better than one 23k-line PR.
+
+## Next Mapping Step
+
+Prototype extraction order:
+
+1. Create a branch from `origin/master` for PCG.
+2. Cherry-pick `40438d17` and `be5f794c`.
+3. Build and run solver tests.
+4. If clean, this becomes the first small PR.
+
+Then try the particle/communication split because it is probably the most valuable non-PIF improvement and has direct tests:
+
+- `unit_tests/Particle/ParticleUpdate.cpp`
+- `unit_tests/Particle/ParticleUpdateNonuniform.cpp`
+- existing `ParticleSendRecv`
+- existing `GatherScatterTest`
+
+## Progress
+
+### 2026-05-31: PR 1 Prototype - PCG Allocation / Solver Performance
+
+Status: extracted cleanly.
+
+Worktree:
+
+```text
+/private/tmp/ippl-pr501-pcg
+```
+
+Branch:
+
+```text
+pr501-pcg-split
+```
+
+Base:
+
+```text
+origin/master @ da43fd66a18848b65dcd82af90001f5b64a798ab
+```
+
+Cherry-picked commits:
+
+- `40438d17` -> `a76097f0` (`PCG/Preconditioner: hoist per-iteration allocations out of solve loop`)
+- `be5f794c` -> `092a04e5` (`PCG: pass Field by reference through OperatorF`)
+
+Resulting diff:
+
+- `src/LinearSolvers/PCG.h`
+- `src/LinearSolvers/Preconditioner.h`
+- `src/PoissonSolvers/PoissonCG.h`
+- 3 files changed, 388 insertions, 250 deletions
+
+Validation:
+
+```text
+cmake -S . -B build-pcg-debug -DCMAKE_BUILD_TYPE=Debug -DIPPL_ENABLE_UNIT_TESTS=ON -DIPPL_ENABLE_FFT=ON -DIPPL_DEFAULT_TEST_PROCS=1
+cmake --build build-pcg-debug
+ctest --test-dir build-pcg-debug --output-on-failure -O build-pcg-debug/ctest-rank1.log
+```
+
+Result:
+
+```text
+100% tests passed, 0 tests failed out of 34
+Total Test time (real) = 54.17 sec
+```
+
+Assessment:
+
+- This is a good standalone first PR.
+- It has a narrow file set and clear motivation: remove repeated allocation work from the PCG/preconditioner loop.
+- Recommended next validation before opening: run the relevant solver integration tests or CSCS GPU tests that originally showed PCG allocation/slowdown symptoms.

From f1465da59baa1e71b80800b5ec6b076ee6817a1c Mon Sep 17 00:00:00 2001
From: Fischill Paul <paul.fischill@inf.ethz.ch>
Date: Mon, 1 Jun 2026 16:56:36 +0200
Subject: [PATCH 3/3] Fix ALPINE particle view handle lifetimes

---
 alpine/BumponTailInstabilityManager.h | 10 +++++-----
 alpine/LandauDampingManager.h         |  8 ++++----
 alpine/PenningTrapManager.h           |  8 ++++----
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/alpine/BumponTailInstabilityManager.h b/alpine/BumponTailInstabilityManager.h
index 7525f9af1..5e03a0712 100644
--- a/alpine/BumponTailInstabilityManager.h
+++ b/alpine/BumponTailInstabilityManager.h
@@ -274,10 +274,10 @@ class BumponTailInstabilityManager : public AlpineManager<T, Dim> {
 
         this->pcontainer_m->create(nlocal);
 
-        view_type* R = &(this->pcontainer_m->R.getView());
-        samplingR.generate(*R, rand_pool64);
+        view_type R = this->pcontainer_m->R.getView();
+        samplingR.generate(R, rand_pool64);
 
-        view_type* P = &(this->pcontainer_m->P.getView());
+        view_type P = this->pcontainer_m->P.getView();
 
         double mu[Dim];
         double sd[Dim];
@@ -288,12 +288,12 @@ class BumponTailInstabilityManager : public AlpineManager<T, Dim> {
         // sample first nlocBulk with muBulk as mean velocity
         mu[Dim - 1] = muBulk_m;
         Kokkos::parallel_for(Kokkos::RangePolicy<int>(0, nlocBulk),
-                             ippl::random::randn<double, Dim>(*P, rand_pool64, mu, sd));
+                             ippl::random::randn<double, Dim>(P, rand_pool64, mu, sd));
 
         // sample remaining with muBeam as mean velocity
         mu[Dim - 1] = muBeam_m;
         Kokkos::parallel_for(Kokkos::RangePolicy<int>(nlocBulk, nlocal),
-                             ippl::random::randn<double, Dim>(*P, rand_pool64, mu, sd));
+                             ippl::random::randn<double, Dim>(P, rand_pool64, mu, sd));
 
         Kokkos::fence();
         ippl::Comm->barrier();
diff --git a/alpine/LandauDampingManager.h b/alpine/LandauDampingManager.h
index b8491c7a5..23dcc632a 100644
--- a/alpine/LandauDampingManager.h
+++ b/alpine/LandauDampingManager.h
@@ -226,10 +226,10 @@ class LandauDampingManager : public AlpineManager<T, Dim> {
 
         this->pcontainer_m->create(nlocal);
 
-        view_type* R = &(this->pcontainer_m->R.getView());
-        samplingR.generate(*R, rand_pool64);
+        view_type R = this->pcontainer_m->R.getView();
+        samplingR.generate(R, rand_pool64);
 
-        view_type* P = &(this->pcontainer_m->P.getView());
+        view_type P = this->pcontainer_m->P.getView();
 
         double mu[Dim];
         double sd[Dim];
@@ -237,7 +237,7 @@ class LandauDampingManager : public AlpineManager<T, Dim> {
             mu[i] = 0.0;
             sd[i] = 1.0;
         }
-        Kokkos::parallel_for(nlocal, ippl::random::randn<double, Dim>(*P, rand_pool64, mu, sd));
+        Kokkos::parallel_for(nlocal, ippl::random::randn<double, Dim>(P, rand_pool64, mu, sd));
         Kokkos::fence();
         ippl::Comm->barrier();
 
diff --git a/alpine/PenningTrapManager.h b/alpine/PenningTrapManager.h
index b721f3ec8..c7e81de43 100644
--- a/alpine/PenningTrapManager.h
+++ b/alpine/PenningTrapManager.h
@@ -202,14 +202,14 @@ class PenningTrapManager : public AlpineManager<T, Dim> {
 
         this->pcontainer_m->create(nlocal);
 
-        view_type* R = &(this->pcontainer_m->R.getView());
-        samplingR.generate(*R, rand_pool64);
+        view_type R = this->pcontainer_m->R.getView();
+        samplingR.generate(R, rand_pool64);
 
-        view_type* P = &(this->pcontainer_m->P.getView());
+        view_type P = this->pcontainer_m->P.getView();
 
         double muP[Dim] = {0.0, 0.0, 0.0};
         double sdP[Dim] = {1.0, 1.0, 1.0};
-        Kokkos::parallel_for(nlocal, ippl::random::randn<double, Dim>(*P, rand_pool64, muP, sdP));
+        Kokkos::parallel_for(nlocal, ippl::random::randn<double, Dim>(P, rand_pool64, muP, sdP));
 
         Kokkos::fence();
         ippl::Comm->barrier();