beehive-lab · orionpapadakis · May 26, 2026 · May 28, 2026 · May 28, 2026 · May 28, 2026
diff --git a/.github/workflows/build-and-run.yml b/.github/workflows/build-and-run.yml
@@ -385,6 +385,107 @@ jobs:
             flags="" \
             prompt="Say hello"
 
+      - name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf - Prefill-Decode
+        env:
+          JAVA_TOOL_OPTIONS: >-
+            -Dllama.metrics.format=json
+            -Dllama.metrics.output=file
+            -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-prefill-decode.json
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
+            --prompt "Say hello" \
+            --with-prefill-decode
+          python3 scripts/write_metrics_sidecar.py \
+            --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-prefill-decode.meta.json" \
+            backend="${{ matrix.backend.name }}" \
+            task=llama-inference \
+            model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \
+            model=Llama-3.2-1B-Instruct \
+            quantization=Q8_0 \
+            configuration=prefill-decode \
+            "flags=--with-prefill-decode" \
+            prompt="Say hello"
+
+      - name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf - Batch-Prefill-Decode
+        env:
+          JAVA_TOOL_OPTIONS: >-
+            -Dllama.metrics.format=json
+            -Dllama.metrics.output=file
+            -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-batch-prefill-decode.json
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
+            --prompt "Say hello" \
+            --with-prefill-decode --batch-prefill-size 32
+          python3 scripts/write_metrics_sidecar.py \
+            --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-batch-prefill-decode.meta.json" \
+            backend="${{ matrix.backend.name }}" \
+            task=llama-inference \
+            model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \
+            model=Llama-3.2-1B-Instruct \
+            quantization=Q8_0 \
+            configuration=batch-prefill-decode \
+            "flags=--with-prefill-decode --batch-prefill-size 32" \
+            prompt="Say hello"
+
+      # ── PTX-only: CUDA-graph variants ────────────────────────────────────────
+      - name: PTX - Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf - Prefill-Decode-CUDA-Graphs
+        if: matrix.backend.name == 'ptx'
+        env:
+          JAVA_TOOL_OPTIONS: >-
+            -Dllama.metrics.format=json
+            -Dllama.metrics.output=file
+            -Dllama.metrics.file=${{ runner.temp }}/metrics-ptx-llama-1b-q8-prefill-decode-cuda-graphs.json
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --ptx \
+            --model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
+            --prompt "Say hello" \
+            --with-prefill-decode \
+            --cuda-graphs
+          python3 scripts/write_metrics_sidecar.py \
+            --out "${{ runner.temp }}/metrics-ptx-llama-1b-q8-prefill-decode-cuda-graphs.meta.json" \
+            backend=ptx \
+            task=llama-inference \
+            model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \
+            model=Llama-3.2-1B-Instruct \
+            quantization=Q8_0 \
+            configuration=prefill-decode-cuda-graphs \
+            "flags=--with-prefill-decode --cuda-graphs" \
+            prompt="Say hello"
+
+      - name: PTX - Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf - Batch-Prefill-Decode-CUDA-Graphs
+        if: matrix.backend.name == 'ptx'
+        env:
+          JAVA_TOOL_OPTIONS: >-
+            -Dllama.metrics.format=json
+            -Dllama.metrics.output=file
+            -Dllama.metrics.file=${{ runner.temp }}/metrics-ptx-llama-1b-q8-batch-prefill-decode-cuda-graphs.json
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --ptx \
+            --model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
+            --prompt "Say hello" \
+            --with-prefill-decode --batch-prefill-size 32 \
+            --cuda-graphs
+          python3 scripts/write_metrics_sidecar.py \
+            --out "${{ runner.temp }}/metrics-ptx-llama-1b-q8-batch-prefill-decode-cuda-graphs.meta.json" \
+            backend=ptx \
+            task=llama-inference \
+            model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \
+            model=Llama-3.2-1B-Instruct \
+            quantization=Q8_0 \
+            configuration=batch-prefill-decode-cuda-graphs \
+            "flags=--with-prefill-decode --batch-prefill-size 32 --cuda-graphs" \
+            prompt="Say hello"
+
       - name: Q8 - Run Qwen3-0.6B-Q8_0.gguf
         env:
           JAVA_TOOL_OPTIONS: >-

diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceCore.java b/src/main/java/org/beehive/gpullama3/inference/InferenceCore.java
@@ -814,7 +814,7 @@ public static FloatArray forwardTornadoVM(Model model, State state, int token, i
             default -> throw new IllegalArgumentException("Unsupported weight type: " + weights.getWeightType());
         }
 
-        return tornadoVMMasterPlan.tornadoVMForwardExecuteLayered(position);
+        return tornadoVMMasterPlan.tornadoVMForwardDecode(position);
     }
 
 }
diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceCoreBatchPrefillDecode.java b/src/main/java/org/beehive/gpullama3/inference/InferenceCoreBatchPrefillDecode.java
@@ -3,13 +3,16 @@
 import org.beehive.gpullama3.auxiliary.Parallel;
 import org.beehive.gpullama3.inference.state.State;
 import org.beehive.gpullama3.inference.weights.standard.StandardWeights;
+import org.beehive.gpullama3.inference.weights.tornado.TornadoWeights;
 import org.beehive.gpullama3.model.Configuration;
 import org.beehive.gpullama3.model.Model;
 import org.beehive.gpullama3.tensor.standard.ArrayFloatTensor;
 import org.beehive.gpullama3.tensor.standard.FloatTensor;
-import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanWithBatchPrefillDecode;
+import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanBatchPrefillDecode;
 import uk.ac.manchester.tornado.api.types.arrays.FloatArray;
 
+import java.lang.foreign.MemorySegment;
+
 /**
  * Low-level forward passes for the batched prefill/decode inference path (Phase 3/4).
  *
@@ -20,11 +23,10 @@
  *   <li>{@link #batchForwardJavaPrefill} — CPU batch prefill: processes a chunk of
  *       prompt tokens in one pass using batch matmul, avoiding redundant weight
  *       traversals. Only the KV cache is populated; logits are intentionally omitted.</li>
- *   <li>{@link #batchForwardTornadoVMPrefill} — GPU batch prefill: delegates the chunk
- *       to {@link TornadoVMMasterPlanWithBatchPrefillDecode#tornadoVMForwardBatchPrefill}.</li>
- *   <li>{@link #forwardTornadoVMDecode} — GPU decode: delegates a single decode step to
- *       {@link TornadoVMMasterPlanWithBatchPrefillDecode#tornadoVMForwardDecode}, which
- *       handles the embedding copy and runs the full decode + logits graphs.</li>
+ *   <li>{@link #batchForwardTornadoVMPrefill} — GPU batch prefill: copies batch embeddings
+ *       into device-visible state buffers then runs the batch activation + layer graphs.</li>
+ *   <li>{@link #forwardTornadoVMDecode} — GPU decode: copies the decode token embedding
+ *       then runs the decode activation + layer + logits graphs.</li>
  * </ul>
  */
 public final class InferenceCoreBatchPrefillDecode {
@@ -161,39 +163,92 @@ public static void batchForwardJavaPrefill(Model model, State state, int[] token
         // logits are not needed for any token in a prefill batch.
     }
 
+    private static final int Q8_0_BLOCK_SIZE  = 32;
+    private static final int Q8_0_BLOCK_BYTES = 34;
+
     /**
      * GPU batched prefill forward pass (Phase 4).
      *
-     * <p>Delegates the full chunk to
-     * {@link TornadoVMMasterPlanWithBatchPrefillDecode#tornadoVMForwardBatchPrefill},
-     * which handles embedding lookup and GPU execution internally.</p>
+     * <p>Copies {@code chunkSize} token embeddings into device-visible state buffers,
+     * then delegates graph execution to the plan.</p>
      *
      * @param model     the LLaMA model
+     * @param state     mutable inference state
      * @param tokens    token ids for this chunk
      * @param startPos  sequence position of {@code tokens[0]}
      * @param chunkSize number of tokens in this chunk
      * @param plan      the batched prefill/decode GPU plan
      */
-    public static void batchForwardTornadoVMPrefill(Model model, int[] tokens, int startPos, int chunkSize,
-            TornadoVMMasterPlanWithBatchPrefillDecode plan) {
-        plan.tornadoVMForwardBatchPrefill(tokens, startPos, model, chunkSize);
+    public static void batchForwardTornadoVMPrefill(Model model, State state, int[] tokens, int startPos,
+            int chunkSize, TornadoVMMasterPlanBatchPrefillDecode plan) {
+        final Configuration config = model.configuration();
+        final TornadoWeights weights = (TornadoWeights) model.weights();
+
+        state.batchStartPosHolder.set(0, startPos);
+
+        switch (weights.getWeightType()) {
+            case F16 -> {
+                MemorySegment embTable = weights.getTokenEmbeddingTable().asHalfFloatArray().getSegment();
+                long dimBytes = (long) config.dim() * Short.BYTES;
+                for (int b = 0; b < chunkSize; b++) {
+                    MemorySegment.copy(embTable, (long) tokens[b] * dimBytes,
+                            state.embeddingXBatch.getSegment(), (long) b * dimBytes, dimBytes);
+                }
+            }
+            case Q8_0 -> {
+                var embTable = weights.getTokenEmbeddingTable().asByteArray();
+                int dim = config.dim();
+                int blocksPerRow = (dim + Q8_0_BLOCK_SIZE - 1) / Q8_0_BLOCK_SIZE;
+                for (int b = 0; b < chunkSize; b++) {
+                    int tokenId = tokens[b];
+                    for (int j = 0; j < dim; j++) {
+                        int blockByteOffset = (tokenId * blocksPerRow + j / Q8_0_BLOCK_SIZE) * Q8_0_BLOCK_BYTES;
+                        float scale = embTable.getHalfFloat(blockByteOffset).getFloat32();
+                        float quant = embTable.get(blockByteOffset + 2 + j % Q8_0_BLOCK_SIZE);
+                        state.wrapXBatch.set(b * dim + j, quant * scale);
+                    }
+                }
+            }
+            default -> throw new IllegalArgumentException("Unsupported weight type: " + weights.getWeightType());
+        }
+
+        plan.tornadoVMForwardBatchPrefill();
     }
 
     /**
      * GPU decode forward pass (Phase 4).
      *
-     * <p>Delegates a single-token decode step to
-     * {@link TornadoVMMasterPlanWithBatchPrefillDecode#tornadoVMForwardDecode},
-     * which copies the token embedding and runs the decode + logits graphs.</p>
+     * <p>Copies the token embedding into device-visible state, then delegates
+     * graph execution to the plan.</p>
      *
      * @param model    the LLaMA model
+     * @param state    mutable inference state
      * @param token    current token id
      * @param position sequence position
      * @param plan     the batched prefill/decode GPU plan
      * @return logits array for token sampling
      */
-    public static FloatArray forwardTornadoVMDecode(Model model, int token, int position,
-            TornadoVMMasterPlanWithBatchPrefillDecode plan) {
-        return plan.tornadoVMForwardDecode(token, position, model);
+    public static FloatArray forwardTornadoVMDecode(Model model, State state, int token, int position,
+            TornadoVMMasterPlanBatchPrefillDecode plan) {
+        final Configuration config = model.configuration();
+        final TornadoWeights weights = (TornadoWeights) model.weights();
+
+        switch (weights.getWeightType()) {
+            case F16 -> {
+                MemorySegment embTable = weights.getTokenEmbeddingTable().asHalfFloatArray().getSegment();
+                MemorySegment.copy(embTable, (long) token * config.dim() * Short.BYTES,
+                        state.embeddingX.getSegment(), 0L, (long) config.dim() * Short.BYTES);
+            }
+            case Q8_0 -> {
+                MemorySegment embTable = weights.getTokenEmbeddingTable().asByteArray().getSegment();
+                int blocksPerToken = (config.dim() + Q8_0_BLOCK_SIZE - 1) / Q8_0_BLOCK_SIZE;
+                long bytesPerToken = (long) blocksPerToken * Q8_0_BLOCK_BYTES;
+                MemorySegment.copy(embTable, (long) token * bytesPerToken,
+                        state.embeddingX.getSegment(), 0L, bytesPerToken);
+            }
+            default -> throw new IllegalArgumentException("Unsupported weight type: " + weights.getWeightType());
+        }
+
+        return plan.tornadoVMForwardDecode(position);
     }
 }
diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceCoreWithPrefillDecode.java b/src/main/java/org/beehive/gpullama3/inference/InferenceCoreWithPrefillDecode.java
@@ -7,7 +7,7 @@
 import org.beehive.gpullama3.model.Configuration;
 import org.beehive.gpullama3.model.Model;
 import org.beehive.gpullama3.tensor.standard.FloatTensor;
-import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanWithPrefillDecode;
+import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanPrefillDecode;
 
 import java.lang.foreign.MemorySegment;
 
@@ -131,7 +131,7 @@ public static void forwardJavaPrefill(Model model, State state, int token, int p
      *
      * <p>Copies the token embedding into {@code state.embeddingX} (same as
      * {@link InferenceCore#forwardTornadoVM}) then delegates to
-     * {@link TornadoVMMasterPlanWithPrefillDecode#tornadoVMForwardPrefill},
+     * {@link TornadoVMMasterPlanPrefillDecode#tornadoVMForwardPrefill},
      * which executes preprocessing + layer graphs but skips the logits graph.</p>
      *
      * @param model       the LLaMA model (must carry {@link TornadoWeights}, FP16 only)
@@ -142,7 +142,7 @@ public static void forwardJavaPrefill(Model model, State state, int token, int p
      * @throws UnsupportedOperationException if the model uses Q8_0 weights
      */
     public static void forwardTornadoVMPrefill(Model model, State state, int token, int position,
-            TornadoVMMasterPlanWithPrefillDecode prefillPlan) {
+            TornadoVMMasterPlanPrefillDecode prefillPlan) {
         final Configuration configuration = model.configuration();
         final TornadoWeights weights = (TornadoWeights) model.weights();
 
@@ -153,9 +153,13 @@ public static void forwardTornadoVMPrefill(Model model, State state, int token,
                 MemorySegment.copy(tokenEmbeddings, (long) token * configuration.dim() * bytes,
                         state.embeddingX.getSegment(), 0, (long) configuration.dim() * bytes);
             }
-            case Q8_0 -> throw new UnsupportedOperationException(
-                    // TODO Phase 4: implement Q8_0 GPU batched prefill kernels
-                    "GPU prefill/decode path not yet implemented for Q8_0 weights");
+            case Q8_0 -> {
+                MemorySegment tokenEmbeddings = weights.getTokenEmbeddingTable().asByteArray().getSegment();
+                int blocksPerToken = (configuration.dim() + 31) / 32;
+                long bytesPerToken = (long) blocksPerToken * 34;
+                MemorySegment.copy(tokenEmbeddings, (long) token * bytesPerToken,
+                        state.embeddingX.getSegment(), 0, bytesPerToken);
+            }
             default -> throw new IllegalArgumentException("Unsupported weight type: " + weights.getWeightType());
         }
 

diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithBatchPrefillDecode.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithBatchPrefillDecode.java
@@ -7,7 +7,7 @@
 import org.beehive.gpullama3.model.Model;
 import org.beehive.gpullama3.tokenizer.Tokenizer;
 import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlan;
-import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanWithBatchPrefillDecode;
+import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanBatchPrefillDecode;
 
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -163,8 +163,8 @@ public static List<Integer> generateTokensGPULlama(
                 ? config.contextLength() : maxTokens;
         final int batchSize = TornadoVMMasterPlan.PREFILL_BATCH_SIZE;
 
-        TornadoVMMasterPlanWithBatchPrefillDecode plan =
-                (TornadoVMMasterPlanWithBatchPrefillDecode) tornadoVMPlan;
+        TornadoVMMasterPlanBatchPrefillDecode plan =
+                (TornadoVMMasterPlanBatchPrefillDecode) tornadoVMPlan;
 
         List<Integer> generatedTokens = new ArrayList<>();
 
@@ -185,7 +185,7 @@ public static List<Integer> generateTokensGPULlama(
             int chunkSize = chunkEnd - chunkStart;
             int[] chunk   = Arrays.copyOfRange(prefillSeq, chunkStart, chunkEnd);
 
-            InferenceCoreBatchPrefillDecode.batchForwardTornadoVMPrefill(model, chunk, pos + chunkStart, chunkSize, plan);
+            InferenceCoreBatchPrefillDecode.batchForwardTornadoVMPrefill(model, state, chunk, pos + chunkStart, chunkSize, plan);
 
             if (echo) {
                 for (int b = 0; b < chunkSize; b++) {
@@ -203,7 +203,7 @@ public static List<Integer> generateTokensGPULlama(
 
         // ── Decode ────────────────────────────────────────────────────────────
         while (pos < actualMaxTokens) {
-            var logits = InferenceCoreBatchPrefillDecode.forwardTornadoVMDecode(model, currentToken, pos, plan);
+            var logits = InferenceCoreBatchPrefillDecode.forwardTornadoVMDecode(model, state, currentToken, pos, plan);
             int nextToken = sampler.sampleToken(logits);
 
             if (echo) {

diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithPrefillDecode.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithPrefillDecode.java
@@ -7,7 +7,7 @@
 import org.beehive.gpullama3.model.Model;
 import org.beehive.gpullama3.tokenizer.Tokenizer;
 import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlan;
-import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanWithPrefillDecode;
+import org.beehive.gpullama3.tornadovm.TornadoVMMasterPlanPrefillDecode;
 
 import java.util.ArrayList;
 import java.util.List;
@@ -135,8 +135,8 @@ public static List<Integer> generateTokensGPULlama(
         int actualMaxTokens = (maxTokens < 0 || config.contextLength() < maxTokens)
                 ? config.contextLength() : maxTokens;
 
-        TornadoVMMasterPlanWithPrefillDecode prefillPlan =
-                (TornadoVMMasterPlanWithPrefillDecode) tornadoVMPlan;
+        TornadoVMMasterPlanPrefillDecode prefillPlan =
+                (TornadoVMMasterPlanPrefillDecode) tornadoVMPlan;
 
         List<Integer> generatedTokens = new ArrayList<>();