diff --git a/jmh_full_get_java21.json b/jmh_full_get_java21.json
new file mode 100644
index 0000000..1445fff
--- /dev/null
+++ b/jmh_full_get_java21.json
@@ -0,0 +1,19 @@
+{
+    "benchmark": "GetJNIBenchmark",
+    "jvmargs": ["Xmx4G", "XX:ErrorFile=./results/hs_err_pid%p.log", "XX:+HeapDumpOnOutOfMemoryError", "-enable-preview"],
+    "params": {
+        "valueSize": [10, 50, 512, 1024, 4096, 8192, 16384, 32768, 65536],
+        "cacheMB": [1],
+        "checksum": ["none", "copyout"]
+    },
+    "options": {
+        "batchsize": 1,
+        "warmupiterations": 20,
+        "warmuptime": "50ms",
+        "iterations": 50,
+        "time": "500ms"
+    },
+    "result.path": "./results",
+    "java.library.path": "target/jni-benchmarks-1.0.1-SNAPSHOT-application/jni-benchmarks-1.0.1-SNAPSHOT/lib",
+    "jar": "target/jni-benchmarks-1.0.1-SNAPSHOT-benchmarks.nar"
+}
diff --git a/jmh_full_put_java21.json b/jmh_full_put_java21.json
new file mode 100644
index 0000000..592c75d
--- /dev/null
+++ b/jmh_full_put_java21.json
@@ -0,0 +1,19 @@
+{
+    "benchmark": "PutJNIBenchmark",
+    "jvmargs": ["Xmx4G", "XX:ErrorFile=./results/hs_err_pid%p.log", "XX:+HeapDumpOnOutOfMemoryError", "-enable-preview"],
+    "params": {
+        "valueSize": [10, 50, 512, 1024, 4096, 8192, 16384, 32768, 65536],
+        "cacheMB": [1],
+        "checksum": ["none", "copyin"]
+    },
+    "options": {
+        "batchsize": 1,
+        "warmupiterations": 20,
+        "warmuptime": "50ms",
+        "iterations": 50,
+        "time": "500ms"
+    },
+    "result.path": "./results",
+    "java.library.path": "target/jni-benchmarks-1.0.1-SNAPSHOT-application/jni-benchmarks-1.0.1-SNAPSHOT/lib",
+    "jar": "target/jni-benchmarks-1.0.1-SNAPSHOT-benchmarks.nar"
+}
diff --git a/jmh_plot.json b/jmh_plot.json
index da9d398..cb7aa90 100644
--- a/jmh_plot.json
+++ b/jmh_plot.json
@@ -5,7 +5,8 @@
                 "name": "valueSize",
                 "min": 1024
             },
-            "label": "allbig"
+            "label": "allbig",
+            "valueSizeTitle": ">= 1024"
         },
         {
             "xaxisparam": {
@@ -13,7 +14,8 @@
                 "min": 1,
                 "max": 4096
             },
-            "label": "allsmall"
+            "label": "allsmall",
+            "valueSizeTitle": "<= 4096"
             # defaults to include_patterns of all matching
         },
         {
@@ -23,7 +25,8 @@
                 "max": 4096
             },
             "exclude_patterns": ["Pooled"],
-            "label": "nopoolsmall"
+            "label": "nopoolsmall",
+            "valueSizeTitle": "<= 4096"
         },
         {
             "xaxisparam": {
@@ -31,7 +34,8 @@
                 "min": 1024
             },
             "exclude_patterns": ["Pooled"],
-            "label": "nopoolbig"
+            "label": "nopoolbig",
+            "valueSizeTitle": ">= 1024"
         },
         {
             "xaxisparam": {
@@ -84,4 +88,4 @@
         }
     ],
     "result.path": "./analysis/testplots"
-}
\ No newline at end of file
+}
diff --git a/jmh_small_get_java21.json b/jmh_small_get_java21.json
new file mode 100644
index 0000000..a13c410
--- /dev/null
+++ b/jmh_small_get_java21.json
@@ -0,0 +1,19 @@
+{
+    "benchmark": "GetJNIBenchmark",
+    "jvmargs": ["Xmx4G", "XX:ErrorFile=./results/hs_err_pid%p.log", "XX:+HeapDumpOnOutOfMemoryError", "-enable-preview"],
+    "params": {
+        "valueSize": [10, 50, 512, 1024, 4096, 8192, 16384, 32768, 65536],
+        "cacheMB": [1],
+        "checksum": ["none", "copyout"]
+    },
+    "options": {
+        "batchsize": 1,
+        "warmupiterations": 10,
+        "warmuptime": "20ms",
+        "iterations": 20,
+        "time": "200ms"
+    },
+    "result.path": "./results",
+    "java.library.path": "target/jni-benchmarks-1.0.1-SNAPSHOT-application/jni-benchmarks-1.0.1-SNAPSHOT/lib",
+    "jar": "target/jni-benchmarks-1.0.1-SNAPSHOT-benchmarks.nar"
+}
diff --git a/jmh_small_put_java21.json b/jmh_small_put_java21.json
new file mode 100644
index 0000000..e92e8a0
--- /dev/null
+++ b/jmh_small_put_java21.json
@@ -0,0 +1,19 @@
+{
+    "benchmark": "PutJNIBenchmark",
+    "jvmargs": ["Xmx4G", "XX:ErrorFile=./results/hs_err_pid%p.log", "XX:+HeapDumpOnOutOfMemoryError", "-enable-preview"],
+    "params": {
+        "valueSize": [10, 50, 512, 1024, 4096, 8192, 16384, 32768, 65536],
+        "cacheMB": [1],
+        "checksum": ["none", "copyin"]
+    },
+    "options": {
+        "batchsize": 1,
+        "warmupiterations": 5,
+        "warmuptime": "20ms",
+        "iterations": 10,
+        "time": "100ms"
+    },
+    "result.path": "./results",
+    "java.library.path": "target/jni-benchmarks-1.0.1-SNAPSHOT-application/jni-benchmarks-1.0.1-SNAPSHOT/lib",
+    "jar": "target/jni-benchmarks-1.0.1-SNAPSHOT-benchmarks.nar"
+}
diff --git a/jmh_tiny_get_java21.json b/jmh_tiny_get_java21.json
new file mode 100644
index 0000000..9d3b78e
--- /dev/null
+++ b/jmh_tiny_get_java21.json
@@ -0,0 +1,19 @@
+{
+    "benchmark": "GetJNIBenchmark",
+    "jvmargs": ["Xmx4G", "XX:ErrorFile=./results/hs_err_pid%p.log", "XX:+HeapDumpOnOutOfMemoryError", "-enable-preview"],
+    "params": {
+        "valueSize": [50, 1024, 4096, 16384],
+        "cacheMB": [1],
+        "checksum": ["none", "copyout"]
+    },
+    "options": {
+        "batchsize": 1,
+        "warmupiterations": 5,
+        "warmuptime": "10ms",
+        "iterations": 5,
+        "time": "50ms"
+    },
+    "result.path": "./results",
+    "java.library.path": "target/jni-benchmarks-1.0.1-SNAPSHOT-application/jni-benchmarks-1.0.1-SNAPSHOT/lib",
+    "jar": "target/jni-benchmarks-1.0.1-SNAPSHOT-benchmarks.nar"
+}
diff --git a/jmh_tiny_put_java21.json b/jmh_tiny_put_java21.json
new file mode 100644
index 0000000..e390dd5
--- /dev/null
+++ b/jmh_tiny_put_java21.json
@@ -0,0 +1,19 @@
+{
+    "benchmark": "PutJNIBenchmark",
+    "jvmargs": ["Xmx4G", "XX:ErrorFile=./results/hs_err_pid%p.log", "XX:+HeapDumpOnOutOfMemoryError", "-enable-preview"],
+    "params": {
+        "valueSize": [50, 1024, 4096, 16384],
+        "cacheMB": [1],
+        "checksum": ["none", "copyin"]
+    },
+    "options": {
+        "batchsize": 1,
+        "warmupiterations": 5,
+        "warmuptime": "10ms",
+        "iterations": 5,
+        "time": "50ms"
+    },
+    "result.path": "./results",
+    "java.library.path": "target/jni-benchmarks-1.0.1-SNAPSHOT-application/jni-benchmarks-1.0.1-SNAPSHOT/lib",
+    "jar": "target/jni-benchmarks-1.0.1-SNAPSHOT-benchmarks.nar"
+}
diff --git a/jmhplot.py b/jmhplot.py
index bf4089a..c626c88 100755
--- a/jmhplot.py
+++ b/jmhplot.py
@@ -35,6 +35,7 @@
 
 import numpy as np
 import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
 import pandas as pd
 from pandas.core.frame import DataFrame
 import re
@@ -116,8 +117,8 @@ def normalize_data_frame_from_path(path: pathlib.Path):
         except pd.errors.EmptyDataError:
             break
 
-        # every 9th line is the interesting one, discard the rest
-        df = df.iloc[::9, :]
+        # df = df.iloc[::9, :]
+        df = df[~df['Benchmark'].str.contains(':')]
         df["Benchmark"] = df["Benchmark"].apply(lambda x: x.split('.')[-1])
         if normalized is None:
             normalized = df
@@ -193,11 +194,11 @@ def tuple_of_secondary_keys(params: BMParams) -> Tuple:
     return tuple(secondaryKeys)
 
 
-def plot_all_results(params: BMParams, resultSets: ResultSets, path, include_benchmarks: str, exclude_benchmarks: str, label: str) -> None:
+def plot_all_results(params: BMParams, xaxisparam:Dict, result_sets: ResultSets, path, include_benchmarks: str, exclude_benchmarks: str, label: str, value_size_title: str, system_info: str) -> None:
     indexKeys = tuple_of_secondary_keys(params)
-    for indexTuple, resultSet in resultSets.items():
-        plot_result_set(indexKeys, indexTuple, resultSet,
-                        path, include_benchmarks, exclude_benchmarks, label)
+    for indexTuple, resultSet in result_sets.items():
+        plot_result_set(xaxisparam, indexKeys, indexTuple, resultSet,
+                        path, include_benchmarks, exclude_benchmarks, label, value_size_title, system_info)
 
 
 def plot_result_axis_errorbars(ax, resultSet: ResultSet) -> None:
@@ -256,25 +257,42 @@ def plot_result_axis_bars(ax, resultSet: ResultSet) -> None:
         bmIndex = bmIndex + 1
 
 
-def plot_result_set(indexKeys: Tuple, indexTuple: Tuple, resultSet: ResultSet, path: pathlib.Path, include_benchmarks: str, exclude_benchmarks: str, label: str):
+def plot_result_set(xaxisparam:Dict, indexKeys: Tuple, indexTuple: Tuple, resultSet: ResultSet, path: pathlib.Path, include_benchmarks: str, exclude_benchmarks: str, label: str, value_size_title: str, system_info: str):
+    # Determine how many colors we need
+    num_benchmarks = len(resultSet)
+
+    # Sample gist_ncar (or nipy_spectral) at discrete intervals
+    cmap = plt.get_cmap('gist_ncar')
+    colors = [cmap(i / num_benchmarks) for i in range(num_benchmarks)]
+
+    # Set the property cycle with these colors
+    plt.rc('axes', prop_cycle=plt.cycler('color', colors))
+
     fig = plt.figure(num=None, figsize=(18, 12), dpi=80,
                      facecolor='w', edgecolor='k')
     ax = plt.subplot()
 
     plot_result_axis_bars(ax, resultSet)
 
-    plt.title(
-        f'{str(indexKeys)}={str(indexTuple)} include={include_benchmarks} exclude={exclude_benchmarks}')
-    plt.xlabel("X")
+    # Ensure more marks on the x-axis for log scale
+    ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=15))
+    ax.xaxis.set_minor_locator(ticker.LogLocator(base=10.0, subs='auto', numticks=15))
+    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
+    ax.xaxis.set_minor_formatter(ticker.NullFormatter())
+
+    plt.suptitle(system_info)
+    title = f'{str(indexKeys)}={str(indexTuple)} include={include_benchmarks} exclude={exclude_benchmarks} Value Size="{value_size_title}"'
+    plt.title(title)
+    plt.xlabel(extract_parameter_name(xaxisparam))
     plt.ylabel("t (ns)")
-    plt.legend(loc='lower right')
-    plt.grid(b='True', which='both')
+    plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
+    plt.grid(visible='True', which='both')
 
     name = f'fig_{"_".join([str(t) for t in indexTuple])}_{label}.png'
 
     if path.is_file():
-        path = path.parent()
-    fig.savefig(path.joinpath(name))
+        path = path.parent
+    fig.savefig(path.joinpath(name), bbox_inches='tight')
 
 
 alpha_pattern = re.compile(f'[A-Za-z0-9_\-+]')
@@ -311,7 +329,7 @@ def filter_for_benchmarks(dataframe: DataFrame, include_benchmarks, exclude_benc
 
 def filter_for_range(dataframe: DataFrame, xaxisparam: Dict) -> DataFrame:
 
-    param_name = required('name', xaxisparam)
+    param_name = extract_parameter_name(xaxisparam)
     xmin = optional('min', xaxisparam, lambda x: int(x))
     xmax = optional('max', xaxisparam, lambda x: int(x))
     if xmax is None and xmin is None:
@@ -329,6 +347,14 @@ def filter_for_range(dataframe: DataFrame, xaxisparam: Dict) -> DataFrame:
         lambda x: int(x) >= xmin and int(x) <= xmax)]
 
 
+def extract_parameter_name(xaxisparam):
+    return required('name', xaxisparam)
+
+
+def default_if_none(optional_string, default_value: str) -> str:
+    return default_value if optional_string is None else optional_string
+
+
 def process_some_plots(path: pathlib.Path, plot: Dict) -> None:
 
     xaxisparam = required('xaxisparam', plot)
@@ -337,6 +363,26 @@ def process_some_plots(path: pathlib.Path, plot: Dict) -> None:
     include_benchmarks = optional('include_patterns', plot)
     exclude_benchmarks = optional('exclude_patterns', plot)
     label = required('label', plot)
+    value_size_title = default_if_none(optional('valueSizeTitle', plot), "All")
+
+    # Check for system_info.json in the path
+    system_info = None
+    system_info_file = None
+    if path.is_dir():
+        system_info_file = path.joinpath('system_info.json')
+    if path.is_file():
+        system_info_file = path.parent.joinpath('system_info.json')
+
+    if system_info_file and system_info_file.exists():
+        try:
+            with system_info_file.open(mode='r', encoding='UTF-8') as f:
+                info_json = json.load(f)
+                system_info = info_json.get('system_info')
+        except Exception:
+            pass
+
+    if system_info is None:
+        system_info = "System Info unavailable"
 
     dataframe = normalize_data_frame_from_path(path)
     if len(dataframe) == 0:
@@ -357,8 +403,8 @@ def process_some_plots(path: pathlib.Path, plot: Dict) -> None:
     params: BMParams = split_params(
         extract_params(dataframe), primary_param_name)
     resultSets = extract_results_per_param(dataframe, params)
-    plot_all_results(params, resultSets, path,
-                     include_benchmarks, exclude_benchmarks, label)
+    plot_all_results(params, xaxisparam, resultSets, path,
+                     include_benchmarks, exclude_benchmarks, label, value_size_title, system_info)
 
 
 def process_benchmarks(config: Dict) -> None:
diff --git a/jmhrun.py b/jmhrun.py
index ebc9bd7..771c026 100755
--- a/jmhrun.py
+++ b/jmhrun.py
@@ -32,6 +32,7 @@
 import pathlib
 import json
 import subprocess
+import platform
 from typing import Dict
 
 
@@ -112,6 +113,81 @@ def output_options(config: Dict) -> list:
     return ['-rff', str(path.joinpath(pathlib.Path(f'jmh_{const_datetime_str}.csv')))]
 
 
+def get_system_info() -> str:
+    try:
+        arch = platform.machine()
+        system = platform.system()
+        kernel = platform.release()
+
+        cpu_model = ""
+        ram_info = ""
+        os_info = ""
+        java_info = ""
+
+        try:
+            java_version_out = subprocess.check_output(['java', '-version'], stderr=subprocess.STDOUT).decode().strip()
+            # The first line usually contains the version information
+            java_info = java_version_out.splitlines()[0]
+        except Exception:
+            java_info = "Unknown Java"
+
+        if system == "Darwin":
+            try:
+                cpu_model = subprocess.check_output(['sysctl', '-n', 'machdep.cpu.brand_string']).decode().strip()
+            except Exception:
+                cpu_model = platform.processor()
+
+            try:
+                mem_bytes = int(subprocess.check_output(['sysctl', '-n', 'hw.memsize']).decode().strip())
+                ram_info = f"{mem_bytes // (1024**3)}GB RAM"
+            except Exception:
+                ram_info = "Unknown RAM"
+
+            os_info = f"macOS {platform.mac_ver()[0]}"
+
+        elif system == "Linux":
+            try:
+                with open("/proc/cpuinfo", "r") as f:
+                    for line in f:
+                        if "model name" in line:
+                            cpu_model = line.split(":")[1].strip()
+                            break
+            except Exception:
+                cpu_model = platform.processor()
+
+            try:
+                with open("/proc/meminfo", "r") as f:
+                    for line in f:
+                        if "MemTotal" in line:
+                            mem_kb = int(line.split(":")[1].strip().split()[0])
+                            ram_info = f"{mem_kb // (1024**2)}GB RAM"
+                            break
+            except Exception:
+                ram_info = "Unknown RAM"
+
+            try:
+                import lsb_release
+                os_info = lsb_release.get_distro_information()['DESCRIPTION']
+            except Exception:
+                try:
+                    with open("/etc/os-release", "r") as f:
+                        for line in f:
+                            if line.startswith("PRETTY_NAME="):
+                                os_info = line.split("=")[1].strip().strip('"')
+                                break
+                except Exception:
+                    os_info = f"Linux {platform.release()}"
+
+        else:
+            cpu_model = platform.processor()
+            os_info = f"{system} {platform.release()}"
+
+        return f"{arch} - {cpu_model} - {ram_info} - {os_info} - Kernel: {kernel} - {java_info}"
+
+    except Exception as e:
+        return f"Unknown System - {str(e)}"
+
+
 def build_jmh_command(config: Dict) -> list:
 
     cmd = ["java"]
@@ -192,6 +268,11 @@ def log_jmh_session(cmd: list, config: Dict, config_file: str):
         log.writelines(line + '\n' for line in
                        ['```', '#### Command', 'The java command executed to run the tests', '```', ' '.join(cmd), '```'])
 
+    # Save system info
+    system_info_file = output_dir_path(config).joinpath('system_info.json')
+    with system_info_file.open(mode='w', encoding='UTF-8') as f:
+        json.dump({"system_info": get_system_info()}, f, indent=4)
+
 
 def exec_jmh_cmd(cmd: list, help_requested):
     cmd_str = ' '.join(cmd)
diff --git a/pom.xml b/pom.xml
index f67ad03..b07a44b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,5 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <modelVersion>4.0.0</modelVersion>
 
     <groupId>com.evolvedbinary.jni</groupId>
@@ -195,7 +196,8 @@
                         <configuration>
                             <finalName>${project.artifactId}-${project.version}-${uberjar.name}</finalName>
                             <transformers>
-                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                <transformer
+                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                     <mainClass>org.openjdk.jmh.Main</mainClass>
                                 </transformer>
                             </transformers>
@@ -277,6 +279,21 @@
                 <java.source.version>21</java.source.version>
                 <java.target.version>21</java.target.version>
             </properties>
+            <build>
+                <plugins>
+                    <plugin>
+                        <artifactId>maven-compiler-plugin</artifactId>
+                        <configuration>
+                            <compilerArgs>
+                                <arg>-proc:full</arg>
+                                <arg>-h</arg>
+                                <arg>${project.build.directory}/nar/javah-include</arg>
+                                <arg>--enable-preview</arg>
+                            </compilerArgs>
+                        </configuration>
+                    </plugin>
+                </plugins>
+            </build>
         </profile>
         <profile>
             <id>java25</id>
diff --git a/src/main/c++/getputjni/GetPutJNI.cpp b/src/main/c++/getputjni/GetPutJNI.cpp
index 810e74e..e4b67c8 100644
--- a/src/main/c++/getputjni/GetPutJNI.cpp
+++ b/src/main/c++/getputjni/GetPutJNI.cpp
@@ -377,6 +377,19 @@ jint Java_com_evolvedbinary_jnibench_common_getputjni_GetPutJNI_getIntoIndirectB
   return get_size;
 }
 
+extern "C" int getIntoMemorySegment(const char* key, char* dest, int dest_len) {
+    std::string value = GetByteArrayInternal(key);
+    int size = std::min((int)value.size(), dest_len);
+    memcpy(dest, value.c_str(), size);
+    return size;
+}
+
+extern "C" int putFromMemorySegment(const char* key, const char* src, int src_len) {
+    char *db_buf = GetByteArrayInternalForWrite(key, src_len);
+    memcpy(db_buf, src, src_len);
+    return src_len;
+}
+
 /*
  * Class:     com_evolvedbinary_jnibench_common_getputjni_GetPutJNI
  * Method:    putFromIndirectByteBufferGetRegion
diff --git a/src/main/java/com/evolvedbinary/jnibench/jmhbench/GetJNIBenchmark.java b/src/main/java/com/evolvedbinary/jnibench/jmhbench/GetJNIBenchmark.java
index 7887ecb..e665cd0 100644
--- a/src/main/java/com/evolvedbinary/jnibench/jmhbench/GetJNIBenchmark.java
+++ b/src/main/java/com/evolvedbinary/jnibench/jmhbench/GetJNIBenchmark.java
@@ -1,18 +1,18 @@
 /**
  * Copyright © 2021, Evolved Binary Ltd
  * All rights reserved.
- *
+ * <p>
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the <organization> nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the <organization> nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ * <p>
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -28,22 +28,44 @@
 
 import com.evolvedbinary.jnibench.common.getputjni.GetPutJNI;
 import com.evolvedbinary.jnibench.consbench.NarSystem;
-import com.evolvedbinary.jnibench.jmhbench.cache.*;
-import com.evolvedbinary.jnibench.jmhbench.common.*;
-import io.netty.buffer.PooledByteBufAllocator;
-import org.openjdk.jmh.annotations.*;
-import org.openjdk.jmh.infra.Blackhole;
-import org.openjdk.jmh.runner.Runner;
-import org.openjdk.jmh.runner.RunnerException;
-import org.openjdk.jmh.runner.options.Options;
-import org.openjdk.jmh.runner.options.OptionsBuilder;
+import com.evolvedbinary.jnibench.jmhbench.cache.AllocationCache;
+import com.evolvedbinary.jnibench.jmhbench.cache.ByteArrayCache;
+import com.evolvedbinary.jnibench.jmhbench.cache.DirectByteBufferCache;
+import com.evolvedbinary.jnibench.jmhbench.cache.IndirectByteBufferCache;
+import com.evolvedbinary.jnibench.jmhbench.cache.MemorySegmentCache;
+import com.evolvedbinary.jnibench.jmhbench.cache.NettyByteBufCache;
+import com.evolvedbinary.jnibench.jmhbench.cache.UnsafeBufferCache;
+import com.evolvedbinary.jnibench.jmhbench.common.JMHCaller;
 import io.netty.buffer.ByteBuf;
-
+import io.netty.buffer.PooledByteBufAllocator;
+import java.lang.foreign.Arena;
+import java.lang.foreign.FunctionDescriptor;
+import java.lang.foreign.Linker;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.SymbolLookup;
+import java.lang.foreign.ValueLayout;
+import java.lang.invoke.MethodHandle;
 import java.nio.ByteBuffer;
 import java.text.SimpleDateFormat;
 import java.util.Date;
 import java.util.concurrent.TimeUnit;
 import java.util.logging.Logger;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.RunnerException;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
 
 /**
  * Benchmark getting byte arrays from native methods.
@@ -56,288 +78,371 @@
 //@Measurement(iterations = 500, time = 2000, timeUnit = TimeUnit.NANOSECONDS)
 public class GetJNIBenchmark {
 
-    private static final Logger LOG = Logger.getLogger(GetJNIBenchmark.class.getName());
-
-    static {
-        NarSystem.loadLibrary();
-    }
-
-    @State(Scope.Benchmark)
-    public static class GetJNIBenchmarkState {
-
-        @Param({
-                "10",
-                "50",
-                "128",
-                "512",
-                "1024",
-                "4096",
-                "8192",
-                "16384",
-                "32768",
-                "65536",
-                "131072"})
-        int valueSize;
-
-        @Param({"4", "16"}) int cacheMB;
-        final static int MB = 1024 * 1024;
-        @Param({"1024"}) int cacheEntryOverhead;
-
-        @Param({"none", "copyout", "bytesum", "longsum"}) String checksum;
-        AllocationCache.Checksum readChecksum;
-
-        String keyBase;
-        byte[] keyBytes;
-
-        JMHCaller caller;
-
-        @Setup
-        public void setup() {
-            this.caller = JMHCaller.fromStack();
-
-            keyBase = "testKeyWithReturnValueSize" + String.format("%07d", valueSize) + "Bytes";
-
-            keyBytes = keyBase.getBytes();
-
-            readChecksum = AllocationCache.Checksum.valueOf(checksum);
-        }
-
-        @TearDown
-        public void tearDown() {
-
-        }
-    }
-
-    @State(Scope.Thread)
-    public static class GetJNIThreadState {
-
-        private DirectByteBufferCache directByteBufferCache = new DirectByteBufferCache();
-        private UnsafeBufferCache unsafeBufferCache = new UnsafeBufferCache();
-        private ByteArrayCache byteArrayCache = new ByteArrayCache();
-        private IndirectByteBufferCache indirectByteBufferCache = new IndirectByteBufferCache();
-        private PooledByteBufAllocator pooledByteBufAllocator;
-        private NettyByteBufCache nettyByteBufCache = new NettyByteBufCache();
-
-        int valueSize;
-        int cacheSize;
-
-        @Setup
-        public void setup(GetJNIBenchmarkState benchmarkState, Blackhole blackhole) {
-            valueSize = benchmarkState.valueSize;
-            cacheSize = benchmarkState.cacheMB * GetJNIBenchmarkState.MB;
-
-            switch (benchmarkState.caller.benchmarkMethod) {
-                case "getIntoPooledNettyByteBuf":
-                    pooledByteBufAllocator = PooledByteBufAllocator.DEFAULT;
-                    //create a 0-sized cache so that we can use it to do checksum
-                    nettyByteBufCache.setup(valueSize, 0/*cacheSize*/, benchmarkState.cacheEntryOverhead, benchmarkState.readChecksum, blackhole);
-                    break;
-                case "getIntoNettyByteBuf":
-                    nettyByteBufCache.setup(valueSize, cacheSize, benchmarkState.cacheEntryOverhead, benchmarkState.readChecksum, blackhole);
-                    break;
-                case "getIntoDirectByteBuffer":
-                    directByteBufferCache.setup(valueSize, cacheSize, benchmarkState.cacheEntryOverhead, benchmarkState.readChecksum, blackhole);
-                    break;
-                case "getIntoIndirectByteBufferSetRegion":
-                case "getIntoIndirectByteBufferGetElements":
-                case "getIntoIndirectByteBufferGetCritical":
-                    indirectByteBufferCache.setup(valueSize, cacheSize, benchmarkState.cacheEntryOverhead, benchmarkState.readChecksum, blackhole);
-                    break;
-                case "getIntoDirectByteBufferFromUnsafe":
-                case "buffersOnlyDirectByteBufferFromUnsafe":
-                case "getIntoUnsafe":
-                    unsafeBufferCache.setup(valueSize, cacheSize, benchmarkState.cacheEntryOverhead, benchmarkState.readChecksum, blackhole);
-                    break;
-                case "getIntoByteArraySetRegion":
-                case "getIntoByteArrayGetElements":
-                case "getIntoByteArrayCritical":
-                    byteArrayCache.setup(valueSize, cacheSize, benchmarkState.cacheEntryOverhead, benchmarkState.readChecksum, blackhole);
-                    break;
-                default:
-                    throw new RuntimeException("Don't know how to setup() for benchmark: " + benchmarkState.caller.benchmarkMethod);
-            }
-        }
-
-        @TearDown
-        public void tearDown(GetJNIBenchmarkState benchmarkState) {
-
-            switch (benchmarkState.caller.benchmarkMethod) {
-                case "getIntoPooledNettyByteBuf":
-                    pooledByteBufAllocator = null;
-                    break;
-                case "getIntoNettyByteBuf":
-                    nettyByteBufCache.tearDown();
-                    break;
-                case "getIntoDirectByteBuffer":
-                    directByteBufferCache.tearDown();
-                    break;
-                case "getIntoIndirectByteBufferSetRegion":
-                case "getIntoIndirectByteBufferGetElements":
-                case "getIntoIndirectByteBufferGetCritical":
-                    indirectByteBufferCache.tearDown();
-                    break;
-                case "getIntoDirectByteBufferFromUnsafe":
-                case "buffersOnlyDirectByteBufferFromUnsafe":
-                case "getIntoUnsafe":
-                    unsafeBufferCache.tearDown();
-                    break;
-                case "getIntoByteArraySetRegion":
-                case "getIntoByteArrayGetElements":
-                case "getIntoByteArrayCritical":
-                    byteArrayCache.tearDown();
-                    break;
-                default:
-                    throw new RuntimeException("Don't know how to tearDown() for benchmark: " + benchmarkState.caller.benchmarkMethod);
-            }
-        }
+  private static final Logger LOG = Logger.getLogger(GetJNIBenchmark.class.getName());
+
+  private static final MethodHandle GET_INTO_MEMORY_SEGMENT_HANDLE;
+
+  static {
+    NarSystem.loadLibrary();
+
+// 2. Initialize the Linker and Lookup
+    Linker linker = Linker.nativeLinker();
+    SymbolLookup loaderLookup = SymbolLookup.loaderLookup();
+
+    // 3. Find the symbol and create the Downcall Handle once
+    GET_INTO_MEMORY_SEGMENT_HANDLE = loaderLookup.find("getIntoMemorySegment")
+                                                 .map(symbol -> linker.downcallHandle(symbol,
+                                                                                      FunctionDescriptor.of(
+                                                                                          ValueLayout.JAVA_INT,
+                                                                                          ValueLayout.ADDRESS,
+                                                                                          ValueLayout.ADDRESS,
+                                                                                          ValueLayout.JAVA_INT)))
+                                                 .orElseThrow();
+  }
+
+  @State(Scope.Benchmark)
+  public static class GetJNIBenchmarkState {
+
+    @Param({
+        "10",
+        "50",
+        "128",
+        "512",
+        "1024",
+        "4096",
+        "8192",
+        "16384",
+        "32768",
+        "65536",
+        "131072"})
+    int valueSize;
+
+    @Param({"4", "16"})
+    int cacheMB;
+    final static int MB = 1024 * 1024;
+    @Param({"1024"})
+    int cacheEntryOverhead;
+
+    @Param({"none", "copyout", "bytesum", "longsum"})
+    String checksum;
+    AllocationCache.Checksum readChecksum;
+
+    String keyBase;
+    byte[] keyBytes;
+    private Arena arena;
+    private MemorySegment keyMemorySegment;
+
+    JMHCaller caller;
+
+    @Setup
+    public void setup() {
+      this.caller = JMHCaller.fromStack();
+      arena = Arena.ofShared();
+
+      keyBase = "testKeyWithReturnValueSize" + String.format("%07d", valueSize) + "Bytes";
+
+      keyBytes = keyBase.getBytes();
+      keyMemorySegment = arena.allocateArray(ValueLayout.JAVA_BYTE, keyBytes);
+
+      readChecksum = AllocationCache.Checksum.valueOf(checksum);
     }
 
-    //@Benchmark
-    public void buffersOnlyDirectByteBufferFromUnsafe(GetJNIThreadState threadState) {
-        UnsafeBufferCache.UnsafeBuffer unsafeBuffer = threadState.unsafeBufferCache.acquire();
-        threadState.unsafeBufferCache.release(unsafeBuffer);
+    @TearDown
+    public void tearDown() {
+      if (arena != null) {
+        arena.close();
+      }
     }
-
-    @Benchmark
-    public void getIntoDirectByteBuffer(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState, Blackhole blackhole) {
-        ByteBuffer byteBuffer = threadState.directByteBufferCache.acquire();
-        byteBuffer.clear();
-        GetPutJNI.getIntoDirectByteBuffer(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length, byteBuffer, benchmarkState.valueSize);
-        threadState.directByteBufferCache.checksumBuffer(byteBuffer);
-        threadState.directByteBufferCache.release(byteBuffer);
-    }
-
-    @Benchmark
-    public void getIntoUnsafe(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState, Blackhole blackhole) {
-        UnsafeBufferCache.UnsafeBuffer unsafeBuffer = threadState.unsafeBufferCache.acquire();
-        int size = GetPutJNI.getIntoUnsafe(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length, unsafeBuffer.handle, benchmarkState.valueSize);
-        threadState.unsafeBufferCache.checksumBuffer(unsafeBuffer);
-        threadState.unsafeBufferCache.release(unsafeBuffer);
+  }
+
+  @State(Scope.Thread)
+  public static class GetJNIThreadState {
+
+    private DirectByteBufferCache directByteBufferCache = new DirectByteBufferCache();
+    private UnsafeBufferCache unsafeBufferCache = new UnsafeBufferCache();
+    private ByteArrayCache byteArrayCache = new ByteArrayCache();
+    private IndirectByteBufferCache indirectByteBufferCache = new IndirectByteBufferCache();
+    private PooledByteBufAllocator pooledByteBufAllocator;
+    private NettyByteBufCache nettyByteBufCache = new NettyByteBufCache();
+    private MemorySegmentCache memorySegmentCache = new MemorySegmentCache();
+
+    int valueSize;
+    int cacheSize;
+
+    @Setup
+    public void setup(GetJNIBenchmarkState benchmarkState, Blackhole blackhole) {
+      valueSize = benchmarkState.valueSize;
+      cacheSize = benchmarkState.cacheMB * GetJNIBenchmarkState.MB;
+
+      switch (benchmarkState.caller.benchmarkMethod) {
+        case "getIntoPooledNettyByteBuf":
+          pooledByteBufAllocator = PooledByteBufAllocator.DEFAULT;
+          //create a 0-sized cache so that we can use it to do checksum
+          nettyByteBufCache.setup(valueSize, 0/*cacheSize*/, benchmarkState.cacheEntryOverhead,
+                                  benchmarkState.readChecksum, blackhole);
+          break;
+        case "getIntoNettyByteBuf":
+          nettyByteBufCache.setup(valueSize, cacheSize, benchmarkState.cacheEntryOverhead, benchmarkState.readChecksum,
+                                  blackhole);
+          break;
+        case "getIntoDirectByteBuffer":
+          directByteBufferCache.setup(valueSize, cacheSize, benchmarkState.cacheEntryOverhead,
+                                      benchmarkState.readChecksum, blackhole);
+          break;
+        case "getIntoIndirectByteBufferSetRegion":
+        case "getIntoIndirectByteBufferGetElements":
+        case "getIntoIndirectByteBufferGetCritical":
+          indirectByteBufferCache.setup(valueSize, cacheSize, benchmarkState.cacheEntryOverhead,
+                                        benchmarkState.readChecksum, blackhole);
+          break;
+        case "getIntoDirectByteBufferFromUnsafe":
+        case "buffersOnlyDirectByteBufferFromUnsafe":
+        case "getIntoUnsafe":
+          unsafeBufferCache.setup(valueSize, cacheSize, benchmarkState.cacheEntryOverhead, benchmarkState.readChecksum,
+                                  blackhole);
+          break;
+        case "getIntoByteArraySetRegion":
+        case "getIntoByteArrayGetElements":
+        case "getIntoByteArrayCritical":
+          byteArrayCache.setup(valueSize, cacheSize, benchmarkState.cacheEntryOverhead, benchmarkState.readChecksum,
+                               blackhole);
+          break;
+        case "getIntoMemorySegment":
+          memorySegmentCache.setup(valueSize, cacheSize, benchmarkState.cacheEntryOverhead, benchmarkState.readChecksum,
+                                   blackhole);
+          break;
+        default:
+          throw new RuntimeException(
+              "Don't know how to setup() for benchmark: " + benchmarkState.caller.benchmarkMethod);
+      }
     }
 
-    @Benchmark
-    public void getIntoPooledNettyByteBuf(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState, Blackhole blackhole) {
-        ByteBuf byteBuf = threadState.pooledByteBufAllocator.directBuffer(benchmarkState.valueSize);
-        byteBuf.readerIndex(0);
-        int size = GetPutJNI.getIntoUnsafe(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length, byteBuf.memoryAddress(), benchmarkState.valueSize);
-        byteBuf.writerIndex(size);
-        //Use 0-sized cache which we created specially to do checksumBuffer operation
-        threadState.nettyByteBufCache.checksumBuffer(byteBuf);
-        // Allocated buffer already has retain count of 1
-        byteBuf.release();
+    @TearDown
+    public void tearDown(GetJNIBenchmarkState benchmarkState) {
+
+      switch (benchmarkState.caller.benchmarkMethod) {
+        case "getIntoPooledNettyByteBuf":
+          pooledByteBufAllocator = null;
+          break;
+        case "getIntoNettyByteBuf":
+          nettyByteBufCache.tearDown();
+          break;
+        case "getIntoDirectByteBuffer":
+          directByteBufferCache.tearDown();
+          break;
+        case "getIntoIndirectByteBufferSetRegion":
+        case "getIntoIndirectByteBufferGetElements":
+        case "getIntoIndirectByteBufferGetCritical":
+          indirectByteBufferCache.tearDown();
+          break;
+        case "getIntoDirectByteBufferFromUnsafe":
+        case "buffersOnlyDirectByteBufferFromUnsafe":
+        case "getIntoUnsafe":
+          unsafeBufferCache.tearDown();
+          break;
+        case "getIntoByteArraySetRegion":
+        case "getIntoByteArrayGetElements":
+        case "getIntoByteArrayCritical":
+          byteArrayCache.tearDown();
+          break;
+        case "getIntoMemorySegment":
+          memorySegmentCache.tearDown();
+          break;
+        default:
+          throw new RuntimeException(
+              "Don't know how to tearDown() for benchmark: " + benchmarkState.caller.benchmarkMethod);
+      }
     }
-
-    @Benchmark
-    public void getIntoNettyByteBuf(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState, Blackhole blackhole) {
-        ByteBuf byteBuf = threadState.nettyByteBufCache.acquire();
-        byteBuf.readerIndex(0);
-        int size = GetPutJNI.getIntoUnsafe(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length, byteBuf.memoryAddress(), benchmarkState.valueSize);
-        byteBuf.writerIndex(size);
-        threadState.nettyByteBufCache.checksumBuffer(byteBuf);
-        threadState.nettyByteBufCache.release(byteBuf);
-    }
-
-    @Benchmark
-    public void getIntoByteArraySetRegion(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState, Blackhole blackhole) {
-        byte[] array = threadState.byteArrayCache.acquire();
-        int size = GetPutJNI.getIntoByteArraySetRegion(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length, array, benchmarkState.valueSize);
-        threadState.byteArrayCache.checksumBuffer(array);
-        threadState.byteArrayCache.release(array);
+  }
+
+  //@Benchmark
+  public void buffersOnlyDirectByteBufferFromUnsafe(GetJNIThreadState threadState) {
+    UnsafeBufferCache.UnsafeBuffer unsafeBuffer = threadState.unsafeBufferCache.acquire();
+    threadState.unsafeBufferCache.release(unsafeBuffer);
+  }
+
+  @Benchmark
+  public void getIntoDirectByteBuffer(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState,
+                                      Blackhole blackhole) {
+    ByteBuffer byteBuffer = threadState.directByteBufferCache.acquire();
+    byteBuffer.clear();
+    GetPutJNI.getIntoDirectByteBuffer(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length, byteBuffer,
+                                      benchmarkState.valueSize);
+    threadState.directByteBufferCache.checksumBuffer(byteBuffer);
+    threadState.directByteBufferCache.release(byteBuffer);
+  }
+
+  @Benchmark
+  public void getIntoUnsafe(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState, Blackhole blackhole) {
+    UnsafeBufferCache.UnsafeBuffer unsafeBuffer = threadState.unsafeBufferCache.acquire();
+    int size = GetPutJNI.getIntoUnsafe(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length, unsafeBuffer.handle,
+                                       benchmarkState.valueSize);
+    threadState.unsafeBufferCache.checksumBuffer(unsafeBuffer);
+    threadState.unsafeBufferCache.release(unsafeBuffer);
+  }
+
+  @Benchmark
+  public void getIntoMemorySegment(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState,
+                                   Blackhole blackhole) {
+    final var segment = threadState.memorySegmentCache.acquire();
+
+    try {
+      final var size = (int) GET_INTO_MEMORY_SEGMENT_HANDLE.invokeExact(
+          benchmarkState.keyMemorySegment, // Pre-allocated segment for key
+          segment,
+          benchmarkState.valueSize
+      );
+      blackhole.consume(size);
+    } catch (Throwable e) {
+      throw new RuntimeException(e);
     }
 
-    @Benchmark
-    public void getIntoByteArrayGetElements(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState, Blackhole blackhole) {
-        byte[] array = threadState.byteArrayCache.acquire();
-        int size = GetPutJNI.getIntoByteArrayGetElements(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length, array, benchmarkState.valueSize);
-        threadState.byteArrayCache.checksumBuffer(array);
-        threadState.byteArrayCache.release(array);
-    }
-
-    @Benchmark
-    public void getIntoByteArrayCritical(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState, Blackhole blackhole) {
-        byte[] array = threadState.byteArrayCache.acquire();
-        int size = GetPutJNI.getIntoByteArrayCritical(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length, array, benchmarkState.valueSize);
-        threadState.byteArrayCache.checksumBuffer(array);
-        threadState.byteArrayCache.release(array);
-    }
-
-    //final supplied buffer(s)
-    //TODO this can be done in as many different ways as supplying a byte[]
-    //But why shouldn't we just expect the same performance as byte[] ?
-    //Start with one instance (one that seems good in the byte[] case), and check for surprises...
-    @Benchmark
-    public void getIntoIndirectByteBufferSetRegion(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState, Blackhole blackhole) {
-        ByteBuffer byteBuffer = threadState.indirectByteBufferCache.acquire();
-        byteBuffer.clear();
-        GetPutJNI.getIntoIndirectByteBufferSetRegion(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length, byteBuffer, benchmarkState.valueSize);
-        threadState.indirectByteBufferCache.checksumBuffer(byteBuffer);
-        threadState.indirectByteBufferCache.release(byteBuffer);
-    }
-
-    @Benchmark
-    public void getIntoIndirectByteBufferGetElements(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState, Blackhole blackhole) {
-        ByteBuffer byteBuffer = threadState.indirectByteBufferCache.acquire();
-        byteBuffer.clear();
-        int size = GetPutJNI.getIntoIndirectByteBufferGetElements(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length, byteBuffer, benchmarkState.valueSize);
-        threadState.indirectByteBufferCache.checksumBuffer(byteBuffer);
-        threadState.indirectByteBufferCache.release(byteBuffer);
-    }
-
-    @Benchmark
-    public void getIntoIndirectByteBufferGetCritical(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState, Blackhole blackhole) {
-        ByteBuffer byteBuffer = threadState.indirectByteBufferCache.acquire();
-        byteBuffer.clear();
-        int size = GetPutJNI.getIntoIndirectByteBufferGetCritical(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length, byteBuffer, benchmarkState.valueSize);
-        threadState.indirectByteBufferCache.checksumBuffer(byteBuffer);
-        threadState.indirectByteBufferCache.release(byteBuffer);
-    }
-
-    //create/allocate the result buffers, analogous to the "into" methods (but no unsafe ones here)
-    //TODO getReturnDirectByteBuffer
-    //TODO getReturnIndirectByteBuffer
-    //TODO getReturnByteArrayCritical
-    //TODO getReturnByteArrayGetElements
-    //TODO getReturnByteArraySetRegion
-
-    //TODO env->NewDirectByteBuffer() - what aree the ownership rules ?
-    //TODO track whether the byte[] copying/sharing methods we are using are doing copies
-    //env->GetByteArrayElements(..., &is_copy)
-
-    //TODO graphing - dig into the Python stuff a bit more
-
-    /**
-     * Run from the IDE
-     *
-     * You will need this in the VM args of the run configuration,
-     * in order for NAR to find at runtime the native lib it has built:
-     *
-     * -Djava.library.path=PATH_TO_REPO/target/jni-benchmarks-1.0.0-SNAPSHOT-application/jni-benchmarks-1.0.0-SNAPSHOT/lib
-     *
-     * The parameters we set here configure for debugging,
-     * typically we want a much shorter runs than is needed for accurate benchmarking
-     * SO DON'T TRUST THE NUMBERS GENERATED BY THIS RUN
-     * fork(0) runs everything is in a single process so we don't need to configure JDWP
-     * Again this affects JMH
-     * {@link https://github.com/openjdk/jmh/blob/master/jmh-samples/src/main/java/org/openjdk/jmh/samples/JMHSample_12_Forking.java}
-     * It's a convenience for debugging the tests so that they actually run, that is all.
-     *
-     * @param args
-     * @throws RunnerException
-     */
-    public static void main(String[] args) throws RunnerException {
-        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy.MM.dd_HH:mm:ss.SSS");
-        Options opt = new OptionsBuilder()
-                .forks(0)
-                .param("checksum", "none", "copyout")
-                .param("valueSize", "50", "4096", "16384", "65536")
-                .param("cacheMB", "4")
-                .warmupIterations(10)
-                .measurementIterations(50)
-                .include(GetJNIBenchmark.class.getSimpleName())
-                .result("analysis/testplots/" +  simpleDateFormat.format(new Date()) + "_" + GetJNIBenchmark.class.getSimpleName() + ".csv")
-                .build();
-
-        new Runner(opt).run();
-    }
+    threadState.memorySegmentCache.checksumBuffer(segment);
+    threadState.memorySegmentCache.release(segment);
+  }
+
+  @Benchmark
+  public void getIntoPooledNettyByteBuf(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState,
+                                        Blackhole blackhole) {
+    ByteBuf byteBuf = threadState.pooledByteBufAllocator.directBuffer(benchmarkState.valueSize);
+    byteBuf.readerIndex(0);
+    int size = GetPutJNI.getIntoUnsafe(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length,
+                                       byteBuf.memoryAddress(), benchmarkState.valueSize);
+    byteBuf.writerIndex(size);
+    //Use 0-sized cache which we created specially to do checksumBuffer operation
+    threadState.nettyByteBufCache.checksumBuffer(byteBuf);
+    // Allocated buffer already has retain count of 1
+    byteBuf.release();
+  }
+
+  @Benchmark
+  public void getIntoNettyByteBuf(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState,
+                                  Blackhole blackhole) {
+    ByteBuf byteBuf = threadState.nettyByteBufCache.acquire();
+    byteBuf.readerIndex(0);
+    int size = GetPutJNI.getIntoUnsafe(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length,
+                                       byteBuf.memoryAddress(), benchmarkState.valueSize);
+    byteBuf.writerIndex(size);
+    threadState.nettyByteBufCache.checksumBuffer(byteBuf);
+    threadState.nettyByteBufCache.release(byteBuf);
+  }
+
+  @Benchmark
+  public void getIntoByteArraySetRegion(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState,
+                                        Blackhole blackhole) {
+    byte[] array = threadState.byteArrayCache.acquire();
+    int size = GetPutJNI.getIntoByteArraySetRegion(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length, array,
+                                                   benchmarkState.valueSize);
+    threadState.byteArrayCache.checksumBuffer(array);
+    threadState.byteArrayCache.release(array);
+  }
+
+  @Benchmark
+  public void getIntoByteArrayGetElements(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState,
+                                          Blackhole blackhole) {
+    byte[] array = threadState.byteArrayCache.acquire();
+    int size = GetPutJNI.getIntoByteArrayGetElements(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length, array,
+                                                     benchmarkState.valueSize);
+    threadState.byteArrayCache.checksumBuffer(array);
+    threadState.byteArrayCache.release(array);
+  }
+
+  @Benchmark
+  public void getIntoByteArrayCritical(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState,
+                                       Blackhole blackhole) {
+    byte[] array = threadState.byteArrayCache.acquire();
+    int size = GetPutJNI.getIntoByteArrayCritical(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length, array,
+                                                  benchmarkState.valueSize);
+    threadState.byteArrayCache.checksumBuffer(array);
+    threadState.byteArrayCache.release(array);
+  }
+
+  //final supplied buffer(s)
+  //TODO this can be done in as many different ways as supplying a byte[]
+  //But why shouldn't we just expect the same performance as byte[] ?
+  //Start with one instance (one that seems good in the byte[] case), and check for surprises...
+  @Benchmark
+  public void getIntoIndirectByteBufferSetRegion(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState,
+                                                 Blackhole blackhole) {
+    ByteBuffer byteBuffer = threadState.indirectByteBufferCache.acquire();
+    byteBuffer.clear();
+    GetPutJNI.getIntoIndirectByteBufferSetRegion(benchmarkState.keyBytes, 0, benchmarkState.keyBytes.length, byteBuffer,
+                                                 benchmarkState.valueSize);
+    threadState.indirectByteBufferCache.checksumBuffer(byteBuffer);
+    threadState.indirectByteBufferCache.release(byteBuffer);
+  }
+
+  @Benchmark
+  public void getIntoIndirectByteBufferGetElements(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState,
+                                                   Blackhole blackhole) {
+    ByteBuffer byteBuffer = threadState.indirectByteBufferCache.acquire();
+    byteBuffer.clear();
+    int size = GetPutJNI.getIntoIndirectByteBufferGetElements(benchmarkState.keyBytes, 0,
+                                                              benchmarkState.keyBytes.length, byteBuffer,
+                                                              benchmarkState.valueSize);
+    threadState.indirectByteBufferCache.checksumBuffer(byteBuffer);
+    threadState.indirectByteBufferCache.release(byteBuffer);
+  }
+
+  @Benchmark
+  public void getIntoIndirectByteBufferGetCritical(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState,
+                                                   Blackhole blackhole) {
+    ByteBuffer byteBuffer = threadState.indirectByteBufferCache.acquire();
+    byteBuffer.clear();
+    int size = GetPutJNI.getIntoIndirectByteBufferGetCritical(benchmarkState.keyBytes, 0,
+                                                              benchmarkState.keyBytes.length, byteBuffer,
+                                                              benchmarkState.valueSize);
+    threadState.indirectByteBufferCache.checksumBuffer(byteBuffer);
+    threadState.indirectByteBufferCache.release(byteBuffer);
+  }
+
+  //create/allocate the result buffers, analogous to the "into" methods (but no unsafe ones here)
+  //TODO getReturnDirectByteBuffer
+  //TODO getReturnIndirectByteBuffer
+  //TODO getReturnByteArrayCritical
+  //TODO getReturnByteArrayGetElements
+  //TODO getReturnByteArraySetRegion
+
+  //TODO env->NewDirectByteBuffer() - what aree the ownership rules ?
+  //TODO track whether the byte[] copying/sharing methods we are using are doing copies
+  //env->GetByteArrayElements(..., &is_copy)
+
+  //TODO graphing - dig into the Python stuff a bit more
+
+  /**
+   * Run from the IDE
+   * <p>
+   * You will need this in the VM args of the run configuration,
+   * in order for NAR to find at runtime the native lib it has built:
+   * <p>
+   * -Djava.library.path=PATH_TO_REPO/target/jni-benchmarks-1.0.0-SNAPSHOT-application/jni-benchmarks-1.0.0-SNAPSHOT/lib
+   * <p>
+   * The parameters we set here configure for debugging,
+   * typically we want a much shorter runs than is needed for accurate benchmarking
+   * SO DON'T TRUST THE NUMBERS GENERATED BY THIS RUN
+   * fork(0) runs everything is in a single process so we don't need to configure JDWP
+   * Again this affects JMH
+   * {@link https://github.com/openjdk/jmh/blob/master/jmh-samples/src/main/java/org/openjdk/jmh/samples/JMHSample_12_Forking.java}
+   * It's a convenience for debugging the tests so that they actually run, that is all.
+   *
+   * @param args
+   * @throws RunnerException
+   */
+  public static void main(String[] args) throws RunnerException {
+    SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy.MM.dd_HH:mm:ss.SSS");
+    Options opt = new OptionsBuilder()
+        .forks(0)
+        .param("checksum", "none", "copyout")
+        .param("valueSize", "50", "4096", "16384", "65536")
+        .param("cacheMB", "4")
+        .warmupIterations(10)
+        .measurementIterations(50)
+        .include(GetJNIBenchmark.class.getSimpleName())
+        .result("analysis/testplots/" + simpleDateFormat.format(
+            new Date()) + "_" + GetJNIBenchmark.class.getSimpleName() + ".csv")
+        .build();
+
+    new Runner(opt).run();
+  }
 
 }
diff --git a/src/main/java/com/evolvedbinary/jnibench/jmhbench/PutJNIBenchmark.java b/src/main/java/com/evolvedbinary/jnibench/jmhbench/PutJNIBenchmark.java
index e3b3355..d2f242a 100644
--- a/src/main/java/com/evolvedbinary/jnibench/jmhbench/PutJNIBenchmark.java
+++ b/src/main/java/com/evolvedbinary/jnibench/jmhbench/PutJNIBenchmark.java
@@ -30,20 +30,26 @@
 import com.evolvedbinary.jnibench.consbench.NarSystem;
 import com.evolvedbinary.jnibench.jmhbench.cache.*;
 import com.evolvedbinary.jnibench.jmhbench.common.*;
+import io.netty.buffer.ByteBuf;
 import io.netty.buffer.PooledByteBufAllocator;
+import java.lang.foreign.Arena;
+import java.lang.foreign.FunctionDescriptor;
+import java.lang.foreign.Linker;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.SymbolLookup;
+import java.lang.foreign.ValueLayout;
+import java.lang.invoke.MethodHandle;
+import java.nio.ByteBuffer;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.concurrent.TimeUnit;
+import java.util.logging.Logger;
 import org.openjdk.jmh.annotations.*;
 import org.openjdk.jmh.infra.Blackhole;
 import org.openjdk.jmh.runner.Runner;
 import org.openjdk.jmh.runner.RunnerException;
 import org.openjdk.jmh.runner.options.Options;
 import org.openjdk.jmh.runner.options.OptionsBuilder;
-import io.netty.buffer.ByteBuf;
-
-import java.nio.ByteBuffer;
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.concurrent.TimeUnit;
-import java.util.logging.Logger;
 
 /**
  * Benchmark getting byte arrays from native methods.
@@ -58,8 +64,24 @@ public class PutJNIBenchmark {
 
     private static final Logger LOG = Logger.getLogger(GetJNIBenchmark.class.getName());
 
+    private static final MethodHandle PUT_FROM_MEMORY_SEGMENT_HANDLE;
+
     static {
         NarSystem.loadLibrary();
+
+        // 2. Initialize the Linker and Lookup
+        Linker linker = Linker.nativeLinker();
+        SymbolLookup loaderLookup = SymbolLookup.loaderLookup();
+
+        // 3. Find the symbol and create the Downcall Handle once
+        PUT_FROM_MEMORY_SEGMENT_HANDLE = loaderLookup.find("putFromMemorySegment")
+                .map(symbol -> linker.downcallHandle(symbol,
+                        FunctionDescriptor.of(
+                                ValueLayout.JAVA_INT,
+                                ValueLayout.ADDRESS,
+                                ValueLayout.ADDRESS,
+                                ValueLayout.JAVA_INT)))
+                .orElseThrow();
     }
 
     @State(Scope.Benchmark)
@@ -90,6 +112,8 @@ public static class GetJNIBenchmarkState {
 
         String keyBase;
         byte[] keyBytes;
+        MemorySegment keyMemorySegment;
+        private Arena benchmarkArena;
 
         JMHCaller caller;
 
@@ -99,14 +123,19 @@ public void setup() {
 
             keyBase = "testKeyWithReturnValueSize" + String.format("%07d", valueSize) + "Bytes";
 
+            benchmarkArena = Arena.ofShared();
+
             keyBytes = keyBase.getBytes();
+            keyMemorySegment = benchmarkArena.allocateArray(ValueLayout.JAVA_BYTE, keyBytes);
 
             writePreparation = AllocationCache.Prepare.valueOf(preparation);
         }
 
         @TearDown
         public void tearDown() {
-
+            if (benchmarkArena != null) {
+                benchmarkArena.close();
+            }
         }
     }
 
@@ -117,6 +146,7 @@ public static class GetJNIThreadState {
         private final UnsafeBufferCache unsafeBufferCache = new UnsafeBufferCache();
         private final ByteArrayCache byteArrayCache = new ByteArrayCache();
         private final IndirectByteBufferCache indirectByteBufferCache = new IndirectByteBufferCache();
+        private final MemorySegmentCache memorySegmentCache = new MemorySegmentCache();
         private final PooledByteBufAllocator pooledByteBufAllocator = PooledByteBufAllocator.DEFAULT;
         private final NettyByteBufCache nettyByteBufCache = new NettyByteBufCache();
 
@@ -152,6 +182,9 @@ public void setup(GetJNIBenchmarkState benchmarkState, Blackhole blackhole) {
                 case "putFromByteArrayCritical":
                     byteArrayCache.setup(valueSize, cacheSize, benchmarkState.cacheEntryOverhead, benchmarkState.writePreparation, blackhole);
                     break;
+                case "putFromMemorySegment":
+                    memorySegmentCache.setup(valueSize, cacheSize, benchmarkState.cacheEntryOverhead, benchmarkState.writePreparation, blackhole);
+                    break;
                 default:
                     throw new RuntimeException("Don't know how to setup() for benchmark: " + benchmarkState.caller.benchmarkMethod);
             }
@@ -184,6 +217,9 @@ public void tearDown(GetJNIBenchmarkState benchmarkState) {
                 case "putFromByteArrayCritical":
                     byteArrayCache.tearDown();
                     break;
+                case "putFromMemorySegment":
+                    memorySegmentCache.tearDown();
+                    break;
                 default:
                     throw new RuntimeException("Don't know how to tearDown() for benchmark: " + benchmarkState.caller.benchmarkMethod);
             }
@@ -196,6 +232,26 @@ public void buffersOnlyDirectByteBufferFromUnsafe(GetJNIThreadState threadState)
         threadState.unsafeBufferCache.release(unsafeBuffer);
     }
 
+    @Benchmark
+    public void putFromMemorySegment(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState,
+                                     Blackhole blackhole) {
+        final var segment = threadState.memorySegmentCache.acquire();
+        threadState.memorySegmentCache.prepareBuffer(segment, benchmarkState.fillByte);
+
+        try {
+            final var size = (int) PUT_FROM_MEMORY_SEGMENT_HANDLE.invokeExact(
+                    benchmarkState.keyMemorySegment, // Pre-allocated segment for key
+                    segment,
+                    benchmarkState.valueSize
+            );
+            blackhole.consume(size);
+        } catch (Throwable e) {
+            throw new RuntimeException(e);
+        }
+
+        threadState.memorySegmentCache.release(segment);
+    }
+
     @Benchmark
     public void putFromDirectByteBuffer(GetJNIBenchmarkState benchmarkState, GetJNIThreadState threadState, Blackhole blackhole) {
         ByteBuffer byteBuffer = threadState.directByteBufferCache.acquire();
diff --git a/src/main/java/com/evolvedbinary/jnibench/jmhbench/cache/MemorySegmentCache.java b/src/main/java/com/evolvedbinary/jnibench/jmhbench/cache/MemorySegmentCache.java
new file mode 100644
index 0000000..9c6079f
--- /dev/null
+++ b/src/main/java/com/evolvedbinary/jnibench/jmhbench/cache/MemorySegmentCache.java
@@ -0,0 +1,62 @@
+package com.evolvedbinary.jnibench.jmhbench.cache;
+
+import static java.lang.foreign.ValueLayout.JAVA_BYTE;
+
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+import java.util.stream.IntStream;
+
+public class MemorySegmentCache extends LinkedListAllocationCache<MemorySegment> {
+  private final Arena arena;
+
+  public MemorySegmentCache() {
+    arena = Arena.ofShared();
+  }
+
+
+  @Override
+  MemorySegment allocate(final int valueSize) {
+    return arena.allocate(valueSize);
+  }
+
+  @Override
+  void free(final MemorySegment buffer) {
+    // Nothing to do here, as we override taerdown() directly.
+  }
+
+  @Override
+  public void tearDown() {
+    super.tearDown();
+    arena.close();
+  }
+
+  @Override
+  protected int byteChecksum(final MemorySegment item) {
+    return IntStream.range(0, (int) item.byteSize()).map(offset -> item.get(JAVA_BYTE, offset)).sum();
+  }
+
+  @Override
+  protected int longChecksum(final MemorySegment item) {
+    return byteChecksum(item);
+  }
+
+  @Override
+  protected byte[] copyOut(final MemorySegment item) {
+    // Get a cached byte array of the correct size
+    byte[] array = byteArrayOfSize((int) item.byteSize());
+
+    // Perform bulk copy from native memory to Java heap array
+    MemorySegment.copy(item, ValueLayout.JAVA_BYTE, 0, array, 0, (int) item.byteSize());
+
+    return array;
+  }
+
+  @Override
+  protected long copyIn(final MemorySegment item, final byte fillByte) {
+    // Highly optimized bulk fill (native memset equivalent)
+    item.fill(fillByte);
+
+    return fillByte;
+  }
+}