From dbdb96c8acf19e4937c7c5b11b460f3ec1cf2116 Mon Sep 17 00:00:00 2001 From: Keith Lee Date: Fri, 12 Jun 2026 23:40:52 +0100 Subject: [PATCH] [FLINK-39924] Size jemalloc narenas from container CPU allowance Apache Flink containers load jemalloc via LD_PRELOAD but don't configure narenas. jemalloc's default is 4 * ncpus, where ncpus is read from /proc/cpuinfo, the host CPU count, not the container's CPU limit. In CPU limited pods on large hosts this over-provisions arenas and causes RSS fragmentation, since each arena holds dirty pages for dirty_decay_ms before releasing them to the OS. Determine the effective CPU count from the cgroup CPU quota directly (cpu.max for v2, cpu.cfs_quota_us / cpu.cfs_period_us for v1), since nproc honors cpuset but not CPU quotas. Fall back to nproc when no quota is set. Skip the override entirely when the user has supplied narenas in MALLOC_CONF, and append narenas to any other user-supplied MALLOC_CONF value. --- docker-entrypoint.sh | 51 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index cf63daa..8accfbc 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -120,6 +120,54 @@ process_flink_properties() { fi } +# FLINK-39924: jemalloc's default narenas = 4 * ncpus is taken from +# /proc/cpuinfo (host CPU count), ignoring the container's quota +# and idle arenas hold dirty pages until dirty_decay_ms, inflating +# RSS. Derive narenas from the cgroup CPU quota instead. +configure_jemalloc_narenas() { + case ",${MALLOC_CONF}," in + *,narenas:*) + echo "jemalloc: respecting user-supplied narenas in MALLOC_CONF=${MALLOC_CONF}" + return + ;; + esac + + local cpus="" cpu_quota cpu_period + if [ -r /sys/fs/cgroup/cpu.max ]; then + # cgroup v2 + read -r cpu_quota cpu_period < /sys/fs/cgroup/cpu.max + if [ "$cpu_quota" != "max" ] && [ -n "$cpu_period" ] && [ "$cpu_period" -gt 0 ]; then + cpus=$(( (cpu_quota + cpu_period - 1) / cpu_period )) + fi + elif [ -r /sys/fs/cgroup/cpu/cpu.cfs_quota_us ] && [ -r /sys/fs/cgroup/cpu/cpu.cfs_period_us ]; then + # cgroup v1 + cpu_quota=$(cat /sys/fs/cgroup/cpu/cpu.cfs_quota_us) + cpu_period=$(cat /sys/fs/cgroup/cpu/cpu.cfs_period_us) + if [ "$cpu_quota" -gt 0 ] && [ "$cpu_period" -gt 0 ]; then + cpus=$(( (cpu_quota + cpu_period - 1) / cpu_period )) + fi + fi + # Fall back to nproc when no quota is set (cpuset-pinned / unlimited). + if [ -z "$cpus" ] || [ "$cpus" -le 0 ]; then + cpus=$(nproc 2>/dev/null || echo 1) + fi + + local narenas + if [ "$cpus" -le 1 ]; then + narenas=1 + else + narenas=$(( cpus * 4 )) + fi + + if [ -z "${MALLOC_CONF}" ]; then + export MALLOC_CONF="narenas:${narenas}" + echo "jemalloc: setting MALLOC_CONF=${MALLOC_CONF} (detected ${cpus} CPUs)" + else + export MALLOC_CONF="${MALLOC_CONF},narenas:${narenas}" + echo "jemalloc: appended narenas, MALLOC_CONF=${MALLOC_CONF} (detected ${cpus} CPUs)" + fi +} + maybe_enable_jemalloc() { if [ "${DISABLE_JEMALLOC:-false}" == "false" ]; then JEMALLOC_PATH="/usr/lib/$(uname -m)-linux-gnu/libjemalloc.so" @@ -135,7 +183,10 @@ maybe_enable_jemalloc() { MSG_PATH="$JEMALLOC_PATH and $JEMALLOC_FALLBACK" fi echo "WARNING: attempted to load jemalloc from $MSG_PATH but the library couldn't be found. glibc will be used instead." + return fi + + configure_jemalloc_narenas fi }