From 7b606fcbf57c234313fc5bf5313d03ab2bf1ff29 Mon Sep 17 00:00:00 2001 From: Jan Michael Auer Date: Thu, 25 Jun 2026 15:14:46 +0200 Subject: [PATCH 1/5] feat(server): Add on-demand heap profiling endpoints --- .github/workflows/build.yml | 2 +- Cargo.lock | 105 ++++++++++ Cargo.toml | 1 + README.md | 38 ++++ objectstore-server/Cargo.toml | 4 + objectstore-server/src/endpoints/mod.rs | 15 +- objectstore-server/src/endpoints/profiling.rs | 185 ++++++++++++++++++ objectstore-server/src/main.rs | 8 + scripts/build-cross.sh | 2 +- 9 files changed, 355 insertions(+), 5 deletions(-) create mode 100644 objectstore-server/src/endpoints/profiling.rs diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ac4d8c69..b42e1274 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -46,7 +46,7 @@ jobs: - name: Build Binary run: | - cargo build --release --locked --target=${{ matrix.target }} --bin objectstore + cargo build --release --locked --target=${{ matrix.target }} --bin objectstore --features objectstore-server/profiling cp target/${{ matrix.target }}/release/objectstore ./objectstore - name: Set up Docker Buildx diff --git a/Cargo.lock b/Cargo.lock index cc944985..e4be1837 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -672,6 +672,15 @@ dependencies = [ "libc", ] +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + [[package]] name = "critical-section" version = "1.2.0" @@ -1078,6 +1087,16 @@ version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" @@ -1907,6 +1926,23 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +[[package]] +name = "jemalloc_pprof" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bb2dfa20a68824f4c8c1aa271617d0ee0d9b707a866d56b7d7ee39c60c235a8" +dependencies = [ + "anyhow", + "libc", + "mappings", + "once_cell", + "pprof_util", + "tempfile", + "tikv-jemalloc-ctl", + "tokio", + "tracing", +] + [[package]] name = "jni" version = "0.21.1" @@ -2094,6 +2130,19 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "mappings" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bab1e61a4b76757edb59cd81fcaa7f3ba9018d43b527d9abfad877b4c6c60f2" +dependencies = [ + "anyhow", + "libc", + "once_cell", + "pprof_util", + "tracing", +] + [[package]] name = "matchers" version = "0.2.0" @@ -2189,6 +2238,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ "adler2", + "simd-adler32", ] [[package]] @@ -2308,6 +2358,20 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + [[package]] name = "num-bigint" version = "0.4.6" @@ -2334,6 +2398,15 @@ dependencies = [ "zeroize", ] +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + [[package]] name = "num-conv" version = "0.2.0" @@ -2360,6 +2433,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -2614,6 +2698,7 @@ dependencies = [ "http 1.4.2", "humantime", "humantime-serde", + "jemalloc_pprof", "jsonwebtoken", "nix 0.31.3", "num_cpus", @@ -2980,6 +3065,20 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "pprof_util" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eea0cc524de808a6d98d192a3d99fe95617031ad4a52ec0a0f987ef4432e8fe1" +dependencies = [ + "anyhow", + "backtrace", + "flate2", + "num", + "paste", + "prost", +] + [[package]] name = "ppv-lite86" version = "0.2.21" @@ -4089,6 +4188,12 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "simd-adler32" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" + [[package]] name = "simd_cesu8" version = "1.1.1" diff --git a/Cargo.toml b/Cargo.toml index e311da67..38130f7d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -77,6 +77,7 @@ serde_yaml = "0.9.34-deprecated" sketches-ddsketch = "0.3.1" tempfile = "3.27.0" thiserror = "2.0.18" +jemalloc_pprof = { version = "0.9.0", features = ["symbolize"] } tikv-jemallocator = { version = "0.7.0", features = ["background_threads", "override_allocator_on_supported_platforms"] } tikv-jemalloc-ctl = { version = "0.7.0", features = ["stats"] } tokio = "1.52.3" diff --git a/README.md b/README.md index 371d1a9c..318aefae 100644 --- a/README.md +++ b/README.md @@ -243,6 +243,44 @@ cargo run -- run You can copy and save additional config files next to the examples in `objectstore-server/config`. All other files are ignored by git. +### Heap Profiling + +Production release builds include on-demand heap profiling via jemalloc. Profiling is compiled +in but dormant at startup — you enable and disable it through HTTP endpoints that are only +reachable from loopback (i.e. from inside the pod or over `kubectl port-forward`). + +To capture a heap profile from a running pod: + +```sh +# Forward the server port to your local machine +kubectl port-forward pod/ 8888:8888 + +# Enable sampling, let the workload run, then dump a profile +curl -s -XPOST localhost:8888/debug/pprof/enable +# ... wait for the leak or load to accumulate ... +curl -s localhost:8888/debug/pprof/heap > heap.pb.gz + +# Disable sampling when done +curl -s -XPOST localhost:8888/debug/pprof/disable +``` + +The dump is a symbolized gzipped pprof file — function names are resolved against the running +binary's debug info, so no local binary is needed to analyze it. Open it with `go tool pprof`: + +```sh +go tool pprof -top heap.pb.gz + +# To isolate growth between two snapshots, use the -base flag: +curl -s localhost:8888/debug/pprof/heap > heap1.pb.gz +# ... wait for more growth ... +curl -s localhost:8888/debug/pprof/heap > heap2.pb.gz +go tool pprof -base heap1.pb.gz heap2.pb.gz +``` + +Sampling overhead is negligible (jemalloc default: one sample per ~512 KiB allocated on +average), so profiling can be left enabled for an extended capture window without measurably +affecting request latency. + ### Tests To run tests: diff --git a/objectstore-server/Cargo.toml b/objectstore-server/Cargo.toml index 5dc9088b..67b61394 100644 --- a/objectstore-server/Cargo.toml +++ b/objectstore-server/Cargo.toml @@ -45,9 +45,13 @@ tower = { workspace = true } tower-http = { workspace = true, features = ["catch-panic", "metrics", "set-header", "trace"] } [target.'cfg(target_os = "linux")'.dependencies] +jemalloc_pprof = { workspace = true, optional = true } tikv-jemallocator = { workspace = true } tikv-jemalloc-ctl = { workspace = true } +[features] +profiling = ["dep:jemalloc_pprof", "tikv-jemallocator/profiling"] + [dev-dependencies] nix = { workspace = true, features = ["signal"] } objectstore-test = { workspace = true } diff --git a/objectstore-server/src/endpoints/mod.rs b/objectstore-server/src/endpoints/mod.rs index a29f5060..ab460b69 100644 --- a/objectstore-server/src/endpoints/mod.rs +++ b/objectstore-server/src/endpoints/mod.rs @@ -12,10 +12,12 @@ pub mod health; mod keda; mod multipart; mod objects; +#[cfg(all(target_os = "linux", feature = "profiling"))] +mod profiling; /// Returns `true` for internal endpoints that are exempt from metrics and concurrency limits. pub fn is_internal_route(route: &str) -> bool { - matches!(route, "/health" | "/ready" | "/keda") + matches!(route, "/health" | "/ready" | "/keda") || route.starts_with("/debug/") } /// Returns a router with all objectstore HTTP endpoints mounted. @@ -28,8 +30,15 @@ pub fn routes() -> Router { .merge(batch::router()) .merge(multipart::router()); - Router::new() + let router = Router::new() .merge(health::router()) .merge(keda::router()) - .nest("/v1/", routes_v1) + .nest("/v1/", routes_v1); + + std::cfg_select! { + all(target_os = "linux", feature = "profiling") => { + router.merge(profiling::router()) + } + _ => { router } + } } diff --git a/objectstore-server/src/endpoints/profiling.rs b/objectstore-server/src/endpoints/profiling.rs new file mode 100644 index 00000000..194799d7 --- /dev/null +++ b/objectstore-server/src/endpoints/profiling.rs @@ -0,0 +1,185 @@ +//! On-demand heap profiling endpoints (Linux + `profiling` feature only). +//! +//! Exposes three routes under `/debug/pprof/`: +//! +//! | Method | Path | Description | +//! |--------|------|-------------| +//! | `POST` | `/debug/pprof/enable` | Activate jemalloc heap sampling | +//! | `POST` | `/debug/pprof/disable` | Deactivate heap sampling | +//! | `GET` | `/debug/pprof/heap` | Dump a symbolized gzipped pprof profile | +//! +//! All routes are guarded by a loopback-only middleware: requests whose TCP peer is not +//! a loopback address (`127.0.0.1` or `::1`) are rejected with `403 Forbidden`. This +//! makes the endpoints reachable via `kubectl port-forward` or an ephemeral debug +//! container (both present as loopback inside the pod netns) while blocking normal +//! Service/ingress traffic. +//! +//! Profiling is compiled in but dormant at startup (`prof_active:false`). Enable it on +//! demand via `POST /debug/pprof/enable`, capture with `GET /debug/pprof/heap`, then +//! disable with `POST /debug/pprof/disable`. + +use std::net::SocketAddr; + +use axum::extract::{ConnectInfo, Request}; +use axum::http::{HeaderValue, StatusCode, header}; +use axum::middleware::Next; +use axum::response::{IntoResponse, Response}; +use axum::{Router, routing}; +use jemalloc_pprof::PROF_CTL; + +use crate::state::ServiceState; + +/// Returns a router for all `/debug/pprof/*` endpoints, protected by the loopback gate. +pub fn router() -> Router { + Router::new() + .route("/debug/pprof/enable", routing::post(enable)) + .route("/debug/pprof/disable", routing::post(disable)) + .route("/debug/pprof/heap", routing::get(heap)) + .route_layer(axum::middleware::from_fn(require_loopback)) +} + +/// Middleware that rejects any request whose TCP peer is not a loopback address. +/// +/// Fails closed: if `ConnectInfo` is absent from the request extensions, the request is denied. +async fn require_loopback(request: Request, next: Next) -> Response { + let is_loopback = request + .extensions() + .get::>() + .map_or(false, |ConnectInfo(addr)| addr.ip().is_loopback()); + + if is_loopback { + next.run(request).await + } else { + StatusCode::FORBIDDEN.into_response() + } +} + +/// Activates jemalloc heap sampling. +/// +/// Sampling runs until `POST /debug/pprof/disable` is called. The sampling interval is set +/// at startup via `malloc_conf` (`lg_prof_sample:19`, i.e. 512 KiB mean interval). +async fn enable() -> Response { + let Some(ctl) = &*PROF_CTL else { + return (StatusCode::SERVICE_UNAVAILABLE, "profiling unavailable").into_response(); + }; + let mut guard = ctl.lock().await; + match guard.activate() { + Ok(()) => StatusCode::OK.into_response(), + Err(err) => (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()).into_response(), + } +} + +/// Deactivates jemalloc heap sampling. +async fn disable() -> Response { + let Some(ctl) = &*PROF_CTL else { + return (StatusCode::SERVICE_UNAVAILABLE, "profiling unavailable").into_response(); + }; + let mut guard = ctl.lock().await; + match guard.deactivate() { + Ok(()) => StatusCode::OK.into_response(), + Err(err) => (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()).into_response(), + } +} + +/// Dumps a symbolized, gzipped pprof heap profile. +/// +/// Returns `403` if profiling is not currently active. Activate first with +/// `POST /debug/pprof/enable`. +/// +/// The response body is a gzipped pprof protobuf (`heap.pb.gz`), ready for +/// `go tool pprof` or Speedscope without shipping a separate binary. +async fn heap() -> Response { + let Some(ctl) = &*PROF_CTL else { + return (StatusCode::SERVICE_UNAVAILABLE, "profiling unavailable").into_response(); + }; + let mut guard = ctl.lock().await; + + if !guard.activated() { + return ( + StatusCode::FORBIDDEN, + "profiling not active; POST /debug/pprof/enable first", + ) + .into_response(); + } + + match guard.dump_pprof() { + Ok(bytes) => { + let headers = [ + ( + header::CONTENT_TYPE, + HeaderValue::from_static("application/octet-stream"), + ), + ( + header::CONTENT_DISPOSITION, + HeaderValue::from_static("attachment; filename=\"heap.pb.gz\""), + ), + ]; + (headers, bytes).into_response() + } + Err(err) => (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()).into_response(), + } +} + +#[cfg(test)] +mod tests { + use std::net::SocketAddr; + + use axum::Router; + use axum::body::Body; + use axum::extract::ConnectInfo; + use axum::http::{Request, StatusCode}; + use axum::middleware::from_fn; + use axum::routing::get; + use tower::ServiceExt; + + use super::require_loopback; + + fn make_loopback_app() -> Router { + Router::new() + .route("/test", get(|| async { StatusCode::OK })) + .route_layer(from_fn(require_loopback)) + } + + fn request_with_peer(peer: &str) -> Request { + let addr: SocketAddr = peer.parse().unwrap(); + Request::builder() + .uri("/test") + .extension(ConnectInfo(addr)) + .body(Body::empty()) + .unwrap() + } + + #[tokio::test] + async fn loopback_ipv4_allowed() { + let resp = make_loopback_app() + .oneshot(request_with_peer("127.0.0.1:1234")) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + } + + #[tokio::test] + async fn loopback_ipv6_allowed() { + let resp = make_loopback_app() + .oneshot(request_with_peer("[::1]:1234")) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + } + + #[tokio::test] + async fn non_loopback_rejected() { + let resp = make_loopback_app() + .oneshot(request_with_peer("203.0.113.1:1234")) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::FORBIDDEN); + } + + #[tokio::test] + async fn no_connect_info_rejected() { + let req = Request::builder().uri("/test").body(Body::empty()).unwrap(); + let resp = make_loopback_app().oneshot(req).await.unwrap(); + assert_eq!(resp.status(), StatusCode::FORBIDDEN); + } +} diff --git a/objectstore-server/src/main.rs b/objectstore-server/src/main.rs index b1108f12..3eeb72ff 100644 --- a/objectstore-server/src/main.rs +++ b/objectstore-server/src/main.rs @@ -12,6 +12,14 @@ use objectstore_server::cli; #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; +// Prof enabled at compile time; sampling starts dormant and is activated on demand via the +// /debug/pprof/enable endpoint. lg_prof_sample:19 means one sample per 2^19 = 512 KiB allocated +// on average (jemalloc's default). The env var MALLOC_CONF still overrides this if set. +#[cfg(all(target_os = "linux", feature = "profiling"))] +#[allow(non_upper_case_globals)] +#[unsafe(export_name = "malloc_conf")] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:false,lg_prof_sample:19\0"; + fn main() { match cli::execute() { Ok(()) => std::process::exit(0), diff --git a/scripts/build-cross.sh b/scripts/build-cross.sh index ee56d977..c26e6cc4 100755 --- a/scripts/build-cross.sh +++ b/scripts/build-cross.sh @@ -28,7 +28,7 @@ docker run --rm \ -v "$HOME/.cargo/registry":/usr/local/cargo/registry \ -v "$HOME/.cargo/git":/usr/local/cargo/git \ objectstore-build \ - -p "$PACKAGE" + -p "$PACKAGE" --features objectstore-server/profiling docker build \ --platform linux/amd64 \ From 7b822975df74ca3797164dc4812d62c3261e9d40 Mon Sep 17 00:00:00 2001 From: Jan Michael Auer Date: Thu, 25 Jun 2026 15:21:37 +0200 Subject: [PATCH 2/5] feat(server): Fix clippy lints in profiling module --- objectstore-server/src/endpoints/profiling.rs | 2 +- objectstore-server/src/main.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/objectstore-server/src/endpoints/profiling.rs b/objectstore-server/src/endpoints/profiling.rs index 194799d7..660afdf8 100644 --- a/objectstore-server/src/endpoints/profiling.rs +++ b/objectstore-server/src/endpoints/profiling.rs @@ -45,7 +45,7 @@ async fn require_loopback(request: Request, next: Next) -> Response { let is_loopback = request .extensions() .get::>() - .map_or(false, |ConnectInfo(addr)| addr.ip().is_loopback()); + .is_some_and(|ConnectInfo(addr)| addr.ip().is_loopback()); if is_loopback { next.run(request).await diff --git a/objectstore-server/src/main.rs b/objectstore-server/src/main.rs index 3eeb72ff..7f701788 100644 --- a/objectstore-server/src/main.rs +++ b/objectstore-server/src/main.rs @@ -18,7 +18,7 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; #[cfg(all(target_os = "linux", feature = "profiling"))] #[allow(non_upper_case_globals)] #[unsafe(export_name = "malloc_conf")] -pub static malloc_conf: &[u8] = b"prof:true,prof_active:false,lg_prof_sample:19\0"; +static malloc_conf: &[u8] = b"prof:true,prof_active:false,lg_prof_sample:19\0"; fn main() { match cli::execute() { From ef5475c7cd05953d11393a1aaa19c1598941f803 Mon Sep 17 00:00:00 2001 From: Jan Michael Auer Date: Thu, 25 Jun 2026 15:40:55 +0200 Subject: [PATCH 3/5] feat(server): Add info logs for profiling enable, disable, and dump Also updates README heap profiling section. --- README.md | 34 +++++++------------ objectstore-server/src/endpoints/profiling.rs | 11 ++++-- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 318aefae..5b7dbbd9 100644 --- a/README.md +++ b/README.md @@ -245,41 +245,33 @@ You can copy and save additional config files next to the examples in ### Heap Profiling -Production release builds include on-demand heap profiling via jemalloc. Profiling is compiled -in but dormant at startup — you enable and disable it through HTTP endpoints that are only -reachable from loopback (i.e. from inside the pod or over `kubectl port-forward`). +Production release builds include on-demand heap profiling via jemalloc. It can +be enabled and disabled through HTTP endpoints that are only reachable from +loopback. -To capture a heap profile from a running pod: +To capture a heap profile from a running server, for example on localhost with +default port: ```sh -# Forward the server port to your local machine -kubectl port-forward pod/ 8888:8888 - # Enable sampling, let the workload run, then dump a profile -curl -s -XPOST localhost:8888/debug/pprof/enable -# ... wait for the leak or load to accumulate ... -curl -s localhost:8888/debug/pprof/heap > heap.pb.gz +curl -X POST http://localhost:8888/debug/pprof/enable +curl http://localhost:8888/debug/pprof/heap > heap.pb.gz # Disable sampling when done -curl -s -XPOST localhost:8888/debug/pprof/disable +curl -X POST http://localhost:8888/debug/pprof/disable ``` -The dump is a symbolized gzipped pprof file — function names are resolved against the running -binary's debug info, so no local binary is needed to analyze it. Open it with `go tool pprof`: +Analyze the profile dump with `go tool pprof`: ```sh -go tool pprof -top heap.pb.gz +go tool pprof heap.pb.gz # To isolate growth between two snapshots, use the -base flag: -curl -s localhost:8888/debug/pprof/heap > heap1.pb.gz -# ... wait for more growth ... -curl -s localhost:8888/debug/pprof/heap > heap2.pb.gz -go tool pprof -base heap1.pb.gz heap2.pb.gz +go tool pprof -base before.pb.gz after.pb.gz ``` -Sampling overhead is negligible (jemalloc default: one sample per ~512 KiB allocated on -average), so profiling can be left enabled for an extended capture window without measurably -affecting request latency. +The sampling overhead is expected to be low, so profiling can be left enabled +for an extended capture window safely. ### Tests diff --git a/objectstore-server/src/endpoints/profiling.rs b/objectstore-server/src/endpoints/profiling.rs index 660afdf8..ac8e3c7f 100644 --- a/objectstore-server/src/endpoints/profiling.rs +++ b/objectstore-server/src/endpoints/profiling.rs @@ -64,7 +64,10 @@ async fn enable() -> Response { }; let mut guard = ctl.lock().await; match guard.activate() { - Ok(()) => StatusCode::OK.into_response(), + Ok(()) => { + objectstore_log::info!("Heap profiling enabled"); + StatusCode::OK.into_response() + } Err(err) => (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()).into_response(), } } @@ -76,7 +79,10 @@ async fn disable() -> Response { }; let mut guard = ctl.lock().await; match guard.deactivate() { - Ok(()) => StatusCode::OK.into_response(), + Ok(()) => { + objectstore_log::info!("Heap profiling disabled"); + StatusCode::OK.into_response() + } Err(err) => (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()).into_response(), } } @@ -104,6 +110,7 @@ async fn heap() -> Response { match guard.dump_pprof() { Ok(bytes) => { + objectstore_log::info!("Heap profile requested"); let headers = [ ( header::CONTENT_TYPE, From 02641a590cc67e12f49ca567a45583412134b1c4 Mon Sep 17 00:00:00 2001 From: Jan Michael Auer Date: Thu, 25 Jun 2026 17:59:00 +0200 Subject: [PATCH 4/5] build: Simplify profiling feature flag in build scripts The crate is already selected via --bin or -p, so the objectstore-server/ prefix on the feature is redundant. --- .github/workflows/build.yml | 2 +- scripts/build-cross.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b42e1274..96097d25 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -46,7 +46,7 @@ jobs: - name: Build Binary run: | - cargo build --release --locked --target=${{ matrix.target }} --bin objectstore --features objectstore-server/profiling + cargo build --release --locked --target=${{ matrix.target }} --bin objectstore --features profiling cp target/${{ matrix.target }}/release/objectstore ./objectstore - name: Set up Docker Buildx diff --git a/scripts/build-cross.sh b/scripts/build-cross.sh index c26e6cc4..34a2f0eb 100755 --- a/scripts/build-cross.sh +++ b/scripts/build-cross.sh @@ -28,7 +28,7 @@ docker run --rm \ -v "$HOME/.cargo/registry":/usr/local/cargo/registry \ -v "$HOME/.cargo/git":/usr/local/cargo/git \ objectstore-build \ - -p "$PACKAGE" --features objectstore-server/profiling + -p "$PACKAGE" --features profiling docker build \ --platform linux/amd64 \ From 0b723af4c466e57d46575acdaa5f2c1fb208d2b6 Mon Sep 17 00:00:00 2001 From: Jan Michael Auer Date: Thu, 25 Jun 2026 18:34:13 +0200 Subject: [PATCH 5/5] ref(server): Clean up profiling endpoint responses and docs Improve status codes: loopback guard returns 404 instead of 403 to avoid leaking endpoint existence, inactive profiling returns 409 Conflict instead of 403. Simplify response construction with an unavailable() helper and a module-level HEAP_DISPOSITION const. Rewrite module docs for clarity. --- objectstore-server/src/endpoints/profiling.rs | 71 ++++++++----------- 1 file changed, 30 insertions(+), 41 deletions(-) diff --git a/objectstore-server/src/endpoints/profiling.rs b/objectstore-server/src/endpoints/profiling.rs index ac8e3c7f..abdf583b 100644 --- a/objectstore-server/src/endpoints/profiling.rs +++ b/objectstore-server/src/endpoints/profiling.rs @@ -1,22 +1,20 @@ -//! On-demand heap profiling endpoints (Linux + `profiling` feature only). +//! On-demand heap profiling endpoints. //! //! Exposes three routes under `/debug/pprof/`: //! //! | Method | Path | Description | //! |--------|------|-------------| -//! | `POST` | `/debug/pprof/enable` | Activate jemalloc heap sampling | +//! | `POST` | `/debug/pprof/enable` | Activate heap sampling | //! | `POST` | `/debug/pprof/disable` | Deactivate heap sampling | //! | `GET` | `/debug/pprof/heap` | Dump a symbolized gzipped pprof profile | //! -//! All routes are guarded by a loopback-only middleware: requests whose TCP peer is not -//! a loopback address (`127.0.0.1` or `::1`) are rejected with `403 Forbidden`. This -//! makes the endpoints reachable via `kubectl port-forward` or an ephemeral debug -//! container (both present as loopback inside the pod netns) while blocking normal -//! Service/ingress traffic. +//! **Requires the `profiling` feature.** When included, profiling is available +//! but disabled at startup. Enable it on demand to start capturing samples. +//! With `GET /debug/pprof/heap`, you can then download a pprof snapshot with +//! all allocations since profiling was enabled. //! -//! Profiling is compiled in but dormant at startup (`prof_active:false`). Enable it on -//! demand via `POST /debug/pprof/enable`, capture with `GET /debug/pprof/heap`, then -//! disable with `POST /debug/pprof/disable`. +//! **Note**:Due to their sensitive nature, these routes are only reachable via +//! the loopback interface. use std::net::SocketAddr; @@ -29,6 +27,9 @@ use jemalloc_pprof::PROF_CTL; use crate::state::ServiceState; +const HEAP_DISPOSITION: HeaderValue = + HeaderValue::from_static("attachment; filename=\"heap.pb.gz\""); + /// Returns a router for all `/debug/pprof/*` endpoints, protected by the loopback gate. pub fn router() -> Router { Router::new() @@ -50,7 +51,7 @@ async fn require_loopback(request: Request, next: Next) -> Response { if is_loopback { next.run(request).await } else { - StatusCode::FORBIDDEN.into_response() + StatusCode::NOT_FOUND.into_response() } } @@ -60,10 +61,10 @@ async fn require_loopback(request: Request, next: Next) -> Response { /// at startup via `malloc_conf` (`lg_prof_sample:19`, i.e. 512 KiB mean interval). async fn enable() -> Response { let Some(ctl) = &*PROF_CTL else { - return (StatusCode::SERVICE_UNAVAILABLE, "profiling unavailable").into_response(); + return unavailable(); }; - let mut guard = ctl.lock().await; - match guard.activate() { + + match ctl.lock().await.activate() { Ok(()) => { objectstore_log::info!("Heap profiling enabled"); StatusCode::OK.into_response() @@ -75,10 +76,10 @@ async fn enable() -> Response { /// Deactivates jemalloc heap sampling. async fn disable() -> Response { let Some(ctl) = &*PROF_CTL else { - return (StatusCode::SERVICE_UNAVAILABLE, "profiling unavailable").into_response(); + return unavailable(); }; - let mut guard = ctl.lock().await; - match guard.deactivate() { + + match ctl.lock().await.deactivate() { Ok(()) => { objectstore_log::info!("Heap profiling disabled"); StatusCode::OK.into_response() @@ -89,44 +90,32 @@ async fn disable() -> Response { /// Dumps a symbolized, gzipped pprof heap profile. /// -/// Returns `403` if profiling is not currently active. Activate first with +/// Returns `409 Conflict` if profiling is not currently active. Activate first with /// `POST /debug/pprof/enable`. /// /// The response body is a gzipped pprof protobuf (`heap.pb.gz`), ready for /// `go tool pprof` or Speedscope without shipping a separate binary. async fn heap() -> Response { let Some(ctl) = &*PROF_CTL else { - return (StatusCode::SERVICE_UNAVAILABLE, "profiling unavailable").into_response(); + return unavailable(); }; - let mut guard = ctl.lock().await; + let mut guard = ctl.lock().await; if !guard.activated() { - return ( - StatusCode::FORBIDDEN, - "profiling not active; POST /debug/pprof/enable first", - ) - .into_response(); + return (StatusCode::CONFLICT, "profiling not enabled").into_response(); } + objectstore_log::info!("Heap profile requested"); match guard.dump_pprof() { - Ok(bytes) => { - objectstore_log::info!("Heap profile requested"); - let headers = [ - ( - header::CONTENT_TYPE, - HeaderValue::from_static("application/octet-stream"), - ), - ( - header::CONTENT_DISPOSITION, - HeaderValue::from_static("attachment; filename=\"heap.pb.gz\""), - ), - ]; - (headers, bytes).into_response() - } + Ok(bytes) => ([(header::CONTENT_DISPOSITION, HEAP_DISPOSITION)], bytes).into_response(), Err(err) => (StatusCode::INTERNAL_SERVER_ERROR, err.to_string()).into_response(), } } +fn unavailable() -> Response { + (StatusCode::SERVICE_UNAVAILABLE, "profiling unavailable").into_response() +} + #[cfg(test)] mod tests { use std::net::SocketAddr; @@ -180,13 +169,13 @@ mod tests { .oneshot(request_with_peer("203.0.113.1:1234")) .await .unwrap(); - assert_eq!(resp.status(), StatusCode::FORBIDDEN); + assert_eq!(resp.status(), StatusCode::NOT_FOUND); } #[tokio::test] async fn no_connect_info_rejected() { let req = Request::builder().uri("/test").body(Body::empty()).unwrap(); let resp = make_loopback_app().oneshot(req).await.unwrap(); - assert_eq!(resp.status(), StatusCode::FORBIDDEN); + assert_eq!(resp.status(), StatusCode::NOT_FOUND); } }