scality · delthas · May 5, 2026 · May 5, 2026 · May 5, 2026 · May 5, 2026
diff --git a/index.js b/index.js
@@ -7,4 +7,9 @@ require('werelogs').stderrUtils.catchAndTimestampStderr(
     require('cluster').isPrimary ? 1 : null,
 );
 
+// Start tracing before requiring anything that hooks into HTTP, MongoDB,
+// or ioredis — instrumentation patches modules on require, so anything
+// loaded earlier than init() would run unpatched.
+require('./lib/tracing').init();
+
 require('./lib/server.js')();
diff --git a/lib/api/api.js b/lib/api/api.js
diff --git a/lib/instrumentation/simple.js b/lib/instrumentation/simple.js
@@ -0,0 +1,92 @@
+'use strict';
+
+const tracing = require('../tracing');
+
+let tracer = null;
+function getTracer() {
+    if (tracer) {
+        return tracer;
+    }
+    const { trace } = require('@opentelemetry/api');
+    const { version } = require('../../package.json');
+    tracer = trace.getTracer('cloudserver-api', version);
+    return tracer;
+}
+
+async function endSpanWhenSettled(promise, endSpan) {
+    try {
+        const value = await promise;
+        endSpan();
+        return value;
+    } catch (err) {
+        endSpan(err);
+        throw err;
+    }
+}
+
+function instrumentApiMethod(apiMethod, methodName) {
+    if (!tracing.isEnabled()) {
+        return apiMethod;
+    }
+
+    const api = require('@opentelemetry/api');
+    const spanName = `api.${methodName}`;
+
+    return function instrumented(...args) {
+        const callbackIndex = args.findLastIndex(a => typeof a === 'function');
+        const span = getTracer().startSpan(spanName, { kind: api.SpanKind.INTERNAL });
+
+        // End-once guard. Multiple termination paths can race: the
+        // wrapped callback may fire and then the handler may also throw
+        // synchronously, or a callback-and-Promise hybrid handler may
+        // resolve after firing the callback.
+        let spanEnded = false;
+        const endSpan = err => {
+            if (spanEnded) {
+                return;
+            }
+            spanEnded = true;
+            if (err) {
+                span.recordException(err);
+                span.setStatus({ code: api.SpanStatusCode.ERROR });
+                if (err.code) {
+                    span.setAttribute('cloudserver.error_code', err.code);
+                }
+            } else {
+                span.setStatus({ code: api.SpanStatusCode.OK });
+            }
+            span.end();
+        };
+
+        const wrappedArgs = [...args];
+        if (callbackIndex !== -1) {
+            const originalCallback = args[callbackIndex];
+            wrappedArgs[callbackIndex] = function wrappedCallback(err, ...results) {
+                endSpan(err);
+                return originalCallback.call(this, err, ...results);
+            };
+        }
+
+        const ctx = api.trace.setSpan(api.context.active(), span);
+        try {
+            const result = api.context.with(ctx, () => apiMethod.apply(this, wrappedArgs));
+            if (callbackIndex === -1) {
+                if (result && typeof result.then === 'function') {
+                    return endSpanWhenSettled(result, endSpan);
+                }
+                endSpan();
+            }
+            // Callback-style handler: the wrapped callback drives the
+            // span lifecycle. If the handler also returns a thenable
+            // (hybrid migration shape), pass it through untouched —
+            // attaching a second .then() chain would surface as an
+            // unhandled rejection in callback-only callers.
+            return result;
+        } catch (error) {
+            endSpan(error);
+            throw error;
+        }
+    };
+}
+
+module.exports = { instrumentApiMethod };
diff --git a/lib/server.js b/lib/server.js
@@ -6,6 +6,7 @@
 const { setServerHeader } = arsenal.s3routes.routesUtils;
 const { RedisClient, StatsClient } = arsenal.metrics;
 const monitoringClient = require('./utilities/monitoringHandler');
+const tracing = require('./tracing');
 
 const logger = require('./utilities/logger');
 const { internalHandlers } = require('./utilities/internalHandlers');
@@ -15,15 +16,11 @@
 const api = require('./api/api');
 const dataWrapper = require('./data/wrapper');
 const kms = require('./kms/wrapper');
-const locationStorageCheck =
-    require('./api/apiUtils/object/locationStorageCheck');
+const locationStorageCheck = require('./api/apiUtils/object/locationStorageCheck');
 const vault = require('./auth/vault');
 const metadata = require('./metadata/wrapper');
 const { initManagement } = require('./management');
-const {
-    initManagementClient,
-    isManagementAgentUsed,
-} = require('./management/agentClient');
+const { initManagementClient, isManagementAgentUsed } = require('./management/agentClient');
 const { startCleanupJob } = require('./api/apiUtils/rateLimit/cleanup');
 const { startRefillJob, stopRefillJob } = require('./api/apiUtils/rateLimit/refillJob');
 
@@ -46,8 +43,7 @@
 _config.on('location-constraints-update', () => {
     if (implName === 'multipleBackends') {
         const clients = parseLC(_config, vault);
-        client = new MultipleBackendGateway(
-            clients, metadata, locationStorageCheck);
+        client = new MultipleBackendGateway(clients, metadata, locationStorageCheck);
     }
 });
 
@@ -59,8 +55,7 @@
 // stats client
 const STATS_INTERVAL = 5; // 5 seconds
 const STATS_EXPIRY = 30; // 30 seconds
-const statsClient = new StatsClient(localCacheClient, STATS_INTERVAL,
-    STATS_EXPIRY);
+const statsClient = new StatsClient(localCacheClient, STATS_INTERVAL, STATS_EXPIRY);
 const enableRemoteManagement = true;
 
 class S3Server {
@@ -84,7 +79,7 @@
         process.on('SIGHUP', this.cleanUp.bind(this));
         process.on('SIGQUIT', this.cleanUp.bind(this));
         process.on('SIGTERM', this.cleanUp.bind(this));
-        process.on('SIGPIPE', () => { });
+        process.on('SIGPIPE', () => {});
         // This will pick up exceptions up the stack
         process.on('uncaughtException', err => {
             // If just send the error object results in empty
@@ -130,9 +125,10 @@
         const requestStartTime = process.hrtime.bigint();
 
         // Skip server access logs for heartbeat.
-        const isLoggingEnabled = _config.serverAccessLogs
-            && (_config.serverAccessLogs.mode === serverAccessLogsModes.LOG_ONLY
-                || _config.serverAccessLogs.mode === serverAccessLogsModes.ENABLED);
+        const isLoggingEnabled =
+            _config.serverAccessLogs &&
+            (_config.serverAccessLogs.mode === serverAccessLogsModes.LOG_ONLY ||
+                _config.serverAccessLogs.mode === serverAccessLogsModes.ENABLED);
         const isInternalRoute = req.url.startsWith('/_');
         const isBackbeatRoute = req.url.startsWith('/_/backbeat/');
         if (isLoggingEnabled && (!isInternalRoute || isBackbeatRoute)) {
@@ -176,9 +172,7 @@
                 labels.action = req.apiMethod;
             }
             monitoringClient.httpRequestsTotal.labels(labels).inc();
-            monitoringClient.httpRequestDurationSeconds
-                .labels(labels)
-                .observe(responseTimeInNs / 1e9);
+            monitoringClient.httpRequestDurationSeconds.labels(labels).observe(responseTimeInNs / 1e9);
             monitoringClient.httpActiveRequests.dec();
         };
         res.on('close', monitorEndOfRequest);
@@ -206,6 +200,7 @@
                 vault,
             },
         };
+
         arsenal.s3routes.routes(req, res, params, logger, this.config);
     }
 
@@ -231,14 +226,13 @@
         };
 
         let reqUids = req.headers['x-scal-request-uids'];
-        if (reqUids !== undefined && !/*isValidReqUids*/(reqUids.length < 128)) {
+        if (reqUids !== undefined && !(/*isValidReqUids*/ (reqUids.length < 128))) {
             // simply ignore invalid id (any user can provide an
             // invalid request ID through a crafted header)
             reqUids = undefined;
         }
-        const log = (reqUids !== undefined ?
-            logger.newRequestLoggerFromSerializedUids(reqUids) :
-            logger.newRequestLogger());
+        const log =
+            reqUids !== undefined ? logger.newRequestLoggerFromSerializedUids(reqUids) : logger.newRequestLogger();
         log.end().addDefaultFields(clientInfo);
 
         log.debug('received admin request', clientInfo);
@@ -292,8 +286,7 @@
         server.requestTimeout = 0; // disabling request timeout
 
         server.on('connection', socket => {
-            socket.on('error', err => logger.info('request rejected',
-                { error: err }));
+            socket.on('error', err => logger.info('request rejected', { error: err }));
         });
 
         // https://nodejs.org/dist/latest-v6.x/
@@ -309,8 +302,11 @@
             };
             const { address } = addr;
             logger.info('server started', {
-                address, port,
-                pid: process.pid, serverIP: address, serverPort: port
+                address,
+                port,
+                pid: process.pid,
+                serverIP: address,
+                serverPort: port,
             });
         });
 
@@ -323,32 +319,41 @@
         this.servers.push(server);
     }
 
-    /*
-     * This exits the running process properly.
-     */
-    cleanUp() {
+    async cleanUp() {
         logger.info('server shutting down');
-        // Stop token refill job if running
         if (this.config.rateLimiting?.enabled) {
             stopRefillJob(logger);
         }
-        Promise.all(this.servers.map(server =>
-            new Promise(resolve => server.close(resolve))
-        )).then(() => process.exit(0));
+        try {
+            await Promise.all(this.servers.map(server => new Promise(resolve => server.close(resolve))));
+            await tracing.close();
+        } finally {
+            process.exit(0);
+        }
     }
 
-    caughtExceptionShutdown() {
+    async caughtExceptionShutdown() {
         if (!this.cluster) {
-            process.exit(1);
+            try {
+                await tracing.close();
+            } finally {
+                process.exit(1);
+            }
+            return;
         }
         logger.error('shutdown of worker due to exception', {
             workerId: this.worker ? this.worker.id : undefined,
             workerPid: this.worker ? this.worker.process.pid : undefined,
         });
-        // Will close all servers, cause disconnect event on primary and kill
-        // worker process with 'SIGTERM'.
+        // worker.kill() is graceful (closes servers, disconnects IPC) but
+        // does not fire our SIGTERM handler, so the BatchSpanProcessor
+        // would lose buffered spans without an explicit flush here.
         if (this.worker) {
-            this.worker.kill();
+            try {
+                await tracing.close();
+            } finally {
+                this.worker.kill();
+            }
         }
     }
 
@@ -363,10 +368,7 @@
     }
 
     initiateStartup(log) {
-        series([
-            next => metadata.setup(next),
-            next => clientCheck(true, log, next),
-        ], (err, results) => {
+        series([next => metadata.setup(next), next => clientCheck(true, log, next)], (err, results) => {
             if (err) {
                 log.warn('initial health check failed, delaying startup', {
                     error: err,
@@ -417,8 +419,10 @@
 
             try {
                 logger.info('ServerAccessLogger config', { config: _config.serverAccessLogs });
-                if (_config.serverAccessLogs.mode === serverAccessLogsModes.LOG_ONLY
-                    || _config.serverAccessLogs.mode === serverAccessLogsModes.ENABLED) {
+                if (
+                    _config.serverAccessLogs.mode === serverAccessLogsModes.LOG_ONLY ||
+                    _config.serverAccessLogs.mode === serverAccessLogsModes.ENABLED
+                ) {
                     var serverAccessLogger = new ServerAccessLogger(
                         _config.serverAccessLogs.outputFile,
                         _config.serverAccessLogs.highWaterMarkBytes,
@@ -434,7 +438,6 @@
                 logger.error('ServerAccessLogger creation error', error);
             }
 
-
             this.started = true;
         });
     }
@@ -490,8 +493,7 @@
         });
 
         const metricServer = new S3Server(_config);
-        metricServer.startServer(_config.metricsListenOn,
-            _config.metricsPort, metricServer.routeAdminRequest);
+        metricServer.startServer(_config.metricsListenOn, _config.metricsPort, metricServer.routeAdminRequest);
     }
     if (_config.isCluster && cluster.isWorker) {
         const server = new S3Server(_config, cluster.worker);

diff --git a/lib/tracing/healthPaths.js b/lib/tracing/healthPaths.js
@@ -0,0 +1,18 @@
+'use strict';
+
+// Probe + scrape paths that should never produce a span. Filtered at
+// ingest (not at the trace backend) because probe rate × pod count ×
+// always-on sampling overwhelms the exporter and storage with traffic
+// nobody queries.
+const HEALTH_PATHS = new Set(['/live', '/ready', '/_/healthcheck', '/_/healthcheck/deep', '/metrics']);
+
+function isHealthPath(url) {
+    if (typeof url !== 'string' || url.length === 0) {
+        return false;
+    }
+    const qIdx = url.indexOf('?');
+    const path = qIdx === -1 ? url : url.slice(0, qIdx);
+    return HEALTH_PATHS.has(path);
+}
+
+module.exports = { isHealthPath };