From d3c92cd33800932bbb71948b58d958b89825dc25 Mon Sep 17 00:00:00 2001 From: Valera V Harseko Date: Mon, 15 Jun 2026 21:44:18 +0300 Subject: [PATCH 1/3] test(radius): isolate RadiusConnSecurityTest from leaked static state RadiusConn keeps server-availability state in two static fields that outlive any single connection: the SERVER_STATUS map and the scheduled RADIUSMonitor health-check timer. These leaked across the 13 test methods, making the suite order-dependent and flaky under CI timing: - failoverToSecondary permanently recorded its dead primary as OFFLINE and scheduled a background monitor that kept probing. - attackerForgedAccessAccept_withZeroAuthenticator_isRejected then saw getOnlineServer() return null ("No RADIUS server is online.") when an ephemeral port number was reused, since RADIUSServer keys on host+port. - healthCheckSurvivesClientSocketClose had its manually-driven monitor.run() churn through stale OFFLINE entries (multiple 2s socket timeouts) while racing the leaked background monitor, so the live server was never promoted back to ONLINE. Add resetRadiusConnStatics(), invoked in @BeforeMethod and @AfterMethod, which cancels and clears the static serverMonitor (unscheduling the background timer) and clears SERVER_STATUS via reflection. Each method now starts from a clean slate. All 13 tests pass, green and stable across repeated runs (~4.8s vs the 20s timeout-laden CI run). --- .../radius/client/RadiusConnSecurityTest.java | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java b/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java index b4a04220bd..367a808b50 100644 --- a/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java +++ b/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java @@ -65,6 +65,15 @@ public class RadiusConnSecurityTest { @BeforeMethod public void startServerSocket() throws IOException { + // RadiusConn keeps the server-availability map and the health-check timer in static + // fields that outlive any single connection. Left untouched they leak across test + // methods: e.g. failoverToSecondary() permanently records its dead primary as OFFLINE + // and schedules a background RADIUSMonitor that keeps probing. A later test can then + // see getOnlineServer() return null ("No RADIUS server is online.") when an ephemeral + // port number is reused, or have its manually-driven monitor.run() race the background + // one. Reset the shared statics so every method starts from a clean slate. + resetRadiusConnStatics(); + serverSocket = new DatagramSocket(0, InetAddress.getByName("127.0.0.1")); serverSocket.setSoTimeout(5000); serverRunning = true; @@ -84,6 +93,37 @@ public void stopServer() { Thread.currentThread().interrupt(); } } + // Tear down any state/timer this test scheduled so it cannot bleed into the next one. + resetRadiusConnStatics(); + } + + /** + * Cancel any scheduled health-check monitor and clear the static server-status map on + * {@link RadiusConn}, isolating each test method from the shared singleton state. + */ + private static void resetRadiusConnStatics() { + try { + final java.lang.reflect.Field monitorField = RadiusConn.class.getDeclaredField("serverMonitor"); + monitorField.setAccessible(true); + final Object monitor = monitorField.get(null); + if (monitor != null) { + // RADIUSMonitor extends GeneralTaskRunnable, whose cancel() unschedules it from + // the shared SystemTimer so no background thread keeps probing. + monitor.getClass().getMethod("cancel").invoke(monitor); + monitorField.set(null, null); + } + + final java.lang.reflect.Field statusField = RadiusConn.class.getDeclaredField("SERVER_STATUS"); + statusField.setAccessible(true); + @SuppressWarnings("unchecked") + final java.util.Map serverStatus = + (java.util.Map) statusField.get(null); + synchronized (serverStatus) { + serverStatus.clear(); + } + } catch (ReflectiveOperationException roe) { + throw new IllegalStateException("Unable to reset RadiusConn static state for test isolation", roe); + } } private RadiusConn newClient() throws IOException { From e3679a5da2c25778e6b5eb6f507b61f1bd740c3c Mon Sep 17 00:00:00 2001 From: Valera V Harseko Date: Mon, 15 Jun 2026 22:05:26 +0300 Subject: [PATCH 2/3] test(radius): de-flake RadiusConnSecurityTest on CI Two independent CI-only flakes in RadiusConnSecurityTest, both rooted in test-harness timing rather than in RadiusConn's security logic. 1. Leaked static state across test methods. RadiusConn keeps server-availability in two static fields that outlive any single connection: the SERVER_STATUS map and the scheduled RADIUSMonitor health-check timer. failoverToSecondary marks its dead primary OFFLINE and schedules a background monitor; left running it bled into later methods, so healthCheckSurvivesClientSocketClose had its manually-driven monitor.run() churn through stale OFFLINE entries (multiple socket timeouts) while racing the leaked monitor, and the live server was never promoted back to ONLINE. Fix: resetRadiusConnStatics(), invoked in @BeforeMethod and @AfterMethod, cancels and nulls the static serverMonitor (unscheduling the background timer) and clears SERVER_STATUS via reflection, so each method starts from a clean slate. 2. Too-tight client read timeout on the cold first method. attackerForgedAccessAccept_withZeroAuthenticator_isRejected is the first @Test in source order and pays JVM warm-up / class-loading cost. On a CPU-constrained CI runner the responder thread is briefly starved and the 2s socket timeout fired before the forged reply arrived; the client then marked its only server OFFLINE and surfaced "No RADIUS server is online." instead of the asserted authenticator rejection. Fix: widen the test client read timeout from 2s to 15s. Every test using this client receives a response and returns as soon as it arrives, so the larger timeout never slows the happy path; it only absorbs CI scheduling jitter. Suite is green and stable across repeated local runs (~4.8s). --- .../modules/radius/client/RadiusConnSecurityTest.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java b/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java index 367a808b50..8fae70fc9e 100644 --- a/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java +++ b/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java @@ -133,8 +133,14 @@ private RadiusConn newClient() throws IOException { private RadiusConn newClient(boolean strict) throws IOException { final Set servers = new HashSet<>(); servers.add(new RADIUSServer("127.0.0.1", serverSocket.getLocalPort())); - // 2-second timeout; tests respond synchronously well within that. - return new RadiusConn(servers, Collections.emptySet(), SHARED_SECRET, 2, null, 60, strict); + // Generous 15-second read timeout. Every test that uses this client receives a response + // (forged, malformed, or legitimate) and returns as soon as it arrives, so a large timeout + // never slows the happy path. It only guards against a false "No RADIUS server is online." + // failure when the very first test method pays JVM warm-up / class-loading cost and the + // responder thread is briefly starved on a CPU-constrained CI runner: a too-tight timeout + // there makes the client give up, mark its only server OFFLINE and report "no server" in + // place of the authenticator rejection the test actually asserts. + return new RadiusConn(servers, Collections.emptySet(), SHARED_SECRET, 15, null, 60, strict); } /** Start a background responder that crafts a reply per the supplied lambda. */ From e6b75c70c6d350584375a3b1eb602301d3e18867 Mon Sep 17 00:00:00 2001 From: Valera V Harseko Date: Mon, 15 Jun 2026 22:29:00 +0300 Subject: [PATCH 3/3] test(radius): keep test responder alive across its read timeout Root-cause fix for the remaining CI flake in attackerForgedAccessAccept_withZeroAuthenticator_isRejected, which kept failing with "No RADIUS server is online." despite the earlier static- state isolation and the widened client timeout. The background UDP responder runs on a server socket with a 5s SO_TIMEOUT (@BeforeMethod). Its receive loop rethrew every IOException - including SocketTimeoutException - which killed the responder thread whenever no request arrived within 5s. The very first test method can be slow to *send* its first packet (cold-JVM class loading, and authenticate()'s InetAddress.getLocalHost() blocking on reverse DNS on a CI host). When that pre-send delay exceeds 5s the responder dies first; the client's request then goes unanswered, the client times out, marks its only server OFFLINE and reports "No RADIUS server is online." instead of the asserted authenticator rejection. Locally the client always sends within 5s, so the responder replies before its timeout - hence it only failed on CI. Treat SocketTimeoutException in the responder loop as "no request yet" and keep waiting, so the responder survives an arbitrarily slow client. Reproduced deterministically by injecting a 7s pre-send delay: fails ("No RADIUS server is online.", ~17s) without this change, passes with it. Also tighten the test client read timeout from 15s to 10s: with the responder no longer dying, the large value is only defence-in-depth against scheduling jitter and no longer needs to mask the real bug. --- .../radius/client/RadiusConnSecurityTest.java | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java b/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java index 8fae70fc9e..c697e5301a 100644 --- a/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java +++ b/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java @@ -133,14 +133,11 @@ private RadiusConn newClient() throws IOException { private RadiusConn newClient(boolean strict) throws IOException { final Set servers = new HashSet<>(); servers.add(new RADIUSServer("127.0.0.1", serverSocket.getLocalPort())); - // Generous 15-second read timeout. Every test that uses this client receives a response - // (forged, malformed, or legitimate) and returns as soon as it arrives, so a large timeout - // never slows the happy path. It only guards against a false "No RADIUS server is online." - // failure when the very first test method pays JVM warm-up / class-loading cost and the - // responder thread is briefly starved on a CPU-constrained CI runner: a too-tight timeout - // there makes the client give up, mark its only server OFFLINE and report "no server" in - // place of the authenticator rejection the test actually asserts. - return new RadiusConn(servers, Collections.emptySet(), SHARED_SECRET, 15, null, 60, strict); + // 10-second read timeout (defence-in-depth; the real CI de-flake is the responder no + // longer dying on its own read timeout - see startResponder). Every test using this client + // receives a response and returns as soon as it arrives, so a generous timeout never slows + // the happy path; it only adds margin against scheduling jitter on a loaded CI runner. + return new RadiusConn(servers, Collections.emptySet(), SHARED_SECRET, 10, null, 60, strict); } /** Start a background responder that crafts a reply per the supplied lambda. */ @@ -152,6 +149,15 @@ private void startResponder(Responder responder) { final DatagramPacket dp = new DatagramPacket(buf, buf.length); try { serverSocket.receive(dp); + } catch (java.net.SocketTimeoutException ste) { + // The server socket carries a 5s SO_TIMEOUT (see @BeforeMethod). A read + // timeout only means no request has arrived *yet* - it must NOT kill the + // responder. The client can legitimately be slow to send its first packet + // (cold-JVM class loading, or InetAddress.getLocalHost() blocking on reverse + // DNS on a CI host), and if the responder died here it would never answer the + // request that eventually arrives, leaving the client to time out and report + // "No RADIUS server is online." Keep waiting instead. + continue; } catch (IOException e) { if (!serverRunning) { return;