diff --git a/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java b/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java index b4a04220bd..c697e5301a 100644 --- a/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java +++ b/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java @@ -65,6 +65,15 @@ public class RadiusConnSecurityTest { @BeforeMethod public void startServerSocket() throws IOException { + // RadiusConn keeps the server-availability map and the health-check timer in static + // fields that outlive any single connection. Left untouched they leak across test + // methods: e.g. failoverToSecondary() permanently records its dead primary as OFFLINE + // and schedules a background RADIUSMonitor that keeps probing. A later test can then + // see getOnlineServer() return null ("No RADIUS server is online.") when an ephemeral + // port number is reused, or have its manually-driven monitor.run() race the background + // one. Reset the shared statics so every method starts from a clean slate. + resetRadiusConnStatics(); + serverSocket = new DatagramSocket(0, InetAddress.getByName("127.0.0.1")); serverSocket.setSoTimeout(5000); serverRunning = true; @@ -84,6 +93,37 @@ public void stopServer() { Thread.currentThread().interrupt(); } } + // Tear down any state/timer this test scheduled so it cannot bleed into the next one. + resetRadiusConnStatics(); + } + + /** + * Cancel any scheduled health-check monitor and clear the static server-status map on + * {@link RadiusConn}, isolating each test method from the shared singleton state. + */ + private static void resetRadiusConnStatics() { + try { + final java.lang.reflect.Field monitorField = RadiusConn.class.getDeclaredField("serverMonitor"); + monitorField.setAccessible(true); + final Object monitor = monitorField.get(null); + if (monitor != null) { + // RADIUSMonitor extends GeneralTaskRunnable, whose cancel() unschedules it from + // the shared SystemTimer so no background thread keeps probing. + monitor.getClass().getMethod("cancel").invoke(monitor); + monitorField.set(null, null); + } + + final java.lang.reflect.Field statusField = RadiusConn.class.getDeclaredField("SERVER_STATUS"); + statusField.setAccessible(true); + @SuppressWarnings("unchecked") + final java.util.Map serverStatus = + (java.util.Map) statusField.get(null); + synchronized (serverStatus) { + serverStatus.clear(); + } + } catch (ReflectiveOperationException roe) { + throw new IllegalStateException("Unable to reset RadiusConn static state for test isolation", roe); + } } private RadiusConn newClient() throws IOException { @@ -93,8 +133,11 @@ private RadiusConn newClient() throws IOException { private RadiusConn newClient(boolean strict) throws IOException { final Set servers = new HashSet<>(); servers.add(new RADIUSServer("127.0.0.1", serverSocket.getLocalPort())); - // 2-second timeout; tests respond synchronously well within that. - return new RadiusConn(servers, Collections.emptySet(), SHARED_SECRET, 2, null, 60, strict); + // 10-second read timeout (defence-in-depth; the real CI de-flake is the responder no + // longer dying on its own read timeout - see startResponder). Every test using this client + // receives a response and returns as soon as it arrives, so a generous timeout never slows + // the happy path; it only adds margin against scheduling jitter on a loaded CI runner. + return new RadiusConn(servers, Collections.emptySet(), SHARED_SECRET, 10, null, 60, strict); } /** Start a background responder that crafts a reply per the supplied lambda. */ @@ -106,6 +149,15 @@ private void startResponder(Responder responder) { final DatagramPacket dp = new DatagramPacket(buf, buf.length); try { serverSocket.receive(dp); + } catch (java.net.SocketTimeoutException ste) { + // The server socket carries a 5s SO_TIMEOUT (see @BeforeMethod). A read + // timeout only means no request has arrived *yet* - it must NOT kill the + // responder. The client can legitimately be slow to send its first packet + // (cold-JVM class loading, or InetAddress.getLocalHost() blocking on reverse + // DNS on a CI host), and if the responder died here it would never answer the + // request that eventually arrives, leaving the client to time out and report + // "No RADIUS server is online." Keep waiting instead. + continue; } catch (IOException e) { if (!serverRunning) { return;