From d3c92cd33800932bbb71948b58d958b89825dc25 Mon Sep 17 00:00:00 2001 From: Valera V Harseko Date: Mon, 15 Jun 2026 21:44:18 +0300 Subject: [PATCH 1/4] test(radius): isolate RadiusConnSecurityTest from leaked static state RadiusConn keeps server-availability state in two static fields that outlive any single connection: the SERVER_STATUS map and the scheduled RADIUSMonitor health-check timer. These leaked across the 13 test methods, making the suite order-dependent and flaky under CI timing: - failoverToSecondary permanently recorded its dead primary as OFFLINE and scheduled a background monitor that kept probing. - attackerForgedAccessAccept_withZeroAuthenticator_isRejected then saw getOnlineServer() return null ("No RADIUS server is online.") when an ephemeral port number was reused, since RADIUSServer keys on host+port. - healthCheckSurvivesClientSocketClose had its manually-driven monitor.run() churn through stale OFFLINE entries (multiple 2s socket timeouts) while racing the leaked background monitor, so the live server was never promoted back to ONLINE. Add resetRadiusConnStatics(), invoked in @BeforeMethod and @AfterMethod, which cancels and clears the static serverMonitor (unscheduling the background timer) and clears SERVER_STATUS via reflection. Each method now starts from a clean slate. All 13 tests pass, green and stable across repeated runs (~4.8s vs the 20s timeout-laden CI run). --- .../radius/client/RadiusConnSecurityTest.java | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java b/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java index b4a04220bd..367a808b50 100644 --- a/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java +++ b/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java @@ -65,6 +65,15 @@ public class RadiusConnSecurityTest { @BeforeMethod public void startServerSocket() throws IOException { + // RadiusConn keeps the server-availability map and the health-check timer in static + // fields that outlive any single connection. Left untouched they leak across test + // methods: e.g. failoverToSecondary() permanently records its dead primary as OFFLINE + // and schedules a background RADIUSMonitor that keeps probing. A later test can then + // see getOnlineServer() return null ("No RADIUS server is online.") when an ephemeral + // port number is reused, or have its manually-driven monitor.run() race the background + // one. Reset the shared statics so every method starts from a clean slate. + resetRadiusConnStatics(); + serverSocket = new DatagramSocket(0, InetAddress.getByName("127.0.0.1")); serverSocket.setSoTimeout(5000); serverRunning = true; @@ -84,6 +93,37 @@ public void stopServer() { Thread.currentThread().interrupt(); } } + // Tear down any state/timer this test scheduled so it cannot bleed into the next one. + resetRadiusConnStatics(); + } + + /** + * Cancel any scheduled health-check monitor and clear the static server-status map on + * {@link RadiusConn}, isolating each test method from the shared singleton state. + */ + private static void resetRadiusConnStatics() { + try { + final java.lang.reflect.Field monitorField = RadiusConn.class.getDeclaredField("serverMonitor"); + monitorField.setAccessible(true); + final Object monitor = monitorField.get(null); + if (monitor != null) { + // RADIUSMonitor extends GeneralTaskRunnable, whose cancel() unschedules it from + // the shared SystemTimer so no background thread keeps probing. + monitor.getClass().getMethod("cancel").invoke(monitor); + monitorField.set(null, null); + } + + final java.lang.reflect.Field statusField = RadiusConn.class.getDeclaredField("SERVER_STATUS"); + statusField.setAccessible(true); + @SuppressWarnings("unchecked") + final java.util.Map serverStatus = + (java.util.Map) statusField.get(null); + synchronized (serverStatus) { + serverStatus.clear(); + } + } catch (ReflectiveOperationException roe) { + throw new IllegalStateException("Unable to reset RadiusConn static state for test isolation", roe); + } } private RadiusConn newClient() throws IOException { From e3679a5da2c25778e6b5eb6f507b61f1bd740c3c Mon Sep 17 00:00:00 2001 From: Valera V Harseko Date: Mon, 15 Jun 2026 22:05:26 +0300 Subject: [PATCH 2/4] test(radius): de-flake RadiusConnSecurityTest on CI Two independent CI-only flakes in RadiusConnSecurityTest, both rooted in test-harness timing rather than in RadiusConn's security logic. 1. Leaked static state across test methods. RadiusConn keeps server-availability in two static fields that outlive any single connection: the SERVER_STATUS map and the scheduled RADIUSMonitor health-check timer. failoverToSecondary marks its dead primary OFFLINE and schedules a background monitor; left running it bled into later methods, so healthCheckSurvivesClientSocketClose had its manually-driven monitor.run() churn through stale OFFLINE entries (multiple socket timeouts) while racing the leaked monitor, and the live server was never promoted back to ONLINE. Fix: resetRadiusConnStatics(), invoked in @BeforeMethod and @AfterMethod, cancels and nulls the static serverMonitor (unscheduling the background timer) and clears SERVER_STATUS via reflection, so each method starts from a clean slate. 2. Too-tight client read timeout on the cold first method. attackerForgedAccessAccept_withZeroAuthenticator_isRejected is the first @Test in source order and pays JVM warm-up / class-loading cost. On a CPU-constrained CI runner the responder thread is briefly starved and the 2s socket timeout fired before the forged reply arrived; the client then marked its only server OFFLINE and surfaced "No RADIUS server is online." instead of the asserted authenticator rejection. Fix: widen the test client read timeout from 2s to 15s. Every test using this client receives a response and returns as soon as it arrives, so the larger timeout never slows the happy path; it only absorbs CI scheduling jitter. Suite is green and stable across repeated local runs (~4.8s). --- .../modules/radius/client/RadiusConnSecurityTest.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java b/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java index 367a808b50..8fae70fc9e 100644 --- a/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java +++ b/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java @@ -133,8 +133,14 @@ private RadiusConn newClient() throws IOException { private RadiusConn newClient(boolean strict) throws IOException { final Set servers = new HashSet<>(); servers.add(new RADIUSServer("127.0.0.1", serverSocket.getLocalPort())); - // 2-second timeout; tests respond synchronously well within that. - return new RadiusConn(servers, Collections.emptySet(), SHARED_SECRET, 2, null, 60, strict); + // Generous 15-second read timeout. Every test that uses this client receives a response + // (forged, malformed, or legitimate) and returns as soon as it arrives, so a large timeout + // never slows the happy path. It only guards against a false "No RADIUS server is online." + // failure when the very first test method pays JVM warm-up / class-loading cost and the + // responder thread is briefly starved on a CPU-constrained CI runner: a too-tight timeout + // there makes the client give up, mark its only server OFFLINE and report "no server" in + // place of the authenticator rejection the test actually asserts. + return new RadiusConn(servers, Collections.emptySet(), SHARED_SECRET, 15, null, 60, strict); } /** Start a background responder that crafts a reply per the supplied lambda. */ From e6b75c70c6d350584375a3b1eb602301d3e18867 Mon Sep 17 00:00:00 2001 From: Valera V Harseko Date: Mon, 15 Jun 2026 22:29:00 +0300 Subject: [PATCH 3/4] test(radius): keep test responder alive across its read timeout Root-cause fix for the remaining CI flake in attackerForgedAccessAccept_withZeroAuthenticator_isRejected, which kept failing with "No RADIUS server is online." despite the earlier static- state isolation and the widened client timeout. The background UDP responder runs on a server socket with a 5s SO_TIMEOUT (@BeforeMethod). Its receive loop rethrew every IOException - including SocketTimeoutException - which killed the responder thread whenever no request arrived within 5s. The very first test method can be slow to *send* its first packet (cold-JVM class loading, and authenticate()'s InetAddress.getLocalHost() blocking on reverse DNS on a CI host). When that pre-send delay exceeds 5s the responder dies first; the client's request then goes unanswered, the client times out, marks its only server OFFLINE and reports "No RADIUS server is online." instead of the asserted authenticator rejection. Locally the client always sends within 5s, so the responder replies before its timeout - hence it only failed on CI. Treat SocketTimeoutException in the responder loop as "no request yet" and keep waiting, so the responder survives an arbitrarily slow client. Reproduced deterministically by injecting a 7s pre-send delay: fails ("No RADIUS server is online.", ~17s) without this change, passes with it. Also tighten the test client read timeout from 15s to 10s: with the responder no longer dying, the large value is only defence-in-depth against scheduling jitter and no longer needs to mask the real bug. --- .../radius/client/RadiusConnSecurityTest.java | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java b/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java index 8fae70fc9e..c697e5301a 100644 --- a/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java +++ b/openam-authentication/openam-auth-radius/src/test/java/com/sun/identity/authentication/modules/radius/client/RadiusConnSecurityTest.java @@ -133,14 +133,11 @@ private RadiusConn newClient() throws IOException { private RadiusConn newClient(boolean strict) throws IOException { final Set servers = new HashSet<>(); servers.add(new RADIUSServer("127.0.0.1", serverSocket.getLocalPort())); - // Generous 15-second read timeout. Every test that uses this client receives a response - // (forged, malformed, or legitimate) and returns as soon as it arrives, so a large timeout - // never slows the happy path. It only guards against a false "No RADIUS server is online." - // failure when the very first test method pays JVM warm-up / class-loading cost and the - // responder thread is briefly starved on a CPU-constrained CI runner: a too-tight timeout - // there makes the client give up, mark its only server OFFLINE and report "no server" in - // place of the authenticator rejection the test actually asserts. - return new RadiusConn(servers, Collections.emptySet(), SHARED_SECRET, 15, null, 60, strict); + // 10-second read timeout (defence-in-depth; the real CI de-flake is the responder no + // longer dying on its own read timeout - see startResponder). Every test using this client + // receives a response and returns as soon as it arrives, so a generous timeout never slows + // the happy path; it only adds margin against scheduling jitter on a loaded CI runner. + return new RadiusConn(servers, Collections.emptySet(), SHARED_SECRET, 10, null, 60, strict); } /** Start a background responder that crafts a reply per the supplied lambda. */ @@ -152,6 +149,15 @@ private void startResponder(Responder responder) { final DatagramPacket dp = new DatagramPacket(buf, buf.length); try { serverSocket.receive(dp); + } catch (java.net.SocketTimeoutException ste) { + // The server socket carries a 5s SO_TIMEOUT (see @BeforeMethod). A read + // timeout only means no request has arrived *yet* - it must NOT kill the + // responder. The client can legitimately be slow to send its first packet + // (cold-JVM class loading, or InetAddress.getLocalHost() blocking on reverse + // DNS on a CI host), and if the responder died here it would never answer the + // request that eventually arrives, leaving the client to time out and report + // "No RADIUS server is online." Keep waiting instead. + continue; } catch (IOException e) { if (!serverRunning) { return; From 4d703dfb28576eeb4aea5be7f558b43d7ab41bdc Mon Sep 17 00:00:00 2001 From: Valera V Harseko Date: Mon, 15 Jun 2026 23:27:59 +0300 Subject: [PATCH 4/4] test(server): harden setup integration tests against flaky CI IT_SetupWithOpenDJ.testSetupWithOpendj intermittently fails on CI with a TimeoutException while waiting for the OpenAM installation to complete. The Selenium + Docker + Cargo setup tests drive a full OpenAM install through a real browser against a Tomcat and an OpenDJ container, so they are sensitive to resource contention on a shared CI runner (a container slow to become healthy, a Tomcat slow to deploy the heavy OpenAM web app, or a ChromeDriver command stalling under load). Treat these as infra flakes rather than a product bug: - Add RetryAnalyzer + RetryListener: every integration-test method is retried once (configurable via -Dit.retry.count) before failing the build. Each retry starts from a clean config and a freshly started OpenAM instance (CargoBaseTest), so a transient infra hiccup gets a second chance without masking a reproducible failure. Wired globally via @Listeners on BaseTest so both IT classes inherit it. - Raise the testSetupWithOpendj install-completion wait from 600s to 900s: this variant installs against an external OpenDJ and makes OpenAM create the base entry itself, which under contention can exceed the old 10-minute ceiling. --- .../openam/test/integration/BaseTest.java | 2 + .../test/integration/IT_SetupWithOpenDJ.java | 7 ++- .../test/integration/RetryAnalyzer.java | 56 +++++++++++++++++++ .../test/integration/RetryListener.java | 41 ++++++++++++++ 4 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 openam-server/src/test/java/org/openidentityplatform/openam/test/integration/RetryAnalyzer.java create mode 100644 openam-server/src/test/java/org/openidentityplatform/openam/test/integration/RetryListener.java diff --git a/openam-server/src/test/java/org/openidentityplatform/openam/test/integration/BaseTest.java b/openam-server/src/test/java/org/openidentityplatform/openam/test/integration/BaseTest.java index 324add2b71..d0e17b53df 100644 --- a/openam-server/src/test/java/org/openidentityplatform/openam/test/integration/BaseTest.java +++ b/openam-server/src/test/java/org/openidentityplatform/openam/test/integration/BaseTest.java @@ -32,6 +32,7 @@ import org.testng.annotations.AfterClass; import org.testng.annotations.AfterMethod; import org.testng.annotations.BeforeClass; +import org.testng.annotations.Listeners; import java.io.BufferedReader; import java.io.File; @@ -42,6 +43,7 @@ import java.time.Duration; import java.util.List; +@Listeners(RetryListener.class) public abstract class BaseTest { WebDriver driver; WebDriverWait wait; diff --git a/openam-server/src/test/java/org/openidentityplatform/openam/test/integration/IT_SetupWithOpenDJ.java b/openam-server/src/test/java/org/openidentityplatform/openam/test/integration/IT_SetupWithOpenDJ.java index 81ef857826..279e2f228a 100644 --- a/openam-server/src/test/java/org/openidentityplatform/openam/test/integration/IT_SetupWithOpenDJ.java +++ b/openam-server/src/test/java/org/openidentityplatform/openam/test/integration/IT_SetupWithOpenDJ.java @@ -146,7 +146,12 @@ private void testOpenAmInstallation(String openamUrl, Integer opendjPort) throws wait.until(ExpectedConditions.elementToBeClickable(By.id("writeConfigButton"))).click(); - WebDriverWait waitComplete = new WebDriverWait(driver, Duration.ofSeconds(600)); + // Generous completion budget: this variant installs against an external OpenDJ container and + // (for testSetupWithOpendj) makes OpenAM create the base entry itself, which under CI + // resource contention can push the install past the previous 10-minute ceiling. Combined + // with the suite-wide retry (see RetryListener) this keeps transient infra slowness from + // failing the build. + WebDriverWait waitComplete = new WebDriverWait(driver, Duration.ofSeconds(900)); try { WebElement proceedToConsole = waitComplete.until(visibilityOfAnyElement(By.cssSelector("#confComplete a"))); proceedToConsole.click(); diff --git a/openam-server/src/test/java/org/openidentityplatform/openam/test/integration/RetryAnalyzer.java b/openam-server/src/test/java/org/openidentityplatform/openam/test/integration/RetryAnalyzer.java new file mode 100644 index 0000000000..d6c12aff7b --- /dev/null +++ b/openam-server/src/test/java/org/openidentityplatform/openam/test/integration/RetryAnalyzer.java @@ -0,0 +1,56 @@ +/* + * The contents of this file are subject to the terms of the Common Development and + * Distribution License (the License). You may not use this file except in compliance with the + * License. + * + * You can obtain a copy of the License at legal/CDDLv1.0.txt. See the License for the + * specific language governing permission and limitations under the License. + * + * When distributing Covered Software, include this CDDL Header Notice in each file and include + * the License file at legal/CDDLv1.0.txt. If applicable, add the following below the CDDL + * Header, with the fields enclosed by brackets [] replaced by your own identifying + * information: "Portions copyright [year] [name of copyright owner]". + * + * Copyright 2026 3A Systems LLC. + */ + +package org.openidentityplatform.openam.test.integration; + +import org.testng.IRetryAnalyzer; +import org.testng.ITestResult; + +/** + * Retries a failed integration-test method a small number of times before giving up. + * + *

The Selenium/Docker/Cargo integration tests drive a full OpenAM installation through a real + * browser against a Tomcat and an OpenDJ container. They are inherently sensitive to resource + * contention on a shared CI runner: a container that is slow to become healthy, a Tomcat that is + * slow to deploy the heavy OpenAM web app, or a ChromeDriver command that stalls under load can all + * make an otherwise-correct run time out. Rather than fail the whole build on such transient + * infrastructure hiccups, give each test method one more attempt (each retry starts from a clean + * config and a freshly started OpenAM instance - see {@link CargoBaseTest}). + * + *

The retry budget can be overridden with the {@code it.retry.count} system property; it defaults + * to a single retry (two attempts total) so a genuine, reproducible failure still fails the build + * without exploding CI time on a test that takes minutes per attempt. + */ +public class RetryAnalyzer implements IRetryAnalyzer { + + private static final int MAX_RETRIES = + Integer.getInteger("it.retry.count", 1); + + private int attempts = 0; + + @Override + public boolean retry(ITestResult result) { + if (attempts < MAX_RETRIES) { + attempts++; + System.err.println("Retrying " + result.getTestClass().getName() + "." + + result.getMethod().getMethodName() + " (attempt " + (attempts + 1) + + " of " + (MAX_RETRIES + 1) + ") after failure: " + + (result.getThrowable() == null ? "n/a" : result.getThrowable().toString())); + return true; + } + return false; + } +} diff --git a/openam-server/src/test/java/org/openidentityplatform/openam/test/integration/RetryListener.java b/openam-server/src/test/java/org/openidentityplatform/openam/test/integration/RetryListener.java new file mode 100644 index 0000000000..f7a4e7f66e --- /dev/null +++ b/openam-server/src/test/java/org/openidentityplatform/openam/test/integration/RetryListener.java @@ -0,0 +1,41 @@ +/* + * The contents of this file are subject to the terms of the Common Development and + * Distribution License (the License). You may not use this file except in compliance with the + * License. + * + * You can obtain a copy of the License at legal/CDDLv1.0.txt. See the License for the + * specific language governing permission and limitations under the License. + * + * When distributing Covered Software, include this CDDL Header Notice in each file and include + * the License file at legal/CDDLv1.0.txt. If applicable, add the following below the CDDL + * Header, with the fields enclosed by brackets [] replaced by your own identifying + * information: "Portions copyright [year] [name of copyright owner]". + * + * Copyright 2026 3A Systems LLC. + */ + +package org.openidentityplatform.openam.test.integration; + +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; + +import org.testng.IAnnotationTransformer; +import org.testng.annotations.ITestAnnotation; + +/** + * Installs {@link RetryAnalyzer} on every integration-test method automatically, so the flaky-CI + * retry behaviour does not have to be repeated on each {@code @Test}. Registered once via + * {@code @Listeners} on {@link BaseTest}; any test method that already declares its own retry + * analyzer is left untouched. + */ +public class RetryListener implements IAnnotationTransformer { + + @Override + public void transform(ITestAnnotation annotation, Class testClass, Constructor testConstructor, + Method testMethod) { + // TestNG 6.x API: a method with no explicit retry analyzer returns null here. + if (annotation.getRetryAnalyzer() == null) { + annotation.setRetryAnalyzer(RetryAnalyzer.class); + } + } +}