From c1edd1411418aa7d5d5655a34a36071c0b82559f Mon Sep 17 00:00:00 2001 From: Gabriel J Mendoza Date: Fri, 20 Mar 2026 10:45:18 -0400 Subject: [PATCH 01/10] Update Splunk Enterprise version from 10.0.0 to 10.2.0 Co-Authored-By: Claude Opus 4.6 --- .env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.env b/.env index 28360f2d8..599249271 100644 --- a/.env +++ b/.env @@ -8,4 +8,4 @@ EKSCTL_VERSION=v0.215.0 EKS_CLUSTER_K8_VERSION=1.34 EKS_INSTANCE_TYPE=m5.2xlarge EKS_INSTANCE_TYPE_ARM64=c6g.4xlarge -SPLUNK_ENTERPRISE_RELEASE_IMAGE=splunk/splunk:10.0.0 \ No newline at end of file +SPLUNK_ENTERPRISE_RELEASE_IMAGE=splunk/splunk:10.2.0 \ No newline at end of file From c6728e4e790263014bec32804990570ba26f49f2 Mon Sep 17 00:00:00 2001 From: rlieberman-splunk Date: Wed, 25 Mar 2026 11:12:12 -0500 Subject: [PATCH 02/10] apply FIPS error handling to shc bundle push and premium apps --- pkg/splunk/enterprise/afwscheduler.go | 40 +++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/pkg/splunk/enterprise/afwscheduler.go b/pkg/splunk/enterprise/afwscheduler.go index 17cfb0ce4..c8ccbe9db 100644 --- a/pkg/splunk/enterprise/afwscheduler.go +++ b/pkg/splunk/enterprise/afwscheduler.go @@ -1688,13 +1688,38 @@ func (shcPlaybookContext *SHCPlaybookContext) isBundlePushComplete(ctx context.C return false, err } - // Check if we did not get the desired output in the status file. There can be 2 scenarios - + // Check if we did not get the desired output in the status file. There can be 3 scenarios - // 1. stdOut is empty, which means bundle push is still in progress - // 2. stdOut has some other string other than the bundle push success message + // 2. stdOut contains only informational lines (e.g. the FIPS provider banner written to + // stderr by the Splunk CLI on FIPS-enabled clusters, captured via the &> shell redirect + // in applySHCBundleCmdStr before the actual push output is written) + // 3. stdOut has some other string other than the bundle push success message if stdOut == "" { scopedLog.Info("SHC Bundle Push is still in progress") return false, nil } else if !strings.Contains(stdOut, shcBundlePushCompleteStr) { + // Check whether the file contains only known informational lines. On FIPS-enabled + // clusters the Splunk binary immediately writes the FIPS provider banner (and SSL + // warnings) to stderr at startup; because the bundle push command uses &> to + // redirect all output to the status file, these lines appear in the file before the + // actual push result. Treat such content as "still in progress" so we do not + // prematurely abort a running push and trigger a retry storm. + hasMeaningfulContent := false + for _, line := range strings.Split(stdOut, "\n") { + trimmed := strings.TrimSpace(line) + if trimmed == "" || + strings.HasPrefix(trimmed, "FIPS provider enabled.") || + strings.HasPrefix(trimmed, "WARNING: Server Certificate") { + continue + } + hasMeaningfulContent = true + break + } + if !hasMeaningfulContent { + scopedLog.Info("SHC Bundle Push is still in progress (status file contains only informational messages)") + return false, nil + } + // this means there was an error in bundle push command err = fmt.Errorf("there was an error in applying SHC Bundle, err=\"%v\"", stdOut) scopedLog.Error(err, "SHC Bundle push status file reported an error while applying bundle") @@ -2078,7 +2103,16 @@ func handleEsappPostinstall(rctx context.Context, preCtx *premiumAppScopePlayboo streamOptions := splutil.NewStreamOptionsObject(command) stdOut, stdErr, err := preCtx.localCtx.podExecClient.RunPodExecCommand(rctx, streamOptions, []string{"/bin/sh"}) - if stdErr != "" || err != nil { + + // Log stderr content for debugging but don't use it for error detection. + // On FIPS-enabled clusters the Splunk CLI always writes the FIPS provider + // banner and related informational messages to stderr on every invocation, + // so a non-empty stderr does not indicate failure. + if stdErr != "" { + scopedLog.Info("Post install command stderr output (informational only)", "stdout", stdOut, "stderr", stdErr, "post install command", command) + } + + if err != nil { phaseInfo.FailCount++ scopedLog.Error(err, "premium scoped app package install failed", "stdout", stdOut, "stderr", stdErr, "post install command", command, "failCount", phaseInfo.FailCount) return fmt.Errorf("premium scoped app package install failed. stdOut: %s, stdErr: %s, post install command: %s, failCount: %d", stdOut, stdErr, command, phaseInfo.FailCount) From 626fb2d2c18fdfc7d9bca6d8fe5bd9d8a25a8683 Mon Sep 17 00:00:00 2001 From: rlieberman-splunk Date: Wed, 1 Apr 2026 15:36:19 -0500 Subject: [PATCH 03/10] add unit tests --- pkg/splunk/enterprise/afwscheduler_test.go | 341 +++++++++++++++++++++ 1 file changed, 341 insertions(+) diff --git a/pkg/splunk/enterprise/afwscheduler_test.go b/pkg/splunk/enterprise/afwscheduler_test.go index 53d8eeef1..fd3bebe6c 100644 --- a/pkg/splunk/enterprise/afwscheduler_test.go +++ b/pkg/splunk/enterprise/afwscheduler_test.go @@ -4741,3 +4741,344 @@ func TestIsAppAlreadyInstalled(t *testing.T) { }) } } + +func TestSHCIsBundlePushComplete(t *testing.T) { + ctx := context.TODO() + cr := &enterpriseApi.SearchHeadCluster{ + TypeMeta: metav1.TypeMeta{ + Kind: "SearchHeadCluster", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "stack1", + Namespace: "test", + }, + } + + c := spltest.NewMockClient() + + catCmd := fmt.Sprintf("cat %s", shcBundlePushStatusCheckFile) + rmCmd := fmt.Sprintf("rm %s", shcBundlePushStatusCheckFile) + + tests := []struct { + name string + catStdOut string + catStdErr string + catErr error + expectsRemoval bool + removalStdErr string + expectedResult bool + expectedError bool + description string + }{ + { + name: "empty stdOut - bundle push still in progress", + catStdOut: "", + catStdErr: "", + catErr: nil, + expectsRemoval: false, + expectedResult: false, + expectedError: false, + description: "Empty status file means push still in progress", + }, + { + name: "FIPS provider banner only - treated as still in progress", + catStdOut: "FIPS provider enabled.", + catStdErr: "", + catErr: nil, + expectsRemoval: false, + expectedResult: false, + expectedError: false, + description: "Status file with only FIPS banner should not be treated as an error", + }, + { + name: "FIPS banner and WARNING lines only - treated as still in progress", + catStdOut: "FIPS provider enabled.\nWARNING: Server Certificate Validation Disabled\n", + catStdErr: "", + catErr: nil, + expectsRemoval: false, + expectedResult: false, + expectedError: false, + description: "Status file with FIPS banner and SSL warnings should not be treated as an error", + }, + { + name: "FIPS banner and blank lines only - treated as still in progress", + catStdOut: "\nFIPS provider enabled.\n\n", + catStdErr: "", + catErr: nil, + expectsRemoval: false, + expectedResult: false, + expectedError: false, + description: "Blank lines alongside FIPS banner should still be treated as informational", + }, + { + name: "FIPS banner followed by real error content - treated as error", + catStdOut: "FIPS provider enabled.\nError applying bundle: permission denied", + catStdErr: "", + catErr: nil, + expectsRemoval: true, + removalStdErr: "", + expectedResult: false, + expectedError: true, + description: "Meaningful error content after FIPS banner should cause an error", + }, + { + name: "meaningful error in stdOut - treated as error", + catStdOut: "Error while deploying apps", + catStdErr: "", + catErr: nil, + expectsRemoval: true, + removalStdErr: "", + expectedResult: false, + expectedError: true, + description: "Non-success, non-FIPS content is a real bundle push error", + }, + { + name: "stdErr from cat command - error", + catStdOut: "", + catStdErr: "cat: no such file or directory", + catErr: nil, + expectsRemoval: true, + removalStdErr: "", + expectedResult: false, + expectedError: true, + description: "Stderr from status file read indicates a failure", + }, + { + name: "exec error from cat command - error", + catStdOut: "", + catStdErr: "", + catErr: fmt.Errorf("pod exec failed"), + expectsRemoval: true, + removalStdErr: "", + expectedResult: false, + expectedError: true, + description: "Exec error when reading status file should propagate", + }, + { + name: "bundle push complete success string - complete", + catStdOut: shcBundlePushCompleteStr, + catStdErr: "", + catErr: nil, + expectsRemoval: true, + removalStdErr: "", + expectedResult: true, + expectedError: false, + description: "Status file with success string means push complete", + }, + { + name: "FIPS banner preceding success string - complete", + catStdOut: "FIPS provider enabled.\n" + shcBundlePushCompleteStr, + catStdErr: "", + catErr: nil, + expectsRemoval: true, + removalStdErr: "", + expectedResult: true, + expectedError: false, + description: "FIPS banner before success string should still be recognized as complete", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + appDeployContext := &enterpriseApi.AppDeploymentContext{ + BundlePushStatus: enterpriseApi.BundlePushTracker{ + BundlePushStage: enterpriseApi.BundlePushInProgress, + }, + } + afwPipeline := &AppInstallPipeline{ + appDeployContext: appDeployContext, + } + + mockPodExecClient := &spltest.MockPodExecClient{Cr: cr} + + podExecCmds := []string{catCmd} + mockReturnCtxts := []*spltest.MockPodExecReturnContext{ + {StdOut: tt.catStdOut, StdErr: tt.catStdErr, Err: tt.catErr}, + } + + if tt.expectsRemoval { + podExecCmds = append(podExecCmds, rmCmd) + mockReturnCtxts = append(mockReturnCtxts, &spltest.MockPodExecReturnContext{ + StdOut: "", + StdErr: tt.removalStdErr, + }) + } + + mockPodExecClient.AddMockPodExecReturnContexts(ctx, podExecCmds, mockReturnCtxts...) + + shcCtx := &SHCPlaybookContext{ + client: c, + cr: cr, + afwPipeline: afwPipeline, + targetPodName: "splunk-stack1-searchheadcluster-0", + podExecClient: mockPodExecClient, + } + + result, err := shcCtx.isBundlePushComplete(ctx) + + if tt.expectedError { + if err == nil { + t.Errorf("Expected error for %q but got none", tt.description) + } + } else { + if err != nil { + t.Errorf("Unexpected error for %q: %v", tt.description, err) + } + } + + if result != tt.expectedResult { + t.Errorf("Expected result %v but got %v for %q", tt.expectedResult, result, tt.description) + } + }) + } +} + +func TestHandleEsappPostinstallFipsAware(t *testing.T) { + ctx := context.TODO() + + cr := enterpriseApi.Standalone{ + TypeMeta: metav1.TypeMeta{ + Kind: "Standalone", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "stack1", + Namespace: "test", + }, + Spec: enterpriseApi.StandaloneSpec{ + AppFrameworkConfig: enterpriseApi.AppFrameworkSpec{ + AppSources: []enterpriseApi.AppSourceSpec{ + { + Name: "appSrc1", + AppSourceDefaultSpec: enterpriseApi.AppSourceDefaultSpec{ + Scope: enterpriseApi.ScopePremiumApps, + PremiumAppsProps: enterpriseApi.PremiumAppsProps{ + Type: enterpriseApi.PremiumAppsTypeEs, + }, + }, + }, + }, + }, + }, + } + + appSrcSpec := &cr.Spec.AppFrameworkConfig.AppSources[0] + // The command registered in the mock only needs a prefix since matching uses strings.Contains. + esPostInstallCmdPrefix := "/opt/splunk/bin/splunk search" + + tests := []struct { + name string + stdOut string + stdErr string + execErr error + expectedError bool + description string + }{ + { + name: "success with no stderr - no error", + stdOut: "Successfully installed", + stdErr: "", + execErr: nil, + expectedError: false, + description: "Clean success should return nil", + }, + { + name: "success with FIPS stderr - no error", + stdOut: "Successfully installed", + stdErr: "FIPS provider enabled.", + execErr: nil, + expectedError: false, + description: "Stderr content alone should not cause failure on FIPS-enabled clusters", + }, + { + name: "success with WARNING stderr - no error", + stdOut: "Successfully installed", + stdErr: "WARNING: Server Certificate Validation Disabled", + execErr: nil, + expectedError: false, + description: "SSL warning in stderr alone should not cause failure", + }, + { + name: "exec error with no stderr - error", + stdOut: "", + stdErr: "", + execErr: fmt.Errorf("command terminated with exit code 1"), + expectedError: true, + description: "A real exec error must be surfaced", + }, + { + name: "exec error with FIPS stderr - error", + stdOut: "", + stdErr: "FIPS provider enabled.", + execErr: fmt.Errorf("essinstall failed"), + expectedError: true, + description: "Exec error takes precedence even when stderr carries only FIPS banner", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mockPodExecClient := &spltest.MockPodExecClient{} + + mockPodExecClient.AddMockPodExecReturnContext(ctx, esPostInstallCmdPrefix, &spltest.MockPodExecReturnContext{ + StdOut: tt.stdOut, + StdErr: tt.stdErr, + Err: tt.execErr, + }) + + var replicas int32 = 1 + sts := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "splunk-stack1", + Namespace: "test", + }, + Spec: appsv1.StatefulSetSpec{ + Replicas: &replicas, + }, + } + + c := spltest.NewMockClient() + var client splcommon.ControllerClient = getConvertedClient(c) + var waiter sync.WaitGroup + + localInstallCtxt := &localScopePlaybookContext{ + worker: &PipelineWorker{ + appSrcName: appSrcSpec.Name, + targetPodName: "splunk-stack1-standalone-0", + sts: sts, + cr: &cr, + appDeployInfo: &enterpriseApi.AppDeploymentInfo{ + AppName: "app1.tgz", + ObjectHash: "abcdef12345", + AuxPhaseInfo: make([]enterpriseApi.PhaseInfo, 1), + }, + afwConfig: &cr.Spec.AppFrameworkConfig, + client: client, + waiter: &waiter, + }, + sem: make(chan struct{}, 1), + podExecClient: mockPodExecClient, + } + + pCtx := premiumAppScopePlaybookContext{ + localCtx: localInstallCtxt, + client: client, + appSrcSpec: appSrcSpec, + cr: &cr, + afwPipeline: &AppInstallPipeline{}, + } + + phaseInfo := &enterpriseApi.PhaseInfo{} + err := handleEsappPostinstall(ctx, &pCtx, phaseInfo) + + if tt.expectedError { + if err == nil { + t.Errorf("Expected error for %q but got none", tt.description) + } + } else { + if err != nil { + t.Errorf("Unexpected error for %q: %v", tt.description, err) + } + } + }) + } +} From 906fbdeef6c4030f4cdf3cdf776a82d7d7f7e0c8 Mon Sep 17 00:00:00 2001 From: rlieberman-splunk Date: Thu, 2 Apr 2026 14:17:07 -0500 Subject: [PATCH 04/10] add in check for ConfDeploymentException --- pkg/splunk/enterprise/afwscheduler.go | 11 +++++++++++ pkg/splunk/enterprise/afwscheduler_test.go | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/pkg/splunk/enterprise/afwscheduler.go b/pkg/splunk/enterprise/afwscheduler.go index c8ccbe9db..0d7d1307d 100644 --- a/pkg/splunk/enterprise/afwscheduler.go +++ b/pkg/splunk/enterprise/afwscheduler.go @@ -1720,6 +1720,17 @@ func (shcPlaybookContext *SHCPlaybookContext) isBundlePushComplete(ctx context.C return false, nil } + // If Splunk reports that a deployment job is already running, treat it as still + // in progress rather than resetting state to Pending and re-triggering. Resetting + // would schedule a second push while the first is still holding Splunk's deployment + // lock, producing another ConfDeploymentException and creating a retry storm. + // This can happen in FIPS mode (premature state reset caused by the FIPS banner) or + // after an operator restart mid-push. + if strings.Contains(stdOut, "ConfDeploymentException: Can't start deployment job as one is already running!") { + scopedLog.Info("SHC Bundle Push is already running; will recheck status on next reconcile", "statusFileOutput", stdOut) + return false, nil + } + // this means there was an error in bundle push command err = fmt.Errorf("there was an error in applying SHC Bundle, err=\"%v\"", stdOut) scopedLog.Error(err, "SHC Bundle push status file reported an error while applying bundle") diff --git a/pkg/splunk/enterprise/afwscheduler_test.go b/pkg/splunk/enterprise/afwscheduler_test.go index fd3bebe6c..0c5e7d034 100644 --- a/pkg/splunk/enterprise/afwscheduler_test.go +++ b/pkg/splunk/enterprise/afwscheduler_test.go @@ -4876,6 +4876,26 @@ func TestSHCIsBundlePushComplete(t *testing.T) { expectedError: false, description: "FIPS banner before success string should still be recognized as complete", }, + { + name: "ConfDeploymentException alone - treated as still in progress", + catStdOut: "ConfDeploymentException: Can't start deployment job as one is already running!", + catStdErr: "", + catErr: nil, + expectsRemoval: false, + expectedResult: false, + expectedError: false, + description: "ConfDeploymentException must not reset state to Pending; treat as still in progress to avoid retry storm", + }, + { + name: "FIPS banner followed by ConfDeploymentException - treated as still in progress", + catStdOut: "FIPS provider enabled.\nWARNING: Server Certificate Hostname Validation is disabled.\nConfDeploymentException: Can't start deployment job as one is already running!", + catStdErr: "", + catErr: nil, + expectsRemoval: false, + expectedResult: false, + expectedError: false, + description: "FIPS banner plus ConfDeploymentException is the primary FIPS retry-storm scenario; must not reset to Pending", + }, } for _, tt := range tests { From d0c1a5dddf8aefa0e05670d180dee1af34d67e5e Mon Sep 17 00:00:00 2001 From: rlieberman-splunk Date: Thu, 2 Apr 2026 15:16:58 -0500 Subject: [PATCH 05/10] add more time for appframework tests --- test/appframework_aws/c3/appframework_aws_suite_test.go | 2 +- test/appframework_aws/m4/appframework_aws_suite_test.go | 2 +- test/monitoring_console/monitoring_console_suite_test.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/appframework_aws/c3/appframework_aws_suite_test.go b/test/appframework_aws/c3/appframework_aws_suite_test.go index aa1dde42d..a8b4fff22 100644 --- a/test/appframework_aws/c3/appframework_aws_suite_test.go +++ b/test/appframework_aws/c3/appframework_aws_suite_test.go @@ -61,7 +61,7 @@ func TestBasic(t *testing.T) { RegisterFailHandler(Fail) sc, _ := GinkgoConfiguration() - sc.Timeout = 240 * time.Minute + sc.Timeout = 420 * time.Minute RunSpecs(t, "Running "+testSuiteName, sc) } diff --git a/test/appframework_aws/m4/appframework_aws_suite_test.go b/test/appframework_aws/m4/appframework_aws_suite_test.go index aa21c7084..d7d28fd85 100644 --- a/test/appframework_aws/m4/appframework_aws_suite_test.go +++ b/test/appframework_aws/m4/appframework_aws_suite_test.go @@ -56,7 +56,7 @@ func TestBasic(t *testing.T) { RegisterFailHandler(Fail) sc, _ := GinkgoConfiguration() - sc.Timeout = 240 * time.Minute + sc.Timeout = 420 * time.Minute RunSpecs(t, "Running "+testSuiteName, sc) } diff --git a/test/monitoring_console/monitoring_console_suite_test.go b/test/monitoring_console/monitoring_console_suite_test.go index 83bf2060d..b28aa0ead 100644 --- a/test/monitoring_console/monitoring_console_suite_test.go +++ b/test/monitoring_console/monitoring_console_suite_test.go @@ -45,7 +45,7 @@ func TestBasic(t *testing.T) { RegisterFailHandler(Fail) sc, _ := GinkgoConfiguration() - sc.Timeout = 240 * time.Minute + sc.Timeout = 420 * time.Minute RunSpecs(t, "Running "+testSuiteName, sc) } From 241db0dc2287e98ea0e9ec145044d690df3f013d Mon Sep 17 00:00:00 2001 From: rlieberman-splunk Date: Fri, 3 Apr 2026 08:48:49 -0500 Subject: [PATCH 06/10] treat ssl as failure unless used with FIPS --- .github/workflows/int-test-workflow.yml | 2 +- pkg/splunk/enterprise/afwscheduler.go | 11 ++++-- pkg/splunk/enterprise/afwscheduler_test.go | 40 +++++++++++++++++----- pkg/splunk/enterprise/names.go | 13 +++++++ 4 files changed, 54 insertions(+), 12 deletions(-) diff --git a/.github/workflows/int-test-workflow.yml b/.github/workflows/int-test-workflow.yml index 001a34cee..c93f164e7 100644 --- a/.github/workflows/int-test-workflow.yml +++ b/.github/workflows/int-test-workflow.yml @@ -194,7 +194,7 @@ jobs: mkdir -p ./bin cp /snap/bin/kustomize ./bin/kustomize - name: Run Integration test - timeout-minutes: 240 + timeout-minutes: 420 env: TEST_TIMEOUT: 225m TEST_S3_ACCESS_KEY_ID: ${{ vars.TEST_S3_ACCESS_KEY_ID }} diff --git a/pkg/splunk/enterprise/afwscheduler.go b/pkg/splunk/enterprise/afwscheduler.go index 0d7d1307d..fd32c74da 100644 --- a/pkg/splunk/enterprise/afwscheduler.go +++ b/pkg/splunk/enterprise/afwscheduler.go @@ -1704,12 +1704,19 @@ func (shcPlaybookContext *SHCPlaybookContext) isBundlePushComplete(ctx context.C // redirect all output to the status file, these lines appear in the file before the // actual push result. Treat such content as "still in progress" so we do not // prematurely abort a running push and trigger a retry storm. + // + // IMPORTANT: SSL certificate warnings ("WARNING: Server Certificate ...") are only + // treated as informational when the FIPS provider banner is also present. On non-FIPS + // clusters the Splunk CLI can also emit SSL warnings (e.g. when hostname validation is + // disabled), but if those warnings are the only content in the status file it means the + // push failed silently — we must fall through to error/retry rather than waiting forever. + hasFIPSContent := strings.Contains(stdOut, splunkFIPSProviderBannerStr) hasMeaningfulContent := false for _, line := range strings.Split(stdOut, "\n") { trimmed := strings.TrimSpace(line) if trimmed == "" || - strings.HasPrefix(trimmed, "FIPS provider enabled.") || - strings.HasPrefix(trimmed, "WARNING: Server Certificate") { + strings.HasPrefix(trimmed, splunkFIPSProviderBannerStr) || + (hasFIPSContent && strings.HasPrefix(trimmed, splunkSSLCertWarnStr)) { continue } hasMeaningfulContent = true diff --git a/pkg/splunk/enterprise/afwscheduler_test.go b/pkg/splunk/enterprise/afwscheduler_test.go index 0c5e7d034..a09c05cbb 100644 --- a/pkg/splunk/enterprise/afwscheduler_test.go +++ b/pkg/splunk/enterprise/afwscheduler_test.go @@ -4782,7 +4782,7 @@ func TestSHCIsBundlePushComplete(t *testing.T) { }, { name: "FIPS provider banner only - treated as still in progress", - catStdOut: "FIPS provider enabled.", + catStdOut: splunkFIPSProviderBannerStr, catStdErr: "", catErr: nil, expectsRemoval: false, @@ -4792,7 +4792,7 @@ func TestSHCIsBundlePushComplete(t *testing.T) { }, { name: "FIPS banner and WARNING lines only - treated as still in progress", - catStdOut: "FIPS provider enabled.\nWARNING: Server Certificate Validation Disabled\n", + catStdOut: splunkFIPSProviderBannerStr + "\n" + splunkSSLCertWarnStr + " Validation Disabled\n", catStdErr: "", catErr: nil, expectsRemoval: false, @@ -4802,7 +4802,7 @@ func TestSHCIsBundlePushComplete(t *testing.T) { }, { name: "FIPS banner and blank lines only - treated as still in progress", - catStdOut: "\nFIPS provider enabled.\n\n", + catStdOut: "\n" + splunkFIPSProviderBannerStr + "\n\n", catStdErr: "", catErr: nil, expectsRemoval: false, @@ -4812,7 +4812,7 @@ func TestSHCIsBundlePushComplete(t *testing.T) { }, { name: "FIPS banner followed by real error content - treated as error", - catStdOut: "FIPS provider enabled.\nError applying bundle: permission denied", + catStdOut: splunkFIPSProviderBannerStr + "\nError applying bundle: permission denied", catStdErr: "", catErr: nil, expectsRemoval: true, @@ -4821,6 +4821,28 @@ func TestSHCIsBundlePushComplete(t *testing.T) { expectedError: true, description: "Meaningful error content after FIPS banner should cause an error", }, + { + name: "SSL WARNING only without FIPS banner - treated as error", + catStdOut: splunkSSLCertWarnStr + " Hostname Validation is disabled.", + catStdErr: "", + catErr: nil, + expectsRemoval: true, + removalStdErr: "", + expectedResult: false, + expectedError: true, + description: "SSL warning without FIPS banner means a silent failure on non-FIPS clusters; must not hang waiting for a push that already exited", + }, + { + name: "SSL WARNING only without FIPS banner (multiple lines) - treated as error", + catStdOut: splunkSSLCertWarnStr + " Hostname Validation is disabled.\n" + splunkSSLCertWarnStr + " Validation Disabled\n", + catStdErr: "", + catErr: nil, + expectsRemoval: true, + removalStdErr: "", + expectedResult: false, + expectedError: true, + description: "Multiple SSL warnings without FIPS banner must not suppress error detection on non-FIPS clusters", + }, { name: "meaningful error in stdOut - treated as error", catStdOut: "Error while deploying apps", @@ -4867,7 +4889,7 @@ func TestSHCIsBundlePushComplete(t *testing.T) { }, { name: "FIPS banner preceding success string - complete", - catStdOut: "FIPS provider enabled.\n" + shcBundlePushCompleteStr, + catStdOut: splunkFIPSProviderBannerStr + "\n" + shcBundlePushCompleteStr, catStdErr: "", catErr: nil, expectsRemoval: true, @@ -4888,7 +4910,7 @@ func TestSHCIsBundlePushComplete(t *testing.T) { }, { name: "FIPS banner followed by ConfDeploymentException - treated as still in progress", - catStdOut: "FIPS provider enabled.\nWARNING: Server Certificate Hostname Validation is disabled.\nConfDeploymentException: Can't start deployment job as one is already running!", + catStdOut: splunkFIPSProviderBannerStr + "\n" + splunkSSLCertWarnStr + " Hostname Validation is disabled.\nConfDeploymentException: Can't start deployment job as one is already running!", catStdErr: "", catErr: nil, expectsRemoval: false, @@ -5004,7 +5026,7 @@ func TestHandleEsappPostinstallFipsAware(t *testing.T) { { name: "success with FIPS stderr - no error", stdOut: "Successfully installed", - stdErr: "FIPS provider enabled.", + stdErr: splunkFIPSProviderBannerStr, execErr: nil, expectedError: false, description: "Stderr content alone should not cause failure on FIPS-enabled clusters", @@ -5012,7 +5034,7 @@ func TestHandleEsappPostinstallFipsAware(t *testing.T) { { name: "success with WARNING stderr - no error", stdOut: "Successfully installed", - stdErr: "WARNING: Server Certificate Validation Disabled", + stdErr: splunkSSLCertWarnStr + " Validation Disabled", execErr: nil, expectedError: false, description: "SSL warning in stderr alone should not cause failure", @@ -5028,7 +5050,7 @@ func TestHandleEsappPostinstallFipsAware(t *testing.T) { { name: "exec error with FIPS stderr - error", stdOut: "", - stdErr: "FIPS provider enabled.", + stdErr: splunkFIPSProviderBannerStr, execErr: fmt.Errorf("essinstall failed"), expectedError: true, description: "Exec error takes precedence even when stderr carries only FIPS banner", diff --git a/pkg/splunk/enterprise/names.go b/pkg/splunk/enterprise/names.go index e49782f59..dd232b821 100644 --- a/pkg/splunk/enterprise/names.go +++ b/pkg/splunk/enterprise/names.go @@ -112,6 +112,19 @@ const ( shcBundlePushStatusCheckFile = "/operator-staging/appframework/.shcluster_bundle_status.txt" + // splunkFIPSProviderBannerStr is the line written to stderr by the Splunk CLI at + // startup on FIPS-enabled clusters. Because the bundle push command redirects all + // output (&>) to the status file, this banner can appear in the file before the + // actual push result. + splunkFIPSProviderBannerStr = "FIPS provider enabled." + + // splunkSSLCertWarnStr is the prefix of SSL certificate-related warnings emitted + // by the Splunk CLI to stderr. On FIPS-enabled clusters these appear alongside the + // FIPS banner and must be treated as informational. On non-FIPS clusters an SSL + // warning without a FIPS banner indicates a silent failure and should not suppress + // error detection. + splunkSSLCertWarnStr = "WARNING: Server Certificate" + applyIdxcBundleCmdStr = "/opt/splunk/bin/splunk apply cluster-bundle -auth admin:`cat /mnt/splunk-secrets/password` --skip-validation --answer-yes" idxcShowClusterBundleStatusStr = "/opt/splunk/bin/splunk show cluster-bundle-status -auth admin:`cat /mnt/splunk-secrets/password`" From 81c86771b8e3317398538751c4f8d1937d026246 Mon Sep 17 00:00:00 2001 From: rlieberman-splunk Date: Fri, 3 Apr 2026 11:45:49 -0500 Subject: [PATCH 07/10] remove check for ConfDeploymentException --- pkg/splunk/enterprise/afwscheduler.go | 11 ----------- pkg/splunk/enterprise/afwscheduler_test.go | 20 -------------------- 2 files changed, 31 deletions(-) diff --git a/pkg/splunk/enterprise/afwscheduler.go b/pkg/splunk/enterprise/afwscheduler.go index fd32c74da..699a59f3e 100644 --- a/pkg/splunk/enterprise/afwscheduler.go +++ b/pkg/splunk/enterprise/afwscheduler.go @@ -1727,17 +1727,6 @@ func (shcPlaybookContext *SHCPlaybookContext) isBundlePushComplete(ctx context.C return false, nil } - // If Splunk reports that a deployment job is already running, treat it as still - // in progress rather than resetting state to Pending and re-triggering. Resetting - // would schedule a second push while the first is still holding Splunk's deployment - // lock, producing another ConfDeploymentException and creating a retry storm. - // This can happen in FIPS mode (premature state reset caused by the FIPS banner) or - // after an operator restart mid-push. - if strings.Contains(stdOut, "ConfDeploymentException: Can't start deployment job as one is already running!") { - scopedLog.Info("SHC Bundle Push is already running; will recheck status on next reconcile", "statusFileOutput", stdOut) - return false, nil - } - // this means there was an error in bundle push command err = fmt.Errorf("there was an error in applying SHC Bundle, err=\"%v\"", stdOut) scopedLog.Error(err, "SHC Bundle push status file reported an error while applying bundle") diff --git a/pkg/splunk/enterprise/afwscheduler_test.go b/pkg/splunk/enterprise/afwscheduler_test.go index a09c05cbb..b055e5f59 100644 --- a/pkg/splunk/enterprise/afwscheduler_test.go +++ b/pkg/splunk/enterprise/afwscheduler_test.go @@ -4898,26 +4898,6 @@ func TestSHCIsBundlePushComplete(t *testing.T) { expectedError: false, description: "FIPS banner before success string should still be recognized as complete", }, - { - name: "ConfDeploymentException alone - treated as still in progress", - catStdOut: "ConfDeploymentException: Can't start deployment job as one is already running!", - catStdErr: "", - catErr: nil, - expectsRemoval: false, - expectedResult: false, - expectedError: false, - description: "ConfDeploymentException must not reset state to Pending; treat as still in progress to avoid retry storm", - }, - { - name: "FIPS banner followed by ConfDeploymentException - treated as still in progress", - catStdOut: splunkFIPSProviderBannerStr + "\n" + splunkSSLCertWarnStr + " Hostname Validation is disabled.\nConfDeploymentException: Can't start deployment job as one is already running!", - catStdErr: "", - catErr: nil, - expectsRemoval: false, - expectedResult: false, - expectedError: false, - description: "FIPS banner plus ConfDeploymentException is the primary FIPS retry-storm scenario; must not reset to Pending", - }, } for _, tt := range tests { From af2a6e4ea573f10993f5ddf74c13df92b5f8c6fa Mon Sep 17 00:00:00 2001 From: rlieberman-splunk Date: Mon, 6 Apr 2026 08:42:28 -0500 Subject: [PATCH 08/10] increase timeout for test cases --- .github/workflows/int-test-workflow.yml | 2 +- test/appframework_aws/c3/appframework_aws_suite_test.go | 2 +- test/appframework_aws/m4/appframework_aws_suite_test.go | 2 +- test/monitoring_console/monitoring_console_suite_test.go | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/int-test-workflow.yml b/.github/workflows/int-test-workflow.yml index c93f164e7..f987f0f37 100644 --- a/.github/workflows/int-test-workflow.yml +++ b/.github/workflows/int-test-workflow.yml @@ -194,7 +194,7 @@ jobs: mkdir -p ./bin cp /snap/bin/kustomize ./bin/kustomize - name: Run Integration test - timeout-minutes: 420 + timeout-minutes: 540 env: TEST_TIMEOUT: 225m TEST_S3_ACCESS_KEY_ID: ${{ vars.TEST_S3_ACCESS_KEY_ID }} diff --git a/test/appframework_aws/c3/appframework_aws_suite_test.go b/test/appframework_aws/c3/appframework_aws_suite_test.go index a8b4fff22..a2da9e3d6 100644 --- a/test/appframework_aws/c3/appframework_aws_suite_test.go +++ b/test/appframework_aws/c3/appframework_aws_suite_test.go @@ -61,7 +61,7 @@ func TestBasic(t *testing.T) { RegisterFailHandler(Fail) sc, _ := GinkgoConfiguration() - sc.Timeout = 420 * time.Minute + sc.Timeout = 540 * time.Minute RunSpecs(t, "Running "+testSuiteName, sc) } diff --git a/test/appframework_aws/m4/appframework_aws_suite_test.go b/test/appframework_aws/m4/appframework_aws_suite_test.go index d7d28fd85..383316078 100644 --- a/test/appframework_aws/m4/appframework_aws_suite_test.go +++ b/test/appframework_aws/m4/appframework_aws_suite_test.go @@ -56,7 +56,7 @@ func TestBasic(t *testing.T) { RegisterFailHandler(Fail) sc, _ := GinkgoConfiguration() - sc.Timeout = 420 * time.Minute + sc.Timeout = 540 * time.Minute RunSpecs(t, "Running "+testSuiteName, sc) } diff --git a/test/monitoring_console/monitoring_console_suite_test.go b/test/monitoring_console/monitoring_console_suite_test.go index b28aa0ead..cdeed5704 100644 --- a/test/monitoring_console/monitoring_console_suite_test.go +++ b/test/monitoring_console/monitoring_console_suite_test.go @@ -45,7 +45,7 @@ func TestBasic(t *testing.T) { RegisterFailHandler(Fail) sc, _ := GinkgoConfiguration() - sc.Timeout = 420 * time.Minute + sc.Timeout = 540 * time.Minute RunSpecs(t, "Running "+testSuiteName, sc) } From 954e27cc375409e08d49b1bde73e0801e0a3e402 Mon Sep 17 00:00:00 2001 From: rlieberman-splunk Date: Tue, 7 Apr 2026 08:44:17 -0500 Subject: [PATCH 09/10] update timeout for smoke and integration tests --- .github/workflows/int-test-workflow.yml | 2 +- test/appframework_aws/c3/appframework_aws_suite_test.go | 2 +- test/appframework_aws/m4/appframework_aws_suite_test.go | 2 +- test/monitoring_console/monitoring_console_suite_test.go | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/int-test-workflow.yml b/.github/workflows/int-test-workflow.yml index f987f0f37..207cb1177 100644 --- a/.github/workflows/int-test-workflow.yml +++ b/.github/workflows/int-test-workflow.yml @@ -194,7 +194,7 @@ jobs: mkdir -p ./bin cp /snap/bin/kustomize ./bin/kustomize - name: Run Integration test - timeout-minutes: 540 + timeout-minutes: 270 env: TEST_TIMEOUT: 225m TEST_S3_ACCESS_KEY_ID: ${{ vars.TEST_S3_ACCESS_KEY_ID }} diff --git a/test/appframework_aws/c3/appframework_aws_suite_test.go b/test/appframework_aws/c3/appframework_aws_suite_test.go index a2da9e3d6..254ce3069 100644 --- a/test/appframework_aws/c3/appframework_aws_suite_test.go +++ b/test/appframework_aws/c3/appframework_aws_suite_test.go @@ -61,7 +61,7 @@ func TestBasic(t *testing.T) { RegisterFailHandler(Fail) sc, _ := GinkgoConfiguration() - sc.Timeout = 540 * time.Minute + sc.Timeout = 270 * time.Minute RunSpecs(t, "Running "+testSuiteName, sc) } diff --git a/test/appframework_aws/m4/appframework_aws_suite_test.go b/test/appframework_aws/m4/appframework_aws_suite_test.go index 383316078..5047478d2 100644 --- a/test/appframework_aws/m4/appframework_aws_suite_test.go +++ b/test/appframework_aws/m4/appframework_aws_suite_test.go @@ -56,7 +56,7 @@ func TestBasic(t *testing.T) { RegisterFailHandler(Fail) sc, _ := GinkgoConfiguration() - sc.Timeout = 540 * time.Minute + sc.Timeout = 270 * time.Minute RunSpecs(t, "Running "+testSuiteName, sc) } diff --git a/test/monitoring_console/monitoring_console_suite_test.go b/test/monitoring_console/monitoring_console_suite_test.go index cdeed5704..6746766ab 100644 --- a/test/monitoring_console/monitoring_console_suite_test.go +++ b/test/monitoring_console/monitoring_console_suite_test.go @@ -45,7 +45,7 @@ func TestBasic(t *testing.T) { RegisterFailHandler(Fail) sc, _ := GinkgoConfiguration() - sc.Timeout = 540 * time.Minute + sc.Timeout = 270 * time.Minute RunSpecs(t, "Running "+testSuiteName, sc) } From ff8c98f38501c19f6487054ac06d2707e38cb4b0 Mon Sep 17 00:00:00 2001 From: rlieberman-splunk Date: Tue, 7 Apr 2026 14:09:17 -0500 Subject: [PATCH 10/10] update TEST_TIMEOUT variable --- .github/workflows/int-test-workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/int-test-workflow.yml b/.github/workflows/int-test-workflow.yml index 207cb1177..8ffc0ec0c 100644 --- a/.github/workflows/int-test-workflow.yml +++ b/.github/workflows/int-test-workflow.yml @@ -196,7 +196,7 @@ jobs: - name: Run Integration test timeout-minutes: 270 env: - TEST_TIMEOUT: 225m + TEST_TIMEOUT: 255m TEST_S3_ACCESS_KEY_ID: ${{ vars.TEST_S3_ACCESS_KEY_ID }} TEST_S3_SECRET_ACCESS_KEY: ${{ secrets.TEST_S3_SECRET_ACCESS_KEY }} run: |