Conversation
|
CLA Assistant Lite bot CLA Assistant Lite bot All contributors have signed the COC ✍️ ✅ |
|
I have read the CLA Document and I hereby sign the CLA |
|
I have read the Code of Conduct and I hereby accept the Terms |
Pull Request Test Coverage Report for Build 21975003576Details
💛 - Coveralls |
|
I think it might be worth to add/update docs |
| "sigs.k8s.io/controller-runtime/pkg/reconcile" | ||
| ) | ||
|
|
||
| var _ = Describe("Telemetry Controller", func() { |
There was a problem hiding this comment.
we should have some controller test cases
pkg/splunk/enterprise/telemetry.go
Outdated
| scopedLog.Info("Updated last transmission time in configmap", "newStatus", cm.Data[telStatusKey]) | ||
| } | ||
|
|
||
| func collectResourceTelData(resources corev1.ResourceRequirements, data map[string]string) { |
There was a problem hiding this comment.
should we refactor this code to make it much easier to read, or use generics
an example
func collectDeploymentTelDataRefactored(ctx context.Context, client splcommon.ControllerClient, deploymentData map[string]interface{}) map[string][]splcommon.MetaObject {
reqLogger := log.FromContext(ctx)
scopedLog := reqLogger.WithName("collectDeploymentTelData")
crWithTelAppList := make(map[string][]splcommon.MetaObject)
scopedLog.Info("Start collecting deployment telemetry data")
// Define all CR handlers in a slice
handlers := []crListHandler{
{kind: "Standalone", listFunc: listStandalones, checkTelApp: true},
{kind: "LicenseManager", listFunc: listLicenseManagers, checkTelApp: true},
{kind: "LicenseMaster", listFunc: listLicenseMasters, checkTelApp: true},
{kind: "SearchHeadCluster", listFunc: listSearchHeadClusters, checkTelApp: true},
{kind: "IndexerCluster", listFunc: listIndexerClusters, checkTelApp: false},
{kind: "ClusterManager", listFunc: listClusterManagers, checkTelApp: true},
{kind: "ClusterMaster", listFunc: listClusterMasters, checkTelApp: true},
{kind: "MonitoringConsole", listFunc: listMonitoringConsoles, checkTelApp: false},
}
// Process each CR type using the same logic
for _, handler := range handlers {
processCRType(ctx, client, handler, deploymentData, crWithTelAppList, scopedLog)
}
return crWithTelAppList
}
// processCRType is the common processing logic for all CR types
func processCRType(
ctx context.Context,
client splcommon.ControllerClient,
handler crListHandler,
deploymentData map[string]interface{},
crWithTelAppList map[string][]splcommon.MetaObject,
scopedLog interface{}, // Using interface{} to avoid import issues, should be logr.Logger
) {
items, err := handler.listFunc(ctx, client)
if err != nil {
// scopedLog.Error(err, "Failed to list objects", "kind", handler.kind)
return
}
if len(items) == 0 {
return
}
// Create per-kind data map
perKindData := make(map[string]interface{})
deploymentData[handler.kind] = perKindData
// Process each item
for _, item := range items {
// scopedLog.Info("Collecting data", "kind", item.kind, "name", item.name, "namespace", item.namespace)
crResourceData := make(map[string]string)
perKindData[item.name] = crResourceData
// Collect resource telemetry data
if resources, ok := item.resources.(corev1.ResourceRequirements); ok {
collectResourceTelData(resources, crResourceData)
}
// Add to telemetry app list if applicable
if handler.checkTelApp && item.hasTelApp {
crWithTelAppList[handler.kind] = append(crWithTelAppList[handler.kind], item.cr)
} else if handler.checkTelApp && !item.hasTelApp {
// scopedLog.Info("Telemetry app is not installed for this CR", "kind", item.kind, "name", item.name)
}
}
}
// List functions for each CR type - these extract the common pattern
func listStandalones(ctx context.Context, client splcommon.ControllerClient) ([]crItem, error) {
var list enterpriseApi.StandaloneList
err := client.List(ctx, &list)
if err != nil {
return nil, err
}
items := make([]crItem, 0, len(list.Items))
for i := range list.Items {
cr := &list.Items[i]
items = append(items, crItem{
name: cr.GetName(),
namespace: cr.GetNamespace(),
kind: cr.Kind,
resources: cr.Spec.CommonSplunkSpec.Resources,
hasTelApp: cr.Status.TelAppInstalled,
cr: cr,
})
}
return items, nil
}
func listLicenseManagers(ctx context.Context, client splcommon.ControllerClient) ([]crItem, error) {
var list enterpriseApi.LicenseManagerList
err := client.List(ctx, &list)
if err != nil {
return nil, err
}
items := make([]crItem, 0, len(list.Items))
for i := range list.Items {
cr := &list.Items[i]
items = append(items, crItem{
name: cr.GetName(),
namespace: cr.GetNamespace(),
kind: cr.Kind,
resources: cr.Spec.CommonSplunkSpec.Resources,
hasTelApp: cr.Status.TelAppInstalled,
cr: cr,
})
}
return items, nil
}
func listLicenseMasters(ctx context.Context, client splcommon.ControllerClient) ([]crItem, error) {
var list enterpriseApiV3.LicenseMasterList
err := client.List(ctx, &list)
if err != nil {
return nil, err
}
items := make([]crItem, 0, len(list.Items))
for i := range list.Items {
cr := &list.Items[i]
items = append(items, crItem{
name: cr.GetName(),
namespace: cr.GetNamespace(),
kind: cr.Kind,
resources: cr.Spec.CommonSplunkSpec.Resources,
hasTelApp: cr.Status.TelAppInstalled,
cr: cr,
})
}
return items, nil
}
func listSearchHeadClusters(ctx context.Context, client splcommon.ControllerClient) ([]crItem, error) {
var list enterpriseApi.SearchHeadClusterList
err := client.List(ctx, &list)
if err != nil {
return nil, err
}
items := make([]crItem, 0, len(list.Items))
for i := range list.Items {
cr := &list.Items[i]
items = append(items, crItem{
name: cr.GetName(),
namespace: cr.GetNamespace(),
kind: cr.Kind,
resources: cr.Spec.CommonSplunkSpec.Resources,
hasTelApp: cr.Status.TelAppInstalled,
cr: cr,
})
}
return items, nil
}
func listIndexerClusters(ctx context.Context, client splcommon.ControllerClient) ([]crItem, error) {
var list enterpriseApi.IndexerClusterList
err := client.List(ctx, &list)
if err != nil {
return nil, err
}
items := make([]crItem, 0, len(list.Items))
for i := range list.Items {
cr := &list.Items[i]
items = append(items, crItem{
name: cr.GetName(),
namespace: cr.GetNamespace(),
kind: cr.Kind,
resources: cr.Spec.CommonSplunkSpec.Resources,
hasTelApp: false, // IndexerClusters don't track TelAppInstalled
cr: cr,
})
}
return items, nil
}
func listClusterManagers(ctx context.Context, client splcommon.ControllerClient) ([]crItem, error) {
var list enterpriseApi.ClusterManagerList
err := client.List(ctx, &list)
if err != nil {
return nil, err
}
items := make([]crItem, 0, len(list.Items))
for i := range list.Items {
cr := &list.Items[i]
items = append(items, crItem{
name: cr.GetName(),
namespace: cr.GetNamespace(),
kind: cr.Kind,
resources: cr.Spec.CommonSplunkSpec.Resources,
hasTelApp: cr.Status.TelAppInstalled,
cr: cr,
})
}
return items, nil
}
func listClusterMasters(ctx context.Context, client splcommon.ControllerClient) ([]crItem, error) {
var list enterpriseApiV3.ClusterMasterList
err := client.List(ctx, &list)
if err != nil {
return nil, err
}
items := make([]crItem, 0, len(list.Items))
for i := range list.Items {
cr := &list.Items[i]
items = append(items, crItem{
name: cr.GetName(),
namespace: cr.GetNamespace(),
kind: cr.Kind,
resources: cr.Spec.CommonSplunkSpec.Resources,
hasTelApp: cr.Status.TelAppInstalled,
cr: cr,
})
}
return items, nil
}
func listMonitoringConsoles(ctx context.Context, client splcommon.ControllerClient) ([]crItem, error) {
var list enterpriseApi.MonitoringConsoleList
err := client.List(ctx, &list)
if err != nil {
return nil, err
}
items := make([]crItem, 0, len(list.Items))
for i := range list.Items {
cr := &list.Items[i]
items = append(items, crItem{
name: cr.GetName(),
namespace: cr.GetNamespace(),
kind: cr.Kind,
resources: cr.Spec.CommonSplunkSpec.Resources,
hasTelApp: false, // MonitoringConsoles don't track TelAppInstalled
cr: cr,
})
}
return items, nil
}
There was a problem hiding this comment.
Thanks for the code suggestion. I have made the change.
There was a problem hiding this comment.
code has 47% test coverage lets try to move to 90%
| telAppReloadString = "curl -k -u admin:`cat /mnt/splunk-secrets/password` https://localhost:8089/services/apps/local/_reload" | ||
|
|
||
| // Name of the telemetry configmap: <namePrefix>-manager-telemetry | ||
| telConfigMapTemplateStr = "%smanager-telemetry" |
There was a problem hiding this comment.
Yes. This config map is not accessed by multiple CRs.
|
|
||
| // SetupWithManager sets up the controller with the Manager. | ||
| func (r *TelemetryReconciler) SetupWithManager(mgr ctrl.Manager) error { | ||
| return ctrl.NewControllerManagedBy(mgr). |
There was a problem hiding this comment.
should you be watching for CR resource creation and process them only when new CR is created
There was a problem hiding this comment.
Can you implement an event-driven approach where the telemetry controller watches the actual Splunk custom resources and only triggers reconciliation when:
- A new CR is created (Standalone, ClusterMaster, IndexerCluster, SearchHeadCluster, etc.)
- An existing CR is modified (configuration changes, scaling events)
- A CR is deleted (to track removal events)
Benefits of This Approach
1. Reduced Resource Consumption
- No periodic reconciliation when nothing has changed
- CPU and memory usage only when actual events occur
- More efficient for clusters with stable configurations
2. Immediate Response
- Telemetry collected immediately when CRs are created/modified
- No waiting for the next 10-minute requeue cycle
- More accurate timestamps for resource creation events
3. Better Alignment with Kubernetes Best Practices
- Controllers should react to resource changes, not poll
- Leverages Kubernetes watch mechanism efficiently
- Reduces unnecessary API server load
4. Clearer Intent
- The controller's purpose becomes explicit: "Send telemetry when Splunk resources change"
- Easier to understand and maintain
- Better for debugging (logs show which CR triggered telemetry)
Proposed Implementation Changes
Current Setup (from SetupWithManager):
func (r *TelemetryReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
For(&corev1.ConfigMap{}). // Watching ConfigMaps
WithEventFilter(predicate.Funcs{
CreateFunc: func(e event.CreateEvent) bool {
return r.isTelemetryConfigMap(e.Object)
},
// ... more predicates
}).
WithOptions(controller.Options{
MaxConcurrentReconciles: 1,
}).
Complete(r)
}Suggested Alternative:
func (r *TelemetryReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
// Watch Splunk CRs directly
For(&enterprisev4.Standalone{}).
Owns(&enterprisev4.ClusterMaster{}).
Owns(&enterprisev4.IndexerCluster{}).
Owns(&enterprisev4.SearchHeadCluster{}).
// ... other Splunk CRs
WithEventFilter(predicate.Funcs{
CreateFunc: func(e event.CreateEvent) bool {
// Trigger on CR creation
return true
},
UpdateFunc: func(e event.UpdateEvent) bool {
// Optionally trigger on significant updates
return shouldCollectTelemetry(e.ObjectOld, e.ObjectNew)
},
DeleteFunc: func(e event.DeleteEvent) bool {
// Optionally track deletions
return false
},
}).
WithOptions(controller.Options{
MaxConcurrentReconciles: 1,
}).
Complete(r)
}Modified Reconcile Method:
func (r *TelemetryReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := r.Log.WithValues("telemetry", req.NamespacedName)
// Fetch the actual Splunk CR that triggered this reconciliation
// Determine CR type and get relevant telemetry data
// Collect telemetry for THIS specific resource
telemetryData := r.collectResourceTelemetry(ctx, req)
// Send telemetry immediately (no requeue needed!)
if err := r.applyTelemetryFn(ctx, telemetryData); err != nil {
log.Error(err, "Failed to send telemetry")
// Only requeue on actual errors, not as a periodic timer
return ctrl.Result{Requeue: true}, err
}
// Done! No automatic requeue
return ctrl.Result{}, nil
}Additional Considerations
1. Rate Limiting
If watching CRs directly, consider:
- Implementing rate limiting to avoid telemetry spam
- Batching multiple CR events within a time window
- Using a "debounce" mechanism for rapid successive changes
2. Daily Telemetry Requirement
The PR mentions "collecting and sending telemetry data once per day". If this is the actual requirement:
Option A: Use a CronJob instead of a controller
apiVersion: batch/v1
kind: CronJob
metadata:
name: splunk-operator-telemetry
spec:
schedule: "0 2 * * *" # Daily at 2 AM
jobTemplate:
spec:
template:
spec:
containers:
- name: telemetry-collector
# Collect and send telemetryOption B: If controller is needed, add timestamp-based logic:
// Check last telemetry send time
lastSent := getLastTelemetrySendTime()
if time.Since(lastSent) < 24*time.Hour {
// Skip telemetry, already sent today
return ctrl.Result{}, nil
}
Description.
This PR implement SOK Telemetry enhancement. ERD:
https://cisco-my.sharepoint.com/:w:/p/mqiu/IQBoVUuEEY1SR4rDjbja0iPuAeN5dxFG-K-ZPpvO6RoWJp0?e=n5R1Ow
What does this PR have in it?.
Periodically collect (once per day) and send SOK telemetry which includes:
a. SOK version.
b. CPU/Memory settings (limit and request) of containers including standalone, searchheadcluster, indexercluster,
clustermaster, clustermanager, licensemaster and licensemanager.
c. LincenseInfo (Splunk license ID and license type).
Key Changes.
Highlight the updates in specific files
Testing and Verification.
Tested on s1, c3 and m4.
How did you test these changes? What automated tests are added?.
Added telemetry verification to existing s1, c3 and m4 tests.
Related Issues
Jira tickets, GitHub issues, Support tickets...
https://splunk.atlassian.net/browse/CSPL-4371.
PR Checklist