Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 106 additions & 27 deletions pkg/cvo/availableupdates.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,21 @@ import (
"time"

"github.com/blang/semver/v4"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/google/uuid"
"k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/klog/v2"

configv1 "github.com/openshift/api/config/v1"
"github.com/openshift/cluster-version-operator/lib/resourcemerge"
"github.com/openshift/cluster-version-operator/pkg/cincinnati"
"github.com/openshift/cluster-version-operator/pkg/clusterconditions"
"github.com/openshift/cluster-version-operator/pkg/internal"
)

const noArchitecture string = "NoArchitecture"
Expand Down Expand Up @@ -50,6 +55,12 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
channel := config.Spec.Channel
desiredArch := optr.getDesiredArchitecture(config.Spec.DesiredUpdate)
currentArch := optr.getCurrentArchitecture()
acceptRisks := sets.New[string]()
if config.Spec.DesiredUpdate != nil {
for _, risk := range config.Spec.DesiredUpdate.AcceptRisks {
acceptRisks.Insert(risk.Name)
}
}

// updates are only checked at most once per minimumUpdateCheckInterval or if the generation changes
optrAvailableUpdates := optr.getAvailableUpdates()
Expand Down Expand Up @@ -129,6 +140,9 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
optrAvailableUpdates.UpdateService = updateService
optrAvailableUpdates.Channel = channel
optrAvailableUpdates.Architecture = desiredArch
optrAvailableUpdates.ShouldReconcileAcceptRisks = optr.shouldReconcileAcceptRisks
optrAvailableUpdates.AcceptRisks = acceptRisks
optrAvailableUpdates.RiskConditions = map[string][]metav1.Condition{}
optrAvailableUpdates.ConditionRegistry = optr.conditionRegistry
optrAvailableUpdates.Condition = condition

Expand Down Expand Up @@ -167,9 +181,11 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
}

type availableUpdates struct {
UpdateService string
Channel string
Architecture string
UpdateService string
Channel string
Architecture string
ShouldReconcileAcceptRisks func() bool
AcceptRisks sets.Set[string]

// LastAttempt records the time of the most recent attempt at update
// retrieval, regardless of whether it was successful.
Expand All @@ -192,6 +208,9 @@ type availableUpdates struct {
ConditionRegistry clusterconditions.ConditionRegistry

Condition configv1.ClusterOperatorStatusCondition

// RiskConditions stores the condition for every risk (name, url, message, matchingRules).
RiskConditions map[string][]metav1.Condition
}

func (u *availableUpdates) RecentlyAttempted(interval time.Duration) bool {
Expand Down Expand Up @@ -291,14 +310,17 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates {
}

u := &availableUpdates{
UpdateService: optr.availableUpdates.UpdateService,
Channel: optr.availableUpdates.Channel,
Architecture: optr.availableUpdates.Architecture,
LastAttempt: optr.availableUpdates.LastAttempt,
LastSyncOrConfigChange: optr.availableUpdates.LastSyncOrConfigChange,
Current: *optr.availableUpdates.Current.DeepCopy(),
ConditionRegistry: optr.availableUpdates.ConditionRegistry, // intentionally not a copy, to preserve cache state
Condition: optr.availableUpdates.Condition,
UpdateService: optr.availableUpdates.UpdateService,
Channel: optr.availableUpdates.Channel,
Architecture: optr.availableUpdates.Architecture,
ShouldReconcileAcceptRisks: optr.shouldReconcileAcceptRisks,
AcceptRisks: optr.availableUpdates.AcceptRisks,
RiskConditions: optr.availableUpdates.RiskConditions,
LastAttempt: optr.availableUpdates.LastAttempt,
LastSyncOrConfigChange: optr.availableUpdates.LastSyncOrConfigChange,
Current: *optr.availableUpdates.Current.DeepCopy(),
ConditionRegistry: optr.availableUpdates.ConditionRegistry, // intentionally not a copy, to preserve cache state
Condition: optr.availableUpdates.Condition,
}

if optr.availableUpdates.Updates != nil {
Expand Down Expand Up @@ -426,8 +448,11 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
vj := semver.MustParse(u.ConditionalUpdates[j].Release.Version)
return vi.GTE(vj)
})
if err := sanityCheck(u.ConditionalUpdates); err != nil {
klog.Errorf("Sanity check failed which might impact risk evaluation: %v", err)
}
for i, conditionalUpdate := range u.ConditionalUpdates {
condition := evaluateConditionalUpdate(ctx, conditionalUpdate.Risks, u.ConditionRegistry)
condition := evaluateConditionalUpdate(ctx, conditionalUpdate.Risks, u.ConditionRegistry, u.AcceptRisks, u.ShouldReconcileAcceptRisks, u.RiskConditions)

if condition.Status == metav1.ConditionTrue {
u.addUpdate(conditionalUpdate.Release)
Expand All @@ -441,6 +466,25 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
}
}

func sanityCheck(updates []configv1.ConditionalUpdate) error {
risks := map[string]configv1.ConditionalUpdateRisk{}
var errs []error
for _, update := range updates {
for _, risk := range update.Risks {
if v, ok := risks[risk.Name]; ok {
if diff := cmp.Diff(v, risk, cmpopts.IgnoreFields(configv1.ConditionalUpdateRisk{}, "Conditions")); diff != "" {
errs = append(errs, fmt.Errorf("found collision on risk %s: %v and %v", risk.Name, v, risk))
}
} else if trimmed := strings.TrimSpace(risk.Name); trimmed == "" {
errs = append(errs, fmt.Errorf("found invalid name on risk %v", risk))
} else {
risks[risk.Name] = risk
}
}
}
return utilerrors.NewAggregate(errs)
}

func (u *availableUpdates) addUpdate(release configv1.Release) {
for _, update := range u.Updates {
if update.Image == release.Image {
Expand Down Expand Up @@ -478,33 +522,47 @@ func newRecommendedStatus(now, want metav1.ConditionStatus) metav1.ConditionStat
}

const (
recommendedReasonRisksNotExposed = "NotExposedToRisks"
recommendedReasonEvaluationFailed = "EvaluationFailed"
recommendedReasonMultiple = "MultipleReasons"
recommendedReasonRisksNotExposed = "NotExposedToRisks"
recommendedReasonAllExposedRisksAccepted = "AllExposedRisksAccepted"
recommendedReasonEvaluationFailed = "EvaluationFailed"
recommendedReasonMultiple = "MultipleReasons"

// recommendedReasonExposed is used instead of the original name if it does
// not match the pattern for a valid k8s condition reason.
recommendedReasonExposed = "ExposedToRisks"

riskConditionReasonEvaluationFailed = "EvaluationFailed"
riskConditionReasonMatch = "Match"
riskConditionReasonNotMatch = "NotMatch"
)

// Reasons follow same pattern as k8s Condition Reasons
// https://github.com/openshift/api/blob/59fa376de7cb668ddb95a7ee4e9879d7f6ca2767/vendor/k8s.io/apimachinery/pkg/apis/meta/v1/types.go#L1535-L1536
var reasonPattern = regexp.MustCompile(`^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$`)

func newRecommendedReason(now, want string) string {
switch now {
case recommendedReasonRisksNotExposed:
switch {
case now == recommendedReasonRisksNotExposed ||
now == recommendedReasonAllExposedRisksAccepted ||
now == want:
return want
case want:
case want == recommendedReasonRisksNotExposed:
return now
default:
return recommendedReasonMultiple
}
}

func evaluateConditionalUpdate(ctx context.Context, risks []configv1.ConditionalUpdateRisk, conditionRegistry clusterconditions.ConditionRegistry) metav1.Condition {
func evaluateConditionalUpdate(
ctx context.Context,
risks []configv1.ConditionalUpdateRisk,
conditionRegistry clusterconditions.ConditionRegistry,
acceptRisks sets.Set[string],
shouldReconcileAcceptRisks func() bool,
riskConditions map[string][]metav1.Condition,
) metav1.Condition {
recommended := metav1.Condition{
Type: ConditionalUpdateConditionTypeRecommended,
Type: internal.ConditionalUpdateConditionTypeRecommended,
Status: metav1.ConditionTrue,
// FIXME: ObservedGeneration? That would capture upstream/channel, but not necessarily the currently-reconciling version.
Reason: recommendedReasonRisksNotExposed,
Expand All @@ -513,18 +571,39 @@ func evaluateConditionalUpdate(ctx context.Context, risks []configv1.Conditional

var errorMessages []string
for _, risk := range risks {
riskCondition := metav1.Condition{
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
Status: metav1.ConditionFalse,
Reason: riskConditionReasonNotMatch,
}
if match, err := conditionRegistry.Match(ctx, risk.MatchingRules); err != nil {
msg := unknownExposureMessage(risk, err)
recommended.Status = newRecommendedStatus(recommended.Status, metav1.ConditionUnknown)
recommended.Reason = newRecommendedReason(recommended.Reason, recommendedReasonEvaluationFailed)
errorMessages = append(errorMessages, unknownExposureMessage(risk, err))
errorMessages = append(errorMessages, msg)
riskCondition.Status = metav1.ConditionUnknown
riskCondition.Reason = riskConditionReasonEvaluationFailed
riskCondition.Message = msg
} else if match {
recommended.Status = newRecommendedStatus(recommended.Status, metav1.ConditionFalse)
wantReason := recommendedReasonExposed
if reasonPattern.MatchString(risk.Name) {
wantReason = risk.Name
riskCondition.Status = metav1.ConditionTrue
riskCondition.Reason = riskConditionReasonMatch
if shouldReconcileAcceptRisks() && acceptRisks.Has(risk.Name) {
recommended.Status = newRecommendedStatus(recommended.Status, metav1.ConditionTrue)
recommended.Reason = newRecommendedReason(recommended.Reason, recommendedReasonAllExposedRisksAccepted)
recommended.Message = "The update is recommended, because either risk does not apply to this cluster or it is accepted by cluster admins."
klog.V(2).Infof("Risk with name %q is accepted by the cluster admin and thus not in the evaluation of conditional update", risk.Name)
} else {
recommended.Status = newRecommendedStatus(recommended.Status, metav1.ConditionFalse)
wantReason := recommendedReasonExposed
if reasonPattern.MatchString(risk.Name) {
wantReason = risk.Name
}
recommended.Reason = newRecommendedReason(recommended.Reason, wantReason)
errorMessages = append(errorMessages, fmt.Sprintf("%s %s", risk.Message, risk.URL))
}
recommended.Reason = newRecommendedReason(recommended.Reason, wantReason)
errorMessages = append(errorMessages, fmt.Sprintf("%s %s", risk.Message, risk.URL))
}
if _, ok := riskConditions[risk.Name]; !ok {
riskConditions[risk.Name] = []metav1.Condition{riskCondition}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm worried that if this stays conditional on !ok, we might fail to update the cached riskConditions if the risk transitions from matching to not-matching, etc. I'd expect the new risk standardization allows us to simplify, or maybe completely drop, the evaluateConditionalUpdate function, with something like:

  1. Parse the update advice to collect all the next-hop target versions and the relevant risk map.
  2. Rank the risks in the riskConditions risk map to evaluate the ones that apply to the longest-hop targets first.
  3. Evaluate the risks in riskConditions in ranked order, to see if they apply to the cluster. Later risks in the risk map may initially fail to evaluate as we self-throttle to avoid overloading Prometheus.
  4. For each update target, look up any associated risks in riskConditions, and construct an appropriate Recommended message.

That avoids needing to drop down to the per-target level when managing the riskConditions cache. The only way per-target information affects riskConditions management would be in ranking the evalation order. Does that seem helpful?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thought about this too while code.
I wanted to avoid meta changes at a first pull on this epic.
Let me handle this one on a follow up pull.

}
}
if len(errorMessages) > 0 {
Expand Down
Loading