diff --git a/cmd/mapt/cmd/aws/services/snc.go b/cmd/mapt/cmd/aws/services/snc.go index 0e1d07665..8019e1a7b 100644 --- a/cmd/mapt/cmd/aws/services/snc.go +++ b/cmd/mapt/cmd/aws/services/snc.go @@ -27,6 +27,11 @@ const ( sncProfile = "profile" sncProfileDesc = "comma separated list of profiles to apply on the SNC cluster. Profiles available: virtualization, serverless-serving, serverless-eventing, serverless, servicemesh, ai, nvidia. The ai profile automatically includes servicemesh and serverless-serving as prerequisites and raises the minimum instance size to 16 vCPUs. The nvidia profile installs NFD and the NVIDIA GPU Operator" + + operatorChannel = "operator-channel" + operatorChannelDesc = "override the OLM subscription channel for an operator (--operator-channel serverless-operator=preview,nfd=4.17)" + catalogSource = "catalog-source" + catalogSourceDesc = "override the OLM catalog source with a custom index image (--catalog-source serverless-operator=quay.io/my-org/my-index:latest)" ) func GetOpenshiftSNCCmd() *cobra.Command { @@ -92,7 +97,9 @@ func createSNC() *cobra.Command { PullSecretFile: viper.GetString(pullSecretFile), Timeout: viper.GetString(params.Timeout), ServiceEndpoints: params.NetworkServiceEndpoints(), - Profiles: profiles}); err != nil { + Profiles: profiles, + OperatorChannels: viper.GetStringMapString(operatorChannel), + CatalogSources: viper.GetStringMapString(catalogSource)}); err != nil { return err } return nil @@ -107,6 +114,8 @@ func createSNC() *cobra.Command { flagSet.StringP(params.Timeout, "", "", params.TimeoutDesc) flagSet.StringToStringP(params.Tags, "", nil, params.TagsDesc) flagSet.StringSliceP(sncProfile, "", []string{}, sncProfileDesc) + flagSet.StringToStringP(operatorChannel, "", nil, operatorChannelDesc) + flagSet.StringToStringP(catalogSource, "", nil, catalogSourceDesc) params.AddComputeRequestFlags(flagSet) params.AddSpotFlags(flagSet) params.AddNetworkFlags(flagSet, awsParams.ServiceEndpointsDesc) diff --git a/docs/aws/openshift-snc.md b/docs/aws/openshift-snc.md index 3528bd9d5..9c68967c0 100644 --- a/docs/aws/openshift-snc.md +++ b/docs/aws/openshift-snc.md @@ -77,6 +77,56 @@ Multiple profiles can be specified as a comma-separated list (e.g., `--profile v | `nvidia` | Installs the [NVIDIA GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/install-gpu-ocp.html) on the cluster. Automatically installs [Node Feature Discovery](https://docs.redhat.com/en/documentation/openshift_container_platform/latest/html/specialized_hardware_and_driver_enablement/psap-node-feature-discovery-operator) (NFD) as a prerequisite and creates a ClusterPolicy with the recommended OpenShift defaults (CRI-O runtime, OCP driver toolkit). The cluster must run on a GPU-capable instance type (e.g. `g4dn`, `g5`, `p4d`).| +### Operator overrides + +Profiles install operators using the default OLM channel (`stable`) and catalog (`redhat-operators`). Two flags allow overriding these per operator, which is useful for testing pre-release operator builds: + +#### `--operator-channel` + +Override the OLM subscription channel for a specific operator: + +```bash +mapt aws openshift-snc create \ + --profile serverless-serving \ + --operator-channel serverless-operator=candidate +``` + +Multiple operators can be overridden at once: + +```bash +--operator-channel serverless-operator=preview,nfd=4.17 +``` + +#### `--catalog-source` + +Use a custom index image instead of the default catalog. This creates a `CatalogSource` CR in `openshift-marketplace` and points the operator's subscription to it: + +```bash +mapt aws openshift-snc create \ + --profile nvidia \ + --catalog-source gpu-operator-certified=quay.io/my-team/gpu-operator-index:test-v1.0 +``` + +Both flags can be combined: + +```bash +mapt aws openshift-snc create \ + --profile ai \ + --operator-channel serverless-operator=candidate \ + --catalog-source rhods-operator=quay.io/my-team/rhoai-index:nightly +``` + +When neither flag is provided, operators use the defaults: channel `stable` and catalog `redhat-operators` (unless overridden in the profile definition, e.g. `gpu-operator-certified` and `nfd` use `certified-operators`). + +The keys are operator package names as they appear in OLM. The operators installed by each profile are: + +| Profile | Operator package names | +|---------|----------------------| +| `serverless-serving` / `serverless-eventing` / `serverless` | `serverless-operator` | +| `servicemesh` | `servicemeshoperator3` | +| `ai` | `rhods-operator`, `servicemeshoperator`, `authorino-operator`, `serverless-operator` | +| `nvidia` | `gpu-operator-certified`, `nfd` | + ### Adding new profiles To add a new profile: diff --git a/pkg/provider/aws/action/snc/snc.go b/pkg/provider/aws/action/snc/snc.go index df643c51e..e702344f7 100644 --- a/pkg/provider/aws/action/snc/snc.go +++ b/pkg/provider/aws/action/snc/snc.go @@ -45,8 +45,10 @@ type openshiftSNCRequest struct { pullSecretFile *string serviceEndpoints []string allocationData *allocation.AllocationResult - profiles []string - diskSize *int + profiles []string + operatorChannels map[string]string + catalogSources map[string]string + diskSize *int } func (r *openshiftSNCRequest) validate() error { @@ -67,10 +69,13 @@ func Create(mCtxArgs *mc.ContextArgs, args *apiSNC.SNCArgs) (_ *apiSNC.SNCResult if err != nil { return nil, err } - // Validate profiles + // Validate profiles and operator overrides if err := profile.Validate(args.Profiles); err != nil { return nil, err } + if err := profile.ValidateOperatorOverrides(args.OperatorChannels, args.CatalogSources); err != nil { + return nil, err + } // Compose request prefix := util.If(len(args.Prefix) > 0, args.Prefix, "main") r := openshiftSNCRequest{ @@ -82,8 +87,10 @@ func Create(mCtxArgs *mc.ContextArgs, args *apiSNC.SNCArgs) (_ *apiSNC.SNCResult pullSecretFile: &args.PullSecretFile, timeout: &args.Timeout, serviceEndpoints: args.ServiceEndpoints, - profiles: args.Profiles, - diskSize: args.ComputeRequest.DiskSize} + profiles: args.Profiles, + operatorChannels: args.OperatorChannels, + catalogSources: args.CatalogSources, + diskSize: args.ComputeRequest.DiskSize} if args.Spot != nil { r.spot = args.Spot.Spot } @@ -290,10 +297,12 @@ func (r *openshiftSNCRequest) deploy(ctx *pulumi.Context) error { deletedWith = c.AutoscalingGroup } if err := profile.Deploy(ctx, r.profiles, &profile.DeployArgs{ - K8sProvider: k8sProvider, - Kubeconfig: kubeconfig, - Prefix: *r.prefix, - DeletedWith: deletedWith, + K8sProvider: k8sProvider, + Kubeconfig: kubeconfig, + Prefix: *r.prefix, + DeletedWith: deletedWith, + OperatorChannels: r.operatorChannels, + CatalogSources: r.catalogSources, }); err != nil { return err } diff --git a/pkg/target/service/snc/api.go b/pkg/target/service/snc/api.go index cc1a71325..c3de13b36 100644 --- a/pkg/target/service/snc/api.go +++ b/pkg/target/service/snc/api.go @@ -50,7 +50,9 @@ type SNCArgs struct { Spot *spotTypes.SpotArgs Timeout string ServiceEndpoints []string - Profiles []string + Profiles []string + OperatorChannels map[string]string + CatalogSources map[string]string } type SNCResults struct { diff --git a/pkg/target/service/snc/profile/operator.go b/pkg/target/service/snc/profile/operator.go index f13b85956..58e5979d7 100644 --- a/pkg/target/service/snc/profile/operator.go +++ b/pkg/target/service/snc/profile/operator.go @@ -59,8 +59,18 @@ func installOperator(ctx *pulumi.Context, args *DeployArgs, oi operatorInstall) catalogSource = catalogSourceRedHat } + if override, ok := args.OperatorChannels[oi.packageName]; ok { + channel = override + } + if cs, ok := args.catalogSourceCRs[oi.packageName]; ok { + catalogSource = cs.Name + } + deps := append([]pulumi.Resource{}, args.Deps...) deps = append(deps, oi.extraDeps...) + if cs, ok := args.catalogSourceCRs[oi.packageName]; ok { + deps = append(deps, cs.Resource) + } // If ogName is provided, create a dedicated namespace and OperatorGroup. if oi.ogName != "" { diff --git a/pkg/target/service/snc/profile/profile.go b/pkg/target/service/snc/profile/profile.go index a7f7f59f4..9fab4db06 100644 --- a/pkg/target/service/snc/profile/profile.go +++ b/pkg/target/service/snc/profile/profile.go @@ -1,11 +1,13 @@ package profile import ( + "crypto/sha256" "fmt" "maps" "slices" "github.com/pulumi/pulumi-kubernetes/sdk/v4/go/kubernetes" + "github.com/pulumi/pulumi-kubernetes/sdk/v4/go/kubernetes/apiextensions" corev1 "github.com/pulumi/pulumi-kubernetes/sdk/v4/go/kubernetes/core/v1" metav1 "github.com/pulumi/pulumi-kubernetes/sdk/v4/go/kubernetes/meta/v1" "github.com/pulumi/pulumi/sdk/v3/go/pulumi" @@ -63,6 +65,18 @@ type DeployArgs struct { // so that Pulumi skips deleting them individually during destroy — the // resources disappear when the VM is terminated. DeletedWith pulumi.Resource + // OperatorChannels maps operator packageName to an OLM channel override. + OperatorChannels map[string]string + // CatalogSources maps operator packageName to a custom index image URL. + CatalogSources map[string]string + + // catalogSourceCRs maps packageName to the CatalogSource CR info. + catalogSourceCRs map[string]catalogSourceInfo +} + +type catalogSourceInfo struct { + Name string + Resource pulumi.Resource } // Validate checks that all requested profiles are supported and @@ -88,6 +102,10 @@ func Validate(profiles []string) error { // The AI profile implicitly brings in Service Mesh v2 (Maistra) and // serverless-serving as prerequisites for Kserve. func Deploy(ctx *pulumi.Context, profiles []string, args *DeployArgs) error { + if err := args.ensureCatalogSources(ctx); err != nil { + return err + } + needServing := false needEventing := false needAI := false @@ -194,6 +212,56 @@ func (a *DeployArgs) newNamespace(ctx *pulumi.Context, name string, nsName pulum a.k8sOpts(extra...)...) } +func ValidateOperatorOverrides(channels, catalogs map[string]string) error { + for pkg, ch := range channels { + if pkg == "" || ch == "" { + return fmt.Errorf("invalid --operator-channel: both package name and channel must be non-empty (got %q=%q)", pkg, ch) + } + } + for pkg, img := range catalogs { + if pkg == "" || img == "" { + return fmt.Errorf("invalid --catalog-source: both package name and index image must be non-empty (got %q=%q)", pkg, img) + } + } + return nil +} + +// ensureCatalogSources creates CatalogSource CRs for any custom index images +// specified via --catalog-source, so that operator subscriptions can reference them. +func (a *DeployArgs) ensureCatalogSources(ctx *pulumi.Context) error { + if len(a.CatalogSources) == 0 { + return nil + } + a.catalogSourceCRs = make(map[string]catalogSourceInfo, len(a.CatalogSources)) + for pkg, indexImage := range a.CatalogSources { + hash := fmt.Sprintf("%x", sha256.Sum256([]byte(indexImage)))[:8] + csName := fmt.Sprintf("mapt-cs-%s-%s", pkg, hash) + cs, err := apiextensions.NewCustomResource(ctx, csName, + &apiextensions.CustomResourceArgs{ + ApiVersion: pulumi.String("operators.coreos.com/v1alpha1"), + Kind: pulumi.String("CatalogSource"), + Metadata: &metav1.ObjectMetaArgs{ + Name: pulumi.String(csName), + Namespace: pulumi.String("openshift-marketplace"), + }, + OtherFields: map[string]interface{}{ + "spec": map[string]interface{}{ + "sourceType": "grpc", + "image": indexImage, + "displayName": fmt.Sprintf("MAPT custom catalog for %s", pkg), + "publisher": "MAPT", + }, + }, + }, + a.k8sOpts(pulumi.DependsOn(a.Deps))...) + if err != nil { + return err + } + a.catalogSourceCRs[pkg] = catalogSourceInfo{Name: csName, Resource: cs} + } + return nil +} + // k8sOpts returns the common Pulumi resource options for K8s resources: // the K8s provider and (when set) the DeletedWith option. Extra options // (e.g. DependsOn) can be appended.