diff --git a/chart/infra-server/static/flavors.yaml b/chart/infra-server/static/flavors.yaml index 84db1829f..98500e448 100644 --- a/chart/infra-server/static/flavors.yaml +++ b/chart/infra-server/static/flavors.yaml @@ -329,6 +329,35 @@ Defines a list of capabilities to explicitly enable. These capabilities are enabled in addition to the capabilities specified in the baseline capability set. Example: `["DeploymentConfig", "ImageRegistry"]` + - name: install-virt + description: Install OpenShift Virtualization operator with VSOCK and create a VM + value: false + kind: optional + help: | + When true, an additional n2-standard-8 worker node is added and configured + with OpenShift Virtualization (KubeVirt) including VSOCK support. A single + VM is created using the OS specified by the vm-os parameter. + + - name: vm-os + description: OS for the virtual machine (rhel9, rhel10) + value: rhel9 + kind: optional + help: | + The RHEL version for the virtual machine container disk. + Valid values: rhel9, rhel10. + The image used will be quay.io/rhacs-eng/vm-images:-dnf-primed-latest. + + - name: virt-node-dedicated + description: Taint the virt node so only VMs are scheduled on it + value: false + kind: optional + help: | + When true, the dedicated virt worker node is tainted with + node-role.kubernetes.io/virt:NoSchedule so that only VM workloads + (which have a matching toleration) are scheduled on it. + When false, the virt node also accepts regular ACS/OCP workloads. + Only relevant when install-virt is true. + artifacts: - name: kubeconfig description: Kube config for connecting to this cluster @@ -359,6 +388,9 @@ - name: cluster-console-password description: The password to login at the openshift console + - name: vm-access + description: Credentials and access commands for the created VM + ###################### # openshift-4-demo # ###################### @@ -651,6 +683,35 @@ Defines a list of capabilities to explicitly enable. These capabilities are enabled in addition to the capabilities specified in the baseline capability set. Example: `["DeploymentConfig", "ImageRegistry"]` + - name: install-virt + description: Install OpenShift Virtualization operator with VSOCK and create a VM + value: false + kind: optional + help: | + When true, an additional n2-standard-8 worker node is added and configured + with OpenShift Virtualization (KubeVirt) including VSOCK support. A single + VM is created using the OS specified by the vm-os parameter. + + - name: vm-os + description: OS for the virtual machine (rhel9, rhel10) + value: rhel9 + kind: optional + help: | + The RHEL version for the virtual machine container disk. + Valid values: rhel9, rhel10. + The image used will be quay.io/rhacs-eng/vm-images:-dnf-primed-latest. + + - name: virt-node-dedicated + description: Taint the virt node so only VMs are scheduled on it + value: false + kind: optional + help: | + When true, the dedicated virt worker node is tainted with + node-role.kubernetes.io/virt:NoSchedule so that only VM workloads + (which have a matching toleration) are scheduled on it. + When false, the virt node also accepts regular ACS/OCP workloads. + Only relevant when install-virt is true. + artifacts: - name: kubeconfig description: Kube config for connecting to this cluster @@ -681,6 +742,9 @@ - name: cluster-console-password description: The password to login at the openshift console + - name: vm-access + description: Credentials and access commands for the created VM + ##################### # AWS EKS # ##################### diff --git a/chart/infra-server/static/workflow-openshift-4.yaml b/chart/infra-server/static/workflow-openshift-4.yaml index 5290c3d85..ed73153b2 100644 --- a/chart/infra-server/static/workflow-openshift-4.yaml +++ b/chart/infra-server/static/workflow-openshift-4.yaml @@ -32,6 +32,12 @@ spec: value: "vCurrent" - name: additional-enabled-capabilities value: "" + - name: install-virt + value: "false" + - name: vm-os + value: "rhel9" + - name: virt-node-dedicated + value: "false" volumeClaimTemplates: - metadata: name: data @@ -44,6 +50,9 @@ spec: - name: credentials secret: secretName: openshift-4-gcp-service-account + - name: registry-pull-secret + secret: + secretName: infra-image-registry-pull-secret templates: - name: start @@ -51,6 +60,18 @@ spec: - - name: create template: create + - - name: add-virt-node + template: add-virt-node + when: '{{ "{{" }}workflow.parameters.install-virt{{ "}}" }} == true' + + - - name: install-virt-operator + template: install-virt-operator + when: '{{ "{{" }}workflow.parameters.install-virt{{ "}}" }} == true' + + - - name: create-vm + template: create-vm + when: '{{ "{{" }}workflow.parameters.install-virt{{ "}}" }} == true' + - - name: gather template: gather @@ -183,6 +204,488 @@ spec: archive: none: {} + - name: add-virt-node + activeDeadlineSeconds: 900 + script: + image: quay.io/stackrox-io/ci:automation-flavors-openshift-4-{{ .Chart.Annotations.automationFlavorsVersion }} + command: [bash] + source: | + set -euo pipefail + export KUBECONFIG=/data/auth/kubeconfig + + DEDICATED='{{ "{{" }}workflow.parameters.virt-node-dedicated{{ "}}" }}' + INFRA_ID=$(oc get infrastructure cluster -o jsonpath='{.status.infrastructureName}') + + echo "=== Creating dedicated virt worker node ===" + echo "Dedicated (tainted) mode: $DEDICATED" + echo "Infrastructure ID: $INFRA_ID" + + READY_VIRT=$(oc get nodes -l node-role.kubernetes.io/virt -o json 2>/dev/null | \ + jq '[.items[] | select(.status.conditions[]? | select(.type=="Ready" and .status=="True"))] | length' || echo "0") + if [ "${READY_VIRT:-0}" -gt 0 ]; then + echo "Virt node already exists and is Ready" + oc get nodes -l node-role.kubernetes.io/virt + exit 0 + fi + + # On OCP 4.19+, the installer uses CAPI via a local envtest to provision + # machines, but the running cluster uses legacy machine.openshift.io + # MachineSets. The Machine API operator may need time after cluster creation + # to reconcile MachineSets, so we retry instead of failing immediately. + echo "Waiting for worker MachineSets in openshift-machine-api..." + ms_timeout=300 + ms_elapsed=0 + WORKER_MS="" + while [ -z "$WORKER_MS" ]; do + WORKER_MS=$(oc get machinesets -n openshift-machine-api -o json 2>/dev/null | jq -r ' + .items + | map(select( + .metadata.labels["machine.openshift.io/cluster-api-machine-role"] == "worker" or + .metadata.labels["machine.openshift.io/cluster-api-machine-type"] == "worker" + )) + | sort_by(.metadata.name) + | .[0].metadata.name // empty + ' 2>/dev/null || echo "") + if [ -n "$WORKER_MS" ]; then + break + fi + if [ $ms_elapsed -ge $ms_timeout ]; then + echo "ERROR: No worker MachineSet found in openshift-machine-api after ${ms_timeout}s" + oc get machinesets -n openshift-machine-api --no-headers 2>/dev/null || true + oc get machines -n openshift-machine-api --no-headers 2>/dev/null || true + exit 1 + fi + if [ $((ms_elapsed % 30)) -eq 0 ]; then + echo "No worker MachineSets yet, retrying... (${ms_elapsed}s)" + fi + sleep 15 + ms_elapsed=$((ms_elapsed + 15)) + done + echo "Using $WORKER_MS as template" + + ZONE=$(oc get machineset "$WORKER_MS" -n openshift-machine-api \ + -o jsonpath='{.spec.template.spec.providerSpec.value.zone}') + VIRT_MS_NAME="${INFRA_ID}-virt-worker-${ZONE}" + + if oc get machineset "$VIRT_MS_NAME" -n openshift-machine-api &>/dev/null; then + echo "MachineSet $VIRT_MS_NAME already exists" + else + JQ_FILTER=' + del(.metadata.uid, .metadata.resourceVersion, .metadata.creationTimestamp, .metadata.generation, .status) | + .metadata.name = $name | + .spec.replicas = 1 | + .spec.selector.matchLabels["machine.openshift.io/cluster-api-machineset"] = $name | + .spec.template.metadata.labels["machine.openshift.io/cluster-api-machineset"] = $name | + .spec.template.metadata.labels["node-role.kubernetes.io/virt"] = "" | + .spec.template.spec.providerSpec.value.machineType = $machineType | + .spec.template.spec.metadata.labels["node-role.kubernetes.io/virt"] = "" + ' + if [ "$DEDICATED" = "true" ]; then + JQ_FILTER="${JQ_FILTER} | .spec.template.spec.taints = [{\"key\": \"node-role.kubernetes.io/virt\", \"effect\": \"NoSchedule\"}]" + fi + + oc get machineset "$WORKER_MS" -n openshift-machine-api -o json | \ + jq --arg name "$VIRT_MS_NAME" --arg machineType "n2-standard-8" "$JQ_FILTER" | \ + oc apply -f - + echo "Created MachineSet $VIRT_MS_NAME" + fi + + echo "Waiting for virt node to become Ready..." + timeout=600 + elapsed=0 + while true; do + READY_NODES=$(oc get nodes -l node-role.kubernetes.io/virt -o json 2>/dev/null | \ + jq -r '[.items[] | select(.status.conditions[]? | + select(.type=="Ready" and .status=="True")) | + .metadata.name] | join(" ")' 2>/dev/null || echo "") + if [ -n "$READY_NODES" ]; then + echo "Virt node is Ready: $READY_NODES" + break + fi + if [ $elapsed -ge $timeout ]; then + echo "ERROR: Timeout waiting for virt node after ${timeout}s" + oc get machines -n openshift-machine-api \ + -l "machine.openshift.io/cluster-api-machineset=$VIRT_MS_NAME" \ + --no-headers 2>/dev/null || true + exit 1 + fi + if [ $((elapsed % 60)) -eq 0 ] && [ $elapsed -gt 0 ]; then + echo "Still waiting... (${elapsed}s elapsed)" + oc get machines -n openshift-machine-api \ + -l "machine.openshift.io/cluster-api-machineset=$VIRT_MS_NAME" \ + --no-headers 2>/dev/null || true + fi + sleep 15 + elapsed=$((elapsed + 15)) + done + + echo "=== Virt worker node ready ===" + volumeMounts: + - name: data + mountPath: /data + + - name: install-virt-operator + activeDeadlineSeconds: 2400 + script: + image: quay.io/stackrox-io/ci:automation-flavors-openshift-4-{{ .Chart.Annotations.automationFlavorsVersion }} + command: [bash] + source: | + set -euo pipefail + export KUBECONFIG=/data/auth/kubeconfig + + OLM_NAMESPACE="openshift-cnv" + SUBSCRIPTION_NAME="kubevirt-hyperconverged" + HCO_NAMESPACE="$OLM_NAMESPACE" + HCO_NAME="kubevirt-hyperconverged" + + echo "=== Installing OpenShift Virtualization ===" + + # Check if already installed and healthy + if oc get hyperconverged "$HCO_NAME" -n "$HCO_NAMESPACE" &>/dev/null; then + avail=$(oc -n "$HCO_NAMESPACE" get hyperconverged "$HCO_NAME" -o jsonpath='{.status.conditions[?(@.type=="Available")].status}' 2>/dev/null || echo "Unknown") + prog=$(oc -n "$HCO_NAMESPACE" get hyperconverged "$HCO_NAME" -o jsonpath='{.status.conditions[?(@.type=="Progressing")].status}' 2>/dev/null || echo "Unknown") + degr=$(oc -n "$HCO_NAMESPACE" get hyperconverged "$HCO_NAME" -o jsonpath='{.status.conditions[?(@.type=="Degraded")].status}' 2>/dev/null || echo "Unknown") + if [ "$avail" = "True" ] && [ "$prog" = "False" ] && [ "$degr" = "False" ]; then + echo "OpenShift Virtualization already installed and healthy" + echo "Ensuring VSOCK and KVM_EMULATION are configured..." + else + echo "HyperConverged exists but not healthy (Available=$avail, Progressing=$prog, Degraded=$degr)" + fi + fi + + # Create namespace, OperatorGroup, and Subscription + cat <<'EOFK8S' | oc apply -f - + apiVersion: v1 + kind: Namespace + metadata: + name: openshift-cnv + --- + apiVersion: operators.coreos.com/v1 + kind: OperatorGroup + metadata: + name: openshift-cnv + namespace: openshift-cnv + spec: + targetNamespaces: + - openshift-cnv + --- + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + name: kubevirt-hyperconverged + namespace: openshift-cnv + spec: + channel: stable + name: kubevirt-hyperconverged + source: redhat-operators + sourceNamespace: openshift-marketplace + installPlanApproval: Automatic + EOFK8S + echo "Applied namespace, OperatorGroup, and Subscription" + + # Wait for installedCSV + echo "Waiting for Subscription to report installedCSV..." + timeout=300 + elapsed=0 + until oc -n "$OLM_NAMESPACE" get sub "$SUBSCRIPTION_NAME" -o jsonpath='{.status.installedCSV}' 2>/dev/null | grep -q .; do + sleep 5 + elapsed=$((elapsed + 5)) + if [ $elapsed -ge $timeout ]; then + echo "ERROR: Timeout waiting for installedCSV after ${timeout}s" + exit 1 + fi + if [ $((elapsed % 30)) -eq 0 ]; then + echo "Still waiting for installedCSV... (${elapsed}s)" + fi + done + + CSV=$(oc -n "$OLM_NAMESPACE" get sub "$SUBSCRIPTION_NAME" -o jsonpath='{.status.installedCSV}') + echo "InstalledCSV: $CSV" + + # Wait for CSV Succeeded + echo "Waiting for CSV to reach Succeeded phase..." + timeout=900 + elapsed=0 + while true; do + PHASE=$(oc -n "$OLM_NAMESPACE" get csv "$CSV" -o jsonpath='{.status.phase}' 2>/dev/null || echo "") + if [ "$PHASE" = "Succeeded" ]; then + echo "CSV is Succeeded" + break + fi + if [ $elapsed -ge $timeout ]; then + echo "ERROR: CSV did not reach Succeeded (current: $PHASE) after ${timeout}s" + exit 1 + fi + if [ $((elapsed % 60)) -eq 0 ] && [ $elapsed -gt 0 ]; then + echo "Still waiting for CSV (Phase: ${PHASE:-Unknown}, ${elapsed}s)..." + fi + sleep 10 + elapsed=$((elapsed + 10)) + done + + # Create HyperConverged CR with VSOCK + echo "Creating HyperConverged CR with VSOCK feature gate..." + cat </dev/null || echo "Unknown") + prog=$(oc -n "$HCO_NAMESPACE" get hyperconverged "$HCO_NAME" -o jsonpath='{.status.conditions[?(@.type=="Progressing")].status}' 2>/dev/null || echo "Unknown") + degr=$(oc -n "$HCO_NAMESPACE" get hyperconverged "$HCO_NAME" -o jsonpath='{.status.conditions[?(@.type=="Degraded")].status}' 2>/dev/null || echo "Unknown") + if [ "$avail" = "True" ] && [ "$prog" = "False" ] && [ "$degr" = "False" ]; then + echo "HyperConverged is healthy" + break + fi + if [ $elapsed -ge $timeout ]; then + echo "ERROR: HyperConverged not healthy after ${timeout}s (Available=$avail, Progressing=$prog, Degraded=$degr)" + exit 1 + fi + if [ $((elapsed % 60)) -eq 0 ] && [ $elapsed -gt 0 ]; then + echo "Still waiting (Available=$avail, Progressing=$prog, Degraded=$degr) - ${elapsed}s" + fi + sleep 15 + elapsed=$((elapsed + 15)) + done + + # Patch KVM_EMULATION + current_kvm=$(oc get subscription kubevirt-hyperconverged -n openshift-cnv -o jsonpath='{.spec.config.env[?(@.name=="KVM_EMULATION")].value}' 2>/dev/null || echo "") + if [ "$current_kvm" = "true" ]; then + echo "KVM_EMULATION already set" + else + echo "Patching subscription with KVM_EMULATION..." + oc patch subscription kubevirt-hyperconverged \ + -n openshift-cnv \ + --type=merge \ + -p '{"spec":{"config":{"selector":{"matchLabels":{"name":"hyperconverged-cluster-operator"}},"env":[{"name":"KVM_EMULATION","value":"true"}]}}}' + echo "KVM_EMULATION patched" + fi + + echo "=== OpenShift Virtualization installed with VSOCK + KVM_EMULATION ===" + volumeMounts: + - name: data + mountPath: /data + + - name: create-vm + activeDeadlineSeconds: 600 + outputs: + artifacts: + - name: vm-access + path: /data/vm-access.md + archive: + none: {} + script: + image: quay.io/stackrox-io/ci:automation-flavors-openshift-4-{{ .Chart.Annotations.automationFlavorsVersion }} + command: [bash] + source: | + set -euo pipefail + export KUBECONFIG=/data/auth/kubeconfig + + VM_OS='{{ "{{" }}workflow.parameters.vm-os{{ "}}" }}' + DEDICATED='{{ "{{" }}workflow.parameters.virt-node-dedicated{{ "}}" }}' + NAMESPACE="openshift-cnv" + VM_NAME="${VM_OS}-1" + CONTAINER_IMAGE="quay.io/rhacs-eng/vm-images:${VM_OS}-dnf-primed-latest" + SSH_USER="cloud-user" + PULL_SECRET_NAME="quay-rhacs-eng-ro" + + case "$VM_OS" in + rhel9|rhel10) ;; + *) + echo "ERROR: unsupported vm-os '$VM_OS'. Valid values: rhel9, rhel10." + exit 1 + ;; + esac + + echo "=== Creating VM: $VM_NAME (OS: $VM_OS, dedicated=$DEDICATED) ===" + + # Check if VM already exists — skip creation to avoid publishing stale credentials + if oc get vm "$VM_NAME" -n "$NAMESPACE" &>/dev/null; then + STATUS=$(oc get vm "$VM_NAME" -n "$NAMESPACE" -o jsonpath='{.status.printableStatus}' 2>/dev/null || echo "Unknown") + echo "VM $VM_NAME already exists (status: $STATUS), skipping creation" + echo "# VM Access Information (pre-existing VM)" > /data/vm-access.md + echo "" >> /data/vm-access.md + echo "VM was already provisioned in a prior run. Password was set at first boot." >> /data/vm-access.md + echo "Check cluster secrets or prior workflow artifacts for credentials." >> /data/vm-access.md + exit 0 + fi + + VM_PASSWORD=$(openssl rand -hex 10) + + # Copy the already-provisioned Quay pull secret from the infra namespace into the target cluster + echo "Creating pull secret for quay.io/rhacs-eng in namespace $NAMESPACE from mounted infra secret..." + if [ ! -s /infra-secrets/quay/.dockerconfigjson ]; then + echo "ERROR: mounted registry secret /infra-secrets/quay/.dockerconfigjson is missing or empty" + exit 1 + fi + cat </dev/null || echo "") + if [ "$PHASE" = "Running" ]; then + echo "VMI is Running" + break + fi + if [ $elapsed -ge $timeout ]; then + echo "ERROR: VMI did not reach Running phase after ${timeout}s (current: $PHASE)" + echo "--- VMI details ---" + oc get vmi "$VM_NAME" -n "$NAMESPACE" -o yaml 2>&1 || true + echo "--- Recent events ---" + oc get events -n "$NAMESPACE" --sort-by='.lastTimestamp' 2>&1 | tail -20 || true + exit 1 + fi + if [ $((elapsed % 30)) -eq 0 ] && [ $elapsed -gt 0 ]; then + echo "Waiting for VMI (phase: ${PHASE:-Pending}, ${elapsed}s)..." + fi + sleep 10 + elapsed=$((elapsed + 10)) + done + + echo "" + echo "=== VM Created ===" + echo " Name: $VM_NAME" + echo " Namespace: $NAMESPACE" + echo " OS: $VM_OS" + echo " User: $SSH_USER" + echo " Password: stored in vm-access artifact" + echo " VSOCK: enabled" + echo "" + echo "Access via: virtctl ssh -n $NAMESPACE ${SSH_USER}@vmi/${VM_NAME}" + + # Save VM access info to the downloadable artifact + umask 077 + cat > /data/vm-access.md <