Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 56 additions & 2 deletions cmd/containerd-shim-runhcs-v1/task_hcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -931,31 +931,85 @@ func (ht *hcsTask) updateTaskContainerResources(ctx context.Context, data interf
func (ht *hcsTask) updateWCOWContainerCPU(ctx context.Context, cpu *specs.WindowsCPUResources) error {
// if host is 20h2+ then we can make a request directly to hcs
if osversion.Get().Build >= osversion.V20H2 {
// Count/Maximum/Shares live on the HCS Processor schema. Only send a modify
// request when at least one of them is set, so an affinity-only update does
// not push an empty (no-op) request to HCS.
req := &hcsschema.Processor{}
hasRateControl := false
if cpu.Count != nil {
procCount := int32(*cpu.Count)
hostProcs := processorinfo.ProcessorCount()
if ht.host != nil {
hostProcs = ht.host.ProcessorCount()
}
req.Count = hcsoci.NormalizeProcessorCount(ctx, ht.id, procCount, hostProcs)
hasRateControl = true
}
if cpu.Maximum != nil {
req.Maximum = int32(*cpu.Maximum)
hasRateControl = true
}
if cpu.Shares != nil {
req.Weight = int32(*cpu.Shares)
hasRateControl = true
}
return ht.requestUpdateContainer(ctx, resourcepaths.SiloProcessorResourcePath, req)
if hasRateControl {
if err := ht.requestUpdateContainer(ctx, resourcepaths.SiloProcessorResourcePath, req); err != nil {
return err
}
}

// CPU affinity is not part of the HCS Processor schema, so it has to be
// applied out of band (the silo's job object for Argon). A no-op when unset.
if len(cpu.Affinity) > 0 {
return ht.updateWCOWContainerCPUAffinity(ctx, cpu.Affinity)
}
return nil
}

return errdefs.ErrNotImplemented
}

// updateWCOWContainerCPUAffinity honors a post-start change to
// spec.Windows.Resources.CPU.Affinity for an HCS-backed WCOW container.
//
// For process-isolated (Argon) containers this re-pins the silo's job object, using
// the same race-free mechanism as create-time: the Windows kernel re-applies the new
// mask to every process already in the silo and to every future joiner.
//
// Hypervisor-isolated (Xenon) containers require swapping the UVM's CPU group instead;
// that is not yet implemented, so this returns ErrNotImplemented rather than silently
// dropping the request.
func (ht *hcsTask) updateWCOWContainerCPUAffinity(ctx context.Context, affinity []specs.WindowsCPUGroupAffinity) error {
validated, err := hcsoci.ValidateCPUAffinityEntries(affinity)
if err != nil {
return err
}
if len(validated) == 0 {
return nil
}

if ht.host != nil {
// Xenon: UVM-level CPU-group swap is out of scope here (Track A).
return fmt.Errorf("cpu affinity update for hypervisor-isolated containers is not supported: %w", errdefs.ErrNotImplemented)
}

system, ok := ht.c.(*hcs.System)
if !ok {
return fmt.Errorf("cpu affinity update requires an HCS-backed container, got %T", ht.c)
}
return system.SetSiloCPUGroupAffinities(ctx, hcsoci.ToJobObjectAffinities(validated))
}

func isValidWindowsCPUResources(c *specs.WindowsCPUResources) bool {
return (c.Count != nil && (c.Shares == nil && c.Maximum == nil)) ||
// Exactly one of the mutually-exclusive rate controls (Count/Shares/Maximum).
exactlyOneRateControl := (c.Count != nil && (c.Shares == nil && c.Maximum == nil)) ||
(c.Shares != nil && (c.Count == nil && c.Maximum == nil)) ||
(c.Maximum != nil && (c.Count == nil && c.Shares == nil))
// An affinity-only update carries no rate control; accept it on its own so that
// CPU affinity can be changed after the container has started.
affinityOnly := len(c.Affinity) > 0 && c.Count == nil && c.Shares == nil && c.Maximum == nil
return exactlyOneRateControl || affinityOnly
}

func (ht *hcsTask) updateWCOWResources(ctx context.Context, resources *specs.WindowsResources, annotations map[string]string) error {
Expand Down
44 changes: 44 additions & 0 deletions cmd/containerd-shim-runhcs-v1/task_hcs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ package main

import (
"context"
"errors"
"math/rand"
"reflect"
"strconv"
"testing"
"time"

"github.com/Microsoft/hcsshim/internal/uvm"
"github.com/Microsoft/hcsshim/pkg/annotations"
"github.com/containerd/errdefs"
"github.com/opencontainers/runtime-spec/specs-go"
Expand Down Expand Up @@ -506,3 +508,45 @@ func Test_handleProcessArgsForIsolatedJobContainer(t *testing.T) {
})
}
}

func u64(v uint64) *uint64 { return &v }
func u16(v uint16) *uint16 { return &v }

func Test_isValidWindowsCPUResources(t *testing.T) {
affinity := []specs.WindowsCPUGroupAffinity{{Group: 0, Mask: 0x3}}
for _, tt := range []struct {
name string
c *specs.WindowsCPUResources
want bool
}{
{"count only", &specs.WindowsCPUResources{Count: u64(2)}, true},
{"shares only", &specs.WindowsCPUResources{Shares: u16(100)}, true},
{"maximum only", &specs.WindowsCPUResources{Maximum: u16(5000)}, true},
{"count and shares", &specs.WindowsCPUResources{Count: u64(2), Shares: u16(100)}, false},
{"affinity only", &specs.WindowsCPUResources{Affinity: affinity}, true},
{"affinity with count", &specs.WindowsCPUResources{Count: u64(2), Affinity: affinity}, true},
{"empty", &specs.WindowsCPUResources{}, false},
} {
t.Run(tt.name, func(t *testing.T) {
if got := isValidWindowsCPUResources(tt.c); got != tt.want {
t.Fatalf("isValidWindowsCPUResources(%+v) = %v, want %v", tt.c, got, tt.want)
}
})
}
}

func Test_hcsTask_updateWCOWContainerCPUAffinity_NoAffinity(t *testing.T) {
ht := &hcsTask{id: t.Name()}
// An empty affinity slice is a no-op and must not require an HCS-backed container.
if err := ht.updateWCOWContainerCPUAffinity(context.Background(), nil); err != nil {
t.Fatalf("expected nil error for empty affinity, got %v", err)
}
}

func Test_hcsTask_updateWCOWContainerCPUAffinity_XenonNotImplemented(t *testing.T) {
ht := &hcsTask{id: t.Name(), host: &uvm.UtilityVM{}}
err := ht.updateWCOWContainerCPUAffinity(context.Background(), []specs.WindowsCPUGroupAffinity{{Group: 0, Mask: 0x1}})
if !errors.Is(err, errdefs.ErrNotImplemented) {
t.Fatalf("expected ErrNotImplemented for hypervisor-isolated container, got %v", err)
}
}
52 changes: 44 additions & 8 deletions internal/hcs/system.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,20 @@ func (computeSystem *System) Properties(ctx context.Context, types ...schema1.Pr
return properties, nil
}

// openSilo opens the container's server silo job object by its well-known name
// (`\Container_<id>`). HCS owns the silo; the only way to open it from the shim is
// by name, and only while running as SYSTEM. The caller owns the returned handle and
// must Close it.
//
// In the future we can make use of some new functionality in HCS that allows you to
// pass a job object for HCS to use for the container.
func (computeSystem *System) openSilo(ctx context.Context) (*jobobject.JobObject, error) {
return jobobject.Open(ctx, &jobobject.Options{
UseNTVariant: true,
Name: siloNameFmt(computeSystem.id),
})
}

// queryInProc handles querying for container properties without reaching out to HCS. `props`
// will be updated to contain any data returned from the queries present in `types`. If any properties
// failed to be queried they will be tallied up and returned in as the first return value. Failures on
Expand All @@ -434,14 +448,7 @@ func (computeSystem *System) queryInProc(
props *hcsschema.Properties,
types []hcsschema.PropertyType,
) ([]hcsschema.PropertyType, error) {
// In the future we can make use of some new functionality in the HCS that allows you
// to pass a job object for HCS to use for the container. Currently, the only way we'll
// be able to open the job/silo is if we're running as SYSTEM.
jobOptions := &jobobject.Options{
UseNTVariant: true,
Name: siloNameFmt(computeSystem.id),
}
job, err := jobobject.Open(ctx, jobOptions)
job, err := computeSystem.openSilo(ctx)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -535,6 +542,35 @@ func (computeSystem *System) statisticsInProc(job *jobobject.JobObject) (*hcssch
}, nil
}

// SetSiloCPUGroupAffinities pins the container's server silo to the given processor
// group affinities. HCS does not expose a CPU-affinity field on the container Processor
// schema, so for process-isolated (Argon) containers we set the affinity directly on the
// silo's job object via SetInformationJobObject(JobObjectGroupInformationEx).
//
// HCS owns the silo; we only open a transient handle (by the silo's well-known job name,
// the same handle queryInProc opens) to record the affinity property. The kernel enforces
// it on every process that joins the silo via AssignProcessToJobObject — including the init
// process at Start and any descendants it spawns.
//
// This must be called after the compute system is created but before it is started, so the
// affinity is already recorded on the job when HCS assigns the init process. Applying it to
// an already-running silo is also safe: the kernel re-applies the mask to current members and
// migrates threads at the next scheduling dispatch.
func (computeSystem *System) SetSiloCPUGroupAffinities(ctx context.Context, affinities []jobobject.GroupAffinity) (err error) {
operation := "hcs::System::SetSiloCPUGroupAffinities"

job, err := computeSystem.openSilo(ctx)
if err != nil {
return makeSystemError(computeSystem, operation, err, nil)
}
defer job.Close()

if err := job.SetCPUGroupAffinities(affinities); err != nil {
return makeSystemError(computeSystem, operation, err, nil)
}
return nil
}

// hcsPropertiesV2Query is a helper to make a HcsGetComputeSystemProperties call using the V2 schema property types.
func (computeSystem *System) hcsPropertiesV2Query(ctx context.Context, types []hcsschema.PropertyType) (*hcsschema.Properties, error) {
operation := "hcs::System::PropertiesV2"
Expand Down
99 changes: 99 additions & 0 deletions internal/hcsoci/cpuaffinity.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
//go:build windows
// +build windows

package hcsoci

import (
"errors"
"fmt"

specs "github.com/opencontainers/runtime-spec/specs-go"

"github.com/Microsoft/hcsshim/internal/jobobject"
"github.com/Microsoft/hcsshim/osversion"
)

// Shared, container-kind-agnostic CPU affinity helpers. These are used by every
// Windows container shape that honors spec.Windows.Resources.CPU.Affinity:
// HostProcess (internal/jobcontainers) and Argon (this package). Keeping them
// here, rather than in a kind-specific file, avoids duplicating the validation
// and conversion logic across packages.

// Sentinel errors returned by ValidateCPUAffinity / ValidateCPUAffinityEntries.
var (
// ErrCPUAffinityMultipleGroupsNotSupported is returned when multiple processor-group
// affinity entries are requested on a host older than Windows Server 2022 (build 20348),
// which does not support multi-group affinity for job object silos.
// On Windows Server 2022+, multiple processor groups are fully supported.
ErrCPUAffinityMultipleGroupsNotSupported = errors.New("cpu affinity with multiple processor groups requires Windows Server 2022 or later")
// ErrCPUAffinityNonZeroGroupNotSupported is returned when a non-zero processor group is
// requested on a host older than Windows Server 2022 (build 20348).
// On Windows Server 2022+, non-zero processor groups are fully supported.
ErrCPUAffinityNonZeroGroupNotSupported = errors.New("cpu affinity with a non-zero processor group requires Windows Server 2022 or later")
// ErrCPUAffinityMaskZero is returned when an affinity entry has a zero bitmask,
// which would select no processors and is always invalid.
ErrCPUAffinityMaskZero = errors.New("cpu affinity mask must be non-zero")
)

// ValidateCPUAffinity handles the logic of validating the container's CPU affinity
// specified in the OCI spec.
//
// Returns the validated affinity entries (nil if not specified) and any validation error.
// Multiple processor groups and non-zero group numbers require Windows Server 2022
// (build 20348) or later; on older hosts only a single entry for group 0 is accepted.
func ValidateCPUAffinity(spec *specs.Spec) ([]specs.WindowsCPUGroupAffinity, error) {
if spec.Windows == nil || spec.Windows.Resources == nil || spec.Windows.Resources.CPU == nil {
return nil, nil
}
return ValidateCPUAffinityEntries(spec.Windows.Resources.CPU.Affinity)
}

// ValidateCPUAffinityEntries validates a set of OCI CPU affinity entries directly,
// applying the same rules as ValidateCPUAffinity. It is used on the container update
// path, where the affinity is supplied as a bare slice rather than a full spec.
//
// Returns the validated entries (nil if empty) and any validation error.
func ValidateCPUAffinityEntries(affinity []specs.WindowsCPUGroupAffinity) ([]specs.WindowsCPUGroupAffinity, error) {
if len(affinity) == 0 {
return nil, nil
}

// Zero masks are never valid regardless of OS version.
for i, a := range affinity {
if a.Mask == 0 {
return nil, fmt.Errorf("%w: entry %d has zero mask", ErrCPUAffinityMaskZero, i)
}
}

// Determine whether multi-group features are needed: either multiple entries,
// or a single entry targeting a non-zero processor group.
multiGroup := len(affinity) > 1 || affinity[0].Group != 0

// Multiple processor groups are only supported on Windows Server 2022+.
if multiGroup && osversion.Build() < osversion.LTSC2022 {
if len(affinity) > 1 {
return nil, fmt.Errorf("%w: %d entries", ErrCPUAffinityMultipleGroupsNotSupported, len(affinity))
}
return nil, fmt.Errorf("%w: group %d", ErrCPUAffinityNonZeroGroupNotSupported, affinity[0].Group)
}

return affinity, nil
}

// ToJobObjectAffinities converts validated OCI CPU affinity entries into the
// jobobject.GroupAffinity representation used by the Win32 job-object APIs.
//
// The input is expected to already have been run through ValidateCPUAffinity.
func ToJobObjectAffinities(affinities []specs.WindowsCPUGroupAffinity) []jobobject.GroupAffinity {
if len(affinities) == 0 {
return nil
}
out := make([]jobobject.GroupAffinity, len(affinities))
for i, a := range affinities {
out[i] = jobobject.GroupAffinity{
Mask: a.Mask,
Group: uint16(a.Group),
}
}
return out
}
39 changes: 39 additions & 0 deletions internal/hcsoci/cpuaffinity_argon.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
//go:build windows
// +build windows

package hcsoci

import (
"context"
"fmt"

"github.com/Microsoft/hcsshim/internal/hcs"
"github.com/Microsoft/hcsshim/internal/log"
)

// applyArgonCPUAffinity honors spec.Windows.Resources.CPU.Affinity for a
// process-isolated (Argon) container by pinning the container's server silo.
//
// HCS ignores CPU affinity on the container Processor schema (Count/Maximum/Weight),
// so instead we set the affinity on the silo's job object directly. This must run
// after the compute system is created but before it is started, so the affinity is
// already recorded on the job when HCS assigns the init process to the silo. See
// (*hcs.System).SetSiloCPUGroupAffinities for the race-free timeline.
//
// If the spec requests no affinity this is a no-op.
func applyArgonCPUAffinity(ctx context.Context, system *hcs.System, coi *createOptionsInternal) error {
affinities, err := ValidateCPUAffinity(coi.Spec)
if err != nil {
return err
}
if len(affinities) == 0 {
return nil
}

if err := system.SetSiloCPUGroupAffinities(ctx, ToJobObjectAffinities(affinities)); err != nil {
return fmt.Errorf("apply CPU affinity to container silo: %w", err)
}

log.G(ctx).WithField("affinities", affinities).Debug("applied CPU affinity to Argon container silo")
return nil
}
Loading
Loading