From e48eb2d66f594ccce5e940ed1d80fb5b8910b85d Mon Sep 17 00:00:00 2001 From: zylxjtu Date: Tue, 2 Jun 2026 21:18:36 +0000 Subject: [PATCH] hcsoci,hcs,shim: honor CPU affinity for Argon containers Process-isolated (Argon) WCOW containers run inside a server silo, which is a job object owned by HCS. HCS does not expose a CPU-affinity field on the container Processor schema (Count/Maximum/Weight), so the OCI spec.Windows.Resources.CPU.Affinity field was silently ignored for Argon containers at both create and update time. Honor it the same way HostProcess containers do: open the silo's job object (by its well-known \Container_ name, the same handle queryInProc already opens) and apply the affinity with SetInformationJobObject(JobObjectGroupInformationEx) via the existing SetCPUGroupAffinities helper. Create path: - Add (*hcs.System).SetSiloCPUGroupAffinities, which opens the silo job and sets the group affinities on it. - After CreateComputeSystem but before the container is started, apply affinity for Argon (HostingSystem == nil) via applyArgonCPUAffinity. This is race-free: the kernel records the property on the job, then applies it to the init process (and every descendant that later joins the silo) at the moment HCS calls AssignProcessToJobObject during Start. No container instruction ever runs on a forbidden processor, and the property lives on the kernel object so no per-PID walking or watchdog is needed. Update path (runtime update / UpdateContainerResources): - Loosen isValidWindowsCPUResources to accept an affinity-only update (no Count/Shares/Maximum), so affinity can be changed after the container starts. - updateWCOWContainerCPU now only sends the HCS Processor modify request when a rate control is set (avoiding an empty no-op request) and additionally applies affinity out of band when present. - New updateWCOWContainerCPUAffinity re-pins the silo for Argon via the same SetSiloCPUGroupAffinities mechanism (the kernel re-applies the mask to current and future silo members). Hypervisor-isolated (Xenon) containers return ErrNotImplemented rather than silently dropping the request, since they require a UVM-level CPU-group swap. Shared helpers: - Collect the container-kind-agnostic CPU affinity validators (ValidateCPUAffinity / ValidateCPUAffinityEntries), their sentinel errors, and the OCI -> jobobject.GroupAffinity converter (ToJobObjectAffinities) into a single kind-neutral cpuaffinity.go, shared by the HostProcess (internal/jobcontainers) and Argon paths. jobcontainers now reuses ToJobObjectAffinities instead of a duplicated conversion loop. Xenon (UVM-backed) containers are out of scope here and are skipped; they require UVM-level CPU groups and are handled separately. Tests: unit coverage for the converter, the affinity validators, the loosened isValidWindowsCPUResources (including affinity-only), and the affinity-update dispatch (no-op and Xenon not-implemented branches). Plus a functional create-path test (test/functional/container_affinity_test.go) that pins an Argon container at create and asserts, in-process via internal/jobobject, that the affinity landed on the silo job object (the real regression gate, both pre- and post-start) and that the init process inherited it. It needs no external tooling since the functional suite runs as SYSTEM and can open the silo job directly. Runtime-update (post-start) read-back over cri-containerd is deferred: it relies on jobobject-util, whose get-path is still single-group only. Signed-off-by: zylxjtu --- cmd/containerd-shim-runhcs-v1/task_hcs.go | 58 +++- .../task_hcs_test.go | 44 +++ internal/hcs/system.go | 52 +++- internal/hcsoci/cpuaffinity.go | 99 +++++++ internal/hcsoci/cpuaffinity_argon.go | 39 +++ internal/hcsoci/cpuaffinity_test.go | 83 ++++++ internal/hcsoci/create.go | 11 + internal/hcsoci/hcsdoc_wcow.go | 52 +--- internal/jobcontainers/oci.go | 11 +- test/functional/container_affinity_test.go | 255 ++++++++++++++++++ 10 files changed, 634 insertions(+), 70 deletions(-) create mode 100644 internal/hcsoci/cpuaffinity.go create mode 100644 internal/hcsoci/cpuaffinity_argon.go create mode 100644 internal/hcsoci/cpuaffinity_test.go create mode 100644 test/functional/container_affinity_test.go diff --git a/cmd/containerd-shim-runhcs-v1/task_hcs.go b/cmd/containerd-shim-runhcs-v1/task_hcs.go index a59618137b..b984981395 100644 --- a/cmd/containerd-shim-runhcs-v1/task_hcs.go +++ b/cmd/containerd-shim-runhcs-v1/task_hcs.go @@ -931,7 +931,11 @@ func (ht *hcsTask) updateTaskContainerResources(ctx context.Context, data interf func (ht *hcsTask) updateWCOWContainerCPU(ctx context.Context, cpu *specs.WindowsCPUResources) error { // if host is 20h2+ then we can make a request directly to hcs if osversion.Get().Build >= osversion.V20H2 { + // Count/Maximum/Shares live on the HCS Processor schema. Only send a modify + // request when at least one of them is set, so an affinity-only update does + // not push an empty (no-op) request to HCS. req := &hcsschema.Processor{} + hasRateControl := false if cpu.Count != nil { procCount := int32(*cpu.Count) hostProcs := processorinfo.ProcessorCount() @@ -939,23 +943,73 @@ func (ht *hcsTask) updateWCOWContainerCPU(ctx context.Context, cpu *specs.Window hostProcs = ht.host.ProcessorCount() } req.Count = hcsoci.NormalizeProcessorCount(ctx, ht.id, procCount, hostProcs) + hasRateControl = true } if cpu.Maximum != nil { req.Maximum = int32(*cpu.Maximum) + hasRateControl = true } if cpu.Shares != nil { req.Weight = int32(*cpu.Shares) + hasRateControl = true } - return ht.requestUpdateContainer(ctx, resourcepaths.SiloProcessorResourcePath, req) + if hasRateControl { + if err := ht.requestUpdateContainer(ctx, resourcepaths.SiloProcessorResourcePath, req); err != nil { + return err + } + } + + // CPU affinity is not part of the HCS Processor schema, so it has to be + // applied out of band (the silo's job object for Argon). A no-op when unset. + if len(cpu.Affinity) > 0 { + return ht.updateWCOWContainerCPUAffinity(ctx, cpu.Affinity) + } + return nil } return errdefs.ErrNotImplemented } +// updateWCOWContainerCPUAffinity honors a post-start change to +// spec.Windows.Resources.CPU.Affinity for an HCS-backed WCOW container. +// +// For process-isolated (Argon) containers this re-pins the silo's job object, using +// the same race-free mechanism as create-time: the Windows kernel re-applies the new +// mask to every process already in the silo and to every future joiner. +// +// Hypervisor-isolated (Xenon) containers require swapping the UVM's CPU group instead; +// that is not yet implemented, so this returns ErrNotImplemented rather than silently +// dropping the request. +func (ht *hcsTask) updateWCOWContainerCPUAffinity(ctx context.Context, affinity []specs.WindowsCPUGroupAffinity) error { + validated, err := hcsoci.ValidateCPUAffinityEntries(affinity) + if err != nil { + return err + } + if len(validated) == 0 { + return nil + } + + if ht.host != nil { + // Xenon: UVM-level CPU-group swap is out of scope here (Track A). + return fmt.Errorf("cpu affinity update for hypervisor-isolated containers is not supported: %w", errdefs.ErrNotImplemented) + } + + system, ok := ht.c.(*hcs.System) + if !ok { + return fmt.Errorf("cpu affinity update requires an HCS-backed container, got %T", ht.c) + } + return system.SetSiloCPUGroupAffinities(ctx, hcsoci.ToJobObjectAffinities(validated)) +} + func isValidWindowsCPUResources(c *specs.WindowsCPUResources) bool { - return (c.Count != nil && (c.Shares == nil && c.Maximum == nil)) || + // Exactly one of the mutually-exclusive rate controls (Count/Shares/Maximum). + exactlyOneRateControl := (c.Count != nil && (c.Shares == nil && c.Maximum == nil)) || (c.Shares != nil && (c.Count == nil && c.Maximum == nil)) || (c.Maximum != nil && (c.Count == nil && c.Shares == nil)) + // An affinity-only update carries no rate control; accept it on its own so that + // CPU affinity can be changed after the container has started. + affinityOnly := len(c.Affinity) > 0 && c.Count == nil && c.Shares == nil && c.Maximum == nil + return exactlyOneRateControl || affinityOnly } func (ht *hcsTask) updateWCOWResources(ctx context.Context, resources *specs.WindowsResources, annotations map[string]string) error { diff --git a/cmd/containerd-shim-runhcs-v1/task_hcs_test.go b/cmd/containerd-shim-runhcs-v1/task_hcs_test.go index beb58ffc50..d922d8e2f3 100644 --- a/cmd/containerd-shim-runhcs-v1/task_hcs_test.go +++ b/cmd/containerd-shim-runhcs-v1/task_hcs_test.go @@ -4,12 +4,14 @@ package main import ( "context" + "errors" "math/rand" "reflect" "strconv" "testing" "time" + "github.com/Microsoft/hcsshim/internal/uvm" "github.com/Microsoft/hcsshim/pkg/annotations" "github.com/containerd/errdefs" "github.com/opencontainers/runtime-spec/specs-go" @@ -506,3 +508,45 @@ func Test_handleProcessArgsForIsolatedJobContainer(t *testing.T) { }) } } + +func u64(v uint64) *uint64 { return &v } +func u16(v uint16) *uint16 { return &v } + +func Test_isValidWindowsCPUResources(t *testing.T) { + affinity := []specs.WindowsCPUGroupAffinity{{Group: 0, Mask: 0x3}} + for _, tt := range []struct { + name string + c *specs.WindowsCPUResources + want bool + }{ + {"count only", &specs.WindowsCPUResources{Count: u64(2)}, true}, + {"shares only", &specs.WindowsCPUResources{Shares: u16(100)}, true}, + {"maximum only", &specs.WindowsCPUResources{Maximum: u16(5000)}, true}, + {"count and shares", &specs.WindowsCPUResources{Count: u64(2), Shares: u16(100)}, false}, + {"affinity only", &specs.WindowsCPUResources{Affinity: affinity}, true}, + {"affinity with count", &specs.WindowsCPUResources{Count: u64(2), Affinity: affinity}, true}, + {"empty", &specs.WindowsCPUResources{}, false}, + } { + t.Run(tt.name, func(t *testing.T) { + if got := isValidWindowsCPUResources(tt.c); got != tt.want { + t.Fatalf("isValidWindowsCPUResources(%+v) = %v, want %v", tt.c, got, tt.want) + } + }) + } +} + +func Test_hcsTask_updateWCOWContainerCPUAffinity_NoAffinity(t *testing.T) { + ht := &hcsTask{id: t.Name()} + // An empty affinity slice is a no-op and must not require an HCS-backed container. + if err := ht.updateWCOWContainerCPUAffinity(context.Background(), nil); err != nil { + t.Fatalf("expected nil error for empty affinity, got %v", err) + } +} + +func Test_hcsTask_updateWCOWContainerCPUAffinity_XenonNotImplemented(t *testing.T) { + ht := &hcsTask{id: t.Name(), host: &uvm.UtilityVM{}} + err := ht.updateWCOWContainerCPUAffinity(context.Background(), []specs.WindowsCPUGroupAffinity{{Group: 0, Mask: 0x1}}) + if !errors.Is(err, errdefs.ErrNotImplemented) { + t.Fatalf("expected ErrNotImplemented for hypervisor-isolated container, got %v", err) + } +} diff --git a/internal/hcs/system.go b/internal/hcs/system.go index 869a5f3e7a..c50771cc4e 100644 --- a/internal/hcs/system.go +++ b/internal/hcs/system.go @@ -424,6 +424,20 @@ func (computeSystem *System) Properties(ctx context.Context, types ...schema1.Pr return properties, nil } +// openSilo opens the container's server silo job object by its well-known name +// (`\Container_`). HCS owns the silo; the only way to open it from the shim is +// by name, and only while running as SYSTEM. The caller owns the returned handle and +// must Close it. +// +// In the future we can make use of some new functionality in HCS that allows you to +// pass a job object for HCS to use for the container. +func (computeSystem *System) openSilo(ctx context.Context) (*jobobject.JobObject, error) { + return jobobject.Open(ctx, &jobobject.Options{ + UseNTVariant: true, + Name: siloNameFmt(computeSystem.id), + }) +} + // queryInProc handles querying for container properties without reaching out to HCS. `props` // will be updated to contain any data returned from the queries present in `types`. If any properties // failed to be queried they will be tallied up and returned in as the first return value. Failures on @@ -434,14 +448,7 @@ func (computeSystem *System) queryInProc( props *hcsschema.Properties, types []hcsschema.PropertyType, ) ([]hcsschema.PropertyType, error) { - // In the future we can make use of some new functionality in the HCS that allows you - // to pass a job object for HCS to use for the container. Currently, the only way we'll - // be able to open the job/silo is if we're running as SYSTEM. - jobOptions := &jobobject.Options{ - UseNTVariant: true, - Name: siloNameFmt(computeSystem.id), - } - job, err := jobobject.Open(ctx, jobOptions) + job, err := computeSystem.openSilo(ctx) if err != nil { return nil, err } @@ -535,6 +542,35 @@ func (computeSystem *System) statisticsInProc(job *jobobject.JobObject) (*hcssch }, nil } +// SetSiloCPUGroupAffinities pins the container's server silo to the given processor +// group affinities. HCS does not expose a CPU-affinity field on the container Processor +// schema, so for process-isolated (Argon) containers we set the affinity directly on the +// silo's job object via SetInformationJobObject(JobObjectGroupInformationEx). +// +// HCS owns the silo; we only open a transient handle (by the silo's well-known job name, +// the same handle queryInProc opens) to record the affinity property. The kernel enforces +// it on every process that joins the silo via AssignProcessToJobObject — including the init +// process at Start and any descendants it spawns. +// +// This must be called after the compute system is created but before it is started, so the +// affinity is already recorded on the job when HCS assigns the init process. Applying it to +// an already-running silo is also safe: the kernel re-applies the mask to current members and +// migrates threads at the next scheduling dispatch. +func (computeSystem *System) SetSiloCPUGroupAffinities(ctx context.Context, affinities []jobobject.GroupAffinity) (err error) { + operation := "hcs::System::SetSiloCPUGroupAffinities" + + job, err := computeSystem.openSilo(ctx) + if err != nil { + return makeSystemError(computeSystem, operation, err, nil) + } + defer job.Close() + + if err := job.SetCPUGroupAffinities(affinities); err != nil { + return makeSystemError(computeSystem, operation, err, nil) + } + return nil +} + // hcsPropertiesV2Query is a helper to make a HcsGetComputeSystemProperties call using the V2 schema property types. func (computeSystem *System) hcsPropertiesV2Query(ctx context.Context, types []hcsschema.PropertyType) (*hcsschema.Properties, error) { operation := "hcs::System::PropertiesV2" diff --git a/internal/hcsoci/cpuaffinity.go b/internal/hcsoci/cpuaffinity.go new file mode 100644 index 0000000000..0db6955758 --- /dev/null +++ b/internal/hcsoci/cpuaffinity.go @@ -0,0 +1,99 @@ +//go:build windows +// +build windows + +package hcsoci + +import ( + "errors" + "fmt" + + specs "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/Microsoft/hcsshim/internal/jobobject" + "github.com/Microsoft/hcsshim/osversion" +) + +// Shared, container-kind-agnostic CPU affinity helpers. These are used by every +// Windows container shape that honors spec.Windows.Resources.CPU.Affinity: +// HostProcess (internal/jobcontainers) and Argon (this package). Keeping them +// here, rather than in a kind-specific file, avoids duplicating the validation +// and conversion logic across packages. + +// Sentinel errors returned by ValidateCPUAffinity / ValidateCPUAffinityEntries. +var ( + // ErrCPUAffinityMultipleGroupsNotSupported is returned when multiple processor-group + // affinity entries are requested on a host older than Windows Server 2022 (build 20348), + // which does not support multi-group affinity for job object silos. + // On Windows Server 2022+, multiple processor groups are fully supported. + ErrCPUAffinityMultipleGroupsNotSupported = errors.New("cpu affinity with multiple processor groups requires Windows Server 2022 or later") + // ErrCPUAffinityNonZeroGroupNotSupported is returned when a non-zero processor group is + // requested on a host older than Windows Server 2022 (build 20348). + // On Windows Server 2022+, non-zero processor groups are fully supported. + ErrCPUAffinityNonZeroGroupNotSupported = errors.New("cpu affinity with a non-zero processor group requires Windows Server 2022 or later") + // ErrCPUAffinityMaskZero is returned when an affinity entry has a zero bitmask, + // which would select no processors and is always invalid. + ErrCPUAffinityMaskZero = errors.New("cpu affinity mask must be non-zero") +) + +// ValidateCPUAffinity handles the logic of validating the container's CPU affinity +// specified in the OCI spec. +// +// Returns the validated affinity entries (nil if not specified) and any validation error. +// Multiple processor groups and non-zero group numbers require Windows Server 2022 +// (build 20348) or later; on older hosts only a single entry for group 0 is accepted. +func ValidateCPUAffinity(spec *specs.Spec) ([]specs.WindowsCPUGroupAffinity, error) { + if spec.Windows == nil || spec.Windows.Resources == nil || spec.Windows.Resources.CPU == nil { + return nil, nil + } + return ValidateCPUAffinityEntries(spec.Windows.Resources.CPU.Affinity) +} + +// ValidateCPUAffinityEntries validates a set of OCI CPU affinity entries directly, +// applying the same rules as ValidateCPUAffinity. It is used on the container update +// path, where the affinity is supplied as a bare slice rather than a full spec. +// +// Returns the validated entries (nil if empty) and any validation error. +func ValidateCPUAffinityEntries(affinity []specs.WindowsCPUGroupAffinity) ([]specs.WindowsCPUGroupAffinity, error) { + if len(affinity) == 0 { + return nil, nil + } + + // Zero masks are never valid regardless of OS version. + for i, a := range affinity { + if a.Mask == 0 { + return nil, fmt.Errorf("%w: entry %d has zero mask", ErrCPUAffinityMaskZero, i) + } + } + + // Determine whether multi-group features are needed: either multiple entries, + // or a single entry targeting a non-zero processor group. + multiGroup := len(affinity) > 1 || affinity[0].Group != 0 + + // Multiple processor groups are only supported on Windows Server 2022+. + if multiGroup && osversion.Build() < osversion.LTSC2022 { + if len(affinity) > 1 { + return nil, fmt.Errorf("%w: %d entries", ErrCPUAffinityMultipleGroupsNotSupported, len(affinity)) + } + return nil, fmt.Errorf("%w: group %d", ErrCPUAffinityNonZeroGroupNotSupported, affinity[0].Group) + } + + return affinity, nil +} + +// ToJobObjectAffinities converts validated OCI CPU affinity entries into the +// jobobject.GroupAffinity representation used by the Win32 job-object APIs. +// +// The input is expected to already have been run through ValidateCPUAffinity. +func ToJobObjectAffinities(affinities []specs.WindowsCPUGroupAffinity) []jobobject.GroupAffinity { + if len(affinities) == 0 { + return nil + } + out := make([]jobobject.GroupAffinity, len(affinities)) + for i, a := range affinities { + out[i] = jobobject.GroupAffinity{ + Mask: a.Mask, + Group: uint16(a.Group), + } + } + return out +} diff --git a/internal/hcsoci/cpuaffinity_argon.go b/internal/hcsoci/cpuaffinity_argon.go new file mode 100644 index 0000000000..2deea3ef70 --- /dev/null +++ b/internal/hcsoci/cpuaffinity_argon.go @@ -0,0 +1,39 @@ +//go:build windows +// +build windows + +package hcsoci + +import ( + "context" + "fmt" + + "github.com/Microsoft/hcsshim/internal/hcs" + "github.com/Microsoft/hcsshim/internal/log" +) + +// applyArgonCPUAffinity honors spec.Windows.Resources.CPU.Affinity for a +// process-isolated (Argon) container by pinning the container's server silo. +// +// HCS ignores CPU affinity on the container Processor schema (Count/Maximum/Weight), +// so instead we set the affinity on the silo's job object directly. This must run +// after the compute system is created but before it is started, so the affinity is +// already recorded on the job when HCS assigns the init process to the silo. See +// (*hcs.System).SetSiloCPUGroupAffinities for the race-free timeline. +// +// If the spec requests no affinity this is a no-op. +func applyArgonCPUAffinity(ctx context.Context, system *hcs.System, coi *createOptionsInternal) error { + affinities, err := ValidateCPUAffinity(coi.Spec) + if err != nil { + return err + } + if len(affinities) == 0 { + return nil + } + + if err := system.SetSiloCPUGroupAffinities(ctx, ToJobObjectAffinities(affinities)); err != nil { + return fmt.Errorf("apply CPU affinity to container silo: %w", err) + } + + log.G(ctx).WithField("affinities", affinities).Debug("applied CPU affinity to Argon container silo") + return nil +} diff --git a/internal/hcsoci/cpuaffinity_test.go b/internal/hcsoci/cpuaffinity_test.go new file mode 100644 index 0000000000..c74c63d3e4 --- /dev/null +++ b/internal/hcsoci/cpuaffinity_test.go @@ -0,0 +1,83 @@ +//go:build windows +// +build windows + +package hcsoci + +import ( + "errors" + "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/Microsoft/hcsshim/internal/jobobject" +) + +func TestValidateCPUAffinityEntries(t *testing.T) { + // A zero mask is invalid on every OS version, so this case is host-independent. + if _, err := ValidateCPUAffinityEntries([]specs.WindowsCPUGroupAffinity{{Group: 0, Mask: 0}}); !errors.Is(err, ErrCPUAffinityMaskZero) { + t.Fatalf("zero mask: got %v, want %v", err, ErrCPUAffinityMaskZero) + } + + // Empty input validates to no entries (no affinity requested). + got, err := ValidateCPUAffinityEntries(nil) + if err != nil || got != nil { + t.Fatalf("nil input: got (%v, %v), want (nil, nil)", got, err) + } + + // A single group-0 entry with a non-zero mask is valid regardless of OS version. + in := []specs.WindowsCPUGroupAffinity{{Group: 0, Mask: 0x3}} + got, err = ValidateCPUAffinityEntries(in) + if err != nil { + t.Fatalf("group-0 single entry: unexpected error %v", err) + } + if len(got) != 1 || got[0] != in[0] { + t.Fatalf("group-0 single entry: got %+v, want %+v", got, in) + } +} + +func TestToJobObjectAffinities(t *testing.T) { + for _, tc := range []struct { + name string + in []specs.WindowsCPUGroupAffinity + want []jobobject.GroupAffinity + }{ + { + name: "nil", + in: nil, + want: nil, + }, + { + name: "empty", + in: []specs.WindowsCPUGroupAffinity{}, + want: nil, + }, + { + name: "single group", + in: []specs.WindowsCPUGroupAffinity{{Group: 0, Mask: 0b1011}}, + want: []jobobject.GroupAffinity{{Group: 0, Mask: 0b1011}}, + }, + { + name: "multiple groups", + in: []specs.WindowsCPUGroupAffinity{ + {Group: 0, Mask: 0xff}, + {Group: 1, Mask: 0x1}, + }, + want: []jobobject.GroupAffinity{ + {Group: 0, Mask: 0xff}, + {Group: 1, Mask: 0x1}, + }, + }, + } { + t.Run(tc.name, func(t *testing.T) { + got := ToJobObjectAffinities(tc.in) + if len(got) != len(tc.want) { + t.Fatalf("got %d entries, want %d", len(got), len(tc.want)) + } + for i := range got { + if got[i] != tc.want[i] { + t.Errorf("entry %d: got %+v, want %+v", i, got[i], tc.want[i]) + } + } + }) + } +} diff --git a/internal/hcsoci/create.go b/internal/hcsoci/create.go index 5288932fa1..5a3082012e 100644 --- a/internal/hcsoci/create.go +++ b/internal/hcsoci/create.go @@ -357,6 +357,17 @@ func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.C if err != nil { return nil, r, err } + + // Process-isolated (Argon) containers run in a server silo on the host. HCS does not + // honor CPU affinity on the container Processor schema, so pin the silo's job object + // directly, after create but before the caller starts the container. Xenon (UVM-backed) + // containers are handled separately at the UVM layer and are skipped here. + if coi.HostingSystem == nil { + if err := applyArgonCPUAffinity(ctx, system, coi); err != nil { + return nil, r, err + } + } + return system, r, nil } diff --git a/internal/hcsoci/hcsdoc_wcow.go b/internal/hcsoci/hcsdoc_wcow.go index 52f01e2ab6..b14e6190aa 100644 --- a/internal/hcsoci/hcsdoc_wcow.go +++ b/internal/hcsoci/hcsdoc_wcow.go @@ -32,21 +32,8 @@ import ( const createContainerSubdirectoryForProcessDumpSuffix = "{container_id}" -// Sentinel errors returned by ValidateCPUAffinity. -var ( - // ErrCPUAffinityMultipleGroupsNotSupported is returned when multiple processor-group - // affinity entries are requested on a host older than Windows Server 2022 (build 20348), - // which does not support multi-group affinity for job object silos. - // On Windows Server 2022+, multiple processor groups are fully supported. - ErrCPUAffinityMultipleGroupsNotSupported = errors.New("cpu affinity with multiple processor groups requires Windows Server 2022 or later") - // ErrCPUAffinityNonZeroGroupNotSupported is returned when a non-zero processor group is - // requested on a host older than Windows Server 2022 (build 20348). - // On Windows Server 2022+, non-zero processor groups are fully supported. - ErrCPUAffinityNonZeroGroupNotSupported = errors.New("cpu affinity with a non-zero processor group requires Windows Server 2022 or later") - // ErrCPUAffinityMaskZero is returned when an affinity entry has a zero bitmask, - // which would select no processors and is always invalid. - ErrCPUAffinityMaskZero = errors.New("cpu affinity mask must be non-zero") -) +// CPU affinity validation (ValidateCPUAffinity / ValidateCPUAffinityEntries) and its +// sentinel errors live in cpuaffinity.go, shared with the HostProcess container path. // A simple wrapper struct around the container mount configs that should be added to the // container. @@ -111,41 +98,6 @@ func createMountsConfig(ctx context.Context, coi *createOptionsInternal) (*mount return &config, nil } -// ValidateCPUAffinity handles the logic of validating the container's CPU affinity -// specified in the OCI spec. -// -// Returns the validated affinity entries (nil if not specified) and any validation error. -// Multiple processor groups and non-zero group numbers require Windows Server 2022 -// (build 20348) or later; on older hosts only a single entry for group 0 is accepted. -func ValidateCPUAffinity(spec *specs.Spec) ([]specs.WindowsCPUGroupAffinity, error) { - if spec.Windows == nil || spec.Windows.Resources == nil || spec.Windows.Resources.CPU == nil || len(spec.Windows.Resources.CPU.Affinity) == 0 { - return nil, nil - } - - affinity := spec.Windows.Resources.CPU.Affinity - - // Zero masks are never valid regardless of OS version. - for i, a := range affinity { - if a.Mask == 0 { - return nil, fmt.Errorf("%w: entry %d has zero mask", ErrCPUAffinityMaskZero, i) - } - } - - // Determine whether multi-group features are needed: either multiple entries, - // or a single entry targeting a non-zero processor group. - multiGroup := len(affinity) > 1 || affinity[0].Group != 0 - - // Multiple processor groups are only supported on Windows Server 2022+. - if multiGroup && osversion.Build() < osversion.LTSC2022 { - if len(affinity) > 1 { - return nil, fmt.Errorf("%w: %d entries", ErrCPUAffinityMultipleGroupsNotSupported, len(affinity)) - } - return nil, fmt.Errorf("%w: group %d", ErrCPUAffinityNonZeroGroupNotSupported, affinity[0].Group) - } - - return affinity, nil -} - // ConvertCPULimits handles the logic of converting and validating the containers CPU limits // specified in the OCI spec to what HCS expects. // diff --git a/internal/jobcontainers/oci.go b/internal/jobcontainers/oci.go index b0b07927dc..fa0b1bd276 100644 --- a/internal/jobcontainers/oci.go +++ b/internal/jobcontainers/oci.go @@ -46,16 +46,7 @@ func specToLimits(ctx context.Context, cid string, s *specs.Spec) (*jobobject.Jo if err != nil { return nil, err } - var groupAffinities []jobobject.GroupAffinity - if len(affinities) > 0 { - groupAffinities = make([]jobobject.GroupAffinity, len(affinities)) - for i, a := range affinities { - groupAffinities[i] = jobobject.GroupAffinity{ - Mask: a.Mask, - Group: uint16(a.Group), - } - } - } + groupAffinities := hcsoci.ToJobObjectAffinities(affinities) realCPULimit, realCPUWeight := uint32(cpuLimit), uint32(cpuWeight) if cpuCount != 0 { diff --git a/test/functional/container_affinity_test.go b/test/functional/container_affinity_test.go new file mode 100644 index 0000000000..efe1edffc6 --- /dev/null +++ b/test/functional/container_affinity_test.go @@ -0,0 +1,255 @@ +//go:build windows && functional +// +build windows,functional + +package functional + +import ( + "context" + "errors" + "testing" + "unsafe" + + "github.com/containerd/containerd/v2/core/containers" + ctrdoci "github.com/containerd/containerd/v2/pkg/oci" + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/windows" + + "github.com/Microsoft/hcsshim/internal/jobobject" + "github.com/Microsoft/hcsshim/osversion" + + testcontainer "github.com/Microsoft/hcsshim/test/internal/container" + testlayers "github.com/Microsoft/hcsshim/test/internal/layers" + testoci "github.com/Microsoft/hcsshim/test/internal/oci" + "github.com/Microsoft/hcsshim/test/internal/util" + "github.com/Microsoft/hcsshim/test/pkg/require" +) + +// Test_Container_CPUAffinity_Argon is the CI-gating functional test for honoring +// spec.Windows.Resources.CPU.Affinity on process-isolated (Argon) WCOW containers +// (commit "hcsoci,hcs,shim: honor CPU affinity for Argon containers"). +// +// It asserts the three layers from the validation strategy, all reachable from this +// one in-process test (the functional suite runs in-process with internal/jobobject +// and as SYSTEM, so it can open the silo job by name): +// +// Layer 1 — the PR wrote the affinity to the silo's job object in the +// create→start window. This is the real regression gate: it fails if +// applyArgonCPUAffinity / SetSiloCPUGroupAffinities regresses. +// Layer 2 — the host's view matches. The NT-variant silo job IS the host object, +// so the same GetCPUGroupAffinities read-back doubles as the host view; +// no second tool is needed. +// Layer 3 — the init process is actually constrained. This is a kernel guarantee +// (the kernel propagates the silo job's affinity onto silo members), not +// hcsshim code. If the affinity cannot be read (OpenProcess / +// GetProcessGroupAffinity fail) the check is skipped, but a genuine +// mismatch is a hard failure: with Layer 1 passing, it points at the +// kernel/silo plumbing rather than this PR. +func Test_Container_CPUAffinity_Argon(t *testing.T) { + requireFeatures(t, featureWCOW) + // Affinity is applied via the silo job object on 20H2+ (the same floor as the + // rest of the WCOW resource-update path). + require.Build(t, osversion.V20H2) + + ctx := util.Context(namespacedContext(context.Background()), t) + + // Group 0 / single-mask works on any host, so it is the default CI case. + t.Run("Group0SingleMask", func(t *testing.T) { + want := []jobobject.GroupAffinity{{Group: 0, Mask: 0x3}} // CPUs 0 and 1. + runArgonAffinityTest(ctx, t, want) + }) + + // A genuine multi-group pin needs a confirmed >1-processor-group host and + // Windows Server 2022+; skip otherwise rather than assert against a topology + // the runner does not have. + t.Run("MultiGroup", func(t *testing.T) { + require.Build(t, osversion.LTSC2022) + if n := activeProcessorGroupCount(t); n < 2 { + t.Skipf("multi-group affinity requires a host with >1 processor group, got %d", n) + } + want := []jobobject.GroupAffinity{ + {Group: 0, Mask: 0x1}, + {Group: 1, Mask: 0x1}, + } + runArgonAffinityTest(ctx, t, want) + }) +} + +// runArgonAffinityTest creates an Argon container pinned to want, then asserts the +// three validation layers. +func runArgonAffinityTest(ctx context.Context, t *testing.T, want []jobobject.GroupAffinity) { + t.Helper() + + cID := testName(t, "container") + scratch := testlayers.WCOWScratchDir(ctx, t, "") + spec := testoci.CreateWindowsSpec(ctx, t, cID, + testoci.DefaultWindowsSpecOpts(cID, + ctrdoci.WithProcessCommandLine(testoci.PingSelfCmd), + testoci.WithWindowsLayerFolders(append(windowsImageLayers(ctx, t), scratch)), + withCPUAffinity(want), + )...) + + // nil host => process-isolated (Argon). Create runs the PR's applyArgonCPUAffinity + // between HCS-create and HCS-start. + c, _, cleanup := testcontainer.Create(ctx, t, nil, spec, cID, hcsOwner) + t.Cleanup(cleanup) + + // Layers 1 & 2, pre-start gate: the affinity is already recorded on the silo job + // before the init process runs, proving "set after create, before start". + assertSiloJobAffinity(ctx, t, cID, want) + + init := testcontainer.StartWithSpec(ctx, t, c, spec.Process, nil) + t.Cleanup(func() { + testcontainer.Kill(ctx, t, c) + testcontainer.Wait(ctx, t, c) + }) + + // Layers 1 & 2 again, now that the silo has a running member. + assertSiloJobAffinity(ctx, t, cID, want) + + // Layer 3 (kernel assertion): the init process inherited the pin. Skipped if the + // affinity cannot be read; a real mismatch fails the test. + assertProcessGroupAffinity(t, uint32(init.Process.Pid()), want) +} + +// withCPUAffinity returns a SpecOpt that sets spec.Windows.Resources.CPU.Affinity. +func withCPUAffinity(affinities []jobobject.GroupAffinity) ctrdoci.SpecOpts { + return func(_ context.Context, _ ctrdoci.Client, _ *containers.Container, s *specs.Spec) error { + if s.Windows == nil { + s.Windows = &specs.Windows{} + } + if s.Windows.Resources == nil { + s.Windows.Resources = &specs.WindowsResources{} + } + if s.Windows.Resources.CPU == nil { + s.Windows.Resources.CPU = &specs.WindowsCPUResources{} + } + oci := make([]specs.WindowsCPUGroupAffinity, len(affinities)) + for i, a := range affinities { + oci[i] = specs.WindowsCPUGroupAffinity{Group: uint32(a.Group), Mask: a.Mask} + } + s.Windows.Resources.CPU.Affinity = oci + return nil + } +} + +// assertSiloJobAffinity opens the container's server silo job object by its +// well-known name (\Container_) and asserts its CPU group affinities equal want. +// This is the host-side view of the object the PR wrote to (Layers 1 & 2). +func assertSiloJobAffinity(ctx context.Context, t *testing.T, cID string, want []jobobject.GroupAffinity) { + t.Helper() + + job, err := jobobject.Open(ctx, &jobobject.Options{ + UseNTVariant: true, + Name: `\Container_` + cID, + }) + if err != nil { + t.Fatalf("open silo job for %q: %v", cID, err) + } + defer job.Close() + + got, err := job.GetCPUGroupAffinities() + if err != nil { + t.Fatalf("get silo job cpu group affinities: %v", err) + } + assertAffinitiesEqual(t, "silo job object", got, want) +} + +// assertProcessGroupAffinity reads the group affinity the kernel placed on the init +// process and compares it to want. The PR only writes the job object; propagation +// onto silo members is a kernel guarantee. If the affinity cannot be read the check +// is skipped (logged, not failed), but a successful read that omits a pinned group +// is a hard failure. +func assertProcessGroupAffinity(t *testing.T, pid uint32, want []jobobject.GroupAffinity) { + t.Helper() + + h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) + if err != nil { + t.Logf("Layer 3 (kernel) skipped: OpenProcess(%d): %v", pid, err) + return + } + defer windows.CloseHandle(h) //nolint:errcheck + + got, err := getProcessGroupAffinity(h) + if err != nil { + t.Logf("Layer 3 (kernel) skipped: GetProcessGroupAffinity(%d): %v", pid, err) + return + } + + // The process reports the set of groups it may run on; assert every group we + // pinned shows up. We do not compare masks here: the kernel reports the group's + // active-processor mask for the process, not necessarily the bits we requested. + wantGroups := make(map[uint16]struct{}, len(want)) + for _, a := range want { + wantGroups[a.Group] = struct{}{} + } + gotGroups := make(map[uint16]struct{}, len(got)) + for _, g := range got { + gotGroups[g] = struct{}{} + } + for g := range wantGroups { + if _, ok := gotGroups[g]; !ok { + t.Errorf("Layer 3 (kernel): init process not constrained to group %d; process groups = %v", g, got) + } + } +} + +func assertAffinitiesEqual(t *testing.T, what string, got, want []jobobject.GroupAffinity) { + t.Helper() + + // Order-independent compare keyed by group: the OS does not promise to return + // entries in the order they were set. + if len(got) != len(want) { + t.Fatalf("%s affinity: got %+v, want %+v (length mismatch)", what, got, want) + } + byGroup := make(map[uint16]uint64, len(got)) + for _, g := range got { + byGroup[g.Group] = g.Mask + } + for _, w := range want { + mask, ok := byGroup[w.Group] + if !ok { + t.Fatalf("%s affinity: missing group %d; got %+v, want %+v", what, w.Group, got, want) + } + if mask != w.Mask { + t.Fatalf("%s affinity: group %d mask = %#x, want %#x", what, w.Group, mask, w.Mask) + } + } +} + +var ( + kernel32 = windows.NewLazySystemDLL("kernel32.dll") + procGetProcessGroupAffinity = kernel32.NewProc("GetProcessGroupAffinity") + procGetActiveProcessorGroupCnt = kernel32.NewProc("GetActiveProcessorGroupCount") +) + +// getProcessGroupAffinity wraps kernel32!GetProcessGroupAffinity, which is not bound +// in golang.org/x/sys/windows. It returns the processor groups the process may run on. +func getProcessGroupAffinity(h windows.Handle) ([]uint16, error) { + // Probe with a small buffer; the call sets count to the required size and fails + // with ERROR_INSUFFICIENT_BUFFER if it is too small. + groups := make([]uint16, 4) + count := uint16(len(groups)) + for { + r1, _, e := procGetProcessGroupAffinity.Call( + uintptr(h), + uintptr(unsafe.Pointer(&count)), + uintptr(unsafe.Pointer(&groups[0])), + ) + if r1 != 0 { + return groups[:count], nil + } + if errors.Is(e, windows.ERROR_INSUFFICIENT_BUFFER) && int(count) > len(groups) { + groups = make([]uint16, count) + continue + } + return nil, e + } +} + +// activeProcessorGroupCount returns the number of active processor groups on the host, +// used to decide whether a multi-group affinity test can run. +func activeProcessorGroupCount(t *testing.T) int { + t.Helper() + r1, _, _ := procGetActiveProcessorGroupCnt.Call() + return int(uint16(r1)) +}