Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 28 additions & 4 deletions coreapi/lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,33 @@ func startWithPanicRecovery(ctx context.Context, s Service, deps Deps) (err erro
return s.Start(ctx, deps)
}

// StopAll stops every started service in reverse order. Errors from
// individual Stop calls are collected; the first one is returned but
// every service still gets its Stop call invoked.
// stopWithPanicRecovery calls s.Stop(ctx) inside a defer recover()
// so a buggy plugin panicking during shutdown (channel-send on closed
// channel, nil dereference, os.Remove of a path whose parent dir
// vanished) surfaces as a normal Stop error rather than crashing the
// entire daemon process mid-teardown.
//
// Without this wrapper, a plugin that panics in Stop propagates the
// panic up through StopAll, terminating the daemon before remaining
// plugins get their Stop calls — leaving orphaned goroutines, half-
// written on-disk state, and no tear-down log.
func stopWithPanicRecovery(ctx context.Context, s Service) (err error) {
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("plugin %q Stop panicked: %v", s.Name(), r)
}
}()
return s.Stop(ctx)
}

// StopAll stops every started service in reverse order. Each Stop is
// wrapped in stopWithPanicRecovery so a buggy plugin panicking during
// shutdown cannot crash the daemon; the panic is converted to an error
// and all remaining services still get their Stop call.
//
// Errors from individual Stop calls (including recovered panics) are
// collected; the first one is returned but every service still gets
// its Stop call invoked.
func (sr *ServiceRegistry) StopAll(ctx context.Context) error {
sr.mu.Lock()
queue := append([]Service(nil), sr.started...)
Expand All @@ -132,7 +156,7 @@ func (sr *ServiceRegistry) StopAll(ctx context.Context) error {

var firstErr error
for i := len(queue) - 1; i >= 0; i-- {
if err := queue[i].Stop(ctx); err != nil && firstErr == nil {
if err := stopWithPanicRecovery(ctx, queue[i]); err != nil && firstErr == nil {
firstErr = err
}
}
Expand Down
82 changes: 82 additions & 0 deletions coreapi/zz_panic_recovery_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,88 @@ func (p *panickingService) Order() int { retu
func (p *panickingService) Start(_ context.Context, _ coreapi.Deps) error { panic(p.msg) }
func (p *panickingService) Stop(_ context.Context) error { return nil }

// panickingStop panics during Stop with the given message.
type panickingStop struct{ msg string }

func (p *panickingStop) Name() string { return "panicker-stop" }
func (p *panickingStop) Order() int { return 100 }
func (p *panickingStop) Start(_ context.Context, _ coreapi.Deps) error { return nil }
func (p *panickingStop) Stop(_ context.Context) error { panic(p.msg) }

func TestServiceRegistry_StopAllRecoversFromPluginPanic(t *testing.T) {
t.Parallel()

sr := &coreapi.ServiceRegistry{}
if err := sr.Register(&panickingStop{msg: "boom during shutdown"}); err != nil {
t.Fatalf("Register: %v", err)
}

// Start first so the service is in the started queue.
if err := sr.StartAll(context.Background(), coreapi.Deps{}); err != nil {
t.Fatalf("StartAll: %v", err)
}

// Without the recover wrapper in StopAll, this CRASHES the test process.
err := sr.StopAll(context.Background())

if err == nil {
t.Fatal("StopAll returned nil for panicking plugin — recover wrapper missing")
}
if !strings.Contains(err.Error(), "panic") {
t.Errorf("expected error to mention 'panic'; got %q", err.Error())
}
if !strings.Contains(err.Error(), "boom during shutdown") {
t.Errorf("expected error to include the panic message; got %q", err.Error())
}
}

// stopThenPanic panics during Stop AFTER calling an observer so the
// test can verify downstream services still get their Stop calls.
type stopThenPanic struct {
name string
order int
called *bool
msg string
}

func (s *stopThenPanic) Name() string { return s.name }
func (s *stopThenPanic) Order() int { return s.order }
func (s *stopThenPanic) Start(_ context.Context, _ coreapi.Deps) error { return nil }
func (s *stopThenPanic) Stop(_ context.Context) error {
*s.called = true
panic(s.msg)
}

func TestServiceRegistry_StopAllContinuesAfterPanic(t *testing.T) {
t.Parallel()

sr := &coreapi.ServiceRegistry{}

laterCalled := false
panicker := &stopThenPanic{name: "panicker", order: 50, called: new(bool), msg: "crash"}
later := &stopThenPanic{name: "later", order: 100, called: &laterCalled, msg: ""}

// later.Order > panicker.Order → later stops first (reverse order).
// panicker panics in Stop; later should already have been stopped.
for _, s := range []coreapi.Service{panicker, later} {
if err := sr.Register(s); err != nil {
t.Fatalf("Register: %v", err)
}
}

if err := sr.StartAll(context.Background(), coreapi.Deps{}); err != nil {
t.Fatalf("StartAll: %v", err)
}

_ = sr.StopAll(context.Background())

// later must have been stopped despite panicker panicking earlier
// (later starts first, so it stops first — reverse order).
if !laterCalled {
t.Error("downstream service was not stopped after panicker; StopAll aborted early")
}
}

func TestServiceRegistry_StartAllRecoversFromPluginPanic(t *testing.T) {
t.Parallel()

Expand Down
Loading