Skip to content

Commit c974afb

Browse files
committed
gh-146313: Fix ResourceTracker deadlock after os.fork()
Problem ResourceTracker.__del__ (added in gh-88887) calls os.waitpid(pid, 0) which blocks indefinitely if a process created via os.fork() still holds the tracker pipe's write end. The tracker never sees EOF, never exits, and the parent hangs at interpreter shutdown. Root cause Three requirements conflict: - gh-88887 wants the parent to reap the tracker to prevent zombies - gh-80849 wants mp.Process(fork) children to reuse the parent's tracker via the inherited pipe fd - gh-146313 shows the parent can't block in waitpid() if a child holds the fd -- the tracker won't see EOF until all copies close Fix Two layers: Timeout safety-net. _stop_locked() gains a wait_timeout parameter. When called from __del__, it polls with WNOHANG using exponential backoff for up to 1 second instead of blocking indefinitely. At-fork handler. An os.register_at_fork(after_in_child=...) handler closes the inherited pipe fd in the child unless a preserve flag is set. popen_fork.Popen._launch() sets the flag before its fork so mp.Process(fork) children keep the fd and reuse the parent's tracker (preserving gh-80849). Raw os.fork() children close the fd, letting the parent reap promptly. Result Scenario Before After Raw os.fork(), parent exits while child alive deadlock ~30ms reap mp.Process(fork), parent joins then exits ~30ms reap ~30ms reap mp.Process(fork), parent exits abnormally deadlock 1s bounded wait No fork (gh-88887 scenario) ~30ms reap ~30ms reap The at-fork handler makes the timeout unreachable in all well-behaved paths. The timeout remains as a safety net for abnormal shutdowns.
1 parent 4561f64 commit c974afb

File tree

4 files changed

+331
-9
lines changed

4 files changed

+331
-9
lines changed

Lib/multiprocessing/popen_fork.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,17 @@ def _launch(self, process_obj):
6767
code = 1
6868
parent_r, child_w = os.pipe()
6969
child_r, parent_w = os.pipe()
70-
self.pid = os.fork()
70+
# gh-146313: Tell the resource tracker's at-fork handler to keep
71+
# the inherited pipe fd so this child reuses the parent's tracker
72+
# (gh-80849) rather than closing it and launching its own.
73+
from .resource_tracker import _fork_intent
74+
_fork_intent.preserve_fd = True
75+
try:
76+
self.pid = os.fork()
77+
finally:
78+
# Reset in both parent and child so the flag does not leak
79+
# into a subsequent raw os.fork() or nested Process launch.
80+
_fork_intent.preserve_fd = False
7181
if self.pid == 0:
7282
try:
7383
atexit._clear()

Lib/multiprocessing/resource_tracker.py

Lines changed: 83 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import signal
2121
import sys
2222
import threading
23+
import time
2324
import warnings
2425
from collections import deque
2526

@@ -75,6 +76,10 @@ def __init__(self):
7576
# The reader should understand all formats.
7677
self._use_simple_format = False
7778

79+
# Set to True by _stop_locked() if the waitpid polling loop ran to
80+
# its timeout without reaping the tracker. Exposed for tests.
81+
self._waitpid_timed_out = False
82+
7883
def _reentrant_call_error(self):
7984
# gh-109629: this happens if an explicit call to the ResourceTracker
8085
# gets interrupted by a garbage collection, invoking a finalizer (*)
@@ -87,16 +92,50 @@ def __del__(self):
8792
# making sure child processess are cleaned before ResourceTracker
8893
# gets destructed.
8994
# see https://github.com/python/cpython/issues/88887
90-
self._stop(use_blocking_lock=False)
95+
# gh-146313: use a timeout to avoid deadlocking if a forked child
96+
# still holds the pipe's write end open.
97+
self._stop(use_blocking_lock=False, wait_timeout=1.0)
98+
99+
def _after_fork_in_child(self):
100+
# gh-146313: Called in the child right after os.fork().
101+
#
102+
# The tracker process is a child of the *parent*, not of us, so we
103+
# could never waitpid() it anyway. Clearing _pid means our __del__
104+
# becomes a no-op (the early return for _pid is None).
105+
#
106+
# Whether we keep the inherited _fd depends on who forked us:
107+
#
108+
# - multiprocessing.Process with the 'fork' start method sets
109+
# _fork_intent.preserve_fd before forking. The child keeps the
110+
# fd and reuses the parent's tracker (gh-80849). This is safe
111+
# because multiprocessing's atexit handler joins all children
112+
# before the parent's __del__ runs, so by then the fd copies
113+
# are gone and the parent can reap the tracker promptly.
114+
#
115+
# - A raw os.fork() leaves the flag unset. We close the fd so
116+
# the parent's __del__ can reap the tracker without waiting
117+
# for us to exit. If we later need a tracker, ensure_running()
118+
# will launch a fresh one.
119+
self._lock._at_fork_reinit()
120+
self._reentrant_messages.clear()
121+
self._pid = None
122+
self._exitcode = None
123+
if (self._fd is not None and
124+
not getattr(_fork_intent, 'preserve_fd', False)):
125+
try:
126+
os.close(self._fd)
127+
except OSError:
128+
pass
129+
self._fd = None
91130

92-
def _stop(self, use_blocking_lock=True):
131+
def _stop(self, use_blocking_lock=True, wait_timeout=None):
93132
if use_blocking_lock:
94133
with self._lock:
95-
self._stop_locked()
134+
self._stop_locked(wait_timeout=wait_timeout)
96135
else:
97136
acquired = self._lock.acquire(blocking=False)
98137
try:
99-
self._stop_locked()
138+
self._stop_locked(wait_timeout=wait_timeout)
100139
finally:
101140
if acquired:
102141
self._lock.release()
@@ -106,6 +145,10 @@ def _stop_locked(
106145
close=os.close,
107146
waitpid=os.waitpid,
108147
waitstatus_to_exitcode=os.waitstatus_to_exitcode,
148+
monotonic=time.monotonic,
149+
sleep=time.sleep,
150+
WNOHANG=os.WNOHANG,
151+
wait_timeout=None,
109152
):
110153
# This shouldn't happen (it might when called by a finalizer)
111154
# so we check for it anyway.
@@ -122,7 +165,30 @@ def _stop_locked(
122165
self._fd = None
123166

124167
try:
125-
_, status = waitpid(self._pid, 0)
168+
if wait_timeout is None:
169+
_, status = waitpid(self._pid, 0)
170+
else:
171+
# gh-146313: A forked child may still hold the pipe's write
172+
# end open, preventing the tracker from seeing EOF and
173+
# exiting. Poll with WNOHANG to avoid blocking forever.
174+
deadline = monotonic() + wait_timeout
175+
delay = 0.001
176+
while True:
177+
result_pid, status = waitpid(self._pid, WNOHANG)
178+
if result_pid != 0:
179+
break
180+
remaining = deadline - monotonic()
181+
if remaining <= 0:
182+
# The tracker is still running; it will be
183+
# reparented to PID 1 (or the nearest subreaper)
184+
# when we exit, and reaped there once all pipe
185+
# holders release their fd.
186+
self._pid = None
187+
self._exitcode = None
188+
self._waitpid_timed_out = True
189+
return
190+
delay = min(delay * 2, remaining, 0.1)
191+
sleep(delay)
126192
except ChildProcessError:
127193
self._pid = None
128194
self._exitcode = None
@@ -308,12 +374,24 @@ def _send(self, cmd, name, rtype):
308374

309375
self._ensure_running_and_write(msg)
310376

377+
# gh-146313: Per-thread flag set by .popen_fork.Popen._launch() just before
378+
# os.fork(), telling _after_fork_in_child() to keep the inherited pipe fd so
379+
# the child can reuse this tracker (gh-80849). Unset for raw os.fork() calls,
380+
# where the child instead closes the fd so the parent's __del__ can reap the
381+
# tracker. Using threading.local() keeps multiple threads calling
382+
# popen_fork.Popen._launch() at once from clobbering eachothers intent.
383+
_fork_intent = threading.local()
384+
311385
_resource_tracker = ResourceTracker()
312386
ensure_running = _resource_tracker.ensure_running
313387
register = _resource_tracker.register
314388
unregister = _resource_tracker.unregister
315389
getfd = _resource_tracker.getfd
316390

391+
# gh-146313: See _after_fork_in_child docstring.
392+
if hasattr(os, 'register_at_fork'):
393+
os.register_at_fork(after_in_child=_resource_tracker._after_fork_in_child)
394+
317395

318396
def _decode_message(line):
319397
if line.startswith(b'{'):

0 commit comments

Comments
 (0)