From 1dd9f2599ee289324ac01b714b7ba82d2422523a Mon Sep 17 00:00:00 2001 From: Molly He Date: Fri, 13 Mar 2026 16:36:32 -0700 Subject: [PATCH 1/5] fix: use docker fallback to clean up root-owned files in local mode --- .../modules/local_core/local_container.py | 25 ++++--- .../sagemaker/train/local/local_container.py | 25 ++++--- .../unit/train/local/test_local_container.py | 66 +++++++++++++++++++ 3 files changed, 100 insertions(+), 16 deletions(-) create mode 100644 sagemaker-train/tests/unit/train/local/test_local_container.py diff --git a/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py b/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py index 43a5ca502d..b23d01e400 100644 --- a/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py +++ b/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py @@ -17,7 +17,6 @@ import os import re import shutil -import stat import subprocess from tempfile import TemporaryDirectory from typing import Any, Dict, List, Optional @@ -60,13 +59,23 @@ def _rmtree(path): """Remove a directory tree, handling root-owned files from Docker containers.""" - def _onerror(func, path, exc_info): - if isinstance(exc_info[1], PermissionError): - os.chmod(path, stat.S_IRWXU) - func(path) - else: - raise exc_info[1] - shutil.rmtree(path, onerror=_onerror) + try: + shutil.rmtree(path) + except PermissionError: + # Files created by Docker containers are owned by root. + # Use a Docker container to remove them since os.chmod will also fail. + try: + subprocess.run( + ["docker", "run", "--rm", "-v", f"{path}:/delete", "alpine", "rm", "-rf", "/delete"], + check=True, + capture_output=True, + ) + # The mount point directory itself may remain — clean it up + if os.path.exists(path): + shutil.rmtree(path, ignore_errors=True) + except Exception: + logger.warning("Failed to clean up root-owned files in %s. You may need to remove them manually with: sudo rm -rf %s", path, path) + raise class _LocalContainer(BaseModel): diff --git a/sagemaker-train/src/sagemaker/train/local/local_container.py b/sagemaker-train/src/sagemaker/train/local/local_container.py index bcf84395ce..7933ead9f1 100644 --- a/sagemaker-train/src/sagemaker/train/local/local_container.py +++ b/sagemaker-train/src/sagemaker/train/local/local_container.py @@ -18,7 +18,6 @@ import os import re import shutil -import stat import subprocess from tempfile import TemporaryDirectory from typing import Any, Dict, List, Optional @@ -68,13 +67,23 @@ def _rmtree(path): """Remove a directory tree, handling root-owned files from Docker containers.""" - def _onerror(func, path, exc_info): - if isinstance(exc_info[1], PermissionError): - os.chmod(path, stat.S_IRWXU) - func(path) - else: - raise exc_info[1] - shutil.rmtree(path, onerror=_onerror) + try: + shutil.rmtree(path) + except PermissionError: + # Files created by Docker containers are owned by root. + # Use a Docker container to remove them since os.chmod will also fail. + try: + subprocess.run( + ["docker", "run", "--rm", "-v", f"{path}:/delete", "alpine", "rm", "-rf", "/delete"], + check=True, + capture_output=True, + ) + # The mount point directory itself may remain — clean it up + if os.path.exists(path): + shutil.rmtree(path, ignore_errors=True) + except Exception: + logger.warning("Failed to clean up root-owned files in %s. You may need to remove them manually with: sudo rm -rf %s", path, path) + raise class _LocalContainer(BaseModel): diff --git a/sagemaker-train/tests/unit/train/local/test_local_container.py b/sagemaker-train/tests/unit/train/local/test_local_container.py new file mode 100644 index 0000000000..4a9cba27fd --- /dev/null +++ b/sagemaker-train/tests/unit/train/local/test_local_container.py @@ -0,0 +1,66 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from unittest.mock import patch, call +import pytest + +from sagemaker.train.local.local_container import _rmtree + + +class TestRmtree: + """Test cases for _rmtree function.""" + + @patch("sagemaker.train.local.local_container.shutil.rmtree") + def test_rmtree_success(self, mock_rmtree): + """Normal case — shutil.rmtree succeeds.""" + _rmtree("/tmp/test") + mock_rmtree.assert_called_once_with("/tmp/test") + + @patch("sagemaker.train.local.local_container.shutil.rmtree") + @patch("sagemaker.train.local.local_container.subprocess.run") + @patch("sagemaker.train.local.local_container.os.path.exists", return_value=False) + def test_rmtree_permission_error_docker_fallback(self, mock_exists, mock_run, mock_rmtree): + """PermissionError triggers docker fallback to remove root-owned files.""" + mock_rmtree.side_effect = PermissionError("Permission denied") + + _rmtree("/tmp/test") + + mock_run.assert_called_once_with( + ["docker", "run", "--rm", "-v", "/tmp/test:/delete", "alpine", "rm", "-rf", "/delete"], + check=True, + capture_output=True, + ) + + @patch("sagemaker.train.local.local_container.shutil.rmtree") + @patch("sagemaker.train.local.local_container.subprocess.run") + @patch("sagemaker.train.local.local_container.os.path.exists", return_value=True) + def test_rmtree_cleans_up_mount_point(self, mock_exists, mock_run, mock_rmtree): + """After docker cleanup, remaining mount point directory is removed.""" + mock_rmtree.side_effect = [PermissionError("Permission denied"), None] + + _rmtree("/tmp/test") + + assert mock_rmtree.call_count == 2 + mock_rmtree.assert_has_calls([ + call("/tmp/test"), + call("/tmp/test", ignore_errors=True), + ]) + + @patch("sagemaker.train.local.local_container.shutil.rmtree") + @patch("sagemaker.train.local.local_container.subprocess.run") + def test_rmtree_docker_fallback_fails_raises(self, mock_run, mock_rmtree): + """If docker fallback also fails, the exception propagates.""" + mock_rmtree.side_effect = PermissionError("Permission denied") + mock_run.side_effect = Exception("docker not available") + + with pytest.raises(Exception, match="docker not available"): + _rmtree("/tmp/test") From c0134f58b9c9f6c14af49eebc77caf14bb6b80e0 Mon Sep 17 00:00:00 2001 From: Molly He Date: Fri, 13 Mar 2026 16:48:06 -0700 Subject: [PATCH 2/5] Remove alpine --- .../modules/local_core/local_container.py | 26 ++++++++++++++----- .../sagemaker/train/local/local_container.py | 26 ++++++++++++++----- .../unit/train/local/test_local_container.py | 24 ++++++++++++----- 3 files changed, 55 insertions(+), 21 deletions(-) diff --git a/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py b/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py index b23d01e400..5d87adc520 100644 --- a/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py +++ b/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py @@ -57,16 +57,24 @@ SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE" -def _rmtree(path): +def _rmtree(path, image=None): """Remove a directory tree, handling root-owned files from Docker containers.""" try: shutil.rmtree(path) except PermissionError: # Files created by Docker containers are owned by root. # Use a Docker container to remove them since os.chmod will also fail. + # Use the training image (already pulled) to avoid needing alpine. + if image is None: + logger.warning( + "Failed to clean up root-owned files in %s. " + "You may need to remove them manually with: sudo rm -rf %s", + path, path, + ) + raise try: subprocess.run( - ["docker", "run", "--rm", "-v", f"{path}:/delete", "alpine", "rm", "-rf", "/delete"], + ["docker", "run", "--rm", "-v", f"{path}:/delete", image, "rm", "-rf", "/delete"], check=True, capture_output=True, ) @@ -74,7 +82,11 @@ def _rmtree(path): if os.path.exists(path): shutil.rmtree(path, ignore_errors=True) except Exception: - logger.warning("Failed to clean up root-owned files in %s. You may need to remove them manually with: sudo rm -rf %s", path, path) + logger.warning( + "Failed to clean up root-owned files in %s. " + "You may need to remove them manually with: sudo rm -rf %s", + path, path, + ) raise @@ -230,12 +242,12 @@ def train( # Print our Job Complete line logger.info("Local training job completed, output artifacts saved to %s", artifacts) - _rmtree(os.path.join(self.container_root, "input")) - _rmtree(os.path.join(self.container_root, "shared")) + _rmtree(os.path.join(self.container_root, "input"), self.image) + _rmtree(os.path.join(self.container_root, "shared"), self.image) for host in self.hosts: - _rmtree(os.path.join(self.container_root, host)) + _rmtree(os.path.join(self.container_root, host), self.image) for folder in self._temporary_folders: - _rmtree(os.path.join(self.container_root, folder)) + _rmtree(os.path.join(self.container_root, folder), self.image) return artifacts def retrieve_artifacts( diff --git a/sagemaker-train/src/sagemaker/train/local/local_container.py b/sagemaker-train/src/sagemaker/train/local/local_container.py index 7933ead9f1..88825012b5 100644 --- a/sagemaker-train/src/sagemaker/train/local/local_container.py +++ b/sagemaker-train/src/sagemaker/train/local/local_container.py @@ -65,16 +65,24 @@ SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE" -def _rmtree(path): +def _rmtree(path, image=None): """Remove a directory tree, handling root-owned files from Docker containers.""" try: shutil.rmtree(path) except PermissionError: # Files created by Docker containers are owned by root. # Use a Docker container to remove them since os.chmod will also fail. + # Use the training image (already pulled) to avoid needing alpine. + if image is None: + logger.warning( + "Failed to clean up root-owned files in %s. " + "You may need to remove them manually with: sudo rm -rf %s", + path, path, + ) + raise try: subprocess.run( - ["docker", "run", "--rm", "-v", f"{path}:/delete", "alpine", "rm", "-rf", "/delete"], + ["docker", "run", "--rm", "-v", f"{path}:/delete", image, "rm", "-rf", "/delete"], check=True, capture_output=True, ) @@ -82,7 +90,11 @@ def _rmtree(path): if os.path.exists(path): shutil.rmtree(path, ignore_errors=True) except Exception: - logger.warning("Failed to clean up root-owned files in %s. You may need to remove them manually with: sudo rm -rf %s", path, path) + logger.warning( + "Failed to clean up root-owned files in %s. " + "You may need to remove them manually with: sudo rm -rf %s", + path, path, + ) raise @@ -238,12 +250,12 @@ def train( # Print our Job Complete line logger.info("Local training job completed, output artifacts saved to %s", artifacts) - _rmtree(os.path.join(self.container_root, "input")) - _rmtree(os.path.join(self.container_root, "shared")) + _rmtree(os.path.join(self.container_root, "input"), self.image) + _rmtree(os.path.join(self.container_root, "shared"), self.image) for host in self.hosts: - _rmtree(os.path.join(self.container_root, host)) + _rmtree(os.path.join(self.container_root, host), self.image) for folder in self._temporary_folders: - _rmtree(os.path.join(self.container_root, folder)) + _rmtree(os.path.join(self.container_root, folder), self.image) return artifacts def retrieve_artifacts( diff --git a/sagemaker-train/tests/unit/train/local/test_local_container.py b/sagemaker-train/tests/unit/train/local/test_local_container.py index 4a9cba27fd..b30d86f2ea 100644 --- a/sagemaker-train/tests/unit/train/local/test_local_container.py +++ b/sagemaker-train/tests/unit/train/local/test_local_container.py @@ -15,6 +15,8 @@ from sagemaker.train.local.local_container import _rmtree +IMAGE = "763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.1-cpu-py310" + class TestRmtree: """Test cases for _rmtree function.""" @@ -22,20 +24,20 @@ class TestRmtree: @patch("sagemaker.train.local.local_container.shutil.rmtree") def test_rmtree_success(self, mock_rmtree): """Normal case — shutil.rmtree succeeds.""" - _rmtree("/tmp/test") + _rmtree("/tmp/test", IMAGE) mock_rmtree.assert_called_once_with("/tmp/test") @patch("sagemaker.train.local.local_container.shutil.rmtree") @patch("sagemaker.train.local.local_container.subprocess.run") @patch("sagemaker.train.local.local_container.os.path.exists", return_value=False) def test_rmtree_permission_error_docker_fallback(self, mock_exists, mock_run, mock_rmtree): - """PermissionError triggers docker fallback to remove root-owned files.""" + """PermissionError triggers docker fallback using the training image.""" mock_rmtree.side_effect = PermissionError("Permission denied") - _rmtree("/tmp/test") + _rmtree("/tmp/test", IMAGE) mock_run.assert_called_once_with( - ["docker", "run", "--rm", "-v", "/tmp/test:/delete", "alpine", "rm", "-rf", "/delete"], + ["docker", "run", "--rm", "-v", "/tmp/test:/delete", IMAGE, "rm", "-rf", "/delete"], check=True, capture_output=True, ) @@ -47,7 +49,7 @@ def test_rmtree_cleans_up_mount_point(self, mock_exists, mock_run, mock_rmtree): """After docker cleanup, remaining mount point directory is removed.""" mock_rmtree.side_effect = [PermissionError("Permission denied"), None] - _rmtree("/tmp/test") + _rmtree("/tmp/test", IMAGE) assert mock_rmtree.call_count == 2 mock_rmtree.assert_has_calls([ @@ -60,7 +62,15 @@ def test_rmtree_cleans_up_mount_point(self, mock_exists, mock_run, mock_rmtree): def test_rmtree_docker_fallback_fails_raises(self, mock_run, mock_rmtree): """If docker fallback also fails, the exception propagates.""" mock_rmtree.side_effect = PermissionError("Permission denied") - mock_run.side_effect = Exception("docker not available") + mock_run.side_effect = Exception("docker failed") + + with pytest.raises(Exception, match="docker failed"): + _rmtree("/tmp/test", IMAGE) + + @patch("sagemaker.train.local.local_container.shutil.rmtree") + def test_rmtree_no_image_raises(self, mock_rmtree): + """PermissionError without image raises immediately.""" + mock_rmtree.side_effect = PermissionError("Permission denied") - with pytest.raises(Exception, match="docker not available"): + with pytest.raises(PermissionError): _rmtree("/tmp/test") From 4e5fda9a592676fb94ba0b2ef871c21fb370685d Mon Sep 17 00:00:00 2001 From: Molly He Date: Fri, 13 Mar 2026 16:53:58 -0700 Subject: [PATCH 3/5] Use network flag --- .../modules/local_core/local_container.py | 21 +++++++++---------- .../sagemaker/train/local/local_container.py | 21 +++++++++---------- .../unit/train/local/test_local_container.py | 19 +++++++++++++++++ 3 files changed, 39 insertions(+), 22 deletions(-) diff --git a/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py b/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py index 5d87adc520..af50bad700 100644 --- a/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py +++ b/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py @@ -57,14 +57,13 @@ SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE" -def _rmtree(path, image=None): +def _rmtree(path, image=None, is_studio=False): """Remove a directory tree, handling root-owned files from Docker containers.""" try: shutil.rmtree(path) except PermissionError: # Files created by Docker containers are owned by root. # Use a Docker container to remove them since os.chmod will also fail. - # Use the training image (already pulled) to avoid needing alpine. if image is None: logger.warning( "Failed to clean up root-owned files in %s. " @@ -73,11 +72,11 @@ def _rmtree(path, image=None): ) raise try: - subprocess.run( - ["docker", "run", "--rm", "-v", f"{path}:/delete", image, "rm", "-rf", "/delete"], - check=True, - capture_output=True, - ) + cmd = ["docker", "run", "--rm"] + if is_studio: + cmd += ["--network", "sagemaker"] + cmd += ["-v", f"{path}:/delete", image, "rm", "-rf", "/delete"] + subprocess.run(cmd, check=True, capture_output=True) # The mount point directory itself may remain — clean it up if os.path.exists(path): shutil.rmtree(path, ignore_errors=True) @@ -242,12 +241,12 @@ def train( # Print our Job Complete line logger.info("Local training job completed, output artifacts saved to %s", artifacts) - _rmtree(os.path.join(self.container_root, "input"), self.image) - _rmtree(os.path.join(self.container_root, "shared"), self.image) + _rmtree(os.path.join(self.container_root, "input"), self.image, self.is_studio) + _rmtree(os.path.join(self.container_root, "shared"), self.image, self.is_studio) for host in self.hosts: - _rmtree(os.path.join(self.container_root, host), self.image) + _rmtree(os.path.join(self.container_root, host), self.image, self.is_studio) for folder in self._temporary_folders: - _rmtree(os.path.join(self.container_root, folder), self.image) + _rmtree(os.path.join(self.container_root, folder), self.image, self.is_studio) return artifacts def retrieve_artifacts( diff --git a/sagemaker-train/src/sagemaker/train/local/local_container.py b/sagemaker-train/src/sagemaker/train/local/local_container.py index 88825012b5..cf4413a171 100644 --- a/sagemaker-train/src/sagemaker/train/local/local_container.py +++ b/sagemaker-train/src/sagemaker/train/local/local_container.py @@ -65,14 +65,13 @@ SM_STUDIO_LOCAL_MODE = "SM_STUDIO_LOCAL_MODE" -def _rmtree(path, image=None): +def _rmtree(path, image=None, is_studio=False): """Remove a directory tree, handling root-owned files from Docker containers.""" try: shutil.rmtree(path) except PermissionError: # Files created by Docker containers are owned by root. # Use a Docker container to remove them since os.chmod will also fail. - # Use the training image (already pulled) to avoid needing alpine. if image is None: logger.warning( "Failed to clean up root-owned files in %s. " @@ -81,11 +80,11 @@ def _rmtree(path, image=None): ) raise try: - subprocess.run( - ["docker", "run", "--rm", "-v", f"{path}:/delete", image, "rm", "-rf", "/delete"], - check=True, - capture_output=True, - ) + cmd = ["docker", "run", "--rm"] + if is_studio: + cmd += ["--network", "sagemaker"] + cmd += ["-v", f"{path}:/delete", image, "rm", "-rf", "/delete"] + subprocess.run(cmd, check=True, capture_output=True) # The mount point directory itself may remain — clean it up if os.path.exists(path): shutil.rmtree(path, ignore_errors=True) @@ -250,12 +249,12 @@ def train( # Print our Job Complete line logger.info("Local training job completed, output artifacts saved to %s", artifacts) - _rmtree(os.path.join(self.container_root, "input"), self.image) - _rmtree(os.path.join(self.container_root, "shared"), self.image) + _rmtree(os.path.join(self.container_root, "input"), self.image, self.is_studio) + _rmtree(os.path.join(self.container_root, "shared"), self.image, self.is_studio) for host in self.hosts: - _rmtree(os.path.join(self.container_root, host), self.image) + _rmtree(os.path.join(self.container_root, host), self.image, self.is_studio) for folder in self._temporary_folders: - _rmtree(os.path.join(self.container_root, folder), self.image) + _rmtree(os.path.join(self.container_root, folder), self.image, self.is_studio) return artifacts def retrieve_artifacts( diff --git a/sagemaker-train/tests/unit/train/local/test_local_container.py b/sagemaker-train/tests/unit/train/local/test_local_container.py index b30d86f2ea..5dd556d995 100644 --- a/sagemaker-train/tests/unit/train/local/test_local_container.py +++ b/sagemaker-train/tests/unit/train/local/test_local_container.py @@ -42,6 +42,25 @@ def test_rmtree_permission_error_docker_fallback(self, mock_exists, mock_run, mo capture_output=True, ) + @patch("sagemaker.train.local.local_container.shutil.rmtree") + @patch("sagemaker.train.local.local_container.subprocess.run") + @patch("sagemaker.train.local.local_container.os.path.exists", return_value=False) + def test_rmtree_studio_adds_network(self, mock_exists, mock_run, mock_rmtree): + """In Studio, docker run includes --network sagemaker.""" + mock_rmtree.side_effect = PermissionError("Permission denied") + + _rmtree("/tmp/test", IMAGE, is_studio=True) + + mock_run.assert_called_once_with( + [ + "docker", "run", "--rm", + "--network", "sagemaker", + "-v", "/tmp/test:/delete", IMAGE, "rm", "-rf", "/delete", + ], + check=True, + capture_output=True, + ) + @patch("sagemaker.train.local.local_container.shutil.rmtree") @patch("sagemaker.train.local.local_container.subprocess.run") @patch("sagemaker.train.local.local_container.os.path.exists", return_value=True) From de812477711cceac7e83d8fdff9f2c9ffb3c9b16 Mon Sep 17 00:00:00 2001 From: Molly He Date: Fri, 13 Mar 2026 17:00:55 -0700 Subject: [PATCH 4/5] Add -mindepth --- .../sagemaker/core/modules/local_core/local_container.py | 2 +- .../src/sagemaker/train/local/local_container.py | 2 +- .../tests/unit/train/local/test_local_container.py | 9 +++++++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py b/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py index af50bad700..c2498d449c 100644 --- a/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py +++ b/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py @@ -75,7 +75,7 @@ def _rmtree(path, image=None, is_studio=False): cmd = ["docker", "run", "--rm"] if is_studio: cmd += ["--network", "sagemaker"] - cmd += ["-v", f"{path}:/delete", image, "rm", "-rf", "/delete"] + cmd += ["-v", f"{path}:/delete", image, "sh", "-c", "find /delete -mindepth 1 -delete"] subprocess.run(cmd, check=True, capture_output=True) # The mount point directory itself may remain — clean it up if os.path.exists(path): diff --git a/sagemaker-train/src/sagemaker/train/local/local_container.py b/sagemaker-train/src/sagemaker/train/local/local_container.py index cf4413a171..dabdc6cb1b 100644 --- a/sagemaker-train/src/sagemaker/train/local/local_container.py +++ b/sagemaker-train/src/sagemaker/train/local/local_container.py @@ -83,7 +83,7 @@ def _rmtree(path, image=None, is_studio=False): cmd = ["docker", "run", "--rm"] if is_studio: cmd += ["--network", "sagemaker"] - cmd += ["-v", f"{path}:/delete", image, "rm", "-rf", "/delete"] + cmd += ["-v", f"{path}:/delete", image, "sh", "-c", "find /delete -mindepth 1 -delete"] subprocess.run(cmd, check=True, capture_output=True) # The mount point directory itself may remain — clean it up if os.path.exists(path): diff --git a/sagemaker-train/tests/unit/train/local/test_local_container.py b/sagemaker-train/tests/unit/train/local/test_local_container.py index 5dd556d995..43cbadacbd 100644 --- a/sagemaker-train/tests/unit/train/local/test_local_container.py +++ b/sagemaker-train/tests/unit/train/local/test_local_container.py @@ -37,7 +37,11 @@ def test_rmtree_permission_error_docker_fallback(self, mock_exists, mock_run, mo _rmtree("/tmp/test", IMAGE) mock_run.assert_called_once_with( - ["docker", "run", "--rm", "-v", "/tmp/test:/delete", IMAGE, "rm", "-rf", "/delete"], + [ + "docker", "run", "--rm", + "-v", "/tmp/test:/delete", IMAGE, + "sh", "-c", "find /delete -mindepth 1 -delete", + ], check=True, capture_output=True, ) @@ -55,7 +59,8 @@ def test_rmtree_studio_adds_network(self, mock_exists, mock_run, mock_rmtree): [ "docker", "run", "--rm", "--network", "sagemaker", - "-v", "/tmp/test:/delete", IMAGE, "rm", "-rf", "/delete", + "-v", "/tmp/test:/delete", IMAGE, + "sh", "-c", "find /delete -mindepth 1 -delete", ], check=True, capture_output=True, From 27960034dcd0ea9178b76c5191135fcd0e85d6a9 Mon Sep 17 00:00:00 2001 From: Molly He Date: Fri, 13 Mar 2026 17:07:01 -0700 Subject: [PATCH 5/5] Use chmod -R 777 via Docker --- .../modules/local_core/local_container.py | 8 ++--- .../sagemaker/train/local/local_container.py | 8 ++--- .../unit/train/local/test_local_container.py | 36 +++++-------------- 3 files changed, 14 insertions(+), 38 deletions(-) diff --git a/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py b/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py index c2498d449c..3ca9b82fa6 100644 --- a/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py +++ b/sagemaker-core/src/sagemaker/core/modules/local_core/local_container.py @@ -63,7 +63,7 @@ def _rmtree(path, image=None, is_studio=False): shutil.rmtree(path) except PermissionError: # Files created by Docker containers are owned by root. - # Use a Docker container to remove them since os.chmod will also fail. + # Use docker to chmod as root, then retry shutil.rmtree. if image is None: logger.warning( "Failed to clean up root-owned files in %s. " @@ -75,11 +75,9 @@ def _rmtree(path, image=None, is_studio=False): cmd = ["docker", "run", "--rm"] if is_studio: cmd += ["--network", "sagemaker"] - cmd += ["-v", f"{path}:/delete", image, "sh", "-c", "find /delete -mindepth 1 -delete"] + cmd += ["-v", f"{path}:/delete", image, "chmod", "-R", "777", "/delete"] subprocess.run(cmd, check=True, capture_output=True) - # The mount point directory itself may remain — clean it up - if os.path.exists(path): - shutil.rmtree(path, ignore_errors=True) + shutil.rmtree(path) except Exception: logger.warning( "Failed to clean up root-owned files in %s. " diff --git a/sagemaker-train/src/sagemaker/train/local/local_container.py b/sagemaker-train/src/sagemaker/train/local/local_container.py index dabdc6cb1b..329e21237b 100644 --- a/sagemaker-train/src/sagemaker/train/local/local_container.py +++ b/sagemaker-train/src/sagemaker/train/local/local_container.py @@ -71,7 +71,7 @@ def _rmtree(path, image=None, is_studio=False): shutil.rmtree(path) except PermissionError: # Files created by Docker containers are owned by root. - # Use a Docker container to remove them since os.chmod will also fail. + # Use docker to chmod as root, then retry shutil.rmtree. if image is None: logger.warning( "Failed to clean up root-owned files in %s. " @@ -83,11 +83,9 @@ def _rmtree(path, image=None, is_studio=False): cmd = ["docker", "run", "--rm"] if is_studio: cmd += ["--network", "sagemaker"] - cmd += ["-v", f"{path}:/delete", image, "sh", "-c", "find /delete -mindepth 1 -delete"] + cmd += ["-v", f"{path}:/delete", image, "chmod", "-R", "777", "/delete"] subprocess.run(cmd, check=True, capture_output=True) - # The mount point directory itself may remain — clean it up - if os.path.exists(path): - shutil.rmtree(path, ignore_errors=True) + shutil.rmtree(path) except Exception: logger.warning( "Failed to clean up root-owned files in %s. " diff --git a/sagemaker-train/tests/unit/train/local/test_local_container.py b/sagemaker-train/tests/unit/train/local/test_local_container.py index 43cbadacbd..ff3abd53eb 100644 --- a/sagemaker-train/tests/unit/train/local/test_local_container.py +++ b/sagemaker-train/tests/unit/train/local/test_local_container.py @@ -29,29 +29,24 @@ def test_rmtree_success(self, mock_rmtree): @patch("sagemaker.train.local.local_container.shutil.rmtree") @patch("sagemaker.train.local.local_container.subprocess.run") - @patch("sagemaker.train.local.local_container.os.path.exists", return_value=False) - def test_rmtree_permission_error_docker_fallback(self, mock_exists, mock_run, mock_rmtree): - """PermissionError triggers docker fallback using the training image.""" - mock_rmtree.side_effect = PermissionError("Permission denied") + def test_rmtree_permission_error_docker_chmod_fallback(self, mock_run, mock_rmtree): + """PermissionError triggers docker chmod then retry.""" + mock_rmtree.side_effect = [PermissionError("Permission denied"), None] _rmtree("/tmp/test", IMAGE) mock_run.assert_called_once_with( - [ - "docker", "run", "--rm", - "-v", "/tmp/test:/delete", IMAGE, - "sh", "-c", "find /delete -mindepth 1 -delete", - ], + ["docker", "run", "--rm", "-v", "/tmp/test:/delete", IMAGE, "chmod", "-R", "777", "/delete"], check=True, capture_output=True, ) + assert mock_rmtree.call_count == 2 @patch("sagemaker.train.local.local_container.shutil.rmtree") @patch("sagemaker.train.local.local_container.subprocess.run") - @patch("sagemaker.train.local.local_container.os.path.exists", return_value=False) - def test_rmtree_studio_adds_network(self, mock_exists, mock_run, mock_rmtree): + def test_rmtree_studio_adds_network(self, mock_run, mock_rmtree): """In Studio, docker run includes --network sagemaker.""" - mock_rmtree.side_effect = PermissionError("Permission denied") + mock_rmtree.side_effect = [PermissionError("Permission denied"), None] _rmtree("/tmp/test", IMAGE, is_studio=True) @@ -60,27 +55,12 @@ def test_rmtree_studio_adds_network(self, mock_exists, mock_run, mock_rmtree): "docker", "run", "--rm", "--network", "sagemaker", "-v", "/tmp/test:/delete", IMAGE, - "sh", "-c", "find /delete -mindepth 1 -delete", + "chmod", "-R", "777", "/delete", ], check=True, capture_output=True, ) - @patch("sagemaker.train.local.local_container.shutil.rmtree") - @patch("sagemaker.train.local.local_container.subprocess.run") - @patch("sagemaker.train.local.local_container.os.path.exists", return_value=True) - def test_rmtree_cleans_up_mount_point(self, mock_exists, mock_run, mock_rmtree): - """After docker cleanup, remaining mount point directory is removed.""" - mock_rmtree.side_effect = [PermissionError("Permission denied"), None] - - _rmtree("/tmp/test", IMAGE) - - assert mock_rmtree.call_count == 2 - mock_rmtree.assert_has_calls([ - call("/tmp/test"), - call("/tmp/test", ignore_errors=True), - ]) - @patch("sagemaker.train.local.local_container.shutil.rmtree") @patch("sagemaker.train.local.local_container.subprocess.run") def test_rmtree_docker_fallback_fails_raises(self, mock_run, mock_rmtree):