diff --git a/.changes/next-release/bugfix-s3-1138.json b/.changes/next-release/bugfix-s3-1138.json new file mode 100644 index 000000000000..e6d7b63d7780 --- /dev/null +++ b/.changes/next-release/bugfix-s3-1138.json @@ -0,0 +1,5 @@ +{ + "type": "bugfix", + "category": "``s3``", + "description": "Skip traversing directories that are fully excluded by ``--exclude``/``--include`` filters during ``s3 sync``/``cp``/``mv``/``rm``. Fixes a long-standing performance issue (#1138) and a non-zero rc bug for excluded unreadable files (#1117)." +} diff --git a/awscli/customizations/s3/filegenerator.py b/awscli/customizations/s3/filegenerator.py index 088c0b7381eb..f7c37ec7b733 100644 --- a/awscli/customizations/s3/filegenerator.py +++ b/awscli/customizations/s3/filegenerator.py @@ -19,6 +19,7 @@ from dateutil.tz import tzlocal from awscli.compat import queue +from awscli.customizations.s3.fileinfo import FileInfo from awscli.customizations.s3.utils import ( EPOCH_TIME, BucketLister, @@ -143,6 +144,8 @@ def __init__( page_size=None, result_queue=None, request_parameters=None, + file_filter=None, + is_dst_walker=False, ): self._client = client self.operation_name = operation_name @@ -154,6 +157,12 @@ def __init__( self.request_parameters = {} if request_parameters is not None: self.request_parameters = request_parameters + self.file_filter = file_filter + # When True, this generator is the reverse/destination walker for + # ``sync``. Filter pruning consults ``dst_patterns`` instead of + # ``patterns`` because the paths it sees are rooted at the + # destination, not the source. + self.is_dst_walker = is_dst_walker def call(self, files): """ @@ -230,6 +239,18 @@ def list_files(self, path, dir_op): for name in names: file_path = join(path, name) if isdir(file_path): + # If the user's filter chain makes it impossible + # for any descendant to be included, prune the + # whole subtree instead of recursing into it. + if ( + self.file_filter is not None + and self.file_filter.can_skip_directory( + file_path, + 'local', + use_dst_patterns=self.is_dst_walker, + ) + ): + continue # Anything in a directory will have a prefix of # this current directory and will come before the # remaining contents in this directory. This @@ -305,6 +326,44 @@ def should_ignore_file(self, path): path = path[:-1] if os.path.islink(path): return True + # Only run the prefilter / existence-before-filter logic when the + # user actually supplied --include/--exclude patterns. With an + # empty filter, ``Filter.call`` cannot suppress anything anyway, + # so the extra os.path.exists() call here would be pure overhead + # on large unfiltered sync/cp/mv walks. + if self.file_filter is not None: + relevant_patterns = ( + self.file_filter.dst_patterns + if self.is_dst_walker + else self.file_filter.patterns + ) + else: + relevant_patterns = None + if relevant_patterns: + # Validate existence *before* the filter has a chance to + # suppress the warning. A broken symlink or a path that + # disappeared between listdir and stat should still produce + # the standard "File does not exist." warning even when + # --exclude/--include would otherwise mask it. + if not os.path.exists(path): + return self.triggers_warning(path) + # Pre-filter using the user's --include/--exclude rules so the + # walker does not stat / listdir entries that the filter chain + # is going to drop anyway. The directory check must run before + # triggers_warning() because is_readable() calls os.listdir() + # on directories — without this guard, excluded subtrees would + # be listed just to test readability and the prune would be + # defeated. The file branch silences readability warnings for + # files that the filter would drop anyway. + if os.path.isdir(path): + if self.file_filter.can_skip_directory( + path, 'local', use_dst_patterns=self.is_dst_walker + ): + return True + else: + probe = FileInfo(src=path, src_type='local') + if not list(self.file_filter.call([probe])): + return True warning_triggered = self.triggers_warning(path) if warning_triggered: return True diff --git a/awscli/customizations/s3/filters.py b/awscli/customizations/s3/filters.py index 2c03f2a1eb5c..32022ad03f87 100644 --- a/awscli/customizations/s3/filters.py +++ b/awscli/customizations/s3/filters.py @@ -74,6 +74,42 @@ def _get_local_root(source_location, dir_op): return rootdir +def _literal_prefix(pattern): + for i, ch in enumerate(pattern): + if ch in '*?[': + return pattern[:i] + return pattern + + +def _pattern_can_match_under(pattern, target_with_sep): + """Return True if some descendant path under ``target_with_sep`` + could match ``pattern``. False means no descendant can possibly + match, so the caller may safely skip the directory. False is + never returned when a match is actually possible. + """ + lit = _literal_prefix(pattern) + common = min(len(lit), len(target_with_sep)) + if lit[:common] != target_with_sep[:common]: + return False + if len(target_with_sep) <= len(lit): + return True + if lit == pattern: + return False + return True + + +def _pattern_matches_all_under(pattern, target_with_sep): + """Return True only when every descendant path under + ``target_with_sep`` is guaranteed to match ``pattern``. False + means we cannot prove this — the caller must not assume a match. + """ + lit = _literal_prefix(pattern) + if not target_with_sep.startswith(lit): + return False + rest = pattern[len(lit):] + return bool(rest) and all(c == '*' for c in rest) + + class Filter: """ This is a universal exclude/include filter. @@ -161,3 +197,48 @@ def _match_pattern(self, pattern, file_info): path_pattern, ) return file_status + + def can_skip_directory( + self, dir_path, src_type='local', use_dst_patterns=False + ): + """Return True only when the filter chain cannot include any + descendant of ``dir_path``, so the walker may skip listing it. + False means the directory must be traversed normally — either + a descendant could match, or we cannot prove otherwise. + + ``use_dst_patterns``: when True, evaluate against ``dst_patterns`` + (rooted at the destination) instead of ``patterns`` (rooted at the + source). The reverse-direction file generator used during + ``s3 sync s3://bucket/ ./local`` walks the local destination, so + it must consult the destination-rooted patterns to prune correctly. + """ + patterns = self.dst_patterns if use_dst_patterns else self.patterns + if not patterns: + return False + sep = os.sep if src_type == 'local' else '/' + target = dir_path.rstrip(sep) + sep + if src_type == 'local': + # ``fnmatch.fnmatch`` (used by ``_match_pattern``) normalizes + # case via ``os.path.normcase`` on Windows, so this prefix + # comparison must do the same — otherwise a case-different + # subtree on Windows would be incorrectly pruned. On POSIX + # ``normcase`` is identity. + target = os.path.normcase(target) + normalized = [] + for pattern_type, pat in patterns: + if src_type == 'local': + pat = os.path.normcase(pat.replace('/', os.sep)) + else: + pat = pat.replace(os.sep, '/') + normalized.append((pattern_type, pat)) + for pattern_type, pat in normalized: + if pattern_type == 'include' and _pattern_can_match_under( + pat, target + ): + return False + for pattern_type, pat in normalized: + if pattern_type == 'exclude' and _pattern_matches_all_under( + pat, target + ): + return True + return False diff --git a/awscli/customizations/s3/subcommands.py b/awscli/customizations/s3/subcommands.py index 03630f872cee..7c483bb594ab 100644 --- a/awscli/customizations/s3/subcommands.py +++ b/awscli/customizations/s3/subcommands.py @@ -1470,12 +1470,14 @@ def run(self): result_queue = queue.Queue() operation_name = cmd_translation[paths_type] + file_filter = create_filter(self.parameters) fgen_kwargs = { 'client': self._source_client, 'operation_name': operation_name, 'follow_symlinks': self.parameters['follow_symlinks'], 'page_size': self.parameters['page_size'], 'result_queue': result_queue, + 'file_filter': file_filter, } rgen_kwargs = { 'client': self._client, @@ -1483,6 +1485,8 @@ def run(self): 'follow_symlinks': self.parameters['follow_symlinks'], 'page_size': self.parameters['page_size'], 'result_queue': result_queue, + 'file_filter': file_filter, + 'is_dst_walker': True, } fgen_request_parameters = ( diff --git a/tests/functional/s3/test_filter_traverse.py b/tests/functional/s3/test_filter_traverse.py new file mode 100644 index 000000000000..f5e848f884cd --- /dev/null +++ b/tests/functional/s3/test_filter_traverse.py @@ -0,0 +1,337 @@ +# Copyright 2026 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +"""Functional tests for ``--exclude``/``--include`` filter behavior. + +Covers issues #1117 (excluded unreadable files raising rc=2) and #1138 +(traversal cost on excluded subtrees), and exercises filter semantics +across the four S3 transfer commands (sync, cp, mv, rm). +""" +import os + +from awscli.testutils import mock, skip_if_windows +from tests.functional.s3 import BaseS3TransferCommandTest + + +class _BaseFilterTraverse(BaseS3TransferCommandTest): + """Shared helpers for the four command-specific test classes.""" + + def _put_object_response(self): + return {'ETag': '"c8afdb36c52cf4727836669019e69222"'} + + def _list_dest(self): + """Mock responses for any list-the-destination calls a command makes + before transferring. Subclasses override based on command semantics. + + - ``s3 sync`` lists the destination bucket once. + - ``s3 cp``/``s3 mv`` upload directly without listing the destination. + """ + return [] + + def _operation_keys(self, op_name): + keys = [] + for op in self.operations_called: + if op[0].name == op_name: + keys.append(op[1]['Key'].replace('\\', '/')) + return sorted(keys) + + def _uploaded_keys(self): + return self._operation_keys('PutObject') + + def _deleted_keys(self): + return self._operation_keys('DeleteObject') + + +class _LocalToS3Mixin: + """For sync/cp/mv: source is the local rootdir, dest is s3://bucket/.""" + + def _run(self, args, expected_rc=0): + cmdline = '%s %s s3://bucket/ %s' % ( + self.prefix, + self.files.rootdir, + args, + ) + return self.run_cmd(cmdline, expected_rc=expected_rc) + + +class _UploadFilterCases: + """Test methods reused by sync/cp/mv (all local→s3 with traversal).""" + + @skip_if_windows('POSIX-only file mode test.') + def test_unreadable_file_in_excluded_dir_does_not_fail(self): + """#1117: 0o000 file in an excluded directory must not affect rc.""" + bad_path = self.files.create_file( + os.path.join('excluded', 'bad'), 'data' + ) + os.chmod(bad_path, 0o000) + try: + self.parsed_responses = self._list_dest() + _, stderr, _ = self._run( + "--only-show-errors --exclude excluded/*", expected_rc=0 + ) + self.assertNotIn('excluded/bad', stderr) + self.assertNotIn('not readable', stderr) + finally: + os.chmod(bad_path, 0o600) + + @skip_if_windows('POSIX-only directory mode test.') + def test_unreadable_directory_inside_excluded_path_not_descended(self): + """#1138: listdir must not be called on excluded subtrees.""" + self.files.create_file( + os.path.join('excluded', 'sub', 'inner.txt'), 'data' + ) + sub_path = os.path.join(self.files.rootdir, 'excluded', 'sub') + os.chmod(sub_path, 0o000) + try: + self.parsed_responses = self._list_dest() + _, stderr, _ = self._run( + "--only-show-errors --exclude excluded/*", expected_rc=0 + ) + self.assertNotIn('excluded/sub', stderr) + self.assertNotIn('not readable', stderr) + finally: + os.chmod(sub_path, 0o755) + + def test_filter_excludes_everything_no_uploads(self): + """``--exclude *`` alone yields rc=0 with no uploads.""" + self.files.create_file('a.txt', 'data') + self.files.create_file(os.path.join('sub', 'b.txt'), 'data') + self.parsed_responses = self._list_dest() + self._run("--exclude *", expected_rc=0) + self.assertEqual(self._uploaded_keys(), []) + + def test_empty_source_directory_no_uploads(self): + """Sync of an empty directory yields rc=0 with no uploads.""" + self.parsed_responses = self._list_dest() + self._run("", expected_rc=0) + self.assertEqual(self._uploaded_keys(), []) + + def test_multiple_includes_combine_with_exclude_star(self): + """``--exclude * --include *.txt --include *.md`` keeps both kinds.""" + self.files.create_file('a.txt', 'data') + self.files.create_file('b.md', 'data') + self.files.create_file('c.log', 'data') + self.files.create_file('d.bin', 'data') + self.parsed_responses = self._list_dest() + [ + self._put_object_response(), + self._put_object_response(), + ] + self._run( + "--exclude * --include *.txt --include *.md", expected_rc=0 + ) + self.assertEqual(self._uploaded_keys(), ['a.txt', 'b.md']) + + def test_deep_include_with_exclude_star_uploads_only_target(self): + """``--exclude * --include a/b/c.txt`` uploads exactly c.txt.""" + self.files.create_file(os.path.join('a', 'b', 'c.txt'), 'data') + self.files.create_file(os.path.join('a', 'b', 'x.txt'), 'data') + self.files.create_file(os.path.join('a', 'y.txt'), 'data') + self.files.create_file('z.txt', 'data') + self.parsed_responses = self._list_dest() + [ + self._put_object_response() + ] + self._run("--exclude * --include a/b/c.txt", expected_rc=0) + self.assertEqual(self._uploaded_keys(), ['a/b/c.txt']) + + def test_include_overrides_exclude_in_same_subdirectory(self): + """``--exclude cache/* --include cache/keep.txt`` keeps the target.""" + self.files.create_file(os.path.join('cache', 'keep.txt'), 'data') + self.files.create_file(os.path.join('cache', 'drop.txt'), 'data') + self.parsed_responses = self._list_dest() + [ + self._put_object_response() + ] + self._run( + "--exclude cache/* --include cache/keep.txt", expected_rc=0 + ) + self.assertEqual(self._uploaded_keys(), ['cache/keep.txt']) + + def test_glob_include_under_exclude_star_finds_all_matches(self): + """Regression case from PR aws/aws-cli#5425 + ([discussion_r883991435]). + + ``--exclude '*' --include '*.py'`` must traverse subdirectories + to locate ``.py`` files at every depth. + """ + self.files.create_file('top.py', 'data') + self.files.create_file(os.path.join('directory', 'mid.py'), 'data') + self.files.create_file( + os.path.join('directory', 'another-dir', 'deep.py'), 'data' + ) + self.files.create_file( + os.path.join('directory', 'note.txt'), 'data' + ) + self.parsed_responses = self._list_dest() + [ + self._put_object_response() for _ in range(3) + ] + self._run("--exclude * --include *.py", expected_rc=0) + self.assertEqual( + self._uploaded_keys(), + [ + 'directory/another-dir/deep.py', + 'directory/mid.py', + 'top.py', + ], + ) + + @skip_if_windows('Uses POSIX path realpath comparison.') + def test_excluded_subtree_is_not_listdired(self): + """Direct prune evidence for #1138. + + Spies on ``os.listdir`` inside the file generator and asserts + the excluded subtree is never listed. Side-effect-only tests + cannot distinguish 'filter dropped late' from 'walker pruned + early'; this one can. + """ + self.files.create_file( + os.path.join('excluded', 'a.txt'), 'data' + ) + self.files.create_file(os.path.join('kept', 'b.txt'), 'data') + excluded_path = os.path.realpath( + os.path.join(self.files.rootdir, 'excluded') + ) + real_listdir = os.listdir + listed_paths = [] + + def spy(path): + listed_paths.append(os.path.realpath(path)) + return real_listdir(path) + + self.parsed_responses = self._list_dest() + [ + self._put_object_response() + ] + with mock.patch( + 'awscli.customizations.s3.filegenerator.os.listdir', + side_effect=spy, + ): + self._run("--exclude excluded/*", expected_rc=0) + + self.assertNotIn(excluded_path, listed_paths) + self.assertEqual(self._uploaded_keys(), ['kept/b.txt']) + + @skip_if_windows('Symlink semantics differ on Windows.') + def test_broken_symlink_warning_survives_specific_exclude(self): + """A broken symlink encountered during walking must still produce + a 'does not exist' warning. The filter is for selecting transfers, + not for swallowing fs-level validation errors. + """ + self.files.create_file('keep.txt', 'data') + target = os.path.join(self.files.rootdir, 'no_such_target') + link = os.path.join(self.files.rootdir, 'broken_link') + os.symlink(target, link) + self.parsed_responses = self._list_dest() + [ + self._put_object_response() + ] + _, stderr, _ = self._run("--exclude broken_link", expected_rc=2) + self.assertIn('does not exist', stderr) + + +class TestSyncFilterTraverse( + _UploadFilterCases, _LocalToS3Mixin, _BaseFilterTraverse +): + prefix = 's3 sync ' + + def _list_dest(self): + # sync compares destination by listing it once. + return [self.list_objects_response([])] + + +class TestCpFilterTraverse( + _UploadFilterCases, _LocalToS3Mixin, _BaseFilterTraverse +): + prefix = 's3 cp --recursive ' + + +class TestMvFilterTraverse( + _UploadFilterCases, _LocalToS3Mixin, _BaseFilterTraverse +): + prefix = 's3 mv --recursive ' + + +class TestSyncDownloadFilterTraverse(_BaseFilterTraverse): + """``s3 sync s3://b/ local/`` with filters applied to S3 keys.""" + + prefix = 's3 sync ' + + def _run(self, args, expected_rc=0): + cmdline = '%s s3://bucket/ %s %s' % ( + self.prefix, + self.files.rootdir, + args, + ) + return self.run_cmd(cmdline, expected_rc=expected_rc) + + def _downloaded_keys(self): + return self._operation_keys('GetObject') + + def test_excluded_keys_not_downloaded(self): + self.parsed_responses = [ + self.list_objects_response( + ['keep/a.txt', 'logs/x.log', 'logs/y.log', 'keep/b.txt'] + ), + self.get_object_response(), + self.get_object_response(), + ] + self._run("--exclude logs/*", expected_rc=0) + self.assertEqual( + self._downloaded_keys(), ['keep/a.txt', 'keep/b.txt'] + ) + + def test_filter_excludes_everything_no_downloads(self): + self.parsed_responses = [ + self.list_objects_response(['a.txt', 'sub/b.txt']), + ] + self._run("--exclude *", expected_rc=0) + self.assertEqual(self._downloaded_keys(), []) + + @skip_if_windows('POSIX-only directory mode test.') + def test_excluded_dest_subtree_with_unreadable_dir_does_not_warn(self): + """sync s3://b/ ./dst --exclude excluded/*: a 0o000 directory + sitting inside the excluded destination subtree must not be + listdired (and so must not produce a 'not readable' warning). + """ + self.files.create_file( + os.path.join('excluded', 'sub', 'inner.txt'), 'data' + ) + sub_path = os.path.join(self.files.rootdir, 'excluded', 'sub') + os.chmod(sub_path, 0o000) + try: + self.parsed_responses = [self.list_objects_response([])] + _, stderr, _ = self._run( + "--only-show-errors --exclude excluded/*", expected_rc=0 + ) + self.assertNotIn('excluded/sub', stderr) + self.assertNotIn('not readable', stderr) + finally: + os.chmod(sub_path, 0o755) + + +class TestRmFilterTraverse(_BaseFilterTraverse): + """``s3 rm`` works on S3 objects only — no local traversal.""" + + prefix = 's3 rm --recursive ' + + def _run(self, args, expected_rc=0): + cmdline = '%s s3://bucket/ %s' % (self.prefix, args) + return self.run_cmd(cmdline, expected_rc=expected_rc) + + def test_excluded_keys_not_deleted(self): + self.parsed_responses = [ + self.list_objects_response( + ['keep/a.txt', 'logs/x.log', 'logs/y.log', 'keep/b.txt'] + ), + self.empty_response(), + self.empty_response(), + ] + self._run("--exclude logs/*", expected_rc=0) + self.assertEqual( + self._deleted_keys(), ['keep/a.txt', 'keep/b.txt'] + ) diff --git a/tests/unit/customizations/s3/test_filegenerator.py b/tests/unit/customizations/s3/test_filegenerator.py index 1b962ba8b5da..dc96543df26a 100644 --- a/tests/unit/customizations/s3/test_filegenerator.py +++ b/tests/unit/customizations/s3/test_filegenerator.py @@ -239,6 +239,28 @@ def test_no_skip_symlink_dir(self): self.assertFalse(filegenerator.should_ignore_file(sym_path)) self.assertFalse(filegenerator.should_ignore_file(path)) + def test_empty_filter_does_not_add_extra_exists_check(self): + """No --exclude/--include → should_ignore_file must not call + os.path.exists() before triggers_warning(). The pre-filter path + is pure overhead when the filter has no patterns and would + regress unfiltered sync/cp/mv walks on large trees. + """ + from awscli.customizations.s3.filters import Filter + + path = self.files.create_file('foo.txt', contents='data') + empty_filter = Filter({}, None, None) + gen = FileGenerator( + self.client, '', True, file_filter=empty_filter + ) + with mock.patch( + 'awscli.customizations.s3.filegenerator.os.path.exists', + wraps=os.path.exists, + ) as mock_exists: + gen.should_ignore_file(path) + # triggers_warning() makes one os.path.exists() call. Our prefilter + # must not add a second one for the empty-filter case. + self.assertEqual(mock_exists.call_count, 1) + class TestThrowsWarning(unittest.TestCase): def setUp(self): diff --git a/tests/unit/customizations/s3/test_filters.py b/tests/unit/customizations/s3/test_filters.py index 77024589e1b5..064d4d4e7dc5 100644 --- a/tests/unit/customizations/s3/test_filters.py +++ b/tests/unit/customizations/s3/test_filters.py @@ -14,8 +14,14 @@ import platform from awscli.customizations.s3.filegenerator import FileStat -from awscli.customizations.s3.filters import Filter, create_filter -from awscli.testutils import unittest +from awscli.customizations.s3.filters import ( + Filter, + _literal_prefix, + _pattern_can_match_under, + _pattern_matches_all_under, + create_filter, +) +from awscli.testutils import mock, unittest def platform_path(filepath): @@ -245,5 +251,240 @@ def test_create_filter_s3_to_s3(self): self.assertFalse('.txt' in filtered_file.src) +class LiteralPrefixTest(unittest.TestCase): + def test_no_metacharacters_returns_full_pattern(self): + self.assertEqual(_literal_prefix('foo/bar.txt'), 'foo/bar.txt') + + def test_stops_at_star(self): + self.assertEqual(_literal_prefix('foo/*'), 'foo/') + + def test_stops_at_question_mark(self): + self.assertEqual(_literal_prefix('foo/?bar'), 'foo/') + + def test_stops_at_bracket(self): + self.assertEqual(_literal_prefix('foo/[abc]'), 'foo/') + + def test_starts_with_metacharacter(self): + self.assertEqual(_literal_prefix('*.py'), '') + + +class PatternCanMatchUnderTest(unittest.TestCase): + def test_glob_metachar_pattern_can_match_anywhere(self): + self.assertTrue(_pattern_can_match_under('*.py', 'root/foo/')) + + def test_pattern_with_target_as_prefix_can_match(self): + self.assertTrue( + _pattern_can_match_under('root/excluded/*', 'root/excluded/') + ) + + def test_pattern_more_specific_than_target_still_matches(self): + self.assertTrue( + _pattern_can_match_under( + 'root/excluded/included/*', 'root/excluded/' + ) + ) + + def test_diverging_literals_cannot_match(self): + self.assertFalse( + _pattern_can_match_under('root/excluded/*', 'root/other/') + ) + + def test_literal_only_pattern_shorter_than_target_cannot_match(self): + self.assertFalse( + _pattern_can_match_under('root/excluded', 'root/excluded/') + ) + + def test_literal_only_pattern_under_target_can_match(self): + self.assertTrue(_pattern_can_match_under('root/foo', 'root/')) + + +class PatternMatchesAllUnderTest(unittest.TestCase): + def test_star_only_matches_everything(self): + self.assertTrue(_pattern_matches_all_under('*', 'anything/')) + + def test_target_star_matches_all_descendants(self): + self.assertTrue( + _pattern_matches_all_under('root/excluded/*', 'root/excluded/') + ) + + def test_double_star_treated_as_star(self): + self.assertTrue( + _pattern_matches_all_under('root/excluded/**', 'root/excluded/') + ) + + def test_higher_pattern_with_wildcard_covers_descendants(self): + self.assertTrue(_pattern_matches_all_under('root/*', 'root/foo/')) + + def test_partial_pattern_does_not_cover(self): + self.assertFalse(_pattern_matches_all_under('root/*.tmp', 'root/foo/')) + + def test_literal_only_pattern_does_not_cover(self): + self.assertFalse(_pattern_matches_all_under('root/foo', 'root/foo/')) + + def test_diverging_literal_does_not_cover(self): + self.assertFalse( + _pattern_matches_all_under('root/excluded/*', 'root/other/') + ) + + +class CanSkipDirectoryTest(unittest.TestCase): + """Verifies the §2.5 case table from the design proposal.""" + + def _make_filter(self, raw_filters, rootdir=None): + if rootdir is None: + rootdir = platform_path('/root') + normalized = [(action.lstrip('-'), pat) for action, pat in raw_filters] + return Filter(normalized, rootdir, rootdir) + + def _path_under(self, *parts, rootdir=None): + if rootdir is None: + rootdir = platform_path('/root') + return os.path.join(rootdir, *parts) + + def test_no_filters_never_skips(self): + f = Filter({}, None, None) + self.assertFalse(f.can_skip_directory(self._path_under('anything'))) + + def test_simple_exclude_skips_matching_directory(self): + f = self._make_filter([('--exclude', 'src/*')]) + self.assertTrue(f.can_skip_directory(self._path_under('src'))) + + def test_exclude_star_skips_all_descendants(self): + f = self._make_filter([('--exclude', '*')]) + self.assertTrue(f.can_skip_directory(self._path_under('foo', 'bar'))) + + def test_kyleknap_regression_does_not_skip(self): + """The case that killed PR aws/aws-cli#5425 must not regress.""" + f = self._make_filter( + [('--exclude', '*'), ('--include', '*.py')] + ) + self.assertFalse(f.can_skip_directory(self._path_under('foo'))) + self.assertFalse( + f.can_skip_directory(self._path_under('foo', 'bar')) + ) + + def test_include_under_excluded_subtree_blocks_skip(self): + f = self._make_filter( + [ + ('--exclude', 'sub/*'), + ('--include', 'sub/included/*'), + ] + ) + self.assertFalse(f.can_skip_directory(self._path_under('sub'))) + + def test_sibling_subtree_under_exclude_is_skipped(self): + f = self._make_filter( + [ + ('--exclude', 'src/*'), + ('--include', 'src/included/*'), + ] + ) + self.assertTrue( + f.can_skip_directory(self._path_under('src', 'other')) + ) + + def test_included_subtree_traversed(self): + f = self._make_filter( + [ + ('--exclude', 'src/*'), + ('--include', 'src/included/*'), + ] + ) + self.assertFalse( + f.can_skip_directory(self._path_under('src', 'included')) + ) + + def test_partial_pattern_does_not_skip(self): + f = self._make_filter([('--exclude', '*.tmp')]) + self.assertFalse(f.can_skip_directory(self._path_under('foo'))) + + def test_literal_pattern_does_not_skip_descendants(self): + f = self._make_filter([('--exclude', 'foo')]) + self.assertFalse(f.can_skip_directory(self._path_under('foo'))) + + def test_s3_separator_exclude(self): + rootdir = 'bucket' + f = Filter([('exclude', 'logs/*')], rootdir, rootdir) + self.assertTrue( + f.can_skip_directory('bucket/logs', src_type='s3') + ) + self.assertFalse( + f.can_skip_directory('bucket/keep', src_type='s3') + ) + + def test_three_filter_alternating_stack(self): + f = self._make_filter( + [ + ('--exclude', 'excluded/*'), + ('--include', 'excluded/included/*'), + ('--exclude', 'included/excluded/*'), + ] + ) + self.assertFalse( + f.can_skip_directory(self._path_under('excluded')) + ) + self.assertTrue( + f.can_skip_directory( + self._path_under('included', 'excluded') + ) + ) + self.assertFalse( + f.can_skip_directory(self._path_under('included')) + ) + + def test_can_skip_directory_uses_dst_patterns_when_requested(self): + """The reverse walker (sync s3://b/ ./dst) must be able to prune + destination subtrees by consulting ``dst_patterns`` instead of + the source-rooted ``patterns``. + """ + src_root = platform_path('/src') + dst_root = platform_path('/dst') + f = Filter([('exclude', 'excluded/*')], src_root, dst_root) + self.assertTrue( + f.can_skip_directory( + os.path.join(dst_root, 'excluded'), + 'local', + use_dst_patterns=True, + ) + ) + self.assertFalse( + f.can_skip_directory( + os.path.join(dst_root, 'excluded'), + 'local', + use_dst_patterns=False, + ) + ) + + def test_can_skip_directory_dst_patterns_empty_returns_false(self): + f = Filter({}, None, None) + self.assertFalse( + f.can_skip_directory( + self._path_under('anything'), + 'local', + use_dst_patterns=True, + ) + ) + + @mock.patch('awscli.customizations.s3.filters.os.path.normcase') + def test_can_skip_directory_is_case_insensitive_on_windows( + self, mock_normcase + ): + """fnmatch.fnmatch is case-insensitive on Windows via normcase. + can_skip_directory must apply the same normalization or it will + wrong-prune a case-different subtree that an include pattern + actually covers. + """ + mock_normcase.side_effect = ( + lambda p: p.lower().replace('/', os.sep) + ) + f = self._make_filter( + [ + ('--exclude', '*'), + ('--include', 'SRC/important.txt'), + ] + ) + self.assertFalse(f.can_skip_directory(self._path_under('src'))) + + if __name__ == "__main__": unittest.main()