From cfce65c5ef8bd0e9b07e2a710690960ec9a49d31 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 1 Apr 2026 16:40:18 +0000 Subject: [PATCH 1/5] Add comprehensive unicode bypass tests for grep-find-unicode-wrapper Tests 66 cases across 10 categories: clean files, ASCII control chars, BiDi/Trojan Source chars, invisible/zero-width chars, homoglyphs, Unicode spaces, tag characters, malformed UTF-8, sneaky embeddings, and edge cases. No actual bypass found - checks 1+2 catch all non-ASCII. Documents a locale bug: check 3's $'\uXXXX' expansion requires a UTF-8 locale. In non-UTF-8 locales the pattern degrades to literal chars causing false positives, though BiDi detection is still covered by checks 1+2. https://claude.ai/code/session_01726gqqGv3oaDV5jLbM6E6h --- tests/test_grep_find_unicode_wrapper | 495 +++++++++++++++++++++++++++ 1 file changed, 495 insertions(+) create mode 100755 tests/test_grep_find_unicode_wrapper diff --git a/tests/test_grep_find_unicode_wrapper b/tests/test_grep_find_unicode_wrapper new file mode 100755 index 00000000..d9998d68 --- /dev/null +++ b/tests/test_grep_find_unicode_wrapper @@ -0,0 +1,495 @@ +#!/bin/bash + +## Copyright (C) 2025 - 2025 ENCRYPTED SUPPORT LLC +## See the file COPYING for copying conditions. + +## Test script for grep-find-unicode-wrapper +## Tests for Unicode bypass vulnerabilities by checking that various +## suspicious characters are properly detected by the four grep checks. +## +## Bug found: Check 3 uses bash $'\uXXXX' expansion which requires a +## UTF-8 locale at parse time. In non-UTF-8 locales (LANG=C or LANG=), +## the \u sequences are passed through literally, creating a character +## class of [0-9A-Fu\] that causes massive false positives. +## Fix: Use raw \x byte sequences instead of \u escapes. + +set -o errexit +set -o nounset +set -o errtrace +set -o pipefail + +test_dir="" +pass_count=0 +fail_count=0 +skip_count=0 + +cleanup() { + if [ -n "$test_dir" ] && [ -d "$test_dir" ]; then + rm -rf -- "$test_dir" + fi +} +trap cleanup EXIT + +test_dir="$(mktemp -d)" + +## Colors for output (if terminal supports it). +if [ -t 1 ]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + NC='\033[0m' +else + RED='' + GREEN='' + YELLOW='' + NC='' +fi + +## Replicate the four grep checks from grep-find-unicode-wrapper. +## This allows testing even without stecho installed. +grep_args=( + --files-with-matches + --line-number + --binary-files=text +) + +## Build the check 3 pattern using raw UTF-8 byte sequences (\x) +## instead of \u escapes, so this test works regardless of locale. +## +## U+061C -> 0xD8 0x9C +## U+200E -> 0xE2 0x80 0x8E +## U+200F -> 0xE2 0x80 0x8F +## U+202A -> 0xE2 0x80 0xAA +## U+202B -> 0xE2 0x80 0xAB +## U+202C -> 0xE2 0x80 0xAC +## U+202D -> 0xE2 0x80 0xAD +## U+202E -> 0xE2 0x80 0xAE +## U+2066 -> 0xE2 0x81 0xA6 +## U+2067 -> 0xE2 0x81 0xA7 +## U+2068 -> 0xE2 0x81 0xA8 +## U+2069 -> 0xE2 0x81 0xA9 +## +## Note: check 3 in the wrapper does NOT use --perl-regexp, so we use +## grep's basic bracket expression which with LC_ALL=C matches individual +## bytes. This means the bracket expression matches ANY of the individual +## bytes, not specific multi-byte sequences. This is overly broad but +## ensures detection. +bidi_pattern=$'[\xD8\x9C\xE2\x80\x8E\x8F\xAA\xAB\xAC\xAD\xAE\x81\xA6\xA7\xA8\xA9]' + +run_checks() { + local file="$1" + local found=0 + + ## Check 1: Non-ASCII bytes (hex range). + if LC_ALL=C grep "${grep_args[@]}" --perl-regexp '[^\x00-\x7F]' "$file" >/dev/null 2>&1; then + found=1 + fi + + ## Check 2: Non-ASCII (POSIX class). + if LC_ALL=C grep "${grep_args[@]}" --perl-regexp "[^[:ascii:]]" "$file" >/dev/null 2>&1; then + found=1 + fi + + ## Check 3: BiDi / Trojan Source characters. + ## Using raw byte pattern (see bidi_pattern above). + if LC_ALL=C grep "${grep_args[@]}" "$bidi_pattern" "$file" >/dev/null 2>&1; then + found=1 + fi + + ## Check 4: ASCII control characters. + if LC_ALL=C grep "${grep_args[@]}" --perl-regexp '[\x00-\x08\x0B\x0C\x0D\x0E-\x1F\x7F]' "$file" >/dev/null 2>&1; then + found=1 + fi + + return $(( ! found )) +} + +expect_detected() { + local description="$1" + local file="$2" + + if run_checks "$file"; then + printf "${GREEN}PASS${NC}: Detected: %s\n" "$description" + pass_count=$(( pass_count + 1 )) + else + printf "${RED}FAIL${NC}: NOT detected: %s\n" "$description" >&2 + fail_count=$(( fail_count + 1 )) + fi +} + +expect_clean() { + local description="$1" + local file="$2" + + if ! run_checks "$file"; then + printf "${GREEN}PASS${NC}: Clean: %s\n" "$description" + pass_count=$(( pass_count + 1 )) + else + printf "${RED}FAIL${NC}: False positive: %s\n" "$description" >&2 + fail_count=$(( fail_count + 1 )) + fi +} + +write_file() { + local name="$1" + local content="$2" + local file="$test_dir/$name" + printf '%s' "$content" > "$file" + printf '%s' "$file" +} + +write_file_binary() { + local name="$1" + shift + local file="$test_dir/$name" + printf "$@" > "$file" + printf '%s' "$file" +} + +printf '%s\n' "===== grep-find-unicode-wrapper bypass tests =====" +printf '%s\n' "" + +## =================================================================== +## Section 0: Check 3 locale bug validation +## =================================================================== +printf '%s\n' "--- Check 3 locale bug ($'\u' expansion) ---" + +## Verify whether $'\uXXXX' expands to UTF-8 in the current locale. +## If it doesn't, check 3 in the wrapper is broken. +check3_test_byte="$(printf '%s' $'\u061C' | od -A n -t x1 | tr -d ' \n')" +if [ "$check3_test_byte" = "d89c" ]; then + printf "${GREEN}PASS${NC}: \$'\\u061C' expands to UTF-8 bytes (d8 9c) - check 3 works in this locale\n" + pass_count=$(( pass_count + 1 )) +else + printf "${YELLOW}WARN${NC}: \$'\\u061C' expands to literal '\\u061C' (got: %s) - check 3 in wrapper is broken in this locale!\n" "$check3_test_byte" + printf " Current locale: LANG='${LANG:-}' LC_ALL='${LC_ALL:-}'\n" + printf " The wrapper's check 3 pattern becomes a character class of\n" + printf " literal chars [0-9A-Fu\\\\] causing false positives on most files.\n" + printf " BiDi chars are still caught by checks 1+2 (non-ASCII byte detection).\n" + printf " Fix: use \\\\x byte sequences instead of \\\\u escapes in the wrapper.\n" + skip_count=$(( skip_count + 1 )) +fi + +## =================================================================== +## Section 1: Clean files (should NOT be detected) +## =================================================================== +printf '%s\n' "" +printf '%s\n' "--- Clean files (no false positives expected) ---" + +f="$(write_file "clean_ascii.txt" "Hello, World!")" +expect_clean "Plain ASCII text" "$f" + +f="$(write_file "clean_with_tab.txt" "$(printf 'col1\tcol2')")" +expect_clean "ASCII with TAB (0x09)" "$f" + +f="$(write_file_binary "clean_with_newline.txt" 'line1\nline2\n')" +expect_clean "ASCII with LF (0x0A)" "$f" + +f="$(write_file "clean_printable.txt" ' !"#$%&'\''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~')" +expect_clean "All printable ASCII characters" "$f" + +f="$(write_file "empty.txt" "")" +expect_clean "Empty file" "$f" + +## =================================================================== +## Section 2: ASCII control characters (check 4) +## =================================================================== +printf '%s\n' "" +printf '%s\n' "--- ASCII control characters ---" + +f="$(write_file_binary "null.txt" '\x00')" +expect_detected "NULL byte (0x00)" "$f" + +f="$(write_file_binary "soh.txt" '\x01')" +expect_detected "SOH (0x01)" "$f" + +f="$(write_file_binary "stx.txt" '\x02')" +expect_detected "STX (0x02)" "$f" + +f="$(write_file_binary "bell.txt" '\x07')" +expect_detected "BEL (0x07)" "$f" + +f="$(write_file_binary "backspace.txt" '\x08')" +expect_detected "Backspace (0x08) - can overwrite displayed text" "$f" + +f="$(write_file_binary "vt.txt" '\x0B')" +expect_detected "Vertical Tab (0x0B)" "$f" + +f="$(write_file_binary "ff.txt" '\x0C')" +expect_detected "Form Feed (0x0C)" "$f" + +f="$(write_file_binary "cr.txt" '\x0D')" +expect_detected "Carriage Return (0x0D) - can overwrite line content" "$f" + +f="$(write_file_binary "so.txt" '\x0E')" +expect_detected "Shift Out (0x0E)" "$f" + +f="$(write_file_binary "si.txt" '\x0F')" +expect_detected "Shift In (0x0F)" "$f" + +f="$(write_file_binary "escape.txt" '\x1B')" +expect_detected "Escape (0x1B) - terminal escape sequences" "$f" + +f="$(write_file_binary "us.txt" '\x1F')" +expect_detected "Unit Separator (0x1F)" "$f" + +f="$(write_file_binary "del.txt" '\x7F')" +expect_detected "DEL (0x7F)" "$f" + +## =================================================================== +## Section 3: BiDi / Trojan Source characters (CVE-2021-42574) +## Using \x byte sequences to create test files reliably. +## =================================================================== +printf '%s\n' "" +printf '%s\n' "--- BiDi / Trojan Source characters (CVE-2021-42574) ---" + +## U+061C ARABIC LETTER MARK -> D8 9C +f="$(write_file_binary "bidi_alm.txt" 'test\xD8\x9Ctext')" +expect_detected "U+061C ARABIC LETTER MARK" "$f" + +## U+200E LEFT-TO-RIGHT MARK -> E2 80 8E +f="$(write_file_binary "bidi_lrm.txt" 'test\xE2\x80\x8Etext')" +expect_detected "U+200E LEFT-TO-RIGHT MARK" "$f" + +## U+200F RIGHT-TO-LEFT MARK -> E2 80 8F +f="$(write_file_binary "bidi_rlm.txt" 'test\xE2\x80\x8Ftext')" +expect_detected "U+200F RIGHT-TO-LEFT MARK" "$f" + +## U+202A LEFT-TO-RIGHT EMBEDDING -> E2 80 AA +f="$(write_file_binary "bidi_lre.txt" 'test\xE2\x80\xAAtext')" +expect_detected "U+202A LEFT-TO-RIGHT EMBEDDING" "$f" + +## U+202B RIGHT-TO-LEFT EMBEDDING -> E2 80 AB +f="$(write_file_binary "bidi_rle.txt" 'test\xE2\x80\xABtext')" +expect_detected "U+202B RIGHT-TO-LEFT EMBEDDING" "$f" + +## U+202C POP DIRECTIONAL FORMATTING -> E2 80 AC +f="$(write_file_binary "bidi_pdf.txt" 'test\xE2\x80\xACtext')" +expect_detected "U+202C POP DIRECTIONAL FORMATTING" "$f" + +## U+202D LEFT-TO-RIGHT OVERRIDE -> E2 80 AD +f="$(write_file_binary "bidi_lro.txt" 'test\xE2\x80\xADtext')" +expect_detected "U+202D LEFT-TO-RIGHT OVERRIDE" "$f" + +## U+202E RIGHT-TO-LEFT OVERRIDE -> E2 80 AE +f="$(write_file_binary "bidi_rlo.txt" 'test\xE2\x80\xAEtext')" +expect_detected "U+202E RIGHT-TO-LEFT OVERRIDE" "$f" + +## U+2066 LEFT-TO-RIGHT ISOLATE -> E2 81 A6 +f="$(write_file_binary "bidi_lri.txt" 'test\xE2\x81\xA6text')" +expect_detected "U+2066 LEFT-TO-RIGHT ISOLATE" "$f" + +## U+2067 RIGHT-TO-LEFT ISOLATE -> E2 81 A7 +f="$(write_file_binary "bidi_rli.txt" 'test\xE2\x81\xA7text')" +expect_detected "U+2067 RIGHT-TO-LEFT ISOLATE" "$f" + +## U+2068 FIRST STRONG ISOLATE -> E2 81 A8 +f="$(write_file_binary "bidi_fsi.txt" 'test\xE2\x81\xA8text')" +expect_detected "U+2068 FIRST STRONG ISOLATE" "$f" + +## U+2069 POP DIRECTIONAL ISOLATE -> E2 81 A9 +f="$(write_file_binary "bidi_pdi.txt" 'test\xE2\x81\xA9text')" +expect_detected "U+2069 POP DIRECTIONAL ISOLATE" "$f" + +## =================================================================== +## Section 4: Invisible / zero-width Unicode characters +## =================================================================== +printf '%s\n' "" +printf '%s\n' "--- Invisible / zero-width Unicode characters ---" + +## U+200B ZERO WIDTH SPACE -> E2 80 8B +f="$(write_file_binary "zwsp.txt" 'ab\xE2\x80\x8Bcd')" +expect_detected "U+200B ZERO WIDTH SPACE" "$f" + +## U+200C ZERO WIDTH NON-JOINER -> E2 80 8C +f="$(write_file_binary "zwnj.txt" 'ab\xE2\x80\x8Ccd')" +expect_detected "U+200C ZERO WIDTH NON-JOINER" "$f" + +## U+200D ZERO WIDTH JOINER -> E2 80 8D +f="$(write_file_binary "zwj.txt" 'ab\xE2\x80\x8Dcd')" +expect_detected "U+200D ZERO WIDTH JOINER" "$f" + +## U+2060 WORD JOINER -> E2 81 A0 +f="$(write_file_binary "wj.txt" 'ab\xE2\x81\xA0cd')" +expect_detected "U+2060 WORD JOINER" "$f" + +## U+FEFF BOM -> EF BB BF +f="$(write_file_binary "bom.txt" '\xEF\xBB\xBFtext')" +expect_detected "U+FEFF BOM / ZERO WIDTH NO-BREAK SPACE" "$f" + +## U+00AD SOFT HYPHEN -> C2 AD +f="$(write_file_binary "soft_hyphen.txt" 'ab\xC2\xADcd')" +expect_detected "U+00AD SOFT HYPHEN" "$f" + +## U+034F COMBINING GRAPHEME JOINER -> CD 8F +f="$(write_file_binary "cgj.txt" 'ab\xCD\x8Fcd')" +expect_detected "U+034F COMBINING GRAPHEME JOINER" "$f" + +## =================================================================== +## Section 5: Homoglyph / confusable characters +## =================================================================== +printf '%s\n' "" +printf '%s\n' "--- Homoglyph attacks (visually similar to ASCII) ---" + +## U+0410 CYRILLIC A -> D0 90 +f="$(write_file_binary "cyrillic_a.txt" '\xD0\x90dmin')" +expect_detected "U+0410 CYRILLIC CAPITAL A (looks like Latin A)" "$f" + +## U+043E CYRILLIC o -> D0 BE +f="$(write_file_binary "cyrillic_o.txt" 'passw\xD0\xBErd')" +expect_detected "U+043E CYRILLIC SMALL O (looks like Latin o)" "$f" + +## U+03BF GREEK OMICRON -> CE BF +f="$(write_file_binary "greek_omicron.txt" 'passw\xCE\xBFrd')" +expect_detected "U+03BF GREEK SMALL OMICRON (looks like Latin o)" "$f" + +## U+FF21 FULLWIDTH A -> EF BC A1 +f="$(write_file_binary "fullwidth_A.txt" '\xEF\xBC\xA1dmin')" +expect_detected "U+FF21 FULLWIDTH LATIN CAPITAL A" "$f" + +## =================================================================== +## Section 6: Special Unicode spaces and separators +## =================================================================== +printf '%s\n' "" +printf '%s\n' "--- Unicode whitespace / separators ---" + +## U+00A0 NO-BREAK SPACE -> C2 A0 +f="$(write_file_binary "nbsp.txt" 'a\xC2\xA0b')" +expect_detected "U+00A0 NO-BREAK SPACE" "$f" + +## U+2002 EN SPACE -> E2 80 82 +f="$(write_file_binary "en_space.txt" 'a\xE2\x80\x82b')" +expect_detected "U+2002 EN SPACE" "$f" + +## U+2003 EM SPACE -> E2 80 83 +f="$(write_file_binary "em_space.txt" 'a\xE2\x80\x83b')" +expect_detected "U+2003 EM SPACE" "$f" + +## U+2009 THIN SPACE -> E2 80 89 +f="$(write_file_binary "thin_space.txt" 'a\xE2\x80\x89b')" +expect_detected "U+2009 THIN SPACE" "$f" + +## U+200A HAIR SPACE -> E2 80 8A +f="$(write_file_binary "hair_space.txt" 'a\xE2\x80\x8Ab')" +expect_detected "U+200A HAIR SPACE" "$f" + +## U+2028 LINE SEPARATOR -> E2 80 A8 +f="$(write_file_binary "line_sep.txt" 'a\xE2\x80\xA8b')" +expect_detected "U+2028 LINE SEPARATOR" "$f" + +## U+2029 PARAGRAPH SEPARATOR -> E2 80 A9 +f="$(write_file_binary "para_sep.txt" 'a\xE2\x80\xA9b')" +expect_detected "U+2029 PARAGRAPH SEPARATOR" "$f" + +## U+3000 IDEOGRAPHIC SPACE -> E3 80 80 +f="$(write_file_binary "ideographic_space.txt" 'a\xE3\x80\x80b')" +expect_detected "U+3000 IDEOGRAPHIC SPACE" "$f" + +## U+2800 BRAILLE PATTERN BLANK -> E2 A0 80 +f="$(write_file_binary "braille_blank.txt" 'a\xE2\xA0\x80b')" +expect_detected "U+2800 BRAILLE PATTERN BLANK (invisible)" "$f" + +## =================================================================== +## Section 7: Tag characters (Supplementary Plane, used in exploits) +## =================================================================== +printf '%s\n' "" +printf '%s\n' "--- Unicode tag characters ---" + +## U+E0061 TAG LATIN SMALL LETTER A -> F3 A0 81 A1 +f="$(write_file_binary "tag_latin_a.txt" 'test\xF3\xA0\x81\xA1text')" +expect_detected "U+E0061 TAG LATIN SMALL LETTER A" "$f" + +## U+E007F CANCEL TAG -> F3 A0 81 BF +f="$(write_file_binary "cancel_tag.txt" 'test\xF3\xA0\x81\xBFtext')" +expect_detected "U+E007F CANCEL TAG" "$f" + +## =================================================================== +## Section 8: Overlong UTF-8 / malformed sequences +## =================================================================== +printf '%s\n' "" +printf '%s\n' "--- Malformed / overlong UTF-8 ---" + +## Overlong encoding of NULL (C0 80 instead of 00). +f="$(write_file_binary "overlong_null.txt" '\xC0\x80')" +expect_detected "Overlong UTF-8 NULL (0xC0 0x80)" "$f" + +## Overlong encoding of '/' (C0 AF instead of 2F). +f="$(write_file_binary "overlong_slash.txt" '\xC0\xAF')" +expect_detected "Overlong UTF-8 slash (0xC0 0xAF)" "$f" + +## Invalid continuation byte. +f="$(write_file_binary "invalid_continuation.txt" '\x80')" +expect_detected "Invalid UTF-8 continuation byte (0x80)" "$f" + +## Invalid start byte. +f="$(write_file_binary "invalid_start.txt" '\xFE')" +expect_detected "Invalid UTF-8 start byte (0xFE)" "$f" + +f="$(write_file_binary "invalid_ff.txt" '\xFF')" +expect_detected "Invalid byte (0xFF)" "$f" + +## =================================================================== +## Section 9: Sneaky embedding in otherwise clean files +## =================================================================== +printf '%s\n' "" +printf '%s\n' "--- Sneaky embeddings in normal-looking files ---" + +## Zero-width space (E2 80 8B) hidden in a comment. +f="$(write_file_binary "hidden_in_comment.txt" '# This is a normal comment\xE2\x80\x8B\ndef hello():\n print('"'"'hello'"'"')\n')" +expect_detected "Zero-width space hidden in comment" "$f" + +## LTR mark (E2 80 8E) hidden in a string literal. +f="$(write_file_binary "hidden_in_string.txt" 'var x = "hello\xE2\x80\x8Eworld";')" +expect_detected "LTR mark hidden in string literal" "$f" + +## Trojan Source BiDi attack pattern. +f="$(write_file_binary "trojan_source_example.txt" 'access_level = "user\xE2\x80\xAA\xE2\x81\xA6\xE2\x81\xA9\xE2\x81\xA6admin\xE2\x81\xA9\xE2\x80\xAC"')" +expect_detected "Trojan Source BiDi attack pattern" "$f" + +## Backspace (\x08) overwrite attack. +f="$(write_file_binary "backspace_overwrite.txt" 'user\x08\x08\x08\x08root')" +expect_detected "Backspace overwrite (displays 'root' over 'user')" "$f" + +## Carriage return (\x0D) overwrite attack. +f="$(write_file_binary "cr_overwrite.txt" 'safe command\x0Drm -rf /')" +expect_detected "CR overwrite (hides malicious command)" "$f" + +## =================================================================== +## Section 10: Mixed content edge cases +## =================================================================== +printf '%s\n' "" +printf '%s\n' "--- Edge cases ---" + +## File with only a BOM (EF BB BF) and nothing else. +f="$(write_file_binary "bom_only.txt" '\xEF\xBB\xBF')" +expect_detected "File containing only BOM" "$f" + +## Suspicious char at very end of file. +f="$(write_file_binary "trailing_bidi.txt" 'normal text\xE2\x80\x8E')" +expect_detected "BiDi char at end of file" "$f" + +## Suspicious char at very start of file. +f="$(write_file_binary "leading_null.txt" '\x00normal text')" +expect_detected "NULL byte at start of file" "$f" + +## Very long line with suspicious char in the middle (ZWSP = E2 80 8B). +python3 -c " +import sys +sys.stdout.buffer.write(b'a' * 10000 + b'\xe2\x80\x8b' + b'b' * 10000) +" > "$test_dir/long_line.txt" +expect_detected "Suspicious char buried in 20000-char line" "$test_dir/long_line.txt" + +## =================================================================== +## Summary +## =================================================================== +printf '%s\n' "" +printf '%s\n' "===== Results =====" +printf "Passed: %d | Failed: %d | Skipped: %d\n" "$pass_count" "$fail_count" "$skip_count" + +if [ "$fail_count" -gt 0 ]; then + printf "${RED}%s${NC}\n" "SOME TESTS FAILED - potential Unicode bypass found!" + exit 1 +fi + +printf "${GREEN}%s${NC}\n" "All tests passed - no bypass detected." +exit 0 From 50cc246066f4b96490fea98acac47a8c4b7b84e5 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 2 Apr 2026 09:41:45 +0000 Subject: [PATCH 2/5] Add check 3 false positive tests for $'\u' locale bug The wrapper's check 3 uses $'\uXXXX' which is expanded at bash parse time using the caller's locale - NOT the LC_ALL=C on the grep command. In non-UTF-8 locales, \u sequences pass through literally, creating a bracket expression of [0-9A-Fu\] that false-positives on files with digits, hex values, UUIDs, backslashes, or typical code. New tests verify this: 5/6 clean ASCII files trigger false positives in non-UTF-8 locales, 0/6 in UTF-8 locales. Fix: use \x byte sequences. https://claude.ai/code/session_01726gqqGv3oaDV5jLbM6E6h --- tests/test_grep_find_unicode_wrapper | 48 ++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tests/test_grep_find_unicode_wrapper b/tests/test_grep_find_unicode_wrapper index d9998d68..72d4e2c0 100755 --- a/tests/test_grep_find_unicode_wrapper +++ b/tests/test_grep_find_unicode_wrapper @@ -156,8 +156,10 @@ printf '%s\n' "--- Check 3 locale bug ($'\u' expansion) ---" ## Verify whether $'\uXXXX' expands to UTF-8 in the current locale. ## If it doesn't, check 3 in the wrapper is broken. +check3_locale_ok="false" check3_test_byte="$(printf '%s' $'\u061C' | od -A n -t x1 | tr -d ' \n')" if [ "$check3_test_byte" = "d89c" ]; then + check3_locale_ok="true" printf "${GREEN}PASS${NC}: \$'\\u061C' expands to UTF-8 bytes (d8 9c) - check 3 works in this locale\n" pass_count=$(( pass_count + 1 )) else @@ -170,6 +172,52 @@ else skip_count=$(( skip_count + 1 )) fi +## Test check 3's actual $'\u...' pattern (as used in the wrapper) for false positives. +## The $'\u...' expansion happens at bash parse time using the caller's locale, +## NOT the LC_ALL=C set on the grep command. In non-UTF-8 locales, \u sequences +## are passed through literally, creating a bracket expression containing ASCII +## chars like digits, hex letters, 'u', and '\'. This causes false positives +## on nearly any file with digits or hex characters. +wrapper_check3_pattern=$'[\u061C\u200E\u200F\u202A\u202B\u202C\u202D\u202E\u2066\u2067\u2068\u2069]' + +check3_false_positive_test() { + local description="$1" + local file="$2" + if LC_ALL=C grep --files-with-matches --line-number --binary-files=text "$wrapper_check3_pattern" "$file" >/dev/null 2>&1; then + if [ "$check3_locale_ok" = "true" ]; then + printf "${RED}FAIL${NC}: Check 3 false positive: %s\n" "$description" >&2 + fail_count=$(( fail_count + 1 )) + else + printf "${RED}FAIL${NC}: Check 3 false positive (locale bug): %s\n" "$description" >&2 + fail_count=$(( fail_count + 1 )) + fi + else + printf "${GREEN}PASS${NC}: Check 3 clean: %s\n" "$description" + pass_count=$(( pass_count + 1 )) + fi +} + +printf '%s\n' "" +printf '%s\n' "--- Check 3 false positive tests (wrapper's actual \$'\\u' pattern) ---" + +f="$(write_file "check3_digits.txt" "x = 12345")" +check3_false_positive_test "File with digits" "$f" + +f="$(write_file "check3_hex.txt" "color = 0xDEADBEEF")" +check3_false_positive_test "File with hex values" "$f" + +f="$(write_file "check3_code.txt" "for (int i = 0; i < 100; i++) { printf(\"%d\", i); }")" +check3_false_positive_test "File with typical C code" "$f" + +f="$(write_file "check3_uuid.txt" "uuid = \"550e8400-e29b-41d4-a716-446655440000\"")" +check3_false_positive_test "File with UUID" "$f" + +f="$(write_file "check3_backslash.txt" "path = C:\\\\Users\\\\test")" +check3_false_positive_test "File with backslashes" "$f" + +f="$(write_file "check3_plain.txt" "Hello World")" +check3_false_positive_test "File with plain text (no digits)" "$f" + ## =================================================================== ## Section 1: Clean files (should NOT be detected) ## =================================================================== From 07fc663960cd8551e496efb9db44bd5babcc2baf Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 2 Apr 2026 10:04:48 +0000 Subject: [PATCH 3/5] Fix check 3 locale bug in grep-find-unicode-wrapper, integrate test - grep-find-unicode-wrapper: check 3 used $'\uXXXX' which requires a UTF-8 locale at bash parse time. In non-UTF-8 locales the pattern degrades to literal ASCII chars causing false positives. Fixed by using \x byte sequences. Old line kept commented out with explanation. - test script: use get_colors.sh instead of custom color vars, handle old-pattern false positives as expected warns not failures. - run-tests: call tests/test_grep_find_unicode_wrapper. https://claude.ai/code/session_01726gqqGv3oaDV5jLbM6E6h --- run-tests | 2 + tests/test_grep_find_unicode_wrapper | 55 +++++++++++++++------------- usr/bin/grep-find-unicode-wrapper | 16 +++++++- 3 files changed, 46 insertions(+), 27 deletions(-) diff --git a/run-tests b/run-tests index ce0581cf..44aa69f4 100755 --- a/run-tests +++ b/run-tests @@ -56,3 +56,5 @@ fi cd "${git_toplevel}" ./unicode-testscript + +./tests/test_grep_find_unicode_wrapper diff --git a/tests/test_grep_find_unicode_wrapper b/tests/test_grep_find_unicode_wrapper index 72d4e2c0..ecd62428 100755 --- a/tests/test_grep_find_unicode_wrapper +++ b/tests/test_grep_find_unicode_wrapper @@ -32,17 +32,16 @@ trap cleanup EXIT test_dir="$(mktemp -d)" -## Colors for output (if terminal supports it). -if [ -t 1 ]; then - RED='\033[0;31m' - GREEN='\033[0;32m' - YELLOW='\033[0;33m' - NC='\033[0m' +script_dir="$(cd -- "$(dirname -- "$0")" && pwd)" +git_toplevel="$(cd -- "$script_dir" && git rev-parse --show-toplevel)" + +if test -f /usr/libexec/helper-scripts/get_colors.sh ; then + source /usr/libexec/helper-scripts/get_colors.sh +elif test -f "${git_toplevel}/usr/libexec/helper-scripts/get_colors.sh" ; then + source "${git_toplevel}/usr/libexec/helper-scripts/get_colors.sh" else - RED='' - GREEN='' - YELLOW='' - NC='' + printf '%s\n' "$0: ERROR: get_colors.sh not found!" >&2 + exit 1 fi ## Replicate the four grep checks from grep-find-unicode-wrapper. @@ -109,10 +108,10 @@ expect_detected() { local file="$2" if run_checks "$file"; then - printf "${GREEN}PASS${NC}: Detected: %s\n" "$description" + printf "${green}PASS${nocolor}: Detected: %s\n" "$description" pass_count=$(( pass_count + 1 )) else - printf "${RED}FAIL${NC}: NOT detected: %s\n" "$description" >&2 + printf "${red}FAIL${nocolor}: NOT detected: %s\n" "$description" >&2 fail_count=$(( fail_count + 1 )) fi } @@ -122,10 +121,10 @@ expect_clean() { local file="$2" if ! run_checks "$file"; then - printf "${GREEN}PASS${NC}: Clean: %s\n" "$description" + printf "${green}PASS${nocolor}: Clean: %s\n" "$description" pass_count=$(( pass_count + 1 )) else - printf "${RED}FAIL${NC}: False positive: %s\n" "$description" >&2 + printf "${red}FAIL${nocolor}: False positive: %s\n" "$description" >&2 fail_count=$(( fail_count + 1 )) fi } @@ -160,10 +159,10 @@ check3_locale_ok="false" check3_test_byte="$(printf '%s' $'\u061C' | od -A n -t x1 | tr -d ' \n')" if [ "$check3_test_byte" = "d89c" ]; then check3_locale_ok="true" - printf "${GREEN}PASS${NC}: \$'\\u061C' expands to UTF-8 bytes (d8 9c) - check 3 works in this locale\n" + printf "${green}PASS${nocolor}: \$'\\u061C' expands to UTF-8 bytes (d8 9c) - check 3 works in this locale\n" pass_count=$(( pass_count + 1 )) else - printf "${YELLOW}WARN${NC}: \$'\\u061C' expands to literal '\\u061C' (got: %s) - check 3 in wrapper is broken in this locale!\n" "$check3_test_byte" + printf "${yellow}WARN${nocolor}: \$'\\u061C' expands to literal '\\u061C' (got: %s) - check 3 in wrapper is broken in this locale!\n" "$check3_test_byte" printf " Current locale: LANG='${LANG:-}' LC_ALL='${LC_ALL:-}'\n" printf " The wrapper's check 3 pattern becomes a character class of\n" printf " literal chars [0-9A-Fu\\\\] causing false positives on most files.\n" @@ -172,33 +171,37 @@ else skip_count=$(( skip_count + 1 )) fi -## Test check 3's actual $'\u...' pattern (as used in the wrapper) for false positives. +## Test the old $'\u...' pattern (before the fix) for false positives. ## The $'\u...' expansion happens at bash parse time using the caller's locale, ## NOT the LC_ALL=C set on the grep command. In non-UTF-8 locales, \u sequences ## are passed through literally, creating a bracket expression containing ASCII ## chars like digits, hex letters, 'u', and '\'. This causes false positives ## on nearly any file with digits or hex characters. -wrapper_check3_pattern=$'[\u061C\u200E\u200F\u202A\u202B\u202C\u202D\u202E\u2066\u2067\u2068\u2069]' +## The wrapper has been fixed to use \x byte sequences. These tests document +## the old behavior - false positives in non-UTF-8 locales are expected. +old_check3_pattern=$'[\u061C\u200E\u200F\u202A\u202B\u202C\u202D\u202E\u2066\u2067\u2068\u2069]' check3_false_positive_test() { local description="$1" local file="$2" - if LC_ALL=C grep --files-with-matches --line-number --binary-files=text "$wrapper_check3_pattern" "$file" >/dev/null 2>&1; then + if LC_ALL=C grep --files-with-matches --line-number --binary-files=text "$old_check3_pattern" "$file" >/dev/null 2>&1; then if [ "$check3_locale_ok" = "true" ]; then - printf "${RED}FAIL${NC}: Check 3 false positive: %s\n" "$description" >&2 + ## UTF-8 locale: old pattern should work, so this is a real failure. + printf "${red}FAIL${nocolor}: Old check 3 false positive: %s\n" "$description" >&2 fail_count=$(( fail_count + 1 )) else - printf "${RED}FAIL${NC}: Check 3 false positive (locale bug): %s\n" "$description" >&2 - fail_count=$(( fail_count + 1 )) + ## Non-UTF-8 locale: false positive from old pattern is expected. + printf "${yellow}WARN${nocolor}: Old check 3 false positive (expected, locale bug): %s\n" "$description" + skip_count=$(( skip_count + 1 )) fi else - printf "${GREEN}PASS${NC}: Check 3 clean: %s\n" "$description" + printf "${green}PASS${nocolor}: Old check 3 clean: %s\n" "$description" pass_count=$(( pass_count + 1 )) fi } printf '%s\n' "" -printf '%s\n' "--- Check 3 false positive tests (wrapper's actual \$'\\u' pattern) ---" +printf '%s\n' "--- Old check 3 false positive tests (\$'\\u' pattern before fix) ---" f="$(write_file "check3_digits.txt" "x = 12345")" check3_false_positive_test "File with digits" "$f" @@ -535,9 +538,9 @@ printf '%s\n' "===== Results =====" printf "Passed: %d | Failed: %d | Skipped: %d\n" "$pass_count" "$fail_count" "$skip_count" if [ "$fail_count" -gt 0 ]; then - printf "${RED}%s${NC}\n" "SOME TESTS FAILED - potential Unicode bypass found!" + printf "${red}%s${nocolor}\n" "SOME TESTS FAILED - potential Unicode bypass found!" exit 1 fi -printf "${GREEN}%s${NC}\n" "All tests passed - no bypass detected." +printf "${green}%s${nocolor}\n" "All tests passed - no bypass detected." exit 0 diff --git a/usr/bin/grep-find-unicode-wrapper b/usr/bin/grep-find-unicode-wrapper index 1bf188e4..fac84fb3 100755 --- a/usr/bin/grep-find-unicode-wrapper +++ b/usr/bin/grep-find-unicode-wrapper @@ -52,7 +52,21 @@ check_grep_status "$?" ## Not using 'grep's '--perl-regexp' option for three. ## Because not mentioned in above links and can lead to the following error message: # grep: PCRE2 does not support \F, \L, \l, \N{name}, \U, or \u -three="$(LC_ALL=C grep "${grep_args[@]}" $'[\u061C\u200E\u200F\u202A\u202B\u202C\u202D\u202E\u2066\u2067\u2068\u2069]' "$@")" +## Locale bug: bash $'\uXXXX' is expanded at parse time using the caller's +## locale, NOT the LC_ALL=C set on the grep command. In non-UTF-8 locales +## (LANG=C, LANG= empty), the \u sequences are passed through literally, +## creating a bracket expression of ASCII chars [0-9A-Fu\] that causes false +## positives on almost any file containing digits or hex characters. +## BiDi characters would still be caught by checks one and two (non-ASCII +## byte detection), so this is a false positive issue, not a bypass. +#three="$(LC_ALL=C grep "${grep_args[@]}" $'[\u061C\u200E\u200F\u202A\u202B\u202C\u202D\u202E\u2066\u2067\u2068\u2069]' "$@")" +## Fixed: use \x byte sequences which work regardless of locale. +## U+061C -> \xD8\x9C U+202A -> \xE2\x80\xAA U+202E -> \xE2\x80\xAE +## U+200E -> \xE2\x80\x8E U+202B -> \xE2\x80\xAB U+2066 -> \xE2\x81\xA6 +## U+200F -> \xE2\x80\x8F U+202C -> \xE2\x80\xAC U+2067 -> \xE2\x81\xA7 +## U+202D -> \xE2\x80\xAD U+2068 -> \xE2\x81\xA8 +## U+2069 -> \xE2\x81\xA9 +three="$(LC_ALL=C grep "${grep_args[@]}" $'[\xD8\x9C\xE2\x80\x8E\x8F\xAA\xAB\xAC\xAD\xAE\x81\xA6\xA7\xA8\xA9]' "$@")" check_grep_status "$?" ## ASCII control characters. From d34fa86161c80111fadf39de222ac4a528a088df Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 2 Apr 2026 10:14:40 +0000 Subject: [PATCH 4/5] Refactor test to use actual grep-find-unicode-wrapper and project conventions - Use real grep-find-unicode-wrapper binary instead of reimplementing the four grep checks locally. Tests exercise actual code paths. - Source get_colors.sh for colors instead of custom color variables. - Create safe-rm-maybe.bsh providing rm-safe-maybe function that uses safe-rm if installed, otherwise falls back to rm. - Use long options (--recursive, --force, --directory, --delete, --address-radix, --format) instead of short flags. - Use `| tee -- "$file_name"` instead of `> "$file_name"` for better xtrace output and error handling. - Rename variable 'file' to 'file_name' to avoid collision with the standard unix 'file' utility. - Remove assumption that tools might not be installed; tests require all tools available (installed from source or on disk). https://claude.ai/code/session_01726gqqGv3oaDV5jLbM6E6h --- tests/test_grep_find_unicode_wrapper | 151 ++++++------------- usr/libexec/helper-scripts/safe-rm-maybe.bsh | 15 ++ 2 files changed, 59 insertions(+), 107 deletions(-) create mode 100644 usr/libexec/helper-scripts/safe-rm-maybe.bsh diff --git a/tests/test_grep_find_unicode_wrapper b/tests/test_grep_find_unicode_wrapper index ecd62428..6eb50eaa 100755 --- a/tests/test_grep_find_unicode_wrapper +++ b/tests/test_grep_find_unicode_wrapper @@ -5,9 +5,9 @@ ## Test script for grep-find-unicode-wrapper ## Tests for Unicode bypass vulnerabilities by checking that various -## suspicious characters are properly detected by the four grep checks. +## suspicious characters are properly detected. ## -## Bug found: Check 3 uses bash $'\uXXXX' expansion which requires a +## Bug found: Check 3 used bash $'\uXXXX' expansion which requires a ## UTF-8 locale at parse time. In non-UTF-8 locales (LANG=C or LANG=), ## the \u sequences are passed through literally, creating a character ## class of [0-9A-Fu\] that causes massive false positives. @@ -23,108 +23,45 @@ pass_count=0 fail_count=0 skip_count=0 +source /usr/libexec/helper-scripts/get_colors.sh +source /usr/libexec/helper-scripts/safe-rm-maybe.bsh + cleanup() { if [ -n "$test_dir" ] && [ -d "$test_dir" ]; then - rm -rf -- "$test_dir" + rm-safe-maybe --recursive --force -- "$test_dir" fi } trap cleanup EXIT -test_dir="$(mktemp -d)" - -script_dir="$(cd -- "$(dirname -- "$0")" && pwd)" -git_toplevel="$(cd -- "$script_dir" && git rev-parse --show-toplevel)" - -if test -f /usr/libexec/helper-scripts/get_colors.sh ; then - source /usr/libexec/helper-scripts/get_colors.sh -elif test -f "${git_toplevel}/usr/libexec/helper-scripts/get_colors.sh" ; then - source "${git_toplevel}/usr/libexec/helper-scripts/get_colors.sh" -else - printf '%s\n' "$0: ERROR: get_colors.sh not found!" >&2 - exit 1 -fi - -## Replicate the four grep checks from grep-find-unicode-wrapper. -## This allows testing even without stecho installed. -grep_args=( - --files-with-matches - --line-number - --binary-files=text -) - -## Build the check 3 pattern using raw UTF-8 byte sequences (\x) -## instead of \u escapes, so this test works regardless of locale. -## -## U+061C -> 0xD8 0x9C -## U+200E -> 0xE2 0x80 0x8E -## U+200F -> 0xE2 0x80 0x8F -## U+202A -> 0xE2 0x80 0xAA -## U+202B -> 0xE2 0x80 0xAB -## U+202C -> 0xE2 0x80 0xAC -## U+202D -> 0xE2 0x80 0xAD -## U+202E -> 0xE2 0x80 0xAE -## U+2066 -> 0xE2 0x81 0xA6 -## U+2067 -> 0xE2 0x81 0xA7 -## U+2068 -> 0xE2 0x81 0xA8 -## U+2069 -> 0xE2 0x81 0xA9 -## -## Note: check 3 in the wrapper does NOT use --perl-regexp, so we use -## grep's basic bracket expression which with LC_ALL=C matches individual -## bytes. This means the bracket expression matches ANY of the individual -## bytes, not specific multi-byte sequences. This is overly broad but -## ensures detection. -bidi_pattern=$'[\xD8\x9C\xE2\x80\x8E\x8F\xAA\xAB\xAC\xAD\xAE\x81\xA6\xA7\xA8\xA9]' - -run_checks() { - local file="$1" - local found=0 - - ## Check 1: Non-ASCII bytes (hex range). - if LC_ALL=C grep "${grep_args[@]}" --perl-regexp '[^\x00-\x7F]' "$file" >/dev/null 2>&1; then - found=1 - fi - - ## Check 2: Non-ASCII (POSIX class). - if LC_ALL=C grep "${grep_args[@]}" --perl-regexp "[^[:ascii:]]" "$file" >/dev/null 2>&1; then - found=1 - fi +test_dir="$(mktemp --directory)" - ## Check 3: BiDi / Trojan Source characters. - ## Using raw byte pattern (see bidi_pattern above). - if LC_ALL=C grep "${grep_args[@]}" "$bidi_pattern" "$file" >/dev/null 2>&1; then - found=1 - fi - - ## Check 4: ASCII control characters. - if LC_ALL=C grep "${grep_args[@]}" --perl-regexp '[\x00-\x08\x0B\x0C\x0D\x0E-\x1F\x7F]' "$file" >/dev/null 2>&1; then - found=1 - fi - - return $(( ! found )) -} +## grep-find-unicode-wrapper is similar to grep. +## - Exit code 0: if found. +## - Non-zero exit code: if not found. +command -v grep-find-unicode-wrapper >/dev/null expect_detected() { local description="$1" - local file="$2" + local file_name="$2" - if run_checks "$file"; then - printf "${green}PASS${nocolor}: Detected: %s\n" "$description" + if grep-find-unicode-wrapper "$file_name" >/dev/null 2>&1 ; then + printf "%s\n" "${green}PASS${nocolor}: Detected: $description" pass_count=$(( pass_count + 1 )) else - printf "${red}FAIL${nocolor}: NOT detected: %s\n" "$description" >&2 + printf "%s\n" "${red}FAIL${nocolor}: NOT detected: $description" >&2 fail_count=$(( fail_count + 1 )) fi } expect_clean() { local description="$1" - local file="$2" + local file_name="$2" - if ! run_checks "$file"; then - printf "${green}PASS${nocolor}: Clean: %s\n" "$description" + if ! grep-find-unicode-wrapper "$file_name" >/dev/null 2>&1 ; then + printf "%s\n" "${green}PASS${nocolor}: Clean: $description" pass_count=$(( pass_count + 1 )) else - printf "${red}FAIL${nocolor}: False positive: %s\n" "$description" >&2 + printf "%s\n" "${red}FAIL${nocolor}: False positive: $description" >&2 fail_count=$(( fail_count + 1 )) fi } @@ -132,17 +69,17 @@ expect_clean() { write_file() { local name="$1" local content="$2" - local file="$test_dir/$name" - printf '%s' "$content" > "$file" - printf '%s' "$file" + local file_name="$test_dir/$name" + printf '%s' "$content" | tee -- "$file_name" >/dev/null + printf '%s' "$file_name" } write_file_binary() { local name="$1" shift - local file="$test_dir/$name" - printf "$@" > "$file" - printf '%s' "$file" + local file_name="$test_dir/$name" + printf "$@" | tee -- "$file_name" >/dev/null + printf '%s' "$file_name" } printf '%s\n' "===== grep-find-unicode-wrapper bypass tests =====" @@ -151,23 +88,23 @@ printf '%s\n' "" ## =================================================================== ## Section 0: Check 3 locale bug validation ## =================================================================== -printf '%s\n' "--- Check 3 locale bug ($'\u' expansion) ---" +printf '%s\n' "--- Check 3 locale bug (\$'\\u' expansion) ---" ## Verify whether $'\uXXXX' expands to UTF-8 in the current locale. -## If it doesn't, check 3 in the wrapper is broken. +## If it doesn't, check 3 in the wrapper would be broken with the old pattern. check3_locale_ok="false" -check3_test_byte="$(printf '%s' $'\u061C' | od -A n -t x1 | tr -d ' \n')" +check3_test_byte="$(printf '%s' $'\u061C' | od --address-radix=n --format=x1 | tr --delete ' \n')" if [ "$check3_test_byte" = "d89c" ]; then check3_locale_ok="true" - printf "${green}PASS${nocolor}: \$'\\u061C' expands to UTF-8 bytes (d8 9c) - check 3 works in this locale\n" + printf '%s\n' "${green}PASS${nocolor}: \$'\\u061C' expands to UTF-8 bytes (d8 9c) - check 3 works in this locale" pass_count=$(( pass_count + 1 )) else - printf "${yellow}WARN${nocolor}: \$'\\u061C' expands to literal '\\u061C' (got: %s) - check 3 in wrapper is broken in this locale!\n" "$check3_test_byte" - printf " Current locale: LANG='${LANG:-}' LC_ALL='${LC_ALL:-}'\n" - printf " The wrapper's check 3 pattern becomes a character class of\n" - printf " literal chars [0-9A-Fu\\\\] causing false positives on most files.\n" - printf " BiDi chars are still caught by checks 1+2 (non-ASCII byte detection).\n" - printf " Fix: use \\\\x byte sequences instead of \\\\u escapes in the wrapper.\n" + printf '%s\n' "${yellow}WARN${nocolor}: \$'\\u061C' expands to literal '\\u061C' (got: $check3_test_byte) - check 3 would be broken with old pattern!" + printf '%s\n' " Current locale: LANG='${LANG:-}' LC_ALL='${LC_ALL:-}'" + printf '%s\n' " The old \$'\\u...' pattern becomes a character class of" + printf '%s\n' " literal chars [0-9A-Fu\\] causing false positives on most files." + printf '%s\n' " BiDi chars are still caught by checks 1+2 (non-ASCII byte detection)." + printf '%s\n' " Fix: use \\x byte sequences instead of \\u escapes in the wrapper." skip_count=$(( skip_count + 1 )) fi @@ -183,19 +120,19 @@ old_check3_pattern=$'[\u061C\u200E\u200F\u202A\u202B\u202C\u202D\u202E\u2066\u20 check3_false_positive_test() { local description="$1" - local file="$2" - if LC_ALL=C grep --files-with-matches --line-number --binary-files=text "$old_check3_pattern" "$file" >/dev/null 2>&1; then + local file_name="$2" + if LC_ALL=C grep --files-with-matches --line-number --binary-files=text "$old_check3_pattern" "$file_name" >/dev/null 2>&1; then if [ "$check3_locale_ok" = "true" ]; then ## UTF-8 locale: old pattern should work, so this is a real failure. - printf "${red}FAIL${nocolor}: Old check 3 false positive: %s\n" "$description" >&2 + printf '%s\n' "${red}FAIL${nocolor}: Old check 3 false positive: $description" >&2 fail_count=$(( fail_count + 1 )) else ## Non-UTF-8 locale: false positive from old pattern is expected. - printf "${yellow}WARN${nocolor}: Old check 3 false positive (expected, locale bug): %s\n" "$description" + printf '%s\n' "${yellow}WARN${nocolor}: Old check 3 false positive (expected, locale bug): $description" skip_count=$(( skip_count + 1 )) fi else - printf "${green}PASS${nocolor}: Old check 3 clean: %s\n" "$description" + printf '%s\n' "${green}PASS${nocolor}: Old check 3 clean: $description" pass_count=$(( pass_count + 1 )) fi } @@ -527,7 +464,7 @@ expect_detected "NULL byte at start of file" "$f" python3 -c " import sys sys.stdout.buffer.write(b'a' * 10000 + b'\xe2\x80\x8b' + b'b' * 10000) -" > "$test_dir/long_line.txt" +" | tee -- "$test_dir/long_line.txt" >/dev/null expect_detected "Suspicious char buried in 20000-char line" "$test_dir/long_line.txt" ## =================================================================== @@ -535,12 +472,12 @@ expect_detected "Suspicious char buried in 20000-char line" "$test_dir/long_line ## =================================================================== printf '%s\n' "" printf '%s\n' "===== Results =====" -printf "Passed: %d | Failed: %d | Skipped: %d\n" "$pass_count" "$fail_count" "$skip_count" +printf '%s\n' "Passed: $pass_count | Failed: $fail_count | Skipped: $skip_count" if [ "$fail_count" -gt 0 ]; then - printf "${red}%s${nocolor}\n" "SOME TESTS FAILED - potential Unicode bypass found!" + printf '%s\n' "${red}SOME TESTS FAILED - potential Unicode bypass found!${nocolor}" exit 1 fi -printf "${green}%s${nocolor}\n" "All tests passed - no bypass detected." +printf '%s\n' "${green}All tests passed - no bypass detected.${nocolor}" exit 0 diff --git a/usr/libexec/helper-scripts/safe-rm-maybe.bsh b/usr/libexec/helper-scripts/safe-rm-maybe.bsh new file mode 100644 index 00000000..1b56cc72 --- /dev/null +++ b/usr/libexec/helper-scripts/safe-rm-maybe.bsh @@ -0,0 +1,15 @@ +#!/bin/bash + +## Copyright (C) 2025 - 2025 ENCRYPTED SUPPORT LLC +## See the file COPYING for copying conditions. + +## Provides rm-safe-maybe function. +## Uses safe-rm if installed, otherwise falls back to rm. + +rm-safe-maybe() { + if command -v safe-rm >/dev/null 2>&1 ; then + safe-rm "$@" + else + rm "$@" + fi +} From e1638f20439bceb58e0106ea2797695e451e8121 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 2 Apr 2026 11:03:28 +0000 Subject: [PATCH 5/5] Use has.sh, &>/dev/null, safe examples, extract Python to separate file - Use 'has' from has.sh instead of 'command -v'. - Use &>/dev/null instead of >/dev/null 2>&1. - Replace offensive/risky test examples (rm -rf, root escalation) with safe alternatives (GOOD/BADX overwrite, harmless error messages). - Move inline Python for long-line generation to separate script usr/libexec/helper-scripts/write-long-line-with-unicode. https://claude.ai/code/session_01726gqqGv3oaDV5jLbM6E6h --- tests/test_grep_find_unicode_wrapper | 38 ++++++++++--------- .../write-long-line-with-unicode | 28 ++++++++++++++ 2 files changed, 48 insertions(+), 18 deletions(-) create mode 100644 usr/libexec/helper-scripts/write-long-line-with-unicode diff --git a/tests/test_grep_find_unicode_wrapper b/tests/test_grep_find_unicode_wrapper index 6eb50eaa..afb825e3 100755 --- a/tests/test_grep_find_unicode_wrapper +++ b/tests/test_grep_find_unicode_wrapper @@ -25,6 +25,7 @@ skip_count=0 source /usr/libexec/helper-scripts/get_colors.sh source /usr/libexec/helper-scripts/safe-rm-maybe.bsh +source /usr/libexec/helper-scripts/has.sh cleanup() { if [ -n "$test_dir" ] && [ -d "$test_dir" ]; then @@ -38,13 +39,14 @@ test_dir="$(mktemp --directory)" ## grep-find-unicode-wrapper is similar to grep. ## - Exit code 0: if found. ## - Non-zero exit code: if not found. -command -v grep-find-unicode-wrapper >/dev/null +has grep-find-unicode-wrapper +has write-long-line-with-unicode expect_detected() { local description="$1" local file_name="$2" - if grep-find-unicode-wrapper "$file_name" >/dev/null 2>&1 ; then + if grep-find-unicode-wrapper "$file_name" &>/dev/null ; then printf "%s\n" "${green}PASS${nocolor}: Detected: $description" pass_count=$(( pass_count + 1 )) else @@ -57,7 +59,7 @@ expect_clean() { local description="$1" local file_name="$2" - if ! grep-find-unicode-wrapper "$file_name" >/dev/null 2>&1 ; then + if ! grep-find-unicode-wrapper "$file_name" &>/dev/null ; then printf "%s\n" "${green}PASS${nocolor}: Clean: $description" pass_count=$(( pass_count + 1 )) else @@ -70,7 +72,7 @@ write_file() { local name="$1" local content="$2" local file_name="$test_dir/$name" - printf '%s' "$content" | tee -- "$file_name" >/dev/null + printf '%s' "$content" | tee -- "$file_name" &>/dev/null printf '%s' "$file_name" } @@ -78,7 +80,7 @@ write_file_binary() { local name="$1" shift local file_name="$test_dir/$name" - printf "$@" | tee -- "$file_name" >/dev/null + printf "$@" | tee -- "$file_name" &>/dev/null printf '%s' "$file_name" } @@ -121,7 +123,7 @@ old_check3_pattern=$'[\u061C\u200E\u200F\u202A\u202B\u202C\u202D\u202E\u2066\u20 check3_false_positive_test() { local description="$1" local file_name="$2" - if LC_ALL=C grep --files-with-matches --line-number --binary-files=text "$old_check3_pattern" "$file_name" >/dev/null 2>&1; then + if LC_ALL=C grep --files-with-matches --line-number --binary-files=text "$old_check3_pattern" "$file_name" &>/dev/null; then if [ "$check3_locale_ok" = "true" ]; then ## UTF-8 locale: old pattern should work, so this is a real failure. printf '%s\n' "${red}FAIL${nocolor}: Old check 3 false positive: $description" >&2 @@ -427,20 +429,23 @@ f="$(write_file_binary "hidden_in_comment.txt" '# This is a normal comment\xE2\x expect_detected "Zero-width space hidden in comment" "$f" ## LTR mark (E2 80 8E) hidden in a string literal. -f="$(write_file_binary "hidden_in_string.txt" 'var x = "hello\xE2\x80\x8Eworld";')" +f="$(write_file_binary "hidden_in_string.txt" 'x = "hello\xE2\x80\x8Eworld"')" expect_detected "LTR mark hidden in string literal" "$f" ## Trojan Source BiDi attack pattern. -f="$(write_file_binary "trojan_source_example.txt" 'access_level = "user\xE2\x80\xAA\xE2\x81\xA6\xE2\x81\xA9\xE2\x81\xA6admin\xE2\x81\xA9\xE2\x80\xAC"')" +## Embeds BiDi overrides around the word "test" to demonstrate detection. +f="$(write_file_binary "trojan_source_example.txt" 'access_level = "user\xE2\x80\xAA\xE2\x81\xA6\xE2\x81\xA9\xE2\x81\xA6test\xE2\x81\xA9\xE2\x80\xAC"')" expect_detected "Trojan Source BiDi attack pattern" "$f" -## Backspace (\x08) overwrite attack. -f="$(write_file_binary "backspace_overwrite.txt" 'user\x08\x08\x08\x08root')" -expect_detected "Backspace overwrite (displays 'root' over 'user')" "$f" +## Backspace (\x08) overwrite: text followed by backspaces then replacement. +## Would display "BADX" overwriting "GOOD" on a terminal. +f="$(write_file_binary "backspace_overwrite.txt" 'GOOD\x08\x08\x08\x08BADX')" +expect_detected "Backspace overwrite (GOOD overwritten by BADX)" "$f" -## Carriage return (\x0D) overwrite attack. -f="$(write_file_binary "cr_overwrite.txt" 'safe command\x0Drm -rf /')" -expect_detected "CR overwrite (hides malicious command)" "$f" +## Carriage return (\x0D) overwrite: second text replaces first on display. +## Would display "ERROR: You should not see this" on a terminal. +f="$(write_file_binary "cr_overwrite.txt" 'This line looks safe\x0DERROR: You should not see this')" +expect_detected "CR overwrite (hides text behind carriage return)" "$f" ## =================================================================== ## Section 10: Mixed content edge cases @@ -461,10 +466,7 @@ f="$(write_file_binary "leading_null.txt" '\x00normal text')" expect_detected "NULL byte at start of file" "$f" ## Very long line with suspicious char in the middle (ZWSP = E2 80 8B). -python3 -c " -import sys -sys.stdout.buffer.write(b'a' * 10000 + b'\xe2\x80\x8b' + b'b' * 10000) -" | tee -- "$test_dir/long_line.txt" >/dev/null +write-long-line-with-unicode "$test_dir/long_line.txt" expect_detected "Suspicious char buried in 20000-char line" "$test_dir/long_line.txt" ## =================================================================== diff --git a/usr/libexec/helper-scripts/write-long-line-with-unicode b/usr/libexec/helper-scripts/write-long-line-with-unicode new file mode 100644 index 00000000..5929dd4f --- /dev/null +++ b/usr/libexec/helper-scripts/write-long-line-with-unicode @@ -0,0 +1,28 @@ +#!/usr/bin/python3 -su + +## Copyright (C) 2025 - 2025 ENCRYPTED SUPPORT LLC +## See the file COPYING for copying conditions. + +""" +Writes a long line with a suspicious Unicode character (ZWSP, U+200B) +buried in the middle. Used by test_grep_find_unicode_wrapper to test +detection in large files. + +Usage: write-long-line-with-unicode +""" + +import sys + +def main(): + if len(sys.argv) != 2: + sys.stderr.write("Usage: write-long-line-with-unicode \n") + return 1 + output_path = sys.argv[1] + ## 10000 'a' bytes + ZWSP (E2 80 8B) + 10000 'b' bytes + data = b'a' * 10000 + b'\xe2\x80\x8b' + b'b' * 10000 + with open(output_path, 'wb') as output_file: + output_file.write(data) + return 0 + +if __name__ == "__main__": + sys.exit(main())