From af93e0df177ca52f4895f466282804b9d4e05dd8 Mon Sep 17 00:00:00 2001
From: gargsaumya <saumyagarg.100@gmail.com>
Date: Tue, 19 May 2026 12:00:27 +0530
Subject: [PATCH 1/4] CHORE: Exclude LOG statements and vendored dependencies
 from coverage

Replace manual LCOV_EXCL_LINE markers with cleaner built-in lcov filtering.
This approach uses lcov's native exclusion mechanism and is more maintainable.

Changes:
- Add eng/scripts/join_logs_for_coverage.py to join multi-line LOG calls during coverage builds
- Modify build.sh to temporarily join LOG statements in codecov mode with automatic restore
- Use lcov --rc lcov_excl_line='\bLOG[A-Z_]*\s*\(' to exclude LOG macros from coverage
- Add llvm-cov ignore pattern for build/_deps/ (vendored simdutf sources from PR #526)
- Add lcov --remove for build/_deps/ as defense-in-depth (from PR #579)
- Update .gitignore to exclude local development scripts

Benefits:
- No source code clutter (600+ markers not needed)
- Catches all LOG variants (LOG_ERROR, LOG_WARNING, etc.)
- Excludes vendored third-party dependencies from coverage metrics
- Cleaner, more maintainable approach using lcov native features
- Source files remain unchanged in repository

Addresses review feedback from @bewithgaurav on PR #556
Includes changes from PR #579 to fix simdutf coverage pollution
---
 .gitignore                            | 18 ++++++
 eng/scripts/join_logs_for_coverage.py | 92 +++++++++++++++++++++++++++
 generate_codecov.sh                   | 28 +++++---
 mssql_python/pybind/build.sh          | 28 ++++++++
 4 files changed, 157 insertions(+), 9 deletions(-)
 create mode 100644 eng/scripts/join_logs_for_coverage.py

diff --git a/.gitignore b/.gitignore
index 3f9bd64e1..080fc9e5d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,6 +48,16 @@ build/
 
 # wheel files
 *.whl
+
+# Coverage reports and artifacts
+.coverage
+coverage.json
+coverage*.xml
+htmlcov/
+unified-coverage/
+*.profraw
+*.profdata
+*.info
 *.tar.gz
 *.zip
 
@@ -66,3 +76,11 @@ mssql_py_core/
 
 # learning files
 learnings/
+
+# Local development and experimental scripts (not part of the PR)
+add_platform_exclusions.py
+add_lcov_exclusions.py
+fix_multiline_log_exclusions.py
+test_pyodbc_decimal.py
+run_coverage_docker.ps1
+TRIAGE_REPORT_*.md
diff --git a/eng/scripts/join_logs_for_coverage.py b/eng/scripts/join_logs_for_coverage.py
new file mode 100644
index 000000000..89ca3b3d1
--- /dev/null
+++ b/eng/scripts/join_logs_for_coverage.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+"""
+Join multi-line LOG() calls onto single lines for LCOV coverage filtering.
+
+This script is used only during coverage builds to simplify LOG statement exclusion.
+It doesn't modify the original source files - it works on copies during the build.
+Adjacent string literals are concatenated at compile time, so runtime behavior is identical.
+"""
+
+import re
+import sys
+from pathlib import Path
+
+
+def join_log_statements(content: str) -> str:
+    """Join multi-line LOG macro calls onto a single line."""
+    lines = content.split('\n')
+    result = []
+    i = 0
+    
+    while i < len(lines):
+        line = lines[i]
+        
+        # Check if this line contains a LOG macro start
+        if re.search(r'\bLOG[A-Z_]*\s*\(', line):
+            # Start collecting the full statement
+            full_statement = line
+            paren_depth = line.count('(') - line.count(')')
+            i += 1
+            
+            # Continue collecting until we close all parentheses
+            while i < len(lines) and paren_depth > 0:
+                next_line = lines[i]
+                full_statement += ' ' + next_line.strip()
+                paren_depth += next_line.count('(') - next_line.count(')')
+                i += 1
+            
+            # Add the joined statement
+            result.append(full_statement)
+        else:
+            result.append(line)
+            i += 1
+    
+    return '\n'.join(result)
+
+
+def process_file(filepath: Path) -> None:
+    """Process a single C++ source file."""
+    try:
+        with open(filepath, 'r', encoding='utf-8') as f:
+            content = f.read()
+        
+        modified = join_log_statements(content)
+        
+        with open(filepath, 'w', encoding='utf-8') as f:
+            f.write(modified)
+        
+        print(f"[INFO] Processed: {filepath}")
+    except Exception as e:
+        print(f"[ERROR] Failed to process {filepath}: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+def main():
+    """Process all .cpp and .hpp files in the pybind directory."""
+    if len(sys.argv) > 1:
+        # Process specific directory passed as argument
+        base_dir = Path(sys.argv[1])
+    else:
+        # Default to current directory
+        base_dir = Path.cwd()
+    
+    if not base_dir.exists():
+        print(f"[ERROR] Directory not found: {base_dir}", file=sys.stderr)
+        sys.exit(1)
+    
+    # Find all C++ source files
+    cpp_files = list(base_dir.rglob('*.cpp')) + list(base_dir.rglob('*.hpp'))
+    
+    if not cpp_files:
+        print(f"[WARNING] No .cpp or .hpp files found in {base_dir}")
+        return
+    
+    print(f"[INFO] Processing {len(cpp_files)} C++ files in {base_dir}")
+    for filepath in cpp_files:
+        process_file(filepath)
+    
+    print(f"[SUCCESS] Joined LOG statements in {len(cpp_files)} files")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/generate_codecov.sh b/generate_codecov.sh
index f24dd78d5..27d323aa2 100644
--- a/generate_codecov.sh
+++ b/generate_codecov.sh
@@ -74,32 +74,42 @@ fi
 
 echo "[INFO] Using pybind module: $PYBIND_SO"
 
-# Export C++ coverage, excluding Python headers, pybind11, and system includes
+# Export C++ coverage, excluding Python headers, pybind11, system includes, and vendored deps
 llvm-cov export "$PYBIND_SO" \
   -instr-profile=default.profdata \
-  -ignore-filename-regex='(python3\.[0-9]+|cpython|pybind11|/usr/include/|/usr/lib/)' \
+  -ignore-filename-regex='(python3\.[0-9]+|cpython|pybind11|/usr/include/|/usr/lib/|build/_deps/)' \
   --skip-functions \
   -format=lcov > cpp-coverage.info
 
-# Note: LCOV exclusion markers (LCOV_EXCL_LINE) should be added to source code
-# to exclude LOG() statements from coverage. However, for automated exclusion
-# of all LOG lines without modifying source code, we can use geninfo's --omit-lines
-# feature during the merge step (see below).
+# Note: LCOV exclusion markers (LCOV_EXCL_LINE) are processed below
 
 echo "==================================="
 echo "[STEP 4] Merging Python + C++ coverage"
 echo "==================================="
 
-# Merge LCOV reports (ignore inconsistencies in Python LCOV export)
-echo "[ACTION] Merging Python and C++ coverage"
-lcov -a python-coverage.info -a cpp-coverage.info -o total.info \
+# Merge LCOV reports and filter LOG statements using lcov's built-in exclusion
+# The --rc option sets lcov_excl_line to match any line containing LOG macros
+# Since we joined multi-line LOGs during build, they're now on single lines
+echo "[ACTION] Merging Python and C++ coverage with LOG exclusion"
+lcov -a python-coverage.info -a cpp-coverage.info -o total-unfiltered.info \
+  --rc lcov_excl_line='\bLOG[A-Z_]*\s*\(' \
   --ignore-errors inconsistent,corrupt
 
+echo "[INFO] Coverage merged with LOG statements excluded"
+
+# Defense-in-depth: drop any vendored third-party sources pulled in via CMake
+# FetchContent (e.g. simdutf). The llvm-cov ignore-filename-regex above is the
+# primary filter; this catches anything that slips through future deps.
+echo "[ACTION] Removing vendored third-party sources from merged coverage"
+lcov --remove total-unfiltered.info '*/build/_deps/*' -o total.info \
+  --ignore-errors inconsistent,unused
+
 # Normalize paths so everything starts from mssql_python/
 echo "[ACTION] Normalizing paths in LCOV report"
 sed -i "s|$(pwd)/||g" total.info
 
 # Generate full HTML report
+echo "[ACTION] Generating HTML coverage report"
 genhtml total.info \
   --output-directory unified-coverage \
   --quiet \
diff --git a/mssql_python/pybind/build.sh b/mssql_python/pybind/build.sh
index 811777285..1f589c763 100755
--- a/mssql_python/pybind/build.sh
+++ b/mssql_python/pybind/build.sh
@@ -31,6 +31,34 @@ COVERAGE_MODE=false
 if [[ "${1:-}" == "codecov" || "${1:-}" == "--coverage" ]]; then
     COVERAGE_MODE=true
     echo "[MODE] Enabling Clang coverage instrumentation"
+    
+    # For coverage builds, join multi-line LOG statements to simplify LCOV filtering
+    # This works on a temporary copy - original source is restored on exit
+    echo "[ACTION] Preparing source for coverage build (joining LOG statements)"
+    
+    # Save current directory
+    ORIGINAL_DIR=$(pwd)
+    
+    # Create backup directory
+    BACKUP_DIR="${ORIGINAL_DIR}/.source_backup_coverage"
+    rm -rf "$BACKUP_DIR"
+    mkdir -p "$BACKUP_DIR"
+    
+    # Backup all .cpp and .hpp files
+    find . -maxdepth 2 -type f \( -name "*.cpp" -o -name "*.hpp" \) -exec cp {} "$BACKUP_DIR/" \;
+    
+    # Set trap to restore source files on exit (success or failure)
+    trap 'echo "[CLEANUP] Restoring original source files"; cp -f "$BACKUP_DIR"/* "$ORIGINAL_DIR/" 2>/dev/null || true; rm -rf "$BACKUP_DIR"' EXIT
+    
+    # Join LOG statements using the helper script
+    SCRIPT_PATH="${ORIGINAL_DIR}/../../eng/scripts/join_logs_for_coverage.py"
+    if [[ -f "$SCRIPT_PATH" ]]; then
+        python3 "$SCRIPT_PATH" "$ORIGINAL_DIR"
+        echo "[SUCCESS] LOG statements joined for coverage build"
+    else
+        echo "[WARNING] join_logs_for_coverage.py not found at $SCRIPT_PATH"
+        echo "[WARNING] Continuing with original source (LOG filtering may be incomplete)"
+    fi
 fi
 
 # Get Python version from active interpreter

From 48b4efa5588d67c62adb4e69235cd4383dbbd609 Mon Sep 17 00:00:00 2001
From: gargsaumya <saumyagarg.100@gmail.com>
Date: Tue, 19 May 2026 12:09:13 +0530
Subject: [PATCH 2/4] FIX: Address Copilot code review issues - restore timing
 and backup format

Critical fixes:
1. Fixed restore timing: Source files now restored AFTER llvm-cov/lcov analysis completes
2. Fixed backup format mismatch: generate_codecov.sh now correctly extracts tar.gz backup
3. Removed duplicate cleanup section in generate_codecov.sh

Existing safeguards in place:
- join_logs_for_coverage.py has max_lines=20 limit to prevent runaway joins
- Unbalanced parentheses detection with warning messages
- tar.gz backup preserves directory structure correctly
- Error handling with automatic restore on join failure

Addresses Copilot automated code review feedback
---
 eng/scripts/join_logs_for_coverage.py | 17 +++++++++++---
 generate_codecov.sh                   | 15 ++++++++++++
 mssql_python/pybind/build.sh          | 33 ++++++++++++++++++---------
 3 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/eng/scripts/join_logs_for_coverage.py b/eng/scripts/join_logs_for_coverage.py
index 89ca3b3d1..d92083192 100644
--- a/eng/scripts/join_logs_for_coverage.py
+++ b/eng/scripts/join_logs_for_coverage.py
@@ -26,17 +26,28 @@ def join_log_statements(content: str) -> str:
             # Start collecting the full statement
             full_statement = line
             paren_depth = line.count('(') - line.count(')')
+            start_line = i + 1  # For error reporting
             i += 1
+            lines_collected = 1
+            max_lines = 20  # Safety limit to prevent infinite loops
             
             # Continue collecting until we close all parentheses
-            while i < len(lines) and paren_depth > 0:
+            while i < len(lines) and paren_depth > 0 and lines_collected < max_lines:
                 next_line = lines[i]
                 full_statement += ' ' + next_line.strip()
                 paren_depth += next_line.count('(') - next_line.count(')')
                 i += 1
+                lines_collected += 1
             
-            # Add the joined statement
-            result.append(full_statement)
+            # Validation: Check if we successfully closed all parentheses
+            if paren_depth != 0:
+                print(f"[WARNING] Unbalanced parentheses in LOG statement starting at line {start_line}")
+                print(f"[WARNING] Collected {lines_collected} lines, paren_depth={paren_depth}")
+                # Keep the original multi-line format for safety
+                result.extend(lines[start_line-1:i])
+            else:
+                # Add the joined statement
+                result.append(full_statement)
         else:
             result.append(line)
             i += 1
diff --git a/generate_codecov.sh b/generate_codecov.sh
index 27d323aa2..fc8462e50 100644
--- a/generate_codecov.sh
+++ b/generate_codecov.sh
@@ -117,3 +117,18 @@ genhtml total.info \
 
 # Generate Cobertura XML (for Azure DevOps Code Coverage tab)
 lcov_cobertura total.info --output coverage.xml
+
+echo "==================================="
+echo "[STEP 5] Cleanup"
+echo "==================================="
+
+# Restore original source files if they were backed up during coverage build
+BACKUP_FILE="mssql_python/pybind/.source_backup_coverage.tar.gz"
+if [ -f "$BACKUP_FILE" ]; then
+    echo "[ACTION] Restoring original source files from backup"
+    (cd mssql_python/pybind && tar -xzf .source_backup_coverage.tar.gz)
+    rm -f "$BACKUP_FILE"
+    echo "[INFO] Original source files restored"
+fi
+
+echo "[INFO] Coverage report generation complete"
diff --git a/mssql_python/pybind/build.sh b/mssql_python/pybind/build.sh
index 1f589c763..cd8c1cc48 100755
--- a/mssql_python/pybind/build.sh
+++ b/mssql_python/pybind/build.sh
@@ -33,31 +33,42 @@ if [[ "${1:-}" == "codecov" || "${1:-}" == "--coverage" ]]; then
     echo "[MODE] Enabling Clang coverage instrumentation"
     
     # For coverage builds, join multi-line LOG statements to simplify LCOV filtering
-    # This works on a temporary copy - original source is restored on exit
+    # Original source is backed up and must be restored by generate_codecov.sh after analysis
     echo "[ACTION] Preparing source for coverage build (joining LOG statements)"
     
     # Save current directory
     ORIGINAL_DIR=$(pwd)
     
-    # Create backup directory
-    BACKUP_DIR="${ORIGINAL_DIR}/.source_backup_coverage"
-    rm -rf "$BACKUP_DIR"
-    mkdir -p "$BACKUP_DIR"
+    # Create backup using tar to preserve directory structure
+    BACKUP_FILE="${ORIGINAL_DIR}/.source_backup_coverage.tar.gz"
+    echo "[INFO] Creating backup of source files"
+    tar -czf "$BACKUP_FILE" --exclude='build' --exclude='.source_backup*' \
+        $(find . -maxdepth 2 -type f \( -name "*.cpp" -o -name "*.hpp" \) -o -type d -name connection) 2>/dev/null || true
     
-    # Backup all .cpp and .hpp files
-    find . -maxdepth 2 -type f \( -name "*.cpp" -o -name "*.hpp" \) -exec cp {} "$BACKUP_DIR/" \;
-    
-    # Set trap to restore source files on exit (success or failure)
-    trap 'echo "[CLEANUP] Restoring original source files"; cp -f "$BACKUP_DIR"/* "$ORIGINAL_DIR/" 2>/dev/null || true; rm -rf "$BACKUP_DIR"' EXIT
+    if [[ ! -f "$BACKUP_FILE" ]]; then
+        echo "[ERROR] Failed to create source backup"
+        exit 1
+    fi
     
     # Join LOG statements using the helper script
     SCRIPT_PATH="${ORIGINAL_DIR}/../../eng/scripts/join_logs_for_coverage.py"
     if [[ -f "$SCRIPT_PATH" ]]; then
         python3 "$SCRIPT_PATH" "$ORIGINAL_DIR"
-        echo "[SUCCESS] LOG statements joined for coverage build"
+        if [[ $? -eq 0 ]]; then
+            echo "[SUCCESS] LOG statements joined for coverage build"
+            echo "[INFO] Original source backed up to $BACKUP_FILE"
+            echo "[IMPORTANT] Run 'tar -xzf $BACKUP_FILE' in $(pwd) to restore after coverage analysis"
+        else
+            echo "[ERROR] Failed to join LOG statements"
+            # Restore backup and exit
+            tar -xzf "$BACKUP_FILE" 2>/dev/null
+            rm -f "$BACKUP_FILE"
+            exit 1
+        fi
     else
         echo "[WARNING] join_logs_for_coverage.py not found at $SCRIPT_PATH"
         echo "[WARNING] Continuing with original source (LOG filtering may be incomplete)"
+        rm -f "$BACKUP_FILE"  # No need for backup if not joining
     fi
 fi
 

From cac576ea8ecb836f674ffaa5f0a39aecc12967ee Mon Sep 17 00:00:00 2001
From: gargsaumya <saumyagarg.100@gmail.com>
Date: Tue, 19 May 2026 12:40:32 +0530
Subject: [PATCH 3/4] FIX: Replace parenthesis counting with semicolon-based
 LOG joining

Critical fix for Copilot review comment (Medium priority):

- Switched from naive parenthesis counting to semicolon-based joining

- Prevents issues with unbalanced parens in C++ string literals

- More reliable for C-style statements that always end with semicolon

Impact on coverage:

- Ensures ALL multi-line LOG statements are joined properly

- Previously, LOGs with unbalanced parens in strings failed to join

- Their continuation lines were still counted toward coverage

- This fix should improve coverage from 80% closer to 81%

Addresses Copilot automated code review feedback
---
 eng/scripts/join_logs_for_coverage.py | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/eng/scripts/join_logs_for_coverage.py b/eng/scripts/join_logs_for_coverage.py
index d92083192..d4b13f2ca 100644
--- a/eng/scripts/join_logs_for_coverage.py
+++ b/eng/scripts/join_logs_for_coverage.py
@@ -13,7 +13,11 @@
 
 
 def join_log_statements(content: str) -> str:
-    """Join multi-line LOG macro calls onto a single line."""
+    """Join multi-line LOG macro calls onto a single line.
+    
+    Uses semicolon-based joining rather than parenthesis counting to avoid
+    issues with unbalanced parentheses inside C++ string literals.
+    """
     lines = content.split('\n')
     result = []
     i = 0
@@ -23,26 +27,27 @@ def join_log_statements(content: str) -> str:
         
         # Check if this line contains a LOG macro start
         if re.search(r'\bLOG[A-Z_]*\s*\(', line):
-            # Start collecting the full statement
+            # Start collecting the full statement until we find a semicolon
             full_statement = line
-            paren_depth = line.count('(') - line.count(')')
             start_line = i + 1  # For error reporting
             i += 1
             lines_collected = 1
-            max_lines = 20  # Safety limit to prevent infinite loops
+            max_lines = 20  # Safety limit to prevent runaway joins
             
-            # Continue collecting until we close all parentheses
-            while i < len(lines) and paren_depth > 0 and lines_collected < max_lines:
+            # Continue collecting until we find a semicolon (end of statement)
+            # This is more reliable than parenthesis counting for LOG statements
+            # because it doesn't get confused by unbalanced parens in string literals
+            while i < len(lines) and ';' not in full_statement and lines_collected < max_lines:
                 next_line = lines[i]
                 full_statement += ' ' + next_line.strip()
-                paren_depth += next_line.count('(') - next_line.count(')')
                 i += 1
                 lines_collected += 1
             
-            # Validation: Check if we successfully closed all parentheses
-            if paren_depth != 0:
-                print(f"[WARNING] Unbalanced parentheses in LOG statement starting at line {start_line}")
-                print(f"[WARNING] Collected {lines_collected} lines, paren_depth={paren_depth}")
+            # Validation: Check if we found a semicolon
+            if ';' not in full_statement:
+                print(f"[WARNING] No semicolon found in LOG statement starting at line {start_line}")
+                print(f"[WARNING] Collected {lines_collected} lines without finding ';'")
+                print(f"[WARNING] First 100 chars: {full_statement[:100]}...")
                 # Keep the original multi-line format for safety
                 result.extend(lines[start_line-1:i])
             else:

From 322402e4216b7e29d006b5932364cb1b0709b27e Mon Sep 17 00:00:00 2001
From: gargsaumya <saumyagarg.100@gmail.com>
Date: Tue, 19 May 2026 12:51:19 +0530
Subject: [PATCH 4/4] FIX: Use proper C++ tokenizer for LOG statement joining

Critical improvement over semicolon-based approach:
- Implements proper C++ tokenizer that understands syntax
- Correctly handles string literals, char literals, comments
- Avoids corruption from semicolons/parens in strings

Examples now handled correctly:
- LOG(""SQL: SELECT *; WHERE"", x); - semicolon in string
- LOG(""unbalanced ("", x); - unbalanced paren in string
- LOG(')', code); - closing paren as character literal
- LOG(""msg"", x); // comment with ) - comment with paren

This is the only robust way to parse C++ syntax.
Addresses Copilot code review suggestion (option a)
---
 eng/scripts/join_logs_for_coverage.py | 155 ++++++++++++++++++++------
 1 file changed, 122 insertions(+), 33 deletions(-)

diff --git a/eng/scripts/join_logs_for_coverage.py b/eng/scripts/join_logs_for_coverage.py
index d4b13f2ca..9742c1338 100644
--- a/eng/scripts/join_logs_for_coverage.py
+++ b/eng/scripts/join_logs_for_coverage.py
@@ -5,6 +5,9 @@
 This script is used only during coverage builds to simplify LOG statement exclusion.
 It doesn't modify the original source files - it works on copies during the build.
 Adjacent string literals are concatenated at compile time, so runtime behavior is identical.
+
+Uses a proper C++ tokenizer to handle string literals, character literals, and comments
+correctly, avoiding issues with unbalanced parentheses or semicolons in strings.
 """
 
 import re
@@ -12,12 +15,112 @@
 from pathlib import Path
 
 
-def join_log_statements(content: str) -> str:
-    """Join multi-line LOG macro calls onto a single line.
+_LOG_MACRO_PATTERN = re.compile(r'\bLOG[A-Z_]*\s*\(')
+
+
+def _find_log_macro_open(line: str):
+    """Return the index of the opening parenthesis for a LOG-like macro, if present."""
+    match = _LOG_MACRO_PATTERN.search(line)
+    if not match:
+        return None
+    return match.end() - 1
+
+
+def _find_log_statement_end(lines, start_line, open_paren_index):
+    """Find the line index where the LOG macro call closes, ignoring literals/comments.
     
-    Uses semicolon-based joining rather than parenthesis counting to avoid
-    issues with unbalanced parentheses inside C++ string literals.
+    This properly handles:
+    - String literals: LOG("unbalanced (", x);
+    - Character literals: LOG(')', code);  
+    - Line comments: LOG("msg", x);  // comment with )
+    - Block comments: LOG("msg" /* comment ) */, x);
     """
+    depth = 0
+    in_string = False
+    in_char = False
+    in_block_comment = False
+    escape = False
+    
+    for line_index in range(start_line, len(lines)):
+        line = lines[line_index]
+        i = open_paren_index if line_index == start_line else 0
+        in_line_comment = False
+        
+        while i < len(line):
+            ch = line[i]
+            nxt = line[i + 1] if i + 1 < len(line) else ''
+            
+            # Line comments consume rest of line
+            if in_line_comment:
+                break
+            
+            # Inside block comment - only look for */
+            if in_block_comment:
+                if ch == '*' and nxt == '/':
+                    in_block_comment = False
+                    i += 2
+                    continue
+                i += 1
+                continue
+            
+            # Inside string literal - handle escapes
+            if in_string:
+                if escape:
+                    escape = False
+                elif ch == '\\':
+                    escape = True
+                elif ch == '"':
+                    in_string = False
+                i += 1
+                continue
+            
+            # Inside character literal - handle escapes
+            if in_char:
+                if escape:
+                    escape = False
+                elif ch == '\\':
+                    escape = True
+                elif ch == "'":
+                    in_char = False
+                i += 1
+                continue
+            
+            # Check for comment starts
+            if ch == '/' and nxt == '/':
+                in_line_comment = True
+                break
+            if ch == '/' and nxt == '*':
+                in_block_comment = True
+                i += 2
+                continue
+            
+            # Check for literal starts
+            if ch == '"':
+                in_string = True
+                escape = False
+                i += 1
+                continue
+            if ch == "'":
+                in_char = True
+                escape = False
+                i += 1
+                continue
+            
+            # Count parentheses depth outside of literals/comments
+            if ch == '(':
+                depth += 1
+            elif ch == ')':
+                depth -= 1
+                if depth == 0:
+                    return line_index
+            
+            i += 1
+    
+    return None
+
+
+def join_log_statements(content: str) -> str:
+    """Join multi-line LOG macro calls onto a single line using proper C++ tokenization."""
     lines = content.split('\n')
     result = []
     i = 0
@@ -26,36 +129,22 @@ def join_log_statements(content: str) -> str:
         line = lines[i]
         
         # Check if this line contains a LOG macro start
-        if re.search(r'\bLOG[A-Z_]*\s*\(', line):
-            # Start collecting the full statement until we find a semicolon
-            full_statement = line
-            start_line = i + 1  # For error reporting
-            i += 1
-            lines_collected = 1
-            max_lines = 20  # Safety limit to prevent runaway joins
-            
-            # Continue collecting until we find a semicolon (end of statement)
-            # This is more reliable than parenthesis counting for LOG statements
-            # because it doesn't get confused by unbalanced parens in string literals
-            while i < len(lines) and ';' not in full_statement and lines_collected < max_lines:
-                next_line = lines[i]
-                full_statement += ' ' + next_line.strip()
-                i += 1
-                lines_collected += 1
-            
-            # Validation: Check if we found a semicolon
-            if ';' not in full_statement:
-                print(f"[WARNING] No semicolon found in LOG statement starting at line {start_line}")
-                print(f"[WARNING] Collected {lines_collected} lines without finding ';'")
-                print(f"[WARNING] First 100 chars: {full_statement[:100]}...")
-                # Keep the original multi-line format for safety
-                result.extend(lines[start_line-1:i])
-            else:
-                # Add the joined statement
+        open_paren_index = _find_log_macro_open(line)
+        if open_paren_index is not None:
+            # Find where the LOG statement ends, respecting C++ syntax
+            end_index = _find_log_statement_end(lines, i, open_paren_index)
+            if end_index is not None and end_index > i:
+                # Multi-line LOG statement found - join it
+                full_statement = lines[i]
+                for join_index in range(i + 1, end_index + 1):
+                    full_statement += ' ' + lines[join_index].strip()
                 result.append(full_statement)
-        else:
-            result.append(line)
-            i += 1
+                i = end_index + 1
+                continue
+        
+        # Not a LOG statement or single-line LOG - keep as is
+        result.append(line)
+        i += 1
     
     return '\n'.join(result)