diff --git a/.gitignore b/.gitignore index 3f9bd64e..080fc9e5 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,16 @@ build/ # wheel files *.whl + +# Coverage reports and artifacts +.coverage +coverage.json +coverage*.xml +htmlcov/ +unified-coverage/ +*.profraw +*.profdata +*.info *.tar.gz *.zip @@ -66,3 +76,11 @@ mssql_py_core/ # learning files learnings/ + +# Local development and experimental scripts (not part of the PR) +add_platform_exclusions.py +add_lcov_exclusions.py +fix_multiline_log_exclusions.py +test_pyodbc_decimal.py +run_coverage_docker.ps1 +TRIAGE_REPORT_*.md diff --git a/eng/scripts/join_logs_for_coverage.py b/eng/scripts/join_logs_for_coverage.py new file mode 100644 index 00000000..9742c133 --- /dev/null +++ b/eng/scripts/join_logs_for_coverage.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 +""" +Join multi-line LOG() calls onto single lines for LCOV coverage filtering. + +This script is used only during coverage builds to simplify LOG statement exclusion. +It doesn't modify the original source files - it works on copies during the build. +Adjacent string literals are concatenated at compile time, so runtime behavior is identical. + +Uses a proper C++ tokenizer to handle string literals, character literals, and comments +correctly, avoiding issues with unbalanced parentheses or semicolons in strings. +""" + +import re +import sys +from pathlib import Path + + +_LOG_MACRO_PATTERN = re.compile(r'\bLOG[A-Z_]*\s*\(') + + +def _find_log_macro_open(line: str): + """Return the index of the opening parenthesis for a LOG-like macro, if present.""" + match = _LOG_MACRO_PATTERN.search(line) + if not match: + return None + return match.end() - 1 + + +def _find_log_statement_end(lines, start_line, open_paren_index): + """Find the line index where the LOG macro call closes, ignoring literals/comments. + + This properly handles: + - String literals: LOG("unbalanced (", x); + - Character literals: LOG(')', code); + - Line comments: LOG("msg", x); // comment with ) + - Block comments: LOG("msg" /* comment ) */, x); + """ + depth = 0 + in_string = False + in_char = False + in_block_comment = False + escape = False + + for line_index in range(start_line, len(lines)): + line = lines[line_index] + i = open_paren_index if line_index == start_line else 0 + in_line_comment = False + + while i < len(line): + ch = line[i] + nxt = line[i + 1] if i + 1 < len(line) else '' + + # Line comments consume rest of line + if in_line_comment: + break + + # Inside block comment - only look for */ + if in_block_comment: + if ch == '*' and nxt == '/': + in_block_comment = False + i += 2 + continue + i += 1 + continue + + # Inside string literal - handle escapes + if in_string: + if escape: + escape = False + elif ch == '\\': + escape = True + elif ch == '"': + in_string = False + i += 1 + continue + + # Inside character literal - handle escapes + if in_char: + if escape: + escape = False + elif ch == '\\': + escape = True + elif ch == "'": + in_char = False + i += 1 + continue + + # Check for comment starts + if ch == '/' and nxt == '/': + in_line_comment = True + break + if ch == '/' and nxt == '*': + in_block_comment = True + i += 2 + continue + + # Check for literal starts + if ch == '"': + in_string = True + escape = False + i += 1 + continue + if ch == "'": + in_char = True + escape = False + i += 1 + continue + + # Count parentheses depth outside of literals/comments + if ch == '(': + depth += 1 + elif ch == ')': + depth -= 1 + if depth == 0: + return line_index + + i += 1 + + return None + + +def join_log_statements(content: str) -> str: + """Join multi-line LOG macro calls onto a single line using proper C++ tokenization.""" + lines = content.split('\n') + result = [] + i = 0 + + while i < len(lines): + line = lines[i] + + # Check if this line contains a LOG macro start + open_paren_index = _find_log_macro_open(line) + if open_paren_index is not None: + # Find where the LOG statement ends, respecting C++ syntax + end_index = _find_log_statement_end(lines, i, open_paren_index) + if end_index is not None and end_index > i: + # Multi-line LOG statement found - join it + full_statement = lines[i] + for join_index in range(i + 1, end_index + 1): + full_statement += ' ' + lines[join_index].strip() + result.append(full_statement) + i = end_index + 1 + continue + + # Not a LOG statement or single-line LOG - keep as is + result.append(line) + i += 1 + + return '\n'.join(result) + + +def process_file(filepath: Path) -> None: + """Process a single C++ source file.""" + try: + with open(filepath, 'r', encoding='utf-8') as f: + content = f.read() + + modified = join_log_statements(content) + + with open(filepath, 'w', encoding='utf-8') as f: + f.write(modified) + + print(f"[INFO] Processed: {filepath}") + except Exception as e: + print(f"[ERROR] Failed to process {filepath}: {e}", file=sys.stderr) + sys.exit(1) + + +def main(): + """Process all .cpp and .hpp files in the pybind directory.""" + if len(sys.argv) > 1: + # Process specific directory passed as argument + base_dir = Path(sys.argv[1]) + else: + # Default to current directory + base_dir = Path.cwd() + + if not base_dir.exists(): + print(f"[ERROR] Directory not found: {base_dir}", file=sys.stderr) + sys.exit(1) + + # Find all C++ source files + cpp_files = list(base_dir.rglob('*.cpp')) + list(base_dir.rglob('*.hpp')) + + if not cpp_files: + print(f"[WARNING] No .cpp or .hpp files found in {base_dir}") + return + + print(f"[INFO] Processing {len(cpp_files)} C++ files in {base_dir}") + for filepath in cpp_files: + process_file(filepath) + + print(f"[SUCCESS] Joined LOG statements in {len(cpp_files)} files") + + +if __name__ == '__main__': + main() diff --git a/generate_codecov.sh b/generate_codecov.sh index f24dd78d..fc8462e5 100644 --- a/generate_codecov.sh +++ b/generate_codecov.sh @@ -74,32 +74,42 @@ fi echo "[INFO] Using pybind module: $PYBIND_SO" -# Export C++ coverage, excluding Python headers, pybind11, and system includes +# Export C++ coverage, excluding Python headers, pybind11, system includes, and vendored deps llvm-cov export "$PYBIND_SO" \ -instr-profile=default.profdata \ - -ignore-filename-regex='(python3\.[0-9]+|cpython|pybind11|/usr/include/|/usr/lib/)' \ + -ignore-filename-regex='(python3\.[0-9]+|cpython|pybind11|/usr/include/|/usr/lib/|build/_deps/)' \ --skip-functions \ -format=lcov > cpp-coverage.info -# Note: LCOV exclusion markers (LCOV_EXCL_LINE) should be added to source code -# to exclude LOG() statements from coverage. However, for automated exclusion -# of all LOG lines without modifying source code, we can use geninfo's --omit-lines -# feature during the merge step (see below). +# Note: LCOV exclusion markers (LCOV_EXCL_LINE) are processed below echo "===================================" echo "[STEP 4] Merging Python + C++ coverage" echo "===================================" -# Merge LCOV reports (ignore inconsistencies in Python LCOV export) -echo "[ACTION] Merging Python and C++ coverage" -lcov -a python-coverage.info -a cpp-coverage.info -o total.info \ +# Merge LCOV reports and filter LOG statements using lcov's built-in exclusion +# The --rc option sets lcov_excl_line to match any line containing LOG macros +# Since we joined multi-line LOGs during build, they're now on single lines +echo "[ACTION] Merging Python and C++ coverage with LOG exclusion" +lcov -a python-coverage.info -a cpp-coverage.info -o total-unfiltered.info \ + --rc lcov_excl_line='\bLOG[A-Z_]*\s*\(' \ --ignore-errors inconsistent,corrupt +echo "[INFO] Coverage merged with LOG statements excluded" + +# Defense-in-depth: drop any vendored third-party sources pulled in via CMake +# FetchContent (e.g. simdutf). The llvm-cov ignore-filename-regex above is the +# primary filter; this catches anything that slips through future deps. +echo "[ACTION] Removing vendored third-party sources from merged coverage" +lcov --remove total-unfiltered.info '*/build/_deps/*' -o total.info \ + --ignore-errors inconsistent,unused + # Normalize paths so everything starts from mssql_python/ echo "[ACTION] Normalizing paths in LCOV report" sed -i "s|$(pwd)/||g" total.info # Generate full HTML report +echo "[ACTION] Generating HTML coverage report" genhtml total.info \ --output-directory unified-coverage \ --quiet \ @@ -107,3 +117,18 @@ genhtml total.info \ # Generate Cobertura XML (for Azure DevOps Code Coverage tab) lcov_cobertura total.info --output coverage.xml + +echo "===================================" +echo "[STEP 5] Cleanup" +echo "===================================" + +# Restore original source files if they were backed up during coverage build +BACKUP_FILE="mssql_python/pybind/.source_backup_coverage.tar.gz" +if [ -f "$BACKUP_FILE" ]; then + echo "[ACTION] Restoring original source files from backup" + (cd mssql_python/pybind && tar -xzf .source_backup_coverage.tar.gz) + rm -f "$BACKUP_FILE" + echo "[INFO] Original source files restored" +fi + +echo "[INFO] Coverage report generation complete" diff --git a/mssql_python/pybind/build.sh b/mssql_python/pybind/build.sh index 81177728..cd8c1cc4 100755 --- a/mssql_python/pybind/build.sh +++ b/mssql_python/pybind/build.sh @@ -31,6 +31,45 @@ COVERAGE_MODE=false if [[ "${1:-}" == "codecov" || "${1:-}" == "--coverage" ]]; then COVERAGE_MODE=true echo "[MODE] Enabling Clang coverage instrumentation" + + # For coverage builds, join multi-line LOG statements to simplify LCOV filtering + # Original source is backed up and must be restored by generate_codecov.sh after analysis + echo "[ACTION] Preparing source for coverage build (joining LOG statements)" + + # Save current directory + ORIGINAL_DIR=$(pwd) + + # Create backup using tar to preserve directory structure + BACKUP_FILE="${ORIGINAL_DIR}/.source_backup_coverage.tar.gz" + echo "[INFO] Creating backup of source files" + tar -czf "$BACKUP_FILE" --exclude='build' --exclude='.source_backup*' \ + $(find . -maxdepth 2 -type f \( -name "*.cpp" -o -name "*.hpp" \) -o -type d -name connection) 2>/dev/null || true + + if [[ ! -f "$BACKUP_FILE" ]]; then + echo "[ERROR] Failed to create source backup" + exit 1 + fi + + # Join LOG statements using the helper script + SCRIPT_PATH="${ORIGINAL_DIR}/../../eng/scripts/join_logs_for_coverage.py" + if [[ -f "$SCRIPT_PATH" ]]; then + python3 "$SCRIPT_PATH" "$ORIGINAL_DIR" + if [[ $? -eq 0 ]]; then + echo "[SUCCESS] LOG statements joined for coverage build" + echo "[INFO] Original source backed up to $BACKUP_FILE" + echo "[IMPORTANT] Run 'tar -xzf $BACKUP_FILE' in $(pwd) to restore after coverage analysis" + else + echo "[ERROR] Failed to join LOG statements" + # Restore backup and exit + tar -xzf "$BACKUP_FILE" 2>/dev/null + rm -f "$BACKUP_FILE" + exit 1 + fi + else + echo "[WARNING] join_logs_for_coverage.py not found at $SCRIPT_PATH" + echo "[WARNING] Continuing with original source (LOG filtering may be incomplete)" + rm -f "$BACKUP_FILE" # No need for backup if not joining + fi fi # Get Python version from active interpreter