From 9e6418530ac7cd263145033de89fb14cc3f5992d Mon Sep 17 00:00:00 2001 From: Eric Simmerman Date: Tue, 31 Mar 2026 18:18:38 -0400 Subject: [PATCH 01/10] WIP: pluggable drivers work in progress Co-Authored-By: Claude Sonnet 4.6 --- qualytics/cli/generate_driver.py | 854 +++++++++++++++++++++++++++++++ qualytics/qualytics.py | 2 + 2 files changed, 856 insertions(+) create mode 100644 qualytics/cli/generate_driver.py diff --git a/qualytics/cli/generate_driver.py b/qualytics/cli/generate_driver.py new file mode 100644 index 0000000..fe0ac51 --- /dev/null +++ b/qualytics/cli/generate_driver.py @@ -0,0 +1,854 @@ +"""CLI command: generate-driver — probe a JDBC driver JAR and emit a YAML driver definition.""" + +from __future__ import annotations + +import json +import os +import re +import shutil +import subprocess +import tempfile +import textwrap +from typing import Annotated, Optional + +import typer +import yaml +from rich import print +from rich.console import Console +from rich.table import Table + +from . import BRAND, print_banner +from .progress import status + +# --------------------------------------------------------------------------- +# Java probe source — compiled at runtime inside a temp dir. +# Outputs a single JSON object to stdout; all diagnostic chatter goes to stderr. +# --------------------------------------------------------------------------- + +_PROBE_JAVA_SOURCE = r""" +import java.io.*; +import java.net.URL; +import java.net.URLClassLoader; +import java.sql.*; +import java.util.*; +import java.util.concurrent.*; + +public class JdbcProbe { + + // ── helpers ────────────────────────────────────────────────────────── + + private static String jq(String s) { + if (s == null) return "null"; + return "\"" + s.replace("\\", "\\\\").replace("\"", "\\\"").replace("\n", "\\n") + "\""; + } + + private static boolean tryQuery(Connection c, String sql, int timeoutSecs) { + try { + Statement st = c.createStatement(); + st.setQueryTimeout(timeoutSecs); + st.execute(sql); + st.close(); + return true; + } catch (Exception e) { + return false; + } + } + + // ── main ───────────────────────────────────────────────────────────── + + public static void main(String[] args) throws Exception { + // args: [user] [password] [key=val ...] + if (args.length < 2) { + System.err.println("Usage: JdbcProbe [user] [password] [key=val...]"); + System.exit(2); + } + String jarPath = args[0]; + String jdbcUrl = args[1]; + String user = args.length > 2 ? args[2] : null; + String pass = args.length > 3 ? args[3] : null; + + Properties extraProps = new Properties(); + for (int i = 4; i < args.length; i++) { + int eq = args[i].indexOf('='); + if (eq > 0) { + extraProps.setProperty(args[i].substring(0, eq), args[i].substring(eq + 1)); + } + } + + // -- Load JAR into an isolated class loader + URL jarUrl = new File(jarPath).toURI().toURL(); + URLClassLoader loader = new URLClassLoader(new URL[]{jarUrl}, + ClassLoader.getSystemClassLoader().getParent()); + + // Discover Driver via ServiceLoader then DriverManager fallback + Driver driver = null; + try { + ServiceLoader sl = ServiceLoader.load(Driver.class, loader); + for (Driver d : sl) { + if (d.acceptsURL(jdbcUrl)) { driver = d; break; } + } + } catch (Exception ignored) {} + + if (driver == null) { + // Try enumerating classes from the JAR manifest Main-Class / known names + // Fallback: ask DriverManager after registering via Class.forName scan + try { + java.util.jar.JarFile jar = new java.util.jar.JarFile(jarPath); + java.util.Enumeration entries = jar.entries(); + while (entries.hasMoreElements()) { + java.util.jar.JarEntry e = entries.nextElement(); + String name = e.getName(); + if (name.endsWith(".class") && !name.contains("$")) { + String cls = name.replace('/', '.').replace(".class", ""); + try { + Class c = loader.loadClass(cls); + if (Driver.class.isAssignableFrom(c) && !c.isInterface()) { + Driver d = (Driver) c.getDeclaredConstructor().newInstance(); + if (d.acceptsURL(jdbcUrl)) { driver = d; break; } + } + } catch (Throwable ignored2) {} + } + } + jar.close(); + } catch (Exception e) { + System.err.println("JAR scan error: " + e.getMessage()); + } + } + + if (driver == null) { + System.err.println("ERROR: No Driver found in JAR that accepts URL: " + jdbcUrl); + System.exit(3); + } + + String className = driver.getClass().getName(); + System.err.println("Driver class: " + className); + + // -- Connect + Properties connProps = new Properties(); + connProps.putAll(extraProps); + if (user != null && !user.equals("null")) connProps.setProperty("user", user); + if (pass != null && !pass.equals("null")) connProps.setProperty("password", pass); + + Connection conn; + try { + conn = driver.connect(jdbcUrl, connProps); + if (conn == null) throw new SQLException("driver.connect() returned null"); + } catch (SQLException e) { + System.err.println("CONNECTION_ERROR: " + e.getMessage()); + System.exit(4); + return; + } + System.err.println("Connected successfully."); + + DatabaseMetaData meta = conn.getMetaData(); + + // ── Phase 1: metadata (no SQL) ──────────────────────────────────── + + // identifierQuoteChar + String quoteChar = "null"; + try { + String q = meta.getIdentifierQuoteString(); + quoteChar = (q != null && !q.isBlank()) ? jq(q.trim()) : jq("\""); + } catch (Exception e) { System.err.println("identifierQuoteChar err: " + e.getMessage()); } + + // transactionIsolation + String txIsolation = "null"; + try { + int ti = meta.getDefaultTransactionIsolation(); + switch (ti) { + case Connection.TRANSACTION_NONE: txIsolation = "\"NONE\""; break; + case Connection.TRANSACTION_READ_UNCOMMITTED: txIsolation = "\"READ_UNCOMMITTED\""; break; + case Connection.TRANSACTION_READ_COMMITTED: txIsolation = "\"READ_COMMITTED\""; break; + case Connection.TRANSACTION_REPEATABLE_READ: txIsolation = "\"REPEATABLE_READ\""; break; + case Connection.TRANSACTION_SERIALIZABLE: txIsolation = "\"SERIALIZABLE\""; break; + default: txIsolation = "\"READ_COMMITTED\""; + } + } catch (Exception e) { System.err.println("transactionIsolation err: " + e.getMessage()); } + + // tableNameCasing + String casing = "\"asis\""; + try { + if (meta.storesUpperCaseIdentifiers()) casing = "\"upper\""; + else if (meta.storesLowerCaseIdentifiers()) casing = "\"lower\""; + } catch (Exception e) { System.err.println("tableNameCasing err: " + e.getMessage()); } + + // ── Phase 2: SQL probes (5s timeout each) ──────────────────────── + + // validationQuery + String validationQuery = "null"; + String[] valCandidates = {"SELECT 1", "SELECT 1 FROM DUAL", "VALUES 1"}; + for (String q : valCandidates) { + if (tryQuery(conn, q, 5)) { validationQuery = jq(q); break; } + } + + // getTablesUsesNullCatalog + String nullCatalog = "false"; + try { + String catalog = conn.getCatalog(); + ResultSet rs1 = meta.getTables(catalog, null, "%", new String[]{"TABLE"}); + int c1 = 0; while (rs1.next()) c1++; + rs1.close(); + ResultSet rs2 = meta.getTables(null, null, "%", new String[]{"TABLE"}); + int c2 = 0; while (rs2.next()) c2++; + rs2.close(); + nullCatalog = (c2 > c1) ? "true" : "false"; + } catch (Exception e) { System.err.println("getTablesUsesNullCatalog err: " + e.getMessage()); } + + // subqueryRequiresAlias + String subAlias = "false"; + try { + tryQuery(conn, "SELECT * FROM (SELECT 1 AS x) WHERE 1=0", 5); + subAlias = "false"; + } catch (Exception e) { subAlias = "false"; } + if (!tryQuery(conn, "SELECT * FROM (SELECT 1 AS x) WHERE 1=0", 5)) { + subAlias = "true"; + } + + // approxCountDistinctFunction + String approxFn = "null"; + if (tryQuery(conn, "SELECT APPROX_COUNT_DISTINCT(1)", 5)) approxFn = "\"APPROX_COUNT_DISTINCT\""; + else if (tryQuery(conn, "SELECT APPROX_DISTINCT(1)", 5)) approxFn = "\"APPROX_DISTINCT\""; + + // schemaExistenceQueryStyle + String schemaStyle = "\"NONE\""; + if (tryQuery(conn, "SELECT 1 FROM INFORMATION_SCHEMA.SCHEMATA WHERE 1=0", 5)) + schemaStyle = "\"INFORMATION_SCHEMA\""; + else if (tryQuery(conn, "SHOW SCHEMAS", 5)) + schemaStyle = "\"SHOW_SCHEMAS\""; + else if (tryQuery(conn, "SELECT 1 FROM SYSCAT.SCHEMATA WHERE 1=0", 5)) + schemaStyle = "\"SYSCAT\""; + + // dateArithmeticStyle + interval templates + String dateArith = "\"STANDARD\""; + String intervalTs = "null"; + String intervalDt = "null"; + String upperTs = "null"; + String upperDt = "null"; + + if (tryQuery(conn, "SELECT TIMESTAMPADD(SECOND, 1, '2000-01-01')", 5)) { + dateArith = "\"MYSQL\""; + intervalTs = jq("TIMESTAMPADD(SECOND, TIMESTAMPDIFF(SECOND, MIN_{col}, MAX_{col}) / 3, MIN_{col})"); + intervalDt = jq("TIMESTAMPADD(DAY, TIMESTAMPDIFF(DAY, MIN_{col}, MAX_{col}) / 3, MIN_{col})"); + upperTs = jq("TIMESTAMPADD(SECOND, TIMESTAMPDIFF(SECOND, MIN_{col}, {interval}), {interval})"); + upperDt = jq("TIMESTAMPADD(DAY, TIMESTAMPDIFF(DAY, MIN_{col}, {interval}), {interval})"); + } else if (tryQuery(conn, "SELECT DATEADD(second, 1, '2000-01-01')", 5)) { + dateArith = "\"DATEADD_DATEDIFF\""; + intervalTs = jq("DATEADD(second, DATEDIFF(second, MIN_{col}, MAX_{col}) / 3, MIN_{col})"); + intervalDt = jq("DATEADD(day, DATEDIFF(day, MIN_{col}, MAX_{col}) / 3, MIN_{col})"); + upperTs = jq("DATEADD(second, DATEDIFF(second, MIN_{col}, {interval}), {interval})"); + upperDt = jq("DATEADD(day, DATEDIFF(day, MIN_{col}, {interval}), {interval})"); + } else if (tryQuery(conn, "SELECT NUMTODSINTERVAL(1, 'SECOND') FROM DUAL", 5)) { + dateArith = "\"NUMTODSINTERVAL\""; + intervalTs = jq("MIN_{col} + NUMTODSINTERVAL(EXTRACT(SECOND FROM (MAX_{col} - MIN_{col}))/3, 'SECOND')"); + intervalDt = jq("MIN_{col} + NUMTODSINTERVAL((MAX_{col} - MIN_{col})/3, 'DAY')"); + upperTs = jq("MIN_{col} + NUMTODSINTERVAL(EXTRACT(SECOND FROM ({interval} - MIN_{col})), 'SECOND')"); + upperDt = jq("MIN_{col} + NUMTODSINTERVAL({interval} - MIN_{col}, 'DAY')"); + } else if (tryQuery(conn, "SELECT TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 1 SECOND)", 5)) { + dateArith = "\"TIMESTAMP_ADD\""; + intervalTs = jq("TIMESTAMP_ADD(MIN_{col}, INTERVAL TIMESTAMP_DIFF(MAX_{col}, MIN_{col}, SECOND)/3 SECOND)"); + intervalDt = jq("DATE_ADD(MIN_{col}, INTERVAL DATE_DIFF(MAX_{col}, MIN_{col}, DAY)/3 DAY)"); + upperTs = jq("TIMESTAMP_ADD({interval}, INTERVAL TIMESTAMP_DIFF({interval}, MIN_{col}, SECOND) SECOND)"); + upperDt = jq("DATE_ADD({interval}, INTERVAL DATE_DIFF({interval}, MIN_{col}, DAY) DAY)"); + } + + // rowLimitSyntax — find a real accessible table first + String rowLimit = "null"; + String sampleTable = null; + try { + ResultSet tables = meta.getTables(null, null, "%", new String[]{"TABLE"}); + if (tables.next()) { + String tcat = tables.getString(1); + String tsch = tables.getString(2); + String tnam = tables.getString(3); + // build qualified name + StringBuilder sb = new StringBuilder(); + if (tcat != null && !tcat.isBlank()) sb.append(tcat).append("."); + if (tsch != null && !tsch.isBlank()) sb.append(tsch).append("."); + sb.append(tnam); + sampleTable = sb.toString(); + } + tables.close(); + } catch (Exception e) { System.err.println("table scan err: " + e.getMessage()); } + + if (sampleTable != null) { + if (tryQuery(conn, "SELECT TOP 1 1 FROM " + sampleTable, 5)) + rowLimit = "\"TOP\""; + else if (tryQuery(conn, "SELECT 1 FROM " + sampleTable + " FETCH FIRST 1 ROWS ONLY", 5)) + rowLimit = "\"FETCH_FIRST\""; + else if (tryQuery(conn, "SELECT 1 FROM " + sampleTable + " WHERE ROWNUM <= 1", 5)) + rowLimit = "\"ROWNUM\""; + else if (tryQuery(conn, "SELECT 1 FROM " + sampleTable + " LIMIT 1", 5)) + rowLimit = "\"LIMIT\""; + } else { + // no tables — try without a table + if (tryQuery(conn, "SELECT TOP 1 1", 5)) rowLimit = "\"TOP\""; + else if (tryQuery(conn, "VALUES 1 FETCH FIRST 1 ROWS ONLY", 5)) rowLimit = "\"FETCH_FIRST\""; + else if (tryQuery(conn, "SELECT 1 LIMIT 1", 5)) rowLimit = "\"LIMIT\""; + } + + // tableSampleTemplate + String sampleTemplate = "null"; + if (sampleTable != null) { + String[][] candidates = { + {"TABLESAMPLE SYSTEM (1)", "\"TABLESAMPLE SYSTEM ({pct})\""}, + {"TABLESAMPLE BERNOULLI (1)", "\"TABLESAMPLE BERNOULLI ({pct})\""}, + {"TABLESAMPLE SYSTEM (1 PERCENT)", "\"TABLESAMPLE SYSTEM ({pct} PERCENT)\""}, + {"SAMPLE (1)", "\"SAMPLE ({pct})\""}, + {"TABLESAMPLE (1)", "\"TABLESAMPLE ({pct})\""}, + {"SAMPLE (1 PERCENT)", "\"SAMPLE ({pct} PERCENT)\""}, + }; + for (String[] pair : candidates) { + if (tryQuery(conn, "SELECT 1 FROM " + sampleTable + " " + pair[0], 5)) { + sampleTemplate = pair[1]; + break; + } + } + } + + conn.close(); + + // ── Emit JSON ───────────────────────────────────────────────────── + StringBuilder out = new StringBuilder(); + out.append("{\n"); + out.append(" \"className\": ").append(jq(className)).append(",\n"); + out.append(" \"identifierQuoteChar\": ").append(quoteChar).append(",\n"); + out.append(" \"transactionIsolation\": ").append(txIsolation).append(",\n"); + out.append(" \"tableNameCasing\": ").append(casing).append(",\n"); + out.append(" \"validationQuery\": ").append(validationQuery).append(",\n"); + out.append(" \"getTablesUsesNullCatalog\": ").append(nullCatalog).append(",\n"); + out.append(" \"subqueryRequiresAlias\": ").append(subAlias).append(",\n"); + out.append(" \"approxCountDistinctFunction\": ").append(approxFn).append(",\n"); + out.append(" \"schemaExistenceQueryStyle\": ").append(schemaStyle).append(",\n"); + out.append(" \"dateArithmeticStyle\": ").append(dateArith).append(",\n"); + out.append(" \"rowLimitSyntax\": ").append(rowLimit).append(",\n"); + out.append(" \"tableSampleTemplate\": ").append(sampleTemplate).append(",\n"); + out.append(" \"intervalCalcDatetimeTimestampTemplate\": ").append(intervalTs).append(",\n"); + out.append(" \"intervalCalcDatetimeDateTemplate\": ").append(intervalDt).append(",\n"); + out.append(" \"upperBoundDatetimeTimestampTemplate\": ").append(upperTs).append(",\n"); + out.append(" \"upperBoundDatetimeDateTemplate\": ").append(upperDt).append("\n"); + out.append("}\n"); + System.out.println(out.toString()); + } +} +""" + +# --------------------------------------------------------------------------- +# Java toolchain helpers +# --------------------------------------------------------------------------- + + +def _require_java_tool(name: str) -> str: + """Return full path to *name* (javac/java), or raise typer.Exit(1).""" + path = shutil.which(name) + if path is None: + print( + f"[red]'{name}' not found on PATH.[/red]\n" + "[yellow]A Java Development Kit (JDK) is required to run driver probes.[/yellow]\n" + "Install a JDK (e.g. OpenJDK 11+) and make sure 'java' and 'javac' are on your PATH." + ) + raise typer.Exit(code=1) + return path + + +def _compile_probe(tmpdir: str) -> str: + """Write and compile JdbcProbe.java; return path to the class directory.""" + src_path = os.path.join(tmpdir, "JdbcProbe.java") + with open(src_path, "w") as fh: + fh.write(_PROBE_JAVA_SOURCE) + + javac = _require_java_tool("javac") + result = subprocess.run( + [javac, "-source", "11", "-target", "11", src_path], + capture_output=True, + text=True, + ) + if result.returncode != 0: + print( + f"[red]Failed to compile Java probe:[/red]\n{result.stderr}" + ) + raise typer.Exit(code=1) + return tmpdir + + +# --------------------------------------------------------------------------- +# Core probe runner +# --------------------------------------------------------------------------- + + +def _run_probe( + *, + jar_path: str, + jdbc_url: str, + user: str | None, + password: str | None, + properties: list[str], +) -> dict: + """Compile + run JdbcProbe; return parsed JSON dict.""" + tmpdir = tempfile.mkdtemp(prefix="qualytics_jdbc_probe_") + try: + class_dir = _compile_probe(tmpdir) + java = _require_java_tool("java") + + cmd = [ + java, + "-cp", + class_dir, + "JdbcProbe", + os.path.abspath(jar_path), + jdbc_url, + user or "null", + password or "null", + ] + list(properties) + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=120, + ) + + # Forward stderr from the probe to our stderr for debugging + if result.stderr.strip(): + for line in result.stderr.strip().splitlines(): + if line.startswith("CONNECTION_ERROR:"): + print(f"[red]JDBC connection failed: {line[len('CONNECTION_ERROR:'):].strip()}[/red]") + raise typer.Exit(code=1) + if line.startswith("ERROR:"): + print(f"[red]{line[len('ERROR:'):].strip()}[/red]") + raise typer.Exit(code=1) + + if result.returncode == 4: + print("[red]Could not connect to the database. Check --url, --user, and --password.[/red]") + raise typer.Exit(code=1) + if result.returncode == 3: + print("[red]No compatible JDBC driver found in the provided JAR for the given URL.[/red]") + raise typer.Exit(code=1) + if result.returncode != 0: + print(f"[red]Probe exited with code {result.returncode}.[/red]") + if result.stderr: + print(f"[dim]{result.stderr.strip()}[/dim]") + raise typer.Exit(code=1) + + try: + return json.loads(result.stdout) + except json.JSONDecodeError as exc: + print(f"[red]Could not parse probe output as JSON: {exc}[/red]") + print(f"[dim]Raw output: {result.stdout[:500]}[/dim]") + raise typer.Exit(code=1) + + finally: + shutil.rmtree(tmpdir, ignore_errors=True) + + +# --------------------------------------------------------------------------- +# YAML generation helpers +# --------------------------------------------------------------------------- + + +def _extract_prefix(jdbc_url: str) -> str | None: + """Parse 'jdbc::' from a JDBC URL.""" + m = re.match(r"jdbc:([^:]+):", jdbc_url, re.IGNORECASE) + return m.group(1).lower() if m else None + + +def _build_yaml( + prefix: str, + probes: dict, + jdbc_url: str, +) -> tuple[str, list[str], list[str]]: + """ + Build the complete YAML content string from the probes dict. + + Fields that were auto-detected get a short '# auto-detected' comment; + fields that need manual review get '# TODO: review' comments. + """ + + detected_fields: list[str] = [] + todo_fields: list[str] = [] + + def field(name: str, value, comment: str = "") -> str: + if value is None: + rendered = "null" + elif isinstance(value, bool): + rendered = str(value).lower() + elif isinstance(value, int): + rendered = str(value) + else: + # string — quote only if it contains special YAML chars or spaces + sv = str(value) + if sv == "null": + rendered = "null" + elif any(c in sv for c in [':', '#', '[', ']', '{', '}', ',', '&', '*', '?', '|', '-', '<', '>', '=', '!', '%', '@', '`', '"', "'", '\n']): + rendered = yaml.dump(sv, default_flow_style=True).strip() + else: + rendered = sv + comment_str = f" # {comment}" if comment else "" + return f"{name}: {rendered}{comment_str}" + + lines: list[str] = [] + + # ── Header comment ────────────────────────────────────────────────────── + lines.append( + textwrap.dedent(f"""\ + # Generated by: qualytics generate-driver + # Source URL: {jdbc_url} + # + # Fields marked "# auto-detected" were probed from JDBC metadata and live SQL. + # Fields marked "# TODO: review" could not be detected automatically; verify + # them against your database documentation before deploying. + # + # Deploy this file to: META-INF/jdbc-drivers/{prefix}.yaml + """) + ) + + # ── Auto-detected section ─────────────────────────────────────────────── + lines.append("# ── Auto-detected (verify before deploying) ──────────────────────") + + class_name = probes.get("className") + lines.append(field("prefix", prefix, "review — extracted from JDBC URL")) + lines.append(field("className", class_name, "auto-detected")) + todo_fields.append("prefix") + detected_fields.append("className") + + quote_char = probes.get("identifierQuoteChar") + lines.append(field("identifierQuoteChar", quote_char, "auto-detected")) + detected_fields.append("identifierQuoteChar") + + tx = probes.get("transactionIsolation") + lines.append(field("transactionIsolation", tx, "auto-detected" if tx else "TODO: review")) + (detected_fields if tx else todo_fields).append("transactionIsolation") + + casing = probes.get("tableNameCasing") + lines.append(field("tableNameCasing", casing, "auto-detected")) + detected_fields.append("tableNameCasing") + + row_limit = probes.get("rowLimitSyntax") + lines.append(field("rowLimitSyntax", row_limit, "auto-detected" if row_limit else "TODO: review — try TOP, LIMIT, FETCH_FIRST, ROWNUM")) + (detected_fields if row_limit else todo_fields).append("rowLimitSyntax") + + sub_alias = probes.get("subqueryRequiresAlias", False) + lines.append(field("subqueryRequiresAlias", sub_alias, "auto-detected")) + detected_fields.append("subqueryRequiresAlias") + + lines.append("") + + val_q = probes.get("validationQuery") + lines.append(field("validationQuery", val_q, "auto-detected" if val_q else "TODO: review — try SELECT 1 or SELECT 1 FROM DUAL")) + (detected_fields if val_q else todo_fields).append("validationQuery") + + lines.append("") + + sample_tmpl = probes.get("tableSampleTemplate") + lines.append(field("tableSampleTemplate", sample_tmpl, "auto-detected" if sample_tmpl else "null — no TABLESAMPLE support detected")) + detected_fields.append("tableSampleTemplate") + + approx = probes.get("approxCountDistinctFunction") + lines.append(field( + "approxCountDistinctFunction", + approx, + "auto-detected" if approx else "null — falls back to COUNT(DISTINCT col)", + )) + detected_fields.append("approxCountDistinctFunction") + + lines.append("") + + get_tables_null = probes.get("getTablesUsesNullCatalog", False) + lines.append(field("getTablesUsesNullCatalog", get_tables_null, "auto-detected")) + detected_fields.append("getTablesUsesNullCatalog") + + schema_style = probes.get("schemaExistenceQueryStyle", "NONE") + lines.append(field("schemaExistenceQueryStyle", schema_style, "auto-detected")) + detected_fields.append("schemaExistenceQueryStyle") + + date_arith = probes.get("dateArithmeticStyle", "STANDARD") + lines.append(field("dateArithmeticStyle", date_arith, "auto-detected")) + detected_fields.append("dateArithmeticStyle") + + lines.append("") + + # ── Performance tuning ────────────────────────────────────────────────── + lines.append("# ── Performance tuning (adjust for your workload) ────────────────") + lines.append(field("insertBatchSize", 10000, "TODO: tune for driver performance")) + lines.append(field("maxPartitionParallelism", 10, "TODO: set to 1 for write-heavy databases")) + lines.append(field("dataSizeLimit", "LONG_MAX", "TODO: use INT_MAX for SQL Server, Redshift, Db2")) + todo_fields += ["insertBatchSize", "maxPartitionParallelism", "dataSizeLimit"] + + lines.append("") + + # ── Needs manual research ──────────────────────────────────────────────── + lines.append("# ── Needs manual research (database-specific) ────────────────────") + lines.append(field("timestampLiteralStyle", "PLAIN", "TODO: check if CAST_DATETIME2, TO_TIMESTAMP, etc. applies")) + lines.append(field("dateLiteralStyle", "PLAIN", "TODO: check if DATE_PREFIX or TO_DATE applies")) + lines.append(field("schemaOnlyQueryStyle", "CTE", "TODO: check if SQLSERVER_TOP0, WHERE_FALSE_QUERYA, etc. applies")) + lines.append(field("viewSampleFallback", "RAND", "TODO: verify RAND() is supported")) + lines.append(field("rowCountQueryStyle", "COUNT_STAR", "TODO: check if INFORMATION_SCHEMA_TABLES_WITH_SIZE etc. applies")) + todo_fields += ["timestampLiteralStyle", "dateLiteralStyle", "schemaOnlyQueryStyle", + "viewSampleFallback", "rowCountQueryStyle"] + + lines.append("") + lines.append("systemSchemaExclusions: [] # TODO: add internal system schemas") + lines.append("systemSchemaExclusionPrefixes: [] # TODO: add prefixes of temporary schemas") + lines.append("systemCatalogExclusions: [] # TODO: add internal system catalogs") + todo_fields += ["systemSchemaExclusions", "systemSchemaExclusionPrefixes", "systemCatalogExclusions"] + + lines.append("") + + # ── Advanced ──────────────────────────────────────────────────────────── + lines.append("# ── Advanced (leave null unless you know you need them) ──────────") + lines.append("connectionProperties: {} # TODO: add driver-specific properties if needed") + lines.append("sessionInitStatements: [] # TODO: add session SQL if needed (NLS, SET DATEFORMAT, etc.)") + lines.append(field("readOnly", False)) + lines.append(field("dialectClass", None, "TODO: set to JdbcDialect class if bundling Spark dialect")) + todo_fields += ["connectionProperties", "sessionInitStatements", "dialectClass"] + + lines.append("") + + # ── Date arithmetic templates ──────────────────────────────────────────── + lines.append("# ── Date arithmetic templates ────────────────────────────────────") + lines.append("# Placeholders: {col} = column name, {interval} = interval expression") + lines.append("") + lines.append(field("intervalCalcNumericTemplate", None, "use generic CASE/DECIMAL(38,0) fallback")) + + int_ts = probes.get("intervalCalcDatetimeTimestampTemplate") + int_dt = probes.get("intervalCalcDatetimeDateTemplate") + up_ts = probes.get("upperBoundDatetimeTimestampTemplate") + up_dt = probes.get("upperBoundDatetimeDateTemplate") + + if int_ts: + lines.append(field("intervalCalcDatetimeTimestampTemplate", int_ts, "auto-detected")) + detected_fields.append("intervalCalcDatetimeTimestampTemplate") + else: + lines.append(field("intervalCalcDatetimeTimestampTemplate", None, "TODO: set timestamp midpoint expression")) + todo_fields.append("intervalCalcDatetimeTimestampTemplate") + + if int_dt: + lines.append(field("intervalCalcDatetimeDateTemplate", int_dt, "auto-detected")) + detected_fields.append("intervalCalcDatetimeDateTemplate") + else: + lines.append(field("intervalCalcDatetimeDateTemplate", None, "TODO: set date midpoint expression")) + todo_fields.append("intervalCalcDatetimeDateTemplate") + + lines.append("") + lines.append(field("upperBoundNumericTemplate", None, "use code default")) + + if up_ts: + lines.append(field("upperBoundDatetimeTimestampTemplate", up_ts, "auto-detected")) + detected_fields.append("upperBoundDatetimeTimestampTemplate") + else: + lines.append(field("upperBoundDatetimeTimestampTemplate", None, "TODO: set timestamp upper-bound expression")) + todo_fields.append("upperBoundDatetimeTimestampTemplate") + + if up_dt: + lines.append(field("upperBoundDatetimeDateTemplate", up_dt, "auto-detected")) + detected_fields.append("upperBoundDatetimeDateTemplate") + else: + lines.append(field("upperBoundDatetimeDateTemplate", None, "TODO: set date upper-bound expression")) + todo_fields.append("upperBoundDatetimeDateTemplate") + + return "\n".join(lines) + "\n", detected_fields, todo_fields + + +# --------------------------------------------------------------------------- +# CLI command +# --------------------------------------------------------------------------- + +generate_driver_app = typer.Typer( + name="generate-driver", + help="Generate a YAML driver definition by probing a JDBC driver JAR.", + invoke_without_command=True, +) + + +@generate_driver_app.callback(invoke_without_command=True) +def generate_driver( + ctx: typer.Context, + jar: Annotated[ + str, + typer.Option( + "--jar", + help="Path to the JDBC driver JAR file.", + show_default=False, + ), + ], + url: Annotated[ + str, + typer.Option( + "--url", + help="JDBC connection URL (e.g. jdbc:postgresql://host:5432/db).", + show_default=False, + ), + ], + user: Annotated[ + Optional[str], + typer.Option( + "--user", + help="Database username.", + show_default=False, + ), + ] = None, + password: Annotated[ + Optional[str], + typer.Option( + "--password", + help="Database password.", + show_default=False, + ), + ] = None, + properties: Annotated[ + Optional[list[str]], + typer.Option( + "--properties", + help="Extra JDBC connection properties as key=value pairs (repeatable).", + show_default=False, + ), + ] = None, + output: Annotated[ + Optional[str], + typer.Option( + "--output", + "-o", + help="Output file path. Defaults to .yaml in the current directory.", + show_default=False, + ), + ] = None, +) -> None: + """Generate a YAML driver definition by probing a JDBC driver JAR. + + Connects to the database using the provided JAR and URL, runs a series of + introspection probes, and writes a best-effort YAML file you can review + and edit before deploying to META-INF/jdbc-drivers/. + + Requires a JDK (java + javac) on PATH. + + Examples: + + \\b + qualytics generate-driver \\ + --jar ./postgresql-42.7.3.jar \\ + --url jdbc:postgresql://localhost:5432/mydb \\ + --user alice --password secret + + qualytics generate-driver \\ + --jar ./custom-driver.jar \\ + --url jdbc:customdb://host:1234/catalog \\ + --properties loginTimeout=30 \\ + --output custom.yaml + """ + # Skip if invoked as part of a help display + if ctx.invoked_subcommand is not None: + return + + print_banner(subtitle="[bold]Generate Driver[/bold]") + + # ── Validate JAR path ──────────────────────────────────────────────── + jar_path = os.path.abspath(jar) + if not os.path.isfile(jar_path): + print(f"[red]JAR file not found: {jar_path}[/red]") + raise typer.Exit(code=1) + + # ── Extract prefix from URL ────────────────────────────────────────── + prefix = _extract_prefix(url) + if prefix is None: + print( + f"[red]Could not parse a JDBC prefix from URL: {url}[/red]\n" + "[yellow]Expected format: jdbc::...[/yellow]" + ) + raise typer.Exit(code=1) + + # ── Determine output path ──────────────────────────────────────────── + if output: + out_path = os.path.abspath(output) + else: + out_path = os.path.join(os.getcwd(), f"{prefix}.yaml") + + print(f" JAR: [bold]{jar_path}[/bold]") + print(f" URL: [bold]{url}[/bold]") + print(f" Output: [bold]{out_path}[/bold]") + print() + + # ── Run probes ─────────────────────────────────────────────────────── + probes: dict = {} + with status("[bold cyan]Probing JDBC driver capabilities...[/bold cyan]"): + probes = _run_probe( + jar_path=jar_path, + jdbc_url=url, + user=user, + password=password, + properties=list(properties or []), + ) + + # ── Build YAML ─────────────────────────────────────────────────────── + yaml_content, detected_fields, todo_fields = _build_yaml(prefix, probes, url) + + # ── Write output ───────────────────────────────────────────────────── + try: + with open(out_path, "w") as fh: + fh.write(yaml_content) + except OSError as e: + print(f"[red]Failed to write output file: {e}[/red]") + raise typer.Exit(code=1) + + # ── Print summary ──────────────────────────────────────────────────── + console = Console() + console.print() + + table = Table(title="Probe Results", show_header=True, header_style=f"bold {BRAND}") + table.add_column("Field", style="bold", min_width=38) + table.add_column("Result", min_width=20) + table.add_column("Status", min_width=12) + + probe_display = [ + ("className", probes.get("className")), + ("prefix (from URL)", prefix), + ("identifierQuoteChar", probes.get("identifierQuoteChar")), + ("transactionIsolation", probes.get("transactionIsolation")), + ("tableNameCasing", probes.get("tableNameCasing")), + ("validationQuery", probes.get("validationQuery")), + ("subqueryRequiresAlias", str(probes.get("subqueryRequiresAlias", False)).lower()), + ("getTablesUsesNullCatalog", str(probes.get("getTablesUsesNullCatalog", False)).lower()), + ("approxCountDistinctFunction", probes.get("approxCountDistinctFunction")), + ("schemaExistenceQueryStyle", probes.get("schemaExistenceQueryStyle")), + ("dateArithmeticStyle", probes.get("dateArithmeticStyle")), + ("rowLimitSyntax", probes.get("rowLimitSyntax")), + ("tableSampleTemplate", probes.get("tableSampleTemplate")), + ("intervalCalcDatetimeTimestampTemplate", probes.get("intervalCalcDatetimeTimestampTemplate")), + ("intervalCalcDatetimeDateTemplate", probes.get("intervalCalcDatetimeDateTemplate")), + ("upperBoundDatetimeTimestampTemplate", probes.get("upperBoundDatetimeTimestampTemplate")), + ("upperBoundDatetimeDateTemplate", probes.get("upperBoundDatetimeDateTemplate")), + ] + + for name, value in probe_display: + if value is not None and value != "null": + display_val = str(value) + if len(display_val) > 50: + display_val = display_val[:47] + "..." + table.add_row(name, display_val, f"[{BRAND}]detected[/{BRAND}]") + else: + table.add_row(name, "—", "[yellow]needs review[/yellow]") + + console.print(table) + console.print() + + todo_count = sum( + 1 for _, v in probe_display if v is None or v == "null" + ) + # Always-todo fields (performance tuning, manual research) + always_todo = [ + "insertBatchSize", "maxPartitionParallelism", "dataSizeLimit", + "timestampLiteralStyle", "dateLiteralStyle", "schemaOnlyQueryStyle", + "viewSampleFallback", "rowCountQueryStyle", + "systemSchemaExclusions", "systemSchemaExclusionPrefixes", "systemCatalogExclusions", + "connectionProperties", "sessionInitStatements", "dialectClass", + ] + total_todo = todo_count + len(always_todo) + auto_detected = len(probe_display) - todo_count + + print( + f" [{BRAND}]{auto_detected} field(s) auto-detected[/{BRAND}] " + f"[yellow]{total_todo} field(s) need review[/yellow]\n" + ) + print(f" [bold]Written:[/bold] {out_path}") + print( + "\n [dim]Review the file, fill in the TODO comments, then deploy to:[/dim]" + "\n [dim]META-INF/jdbc-drivers/" + prefix + ".yaml[/dim]\n" + ) diff --git a/qualytics/qualytics.py b/qualytics/qualytics.py index 774e33d..2217b1c 100644 --- a/qualytics/qualytics.py +++ b/qualytics/qualytics.py @@ -22,6 +22,7 @@ from .cli.users import users_app from .cli.teams import teams_app from .cli.tags import tags_app +from .cli.generate_driver import generate_driver_app # Import config for environment setup from .config import DOTENV_PATH @@ -46,6 +47,7 @@ app.add_typer(teams_app, name="teams") app.add_typer(tags_app, name="tags") app.command("doctor", help="Check CLI health and connectivity")(doctor) +app.add_typer(generate_driver_app, name="generate-driver") if __name__ == "__main__": From eafc1c4327e55c732bc5b9766089a05172965fe4 Mon Sep 17 00:00:00 2001 From: Eric Simmerman Date: Thu, 2 Apr 2026 23:42:21 -0400 Subject: [PATCH 02/10] =?UTF-8?q?feat:=20generate-driver=20=E2=80=94=20ful?= =?UTF-8?q?l=20DriverDefinition=20alignment=20+=20package-drivers=20comman?= =?UTF-8?q?d?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rewrote Java JDBC probe to auto-detect all 14 DriverDefinition fields: NDV(1) added to approxCountDistinctFunction probe; new rowCountQueryStyle probe (INFORMATION_SCHEMA_ROW_COUNT, INFORMATION_SCHEMA_TABLES_WITH_SIZE, ALL_TABLES); all probes emit to JSON for Python consumption. - Rewrote _build_yaml() with canonical key ordering (Identity → SQL dialect → Performance → Schema/catalog → Style selectors → Date arithmetic templates → Connectivity → Spark JdbcDialect → URL construction → Connection spec). Applies "non-default keys only" rule: fields equal to DriverDefinition defaults are omitted. Only insertBatchSize is excluded (write-only); maxPartitionParallelism restored as a TODO field. - Added _derive_url_metadata() returning (port, template, url_components). connectionSpec only marks fields required when actually present in the probe URL — portless drivers (SQLite, MongoDB) no longer get a spurious required port field. - Default output path changed to dist/META-INF/jdbc-drivers/.yaml. Directory is auto-created. Index file created/updated after every write (idempotent — no duplicates on re-run). - Added package-drivers top-level command: bundles dist/ into custom-drivers.jar using Python zipfile (no jar tool required). - Removed Source URL from generated YAML header so probe URL does not confuse the LLM when suggesting jdbcUrlTemplate values. Co-Authored-By: Claude Sonnet 4.6 --- qualytics/cli/generate_driver.py | 985 ++++++++++++++++++++++++++----- qualytics/qualytics.py | 3 +- uv.lock | 6 +- 3 files changed, 844 insertions(+), 150 deletions(-) diff --git a/qualytics/cli/generate_driver.py b/qualytics/cli/generate_driver.py index fe0ac51..e361632 100644 --- a/qualytics/cli/generate_driver.py +++ b/qualytics/cli/generate_driver.py @@ -9,6 +9,7 @@ import subprocess import tempfile import textwrap +import zipfile from typing import Annotated, Optional import typer @@ -142,6 +143,14 @@ DatabaseMetaData meta = conn.getMetaData(); + // databaseProductName / databaseProductVersion + String dbProductName = "null"; + String dbProductVersion = "null"; + try { + dbProductName = jq(meta.getDatabaseProductName()); + dbProductVersion = jq(meta.getDatabaseProductVersion()); + } catch (Exception e) { System.err.println("dbProduct err: " + e.getMessage()); } + // ── Phase 1: metadata (no SQL) ──────────────────────────────────── // identifierQuoteChar @@ -208,15 +217,18 @@ String approxFn = "null"; if (tryQuery(conn, "SELECT APPROX_COUNT_DISTINCT(1)", 5)) approxFn = "\"APPROX_COUNT_DISTINCT\""; else if (tryQuery(conn, "SELECT APPROX_DISTINCT(1)", 5)) approxFn = "\"APPROX_DISTINCT\""; + else if (tryQuery(conn, "SELECT NDV(1)", 5)) approxFn = "\"NDV\""; // schemaExistenceQueryStyle String schemaStyle = "\"NONE\""; if (tryQuery(conn, "SELECT 1 FROM INFORMATION_SCHEMA.SCHEMATA WHERE 1=0", 5)) schemaStyle = "\"INFORMATION_SCHEMA\""; else if (tryQuery(conn, "SHOW SCHEMAS", 5)) - schemaStyle = "\"SHOW_SCHEMAS\""; + schemaStyle = "\"SHOW_SCHEMAS_ITERATE\""; else if (tryQuery(conn, "SELECT 1 FROM SYSCAT.SCHEMATA WHERE 1=0", 5)) schemaStyle = "\"SYSCAT\""; + else if (tryQuery(conn, "SELECT 1 FROM sys.schemas WHERE 1=0", 5)) + schemaStyle = "\"SYS_SCHEMAS\""; // dateArithmeticStyle + interval templates String dateArith = "\"STANDARD\""; @@ -226,7 +238,7 @@ String upperDt = "null"; if (tryQuery(conn, "SELECT TIMESTAMPADD(SECOND, 1, '2000-01-01')", 5)) { - dateArith = "\"MYSQL\""; + dateArith = "\"STANDARD\""; intervalTs = jq("TIMESTAMPADD(SECOND, TIMESTAMPDIFF(SECOND, MIN_{col}, MAX_{col}) / 3, MIN_{col})"); intervalDt = jq("TIMESTAMPADD(DAY, TIMESTAMPDIFF(DAY, MIN_{col}, MAX_{col}) / 3, MIN_{col})"); upperTs = jq("TIMESTAMPADD(SECOND, TIMESTAMPDIFF(SECOND, MIN_{col}, {interval}), {interval})"); @@ -305,12 +317,57 @@ } } + // viewSampleFallback — probe which random function is supported for view sampling + String viewSampleFallback = "\"RAND\""; + if (!tryQuery(conn, "SELECT RAND()", 5)) { + if (tryQuery(conn, "SELECT RANDOM()", 5)) + viewSampleFallback = "\"RANDOM\""; + else if (tryQuery(conn, "SELECT NEWID()", 5)) + viewSampleFallback = "\"NEWID\""; + } + + // timestampLiteralStyle — detect DB-specific timestamp cast syntax + String timestampLiteralStyle = "\"PLAIN\""; + if (tryQuery(conn, "SELECT CAST('2000-01-01 00:00:00' AS DATETIME2)", 5)) + timestampLiteralStyle = "\"CAST_DATETIME2\""; + else if (tryQuery(conn, "SELECT TO_TIMESTAMP('2000-01-01 00:00:00', 'YYYY-MM-DD HH24:MI:SS') FROM DUAL", 5)) + timestampLiteralStyle = "\"TO_TIMESTAMP\""; + else if (tryQuery(conn, "SELECT TIMESTAMP '2000-01-01 00:00:00'", 5)) + timestampLiteralStyle = "\"TIMESTAMP_PREFIX\""; + + // dateLiteralStyle — detect Oracle-style TO_DATE (DUAL distinguishes Oracle from others) + String dateLiteralStyle = "\"PLAIN\""; + if (tryQuery(conn, "SELECT TO_DATE('2000-01-01', 'YYYY-MM-DD') FROM DUAL", 5)) + dateLiteralStyle = "\"TO_DATE\""; + + // schemaOnlyQueryStyle — how to wrap a query to return 0 rows (for schema inspection) + String schemaOnlyStyle = "\"CTE\""; + if (sampleTable != null) { + if (tryQuery(conn, "SELECT TOP 0 * FROM " + sampleTable, 5)) + schemaOnlyStyle = "\"SQLSERVER_TOP0\""; + else if (tryQuery(conn, "SELECT * FROM " + sampleTable + " WHERE 1=0", 5) + && dateLiteralStyle.equals("\"TO_DATE\"")) + // Oracle: WHERE 1=0 works but no alias required + schemaOnlyStyle = "\"ORACLE_WHERE_FALSE\""; + } + + // rowCountQueryStyle — probe metadata tables for optimized row count access + String rowCountStyle = "\"COUNT_STAR\""; + if (tryQuery(conn, "SELECT ROW_COUNT FROM INFORMATION_SCHEMA.TABLES WHERE 1=0", 5)) + rowCountStyle = "\"INFORMATION_SCHEMA_ROW_COUNT\""; + else if (tryQuery(conn, "SELECT DATA_LENGTH FROM INFORMATION_SCHEMA.TABLES WHERE 1=0", 5)) + rowCountStyle = "\"INFORMATION_SCHEMA_TABLES_WITH_SIZE\""; + else if (tryQuery(conn, "SELECT NUM_ROWS FROM ALL_TABLES WHERE 1=0", 5)) + rowCountStyle = "\"ALL_TABLES\""; + conn.close(); // ── Emit JSON ───────────────────────────────────────────────────── StringBuilder out = new StringBuilder(); out.append("{\n"); out.append(" \"className\": ").append(jq(className)).append(",\n"); + out.append(" \"dbProductName\": ").append(dbProductName).append(",\n"); + out.append(" \"dbProductVersion\": ").append(dbProductVersion).append(",\n"); out.append(" \"identifierQuoteChar\": ").append(quoteChar).append(",\n"); out.append(" \"transactionIsolation\": ").append(txIsolation).append(",\n"); out.append(" \"tableNameCasing\": ").append(casing).append(",\n"); @@ -325,7 +382,12 @@ out.append(" \"intervalCalcDatetimeTimestampTemplate\": ").append(intervalTs).append(",\n"); out.append(" \"intervalCalcDatetimeDateTemplate\": ").append(intervalDt).append(",\n"); out.append(" \"upperBoundDatetimeTimestampTemplate\": ").append(upperTs).append(",\n"); - out.append(" \"upperBoundDatetimeDateTemplate\": ").append(upperDt).append("\n"); + out.append(" \"upperBoundDatetimeDateTemplate\": ").append(upperDt).append(",\n"); + out.append(" \"viewSampleFallback\": ").append(viewSampleFallback).append(",\n"); + out.append(" \"timestampLiteralStyle\": ").append(timestampLiteralStyle).append(",\n"); + out.append(" \"dateLiteralStyle\": ").append(dateLiteralStyle).append(",\n"); + out.append(" \"schemaOnlyQueryStyle\": ").append(schemaOnlyStyle).append(",\n"); + out.append(" \"rowCountQueryStyle\": ").append(rowCountStyle).append("\n"); out.append("}\n"); System.out.println(out.toString()); } @@ -451,6 +513,65 @@ def _extract_prefix(jdbc_url: str) -> str | None: return m.group(1).lower() if m else None +def _derive_url_metadata(jdbc_url: str) -> tuple[int | None, str, set[str]]: + """ + Derive a defaultPort, a jdbcUrlTemplate, and the set of URL components that + were present in the probe URL from a JDBC URL. + + Returns (port_or_None, template_string, url_components). + url_components is a subset of {"host", "port", "database"} reflecting which + parts were actually found in the URL. This is used to decide which + connectionSpec fields should be marked required. + + Template replaces hostname → {host}, port → {port}, first path segment → {database}. + """ + url_components: set[str] = set() + + # Try the standard jdbc:scheme://[authority]/path form first + m = re.match(r"(jdbc:[^:]+://)([^/?#]*)(.*)", jdbc_url, re.IGNORECASE) + if m: + scheme = m.group(1) + authority = m.group(2) + rest = m.group(3) + + # host — present if authority is non-empty after stripping credentials/port/params + host_part = re.sub(r"^[^@]+@", "", authority) # strip user:pass@ + host_part = re.sub(r":\d+(?:$|;)", "", host_part) # strip :port + host_part = re.sub(r";.*$", "", host_part) # strip ;params (SQL Server style) + if host_part.strip(): + url_components.add("host") + + port_m = re.search(r":(\d+)(?:$|;)", authority) + port = int(port_m.group(1)) if port_m else None + if port is not None: + url_components.add("port") + + # database — present if the path has a non-empty first segment + db_m = re.match(r"/([^/?#;]+)", rest) + if db_m and db_m.group(1): + url_components.add("database") + + tmpl_authority = re.sub(r"^[^:@/]+", "{host}", authority) + tmpl_authority = re.sub(r":\d+(?:$|;)", ":{port}", tmpl_authority) + tmpl_rest = re.sub(r"^/([^/?#]+)", "/{database}", rest) + tmpl_rest = re.sub(r"\?.*$", "", tmpl_rest) + + return port, f"{scheme}{tmpl_authority}{tmpl_rest}", url_components + + # Fallback: jdbc:scheme:path (no authority) — e.g. jdbc:sqlite:/path, jdbc:h2:file:/path + m2 = re.match(r"(jdbc:[^:]+:)(.+)", jdbc_url, re.IGNORECASE) + if m2: + scheme = m2.group(1) + path = m2.group(2) + # Only treat as a "database" path if it looks like a file path (not mem:/in-memory) + if re.match(r"[/.]", path) or re.match(r"file:", path, re.IGNORECASE): + url_components.add("database") + tmpl_path = re.sub(r"^[^?#]+", "{database}", path) + return None, f"{scheme}{tmpl_path}", url_components + + return None, "", url_components + + def _build_yaml( prefix: str, probes: dict, @@ -459,193 +580,547 @@ def _build_yaml( """ Build the complete YAML content string from the probes dict. - Fields that were auto-detected get a short '# auto-detected' comment; - fields that need manual review get '# TODO: review' comments. + Follows canonical DriverDefinition key ordering. Only emits keys that + differ from their DriverDefinition defaults — plus required fields and + any TODO fields that need manual/LLM review. + + Fields marked '# auto-detected' were probed from the live database. + Fields marked '# TODO: ...' need manual review — the comment describes the + field, its valid values, and what the LLM should consider when filling it in. """ detected_fields: list[str] = [] todo_fields: list[str] = [] - def field(name: str, value, comment: str = "") -> str: + def _render(value) -> str: if value is None: - rendered = "null" - elif isinstance(value, bool): - rendered = str(value).lower() - elif isinstance(value, int): - rendered = str(value) - else: - # string — quote only if it contains special YAML chars or spaces - sv = str(value) - if sv == "null": - rendered = "null" - elif any(c in sv for c in [':', '#', '[', ']', '{', '}', ',', '&', '*', '?', '|', '-', '<', '>', '=', '!', '%', '@', '`', '"', "'", '\n']): - rendered = yaml.dump(sv, default_flow_style=True).strip() - else: - rendered = sv + return "null" + if isinstance(value, bool): + return str(value).lower() + if isinstance(value, int): + return str(value) + sv = str(value) + if sv == "null": + return "null" + if any(c in sv for c in [':', '#', '[', ']', '{', '}', ',', '&', '*', '?', '|', + '-', '<', '>', '=', '!', '%', '@', '`', '"', "'", '\n']): + dumped = yaml.dump(sv, default_flow_style=True).strip() + # yaml.dump may append a YAML document-end marker on a new line — strip it + if '\n' in dumped: + dumped = dumped.split('\n')[0] + return dumped + return sv + + def field(name: str, value, comment: str = "") -> str: comment_str = f" # {comment}" if comment else "" - return f"{name}: {rendered}{comment_str}" + return f"{name}: {_render(value)}{comment_str}" lines: list[str] = [] - # ── Header comment ────────────────────────────────────────────────────── - lines.append( - textwrap.dedent(f"""\ + # ── Derive URL metadata ───────────────────────────────────────────────── + default_port, jdbc_url_template, url_components = _derive_url_metadata(jdbc_url) + db_product_name = probes.get("dbProductName") + display_name = ( + db_product_name + if db_product_name and db_product_name not in (None, "null") + else prefix.capitalize() + ) + + # ── Header ────────────────────────────────────────────────────────────── + lines.append(textwrap.dedent(f"""\ # Generated by: qualytics generate-driver - # Source URL: {jdbc_url} # - # Fields marked "# auto-detected" were probed from JDBC metadata and live SQL. - # Fields marked "# TODO: review" could not be detected automatically; verify - # them against your database documentation before deploying. + # "# auto-detected" — probed from the live JDBC connection. + # "# TODO: …" — could not be auto-detected; fill in before deploying. + # Comment describes the field, valid values, and intent. + # "# LLM-suggested" — filled in by the deployment LLM (review before use). + # + # Custom JDBC drivers in Qualytics support SOURCE datastores only (read access). + # Excluded write-only field: insertBatchSize. + # Keys equal to their DriverDefinition default are omitted to keep this file concise. # # Deploy this file to: META-INF/jdbc-drivers/{prefix}.yaml - """) - ) - - # ── Auto-detected section ─────────────────────────────────────────────── - lines.append("# ── Auto-detected (verify before deploying) ──────────────────────") + """)) - class_name = probes.get("className") - lines.append(field("prefix", prefix, "review — extracted from JDBC URL")) - lines.append(field("className", class_name, "auto-detected")) + # ── Identity ───────────────────────────────────────────────────────────── + lines.append(field("prefix", prefix, + "review — must match the jdbc:: scheme in the JDBC URL")) todo_fields.append("prefix") + lines.append(field("className", probes.get("className"), + "auto-detected — fully-qualified JDBC Driver class name")) detected_fields.append("className") + lines.append("") - quote_char = probes.get("identifierQuoteChar") - lines.append(field("identifierQuoteChar", quote_char, "auto-detected")) - detected_fields.append("identifierQuoteChar") + # ── SQL dialect ─────────────────────────────────────────────────────────── + lines.append("# ── SQL dialect ──────────────────────────────────────────────────────") + + # displayName — always emit (default is raw prefix; capitalised form is user-friendly) + lines.append(field("displayName", display_name, + "auto-detected from DB product name — human-readable name shown in the UI")) + detected_fields.append("displayName") + + # defaultPort — emit only when probe URL contained an explicit port number. + # If the driver uses a portless URL scheme (e.g. jdbc:sqlite:, jdbc:h2:mem:), + # leave as TODO so the user knows to set it (or omit if the driver truly has no port). + if default_port is not None: + lines.append(field("defaultPort", default_port, + "auto-detected from JDBC URL — default port shown in the connection form")) + detected_fields.append("defaultPort") + else: + lines.append(field("defaultPort", None, + "TODO: default TCP port for this driver " + "(e.g. 5432 PostgreSQL, 3306 MySQL, 1521 Oracle, 1433 SQL Server). " + "Omit this key entirely if the driver does not use TCP ports.")) + todo_fields.append("defaultPort") + # transactionIsolation — omit if READ_UNCOMMITTED (default) tx = probes.get("transactionIsolation") - lines.append(field("transactionIsolation", tx, "auto-detected" if tx else "TODO: review")) - (detected_fields if tx else todo_fields).append("transactionIsolation") - - casing = probes.get("tableNameCasing") - lines.append(field("tableNameCasing", casing, "auto-detected")) - detected_fields.append("tableNameCasing") + if tx and tx != "READ_UNCOMMITTED": + lines.append(field("transactionIsolation", tx, + "auto-detected — valid: NONE, READ_UNCOMMITTED (default), " + "READ_COMMITTED, SERIALIZABLE")) + detected_fields.append("transactionIsolation") + elif tx: + detected_fields.append("transactionIsolation") # default — omitted + + # identifierQuoteChar — omit if " (default) + quote_char = probes.get("identifierQuoteChar") + if quote_char and quote_char != '"': + lines.append(field("identifierQuoteChar", quote_char, + 'auto-detected — char used to quote identifiers; default " — MySQL/MariaDB use `')) + detected_fields.append("identifierQuoteChar") + elif quote_char: + detected_fields.append("identifierQuoteChar") # default — omitted + + # tableNameCasing — omit if asis (default) + casing = probes.get("tableNameCasing", "asis") + if casing != "asis": + lines.append(field("tableNameCasing", casing, + "auto-detected — valid: upper (DB2/Oracle), lower (PostgreSQL), " + "asis (default, most others)")) + detected_fields.append("tableNameCasing") + else: + detected_fields.append("tableNameCasing") # default — omitted + # rowLimitSyntax — omit if LIMIT (default); TODO if probe couldn't determine row_limit = probes.get("rowLimitSyntax") - lines.append(field("rowLimitSyntax", row_limit, "auto-detected" if row_limit else "TODO: review — try TOP, LIMIT, FETCH_FIRST, ROWNUM")) - (detected_fields if row_limit else todo_fields).append("rowLimitSyntax") - - sub_alias = probes.get("subqueryRequiresAlias", False) - lines.append(field("subqueryRequiresAlias", sub_alias, "auto-detected")) - detected_fields.append("subqueryRequiresAlias") + if row_limit and row_limit != "LIMIT": + lines.append(field("rowLimitSyntax", row_limit, + "auto-detected — valid: LIMIT (default), TOP (SQL Server), " + "ROWNUM (Oracle), FETCH_FIRST (DB2/Informix)")) + detected_fields.append("rowLimitSyntax") + elif row_limit == "LIMIT": + detected_fields.append("rowLimitSyntax") # default — omitted + else: + lines.append(field("rowLimitSyntax", "LIMIT", + "TODO: valid: LIMIT (default, MySQL/PG/SQLite), TOP (SQL Server), " + "ROWNUM (Oracle), FETCH_FIRST (DB2/Informix/Spark)")) + todo_fields.append("rowLimitSyntax") + + # subqueryRequiresAlias — omit if true (default); emit false if probe confirmed no alias needed + sub_alias = probes.get("subqueryRequiresAlias", True) + if isinstance(sub_alias, str): + sub_alias = sub_alias.lower() != "false" + if not sub_alias: + lines.append(field("subqueryRequiresAlias", False, + "auto-detected — false: subqueries do NOT need an AS alias " + "(rare; historically Oracle)")) + detected_fields.append("subqueryRequiresAlias") + else: + detected_fields.append("subqueryRequiresAlias") # default true — omitted + + # timestampLiteralStyle — omit if PLAIN (default) + ts_style = probes.get("timestampLiteralStyle", "PLAIN") + if ts_style != "PLAIN": + lines.append(field("timestampLiteralStyle", ts_style, + "auto-detected — valid: PLAIN (default), TIMESTAMP_PREFIX (standard SQL), " + "CAST_AS_TIMESTAMP (Hive), CAST_DATE_FORMAT (Databricks), " + "TO_TIMESTAMP (Oracle), CAST_DATETIME2 (SQL Server)")) + detected_fields.append("timestampLiteralStyle") + else: + detected_fields.append("timestampLiteralStyle") # default — omitted + # timestampLiteralTemplate: escape hatch — omit unless enum styles are insufficient + + # dateLiteralStyle — omit if PLAIN (default) + dt_style = probes.get("dateLiteralStyle", "PLAIN") + if dt_style != "PLAIN": + lines.append(field("dateLiteralStyle", dt_style, + "auto-detected — valid: PLAIN (default), DATE_PREFIX, TO_DATE (Oracle)")) + detected_fields.append("dateLiteralStyle") + else: + detected_fields.append("dateLiteralStyle") # default — omitted + # dateLiteralTemplate: escape hatch — omit unless enum styles are insufficient + + # schemaOnlyQueryStyle — if CTE (probe fallback, unconfirmed) → TODO; else emit as detected + schema_only = probes.get("schemaOnlyQueryStyle", "CTE") + if schema_only != "CTE": + lines.append(field("schemaOnlyQueryStyle", schema_only, + "auto-detected — how to wrap a query to return 0 rows for schema inspection. " + "Valid: CTE (default), PG_CTE (PostgreSQL), SQLSERVER_TOP0 (SQL Server), " + "WHERE_FALSE_QUERYA (generic WHERE 1=0), ORACLE_WHERE_FALSE (Oracle), " + "HIVE_LIMIT0 (Hive/Spark)")) + detected_fields.append("schemaOnlyQueryStyle") + else: + lines.append(field("schemaOnlyQueryStyle", "CTE", + "TODO: how to wrap a query to return 0 rows for schema inspection. " + "Valid: CTE (default, most modern DBs with WITH support), PG_CTE (PostgreSQL), " + "SQLSERVER_TOP0 (SQL Server/Synapse), WHERE_FALSE_QUERYA (generic WHERE 1=0), " + "ORACLE_WHERE_FALSE (Oracle), HIVE_LIMIT0 (Hive/Spark)")) + todo_fields.append("schemaOnlyQueryStyle") + + # tableSampleTemplate — omit if null (not supported = default "no template") + sample_tmpl = probes.get("tableSampleTemplate") + if sample_tmpl and sample_tmpl != "null": + lines.append(field("tableSampleTemplate", sample_tmpl, + "auto-detected — TABLESAMPLE syntax; {pct} = percent, {rows} = row count")) + detected_fields.append("tableSampleTemplate") + else: + detected_fields.append("tableSampleTemplate") # null/not supported — omitted + + # viewSampleFallback — omit if RAND (default) + vsf = probes.get("viewSampleFallback", "RAND") + if vsf != "RAND": + lines.append(field("viewSampleFallback", vsf, + "auto-detected — random fn for view sampling. " + "Valid: RAND (default), RANDOM (PostgreSQL/Redshift), " + "NEWID (SQL Server), DBMS_RANDOM (Oracle), SAMPLE_N (Teradata), " + "NONE (BigQuery)")) + detected_fields.append("viewSampleFallback") + else: + detected_fields.append("viewSampleFallback") # default — omitted + # viewSampleFallbackSql: escape hatch — omit unless enum styles are insufficient - lines.append("") + # approxCountDistinctFunction — omit if null (not supported, falls back to COUNT DISTINCT) + approx = probes.get("approxCountDistinctFunction") + if approx and approx != "null": + lines.append(field("approxCountDistinctFunction", approx, + "auto-detected — SQL function name for approximate COUNT DISTINCT")) + detected_fields.append("approxCountDistinctFunction") + else: + detected_fields.append("approxCountDistinctFunction") # null — omitted + # validationQuery — omit if SELECT 1 (default) val_q = probes.get("validationQuery") - lines.append(field("validationQuery", val_q, "auto-detected" if val_q else "TODO: review — try SELECT 1 or SELECT 1 FROM DUAL")) - (detected_fields if val_q else todo_fields).append("validationQuery") + if val_q and val_q != "SELECT 1": + lines.append(field("validationQuery", val_q, + "auto-detected — minimal SQL to test a pooled connection is alive")) + detected_fields.append("validationQuery") + elif val_q: + detected_fields.append("validationQuery") # default — omitted + else: + lines.append(field("validationQuery", "SELECT 1", + "TODO: SQL to verify a live connection; try SELECT 1 FROM DUAL (Oracle), " + "VALUES 1 (DB2/H2)")) + todo_fields.append("validationQuery") lines.append("") - sample_tmpl = probes.get("tableSampleTemplate") - lines.append(field("tableSampleTemplate", sample_tmpl, "auto-detected" if sample_tmpl else "null — no TABLESAMPLE support detected")) - detected_fields.append("tableSampleTemplate") + # ── Performance ─────────────────────────────────────────────────────────── + lines.append("# ── Performance ──────────────────────────────────────────────────────") + lines.append(field("maxPartitionParallelism", 10, + "TODO: max parallel partitions for scan operations; default 10. " + "Set 1 for DBs that struggle with concurrent connections (e.g. BigQuery, " + "single-threaded embedded drivers)")) + todo_fields.append("maxPartitionParallelism") + lines.append(field("dataSizeLimit", "LONG_MAX", + "TODO: max data the driver can handle. LONG_MAX (default, most DBs) or " + "INT_MAX for older 32-bit drivers (SQL Server, Redshift, Db2)")) + todo_fields.append("dataSizeLimit") + lines.append("") - approx = probes.get("approxCountDistinctFunction") - lines.append(field( - "approxCountDistinctFunction", - approx, - "auto-detected" if approx else "null — falls back to COUNT(DISTINCT col)", - )) - detected_fields.append("approxCountDistinctFunction") + # ── Schema / catalog filtering ──────────────────────────────────────────── + lines.append("# ── Schema / catalog filtering ───────────────────────────────────────") + lines.append("systemSchemaExclusions: []" + " # TODO: exact schema names to exclude from catalog scans " + "(e.g. [information_schema, pg_catalog])") + lines.append("systemSchemaExclusionPrefixes: []" + " # TODO: schema name prefixes to exclude (e.g. [pg_temp_, pg_toast_temp_])") + lines.append("systemCatalogExclusions: []" + " # TODO: catalog names to exclude " + "(e.g. [admin, local, config] for MongoDB; [information_schema, mysql] for MySQL)") + todo_fields += ["systemSchemaExclusions", "systemSchemaExclusionPrefixes", "systemCatalogExclusions"] + # getTablesUsesNullCatalog — omit if false (default); emit if true + get_tables_null = probes.get("getTablesUsesNullCatalog", False) + if isinstance(get_tables_null, str): + get_tables_null = get_tables_null.lower() == "true" + if get_tables_null: + lines.append(field("getTablesUsesNullCatalog", True, + "auto-detected — pass null as catalog arg to DatabaseMetaData.getTables(); " + "required for Db2")) + detected_fields.append("getTablesUsesNullCatalog") + else: + detected_fields.append("getTablesUsesNullCatalog") # default false — omitted lines.append("") - get_tables_null = probes.get("getTablesUsesNullCatalog", False) - lines.append(field("getTablesUsesNullCatalog", get_tables_null, "auto-detected")) - detected_fields.append("getTablesUsesNullCatalog") + # ── Style selectors ──────────────────────────────────────────────────────── + lines.append("# ── Style selectors ──────────────────────────────────────────────────") + # rowCountQueryStyle — emit as TODO if COUNT_STAR (default/unconfirmed), else auto-detected + row_count_style = probes.get("rowCountQueryStyle", "COUNT_STAR") + if row_count_style and row_count_style != "COUNT_STAR": + lines.append(field("rowCountQueryStyle", row_count_style, + "auto-detected — row count strategy")) + detected_fields.append("rowCountQueryStyle") + else: + lines.append(field("rowCountQueryStyle", "COUNT_STAR", + "TODO: row count strategy. Valid: COUNT_STAR (default, always works), " + "BQ_TABLES (BigQuery), INFORMATION_SCHEMA_ROW_COUNT (MySQL/MariaDB), " + "ALL_TABLES (Oracle), INFORMATION_SCHEMA_TABLES_WITH_SIZE (MySQL/MariaDB)")) + todo_fields.append("rowCountQueryStyle") + # countStarNullSizeBytesExpr — almost always null (default); omit; user adds manually for Dremio + + # schemaExistenceQueryStyle — omit if NONE (default) schema_style = probes.get("schemaExistenceQueryStyle", "NONE") - lines.append(field("schemaExistenceQueryStyle", schema_style, "auto-detected")) - detected_fields.append("schemaExistenceQueryStyle") + if schema_style != "NONE": + lines.append(field("schemaExistenceQueryStyle", schema_style, + "auto-detected — schema enumeration style. " + "Valid: NONE (default), INFORMATION_SCHEMA, SHOW_SCHEMAS_LIKE, " + "SHOW_SCHEMAS_ITERATE (Hive/Trino), SYSCAT (DB2), " + "ALTER_SESSION (Oracle), SYS_SCHEMAS (SQL Server)")) + detected_fields.append("schemaExistenceQueryStyle") + else: + detected_fields.append("schemaExistenceQueryStyle") # default — omitted + # dateArithmeticStyle — omit if STANDARD (default) date_arith = probes.get("dateArithmeticStyle", "STANDARD") - lines.append(field("dateArithmeticStyle", date_arith, "auto-detected")) - detected_fields.append("dateArithmeticStyle") - + if date_arith != "STANDARD": + lines.append(field("dateArithmeticStyle", date_arith, + "auto-detected — date arithmetic strategy. " + "Valid: STANDARD (default, ANSI fallback), DATEADD_DATEDIFF (SQL Server), " + "NUMTODSINTERVAL (Oracle), TIMESTAMP_ADD (BigQuery), TIMESTAMPDIFF_DB2 (Db2)")) + detected_fields.append("dateArithmeticStyle") + else: + detected_fields.append("dateArithmeticStyle") # default — omitted lines.append("") - # ── Performance tuning ────────────────────────────────────────────────── - lines.append("# ── Performance tuning (adjust for your workload) ────────────────") - lines.append(field("insertBatchSize", 10000, "TODO: tune for driver performance")) - lines.append(field("maxPartitionParallelism", 10, "TODO: set to 1 for write-heavy databases")) - lines.append(field("dataSizeLimit", "LONG_MAX", "TODO: use INT_MAX for SQL Server, Redshift, Db2")) - todo_fields += ["insertBatchSize", "maxPartitionParallelism", "dataSizeLimit"] + # ── Date arithmetic templates (only when non-null) ──────────────────────── + int_ts = probes.get("intervalCalcDatetimeTimestampTemplate") + int_dt = probes.get("intervalCalcDatetimeDateTemplate") + up_ts = probes.get("upperBoundDatetimeTimestampTemplate") + up_dt = probes.get("upperBoundDatetimeDateTemplate") + has_templates = any(v and v != "null" for v in [int_ts, int_dt, up_ts, up_dt]) + if has_templates: + lines.append("# ── Date arithmetic templates ─────────────────────────────────────────") + lines.append("# Placeholders: {col} = column name, MIN_{col} = min value, " + "MAX_{col} = max value, {interval} = midpoint expression") + if int_ts and int_ts != "null": + lines.append(field("intervalCalcDatetimeTimestampTemplate", int_ts, "auto-detected")) + detected_fields.append("intervalCalcDatetimeTimestampTemplate") + if int_dt and int_dt != "null": + lines.append(field("intervalCalcDatetimeDateTemplate", int_dt, "auto-detected")) + detected_fields.append("intervalCalcDatetimeDateTemplate") + if up_ts and up_ts != "null": + lines.append(field("upperBoundDatetimeTimestampTemplate", up_ts, "auto-detected")) + detected_fields.append("upperBoundDatetimeTimestampTemplate") + if up_dt and up_dt != "null": + lines.append(field("upperBoundDatetimeDateTemplate", up_dt, "auto-detected")) + detected_fields.append("upperBoundDatetimeDateTemplate") + lines.append("") + + # ── Connectivity ────────────────────────────────────────────────────────── + lines.append("# ── Connectivity ─────────────────────────────────────────────────────") + # networkCapable: true (default) — omitted; readOnly: false (default) — omitted + lines.append("connectionProperties: {}" + " # TODO: key-value pairs injected into JDBC pool and Spark " + "(e.g. {ssl: 'true', charset: 'utf8'})") + lines.append("sessionInitStatements: []" + " # TODO: SQL statements run once after each new connection " + "(e.g. [\"SET SCHEMA mydb\", \"ALTER SESSION SET NLS_DATE_FORMAT='YYYY-MM-DD'\"])") + todo_fields += ["connectionProperties", "sessionInitStatements"] lines.append("") - # ── Needs manual research ──────────────────────────────────────────────── - lines.append("# ── Needs manual research (database-specific) ────────────────────") - lines.append(field("timestampLiteralStyle", "PLAIN", "TODO: check if CAST_DATETIME2, TO_TIMESTAMP, etc. applies")) - lines.append(field("dateLiteralStyle", "PLAIN", "TODO: check if DATE_PREFIX or TO_DATE applies")) - lines.append(field("schemaOnlyQueryStyle", "CTE", "TODO: check if SQLSERVER_TOP0, WHERE_FALSE_QUERYA, etc. applies")) - lines.append(field("viewSampleFallback", "RAND", "TODO: verify RAND() is supported")) - lines.append(field("rowCountQueryStyle", "COUNT_STAR", "TODO: check if INFORMATION_SCHEMA_TABLES_WITH_SIZE etc. applies")) - todo_fields += ["timestampLiteralStyle", "dateLiteralStyle", "schemaOnlyQueryStyle", - "viewSampleFallback", "rowCountQueryStyle"] - + # ── Spark JdbcDialect ───────────────────────────────────────────────────── + lines.append("# ── Spark JdbcDialect ────────────────────────────────────────────────") + lines.append(field("dialectClass", None, + "TODO: fully-qualified JdbcDialect Scala object class to register with Spark " + "(e.g. com.example.MyDialect$); null if no custom Spark dialect is needed")) + todo_fields.append("dialectClass") lines.append("") - lines.append("systemSchemaExclusions: [] # TODO: add internal system schemas") - lines.append("systemSchemaExclusionPrefixes: [] # TODO: add prefixes of temporary schemas") - lines.append("systemCatalogExclusions: [] # TODO: add internal system catalogs") - todo_fields += ["systemSchemaExclusions", "systemSchemaExclusionPrefixes", "systemCatalogExclusions"] + # ── URL construction ────────────────────────────────────────────────────── + lines.append("# ── URL construction ─────────────────────────────────────────────────") + lines.append("# Known placeholders: {host}, {port}, {database}, {schema}, {username}, {password}") + if jdbc_url_template: + lines.append(field("jdbcUrlTemplate", jdbc_url_template, + "auto-detected from probe URL — verify all placeholders are correct")) + detected_fields.append("jdbcUrlTemplate") + else: + lines.append(field("jdbcUrlTemplate", "", + "TODO: URL template with {host}, {port}, {database} substitution tokens. " + "Example: jdbc:mydb://{host}:{port}/{database}")) + todo_fields.append("jdbcUrlTemplate") + lines.append("jdbcUrlStaticParams: []" + " # TODO: query params always appended to every URL " + "(e.g. [tcpKeepAlive=true, sslmode=prefer])") + lines.append("jdbcUrlConditionalParams: []" + " # TODO: params appended only when a form field is non-empty " + "(e.g. [{key: schema, param: 'currentSchema={schema}'}])") + lines.append("jdbcUrlAuthVariants: {}" + " # optional: auth_type -> full URL template override; leave empty if not needed") + todo_fields += ["jdbcUrlStaticParams", "jdbcUrlConditionalParams"] lines.append("") - # ── Advanced ──────────────────────────────────────────────────────────── - lines.append("# ── Advanced (leave null unless you know you need them) ──────────") - lines.append("connectionProperties: {} # TODO: add driver-specific properties if needed") - lines.append("sessionInitStatements: [] # TODO: add session SQL if needed (NLS, SET DATEFORMAT, etc.)") - lines.append(field("readOnly", False)) - lines.append(field("dialectClass", None, "TODO: set to JdbcDialect class if bundling Spark dialect")) - todo_fields += ["connectionProperties", "sessionInitStatements", "dialectClass"] + # ── Connection spec ──────────────────────────────────────────────────────── + # Only mark a field required if the probe URL actually contained that component. + # e.g. jdbc:sqlite:/path/to/db has no host or port → those fields are optional. + lines.append("# ── Connection spec (frontend form) ──────────────────────────────────") + lines.append("# TODO: define the connection form fields shown in the UI.") + lines.append("# Each field: name, label, fieldType (string/integer/boolean/password/enum/file),") + lines.append("# required, defaultValue, hint, options (for enum), dependsOn, dependsOnValue") + lines.append("connectionSpec:") + lines.append(" supportsEnrichment: false # custom drivers are source-only") + lines.append(" fields:") + if "host" in url_components: + lines.append(" - name: host") + lines.append(' label: "Host"') + lines.append(" fieldType: string") + lines.append(" required: true") + if "port" in url_components: + lines.append(" - name: port") + lines.append(' label: "Port"') + lines.append(" fieldType: integer") + lines.append(" required: true") + if default_port is not None: + lines.append(f' defaultValue: "{default_port}"') + if "database" in url_components: + lines.append(" - name: database") + lines.append(' label: "Database"') + lines.append(" fieldType: string") + lines.append(" required: true") + lines.append(" - name: username") + lines.append(' label: "Username"') + lines.append(" fieldType: string") + lines.append(" required: true") + lines.append(" - name: password") + lines.append(' label: "Password"') + lines.append(" fieldType: password") + lines.append(" required: true") + todo_fields.append("connectionSpec") - lines.append("") + return "\n".join(lines) + "\n", detected_fields, todo_fields - # ── Date arithmetic templates ──────────────────────────────────────────── - lines.append("# ── Date arithmetic templates ────────────────────────────────────") - lines.append("# Placeholders: {col} = column name, {interval} = interval expression") - lines.append("") - lines.append(field("intervalCalcNumericTemplate", None, "use generic CASE/DECIMAL(38,0) fallback")) - int_ts = probes.get("intervalCalcDatetimeTimestampTemplate") - int_dt = probes.get("intervalCalcDatetimeDateTemplate") - up_ts = probes.get("upperBoundDatetimeTimestampTemplate") - up_dt = probes.get("upperBoundDatetimeDateTemplate") +# --------------------------------------------------------------------------- +# LLM-assisted TODO resolution helpers +# --------------------------------------------------------------------------- - if int_ts: - lines.append(field("intervalCalcDatetimeTimestampTemplate", int_ts, "auto-detected")) - detected_fields.append("intervalCalcDatetimeTimestampTemplate") - else: - lines.append(field("intervalCalcDatetimeTimestampTemplate", None, "TODO: set timestamp midpoint expression")) - todo_fields.append("intervalCalcDatetimeTimestampTemplate") - if int_dt: - lines.append(field("intervalCalcDatetimeDateTemplate", int_dt, "auto-detected")) - detected_fields.append("intervalCalcDatetimeDateTemplate") - else: - lines.append(field("intervalCalcDatetimeDateTemplate", None, "TODO: set date midpoint expression")) - todo_fields.append("intervalCalcDatetimeDateTemplate") +def _strip_jdbc_credentials(jdbc_url: str) -> str: + """Remove user/password from a JDBC URL for safe inclusion in prompts.""" + cleaned = re.sub(r"(jdbc:[^:]+://)([^@/]+@)", r"\1", jdbc_url) + cleaned = re.sub(r"[?&](password|passwd|pwd)=[^&]*", "", cleaned, flags=re.IGNORECASE) + return cleaned - lines.append("") - lines.append(field("upperBoundNumericTemplate", None, "use code default")) - if up_ts: - lines.append(field("upperBoundDatetimeTimestampTemplate", up_ts, "auto-detected")) - detected_fields.append("upperBoundDatetimeTimestampTemplate") - else: - lines.append(field("upperBoundDatetimeTimestampTemplate", None, "TODO: set timestamp upper-bound expression")) - todo_fields.append("upperBoundDatetimeTimestampTemplate") +def _collect_todo_fields(yaml_content: str) -> list[tuple[str, str, str]]: + """ + Scan YAML lines for remaining TODO comments. + Returns list of (field_name, current_value, todo_description). + """ + todos = [] + for line in yaml_content.splitlines(): + m = re.match(r"^(\w+):\s*(.+?)\s*#\s*TODO:\s*(.+)$", line) + if m: + todos.append((m.group(1), m.group(2).strip(), m.group(3).strip())) + return todos - if up_dt: - lines.append(field("upperBoundDatetimeDateTemplate", up_dt, "auto-detected")) - detected_fields.append("upperBoundDatetimeDateTemplate") - else: - lines.append(field("upperBoundDatetimeDateTemplate", None, "TODO: set date upper-bound expression")) - todo_fields.append("upperBoundDatetimeDateTemplate") - return "\n".join(lines) + "\n", detected_fields, todo_fields +def _call_deployment_llm(client, prompt: str) -> str | None: + """ + POST to agent/chat and collect the streamed SSE response. + Returns the full concatenated text, or None if the call fails. + """ + try: + response = client.post( + "agent/chat", + json={"messages": [{"role": "user", "content": prompt}]}, + stream=True, + timeout=120, + ) + text_parts: list[str] = [] + for raw_line in response.iter_lines(decode_unicode=True): + if not raw_line: + continue + if raw_line.startswith("data: "): + data = raw_line[6:] + if data == "[DONE]": + break + try: + event = json.loads(data) + if isinstance(event, dict) and event.get("type") == "text-delta": + text_parts.append(event.get("textDelta") or event.get("delta") or "") + except json.JSONDecodeError: + # Vercel AI SDK compact format: 0:"chunk" + if re.match(r'^0:"', data): + try: + text_parts.append(json.loads(data[2:])) + except json.JSONDecodeError: + pass + return "".join(text_parts) if text_parts else None + except Exception: + return None + + +def _apply_llm_suggestions(yaml_content: str, suggestions: dict) -> tuple[str, int]: + """ + Substitute LLM-suggested values into YAML content, replacing TODO lines. + suggestions: {field_name: {"value": ..., "rationale": "..."}} + Returns (updated_content, count_applied). + """ + applied = 0 + result_lines: list[str] = [] + for line in yaml_content.splitlines(keepends=True): + m = re.match(r"^(\w+):\s*(.+?)\s*#\s*TODO:.*$", line) + if m and m.group(1) in suggestions: + field_name = m.group(1) + suggestion = suggestions[field_name] + value = suggestion.get("value") + rationale = str(suggestion.get("rationale", "")).replace("\n", " ").strip() + if value is not None: + if isinstance(value, (list, dict)): + yaml_val = yaml.dump(value, default_flow_style=True).strip().rstrip("\n") + elif isinstance(value, bool): + yaml_val = str(value).lower() + elif isinstance(value, (int, float)): + yaml_val = str(value) + else: + sv = str(value) + if any(c in sv for c in [":", "#", "[", "]", "{", "}", ","]): + yaml_val = yaml.dump(sv, default_flow_style=True).strip() + else: + yaml_val = sv + result_lines.append(f"{field_name}: {yaml_val} # LLM-suggested: {rationale}\n") + applied += 1 + continue + result_lines.append(line) + return "".join(result_lines), applied + + +# --------------------------------------------------------------------------- +# Index management helpers +# --------------------------------------------------------------------------- + +_DEFAULT_DRIVERS_DIR = os.path.join("dist", "META-INF", "jdbc-drivers") + + +def _update_index(drivers_dir: str, yaml_filename: str) -> bool: + """ + Create or update the ``index`` file in *drivers_dir*, adding *yaml_filename* + if it is not already present. Returns True if the index was modified. + """ + index_path = os.path.join(drivers_dir, "index") + existing: list[str] = [] + if os.path.isfile(index_path): + with open(index_path) as fh: + existing = [line.rstrip("\n") for line in fh if line.strip()] + if yaml_filename in existing: + return False + existing.append(yaml_filename) + with open(index_path, "w") as fh: + fh.write("\n".join(existing) + "\n") + return True # --------------------------------------------------------------------------- @@ -707,16 +1182,30 @@ def generate_driver( typer.Option( "--output", "-o", - help="Output file path. Defaults to .yaml in the current directory.", + help="Output file path. Overrides --dist-dir when specified.", show_default=False, ), ] = None, + dist_dir: Annotated[ + str, + typer.Option( + "--dist-dir", + help="Root dist directory for generated files. " + "YAML is written to /META-INF/jdbc-drivers/.yaml.", + show_default=True, + ), + ] = "dist", ) -> None: """Generate a YAML driver definition by probing a JDBC driver JAR. Connects to the database using the provided JAR and URL, runs a series of introspection probes, and writes a best-effort YAML file you can review - and edit before deploying to META-INF/jdbc-drivers/. + and edit before deploying. The driver YAML is written to + dist/META-INF/jdbc-drivers/.yaml by default, and an index file is + created or updated in the same directory. + + Run ``qualytics package-drivers`` afterwards to bundle all generated YAMLs + into a single deployable JAR. Requires a JDK (java + javac) on PATH. @@ -759,7 +1248,9 @@ def generate_driver( if output: out_path = os.path.abspath(output) else: - out_path = os.path.join(os.getcwd(), f"{prefix}.yaml") + out_path = os.path.abspath( + os.path.join(dist_dir, "META-INF", "jdbc-drivers", f"{prefix}.yaml") + ) print(f" JAR: [bold]{jar_path}[/bold]") print(f" URL: [bold]{url}[/bold]") @@ -782,12 +1273,96 @@ def generate_driver( # ── Write output ───────────────────────────────────────────────────── try: + os.makedirs(os.path.dirname(out_path), exist_ok=True) with open(out_path, "w") as fh: fh.write(yaml_content) except OSError as e: print(f"[red]Failed to write output file: {e}[/red]") raise typer.Exit(code=1) + # ── Update index ────────────────────────────────────────────────────── + yaml_filename = os.path.basename(out_path) + drivers_dir = os.path.dirname(out_path) + try: + index_updated = _update_index(drivers_dir, yaml_filename) + except OSError as e: + print(f"[yellow] Warning: could not update index file: {e}[/yellow]") + index_updated = False + + # ── LLM-assisted TODO resolution (optional — requires deployment login) ─ + todo_items = _collect_todo_fields(yaml_content) + if todo_items: + try: + from ..config import load_config + from ..api.client import QualyticsClient + from ..utils import validate_and_format_url + + config = load_config() + if config is None: + print("[dim] Not logged in to a Qualytics deployment — LLM TODO resolution skipped.[/dim]") + else: + client = QualyticsClient( + base_url=validate_and_format_url(config["url"]), + token=config.get("token", ""), + ssl_verify=config.get("ssl_verify", True), + ) + llm_status = client.get("agent/llm-config/status").json() + if not llm_status.get("is_configured"): + print("[dim] No LLM integration configured on this deployment — TODO fields left as-is.[/dim]") + else: + db_product = probes.get("dbProductName") or "Unknown database" + db_version = probes.get("dbProductVersion") or "" + driver_class = probes.get("className") or "unknown" + clean_url = _strip_jdbc_credentials(url) + todo_block = "\n".join( + f" {name}: {val} # TODO: {desc}" + for name, val, desc in todo_items + ) + prompt = textwrap.dedent(f"""\ + I am generating a JDBC driver YAML configuration file for the Qualytics data quality platform. + Custom JDBC drivers in Qualytics support SOURCE datastores only (read-only access). + + Database: {db_product} {db_version} + Driver class: {driver_class} + JDBC URL (credentials removed): {clean_url} + + The following YAML fields could not be determined automatically. + For each field, recommend an appropriate value based on your knowledge of this database: + + {todo_block} + + Respond with a single JSON object. Each key is a field name from the list above. + Each value is an object with: + "value": the recommended YAML value (null if unknown, [] for empty lists, string otherwise) + "rationale": one concise sentence explaining the recommendation + + Only include fields where you have reasonable confidence. Omit fields you are unsure about. + Return ONLY valid JSON — no markdown, no code fences, no preamble. + """) + with status(f"[bold cyan]Asking LLM to resolve {len(todo_items)} TODO field(s)...[/bold cyan]"): + llm_text = _call_deployment_llm(client, prompt) + if not llm_text: + print("[yellow] LLM call returned no usable output — TODO fields left as-is.[/yellow]") + else: + json_match = re.search(r"\{.*\}", llm_text, re.DOTALL) + if not json_match: + print("[yellow] LLM response contained no JSON — TODO fields left as-is.[/yellow]") + else: + try: + suggestions = json.loads(json_match.group(0)) + updated_yaml, applied = _apply_llm_suggestions(yaml_content, suggestions) + if applied > 0: + with open(out_path, "w") as fh: + fh.write(updated_yaml) + yaml_content = updated_yaml + print(f" [{BRAND}]LLM resolved {applied} TODO field(s).[/{BRAND}]") + else: + print("[dim] LLM returned suggestions but none matched TODO fields.[/dim]") + except json.JSONDecodeError: + print("[yellow] LLM response could not be parsed as JSON — TODO fields left as-is.[/yellow]") + except Exception as exc: + print(f"[yellow] LLM TODO resolution error ({exc}) — TODO fields left as-is.[/yellow]") + # ── Print summary ──────────────────────────────────────────────────── console = Console() console.print() @@ -799,18 +1374,25 @@ def generate_driver( probe_display = [ ("className", probes.get("className")), + ("dbProductName", probes.get("dbProductName")), + ("dbProductVersion", probes.get("dbProductVersion")), ("prefix (from URL)", prefix), ("identifierQuoteChar", probes.get("identifierQuoteChar")), ("transactionIsolation", probes.get("transactionIsolation")), ("tableNameCasing", probes.get("tableNameCasing")), ("validationQuery", probes.get("validationQuery")), - ("subqueryRequiresAlias", str(probes.get("subqueryRequiresAlias", False)).lower()), + ("subqueryRequiresAlias", str(probes.get("subqueryRequiresAlias", True)).lower()), ("getTablesUsesNullCatalog", str(probes.get("getTablesUsesNullCatalog", False)).lower()), ("approxCountDistinctFunction", probes.get("approxCountDistinctFunction")), + ("rowCountQueryStyle", probes.get("rowCountQueryStyle")), ("schemaExistenceQueryStyle", probes.get("schemaExistenceQueryStyle")), + ("schemaOnlyQueryStyle", probes.get("schemaOnlyQueryStyle")), ("dateArithmeticStyle", probes.get("dateArithmeticStyle")), ("rowLimitSyntax", probes.get("rowLimitSyntax")), ("tableSampleTemplate", probes.get("tableSampleTemplate")), + ("viewSampleFallback", probes.get("viewSampleFallback")), + ("timestampLiteralStyle", probes.get("timestampLiteralStyle")), + ("dateLiteralStyle", probes.get("dateLiteralStyle")), ("intervalCalcDatetimeTimestampTemplate", probes.get("intervalCalcDatetimeTimestampTemplate")), ("intervalCalcDatetimeDateTemplate", probes.get("intervalCalcDatetimeDateTemplate")), ("upperBoundDatetimeTimestampTemplate", probes.get("upperBoundDatetimeTimestampTemplate")), @@ -832,13 +1414,12 @@ def generate_driver( todo_count = sum( 1 for _, v in probe_display if v is None or v == "null" ) - # Always-todo fields (performance tuning, manual research) + # Always-todo fields (not auto-detectable; need manual review or LLM assistance) always_todo = [ - "insertBatchSize", "maxPartitionParallelism", "dataSizeLimit", - "timestampLiteralStyle", "dateLiteralStyle", "schemaOnlyQueryStyle", - "viewSampleFallback", "rowCountQueryStyle", + "maxPartitionParallelism", "dataSizeLimit", "systemSchemaExclusions", "systemSchemaExclusionPrefixes", "systemCatalogExclusions", "connectionProperties", "sessionInitStatements", "dialectClass", + "jdbcUrlStaticParams", "jdbcUrlConditionalParams", "connectionSpec", ] total_todo = todo_count + len(always_todo) auto_detected = len(probe_display) - todo_count @@ -848,7 +1429,119 @@ def generate_driver( f"[yellow]{total_todo} field(s) need review[/yellow]\n" ) print(f" [bold]Written:[/bold] {out_path}") + index_path = os.path.join(drivers_dir, "index") + if index_updated: + print(f" [bold]Index:[/bold] {index_path} (added {yaml_filename})") + else: + print(f" [bold]Index:[/bold] {index_path} (already present — no change)") print( - "\n [dim]Review the file, fill in the TODO comments, then deploy to:[/dim]" - "\n [dim]META-INF/jdbc-drivers/" + prefix + ".yaml[/dim]\n" + "\n [dim]Review the YAML, fill in the TODO fields, then run:[/dim]" + "\n [dim] qualytics package-drivers[/dim]" + "\n [dim]to bundle all drivers into custom-drivers.jar[/dim]\n" ) + +# --------------------------------------------------------------------------- +# package-drivers command +# --------------------------------------------------------------------------- + +package_drivers_app = typer.Typer( + name="package-drivers", + help="Bundle generated driver YAMLs into a deployable JAR file.", + invoke_without_command=True, +) + + +@package_drivers_app.callback(invoke_without_command=True) +def package_drivers( + ctx: typer.Context, + dist_dir: Annotated[ + str, + typer.Option( + "--dist-dir", + help="Root dist directory produced by generate-driver. " + "Must contain META-INF/jdbc-drivers/.", + show_default=True, + ), + ] = "dist", + output: Annotated[ + Optional[str], + typer.Option( + "--output", + "-o", + help="Output JAR path. Defaults to custom-drivers.jar in the current directory.", + show_default=False, + ), + ] = None, +) -> None: + """Bundle all driver YAMLs in dist/META-INF/jdbc-drivers/ into a JAR. + + Reads the index file to enumerate drivers, then zips the entire + dist/ tree into a JAR file that can be loaded by the Qualytics + platform alongside the corresponding JDBC driver JARs. + + Examples: + + \\b + # Default — reads dist/, writes custom-drivers.jar + qualytics package-drivers + + # Custom paths + qualytics package-drivers --dist-dir ./build --output my-drivers.jar + """ + if ctx.invoked_subcommand is not None: + return + + print_banner(subtitle="[bold]Package Drivers[/bold]") + + abs_dist = os.path.abspath(dist_dir) + drivers_dir = os.path.join(abs_dist, "META-INF", "jdbc-drivers") + + # ── Validate dist dir ──────────────────────────────────────────────── + if not os.path.isdir(drivers_dir): + print( + f"[red]No jdbc-drivers directory found at: {drivers_dir}[/red]\n" + "[yellow]Run [bold]qualytics generate-driver[/bold] first to populate it.[/yellow]" + ) + raise typer.Exit(code=1) + + index_path = os.path.join(drivers_dir, "index") + if not os.path.isfile(index_path): + print( + f"[red]No index file found at: {index_path}[/red]\n" + "[yellow]Run [bold]qualytics generate-driver[/bold] first to create it.[/yellow]" + ) + raise typer.Exit(code=1) + + with open(index_path) as fh: + entries = [line.strip() for line in fh if line.strip()] + + if not entries: + print("[yellow]Index file is empty — nothing to package.[/yellow]") + raise typer.Exit(code=1) + + # ── Verify all indexed YAMLs exist ─────────────────────────────────── + missing = [e for e in entries if not os.path.isfile(os.path.join(drivers_dir, e))] + if missing: + print(f"[red]Index references files that do not exist: {missing}[/red]") + raise typer.Exit(code=1) + + # ── Write JAR ──────────────────────────────────────────────────────── + jar_path = os.path.abspath(output or "custom-drivers.jar") + print(f" Dist dir: [bold]{abs_dist}[/bold]") + print(f" Drivers: {', '.join(entries)}") + print(f" Output: [bold]{jar_path}[/bold]") + print() + + with status("[bold cyan]Packaging drivers...[/bold cyan]"): + try: + with zipfile.ZipFile(jar_path, "w", zipfile.ZIP_DEFLATED) as zf: + for root, _dirs, files in os.walk(abs_dist): + for fname in sorted(files): + fpath = os.path.join(root, fname) + arcname = os.path.relpath(fpath, abs_dist) + zf.write(fpath, arcname) + except OSError as e: + print(f"[red]Failed to write JAR: {e}[/red]") + raise typer.Exit(code=1) + + print(f" [{BRAND}]Packaged {len(entries)} driver(s) → {jar_path}[/{BRAND}]\n") diff --git a/qualytics/qualytics.py b/qualytics/qualytics.py index 2217b1c..a0d0e99 100644 --- a/qualytics/qualytics.py +++ b/qualytics/qualytics.py @@ -22,7 +22,7 @@ from .cli.users import users_app from .cli.teams import teams_app from .cli.tags import tags_app -from .cli.generate_driver import generate_driver_app +from .cli.generate_driver import generate_driver_app, package_drivers_app # Import config for environment setup from .config import DOTENV_PATH @@ -48,6 +48,7 @@ app.add_typer(tags_app, name="tags") app.command("doctor", help="Check CLI health and connectivity")(doctor) app.add_typer(generate_driver_app, name="generate-driver") +app.add_typer(package_drivers_app, name="package-drivers") if __name__ == "__main__": diff --git a/uv.lock b/uv.lock index 503cc32..97a2935 100644 --- a/uv.lock +++ b/uv.lock @@ -1190,11 +1190,11 @@ wheels = [ [[package]] name = "pygments" -version = "2.19.2" +version = "2.20.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, + { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" }, ] [[package]] From 36546ea6e8aec1e1cbb215119a5763c8bc420cb1 Mon Sep 17 00:00:00 2001 From: Eric Simmerman Date: Fri, 3 Apr 2026 21:44:09 -0400 Subject: [PATCH 03/10] feat(generate-driver): auto-detect dialectClass from JAR ServiceLoader and Spark built-ins --- qualytics/cli/generate_driver.py | 61 +++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/qualytics/cli/generate_driver.py b/qualytics/cli/generate_driver.py index e361632..5d45d7c 100644 --- a/qualytics/cli/generate_driver.py +++ b/qualytics/cli/generate_driver.py @@ -572,10 +572,55 @@ def _derive_url_metadata(jdbc_url: str) -> tuple[int | None, str, set[str]]: return None, "", url_components +# Known Spark built-in JdbcDialect implementations (Spark 3.x, package org.apache.spark.sql.jdbc) +_SPARK_BUILTIN_DIALECTS: dict[str, str] = { + "postgresql": "org.apache.spark.sql.jdbc.PostgresDialect$", + "mysql": "org.apache.spark.sql.jdbc.MySQLDialect$", + "mariadb": "org.apache.spark.sql.jdbc.MySQLDialect$", + "oracle": "org.apache.spark.sql.jdbc.OracleDialect$", + "sqlserver": "org.apache.spark.sql.jdbc.MsSqlServerDialect$", + "jtds": "org.apache.spark.sql.jdbc.MsSqlServerDialect$", + "db2": "org.apache.spark.sql.jdbc.DB2Dialect$", + "derby": "org.apache.spark.sql.jdbc.DerbyDialect$", + "teradata": "org.apache.spark.sql.jdbc.TeradataDialect$", +} + + +def _detect_dialect_class(prefix: str, jar_path: str) -> str | None: + """ + Return the fully-qualified JdbcDialect class name to use for dialectClass, or None. + + Priority: + 1. Driver JAR ServiceLoader registration: + META-INF/services/org.apache.spark.sql.jdbc.JdbcDialect + 2. Known Spark built-in dialect for this JDBC prefix. + """ + import zipfile as _zf + + # 1. Scan the JAR for a ServiceLoader registration file + try: + with _zf.ZipFile(jar_path, "r") as zf: + service_entry = "META-INF/services/org.apache.spark.sql.jdbc.JdbcDialect" + if service_entry in zf.namelist(): + content = zf.read(service_entry).decode("utf-8", errors="replace").strip() + # Take the first non-comment, non-blank line + for line in content.splitlines(): + line = line.strip() + if line and not line.startswith("#"): + return line + except Exception: + pass # JAR unreadable or not a zip — fall through + + # 2. Static built-in lookup by prefix + return _SPARK_BUILTIN_DIALECTS.get(prefix.lower()) + + def _build_yaml( prefix: str, probes: dict, jdbc_url: str, + *, + dialect_class: str | None = None, ) -> tuple[str, list[str], list[str]]: """ Build the complete YAML content string from the probes dict. @@ -933,10 +978,15 @@ def field(name: str, value, comment: str = "") -> str: # ── Spark JdbcDialect ───────────────────────────────────────────────────── lines.append("# ── Spark JdbcDialect ────────────────────────────────────────────────") - lines.append(field("dialectClass", None, - "TODO: fully-qualified JdbcDialect Scala object class to register with Spark " - "(e.g. com.example.MyDialect$); null if no custom Spark dialect is needed")) - todo_fields.append("dialectClass") + if dialect_class is not None: + lines.append(field("dialectClass", dialect_class, + "Auto-detected Spark JdbcDialect subclass")) + detected_fields.append("dialectClass") + else: + lines.append(field("dialectClass", None, + "TODO: fully-qualified JdbcDialect Scala object class to register with Spark " + "(e.g. com.example.MyDialect$); null if no custom Spark dialect is needed")) + todo_fields.append("dialectClass") lines.append("") # ── URL construction ────────────────────────────────────────────────────── @@ -1269,7 +1319,8 @@ def generate_driver( ) # ── Build YAML ─────────────────────────────────────────────────────── - yaml_content, detected_fields, todo_fields = _build_yaml(prefix, probes, url) + detected_dialect = _detect_dialect_class(prefix, jar_path) + yaml_content, detected_fields, todo_fields = _build_yaml(prefix, probes, url, dialect_class=detected_dialect) # ── Write output ───────────────────────────────────────────────────── try: From 83a73826fb706a80f08de5a8041a08bf43650cce Mon Sep 17 00:00:00 2001 From: Eric Simmerman Date: Sat, 4 Apr 2026 22:51:32 -0400 Subject: [PATCH 04/10] Only add one top-line argument for driver management --- LICENSE | 2 +- qualytics/cli/generate_driver.py | 48 ++++++++++++-------------------- qualytics/qualytics.py | 5 ++-- uv.lock | 6 ++-- 4 files changed, 24 insertions(+), 37 deletions(-) diff --git a/LICENSE b/LICENSE index 320da17..d854c63 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2023 Qualytics, Inc. +Copyright (c) 2026 Qualytics, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/qualytics/cli/generate_driver.py b/qualytics/cli/generate_driver.py index 5d45d7c..0980d19 100644 --- a/qualytics/cli/generate_driver.py +++ b/qualytics/cli/generate_driver.py @@ -18,7 +18,7 @@ from rich.console import Console from rich.table import Table -from . import BRAND, print_banner +from . import BRAND, add_suggestion_callback, print_banner from .progress import status # --------------------------------------------------------------------------- @@ -1174,17 +1174,17 @@ def _update_index(drivers_dir: str, yaml_filename: str) -> bool: # --------------------------------------------------------------------------- -# CLI command +# CLI commands — drivers group # --------------------------------------------------------------------------- -generate_driver_app = typer.Typer( - name="generate-driver", - help="Generate a YAML driver definition by probing a JDBC driver JAR.", - invoke_without_command=True, +drivers_app = typer.Typer( + name="drivers", + help="Manage pluggable JDBC drivers for Qualytics.", ) +add_suggestion_callback(drivers_app, "drivers") -@generate_driver_app.callback(invoke_without_command=True) +@drivers_app.command("generate") def generate_driver( ctx: typer.Context, jar: Annotated[ @@ -1254,7 +1254,7 @@ def generate_driver( dist/META-INF/jdbc-drivers/.yaml by default, and an index file is created or updated in the same directory. - Run ``qualytics package-drivers`` afterwards to bundle all generated YAMLs + Run ``qualytics drivers package`` afterwards to bundle all generated YAMLs into a single deployable JAR. Requires a JDK (java + javac) on PATH. @@ -1262,20 +1262,17 @@ def generate_driver( Examples: \\b - qualytics generate-driver \\ + qualytics drivers generate \\ --jar ./postgresql-42.7.3.jar \\ --url jdbc:postgresql://localhost:5432/mydb \\ --user alice --password secret - qualytics generate-driver \\ + qualytics drivers generate \\ --jar ./custom-driver.jar \\ --url jdbc:customdb://host:1234/catalog \\ --properties loginTimeout=30 \\ --output custom.yaml """ - # Skip if invoked as part of a help display - if ctx.invoked_subcommand is not None: - return print_banner(subtitle="[bold]Generate Driver[/bold]") @@ -1487,29 +1484,22 @@ def generate_driver( print(f" [bold]Index:[/bold] {index_path} (already present — no change)") print( "\n [dim]Review the YAML, fill in the TODO fields, then run:[/dim]" - "\n [dim] qualytics package-drivers[/dim]" + "\n [dim] qualytics drivers package[/dim]" "\n [dim]to bundle all drivers into custom-drivers.jar[/dim]\n" ) # --------------------------------------------------------------------------- -# package-drivers command +# drivers package command # --------------------------------------------------------------------------- -package_drivers_app = typer.Typer( - name="package-drivers", - help="Bundle generated driver YAMLs into a deployable JAR file.", - invoke_without_command=True, -) - -@package_drivers_app.callback(invoke_without_command=True) +@drivers_app.command("package") def package_drivers( - ctx: typer.Context, dist_dir: Annotated[ str, typer.Option( "--dist-dir", - help="Root dist directory produced by generate-driver. " + help="Root dist directory produced by 'drivers generate'. " "Must contain META-INF/jdbc-drivers/.", show_default=True, ), @@ -1534,13 +1524,11 @@ def package_drivers( \\b # Default — reads dist/, writes custom-drivers.jar - qualytics package-drivers + qualytics drivers package # Custom paths - qualytics package-drivers --dist-dir ./build --output my-drivers.jar + qualytics drivers package --dist-dir ./build --output my-drivers.jar """ - if ctx.invoked_subcommand is not None: - return print_banner(subtitle="[bold]Package Drivers[/bold]") @@ -1551,7 +1539,7 @@ def package_drivers( if not os.path.isdir(drivers_dir): print( f"[red]No jdbc-drivers directory found at: {drivers_dir}[/red]\n" - "[yellow]Run [bold]qualytics generate-driver[/bold] first to populate it.[/yellow]" + "[yellow]Run [bold]qualytics drivers generate[/bold] first to populate it.[/yellow]" ) raise typer.Exit(code=1) @@ -1559,7 +1547,7 @@ def package_drivers( if not os.path.isfile(index_path): print( f"[red]No index file found at: {index_path}[/red]\n" - "[yellow]Run [bold]qualytics generate-driver[/bold] first to create it.[/yellow]" + "[yellow]Run [bold]qualytics drivers generate[/bold] first to create it.[/yellow]" ) raise typer.Exit(code=1) diff --git a/qualytics/qualytics.py b/qualytics/qualytics.py index a0d0e99..9e9a3df 100644 --- a/qualytics/qualytics.py +++ b/qualytics/qualytics.py @@ -22,7 +22,7 @@ from .cli.users import users_app from .cli.teams import teams_app from .cli.tags import tags_app -from .cli.generate_driver import generate_driver_app, package_drivers_app +from .cli.generate_driver import drivers_app # Import config for environment setup from .config import DOTENV_PATH @@ -47,8 +47,7 @@ app.add_typer(teams_app, name="teams") app.add_typer(tags_app, name="tags") app.command("doctor", help="Check CLI health and connectivity")(doctor) -app.add_typer(generate_driver_app, name="generate-driver") -app.add_typer(package_drivers_app, name="package-drivers") +app.add_typer(drivers_app, name="drivers") if __name__ == "__main__": diff --git a/uv.lock b/uv.lock index 97a2935..503cc32 100644 --- a/uv.lock +++ b/uv.lock @@ -1190,11 +1190,11 @@ wheels = [ [[package]] name = "pygments" -version = "2.20.0" +version = "2.19.2" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" }, + { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, ] [[package]] From b79dea8409f80605c64083cec736b8913c2f93ad Mon Sep 17 00:00:00 2001 From: Eric Simmerman Date: Sun, 5 Apr 2026 08:54:20 -0400 Subject: [PATCH 05/10] fix(generate-driver): use INT_MAX for Redshift dataSizeLimit, align value with comment Redshift uses a 32-bit JDBC driver and must use INT_MAX, not LONG_MAX. The generator now auto-selects INT_MAX for redshift/sqlserver/db2 prefixes with a matching comment so value and comment are always in sync. Co-Authored-By: Claude Sonnet 4.6 --- qualytics/cli/generate_driver.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/qualytics/cli/generate_driver.py b/qualytics/cli/generate_driver.py index 0980d19..7945084 100644 --- a/qualytics/cli/generate_driver.py +++ b/qualytics/cli/generate_driver.py @@ -867,9 +867,15 @@ def field(name: str, value, comment: str = "") -> str: "Set 1 for DBs that struggle with concurrent connections (e.g. BigQuery, " "single-threaded embedded drivers)")) todo_fields.append("maxPartitionParallelism") - lines.append(field("dataSizeLimit", "LONG_MAX", - "TODO: max data the driver can handle. LONG_MAX (default, most DBs) or " - "INT_MAX for older 32-bit drivers (SQL Server, Redshift, Db2)")) + _int_max_prefixes = ("redshift", "sqlserver", "db2") + _data_size_default = "INT_MAX" if any(p in prefix.lower() for p in _int_max_prefixes) else "LONG_MAX" + _data_size_comment = ( + "INT_MAX: older 32-bit driver (SQL Server, Redshift, Db2)" + if _data_size_default == "INT_MAX" + else "TODO: max data the driver can handle. LONG_MAX (default, most DBs) or " + "INT_MAX for older 32-bit drivers (SQL Server, Redshift, Db2)" + ) + lines.append(field("dataSizeLimit", _data_size_default, _data_size_comment)) todo_fields.append("dataSizeLimit") lines.append("") From ecc019efdb31e58e01073f0e9526ef3a31bd9442 Mon Sep 17 00:00:00 2001 From: Eric Simmerman Date: Sun, 5 Apr 2026 09:50:02 -0400 Subject: [PATCH 06/10] Update qualytics/cli/generate_driver.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- qualytics/cli/generate_driver.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/qualytics/cli/generate_driver.py b/qualytics/cli/generate_driver.py index 7945084..0cb2a88 100644 --- a/qualytics/cli/generate_driver.py +++ b/qualytics/cli/generate_driver.py @@ -876,7 +876,10 @@ def field(name: str, value, comment: str = "") -> str: "INT_MAX for older 32-bit drivers (SQL Server, Redshift, Db2)" ) lines.append(field("dataSizeLimit", _data_size_default, _data_size_comment)) - todo_fields.append("dataSizeLimit") + if _data_size_default == "INT_MAX": + detected_fields.append("dataSizeLimit") + else: + todo_fields.append("dataSizeLimit") lines.append("") # ── Schema / catalog filtering ──────────────────────────────────────────── From a807a64e4f90723468eae30c515b016b47cbff3f Mon Sep 17 00:00:00 2001 From: Eric Simmerman Date: Sun, 5 Apr 2026 09:50:34 -0400 Subject: [PATCH 07/10] Update qualytics/cli/generate_driver.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- qualytics/cli/generate_driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qualytics/cli/generate_driver.py b/qualytics/cli/generate_driver.py index 0cb2a88..2d4919d 100644 --- a/qualytics/cli/generate_driver.py +++ b/qualytics/cli/generate_driver.py @@ -868,7 +868,7 @@ def field(name: str, value, comment: str = "") -> str: "single-threaded embedded drivers)")) todo_fields.append("maxPartitionParallelism") _int_max_prefixes = ("redshift", "sqlserver", "db2") - _data_size_default = "INT_MAX" if any(p in prefix.lower() for p in _int_max_prefixes) else "LONG_MAX" + _data_size_default = "INT_MAX" if prefix.lower() in _int_max_prefixes else "LONG_MAX" _data_size_comment = ( "INT_MAX: older 32-bit driver (SQL Server, Redshift, Db2)" if _data_size_default == "INT_MAX" From 6390db051a53261dc82dbf3dee2da9048f42ff4a Mon Sep 17 00:00:00 2001 From: Eric Simmerman Date: Sun, 5 Apr 2026 09:53:46 -0400 Subject: [PATCH 08/10] linting --- qualytics/cli/generate_driver.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/qualytics/cli/generate_driver.py b/qualytics/cli/generate_driver.py index 2d4919d..7945084 100644 --- a/qualytics/cli/generate_driver.py +++ b/qualytics/cli/generate_driver.py @@ -868,7 +868,7 @@ def field(name: str, value, comment: str = "") -> str: "single-threaded embedded drivers)")) todo_fields.append("maxPartitionParallelism") _int_max_prefixes = ("redshift", "sqlserver", "db2") - _data_size_default = "INT_MAX" if prefix.lower() in _int_max_prefixes else "LONG_MAX" + _data_size_default = "INT_MAX" if any(p in prefix.lower() for p in _int_max_prefixes) else "LONG_MAX" _data_size_comment = ( "INT_MAX: older 32-bit driver (SQL Server, Redshift, Db2)" if _data_size_default == "INT_MAX" @@ -876,10 +876,7 @@ def field(name: str, value, comment: str = "") -> str: "INT_MAX for older 32-bit drivers (SQL Server, Redshift, Db2)" ) lines.append(field("dataSizeLimit", _data_size_default, _data_size_comment)) - if _data_size_default == "INT_MAX": - detected_fields.append("dataSizeLimit") - else: - todo_fields.append("dataSizeLimit") + todo_fields.append("dataSizeLimit") lines.append("") # ── Schema / catalog filtering ──────────────────────────────────────────── From e10cb36b046424f8121b315628124ef29226eee6 Mon Sep 17 00:00:00 2001 From: Eric Simmerman Date: Sun, 5 Apr 2026 19:41:01 -0400 Subject: [PATCH 09/10] linting --- qualytics/cli/generate_driver.py | 727 ++++++++++++++++++++++--------- 1 file changed, 510 insertions(+), 217 deletions(-) diff --git a/qualytics/cli/generate_driver.py b/qualytics/cli/generate_driver.py index 7945084..3427d65 100644 --- a/qualytics/cli/generate_driver.py +++ b/qualytics/cli/generate_driver.py @@ -425,9 +425,7 @@ def _compile_probe(tmpdir: str) -> str: text=True, ) if result.returncode != 0: - print( - f"[red]Failed to compile Java probe:[/red]\n{result.stderr}" - ) + print(f"[red]Failed to compile Java probe:[/red]\n{result.stderr}") raise typer.Exit(code=1) return tmpdir @@ -473,17 +471,23 @@ def _run_probe( if result.stderr.strip(): for line in result.stderr.strip().splitlines(): if line.startswith("CONNECTION_ERROR:"): - print(f"[red]JDBC connection failed: {line[len('CONNECTION_ERROR:'):].strip()}[/red]") + print( + f"[red]JDBC connection failed: {line[len('CONNECTION_ERROR:') :].strip()}[/red]" + ) raise typer.Exit(code=1) if line.startswith("ERROR:"): - print(f"[red]{line[len('ERROR:'):].strip()}[/red]") + print(f"[red]{line[len('ERROR:') :].strip()}[/red]") raise typer.Exit(code=1) if result.returncode == 4: - print("[red]Could not connect to the database. Check --url, --user, and --password.[/red]") + print( + "[red]Could not connect to the database. Check --url, --user, and --password.[/red]" + ) raise typer.Exit(code=1) if result.returncode == 3: - print("[red]No compatible JDBC driver found in the provided JAR for the given URL.[/red]") + print( + "[red]No compatible JDBC driver found in the provided JAR for the given URL.[/red]" + ) raise typer.Exit(code=1) if result.returncode != 0: print(f"[red]Probe exited with code {result.returncode}.[/red]") @@ -535,9 +539,9 @@ def _derive_url_metadata(jdbc_url: str) -> tuple[int | None, str, set[str]]: rest = m.group(3) # host — present if authority is non-empty after stripping credentials/port/params - host_part = re.sub(r"^[^@]+@", "", authority) # strip user:pass@ + host_part = re.sub(r"^[^@]+@", "", authority) # strip user:pass@ host_part = re.sub(r":\d+(?:$|;)", "", host_part) # strip :port - host_part = re.sub(r";.*$", "", host_part) # strip ;params (SQL Server style) + host_part = re.sub(r";.*$", "", host_part) # strip ;params (SQL Server style) if host_part.strip(): url_components.add("host") @@ -575,14 +579,14 @@ def _derive_url_metadata(jdbc_url: str) -> tuple[int | None, str, set[str]]: # Known Spark built-in JdbcDialect implementations (Spark 3.x, package org.apache.spark.sql.jdbc) _SPARK_BUILTIN_DIALECTS: dict[str, str] = { "postgresql": "org.apache.spark.sql.jdbc.PostgresDialect$", - "mysql": "org.apache.spark.sql.jdbc.MySQLDialect$", - "mariadb": "org.apache.spark.sql.jdbc.MySQLDialect$", - "oracle": "org.apache.spark.sql.jdbc.OracleDialect$", - "sqlserver": "org.apache.spark.sql.jdbc.MsSqlServerDialect$", - "jtds": "org.apache.spark.sql.jdbc.MsSqlServerDialect$", - "db2": "org.apache.spark.sql.jdbc.DB2Dialect$", - "derby": "org.apache.spark.sql.jdbc.DerbyDialect$", - "teradata": "org.apache.spark.sql.jdbc.TeradataDialect$", + "mysql": "org.apache.spark.sql.jdbc.MySQLDialect$", + "mariadb": "org.apache.spark.sql.jdbc.MySQLDialect$", + "oracle": "org.apache.spark.sql.jdbc.OracleDialect$", + "sqlserver": "org.apache.spark.sql.jdbc.MsSqlServerDialect$", + "jtds": "org.apache.spark.sql.jdbc.MsSqlServerDialect$", + "db2": "org.apache.spark.sql.jdbc.DB2Dialect$", + "derby": "org.apache.spark.sql.jdbc.DerbyDialect$", + "teradata": "org.apache.spark.sql.jdbc.TeradataDialect$", } @@ -602,7 +606,9 @@ def _detect_dialect_class(prefix: str, jar_path: str) -> str | None: with _zf.ZipFile(jar_path, "r") as zf: service_entry = "META-INF/services/org.apache.spark.sql.jdbc.JdbcDialect" if service_entry in zf.namelist(): - content = zf.read(service_entry).decode("utf-8", errors="replace").strip() + content = ( + zf.read(service_entry).decode("utf-8", errors="replace").strip() + ) # Take the first non-comment, non-blank line for line in content.splitlines(): line = line.strip() @@ -647,12 +653,37 @@ def _render(value) -> str: sv = str(value) if sv == "null": return "null" - if any(c in sv for c in [':', '#', '[', ']', '{', '}', ',', '&', '*', '?', '|', - '-', '<', '>', '=', '!', '%', '@', '`', '"', "'", '\n']): + if any( + c in sv + for c in [ + ":", + "#", + "[", + "]", + "{", + "}", + ",", + "&", + "*", + "?", + "|", + "-", + "<", + ">", + "=", + "!", + "%", + "@", + "`", + '"', + "'", + "\n", + ] + ): dumped = yaml.dump(sv, default_flow_style=True).strip() # yaml.dump may append a YAML document-end marker on a new line — strip it - if '\n' in dumped: - dumped = dumped.split('\n')[0] + if "\n" in dumped: + dumped = dumped.split("\n")[0] return dumped return sv @@ -672,7 +703,8 @@ def field(name: str, value, comment: str = "") -> str: ) # ── Header ────────────────────────────────────────────────────────────── - lines.append(textwrap.dedent(f"""\ + lines.append( + textwrap.dedent(f"""\ # Generated by: qualytics generate-driver # # "# auto-detected" — probed from the live JDBC connection. @@ -685,81 +717,134 @@ def field(name: str, value, comment: str = "") -> str: # Keys equal to their DriverDefinition default are omitted to keep this file concise. # # Deploy this file to: META-INF/jdbc-drivers/{prefix}.yaml - """)) + """) + ) # ── Identity ───────────────────────────────────────────────────────────── - lines.append(field("prefix", prefix, - "review — must match the jdbc:: scheme in the JDBC URL")) + lines.append( + field( + "prefix", + prefix, + "review — must match the jdbc:: scheme in the JDBC URL", + ) + ) todo_fields.append("prefix") - lines.append(field("className", probes.get("className"), - "auto-detected — fully-qualified JDBC Driver class name")) + lines.append( + field( + "className", + probes.get("className"), + "auto-detected — fully-qualified JDBC Driver class name", + ) + ) detected_fields.append("className") lines.append("") # ── SQL dialect ─────────────────────────────────────────────────────────── - lines.append("# ── SQL dialect ──────────────────────────────────────────────────────") + lines.append( + "# ── SQL dialect ──────────────────────────────────────────────────────" + ) # displayName — always emit (default is raw prefix; capitalised form is user-friendly) - lines.append(field("displayName", display_name, - "auto-detected from DB product name — human-readable name shown in the UI")) + lines.append( + field( + "displayName", + display_name, + "auto-detected from DB product name — human-readable name shown in the UI", + ) + ) detected_fields.append("displayName") # defaultPort — emit only when probe URL contained an explicit port number. # If the driver uses a portless URL scheme (e.g. jdbc:sqlite:, jdbc:h2:mem:), # leave as TODO so the user knows to set it (or omit if the driver truly has no port). if default_port is not None: - lines.append(field("defaultPort", default_port, - "auto-detected from JDBC URL — default port shown in the connection form")) + lines.append( + field( + "defaultPort", + default_port, + "auto-detected from JDBC URL — default port shown in the connection form", + ) + ) detected_fields.append("defaultPort") else: - lines.append(field("defaultPort", None, - "TODO: default TCP port for this driver " - "(e.g. 5432 PostgreSQL, 3306 MySQL, 1521 Oracle, 1433 SQL Server). " - "Omit this key entirely if the driver does not use TCP ports.")) + lines.append( + field( + "defaultPort", + None, + "TODO: default TCP port for this driver " + "(e.g. 5432 PostgreSQL, 3306 MySQL, 1521 Oracle, 1433 SQL Server). " + "Omit this key entirely if the driver does not use TCP ports.", + ) + ) todo_fields.append("defaultPort") # transactionIsolation — omit if READ_UNCOMMITTED (default) tx = probes.get("transactionIsolation") if tx and tx != "READ_UNCOMMITTED": - lines.append(field("transactionIsolation", tx, - "auto-detected — valid: NONE, READ_UNCOMMITTED (default), " - "READ_COMMITTED, SERIALIZABLE")) + lines.append( + field( + "transactionIsolation", + tx, + "auto-detected — valid: NONE, READ_UNCOMMITTED (default), " + "READ_COMMITTED, SERIALIZABLE", + ) + ) detected_fields.append("transactionIsolation") elif tx: - detected_fields.append("transactionIsolation") # default — omitted + detected_fields.append("transactionIsolation") # default — omitted # identifierQuoteChar — omit if " (default) quote_char = probes.get("identifierQuoteChar") if quote_char and quote_char != '"': - lines.append(field("identifierQuoteChar", quote_char, - 'auto-detected — char used to quote identifiers; default " — MySQL/MariaDB use `')) + lines.append( + field( + "identifierQuoteChar", + quote_char, + 'auto-detected — char used to quote identifiers; default " — MySQL/MariaDB use `', + ) + ) detected_fields.append("identifierQuoteChar") elif quote_char: - detected_fields.append("identifierQuoteChar") # default — omitted + detected_fields.append("identifierQuoteChar") # default — omitted # tableNameCasing — omit if asis (default) casing = probes.get("tableNameCasing", "asis") if casing != "asis": - lines.append(field("tableNameCasing", casing, - "auto-detected — valid: upper (DB2/Oracle), lower (PostgreSQL), " - "asis (default, most others)")) + lines.append( + field( + "tableNameCasing", + casing, + "auto-detected — valid: upper (DB2/Oracle), lower (PostgreSQL), " + "asis (default, most others)", + ) + ) detected_fields.append("tableNameCasing") else: - detected_fields.append("tableNameCasing") # default — omitted + detected_fields.append("tableNameCasing") # default — omitted # rowLimitSyntax — omit if LIMIT (default); TODO if probe couldn't determine row_limit = probes.get("rowLimitSyntax") if row_limit and row_limit != "LIMIT": - lines.append(field("rowLimitSyntax", row_limit, - "auto-detected — valid: LIMIT (default), TOP (SQL Server), " - "ROWNUM (Oracle), FETCH_FIRST (DB2/Informix)")) + lines.append( + field( + "rowLimitSyntax", + row_limit, + "auto-detected — valid: LIMIT (default), TOP (SQL Server), " + "ROWNUM (Oracle), FETCH_FIRST (DB2/Informix)", + ) + ) detected_fields.append("rowLimitSyntax") elif row_limit == "LIMIT": - detected_fields.append("rowLimitSyntax") # default — omitted + detected_fields.append("rowLimitSyntax") # default — omitted else: - lines.append(field("rowLimitSyntax", "LIMIT", - "TODO: valid: LIMIT (default, MySQL/PG/SQLite), TOP (SQL Server), " - "ROWNUM (Oracle), FETCH_FIRST (DB2/Informix/Spark)")) + lines.append( + field( + "rowLimitSyntax", + "LIMIT", + "TODO: valid: LIMIT (default, MySQL/PG/SQLite), TOP (SQL Server), " + "ROWNUM (Oracle), FETCH_FIRST (DB2/Informix/Spark)", + ) + ) todo_fields.append("rowLimitSyntax") # subqueryRequiresAlias — omit if true (default); emit false if probe confirmed no alias needed @@ -767,9 +852,14 @@ def field(name: str, value, comment: str = "") -> str: if isinstance(sub_alias, str): sub_alias = sub_alias.lower() != "false" if not sub_alias: - lines.append(field("subqueryRequiresAlias", False, - "auto-detected — false: subqueries do NOT need an AS alias " - "(rare; historically Oracle)")) + lines.append( + field( + "subqueryRequiresAlias", + False, + "auto-detected — false: subqueries do NOT need an AS alias " + "(rare; historically Oracle)", + ) + ) detected_fields.append("subqueryRequiresAlias") else: detected_fields.append("subqueryRequiresAlias") # default true — omitted @@ -777,10 +867,15 @@ def field(name: str, value, comment: str = "") -> str: # timestampLiteralStyle — omit if PLAIN (default) ts_style = probes.get("timestampLiteralStyle", "PLAIN") if ts_style != "PLAIN": - lines.append(field("timestampLiteralStyle", ts_style, - "auto-detected — valid: PLAIN (default), TIMESTAMP_PREFIX (standard SQL), " - "CAST_AS_TIMESTAMP (Hive), CAST_DATE_FORMAT (Databricks), " - "TO_TIMESTAMP (Oracle), CAST_DATETIME2 (SQL Server)")) + lines.append( + field( + "timestampLiteralStyle", + ts_style, + "auto-detected — valid: PLAIN (default), TIMESTAMP_PREFIX (standard SQL), " + "CAST_AS_TIMESTAMP (Hive), CAST_DATE_FORMAT (Databricks), " + "TO_TIMESTAMP (Oracle), CAST_DATETIME2 (SQL Server)", + ) + ) detected_fields.append("timestampLiteralStyle") else: detected_fields.append("timestampLiteralStyle") # default — omitted @@ -789,57 +884,87 @@ def field(name: str, value, comment: str = "") -> str: # dateLiteralStyle — omit if PLAIN (default) dt_style = probes.get("dateLiteralStyle", "PLAIN") if dt_style != "PLAIN": - lines.append(field("dateLiteralStyle", dt_style, - "auto-detected — valid: PLAIN (default), DATE_PREFIX, TO_DATE (Oracle)")) + lines.append( + field( + "dateLiteralStyle", + dt_style, + "auto-detected — valid: PLAIN (default), DATE_PREFIX, TO_DATE (Oracle)", + ) + ) detected_fields.append("dateLiteralStyle") else: - detected_fields.append("dateLiteralStyle") # default — omitted + detected_fields.append("dateLiteralStyle") # default — omitted # dateLiteralTemplate: escape hatch — omit unless enum styles are insufficient # schemaOnlyQueryStyle — if CTE (probe fallback, unconfirmed) → TODO; else emit as detected schema_only = probes.get("schemaOnlyQueryStyle", "CTE") if schema_only != "CTE": - lines.append(field("schemaOnlyQueryStyle", schema_only, - "auto-detected — how to wrap a query to return 0 rows for schema inspection. " - "Valid: CTE (default), PG_CTE (PostgreSQL), SQLSERVER_TOP0 (SQL Server), " - "WHERE_FALSE_QUERYA (generic WHERE 1=0), ORACLE_WHERE_FALSE (Oracle), " - "HIVE_LIMIT0 (Hive/Spark)")) + lines.append( + field( + "schemaOnlyQueryStyle", + schema_only, + "auto-detected — how to wrap a query to return 0 rows for schema inspection. " + "Valid: CTE (default), PG_CTE (PostgreSQL), SQLSERVER_TOP0 (SQL Server), " + "WHERE_FALSE_QUERYA (generic WHERE 1=0), ORACLE_WHERE_FALSE (Oracle), " + "HIVE_LIMIT0 (Hive/Spark)", + ) + ) detected_fields.append("schemaOnlyQueryStyle") else: - lines.append(field("schemaOnlyQueryStyle", "CTE", - "TODO: how to wrap a query to return 0 rows for schema inspection. " - "Valid: CTE (default, most modern DBs with WITH support), PG_CTE (PostgreSQL), " - "SQLSERVER_TOP0 (SQL Server/Synapse), WHERE_FALSE_QUERYA (generic WHERE 1=0), " - "ORACLE_WHERE_FALSE (Oracle), HIVE_LIMIT0 (Hive/Spark)")) + lines.append( + field( + "schemaOnlyQueryStyle", + "CTE", + "TODO: how to wrap a query to return 0 rows for schema inspection. " + "Valid: CTE (default, most modern DBs with WITH support), PG_CTE (PostgreSQL), " + "SQLSERVER_TOP0 (SQL Server/Synapse), WHERE_FALSE_QUERYA (generic WHERE 1=0), " + "ORACLE_WHERE_FALSE (Oracle), HIVE_LIMIT0 (Hive/Spark)", + ) + ) todo_fields.append("schemaOnlyQueryStyle") # tableSampleTemplate — omit if null (not supported = default "no template") sample_tmpl = probes.get("tableSampleTemplate") if sample_tmpl and sample_tmpl != "null": - lines.append(field("tableSampleTemplate", sample_tmpl, - "auto-detected — TABLESAMPLE syntax; {pct} = percent, {rows} = row count")) + lines.append( + field( + "tableSampleTemplate", + sample_tmpl, + "auto-detected — TABLESAMPLE syntax; {pct} = percent, {rows} = row count", + ) + ) detected_fields.append("tableSampleTemplate") else: - detected_fields.append("tableSampleTemplate") # null/not supported — omitted + detected_fields.append("tableSampleTemplate") # null/not supported — omitted # viewSampleFallback — omit if RAND (default) vsf = probes.get("viewSampleFallback", "RAND") if vsf != "RAND": - lines.append(field("viewSampleFallback", vsf, - "auto-detected — random fn for view sampling. " - "Valid: RAND (default), RANDOM (PostgreSQL/Redshift), " - "NEWID (SQL Server), DBMS_RANDOM (Oracle), SAMPLE_N (Teradata), " - "NONE (BigQuery)")) + lines.append( + field( + "viewSampleFallback", + vsf, + "auto-detected — random fn for view sampling. " + "Valid: RAND (default), RANDOM (PostgreSQL/Redshift), " + "NEWID (SQL Server), DBMS_RANDOM (Oracle), SAMPLE_N (Teradata), " + "NONE (BigQuery)", + ) + ) detected_fields.append("viewSampleFallback") else: - detected_fields.append("viewSampleFallback") # default — omitted + detected_fields.append("viewSampleFallback") # default — omitted # viewSampleFallbackSql: escape hatch — omit unless enum styles are insufficient # approxCountDistinctFunction — omit if null (not supported, falls back to COUNT DISTINCT) approx = probes.get("approxCountDistinctFunction") if approx and approx != "null": - lines.append(field("approxCountDistinctFunction", approx, - "auto-detected — SQL function name for approximate COUNT DISTINCT")) + lines.append( + field( + "approxCountDistinctFunction", + approx, + "auto-detected — SQL function name for approximate COUNT DISTINCT", + ) + ) detected_fields.append("approxCountDistinctFunction") else: detected_fields.append("approxCountDistinctFunction") # null — omitted @@ -847,88 +972,141 @@ def field(name: str, value, comment: str = "") -> str: # validationQuery — omit if SELECT 1 (default) val_q = probes.get("validationQuery") if val_q and val_q != "SELECT 1": - lines.append(field("validationQuery", val_q, - "auto-detected — minimal SQL to test a pooled connection is alive")) + lines.append( + field( + "validationQuery", + val_q, + "auto-detected — minimal SQL to test a pooled connection is alive", + ) + ) detected_fields.append("validationQuery") elif val_q: - detected_fields.append("validationQuery") # default — omitted + detected_fields.append("validationQuery") # default — omitted else: - lines.append(field("validationQuery", "SELECT 1", - "TODO: SQL to verify a live connection; try SELECT 1 FROM DUAL (Oracle), " - "VALUES 1 (DB2/H2)")) + lines.append( + field( + "validationQuery", + "SELECT 1", + "TODO: SQL to verify a live connection; try SELECT 1 FROM DUAL (Oracle), " + "VALUES 1 (DB2/H2)", + ) + ) todo_fields.append("validationQuery") lines.append("") # ── Performance ─────────────────────────────────────────────────────────── - lines.append("# ── Performance ──────────────────────────────────────────────────────") - lines.append(field("maxPartitionParallelism", 10, - "TODO: max parallel partitions for scan operations; default 10. " - "Set 1 for DBs that struggle with concurrent connections (e.g. BigQuery, " - "single-threaded embedded drivers)")) + lines.append( + "# ── Performance ──────────────────────────────────────────────────────" + ) + lines.append( + field( + "maxPartitionParallelism", + 10, + "TODO: max parallel partitions for scan operations; default 10. " + "Set 1 for DBs that struggle with concurrent connections (e.g. BigQuery, " + "single-threaded embedded drivers)", + ) + ) todo_fields.append("maxPartitionParallelism") _int_max_prefixes = ("redshift", "sqlserver", "db2") - _data_size_default = "INT_MAX" if any(p in prefix.lower() for p in _int_max_prefixes) else "LONG_MAX" + _data_size_default = ( + "INT_MAX" if any(p in prefix.lower() for p in _int_max_prefixes) else "LONG_MAX" + ) _data_size_comment = ( "INT_MAX: older 32-bit driver (SQL Server, Redshift, Db2)" if _data_size_default == "INT_MAX" else "TODO: max data the driver can handle. LONG_MAX (default, most DBs) or " - "INT_MAX for older 32-bit drivers (SQL Server, Redshift, Db2)" + "INT_MAX for older 32-bit drivers (SQL Server, Redshift, Db2)" ) lines.append(field("dataSizeLimit", _data_size_default, _data_size_comment)) todo_fields.append("dataSizeLimit") lines.append("") # ── Schema / catalog filtering ──────────────────────────────────────────── - lines.append("# ── Schema / catalog filtering ───────────────────────────────────────") - lines.append("systemSchemaExclusions: []" - " # TODO: exact schema names to exclude from catalog scans " - "(e.g. [information_schema, pg_catalog])") - lines.append("systemSchemaExclusionPrefixes: []" - " # TODO: schema name prefixes to exclude (e.g. [pg_temp_, pg_toast_temp_])") - lines.append("systemCatalogExclusions: []" - " # TODO: catalog names to exclude " - "(e.g. [admin, local, config] for MongoDB; [information_schema, mysql] for MySQL)") - todo_fields += ["systemSchemaExclusions", "systemSchemaExclusionPrefixes", "systemCatalogExclusions"] + lines.append( + "# ── Schema / catalog filtering ───────────────────────────────────────" + ) + lines.append( + "systemSchemaExclusions: []" + " # TODO: exact schema names to exclude from catalog scans " + "(e.g. [information_schema, pg_catalog])" + ) + lines.append( + "systemSchemaExclusionPrefixes: []" + " # TODO: schema name prefixes to exclude (e.g. [pg_temp_, pg_toast_temp_])" + ) + lines.append( + "systemCatalogExclusions: []" + " # TODO: catalog names to exclude " + "(e.g. [admin, local, config] for MongoDB; [information_schema, mysql] for MySQL)" + ) + todo_fields += [ + "systemSchemaExclusions", + "systemSchemaExclusionPrefixes", + "systemCatalogExclusions", + ] # getTablesUsesNullCatalog — omit if false (default); emit if true get_tables_null = probes.get("getTablesUsesNullCatalog", False) if isinstance(get_tables_null, str): get_tables_null = get_tables_null.lower() == "true" if get_tables_null: - lines.append(field("getTablesUsesNullCatalog", True, - "auto-detected — pass null as catalog arg to DatabaseMetaData.getTables(); " - "required for Db2")) + lines.append( + field( + "getTablesUsesNullCatalog", + True, + "auto-detected — pass null as catalog arg to DatabaseMetaData.getTables(); " + "required for Db2", + ) + ) detected_fields.append("getTablesUsesNullCatalog") else: detected_fields.append("getTablesUsesNullCatalog") # default false — omitted lines.append("") # ── Style selectors ──────────────────────────────────────────────────────── - lines.append("# ── Style selectors ──────────────────────────────────────────────────") + lines.append( + "# ── Style selectors ──────────────────────────────────────────────────" + ) # rowCountQueryStyle — emit as TODO if COUNT_STAR (default/unconfirmed), else auto-detected row_count_style = probes.get("rowCountQueryStyle", "COUNT_STAR") if row_count_style and row_count_style != "COUNT_STAR": - lines.append(field("rowCountQueryStyle", row_count_style, - "auto-detected — row count strategy")) + lines.append( + field( + "rowCountQueryStyle", + row_count_style, + "auto-detected — row count strategy", + ) + ) detected_fields.append("rowCountQueryStyle") else: - lines.append(field("rowCountQueryStyle", "COUNT_STAR", - "TODO: row count strategy. Valid: COUNT_STAR (default, always works), " - "BQ_TABLES (BigQuery), INFORMATION_SCHEMA_ROW_COUNT (MySQL/MariaDB), " - "ALL_TABLES (Oracle), INFORMATION_SCHEMA_TABLES_WITH_SIZE (MySQL/MariaDB)")) + lines.append( + field( + "rowCountQueryStyle", + "COUNT_STAR", + "TODO: row count strategy. Valid: COUNT_STAR (default, always works), " + "BQ_TABLES (BigQuery), INFORMATION_SCHEMA_ROW_COUNT (MySQL/MariaDB), " + "ALL_TABLES (Oracle), INFORMATION_SCHEMA_TABLES_WITH_SIZE (MySQL/MariaDB)", + ) + ) todo_fields.append("rowCountQueryStyle") # countStarNullSizeBytesExpr — almost always null (default); omit; user adds manually for Dremio # schemaExistenceQueryStyle — omit if NONE (default) schema_style = probes.get("schemaExistenceQueryStyle", "NONE") if schema_style != "NONE": - lines.append(field("schemaExistenceQueryStyle", schema_style, - "auto-detected — schema enumeration style. " - "Valid: NONE (default), INFORMATION_SCHEMA, SHOW_SCHEMAS_LIKE, " - "SHOW_SCHEMAS_ITERATE (Hive/Trino), SYSCAT (DB2), " - "ALTER_SESSION (Oracle), SYS_SCHEMAS (SQL Server)")) + lines.append( + field( + "schemaExistenceQueryStyle", + schema_style, + "auto-detected — schema enumeration style. " + "Valid: NONE (default), INFORMATION_SCHEMA, SHOW_SCHEMAS_LIKE, " + "SHOW_SCHEMAS_ITERATE (Hive/Trino), SYSCAT (DB2), " + "ALTER_SESSION (Oracle), SYS_SCHEMAS (SQL Server)", + ) + ) detected_fields.append("schemaExistenceQueryStyle") else: detected_fields.append("schemaExistenceQueryStyle") # default — omitted @@ -936,95 +1114,156 @@ def field(name: str, value, comment: str = "") -> str: # dateArithmeticStyle — omit if STANDARD (default) date_arith = probes.get("dateArithmeticStyle", "STANDARD") if date_arith != "STANDARD": - lines.append(field("dateArithmeticStyle", date_arith, - "auto-detected — date arithmetic strategy. " - "Valid: STANDARD (default, ANSI fallback), DATEADD_DATEDIFF (SQL Server), " - "NUMTODSINTERVAL (Oracle), TIMESTAMP_ADD (BigQuery), TIMESTAMPDIFF_DB2 (Db2)")) + lines.append( + field( + "dateArithmeticStyle", + date_arith, + "auto-detected — date arithmetic strategy. " + "Valid: STANDARD (default, ANSI fallback), DATEADD_DATEDIFF (SQL Server), " + "NUMTODSINTERVAL (Oracle), TIMESTAMP_ADD (BigQuery), TIMESTAMPDIFF_DB2 (Db2)", + ) + ) detected_fields.append("dateArithmeticStyle") else: - detected_fields.append("dateArithmeticStyle") # default — omitted + detected_fields.append("dateArithmeticStyle") # default — omitted lines.append("") # ── Date arithmetic templates (only when non-null) ──────────────────────── int_ts = probes.get("intervalCalcDatetimeTimestampTemplate") int_dt = probes.get("intervalCalcDatetimeDateTemplate") - up_ts = probes.get("upperBoundDatetimeTimestampTemplate") - up_dt = probes.get("upperBoundDatetimeDateTemplate") + up_ts = probes.get("upperBoundDatetimeTimestampTemplate") + up_dt = probes.get("upperBoundDatetimeDateTemplate") has_templates = any(v and v != "null" for v in [int_ts, int_dt, up_ts, up_dt]) if has_templates: - lines.append("# ── Date arithmetic templates ─────────────────────────────────────────") - lines.append("# Placeholders: {col} = column name, MIN_{col} = min value, " - "MAX_{col} = max value, {interval} = midpoint expression") + lines.append( + "# ── Date arithmetic templates ─────────────────────────────────────────" + ) + lines.append( + "# Placeholders: {col} = column name, MIN_{col} = min value, " + "MAX_{col} = max value, {interval} = midpoint expression" + ) if int_ts and int_ts != "null": - lines.append(field("intervalCalcDatetimeTimestampTemplate", int_ts, "auto-detected")) + lines.append( + field("intervalCalcDatetimeTimestampTemplate", int_ts, "auto-detected") + ) detected_fields.append("intervalCalcDatetimeTimestampTemplate") if int_dt and int_dt != "null": - lines.append(field("intervalCalcDatetimeDateTemplate", int_dt, "auto-detected")) + lines.append( + field("intervalCalcDatetimeDateTemplate", int_dt, "auto-detected") + ) detected_fields.append("intervalCalcDatetimeDateTemplate") if up_ts and up_ts != "null": - lines.append(field("upperBoundDatetimeTimestampTemplate", up_ts, "auto-detected")) + lines.append( + field("upperBoundDatetimeTimestampTemplate", up_ts, "auto-detected") + ) detected_fields.append("upperBoundDatetimeTimestampTemplate") if up_dt and up_dt != "null": - lines.append(field("upperBoundDatetimeDateTemplate", up_dt, "auto-detected")) + lines.append( + field("upperBoundDatetimeDateTemplate", up_dt, "auto-detected") + ) detected_fields.append("upperBoundDatetimeDateTemplate") lines.append("") # ── Connectivity ────────────────────────────────────────────────────────── - lines.append("# ── Connectivity ─────────────────────────────────────────────────────") + lines.append( + "# ── Connectivity ─────────────────────────────────────────────────────" + ) # networkCapable: true (default) — omitted; readOnly: false (default) — omitted - lines.append("connectionProperties: {}" - " # TODO: key-value pairs injected into JDBC pool and Spark " - "(e.g. {ssl: 'true', charset: 'utf8'})") - lines.append("sessionInitStatements: []" - " # TODO: SQL statements run once after each new connection " - "(e.g. [\"SET SCHEMA mydb\", \"ALTER SESSION SET NLS_DATE_FORMAT='YYYY-MM-DD'\"])") + lines.append( + "connectionProperties: {}" + " # TODO: key-value pairs injected into JDBC pool and Spark " + "(e.g. {ssl: 'true', charset: 'utf8'})" + ) + lines.append( + "sessionInitStatements: []" + " # TODO: SQL statements run once after each new connection " + '(e.g. ["SET SCHEMA mydb", "ALTER SESSION SET NLS_DATE_FORMAT=\'YYYY-MM-DD\'"])' + ) todo_fields += ["connectionProperties", "sessionInitStatements"] lines.append("") # ── Spark JdbcDialect ───────────────────────────────────────────────────── - lines.append("# ── Spark JdbcDialect ────────────────────────────────────────────────") + lines.append( + "# ── Spark JdbcDialect ────────────────────────────────────────────────" + ) if dialect_class is not None: - lines.append(field("dialectClass", dialect_class, - "Auto-detected Spark JdbcDialect subclass")) + lines.append( + field( + "dialectClass", + dialect_class, + "Auto-detected Spark JdbcDialect subclass", + ) + ) detected_fields.append("dialectClass") else: - lines.append(field("dialectClass", None, - "TODO: fully-qualified JdbcDialect Scala object class to register with Spark " - "(e.g. com.example.MyDialect$); null if no custom Spark dialect is needed")) + lines.append( + field( + "dialectClass", + None, + "TODO: fully-qualified JdbcDialect Scala object class to register with Spark " + "(e.g. com.example.MyDialect$); null if no custom Spark dialect is needed", + ) + ) todo_fields.append("dialectClass") lines.append("") # ── URL construction ────────────────────────────────────────────────────── - lines.append("# ── URL construction ─────────────────────────────────────────────────") - lines.append("# Known placeholders: {host}, {port}, {database}, {schema}, {username}, {password}") + lines.append( + "# ── URL construction ─────────────────────────────────────────────────" + ) + lines.append( + "# Known placeholders: {host}, {port}, {database}, {schema}, {username}, {password}" + ) if jdbc_url_template: - lines.append(field("jdbcUrlTemplate", jdbc_url_template, - "auto-detected from probe URL — verify all placeholders are correct")) + lines.append( + field( + "jdbcUrlTemplate", + jdbc_url_template, + "auto-detected from probe URL — verify all placeholders are correct", + ) + ) detected_fields.append("jdbcUrlTemplate") else: - lines.append(field("jdbcUrlTemplate", "", - "TODO: URL template with {host}, {port}, {database} substitution tokens. " - "Example: jdbc:mydb://{host}:{port}/{database}")) + lines.append( + field( + "jdbcUrlTemplate", + "", + "TODO: URL template with {host}, {port}, {database} substitution tokens. " + "Example: jdbc:mydb://{host}:{port}/{database}", + ) + ) todo_fields.append("jdbcUrlTemplate") - lines.append("jdbcUrlStaticParams: []" - " # TODO: query params always appended to every URL " - "(e.g. [tcpKeepAlive=true, sslmode=prefer])") - lines.append("jdbcUrlConditionalParams: []" - " # TODO: params appended only when a form field is non-empty " - "(e.g. [{key: schema, param: 'currentSchema={schema}'}])") - lines.append("jdbcUrlAuthVariants: {}" - " # optional: auth_type -> full URL template override; leave empty if not needed") + lines.append( + "jdbcUrlStaticParams: []" + " # TODO: query params always appended to every URL " + "(e.g. [tcpKeepAlive=true, sslmode=prefer])" + ) + lines.append( + "jdbcUrlConditionalParams: []" + " # TODO: params appended only when a form field is non-empty " + "(e.g. [{key: schema, param: 'currentSchema={schema}'}])" + ) + lines.append( + "jdbcUrlAuthVariants: {}" + " # optional: auth_type -> full URL template override; leave empty if not needed" + ) todo_fields += ["jdbcUrlStaticParams", "jdbcUrlConditionalParams"] lines.append("") # ── Connection spec ──────────────────────────────────────────────────────── # Only mark a field required if the probe URL actually contained that component. # e.g. jdbc:sqlite:/path/to/db has no host or port → those fields are optional. - lines.append("# ── Connection spec (frontend form) ──────────────────────────────────") + lines.append( + "# ── Connection spec (frontend form) ──────────────────────────────────" + ) lines.append("# TODO: define the connection form fields shown in the UI.") - lines.append("# Each field: name, label, fieldType (string/integer/boolean/password/enum/file),") - lines.append("# required, defaultValue, hint, options (for enum), dependsOn, dependsOnValue") + lines.append( + "# Each field: name, label, fieldType (string/integer/boolean/password/enum/file)," + ) + lines.append( + "# required, defaultValue, hint, options (for enum), dependsOn, dependsOnValue" + ) lines.append("connectionSpec:") lines.append(" supportsEnrichment: false # custom drivers are source-only") lines.append(" fields:") @@ -1066,7 +1305,9 @@ def field(name: str, value, comment: str = "") -> str: def _strip_jdbc_credentials(jdbc_url: str) -> str: """Remove user/password from a JDBC URL for safe inclusion in prompts.""" cleaned = re.sub(r"(jdbc:[^:]+://)([^@/]+@)", r"\1", jdbc_url) - cleaned = re.sub(r"[?&](password|passwd|pwd)=[^&]*", "", cleaned, flags=re.IGNORECASE) + cleaned = re.sub( + r"[?&](password|passwd|pwd)=[^&]*", "", cleaned, flags=re.IGNORECASE + ) return cleaned @@ -1106,7 +1347,9 @@ def _call_deployment_llm(client, prompt: str) -> str | None: try: event = json.loads(data) if isinstance(event, dict) and event.get("type") == "text-delta": - text_parts.append(event.get("textDelta") or event.get("delta") or "") + text_parts.append( + event.get("textDelta") or event.get("delta") or "" + ) except json.JSONDecodeError: # Vercel AI SDK compact format: 0:"chunk" if re.match(r'^0:"', data): @@ -1136,7 +1379,9 @@ def _apply_llm_suggestions(yaml_content: str, suggestions: dict) -> tuple[str, i rationale = str(suggestion.get("rationale", "")).replace("\n", " ").strip() if value is not None: if isinstance(value, (list, dict)): - yaml_val = yaml.dump(value, default_flow_style=True).strip().rstrip("\n") + yaml_val = ( + yaml.dump(value, default_flow_style=True).strip().rstrip("\n") + ) elif isinstance(value, bool): yaml_val = str(value).lower() elif isinstance(value, (int, float)): @@ -1147,7 +1392,9 @@ def _apply_llm_suggestions(yaml_content: str, suggestions: dict) -> tuple[str, i yaml_val = yaml.dump(sv, default_flow_style=True).strip() else: yaml_val = sv - result_lines.append(f"{field_name}: {yaml_val} # LLM-suggested: {rationale}\n") + result_lines.append( + f"{field_name}: {yaml_val} # LLM-suggested: {rationale}\n" + ) applied += 1 continue result_lines.append(line) @@ -1247,7 +1494,7 @@ def generate_driver( typer.Option( "--dist-dir", help="Root dist directory for generated files. " - "YAML is written to /META-INF/jdbc-drivers/.yaml.", + "YAML is written to /META-INF/jdbc-drivers/.yaml.", show_default=True, ), ] = "dist", @@ -1323,7 +1570,9 @@ def generate_driver( # ── Build YAML ─────────────────────────────────────────────────────── detected_dialect = _detect_dialect_class(prefix, jar_path) - yaml_content, detected_fields, todo_fields = _build_yaml(prefix, probes, url, dialect_class=detected_dialect) + yaml_content, detected_fields, todo_fields = _build_yaml( + prefix, probes, url, dialect_class=detected_dialect + ) # ── Write output ───────────────────────────────────────────────────── try: @@ -1353,7 +1602,9 @@ def generate_driver( config = load_config() if config is None: - print("[dim] Not logged in to a Qualytics deployment — LLM TODO resolution skipped.[/dim]") + print( + "[dim] Not logged in to a Qualytics deployment — LLM TODO resolution skipped.[/dim]" + ) else: client = QualyticsClient( base_url=validate_and_format_url(config["url"]), @@ -1362,7 +1613,9 @@ def generate_driver( ) llm_status = client.get("agent/llm-config/status").json() if not llm_status.get("is_configured"): - print("[dim] No LLM integration configured on this deployment — TODO fields left as-is.[/dim]") + print( + "[dim] No LLM integration configured on this deployment — TODO fields left as-is.[/dim]" + ) else: db_product = probes.get("dbProductName") or "Unknown database" db_version = probes.get("dbProductVersion") or "" @@ -1393,29 +1646,45 @@ def generate_driver( Only include fields where you have reasonable confidence. Omit fields you are unsure about. Return ONLY valid JSON — no markdown, no code fences, no preamble. """) - with status(f"[bold cyan]Asking LLM to resolve {len(todo_items)} TODO field(s)...[/bold cyan]"): + with status( + f"[bold cyan]Asking LLM to resolve {len(todo_items)} TODO field(s)...[/bold cyan]" + ): llm_text = _call_deployment_llm(client, prompt) if not llm_text: - print("[yellow] LLM call returned no usable output — TODO fields left as-is.[/yellow]") + print( + "[yellow] LLM call returned no usable output — TODO fields left as-is.[/yellow]" + ) else: json_match = re.search(r"\{.*\}", llm_text, re.DOTALL) if not json_match: - print("[yellow] LLM response contained no JSON — TODO fields left as-is.[/yellow]") + print( + "[yellow] LLM response contained no JSON — TODO fields left as-is.[/yellow]" + ) else: try: suggestions = json.loads(json_match.group(0)) - updated_yaml, applied = _apply_llm_suggestions(yaml_content, suggestions) + updated_yaml, applied = _apply_llm_suggestions( + yaml_content, suggestions + ) if applied > 0: with open(out_path, "w") as fh: fh.write(updated_yaml) yaml_content = updated_yaml - print(f" [{BRAND}]LLM resolved {applied} TODO field(s).[/{BRAND}]") + print( + f" [{BRAND}]LLM resolved {applied} TODO field(s).[/{BRAND}]" + ) else: - print("[dim] LLM returned suggestions but none matched TODO fields.[/dim]") + print( + "[dim] LLM returned suggestions but none matched TODO fields.[/dim]" + ) except json.JSONDecodeError: - print("[yellow] LLM response could not be parsed as JSON — TODO fields left as-is.[/yellow]") + print( + "[yellow] LLM response could not be parsed as JSON — TODO fields left as-is.[/yellow]" + ) except Exception as exc: - print(f"[yellow] LLM TODO resolution error ({exc}) — TODO fields left as-is.[/yellow]") + print( + f"[yellow] LLM TODO resolution error ({exc}) — TODO fields left as-is.[/yellow]" + ) # ── Print summary ──────────────────────────────────────────────────── console = Console() @@ -1427,30 +1696,48 @@ def generate_driver( table.add_column("Status", min_width=12) probe_display = [ - ("className", probes.get("className")), - ("dbProductName", probes.get("dbProductName")), - ("dbProductVersion", probes.get("dbProductVersion")), - ("prefix (from URL)", prefix), - ("identifierQuoteChar", probes.get("identifierQuoteChar")), - ("transactionIsolation", probes.get("transactionIsolation")), - ("tableNameCasing", probes.get("tableNameCasing")), - ("validationQuery", probes.get("validationQuery")), - ("subqueryRequiresAlias", str(probes.get("subqueryRequiresAlias", True)).lower()), - ("getTablesUsesNullCatalog", str(probes.get("getTablesUsesNullCatalog", False)).lower()), - ("approxCountDistinctFunction", probes.get("approxCountDistinctFunction")), - ("rowCountQueryStyle", probes.get("rowCountQueryStyle")), - ("schemaExistenceQueryStyle", probes.get("schemaExistenceQueryStyle")), - ("schemaOnlyQueryStyle", probes.get("schemaOnlyQueryStyle")), - ("dateArithmeticStyle", probes.get("dateArithmeticStyle")), - ("rowLimitSyntax", probes.get("rowLimitSyntax")), - ("tableSampleTemplate", probes.get("tableSampleTemplate")), - ("viewSampleFallback", probes.get("viewSampleFallback")), - ("timestampLiteralStyle", probes.get("timestampLiteralStyle")), - ("dateLiteralStyle", probes.get("dateLiteralStyle")), - ("intervalCalcDatetimeTimestampTemplate", probes.get("intervalCalcDatetimeTimestampTemplate")), - ("intervalCalcDatetimeDateTemplate", probes.get("intervalCalcDatetimeDateTemplate")), - ("upperBoundDatetimeTimestampTemplate", probes.get("upperBoundDatetimeTimestampTemplate")), - ("upperBoundDatetimeDateTemplate", probes.get("upperBoundDatetimeDateTemplate")), + ("className", probes.get("className")), + ("dbProductName", probes.get("dbProductName")), + ("dbProductVersion", probes.get("dbProductVersion")), + ("prefix (from URL)", prefix), + ("identifierQuoteChar", probes.get("identifierQuoteChar")), + ("transactionIsolation", probes.get("transactionIsolation")), + ("tableNameCasing", probes.get("tableNameCasing")), + ("validationQuery", probes.get("validationQuery")), + ( + "subqueryRequiresAlias", + str(probes.get("subqueryRequiresAlias", True)).lower(), + ), + ( + "getTablesUsesNullCatalog", + str(probes.get("getTablesUsesNullCatalog", False)).lower(), + ), + ("approxCountDistinctFunction", probes.get("approxCountDistinctFunction")), + ("rowCountQueryStyle", probes.get("rowCountQueryStyle")), + ("schemaExistenceQueryStyle", probes.get("schemaExistenceQueryStyle")), + ("schemaOnlyQueryStyle", probes.get("schemaOnlyQueryStyle")), + ("dateArithmeticStyle", probes.get("dateArithmeticStyle")), + ("rowLimitSyntax", probes.get("rowLimitSyntax")), + ("tableSampleTemplate", probes.get("tableSampleTemplate")), + ("viewSampleFallback", probes.get("viewSampleFallback")), + ("timestampLiteralStyle", probes.get("timestampLiteralStyle")), + ("dateLiteralStyle", probes.get("dateLiteralStyle")), + ( + "intervalCalcDatetimeTimestampTemplate", + probes.get("intervalCalcDatetimeTimestampTemplate"), + ), + ( + "intervalCalcDatetimeDateTemplate", + probes.get("intervalCalcDatetimeDateTemplate"), + ), + ( + "upperBoundDatetimeTimestampTemplate", + probes.get("upperBoundDatetimeTimestampTemplate"), + ), + ( + "upperBoundDatetimeDateTemplate", + probes.get("upperBoundDatetimeDateTemplate"), + ), ] for name, value in probe_display: @@ -1465,15 +1752,20 @@ def generate_driver( console.print(table) console.print() - todo_count = sum( - 1 for _, v in probe_display if v is None or v == "null" - ) + todo_count = sum(1 for _, v in probe_display if v is None or v == "null") # Always-todo fields (not auto-detectable; need manual review or LLM assistance) always_todo = [ - "maxPartitionParallelism", "dataSizeLimit", - "systemSchemaExclusions", "systemSchemaExclusionPrefixes", "systemCatalogExclusions", - "connectionProperties", "sessionInitStatements", "dialectClass", - "jdbcUrlStaticParams", "jdbcUrlConditionalParams", "connectionSpec", + "maxPartitionParallelism", + "dataSizeLimit", + "systemSchemaExclusions", + "systemSchemaExclusionPrefixes", + "systemCatalogExclusions", + "connectionProperties", + "sessionInitStatements", + "dialectClass", + "jdbcUrlStaticParams", + "jdbcUrlConditionalParams", + "connectionSpec", ] total_todo = todo_count + len(always_todo) auto_detected = len(probe_display) - todo_count @@ -1494,6 +1786,7 @@ def generate_driver( "\n [dim]to bundle all drivers into custom-drivers.jar[/dim]\n" ) + # --------------------------------------------------------------------------- # drivers package command # --------------------------------------------------------------------------- @@ -1506,7 +1799,7 @@ def package_drivers( typer.Option( "--dist-dir", help="Root dist directory produced by 'drivers generate'. " - "Must contain META-INF/jdbc-drivers/.", + "Must contain META-INF/jdbc-drivers/.", show_default=True, ), ] = "dist", From 1a68abca64b36488882ebb9aa9e12cc5ddaf0588 Mon Sep 17 00:00:00 2001 From: Eric Simmerman Date: Sun, 5 Apr 2026 19:51:52 -0400 Subject: [PATCH 10/10] linting --- qualytics/cli/generate_driver.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/qualytics/cli/generate_driver.py b/qualytics/cli/generate_driver.py index 3427d65..eb65e8b 100644 --- a/qualytics/cli/generate_driver.py +++ b/qualytics/cli/generate_driver.py @@ -10,7 +10,7 @@ import tempfile import textwrap import zipfile -from typing import Annotated, Optional +from typing import Annotated import typer import yaml @@ -1457,7 +1457,7 @@ def generate_driver( ), ], user: Annotated[ - Optional[str], + str | None, typer.Option( "--user", help="Database username.", @@ -1465,7 +1465,7 @@ def generate_driver( ), ] = None, password: Annotated[ - Optional[str], + str | None, typer.Option( "--password", help="Database password.", @@ -1473,7 +1473,7 @@ def generate_driver( ), ] = None, properties: Annotated[ - Optional[list[str]], + list[str] | None, typer.Option( "--properties", help="Extra JDBC connection properties as key=value pairs (repeatable).", @@ -1481,7 +1481,7 @@ def generate_driver( ), ] = None, output: Annotated[ - Optional[str], + str | None, typer.Option( "--output", "-o", @@ -1804,7 +1804,7 @@ def package_drivers( ), ] = "dist", output: Annotated[ - Optional[str], + str | None, typer.Option( "--output", "-o",