From 2b76dd5cbcdfc73e970967b32cca412fdfce6b13 Mon Sep 17 00:00:00 2001 From: chen Date: Tue, 23 Jun 2026 06:33:39 +0800 Subject: [PATCH 1/6] Extract shared export engine; wire CLI to summary cache and drop mypy override --- pyproject.toml | 6 - scripts/export.py | 460 +++++++------------------------ services/export_engine.py | 491 ++++++++++++++++++++++++++++++++++ services/workspace_listing.py | 34 +-- 4 files changed, 602 insertions(+), 389 deletions(-) create mode 100644 services/export_engine.py diff --git a/pyproject.toml b/pyproject.toml index 678f218..a49ac40 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,12 +98,6 @@ pretty = true # Anchored regexes — unanchored `venv/` would match any path segment containing "venv/". exclude = ["^venv/", "^\\.venv/", "^build/", "^dist/"] -# Standalone CLI export script (~985 LOC) duplicates utils/ helpers; typed -# incrementally — issue #100 allows per-module override until consolidated. -[[tool.mypy.overrides]] -module = "scripts.export" -ignore_errors = true - # Test modules use unittest/pytest patterns that are not worth strict-checking # alongside production code; route handlers and utils are fully strict. [[tool.mypy.overrides]] diff --git a/scripts/export.py b/scripts/export.py index cd36454..7f8abd1 100644 --- a/scripts/export.py +++ b/scripts/export.py @@ -10,14 +10,16 @@ The guard below is only necessary for direct invocation (``python scripts/export.py``). """ +from __future__ import annotations + import json import logging import os -import sqlite3 import sys import zipfile from datetime import datetime from pathlib import Path +from typing import Literal, TypedDict # sys.path guard: only needed when the script is invoked directly # (``python scripts/export.py``). When installed via the pyproject.toml @@ -28,48 +30,27 @@ if str(_project_root) not in sys.path: sys.path.insert(0, str(_project_root)) +from models import ExportEntry, SchemaError # noqa: E402 +from services.export_engine import collect_export_entries # noqa: E402 from utils.exclusion_rules import ( # noqa: E402 - resolve_exclusion_rules_path, load_rules, - build_searchable_text, - is_excluded_by_rules, + resolve_exclusion_rules_path, ) from utils.path_helpers import to_epoch_ms # noqa: E402 -from utils.text_extract import ( # noqa: E402 - extract_text_from_bubble, - slug, -) -from utils.workspace_path import ( # noqa: E402 - get_cli_chats_path, - resolve_workspace_path, -) -from utils.cli_chat_reader import ( # noqa: E402 - list_cli_projects, - traverse_blobs, - messages_to_bubbles, -) -from utils.cursor_md_exporter import ( # noqa: E402 - cursor_cli_session_to_markdown, - cursor_ide_chat_to_markdown, -) -from models import Bubble, ExportEntry, SchemaError # noqa: E402 -from services.workspace_context import ( # noqa: E402 - enrich_workspace_context_from_global_db, - resolve_workspace_context, -) -from services.workspace_db import ( # noqa: E402 - load_code_block_diff_map, - open_global_db, -) -from services.workspace_resolver import ( # noqa: E402 - determine_project_for_conversation, - infer_invalid_workspace_aliases, - lookup_workspace_display_name, -) +from utils.workspace_path import resolve_workspace_path # noqa: E402 _logger = logging.getLogger(__name__) +class ExportCliOptions(TypedDict): + since: Literal["all", "last"] + out_dir: str + include_composer: bool + zip: bool + exclusion_rules_path: str | None + base_dir: str | None + + def configure_cli_logging() -> None: """Route log records to stderr so stdout stays for export progress lines.""" root = logging.getLogger() @@ -82,17 +63,9 @@ def configure_cli_logging() -> None: ) -def json_dump_safe(value) -> str: - """Best-effort JSON serialization for exclusion matching.""" - try: - return json.dumps(value, ensure_ascii=False, sort_keys=True) - except Exception: - return str(value) if value is not None else "" - - -def load_manifest_entries(manifest_path: str) -> dict: +def load_manifest_entries(manifest_path: str) -> dict[str, dict[str, object]]: """Load manifest entries keyed by log_id from a JSONL file.""" - existing: dict = {} + existing: dict[str, dict[str, object]] = {} if not os.path.isfile(manifest_path): return existing try: @@ -113,7 +86,10 @@ def load_manifest_entries(manifest_path: str) -> dict: return existing -def write_manifest_entries(manifest_path: str, entries_by_id: dict): +def write_manifest_entries( + manifest_path: str, + entries_by_id: dict[str, dict[str, object]], +) -> None: """Write manifest entries to JSONL.""" os.makedirs(os.path.dirname(manifest_path), exist_ok=True) with open(manifest_path, "w", encoding="utf-8") as f: @@ -132,8 +108,9 @@ def get_global_state_dir() -> str: return os.path.join(str(Path.home()), ".cursor-chat-browser") -def parse_args(): +def parse_args() -> ExportCliOptions: import argparse + parser = argparse.ArgumentParser( description="Export Cursor chat history to Markdown files.", epilog=( @@ -143,23 +120,42 @@ def parse_args(): ), formatter_class=argparse.RawDescriptionHelpFormatter, ) - parser.add_argument("--since", choices=["all", "last"], default="all", - help="Export all chats or only those updated since last export. Default: all") - parser.add_argument("--out", default=".", - help="Output directory. Default: current working directory (.)") - parser.add_argument("--no-zip", action="store_true", default=False, - help="Write individual Markdown files instead of a zip archive.") - parser.add_argument("--no-composer", action="store_true", default=False, - help="Exclude composer logs (export only chat logs).") - parser.add_argument("--base-dir", default=None, - help="Override Cursor workspaceStorage path (also settable via WORKSPACE_PATH env var).") parser.add_argument( - "--exclude-rules", "-e", + "--since", + choices=["all", "last"], + default="all", + help="Export all chats or only those updated since last export. Default: all", + ) + parser.add_argument( + "--out", + default=".", + help="Output directory. Default: current working directory (.)", + ) + parser.add_argument( + "--no-zip", + action="store_true", + default=False, + help="Write individual Markdown files instead of a zip archive.", + ) + parser.add_argument( + "--no-composer", + action="store_true", + default=False, + help="Exclude composer logs (export only chat logs).", + ) + parser.add_argument( + "--base-dir", + default=None, + help="Override Cursor workspaceStorage path (also settable via WORKSPACE_PATH env var).", + ) + parser.add_argument( + "--exclude-rules", + "-e", default=None, metavar="PATH", dest="exclude_rules", help="Path to exclusion rules file (sensitive projects/chats are omitted). " - "If omitted, uses ~/.cursor-chat-browser/exclusion-rules.txt if present.", + "If omitted, uses ~/.cursor-chat-browser/exclusion-rules.txt if present.", ) args = parser.parse_args() return { @@ -172,311 +168,52 @@ def parse_args(): } -def main(): +def _read_last_export_ms(state_path: str, since: Literal["all", "last"]) -> int: + if since != "last" or not os.path.isfile(state_path): + return 0 + try: + with open(state_path, "r", encoding="utf-8") as f: + st = json.load(f) + ts = st.get("lastExportTime") + if ts: + return int( + datetime.fromisoformat(ts.replace("Z", "+00:00")).timestamp() * 1000, + ) + except (json.JSONDecodeError, ValueError, OSError) as e: + _logger.warning( + "Could not read last export timestamp; defaulting to full export: %s", + e, + ) + return 0 + + +def main() -> None: configure_cli_logging() opts = parse_args() since = opts["since"] out_dir = os.path.abspath(opts["out_dir"]) use_zip = opts["zip"] - exclusion_rules = load_rules(resolve_exclusion_rules_path(opts.get("exclusion_rules_path"))) - if opts.get("base_dir"): - os.environ["WORKSPACE_PATH"] = opts["base_dir"] + exclusion_rules = load_rules( + resolve_exclusion_rules_path(opts.get("exclusion_rules_path")), + ) + base_dir = opts.get("base_dir") + if base_dir: + os.environ["WORKSPACE_PATH"] = base_dir workspace_path = resolve_workspace_path() state_dir = get_global_state_dir() state_path = os.path.join(state_dir, "export_state.json") - last_export = 0 - if since == "last" and os.path.isfile(state_path): - try: - with open(state_path, "r", encoding="utf-8") as f: - st = json.load(f) - ts = st.get("lastExportTime") - if ts: - last_export = int(datetime.fromisoformat(ts.replace("Z", "+00:00")).timestamp() * 1000) - except (json.JSONDecodeError, ValueError, OSError) as e: - _logger.warning( - "Could not read last export timestamp; defaulting to full export: %s", - e, - ) - - # ── Workspace scanning via service layer ────────────────────────────────── - ctx = resolve_workspace_context(workspace_path) - workspace_entries = ctx.workspace_entries - invalid_workspace_ids = ctx.invalid_workspace_ids - project_name_map = ctx.project_name_to_workspace_id - workspace_path_map = ctx.workspace_path_to_id - composer_id_to_ws = ctx.composer_id_to_workspace_id - - # Build display-name and slug maps from workspace entries. - # Entries whose workspace.json cannot be resolved are omitted so the - # usage-site fallback (slug(ws_id[:12])) applies — matching original - # behaviour where unresolvable workspaces were skipped. - workspace_id_to_display_name: dict[str, str] = {} - workspace_id_to_slug: dict[str, str] = {} - for entry in workspace_entries: - display = lookup_workspace_display_name(workspace_path, entry["name"]) - if display != entry["name"]: # successfully resolved a human-readable name - workspace_id_to_display_name[entry["name"]] = display - workspace_id_to_slug[entry["name"]] = slug(display) - - # ── Database reading via service layer ──────────────────────────────────── - project_layouts_map: dict = {} - bubble_map: dict[str, Bubble] = {} - code_block_diff_map: dict = {} - ide_composer_rows: list = [] - invalid_workspace_aliases: dict = {} - - with open_global_db(workspace_path) as (global_db, global_db_path): - if global_db is None: - _logger.info( - "Cursor IDE global storage not found at %s — skipping IDE chats.", - global_db_path, - ) - else: - ctx = enrich_workspace_context_from_global_db( - ctx, - global_db, - populate_project_layouts=True, - populate_bubble_map=True, - ) - project_layouts_map = ctx.project_layouts_map - bubble_map = ctx.bubble_map - code_block_diff_map = load_code_block_diff_map(global_db) - - try: - ide_composer_rows = global_db.execute( - "SELECT key, value FROM cursorDiskKV WHERE key LIKE 'composerData:%'" - " AND value LIKE '%fullConversationHeadersOnly%'" - ).fetchall() - except sqlite3.Error: - pass - - invalid_workspace_aliases = infer_invalid_workspace_aliases( - composer_rows=ide_composer_rows, - project_layouts_map=project_layouts_map, - project_name_map=project_name_map, - workspace_path_map=workspace_path_map, - workspace_entries=workspace_entries, - bubble_map=bubble_map, - composer_id_to_ws=composer_id_to_ws, - invalid_workspace_ids=invalid_workspace_ids, - ) - - today = datetime.now().strftime("%Y-%m-%d") - exported = [] - count = 0 - - # ── Process IDE composers ──────────────────────────────────────────────── - include_composer = opts.get("include_composer", True) - for row in ide_composer_rows if include_composer else []: - composer_id = row["key"].split(":")[1] - try: - cd = json.loads(row["value"]) - except (json.JSONDecodeError, ValueError) as parse_err: - _logger.debug( - "Skipping corrupt composerData row %s: %s", - composer_id, - parse_err, - ) - continue - - headers = cd.get("fullConversationHeadersOnly") or [] - if not headers: - continue - - updated_at = to_epoch_ms(cd.get("lastUpdatedAt")) - if updated_at is None: - updated_at = to_epoch_ms(cd.get("createdAt")) - if updated_at is None: - updated_at = 0 - if since == "last" and updated_at <= last_export: - continue - - # Workspace assignment via service layer - pid = determine_project_for_conversation( - cd, composer_id, project_layouts_map, - project_name_map, workspace_path_map, - workspace_entries, bubble_map, composer_id_to_ws, invalid_workspace_ids, - ) - mapped_ws = composer_id_to_ws.get(composer_id) - if not pid and mapped_ws in invalid_workspace_ids: - pid = invalid_workspace_aliases.get(mapped_ws) - ws_id = pid if pid else "global" - - ws_slug = "other-chats" if ws_id == "global" else (workspace_id_to_slug.get(ws_id) or slug(ws_id[:12])) - ws_display_name = "Other chats" if ws_id == "global" else (workspace_id_to_display_name.get(ws_id) or ws_slug) - title = cd.get("name") or f"Chat {composer_id[:8]}" - model_config = cd.get("modelConfig") or {} - model_name = model_config.get("modelName") - model_names = [model_name] if model_name and model_name != "default" else None - - # Build broad text for exclusion checks so any visible output term can match. - # CLI export intentionally includes metadata/tool payload text in addition to - # bubble text because these fields are emitted into exported markdown. - bubble_texts = [] - bubble_meta_parts = [] - for h in headers: - b = bubble_map.get(h.get("bubbleId")) - if not b: - continue - text = extract_text_from_bubble(b) - if text: - bubble_texts.append(text) - bubble_meta_parts.append(json_dump_safe(b)) - - code_diff_parts = [json_dump_safe(d) for d in code_block_diff_map.get(composer_id, [])] - searchable = build_searchable_text( - project_name=ws_display_name, - chat_title=title, - model_names=model_names, - chat_content_snippet="\n\n".join( - p - for p in ( - bubble_texts - + bubble_meta_parts - + code_diff_parts - + [json_dump_safe(model_config), json_dump_safe(cd)] - ) - if p - ), - ) - if is_excluded_by_rules(exclusion_rules, searchable): - continue - - title_slug = slug(title) - ts = updated_at or int(datetime.now().timestamp() * 1000) - ts_str = datetime.fromtimestamp(ts / 1000).strftime("%Y-%m-%dT%H-%M-%S") - filename = f"{ts_str}__{title_slug}__{composer_id[:8]}.md" - out_path = os.path.join(out_dir, today, ws_slug, "chat", filename) - - # Markdown generation via shared exporter - md = cursor_ide_chat_to_markdown( - composer_data=cd, - composer_id=composer_id, - bubble_map=bubble_map, - code_block_diff_map=code_block_diff_map, - workspace_info={"ws_slug": ws_slug, "ws_display_name": ws_display_name}, - ) - - rel_path = os.path.join(today, ws_slug, "chat", filename) - exported.append({ - "id": composer_id, - "rel_path": rel_path, - "content": md, - "out_path": out_path, - "updatedAt": updated_at, - "title": title, - "workspace": ws_display_name, - }) - count += 1 - - # ── Cursor CLI sessions ────────────────────────────────────────────────── - try: - cli_projects = list_cli_projects(get_cli_chats_path()) - except Exception as e: - _logger.warning( - "Could not enumerate CLI chats: %s (%s) — skipping", - e, - type(e).__name__, - exc_info=True, - ) - cli_projects = [] - - for cp in cli_projects: - ws_name = cp["workspace_name"] or cp["project_id"][:12] - ws_slug_cli = slug(ws_name) - - if is_excluded_by_rules(exclusion_rules, build_searchable_text(project_name=ws_name)): - continue - - for session in cp["sessions"]: - meta = session.get("meta", {}) - session_id = session["session_id"] - created_ms: int = meta.get("createdAt") or int(datetime.now().timestamp() * 1000) - session_name = meta.get("name") or f"Session {session_id[:8]}" - - # Use the store.db mtime as a proxy for "last updated" — createdAt - # is immutable and would cause sessions with new turns to be skipped. - try: - db_mtime_ms = int(os.path.getmtime(session["db_path"]) * 1000) - except OSError: - db_mtime_ms = created_ms - updated_ms = max(created_ms, db_mtime_ms) - - if since == "last" and updated_ms <= last_export: - continue - - try: - messages = traverse_blobs(session["db_path"]) - bubbles = messages_to_bubbles(messages, created_ms) - except Exception as e: - _logger.warning( - "Could not read CLI session %s: %s (%s)", - session_id, - e, - type(e).__name__, - exc_info=True, - ) - continue - - if not bubbles: - continue - - # Derive title for the filename (shared exporter does it too, but - # we need it here first to build the output path). - title = session_name - if not title or title.startswith("New Agent"): - for b in bubbles: - if b["type"] == "user" and b.get("text"): - first_lines = [ln for ln in b["text"].split("\n") if ln.strip()] - if first_lines: - title = first_lines[0][:100] - if len(title) == 100: - title += "..." - break - - bubble_texts = [b["text"] for b in bubbles if b.get("text")] - tool_call_texts = [ - tc.get("input", "") or tc.get("summary", "") - for b in bubbles - for tc in (b.get("metadata") or {}).get("toolCalls") or [] - ] - searchable = build_searchable_text( - project_name=ws_name, - chat_title=title, - chat_content_snippet="\n\n".join(bubble_texts + tool_call_texts), - ) - if is_excluded_by_rules(exclusion_rules, searchable): - continue - - title_slug = slug(title) - ts_str = datetime.fromtimestamp(created_ms / 1000).strftime("%Y-%m-%dT%H-%M-%S") - filename = f"{ts_str}__{title_slug}__{session_id[:8]}.md" - out_path = os.path.join(out_dir, today, ws_slug_cli, "cli", filename) - - md = cursor_cli_session_to_markdown( - session["db_path"], - session_meta=meta, - workspace_info={ - "workspace": ws_slug_cli, - "workspace_name": ws_name, - "workspace_path": cp.get("workspace_path"), - "project_id": cp["project_id"], - }, - bubbles=bubbles, - title_override=title, - ) - rel_path = os.path.join(today, ws_slug_cli, "cli", filename) - exported.append({ - "id": session_id, - "rel_path": rel_path, - "content": md, - "out_path": out_path, - "updatedAt": updated_ms, - "title": title, - "workspace": ws_name, - }) - count += 1 + last_export = _read_last_export_ms(state_path, since) + + exported = collect_export_entries( + workspace_path=workspace_path, + exclusion_rules=exclusion_rules, + since=since, + last_export_ms=last_export, + out_dir=out_dir, + include_composer=opts.get("include_composer", True), + ) + count = len(exported) if count == 0: label = " since last export" if since == "last" else "" @@ -484,6 +221,7 @@ def main(): sys.exit(0) os.makedirs(out_dir, exist_ok=True) + today = datetime.now().strftime("%Y-%m-%d") if use_zip: zip_name = f"cursor-export-{today}.zip" @@ -506,7 +244,11 @@ def main(): "title": entry["title"], "workspace": entry["workspace"], "path": os.path.relpath(entry["out_path"], out_dir), - "updated_at": datetime.fromtimestamp(entry["updatedAt"] / 1000).isoformat() if entry["updatedAt"] else datetime.now().isoformat(), + "updated_at": ( + datetime.fromtimestamp(entry["updatedAt"] / 1000).isoformat() + if entry["updatedAt"] + else datetime.now().isoformat() + ), } if existing: write_manifest_entries(manifest_path, existing) @@ -519,7 +261,11 @@ def main(): "title": entry["title"], "workspace": entry["workspace"], "path": entry["out_path"], - "updated_at": datetime.fromtimestamp(entry["updatedAt"] / 1000).isoformat() if entry["updatedAt"] else datetime.now().isoformat(), + "updated_at": ( + datetime.fromtimestamp(entry["updatedAt"] / 1000).isoformat() + if entry["updatedAt"] + else datetime.now().isoformat() + ), } if global_existing: write_manifest_entries(global_manifest_path, global_existing) diff --git a/services/export_engine.py b/services/export_engine.py new file mode 100644 index 0000000..ca7c086 --- /dev/null +++ b/services/export_engine.py @@ -0,0 +1,491 @@ +"""Shared export orchestration for CLI and web paths.""" + +from __future__ import annotations + +import json +import logging +import os +import sqlite3 +from dataclasses import dataclass +from datetime import datetime +from typing import Any, Literal, TypedDict + +from models import Bubble +from services.summary_cache import fingerprint_workspace_storage +from services.workspace_context import ( + WorkspaceContext, + enrich_workspace_context_from_global_db, + resolve_workspace_context_cached, +) +from services.workspace_db import ( + COMPOSER_ROWS_WITH_HEADERS_SQL, + collect_workspace_entries, + global_storage_db_path, + load_code_block_diff_map, + open_global_db, + safe_fetchall, +) +from services.workspace_resolver import ( + determine_project_for_conversation, + infer_invalid_workspace_aliases, + lookup_workspace_display_name, +) +from utils.cli_chat_reader import ( + list_cli_projects, + messages_to_bubbles, + traverse_blobs, +) +from utils.cursor_md_exporter import ( + cursor_cli_session_to_markdown, + cursor_ide_chat_to_markdown, +) +from utils.exclusion_rules import build_searchable_text, is_excluded_by_rules +from utils.path_helpers import to_epoch_ms +from utils.text_extract import extract_text_from_bubble, slug +from utils.workspace_path import get_cli_chats_path + +_logger = logging.getLogger(__name__) + +SinceMode = Literal["all", "last"] + + +class ExportEntry(TypedDict): + """One exportable conversation with rendered markdown.""" + + id: str + rel_path: str + content: str + out_path: str + updatedAt: int + title: str + workspace: str + + +@dataclass(frozen=True) +class WorkspaceOrchestration: + """Precomputed workspace maps shared by listing and export.""" + + workspace_path: str + workspace_entries: list[dict[str, Any]] + fingerprint: dict[str, Any] + ctx: WorkspaceContext + workspace_id_to_display_name: dict[str, str] + workspace_id_to_slug: dict[str, str] + + +@dataclass(frozen=True) +class GlobalDbExportData: + """Global KV data loaded for export orchestration.""" + + project_layouts_map: dict[str, list[str]] + bubble_map: dict[str, Bubble] + code_block_diff_map: dict[str, list[Any]] + ide_composer_rows: list[sqlite3.Row] + invalid_workspace_aliases: dict[str, str] + + +def json_dump_safe(value: object) -> str: + """Best-effort JSON serialization for exclusion matching.""" + try: + return json.dumps(value, ensure_ascii=False, sort_keys=True) + except Exception: + return str(value) if value is not None else "" + + +def build_workspace_display_maps( + workspace_path: str, + workspace_entries: list[dict[str, Any]], +) -> tuple[dict[str, str], dict[str, str]]: + """Build display-name and slug maps from workspace entries. + + Entries whose ``workspace.json`` cannot be resolved are omitted so the + usage-site fallback (``slug(ws_id[:12])``) applies. + """ + workspace_id_to_display_name: dict[str, str] = {} + workspace_id_to_slug: dict[str, str] = {} + for entry in workspace_entries: + display = lookup_workspace_display_name(workspace_path, entry["name"]) + if display != entry["name"]: + workspace_id_to_display_name[entry["name"]] = display + workspace_id_to_slug[entry["name"]] = slug(display) + return workspace_id_to_display_name, workspace_id_to_slug + + +def prepare_workspace_orchestration( + workspace_path: str, + rules: list[Any], + *, + nocache: bool = False, + workspace_entries: list[dict[str, Any]] | None = None, +) -> WorkspaceOrchestration: + """Scan workspace storage and resolve maps (with summary-cache fingerprint).""" + entries = ( + workspace_entries + if workspace_entries is not None + else collect_workspace_entries(workspace_path) + ) + gdb = global_storage_db_path(workspace_path) + cli_path = get_cli_chats_path() + fingerprint = fingerprint_workspace_storage( + workspace_path, + entries, + global_db_path=gdb if os.path.isfile(gdb) else None, + rules=rules, + cli_chats_path=cli_path if os.path.isdir(cli_path) else None, + ) + ctx = resolve_workspace_context_cached( + workspace_path, + rules, + workspace_entries=entries, + nocache=nocache, + ) + display_name, slug_map = build_workspace_display_maps(workspace_path, entries) + return WorkspaceOrchestration( + workspace_path=workspace_path, + workspace_entries=entries, + fingerprint=fingerprint, + ctx=ctx, + workspace_id_to_display_name=display_name, + workspace_id_to_slug=slug_map, + ) + + +def load_global_db_export_data( + orch: WorkspaceOrchestration, +) -> GlobalDbExportData | None: + """Load global DB maps needed for IDE composer export.""" + ctx = orch.ctx + project_layouts_map: dict[str, list[str]] = {} + bubble_map: dict[str, Bubble] = {} + code_block_diff_map: dict[str, list[Any]] = {} + ide_composer_rows: list[sqlite3.Row] = [] + invalid_workspace_aliases: dict[str, str] = {} + + with open_global_db(orch.workspace_path) as (global_db, global_db_path): + if global_db is None: + _logger.info( + "Cursor IDE global storage not found at %s — skipping IDE chats.", + global_db_path, + ) + return None + + enriched = enrich_workspace_context_from_global_db( + ctx, + global_db, + populate_project_layouts=True, + populate_bubble_map=True, + ) + project_layouts_map = enriched.project_layouts_map + bubble_map = enriched.bubble_map + code_block_diff_map = load_code_block_diff_map(global_db) + ide_composer_rows = safe_fetchall(global_db, COMPOSER_ROWS_WITH_HEADERS_SQL) + + invalid_workspace_aliases = infer_invalid_workspace_aliases( + composer_rows=ide_composer_rows, + project_layouts_map=project_layouts_map, + project_name_map=ctx.project_name_to_workspace_id, + workspace_path_map=ctx.workspace_path_to_id, + workspace_entries=orch.workspace_entries, + bubble_map=bubble_map, + composer_id_to_ws=ctx.composer_id_to_workspace_id, + invalid_workspace_ids=ctx.invalid_workspace_ids, + ) + + return GlobalDbExportData( + project_layouts_map=project_layouts_map, + bubble_map=bubble_map, + code_block_diff_map=code_block_diff_map, + ide_composer_rows=ide_composer_rows, + invalid_workspace_aliases=invalid_workspace_aliases, + ) + + +def _collect_ide_export_entries( + *, + orch: WorkspaceOrchestration, + db_data: GlobalDbExportData, + exclusion_rules: list[Any], + since: SinceMode, + last_export_ms: int, + today: str, + out_dir: str, +) -> list[ExportEntry]: + ctx = orch.ctx + exported: list[ExportEntry] = [] + for row in db_data.ide_composer_rows: + composer_id = row["key"].split(":")[1] + try: + cd = json.loads(row["value"]) + except (json.JSONDecodeError, ValueError) as parse_err: + _logger.debug( + "Skipping corrupt composerData row %s: %s", + composer_id, + parse_err, + ) + continue + + headers = cd.get("fullConversationHeadersOnly") or [] + if not headers: + continue + + updated_at = to_epoch_ms(cd.get("lastUpdatedAt")) + if updated_at is None: + updated_at = to_epoch_ms(cd.get("createdAt")) + if updated_at is None: + updated_at = 0 + if since == "last" and updated_at <= last_export_ms: + continue + + pid = determine_project_for_conversation( + cd, + composer_id, + db_data.project_layouts_map, + ctx.project_name_to_workspace_id, + ctx.workspace_path_to_id, + orch.workspace_entries, + db_data.bubble_map, + ctx.composer_id_to_workspace_id, + ctx.invalid_workspace_ids, + ) + mapped_ws = ctx.composer_id_to_workspace_id.get(composer_id) + if not pid and mapped_ws in ctx.invalid_workspace_ids: + pid = db_data.invalid_workspace_aliases.get(mapped_ws) + ws_id = pid if pid else "global" + + ws_slug = ( + "other-chats" + if ws_id == "global" + else (orch.workspace_id_to_slug.get(ws_id) or slug(ws_id[:12])) + ) + ws_display_name = ( + "Other chats" + if ws_id == "global" + else (orch.workspace_id_to_display_name.get(ws_id) or ws_slug) + ) + title = cd.get("name") or f"Chat {composer_id[:8]}" + model_config = cd.get("modelConfig") or {} + model_name = model_config.get("modelName") + model_names = [model_name] if model_name and model_name != "default" else None + + bubble_texts: list[str] = [] + bubble_meta_parts: list[str] = [] + for h in headers: + b = db_data.bubble_map.get(h.get("bubbleId")) + if not b: + continue + text = extract_text_from_bubble(b) + if text: + bubble_texts.append(text) + bubble_meta_parts.append(json_dump_safe(b)) + + code_diff_parts = [ + json_dump_safe(d) for d in db_data.code_block_diff_map.get(composer_id, []) + ] + searchable = build_searchable_text( + project_name=ws_display_name, + chat_title=title, + model_names=model_names, + chat_content_snippet="\n\n".join( + p + for p in ( + bubble_texts + + bubble_meta_parts + + code_diff_parts + + [json_dump_safe(model_config), json_dump_safe(cd)] + ) + if p + ), + ) + if is_excluded_by_rules(exclusion_rules, searchable): + continue + + title_slug = slug(title) + ts = updated_at or int(datetime.now().timestamp() * 1000) + ts_str = datetime.fromtimestamp(ts / 1000).strftime("%Y-%m-%dT%H-%M-%S") + filename = f"{ts_str}__{title_slug}__{composer_id[:8]}.md" + out_path = os.path.join(out_dir, today, ws_slug, "chat", filename) + + md = cursor_ide_chat_to_markdown( + composer_data=cd, + composer_id=composer_id, + bubble_map=db_data.bubble_map, + code_block_diff_map=db_data.code_block_diff_map, + workspace_info={"ws_slug": ws_slug, "ws_display_name": ws_display_name}, + ) + + rel_path = os.path.join(today, ws_slug, "chat", filename) + exported.append({ + "id": composer_id, + "rel_path": rel_path, + "content": md, + "out_path": out_path, + "updatedAt": updated_at, + "title": title, + "workspace": ws_display_name, + }) + return exported + + +def _collect_cli_export_entries( + *, + exclusion_rules: list[Any], + since: SinceMode, + last_export_ms: int, + today: str, + out_dir: str, +) -> list[ExportEntry]: + exported: list[ExportEntry] = [] + try: + cli_projects = list_cli_projects(get_cli_chats_path()) + except Exception as e: + _logger.warning( + "Could not enumerate CLI chats: %s (%s) — skipping", + e, + type(e).__name__, + exc_info=True, + ) + cli_projects = [] + + for cp in cli_projects: + ws_name = cp["workspace_name"] or cp["project_id"][:12] + ws_slug_cli = slug(ws_name) + + if is_excluded_by_rules( + exclusion_rules, build_searchable_text(project_name=ws_name), + ): + continue + + for session in cp["sessions"]: + meta = session.get("meta", {}) + session_id = session["session_id"] + created_ms: int = meta.get("createdAt") or int( + datetime.now().timestamp() * 1000, + ) + session_name = meta.get("name") or f"Session {session_id[:8]}" + + try: + db_mtime_ms = int(os.path.getmtime(session["db_path"]) * 1000) + except OSError: + db_mtime_ms = created_ms + updated_ms = max(created_ms, db_mtime_ms) + + if since == "last" and updated_ms <= last_export_ms: + continue + + try: + messages = traverse_blobs(session["db_path"]) + bubbles = messages_to_bubbles(messages, created_ms) + except Exception as e: + _logger.warning( + "Could not read CLI session %s: %s (%s)", + session_id, + e, + type(e).__name__, + exc_info=True, + ) + continue + + if not bubbles: + continue + + title = session_name + if not title or title.startswith("New Agent"): + for b in bubbles: + if b["type"] == "user" and b.get("text"): + first_lines = [ + ln for ln in b["text"].split("\n") if ln.strip() + ] + if first_lines: + title = first_lines[0][:100] + if len(title) == 100: + title += "..." + break + + bubble_texts = [b["text"] for b in bubbles if b.get("text")] + tool_call_texts = [ + tc.get("input", "") or tc.get("summary", "") + for b in bubbles + for tc in (b.get("metadata") or {}).get("toolCalls") or [] + ] + searchable = build_searchable_text( + project_name=ws_name, + chat_title=title, + chat_content_snippet="\n\n".join(bubble_texts + tool_call_texts), + ) + if is_excluded_by_rules(exclusion_rules, searchable): + continue + + title_slug = slug(title) + ts_str = datetime.fromtimestamp(created_ms / 1000).strftime( + "%Y-%m-%dT%H-%M-%S", + ) + filename = f"{ts_str}__{title_slug}__{session_id[:8]}.md" + out_path = os.path.join(out_dir, today, ws_slug_cli, "cli", filename) + + md = cursor_cli_session_to_markdown( + session["db_path"], + session_meta=meta, + workspace_info={ + "workspace": ws_slug_cli, + "workspace_name": ws_name, + "workspace_path": cp.get("workspace_path"), + "project_id": cp["project_id"], + }, + bubbles=bubbles, + title_override=title, + ) + rel_path = os.path.join(today, ws_slug_cli, "cli", filename) + exported.append({ + "id": session_id, + "rel_path": rel_path, + "content": md, + "out_path": out_path, + "updatedAt": updated_ms, + "title": title, + "workspace": ws_name, + }) + return exported + + +def collect_export_entries( + *, + workspace_path: str, + exclusion_rules: list[Any], + since: SinceMode, + last_export_ms: int, + out_dir: str, + include_composer: bool = True, + nocache: bool = False, +) -> list[ExportEntry]: + """Collect exportable conversations (IDE + CLI) via shared orchestration.""" + orch = prepare_workspace_orchestration( + workspace_path, exclusion_rules, nocache=nocache, + ) + today = datetime.now().strftime("%Y-%m-%d") + exported: list[ExportEntry] = [] + + if include_composer: + db_data = load_global_db_export_data(orch) + if db_data is not None: + exported.extend( + _collect_ide_export_entries( + orch=orch, + db_data=db_data, + exclusion_rules=exclusion_rules, + since=since, + last_export_ms=last_export_ms, + today=today, + out_dir=out_dir, + ), + ) + + exported.extend( + _collect_cli_export_entries( + exclusion_rules=exclusion_rules, + since=since, + last_export_ms=last_export_ms, + today=today, + out_dir=out_dir, + ), + ) + return exported diff --git a/services/workspace_listing.py b/services/workspace_listing.py index 891cabe..b0b1c98 100644 --- a/services/workspace_listing.py +++ b/services/workspace_listing.py @@ -19,17 +19,14 @@ ) from utils.workspace_descriptor import read_json_file from models import Bubble, ParseWarningCollector +from services.export_engine import WorkspaceOrchestration, prepare_workspace_orchestration from services.summary_cache import ( - fingerprint_workspace_storage, get_cached_projects, nocache_enabled, set_cached_projects, ) -from services.workspace_context import resolve_workspace_context_cached from services.workspace_db import ( COMPOSER_ROWS_WITH_HEADERS_SQL, - collect_workspace_entries, - global_storage_db_path, load_project_layouts_for_composer, load_project_layouts_map, open_global_db, @@ -93,43 +90,28 @@ def list_workspace_projects( parse-error dicts (``type``, ``count``, ``detail``) from :meth:`models.ParseWarningCollector.to_api_list`; empty when no skips. """ - workspace_entries = collect_workspace_entries(workspace_path) - gdb = global_storage_db_path(workspace_path) - cli_path = get_cli_chats_path() - fingerprint = fingerprint_workspace_storage( - workspace_path, - workspace_entries, - global_db_path=gdb if os.path.isfile(gdb) else None, - rules=rules, - cli_chats_path=cli_path if os.path.isdir(cli_path) else None, - ) + orch = prepare_workspace_orchestration(workspace_path, rules, nocache=nocache) if not nocache_enabled(request_nocache=nocache): - cached = get_cached_projects(fingerprint) + cached = get_cached_projects(orch.fingerprint) if cached is not None: return cached projects, warnings = _build_workspace_projects_uncached( - workspace_path, rules, workspace_entries, nocache=nocache, + workspace_path, rules, orch, ) if not nocache_enabled(request_nocache=nocache): - set_cached_projects(fingerprint, projects, warnings) + set_cached_projects(orch.fingerprint, projects, warnings) return projects, warnings def _build_workspace_projects_uncached( workspace_path: str, rules: list[Any], - workspace_entries: list[dict[str, Any]], - *, - nocache: bool, + orch: WorkspaceOrchestration, ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: parse_warnings = ParseWarningCollector() - ctx = resolve_workspace_context_cached( - workspace_path, - rules, - workspace_entries=workspace_entries, - nocache=nocache, - ) + ctx = orch.ctx + workspace_entries = orch.workspace_entries invalid_workspace_ids = ctx.invalid_workspace_ids project_name_map = ctx.project_name_to_workspace_id workspace_path_map = ctx.workspace_path_to_id From d75620d60cbae99145edadf607c75edeb06c1ffd Mon Sep 17 00:00:00 2001 From: chen Date: Tue, 23 Jun 2026 09:18:43 +0800 Subject: [PATCH 2/6] Harden export orchestration from review feedback Use to_epoch_ms for lastExportTime parsing, validate composerData shape, serialize CLI tool-call fields safely, and pass effective nocache flag through workspace listing orchestration. --- scripts/export.py | 4 +--- services/export_engine.py | 25 +++++++++++++++++++------ services/workspace_listing.py | 9 ++++++--- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/scripts/export.py b/scripts/export.py index 7f8abd1..f818bba 100644 --- a/scripts/export.py +++ b/scripts/export.py @@ -176,9 +176,7 @@ def _read_last_export_ms(state_path: str, since: Literal["all", "last"]) -> int: st = json.load(f) ts = st.get("lastExportTime") if ts: - return int( - datetime.fromisoformat(ts.replace("Z", "+00:00")).timestamp() * 1000, - ) + return to_epoch_ms(ts) except (json.JSONDecodeError, ValueError, OSError) as e: _logger.warning( "Could not read last export timestamp; defaulting to full export: %s", diff --git a/services/export_engine.py b/services/export_engine.py index ca7c086..30eb76e 100644 --- a/services/export_engine.py +++ b/services/export_engine.py @@ -88,7 +88,7 @@ def json_dump_safe(value: object) -> str: """Best-effort JSON serialization for exclusion matching.""" try: return json.dumps(value, ensure_ascii=False, sort_keys=True) - except Exception: + except Exception: # noqa: BLE001 — best-effort fallback when value is not JSON-serializable return str(value) if value is not None else "" @@ -224,8 +224,16 @@ def _collect_ide_export_entries( ) continue + if not isinstance(cd, dict): + _logger.debug( + "Skipping corrupt composerData row %s: expected object, got %s", + composer_id, + type(cd).__name__, + ) + continue + headers = cd.get("fullConversationHeadersOnly") or [] - if not headers: + if not isinstance(headers, list) or not headers: continue updated_at = to_epoch_ms(cd.get("lastUpdatedAt")) @@ -270,7 +278,12 @@ def _collect_ide_export_entries( bubble_texts: list[str] = [] bubble_meta_parts: list[str] = [] for h in headers: - b = db_data.bubble_map.get(h.get("bubbleId")) + if not isinstance(h, dict): + continue + bubble_id = h.get("bubbleId") + if not isinstance(bubble_id, str): + continue + b = db_data.bubble_map.get(bubble_id) if not b: continue text = extract_text_from_bubble(b) @@ -337,7 +350,7 @@ def _collect_cli_export_entries( exported: list[ExportEntry] = [] try: cli_projects = list_cli_projects(get_cli_chats_path()) - except Exception as e: + except Exception as e: # noqa: BLE001 — log and skip CLI enumeration on any failure _logger.warning( "Could not enumerate CLI chats: %s (%s) — skipping", e, @@ -375,7 +388,7 @@ def _collect_cli_export_entries( try: messages = traverse_blobs(session["db_path"]) bubbles = messages_to_bubbles(messages, created_ms) - except Exception as e: + except Exception as e: # noqa: BLE001 — log and skip session on read/parse failure _logger.warning( "Could not read CLI session %s: %s (%s)", session_id, @@ -403,7 +416,7 @@ def _collect_cli_export_entries( bubble_texts = [b["text"] for b in bubbles if b.get("text")] tool_call_texts = [ - tc.get("input", "") or tc.get("summary", "") + json_dump_safe(tc.get("input", "") or tc.get("summary", "")) for b in bubbles for tc in (b.get("metadata") or {}).get("toolCalls") or [] ] diff --git a/services/workspace_listing.py b/services/workspace_listing.py index b0b1c98..9cab01a 100644 --- a/services/workspace_listing.py +++ b/services/workspace_listing.py @@ -90,8 +90,11 @@ def list_workspace_projects( parse-error dicts (``type``, ``count``, ``detail``) from :meth:`models.ParseWarningCollector.to_api_list`; empty when no skips. """ - orch = prepare_workspace_orchestration(workspace_path, rules, nocache=nocache) - if not nocache_enabled(request_nocache=nocache): + effective_nocache = nocache_enabled(request_nocache=nocache) + orch = prepare_workspace_orchestration( + workspace_path, rules, nocache=effective_nocache, + ) + if not effective_nocache: cached = get_cached_projects(orch.fingerprint) if cached is not None: return cached @@ -99,7 +102,7 @@ def list_workspace_projects( projects, warnings = _build_workspace_projects_uncached( workspace_path, rules, orch, ) - if not nocache_enabled(request_nocache=nocache): + if not effective_nocache: set_cached_projects(orch.fingerprint, projects, warnings) return projects, warnings From 45c36ac5f2bcfd56bc56ac32bda137b3accf4cb6 Mon Sep 17 00:00:00 2001 From: chen Date: Tue, 23 Jun 2026 09:34:29 +0800 Subject: [PATCH 3/6] Address export consolidation review findings --- api/export_api.py | 168 +++++---------------- scripts/export.py | 5 +- services/export_engine.py | 36 ++--- tests/test_api_export.py | 2 +- tests/test_export_engine.py | 94 ++++++++++++ tests/test_workspace_path_thread_safety.py | 5 + utils/workspace_path.py | 17 ++- 7 files changed, 170 insertions(+), 157 deletions(-) create mode 100644 tests/test_export_engine.py diff --git a/api/export_api.py b/api/export_api.py index ba4ab39..66e53a1 100644 --- a/api/export_api.py +++ b/api/export_api.py @@ -4,32 +4,24 @@ GET /api/export/state — returns last export time """ +from __future__ import annotations + import io import json import logging import os -import sqlite3 import zipfile from datetime import datetime from pathlib import Path -from typing import Any +from typing import Any, Literal from flask import Blueprint, Response, request from api.flask_config import exclusion_rules, json_response - -from utils.workspace_path import resolve_workspace_path +from services.export_engine import collect_export_entries +from services.workspace_db import global_storage_db_path from utils.path_helpers import to_epoch_ms -from utils.text_extract import extract_text_from_bubble, slug -from utils.exclusion_rules import build_searchable_text, is_excluded_by_rules -from utils.cursor_md_exporter import cursor_ide_chat_to_markdown -from services.workspace_context import resolve_workspace_context_minimal -from services.workspace_db import ( - load_bubble_map, - load_code_block_diff_map, - open_global_db, -) -from services.workspace_resolver import lookup_workspace_display_name +from utils.workspace_path import resolve_workspace_path bp = Blueprint("export_api", __name__) _logger = logging.getLogger(__name__) @@ -75,6 +67,15 @@ def _save_export_state(count: int) -> None: json.dump(state, f, indent=2) +def _read_last_export_ms(since: Literal["all", "last"]) -> int: + if since != "last": + return 0 + ts = _get_export_state().get("lastExportTime") + if ts: + return to_epoch_ms(ts) + return 0 + + @bp.route("/api/export/state") def get_export_state() -> Response: """Return the last export timestamp.""" @@ -93,126 +94,37 @@ def export_chats() -> tuple[Response, int] | Response: """ try: body = request.get_json(silent=True) or {} - since = "last" if body.get("since") == "last" else "all" + since: Literal["all", "last"] = ( + "last" if body.get("since") == "last" else "all" + ) workspace_path = resolve_workspace_path() - - # Determine last export timestamp for filtering - last_export_ms = 0 - if since == "last": - state = _get_export_state() - ts_str = state.get("lastExportTime") - if ts_str: - last_export_ms = to_epoch_ms(ts_str) - - # ── Workspace scanning via service layer ────────────────────────────── - ctx = resolve_workspace_context_minimal(workspace_path) - workspace_entries = ctx.workspace_entries - composer_id_to_ws = ctx.composer_id_to_workspace_id - - # Build display-name and slug maps - ws_id_to_slug: dict[str, str] = {} - ws_id_to_display_name: dict[str, str] = {} - for e in workspace_entries: - display = lookup_workspace_display_name(workspace_path, e["name"]) - if display != e["name"]: - ws_id_to_display_name[e["name"]] = display - ws_id_to_slug[e["name"]] = slug(display) - - today = datetime.now().strftime("%Y-%m-%d") - exported: list[dict[str, Any]] = [] - rules = exclusion_rules() - - # ── Database reading via service layer ──────────────────────────────── - with open_global_db(workspace_path) as (global_db, _): - if global_db is None: - return json_response({"error": "Cursor global storage not found"}, 404) - bubble_map = load_bubble_map(global_db) - code_block_diff_map = load_code_block_diff_map(global_db) - - try: - composer_rows = global_db.execute( - "SELECT key, value FROM cursorDiskKV WHERE key LIKE 'composerData:%'" - " AND value LIKE '%fullConversationHeadersOnly%'" - " AND value NOT LIKE '%fullConversationHeadersOnly\":[]%'" - ).fetchall() - except sqlite3.Error: - composer_rows = [] - - for row in composer_rows: - composer_id = row["key"].split(":")[1] - try: - cd = json.loads(row["value"]) - headers = cd.get("fullConversationHeadersOnly") or [] - if not headers: - continue - - updated_at_ms = to_epoch_ms(cd.get("lastUpdatedAt")) - if updated_at_ms is None: - updated_at_ms = to_epoch_ms(cd.get("createdAt")) - if updated_at_ms is None: - updated_at_ms = 0 - if since == "last" and updated_at_ms and updated_at_ms <= last_export_ms: - continue - - ws_id = composer_id_to_ws.get(composer_id, "global") - ws_slug = "other-chats" if ws_id == "global" else (ws_id_to_slug.get(ws_id) or slug(ws_id[:12])) - ws_display_name = "Other chats" if ws_id == "global" else (ws_id_to_display_name.get(ws_id) or ws_slug) - title = cd.get("name") or f"Chat {composer_id[:8]}" - model_config = cd.get("modelConfig") or {} - model_name = model_config.get("modelName") - model_names = [model_name] if model_name and model_name != "default" else None - - bubble_texts = [] - for h in headers: - b = bubble_map.get(h.get("bubbleId")) - if b: - bt = extract_text_from_bubble(b) - if bt: - bubble_texts.append(bt) - - searchable = build_searchable_text( - project_name=ws_display_name, - chat_title=title, - model_names=model_names, - chat_content_snippet="\n\n".join(bubble_texts) if bubble_texts else None, - ) - if is_excluded_by_rules(rules, searchable): - continue - - title_slug = slug(title) - ts_ms = updated_at_ms or int(datetime.now().timestamp() * 1000) - ts_str = datetime.fromtimestamp(ts_ms / 1000).strftime("%Y-%m-%dT%H-%M-%S") - filename = f"{ts_str}__{title_slug}__{composer_id[:8]}.md" - rel_path = os.path.join(today, ws_slug, "chat", filename) - - md = cursor_ide_chat_to_markdown( - composer_data=cd, - composer_id=composer_id, - bubble_map=bubble_map, - code_block_diff_map=code_block_diff_map, - workspace_info={"ws_slug": ws_slug, "ws_display_name": ws_display_name}, - ) - exported.append({"path": rel_path, "content": md, "updatedAt": updated_at_ms}) - - except Exception as e: - _logger.error( - "Error processing composer %s for export: %s (%s)", - composer_id, - e, - type(e).__name__, - exc_info=True, - ) - + gdb = global_storage_db_path(workspace_path) + if not os.path.isfile(gdb): + return json_response({"error": "Cursor global storage not found"}, 404) + + exported = collect_export_entries( + workspace_path=workspace_path, + exclusion_rules=exclusion_rules(), + since=since, + last_export_ms=_read_last_export_ms(since), + out_dir="", + include_composer=True, + include_cli=False, + ) count = len(exported) if count == 0: - return json_response({"error": "No conversations to export" + ( - " since last export" if since == "last" else "" - )}, 404) + return json_response( + {"error": "No conversations to export" + ( + " since last export" if since == "last" else "" + )}, + 404, + ) + buf = io.BytesIO() with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: for entry in exported: - zf.writestr(entry["path"], entry["content"]) + zf.writestr(entry["rel_path"], entry["content"]) buf.seek(0) _save_export_state(count) @@ -234,4 +146,4 @@ def export_chats() -> tuple[Response, int] | Response: type(e).__name__, exc_info=True, ) - return json_response({"error": "Export failed"}, 500) \ No newline at end of file + return json_response({"error": "Export failed"}, 500) diff --git a/scripts/export.py b/scripts/export.py index f818bba..9dfcffa 100644 --- a/scripts/export.py +++ b/scripts/export.py @@ -194,10 +194,7 @@ def main() -> None: exclusion_rules = load_rules( resolve_exclusion_rules_path(opts.get("exclusion_rules_path")), ) - base_dir = opts.get("base_dir") - if base_dir: - os.environ["WORKSPACE_PATH"] = base_dir - workspace_path = resolve_workspace_path() + workspace_path = resolve_workspace_path(override=opts.get("base_dir")) state_dir = get_global_state_dir() state_path = os.path.join(state_dir, "export_state.json") diff --git a/services/export_engine.py b/services/export_engine.py index 30eb76e..89a7c9d 100644 --- a/services/export_engine.py +++ b/services/export_engine.py @@ -11,7 +11,7 @@ from typing import Any, Literal, TypedDict from models import Bubble -from services.summary_cache import fingerprint_workspace_storage +from services.summary_cache import fingerprint_workspace_storage, nocache_enabled from services.workspace_context import ( WorkspaceContext, enrich_workspace_context_from_global_db, @@ -236,11 +236,9 @@ def _collect_ide_export_entries( if not isinstance(headers, list) or not headers: continue - updated_at = to_epoch_ms(cd.get("lastUpdatedAt")) - if updated_at is None: - updated_at = to_epoch_ms(cd.get("createdAt")) - if updated_at is None: - updated_at = 0 + updated_at = to_epoch_ms(cd.get("lastUpdatedAt")) or to_epoch_ms( + cd.get("createdAt"), + ) if since == "last" and updated_at <= last_export_ms: continue @@ -271,7 +269,8 @@ def _collect_ide_export_entries( else (orch.workspace_id_to_display_name.get(ws_id) or ws_slug) ) title = cd.get("name") or f"Chat {composer_id[:8]}" - model_config = cd.get("modelConfig") or {} + raw_model_config = cd.get("modelConfig") + model_config = raw_model_config if isinstance(raw_model_config, dict) else {} model_name = model_config.get("modelName") model_names = [model_name] if model_name and model_name != "default" else None @@ -468,11 +467,13 @@ def collect_export_entries( last_export_ms: int, out_dir: str, include_composer: bool = True, + include_cli: bool = True, nocache: bool = False, ) -> list[ExportEntry]: """Collect exportable conversations (IDE + CLI) via shared orchestration.""" + effective_nocache = nocache_enabled(request_nocache=nocache) orch = prepare_workspace_orchestration( - workspace_path, exclusion_rules, nocache=nocache, + workspace_path, exclusion_rules, nocache=effective_nocache, ) today = datetime.now().strftime("%Y-%m-%d") exported: list[ExportEntry] = [] @@ -492,13 +493,14 @@ def collect_export_entries( ), ) - exported.extend( - _collect_cli_export_entries( - exclusion_rules=exclusion_rules, - since=since, - last_export_ms=last_export_ms, - today=today, - out_dir=out_dir, - ), - ) + if include_cli: + exported.extend( + _collect_cli_export_entries( + exclusion_rules=exclusion_rules, + since=since, + last_export_ms=last_export_ms, + today=today, + out_dir=out_dir, + ), + ) return exported diff --git a/tests/test_api_export.py b/tests/test_api_export.py index 1e46c31..68d89ae 100644 --- a/tests/test_api_export.py +++ b/tests/test_api_export.py @@ -115,7 +115,7 @@ def test_no_conversations_returns_404(self, workspace_storage, export_state_dir) def test_internal_failure_returns_500(self, client, export_state_dir): with patch( - "api.export_api.resolve_workspace_context_minimal", + "api.export_api.collect_export_entries", side_effect=RuntimeError("simulated export failure"), ): response = _post_export(client) diff --git a/tests/test_export_engine.py b/tests/test_export_engine.py new file mode 100644 index 0000000..d88540e --- /dev/null +++ b/tests/test_export_engine.py @@ -0,0 +1,94 @@ +"""Unit tests for services.export_engine orchestration.""" + +from __future__ import annotations + +import os +import sys +import unittest +from unittest.mock import MagicMock, patch + +REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if REPO_ROOT not in sys.path: + sys.path.insert(0, REPO_ROOT) + +from services.export_engine import ( # noqa: E402 + GlobalDbExportData, + WorkspaceOrchestration, + collect_export_entries, +) + + +class TestCollectExportEntriesNocache(unittest.TestCase): + def test_nocache_env_passed_to_prepare_workspace_orchestration(self): + with patch.dict(os.environ, {"CURSOR_CHAT_BROWSER_NOCACHE": "1"}): + with patch( + "services.export_engine.prepare_workspace_orchestration", + ) as mock_prepare: + mock_prepare.return_value = MagicMock(spec=WorkspaceOrchestration) + with patch( + "services.export_engine.load_global_db_export_data", + return_value=None, + ): + collect_export_entries( + workspace_path="/tmp/ws", + exclusion_rules=[], + since="all", + last_export_ms=0, + out_dir="/tmp/out", + include_composer=False, + include_cli=False, + ) + mock_prepare.assert_called_once() + self.assertTrue(mock_prepare.call_args.kwargs["nocache"]) + + +class TestCollectExportEntriesCorruptComposer(unittest.TestCase): + def test_non_dict_composer_row_is_skipped(self): + ctx = MagicMock() + ctx.project_name_to_workspace_id = {} + ctx.workspace_path_to_id = {} + ctx.composer_id_to_workspace_id = {} + ctx.invalid_workspace_ids = set() + orch = WorkspaceOrchestration( + workspace_path="/tmp/ws", + workspace_entries=[], + fingerprint={}, + ctx=ctx, + workspace_id_to_display_name={}, + workspace_id_to_slug={}, + ) + + class FakeRow: + def __getitem__(self, key: str) -> str: + if key == "key": + return "composerData:bad-row" + return "[]" + + db_data = GlobalDbExportData( + project_layouts_map={}, + bubble_map={}, + code_block_diff_map={}, + ide_composer_rows=[FakeRow()], + invalid_workspace_aliases={}, + ) + with patch( + "services.export_engine.prepare_workspace_orchestration", + return_value=orch, + ): + with patch( + "services.export_engine.load_global_db_export_data", + return_value=db_data, + ): + exported = collect_export_entries( + workspace_path="/tmp/ws", + exclusion_rules=[], + since="all", + last_export_ms=0, + out_dir="/tmp/out", + include_cli=False, + ) + self.assertEqual(exported, []) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_workspace_path_thread_safety.py b/tests/test_workspace_path_thread_safety.py index 92d2bc9..88ff91c 100644 --- a/tests/test_workspace_path_thread_safety.py +++ b/tests/test_workspace_path_thread_safety.py @@ -112,6 +112,11 @@ def reader() -> None: self.assertEqual(errors, [], "\n".join(errors[:20])) + def test_explicit_override_takes_precedence_over_module_override(self): + set_workspace_path_override(self.path_a) + self.assertEqual(resolve_workspace_path(override=self.path_b), self.path_b) + self.assertEqual(resolve_workspace_path(), self.path_a) + if __name__ == "__main__": unittest.main() diff --git a/utils/workspace_path.py b/utils/workspace_path.py index cc4e048..447b9e1 100644 --- a/utils/workspace_path.py +++ b/utils/workspace_path.py @@ -64,17 +64,20 @@ def get_default_workspace_path() -> str: return os.path.join(home, "workspaceStorage") -def resolve_workspace_path() -> str: - """Return the effective workspace path (override > env var > default). +def resolve_workspace_path(*, override: str | None = None) -> str: + """Return the effective workspace path (call override > module > env > default). - Override comes from POST /api/set-workspace (validated). ``WORKSPACE_PATH`` - is only tilde-expanded — trusted-operator escape hatch, not the same checks - as the API (issue #15). + *override* is for one-shot callers (e.g. CLI ``--base-dir``) and does not + mutate ``WORKSPACE_PATH``. Module override comes from POST /api/set-workspace + (validated). ``WORKSPACE_PATH`` is only tilde-expanded — trusted-operator + escape hatch, not the same checks as the API (issue #15). """ - with _workspace_path_lock: - override = _workspace_path_override if override: return expand_tilde_path(override) + with _workspace_path_lock: + module_override = _workspace_path_override + if module_override: + return expand_tilde_path(module_override) env_path = os.environ.get("WORKSPACE_PATH", "").strip() if env_path: return expand_tilde_path(env_path) From d3b101f723347d8e0379cda49436d747cf704ab8 Mon Sep 17 00:00:00 2001 From: chen Date: Tue, 23 Jun 2026 09:43:30 +0800 Subject: [PATCH 4/6] Validate export API JSON body and use tempfile paths in engine tests --- api/export_api.py | 4 +++- tests/test_api_export.py | 9 +++++++++ tests/test_export_engine.py | 28 +++++++++++++++++++++------- 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/api/export_api.py b/api/export_api.py index 66e53a1..a992cba 100644 --- a/api/export_api.py +++ b/api/export_api.py @@ -93,7 +93,9 @@ def export_chats() -> tuple[Response, int] | Response: exclusion rules file. """ try: - body = request.get_json(silent=True) or {} + body = request.get_json(silent=True) + if not isinstance(body, dict): + return json_response({"error": "request body must be a JSON object"}, 400) since: Literal["all", "last"] = ( "last" if body.get("since") == "last" else "all" ) diff --git a/tests/test_api_export.py b/tests/test_api_export.py index 68d89ae..152f496 100644 --- a/tests/test_api_export.py +++ b/tests/test_api_export.py @@ -91,6 +91,15 @@ def test_post_returns_zip_with_markdown_entry(self, client, export_state_dir): class TestExportErrorResponses: + def test_non_dict_json_body_returns_400(self, client, export_state_dir): + response = client.post( + "/api/export", + json=["not", "an", "object"], + content_type="application/json", + ) + assert response.status_code == 400 + assert response.get_json().get("error") == "request body must be a JSON object" + def test_missing_global_storage_returns_404(self, empty_workspace_client): response = _post_export(empty_workspace_client) assert response.status_code == 404 diff --git a/tests/test_export_engine.py b/tests/test_export_engine.py index d88540e..84a4c2a 100644 --- a/tests/test_export_engine.py +++ b/tests/test_export_engine.py @@ -4,6 +4,7 @@ import os import sys +import tempfile import unittest from unittest.mock import MagicMock, patch @@ -18,7 +19,17 @@ ) -class TestCollectExportEntriesNocache(unittest.TestCase): +class _TempExportPathsMixin: + def setUp(self): + self._tmp = tempfile.TemporaryDirectory() + self.addCleanup(self._tmp.cleanup) + self.tmp_ws = os.path.join(self._tmp.name, "ws") + self.tmp_out = os.path.join(self._tmp.name, "out") + os.makedirs(self.tmp_ws, exist_ok=True) + os.makedirs(self.tmp_out, exist_ok=True) + + +class TestCollectExportEntriesNocache(_TempExportPathsMixin, unittest.TestCase): def test_nocache_env_passed_to_prepare_workspace_orchestration(self): with patch.dict(os.environ, {"CURSOR_CHAT_BROWSER_NOCACHE": "1"}): with patch( @@ -30,11 +41,11 @@ def test_nocache_env_passed_to_prepare_workspace_orchestration(self): return_value=None, ): collect_export_entries( - workspace_path="/tmp/ws", + workspace_path=self.tmp_ws, exclusion_rules=[], since="all", last_export_ms=0, - out_dir="/tmp/out", + out_dir=self.tmp_out, include_composer=False, include_cli=False, ) @@ -42,7 +53,10 @@ def test_nocache_env_passed_to_prepare_workspace_orchestration(self): self.assertTrue(mock_prepare.call_args.kwargs["nocache"]) -class TestCollectExportEntriesCorruptComposer(unittest.TestCase): +class TestCollectExportEntriesCorruptComposer( + _TempExportPathsMixin, + unittest.TestCase, +): def test_non_dict_composer_row_is_skipped(self): ctx = MagicMock() ctx.project_name_to_workspace_id = {} @@ -50,7 +64,7 @@ def test_non_dict_composer_row_is_skipped(self): ctx.composer_id_to_workspace_id = {} ctx.invalid_workspace_ids = set() orch = WorkspaceOrchestration( - workspace_path="/tmp/ws", + workspace_path=self.tmp_ws, workspace_entries=[], fingerprint={}, ctx=ctx, @@ -80,11 +94,11 @@ def __getitem__(self, key: str) -> str: return_value=db_data, ): exported = collect_export_entries( - workspace_path="/tmp/ws", + workspace_path=self.tmp_ws, exclusion_rules=[], since="all", last_export_ms=0, - out_dir="/tmp/out", + out_dir=self.tmp_out, include_cli=False, ) self.assertEqual(exported, []) From 38e548d8b89ec63d1d61f543af754d5d7759b5d6 Mon Sep 17 00:00:00 2001 From: chen Date: Wed, 24 Jun 2026 01:31:31 +0800 Subject: [PATCH 5/6] Address PR #112 review: CollectedExportEntry, shared read_last_export_ms, engine tests --- api/export_api.py | 14 +-- models/__init__.py | 3 +- models/export.py | 14 ++- scripts/export.py | 22 +---- services/export_engine.py | 57 ++++++++---- tests/test_export_engine.py | 173 +++++++++++++++++++++++++++++++----- 6 files changed, 209 insertions(+), 74 deletions(-) diff --git a/api/export_api.py b/api/export_api.py index a992cba..a2bed91 100644 --- a/api/export_api.py +++ b/api/export_api.py @@ -18,9 +18,8 @@ from flask import Blueprint, Response, request from api.flask_config import exclusion_rules, json_response -from services.export_engine import collect_export_entries +from services.export_engine import collect_export_entries, read_last_export_ms from services.workspace_db import global_storage_db_path -from utils.path_helpers import to_epoch_ms from utils.workspace_path import resolve_workspace_path bp = Blueprint("export_api", __name__) @@ -67,15 +66,6 @@ def _save_export_state(count: int) -> None: json.dump(state, f, indent=2) -def _read_last_export_ms(since: Literal["all", "last"]) -> int: - if since != "last": - return 0 - ts = _get_export_state().get("lastExportTime") - if ts: - return to_epoch_ms(ts) - return 0 - - @bp.route("/api/export/state") def get_export_state() -> Response: """Return the last export timestamp.""" @@ -109,7 +99,7 @@ def export_chats() -> tuple[Response, int] | Response: workspace_path=workspace_path, exclusion_rules=exclusion_rules(), since=since, - last_export_ms=_read_last_export_ms(since), + last_export_ms=read_last_export_ms(since, state=_get_export_state()), out_dir="", include_composer=True, include_cli=False, diff --git a/models/__init__.py b/models/__init__.py index 4657ff6..a222bc3 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -3,7 +3,7 @@ from models.conversation import Bubble, Composer, Conversation, WorkspaceLocalComposer from models.errors import SchemaError from models.parse_warnings import ParseWarningCollector -from models.export import ExportEntry +from models.export import CollectedExportEntry, ExportEntry from models.search import ConversationSummary, SearchResult from models.workspace import Workspace @@ -16,6 +16,7 @@ "Composer", "Conversation", "ConversationSummary", + "CollectedExportEntry", "ExportEntry", "ParseWarningCollector", "SchemaError", diff --git a/models/export.py b/models/export.py index ff08bc9..50f7421 100644 --- a/models/export.py +++ b/models/export.py @@ -1,11 +1,23 @@ from __future__ import annotations from dataclasses import dataclass, field -from typing import Any +from typing import Any, TypedDict from models.from_dict_validation import require_dict, require_non_empty_str_fields +class CollectedExportEntry(TypedDict): + """One exportable conversation with rendered markdown (engine/CLI collection).""" + + id: str + rel_path: str + content: str + out_path: str + updatedAt: int + title: str + workspace: str + + @dataclass(frozen=True) class ExportEntry: """One line of manifest.jsonl; log_id / title / workspace required, timestamps optional.""" diff --git a/scripts/export.py b/scripts/export.py index 9dfcffa..8274dee 100644 --- a/scripts/export.py +++ b/scripts/export.py @@ -31,12 +31,11 @@ sys.path.insert(0, str(_project_root)) from models import ExportEntry, SchemaError # noqa: E402 -from services.export_engine import collect_export_entries # noqa: E402 +from services.export_engine import collect_export_entries, read_last_export_ms # noqa: E402 from utils.exclusion_rules import ( # noqa: E402 load_rules, resolve_exclusion_rules_path, ) -from utils.path_helpers import to_epoch_ms # noqa: E402 from utils.workspace_path import resolve_workspace_path # noqa: E402 _logger = logging.getLogger(__name__) @@ -168,23 +167,6 @@ def parse_args() -> ExportCliOptions: } -def _read_last_export_ms(state_path: str, since: Literal["all", "last"]) -> int: - if since != "last" or not os.path.isfile(state_path): - return 0 - try: - with open(state_path, "r", encoding="utf-8") as f: - st = json.load(f) - ts = st.get("lastExportTime") - if ts: - return to_epoch_ms(ts) - except (json.JSONDecodeError, ValueError, OSError) as e: - _logger.warning( - "Could not read last export timestamp; defaulting to full export: %s", - e, - ) - return 0 - - def main() -> None: configure_cli_logging() opts = parse_args() @@ -198,7 +180,7 @@ def main() -> None: state_dir = get_global_state_dir() state_path = os.path.join(state_dir, "export_state.json") - last_export = _read_last_export_ms(state_path, since) + last_export = read_last_export_ms(since, state_path=state_path) exported = collect_export_entries( workspace_path=workspace_path, diff --git a/services/export_engine.py b/services/export_engine.py index 89a7c9d..51a62d7 100644 --- a/services/export_engine.py +++ b/services/export_engine.py @@ -8,9 +8,10 @@ import sqlite3 from dataclasses import dataclass from datetime import datetime -from typing import Any, Literal, TypedDict +from typing import Any, Literal from models import Bubble +from models.export import CollectedExportEntry from services.summary_cache import fingerprint_workspace_storage, nocache_enabled from services.workspace_context import ( WorkspaceContext, @@ -49,16 +50,32 @@ SinceMode = Literal["all", "last"] -class ExportEntry(TypedDict): - """One exportable conversation with rendered markdown.""" - - id: str - rel_path: str - content: str - out_path: str - updatedAt: int - title: str - workspace: str +def read_last_export_ms( + since: SinceMode, + *, + state_path: str | None = None, + state: dict[str, Any] | None = None, +) -> int: + """Return last-export epoch ms for ``since=last``; 0 for a full export.""" + if since != "last": + return 0 + ts: Any = None + if state is not None: + ts = state.get("lastExportTime") + elif state_path is not None and os.path.isfile(state_path): + try: + with open(state_path, "r", encoding="utf-8") as f: + st = json.load(f) + if isinstance(st, dict): + ts = st.get("lastExportTime") + except (json.JSONDecodeError, ValueError, OSError) as e: + _logger.warning( + "Could not read last export timestamp; defaulting to full export: %s", + e, + ) + if ts: + return to_epoch_ms(ts) + return 0 @dataclass(frozen=True) @@ -209,9 +226,9 @@ def _collect_ide_export_entries( last_export_ms: int, today: str, out_dir: str, -) -> list[ExportEntry]: +) -> list[CollectedExportEntry]: ctx = orch.ctx - exported: list[ExportEntry] = [] + exported: list[CollectedExportEntry] = [] for row in db_data.ide_composer_rows: composer_id = row["key"].split(":")[1] try: @@ -239,6 +256,8 @@ def _collect_ide_export_entries( updated_at = to_epoch_ms(cd.get("lastUpdatedAt")) or to_epoch_ms( cd.get("createdAt"), ) + # Intentional behavior change vs legacy CLI: fall back to createdAt when + # lastUpdatedAt is absent (affects timestamps, filenames, and --since last). if since == "last" and updated_at <= last_export_ms: continue @@ -325,7 +344,7 @@ def _collect_ide_export_entries( workspace_info={"ws_slug": ws_slug, "ws_display_name": ws_display_name}, ) - rel_path = os.path.join(today, ws_slug, "chat", filename) + rel_path = os.path.relpath(out_path, out_dir) exported.append({ "id": composer_id, "rel_path": rel_path, @@ -345,8 +364,8 @@ def _collect_cli_export_entries( last_export_ms: int, today: str, out_dir: str, -) -> list[ExportEntry]: - exported: list[ExportEntry] = [] +) -> list[CollectedExportEntry]: + exported: list[CollectedExportEntry] = [] try: cli_projects = list_cli_projects(get_cli_chats_path()) except Exception as e: # noqa: BLE001 — log and skip CLI enumeration on any failure @@ -446,7 +465,7 @@ def _collect_cli_export_entries( bubbles=bubbles, title_override=title, ) - rel_path = os.path.join(today, ws_slug_cli, "cli", filename) + rel_path = os.path.relpath(out_path, out_dir) exported.append({ "id": session_id, "rel_path": rel_path, @@ -469,14 +488,14 @@ def collect_export_entries( include_composer: bool = True, include_cli: bool = True, nocache: bool = False, -) -> list[ExportEntry]: +) -> list[CollectedExportEntry]: """Collect exportable conversations (IDE + CLI) via shared orchestration.""" effective_nocache = nocache_enabled(request_nocache=nocache) orch = prepare_workspace_orchestration( workspace_path, exclusion_rules, nocache=effective_nocache, ) today = datetime.now().strftime("%Y-%m-%d") - exported: list[ExportEntry] = [] + exported: list[CollectedExportEntry] = [] if include_composer: db_data = load_global_db_export_data(orch) diff --git a/tests/test_export_engine.py b/tests/test_export_engine.py index 84a4c2a..2139826 100644 --- a/tests/test_export_engine.py +++ b/tests/test_export_engine.py @@ -2,21 +2,28 @@ from __future__ import annotations +import json import os import sys import tempfile import unittest +from datetime import datetime from unittest.mock import MagicMock, patch REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if REPO_ROOT not in sys.path: sys.path.insert(0, REPO_ROOT) +from models import Bubble # noqa: E402 from services.export_engine import ( # noqa: E402 GlobalDbExportData, WorkspaceOrchestration, + _collect_ide_export_entries, collect_export_entries, + read_last_export_ms, ) +from utils.exclusion_rules import load_rules # noqa: E402 +from utils.text_extract import slug # noqa: E402 class _TempExportPathsMixin: @@ -29,6 +36,53 @@ def setUp(self): os.makedirs(self.tmp_out, exist_ok=True) +def _fake_composer_row(composer_id: str, cd: dict[str, object]) -> object: + class FakeRow: + def __getitem__(self, key: str) -> str: + if key == "key": + return f"composerData:{composer_id}" + return json.dumps(cd) + + return FakeRow() + + +def _minimal_ctx() -> MagicMock: + ctx = MagicMock() + ctx.project_name_to_workspace_id = {} + ctx.workspace_path_to_id = {} + ctx.composer_id_to_workspace_id = {} + ctx.invalid_workspace_ids = set() + return ctx + + +def _minimal_orch( + tmp_ws: str, + *, + display_name: dict[str, str] | None = None, + slug_map: dict[str, str] | None = None, +) -> WorkspaceOrchestration: + return WorkspaceOrchestration( + workspace_path=tmp_ws, + workspace_entries=[], + fingerprint={}, + ctx=_minimal_ctx(), + workspace_id_to_display_name=display_name or {}, + workspace_id_to_slug=slug_map or {}, + ) + + +class TestReadLastExportMs(unittest.TestCase): + def test_since_all_returns_zero(self): + self.assertEqual(read_last_export_ms("all", state={"lastExportTime": "2026-01-01"}), 0) + + def test_since_last_reads_state_dict(self): + ms = read_last_export_ms( + "last", + state={"lastExportTime": "2026-01-01T12:00:00"}, + ) + self.assertGreater(ms, 0) + + class TestCollectExportEntriesNocache(_TempExportPathsMixin, unittest.TestCase): def test_nocache_env_passed_to_prepare_workspace_orchestration(self): with patch.dict(os.environ, {"CURSOR_CHAT_BROWSER_NOCACHE": "1"}): @@ -58,31 +112,12 @@ class TestCollectExportEntriesCorruptComposer( unittest.TestCase, ): def test_non_dict_composer_row_is_skipped(self): - ctx = MagicMock() - ctx.project_name_to_workspace_id = {} - ctx.workspace_path_to_id = {} - ctx.composer_id_to_workspace_id = {} - ctx.invalid_workspace_ids = set() - orch = WorkspaceOrchestration( - workspace_path=self.tmp_ws, - workspace_entries=[], - fingerprint={}, - ctx=ctx, - workspace_id_to_display_name={}, - workspace_id_to_slug={}, - ) - - class FakeRow: - def __getitem__(self, key: str) -> str: - if key == "key": - return "composerData:bad-row" - return "[]" - + orch = _minimal_orch(self.tmp_ws) db_data = GlobalDbExportData( project_layouts_map={}, bubble_map={}, code_block_diff_map={}, - ide_composer_rows=[FakeRow()], + ide_composer_rows=[_fake_composer_row("bad-row", [])], # type: ignore[arg-type] invalid_workspace_aliases={}, ) with patch( @@ -104,5 +139,101 @@ def __getitem__(self, key: str) -> str: self.assertEqual(exported, []) +class TestCollectIdeExportEntries(_TempExportPathsMixin, unittest.TestCase): + def _collect( + self, + cd: dict[str, object], + *, + composer_id: str = "cmp-1", + exclusion_rules: list | None = None, + orch: WorkspaceOrchestration | None = None, + project_id: str = "ws-unknown-abcdefghijklmnop", + ) -> list: + bubble_id = "bubble-1" + bubble_map = { + bubble_id: Bubble.from_dict( + {"type": "user", "text": "Hello from the test bubble."}, + bubble_id=bubble_id, + ), + } + db_data = GlobalDbExportData( + project_layouts_map={}, + bubble_map=bubble_map, + code_block_diff_map={}, + ide_composer_rows=[_fake_composer_row(composer_id, cd)], + invalid_workspace_aliases={}, + ) + orch = orch or _minimal_orch(self.tmp_ws) + with patch( + "services.export_engine.determine_project_for_conversation", + return_value=project_id, + ): + with patch( + "services.export_engine.cursor_ide_chat_to_markdown", + return_value="# exported markdown", + ): + return _collect_ide_export_entries( + orch=orch, + db_data=db_data, + exclusion_rules=exclusion_rules or [], + since="all", + last_export_ms=0, + today="2026-06-22", + out_dir=self.tmp_out, + ) + + def test_created_at_fallback_when_last_updated_missing(self): + created_ms = 1739200000000 + exported = self._collect({ + "name": "Created-only chat", + "modelConfig": {}, + "fullConversationHeadersOnly": [{"bubbleId": "bubble-1", "type": 1}], + "createdAt": created_ms, + }) + self.assertEqual(len(exported), 1) + entry = exported[0] + self.assertEqual(entry["updatedAt"], created_ms) + ts_str = datetime.fromtimestamp(created_ms / 1000).strftime("%Y-%m-%dT%H-%M-%S") + self.assertIn(ts_str, entry["rel_path"]) + self.assertEqual( + entry["rel_path"], + os.path.relpath(entry["out_path"], self.tmp_out), + ) + + def test_display_name_falls_back_to_slug_of_workspace_id_prefix(self): + ws_id = "abcdefghijklmnop" + exported = self._collect( + { + "name": "Workspace fallback chat", + "modelConfig": {}, + "fullConversationHeadersOnly": [{"bubbleId": "bubble-1", "type": 1}], + "lastUpdatedAt": 1739300000000, + }, + project_id=ws_id, + orch=_minimal_orch(self.tmp_ws), + ) + self.assertEqual(len(exported), 1) + expected_display = slug(ws_id[:12]) + self.assertEqual(exported[0]["workspace"], expected_display) + self.assertIn(expected_display, exported[0]["rel_path"]) + + def test_exclusion_rules_filter_ide_entry(self): + rules_path = os.path.join(self._tmp.name, "rules.txt") + with open(rules_path, "w", encoding="utf-8") as f: + f.write("roadmap\n") + rules = load_rules(rules_path) + + exported = self._collect( + { + "name": "Roadmap planning", + "modelConfig": {}, + "fullConversationHeadersOnly": [{"bubbleId": "bubble-1", "type": 1}], + "lastUpdatedAt": 1739300000000, + }, + exclusion_rules=rules, + ) + self.assertEqual(exported, []) + + if __name__ == "__main__": unittest.main() From 476bd5da681758b224c5713436419205146f7ad0 Mon Sep 17 00:00:00 2001 From: chen Date: Wed, 24 Jun 2026 02:23:11 +0800 Subject: [PATCH 6/6] Restore legacy lastUpdatedAt-only behavior; mkdir state_dir early --- scripts/export.py | 2 +- services/export_engine.py | 6 +----- tests/test_export_engine.py | 26 +++++++++++++++----------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/scripts/export.py b/scripts/export.py index 8274dee..2c820ba 100644 --- a/scripts/export.py +++ b/scripts/export.py @@ -179,6 +179,7 @@ def main() -> None: workspace_path = resolve_workspace_path(override=opts.get("base_dir")) state_dir = get_global_state_dir() + os.makedirs(state_dir, exist_ok=True) state_path = os.path.join(state_dir, "export_state.json") last_export = read_last_export_ms(since, state_path=state_path) @@ -253,7 +254,6 @@ def main() -> None: "exportedCount": count, "exportDir": out_dir, } - os.makedirs(state_dir, exist_ok=True) with open(os.path.join(state_dir, "export_state.json"), "w", encoding="utf-8") as f: json.dump(state, f, indent=2) diff --git a/services/export_engine.py b/services/export_engine.py index 51a62d7..ea05fa6 100644 --- a/services/export_engine.py +++ b/services/export_engine.py @@ -253,11 +253,7 @@ def _collect_ide_export_entries( if not isinstance(headers, list) or not headers: continue - updated_at = to_epoch_ms(cd.get("lastUpdatedAt")) or to_epoch_ms( - cd.get("createdAt"), - ) - # Intentional behavior change vs legacy CLI: fall back to createdAt when - # lastUpdatedAt is absent (affects timestamps, filenames, and --since last). + updated_at = to_epoch_ms(cd.get("lastUpdatedAt")) if since == "last" and updated_at <= last_export_ms: continue diff --git a/tests/test_export_engine.py b/tests/test_export_engine.py index 2139826..0257e17 100644 --- a/tests/test_export_engine.py +++ b/tests/test_export_engine.py @@ -182,22 +182,26 @@ def _collect( out_dir=self.tmp_out, ) - def test_created_at_fallback_when_last_updated_missing(self): + def test_last_updated_at_only_no_created_at_fallback(self): created_ms = 1739200000000 - exported = self._collect({ - "name": "Created-only chat", - "modelConfig": {}, - "fullConversationHeadersOnly": [{"bubbleId": "bubble-1", "type": 1}], - "createdAt": created_ms, - }) + fixed_now = datetime(2026, 6, 22, 12, 0, 0) + with patch("services.export_engine.datetime") as mock_dt: + mock_dt.now.return_value = fixed_now + mock_dt.fromtimestamp = datetime.fromtimestamp + exported = self._collect({ + "name": "Created-only chat", + "modelConfig": {}, + "fullConversationHeadersOnly": [{"bubbleId": "bubble-1", "type": 1}], + "createdAt": created_ms, + }) self.assertEqual(len(exported), 1) entry = exported[0] - self.assertEqual(entry["updatedAt"], created_ms) - ts_str = datetime.fromtimestamp(created_ms / 1000).strftime("%Y-%m-%dT%H-%M-%S") + self.assertEqual(entry["updatedAt"], 0) + ts_str = fixed_now.strftime("%Y-%m-%dT%H-%M-%S") self.assertIn(ts_str, entry["rel_path"]) - self.assertEqual( + self.assertNotIn( + datetime.fromtimestamp(created_ms / 1000).strftime("%Y-%m-%dT%H-%M-%S"), entry["rel_path"], - os.path.relpath(entry["out_path"], self.tmp_out), ) def test_display_name_falls_back_to_slug_of_workspace_id_prefix(self):