From aa65f06d4cdaeb0f7ff62fd08a6d80b530180196 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Thu, 2 Apr 2026 12:38:53 -0700 Subject: [PATCH 1/8] Add observability tracing and metrics - Add local trace file output and OTLP export hooks - Instrument RPC, git, orchestration, provider, and DB paths - Document observability configuration --- README.md | 2 + apps/server/src/cli-config.test.ts | 17 + apps/server/src/cli.ts | 37 +- apps/server/src/config.ts | 20 + apps/server/src/git/Layers/GitCore.ts | 55 +- apps/server/src/observability/Attributes.ts | 112 +++++ .../src/observability/Layers/Observability.ts | 64 +++ .../src/observability/LocalFileTracer.test.ts | 117 +++++ .../src/observability/LocalFileTracer.ts | 104 ++++ apps/server/src/observability/Metrics.test.ts | 73 +++ apps/server/src/observability/Metrics.ts | 159 ++++++ .../observability/RpcInstrumentation.test.ts | 97 ++++ .../src/observability/RpcInstrumentation.ts | 37 ++ apps/server/src/observability/TraceRecord.ts | 100 ++++ .../src/observability/TraceSink.test.ts | 108 ++++ apps/server/src/observability/TraceSink.ts | 68 +++ .../Layers/OrchestrationEngine.ts | 24 +- .../Layers/ProviderCommandReactor.ts | 9 + apps/server/src/persistence/Layers/Sqlite.ts | 12 +- .../src/persistence/NodeSqliteClient.ts | 19 +- .../src/provider/Layers/ProviderService.ts | 464 +++++++++++------ apps/server/src/server.test.ts | 13 +- apps/server/src/server.ts | 4 +- apps/server/src/serverLogger.ts | 5 +- apps/server/src/serverRuntimeStartup.ts | 93 ++-- apps/server/src/terminal/Layers/Manager.ts | 94 ++-- apps/server/src/ws.ts | 350 +++++++++---- docs/observability.md | 468 ++++++++++++++++++ turbo.json | 12 +- 29 files changed, 2377 insertions(+), 360 deletions(-) create mode 100644 apps/server/src/observability/Attributes.ts create mode 100644 apps/server/src/observability/Layers/Observability.ts create mode 100644 apps/server/src/observability/LocalFileTracer.test.ts create mode 100644 apps/server/src/observability/LocalFileTracer.ts create mode 100644 apps/server/src/observability/Metrics.test.ts create mode 100644 apps/server/src/observability/Metrics.ts create mode 100644 apps/server/src/observability/RpcInstrumentation.test.ts create mode 100644 apps/server/src/observability/RpcInstrumentation.ts create mode 100644 apps/server/src/observability/TraceRecord.ts create mode 100644 apps/server/src/observability/TraceSink.test.ts create mode 100644 apps/server/src/observability/TraceSink.ts create mode 100644 docs/observability.md diff --git a/README.md b/README.md index d7b2fccb8f..d3a54a1b90 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,8 @@ We are very very early in this project. Expect bugs. We are not accepting contributions yet. +Observability guide: [docs/observability.md](./docs/observability.md) + ## If you REALLY want to contribute still.... read this first Read [CONTRIBUTING.md](./CONTRIBUTING.md) before opening an issue or PR. diff --git a/apps/server/src/cli-config.test.ts b/apps/server/src/cli-config.test.ts index 27bc60b1be..d3753988d9 100644 --- a/apps/server/src/cli-config.test.ts +++ b/apps/server/src/cli-config.test.ts @@ -9,6 +9,18 @@ import { deriveServerPaths } from "./config"; import { resolveServerConfig } from "./cli"; it.layer(NodeServices.layer)("cli config resolution", (it) => { + const defaultObservabilityConfig = { + traceMinLevel: "Info", + traceTimingEnabled: true, + traceBatchWindowMs: 200, + traceMaxBytes: 10 * 1024 * 1024, + traceMaxFiles: 10, + otlpTracesUrl: undefined, + otlpMetricsUrl: undefined, + otlpExportIntervalMs: 10_000, + otlpServiceName: "t3-server", + } as const; + const openBootstrapFd = Effect.fn(function* (payload: Record) { const fs = yield* FileSystem.FileSystem; const filePath = yield* fs.makeTempFileScoped({ prefix: "t3-bootstrap-", suffix: ".ndjson" }); @@ -62,6 +74,7 @@ it.layer(NodeServices.layer)("cli config resolution", (it) => { expect(resolved).toEqual({ logLevel: "Warn", + ...defaultObservabilityConfig, mode: "desktop", port: 4001, cwd: process.cwd(), @@ -123,6 +136,7 @@ it.layer(NodeServices.layer)("cli config resolution", (it) => { expect(resolved).toEqual({ logLevel: "Debug", + ...defaultObservabilityConfig, mode: "web", port: 8788, cwd: process.cwd(), @@ -187,6 +201,7 @@ it.layer(NodeServices.layer)("cli config resolution", (it) => { expect(resolved).toEqual({ logLevel: "Info", + ...defaultObservabilityConfig, mode: "desktop", port: 4888, cwd: process.cwd(), @@ -241,6 +256,7 @@ it.layer(NodeServices.layer)("cli config resolution", (it) => { resolved.attachmentsDir, resolved.worktreesDir, path.dirname(resolved.serverLogPath), + path.dirname(resolved.serverTracePath), ]) { expect(yield* fs.exists(directory)).toBe(true); } @@ -300,6 +316,7 @@ it.layer(NodeServices.layer)("cli config resolution", (it) => { expect(resolved).toEqual({ logLevel: "Debug", + ...defaultObservabilityConfig, mode: "web", port: 8788, cwd: process.cwd(), diff --git a/apps/server/src/cli.ts b/apps/server/src/cli.ts index 2f76d25b7b..af3202954a 100644 --- a/apps/server/src/cli.ts +++ b/apps/server/src/cli.ts @@ -1,5 +1,5 @@ import { NetService } from "@t3tools/shared/Net"; -import { Config, Effect, LogLevel, Option, Schema } from "effect"; +import { Config, Effect, FileSystem, LogLevel, Option, Path, Schema } from "effect"; import { Command, Flag, GlobalFlag } from "effect/unstable/cli"; import { @@ -81,6 +81,27 @@ const logWebSocketEventsFlag = Flag.boolean("log-websocket-events").pipe( const EnvServerConfig = Config.all({ logLevel: Config.logLevel("T3CODE_LOG_LEVEL").pipe(Config.withDefault("Info")), + traceMinLevel: Config.logLevel("T3CODE_TRACE_MIN_LEVEL").pipe(Config.withDefault("Info")), + traceTimingEnabled: Config.boolean("T3CODE_TRACE_TIMING_ENABLED").pipe(Config.withDefault(true)), + traceFile: Config.string("T3CODE_TRACE_FILE").pipe( + Config.option, + Config.map(Option.getOrUndefined), + ), + traceMaxBytes: Config.int("T3CODE_TRACE_MAX_BYTES").pipe(Config.withDefault(10 * 1024 * 1024)), + traceMaxFiles: Config.int("T3CODE_TRACE_MAX_FILES").pipe(Config.withDefault(10)), + traceBatchWindowMs: Config.int("T3CODE_TRACE_BATCH_WINDOW_MS").pipe(Config.withDefault(200)), + otlpTracesUrl: Config.string("T3CODE_OTLP_TRACES_URL").pipe( + Config.option, + Config.map(Option.getOrUndefined), + ), + otlpMetricsUrl: Config.string("T3CODE_OTLP_METRICS_URL").pipe( + Config.option, + Config.map(Option.getOrUndefined), + ), + otlpExportIntervalMs: Config.int("T3CODE_OTLP_EXPORT_INTERVAL_MS").pipe( + Config.withDefault(10_000), + ), + otlpServiceName: Config.string("T3CODE_OTLP_SERVICE_NAME").pipe(Config.withDefault("t3-server")), mode: Config.schema(RuntimeMode, "T3CODE_MODE").pipe( Config.option, Config.map(Option.getOrUndefined), @@ -137,6 +158,8 @@ export const resolveServerConfig = ( ) => Effect.gen(function* () { const { findAvailablePort } = yield* NetService; + const path = yield* Path.Path; + const fs = yield* FileSystem.FileSystem; const env = yield* EnvServerConfig; const bootstrapFd = Option.getOrUndefined(flags.bootstrapFd) ?? env.bootstrapFd; const bootstrapEnvelope = @@ -190,6 +213,8 @@ export const resolveServerConfig = ( ); const derivedPaths = yield* deriveServerPaths(baseDir, devUrl); yield* ensureServerDirectories(derivedPaths); + const serverTracePath = env.traceFile ?? derivedPaths.serverTracePath; + yield* fs.makeDirectory(path.dirname(serverTracePath), { recursive: true }); const noBrowser = resolveBooleanFlag( flags.noBrowser, Option.getOrElse( @@ -248,11 +273,21 @@ export const resolveServerConfig = ( const config: ServerConfigShape = { logLevel, + traceMinLevel: env.traceMinLevel, + traceTimingEnabled: env.traceTimingEnabled, + traceBatchWindowMs: env.traceBatchWindowMs, + traceMaxBytes: env.traceMaxBytes, + traceMaxFiles: env.traceMaxFiles, + otlpTracesUrl: env.otlpTracesUrl, + otlpMetricsUrl: env.otlpMetricsUrl, + otlpExportIntervalMs: env.otlpExportIntervalMs, + otlpServiceName: env.otlpServiceName, mode, port, cwd: process.cwd(), baseDir, ...derivedPaths, + serverTracePath, host, staticDir, devUrl, diff --git a/apps/server/src/config.ts b/apps/server/src/config.ts index 21cd6f3150..9ceea4c13c 100644 --- a/apps/server/src/config.ts +++ b/apps/server/src/config.ts @@ -25,6 +25,7 @@ export interface ServerDerivedPaths { readonly attachmentsDir: string; readonly logsDir: string; readonly serverLogPath: string; + readonly serverTracePath: string; readonly providerLogsDir: string; readonly providerEventLogPath: string; readonly terminalLogsDir: string; @@ -36,6 +37,15 @@ export interface ServerDerivedPaths { */ export interface ServerConfigShape extends ServerDerivedPaths { readonly logLevel: LogLevel.LogLevel; + readonly traceMinLevel: LogLevel.LogLevel; + readonly traceTimingEnabled: boolean; + readonly traceBatchWindowMs: number; + readonly traceMaxBytes: number; + readonly traceMaxFiles: number; + readonly otlpTracesUrl: string | undefined; + readonly otlpMetricsUrl: string | undefined; + readonly otlpExportIntervalMs: number; + readonly otlpServiceName: string; readonly mode: RuntimeMode; readonly port: number; readonly host: string | undefined; @@ -68,6 +78,7 @@ export const deriveServerPaths = Effect.fn(function* ( attachmentsDir, logsDir, serverLogPath: join(logsDir, "server.log"), + serverTracePath: join(logsDir, "server.trace.ndjson"), providerLogsDir, providerEventLogPath: join(providerLogsDir, "events.log"), terminalLogsDir: join(logsDir, "terminals"), @@ -117,6 +128,15 @@ export class ServerConfig extends ServiceMap.Service; } +const nowUnixNano = (): bigint => BigInt(Date.now()) * 1_000_000n; + +const addCurrentSpanEvent = (name: string, attributes: Record) => + Effect.currentSpan.pipe( + Effect.tap((span) => + Effect.sync(() => { + span.event(name, nowUnixNano(), compactTraceAttributes(attributes)); + }), + ), + Effect.catch(() => Effect.void), + ); + function trace2ChildKey(record: Record): string | null { const childId = record.child_id; if (typeof childId === "number" || typeof childId === "string") { @@ -399,6 +413,9 @@ const createTrace2Monitor = Effect.fn("createTrace2Monitor")(function* ( if (event === "child_start") { hookStartByChildKey.set(childKey, { hookName, startedAtMs: Date.now() }); + yield* addCurrentSpanEvent("git.hook.started", { + hookName, + }); if (progress.onHookStarted) { yield* progress.onHookStarted(hookName); } @@ -407,12 +424,19 @@ const createTrace2Monitor = Effect.fn("createTrace2Monitor")(function* ( if (event === "child_exit") { hookStartByChildKey.delete(childKey); + const code = traceRecord.success.code; + const exitCode = typeof code === "number" && Number.isInteger(code) ? code : null; + const durationMs = started ? Math.max(0, Date.now() - started.startedAtMs) : null; + yield* addCurrentSpanEvent("git.hook.finished", { + hookName: started?.hookName ?? hookName, + exitCode, + durationMs, + }); if (progress.onHookFinished) { - const code = traceRecord.success.code; yield* progress.onHookFinished({ hookName: started?.hookName ?? hookName, - exitCode: typeof code === "number" && Number.isInteger(code) ? code : null, - durationMs: started ? Math.max(0, Date.now() - started.startedAtMs) : null, + exitCode, + durationMs, }); } } @@ -564,13 +588,13 @@ export const makeGitCore = Effect.fn("makeGitCore")(function* (options?: { const path = yield* Path.Path; const { worktreesDir } = yield* ServerConfig; - let execute: GitCoreShape["execute"]; + let executeRaw: GitCoreShape["execute"]; if (options?.executeOverride) { - execute = options.executeOverride; + executeRaw = options.executeOverride; } else { const commandSpawner = yield* ChildProcessSpawner.ChildProcessSpawner; - execute = Effect.fnUntraced(function* (input) { + executeRaw = Effect.fnUntraced(function* (input) { const commandInput = { ...input, args: [...input.args], @@ -671,6 +695,25 @@ export const makeGitCore = Effect.fn("makeGitCore")(function* (options?: { }); } + const execute: GitCoreShape["execute"] = (input) => + executeRaw(input).pipe( + withMetrics({ + counter: gitCommandsTotal, + timer: gitCommandDuration, + attributes: { + operation: input.operation, + }, + }), + Effect.withSpan(input.operation, { + kind: "client", + attributes: { + "git.operation": input.operation, + "git.cwd": input.cwd, + "git.args_count": input.args.length, + }, + }), + ); + const executeGit = ( operation: string, cwd: string, diff --git a/apps/server/src/observability/Attributes.ts b/apps/server/src/observability/Attributes.ts new file mode 100644 index 0000000000..831030f916 --- /dev/null +++ b/apps/server/src/observability/Attributes.ts @@ -0,0 +1,112 @@ +import { Cause, Exit } from "effect"; + +export type MetricAttributeValue = string; +export type MetricAttributes = Readonly>; +export type TraceAttributes = Readonly>; +export type ObservabilityOutcome = "success" | "failure" | "interrupt"; + +function isPlainObject(value: unknown): value is Record { + return typeof value === "object" && value !== null && !Array.isArray(value); +} + +function normalizeJsonValue(value: unknown, seen: WeakSet = new WeakSet()): unknown { + if ( + value === null || + value === undefined || + typeof value === "string" || + typeof value === "number" || + typeof value === "boolean" + ) { + return value ?? null; + } + if (typeof value === "bigint") { + return value.toString(); + } + if (value instanceof Date) { + return value.toISOString(); + } + if (value instanceof Error) { + return { + name: value.name, + message: value.message, + ...(value.stack ? { stack: value.stack } : {}), + }; + } + if (Array.isArray(value)) { + return value.map((entry) => normalizeJsonValue(entry, seen)); + } + if (value instanceof Map) { + return Object.fromEntries( + Array.from(value.entries(), ([key, entryValue]) => [ + String(key), + normalizeJsonValue(entryValue, seen), + ]), + ); + } + if (value instanceof Set) { + return Array.from(value.values(), (entry) => normalizeJsonValue(entry, seen)); + } + if (!isPlainObject(value)) { + return String(value); + } + if (seen.has(value)) { + return "[Circular]"; + } + seen.add(value); + return Object.fromEntries( + Object.entries(value).map(([key, entryValue]) => [key, normalizeJsonValue(entryValue, seen)]), + ); +} + +export function compactTraceAttributes( + attributes: Readonly>, +): TraceAttributes { + return Object.fromEntries( + Object.entries(attributes) + .filter(([, value]) => value !== undefined) + .map(([key, value]) => [key, normalizeJsonValue(value)]), + ); +} + +export function compactMetricAttributes( + attributes: Readonly>, +): MetricAttributes { + return Object.fromEntries( + Object.entries(attributes).flatMap(([key, value]) => { + if (value === undefined || value === null) { + return []; + } + if (typeof value === "string") { + return [[key, value]]; + } + if (typeof value === "number" || typeof value === "boolean" || typeof value === "bigint") { + return [[key, String(value)]]; + } + return []; + }), + ); +} + +export function outcomeFromExit(exit: Exit.Exit): ObservabilityOutcome { + if (Exit.isSuccess(exit)) { + return "success"; + } + return Cause.hasInterruptsOnly(exit.cause) ? "interrupt" : "failure"; +} + +export function normalizeModelMetricLabel(model: string | null | undefined): string | undefined { + const normalized = model?.trim().toLowerCase(); + if (!normalized) { + return undefined; + } + if (normalized.includes("gpt-5")) { + return "gpt-5"; + } + if (normalized.includes("claude")) { + return "claude"; + } + if (normalized.includes("gemini")) { + return "gemini"; + } + return "other"; +} diff --git a/apps/server/src/observability/Layers/Observability.ts b/apps/server/src/observability/Layers/Observability.ts new file mode 100644 index 0000000000..29b9d7eac5 --- /dev/null +++ b/apps/server/src/observability/Layers/Observability.ts @@ -0,0 +1,64 @@ +import { Effect, Layer, References, Tracer } from "effect"; +import { OtlpMetrics, OtlpSerialization, OtlpTracer } from "effect/unstable/observability"; + +import { ServerConfig } from "../../config.ts"; +import { ServerLoggerLive } from "../../serverLogger.ts"; +import { makeLocalFileTracer } from "../LocalFileTracer.ts"; + +const otlpSerializationLayer = OtlpSerialization.layerJson; + +export const ObservabilityLive = Layer.unwrap( + Effect.gen(function* () { + const config = yield* ServerConfig; + + const traceReferencesLayer = Layer.mergeAll( + Layer.succeed(Tracer.MinimumTraceLevel, config.traceMinLevel), + Layer.succeed(References.TracerTimingEnabled, config.traceTimingEnabled), + ); + + const tracerLayer = Layer.effect( + Tracer.Tracer, + Effect.gen(function* () { + const delegate = + config.otlpTracesUrl === undefined + ? undefined + : yield* OtlpTracer.make({ + url: config.otlpTracesUrl, + exportInterval: `${config.otlpExportIntervalMs} millis`, + resource: { + serviceName: config.otlpServiceName, + attributes: { + "service.runtime": "t3-server", + "service.mode": config.mode, + }, + }, + }); + + return yield* makeLocalFileTracer({ + filePath: config.serverTracePath, + maxBytes: config.traceMaxBytes, + maxFiles: config.traceMaxFiles, + batchWindowMs: config.traceBatchWindowMs, + ...(delegate ? { delegate } : {}), + }); + }), + ).pipe(Layer.provideMerge(otlpSerializationLayer)); + + const metricsLayer = + config.otlpMetricsUrl === undefined + ? Layer.empty + : OtlpMetrics.layer({ + url: config.otlpMetricsUrl, + exportInterval: `${config.otlpExportIntervalMs} millis`, + resource: { + serviceName: config.otlpServiceName, + attributes: { + "service.runtime": "t3-server", + "service.mode": config.mode, + }, + }, + }).pipe(Layer.provideMerge(otlpSerializationLayer)); + + return Layer.mergeAll(ServerLoggerLive, traceReferencesLayer, tracerLayer, metricsLayer); + }), +); diff --git a/apps/server/src/observability/LocalFileTracer.test.ts b/apps/server/src/observability/LocalFileTracer.test.ts new file mode 100644 index 0000000000..1bc5a34305 --- /dev/null +++ b/apps/server/src/observability/LocalFileTracer.test.ts @@ -0,0 +1,117 @@ +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; + +import { assert, describe, it } from "@effect/vitest"; +import { Effect, Layer, Logger, References, Tracer } from "effect"; + +import type { TraceRecord } from "./TraceRecord.ts"; +import { makeLocalFileTracer } from "./LocalFileTracer.ts"; + +const makeTestLayer = (tracePath: string) => + Layer.mergeAll( + Layer.effect( + Tracer.Tracer, + makeLocalFileTracer({ + filePath: tracePath, + maxBytes: 1024 * 1024, + maxFiles: 2, + batchWindowMs: 10_000, + }), + ), + Logger.layer([Logger.tracerLogger], { mergeWithExisting: false }), + Layer.succeed(References.MinimumLogLevel, "Info"), + ); + +const readTraceRecords = (tracePath: string): Array => + fs + .readFileSync(tracePath, "utf8") + .trim() + .split("\n") + .filter((line) => line.length > 0) + .map((line) => JSON.parse(line) as TraceRecord); + +describe("LocalFileTracer", () => { + it.effect("writes nested spans to disk and captures log messages as span events", () => + Effect.gen(function* () { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "t3-local-tracer-")); + const tracePath = path.join(tempDir, "server.trace.ndjson"); + + try { + yield* Effect.scoped( + Effect.gen(function* () { + const program = Effect.gen(function* () { + yield* Effect.annotateCurrentSpan({ + "demo.parent": true, + }); + yield* Effect.logInfo("parent event"); + yield* Effect.gen(function* () { + yield* Effect.annotateCurrentSpan({ + "demo.child": true, + }); + yield* Effect.logInfo("child event"); + }).pipe(Effect.withSpan("child-span")); + }).pipe(Effect.withSpan("parent-span")); + + yield* program.pipe(Effect.provide(makeTestLayer(tracePath))); + }), + ); + + const records = readTraceRecords(tracePath); + assert.equal(records.length, 2); + + const parent = records.find((record) => record.name === "parent-span"); + const child = records.find((record) => record.name === "child-span"); + + assert.notEqual(parent, undefined); + assert.notEqual(child, undefined); + if (!parent || !child) { + return; + } + + assert.equal(child.parentSpanId, parent.spanId); + assert.equal(parent.attributes["demo.parent"], true); + assert.equal(child.attributes["demo.child"], true); + assert.equal( + parent.events.some((event) => event.name === "parent event"), + true, + ); + assert.equal( + child.events.some((event) => event.name === "child event"), + true, + ); + assert.equal( + child.events.some((event) => event.attributes["effect.logLevel"] === "INFO"), + true, + ); + } finally { + fs.rmSync(tempDir, { recursive: true, force: true }); + } + }), + ); + + it.effect("serializes interrupted spans with an interrupted exit status", () => + Effect.gen(function* () { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "t3-local-tracer-")); + const tracePath = path.join(tempDir, "server.trace.ndjson"); + + try { + yield* Effect.scoped( + Effect.exit( + Effect.interrupt.pipe( + Effect.withSpan("interrupt-span"), + Effect.provide(makeTestLayer(tracePath)), + ), + ), + ); + + const records = readTraceRecords(tracePath); + assert.equal(records.length, 1); + assert.equal(records[0]?.name, "interrupt-span"); + assert.equal(records[0]?.exit._tag, "Interrupted"); + } finally { + fs.rmSync(tempDir, { recursive: true, force: true }); + } + }), + ); +}); diff --git a/apps/server/src/observability/LocalFileTracer.ts b/apps/server/src/observability/LocalFileTracer.ts new file mode 100644 index 0000000000..7f75eb0bf9 --- /dev/null +++ b/apps/server/src/observability/LocalFileTracer.ts @@ -0,0 +1,104 @@ +import type * as Exit from "effect/Exit"; +import { Effect, Option, Tracer } from "effect"; + +import { spanToTraceRecord } from "./TraceRecord.ts"; +import { makeTraceSink } from "./TraceSink.ts"; + +export interface LocalFileTracerOptions { + readonly filePath: string; + readonly maxBytes: number; + readonly maxFiles: number; + readonly batchWindowMs: number; + readonly delegate?: Tracer.Tracer; +} + +class LocalFileSpan implements Tracer.Span { + readonly _tag = "Span"; + readonly name: string; + readonly spanId: string; + readonly traceId: string; + readonly parent: Option.Option; + readonly annotations: Tracer.Span["annotations"]; + readonly links: Array; + readonly sampled: boolean; + readonly kind: Tracer.SpanKind; + + status: Tracer.SpanStatus; + attributes: Map; + events: Array<[name: string, startTime: bigint, attributes: Record]>; + + constructor( + options: Parameters[0], + private readonly delegate: Tracer.Span, + private readonly push: (record: ReturnType) => void, + ) { + this.name = delegate.name; + this.spanId = delegate.spanId; + this.traceId = delegate.traceId; + this.parent = options.parent; + this.annotations = options.annotations; + this.links = [...options.links]; + this.sampled = delegate.sampled; + this.kind = delegate.kind; + this.status = { + _tag: "Started", + startTime: options.startTime, + }; + this.attributes = new Map(); + this.events = []; + } + + end(endTime: bigint, exit: Exit.Exit): void { + this.status = { + _tag: "Ended", + startTime: this.status.startTime, + endTime, + exit, + }; + this.delegate.end(endTime, exit); + + if (this.sampled) { + this.push(spanToTraceRecord(this)); + } + } + + attribute(key: string, value: unknown): void { + this.attributes.set(key, value); + this.delegate.attribute(key, value); + } + + event(name: string, startTime: bigint, attributes?: Record): void { + const nextAttributes = attributes ?? {}; + this.events.push([name, startTime, nextAttributes]); + this.delegate.event(name, startTime, nextAttributes); + } + + addLinks(links: ReadonlyArray): void { + this.links.push(...links); + this.delegate.addLinks(links); + } +} + +export const makeLocalFileTracer = Effect.fn("makeLocalFileTracer")(function* ( + options: LocalFileTracerOptions, +) { + const sink = yield* makeTraceSink({ + filePath: options.filePath, + maxBytes: options.maxBytes, + maxFiles: options.maxFiles, + batchWindowMs: options.batchWindowMs, + }); + + const delegate = + options.delegate ?? + Tracer.make({ + span: (spanOptions) => new Tracer.NativeSpan(spanOptions), + }); + + return Tracer.make({ + span(spanOptions) { + return new LocalFileSpan(spanOptions, delegate.span(spanOptions), sink.push); + }, + ...(delegate.context ? { context: delegate.context } : {}), + }); +}); diff --git a/apps/server/src/observability/Metrics.test.ts b/apps/server/src/observability/Metrics.test.ts new file mode 100644 index 0000000000..148431b7b4 --- /dev/null +++ b/apps/server/src/observability/Metrics.test.ts @@ -0,0 +1,73 @@ +import { assert, describe, it } from "@effect/vitest"; +import { Effect, Metric } from "effect"; + +import { withMetrics } from "./Metrics.ts"; + +const hasMetricSnapshot = ( + snapshots: ReadonlyArray, + id: string, + attributes: Readonly>, +) => + snapshots.some( + (snapshot) => + snapshot.id === id && + Object.entries(attributes).every(([key, value]) => snapshot.attributes?.[key] === value), + ); + +describe("withMetrics", () => { + it.effect("supports pipe-style usage", () => + Effect.gen(function* () { + const counter = Metric.counter("with_metrics_pipe_total"); + const timer = Metric.timer("with_metrics_pipe_duration"); + + const result = yield* Effect.succeed("ok").pipe( + withMetrics({ + counter, + timer, + attributes: { + operation: "pipe", + }, + }), + ); + + assert.equal(result, "ok"); + + const snapshots = yield* Metric.snapshot; + assert.equal( + hasMetricSnapshot(snapshots, "with_metrics_pipe_total", { + operation: "pipe", + outcome: "success", + }), + true, + ); + assert.equal( + hasMetricSnapshot(snapshots, "with_metrics_pipe_duration", { + operation: "pipe", + }), + true, + ); + }), + ); + + it.effect("supports direct invocation", () => + Effect.gen(function* () { + const counter = Metric.counter("with_metrics_direct_total"); + + yield* withMetrics(Effect.fail("boom"), { + counter, + attributes: { + operation: "direct", + }, + }).pipe(Effect.exit); + + const snapshots = yield* Metric.snapshot; + assert.equal( + hasMetricSnapshot(snapshots, "with_metrics_direct_total", { + operation: "direct", + outcome: "failure", + }), + true, + ); + }), + ); +}); diff --git a/apps/server/src/observability/Metrics.ts b/apps/server/src/observability/Metrics.ts new file mode 100644 index 0000000000..b6d1abb20a --- /dev/null +++ b/apps/server/src/observability/Metrics.ts @@ -0,0 +1,159 @@ +import { Duration, Effect, Exit, Metric } from "effect"; +import { dual } from "effect/Function"; + +import { + compactMetricAttributes, + normalizeModelMetricLabel, + outcomeFromExit, +} from "./Attributes.ts"; + +export const rpcRequestsTotal = Metric.counter("t3_rpc_requests_total", { + description: "Total RPC requests handled by the websocket RPC server.", +}); + +export const rpcRequestDuration = Metric.timer("t3_rpc_request_duration", { + description: "RPC request handling duration.", +}); + +export const orchestrationCommandsTotal = Metric.counter("t3_orchestration_commands_total", { + description: "Total orchestration commands dispatched.", +}); + +export const orchestrationCommandDuration = Metric.timer("t3_orchestration_command_duration", { + description: "Orchestration command dispatch duration.", +}); + +export const orchestrationEventsProcessedTotal = Metric.counter( + "t3_orchestration_events_processed_total", + { + description: "Total orchestration intent events processed by runtime reactors.", + }, +); + +export const providerSessionsTotal = Metric.counter("t3_provider_sessions_total", { + description: "Total provider session lifecycle operations.", +}); + +export const providerTurnsTotal = Metric.counter("t3_provider_turns_total", { + description: "Total provider turn lifecycle operations.", +}); + +export const providerTurnDuration = Metric.timer("t3_provider_turn_duration", { + description: "Provider turn request duration.", +}); + +export const providerRuntimeEventsTotal = Metric.counter("t3_provider_runtime_events_total", { + description: "Total canonical provider runtime events processed.", +}); + +export const gitCommandsTotal = Metric.counter("t3_git_commands_total", { + description: "Total git commands executed by the server runtime.", +}); + +export const gitCommandDuration = Metric.timer("t3_git_command_duration", { + description: "Git command execution duration.", +}); + +export const terminalSessionsTotal = Metric.counter("t3_terminal_sessions_total", { + description: "Total terminal sessions started.", +}); + +export const terminalRestartsTotal = Metric.counter("t3_terminal_restarts_total", { + description: "Total terminal restart requests handled.", +}); + +export const dbQueriesTotal = Metric.counter("t3_db_queries_total", { + description: "Total sqlite queries executed by the server runtime.", +}); + +export const dbQueryDuration = Metric.timer("t3_db_query_duration", { + description: "Sqlite query execution duration.", +}); + +export const metricAttributes = ( + attributes: Readonly>, +): ReadonlyArray<[string, string]> => Object.entries(compactMetricAttributes(attributes)); + +export const withMetricAttributes = Metric.withAttributes; + +export const increment = ( + metric: Metric.Metric, + attributes: Readonly>, + amount = 1, +) => Metric.update(Metric.withAttributes(metric, metricAttributes(attributes)), amount); + +export interface WithMetricsOptions { + readonly counter?: Metric.Metric; + readonly timer?: Metric.Metric; + readonly attributes?: Readonly>; + readonly outcomeAttributes?: ( + outcome: ReturnType, + ) => Readonly>; +} + +const withMetricsImpl = ( + effect: Effect.Effect, + options: WithMetricsOptions, +): Effect.Effect => + Effect.gen(function* () { + const startedAt = Date.now(); + const exit = yield* Effect.exit(effect); + const duration = Duration.millis(Math.max(0, Date.now() - startedAt)); + const baseAttributes = options.attributes ?? {}; + + if (options.timer) { + yield* Metric.update( + Metric.withAttributes(options.timer, metricAttributes(baseAttributes)), + duration, + ); + } + + if (options.counter) { + const outcome = outcomeFromExit(exit); + yield* Metric.update( + Metric.withAttributes( + options.counter, + metricAttributes({ + ...baseAttributes, + outcome, + ...(options.outcomeAttributes ? options.outcomeAttributes(outcome) : {}), + }), + ), + 1, + ); + } + + if (Exit.isSuccess(exit)) { + return exit.value; + } + return yield* Effect.failCause(exit.cause); + }); + +export const withMetrics: { + ( + options: WithMetricsOptions, + ): (effect: Effect.Effect) => Effect.Effect; + (effect: Effect.Effect, options: WithMetricsOptions): Effect.Effect; +} = dual(2, withMetricsImpl); + +export const providerMetricAttributes = ( + provider: string, + extra?: Readonly>, +) => + compactMetricAttributes({ + provider, + ...extra, + }); + +export const providerTurnMetricAttributes = (input: { + readonly provider: string; + readonly model: string | null | undefined; + readonly extra?: Readonly>; +}) => + compactMetricAttributes({ + provider: input.provider, + ...(normalizeModelMetricLabel(input.model) + ? { modelFamily: normalizeModelMetricLabel(input.model) } + : {}), + ...input.extra, + }); diff --git a/apps/server/src/observability/RpcInstrumentation.test.ts b/apps/server/src/observability/RpcInstrumentation.test.ts new file mode 100644 index 0000000000..c2a25170ed --- /dev/null +++ b/apps/server/src/observability/RpcInstrumentation.test.ts @@ -0,0 +1,97 @@ +import { assert, describe, it } from "@effect/vitest"; +import { Effect, Metric, Stream } from "effect"; + +import { observeRpcEffect, observeRpcStreamEffect } from "./RpcInstrumentation.ts"; + +const hasMetricSnapshot = ( + snapshots: ReadonlyArray, + id: string, + attributes: Readonly>, +) => + snapshots.some( + (snapshot) => + snapshot.id === id && + Object.entries(attributes).every(([key, value]) => snapshot.attributes?.[key] === value), + ); + +describe("RpcInstrumentation", () => { + it.effect("records success metrics for unary RPC handlers", () => + Effect.gen(function* () { + yield* observeRpcEffect("rpc.instrumentation.success", Effect.succeed("ok"), { + "rpc.aggregate": "test", + }).pipe(Effect.withSpan("rpc.instrumentation.success.span")); + + const snapshots = yield* Metric.snapshot; + + assert.equal( + hasMetricSnapshot(snapshots, "t3_rpc_requests_total", { + method: "rpc.instrumentation.success", + outcome: "success", + }), + true, + ); + assert.equal( + hasMetricSnapshot(snapshots, "t3_rpc_request_duration", { + method: "rpc.instrumentation.success", + }), + true, + ); + }), + ); + + it.effect("records failure outcomes for unary RPC handlers", () => + Effect.gen(function* () { + yield* Effect.exit( + observeRpcEffect("rpc.instrumentation.failure", Effect.fail("boom"), { + "rpc.aggregate": "test", + }).pipe(Effect.withSpan("rpc.instrumentation.failure.span")), + ); + + const snapshots = yield* Metric.snapshot; + + assert.equal( + hasMetricSnapshot(snapshots, "t3_rpc_requests_total", { + method: "rpc.instrumentation.failure", + outcome: "failure", + }), + true, + ); + assert.equal( + hasMetricSnapshot(snapshots, "t3_rpc_request_duration", { + method: "rpc.instrumentation.failure", + }), + true, + ); + }), + ); + + it.effect("records subscription activation metrics for stream RPC handlers", () => + Effect.gen(function* () { + const events = yield* Stream.runCollect( + observeRpcStreamEffect( + "rpc.instrumentation.stream", + Effect.succeed(Stream.make("a", "b")), + { "rpc.aggregate": "test" }, + ).pipe(Stream.withSpan("rpc.instrumentation.stream.span")), + ); + + assert.deepStrictEqual(Array.from(events), ["a", "b"]); + + const snapshots = yield* Metric.snapshot; + + assert.equal( + hasMetricSnapshot(snapshots, "t3_rpc_requests_total", { + method: "rpc.instrumentation.stream", + outcome: "success", + }), + true, + ); + assert.equal( + hasMetricSnapshot(snapshots, "t3_rpc_request_duration", { + method: "rpc.instrumentation.stream", + }), + true, + ); + }), + ); +}); diff --git a/apps/server/src/observability/RpcInstrumentation.ts b/apps/server/src/observability/RpcInstrumentation.ts new file mode 100644 index 0000000000..9595de81c6 --- /dev/null +++ b/apps/server/src/observability/RpcInstrumentation.ts @@ -0,0 +1,37 @@ +import { Effect, Stream } from "effect"; + +import { rpcRequestDuration, rpcRequestsTotal, withMetrics } from "./Metrics.ts"; + +export const observeRpcEffect = ( + method: string, + effect: Effect.Effect, + traceAttributes?: Readonly>, +): Effect.Effect => + Effect.gen(function* () { + yield* Effect.annotateCurrentSpan({ + "rpc.method": method, + ...traceAttributes, + }); + + return yield* effect.pipe( + withMetrics({ + counter: rpcRequestsTotal, + timer: rpcRequestDuration, + attributes: { + method, + }, + }), + ); + }); + +export const observeRpcStream = ( + method: string, + stream: Stream.Stream, + traceAttributes?: Readonly>, +) => Stream.unwrap(observeRpcEffect(method, Effect.succeed(stream), traceAttributes)); + +export const observeRpcStreamEffect = ( + method: string, + effect: Effect.Effect, EffectError, EffectContext>, + traceAttributes?: Readonly>, +) => Stream.unwrap(observeRpcEffect(method, effect, traceAttributes)); diff --git a/apps/server/src/observability/TraceRecord.ts b/apps/server/src/observability/TraceRecord.ts new file mode 100644 index 0000000000..3aee1b267c --- /dev/null +++ b/apps/server/src/observability/TraceRecord.ts @@ -0,0 +1,100 @@ +import { Cause, Exit, Option, Tracer } from "effect"; + +import { compactTraceAttributes } from "./Attributes.ts"; + +export interface TraceRecord { + readonly type: "effect-span"; + readonly name: string; + readonly traceId: string; + readonly spanId: string; + readonly parentSpanId?: string; + readonly sampled: boolean; + readonly kind: Tracer.SpanKind; + readonly startTimeUnixNano: string; + readonly endTimeUnixNano: string; + readonly durationMs: number; + readonly attributes: Readonly>; + readonly events: ReadonlyArray<{ + readonly name: string; + readonly timeUnixNano: string; + readonly attributes: Readonly>; + }>; + readonly links: ReadonlyArray<{ + readonly traceId: string; + readonly spanId: string; + readonly attributes: Readonly>; + }>; + readonly exit: + | { + readonly _tag: "Success"; + } + | { + readonly _tag: "Interrupted"; + readonly cause: string; + } + | { + readonly _tag: "Failure"; + readonly cause: string; + }; +} + +interface SerializableSpan { + readonly name: string; + readonly traceId: string; + readonly spanId: string; + readonly parent: Option.Option; + readonly status: Tracer.SpanStatus; + readonly sampled: boolean; + readonly kind: Tracer.SpanKind; + readonly attributes: ReadonlyMap; + readonly links: ReadonlyArray; + readonly events: ReadonlyArray< + readonly [name: string, startTime: bigint, attributes: Record] + >; +} + +function formatTraceExit(exit: Exit.Exit): TraceRecord["exit"] { + if (Exit.isSuccess(exit)) { + return { _tag: "Success" }; + } + if (Cause.hasInterruptsOnly(exit.cause)) { + return { + _tag: "Interrupted", + cause: Cause.pretty(exit.cause), + }; + } + return { + _tag: "Failure", + cause: Cause.pretty(exit.cause), + }; +} + +export function spanToTraceRecord(span: SerializableSpan): TraceRecord { + const status = span.status as Extract; + const parentSpanId = Option.getOrUndefined(span.parent)?.spanId; + + return { + type: "effect-span", + name: span.name, + traceId: span.traceId, + spanId: span.spanId, + ...(parentSpanId ? { parentSpanId } : {}), + sampled: span.sampled, + kind: span.kind, + startTimeUnixNano: String(status.startTime), + endTimeUnixNano: String(status.endTime), + durationMs: Number(status.endTime - status.startTime) / 1_000_000, + attributes: compactTraceAttributes(Object.fromEntries(span.attributes)), + events: span.events.map(([name, startTime, attributes]) => ({ + name, + timeUnixNano: String(startTime), + attributes: compactTraceAttributes(attributes), + })), + links: span.links.map((link) => ({ + traceId: link.span.traceId, + spanId: link.span.spanId, + attributes: compactTraceAttributes(link.attributes), + })), + exit: formatTraceExit(status.exit), + }; +} diff --git a/apps/server/src/observability/TraceSink.test.ts b/apps/server/src/observability/TraceSink.test.ts new file mode 100644 index 0000000000..c4021b3560 --- /dev/null +++ b/apps/server/src/observability/TraceSink.test.ts @@ -0,0 +1,108 @@ +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; + +import { assert, describe, it } from "@effect/vitest"; +import { Effect } from "effect"; + +import type { TraceRecord } from "./TraceRecord.ts"; +import { makeTraceSink } from "./TraceSink.ts"; + +const makeRecord = (name: string, suffix = ""): TraceRecord => ({ + type: "effect-span", + name, + traceId: `trace-${name}-${suffix}`, + spanId: `span-${name}-${suffix}`, + sampled: true, + kind: "internal", + startTimeUnixNano: "1", + endTimeUnixNano: "2", + durationMs: 1, + attributes: { + payload: suffix, + }, + events: [], + links: [], + exit: { + _tag: "Success", + }, +}); + +describe("TraceSink", () => { + it.effect("flushes buffered records on close", () => + Effect.scoped( + Effect.gen(function* () { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "t3-trace-sink-")); + const tracePath = path.join(tempDir, "server.trace.ndjson"); + + try { + const sink = yield* makeTraceSink({ + filePath: tracePath, + maxBytes: 1024, + maxFiles: 2, + batchWindowMs: 10_000, + }); + + sink.push(makeRecord("alpha")); + sink.push(makeRecord("beta")); + yield* sink.close(); + + const lines = fs + .readFileSync(tracePath, "utf8") + .trim() + .split("\n") + .map((line) => JSON.parse(line) as TraceRecord); + + assert.equal(lines.length, 2); + assert.equal(lines[0]?.name, "alpha"); + assert.equal(lines[1]?.name, "beta"); + } finally { + fs.rmSync(tempDir, { recursive: true, force: true }); + } + }), + ), + ); + + it.effect("rotates the trace file when the configured max size is exceeded", () => + Effect.scoped( + Effect.gen(function* () { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "t3-trace-sink-")); + const tracePath = path.join(tempDir, "server.trace.ndjson"); + + try { + const sink = yield* makeTraceSink({ + filePath: tracePath, + maxBytes: 180, + maxFiles: 2, + batchWindowMs: 10_000, + }); + + for (let index = 0; index < 8; index += 1) { + sink.push(makeRecord("rotate", `${index}-${"x".repeat(48)}`)); + yield* sink.flush; + } + yield* sink.close(); + + const matchingFiles = fs + .readdirSync(tempDir) + .filter( + (entry) => + entry === "server.trace.ndjson" || entry.startsWith("server.trace.ndjson."), + ) + .toSorted(); + + assert.equal( + matchingFiles.some((entry) => entry === "server.trace.ndjson.1"), + true, + ); + assert.equal( + matchingFiles.some((entry) => entry === "server.trace.ndjson.3"), + false, + ); + } finally { + fs.rmSync(tempDir, { recursive: true, force: true }); + } + }), + ), + ); +}); diff --git a/apps/server/src/observability/TraceSink.ts b/apps/server/src/observability/TraceSink.ts new file mode 100644 index 0000000000..8989a0b640 --- /dev/null +++ b/apps/server/src/observability/TraceSink.ts @@ -0,0 +1,68 @@ +import { RotatingFileSink } from "@t3tools/shared/logging"; +import { Effect } from "effect"; + +import type { TraceRecord } from "./TraceRecord.ts"; + +const FLUSH_BUFFER_THRESHOLD = 32; + +export interface TraceSinkOptions { + readonly filePath: string; + readonly maxBytes: number; + readonly maxFiles: number; + readonly batchWindowMs: number; +} + +export interface TraceSink { + readonly filePath: string; + push: (record: TraceRecord) => void; + flush: Effect.Effect; + close: () => Effect.Effect; +} + +export const makeTraceSink = Effect.fn("makeTraceSink")(function* (options: TraceSinkOptions) { + const sink = new RotatingFileSink({ + filePath: options.filePath, + maxBytes: options.maxBytes, + maxFiles: options.maxFiles, + }); + + let buffer: Array = []; + + const flushUnsafe = () => { + if (buffer.length === 0) { + return; + } + + const chunk = buffer.join(""); + buffer = []; + + try { + sink.write(chunk); + } catch { + buffer = []; + } + }; + + const flush = Effect.sync(flushUnsafe).pipe(Effect.withTracerEnabled(false)); + + yield* Effect.addFinalizer(() => flush.pipe(Effect.ignore)); + yield* Effect.forkScoped( + Effect.sleep(`${options.batchWindowMs} millis`).pipe(Effect.andThen(flush), Effect.forever), + ); + + return { + filePath: options.filePath, + push(record) { + try { + buffer.push(`${JSON.stringify(record)}\n`); + if (buffer.length >= FLUSH_BUFFER_THRESHOLD) { + flushUnsafe(); + } + } catch { + buffer = []; + } + }, + flush, + close: () => flush, + } satisfies TraceSink; +}); diff --git a/apps/server/src/orchestration/Layers/OrchestrationEngine.ts b/apps/server/src/orchestration/Layers/OrchestrationEngine.ts index 5c52379f47..3bde1a1e7a 100644 --- a/apps/server/src/orchestration/Layers/OrchestrationEngine.ts +++ b/apps/server/src/orchestration/Layers/OrchestrationEngine.ts @@ -8,6 +8,11 @@ import { OrchestrationCommand } from "@t3tools/contracts"; import { Deferred, Effect, Layer, Option, PubSub, Queue, Schema, Stream } from "effect"; import * as SqlClient from "effect/unstable/sql/SqlClient"; +import { + orchestrationCommandsTotal, + orchestrationCommandDuration, + withMetrics, +} from "../../observability/Metrics.ts"; import { toPersistenceSqlError } from "../../persistence/Errors.ts"; import { OrchestrationEventStore } from "../../persistence/Services/OrchestrationEventStore.ts"; import { OrchestrationCommandReceiptRepository } from "../../persistence/Services/OrchestrationCommandReceipts.ts"; @@ -62,6 +67,7 @@ const makeOrchestrationEngine = Effect.gen(function* () { const processEnvelope = (envelope: CommandEnvelope): Effect.Effect => { const dispatchStartSequence = readModel.snapshotSequence; + const aggregateRef = commandToAggregateRef(envelope.command); const reconcileReadModelAfterDispatchFailure = Effect.gen(function* () { const persistedEvents = yield* Stream.runCollect( eventStore.readFromSequence(dispatchStartSequence), @@ -82,6 +88,13 @@ const makeOrchestrationEngine = Effect.gen(function* () { }); return Effect.gen(function* () { + yield* Effect.annotateCurrentSpan({ + "orchestration.command_id": envelope.command.commandId, + "orchestration.command_type": envelope.command.type, + "orchestration.aggregate_kind": aggregateRef.aggregateKind, + "orchestration.aggregate_id": aggregateRef.aggregateId, + }); + const existingReceipt = yield* commandReceiptRepository.getByCommandId({ commandId: envelope.command.commandId, }); @@ -175,7 +188,6 @@ const makeOrchestrationEngine = Effect.gen(function* () { ); if (Schema.is(OrchestrationCommandInvariantError)(error)) { - const aggregateRef = commandToAggregateRef(envelope.command); yield* commandReceiptRepository .upsert({ commandId: envelope.command.commandId, @@ -191,6 +203,16 @@ const makeOrchestrationEngine = Effect.gen(function* () { yield* Deferred.fail(envelope.result, error); }), ), + Effect.withSpan(`orchestration.command.${envelope.command.type}`), + Effect.asVoid, + withMetrics({ + counter: orchestrationCommandsTotal, + timer: orchestrationCommandDuration, + attributes: { + commandType: envelope.command.type, + aggregateKind: aggregateRef.aggregateKind, + }, + }), ); }; diff --git a/apps/server/src/orchestration/Layers/ProviderCommandReactor.ts b/apps/server/src/orchestration/Layers/ProviderCommandReactor.ts index 8e2a19bcd1..2fcbe7ca24 100644 --- a/apps/server/src/orchestration/Layers/ProviderCommandReactor.ts +++ b/apps/server/src/orchestration/Layers/ProviderCommandReactor.ts @@ -16,6 +16,7 @@ import { makeDrainableWorker } from "@t3tools/shared/DrainableWorker"; import { resolveThreadWorkspaceCwd } from "../../checkpointing/Utils.ts"; import { GitCore } from "../../git/Services/GitCore.ts"; +import { increment, orchestrationEventsProcessedTotal } from "../../observability/Metrics.ts"; import { ProviderAdapterRequestError, ProviderServiceError } from "../../provider/Errors.ts"; import { TextGeneration } from "../../git/Services/TextGeneration.ts"; import { ProviderService } from "../../provider/Services/ProviderService.ts"; @@ -727,6 +728,14 @@ const make = Effect.gen(function* () { const processDomainEvent = Effect.fn("processDomainEvent")(function* ( event: ProviderIntentEvent, ) { + yield* Effect.annotateCurrentSpan({ + "orchestration.event_type": event.type, + "orchestration.thread_id": event.payload.threadId, + ...(event.commandId ? { "orchestration.command_id": event.commandId } : {}), + }); + yield* increment(orchestrationEventsProcessedTotal, { + eventType: event.type, + }); switch (event.type) { case "thread.runtime-mode-set": { const thread = yield* resolveThread(event.payload.threadId); diff --git a/apps/server/src/persistence/Layers/Sqlite.ts b/apps/server/src/persistence/Layers/Sqlite.ts index 07cbf44794..58556099db 100644 --- a/apps/server/src/persistence/Layers/Sqlite.ts +++ b/apps/server/src/persistence/Layers/Sqlite.ts @@ -6,6 +6,7 @@ import { ServerConfig } from "../../config.ts"; type RuntimeSqliteLayerConfig = { readonly filename: string; + readonly spanAttributes?: Record; }; type Loader = { @@ -41,7 +42,16 @@ export const makeSqlitePersistenceLive = Effect.fn("makeSqlitePersistenceLive")( const path = yield* Path.Path; yield* fs.makeDirectory(path.dirname(dbPath), { recursive: true }); - return Layer.provideMerge(setup, makeRuntimeSqliteLayer({ filename: dbPath })); + return Layer.provideMerge( + setup, + makeRuntimeSqliteLayer({ + filename: dbPath, + spanAttributes: { + "db.name": path.basename(dbPath), + "service.name": "t3-server", + }, + }), + ); }, Layer.unwrap); export const SqlitePersistenceMemory = Layer.provideMerge( diff --git a/apps/server/src/persistence/NodeSqliteClient.ts b/apps/server/src/persistence/NodeSqliteClient.ts index 77c316e8ca..803e5afe62 100644 --- a/apps/server/src/persistence/NodeSqliteClient.ts +++ b/apps/server/src/persistence/NodeSqliteClient.ts @@ -23,6 +23,8 @@ import type { Connection } from "effect/unstable/sql/SqlConnection"; import { SqlError, classifySqliteError } from "effect/unstable/sql/SqlError"; import * as Statement from "effect/unstable/sql/Statement"; +import { dbQueriesTotal, dbQueryDuration, withMetrics } from "../observability/Metrics.ts"; + const ATTR_DB_SYSTEM_NAME = "db.system.name"; export const TypeId: TypeId = "~local/sqlite-node/SqliteClient"; @@ -82,6 +84,9 @@ const makeWithDatabase = Effect.fn("makeWithDatabase")(function* ( const transformRows = options.transformResultNames ? Statement.defaultTransforms(options.transformResultNames).array : undefined; + const databaseMetricAttributes = { + database: options.filename === ":memory:" ? "memory" : "disk", + } as const; const makeConnection = Effect.gen(function* () { const scope = yield* Effect.scope; @@ -137,7 +142,13 @@ const makeWithDatabase = Effect.fn("makeWithDatabase")(function* ( }), ); } - }); + }).pipe( + withMetrics({ + counter: dbQueriesTotal, + timer: dbQueryDuration, + attributes: databaseMetricAttributes, + }), + ); const run = (sql: string, params: ReadonlyArray, raw = false) => Effect.flatMap(Cache.get(prepareCache, sql), (s) => runStatement(s, params, raw)); @@ -172,6 +183,12 @@ const makeWithDatabase = Effect.fn("makeWithDatabase")(function* ( statement.setReturnArrays(false); } }), + ).pipe( + withMetrics({ + counter: dbQueriesTotal, + timer: dbQueryDuration, + attributes: databaseMetricAttributes, + }), ); return identity({ diff --git a/apps/server/src/provider/Layers/ProviderService.ts b/apps/server/src/provider/Layers/ProviderService.ts index f348829598..c3947805a6 100644 --- a/apps/server/src/provider/Layers/ProviderService.ts +++ b/apps/server/src/provider/Layers/ProviderService.ts @@ -24,6 +24,16 @@ import { } from "@t3tools/contracts"; import { Effect, Layer, Option, PubSub, Queue, Schema, SchemaIssue, Stream } from "effect"; +import { + increment, + providerMetricAttributes, + providerRuntimeEventsTotal, + providerSessionsTotal, + providerTurnDuration, + providerTurnsTotal, + providerTurnMetricAttributes, + withMetrics, +} from "../../observability/Metrics.ts"; import { ProviderValidationError } from "../Errors.ts"; import { ProviderAdapterRegistry } from "../Services/ProviderAdapterRegistry.ts"; import { ProviderService, type ProviderServiceShape } from "../Services/ProviderService.ts"; @@ -179,7 +189,10 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( const providers = yield* registry.listProviders(); const adapters = yield* Effect.forEach(providers, (provider) => registry.getByProvider(provider)); const processRuntimeEvent = (event: ProviderRuntimeEvent): Effect.Effect => - publishRuntimeEvent(event); + increment(providerRuntimeEventsTotal, { + provider: event.provider, + eventType: event.type, + }).pipe(Effect.andThen(publishRuntimeEvent(event))); const worker = Effect.forever( Queue.take(runtimeEventQueue).pipe(Effect.flatMap(processRuntimeEvent)), @@ -196,58 +209,72 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( readonly binding: ProviderRuntimeBinding; readonly operation: string; }) { - const adapter = yield* registry.getByProvider(input.binding.provider); - const hasResumeCursor = - input.binding.resumeCursor !== null && input.binding.resumeCursor !== undefined; - const hasActiveSession = yield* adapter.hasSession(input.binding.threadId); - if (hasActiveSession) { - const activeSessions = yield* adapter.listSessions(); - const existing = activeSessions.find( - (session) => session.threadId === input.binding.threadId, - ); - if (existing) { - yield* upsertSessionBinding(existing, input.binding.threadId); - yield* analytics.record("provider.session.recovered", { - provider: existing.provider, - strategy: "adopt-existing", - hasResumeCursor: existing.resumeCursor !== undefined, - }); - return { adapter, session: existing } as const; + yield* Effect.annotateCurrentSpan({ + "provider.operation": "recover-session", + "provider.kind": input.binding.provider, + "provider.thread_id": input.binding.threadId, + }); + return yield* Effect.gen(function* () { + const adapter = yield* registry.getByProvider(input.binding.provider); + const hasResumeCursor = + input.binding.resumeCursor !== null && input.binding.resumeCursor !== undefined; + const hasActiveSession = yield* adapter.hasSession(input.binding.threadId); + if (hasActiveSession) { + const activeSessions = yield* adapter.listSessions(); + const existing = activeSessions.find( + (session) => session.threadId === input.binding.threadId, + ); + if (existing) { + yield* upsertSessionBinding(existing, input.binding.threadId); + yield* analytics.record("provider.session.recovered", { + provider: existing.provider, + strategy: "adopt-existing", + hasResumeCursor: existing.resumeCursor !== undefined, + }); + return { adapter, session: existing } as const; + } } - } - if (!hasResumeCursor) { - return yield* toValidationError( - input.operation, - `Cannot recover thread '${input.binding.threadId}' because no provider resume state is persisted.`, - ); - } + if (!hasResumeCursor) { + return yield* toValidationError( + input.operation, + `Cannot recover thread '${input.binding.threadId}' because no provider resume state is persisted.`, + ); + } - const persistedCwd = readPersistedCwd(input.binding.runtimePayload); - const persistedModelSelection = readPersistedModelSelection(input.binding.runtimePayload); + const persistedCwd = readPersistedCwd(input.binding.runtimePayload); + const persistedModelSelection = readPersistedModelSelection(input.binding.runtimePayload); - const resumed = yield* adapter.startSession({ - threadId: input.binding.threadId, - provider: input.binding.provider, - ...(persistedCwd ? { cwd: persistedCwd } : {}), - ...(persistedModelSelection ? { modelSelection: persistedModelSelection } : {}), - ...(hasResumeCursor ? { resumeCursor: input.binding.resumeCursor } : {}), - runtimeMode: input.binding.runtimeMode ?? "full-access", - }); - if (resumed.provider !== adapter.provider) { - return yield* toValidationError( - input.operation, - `Adapter/provider mismatch while recovering thread '${input.binding.threadId}'. Expected '${adapter.provider}', received '${resumed.provider}'.`, - ); - } + const resumed = yield* adapter.startSession({ + threadId: input.binding.threadId, + provider: input.binding.provider, + ...(persistedCwd ? { cwd: persistedCwd } : {}), + ...(persistedModelSelection ? { modelSelection: persistedModelSelection } : {}), + ...(hasResumeCursor ? { resumeCursor: input.binding.resumeCursor } : {}), + runtimeMode: input.binding.runtimeMode ?? "full-access", + }); + if (resumed.provider !== adapter.provider) { + return yield* toValidationError( + input.operation, + `Adapter/provider mismatch while recovering thread '${input.binding.threadId}'. Expected '${adapter.provider}', received '${resumed.provider}'.`, + ); + } - yield* upsertSessionBinding(resumed, input.binding.threadId); - yield* analytics.record("provider.session.recovered", { - provider: resumed.provider, - strategy: "resume-thread", - hasResumeCursor: resumed.resumeCursor !== undefined, - }); - return { adapter, session: resumed } as const; + yield* upsertSessionBinding(resumed, input.binding.threadId); + yield* analytics.record("provider.session.recovered", { + provider: resumed.provider, + strategy: "resume-thread", + hasResumeCursor: resumed.resumeCursor !== undefined, + }); + return { adapter, session: resumed } as const; + }).pipe( + withMetrics({ + counter: providerSessionsTotal, + attributes: providerMetricAttributes(input.binding.provider, { + operation: "recover", + }), + }), + ); }); const resolveRoutableSession = Effect.fn("resolveRoutableSession")(function* (input: { @@ -291,52 +318,69 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( threadId, provider: parsed.provider ?? "codex", }; - const settings = yield* serverSettings.getSettings.pipe( - Effect.mapError((error) => - toValidationError( - "ProviderService.startSession", - `Failed to load provider settings: ${error.message}`, - error, + yield* Effect.annotateCurrentSpan({ + "provider.operation": "start-session", + "provider.kind": input.provider, + "provider.thread_id": threadId, + "provider.runtime_mode": input.runtimeMode, + }); + return yield* Effect.gen(function* () { + const settings = yield* serverSettings.getSettings.pipe( + Effect.mapError((error) => + toValidationError( + "ProviderService.startSession", + `Failed to load provider settings: ${error.message}`, + error, + ), ), - ), - ); - if (!settings.providers[input.provider].enabled) { - return yield* toValidationError( - "ProviderService.startSession", - `Provider '${input.provider}' is disabled in T3 Code settings.`, ); - } - const persistedBinding = Option.getOrUndefined(yield* directory.getBinding(threadId)); - const effectiveResumeCursor = - input.resumeCursor ?? - (persistedBinding?.provider === input.provider ? persistedBinding.resumeCursor : undefined); - const adapter = yield* registry.getByProvider(input.provider); - const session = yield* adapter.startSession({ - ...input, - ...(effectiveResumeCursor !== undefined ? { resumeCursor: effectiveResumeCursor } : {}), - }); + if (!settings.providers[input.provider].enabled) { + return yield* toValidationError( + "ProviderService.startSession", + `Provider '${input.provider}' is disabled in T3 Code settings.`, + ); + } + const persistedBinding = Option.getOrUndefined(yield* directory.getBinding(threadId)); + const effectiveResumeCursor = + input.resumeCursor ?? + (persistedBinding?.provider === input.provider + ? persistedBinding.resumeCursor + : undefined); + const adapter = yield* registry.getByProvider(input.provider); + const session = yield* adapter.startSession({ + ...input, + ...(effectiveResumeCursor !== undefined ? { resumeCursor: effectiveResumeCursor } : {}), + }); - if (session.provider !== adapter.provider) { - return yield* toValidationError( - "ProviderService.startSession", - `Adapter/provider mismatch: requested '${adapter.provider}', received '${session.provider}'.`, - ); - } + if (session.provider !== adapter.provider) { + return yield* toValidationError( + "ProviderService.startSession", + `Adapter/provider mismatch: requested '${adapter.provider}', received '${session.provider}'.`, + ); + } - yield* upsertSessionBinding(session, threadId, { - modelSelection: input.modelSelection, - }); - yield* analytics.record("provider.session.started", { - provider: session.provider, - runtimeMode: input.runtimeMode, - hasResumeCursor: session.resumeCursor !== undefined, - hasCwd: typeof input.cwd === "string" && input.cwd.trim().length > 0, - hasModel: - typeof input.modelSelection?.model === "string" && - input.modelSelection.model.trim().length > 0, - }); + yield* upsertSessionBinding(session, threadId, { + modelSelection: input.modelSelection, + }); + yield* analytics.record("provider.session.started", { + provider: session.provider, + runtimeMode: input.runtimeMode, + hasResumeCursor: session.resumeCursor !== undefined, + hasCwd: typeof input.cwd === "string" && input.cwd.trim().length > 0, + hasModel: + typeof input.modelSelection?.model === "string" && + input.modelSelection.model.trim().length > 0, + }); - return session; + return session; + }).pipe( + withMetrics({ + counter: providerSessionsTotal, + attributes: providerMetricAttributes(input.provider, { + operation: "start", + }), + }), + ); }, ); @@ -357,32 +401,56 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( "Either input text or at least one attachment is required", ); } - const routed = yield* resolveRoutableSession({ - threadId: input.threadId, - operation: "ProviderService.sendTurn", - allowRecovery: true, - }); - const turn = yield* routed.adapter.sendTurn(input); - yield* directory.upsert({ - threadId: input.threadId, - provider: routed.adapter.provider, - status: "running", - ...(turn.resumeCursor !== undefined ? { resumeCursor: turn.resumeCursor } : {}), - runtimePayload: { - ...(input.modelSelection !== undefined ? { modelSelection: input.modelSelection } : {}), - activeTurnId: turn.turnId, - lastRuntimeEvent: "provider.sendTurn", - lastRuntimeEventAt: new Date().toISOString(), - }, + yield* Effect.annotateCurrentSpan({ + "provider.operation": "send-turn", + "provider.thread_id": input.threadId, + "provider.interaction_mode": input.interactionMode, + "provider.attachment_count": input.attachments.length, }); - yield* analytics.record("provider.turn.sent", { - provider: routed.adapter.provider, - model: input.modelSelection?.model, - interactionMode: input.interactionMode, - attachmentCount: input.attachments.length, - hasInput: typeof input.input === "string" && input.input.trim().length > 0, - }); - return turn; + return yield* Effect.gen(function* () { + const routed = yield* resolveRoutableSession({ + threadId: input.threadId, + operation: "ProviderService.sendTurn", + allowRecovery: true, + }); + yield* Effect.annotateCurrentSpan({ + "provider.kind": routed.adapter.provider, + ...(input.modelSelection?.model ? { "provider.model": input.modelSelection.model } : {}), + }); + const turn = yield* routed.adapter.sendTurn(input); + yield* directory.upsert({ + threadId: input.threadId, + provider: routed.adapter.provider, + status: "running", + ...(turn.resumeCursor !== undefined ? { resumeCursor: turn.resumeCursor } : {}), + runtimePayload: { + ...(input.modelSelection !== undefined ? { modelSelection: input.modelSelection } : {}), + activeTurnId: turn.turnId, + lastRuntimeEvent: "provider.sendTurn", + lastRuntimeEventAt: new Date().toISOString(), + }, + }); + yield* analytics.record("provider.turn.sent", { + provider: routed.adapter.provider, + model: input.modelSelection?.model, + interactionMode: input.interactionMode, + attachmentCount: input.attachments.length, + hasInput: typeof input.input === "string" && input.input.trim().length > 0, + }); + return turn; + }).pipe( + withMetrics({ + counter: providerTurnsTotal, + timer: providerTurnDuration, + attributes: providerTurnMetricAttributes({ + provider: input.modelSelection?.provider ?? "unknown", + model: input.modelSelection?.model, + extra: { + operation: "send", + }, + }), + }), + ); }); const interruptTurn: ProviderServiceShape["interruptTurn"] = Effect.fn("interruptTurn")( @@ -392,15 +460,30 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( schema: ProviderInterruptTurnInput, payload: rawInput, }); - const routed = yield* resolveRoutableSession({ - threadId: input.threadId, - operation: "ProviderService.interruptTurn", - allowRecovery: true, - }); - yield* routed.adapter.interruptTurn(routed.threadId, input.turnId); - yield* analytics.record("provider.turn.interrupted", { - provider: routed.adapter.provider, - }); + return yield* Effect.gen(function* () { + const routed = yield* resolveRoutableSession({ + threadId: input.threadId, + operation: "ProviderService.interruptTurn", + allowRecovery: true, + }); + yield* Effect.annotateCurrentSpan({ + "provider.operation": "interrupt-turn", + "provider.kind": routed.adapter.provider, + "provider.thread_id": input.threadId, + "provider.turn_id": input.turnId, + }); + yield* routed.adapter.interruptTurn(routed.threadId, input.turnId); + yield* analytics.record("provider.turn.interrupted", { + provider: routed.adapter.provider, + }); + }).pipe( + withMetrics({ + counter: providerTurnsTotal, + attributes: providerMetricAttributes("unknown", { + operation: "interrupt", + }), + }), + ); }, ); @@ -411,16 +494,31 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( schema: ProviderRespondToRequestInput, payload: rawInput, }); - const routed = yield* resolveRoutableSession({ - threadId: input.threadId, - operation: "ProviderService.respondToRequest", - allowRecovery: true, - }); - yield* routed.adapter.respondToRequest(routed.threadId, input.requestId, input.decision); - yield* analytics.record("provider.request.responded", { - provider: routed.adapter.provider, - decision: input.decision, - }); + return yield* Effect.gen(function* () { + const routed = yield* resolveRoutableSession({ + threadId: input.threadId, + operation: "ProviderService.respondToRequest", + allowRecovery: true, + }); + yield* Effect.annotateCurrentSpan({ + "provider.operation": "respond-to-request", + "provider.kind": routed.adapter.provider, + "provider.thread_id": input.threadId, + "provider.request_id": input.requestId, + }); + yield* routed.adapter.respondToRequest(routed.threadId, input.requestId, input.decision); + yield* analytics.record("provider.request.responded", { + provider: routed.adapter.provider, + decision: input.decision, + }); + }).pipe( + withMetrics({ + counter: providerTurnsTotal, + attributes: providerMetricAttributes("unknown", { + operation: "approval-response", + }), + }), + ); }, ); @@ -432,12 +530,27 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( schema: ProviderRespondToUserInputInput, payload: rawInput, }); - const routed = yield* resolveRoutableSession({ - threadId: input.threadId, - operation: "ProviderService.respondToUserInput", - allowRecovery: true, - }); - yield* routed.adapter.respondToUserInput(routed.threadId, input.requestId, input.answers); + return yield* Effect.gen(function* () { + const routed = yield* resolveRoutableSession({ + threadId: input.threadId, + operation: "ProviderService.respondToUserInput", + allowRecovery: true, + }); + yield* Effect.annotateCurrentSpan({ + "provider.operation": "respond-to-user-input", + "provider.kind": routed.adapter.provider, + "provider.thread_id": input.threadId, + "provider.request_id": input.requestId, + }); + yield* routed.adapter.respondToUserInput(routed.threadId, input.requestId, input.answers); + }).pipe( + withMetrics({ + counter: providerTurnsTotal, + attributes: providerMetricAttributes("unknown", { + operation: "user-input-response", + }), + }), + ); }); const stopSession: ProviderServiceShape["stopSession"] = Effect.fn("stopSession")( @@ -447,18 +560,32 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( schema: ProviderStopSessionInput, payload: rawInput, }); - const routed = yield* resolveRoutableSession({ - threadId: input.threadId, - operation: "ProviderService.stopSession", - allowRecovery: false, - }); - if (routed.isActive) { - yield* routed.adapter.stopSession(routed.threadId); - } - yield* directory.remove(input.threadId); - yield* analytics.record("provider.session.stopped", { - provider: routed.adapter.provider, - }); + return yield* Effect.gen(function* () { + const routed = yield* resolveRoutableSession({ + threadId: input.threadId, + operation: "ProviderService.stopSession", + allowRecovery: false, + }); + yield* Effect.annotateCurrentSpan({ + "provider.operation": "stop-session", + "provider.kind": routed.adapter.provider, + "provider.thread_id": input.threadId, + }); + if (routed.isActive) { + yield* routed.adapter.stopSession(routed.threadId); + } + yield* directory.remove(input.threadId); + yield* analytics.record("provider.session.stopped", { + provider: routed.adapter.provider, + }); + }).pipe( + withMetrics({ + counter: providerSessionsTotal, + attributes: providerMetricAttributes("unknown", { + operation: "stop", + }), + }), + ); }, ); @@ -524,16 +651,31 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( if (input.numTurns === 0) { return; } - const routed = yield* resolveRoutableSession({ - threadId: input.threadId, - operation: "ProviderService.rollbackConversation", - allowRecovery: true, - }); - yield* routed.adapter.rollbackThread(routed.threadId, input.numTurns); - yield* analytics.record("provider.conversation.rolled_back", { - provider: routed.adapter.provider, - turns: input.numTurns, - }); + return yield* Effect.gen(function* () { + const routed = yield* resolveRoutableSession({ + threadId: input.threadId, + operation: "ProviderService.rollbackConversation", + allowRecovery: true, + }); + yield* Effect.annotateCurrentSpan({ + "provider.operation": "rollback-conversation", + "provider.kind": routed.adapter.provider, + "provider.thread_id": input.threadId, + "provider.rollback_turns": input.numTurns, + }); + yield* routed.adapter.rollbackThread(routed.threadId, input.numTurns); + yield* analytics.record("provider.conversation.rolled_back", { + provider: routed.adapter.provider, + turns: input.numTurns, + }); + }).pipe( + withMetrics({ + counter: providerTurnsTotal, + attributes: providerMetricAttributes("unknown", { + operation: "rollback", + }), + }), + ); }); const runStopAll = Effect.fn("runStopAll")(function* () { diff --git a/apps/server/src/server.test.ts b/apps/server/src/server.test.ts index 0e53eae43e..a600f2043e 100644 --- a/apps/server/src/server.test.ts +++ b/apps/server/src/server.test.ts @@ -139,8 +139,17 @@ const buildAppUnderTest = (options?: { const baseDir = options?.config?.baseDir ?? tempBaseDir; const devUrl = options?.config?.devUrl; const derivedPaths = yield* deriveServerPaths(baseDir, devUrl); - const config = { + const config: ServerConfigShape = { logLevel: "Info", + traceMinLevel: "Info", + traceTimingEnabled: true, + traceBatchWindowMs: 200, + traceMaxBytes: 10 * 1024 * 1024, + traceMaxFiles: 10, + otlpTracesUrl: undefined, + otlpMetricsUrl: undefined, + otlpExportIntervalMs: 10_000, + otlpServiceName: "t3-server", mode: "web", port: 0, host: "127.0.0.1", @@ -154,7 +163,7 @@ const buildAppUnderTest = (options?: { autoBootstrapProjectFromCwd: false, logWebSocketEvents: false, ...options?.config, - } satisfies ServerConfigShape; + }; const layerConfig = Layer.succeed(ServerConfig, config); const appLayer = HttpRouter.serve(makeRoutesLayer, { diff --git a/apps/server/src/server.ts b/apps/server/src/server.ts index 40a8eb09bc..a6d111dd64 100644 --- a/apps/server/src/server.ts +++ b/apps/server/src/server.ts @@ -29,7 +29,6 @@ import { RoutingTextGenerationLive } from "./git/Layers/RoutingTextGeneration"; import { TerminalManagerLive } from "./terminal/Layers/Manager"; import { GitManagerLive } from "./git/Layers/GitManager"; import { KeybindingsLive } from "./keybindings"; -import { ServerLoggerLive } from "./serverLogger"; import { ServerRuntimeStartup, ServerRuntimeStartupLive } from "./serverRuntimeStartup"; import { OrchestrationReactorLive } from "./orchestration/Layers/OrchestrationReactor"; import { RuntimeReceiptBusLive } from "./orchestration/Layers/RuntimeReceiptBus"; @@ -42,6 +41,7 @@ import { ProjectFaviconResolverLive } from "./project/Layers/ProjectFaviconResol import { WorkspaceEntriesLive } from "./workspace/Layers/WorkspaceEntries"; import { WorkspaceFileSystemLive } from "./workspace/Layers/WorkspaceFileSystem"; import { WorkspacePathsLive } from "./workspace/Layers/WorkspacePaths"; +import { ObservabilityLive } from "./observability/Layers/Observability"; const PtyAdapterLive = Layer.unwrap( Effect.gen(function* () { @@ -219,7 +219,7 @@ export const makeServerLayer = Layer.unwrap( return serverApplicationLayer.pipe( Layer.provideMerge(RuntimeServicesLive), Layer.provideMerge(HttpServerLive), - Layer.provide(ServerLoggerLive), + Layer.provide(ObservabilityLive), Layer.provideMerge(FetchHttpClient.layer), Layer.provideMerge(PlatformServicesLive), ); diff --git a/apps/server/src/serverLogger.ts b/apps/server/src/serverLogger.ts index aea53aacfb..ea098dcbbe 100644 --- a/apps/server/src/serverLogger.ts +++ b/apps/server/src/serverLogger.ts @@ -4,11 +4,8 @@ import { ServerConfig } from "./config"; export const ServerLoggerLive = Effect.gen(function* () { const config = yield* ServerConfig; - const { serverLogPath } = config; - - const fileLogger = Logger.formatSimple.pipe(Logger.toFile(serverLogPath)); const minimumLogLevelLayer = Layer.succeed(References.MinimumLogLevel, config.logLevel); - const loggerLayer = Logger.layer([Logger.consolePretty(), fileLogger], { + const loggerLayer = Logger.layer([Logger.consolePretty(), Logger.tracerLogger], { mergeWithExisting: false, }); diff --git a/apps/server/src/serverRuntimeStartup.ts b/apps/server/src/serverRuntimeStartup.ts index 2457f6ffe8..2749d36991 100644 --- a/apps/server/src/serverRuntimeStartup.ts +++ b/apps/server/src/serverRuntimeStartup.ts @@ -234,7 +234,14 @@ const maybeOpenBrowser = Effect.gen(function* () { ); }); +const runStartupPhase = (phase: string, effect: Effect.Effect) => + effect.pipe( + Effect.annotateSpans({ "startup.phase": phase }), + Effect.withSpan(`server.startup.${phase}`), + ); + const makeServerRuntimeStartup = Effect.gen(function* () { + const serverConfig = yield* ServerConfig; const keybindings = yield* Keybindings; const orchestrationReactor = yield* OrchestrationReactor; const lifecycleEvents = yield* ServerLifecycleEvents; @@ -248,46 +255,65 @@ const makeServerRuntimeStartup = Effect.gen(function* () { const startup = Effect.gen(function* () { yield* Effect.logDebug("startup phase: starting keybindings runtime"); - yield* keybindings.start.pipe( - Effect.catch((error) => - Effect.logWarning("failed to start keybindings runtime", { - path: error.configPath, - detail: error.detail, - cause: error.cause, - }), + yield* runStartupPhase( + "keybindings.start", + keybindings.start.pipe( + Effect.catch((error) => + Effect.logWarning("failed to start keybindings runtime", { + path: error.configPath, + detail: error.detail, + cause: error.cause, + }), + ), + Effect.forkScoped, ), - Effect.forkScoped, ); yield* Effect.logDebug("startup phase: starting server settings runtime"); - yield* serverSettings.start.pipe( - Effect.catch((error) => - Effect.logWarning("failed to start server settings runtime", { - path: error.settingsPath, - detail: error.detail, - cause: error.cause, - }), + yield* runStartupPhase( + "settings.start", + serverSettings.start.pipe( + Effect.catch((error) => + Effect.logWarning("failed to start server settings runtime", { + path: error.settingsPath, + detail: error.detail, + cause: error.cause, + }), + ), + Effect.forkScoped, ), - Effect.forkScoped, ); yield* Effect.logDebug("startup phase: starting orchestration reactors"); - yield* orchestrationReactor.start().pipe(Scope.provide(reactorScope)); + yield* runStartupPhase( + "reactors.start", + orchestrationReactor.start().pipe(Scope.provide(reactorScope)), + ); yield* Effect.logDebug("startup phase: preparing welcome payload"); - const welcome = yield* autoBootstrapWelcome; + const welcome = yield* runStartupPhase("welcome.prepare", autoBootstrapWelcome); yield* Effect.logDebug("startup phase: publishing welcome event", { cwd: welcome.cwd, projectName: welcome.projectName, bootstrapProjectId: welcome.bootstrapProjectId, bootstrapThreadId: welcome.bootstrapThreadId, }); - yield* lifecycleEvents.publish({ - version: 1, - type: "welcome", - payload: welcome, - }); - }); + yield* runStartupPhase( + "welcome.publish", + lifecycleEvents.publish({ + version: 1, + type: "welcome", + payload: welcome, + }), + ); + }).pipe( + Effect.annotateSpans({ + "server.mode": serverConfig.mode, + "server.port": serverConfig.port, + "server.host": serverConfig.host ?? "default", + }), + Effect.withSpan("server.startup", { kind: "server", root: true }), + ); yield* Effect.forkScoped( Effect.gen(function* () { @@ -305,18 +331,21 @@ const makeServerRuntimeStartup = Effect.gen(function* () { yield* Effect.logDebug("Accepting commands"); yield* commandGate.signalCommandReady; yield* Effect.logDebug("startup phase: waiting for http listener"); - yield* Deferred.await(httpListening); + yield* runStartupPhase("http.wait", Deferred.await(httpListening)); yield* Effect.logDebug("startup phase: publishing ready event"); - yield* lifecycleEvents.publish({ - version: 1, - type: "ready", - payload: { at: new Date().toISOString() }, - }); + yield* runStartupPhase( + "ready.publish", + lifecycleEvents.publish({ + version: 1, + type: "ready", + payload: { at: new Date().toISOString() }, + }), + ); yield* Effect.logDebug("startup phase: recording startup heartbeat"); - yield* recordStartupHeartbeat; + yield* runStartupPhase("heartbeat.record", recordStartupHeartbeat); yield* Effect.logDebug("startup phase: browser open check"); - yield* maybeOpenBrowser; + yield* runStartupPhase("browser.open", maybeOpenBrowser); yield* Effect.logDebug("startup phase: complete"); }), ); diff --git a/apps/server/src/terminal/Layers/Manager.ts b/apps/server/src/terminal/Layers/Manager.ts index 7ef18b1655..a076e7600b 100644 --- a/apps/server/src/terminal/Layers/Manager.ts +++ b/apps/server/src/terminal/Layers/Manager.ts @@ -23,6 +23,11 @@ import { } from "effect"; import { ServerConfig } from "../../config"; +import { + increment, + terminalRestartsTotal, + terminalSessionsTotal, +} from "../../observability/Metrics"; import { runProcess } from "../../processRunner"; import { TerminalCwdError, @@ -1294,6 +1299,12 @@ export const makeTerminalManagerWithOptions = Effect.fn("makeTerminalManagerWith eventType: "started" | "restarted", ) { yield* stopProcess(session); + yield* Effect.annotateCurrentSpan({ + "terminal.thread_id": session.threadId, + "terminal.id": session.terminalId, + "terminal.event_type": eventType, + "terminal.cwd": input.cwd, + }); yield* modifyManagerState((state) => { session.status = "starting"; @@ -1314,45 +1325,49 @@ export const makeTerminalManagerWithOptions = Effect.fn("makeTerminalManagerWith let startedShell: string | null = null; const startResult = yield* Effect.result( - Effect.gen(function* () { - const shellCandidates = resolveShellCandidates(shellResolver); - const terminalEnv = createTerminalSpawnEnv(process.env, session.runtimeEnv); - const spawnResult = yield* trySpawn(shellCandidates, terminalEnv, session); - ptyProcess = spawnResult.process; - startedShell = spawnResult.shellLabel; - - const processPid = ptyProcess.pid; - const unsubscribeData = ptyProcess.onData((data) => { - if (!enqueueProcessEvent(session, processPid, { type: "output", data })) { - return; - } - runFork(drainProcessEvents(session, processPid)); - }); - const unsubscribeExit = ptyProcess.onExit((event) => { - if (!enqueueProcessEvent(session, processPid, { type: "exit", event })) { - return; - } - runFork(drainProcessEvents(session, processPid)); - }); - - yield* modifyManagerState((state) => { - session.process = ptyProcess; - session.pid = processPid; - session.status = "running"; - session.updatedAt = new Date().toISOString(); - session.unsubscribeData = unsubscribeData; - session.unsubscribeExit = unsubscribeExit; - return [undefined, state] as const; - }); - - yield* publishEvent({ - type: eventType, - threadId: session.threadId, - terminalId: session.terminalId, - createdAt: new Date().toISOString(), - snapshot: snapshot(session), - }); - }), + increment(terminalSessionsTotal, { lifecycle: eventType }).pipe( + Effect.andThen( + Effect.gen(function* () { + const shellCandidates = resolveShellCandidates(shellResolver); + const terminalEnv = createTerminalSpawnEnv(process.env, session.runtimeEnv); + const spawnResult = yield* trySpawn(shellCandidates, terminalEnv, session); + ptyProcess = spawnResult.process; + startedShell = spawnResult.shellLabel; + + const processPid = ptyProcess.pid; + const unsubscribeData = ptyProcess.onData((data) => { + if (!enqueueProcessEvent(session, processPid, { type: "output", data })) { + return; + } + runFork(drainProcessEvents(session, processPid)); + }); + const unsubscribeExit = ptyProcess.onExit((event) => { + if (!enqueueProcessEvent(session, processPid, { type: "exit", event })) { + return; + } + runFork(drainProcessEvents(session, processPid)); + }); + + yield* modifyManagerState((state) => { + session.process = ptyProcess; + session.pid = processPid; + session.status = "running"; + session.updatedAt = new Date().toISOString(); + session.unsubscribeData = unsubscribeData; + session.unsubscribeExit = unsubscribeExit; + return [undefined, state] as const; + }); + + yield* publishEvent({ + type: eventType, + threadId: session.threadId, + terminalId: session.terminalId, + createdAt: new Date().toISOString(), + snapshot: snapshot(session), + }); + }), + ), + ), ); if (startResult._tag === "Success") { @@ -1722,6 +1737,7 @@ export const makeTerminalManagerWithOptions = Effect.fn("makeTerminalManagerWith withThreadLock( input.threadId, Effect.gen(function* () { + yield* increment(terminalRestartsTotal, { scope: "thread" }); const terminalId = input.terminalId ?? DEFAULT_TERMINAL_ID; yield* assertValidCwd(input.cwd); diff --git a/apps/server/src/ws.ts b/apps/server/src/ws.ts index cff7e26efa..ebfc7a676c 100644 --- a/apps/server/src/ws.ts +++ b/apps/server/src/ws.ts @@ -28,6 +28,11 @@ import { Open, resolveAvailableEditors } from "./open"; import { normalizeDispatchCommand } from "./orchestration/Normalizer"; import { OrchestrationEngineService } from "./orchestration/Services/OrchestrationEngine"; import { ProjectionSnapshotQuery } from "./orchestration/Services/ProjectionSnapshotQuery"; +import { + observeRpcEffect, + observeRpcStream, + observeRpcStreamEffect, +} from "./observability/RpcInstrumentation"; import { ProviderRegistry } from "./provider/Services/ProviderRegistry"; import { ServerLifecycleEvents } from "./serverLifecycleEvents"; import { ServerRuntimeStartup } from "./serverRuntimeStartup"; @@ -71,68 +76,111 @@ const WsRpcLayer = WsRpcGroup.toLayer( }; }); + const rpcEffect = ( + method: string, + effect: Effect.Effect, + traceAttributes?: Readonly>, + ) => observeRpcEffect(method, effect, traceAttributes); + + const rpcStream = ( + method: string, + stream: Stream.Stream, + traceAttributes?: Readonly>, + ) => observeRpcStream(method, stream, traceAttributes); + + const rpcStreamEffect = ( + method: string, + effect: Effect.Effect< + Stream.Stream, + EffectError, + EffectContext + >, + traceAttributes?: Readonly>, + ) => observeRpcStreamEffect(method, effect, traceAttributes); + return WsRpcGroup.of({ [ORCHESTRATION_WS_METHODS.getSnapshot]: (_input) => - projectionSnapshotQuery.getSnapshot().pipe( - Effect.mapError( - (cause) => - new OrchestrationGetSnapshotError({ - message: "Failed to load orchestration snapshot", - cause, - }), + rpcEffect( + ORCHESTRATION_WS_METHODS.getSnapshot, + projectionSnapshotQuery.getSnapshot().pipe( + Effect.mapError( + (cause) => + new OrchestrationGetSnapshotError({ + message: "Failed to load orchestration snapshot", + cause, + }), + ), ), + { "rpc.aggregate": "orchestration" }, ), [ORCHESTRATION_WS_METHODS.dispatchCommand]: (command) => - Effect.gen(function* () { - const normalizedCommand = yield* normalizeDispatchCommand(command); - return yield* startup.enqueueCommand(orchestrationEngine.dispatch(normalizedCommand)); - }).pipe( - Effect.mapError((cause) => - Schema.is(OrchestrationDispatchCommandError)(cause) - ? cause - : new OrchestrationDispatchCommandError({ - message: "Failed to dispatch orchestration command", - cause, - }), + rpcEffect( + ORCHESTRATION_WS_METHODS.dispatchCommand, + Effect.gen(function* () { + const normalizedCommand = yield* normalizeDispatchCommand(command); + return yield* startup.enqueueCommand(orchestrationEngine.dispatch(normalizedCommand)); + }).pipe( + Effect.mapError((cause) => + Schema.is(OrchestrationDispatchCommandError)(cause) + ? cause + : new OrchestrationDispatchCommandError({ + message: "Failed to dispatch orchestration command", + cause, + }), + ), ), + { "rpc.aggregate": "orchestration" }, ), [ORCHESTRATION_WS_METHODS.getTurnDiff]: (input) => - checkpointDiffQuery.getTurnDiff(input).pipe( - Effect.mapError( - (cause) => - new OrchestrationGetTurnDiffError({ - message: "Failed to load turn diff", - cause, - }), + rpcEffect( + ORCHESTRATION_WS_METHODS.getTurnDiff, + checkpointDiffQuery.getTurnDiff(input).pipe( + Effect.mapError( + (cause) => + new OrchestrationGetTurnDiffError({ + message: "Failed to load turn diff", + cause, + }), + ), ), + { "rpc.aggregate": "orchestration" }, ), [ORCHESTRATION_WS_METHODS.getFullThreadDiff]: (input) => - checkpointDiffQuery.getFullThreadDiff(input).pipe( - Effect.mapError( - (cause) => - new OrchestrationGetFullThreadDiffError({ - message: "Failed to load full thread diff", - cause, - }), + rpcEffect( + ORCHESTRATION_WS_METHODS.getFullThreadDiff, + checkpointDiffQuery.getFullThreadDiff(input).pipe( + Effect.mapError( + (cause) => + new OrchestrationGetFullThreadDiffError({ + message: "Failed to load full thread diff", + cause, + }), + ), ), + { "rpc.aggregate": "orchestration" }, ), [ORCHESTRATION_WS_METHODS.replayEvents]: (input) => - Stream.runCollect( - orchestrationEngine.readEvents( - clamp(input.fromSequenceExclusive, { maximum: Number.MAX_SAFE_INTEGER, minimum: 0 }), - ), - ).pipe( - Effect.map((events) => Array.from(events)), - Effect.mapError( - (cause) => - new OrchestrationReplayEventsError({ - message: "Failed to replay orchestration events", - cause, - }), + rpcEffect( + ORCHESTRATION_WS_METHODS.replayEvents, + Stream.runCollect( + orchestrationEngine.readEvents( + clamp(input.fromSequenceExclusive, { maximum: Number.MAX_SAFE_INTEGER, minimum: 0 }), + ), + ).pipe( + Effect.map((events) => Array.from(events)), + Effect.mapError( + (cause) => + new OrchestrationReplayEventsError({ + message: "Failed to replay orchestration events", + cause, + }), + ), ), + { "rpc.aggregate": "orchestration" }, ), [WS_METHODS.subscribeOrchestrationDomainEvents]: (_input) => - Stream.unwrap( + rpcStreamEffect( + WS_METHODS.subscribeOrchestrationDomainEvents, Effect.gen(function* () { const snapshot = yield* orchestrationEngine.getReadModel(); const fromSequenceExclusive = snapshot.snapshotSequence; @@ -187,82 +235,159 @@ const WsRpcLayer = WsRpcGroup.toLayer( Stream.flatMap((events) => Stream.fromIterable(events)), ); }), + { "rpc.aggregate": "orchestration" }, ), - [WS_METHODS.serverGetConfig]: (_input) => loadServerConfig, + [WS_METHODS.serverGetConfig]: (_input) => + rpcEffect(WS_METHODS.serverGetConfig, loadServerConfig, { "rpc.aggregate": "server" }), [WS_METHODS.serverRefreshProviders]: (_input) => - providerRegistry.refresh().pipe(Effect.map((providers) => ({ providers }))), + rpcEffect( + WS_METHODS.serverRefreshProviders, + providerRegistry.refresh().pipe(Effect.map((providers) => ({ providers }))), + { "rpc.aggregate": "server" }, + ), [WS_METHODS.serverUpsertKeybinding]: (rule) => - Effect.gen(function* () { - const keybindingsConfig = yield* keybindings.upsertKeybindingRule(rule); - return { keybindings: keybindingsConfig, issues: [] }; + rpcEffect( + WS_METHODS.serverUpsertKeybinding, + Effect.gen(function* () { + const keybindingsConfig = yield* keybindings.upsertKeybindingRule(rule); + return { keybindings: keybindingsConfig, issues: [] }; + }), + { "rpc.aggregate": "server" }, + ), + [WS_METHODS.serverGetSettings]: (_input) => + rpcEffect(WS_METHODS.serverGetSettings, serverSettings.getSettings, { + "rpc.aggregate": "server", + }), + [WS_METHODS.serverUpdateSettings]: ({ patch }) => + rpcEffect(WS_METHODS.serverUpdateSettings, serverSettings.updateSettings(patch), { + "rpc.aggregate": "server", }), - [WS_METHODS.serverGetSettings]: (_input) => serverSettings.getSettings, - [WS_METHODS.serverUpdateSettings]: ({ patch }) => serverSettings.updateSettings(patch), [WS_METHODS.projectsSearchEntries]: (input) => - workspaceEntries.search(input).pipe( - Effect.mapError( - (cause) => - new ProjectSearchEntriesError({ - message: `Failed to search workspace entries: ${cause.detail}`, - cause, - }), + rpcEffect( + WS_METHODS.projectsSearchEntries, + workspaceEntries.search(input).pipe( + Effect.mapError( + (cause) => + new ProjectSearchEntriesError({ + message: `Failed to search workspace entries: ${cause.detail}`, + cause, + }), + ), ), + { "rpc.aggregate": "workspace" }, ), [WS_METHODS.projectsWriteFile]: (input) => - workspaceFileSystem.writeFile(input).pipe( - Effect.mapError((cause) => { - const message = Schema.is(WorkspacePathOutsideRootError)(cause) - ? "Workspace file path must stay within the project root." - : "Failed to write workspace file"; - return new ProjectWriteFileError({ - message, - cause, - }); - }), + rpcEffect( + WS_METHODS.projectsWriteFile, + workspaceFileSystem.writeFile(input).pipe( + Effect.mapError((cause) => { + const message = Schema.is(WorkspacePathOutsideRootError)(cause) + ? "Workspace file path must stay within the project root." + : "Failed to write workspace file"; + return new ProjectWriteFileError({ + message, + cause, + }); + }), + ), + { "rpc.aggregate": "workspace" }, ), - [WS_METHODS.shellOpenInEditor]: (input) => open.openInEditor(input), - [WS_METHODS.gitStatus]: (input) => gitManager.status(input), - [WS_METHODS.gitPull]: (input) => git.pullCurrentBranch(input.cwd), + [WS_METHODS.shellOpenInEditor]: (input) => + rpcEffect(WS_METHODS.shellOpenInEditor, open.openInEditor(input), { + "rpc.aggregate": "workspace", + }), + [WS_METHODS.gitStatus]: (input) => + rpcEffect(WS_METHODS.gitStatus, gitManager.status(input), { "rpc.aggregate": "git" }), + [WS_METHODS.gitPull]: (input) => + rpcEffect(WS_METHODS.gitPull, git.pullCurrentBranch(input.cwd), { "rpc.aggregate": "git" }), [WS_METHODS.gitRunStackedAction]: (input) => - Stream.callback((queue) => - gitManager - .runStackedAction(input, { - actionId: input.actionId, - progressReporter: { - publish: (event) => Queue.offer(queue, event).pipe(Effect.asVoid), - }, - }) - .pipe( - Effect.matchCauseEffect({ - onFailure: (cause) => Queue.failCause(queue, cause), - onSuccess: () => Queue.end(queue).pipe(Effect.asVoid), - }), - ), + rpcStream( + WS_METHODS.gitRunStackedAction, + Stream.callback((queue) => + gitManager + .runStackedAction(input, { + actionId: input.actionId, + progressReporter: { + publish: (event) => Queue.offer(queue, event).pipe(Effect.asVoid), + }, + }) + .pipe( + Effect.matchCauseEffect({ + onFailure: (cause) => Queue.failCause(queue, cause), + onSuccess: () => Queue.end(queue).pipe(Effect.asVoid), + }), + ), + ), + { "rpc.aggregate": "git" }, ), - [WS_METHODS.gitResolvePullRequest]: (input) => gitManager.resolvePullRequest(input), + [WS_METHODS.gitResolvePullRequest]: (input) => + rpcEffect(WS_METHODS.gitResolvePullRequest, gitManager.resolvePullRequest(input), { + "rpc.aggregate": "git", + }), [WS_METHODS.gitPreparePullRequestThread]: (input) => - gitManager.preparePullRequestThread(input), - [WS_METHODS.gitListBranches]: (input) => git.listBranches(input), - [WS_METHODS.gitCreateWorktree]: (input) => git.createWorktree(input), - [WS_METHODS.gitRemoveWorktree]: (input) => git.removeWorktree(input), - [WS_METHODS.gitCreateBranch]: (input) => git.createBranch(input), - [WS_METHODS.gitCheckout]: (input) => Effect.scoped(git.checkoutBranch(input)), - [WS_METHODS.gitInit]: (input) => git.initRepo(input), - [WS_METHODS.terminalOpen]: (input) => terminalManager.open(input), - [WS_METHODS.terminalWrite]: (input) => terminalManager.write(input), - [WS_METHODS.terminalResize]: (input) => terminalManager.resize(input), - [WS_METHODS.terminalClear]: (input) => terminalManager.clear(input), - [WS_METHODS.terminalRestart]: (input) => terminalManager.restart(input), - [WS_METHODS.terminalClose]: (input) => terminalManager.close(input), + rpcEffect( + WS_METHODS.gitPreparePullRequestThread, + gitManager.preparePullRequestThread(input), + { "rpc.aggregate": "git" }, + ), + [WS_METHODS.gitListBranches]: (input) => + rpcEffect(WS_METHODS.gitListBranches, git.listBranches(input), { "rpc.aggregate": "git" }), + [WS_METHODS.gitCreateWorktree]: (input) => + rpcEffect(WS_METHODS.gitCreateWorktree, git.createWorktree(input), { + "rpc.aggregate": "git", + }), + [WS_METHODS.gitRemoveWorktree]: (input) => + rpcEffect(WS_METHODS.gitRemoveWorktree, git.removeWorktree(input), { + "rpc.aggregate": "git", + }), + [WS_METHODS.gitCreateBranch]: (input) => + rpcEffect(WS_METHODS.gitCreateBranch, git.createBranch(input), { + "rpc.aggregate": "git", + }), + [WS_METHODS.gitCheckout]: (input) => + rpcEffect(WS_METHODS.gitCheckout, Effect.scoped(git.checkoutBranch(input)), { + "rpc.aggregate": "git", + }), + [WS_METHODS.gitInit]: (input) => + rpcEffect(WS_METHODS.gitInit, git.initRepo(input), { "rpc.aggregate": "git" }), + [WS_METHODS.terminalOpen]: (input) => + rpcEffect(WS_METHODS.terminalOpen, terminalManager.open(input), { + "rpc.aggregate": "terminal", + }), + [WS_METHODS.terminalWrite]: (input) => + rpcEffect(WS_METHODS.terminalWrite, terminalManager.write(input), { + "rpc.aggregate": "terminal", + }), + [WS_METHODS.terminalResize]: (input) => + rpcEffect(WS_METHODS.terminalResize, terminalManager.resize(input), { + "rpc.aggregate": "terminal", + }), + [WS_METHODS.terminalClear]: (input) => + rpcEffect(WS_METHODS.terminalClear, terminalManager.clear(input), { + "rpc.aggregate": "terminal", + }), + [WS_METHODS.terminalRestart]: (input) => + rpcEffect(WS_METHODS.terminalRestart, terminalManager.restart(input), { + "rpc.aggregate": "terminal", + }), + [WS_METHODS.terminalClose]: (input) => + rpcEffect(WS_METHODS.terminalClose, terminalManager.close(input), { + "rpc.aggregate": "terminal", + }), [WS_METHODS.subscribeTerminalEvents]: (_input) => - Stream.callback((queue) => - Effect.acquireRelease( - terminalManager.subscribe((event) => Queue.offer(queue, event)), - (unsubscribe) => Effect.sync(unsubscribe), + rpcStream( + WS_METHODS.subscribeTerminalEvents, + Stream.callback((queue) => + Effect.acquireRelease( + terminalManager.subscribe((event) => Queue.offer(queue, event)), + (unsubscribe) => Effect.sync(unsubscribe), + ), ), + { "rpc.aggregate": "terminal" }, ), [WS_METHODS.subscribeServerConfig]: (_input) => - Stream.unwrap( + rpcStreamEffect( + WS_METHODS.subscribeServerConfig, Effect.gen(function* () { const keybindingsUpdates = keybindings.streamChanges.pipe( Stream.map((event) => ({ @@ -297,9 +422,11 @@ const WsRpcLayer = WsRpcGroup.toLayer( Stream.merge(keybindingsUpdates, Stream.merge(providerStatuses, settingsUpdates)), ); }), + { "rpc.aggregate": "server" }, ), [WS_METHODS.subscribeServerLifecycle]: (_input) => - Stream.unwrap( + rpcStreamEffect( + WS_METHODS.subscribeServerLifecycle, Effect.gen(function* () { const snapshot = yield* lifecycleEvents.snapshot; const snapshotEvents = Array.from(snapshot.events).toSorted( @@ -310,6 +437,7 @@ const WsRpcLayer = WsRpcGroup.toLayer( ); return Stream.concat(Stream.fromIterable(snapshotEvents), liveEvents); }), + { "rpc.aggregate": "server" }, ), }); }), @@ -317,9 +445,13 @@ const WsRpcLayer = WsRpcGroup.toLayer( export const websocketRpcRouteLayer = Layer.unwrap( Effect.gen(function* () { - const rpcWebSocketHttpEffect = yield* RpcServer.toHttpEffectWebsocket(WsRpcGroup).pipe( - Effect.provide(Layer.mergeAll(WsRpcLayer, RpcSerialization.layerJson)), - ); + const rpcWebSocketHttpEffect = yield* RpcServer.toHttpEffectWebsocket(WsRpcGroup, { + spanPrefix: "ws.rpc", + spanAttributes: { + "rpc.transport": "websocket", + "rpc.system": "effect-rpc", + }, + }).pipe(Effect.provide(Layer.mergeAll(WsRpcLayer, RpcSerialization.layerJson))); return HttpRouter.add( "GET", "/ws", diff --git a/docs/observability.md b/docs/observability.md new file mode 100644 index 0000000000..edf8d44bd9 --- /dev/null +++ b/docs/observability.md @@ -0,0 +1,468 @@ +# Observability + +This server now has one observability model: + +- pretty logs to stdout for humans +- completed spans written to a local NDJSON trace file +- optional OTLP export for traces and metrics + +The local trace file is the source of truth for persisted diagnostics. + +## Quick Start + +If you just want a local Grafana stack and working telemetry in a few minutes, use `grafana/otel-lgtm`. + +### 1. Start a local LGTM stack + +Grafana publishes a single-container dev/test image for local OpenTelemetry work: + +```bash +docker run --name lgtm \ + -p 3000:3000 \ + -p 4317:4317 \ + -p 4318:4318 \ + --rm -ti \ + grafana/otel-lgtm +``` + +Wait for the container to report that the collector and stack are ready, then open `http://localhost:3000`. + +Grafana login: + +- username: `admin` +- password: `admin` + +### 2. Point T3 Code at the local collector + +Our server exporters use OTLP HTTP, so set full OTLP HTTP endpoints: + +```bash +export T3CODE_OTLP_TRACES_URL=http://localhost:4318/v1/traces +export T3CODE_OTLP_METRICS_URL=http://localhost:4318/v1/metrics +``` + +Optional but useful: + +```bash +export T3CODE_OTLP_SERVICE_NAME=t3-local +export T3CODE_TRACE_MIN_LEVEL=Info +``` + +### 3. Run the app you care about + +#### Published CLI + +```bash +npx t3 +``` + +#### Monorepo web/server dev + +```bash +bun dev +``` + +#### Monorepo desktop dev + +```bash +bun dev:desktop +``` + +#### Packaged desktop app + +Launch it from a terminal in the same environment so the desktop process inherits `T3CODE_OTLP_*`. + +Important: + +- the desktop app forwards these env vars to the embedded backend process +- launching from Finder / Spotlight / Start Menu / dock icons usually will not inject new shell env vars + +### 4. Where to look in Grafana + +For this app today: + +- traces: use the `Tempo` data source +- metrics: use the `Prometheus` data source +- logs: do not expect much in `Loki` yet, because T3 Code does not export OTLP logs as a separate signal + +The useful first stop is usually `Explore`: + +1. Open `Explore` +2. Pick `Tempo` to inspect traces +3. Search for spans by service name, span name, or span attributes +4. Switch to `Prometheus` to inspect counters and latency metrics + +### 5. Keep the local trace file enabled too + +Even when exporting to LGTM, the server still writes the local NDJSON trace file. That is useful for: + +- quick local `jq` inspection +- debugging before Grafana is open +- checking raw span/event payloads + +So the normal local-dev setup is: + +- pretty stdout for live debugging +- local trace file for raw inspection +- LGTM for search, aggregation, and dashboards + +## What Gets Recorded + +### Traces + +Every completed span is written as one NDJSON record to `serverTracePath`. + +Default path: + +```txt +//logs/server.trace.ndjson +``` + +In normal runs that resolves from `T3CODE_HOME`. In dev mode it uses `dev/logs`; otherwise it uses `userdata/logs`. + +The trace record shape lives in `apps/server/src/observability/TraceRecord.ts`. + +Useful fields: + +- `name`: span name +- `traceId`, `spanId`, `parentSpanId`: correlation +- `durationMs`: elapsed time +- `attributes`: structured context +- `events`: embedded log events and custom events +- `exit`: `Success`, `Failure`, or `Interrupted` + +### Logs + +Application logs are not persisted as a separate file anymore. + +- `Logger.consolePretty()` writes human-readable logs to stdout +- `Logger.tracerLogger` turns `Effect.log...` calls inside an active span into span events +- logs emitted outside a span are stdout-only + +That means if you want a log message to survive into the trace file, it needs to happen inside a traced effect. + +### Metrics + +Metrics are first-class in code, but they are not written to a local file. + +- local persistence: none +- remote export: OTLP only, when configured + +Metric definitions live in `apps/server/src/observability/Metrics.ts`. + +## Runtime Wiring + +The server observability layer is assembled in `apps/server/src/observability/Layers/Observability.ts`. + +It provides: + +- pretty stdout logger +- `Logger.tracerLogger` +- local NDJSON tracer +- optional OTLP trace exporter +- optional OTLP metrics exporter +- Effect trace-level and timing refs + +## Config + +### Local Trace File + +- `T3CODE_TRACE_FILE`: override the trace file path +- `T3CODE_TRACE_MAX_BYTES`: per-file rotation size, default `10485760` +- `T3CODE_TRACE_MAX_FILES`: rotated file count, default `10` +- `T3CODE_TRACE_BATCH_WINDOW_MS`: flush window, default `200` +- `T3CODE_TRACE_MIN_LEVEL`: minimum trace level, default `Info` +- `T3CODE_TRACE_TIMING_ENABLED`: enable timing metadata, default `true` + +### OTLP Export + +- `T3CODE_OTLP_TRACES_URL`: OTLP trace endpoint +- `T3CODE_OTLP_METRICS_URL`: OTLP metric endpoint +- `T3CODE_OTLP_EXPORT_INTERVAL_MS`: export interval, default `10000` +- `T3CODE_OTLP_SERVICE_NAME`: service name, default `t3-server` + +If the OTLP URLs are unset, local tracing still works and metrics simply stay in-process. + +## Common Run Modes + +### CLI or `npx t3` + +This is the easiest way to test observability outside the monorepo: + +```bash +export T3CODE_OTLP_TRACES_URL=http://localhost:4318/v1/traces +export T3CODE_OTLP_METRICS_URL=http://localhost:4318/v1/metrics +export T3CODE_OTLP_SERVICE_NAME=t3-cli + +npx t3 +``` + +### Monorepo `bun dev` + +`bun dev` forwards your shell environment to the server process, so the same OTLP vars work: + +```bash +T3CODE_OTLP_TRACES_URL=http://localhost:4318/v1/traces \ +T3CODE_OTLP_METRICS_URL=http://localhost:4318/v1/metrics \ +T3CODE_OTLP_SERVICE_NAME=t3-dev \ +bun dev +``` + +### Monorepo `bun dev:desktop` + +Desktop dev mode also inherits the shell environment, and the Electron main process forwards OTLP env vars to the child backend: + +```bash +T3CODE_OTLP_TRACES_URL=http://localhost:4318/v1/traces \ +T3CODE_OTLP_METRICS_URL=http://localhost:4318/v1/metrics \ +T3CODE_OTLP_SERVICE_NAME=t3-desktop-dev \ +bun dev:desktop +``` + +### Packaged desktop app + +For packaged builds, start the app from a terminal if you want one-off local OTLP config: + +```bash +T3CODE_OTLP_TRACES_URL=http://localhost:4318/v1/traces \ +T3CODE_OTLP_METRICS_URL=http://localhost:4318/v1/metrics \ +T3CODE_OTLP_SERVICE_NAME=t3-desktop \ + +``` + +If you launch the app from the OS UI instead, the env vars usually will not be present. + +## How To Read The Trace File + +The trace file is NDJSON, so `jq` is the easiest way to explore it. + +### Tail everything + +```bash +tail -f "$T3CODE_HOME/userdata/logs/server.trace.ndjson" +``` + +### Show failures only + +```bash +jq -c 'select(.exit._tag != "Success") | { + name, + durationMs, + exit, + attributes +}' "$T3CODE_HOME/userdata/logs/server.trace.ndjson" +``` + +### Find slow spans + +```bash +jq -c 'select(.durationMs > 1000) | { + name, + durationMs, + traceId, + spanId +}' "$T3CODE_HOME/userdata/logs/server.trace.ndjson" +``` + +### Inspect embedded log events + +```bash +jq -c 'select(any(.events[]?; .attributes["effect.logLevel"] != null)) | { + name, + durationMs, + events: [ + .events[] + | select(.attributes["effect.logLevel"] != null) + | { + message: .name, + level: .attributes["effect.logLevel"] + } + ] +}' "$T3CODE_HOME/userdata/logs/server.trace.ndjson" +``` + +### Follow one trace + +```bash +jq -r 'select(.traceId == "TRACE_ID_HERE") | [ + .name, + .spanId, + (.parentSpanId // "-"), + .durationMs +] | @tsv' "$T3CODE_HOME/userdata/logs/server.trace.ndjson" +``` + +### Filter provider work + +```bash +jq -c 'select(.attributes["provider.thread_id"] == "thread_123") | { + name, + durationMs, + provider: .attributes["provider.kind"], + model: .attributes["provider.model"], + exit: .exit._tag +}' "$T3CODE_HOME/userdata/logs/server.trace.ndjson" +``` + +### Filter orchestration commands + +```bash +jq -c 'select(.attributes["orchestration.command_type"] != null) | { + name, + durationMs, + commandType: .attributes["orchestration.command_type"], + aggregateKind: .attributes["orchestration.aggregate_kind"] +}' "$T3CODE_HOME/userdata/logs/server.trace.ndjson" +``` + +### Filter git activity + +```bash +jq -c 'select(.attributes["git.operation"] != null) | { + name, + durationMs, + operation: .attributes["git.operation"], + cwd: .attributes["git.cwd"], + hookEvents: [ + .events[] + | select(.name == "git.hook.started" or .name == "git.hook.finished") + ] +}' "$T3CODE_HOME/userdata/logs/server.trace.ndjson" +``` + +## What Is Instrumented Today + +Current high-value span and metric boundaries include: + +- Effect RPC websocket request spans from `effect/rpc` +- RPC request metrics in `apps/server/src/observability/RpcInstrumentation.ts` +- startup phases +- orchestration command processing +- provider session and turn operations +- git command execution and git hook events +- terminal session lifecycle +- sqlite query execution + +Provider event NDJSON logging still exists separately for provider-runtime event streams. That is not the same artifact as the server trace file. + +## How To Instrument New Code + +### 1. Create or reuse a span + +Prefer existing `Effect.fn("name")` boundaries where possible. For ad hoc work, wrap it: + +```ts +import { Effect } from "effect"; + +const runThing = Effect.gen(function* () { + yield* Effect.annotateCurrentSpan({ + "thing.id": "abc123", + "thing.kind": "example", + }); + + yield* Effect.logInfo("starting thing"); + return yield* doWork(); +}).pipe(Effect.withSpan("thing.run")); +``` + +### 2. Add metrics with the pipeable API + +Use `withMetrics(...)` from `apps/server/src/observability/Metrics.ts`. + +```ts +import { Effect } from "effect"; + +import { someCounter, someDuration, withMetrics } from "../observability/Metrics.ts"; + +const program = doWork().pipe( + withMetrics({ + counter: someCounter, + timer: someDuration, + attributes: { + operation: "work", + }, + }), +); +``` + +Use low-cardinality metric attributes only. + +Good metric labels: + +- operation kind +- method name +- provider kind +- outcome + +Bad metric labels: + +- raw thread IDs +- command IDs +- file paths +- cwd +- full prompts +- high-cardinality model strings when a normalized family label would do + +Put that detailed context on spans instead. + +### 3. Use span annotations for high-cardinality detail + +```ts +yield * + Effect.annotateCurrentSpan({ + "provider.thread_id": input.threadId, + "provider.request_id": input.requestId, + "git.cwd": input.cwd, + }); +``` + +### 4. Use logs as span events + +If you want a trace to tell the story of what happened, log inside the span: + +```ts +yield * Effect.logInfo("starting provider turn"); +yield * Effect.logDebug("waiting for approval response"); +``` + +Those messages will show up as span events in the trace file because `Logger.tracerLogger` is installed. + +## Practical Workflows + +### “Why did this request fail?” + +1. Find failed spans with `exit._tag != "Success"`. +2. Group by `traceId`. +3. Inspect sibling spans and embedded log events. +4. Look at attributes like `provider.thread_id`, `orchestration.command_type`, or `git.operation`. + +### “Why is the UI feeling slow?” + +1. Sort spans by `durationMs`. +2. Look at top-level RPC request spans. +3. Check child spans for provider, git, terminal, or sqlite work. +4. Compare slow traces against metrics in your OTLP backend once metrics export is enabled. + +### “Are git hooks causing latency?” + +1. Filter spans with `git.operation`. +2. Inspect `git.hook.started` and `git.hook.finished` events. +3. Compare hook event timing to the enclosing git span duration. + +### “Where should I add more instrumentation?” + +Add spans or annotations at boundaries: + +- queue handoff +- provider adapter calls +- external process calls +- persistence writes +- RPC methods that fan out into multiple subsystems + +Avoid tracing every tiny helper. The useful unit is usually a boundary or a phase. + +## Current Constraints + +- logs outside spans are not persisted +- metrics are not snapshotted locally +- the old `serverLogPath` still exists in config for compatibility, but the trace file is the persisted observability artifact that matters diff --git a/turbo.json b/turbo.json index 9359376f59..47d7091385 100644 --- a/turbo.json +++ b/turbo.json @@ -11,7 +11,17 @@ "T3CODE_NO_BROWSER", "T3CODE_HOME", "T3CODE_AUTH_TOKEN", - "T3CODE_DESKTOP_WS_URL" + "T3CODE_DESKTOP_WS_URL", + "T3CODE_TRACE_MIN_LEVEL", + "T3CODE_TRACE_TIMING_ENABLED", + "T3CODE_TRACE_FILE", + "T3CODE_TRACE_MAX_BYTES", + "T3CODE_TRACE_MAX_FILES", + "T3CODE_TRACE_BATCH_WINDOW_MS", + "T3CODE_OTLP_TRACES_URL", + "T3CODE_OTLP_METRICS_URL", + "T3CODE_OTLP_EXPORT_INTERVAL_MS", + "T3CODE_OTLP_SERVICE_NAME" ], "tasks": { "build": { From 067241843a51becc11ba263e38b322aee1f851be Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Thu, 2 Apr 2026 16:26:35 -0700 Subject: [PATCH 2/8] Improve observability metrics and trace handling - Record orchestration command ack latency and route provider metrics by resolved provider - Capture RPC stream failures and keep trace sink writing past bad records - Add circular attribute normalization coverage --- .../src/observability/Attributes.test.ts | 29 ++++++ apps/server/src/observability/Attributes.ts | 20 ++++- apps/server/src/observability/Metrics.ts | 10 ++- .../observability/RpcInstrumentation.test.ts | 68 +++++++++++++- .../src/observability/RpcInstrumentation.ts | 68 ++++++++++++-- .../src/observability/TraceSink.test.ts | 44 ++++++++++ apps/server/src/observability/TraceSink.ts | 4 +- .../Layers/OrchestrationEngine.test.ts | 64 +++++++++++++- .../Layers/OrchestrationEngine.ts | 33 ++++++- .../provider/Layers/ProviderService.test.ts | 88 ++++++++++++++++++- .../src/provider/Layers/ProviderService.ts | 45 ++++++---- 11 files changed, 437 insertions(+), 36 deletions(-) create mode 100644 apps/server/src/observability/Attributes.test.ts diff --git a/apps/server/src/observability/Attributes.test.ts b/apps/server/src/observability/Attributes.test.ts new file mode 100644 index 0000000000..de08bac225 --- /dev/null +++ b/apps/server/src/observability/Attributes.test.ts @@ -0,0 +1,29 @@ +import { assert, describe, it } from "@effect/vitest"; + +import { compactTraceAttributes } from "./Attributes.ts"; + +describe("Attributes", () => { + it("normalizes circular arrays, maps, and sets without recursing forever", () => { + const array: Array = ["alpha"]; + array.push(array); + + const map = new Map(); + map.set("self", map); + + const set = new Set(); + set.add(set); + + assert.deepStrictEqual( + compactTraceAttributes({ + array, + map, + set, + }), + { + array: ["alpha", "[Circular]"], + map: { self: "[Circular]" }, + set: ["[Circular]"], + }, + ); + }); +}); diff --git a/apps/server/src/observability/Attributes.ts b/apps/server/src/observability/Attributes.ts index 831030f916..ab003f6ebd 100644 --- a/apps/server/src/observability/Attributes.ts +++ b/apps/server/src/observability/Attributes.ts @@ -9,6 +9,14 @@ function isPlainObject(value: unknown): value is Record { return typeof value === "object" && value !== null && !Array.isArray(value); } +function markSeen(value: object, seen: WeakSet): boolean { + if (seen.has(value)) { + return true; + } + seen.add(value); + return false; +} + function normalizeJsonValue(value: unknown, seen: WeakSet = new WeakSet()): unknown { if ( value === null || @@ -33,9 +41,15 @@ function normalizeJsonValue(value: unknown, seen: WeakSet = new WeakSet( }; } if (Array.isArray(value)) { + if (markSeen(value, seen)) { + return "[Circular]"; + } return value.map((entry) => normalizeJsonValue(entry, seen)); } if (value instanceof Map) { + if (markSeen(value, seen)) { + return "[Circular]"; + } return Object.fromEntries( Array.from(value.entries(), ([key, entryValue]) => [ String(key), @@ -44,15 +58,17 @@ function normalizeJsonValue(value: unknown, seen: WeakSet = new WeakSet( ); } if (value instanceof Set) { + if (markSeen(value, seen)) { + return "[Circular]"; + } return Array.from(value.values(), (entry) => normalizeJsonValue(entry, seen)); } if (!isPlainObject(value)) { return String(value); } - if (seen.has(value)) { + if (markSeen(value, seen)) { return "[Circular]"; } - seen.add(value); return Object.fromEntries( Object.entries(value).map(([key, entryValue]) => [key, normalizeJsonValue(entryValue, seen)]), ); diff --git a/apps/server/src/observability/Metrics.ts b/apps/server/src/observability/Metrics.ts index b6d1abb20a..a9530ad2c8 100644 --- a/apps/server/src/observability/Metrics.ts +++ b/apps/server/src/observability/Metrics.ts @@ -23,6 +23,14 @@ export const orchestrationCommandDuration = Metric.timer("t3_orchestration_comma description: "Orchestration command dispatch duration.", }); +export const orchestrationCommandAckDuration = Metric.timer( + "t3_orchestration_command_ack_duration", + { + description: + "Time from orchestration command dispatch to the first committed domain event emitted for that command.", + }, +); + export const orchestrationEventsProcessedTotal = Metric.counter( "t3_orchestration_events_processed_total", { @@ -74,8 +82,6 @@ export const metricAttributes = ( attributes: Readonly>, ): ReadonlyArray<[string, string]> => Object.entries(compactMetricAttributes(attributes)); -export const withMetricAttributes = Metric.withAttributes; - export const increment = ( metric: Metric.Metric, attributes: Readonly>, diff --git a/apps/server/src/observability/RpcInstrumentation.test.ts b/apps/server/src/observability/RpcInstrumentation.test.ts index c2a25170ed..d29b05f3c2 100644 --- a/apps/server/src/observability/RpcInstrumentation.test.ts +++ b/apps/server/src/observability/RpcInstrumentation.test.ts @@ -1,7 +1,11 @@ import { assert, describe, it } from "@effect/vitest"; -import { Effect, Metric, Stream } from "effect"; +import { Effect, Exit, Metric, Stream } from "effect"; -import { observeRpcEffect, observeRpcStreamEffect } from "./RpcInstrumentation.ts"; +import { + observeRpcEffect, + observeRpcStream, + observeRpcStreamEffect, +} from "./RpcInstrumentation.ts"; const hasMetricSnapshot = ( snapshots: ReadonlyArray, @@ -94,4 +98,64 @@ describe("RpcInstrumentation", () => { ); }), ); + + it.effect("records failure outcomes for direct stream RPC handlers during consumption", () => + Effect.gen(function* () { + const exit = yield* Stream.runCollect( + observeRpcStream( + "rpc.instrumentation.stream.failure", + Stream.make("a").pipe(Stream.concat(Stream.fail("boom"))), + { "rpc.aggregate": "test" }, + ).pipe(Stream.withSpan("rpc.instrumentation.stream.failure.span")), + ).pipe(Effect.exit); + + assert.equal(Exit.isFailure(exit), true); + + const snapshots = yield* Metric.snapshot; + + assert.equal( + hasMetricSnapshot(snapshots, "t3_rpc_requests_total", { + method: "rpc.instrumentation.stream.failure", + outcome: "failure", + }), + true, + ); + assert.equal( + hasMetricSnapshot(snapshots, "t3_rpc_request_duration", { + method: "rpc.instrumentation.stream.failure", + }), + true, + ); + }), + ); + + it.effect("records failure outcomes when a stream RPC effect produces a failing stream", () => + Effect.gen(function* () { + const exit = yield* Stream.runCollect( + observeRpcStreamEffect( + "rpc.instrumentation.stream.effect.failure", + Effect.succeed(Stream.fail("boom")), + { "rpc.aggregate": "test" }, + ).pipe(Stream.withSpan("rpc.instrumentation.stream.effect.failure.span")), + ).pipe(Effect.exit); + + assert.equal(Exit.isFailure(exit), true); + + const snapshots = yield* Metric.snapshot; + + assert.equal( + hasMetricSnapshot(snapshots, "t3_rpc_requests_total", { + method: "rpc.instrumentation.stream.effect.failure", + outcome: "failure", + }), + true, + ); + assert.equal( + hasMetricSnapshot(snapshots, "t3_rpc_request_duration", { + method: "rpc.instrumentation.stream.effect.failure", + }), + true, + ); + }), + ); }); diff --git a/apps/server/src/observability/RpcInstrumentation.ts b/apps/server/src/observability/RpcInstrumentation.ts index 9595de81c6..a3ac29aa02 100644 --- a/apps/server/src/observability/RpcInstrumentation.ts +++ b/apps/server/src/observability/RpcInstrumentation.ts @@ -1,6 +1,38 @@ -import { Effect, Stream } from "effect"; +import { Duration, Effect, Exit, Metric, Stream } from "effect"; -import { rpcRequestDuration, rpcRequestsTotal, withMetrics } from "./Metrics.ts"; +import { outcomeFromExit } from "./Attributes.ts"; +import { metricAttributes, rpcRequestDuration, rpcRequestsTotal, withMetrics } from "./Metrics.ts"; + +const annotateRpcSpan = ( + method: string, + traceAttributes?: Readonly>, +): Effect.Effect => + Effect.annotateCurrentSpan({ + "rpc.method": method, + ...traceAttributes, + }); + +const recordRpcStreamMetrics = ( + method: string, + startedAt: number, + exit: Exit.Exit, +): Effect.Effect => + Effect.gen(function* () { + yield* Metric.update( + Metric.withAttributes(rpcRequestDuration, metricAttributes({ method })), + Duration.millis(Math.max(0, Date.now() - startedAt)), + ); + yield* Metric.update( + Metric.withAttributes( + rpcRequestsTotal, + metricAttributes({ + method, + outcome: outcomeFromExit(exit), + }), + ), + 1, + ); + }); export const observeRpcEffect = ( method: string, @@ -8,10 +40,7 @@ export const observeRpcEffect = ( traceAttributes?: Readonly>, ): Effect.Effect => Effect.gen(function* () { - yield* Effect.annotateCurrentSpan({ - "rpc.method": method, - ...traceAttributes, - }); + yield* annotateRpcSpan(method, traceAttributes); return yield* effect.pipe( withMetrics({ @@ -28,10 +57,33 @@ export const observeRpcStream = ( method: string, stream: Stream.Stream, traceAttributes?: Readonly>, -) => Stream.unwrap(observeRpcEffect(method, Effect.succeed(stream), traceAttributes)); +): Stream.Stream => + Stream.unwrap( + Effect.gen(function* () { + yield* annotateRpcSpan(method, traceAttributes); + const startedAt = Date.now(); + return stream.pipe(Stream.onExit((exit) => recordRpcStreamMetrics(method, startedAt, exit))); + }), + ); export const observeRpcStreamEffect = ( method: string, effect: Effect.Effect, EffectError, EffectContext>, traceAttributes?: Readonly>, -) => Stream.unwrap(observeRpcEffect(method, effect, traceAttributes)); +): Stream.Stream => + Stream.unwrap( + Effect.gen(function* () { + yield* annotateRpcSpan(method, traceAttributes); + const startedAt = Date.now(); + const exit = yield* Effect.exit(effect); + + if (Exit.isFailure(exit)) { + yield* recordRpcStreamMetrics(method, startedAt, exit); + return yield* Effect.failCause(exit.cause); + } + + return exit.value.pipe( + Stream.onExit((streamExit) => recordRpcStreamMetrics(method, startedAt, streamExit)), + ); + }), + ); diff --git a/apps/server/src/observability/TraceSink.test.ts b/apps/server/src/observability/TraceSink.test.ts index c4021b3560..f4db90516b 100644 --- a/apps/server/src/observability/TraceSink.test.ts +++ b/apps/server/src/observability/TraceSink.test.ts @@ -105,4 +105,48 @@ describe("TraceSink", () => { }), ), ); + + it.effect("drops only the invalid record when serialization fails", () => + Effect.scoped( + Effect.gen(function* () { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "t3-trace-sink-")); + const tracePath = path.join(tempDir, "server.trace.ndjson"); + + try { + const sink = yield* makeTraceSink({ + filePath: tracePath, + maxBytes: 1024, + maxFiles: 2, + batchWindowMs: 10_000, + }); + + const circular: Array = []; + circular.push(circular); + + sink.push(makeRecord("alpha")); + sink.push({ + ...makeRecord("invalid"), + attributes: { + circular, + }, + } as TraceRecord); + sink.push(makeRecord("beta")); + yield* sink.close(); + + const lines = fs + .readFileSync(tracePath, "utf8") + .trim() + .split("\n") + .map((line) => JSON.parse(line) as TraceRecord); + + assert.deepStrictEqual( + lines.map((line) => line.name), + ["alpha", "beta"], + ); + } finally { + fs.rmSync(tempDir, { recursive: true, force: true }); + } + }), + ), + ); }); diff --git a/apps/server/src/observability/TraceSink.ts b/apps/server/src/observability/TraceSink.ts index 8989a0b640..1bd00b4734 100644 --- a/apps/server/src/observability/TraceSink.ts +++ b/apps/server/src/observability/TraceSink.ts @@ -39,7 +39,7 @@ export const makeTraceSink = Effect.fn("makeTraceSink")(function* (options: Trac try { sink.write(chunk); } catch { - buffer = []; + buffer.unshift(chunk); } }; @@ -59,7 +59,7 @@ export const makeTraceSink = Effect.fn("makeTraceSink")(function* (options: Trac flushUnsafe(); } } catch { - buffer = []; + return; } }, flush, diff --git a/apps/server/src/orchestration/Layers/OrchestrationEngine.test.ts b/apps/server/src/orchestration/Layers/OrchestrationEngine.test.ts index 16bbd98d6b..dd945cc9e7 100644 --- a/apps/server/src/orchestration/Layers/OrchestrationEngine.test.ts +++ b/apps/server/src/orchestration/Layers/OrchestrationEngine.test.ts @@ -8,7 +8,7 @@ import { TurnId, type OrchestrationEvent, } from "@t3tools/contracts"; -import { Effect, Layer, ManagedRuntime, Option, Queue, Stream } from "effect"; +import { Effect, Layer, ManagedRuntime, Metric, Option, Queue, Stream } from "effect"; import { describe, expect, it } from "vitest"; import { PersistenceSqlError } from "../../persistence/Errors.ts"; @@ -62,6 +62,17 @@ function now() { return new Date().toISOString(); } +const hasMetricSnapshot = ( + snapshots: ReadonlyArray, + id: string, + attributes: Readonly>, +) => + snapshots.some( + (snapshot) => + snapshot.id === id && + Object.entries(attributes).every(([key, value]) => snapshot.attributes?.[key] === value), + ); + describe("OrchestrationEngine", () => { it("bootstraps the in-memory read model from persisted projections", async () => { const failOnHistoricalReplayStore: OrchestrationEventStoreShape = { @@ -408,6 +419,57 @@ describe("OrchestrationEngine", () => { await system.dispose(); }); + it("records command ack duration using the first committed event type", async () => { + const system = await createOrchestrationSystem(); + const { engine } = system; + const createdAt = now(); + + await system.run( + engine.dispatch({ + type: "project.create", + commandId: CommandId.makeUnsafe("cmd-project-ack-create"), + projectId: asProjectId("project-ack"), + title: "Ack Project", + workspaceRoot: "/tmp/project-ack", + defaultModelSelection: { + provider: "codex", + model: "gpt-5-codex", + }, + createdAt, + }), + ); + + await system.run( + engine.dispatch({ + type: "thread.create", + commandId: CommandId.makeUnsafe("cmd-thread-ack-create"), + threadId: ThreadId.makeUnsafe("thread-ack"), + projectId: asProjectId("project-ack"), + title: "Ack Thread", + modelSelection: { + provider: "codex", + model: "gpt-5-codex", + }, + interactionMode: DEFAULT_PROVIDER_INTERACTION_MODE, + runtimeMode: "full-access", + branch: null, + worktreePath: null, + createdAt, + }), + ); + + const snapshots = await system.run(Metric.snapshot); + expect( + hasMetricSnapshot(snapshots, "t3_orchestration_command_ack_duration", { + commandType: "thread.create", + aggregateKind: "thread", + ackEventType: "thread.created", + }), + ).toBe(true); + + await system.dispose(); + }); + it("stores completed checkpoint summaries even when no files changed", async () => { const system = await createOrchestrationSystem(); const { engine } = system; diff --git a/apps/server/src/orchestration/Layers/OrchestrationEngine.ts b/apps/server/src/orchestration/Layers/OrchestrationEngine.ts index 4266bb0d61..15e3b45ad0 100644 --- a/apps/server/src/orchestration/Layers/OrchestrationEngine.ts +++ b/apps/server/src/orchestration/Layers/OrchestrationEngine.ts @@ -5,10 +5,23 @@ import type { ThreadId, } from "@t3tools/contracts"; import { OrchestrationCommand } from "@t3tools/contracts"; -import { Deferred, Effect, Layer, Option, PubSub, Queue, Schema, Stream } from "effect"; +import { + Deferred, + Duration, + Effect, + Layer, + Metric, + Option, + PubSub, + Queue, + Schema, + Stream, +} from "effect"; import * as SqlClient from "effect/unstable/sql/SqlClient"; import { + metricAttributes, + orchestrationCommandAckDuration, orchestrationCommandsTotal, orchestrationCommandDuration, withMetrics, @@ -33,6 +46,7 @@ import { interface CommandEnvelope { command: OrchestrationCommand; result: Deferred.Deferred<{ sequence: number }, OrchestrationDispatchError>; + startedAtMs: number; } function commandToAggregateRef(command: OrchestrationCommand): { @@ -169,8 +183,21 @@ const makeOrchestrationEngine = Effect.gen(function* () { ); readModel = committedCommand.nextReadModel; - for (const event of committedCommand.committedEvents) { + for (const [index, event] of committedCommand.committedEvents.entries()) { yield* PubSub.publish(eventPubSub, event); + if (index === 0) { + yield* Metric.update( + Metric.withAttributes( + orchestrationCommandAckDuration, + metricAttributes({ + commandType: envelope.command.type, + aggregateKind: aggregateRef.aggregateKind, + ackEventType: event.type, + }), + ), + Duration.millis(Math.max(0, Date.now() - envelope.startedAtMs)), + ); + } } yield* Deferred.succeed(envelope.result, { sequence: committedCommand.lastSequence }); }).pipe( @@ -236,7 +263,7 @@ const makeOrchestrationEngine = Effect.gen(function* () { const dispatch: OrchestrationEngineShape["dispatch"] = (command) => Effect.gen(function* () { const result = yield* Deferred.make<{ sequence: number }, OrchestrationDispatchError>(); - yield* Queue.offer(commandQueue, { command, result }); + yield* Queue.offer(commandQueue, { command, result, startedAtMs: Date.now() }); return yield* Deferred.await(result); }); diff --git a/apps/server/src/provider/Layers/ProviderService.test.ts b/apps/server/src/provider/Layers/ProviderService.test.ts index 651a611649..b23c113618 100644 --- a/apps/server/src/provider/Layers/ProviderService.test.ts +++ b/apps/server/src/provider/Layers/ProviderService.test.ts @@ -20,7 +20,7 @@ import { import { it, assert, vi } from "@effect/vitest"; import { assertFailure } from "@effect/vitest/utils"; -import { Effect, Fiber, Layer, Option, PubSub, Ref, Stream } from "effect"; +import { Effect, Fiber, Layer, Metric, Option, PubSub, Ref, Stream } from "effect"; import * as SqlClient from "effect/unstable/sql/SqlClient"; import { @@ -230,6 +230,17 @@ function makeFakeCodexAdapter(provider: ProviderKind = "codex") { const sleep = (ms: number) => Effect.promise(() => new Promise((resolve) => setTimeout(resolve, ms))); +const hasMetricSnapshot = ( + snapshots: ReadonlyArray, + id: string, + attributes: Readonly>, +) => + snapshots.some( + (snapshot) => + snapshot.id === id && + Object.entries(attributes).every(([key, value]) => snapshot.attributes?.[key] === value), + ); + function makeProviderServiceLayer() { const codex = makeFakeCodexAdapter(); const claude = makeFakeCodexAdapter("claudeAgent"); @@ -1060,6 +1071,81 @@ fanout.layer("ProviderServiceLive fanout", (it) => { ); }), ); + + it.effect("records provider metrics with the routed provider label", () => + Effect.gen(function* () { + const provider = yield* ProviderService; + + const session = yield* provider.startSession(asThreadId("thread-metrics"), { + provider: "claudeAgent", + threadId: asThreadId("thread-metrics"), + cwd: "/tmp/project", + runtimeMode: "full-access", + }); + + yield* provider.interruptTurn({ threadId: session.threadId }); + yield* provider.respondToRequest({ + threadId: session.threadId, + requestId: asRequestId("req-metrics-1"), + decision: "accept", + }); + yield* provider.respondToUserInput({ + threadId: session.threadId, + requestId: asRequestId("req-metrics-2"), + answers: { + sandbox_mode: "workspace-write", + }, + }); + yield* provider.rollbackConversation({ + threadId: session.threadId, + numTurns: 1, + }); + yield* provider.stopSession({ threadId: session.threadId }); + + const snapshots = yield* Metric.snapshot; + + assert.equal( + hasMetricSnapshot(snapshots, "t3_provider_turns_total", { + provider: "claudeAgent", + operation: "interrupt", + outcome: "success", + }), + true, + ); + assert.equal( + hasMetricSnapshot(snapshots, "t3_provider_turns_total", { + provider: "claudeAgent", + operation: "approval-response", + outcome: "success", + }), + true, + ); + assert.equal( + hasMetricSnapshot(snapshots, "t3_provider_turns_total", { + provider: "claudeAgent", + operation: "user-input-response", + outcome: "success", + }), + true, + ); + assert.equal( + hasMetricSnapshot(snapshots, "t3_provider_turns_total", { + provider: "claudeAgent", + operation: "rollback", + outcome: "success", + }), + true, + ); + assert.equal( + hasMetricSnapshot(snapshots, "t3_provider_sessions_total", { + provider: "claudeAgent", + operation: "stop", + outcome: "success", + }), + true, + ); + }), + ); }); const validation = makeProviderServiceLayer(); diff --git a/apps/server/src/provider/Layers/ProviderService.ts b/apps/server/src/provider/Layers/ProviderService.ts index c3947805a6..6c18849c44 100644 --- a/apps/server/src/provider/Layers/ProviderService.ts +++ b/apps/server/src/provider/Layers/ProviderService.ts @@ -460,12 +460,14 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( schema: ProviderInterruptTurnInput, payload: rawInput, }); + let metricProvider = "unknown"; return yield* Effect.gen(function* () { const routed = yield* resolveRoutableSession({ threadId: input.threadId, operation: "ProviderService.interruptTurn", allowRecovery: true, }); + metricProvider = routed.adapter.provider; yield* Effect.annotateCurrentSpan({ "provider.operation": "interrupt-turn", "provider.kind": routed.adapter.provider, @@ -479,9 +481,10 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( }).pipe( withMetrics({ counter: providerTurnsTotal, - attributes: providerMetricAttributes("unknown", { - operation: "interrupt", - }), + outcomeAttributes: () => + providerMetricAttributes(metricProvider, { + operation: "interrupt", + }), }), ); }, @@ -494,12 +497,14 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( schema: ProviderRespondToRequestInput, payload: rawInput, }); + let metricProvider = "unknown"; return yield* Effect.gen(function* () { const routed = yield* resolveRoutableSession({ threadId: input.threadId, operation: "ProviderService.respondToRequest", allowRecovery: true, }); + metricProvider = routed.adapter.provider; yield* Effect.annotateCurrentSpan({ "provider.operation": "respond-to-request", "provider.kind": routed.adapter.provider, @@ -514,9 +519,10 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( }).pipe( withMetrics({ counter: providerTurnsTotal, - attributes: providerMetricAttributes("unknown", { - operation: "approval-response", - }), + outcomeAttributes: () => + providerMetricAttributes(metricProvider, { + operation: "approval-response", + }), }), ); }, @@ -530,12 +536,14 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( schema: ProviderRespondToUserInputInput, payload: rawInput, }); + let metricProvider = "unknown"; return yield* Effect.gen(function* () { const routed = yield* resolveRoutableSession({ threadId: input.threadId, operation: "ProviderService.respondToUserInput", allowRecovery: true, }); + metricProvider = routed.adapter.provider; yield* Effect.annotateCurrentSpan({ "provider.operation": "respond-to-user-input", "provider.kind": routed.adapter.provider, @@ -546,9 +554,10 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( }).pipe( withMetrics({ counter: providerTurnsTotal, - attributes: providerMetricAttributes("unknown", { - operation: "user-input-response", - }), + outcomeAttributes: () => + providerMetricAttributes(metricProvider, { + operation: "user-input-response", + }), }), ); }); @@ -560,12 +569,14 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( schema: ProviderStopSessionInput, payload: rawInput, }); + let metricProvider = "unknown"; return yield* Effect.gen(function* () { const routed = yield* resolveRoutableSession({ threadId: input.threadId, operation: "ProviderService.stopSession", allowRecovery: false, }); + metricProvider = routed.adapter.provider; yield* Effect.annotateCurrentSpan({ "provider.operation": "stop-session", "provider.kind": routed.adapter.provider, @@ -581,9 +592,10 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( }).pipe( withMetrics({ counter: providerSessionsTotal, - attributes: providerMetricAttributes("unknown", { - operation: "stop", - }), + outcomeAttributes: () => + providerMetricAttributes(metricProvider, { + operation: "stop", + }), }), ); }, @@ -651,12 +663,14 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( if (input.numTurns === 0) { return; } + let metricProvider = "unknown"; return yield* Effect.gen(function* () { const routed = yield* resolveRoutableSession({ threadId: input.threadId, operation: "ProviderService.rollbackConversation", allowRecovery: true, }); + metricProvider = routed.adapter.provider; yield* Effect.annotateCurrentSpan({ "provider.operation": "rollback-conversation", "provider.kind": routed.adapter.provider, @@ -671,9 +685,10 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( }).pipe( withMetrics({ counter: providerTurnsTotal, - attributes: providerMetricAttributes("unknown", { - operation: "rollback", - }), + outcomeAttributes: () => + providerMetricAttributes(metricProvider, { + operation: "rollback", + }), }), ); }); From 988862989dfe85e23d8fad8016dba0dc931d4ce3 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Thu, 2 Apr 2026 17:24:21 -0700 Subject: [PATCH 3/8] Surface observability config in the settings UI - expose logs folder and OTLP tracing state in server config - add a Diagnostics section with an open-logs-folder action - update tests and observability docs for the new config shape --- apps/server/src/server.test.ts | 10 + apps/server/src/ws.ts | 8 + apps/web/src/components/ChatView.browser.tsx | 6 + .../components/KeybindingsToast.browser.tsx | 6 + .../settings/SettingsPanels.browser.tsx | 91 +++ .../components/settings/SettingsPanels.tsx | 109 +++- apps/web/src/rpc/serverState.test.ts | 6 + apps/web/src/rpc/serverState.ts | 5 + apps/web/src/wsNativeApi.test.ts | 6 + docs/observability.md | 561 ++++++++++-------- packages/contracts/src/server.ts | 11 + 11 files changed, 542 insertions(+), 277 deletions(-) create mode 100644 apps/web/src/components/settings/SettingsPanels.browser.tsx diff --git a/apps/server/src/server.test.ts b/apps/server/src/server.test.ts index 2c7a8eabac..0e8b909c99 100644 --- a/apps/server/src/server.test.ts +++ b/apps/server/src/server.test.ts @@ -542,6 +542,10 @@ it.layer(NodeServices.layer)("server router seam", (it) => { } as const; yield* buildAppUnderTest({ + config: { + otlpTracesUrl: "http://localhost:4318/v1/traces", + otlpMetricsUrl: "http://localhost:4318/v1/metrics", + }, layers: { keybindings: { loadConfigState: Effect.succeed({ @@ -570,6 +574,12 @@ it.layer(NodeServices.layer)("server router seam", (it) => { assert.deepEqual(first.config.keybindings, []); assert.deepEqual(first.config.issues, []); assert.deepEqual(first.config.providers, providers); + assert.equal(first.config.observability.logsDirectoryPath.endsWith("/logs"), true); + assert.equal(first.config.observability.localTracingEnabled, true); + assert.equal(first.config.observability.otlpTracesUrl, "http://localhost:4318/v1/traces"); + assert.equal(first.config.observability.otlpTracesEnabled, true); + assert.equal(first.config.observability.otlpMetricsUrl, "http://localhost:4318/v1/metrics"); + assert.equal(first.config.observability.otlpMetricsEnabled, true); assert.deepEqual(first.config.settings, DEFAULT_SERVER_SETTINGS); } assert.deepEqual(second, { diff --git a/apps/server/src/ws.ts b/apps/server/src/ws.ts index 3fab1850ca..5210d3748b 100644 --- a/apps/server/src/ws.ts +++ b/apps/server/src/ws.ts @@ -72,6 +72,14 @@ const WsRpcLayer = WsRpcGroup.toLayer( issues: keybindingsConfig.issues, providers, availableEditors: resolveAvailableEditors(), + observability: { + logsDirectoryPath: config.logsDir, + localTracingEnabled: true, + ...(config.otlpTracesUrl !== undefined ? { otlpTracesUrl: config.otlpTracesUrl } : {}), + otlpTracesEnabled: config.otlpTracesUrl !== undefined, + ...(config.otlpMetricsUrl !== undefined ? { otlpMetricsUrl: config.otlpMetricsUrl } : {}), + otlpMetricsEnabled: config.otlpMetricsUrl !== undefined, + }, settings, }; }); diff --git a/apps/web/src/components/ChatView.browser.tsx b/apps/web/src/components/ChatView.browser.tsx index b0263cbf40..f9f1362d0e 100644 --- a/apps/web/src/components/ChatView.browser.tsx +++ b/apps/web/src/components/ChatView.browser.tsx @@ -135,6 +135,12 @@ function createBaseServerConfig(): ServerConfig { }, ], availableEditors: [], + observability: { + logsDirectoryPath: "/repo/project/.t3/logs", + localTracingEnabled: true, + otlpTracesEnabled: false, + otlpMetricsEnabled: false, + }, settings: { ...DEFAULT_SERVER_SETTINGS, ...DEFAULT_CLIENT_SETTINGS, diff --git a/apps/web/src/components/KeybindingsToast.browser.tsx b/apps/web/src/components/KeybindingsToast.browser.tsx index b8398e7b8f..de7d84b4ec 100644 --- a/apps/web/src/components/KeybindingsToast.browser.tsx +++ b/apps/web/src/components/KeybindingsToast.browser.tsx @@ -56,6 +56,12 @@ function createBaseServerConfig(): ServerConfig { }, ], availableEditors: [], + observability: { + logsDirectoryPath: "/repo/project/.t3/logs", + localTracingEnabled: true, + otlpTracesEnabled: false, + otlpMetricsEnabled: false, + }, settings: { enableAssistantStreaming: false, defaultThreadEnvMode: "local" as const, diff --git a/apps/web/src/components/settings/SettingsPanels.browser.tsx b/apps/web/src/components/settings/SettingsPanels.browser.tsx new file mode 100644 index 0000000000..090f6f12ad --- /dev/null +++ b/apps/web/src/components/settings/SettingsPanels.browser.tsx @@ -0,0 +1,91 @@ +import "../../index.css"; + +import { DEFAULT_SERVER_SETTINGS, type NativeApi, type ServerConfig } from "@t3tools/contracts"; +import { page } from "vitest/browser"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { render } from "vitest-browser-react"; + +import { __resetNativeApiForTests } from "../../nativeApi"; +import { AppAtomRegistryProvider } from "../../rpc/atomRegistry"; +import { resetServerStateForTests, setServerConfigSnapshot } from "../../rpc/serverState"; +import { GeneralSettingsPanel } from "./SettingsPanels"; + +function createBaseServerConfig(): ServerConfig { + return { + cwd: "/repo/project", + keybindingsConfigPath: "/repo/project/.t3code-keybindings.json", + keybindings: [], + issues: [], + providers: [], + availableEditors: ["cursor"], + observability: { + logsDirectoryPath: "/repo/project/.t3/logs", + localTracingEnabled: true, + otlpTracesUrl: "http://localhost:4318/v1/traces", + otlpTracesEnabled: true, + otlpMetricsEnabled: false, + }, + settings: DEFAULT_SERVER_SETTINGS, + }; +} + +describe("GeneralSettingsPanel observability", () => { + beforeEach(() => { + resetServerStateForTests(); + __resetNativeApiForTests(); + localStorage.clear(); + document.body.innerHTML = ""; + }); + + afterEach(() => { + resetServerStateForTests(); + __resetNativeApiForTests(); + document.body.innerHTML = ""; + }); + + it("shows diagnostics inside About with a single logs-folder action", async () => { + setServerConfigSnapshot(createBaseServerConfig()); + + await render( + + + , + ); + + await expect.element(page.getByText("About")).toBeInTheDocument(); + await expect.element(page.getByText("Diagnostics")).toBeInTheDocument(); + await expect.element(page.getByText("Open logs folder")).toBeInTheDocument(); + await expect + .element(page.getByText("/repo/project/.t3/logs", { exact: true })) + .toBeInTheDocument(); + await expect + .element( + page.getByText( + "Local trace file. OTLP exporting traces to http://localhost:4318/v1/traces.", + ), + ) + .toBeInTheDocument(); + }); + + it("opens the logs folder in the preferred editor", async () => { + const openInEditor = vi.fn().mockResolvedValue(undefined); + window.nativeApi = { + shell: { + openInEditor, + }, + } as unknown as NativeApi; + + setServerConfigSnapshot(createBaseServerConfig()); + + await render( + + + , + ); + + const openLogsButton = page.getByText("Open logs folder"); + await openLogsButton.click(); + + expect(openInEditor).toHaveBeenCalledWith("/repo/project/.t3/logs", "cursor"); + }); +}); diff --git a/apps/web/src/components/settings/SettingsPanels.tsx b/apps/web/src/components/settings/SettingsPanels.tsx index e3e620030b..d534eefaa4 100644 --- a/apps/web/src/components/settings/SettingsPanels.tsx +++ b/apps/web/src/components/settings/SettingsPanels.tsx @@ -61,6 +61,7 @@ import { ProjectFavicon } from "../ProjectFavicon"; import { useServerAvailableEditors, useServerKeybindingsConfigPath, + useServerObservability, useServerProviders, } from "../../rpc/serverState"; @@ -518,8 +519,13 @@ export function GeneralSettingsPanel() { const { theme, setTheme } = useTheme(); const settings = useSettings(); const { updateSettings } = useUpdateSettings(); - const [isOpeningKeybindings, setIsOpeningKeybindings] = useState(false); - const [openKeybindingsError, setOpenKeybindingsError] = useState(null); + const [openingPathByTarget, setOpeningPathByTarget] = useState({ + keybindings: false, + logsDirectory: false, + }); + const [openPathErrorByTarget, setOpenPathErrorByTarget] = useState< + Partial> + >({}); const [openProviderDetails, setOpenProviderDetails] = useState>({ codex: Boolean( settings.providers.codex.binaryPath !== DEFAULT_UNIFIED_SETTINGS.providers.codex.binaryPath || @@ -561,8 +567,21 @@ export function GeneralSettingsPanel() { const keybindingsConfigPath = useServerKeybindingsConfigPath(); const availableEditors = useServerAvailableEditors(); + const observability = useServerObservability(); const serverProviders = useServerProviders(); const codexHomePath = settings.providers.codex.homePath; + const logsDirectoryPath = observability?.logsDirectoryPath ?? null; + const diagnosticsDescription = (() => { + const exports: string[] = []; + if (observability?.otlpTracesEnabled && observability.otlpTracesUrl) { + exports.push(`traces to ${observability.otlpTracesUrl}`); + } + if (observability?.otlpMetricsEnabled && observability.otlpMetricsUrl) { + exports.push(`metrics to ${observability.otlpMetricsUrl}`); + } + const mode = observability?.localTracingEnabled ? "Local trace file" : "Terminal logs only"; + return exports.length > 0 ? `${mode}. OTLP exporting ${exports.join(" and ")}.` : `${mode}.`; + })(); const textGenerationModelSelection = resolveAppModelSelectionState(settings, serverProviders); const textGenProvider = textGenerationModelSelection.provider; @@ -579,27 +598,49 @@ export function GeneralSettingsPanel() { DEFAULT_UNIFIED_SETTINGS.textGenerationModelSelection ?? null, ); + const openInPreferredEditor = useCallback( + (target: "keybindings" | "logsDirectory", path: string | null, failureMessage: string) => { + if (!path) return; + setOpenPathErrorByTarget((existing) => ({ ...existing, [target]: null })); + setOpeningPathByTarget((existing) => ({ ...existing, [target]: true })); + + const editor = resolveAndPersistPreferredEditor(availableEditors ?? []); + if (!editor) { + setOpenPathErrorByTarget((existing) => ({ + ...existing, + [target]: "No available editors found.", + })); + setOpeningPathByTarget((existing) => ({ ...existing, [target]: false })); + return; + } + + void ensureNativeApi() + .shell.openInEditor(path, editor) + .catch((error) => { + setOpenPathErrorByTarget((existing) => ({ + ...existing, + [target]: error instanceof Error ? error.message : failureMessage, + })); + }) + .finally(() => { + setOpeningPathByTarget((existing) => ({ ...existing, [target]: false })); + }); + }, + [availableEditors], + ); + const openKeybindingsFile = useCallback(() => { - if (!keybindingsConfigPath) return; - setOpenKeybindingsError(null); - setIsOpeningKeybindings(true); - const editor = resolveAndPersistPreferredEditor(availableEditors ?? []); - if (!editor) { - setOpenKeybindingsError("No available editors found."); - setIsOpeningKeybindings(false); - return; - } - void ensureNativeApi() - .shell.openInEditor(keybindingsConfigPath, editor) - .catch((error) => { - setOpenKeybindingsError( - error instanceof Error ? error.message : "Unable to open keybindings file.", - ); - }) - .finally(() => { - setIsOpeningKeybindings(false); - }); - }, [availableEditors, keybindingsConfigPath]); + openInPreferredEditor("keybindings", keybindingsConfigPath, "Unable to open keybindings file."); + }, [keybindingsConfigPath, openInPreferredEditor]); + + const openLogsDirectory = useCallback(() => { + openInPreferredEditor("logsDirectory", logsDirectoryPath, "Unable to open logs folder."); + }, [logsDirectoryPath, openInPreferredEditor]); + + const openKeybindingsError = openPathErrorByTarget.keybindings ?? null; + const openDiagnosticsError = openPathErrorByTarget.logsDirectory ?? null; + const isOpeningKeybindings = openingPathByTarget.keybindings; + const isOpeningLogsDirectory = openingPathByTarget.logsDirectory; const addCustomModel = useCallback( (provider: ProviderKind) => { @@ -1408,6 +1449,30 @@ export function GeneralSettingsPanel() { description="Current version of the application." /> )} + + + {logsDirectoryPath ?? "Resolving logs directory..."} + + {openDiagnosticsError ? ( + {openDiagnosticsError} + ) : null} + + } + control={ + + } + /> ); diff --git a/apps/web/src/rpc/serverState.test.ts b/apps/web/src/rpc/serverState.test.ts index 8da5846fe6..721ce25fb5 100644 --- a/apps/web/src/rpc/serverState.test.ts +++ b/apps/web/src/rpc/serverState.test.ts @@ -57,6 +57,12 @@ const baseServerConfig: ServerConfig = { issues: [], providers: defaultProviders, availableEditors: ["cursor"], + observability: { + logsDirectoryPath: "/tmp/workspace/.config/logs", + localTracingEnabled: true, + otlpTracesEnabled: false, + otlpMetricsEnabled: false, + }, settings: DEFAULT_SERVER_SETTINGS, }; diff --git a/apps/web/src/rpc/serverState.ts b/apps/web/src/rpc/serverState.ts index 2a69394062..3e8dbab28c 100644 --- a/apps/web/src/rpc/serverState.ts +++ b/apps/web/src/rpc/serverState.ts @@ -50,6 +50,7 @@ const selectAvailableEditors = (config: ServerConfig | null): ReadonlyArray config?.keybindings ?? EMPTY_KEYBINDINGS; const selectKeybindingsConfigPath = (config: ServerConfig | null) => config?.keybindingsConfigPath ?? null; +const selectObservability = (config: ServerConfig | null) => config?.observability ?? null; const selectProviders = (config: ServerConfig | null) => config?.providers ?? EMPTY_SERVER_PROVIDERS; const selectSettings = (config: ServerConfig | null): ServerSettings => @@ -281,6 +282,10 @@ export function useServerKeybindingsConfigPath(): string | null { return useAtomValue(serverConfigAtom, selectKeybindingsConfigPath); } +export function useServerObservability(): ServerConfig["observability"] | null { + return useAtomValue(serverConfigAtom, selectObservability); +} + export function useServerWelcomeSubscription( listener: (payload: ServerLifecycleWelcomePayload) => void, ): void { diff --git a/apps/web/src/wsNativeApi.test.ts b/apps/web/src/wsNativeApi.test.ts index c9bcc7ffd2..47f78f967f 100644 --- a/apps/web/src/wsNativeApi.test.ts +++ b/apps/web/src/wsNativeApi.test.ts @@ -159,6 +159,12 @@ const baseServerConfig: ServerConfig = { issues: [], providers: defaultProviders, availableEditors: ["cursor"], + observability: { + logsDirectoryPath: "/tmp/workspace/.config/logs", + localTracingEnabled: true, + otlpTracesEnabled: false, + otlpMetricsEnabled: false, + }, settings: DEFAULT_SERVER_SETTINGS, }; diff --git a/docs/observability.md b/docs/observability.md index edf8d44bd9..dde1093570 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -1,248 +1,183 @@ # Observability -This server now has one observability model: +T3 Code has one server-side observability model: -- pretty logs to stdout for humans -- completed spans written to a local NDJSON trace file -- optional OTLP export for traces and metrics +- pretty logs go to stdout for humans +- completed spans go to a local NDJSON trace file +- traces and metrics can also be exported over OTLP to a real backend like Grafana LGTM -The local trace file is the source of truth for persisted diagnostics. +The local trace file is the persisted source of truth. There is no separate persisted server log file anymore. -## Quick Start +## Where To Find Things -If you just want a local Grafana stack and working telemetry in a few minutes, use `grafana/otel-lgtm`. +### Logs -### 1. Start a local LGTM stack +Logs are human-facing only: -Grafana publishes a single-container dev/test image for local OpenTelemetry work: +- destination: stdout +- format: `Logger.consolePretty()` +- persistence: none -```bash -docker run --name lgtm \ - -p 3000:3000 \ - -p 4317:4317 \ - -p 4318:4318 \ - --rm -ti \ - grafana/otel-lgtm -``` +If you want a log message to show up in the trace file, emit it inside an active span with `Effect.log...`. `Logger.tracerLogger` will attach it as a span event. -Wait for the container to report that the collector and stack are ready, then open `http://localhost:3000`. +### Traces -Grafana login: +Completed spans are written as NDJSON records to `serverTracePath` (by default, `~/.t3/userdata/logs/server.trace.ndjson`). -- username: `admin` -- password: `admin` +Important fields in each record: + +- `name`: span name +- `traceId`, `spanId`, `parentSpanId`: correlation +- `durationMs`: elapsed time +- `attributes`: structured context +- `events`: embedded logs and custom events +- `exit`: `Success`, `Failure`, or `Interrupted` -### 2. Point T3 Code at the local collector +The schema lives in `apps/server/src/observability/TraceRecord.ts`. -Our server exporters use OTLP HTTP, so set full OTLP HTTP endpoints: +### Metrics -```bash -export T3CODE_OTLP_TRACES_URL=http://localhost:4318/v1/traces -export T3CODE_OTLP_METRICS_URL=http://localhost:4318/v1/metrics -``` +Metrics are not written to a local file. -Optional but useful: +- local persistence: none +- remote export: OTLP only, when configured +- current definitions: `apps/server/src/observability/Metrics.ts` -```bash -export T3CODE_OTLP_SERVICE_NAME=t3-local -export T3CODE_TRACE_MIN_LEVEL=Info -``` +If OTLP is not configured, metrics still exist in-process, but you will not have a local artifact to inspect. + +### Related Artifacts + +Provider event NDJSON files still exist for provider runtime streams. Those are separate from the main server trace file. + +## Run The Server In Instrumented Mode + +There are two useful modes: + +- local-only: stdout + local `server.trace.ndjson` +- full local observability: stdout + local trace file + OTLP export to Grafana/Tempo/Prometheus -### 3. Run the app you care about +The local trace file is always on. OTLP export is opt-in. -#### Published CLI +### Option 1: Local Traces Only + +You do not need any extra env vars. Just run the app normally and inspect `server.trace.ndjson`. + +Examples: ```bash npx t3 ``` -#### Monorepo web/server dev - ```bash bun dev ``` -#### Monorepo desktop dev - ```bash bun dev:desktop ``` -#### Packaged desktop app - -Launch it from a terminal in the same environment so the desktop process inherits `T3CODE_OTLP_*`. - -Important: - -- the desktop app forwards these env vars to the embedded backend process -- launching from Finder / Spotlight / Start Menu / dock icons usually will not inject new shell env vars - -### 4. Where to look in Grafana - -For this app today: - -- traces: use the `Tempo` data source -- metrics: use the `Prometheus` data source -- logs: do not expect much in `Loki` yet, because T3 Code does not export OTLP logs as a separate signal - -The useful first stop is usually `Explore`: - -1. Open `Explore` -2. Pick `Tempo` to inspect traces -3. Search for spans by service name, span name, or span attributes -4. Switch to `Prometheus` to inspect counters and latency metrics - -### 5. Keep the local trace file enabled too - -Even when exporting to LGTM, the server still writes the local NDJSON trace file. That is useful for: - -- quick local `jq` inspection -- debugging before Grafana is open -- checking raw span/event payloads - -So the normal local-dev setup is: - -- pretty stdout for live debugging -- local trace file for raw inspection -- LGTM for search, aggregation, and dashboards +### Option 2: Run With A Local LGTM Stack -## What Gets Recorded +#### 1. Start Grafana LGTM -### Traces - -Every completed span is written as one NDJSON record to `serverTracePath`. - -Default path: - -```txt -//logs/server.trace.ndjson +```bash +docker run --name lgtm \ + -p 3000:3000 \ + -p 4317:4317 \ + -p 4318:4318 \ + --rm -ti \ + grafana/otel-lgtm ``` -In normal runs that resolves from `T3CODE_HOME`. In dev mode it uses `dev/logs`; otherwise it uses `userdata/logs`. - -The trace record shape lives in `apps/server/src/observability/TraceRecord.ts`. - -Useful fields: - -- `name`: span name -- `traceId`, `spanId`, `parentSpanId`: correlation -- `durationMs`: elapsed time -- `attributes`: structured context -- `events`: embedded log events and custom events -- `exit`: `Success`, `Failure`, or `Interrupted` - -### Logs - -Application logs are not persisted as a separate file anymore. - -- `Logger.consolePretty()` writes human-readable logs to stdout -- `Logger.tracerLogger` turns `Effect.log...` calls inside an active span into span events -- logs emitted outside a span are stdout-only - -That means if you want a log message to survive into the trace file, it needs to happen inside a traced effect. - -### Metrics - -Metrics are first-class in code, but they are not written to a local file. - -- local persistence: none -- remote export: OTLP only, when configured - -Metric definitions live in `apps/server/src/observability/Metrics.ts`. - -## Runtime Wiring - -The server observability layer is assembled in `apps/server/src/observability/Layers/Observability.ts`. - -It provides: +Then open `http://localhost:3000`. -- pretty stdout logger -- `Logger.tracerLogger` -- local NDJSON tracer -- optional OTLP trace exporter -- optional OTLP metrics exporter -- Effect trace-level and timing refs +Default Grafana login: -## Config - -### Local Trace File - -- `T3CODE_TRACE_FILE`: override the trace file path -- `T3CODE_TRACE_MAX_BYTES`: per-file rotation size, default `10485760` -- `T3CODE_TRACE_MAX_FILES`: rotated file count, default `10` -- `T3CODE_TRACE_BATCH_WINDOW_MS`: flush window, default `200` -- `T3CODE_TRACE_MIN_LEVEL`: minimum trace level, default `Info` -- `T3CODE_TRACE_TIMING_ENABLED`: enable timing metadata, default `true` +- username: `admin` +- password: `admin` -### OTLP Export +#### 2. Export OTLP env vars -- `T3CODE_OTLP_TRACES_URL`: OTLP trace endpoint -- `T3CODE_OTLP_METRICS_URL`: OTLP metric endpoint -- `T3CODE_OTLP_EXPORT_INTERVAL_MS`: export interval, default `10000` -- `T3CODE_OTLP_SERVICE_NAME`: service name, default `t3-server` +```bash +export T3CODE_OTLP_TRACES_URL=http://localhost:4318/v1/traces +export T3CODE_OTLP_METRICS_URL=http://localhost:4318/v1/metrics +export T3CODE_OTLP_SERVICE_NAME=t3-local +``` -If the OTLP URLs are unset, local tracing still works and metrics simply stay in-process. +Optional: -## Common Run Modes +```bash +export T3CODE_TRACE_MIN_LEVEL=Info +export T3CODE_TRACE_TIMING_ENABLED=true +``` -### CLI or `npx t3` +#### 3. Launch the app from that same shell -This is the easiest way to test observability outside the monorepo: +CLI: ```bash -export T3CODE_OTLP_TRACES_URL=http://localhost:4318/v1/traces -export T3CODE_OTLP_METRICS_URL=http://localhost:4318/v1/metrics -export T3CODE_OTLP_SERVICE_NAME=t3-cli - npx t3 ``` -### Monorepo `bun dev` - -`bun dev` forwards your shell environment to the server process, so the same OTLP vars work: +Monorepo web/server dev: ```bash -T3CODE_OTLP_TRACES_URL=http://localhost:4318/v1/traces \ -T3CODE_OTLP_METRICS_URL=http://localhost:4318/v1/metrics \ -T3CODE_OTLP_SERVICE_NAME=t3-dev \ bun dev ``` -### Monorepo `bun dev:desktop` +Monorepo desktop dev: + +```bash +bun dev:desktop +``` + +Packaged desktop app: + +Launch the actual app executable from the same shell so the desktop app and embedded backend inherit `T3CODE_OTLP_*`. -Desktop dev mode also inherits the shell environment, and the Electron main process forwards OTLP env vars to the child backend: +macOS app bundle example: ```bash T3CODE_OTLP_TRACES_URL=http://localhost:4318/v1/traces \ T3CODE_OTLP_METRICS_URL=http://localhost:4318/v1/metrics \ -T3CODE_OTLP_SERVICE_NAME=t3-desktop-dev \ -bun dev:desktop +T3CODE_OTLP_SERVICE_NAME=t3-desktop \ +"/Applications/T3 Code.app/Contents/MacOS/T3 Code" ``` -### Packaged desktop app - -For packaged builds, start the app from a terminal if you want one-off local OTLP config: +Direct binary example: ```bash T3CODE_OTLP_TRACES_URL=http://localhost:4318/v1/traces \ T3CODE_OTLP_METRICS_URL=http://localhost:4318/v1/metrics \ T3CODE_OTLP_SERVICE_NAME=t3-desktop \ - +./path/to/your/desktop-app-binary ``` -If you launch the app from the OS UI instead, the env vars usually will not be present. +Do not rely on launching from Finder, Spotlight, the dock, or the Start menu after setting shell env vars. Those launches usually will not pick them up. + +#### 4. Fully restart after changing env -## How To Read The Trace File +The backend reads observability config at process start. If you change OTLP env vars, stop the app completely and start it again. -The trace file is NDJSON, so `jq` is the easiest way to explore it. +## How To Use Traces And Metrics To Debug The Server -### Tail everything +### Start With The Local Trace File + +The trace file is the fastest way to inspect raw span data. + +Tail it: ```bash tail -f "$T3CODE_HOME/userdata/logs/server.trace.ndjson" ``` -### Show failures only +In monorepo dev, use: + +```bash +tail -f ./dev/logs/server.trace.ndjson +``` + +Show failed spans: ```bash jq -c 'select(.exit._tag != "Success") | { @@ -253,7 +188,7 @@ jq -c 'select(.exit._tag != "Success") | { }' "$T3CODE_HOME/userdata/logs/server.trace.ndjson" ``` -### Find slow spans +Show slow spans: ```bash jq -c 'select(.durationMs > 1000) | { @@ -264,7 +199,7 @@ jq -c 'select(.durationMs > 1000) | { }' "$T3CODE_HOME/userdata/logs/server.trace.ndjson" ``` -### Inspect embedded log events +Inspect embedded log events: ```bash jq -c 'select(any(.events[]?; .attributes["effect.logLevel"] != null)) | { @@ -281,7 +216,7 @@ jq -c 'select(any(.events[]?; .attributes["effect.logLevel"] != null)) | { }' "$T3CODE_HOME/userdata/logs/server.trace.ndjson" ``` -### Follow one trace +Follow one trace: ```bash jq -r 'select(.traceId == "TRACE_ID_HERE") | [ @@ -292,19 +227,7 @@ jq -r 'select(.traceId == "TRACE_ID_HERE") | [ ] | @tsv' "$T3CODE_HOME/userdata/logs/server.trace.ndjson" ``` -### Filter provider work - -```bash -jq -c 'select(.attributes["provider.thread_id"] == "thread_123") | { - name, - durationMs, - provider: .attributes["provider.kind"], - model: .attributes["provider.model"], - exit: .exit._tag -}' "$T3CODE_HOME/userdata/logs/server.trace.ndjson" -``` - -### Filter orchestration commands +Filter orchestration commands: ```bash jq -c 'select(.attributes["orchestration.command_type"] != null) | { @@ -315,7 +238,7 @@ jq -c 'select(.attributes["orchestration.command_type"] != null) | { }' "$T3CODE_HOME/userdata/logs/server.trace.ndjson" ``` -### Filter git activity +Filter git activity: ```bash jq -c 'select(.attributes["git.operation"] != null) | { @@ -330,26 +253,138 @@ jq -c 'select(.attributes["git.operation"] != null) | { }' "$T3CODE_HOME/userdata/logs/server.trace.ndjson" ``` -## What Is Instrumented Today +### Use Tempo When You Need A Real Trace Viewer -Current high-value span and metric boundaries include: +Tempo is better than raw NDJSON when you want to: -- Effect RPC websocket request spans from `effect/rpc` -- RPC request metrics in `apps/server/src/observability/RpcInstrumentation.ts` -- startup phases -- orchestration command processing -- provider session and turn operations -- git command execution and git hook events -- terminal session lifecycle -- sqlite query execution +- search across many traces +- inspect parent/child relationships visually +- compare many slow traces +- drill into one failing request without hand-joining by `traceId` + +Recommended flow in Grafana: + +1. Open `Explore`. +2. Pick the `Tempo` data source. +3. Set the time range to something recent like `Last 15 minutes`. +4. Start broad. Do not begin with a very narrow query. +5. Look for spans from your configured service name, then narrow by span name or attributes. + +Good first searches: + +- service name such as `t3-local`, `t3-dev`, or `t3-desktop` +- span names like `sql.execute`, `git.runCommand`, `provider.sendTurn` +- orchestration spans with attributes like `orchestration.command_type` + +Once you know traces are arriving, narrower TraceQL queries like `name = "sql.execute"` become useful. + +### Use Metrics To See Systemic Problems + +Traces are best for one request. Metrics are best for trends. + +Good metric families to watch: + +- `t3_rpc_request_duration` +- `t3_orchestration_command_duration` +- `t3_orchestration_command_ack_duration` +- `t3_provider_turn_duration` +- `t3_git_command_duration` +- `t3_db_query_duration` + +Counters tell you volume and failure rate: + +- `t3_rpc_requests_total` +- `t3_orchestration_commands_total` +- `t3_provider_turns_total` +- `t3_git_commands_total` +- `t3_db_queries_total` + +Use metrics when the question is: + +- "is this always slow?" +- "did this get worse after a change?" +- "which command type is failing most often?" + +Use traces when the question is: + +- "what happened in this specific request?" +- "which child span caused this one slow interaction?" +- "what logs were emitted inside the failing flow?" + +### What The New Ack Metric Means -Provider event NDJSON logging still exists separately for provider-runtime event streams. That is not the same artifact as the server trace file. +`t3_orchestration_command_ack_duration` measures: -## How To Instrument New Code +- start: command dispatch enters the orchestration engine +- end: the first committed domain event for that command is published by the server -### 1. Create or reuse a span +That is a server-side acknowledgment metric. It does not measure: -Prefer existing `Effect.fn("name")` boundaries where possible. For ad hoc work, wrap it: +- websocket transit to the browser +- client receipt +- React render time + +If you need those later, add client-side instrumentation or a dedicated server fanout metric. + +## Common Workflows + +### "Why did this request fail?" + +1. Start with the local NDJSON file. +2. Find spans where `exit._tag != "Success"`. +3. Group by `traceId`. +4. Inspect sibling spans and span events. +5. If needed, move to Tempo for the full trace tree. + +### "Why is the UI feeling slow?" + +1. Search for slow top-level spans in the trace file or Tempo. +2. Check child spans for sqlite, git, provider, or terminal work. +3. Look at the matching duration metrics to see whether the slowness is systemic. + +### "Did this command take too long to acknowledge?" + +1. Check `t3_orchestration_command_ack_duration` by `commandType`. +2. If it is high, inspect the corresponding orchestration trace. +3. Look at child spans for projection, sqlite, provider, or git work. + +### "Are git hooks causing latency?" + +1. Filter `git.operation` spans. +2. Inspect `git.hook.started` and `git.hook.finished` events. +3. Compare hook timing to the enclosing git span duration. + +### "Why do I have spans locally but nothing in Grafana?" + +Usually one of these is true: + +- `T3CODE_OTLP_TRACES_URL` was not set +- the app was launched from a different environment than the one where you exported the vars +- the app was not fully restarted after changing env +- Grafana is looking at the wrong time range or service name + +If the local NDJSON file is updating, local tracing is working. The problem is almost always OTLP export configuration or process startup. + +## How To Think About Adding Tracing To Future Code + +### Prefer Boundaries Over Tiny Helpers + +Good span boundaries: + +- RPC methods +- orchestration command handling +- provider adapter calls +- external process calls +- persistence writes +- queue handoffs + +Avoid tracing every tiny helper. Most helpers should inherit the active span rather than create a new one. + +### Reuse `Effect.fn(...)` Where It Already Exists + +The codebase already uses `Effect.fn("name")` heavily. That should usually be your first tracing boundary. + +For ad hoc work: ```ts import { Effect } from "effect"; @@ -365,33 +400,27 @@ const runThing = Effect.gen(function* () { }).pipe(Effect.withSpan("thing.run")); ``` -### 2. Add metrics with the pipeable API +### Put High-Cardinality Detail On Spans -Use `withMetrics(...)` from `apps/server/src/observability/Metrics.ts`. +Use span annotations for IDs, paths, and other detailed context: ```ts -import { Effect } from "effect"; - -import { someCounter, someDuration, withMetrics } from "../observability/Metrics.ts"; - -const program = doWork().pipe( - withMetrics({ - counter: someCounter, - timer: someDuration, - attributes: { - operation: "work", - }, - }), -); +yield * + Effect.annotateCurrentSpan({ + "provider.thread_id": input.threadId, + "provider.request_id": input.requestId, + "git.cwd": input.cwd, + }); ``` -Use low-cardinality metric attributes only. +### Keep Metric Labels Low Cardinality Good metric labels: - operation kind - method name - provider kind +- aggregate kind - outcome Bad metric labels: @@ -401,68 +430,90 @@ Bad metric labels: - file paths - cwd - full prompts -- high-cardinality model strings when a normalized family label would do +- full model strings when a normalized family label would do + +Detailed context belongs on spans, not metrics. -Put that detailed context on spans instead. +### Use Logs As Span Events -### 3. Use span annotations for high-cardinality detail +Logs inside a span become part of the trace story: ```ts -yield * - Effect.annotateCurrentSpan({ - "provider.thread_id": input.threadId, - "provider.request_id": input.requestId, - "git.cwd": input.cwd, - }); +yield * Effect.logInfo("starting provider turn"); +yield * Effect.logDebug("waiting for approval response"); ``` -### 4. Use logs as span events +Those messages show up as span events because `Logger.tracerLogger` is installed. + +### Use The Pipeable Metrics API -If you want a trace to tell the story of what happened, log inside the span: +`withMetrics(...)` is the default way to attach a counter and timer to an effect: ```ts -yield * Effect.logInfo("starting provider turn"); -yield * Effect.logDebug("waiting for approval response"); +import { someCounter, someDuration, withMetrics } from "../observability/Metrics.ts"; + +const program = doWork().pipe( + withMetrics({ + counter: someCounter, + timer: someDuration, + attributes: { + operation: "work", + }, + }), +); ``` -Those messages will show up as span events in the trace file because `Logger.tracerLogger` is installed. +## Detailed API Reference -## Practical Workflows +### Runtime Wiring -### “Why did this request fail?” +The server observability layer is assembled in `apps/server/src/observability/Layers/Observability.ts`. -1. Find failed spans with `exit._tag != "Success"`. -2. Group by `traceId`. -3. Inspect sibling spans and embedded log events. -4. Look at attributes like `provider.thread_id`, `orchestration.command_type`, or `git.operation`. +It provides: -### “Why is the UI feeling slow?” +- pretty stdout logger +- `Logger.tracerLogger` +- local NDJSON tracer +- optional OTLP trace exporter +- optional OTLP metrics exporter +- Effect trace-level and timing refs -1. Sort spans by `durationMs`. -2. Look at top-level RPC request spans. -3. Check child spans for provider, git, terminal, or sqlite work. -4. Compare slow traces against metrics in your OTLP backend once metrics export is enabled. +### Env Vars -### “Are git hooks causing latency?” +Local trace file: -1. Filter spans with `git.operation`. -2. Inspect `git.hook.started` and `git.hook.finished` events. -3. Compare hook event timing to the enclosing git span duration. +- `T3CODE_TRACE_FILE`: override trace file path +- `T3CODE_TRACE_MAX_BYTES`: per-file rotation size, default `10485760` +- `T3CODE_TRACE_MAX_FILES`: rotated file count, default `10` +- `T3CODE_TRACE_BATCH_WINDOW_MS`: flush window, default `200` +- `T3CODE_TRACE_MIN_LEVEL`: minimum trace level, default `Info` +- `T3CODE_TRACE_TIMING_ENABLED`: enable timing metadata, default `true` -### “Where should I add more instrumentation?” +OTLP export: -Add spans or annotations at boundaries: +- `T3CODE_OTLP_TRACES_URL`: OTLP trace endpoint +- `T3CODE_OTLP_METRICS_URL`: OTLP metric endpoint +- `T3CODE_OTLP_EXPORT_INTERVAL_MS`: export interval, default `10000` +- `T3CODE_OTLP_SERVICE_NAME`: service name, default `t3-server` -- queue handoff -- provider adapter calls -- external process calls -- persistence writes -- RPC methods that fan out into multiple subsystems +If the OTLP URLs are unset, local tracing still works and metrics stay in-process only. + +### What Is Instrumented Today + +Current high-value span and metric boundaries include: -Avoid tracing every tiny helper. The useful unit is usually a boundary or a phase. +- Effect RPC websocket request spans from `effect/rpc` +- RPC request metrics in `apps/server/src/observability/RpcInstrumentation.ts` +- startup phases +- orchestration command processing +- orchestration command acknowledgment latency +- provider session and turn operations +- git command execution and git hook events +- terminal session lifecycle +- sqlite query execution -## Current Constraints +### Current Constraints - logs outside spans are not persisted - metrics are not snapshotted locally -- the old `serverLogPath` still exists in config for compatibility, but the trace file is the persisted observability artifact that matters +- the old `serverLogPath` still exists in config for compatibility, but the trace file is the persisted artifact that matters diff --git a/packages/contracts/src/server.ts b/packages/contracts/src/server.ts index d4038deb3a..776a0a89e9 100644 --- a/packages/contracts/src/server.ts +++ b/packages/contracts/src/server.ts @@ -72,6 +72,16 @@ export type ServerProvider = typeof ServerProvider.Type; export const ServerProviders = Schema.Array(ServerProvider); export type ServerProviders = typeof ServerProviders.Type; +export const ServerObservability = Schema.Struct({ + logsDirectoryPath: TrimmedNonEmptyString, + localTracingEnabled: Schema.Boolean, + otlpTracesUrl: Schema.optional(TrimmedNonEmptyString), + otlpTracesEnabled: Schema.Boolean, + otlpMetricsUrl: Schema.optional(TrimmedNonEmptyString), + otlpMetricsEnabled: Schema.Boolean, +}); +export type ServerObservability = typeof ServerObservability.Type; + export const ServerConfig = Schema.Struct({ cwd: TrimmedNonEmptyString, keybindingsConfigPath: TrimmedNonEmptyString, @@ -79,6 +89,7 @@ export const ServerConfig = Schema.Struct({ issues: ServerConfigIssues, providers: ServerProviders, availableEditors: Schema.Array(EditorId), + observability: ServerObservability, settings: ServerSettings, }); export type ServerConfig = typeof ServerConfig.Type; From 3e9d5fcfc3218eae18a8f2166bfd90c4e7a01aad Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Thu, 2 Apr 2026 17:30:26 -0700 Subject: [PATCH 4/8] Inline RPC tracing helpers in websocket handlers - Call `observeRpc*` directly for each WS method - Remove the local `rpcEffect`/`rpcStream` aliases --- apps/server/src/ws.ts | 98 +++++++++++++++++++------------------------ 1 file changed, 42 insertions(+), 56 deletions(-) diff --git a/apps/server/src/ws.ts b/apps/server/src/ws.ts index 5210d3748b..de263ecb92 100644 --- a/apps/server/src/ws.ts +++ b/apps/server/src/ws.ts @@ -84,31 +84,9 @@ const WsRpcLayer = WsRpcGroup.toLayer( }; }); - const rpcEffect = ( - method: string, - effect: Effect.Effect, - traceAttributes?: Readonly>, - ) => observeRpcEffect(method, effect, traceAttributes); - - const rpcStream = ( - method: string, - stream: Stream.Stream, - traceAttributes?: Readonly>, - ) => observeRpcStream(method, stream, traceAttributes); - - const rpcStreamEffect = ( - method: string, - effect: Effect.Effect< - Stream.Stream, - EffectError, - EffectContext - >, - traceAttributes?: Readonly>, - ) => observeRpcStreamEffect(method, effect, traceAttributes); - return WsRpcGroup.of({ [ORCHESTRATION_WS_METHODS.getSnapshot]: (_input) => - rpcEffect( + observeRpcEffect( ORCHESTRATION_WS_METHODS.getSnapshot, projectionSnapshotQuery.getSnapshot().pipe( Effect.mapError( @@ -122,7 +100,7 @@ const WsRpcLayer = WsRpcGroup.toLayer( { "rpc.aggregate": "orchestration" }, ), [ORCHESTRATION_WS_METHODS.dispatchCommand]: (command) => - rpcEffect( + observeRpcEffect( ORCHESTRATION_WS_METHODS.dispatchCommand, Effect.gen(function* () { const normalizedCommand = yield* normalizeDispatchCommand(command); @@ -153,7 +131,7 @@ const WsRpcLayer = WsRpcGroup.toLayer( { "rpc.aggregate": "orchestration" }, ), [ORCHESTRATION_WS_METHODS.getTurnDiff]: (input) => - rpcEffect( + observeRpcEffect( ORCHESTRATION_WS_METHODS.getTurnDiff, checkpointDiffQuery.getTurnDiff(input).pipe( Effect.mapError( @@ -167,7 +145,7 @@ const WsRpcLayer = WsRpcGroup.toLayer( { "rpc.aggregate": "orchestration" }, ), [ORCHESTRATION_WS_METHODS.getFullThreadDiff]: (input) => - rpcEffect( + observeRpcEffect( ORCHESTRATION_WS_METHODS.getFullThreadDiff, checkpointDiffQuery.getFullThreadDiff(input).pipe( Effect.mapError( @@ -181,7 +159,7 @@ const WsRpcLayer = WsRpcGroup.toLayer( { "rpc.aggregate": "orchestration" }, ), [ORCHESTRATION_WS_METHODS.replayEvents]: (input) => - rpcEffect( + observeRpcEffect( ORCHESTRATION_WS_METHODS.replayEvents, Stream.runCollect( orchestrationEngine.readEvents( @@ -200,7 +178,7 @@ const WsRpcLayer = WsRpcGroup.toLayer( { "rpc.aggregate": "orchestration" }, ), [WS_METHODS.subscribeOrchestrationDomainEvents]: (_input) => - rpcStreamEffect( + observeRpcStreamEffect( WS_METHODS.subscribeOrchestrationDomainEvents, Effect.gen(function* () { const snapshot = yield* orchestrationEngine.getReadModel(); @@ -259,15 +237,17 @@ const WsRpcLayer = WsRpcGroup.toLayer( { "rpc.aggregate": "orchestration" }, ), [WS_METHODS.serverGetConfig]: (_input) => - rpcEffect(WS_METHODS.serverGetConfig, loadServerConfig, { "rpc.aggregate": "server" }), + observeRpcEffect(WS_METHODS.serverGetConfig, loadServerConfig, { + "rpc.aggregate": "server", + }), [WS_METHODS.serverRefreshProviders]: (_input) => - rpcEffect( + observeRpcEffect( WS_METHODS.serverRefreshProviders, providerRegistry.refresh().pipe(Effect.map((providers) => ({ providers }))), { "rpc.aggregate": "server" }, ), [WS_METHODS.serverUpsertKeybinding]: (rule) => - rpcEffect( + observeRpcEffect( WS_METHODS.serverUpsertKeybinding, Effect.gen(function* () { const keybindingsConfig = yield* keybindings.upsertKeybindingRule(rule); @@ -276,15 +256,15 @@ const WsRpcLayer = WsRpcGroup.toLayer( { "rpc.aggregate": "server" }, ), [WS_METHODS.serverGetSettings]: (_input) => - rpcEffect(WS_METHODS.serverGetSettings, serverSettings.getSettings, { + observeRpcEffect(WS_METHODS.serverGetSettings, serverSettings.getSettings, { "rpc.aggregate": "server", }), [WS_METHODS.serverUpdateSettings]: ({ patch }) => - rpcEffect(WS_METHODS.serverUpdateSettings, serverSettings.updateSettings(patch), { + observeRpcEffect(WS_METHODS.serverUpdateSettings, serverSettings.updateSettings(patch), { "rpc.aggregate": "server", }), [WS_METHODS.projectsSearchEntries]: (input) => - rpcEffect( + observeRpcEffect( WS_METHODS.projectsSearchEntries, workspaceEntries.search(input).pipe( Effect.mapError( @@ -298,7 +278,7 @@ const WsRpcLayer = WsRpcGroup.toLayer( { "rpc.aggregate": "workspace" }, ), [WS_METHODS.projectsWriteFile]: (input) => - rpcEffect( + observeRpcEffect( WS_METHODS.projectsWriteFile, workspaceFileSystem.writeFile(input).pipe( Effect.mapError((cause) => { @@ -314,15 +294,19 @@ const WsRpcLayer = WsRpcGroup.toLayer( { "rpc.aggregate": "workspace" }, ), [WS_METHODS.shellOpenInEditor]: (input) => - rpcEffect(WS_METHODS.shellOpenInEditor, open.openInEditor(input), { + observeRpcEffect(WS_METHODS.shellOpenInEditor, open.openInEditor(input), { "rpc.aggregate": "workspace", }), [WS_METHODS.gitStatus]: (input) => - rpcEffect(WS_METHODS.gitStatus, gitManager.status(input), { "rpc.aggregate": "git" }), + observeRpcEffect(WS_METHODS.gitStatus, gitManager.status(input), { + "rpc.aggregate": "git", + }), [WS_METHODS.gitPull]: (input) => - rpcEffect(WS_METHODS.gitPull, git.pullCurrentBranch(input.cwd), { "rpc.aggregate": "git" }), + observeRpcEffect(WS_METHODS.gitPull, git.pullCurrentBranch(input.cwd), { + "rpc.aggregate": "git", + }), [WS_METHODS.gitRunStackedAction]: (input) => - rpcStream( + observeRpcStream( WS_METHODS.gitRunStackedAction, Stream.callback((queue) => gitManager @@ -342,61 +326,63 @@ const WsRpcLayer = WsRpcGroup.toLayer( { "rpc.aggregate": "git" }, ), [WS_METHODS.gitResolvePullRequest]: (input) => - rpcEffect(WS_METHODS.gitResolvePullRequest, gitManager.resolvePullRequest(input), { + observeRpcEffect(WS_METHODS.gitResolvePullRequest, gitManager.resolvePullRequest(input), { "rpc.aggregate": "git", }), [WS_METHODS.gitPreparePullRequestThread]: (input) => - rpcEffect( + observeRpcEffect( WS_METHODS.gitPreparePullRequestThread, gitManager.preparePullRequestThread(input), { "rpc.aggregate": "git" }, ), [WS_METHODS.gitListBranches]: (input) => - rpcEffect(WS_METHODS.gitListBranches, git.listBranches(input), { "rpc.aggregate": "git" }), + observeRpcEffect(WS_METHODS.gitListBranches, git.listBranches(input), { + "rpc.aggregate": "git", + }), [WS_METHODS.gitCreateWorktree]: (input) => - rpcEffect(WS_METHODS.gitCreateWorktree, git.createWorktree(input), { + observeRpcEffect(WS_METHODS.gitCreateWorktree, git.createWorktree(input), { "rpc.aggregate": "git", }), [WS_METHODS.gitRemoveWorktree]: (input) => - rpcEffect(WS_METHODS.gitRemoveWorktree, git.removeWorktree(input), { + observeRpcEffect(WS_METHODS.gitRemoveWorktree, git.removeWorktree(input), { "rpc.aggregate": "git", }), [WS_METHODS.gitCreateBranch]: (input) => - rpcEffect(WS_METHODS.gitCreateBranch, git.createBranch(input), { + observeRpcEffect(WS_METHODS.gitCreateBranch, git.createBranch(input), { "rpc.aggregate": "git", }), [WS_METHODS.gitCheckout]: (input) => - rpcEffect(WS_METHODS.gitCheckout, Effect.scoped(git.checkoutBranch(input)), { + observeRpcEffect(WS_METHODS.gitCheckout, Effect.scoped(git.checkoutBranch(input)), { "rpc.aggregate": "git", }), [WS_METHODS.gitInit]: (input) => - rpcEffect(WS_METHODS.gitInit, git.initRepo(input), { "rpc.aggregate": "git" }), + observeRpcEffect(WS_METHODS.gitInit, git.initRepo(input), { "rpc.aggregate": "git" }), [WS_METHODS.terminalOpen]: (input) => - rpcEffect(WS_METHODS.terminalOpen, terminalManager.open(input), { + observeRpcEffect(WS_METHODS.terminalOpen, terminalManager.open(input), { "rpc.aggregate": "terminal", }), [WS_METHODS.terminalWrite]: (input) => - rpcEffect(WS_METHODS.terminalWrite, terminalManager.write(input), { + observeRpcEffect(WS_METHODS.terminalWrite, terminalManager.write(input), { "rpc.aggregate": "terminal", }), [WS_METHODS.terminalResize]: (input) => - rpcEffect(WS_METHODS.terminalResize, terminalManager.resize(input), { + observeRpcEffect(WS_METHODS.terminalResize, terminalManager.resize(input), { "rpc.aggregate": "terminal", }), [WS_METHODS.terminalClear]: (input) => - rpcEffect(WS_METHODS.terminalClear, terminalManager.clear(input), { + observeRpcEffect(WS_METHODS.terminalClear, terminalManager.clear(input), { "rpc.aggregate": "terminal", }), [WS_METHODS.terminalRestart]: (input) => - rpcEffect(WS_METHODS.terminalRestart, terminalManager.restart(input), { + observeRpcEffect(WS_METHODS.terminalRestart, terminalManager.restart(input), { "rpc.aggregate": "terminal", }), [WS_METHODS.terminalClose]: (input) => - rpcEffect(WS_METHODS.terminalClose, terminalManager.close(input), { + observeRpcEffect(WS_METHODS.terminalClose, terminalManager.close(input), { "rpc.aggregate": "terminal", }), [WS_METHODS.subscribeTerminalEvents]: (_input) => - rpcStream( + observeRpcStream( WS_METHODS.subscribeTerminalEvents, Stream.callback((queue) => Effect.acquireRelease( @@ -407,7 +393,7 @@ const WsRpcLayer = WsRpcGroup.toLayer( { "rpc.aggregate": "terminal" }, ), [WS_METHODS.subscribeServerConfig]: (_input) => - rpcStreamEffect( + observeRpcStreamEffect( WS_METHODS.subscribeServerConfig, Effect.gen(function* () { const keybindingsUpdates = keybindings.streamChanges.pipe( @@ -446,7 +432,7 @@ const WsRpcLayer = WsRpcGroup.toLayer( { "rpc.aggregate": "server" }, ), [WS_METHODS.subscribeServerLifecycle]: (_input) => - rpcStreamEffect( + observeRpcStreamEffect( WS_METHODS.subscribeServerLifecycle, Effect.gen(function* () { const snapshot = yield* lifecycleEvents.snapshot; From ad971dbdd30a9a3c3f8343c23a64aadcd03f2e31 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Thu, 2 Apr 2026 18:15:27 -0700 Subject: [PATCH 5/8] Track orchestration command outcomes in metrics - Record success, failure, and interrupt outcomes from command dispatch - Normalize invalid dates and group GPT models under a shared label - Add coverage for failed dispatch metric accounting --- .../src/observability/Attributes.test.ts | 19 +- apps/server/src/observability/Attributes.ts | 6 +- apps/server/src/observability/Metrics.ts | 10 +- .../Layers/OrchestrationEngine.test.ts | 38 +++ .../Layers/OrchestrationEngine.ts | 260 ++++++++++-------- 5 files changed, 206 insertions(+), 127 deletions(-) diff --git a/apps/server/src/observability/Attributes.test.ts b/apps/server/src/observability/Attributes.test.ts index de08bac225..4b495598ea 100644 --- a/apps/server/src/observability/Attributes.test.ts +++ b/apps/server/src/observability/Attributes.test.ts @@ -1,6 +1,6 @@ import { assert, describe, it } from "@effect/vitest"; -import { compactTraceAttributes } from "./Attributes.ts"; +import { compactTraceAttributes, normalizeModelMetricLabel } from "./Attributes.ts"; describe("Attributes", () => { it("normalizes circular arrays, maps, and sets without recursing forever", () => { @@ -26,4 +26,21 @@ describe("Attributes", () => { }, ); }); + + it("normalizes invalid dates without throwing", () => { + assert.deepStrictEqual( + compactTraceAttributes({ + invalidDate: new Date("not-a-real-date"), + }), + { + invalidDate: "Invalid Date", + }, + ); + }); + + it("groups GPT-family models under a shared metric label", () => { + assert.strictEqual(normalizeModelMetricLabel("gpt-4o"), "gpt"); + assert.strictEqual(normalizeModelMetricLabel("gpt-5.4"), "gpt"); + assert.strictEqual(normalizeModelMetricLabel("claude-sonnet-4"), "claude"); + }); }); diff --git a/apps/server/src/observability/Attributes.ts b/apps/server/src/observability/Attributes.ts index ab003f6ebd..2251fcfea6 100644 --- a/apps/server/src/observability/Attributes.ts +++ b/apps/server/src/observability/Attributes.ts @@ -31,7 +31,7 @@ function normalizeJsonValue(value: unknown, seen: WeakSet = new WeakSet( return value.toString(); } if (value instanceof Date) { - return value.toISOString(); + return Number.isNaN(value.getTime()) ? "Invalid Date" : value.toISOString(); } if (value instanceof Error) { return { @@ -115,8 +115,8 @@ export function normalizeModelMetricLabel(model: string | null | undefined): str if (!normalized) { return undefined; } - if (normalized.includes("gpt-5")) { - return "gpt-5"; + if (normalized.includes("gpt")) { + return "gpt"; } if (normalized.includes("claude")) { return "claude"; diff --git a/apps/server/src/observability/Metrics.ts b/apps/server/src/observability/Metrics.ts index a9530ad2c8..115859fe15 100644 --- a/apps/server/src/observability/Metrics.ts +++ b/apps/server/src/observability/Metrics.ts @@ -155,11 +155,11 @@ export const providerTurnMetricAttributes = (input: { readonly provider: string; readonly model: string | null | undefined; readonly extra?: Readonly>; -}) => - compactMetricAttributes({ +}) => { + const modelFamily = normalizeModelMetricLabel(input.model); + return compactMetricAttributes({ provider: input.provider, - ...(normalizeModelMetricLabel(input.model) - ? { modelFamily: normalizeModelMetricLabel(input.model) } - : {}), + ...(modelFamily ? { modelFamily } : {}), ...input.extra, }); +}; diff --git a/apps/server/src/orchestration/Layers/OrchestrationEngine.test.ts b/apps/server/src/orchestration/Layers/OrchestrationEngine.test.ts index dd945cc9e7..5a0a6113f0 100644 --- a/apps/server/src/orchestration/Layers/OrchestrationEngine.test.ts +++ b/apps/server/src/orchestration/Layers/OrchestrationEngine.test.ts @@ -470,6 +470,44 @@ describe("OrchestrationEngine", () => { await system.dispose(); }); + it("records failed command dispatches as metric failures", async () => { + const system = await createOrchestrationSystem(); + const { engine } = system; + const createdAt = now(); + + await expect( + system.run( + engine.dispatch({ + type: "thread.create", + commandId: CommandId.makeUnsafe("cmd-thread-missing-project"), + threadId: ThreadId.makeUnsafe("thread-missing-project"), + projectId: asProjectId("project-missing"), + title: "Missing Project Thread", + modelSelection: { + provider: "codex", + model: "gpt-5-codex", + }, + interactionMode: DEFAULT_PROVIDER_INTERACTION_MODE, + runtimeMode: "full-access", + branch: null, + worktreePath: null, + createdAt, + }), + ), + ).rejects.toThrow("does not exist"); + + const snapshots = await system.run(Metric.snapshot); + expect( + hasMetricSnapshot(snapshots, "t3_orchestration_commands_total", { + commandType: "thread.create", + aggregateKind: "thread", + outcome: "failure", + }), + ).toBe(true); + + await system.dispose(); + }); + it("stores completed checkpoint summaries even when no files changed", async () => { const system = await createOrchestrationSystem(); const { engine } = system; diff --git a/apps/server/src/orchestration/Layers/OrchestrationEngine.ts b/apps/server/src/orchestration/Layers/OrchestrationEngine.ts index 15e3b45ad0..ddd1718faf 100644 --- a/apps/server/src/orchestration/Layers/OrchestrationEngine.ts +++ b/apps/server/src/orchestration/Layers/OrchestrationEngine.ts @@ -6,9 +6,11 @@ import type { } from "@t3tools/contracts"; import { OrchestrationCommand } from "@t3tools/contracts"; import { + Cause, Deferred, Duration, Effect, + Exit, Layer, Metric, Option, @@ -24,7 +26,6 @@ import { orchestrationCommandAckDuration, orchestrationCommandsTotal, orchestrationCommandDuration, - withMetrics, } from "../../observability/Metrics.ts"; import { toPersistenceSqlError } from "../../persistence/Errors.ts"; import { OrchestrationEventStore } from "../../persistence/Services/OrchestrationEventStore.ts"; @@ -83,7 +84,12 @@ const makeOrchestrationEngine = Effect.gen(function* () { const processEnvelope = (envelope: CommandEnvelope): Effect.Effect => { const dispatchStartSequence = readModel.snapshotSequence; + const processingStartedAtMs = Date.now(); const aggregateRef = commandToAggregateRef(envelope.command); + const baseMetricAttributes = { + commandType: envelope.command.type, + aggregateKind: aggregateRef.aggregateKind, + } as const; const reconcileReadModelAfterDispatchFailure = Effect.gen(function* () { const persistedEvents = yield* Stream.runCollect( eventStore.readFromSequence(dispatchStartSequence), @@ -103,145 +109,163 @@ const makeOrchestrationEngine = Effect.gen(function* () { } }); - return Effect.gen(function* () { - yield* Effect.annotateCurrentSpan({ - "orchestration.command_id": envelope.command.commandId, - "orchestration.command_type": envelope.command.type, - "orchestration.aggregate_kind": aggregateRef.aggregateKind, - "orchestration.aggregate_id": aggregateRef.aggregateId, - }); + return Effect.exit( + Effect.gen(function* () { + yield* Effect.annotateCurrentSpan({ + "orchestration.command_id": envelope.command.commandId, + "orchestration.command_type": envelope.command.type, + "orchestration.aggregate_kind": aggregateRef.aggregateKind, + "orchestration.aggregate_id": aggregateRef.aggregateId, + }); - const existingReceipt = yield* commandReceiptRepository.getByCommandId({ - commandId: envelope.command.commandId, - }); - if (Option.isSome(existingReceipt)) { - if (existingReceipt.value.status === "accepted") { - yield* Deferred.succeed(envelope.result, { - sequence: existingReceipt.value.resultSequence, - }); - return; - } - yield* Deferred.fail( - envelope.result, - new OrchestrationCommandPreviouslyRejectedError({ + const existingReceipt = yield* commandReceiptRepository.getByCommandId({ + commandId: envelope.command.commandId, + }); + if (Option.isSome(existingReceipt)) { + if (existingReceipt.value.status === "accepted") { + return { + sequence: existingReceipt.value.resultSequence, + }; + } + return yield* new OrchestrationCommandPreviouslyRejectedError({ commandId: envelope.command.commandId, detail: existingReceipt.value.error ?? "Previously rejected.", - }), - ); - return; - } + }); + } - const eventBase = yield* decideOrchestrationCommand({ - command: envelope.command, - readModel, - }); - const eventBases = Array.isArray(eventBase) ? eventBase : [eventBase]; - const committedCommand = yield* sql - .withTransaction( - Effect.gen(function* () { - const committedEvents: OrchestrationEvent[] = []; - let nextReadModel = readModel; + const eventBase = yield* decideOrchestrationCommand({ + command: envelope.command, + readModel, + }); + const eventBases = Array.isArray(eventBase) ? eventBase : [eventBase]; + const committedCommand = yield* sql + .withTransaction( + Effect.gen(function* () { + const committedEvents: OrchestrationEvent[] = []; + let nextReadModel = readModel; - for (const nextEvent of eventBases) { - const savedEvent = yield* eventStore.append(nextEvent); - nextReadModel = yield* projectEvent(nextReadModel, savedEvent); - yield* projectionPipeline.projectEvent(savedEvent); - committedEvents.push(savedEvent); - } + for (const nextEvent of eventBases) { + const savedEvent = yield* eventStore.append(nextEvent); + nextReadModel = yield* projectEvent(nextReadModel, savedEvent); + yield* projectionPipeline.projectEvent(savedEvent); + committedEvents.push(savedEvent); + } - const lastSavedEvent = committedEvents.at(-1) ?? null; - if (lastSavedEvent === null) { - return yield* new OrchestrationCommandInvariantError({ - commandType: envelope.command.type, - detail: "Command produced no events.", - }); - } + const lastSavedEvent = committedEvents.at(-1) ?? null; + if (lastSavedEvent === null) { + return yield* new OrchestrationCommandInvariantError({ + commandType: envelope.command.type, + detail: "Command produced no events.", + }); + } - yield* commandReceiptRepository.upsert({ - commandId: envelope.command.commandId, - aggregateKind: lastSavedEvent.aggregateKind, - aggregateId: lastSavedEvent.aggregateId, - acceptedAt: lastSavedEvent.occurredAt, - resultSequence: lastSavedEvent.sequence, - status: "accepted", - error: null, - }); + yield* commandReceiptRepository.upsert({ + commandId: envelope.command.commandId, + aggregateKind: lastSavedEvent.aggregateKind, + aggregateId: lastSavedEvent.aggregateId, + acceptedAt: lastSavedEvent.occurredAt, + resultSequence: lastSavedEvent.sequence, + status: "accepted", + error: null, + }); - return { - committedEvents, - lastSequence: lastSavedEvent.sequence, - nextReadModel, - } as const; - }), - ) - .pipe( - Effect.catchTag("SqlError", (sqlError) => - Effect.fail( - toPersistenceSqlError("OrchestrationEngine.processEnvelope:transaction")(sqlError), + return { + committedEvents, + lastSequence: lastSavedEvent.sequence, + nextReadModel, + } as const; + }), + ) + .pipe( + Effect.catchTag("SqlError", (sqlError) => + Effect.fail( + toPersistenceSqlError("OrchestrationEngine.processEnvelope:transaction")(sqlError), + ), ), - ), - ); + ); - readModel = committedCommand.nextReadModel; - for (const [index, event] of committedCommand.committedEvents.entries()) { - yield* PubSub.publish(eventPubSub, event); - if (index === 0) { + readModel = committedCommand.nextReadModel; + for (const [index, event] of committedCommand.committedEvents.entries()) { + yield* PubSub.publish(eventPubSub, event); + if (index === 0) { + yield* Metric.update( + Metric.withAttributes( + orchestrationCommandAckDuration, + metricAttributes({ + ...baseMetricAttributes, + ackEventType: event.type, + }), + ), + Duration.millis(Math.max(0, Date.now() - envelope.startedAtMs)), + ); + } + } + return { sequence: committedCommand.lastSequence }; + }).pipe(Effect.withSpan(`orchestration.command.${envelope.command.type}`)), + ).pipe( + Effect.flatMap((exit) => + Effect.gen(function* () { + const outcome = Exit.isSuccess(exit) + ? "success" + : Cause.hasInterruptsOnly(exit.cause) + ? "interrupt" + : "failure"; + yield* Metric.update( + Metric.withAttributes( + orchestrationCommandDuration, + metricAttributes(baseMetricAttributes), + ), + Duration.millis(Math.max(0, Date.now() - processingStartedAtMs)), + ); yield* Metric.update( Metric.withAttributes( - orchestrationCommandAckDuration, + orchestrationCommandsTotal, metricAttributes({ - commandType: envelope.command.type, - aggregateKind: aggregateRef.aggregateKind, - ackEventType: event.type, + ...baseMetricAttributes, + outcome, }), ), - Duration.millis(Math.max(0, Date.now() - envelope.startedAtMs)), + 1, ); - } - } - yield* Deferred.succeed(envelope.result, { sequence: committedCommand.lastSequence }); - }).pipe( - Effect.catch((error) => - Effect.gen(function* () { - yield* reconcileReadModelAfterDispatchFailure.pipe( - Effect.catch(() => - Effect.logWarning( - "failed to reconcile orchestration read model after dispatch failure", - ).pipe( - Effect.annotateLogs({ - commandId: envelope.command.commandId, - snapshotSequence: readModel.snapshotSequence, - }), + + if (Exit.isSuccess(exit)) { + yield* Deferred.succeed(envelope.result, exit.value); + return; + } + + const error = Cause.squash(exit.cause) as OrchestrationDispatchError; + if (!Schema.is(OrchestrationCommandPreviouslyRejectedError)(error)) { + yield* reconcileReadModelAfterDispatchFailure.pipe( + Effect.catch(() => + Effect.logWarning( + "failed to reconcile orchestration read model after dispatch failure", + ).pipe( + Effect.annotateLogs({ + commandId: envelope.command.commandId, + snapshotSequence: readModel.snapshotSequence, + }), + ), ), - ), - ); + ); - if (Schema.is(OrchestrationCommandInvariantError)(error)) { - yield* commandReceiptRepository - .upsert({ - commandId: envelope.command.commandId, - aggregateKind: aggregateRef.aggregateKind, - aggregateId: aggregateRef.aggregateId, - acceptedAt: new Date().toISOString(), - resultSequence: readModel.snapshotSequence, - status: "rejected", - error: error.message, - }) - .pipe(Effect.catch(() => Effect.void)); + if (Schema.is(OrchestrationCommandInvariantError)(error)) { + yield* commandReceiptRepository + .upsert({ + commandId: envelope.command.commandId, + aggregateKind: aggregateRef.aggregateKind, + aggregateId: aggregateRef.aggregateId, + acceptedAt: new Date().toISOString(), + resultSequence: readModel.snapshotSequence, + status: "rejected", + error: error.message, + }) + .pipe(Effect.catch(() => Effect.void)); + } } + yield* Deferred.fail(envelope.result, error); }), ), - Effect.withSpan(`orchestration.command.${envelope.command.type}`), - Effect.asVoid, - withMetrics({ - counter: orchestrationCommandsTotal, - timer: orchestrationCommandDuration, - attributes: { - commandType: envelope.command.type, - aggregateKind: aggregateRef.aggregateKind, - }, - }), ); }; From 5fada5818582402525ae1e4a64516dbc9d967186 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Thu, 2 Apr 2026 18:21:00 -0700 Subject: [PATCH 6/8] Improve effect tracing across orchestration and telemetry - name previously untraced effects - annotate editor launch spans with context --- apps/server/src/open.ts | 7 +- .../orchestration/Layers/CheckpointReactor.ts | 228 +++++++++--------- .../Layers/ProviderCommandReactor.ts | 182 +++++++------- .../Layers/ProviderRuntimeIngestion.ts | 85 +++---- .../src/telemetry/Layers/AnalyticsService.ts | 24 +- 5 files changed, 273 insertions(+), 253 deletions(-) diff --git a/apps/server/src/open.ts b/apps/server/src/open.ts index 212db36c01..fd90e7b6bb 100644 --- a/apps/server/src/open.ts +++ b/apps/server/src/open.ts @@ -198,10 +198,15 @@ export class Open extends ServiceMap.Service()("t3/open") {} // Implementations // ============================== -export const resolveEditorLaunch = Effect.fnUntraced(function* ( +export const resolveEditorLaunch = Effect.fn("resolveEditorLaunch")(function* ( input: OpenInEditorInput, platform: NodeJS.Platform = process.platform, ): Effect.fn.Return { + yield* Effect.annotateCurrentSpan({ + "open.editor": input.editor, + "open.cwd": input.cwd, + "open.platform": platform, + }); const editorDef = EDITORS.find((editor) => editor.id === input.editor); if (!editorDef) { return yield* new OpenError({ message: `Unknown editor: ${input.editor}` }); diff --git a/apps/server/src/orchestration/Layers/CheckpointReactor.ts b/apps/server/src/orchestration/Layers/CheckpointReactor.ts index 28609cecb3..03abebaf3a 100644 --- a/apps/server/src/orchestration/Layers/CheckpointReactor.ts +++ b/apps/server/src/orchestration/Layers/CheckpointReactor.ts @@ -119,7 +119,7 @@ const make = Effect.gen(function* () { createdAt: input.createdAt, }); - const resolveSessionRuntimeForThread = Effect.fnUntraced(function* ( + const resolveSessionRuntimeForThread = Effect.fn("resolveSessionRuntimeForThread")(function* ( threadId: ThreadId, ): Effect.fn.Return> { const readModel = yield* orchestrationEngine.getReadModel(); @@ -153,7 +153,7 @@ const make = Effect.gen(function* () { // active provider session CWD and falling back to the thread/project config. // Returns undefined when no CWD can be determined or the workspace is not // a git repository. - const resolveCheckpointCwd = Effect.fnUntraced(function* (input: { + const resolveCheckpointCwd = Effect.fn("resolveCheckpointCwd")(function* (input: { readonly threadId: ThreadId; readonly thread: { readonly projectId: ProjectId; readonly worktreePath: string | null }; readonly projects: ReadonlyArray<{ readonly id: ProjectId; readonly workspaceRoot: string }>; @@ -188,7 +188,7 @@ const make = Effect.gen(function* () { // Shared tail for both capture paths: creates the git checkpoint ref, diffs // it against the previous turn, then dispatches the domain events to update // the orchestration read model. - const captureAndDispatchCheckpoint = Effect.fnUntraced(function* (input: { + const captureAndDispatchCheckpoint = Effect.fn("captureAndDispatchCheckpoint")(function* (input: { readonly threadId: ThreadId; readonly turnId: TurnId; readonly thread: { @@ -321,70 +321,70 @@ const make = Effect.gen(function* () { }); // Captures a real git checkpoint when a turn completes via a runtime event. - const captureCheckpointFromTurnCompletion = Effect.fnUntraced(function* ( - event: Extract, - ) { - const turnId = toTurnId(event.turnId); - if (!turnId) { - return; - } - - const readModel = yield* orchestrationEngine.getReadModel(); - const thread = readModel.threads.find((entry) => entry.id === event.threadId); - if (!thread) { - return; - } + const captureCheckpointFromTurnCompletion = Effect.fn("captureCheckpointFromTurnCompletion")( + function* (event: Extract) { + const turnId = toTurnId(event.turnId); + if (!turnId) { + return; + } - // When a primary turn is active, only that turn may produce completion checkpoints. - if (thread.session?.activeTurnId && !sameId(thread.session.activeTurnId, turnId)) { - return; - } + const readModel = yield* orchestrationEngine.getReadModel(); + const thread = readModel.threads.find((entry) => entry.id === event.threadId); + if (!thread) { + return; + } - // Only skip if a real (non-placeholder) checkpoint already exists for this turn. - // ProviderRuntimeIngestion may insert placeholder entries with status "missing" - // before this reactor runs; those must not prevent real git capture. - if ( - thread.checkpoints.some( - (checkpoint) => checkpoint.turnId === turnId && checkpoint.status !== "missing", - ) - ) { - return; - } + // When a primary turn is active, only that turn may produce completion checkpoints. + if (thread.session?.activeTurnId && !sameId(thread.session.activeTurnId, turnId)) { + return; + } - const checkpointCwd = yield* resolveCheckpointCwd({ - threadId: thread.id, - thread, - projects: readModel.projects, - preferSessionRuntime: true, - }); - if (!checkpointCwd) { - return; - } + // Only skip if a real (non-placeholder) checkpoint already exists for this turn. + // ProviderRuntimeIngestion may insert placeholder entries with status "missing" + // before this reactor runs; those must not prevent real git capture. + if ( + thread.checkpoints.some( + (checkpoint) => checkpoint.turnId === turnId && checkpoint.status !== "missing", + ) + ) { + return; + } - // If a placeholder checkpoint exists for this turn, reuse its turn count - // instead of incrementing past it. - const existingPlaceholder = thread.checkpoints.find( - (checkpoint) => checkpoint.turnId === turnId && checkpoint.status === "missing", - ); - const currentTurnCount = thread.checkpoints.reduce( - (maxTurnCount, checkpoint) => Math.max(maxTurnCount, checkpoint.checkpointTurnCount), - 0, - ); - const nextTurnCount = existingPlaceholder - ? existingPlaceholder.checkpointTurnCount - : currentTurnCount + 1; + const checkpointCwd = yield* resolveCheckpointCwd({ + threadId: thread.id, + thread, + projects: readModel.projects, + preferSessionRuntime: true, + }); + if (!checkpointCwd) { + return; + } - yield* captureAndDispatchCheckpoint({ - threadId: thread.id, - turnId, - thread, - cwd: checkpointCwd, - turnCount: nextTurnCount, - status: checkpointStatusFromRuntime(event.payload.state), - assistantMessageId: undefined, - createdAt: event.createdAt, - }); - }); + // If a placeholder checkpoint exists for this turn, reuse its turn count + // instead of incrementing past it. + const existingPlaceholder = thread.checkpoints.find( + (checkpoint) => checkpoint.turnId === turnId && checkpoint.status === "missing", + ); + const currentTurnCount = thread.checkpoints.reduce( + (maxTurnCount, checkpoint) => Math.max(maxTurnCount, checkpoint.checkpointTurnCount), + 0, + ); + const nextTurnCount = existingPlaceholder + ? existingPlaceholder.checkpointTurnCount + : currentTurnCount + 1; + + yield* captureAndDispatchCheckpoint({ + threadId: thread.id, + turnId, + thread, + cwd: checkpointCwd, + turnCount: nextTurnCount, + status: checkpointStatusFromRuntime(event.payload.state), + assistantMessageId: undefined, + createdAt: event.createdAt, + }); + }, + ); // Captures a real git checkpoint when a placeholder checkpoint (status "missing") // is detected via a domain event. This replaces the placeholder with a real @@ -394,7 +394,7 @@ const make = Effect.gen(function* () { // events from the Codex runtime. This handler fires when the corresponding // domain event arrives, allowing the reactor to capture the actual filesystem // state into a git ref and dispatch a replacement checkpoint. - const captureCheckpointFromPlaceholder = Effect.fnUntraced(function* ( + const captureCheckpointFromPlaceholder = Effect.fn("captureCheckpointFromPlaceholder")(function* ( event: Extract, ) { const { threadId, turnId, checkpointTurnCount, status } = event.payload; @@ -448,57 +448,59 @@ const make = Effect.gen(function* () { }); }); - const ensurePreTurnBaselineFromTurnStart = Effect.fnUntraced(function* ( - event: Extract, - ) { - const turnId = toTurnId(event.turnId); - if (!turnId) { - return; - } + const ensurePreTurnBaselineFromTurnStart = Effect.fn("ensurePreTurnBaselineFromTurnStart")( + function* (event: Extract) { + const turnId = toTurnId(event.turnId); + if (!turnId) { + return; + } - const readModel = yield* orchestrationEngine.getReadModel(); - const thread = readModel.threads.find((entry) => entry.id === event.threadId); - if (!thread) { - return; - } + const readModel = yield* orchestrationEngine.getReadModel(); + const thread = readModel.threads.find((entry) => entry.id === event.threadId); + if (!thread) { + return; + } - const checkpointCwd = yield* resolveCheckpointCwd({ - threadId: thread.id, - thread, - projects: readModel.projects, - preferSessionRuntime: false, - }); - if (!checkpointCwd) { - return; - } + const checkpointCwd = yield* resolveCheckpointCwd({ + threadId: thread.id, + thread, + projects: readModel.projects, + preferSessionRuntime: false, + }); + if (!checkpointCwd) { + return; + } - const currentTurnCount = thread.checkpoints.reduce( - (maxTurnCount, checkpoint) => Math.max(maxTurnCount, checkpoint.checkpointTurnCount), - 0, - ); - const baselineCheckpointRef = checkpointRefForThreadTurn(thread.id, currentTurnCount); - const baselineExists = yield* checkpointStore.hasCheckpointRef({ - cwd: checkpointCwd, - checkpointRef: baselineCheckpointRef, - }); - if (baselineExists) { - return; - } + const currentTurnCount = thread.checkpoints.reduce( + (maxTurnCount, checkpoint) => Math.max(maxTurnCount, checkpoint.checkpointTurnCount), + 0, + ); + const baselineCheckpointRef = checkpointRefForThreadTurn(thread.id, currentTurnCount); + const baselineExists = yield* checkpointStore.hasCheckpointRef({ + cwd: checkpointCwd, + checkpointRef: baselineCheckpointRef, + }); + if (baselineExists) { + return; + } - yield* checkpointStore.captureCheckpoint({ - cwd: checkpointCwd, - checkpointRef: baselineCheckpointRef, - }); - yield* receiptBus.publish({ - type: "checkpoint.baseline.captured", - threadId: thread.id, - checkpointTurnCount: currentTurnCount, - checkpointRef: baselineCheckpointRef, - createdAt: event.createdAt, - }); - }); + yield* checkpointStore.captureCheckpoint({ + cwd: checkpointCwd, + checkpointRef: baselineCheckpointRef, + }); + yield* receiptBus.publish({ + type: "checkpoint.baseline.captured", + threadId: thread.id, + checkpointTurnCount: currentTurnCount, + checkpointRef: baselineCheckpointRef, + createdAt: event.createdAt, + }); + }, + ); - const ensurePreTurnBaselineFromDomainTurnStart = Effect.fnUntraced(function* ( + const ensurePreTurnBaselineFromDomainTurnStart = Effect.fn( + "ensurePreTurnBaselineFromDomainTurnStart", + )(function* ( event: Extract< OrchestrationEvent, { type: "thread.turn-start-requested" | "thread.message-sent" } @@ -557,7 +559,7 @@ const make = Effect.gen(function* () { }); }); - const handleRevertRequested = Effect.fnUntraced(function* ( + const handleRevertRequested = Effect.fn("handleRevertRequested")(function* ( event: Extract, ) { const now = new Date().toISOString(); @@ -685,7 +687,7 @@ const make = Effect.gen(function* () { ); }); - const processDomainEvent = Effect.fnUntraced(function* (event: OrchestrationEvent) { + const processDomainEvent = Effect.fn("processDomainEvent")(function* (event: OrchestrationEvent) { if (event.type === "thread.turn-start-requested" || event.type === "thread.message-sent") { yield* ensurePreTurnBaselineFromDomainTurnStart(event); return; @@ -724,7 +726,9 @@ const make = Effect.gen(function* () { } }); - const processRuntimeEvent = Effect.fnUntraced(function* (event: ProviderRuntimeEvent) { + const processRuntimeEvent = Effect.fn("processRuntimeEvent")(function* ( + event: ProviderRuntimeEvent, + ) { if (event.type === "turn.started") { yield* ensurePreTurnBaselineFromTurnStart(event); return; diff --git a/apps/server/src/orchestration/Layers/ProviderCommandReactor.ts b/apps/server/src/orchestration/Layers/ProviderCommandReactor.ts index 2fcbe7ca24..419e3f3bf2 100644 --- a/apps/server/src/orchestration/Layers/ProviderCommandReactor.ts +++ b/apps/server/src/orchestration/Layers/ProviderCommandReactor.ts @@ -213,12 +213,12 @@ const make = Effect.gen(function* () { createdAt: input.createdAt, }); - const resolveThread = Effect.fnUntraced(function* (threadId: ThreadId) { + const resolveThread = Effect.fn("resolveThread")(function* (threadId: ThreadId) { const readModel = yield* orchestrationEngine.getReadModel(); return readModel.threads.find((entry) => entry.id === threadId); }); - const ensureSessionForThread = Effect.fnUntraced(function* ( + const ensureSessionForThread = Effect.fn("ensureSessionForThread")(function* ( threadId: ThreadId, createdAt: string, options?: { @@ -358,7 +358,7 @@ const make = Effect.gen(function* () { return startedSession.threadId; }); - const sendTurnForThread = Effect.fnUntraced(function* (input: { + const sendTurnForThread = Effect.fn("sendTurnForThread")(function* (input: { readonly threadId: ThreadId; readonly messageText: string; readonly attachments?: ReadonlyArray; @@ -410,7 +410,9 @@ const make = Effect.gen(function* () { }); }); - const maybeGenerateAndRenameWorktreeBranchForFirstTurn = Effect.fnUntraced(function* (input: { + const maybeGenerateAndRenameWorktreeBranchForFirstTurn = Effect.fn( + "maybeGenerateAndRenameWorktreeBranchForFirstTurn", + )(function* (input: { readonly threadId: ThreadId; readonly branch: string | null; readonly worktreePath: string | null; @@ -462,50 +464,52 @@ const make = Effect.gen(function* () { ); }); - const maybeGenerateThreadTitleForFirstTurn = Effect.fnUntraced(function* (input: { - readonly threadId: ThreadId; - readonly cwd: string; - readonly messageText: string; - readonly attachments?: ReadonlyArray; - readonly titleSeed?: string; - }) { - const attachments = input.attachments ?? []; - yield* Effect.gen(function* () { - const { textGenerationModelSelection: modelSelection } = - yield* serverSettingsService.getSettings; - - const generated = yield* textGeneration.generateThreadTitle({ - cwd: input.cwd, - message: input.messageText, - ...(attachments.length > 0 ? { attachments } : {}), - modelSelection, - }); - if (!generated) return; + const maybeGenerateThreadTitleForFirstTurn = Effect.fn("maybeGenerateThreadTitleForFirstTurn")( + function* (input: { + readonly threadId: ThreadId; + readonly cwd: string; + readonly messageText: string; + readonly attachments?: ReadonlyArray; + readonly titleSeed?: string; + }) { + const attachments = input.attachments ?? []; + yield* Effect.gen(function* () { + const { textGenerationModelSelection: modelSelection } = + yield* serverSettingsService.getSettings; + + const generated = yield* textGeneration.generateThreadTitle({ + cwd: input.cwd, + message: input.messageText, + ...(attachments.length > 0 ? { attachments } : {}), + modelSelection, + }); + if (!generated) return; - const thread = yield* resolveThread(input.threadId); - if (!thread) return; - if (!canReplaceThreadTitle(thread.title, input.titleSeed)) { - return; - } + const thread = yield* resolveThread(input.threadId); + if (!thread) return; + if (!canReplaceThreadTitle(thread.title, input.titleSeed)) { + return; + } - yield* orchestrationEngine.dispatch({ - type: "thread.meta.update", - commandId: serverCommandId("thread-title-rename"), - threadId: input.threadId, - title: generated.title, - }); - }).pipe( - Effect.catchCause((cause) => - Effect.logWarning("provider command reactor failed to generate or rename thread title", { + yield* orchestrationEngine.dispatch({ + type: "thread.meta.update", + commandId: serverCommandId("thread-title-rename"), threadId: input.threadId, - cwd: input.cwd, - cause: Cause.pretty(cause), - }), - ), - ); - }); + title: generated.title, + }); + }).pipe( + Effect.catchCause((cause) => + Effect.logWarning("provider command reactor failed to generate or rename thread title", { + threadId: input.threadId, + cwd: input.cwd, + cause: Cause.pretty(cause), + }), + ), + ); + }, + ); - const processTurnStartRequested = Effect.fnUntraced(function* ( + const processTurnStartRequested = Effect.fn("processTurnStartRequested")(function* ( event: Extract, ) { const key = turnStartKeyForEvent(event); @@ -584,7 +588,7 @@ const make = Effect.gen(function* () { ); }); - const processTurnInterruptRequested = Effect.fnUntraced(function* ( + const processTurnInterruptRequested = Effect.fn("processTurnInterruptRequested")(function* ( event: Extract, ) { const thread = yield* resolveThread(event.payload.threadId); @@ -607,7 +611,7 @@ const make = Effect.gen(function* () { yield* providerService.interruptTurn({ threadId: event.payload.threadId }); }); - const processApprovalResponseRequested = Effect.fnUntraced(function* ( + const processApprovalResponseRequested = Effect.fn("processApprovalResponseRequested")(function* ( event: Extract, ) { const thread = yield* resolveThread(event.payload.threadId); @@ -654,50 +658,52 @@ const make = Effect.gen(function* () { ); }); - const processUserInputResponseRequested = Effect.fnUntraced(function* ( - event: Extract, - ) { - const thread = yield* resolveThread(event.payload.threadId); - if (!thread) { - return; - } - const hasSession = thread.session && thread.session.status !== "stopped"; - if (!hasSession) { - return yield* appendProviderFailureActivity({ - threadId: event.payload.threadId, - kind: "provider.user-input.respond.failed", - summary: "Provider user input response failed", - detail: "No active provider session is bound to this thread.", - turnId: null, - createdAt: event.payload.createdAt, - requestId: event.payload.requestId, - }); - } + const processUserInputResponseRequested = Effect.fn("processUserInputResponseRequested")( + function* ( + event: Extract, + ) { + const thread = yield* resolveThread(event.payload.threadId); + if (!thread) { + return; + } + const hasSession = thread.session && thread.session.status !== "stopped"; + if (!hasSession) { + return yield* appendProviderFailureActivity({ + threadId: event.payload.threadId, + kind: "provider.user-input.respond.failed", + summary: "Provider user input response failed", + detail: "No active provider session is bound to this thread.", + turnId: null, + createdAt: event.payload.createdAt, + requestId: event.payload.requestId, + }); + } - yield* providerService - .respondToUserInput({ - threadId: event.payload.threadId, - requestId: event.payload.requestId, - answers: event.payload.answers, - }) - .pipe( - Effect.catchCause((cause) => - appendProviderFailureActivity({ - threadId: event.payload.threadId, - kind: "provider.user-input.respond.failed", - summary: "Provider user input response failed", - detail: isUnknownPendingUserInputRequestError(cause) - ? stalePendingRequestDetail("user-input", event.payload.requestId) - : Cause.pretty(cause), - turnId: null, - createdAt: event.payload.createdAt, - requestId: event.payload.requestId, - }), - ), - ); - }); + yield* providerService + .respondToUserInput({ + threadId: event.payload.threadId, + requestId: event.payload.requestId, + answers: event.payload.answers, + }) + .pipe( + Effect.catchCause((cause) => + appendProviderFailureActivity({ + threadId: event.payload.threadId, + kind: "provider.user-input.respond.failed", + summary: "Provider user input response failed", + detail: isUnknownPendingUserInputRequestError(cause) + ? stalePendingRequestDetail("user-input", event.payload.requestId) + : Cause.pretty(cause), + turnId: null, + createdAt: event.payload.createdAt, + requestId: event.payload.requestId, + }), + ), + ); + }, + ); - const processSessionStopRequested = Effect.fnUntraced(function* ( + const processSessionStopRequested = Effect.fn("processSessionStopRequested")(function* ( event: Extract, ) { const thread = yield* resolveThread(event.payload.threadId); diff --git a/apps/server/src/orchestration/Layers/ProviderRuntimeIngestion.ts b/apps/server/src/orchestration/Layers/ProviderRuntimeIngestion.ts index b479d1c28a..8d3fb5d752 100644 --- a/apps/server/src/orchestration/Layers/ProviderRuntimeIngestion.ts +++ b/apps/server/src/orchestration/Layers/ProviderRuntimeIngestion.ts @@ -524,7 +524,7 @@ const make = Effect.fn("make")(function* () { lookup: () => Effect.succeed({ text: "", createdAt: "" }), }); - const isGitRepoForThread = Effect.fnUntraced(function* (threadId: ThreadId) { + const isGitRepoForThread = Effect.fn("isGitRepoForThread")(function* (threadId: ThreadId) { const readModel = yield* orchestrationEngine.getReadModel(); const thread = readModel.threads.find((entry) => entry.id === threadId); if (!thread) { @@ -796,9 +796,9 @@ const make = Effect.fn("make")(function* () { ).pipe(Effect.asVoid); }); - const getSourceProposedPlanReferenceForPendingTurnStart = Effect.fnUntraced(function* ( - threadId: ThreadId, - ) { + const getSourceProposedPlanReferenceForPendingTurnStart = Effect.fn( + "getSourceProposedPlanReferenceForPendingTurnStart", + )(function* (threadId: ThreadId) { const pendingTurnStart = yield* projectionTurnRepository.getPendingTurnStartByThreadId({ threadId, }); @@ -818,16 +818,17 @@ const make = Effect.fn("make")(function* () { } as const; }); - const getExpectedProviderTurnIdForThread = Effect.fnUntraced(function* (threadId: ThreadId) { - const sessions = yield* providerService.listSessions(); - const session = sessions.find((entry) => entry.threadId === threadId); - return session?.activeTurnId; - }); - - const getSourceProposedPlanReferenceForAcceptedTurnStart = Effect.fnUntraced(function* ( - threadId: ThreadId, - eventTurnId: TurnId | undefined, - ) { + const getExpectedProviderTurnIdForThread = Effect.fn("getExpectedProviderTurnIdForThread")( + function* (threadId: ThreadId) { + const sessions = yield* providerService.listSessions(); + const session = sessions.find((entry) => entry.threadId === threadId); + return session?.activeTurnId; + }, + ); + + const getSourceProposedPlanReferenceForAcceptedTurnStart = Effect.fn( + "getSourceProposedPlanReferenceForAcceptedTurnStart", + )(function* (threadId: ThreadId, eventTurnId: TurnId | undefined) { if (eventTurnId === undefined) { return null; } @@ -840,34 +841,36 @@ const make = Effect.fn("make")(function* () { return yield* getSourceProposedPlanReferenceForPendingTurnStart(threadId); }); - const markSourceProposedPlanImplemented = Effect.fnUntraced(function* ( - sourceThreadId: ThreadId, - sourcePlanId: OrchestrationProposedPlanId, - implementationThreadId: ThreadId, - implementedAt: string, - ) { - const readModel = yield* orchestrationEngine.getReadModel(); - const sourceThread = readModel.threads.find((entry) => entry.id === sourceThreadId); - const sourcePlan = sourceThread?.proposedPlans.find((entry) => entry.id === sourcePlanId); - if (!sourceThread || !sourcePlan || sourcePlan.implementedAt !== null) { - return; - } + const markSourceProposedPlanImplemented = Effect.fn("markSourceProposedPlanImplemented")( + function* ( + sourceThreadId: ThreadId, + sourcePlanId: OrchestrationProposedPlanId, + implementationThreadId: ThreadId, + implementedAt: string, + ) { + const readModel = yield* orchestrationEngine.getReadModel(); + const sourceThread = readModel.threads.find((entry) => entry.id === sourceThreadId); + const sourcePlan = sourceThread?.proposedPlans.find((entry) => entry.id === sourcePlanId); + if (!sourceThread || !sourcePlan || sourcePlan.implementedAt !== null) { + return; + } - yield* orchestrationEngine.dispatch({ - type: "thread.proposed-plan.upsert", - commandId: CommandId.makeUnsafe( - `provider:source-proposed-plan-implemented:${implementationThreadId}:${crypto.randomUUID()}`, - ), - threadId: sourceThread.id, - proposedPlan: { - ...sourcePlan, - implementedAt, - implementationThreadId, - updatedAt: implementedAt, - }, - createdAt: implementedAt, - }); - }); + yield* orchestrationEngine.dispatch({ + type: "thread.proposed-plan.upsert", + commandId: CommandId.makeUnsafe( + `provider:source-proposed-plan-implemented:${implementationThreadId}:${crypto.randomUUID()}`, + ), + threadId: sourceThread.id, + proposedPlan: { + ...sourcePlan, + implementedAt, + implementationThreadId, + updatedAt: implementedAt, + }, + createdAt: implementedAt, + }); + }, + ); const processRuntimeEvent = Effect.fn("processRuntimeEvent")(function* ( event: ProviderRuntimeEvent, diff --git a/apps/server/src/telemetry/Layers/AnalyticsService.ts b/apps/server/src/telemetry/Layers/AnalyticsService.ts index 06ecfd0237..e933576dff 100644 --- a/apps/server/src/telemetry/Layers/AnalyticsService.ts +++ b/apps/server/src/telemetry/Layers/AnalyticsService.ts @@ -125,17 +125,19 @@ const makeAnalyticsService = Effect.gen(function* () { } }).pipe(Effect.catch((cause) => Effect.logError("Failed to flush telemetry", { cause }))); - const record: AnalyticsServiceShape["record"] = Effect.fnUntraced(function* (event, properties) { - if (!telemetryConfig.enabled || !identifier) return; - - const enqueueResult = yield* enqueueBufferedEvent(event, properties); - if (enqueueResult.dropped) { - yield* Effect.logDebug("analytics buffer full; dropping oldest event", { - size: enqueueResult.size, - event, - }); - } - }); + const record: AnalyticsServiceShape["record"] = Effect.fn("record")( + function* (event, properties) { + if (!telemetryConfig.enabled || !identifier) return; + + const enqueueResult = yield* enqueueBufferedEvent(event, properties); + if (enqueueResult.dropped) { + yield* Effect.logDebug("analytics buffer full; dropping oldest event", { + size: enqueueResult.size, + event, + }); + } + }, + ); yield* Effect.forever(Effect.sleep(1000).pipe(Effect.flatMap(() => flush)), { disableYield: true, From 8cc42335ed1acaf8a8f04fd9a5bd37fbc4bd31fb Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Thu, 2 Apr 2026 19:19:19 -0700 Subject: [PATCH 7/8] Defer provider metric attributes until routing resolves - Allow withMetrics attributes to be computed lazily - Record sendTurn metrics with the resolved provider --- apps/server/src/observability/Metrics.test.ts | 38 ++++++++++++++++++ apps/server/src/observability/Metrics.ts | 7 +++- .../provider/Layers/ProviderService.test.ts | 39 +++++++++++++++++++ .../src/provider/Layers/ProviderService.ts | 19 +++++---- 4 files changed, 94 insertions(+), 9 deletions(-) diff --git a/apps/server/src/observability/Metrics.test.ts b/apps/server/src/observability/Metrics.test.ts index 148431b7b4..4604f43b63 100644 --- a/apps/server/src/observability/Metrics.test.ts +++ b/apps/server/src/observability/Metrics.test.ts @@ -70,4 +70,42 @@ describe("withMetrics", () => { ); }), ); + + it.effect("evaluates attributes lazily after the wrapped effect runs", () => + Effect.gen(function* () { + const counter = Metric.counter("with_metrics_lazy_total"); + const timer = Metric.timer("with_metrics_lazy_duration"); + let provider = "unknown"; + + yield* Effect.sync(() => { + provider = "codex"; + }).pipe( + withMetrics({ + counter, + timer, + attributes: () => ({ + provider, + operation: "lazy", + }), + }), + ); + + const snapshots = yield* Metric.snapshot; + assert.equal( + hasMetricSnapshot(snapshots, "with_metrics_lazy_total", { + provider: "codex", + operation: "lazy", + outcome: "success", + }), + true, + ); + assert.equal( + hasMetricSnapshot(snapshots, "with_metrics_lazy_duration", { + provider: "codex", + operation: "lazy", + }), + true, + ); + }), + ); }); diff --git a/apps/server/src/observability/Metrics.ts b/apps/server/src/observability/Metrics.ts index 115859fe15..4de4b93f2a 100644 --- a/apps/server/src/observability/Metrics.ts +++ b/apps/server/src/observability/Metrics.ts @@ -91,7 +91,9 @@ export const increment = ( export interface WithMetricsOptions { readonly counter?: Metric.Metric; readonly timer?: Metric.Metric; - readonly attributes?: Readonly>; + readonly attributes?: + | Readonly> + | (() => Readonly>); readonly outcomeAttributes?: ( outcome: ReturnType, ) => Readonly>; @@ -105,7 +107,8 @@ const withMetricsImpl = ( const startedAt = Date.now(); const exit = yield* Effect.exit(effect); const duration = Duration.millis(Math.max(0, Date.now() - startedAt)); - const baseAttributes = options.attributes ?? {}; + const baseAttributes = + typeof options.attributes === "function" ? options.attributes() : (options.attributes ?? {}); if (options.timer) { yield* Metric.update( diff --git a/apps/server/src/provider/Layers/ProviderService.test.ts b/apps/server/src/provider/Layers/ProviderService.test.ts index b23c113618..cd6c81405b 100644 --- a/apps/server/src/provider/Layers/ProviderService.test.ts +++ b/apps/server/src/provider/Layers/ProviderService.test.ts @@ -1146,6 +1146,45 @@ fanout.layer("ProviderServiceLive fanout", (it) => { ); }), ); + + it.effect( + "records sendTurn metrics with the resolved provider when modelSelection is omitted", + () => + Effect.gen(function* () { + const provider = yield* ProviderService; + + const session = yield* provider.startSession(asThreadId("thread-send-metrics"), { + provider: "claudeAgent", + threadId: asThreadId("thread-send-metrics"), + cwd: "/tmp/project-send-metrics", + runtimeMode: "full-access", + }); + + yield* provider.sendTurn({ + threadId: session.threadId, + input: "hello", + attachments: [], + }); + + const snapshots = yield* Metric.snapshot; + + assert.equal( + hasMetricSnapshot(snapshots, "t3_provider_turns_total", { + provider: "claudeAgent", + operation: "send", + outcome: "success", + }), + true, + ); + assert.equal( + hasMetricSnapshot(snapshots, "t3_provider_turn_duration", { + provider: "claudeAgent", + operation: "send", + }), + true, + ); + }), + ); }); const validation = makeProviderServiceLayer(); diff --git a/apps/server/src/provider/Layers/ProviderService.ts b/apps/server/src/provider/Layers/ProviderService.ts index 6c18849c44..da2c293fac 100644 --- a/apps/server/src/provider/Layers/ProviderService.ts +++ b/apps/server/src/provider/Layers/ProviderService.ts @@ -407,12 +407,16 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( "provider.interaction_mode": input.interactionMode, "provider.attachment_count": input.attachments.length, }); + let metricProvider = "unknown"; + let metricModel = input.modelSelection?.model; return yield* Effect.gen(function* () { const routed = yield* resolveRoutableSession({ threadId: input.threadId, operation: "ProviderService.sendTurn", allowRecovery: true, }); + metricProvider = routed.adapter.provider; + metricModel = input.modelSelection?.model; yield* Effect.annotateCurrentSpan({ "provider.kind": routed.adapter.provider, ...(input.modelSelection?.model ? { "provider.model": input.modelSelection.model } : {}), @@ -442,13 +446,14 @@ const makeProviderService = Effect.fn("makeProviderService")(function* ( withMetrics({ counter: providerTurnsTotal, timer: providerTurnDuration, - attributes: providerTurnMetricAttributes({ - provider: input.modelSelection?.provider ?? "unknown", - model: input.modelSelection?.model, - extra: { - operation: "send", - }, - }), + attributes: () => + providerTurnMetricAttributes({ + provider: metricProvider, + model: metricModel, + extra: { + operation: "send", + }, + }), }), ); }); From c6b70828ba6eab27584b5a1546ab7902db159343 Mon Sep 17 00:00:00 2001 From: Julius Marminge Date: Thu, 2 Apr 2026 19:28:07 -0700 Subject: [PATCH 8/8] remove metrics from db queries --- apps/server/src/observability/Metrics.ts | 8 -------- .../src/persistence/NodeSqliteClient.ts | 19 +------------------ 2 files changed, 1 insertion(+), 26 deletions(-) diff --git a/apps/server/src/observability/Metrics.ts b/apps/server/src/observability/Metrics.ts index 4de4b93f2a..3e527c7cb4 100644 --- a/apps/server/src/observability/Metrics.ts +++ b/apps/server/src/observability/Metrics.ts @@ -70,14 +70,6 @@ export const terminalRestartsTotal = Metric.counter("t3_terminal_restarts_total" description: "Total terminal restart requests handled.", }); -export const dbQueriesTotal = Metric.counter("t3_db_queries_total", { - description: "Total sqlite queries executed by the server runtime.", -}); - -export const dbQueryDuration = Metric.timer("t3_db_query_duration", { - description: "Sqlite query execution duration.", -}); - export const metricAttributes = ( attributes: Readonly>, ): ReadonlyArray<[string, string]> => Object.entries(compactMetricAttributes(attributes)); diff --git a/apps/server/src/persistence/NodeSqliteClient.ts b/apps/server/src/persistence/NodeSqliteClient.ts index 803e5afe62..77c316e8ca 100644 --- a/apps/server/src/persistence/NodeSqliteClient.ts +++ b/apps/server/src/persistence/NodeSqliteClient.ts @@ -23,8 +23,6 @@ import type { Connection } from "effect/unstable/sql/SqlConnection"; import { SqlError, classifySqliteError } from "effect/unstable/sql/SqlError"; import * as Statement from "effect/unstable/sql/Statement"; -import { dbQueriesTotal, dbQueryDuration, withMetrics } from "../observability/Metrics.ts"; - const ATTR_DB_SYSTEM_NAME = "db.system.name"; export const TypeId: TypeId = "~local/sqlite-node/SqliteClient"; @@ -84,9 +82,6 @@ const makeWithDatabase = Effect.fn("makeWithDatabase")(function* ( const transformRows = options.transformResultNames ? Statement.defaultTransforms(options.transformResultNames).array : undefined; - const databaseMetricAttributes = { - database: options.filename === ":memory:" ? "memory" : "disk", - } as const; const makeConnection = Effect.gen(function* () { const scope = yield* Effect.scope; @@ -142,13 +137,7 @@ const makeWithDatabase = Effect.fn("makeWithDatabase")(function* ( }), ); } - }).pipe( - withMetrics({ - counter: dbQueriesTotal, - timer: dbQueryDuration, - attributes: databaseMetricAttributes, - }), - ); + }); const run = (sql: string, params: ReadonlyArray, raw = false) => Effect.flatMap(Cache.get(prepareCache, sql), (s) => runStatement(s, params, raw)); @@ -183,12 +172,6 @@ const makeWithDatabase = Effect.fn("makeWithDatabase")(function* ( statement.setReturnArrays(false); } }), - ).pipe( - withMetrics({ - counter: dbQueriesTotal, - timer: dbQueryDuration, - attributes: databaseMetricAttributes, - }), ); return identity({