Skip to content

Commit 0ecc385

Browse files
authored
Sample high-volume analytics events (#653)
1 parent 361e2df commit 0ecc385

9 files changed

Lines changed: 470 additions & 10 deletions

File tree

cli/src/utils/analytics.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import {
99
IS_PROD as defaultIsProd,
1010
DEBUG_ANALYTICS,
1111
} from '@codebuff/common/env'
12+
import { shouldTrackAnalyticsEvent } from '@codebuff/common/util/analytics-sampling'
1213

1314
import type { AnalyticsEvent } from '@codebuff/common/constants/analytics-events'
1415

@@ -211,6 +212,10 @@ export function trackEvent(
211212
return
212213
}
213214

215+
if (!shouldTrackAnalyticsEvent({ event, distinctId, properties })) {
216+
return
217+
}
218+
214219
try {
215220
client.capture({
216221
distinctId,

cli/src/utils/logger.ts

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ import { AnalyticsEvent } from '@codebuff/common/constants/analytics-events'
77
import { env, IS_DEV, IS_TEST, IS_CI } from '@codebuff/common/env'
88
import { createAnalyticsDispatcher } from '@codebuff/common/util/analytics-dispatcher'
99
import { getAnalyticsEventId } from '@codebuff/common/util/analytics-log'
10+
import {
11+
isFullTelemetryEnabled,
12+
summarizeAnalyticsValue,
13+
} from '@codebuff/common/util/analytics-sampling'
1014
import { pino } from 'pino'
1115

1216
import {
@@ -169,10 +173,23 @@ function sendAnalyticsAndLog(
169173
// Skip if the log already has an eventId (to avoid duplicate tracking)
170174
const hasEventId = includeData && getAnalyticsEventId(normalizedData) !== null
171175
if (!IS_DEV && !IS_TEST && !IS_CI && !hasEventId) {
176+
const fullTelemetry = isFullTelemetryEnabled({
177+
distinctId: loggerContext.userId,
178+
properties: loggerContext,
179+
})
180+
const includeRawData =
181+
fullTelemetry || level === 'error' || level === 'fatal'
182+
const dataProperties =
183+
includeData && includeRawData
184+
? { data: normalizedData }
185+
: includeData
186+
? { dataSummary: summarizeAnalyticsValue(normalizedData) }
187+
: {}
188+
172189
trackEvent(AnalyticsEvent.CLI_LOG, {
173190
level,
174191
msg: stringFormat(normalizedMsg ?? '', ...args),
175-
...(includeData ? { data: normalizedData } : {}),
192+
...dataProperties,
176193
...loggerContext,
177194
})
178195
}
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import { afterEach, describe, expect, it } from 'bun:test'
2+
3+
import { AnalyticsEvent } from '@codebuff/common/constants/analytics-events'
4+
5+
import {
6+
isFullTelemetryEnabled,
7+
shouldTrackAnalyticsEvent,
8+
summarizeAnalyticsValue,
9+
} from '../analytics-sampling'
10+
11+
const ORIGINAL_ENV = {
12+
CODEBUFF_FULL_TELEMETRY: process.env.CODEBUFF_FULL_TELEMETRY,
13+
CODEBUFF_FULL_TELEMETRY_IDS: process.env.CODEBUFF_FULL_TELEMETRY_IDS,
14+
CODEBUFF_FULL_TELEMETRY_USER_IDS:
15+
process.env.CODEBUFF_FULL_TELEMETRY_USER_IDS,
16+
}
17+
18+
function restoreEnv() {
19+
for (const [key, value] of Object.entries(ORIGINAL_ENV)) {
20+
if (value === undefined) {
21+
delete process.env[key]
22+
} else {
23+
process.env[key] = value
24+
}
25+
}
26+
}
27+
28+
describe('analytics sampling', () => {
29+
afterEach(() => {
30+
restoreEnv()
31+
})
32+
33+
it('always tracks core CLI lifecycle events', () => {
34+
expect(
35+
shouldTrackAnalyticsEvent({
36+
event: AnalyticsEvent.APP_LAUNCHED,
37+
distinctId: 'user-1',
38+
}),
39+
).toBe(true)
40+
expect(
41+
shouldTrackAnalyticsEvent({
42+
event: AnalyticsEvent.USER_INPUT_COMPLETE,
43+
distinctId: 'user-1',
44+
}),
45+
).toBe(true)
46+
})
47+
48+
it('always tracks CLI error logs', () => {
49+
expect(
50+
shouldTrackAnalyticsEvent({
51+
event: AnalyticsEvent.CLI_LOG,
52+
distinctId: 'user-1',
53+
properties: { level: 'error' },
54+
}),
55+
).toBe(true)
56+
})
57+
58+
it('samples high-volume events deterministically', () => {
59+
const first = shouldTrackAnalyticsEvent({
60+
event: AnalyticsEvent.TOOL_USE,
61+
distinctId: 'user-1',
62+
})
63+
const second = shouldTrackAnalyticsEvent({
64+
event: AnalyticsEvent.TOOL_USE,
65+
distinctId: 'user-1',
66+
})
67+
const otherEvent = shouldTrackAnalyticsEvent({
68+
event: AnalyticsEvent.AGENT_STEP,
69+
distinctId: 'user-1',
70+
})
71+
72+
expect(second).toBe(first)
73+
expect(typeof otherEvent).toBe('boolean')
74+
})
75+
76+
it('honors full telemetry env flags and allowlists', () => {
77+
process.env.CODEBUFF_FULL_TELEMETRY = 'true'
78+
expect(
79+
isFullTelemetryEnabled({
80+
distinctId: 'anyone',
81+
}),
82+
).toBe(true)
83+
84+
delete process.env.CODEBUFF_FULL_TELEMETRY
85+
process.env.CODEBUFF_FULL_TELEMETRY_IDS = 'user-2,person@example.com'
86+
87+
expect(
88+
isFullTelemetryEnabled({
89+
distinctId: 'user-2',
90+
}),
91+
).toBe(true)
92+
expect(
93+
isFullTelemetryEnabled({
94+
properties: { userEmail: 'person@example.com' },
95+
}),
96+
).toBe(true)
97+
expect(
98+
isFullTelemetryEnabled({
99+
distinctId: 'user-3',
100+
}),
101+
).toBe(false)
102+
})
103+
104+
it('summarizes values without retaining raw contents', () => {
105+
expect(summarizeAnalyticsValue('secret text')).toEqual({
106+
kind: 'string',
107+
length: 11,
108+
})
109+
expect(summarizeAnalyticsValue(['a', 'b'])).toEqual({
110+
kind: 'array',
111+
length: 2,
112+
})
113+
expect(summarizeAnalyticsValue({ prompt: 'secret', count: 1 })).toEqual({
114+
kind: 'object',
115+
keyCount: 2,
116+
keys: ['prompt', 'count'],
117+
})
118+
})
119+
})
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
import { AnalyticsEvent } from '../constants/analytics-events'
2+
3+
const DEFAULT_SAMPLED_RATE = 0.01
4+
5+
const SAMPLED_EVENT_RATES: Partial<Record<AnalyticsEvent, number>> = {
6+
[AnalyticsEvent.AGENT_STEP]: DEFAULT_SAMPLED_RATE,
7+
[AnalyticsEvent.CHATGPT_OAUTH_REQUEST]: DEFAULT_SAMPLED_RATE,
8+
[AnalyticsEvent.CLI_LOG]: DEFAULT_SAMPLED_RATE,
9+
[AnalyticsEvent.FEEDBACK_BUTTON_HOVERED]: DEFAULT_SAMPLED_RATE,
10+
[AnalyticsEvent.FOLLOWUP_CLICKED]: DEFAULT_SAMPLED_RATE,
11+
[AnalyticsEvent.SLASH_COMMAND_USED]: DEFAULT_SAMPLED_RATE,
12+
[AnalyticsEvent.SLASH_MENU_ACTIVATED]: DEFAULT_SAMPLED_RATE,
13+
[AnalyticsEvent.TOOL_USE]: DEFAULT_SAMPLED_RATE,
14+
}
15+
16+
const ALWAYS_TRACK_EVENTS = new Set<AnalyticsEvent>([
17+
AnalyticsEvent.APP_LAUNCHED,
18+
AnalyticsEvent.CHANGE_DIRECTORY,
19+
AnalyticsEvent.CHATGPT_OAUTH_AUTH_ERROR,
20+
AnalyticsEvent.CHATGPT_OAUTH_RATE_LIMITED,
21+
AnalyticsEvent.FINGERPRINT_GENERATED,
22+
AnalyticsEvent.INVALID_COMMAND,
23+
AnalyticsEvent.KNOWLEDGE_FILE_UPDATED,
24+
AnalyticsEvent.LOGIN,
25+
AnalyticsEvent.TERMINAL_COMMAND_COMPLETED,
26+
AnalyticsEvent.UPDATE_CODEBUFF_FAILED,
27+
AnalyticsEvent.USER_INPUT,
28+
AnalyticsEvent.USER_INPUT_COMPLETE,
29+
])
30+
31+
type AnalyticsProperties = Record<string, unknown> | undefined
32+
33+
function getStringProperty(
34+
properties: AnalyticsProperties,
35+
key: string,
36+
): string | undefined {
37+
const value = properties?.[key]
38+
return typeof value === 'string' && value.trim() ? value : undefined
39+
}
40+
41+
function getPropertyUserId(properties: AnalyticsProperties): string | undefined {
42+
const direct =
43+
getStringProperty(properties, 'userId') ??
44+
getStringProperty(properties, 'user_id') ??
45+
getStringProperty(properties, 'distinct_id')
46+
if (direct) {
47+
return direct
48+
}
49+
50+
const user = properties?.user
51+
if (user && typeof user === 'object') {
52+
const id = (user as { id?: unknown }).id
53+
return typeof id === 'string' && id.trim() ? id : undefined
54+
}
55+
56+
return undefined
57+
}
58+
59+
function splitEnvList(value: string | undefined): Set<string> {
60+
return new Set(
61+
(value ?? '')
62+
.split(',')
63+
.map((item) => item.trim())
64+
.filter(Boolean),
65+
)
66+
}
67+
68+
function isTruthyEnv(value: string | undefined): boolean {
69+
return value === '1' || value === 'true' || value === 'yes'
70+
}
71+
72+
export function isFullTelemetryEnabled(params: {
73+
distinctId?: string
74+
properties?: AnalyticsProperties
75+
}): boolean {
76+
if (isTruthyEnv(process.env.CODEBUFF_FULL_TELEMETRY)) {
77+
return true
78+
}
79+
80+
const ids = splitEnvList(
81+
process.env.CODEBUFF_FULL_TELEMETRY_IDS ??
82+
process.env.CODEBUFF_FULL_TELEMETRY_USER_IDS,
83+
)
84+
if (ids.size === 0) {
85+
return false
86+
}
87+
88+
const candidates = [
89+
params.distinctId,
90+
getPropertyUserId(params.properties),
91+
getStringProperty(params.properties, 'userEmail'),
92+
getStringProperty(params.properties, 'email'),
93+
].filter(
94+
(value): value is string =>
95+
typeof value === 'string' && value.length > 0,
96+
)
97+
98+
return candidates.some((candidate) => ids.has(candidate))
99+
}
100+
101+
function getEventSampleRate(
102+
event: AnalyticsEvent,
103+
properties: AnalyticsProperties,
104+
): number {
105+
const level = getStringProperty(properties, 'level')?.toLowerCase()
106+
if (
107+
event === AnalyticsEvent.CLI_LOG &&
108+
(level === 'error' || level === 'fatal')
109+
) {
110+
return 1
111+
}
112+
113+
if (ALWAYS_TRACK_EVENTS.has(event)) {
114+
return 1
115+
}
116+
117+
return SAMPLED_EVENT_RATES[event] ?? 1
118+
}
119+
120+
function hashString(input: string): number {
121+
let hash = 2166136261
122+
for (let i = 0; i < input.length; i++) {
123+
hash ^= input.charCodeAt(i)
124+
hash = Math.imul(hash, 16777619)
125+
}
126+
return hash >>> 0
127+
}
128+
129+
function getSamplingKey(params: {
130+
event: AnalyticsEvent
131+
distinctId?: string
132+
properties?: AnalyticsProperties
133+
}): string {
134+
return (
135+
params.distinctId ??
136+
getPropertyUserId(params.properties) ??
137+
getStringProperty(params.properties, 'clientSessionId') ??
138+
getStringProperty(params.properties, 'userInputId') ??
139+
params.event
140+
)
141+
}
142+
143+
export function shouldTrackAnalyticsEvent(params: {
144+
event: AnalyticsEvent
145+
distinctId?: string
146+
properties?: AnalyticsProperties
147+
}): boolean {
148+
if (isFullTelemetryEnabled(params)) {
149+
return true
150+
}
151+
152+
const rate = getEventSampleRate(params.event, params.properties)
153+
if (rate >= 1) {
154+
return true
155+
}
156+
if (rate <= 0) {
157+
return false
158+
}
159+
160+
const bucket =
161+
hashString(`${params.event}:${getSamplingKey(params)}`) / 0xffffffff
162+
return bucket < rate
163+
}
164+
165+
function valueKind(value: unknown): string {
166+
if (Array.isArray(value)) {
167+
return 'array'
168+
}
169+
if (value === null) {
170+
return 'null'
171+
}
172+
return typeof value
173+
}
174+
175+
export function summarizeAnalyticsValue(
176+
value: unknown,
177+
): Record<string, unknown> {
178+
if (value === null || value === undefined) {
179+
return { kind: valueKind(value) }
180+
}
181+
182+
if (typeof value === 'string') {
183+
return { kind: 'string', length: value.length }
184+
}
185+
186+
if (Array.isArray(value)) {
187+
return { kind: 'array', length: value.length }
188+
}
189+
190+
if (typeof value === 'object') {
191+
const keys = Object.keys(value as Record<string, unknown>)
192+
return {
193+
kind: 'object',
194+
keyCount: keys.length,
195+
keys: keys.slice(0, 25),
196+
}
197+
}
198+
199+
return { kind: valueKind(value) }
200+
}

docs/environment-variables.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
- Server secrets: validated in `packages/internal/src/env-schema.ts` (used via `@codebuff/internal/env`).
77
- Runtime/OS env: pass typed snapshots instead of reading `process.env` throughout the codebase.
88
- `IPINFO_TOKEN` is required; free-mode country gating uses it to check IPinfo privacy signals for VPN/proxy/Tor/relay/hosting traffic.
9+
- `CODEBUFF_FULL_TELEMETRY=true` or `CODEBUFF_FULL_TELEMETRY_IDS=user-id,email@example.com`
10+
disables client analytics sampling for targeted debugging. Use sparingly because it can send full CLI log payloads.
911

1012
## Env DI Helpers
1113

0 commit comments

Comments
 (0)