From 2ddb2e5a4ffe714958f97987228515a192f4da02 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Tue, 5 May 2026 05:34:20 +0000 Subject: [PATCH] Track bundle resource counts and state file sizes in telemetry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new typed BundleResourcesMetadata struct under BundleDeployExperimental, capturing per-resource-type metadata for a bundle deploy: - count of resources of each type declared in the bundle configuration - max, mean, median state size in bytes across resources of that type - whole state file size on disk - deployment engine ("direct" or "terraform") For Terraform deployments the tfstate is translated to the direct- engine representation (via the existing TerraformToGroupName map) before sizing so per-type stats are comparable across engines. The new count field replaces the deprecated DatabricksBundleDeployEvent .resource_*_count fields; both are populated during the transition. The Go mirror marks the deprecated Resource*Count fields with a "// Deprecated:" comment. Measurement is performed at telemetry-emission time by reading the on-disk state file once, so this lands as a single isolated module (bundle/phases/resources_metadata.go) with one new line at the call site — no instrumentation in deploy mutators, state-mgmt code, or bundle.Metrics. To remove: delete the new module and revert one line in telemetry.go plus the proto/Go field. Requires the new resources_metadata field on BundleDeployExperimental from the universe PR. Lumberjack drops unknown fields, so the two PRs can land in either order. --- bundle/phases/resources_metadata.go | 250 +++++++++++++++++++++++ bundle/phases/resources_metadata_test.go | 158 ++++++++++++++ bundle/phases/telemetry.go | 2 + libs/telemetry/protos/bundle_deploy.go | 37 ++++ 4 files changed, 447 insertions(+) create mode 100644 bundle/phases/resources_metadata.go create mode 100644 bundle/phases/resources_metadata_test.go diff --git a/bundle/phases/resources_metadata.go b/bundle/phases/resources_metadata.go new file mode 100644 index 00000000000..3bb314daec3 --- /dev/null +++ b/bundle/phases/resources_metadata.go @@ -0,0 +1,250 @@ +package phases + +import ( + "cmp" + "context" + "encoding/json" + "errors" + "io/fs" + "os" + "path/filepath" + "slices" + "strings" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/deploy/terraform" + "github.com/databricks/cli/bundle/direct/dstate" + "github.com/databricks/cli/libs/dyn" + "github.com/databricks/cli/libs/log" + "github.com/databricks/cli/libs/telemetry/protos" + tfjson "github.com/hashicorp/terraform-json" +) + +// collectResourcesMetadata builds a BundleResourcesMetadata for the deploy: +// per-resource-type counts come from the bundle configuration (matching the +// semantics of the deprecated DatabricksBundleDeployEvent.resource_*_count +// fields), and state-size statistics come from the on-disk deployment state +// file. For Terraform deployments the tfstate is translated to the direct- +// engine representation before sizing so per-type stats are comparable across +// engines. +// +// Returns nil only on a complete absence of signal (no resources declared and +// no readable state). Telemetry must never fail a deploy — all parse errors +// are logged at debug level and treated as missing data. +// +// This file is the sole site of resource-metadata telemetry logic. To remove +// the feature: delete this file and its companion test, revert the call site +// in telemetry.go, and revert the ResourcesMetadata field in +// libs/telemetry/protos/bundle_deploy.go. +func collectResourcesMetadata(ctx context.Context, b *bundle.Bundle) *protos.BundleResourcesMetadata { + counts := countResourcesByType(ctx, b) + + engine, fileSize, sizesByType := readStateForMetadata(ctx, b) + + if len(counts) == 0 && len(sizesByType) == 0 && fileSize == 0 { + return nil + } + + types := unionKeys(counts, sizesByType) + slices.Sort(types) + + resources := make([]protos.ResourceMetadata, 0, len(types)) + for _, t := range types { + sizes := sizesByType[t] + slices.SortFunc(sizes, func(a, b int64) int { return cmp.Compare(a, b) }) + resources = append(resources, protos.ResourceMetadata{ + ResourceType: t, + Count: counts[t], + StateSizeMaxBytes: statMax(sizes), + StateSizeMeanBytes: statMean(sizes), + StateSizeMedianBytes: statMedian(sizes), + }) + } + + return &protos.BundleResourcesMetadata{ + StateEngine: engine, + StateFileSizeBytes: fileSize, + Resources: resources, + } +} + +// countResourcesByType walks the bundle config and counts top-level resources +// at "resources..". Returns map[type]count. +func countResourcesByType(ctx context.Context, b *bundle.Bundle) map[string]int64 { + out := make(map[string]int64) + pattern := dyn.NewPattern(dyn.Key("resources"), dyn.AnyKey(), dyn.AnyKey()) + _, err := dyn.MapByPattern(b.Config.Value(), pattern, func(p dyn.Path, v dyn.Value) (dyn.Value, error) { + if len(p) >= 2 { + out[p[1].Key()]++ + } + return v, nil + }) + if err != nil { + log.Debugf(ctx, "resources-metadata telemetry: failed to walk config resources: %s", err) + } + return out +} + +// readStateForMetadata reads whichever local state file exists (direct +// preferred, then terraform) and returns engine name, whole-file size, and +// per-resource-type sizes. Returns ("", 0, nil) if no state is present or if +// the bundle isn't far enough through initialization to have a target +// selected (which is required to compute state file paths). +func readStateForMetadata(ctx context.Context, b *bundle.Bundle) (string, int64, map[string][]int64) { + if b.Target == nil { + return "", 0, nil + } + + if _, localPath := b.StateFilenameDirect(ctx); localPath != "" { + raw, err := readStateFile(localPath) + if err == nil && raw != nil { + return "direct", int64(len(raw)), parseDirectStateSizes(ctx, raw) + } + if err != nil { + log.Debugf(ctx, "resources-metadata telemetry: skipping direct state at %s: %s", localPath, err) + } + } + + if _, localPath := b.StateFilenameTerraform(ctx); localPath != "" { + raw, err := readStateFile(localPath) + if errors.Is(err, fs.ErrNotExist) { + altPath := terraformCacheStatePath(ctx, b) + if altPath != localPath && altPath != "" { + raw, err = readStateFile(altPath) + } + } + if err == nil && raw != nil { + return "terraform", int64(len(raw)), parseTerraformStateSizes(ctx, raw) + } + if err != nil { + log.Debugf(ctx, "resources-metadata telemetry: skipping terraform state at %s: %s", localPath, err) + } + } + + return "", 0, nil +} + +func readStateFile(path string) ([]byte, error) { + if path == "" { + return nil, nil + } + raw, err := os.ReadFile(path) + if errors.Is(err, fs.ErrNotExist) { + return nil, nil + } + return raw, err +} + +func terraformCacheStatePath(ctx context.Context, b *bundle.Bundle) string { + dir, err := terraform.Dir(ctx, b) + if err != nil { + return "" + } + return filepath.Join(dir, "terraform.tfstate") +} + +func parseDirectStateSizes(ctx context.Context, raw []byte) map[string][]int64 { + var db dstate.Database + if err := json.Unmarshal(raw, &db); err != nil { + log.Debugf(ctx, "resources-metadata telemetry: failed to parse direct state: %s", err) + return nil + } + byType := make(map[string][]int64) + for key, entry := range db.State { + t := resourceTypeFromKey(key) + if t == "" { + continue + } + byType[t] = append(byType[t], int64(len(entry.State))) + } + return byType +} + +func parseTerraformStateSizes(ctx context.Context, raw []byte) map[string][]int64 { + var state struct { + Version int `json:"version"` + Resources []struct { + Type string `json:"type"` + Mode tfjson.ResourceMode `json:"mode"` + Instances []struct { + Attributes json.RawMessage `json:"attributes"` + } `json:"instances"` + } `json:"resources"` + } + if err := json.Unmarshal(raw, &state); err != nil { + log.Debugf(ctx, "resources-metadata telemetry: failed to parse terraform state: %s", err) + return nil + } + byType := make(map[string][]int64) + for _, resource := range state.Resources { + if resource.Mode != tfjson.ManagedResourceMode { + continue + } + groupName, ok := terraform.TerraformToGroupName[resource.Type] + if !ok { + continue + } + for _, instance := range resource.Instances { + byType[groupName] = append(byType[groupName], int64(len(instance.Attributes))) + } + } + return byType +} + +// resourceTypeFromKey extracts the resource type from a direct-engine state +// key. Direct-engine keys are of the form "resources.." or +// "resources..." (for permissions/grants/secret_acls). +// Returns "" for keys that don't match. +func resourceTypeFromKey(key string) string { + parts := strings.SplitN(key, ".", 4) + if len(parts) < 3 || parts[0] != "resources" { + return "" + } + if len(parts) == 4 { + // Sub-resources like permissions / grants / secret_acls live at + // "resources...". Track them under the sub-resource + // type so they aggregate across resource families. + return parts[3] + } + return parts[1] +} + +func unionKeys(a map[string]int64, b map[string][]int64) []string { + seen := make(map[string]struct{}, len(a)+len(b)) + for k := range a { + seen[k] = struct{}{} + } + for k := range b { + seen[k] = struct{}{} + } + out := make([]string, 0, len(seen)) + for k := range seen { + out = append(out, k) + } + return out +} + +func statMax(sortedSizes []int64) int64 { + if len(sortedSizes) == 0 { + return 0 + } + return sortedSizes[len(sortedSizes)-1] +} + +func statMean(sortedSizes []int64) int64 { + if len(sortedSizes) == 0 { + return 0 + } + var total int64 + for _, s := range sortedSizes { + total += s + } + return total / int64(len(sortedSizes)) +} + +func statMedian(sortedSizes []int64) int64 { + if len(sortedSizes) == 0 { + return 0 + } + return sortedSizes[(len(sortedSizes)-1)/2] +} diff --git a/bundle/phases/resources_metadata_test.go b/bundle/phases/resources_metadata_test.go new file mode 100644 index 00000000000..b67454852b6 --- /dev/null +++ b/bundle/phases/resources_metadata_test.go @@ -0,0 +1,158 @@ +package phases + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/databricks/cli/bundle/direct/dstate" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestParseDirectStateSizes_GroupsByResourceType(t *testing.T) { + db := dstate.Database{ + State: map[string]dstate.ResourceEntry{ + "resources.jobs.foo": {State: json.RawMessage(`{"name":"foo","x":1}`)}, // 20 + "resources.jobs.bar": {State: json.RawMessage(`{"n":"bar"}`)}, // 11 + "resources.jobs.baz": {State: json.RawMessage(`{"name":"baz","y":42}`)}, // 21 + "resources.pipelines.qux": {State: json.RawMessage(`{"name":"qux"}`)}, // 14 + }, + } + raw, err := json.Marshal(db) + require.NoError(t, err) + + got := parseDirectStateSizes(t.Context(), raw) + assert.ElementsMatch(t, []int64{20, 11, 21}, got["jobs"]) + assert.ElementsMatch(t, []int64{14}, got["pipelines"]) +} + +func TestParseDirectStateSizes_SubResources(t *testing.T) { + db := dstate.Database{ + State: map[string]dstate.ResourceEntry{ + "resources.jobs.foo": {State: json.RawMessage(`{}`)}, + "resources.jobs.foo.permissions": {State: json.RawMessage(`[]`)}, + }, + } + raw, _ := json.Marshal(db) + got := parseDirectStateSizes(t.Context(), raw) + assert.Len(t, got["jobs"], 1) + assert.Len(t, got["permissions"], 1) +} + +func TestParseDirectStateSizes_Malformed(t *testing.T) { + assert.Nil(t, parseDirectStateSizes(t.Context(), []byte("not json"))) +} + +func TestParseTerraformStateSizes_TranslatesAndGroups(t *testing.T) { + tfstate := map[string]any{ + "version": 4, + "resources": []any{ + map[string]any{ + "type": "databricks_job", + "mode": "managed", + "instances": []any{ + map[string]any{"attributes": map[string]any{"id": "1001", "name": "foo"}}, + map[string]any{"attributes": map[string]any{"id": "1002", "name": "bar"}}, + }, + }, + map[string]any{ + "type": "databricks_pipeline", + "mode": "managed", + "instances": []any{ + map[string]any{"attributes": map[string]any{"id": "abc"}}, + }, + }, + map[string]any{ + "type": "databricks_unknown_provider_type", + "mode": "managed", + "instances": []any{ + map[string]any{"attributes": map[string]any{"id": "xyz"}}, + }, + }, + }, + } + raw, err := json.Marshal(tfstate) + require.NoError(t, err) + + got := parseTerraformStateSizes(t.Context(), raw) + assert.Len(t, got["jobs"], 2) + assert.Len(t, got["pipelines"], 1) + + for k := range got { + assert.NotContains(t, k, "unknown") + } +} + +func TestParseTerraformStateSizes_SkipsDataSources(t *testing.T) { + tfstate := map[string]any{ + "version": 4, + "resources": []any{ + map[string]any{ + "type": "databricks_job", + "mode": "data", + "instances": []any{ + map[string]any{"attributes": map[string]any{"id": "1001"}}, + }, + }, + }, + } + raw, _ := json.Marshal(tfstate) + got := parseTerraformStateSizes(t.Context(), raw) + assert.Empty(t, got["jobs"]) +} + +func TestParseTerraformStateSizes_Malformed(t *testing.T) { + assert.Nil(t, parseTerraformStateSizes(t.Context(), []byte("not json"))) +} + +func TestResourceTypeFromKey(t *testing.T) { + cases := []struct { + in string + want string + }{ + {"resources.jobs.foo", "jobs"}, + {"resources.pipelines.bar", "pipelines"}, + {"resources.jobs.foo.permissions", "permissions"}, + {"resources.secret_scopes.s.permissions", "permissions"}, + {"not-a-state-key", ""}, + {"resources.jobs", ""}, + } + for _, c := range cases { + assert.Equal(t, c.want, resourceTypeFromKey(c.in), "key=%q", c.in) + } +} + +func TestReadStateFile_MissingReturnsNilNil(t *testing.T) { + tmp := filepath.Join(t.TempDir(), "missing.json") + raw, err := readStateFile(tmp) + assert.NoError(t, err) + assert.Nil(t, raw) +} + +func TestReadStateFile_ReadsExistingFile(t *testing.T) { + tmp := filepath.Join(t.TempDir(), "state.json") + require.NoError(t, os.WriteFile(tmp, []byte("hello"), 0o600)) + raw, err := readStateFile(tmp) + require.NoError(t, err) + assert.Equal(t, []byte("hello"), raw) +} + +func TestStatHelpers(t *testing.T) { + // Helpers expect a sorted slice (collectResourcesMetadata sorts before calling). + assert.Equal(t, int64(3), statMax([]int64{1, 2, 3})) + assert.Equal(t, int64(2), statMean([]int64{1, 2, 3})) + assert.Equal(t, int64(2), statMedian([]int64{1, 2, 3})) + // Lower-middle for even count: sorted [1,2,3,4] -> index (4-1)/2 = 1 -> 2. + assert.Equal(t, int64(2), statMedian([]int64{1, 2, 3, 4})) + // Empty slices are zero. + assert.Equal(t, int64(0), statMax(nil)) + assert.Equal(t, int64(0), statMean(nil)) + assert.Equal(t, int64(0), statMedian(nil)) +} + +func TestUnionKeys(t *testing.T) { + got := unionKeys(map[string]int64{"a": 1, "b": 2}, map[string][]int64{"b": nil, "c": nil}) + assert.ElementsMatch(t, []string{"a", "b", "c"}, got) +} diff --git a/bundle/phases/telemetry.go b/bundle/phases/telemetry.go index bb9a7d7e6b7..ac8adc102a6 100644 --- a/bundle/phases/telemetry.go +++ b/bundle/phases/telemetry.go @@ -180,6 +180,8 @@ func LogDeployTelemetry(ctx context.Context, b *bundle.Bundle, errMsg string) { ResourceClusterIDs: clusterIds, ResourceDashboardIDs: dashboardIds, + ResourcesMetadata: collectResourcesMetadata(ctx, b), + Experimental: &protos.BundleDeployExperimental{ BundleMode: mode, ConfigurationFileCount: b.Metrics.ConfigurationFileCount, diff --git a/libs/telemetry/protos/bundle_deploy.go b/libs/telemetry/protos/bundle_deploy.go index d9439437d9b..7d1419a6e46 100644 --- a/libs/telemetry/protos/bundle_deploy.go +++ b/libs/telemetry/protos/bundle_deploy.go @@ -10,6 +10,9 @@ type BundleDeployEvent struct { // Error message encountered during the bundle deploy command, if any. ErrorMessage string `json:"error_message,omitempty"` + // Deprecated: use Experimental.StateSize.ResourceStats[*].Count instead. + // Per-resource-type counts are derived from deployment state at telemetry- + // emission time and tracked under StateSize. ResourceCount int64 `json:"resource_count"` ResourceJobCount int64 `json:"resource_job_count"` ResourcePipelineCount int64 `json:"resource_pipeline_count"` @@ -32,6 +35,9 @@ type BundleDeployEvent struct { ResourceClusterIDs []string `json:"resource_cluster_ids,omitempty"` ResourceDashboardIDs []string `json:"resource_dashboard_ids,omitempty"` + // Per-resource-type metadata (counts and state-size statistics). + ResourcesMetadata *BundleResourcesMetadata `json:"resources_metadata,omitempty"` + Experimental *BundleDeployExperimental `json:"experimental,omitempty"` } @@ -88,6 +94,37 @@ type BundleDeployExperimental struct { LocalCacheMeasurementsMs []IntMapEntry `json:"local_cache_measurements_ms,omitempty"` } +// BundleResourcesMetadata mirrors the universe proto. Per-resource-type +// metadata for one bundle deployment, including counts (which replace the +// deprecated DatabricksBundleDeployEvent.resource_*_count fields) and +// state-size statistics. +type BundleResourcesMetadata struct { + // "direct" or "terraform" + StateEngine string `json:"state_engine,omitempty"` + + // Size in bytes of the entire deployment state file on disk. + StateFileSizeBytes int64 `json:"state_file_size_bytes,omitempty"` + + // One entry per resource type present in the bundle. + Resources []ResourceMetadata `json:"resources,omitempty"` +} + +// ResourceMetadata holds metadata about resources of a single type within one +// bundle deployment. +type ResourceMetadata struct { + // Resource type name: "jobs", "pipelines", "schemas", ... + ResourceType string `json:"resource_type,omitempty"` + + // Number of resources of this type declared in the bundle configuration. + Count int64 `json:"count,omitempty"` + + // State-size statistics across resources of this type tracked in the + // deployment state. Zero when no resources of this type are in state. + StateSizeMaxBytes int64 `json:"state_size_max_bytes,omitempty"` + StateSizeMeanBytes int64 `json:"state_size_mean_bytes,omitempty"` + StateSizeMedianBytes int64 `json:"state_size_median_bytes,omitempty"` +} + type BoolMapEntry struct { Key string `json:"key,omitempty"` Value bool `json:"value"`