diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index 1459df2947..13efb33eb7 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -8,6 +8,8 @@ ### Bundles + * direct: Cluster resize now falls back to regular update if resize fails due to `INVALID_STATE` ([#5716](https://github.com/databricks/cli/pull/5716)). + ### Dependency updates ### API Changes diff --git a/acceptance/bundle/resources/clusters/resize-terminated-fallback/databricks.yml.tmpl b/acceptance/bundle/resources/clusters/resize-terminated-fallback/databricks.yml.tmpl new file mode 100644 index 0000000000..ff3fa1211a --- /dev/null +++ b/acceptance/bundle/resources/clusters/resize-terminated-fallback/databricks.yml.tmpl @@ -0,0 +1,14 @@ +bundle: + name: test-bundle + +workspace: + root_path: ~/.bundle/$UNIQUE_NAME + +resources: + clusters: + test_cluster: + cluster_name: test-cluster-$UNIQUE_NAME + spark_version: $DEFAULT_SPARK_VERSION + node_type_id: $NODE_TYPE_ID + instance_pool_id: $TEST_INSTANCE_POOL_ID + num_workers: 2 diff --git a/acceptance/bundle/resources/clusters/resize-terminated-fallback/out.test.toml b/acceptance/bundle/resources/clusters/resize-terminated-fallback/out.test.toml new file mode 100644 index 0000000000..9cfad3fb0d --- /dev/null +++ b/acceptance/bundle/resources/clusters/resize-terminated-fallback/out.test.toml @@ -0,0 +1,3 @@ +Local = true +Cloud = true +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/resources/clusters/resize-terminated-fallback/output.txt b/acceptance/bundle/resources/clusters/resize-terminated-fallback/output.txt new file mode 100644 index 0000000000..61c620e223 --- /dev/null +++ b/acceptance/bundle/resources/clusters/resize-terminated-fallback/output.txt @@ -0,0 +1,66 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/[UNIQUE_NAME]/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +=== Create a plan while cluster is running: should show resize + +>>> [CLI] bundle plan -o json + +=== Terminate the cluster before applying the saved plan + +>>> [CLI] clusters get [CLUSTER_ID] +{ + "cluster_name": "test-cluster-[UNIQUE_NAME]", + "num_workers": 2, + "state": "TERMINATED" +} + +=== Apply saved plan: resize fails with INVALID_STATE, falls back to edit + +>>> [CLI] bundle deploy --plan plan.json +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/[UNIQUE_NAME]/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +>>> print_requests.py //clusters/resize //clusters/edit +{ + "method": "POST", + "path": "/api/2.1/clusters/resize", + "body": { + "cluster_id": "[CLUSTER_ID]", + "num_workers": 3 + } +} +{ + "method": "POST", + "path": "/api/2.1/clusters/edit", + "body": { + "autotermination_minutes": 60, + "cluster_id": "[CLUSTER_ID]", + "cluster_name": "test-cluster-[UNIQUE_NAME]", + "instance_pool_id": "[TEST_INSTANCE_POOL_ID]", + "num_workers": 3, + "spark_version": "13.3.x-snapshot-scala2.12" + } +} + +=== Cluster should have new num_workers + +>>> [CLI] clusters get [CLUSTER_ID] +{ + "cluster_name": "test-cluster-[UNIQUE_NAME]", + "num_workers": 3 +} + +>>> [CLI] bundle destroy --auto-approve +The following resources will be deleted: + delete resources.clusters.test_cluster + +All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/[UNIQUE_NAME] + +Deleting files... +Destroy complete! diff --git a/acceptance/bundle/resources/clusters/resize-terminated-fallback/script b/acceptance/bundle/resources/clusters/resize-terminated-fallback/script new file mode 100644 index 0000000000..258d8e8d37 --- /dev/null +++ b/acceptance/bundle/resources/clusters/resize-terminated-fallback/script @@ -0,0 +1,27 @@ +envsubst < databricks.yml.tmpl > databricks.yml + +cleanup() { + trace $CLI bundle destroy --auto-approve + rm -f out.requests.txt +} +trap cleanup EXIT + +trace $CLI bundle deploy + +CLUSTER_ID=$($CLI bundle summary -o json | jq -r '.resources.clusters.test_cluster.id') +echo "$CLUSTER_ID:CLUSTER_ID" >> ACC_REPLS + +title "Create a plan while cluster is running: should show resize\n" +update_file.py databricks.yml "num_workers: 2" "num_workers: 3" +trace $CLI bundle plan -o json > plan.json + +title "Terminate the cluster before applying the saved plan\n" +$CLI clusters delete "$CLUSTER_ID" > /dev/null +trace $CLI clusters get "$CLUSTER_ID" | jq '{cluster_name,num_workers,state}' + +title "Apply saved plan: resize fails with INVALID_STATE, falls back to edit\n" +trace $CLI bundle deploy --plan plan.json +trace print_requests.py //clusters/resize //clusters/edit + +title "Cluster should have new num_workers\n" +trace $CLI clusters get "$CLUSTER_ID" | jq '{cluster_name,num_workers}' diff --git a/acceptance/bundle/resources/clusters/resize-terminated-fallback/test.toml b/acceptance/bundle/resources/clusters/resize-terminated-fallback/test.toml new file mode 100644 index 0000000000..3d30b20670 --- /dev/null +++ b/acceptance/bundle/resources/clusters/resize-terminated-fallback/test.toml @@ -0,0 +1,8 @@ +Local = true +Cloud = true +RecordRequests = true + +Ignore = [".databricks", "databricks.yml", "plan.json"] + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/bundle/direct/apply.go b/bundle/direct/apply.go index 7b6d2b3f05..d227d7f564 100644 --- a/bundle/direct/apply.go +++ b/bundle/direct/apply.go @@ -44,7 +44,7 @@ func (d *DeploymentUnit) Deploy(ctx context.Context, db *dstate.DeploymentState, case deployplan.UpdateWithID: return d.UpdateWithID(ctx, db, oldID, newState) case deployplan.Resize: - return d.Resize(ctx, db, oldID, newState) + return d.Resize(ctx, db, oldID, newState, planEntry) default: return fmt.Errorf("internal error: unexpected actionType: %#v", actionType) } @@ -246,8 +246,8 @@ func (d *DeploymentUnit) Delete(ctx context.Context, db *dstate.DeploymentState, return nil } -func (d *DeploymentUnit) Resize(ctx context.Context, db *dstate.DeploymentState, id string, newState any) error { - err := retryOnTransientErr(ctx, func() error { return d.Adapter.DoResize(ctx, id, newState) }) +func (d *DeploymentUnit) Resize(ctx context.Context, db *dstate.DeploymentState, id string, newState any, entry *deployplan.PlanEntry) error { + err := retryOnTransientErr(ctx, func() error { return d.Adapter.DoResize(ctx, id, newState, entry) }) if err != nil { return fmt.Errorf("resizing id=%s: %w", id, err) } diff --git a/bundle/direct/dresources/adapter.go b/bundle/direct/dresources/adapter.go index e38bb61107..70a9f1f5e3 100644 --- a/bundle/direct/dresources/adapter.go +++ b/bundle/direct/dresources/adapter.go @@ -65,7 +65,7 @@ type IResource interface { DoUpdateWithID(ctx context.Context, id string, newState any) (newID string, remoteState any, e error) // [Optional] DoResize resizes the resource. Only supported by clusters - DoResize(ctx context.Context, id string, newState any) error + DoResize(ctx context.Context, id string, newState any, entry *PlanEntry) error // [Optional] WaitAfterCreate waits for the resource to become ready after creation. Returns optionally updated remote state. // TODO: wait status should be persisted in the state. @@ -494,12 +494,12 @@ func (a *Adapter) DoUpdateWithID(ctx context.Context, oldID string, newState any return id, remoteState, nil } -func (a *Adapter) DoResize(ctx context.Context, id string, newState any) error { +func (a *Adapter) DoResize(ctx context.Context, id string, newState any, entry *PlanEntry) error { if a.doResize == nil { return errors.New("internal error: DoResize not found") } - _, err := a.doResize.Call(ctx, id, newState) + _, err := a.doResize.Call(ctx, id, newState, entry) return err } diff --git a/bundle/direct/dresources/cluster.go b/bundle/direct/dresources/cluster.go index b41de45301..6ded283c9f 100644 --- a/bundle/direct/dresources/cluster.go +++ b/bundle/direct/dresources/cluster.go @@ -8,6 +8,7 @@ import ( "github.com/databricks/cli/bundle/config/resources" "github.com/databricks/cli/bundle/deployplan" + "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/structs/structpath" "github.com/databricks/cli/libs/utils" "github.com/databricks/databricks-sdk-go" @@ -241,13 +242,25 @@ func (r *ResourceCluster) WaitAfterCreate(ctx context.Context, id string, config return nil, nil } -func (r *ResourceCluster) DoResize(ctx context.Context, id string, config *ClusterState) error { +func (r *ResourceCluster) DoResize(ctx context.Context, id string, config *ClusterState, entry *PlanEntry) error { _, err := r.client.Clusters.Resize(ctx, compute.ResizeCluster{ ClusterId: id, NumWorkers: config.NumWorkers, Autoscale: config.Autoscale, ForceSendFields: utils.FilterFields[compute.ResizeCluster](config.ForceSendFields), }) + if err == nil { + return nil + } + + apiErr, ok := errors.AsType[*apierr.APIError](err) + if !ok || apiErr.ErrorCode != "INVALID_STATE" { + return err + } + + // Cluster is not running; fall back to the full clusters/edit path. + log.Debugf(ctx, "cluster %s: resize returned INVALID_STATE (%s), falling back to edit", id, err) + _, err = r.DoUpdate(ctx, id, config, entry) return err } diff --git a/libs/testserver/clusters.go b/libs/testserver/clusters.go index b6e1c6c171..178c240a9e 100644 --- a/libs/testserver/clusters.go +++ b/libs/testserver/clusters.go @@ -58,6 +58,14 @@ func (s *FakeWorkspace) ClustersResize(req Request) any { return Response{StatusCode: 404} } + // Only running clusters can be resized; match the real API behavior. + if cluster.State != compute.StateRunning { + return Response{ + StatusCode: 400, + Body: map[string]string{"error_code": "INVALID_STATE", "message": "Cluster is not running"}, + } + } + cluster.NumWorkers = request.NumWorkers cluster.Autoscale = request.Autoscale s.Clusters[request.ClusterId] = cluster