From 6c36986edc23b569713864cc21738fd11efc86d3 Mon Sep 17 00:00:00 2001
From: Michael Choi <michael.choi@scale.com>
Date: Fri, 23 Jan 2026 09:54:13 -0800
Subject: [PATCH] Add new parameters to LaunchClient methods

- Add concurrent_requests_per_worker to create_model_endpoint and edit_model_endpoint
- Add forwarder_type, routes, extra_routes, worker_command, worker_env to create_model_bundle_from_runnable_image_v2

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 launch/client.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/launch/client.py b/launch/client.py
index 89a686a9..db96c54e 100644
--- a/launch/client.py
+++ b/launch/client.py
@@ -679,6 +679,11 @@ def create_model_bundle_from_runnable_image_v2(
         env: Dict[str, str],
         readiness_initial_delay_seconds: int,
         metadata: Optional[Dict[str, Any]] = None,
+        forwarder_type: Optional[str] = None,
+        routes: Optional[List[str]] = None,
+        extra_routes: Optional[List[str]] = None,
+        worker_command: Optional[List[str]] = None,
+        worker_env: Optional[Dict[str, str]] = None,
     ) -> CreateModelBundleV2Response:
         """
         Create a model bundle from a runnable image. The specified ``command`` must start a process
@@ -711,6 +716,16 @@ def create_model_bundle_from_runnable_image_v2(
 
             metadata: Metadata to record with the bundle.
 
+            forwarder_type: The type of forwarder to use for the bundle.
+
+            routes: A list of routes that the bundle will serve.
+
+            extra_routes: A list of additional routes that the bundle will serve.
+
+            worker_command: The command to start worker processes.
+
+            worker_env: A dictionary of environment variables for worker processes.
+
         Returns:
             An object containing the following keys:
 
@@ -728,6 +743,11 @@ def create_model_bundle_from_runnable_image_v2(
                 env=env,
                 protocol="http",
                 readiness_initial_delay_seconds=readiness_initial_delay_seconds,
+                forwarder_type=forwarder_type,
+                routes=routes,
+                extra_routes=extra_routes,
+                worker_command=worker_command,
+                worker_env=worker_env,
             )
         )
         create_model_bundle_request = CreateModelBundleV2Request(
@@ -1400,6 +1420,7 @@ def create_model_endpoint(
         min_workers: int = 1,
         max_workers: int = 1,
         per_worker: int = 10,
+        concurrent_requests_per_worker: Optional[int] = None,
         gpu_type: Optional[str] = None,
         endpoint_type: str = "sync",
         high_priority: Optional[bool] = False,
@@ -1464,6 +1485,9 @@ def create_model_endpoint(
                 concurrent requests in the workload. Divide this number by ``max_workers``. Doing
                 this ensures that the number of workers will "climb" to ``max_workers``.
 
+            concurrent_requests_per_worker: The maximum number of concurrent requests that each
+                worker can handle. If not specified, the server will use a default value.
+
             gpu_type: If specifying a non-zero number of gpus, this controls the type of gpu
                 requested. Here are the supported values:
 
@@ -1530,6 +1554,7 @@ def create_model_endpoint(
                 min_workers=min_workers,
                 max_workers=max_workers,
                 per_worker=per_worker,
+                concurrent_requests_per_worker=concurrent_requests_per_worker,
                 gpu_type=gpu_type,
                 high_priority=high_priority,
                 default_callback_url=default_callback_url,
@@ -1584,6 +1609,7 @@ def create_model_endpoint(
                     model_bundle_id=model_bundle.id,
                     name=endpoint_name,
                     per_worker=per_worker,
+                    concurrent_requests_per_worker=concurrent_requests_per_worker,
                     high_priority=high_priority,
                     post_inference_hooks=post_inference_hooks_strs,
                     default_callback_url=default_callback_url,
@@ -1621,6 +1647,7 @@ def edit_model_endpoint(
         min_workers: Optional[int] = None,
         max_workers: Optional[int] = None,
         per_worker: Optional[int] = None,
+        concurrent_requests_per_worker: Optional[int] = None,
         gpu_type: Optional[str] = None,
         high_priority: Optional[bool] = None,
         post_inference_hooks: Optional[List[PostInferenceHooks]] = None,
@@ -1671,6 +1698,9 @@ def edit_model_endpoint(
                 ``per_worker``, then the number of workers will be increased to meet the elevated
                 traffic.
 
+            concurrent_requests_per_worker: The maximum number of concurrent requests that each
+                worker can handle. If not specified, the server will use a default value.
+
             gpu_type: If specifying a non-zero number of gpus, this controls the type of gpu
                 requested. Here are the supported values:
 
@@ -1764,6 +1794,7 @@ def edit_model_endpoint(
                 min_workers=min_workers,
                 model_bundle_id=model_bundle_id,
                 per_worker=per_worker,
+                concurrent_requests_per_worker=concurrent_requests_per_worker,
                 high_priority=high_priority,
                 post_inference_hooks=post_inference_hooks_strs,
                 default_callback_url=default_callback_url,