scaleapi · dmchoiboi · Jan 23, 2026
diff --git a/launch/client.py b/launch/client.py
@@ -679,6 +679,11 @@ def create_model_bundle_from_runnable_image_v2(
         env: Dict[str, str],
         readiness_initial_delay_seconds: int,
         metadata: Optional[Dict[str, Any]] = None,
+        forwarder_type: Optional[str] = None,
+        routes: Optional[List[str]] = None,
+        extra_routes: Optional[List[str]] = None,
+        worker_command: Optional[List[str]] = None,
+        worker_env: Optional[Dict[str, str]] = None,
     ) -> CreateModelBundleV2Response:
         """
         Create a model bundle from a runnable image. The specified ``command`` must start a process
@@ -711,6 +716,16 @@ def create_model_bundle_from_runnable_image_v2(
 
             metadata: Metadata to record with the bundle.
 
+            forwarder_type: The type of forwarder to use for the bundle.
+
+            routes: A list of routes that the bundle will serve.
+
+            extra_routes: A list of additional routes that the bundle will serve.
+
+            worker_command: The command to start worker processes.
+
+            worker_env: A dictionary of environment variables for worker processes.
+
         Returns:
             An object containing the following keys:
 
@@ -728,6 +743,11 @@ def create_model_bundle_from_runnable_image_v2(
                 env=env,
                 protocol="http",
                 readiness_initial_delay_seconds=readiness_initial_delay_seconds,
+                forwarder_type=forwarder_type,
+                routes=routes,
+                extra_routes=extra_routes,
+                worker_command=worker_command,
+                worker_env=worker_env,
             )
         )
         create_model_bundle_request = CreateModelBundleV2Request(
@@ -1400,6 +1420,7 @@ def create_model_endpoint(
         min_workers: int = 1,
         max_workers: int = 1,
         per_worker: int = 10,
+        concurrent_requests_per_worker: Optional[int] = None,
         gpu_type: Optional[str] = None,
         endpoint_type: str = "sync",
         high_priority: Optional[bool] = False,
@@ -1464,6 +1485,9 @@ def create_model_endpoint(
                 concurrent requests in the workload. Divide this number by ``max_workers``. Doing
                 this ensures that the number of workers will "climb" to ``max_workers``.
 
+            concurrent_requests_per_worker: The maximum number of concurrent requests that each
+                worker can handle. If not specified, the server will use a default value.
+
             gpu_type: If specifying a non-zero number of gpus, this controls the type of gpu
                 requested. Here are the supported values:
 
@@ -1530,6 +1554,7 @@ def create_model_endpoint(
                 min_workers=min_workers,
                 max_workers=max_workers,
                 per_worker=per_worker,
+                concurrent_requests_per_worker=concurrent_requests_per_worker,
                 gpu_type=gpu_type,
                 high_priority=high_priority,
                 default_callback_url=default_callback_url,
@@ -1584,6 +1609,7 @@ def create_model_endpoint(
                     model_bundle_id=model_bundle.id,
                     name=endpoint_name,
                     per_worker=per_worker,
+                    concurrent_requests_per_worker=concurrent_requests_per_worker,
                     high_priority=high_priority,
                     post_inference_hooks=post_inference_hooks_strs,
                     default_callback_url=default_callback_url,
@@ -1621,6 +1647,7 @@ def edit_model_endpoint(
         min_workers: Optional[int] = None,
         max_workers: Optional[int] = None,
         per_worker: Optional[int] = None,
+        concurrent_requests_per_worker: Optional[int] = None,
         gpu_type: Optional[str] = None,
         high_priority: Optional[bool] = None,
         post_inference_hooks: Optional[List[PostInferenceHooks]] = None,
@@ -1671,6 +1698,9 @@ def edit_model_endpoint(
                 ``per_worker``, then the number of workers will be increased to meet the elevated
                 traffic.
 
+            concurrent_requests_per_worker: The maximum number of concurrent requests that each
+                worker can handle. If not specified, the server will use a default value.
+
             gpu_type: If specifying a non-zero number of gpus, this controls the type of gpu
                 requested. Here are the supported values:
 
@@ -1764,6 +1794,7 @@ def edit_model_endpoint(
                 min_workers=min_workers,
                 model_bundle_id=model_bundle_id,
                 per_worker=per_worker,
+                concurrent_requests_per_worker=concurrent_requests_per_worker,
                 high_priority=high_priority,
                 post_inference_hooks=post_inference_hooks_strs,
                 default_callback_url=default_callback_url,