Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions launch/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,11 @@ def create_model_bundle_from_runnable_image_v2(
env: Dict[str, str],
readiness_initial_delay_seconds: int,
metadata: Optional[Dict[str, Any]] = None,
forwarder_type: Optional[str] = None,
routes: Optional[List[str]] = None,
extra_routes: Optional[List[str]] = None,
worker_command: Optional[List[str]] = None,
worker_env: Optional[Dict[str, str]] = None,
) -> CreateModelBundleV2Response:
"""
Create a model bundle from a runnable image. The specified ``command`` must start a process
Expand Down Expand Up @@ -711,6 +716,16 @@ def create_model_bundle_from_runnable_image_v2(

metadata: Metadata to record with the bundle.

forwarder_type: The type of forwarder to use for the bundle.

routes: A list of routes that the bundle will serve.

extra_routes: A list of additional routes that the bundle will serve.

worker_command: The command to start worker processes.

worker_env: A dictionary of environment variables for worker processes.

Returns:
An object containing the following keys:

Expand All @@ -728,6 +743,11 @@ def create_model_bundle_from_runnable_image_v2(
env=env,
protocol="http",
readiness_initial_delay_seconds=readiness_initial_delay_seconds,
forwarder_type=forwarder_type,
routes=routes,
extra_routes=extra_routes,
worker_command=worker_command,
worker_env=worker_env,
)
)
create_model_bundle_request = CreateModelBundleV2Request(
Expand Down Expand Up @@ -1400,6 +1420,7 @@ def create_model_endpoint(
min_workers: int = 1,
max_workers: int = 1,
per_worker: int = 10,
concurrent_requests_per_worker: Optional[int] = None,
gpu_type: Optional[str] = None,
endpoint_type: str = "sync",
high_priority: Optional[bool] = False,
Expand Down Expand Up @@ -1464,6 +1485,9 @@ def create_model_endpoint(
concurrent requests in the workload. Divide this number by ``max_workers``. Doing
this ensures that the number of workers will "climb" to ``max_workers``.

concurrent_requests_per_worker: The maximum number of concurrent requests that each
worker can handle. If not specified, the server will use a default value.

gpu_type: If specifying a non-zero number of gpus, this controls the type of gpu
requested. Here are the supported values:

Expand Down Expand Up @@ -1530,6 +1554,7 @@ def create_model_endpoint(
min_workers=min_workers,
max_workers=max_workers,
per_worker=per_worker,
concurrent_requests_per_worker=concurrent_requests_per_worker,
gpu_type=gpu_type,
high_priority=high_priority,
default_callback_url=default_callback_url,
Expand Down Expand Up @@ -1584,6 +1609,7 @@ def create_model_endpoint(
model_bundle_id=model_bundle.id,
name=endpoint_name,
per_worker=per_worker,
concurrent_requests_per_worker=concurrent_requests_per_worker,
high_priority=high_priority,
post_inference_hooks=post_inference_hooks_strs,
default_callback_url=default_callback_url,
Expand Down Expand Up @@ -1621,6 +1647,7 @@ def edit_model_endpoint(
min_workers: Optional[int] = None,
max_workers: Optional[int] = None,
per_worker: Optional[int] = None,
concurrent_requests_per_worker: Optional[int] = None,
gpu_type: Optional[str] = None,
high_priority: Optional[bool] = None,
post_inference_hooks: Optional[List[PostInferenceHooks]] = None,
Expand Down Expand Up @@ -1671,6 +1698,9 @@ def edit_model_endpoint(
``per_worker``, then the number of workers will be increased to meet the elevated
traffic.

concurrent_requests_per_worker: The maximum number of concurrent requests that each
worker can handle. If not specified, the server will use a default value.

gpu_type: If specifying a non-zero number of gpus, this controls the type of gpu
requested. Here are the supported values:

Expand Down Expand Up @@ -1764,6 +1794,7 @@ def edit_model_endpoint(
min_workers=min_workers,
model_bundle_id=model_bundle_id,
per_worker=per_worker,
concurrent_requests_per_worker=concurrent_requests_per_worker,
high_priority=high_priority,
post_inference_hooks=post_inference_hooks_strs,
default_callback_url=default_callback_url,
Expand Down