From 4be8ffa318e8a74a0a8170b0987ca7701fd93660 Mon Sep 17 00:00:00 2001 From: Chenghao Mou Date: Wed, 8 Apr 2026 11:17:25 +0800 Subject: [PATCH 1/5] add turn detection protobufs --- .../agent/livekit_agent_turn_detector.proto | 142 ++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 protobufs/agent/livekit_agent_turn_detector.proto diff --git a/protobufs/agent/livekit_agent_turn_detector.proto b/protobufs/agent/livekit_agent_turn_detector.proto new file mode 100644 index 000000000..de17b08ae --- /dev/null +++ b/protobufs/agent/livekit_agent_turn_detector.proto @@ -0,0 +1,142 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +syntax = "proto3"; + +package livekit.agent; + +option go_package = "github.com/livekit/protocol/livekit/agent"; +option csharp_namespace = "LiveKit.Proto"; +option ruby_package = "LiveKit::Proto"; +option optimize_for = SPEED; + +import "agent/livekit_agent_session.proto"; +import "google/protobuf/duration.proto"; +import "google/protobuf/timestamp.proto"; + +enum TdAudioEncoding { + TD_AUDIO_ENCODING_OPUS = 0; +} + +message TdSessionSettings { + uint32 sample_rate = 1; + TdAudioEncoding encoding = 2; +} + +message TdInferenceStats { + google.protobuf.Duration e2e_latency = 1; + google.protobuf.Duration preprocessing_duration = 2; + google.protobuf.Duration inference_duration = 3; +} + +message TdError { + string message = 1; + // error code follows the HTTP status code convention + // 4xx for client errors + // 5xx for server errors + uint32 code = 2; +} + +// --- Client -> Server --- + +message TdSessionCreate { + TdSessionSettings settings = 1; +} + +message TdInputAudio { + bytes audio = 1; + google.protobuf.Timestamp created_at = 2; +} + +message TdInputChatContext { + repeated ChatMessage messages = 1; +} + +message TdSessionFlush {} + +message TdSessionClose {} + +message TdInferenceStart { + string request_id = 1; +} + +message TdInferenceStop { + string request_id = 1; +} + +message TdClientMessage { + oneof message { + TdSessionCreate session_create = 1; + TdInputAudio input_audio = 2; + TdInputChatContext input_chat_context = 3; + TdSessionFlush session_flush = 4; + TdSessionClose session_close = 5; + TdInferenceStart inference_start = 6; + TdInferenceStop inference_stop = 7; + } + google.protobuf.Timestamp created_at = 8; +} + +// --- Server -> Model --- + +message TdInferenceRequest { + bytes audio = 1; + string assistant_text = 2; + TdAudioEncoding encoding = 3; + uint32 sample_rate = 4; +} + +// --- Model -> Server --- + +message TdInferenceResponse { + float probability = 1; + TdInferenceStats stats = 2; +} + +// --- Server -> Client --- + +message TdSessionCreated {} + +message TdProcessingStats { + google.protobuf.Timestamp earliest_client_created_at = 1; + google.protobuf.Timestamp latest_client_created_at = 2; + // server-side E2E latency + google.protobuf.Duration e2e_latency = 3; + // stats including model-side E2E latency + TdInferenceStats inference_stats = 4; +} + +message TdEouPrediction { + float probability = 1; + TdProcessingStats processing_stats = 2; +} + +message TdInferenceStarted {} + +message TdInferenceStopped {} + +message TdSessionClosed {} + +message TdServerMessage { + oneof message { + TdSessionCreated session_created = 1; + TdInferenceStarted inference_started = 2; + TdInferenceStopped inference_stopped = 3; + TdEouPrediction eou_prediction = 4; + TdSessionClosed session_closed = 5; + TdError error = 6; + } + optional string request_id = 7; + google.protobuf.Timestamp server_created_at = 8; + optional google.protobuf.Timestamp client_created_at = 9; +} From 073003beb279edf537c0210405617b289d78148f Mon Sep 17 00:00:00 2001 From: Chenghao Mou Date: Thu, 9 Apr 2026 16:50:30 +0800 Subject: [PATCH 2/5] Create many-seas-fry.md --- .changeset/many-seas-fry.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/many-seas-fry.md diff --git a/.changeset/many-seas-fry.md b/.changeset/many-seas-fry.md new file mode 100644 index 000000000..a31f49c77 --- /dev/null +++ b/.changeset/many-seas-fry.md @@ -0,0 +1,5 @@ +--- +"@fake-scope/fake-pkg": patch +--- + +Add turn detection protobufs From d8da1d27ef6723c303a2e5d843ab5dc6c70d8f5d Mon Sep 17 00:00:00 2001 From: Chenghao Mou Date: Thu, 9 Apr 2026 22:21:47 +0800 Subject: [PATCH 3/5] add num_samples in each audio frame --- protobufs/agent/livekit_agent_turn_detector.proto | 2 ++ 1 file changed, 2 insertions(+) diff --git a/protobufs/agent/livekit_agent_turn_detector.proto b/protobufs/agent/livekit_agent_turn_detector.proto index de17b08ae..ac4c08cca 100644 --- a/protobufs/agent/livekit_agent_turn_detector.proto +++ b/protobufs/agent/livekit_agent_turn_detector.proto @@ -56,6 +56,8 @@ message TdSessionCreate { message TdInputAudio { bytes audio = 1; google.protobuf.Timestamp created_at = 2; + // used for buffer size calculation + uint32 num_samples = 3; } message TdInputChatContext { From 8cc28aa1ee57739e5f803b9393bff61e53f4bbee Mon Sep 17 00:00:00 2001 From: Chenghao Mou Date: Fri, 10 Apr 2026 15:02:03 +0800 Subject: [PATCH 4/5] add barge in --- protobufs/agent/livekit_agent_inference.proto | 190 ++++++++++++++++++ .../agent/livekit_agent_turn_detector.proto | 144 ------------- 2 files changed, 190 insertions(+), 144 deletions(-) create mode 100644 protobufs/agent/livekit_agent_inference.proto delete mode 100644 protobufs/agent/livekit_agent_turn_detector.proto diff --git a/protobufs/agent/livekit_agent_inference.proto b/protobufs/agent/livekit_agent_inference.proto new file mode 100644 index 000000000..1b27fe37e --- /dev/null +++ b/protobufs/agent/livekit_agent_inference.proto @@ -0,0 +1,190 @@ +// Copyright 2026 LiveKit, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +syntax = "proto3"; + +package livekit.agent; + +option go_package = "github.com/livekit/protocol/livekit/agent"; +option csharp_namespace = "LiveKit.Proto"; +option ruby_package = "LiveKit::Proto"; +option optimize_for = SPEED; + +import "agent/livekit_agent_session.proto"; +import "google/protobuf/duration.proto"; +import "google/protobuf/timestamp.proto"; + +// --- Shared Types --- + +enum AudioEncoding { + AUDIO_ENCODING_PCM_S16LE = 0; + AUDIO_ENCODING_OPUS = 1; +} + +message SessionSettings { + uint32 sample_rate = 1; + AudioEncoding encoding = 2; + oneof type_settings { + TdSettings td_settings = 3; + BiSettings bi_settings = 4; + } +} + +message InferenceStats { + google.protobuf.Duration e2e_latency = 1; + google.protobuf.Duration preprocessing_duration = 2; + google.protobuf.Duration inference_duration = 3; +} + +message Error { + string message = 1; + // error code follows the HTTP status code convention + // 4xx for client errors + // 5xx for server errors + uint32 code = 2; +} + +message ProcessingStats { + google.protobuf.Timestamp earliest_client_created_at = 1; + google.protobuf.Timestamp latest_client_created_at = 2; + google.protobuf.Duration e2e_latency = 3; + InferenceStats inference_stats = 4; +} + +// --- Turn Detection Settings --- + +message TdSettings { + float detection_interval = 1; +} + +// --- Barge-in Settings --- + +message BiSettings { + float threshold = 1; + uint32 min_frames = 2; + float max_audio_duration = 3; + float audio_prefix_duration = 4; + float detection_interval = 5; +} + +// --- Client -> Server --- + +message SessionCreate { + SessionSettings settings = 1; +} + +message InputAudio { + bytes audio = 1; + google.protobuf.Timestamp created_at = 2; + uint32 num_samples = 3; +} + +message TdInputChatContext { + repeated ChatMessage messages = 1; +} + +message SessionFlush {} + +message SessionClose {} + +message InferenceStart { + string request_id = 1; +} + +message InferenceStop { + string request_id = 1; +} + +message BufferStart {} + +message BufferStop {} + +message ClientMessage { + google.protobuf.Timestamp created_at = 1; + oneof message { + SessionCreate session_create = 2; + InputAudio input_audio = 3; + SessionFlush session_flush = 4; + SessionClose session_close = 5; + InferenceStart inference_start = 6; + InferenceStop inference_stop = 7; + BufferStart buffer_start = 8; + BufferStop buffer_stop = 9; + // only for turn detection + TdInputChatContext td_input_chat_context = 10; + } +} + +// --- Server -> Model --- + +message TdInferenceRequest { + bytes audio = 1; + string assistant_text = 2; + AudioEncoding encoding = 3; + uint32 sample_rate = 4; +} + +message TdInferenceResponse { + float probability = 1; + InferenceStats stats = 2; +} + +message BiInferenceRequest { + bytes audio = 1; + AudioEncoding encoding = 2; + uint32 sample_rate = 3; +} + +message BiInferenceResponse { + bool is_bargein = 1; + repeated float probabilities = 2; + InferenceStats stats = 3; +} + +// --- Server -> Client --- + +message SessionCreated {} + +message InferenceStarted {} + +message InferenceStopped {} + +message SessionClosed {} + +message TdPrediction { + float probability = 1; + ProcessingStats processing_stats = 2; +} + +message BiPrediction { + bool is_bargein = 1; + repeated float probabilities = 2; + ProcessingStats processing_stats = 3; + int64 created_at = 4; + float prediction_duration = 5; +} + +message ServerMessage { + google.protobuf.Timestamp server_created_at = 1; + optional string request_id = 2; + optional google.protobuf.Timestamp client_created_at = 3; + oneof message { + SessionCreated session_created = 4; + InferenceStarted inference_started = 5; + InferenceStopped inference_stopped = 6; + SessionClosed session_closed = 7; + Error error = 8; + TdPrediction td_prediction = 9; + BiPrediction bi_prediction = 10; + } +} diff --git a/protobufs/agent/livekit_agent_turn_detector.proto b/protobufs/agent/livekit_agent_turn_detector.proto deleted file mode 100644 index ac4c08cca..000000000 --- a/protobufs/agent/livekit_agent_turn_detector.proto +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright 2026 LiveKit, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -syntax = "proto3"; - -package livekit.agent; - -option go_package = "github.com/livekit/protocol/livekit/agent"; -option csharp_namespace = "LiveKit.Proto"; -option ruby_package = "LiveKit::Proto"; -option optimize_for = SPEED; - -import "agent/livekit_agent_session.proto"; -import "google/protobuf/duration.proto"; -import "google/protobuf/timestamp.proto"; - -enum TdAudioEncoding { - TD_AUDIO_ENCODING_OPUS = 0; -} - -message TdSessionSettings { - uint32 sample_rate = 1; - TdAudioEncoding encoding = 2; -} - -message TdInferenceStats { - google.protobuf.Duration e2e_latency = 1; - google.protobuf.Duration preprocessing_duration = 2; - google.protobuf.Duration inference_duration = 3; -} - -message TdError { - string message = 1; - // error code follows the HTTP status code convention - // 4xx for client errors - // 5xx for server errors - uint32 code = 2; -} - -// --- Client -> Server --- - -message TdSessionCreate { - TdSessionSettings settings = 1; -} - -message TdInputAudio { - bytes audio = 1; - google.protobuf.Timestamp created_at = 2; - // used for buffer size calculation - uint32 num_samples = 3; -} - -message TdInputChatContext { - repeated ChatMessage messages = 1; -} - -message TdSessionFlush {} - -message TdSessionClose {} - -message TdInferenceStart { - string request_id = 1; -} - -message TdInferenceStop { - string request_id = 1; -} - -message TdClientMessage { - oneof message { - TdSessionCreate session_create = 1; - TdInputAudio input_audio = 2; - TdInputChatContext input_chat_context = 3; - TdSessionFlush session_flush = 4; - TdSessionClose session_close = 5; - TdInferenceStart inference_start = 6; - TdInferenceStop inference_stop = 7; - } - google.protobuf.Timestamp created_at = 8; -} - -// --- Server -> Model --- - -message TdInferenceRequest { - bytes audio = 1; - string assistant_text = 2; - TdAudioEncoding encoding = 3; - uint32 sample_rate = 4; -} - -// --- Model -> Server --- - -message TdInferenceResponse { - float probability = 1; - TdInferenceStats stats = 2; -} - -// --- Server -> Client --- - -message TdSessionCreated {} - -message TdProcessingStats { - google.protobuf.Timestamp earliest_client_created_at = 1; - google.protobuf.Timestamp latest_client_created_at = 2; - // server-side E2E latency - google.protobuf.Duration e2e_latency = 3; - // stats including model-side E2E latency - TdInferenceStats inference_stats = 4; -} - -message TdEouPrediction { - float probability = 1; - TdProcessingStats processing_stats = 2; -} - -message TdInferenceStarted {} - -message TdInferenceStopped {} - -message TdSessionClosed {} - -message TdServerMessage { - oneof message { - TdSessionCreated session_created = 1; - TdInferenceStarted inference_started = 2; - TdInferenceStopped inference_stopped = 3; - TdEouPrediction eou_prediction = 4; - TdSessionClosed session_closed = 5; - TdError error = 6; - } - optional string request_id = 7; - google.protobuf.Timestamp server_created_at = 8; - optional google.protobuf.Timestamp client_created_at = 9; -} From f4d080fb25a80260772f656f4656f94b4df16b46 Mon Sep 17 00:00:00 2001 From: Chenghao Mou Date: Fri, 10 Apr 2026 15:16:16 +0800 Subject: [PATCH 5/5] add request, response, and prediction --- protobufs/agent/livekit_agent_inference.proto | 35 +++++++++++++++---- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/protobufs/agent/livekit_agent_inference.proto b/protobufs/agent/livekit_agent_inference.proto index 1b27fe37e..aa0479036 100644 --- a/protobufs/agent/livekit_agent_inference.proto +++ b/protobufs/agent/livekit_agent_inference.proto @@ -134,23 +134,38 @@ message TdInferenceRequest { uint32 sample_rate = 4; } -message TdInferenceResponse { - float probability = 1; - InferenceStats stats = 2; -} - message BiInferenceRequest { bytes audio = 1; AudioEncoding encoding = 2; uint32 sample_rate = 3; } +message InferenceRequest { + oneof request { + TdInferenceRequest td_inference_request = 1; + BiInferenceRequest bi_inference_request = 2; + } +} + + +message TdInferenceResponse { + float probability = 1; + InferenceStats stats = 2; +} + message BiInferenceResponse { bool is_bargein = 1; repeated float probabilities = 2; InferenceStats stats = 3; } +message InferenceResponse { + oneof response { + TdInferenceResponse td_inference_response = 1; + BiInferenceResponse bi_inference_response = 2; + } +} + // --- Server -> Client --- message SessionCreated {} @@ -174,6 +189,13 @@ message BiPrediction { float prediction_duration = 5; } +message Prediction { + oneof prediction { + TdPrediction td_prediction = 1; + BiPrediction bi_prediction = 2; + } +} + message ServerMessage { google.protobuf.Timestamp server_created_at = 1; optional string request_id = 2; @@ -184,7 +206,6 @@ message ServerMessage { InferenceStopped inference_stopped = 6; SessionClosed session_closed = 7; Error error = 8; - TdPrediction td_prediction = 9; - BiPrediction bi_prediction = 10; + Prediction prediction = 9; } }