Skip to content

Commit 0ab79b9

Browse files
hangfeicopybara-github
authored andcommitted
fix: Deprecate save_live_audio in favor of save_live_blob and fix audio event saving
This change: * Deprecates the `save_live_audio` configuration option in `RunConfig`, introducing `save_live_blob` as its replacement. A warning is issued if `save_live_audio` is used. * Updates `base_llm_flow.py` to use the new `save_live_blob` flag. * Ensures that audio events generated by `audio_cache_manager.py` are properly appended to the session service. * Adds a utility script `pcm_audio_player.py` for playing raw PCM audio files. Input sample event: 14a5859f-6b6c-46ed-9f28-e5008793b1c6|live_bidi_streaming_multi_agent|user|1e867c7f-dbe1-4268-a7bc-7a9fa5fbd16c|e-7a28a060-29bf-4483-bc5c-17248698a897|1762916981.5932|{"content":{"parts":[{"file_data":{"file_uri":"artifact://live_bidi_streaming_multi_agent/user/1e867c7f-dbe1-4268-a7bc-7a9fa5fbd16c/_adk_live/adk_live_audio_storage_input_audio_1762916981593.pcm#0","mime_type":"audio/pcm"}}],"role":"user"},"invocation_id":"e-7a28a060-29bf-4483-bc5c-17248698a897","author":"user","actions":{"state_delta":{},"artifact_delta":{},"requested_auth_configs":{},"requested_tool_confirmations":{}},"id":"14a5859f-6b6c-46ed-9f28-e5008793b1c6","timestamp":1762916981.5932002} output sample event: 506c9df4-e143-4ebc-90a5-2f5b2eb26754|live_bidi_streaming_multi_agent|user|1e867c7f-dbe1-4268-a7bc-7a9fa5fbd16c|e-7a28a060-29bf-4483-bc5c-17248698a897|1762916986.10579|{"content":{"parts":[{"file_data":{"file_uri":"artifact://live_bidi_streaming_multi_agent/user/1e867c7f-dbe1-4268-a7bc-7a9fa5fbd16c/_adk_live/adk_live_audio_storage_output_audio_1762916986105.pcm;rate=24000#0","mime_type":"audio/pcm;rate=24000"}}],"role":"model"},"invocation_id":"e-7a28a060-29bf-4483-bc5c-17248698a897","author":"model","actions":{"state_delta":{},"artifact_delta":{},"requested_auth_configs":{},"requested_tool_confirmations":{}},"id":"506c9df4-e143-4ebc-90a5-2f5b2eb26754","timestamp":1762916986.105794} Co-authored-by: Hangfei Lin <hangfei@google.com> PiperOrigin-RevId: 831512074
1 parent 675ecaa commit 0ab79b9

7 files changed

Lines changed: 137 additions & 50 deletions

File tree

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import numpy as np
15+
import sounddevice as sd
16+
17+
# input audio example. replace with the input audio you want to test
18+
FILE_PATH = 'adk_live_audio_storage_input_audio_1762910896736.pcm'
19+
# output audio example. replace with the input audio you want to test
20+
FILE_PATH = 'adk_live_audio_storage_output_audio_1762910893258.pcm;rate=24000'
21+
# PCM rate is always 24,000 for input and output
22+
SAMPLE_RATE = 24000
23+
CHANNELS = 1
24+
DTYPE = np.int16 # Common types: int16, float32
25+
26+
# Read and play
27+
with open(FILE_PATH, 'rb') as f:
28+
# Load raw data into numpy array
29+
raw_data = f.read()
30+
audio_array = np.frombuffer(raw_data, dtype=DTYPE)
31+
32+
# Reshape if stereo (interleaved)
33+
if CHANNELS > 1:
34+
audio_array = audio_array.reshape((-1, CHANNELS))
35+
36+
# Play
37+
print('Playing...')
38+
sd.play(audio_array, SAMPLE_RATE)
39+
sd.wait()

contributing/samples/live_bidi_streaming_single_agent/agent.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ async def check_prime(nums: list[int]) -> str:
6565

6666

6767
root_agent = Agent(
68-
model='gemini-live-2.5-flash-preview-native-audio-09-2025', # vertex
69-
# model='gemini-2.5-flash-native-audio-preview-09-2025', # for AI studio
68+
# model='gemini-live-2.5-flash-preview-native-audio-09-2025', # vertex
69+
model='gemini-2.5-flash-native-audio-preview-09-2025', # for AI studio
7070
# key
7171
name='roll_dice_agent',
7272
description=(

src/google/adk/agents/run_config.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@
1919
import sys
2020
from typing import Any
2121
from typing import Optional
22+
import warnings
2223

2324
from google.genai import types
2425
from pydantic import BaseModel
2526
from pydantic import ConfigDict
2627
from pydantic import Field
2728
from pydantic import field_validator
29+
from pydantic import model_validator
2830

2931
logger = logging.getLogger('google_adk.' + __name__)
3032

@@ -103,11 +105,17 @@ class RunConfig(BaseModel):
103105
)
104106
"""Configuration for context window compression. If set, this will enable context window compression for LLM input."""
105107

106-
save_live_audio: bool = False
107-
"""Saves live video and audio data to session and artifact service.
108+
save_live_blob: bool = False
109+
"""Saves live video and audio data to session and artifact service."""
108110

109-
Right now, only audio is supported.
110-
"""
111+
save_live_audio: bool = Field(
112+
default=False,
113+
deprecated=True,
114+
description=(
115+
'DEPRECATED: Use save_live_blob instead. If set to True, it saves'
116+
' live video and audio data to session and artifact service.'
117+
),
118+
)
111119

112120
max_llm_calls: int = 500
113121
"""
@@ -122,6 +130,21 @@ class RunConfig(BaseModel):
122130
custom_metadata: Optional[dict[str, Any]] = None
123131
"""Custom metadata for the current invocation."""
124132

133+
@model_validator(mode='before')
134+
@classmethod
135+
def check_for_deprecated_save_live_audio(cls, data: Any) -> Any:
136+
"""If save_live_audio is passed, use it to set save_live_blob."""
137+
if isinstance(data, dict) and 'save_live_audio' in data:
138+
warnings.warn(
139+
'The `save_live_audio` config is deprecated and will be removed in a'
140+
' future release. Please use `save_live_blob` instead.',
141+
DeprecationWarning,
142+
stacklevel=2,
143+
)
144+
if data['save_live_audio']:
145+
data['save_live_blob'] = True
146+
return data
147+
125148
@field_validator('max_llm_calls', mode='after')
126149
@classmethod
127150
def validate_max_llm_calls(cls, value: int) -> int:

src/google/adk/flows/llm_flows/audio_cache_manager.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ async def flush_caches(
8686
invocation_context: InvocationContext,
8787
flush_user_audio: bool = True,
8888
flush_model_audio: bool = True,
89-
) -> None:
89+
) -> list[Event]:
9090
"""Flush audio caches to artifact services.
9191
9292
The multimodality data is saved in artifact service in the format of
@@ -101,32 +101,40 @@ async def flush_caches(
101101
invocation_context: The invocation context containing audio caches.
102102
flush_user_audio: Whether to flush the input (user) audio cache.
103103
flush_model_audio: Whether to flush the output (model) audio cache.
104+
105+
Returns:
106+
A list of Event objects created from the flushed caches.
104107
"""
108+
flushed_events = []
105109
if flush_user_audio and invocation_context.input_realtime_cache:
106-
flush_success = await self._flush_cache_to_services(
110+
audio_event = await self._flush_cache_to_services(
107111
invocation_context,
108112
invocation_context.input_realtime_cache,
109113
'input_audio',
110114
)
111-
if flush_success:
115+
if audio_event:
116+
flushed_events.append(audio_event)
112117
invocation_context.input_realtime_cache = []
113118

114119
if flush_model_audio and invocation_context.output_realtime_cache:
115120
logger.debug('Flushed output audio cache')
116-
flush_success = await self._flush_cache_to_services(
121+
audio_event = await self._flush_cache_to_services(
117122
invocation_context,
118123
invocation_context.output_realtime_cache,
119124
'output_audio',
120125
)
121-
if flush_success:
126+
if audio_event:
127+
flushed_events.append(audio_event)
122128
invocation_context.output_realtime_cache = []
123129

130+
return flushed_events
131+
124132
async def _flush_cache_to_services(
125133
self,
126134
invocation_context: InvocationContext,
127135
audio_cache: list[RealtimeCacheEntry],
128136
cache_type: str,
129-
) -> bool:
137+
) -> Event | None:
130138
"""Flush a list of audio cache entries to artifact services.
131139
132140
The artifact service stores the actual blob. The session stores the
@@ -138,11 +146,11 @@ async def _flush_cache_to_services(
138146
cache_type: Type identifier for the cache ('input_audio' or 'output_audio').
139147
140148
Returns:
141-
True if the cache was successfully flushed, False otherwise.
149+
The created Event if the cache was successfully flushed, None otherwise.
142150
"""
143151
if not invocation_context.artifact_service or not audio_cache:
144152
logger.debug('Skipping cache flush: no artifact service or empty cache')
145-
return False
153+
return None
146154

147155
try:
148156
# Combine audio chunks into a single file
@@ -201,7 +209,7 @@ async def _flush_cache_to_services(
201209

202210
except Exception as e:
203211
logger.error('Failed to flush %s cache: %s', cache_type, e)
204-
return False
212+
return None
205213

206214
def get_cache_stats(
207215
self, invocation_context: InvocationContext

src/google/adk/flows/llm_flows/base_llm_flow.py

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@
4949
from ...tools.tool_context import ToolContext
5050
from ...utils.context_utils import Aclosing
5151
from .audio_cache_manager import AudioCacheManager
52-
from .transcription_manager import TranscriptionManager
5352

5453
if TYPE_CHECKING:
5554
from ...agents.llm_agent import LlmAgent
@@ -82,7 +81,6 @@ def __init__(self):
8281

8382
# Initialize configuration and managers
8483
self.audio_cache_manager = AudioCacheManager()
85-
self.transcription_manager = TranscriptionManager()
8684

8785
async def run_live(
8886
self,
@@ -246,16 +244,6 @@ async def _send_to_model(
246244
elif live_request.activity_end:
247245
await llm_connection.send_realtime(types.ActivityEnd())
248246
elif live_request.blob:
249-
# Cache audio data here for transcription
250-
if not invocation_context.transcription_cache:
251-
invocation_context.transcription_cache = []
252-
if not invocation_context.run_config.input_audio_transcription:
253-
# if the live model's input transcription is not enabled, then
254-
# we use our own audio transcriber to achieve that.
255-
invocation_context.transcription_cache.append(
256-
TranscriptionEntry(role='user', data=live_request.blob)
257-
)
258-
259247
# Cache input audio chunks before flushing
260248
self.audio_cache_manager.cache_audio(
261249
invocation_context, live_request.blob, cache_type='input'
@@ -324,7 +312,7 @@ def get_author_for_event(llm_response):
324312
# Cache output audio chunks from model responses
325313
# TODO: support video data
326314
if (
327-
invocation_context.run_config.save_live_audio
315+
invocation_context.run_config.save_live_blob
328316
and event.content
329317
and event.content.parts
330318
and event.content.parts[0].inline_data
@@ -603,14 +591,13 @@ async def _postprocess_live(
603591
return
604592

605593
# Flush audio caches based on control events using configurable settings
606-
if invocation_context.run_config.save_live_audio:
607-
_handle_control_event_flush_event = (
608-
await self._handle_control_event_flush(
609-
invocation_context, llm_response
610-
)
594+
if invocation_context.run_config.save_live_blob:
595+
flushed_events = await self._handle_control_event_flush(
596+
invocation_context, llm_response
611597
)
612-
if _handle_control_event_flush_event:
613-
yield _handle_control_event_flush_event
598+
for event in flushed_events:
599+
yield event
600+
if flushed_events:
614601
return
615602

616603
# Builds the event.
@@ -925,12 +912,15 @@ def _finalize_model_response_event(
925912

926913
async def _handle_control_event_flush(
927914
self, invocation_context: InvocationContext, llm_response: LlmResponse
928-
) -> None:
915+
) -> list[Event]:
929916
"""Handle audio cache flushing based on control events.
930917
931918
Args:
932919
invocation_context: The invocation context containing audio caches.
933920
llm_response: The LLM response containing control event information.
921+
922+
Returns:
923+
A list of Event objects created from the flushed caches.
934924
"""
935925

936926
# Log cache statistics if enabled
@@ -959,6 +949,7 @@ async def _handle_control_event_flush(
959949
flush_user_audio=False,
960950
flush_model_audio=True,
961951
)
952+
return []
962953

963954
async def _run_and_handle_error(
964955
self,

src/google/adk/flows/llm_flows/contents.py

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -648,10 +648,10 @@ def _is_request_confirmation_event(event: Event) -> bool:
648648
return _is_function_call_event(event, REQUEST_CONFIRMATION_FUNCTION_CALL_NAME)
649649

650650

651-
def _is_live_model_audio_event(event: Event) -> bool:
652-
"""Check if the event is an audio event produced by live/bidi models
651+
def _is_live_model_audio_event_with_inline_data(event: Event) -> bool:
652+
"""Check if the event is a live/bidi audio event with inline data.
653653
654-
There are two possible cases:
654+
There are two possible cases and we only care about the second case:
655655
content=Content(
656656
parts=[
657657
Part(
@@ -676,24 +676,15 @@ def _is_live_model_audio_event(event: Event) -> bool:
676676
) grounding_metadata=None partial=None turn_complete=None finish_reason=None
677677
error_code=None error_message=None ...
678678
"""
679-
if not event.content:
680-
return False
681-
if not event.content.parts:
679+
if not event.content or not event.content.parts:
682680
return False
683-
# If it's audio data, then one event only has one part of audio.
684681
for part in event.content.parts:
685682
if (
686683
part.inline_data
687684
and part.inline_data.mime_type
688685
and part.inline_data.mime_type.startswith('audio/')
689686
):
690687
return True
691-
if (
692-
part.file_data
693-
and part.file_data.mime_type
694-
and part.file_data.mime_type.startswith('audio/')
695-
):
696-
return True
697688
return False
698689

699690

src/google/adk/runners.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -596,7 +596,12 @@ def _should_append_event(self, event: Event, is_live_call: bool) -> bool:
596596
# transcription events should not be appended.
597597
# Function call and function response events should be appended.
598598
# Other control events should be appended.
599-
if is_live_call and contents._is_live_model_audio_event(event):
599+
if is_live_call and contents._is_live_model_audio_event_with_inline_data(
600+
event
601+
):
602+
# We don't append live model audio events with inline data to avoid
603+
# storing large blobs in the session. However, events with file_data
604+
# (references to artifacts) should be appended.
600605
return False
601606
return True
602607

@@ -738,6 +743,36 @@ async def run_live(
738743
) -> AsyncGenerator[Event, None]:
739744
"""Runs the agent in live mode (experimental feature).
740745
746+
The `run_live` method yields a stream of `Event` objects, but not all
747+
yielded events are saved to the session. Here's a breakdown:
748+
749+
**Events Yielded to Callers:**
750+
* **Live Model Audio Events with Inline Data:** Events containing raw
751+
audio `Blob` data(`inline_data`).
752+
* **Live Model Audio Events with File Data:** Both input and ouput audio
753+
data are aggregated into a audio file saved into artifacts. The
754+
reference to the file is saved in the event as `file_data`.
755+
* **Usage Metadata:** Events containing token usage.
756+
* **Transcription Events:** Both partial and non-partial transcription
757+
events are yielded.
758+
* **Function Call and Response Events:** Always saved.
759+
* **Other Control Events:** Most control events are saved.
760+
761+
**Events Saved to the Session:**
762+
* **Live Model Audio Events with File Data:** Both input and ouput audio
763+
data are aggregated into a audio file saved into artifacts. The
764+
reference to the file is saved as event in the `file_data` to session
765+
if RunConfig.save_live_model_audio_to_session is True.
766+
* **Usage Metadata Events:** Saved to the session.
767+
* **Non-Partial Transcription Events:** Non-partial transcription events
768+
are saved.
769+
* **Function Call and Response Events:** Always saved.
770+
* **Other Control Events:** Most control events are saved.
771+
772+
**Events Not Saved to the Session:**
773+
* **Live Model Audio Events with Inline Data:** Events containing raw
774+
audio `Blob` data are **not** saved to the session.
775+
741776
Args:
742777
user_id: The user ID for the session. Required if `session` is None.
743778
session_id: The session ID for the session. Required if `session` is

0 commit comments

Comments
 (0)