fix: Deprecate save_live_audio in favor of save_live_blob and fix audio event saving

hangfei · copybara-github · commit 0ab79b950291 · 2025-11-12T13:16:13.000-08:00
This change:
*   Deprecates the `save_live_audio` configuration option in `RunConfig`, introducing `save_live_blob` as its replacement. A warning is issued if `save_live_audio` is used.
*   Updates `base_llm_flow.py` to use the new `save_live_blob` flag.
*   Ensures that audio events generated by `audio_cache_manager.py` are properly appended to the session service.
*   Adds a utility script `pcm_audio_player.py` for playing raw PCM audio files.

Input sample event: 14a5859f-6b6c-46ed-9f28-e5008793b1c6|live_bidi_streaming_multi_agent|user|1e867c7f-dbe1-4268-a7bc-7a9fa5fbd16c|e-7a28a060-29bf-4483-bc5c-17248698a897|1762916981.5932|{"content":{"parts":[{"file_data":{"file_uri":"artifact://live_bidi_streaming_multi_agent/user/1e867c7f-dbe1-4268-a7bc-7a9fa5fbd16c/_adk_live/adk_live_audio_storage_input_audio_1762916981593.pcm#0","mime_type":"audio/pcm"}}],"role":"user"},"invocation_id":"e-7a28a060-29bf-4483-bc5c-17248698a897","author":"user","actions":{"state_delta":{},"artifact_delta":{},"requested_auth_configs":{},"requested_tool_confirmations":{}},"id":"14a5859f-6b6c-46ed-9f28-e5008793b1c6","timestamp":1762916981.5932002}

output sample event:
506c9df4-e143-4ebc-90a5-2f5b2eb26754|live_bidi_streaming_multi_agent|user|1e867c7f-dbe1-4268-a7bc-7a9fa5fbd16c|e-7a28a060-29bf-4483-bc5c-17248698a897|1762916986.10579|{"content":{"parts":[{"file_data":{"file_uri":"artifact://live_bidi_streaming_multi_agent/user/1e867c7f-dbe1-4268-a7bc-7a9fa5fbd16c/_adk_live/adk_live_audio_storage_output_audio_1762916986105.pcm;rate=24000#0","mime_type":"audio/pcm;rate=24000"}}],"role":"model"},"invocation_id":"e-7a28a060-29bf-4483-bc5c-17248698a897","author":"model","actions":{"state_delta":{},"artifact_delta":{},"requested_auth_configs":{},"requested_tool_confirmations":{}},"id":"506c9df4-e143-4ebc-90a5-2f5b2eb26754","timestamp":1762916986.105794}

Co-authored-by: Hangfei Lin &lt;hangfei@google.com&gt;
PiperOrigin-RevId: 831512074
diff --git a/contributing/samples/live_bidi_debug_utils/pcm_audio_player.py b/contributing/samples/live_bidi_debug_utils/pcm_audio_player.py
@@ -0,0 +1,39 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import sounddevice as sd
+
+# input audio example. replace with the input audio you want to test
+FILE_PATH = 'adk_live_audio_storage_input_audio_1762910896736.pcm'
+# output audio example. replace with the input audio you want to test
+FILE_PATH = 'adk_live_audio_storage_output_audio_1762910893258.pcm;rate=24000'
+# PCM rate is always 24,000 for input and output
+SAMPLE_RATE = 24000
+CHANNELS = 1
+DTYPE = np.int16  # Common types: int16, float32
+
+# Read and play
+with open(FILE_PATH, 'rb') as f:
+  # Load raw data into numpy array
+  raw_data = f.read()
+  audio_array = np.frombuffer(raw_data, dtype=DTYPE)
+
+  # Reshape if stereo (interleaved)
+  if CHANNELS > 1:
+    audio_array = audio_array.reshape((-1, CHANNELS))
+
+  # Play
+  print('Playing...')
+  sd.play(audio_array, SAMPLE_RATE)
+  sd.wait()
diff --git a/contributing/samples/live_bidi_streaming_single_agent/agent.py b/contributing/samples/live_bidi_streaming_single_agent/agent.py
@@ -65,8 +65,8 @@ async def check_prime(nums: list[int]) -> str:
 
 
 root_agent = Agent(
-    model='gemini-live-2.5-flash-preview-native-audio-09-2025',  # vertex
-    # model='gemini-2.5-flash-native-audio-preview-09-2025',  # for AI studio
+    # model='gemini-live-2.5-flash-preview-native-audio-09-2025',  # vertex
+    model='gemini-2.5-flash-native-audio-preview-09-2025',  # for AI studio
     # key
     name='roll_dice_agent',
     description=(
diff --git a/src/google/adk/agents/run_config.py b/src/google/adk/agents/run_config.py
@@ -19,12 +19,14 @@
 import sys
 from typing import Any
 from typing import Optional
+import warnings
 
 from google.genai import types
 from pydantic import BaseModel
 from pydantic import ConfigDict
 from pydantic import Field
 from pydantic import field_validator
+from pydantic import model_validator
 
 logger = logging.getLogger('google_adk.' + __name__)
 
@@ -103,11 +105,17 @@ class RunConfig(BaseModel):
   )
   """Configuration for context window compression. If set, this will enable context window compression for LLM input."""
 
-  save_live_audio: bool = False
-  """Saves live video and audio data to session and artifact service.
+  save_live_blob: bool = False
+  """Saves live video and audio data to session and artifact service."""
 
-  Right now, only audio is supported.
-  """
+  save_live_audio: bool = Field(
+      default=False,
+      deprecated=True,
+      description=(
+          'DEPRECATED: Use save_live_blob instead. If set to True, it saves'
+          ' live video and audio data to session and artifact service.'
+      ),
+  )
 
   max_llm_calls: int = 500
   """
@@ -122,6 +130,21 @@ class RunConfig(BaseModel):
   custom_metadata: Optional[dict[str, Any]] = None
   """Custom metadata for the current invocation."""
 
+  @model_validator(mode='before')
+  @classmethod
+  def check_for_deprecated_save_live_audio(cls, data: Any) -> Any:
+    """If save_live_audio is passed, use it to set save_live_blob."""
+    if isinstance(data, dict) and 'save_live_audio' in data:
+      warnings.warn(
+          'The `save_live_audio` config is deprecated and will be removed in a'
+          ' future release. Please use `save_live_blob` instead.',
+          DeprecationWarning,
+          stacklevel=2,
+      )
+      if data['save_live_audio']:
+        data['save_live_blob'] = True
+    return data
+
   @field_validator('max_llm_calls', mode='after')
   @classmethod
   def validate_max_llm_calls(cls, value: int) -> int:
diff --git a/src/google/adk/flows/llm_flows/audio_cache_manager.py b/src/google/adk/flows/llm_flows/audio_cache_manager.py
@@ -86,7 +86,7 @@ async def flush_caches(
       invocation_context: InvocationContext,
       flush_user_audio: bool = True,
       flush_model_audio: bool = True,
-  ) -> None:
+  ) -> list[Event]:
     """Flush audio caches to artifact services.
 
     The multimodality data is saved in artifact service in the format of
@@ -101,32 +101,40 @@ async def flush_caches(
       invocation_context: The invocation context containing audio caches.
       flush_user_audio: Whether to flush the input (user) audio cache.
       flush_model_audio: Whether to flush the output (model) audio cache.
+
+    Returns:
+      A list of Event objects created from the flushed caches.
     """
+    flushed_events = []
     if flush_user_audio and invocation_context.input_realtime_cache:
-      flush_success = await self._flush_cache_to_services(
+      audio_event = await self._flush_cache_to_services(
           invocation_context,
           invocation_context.input_realtime_cache,
           'input_audio',
       )
-      if flush_success:
+      if audio_event:
+        flushed_events.append(audio_event)
         invocation_context.input_realtime_cache = []
 
     if flush_model_audio and invocation_context.output_realtime_cache:
       logger.debug('Flushed output audio cache')
-      flush_success = await self._flush_cache_to_services(
+      audio_event = await self._flush_cache_to_services(
           invocation_context,
           invocation_context.output_realtime_cache,
           'output_audio',
       )
-      if flush_success:
+      if audio_event:
+        flushed_events.append(audio_event)
         invocation_context.output_realtime_cache = []
 
+    return flushed_events
+
   async def _flush_cache_to_services(
       self,
       invocation_context: InvocationContext,
       audio_cache: list[RealtimeCacheEntry],
       cache_type: str,
-  ) -> bool:
+  ) -> Event | None:
     """Flush a list of audio cache entries to artifact services.
 
     The artifact service stores the actual blob. The session stores the
@@ -138,11 +146,11 @@ async def _flush_cache_to_services(
       cache_type: Type identifier for the cache ('input_audio' or 'output_audio').
 
     Returns:
-      True if the cache was successfully flushed, False otherwise.
+      The created Event if the cache was successfully flushed, None otherwise.
     """
     if not invocation_context.artifact_service or not audio_cache:
       logger.debug('Skipping cache flush: no artifact service or empty cache')
-      return False
+      return None
 
     try:
       # Combine audio chunks into a single file
@@ -201,7 +209,7 @@ async def _flush_cache_to_services(
 
     except Exception as e:
       logger.error('Failed to flush %s cache: %s', cache_type, e)
-      return False
+      return None
 
   def get_cache_stats(
       self, invocation_context: InvocationContext
diff --git a/src/google/adk/flows/llm_flows/base_llm_flow.py b/src/google/adk/flows/llm_flows/base_llm_flow.py
@@ -49,7 +49,6 @@
 from ...tools.tool_context import ToolContext
 from ...utils.context_utils import Aclosing
 from .audio_cache_manager import AudioCacheManager
-from .transcription_manager import TranscriptionManager
 
 if TYPE_CHECKING:
   from ...agents.llm_agent import LlmAgent
@@ -82,7 +81,6 @@ def __init__(self):
 
     # Initialize configuration and managers
     self.audio_cache_manager = AudioCacheManager()
-    self.transcription_manager = TranscriptionManager()
 
   async def run_live(
       self,
@@ -246,16 +244,6 @@ async def _send_to_model(
       elif live_request.activity_end:
         await llm_connection.send_realtime(types.ActivityEnd())
       elif live_request.blob:
-        # Cache audio data here for transcription
-        if not invocation_context.transcription_cache:
-          invocation_context.transcription_cache = []
-        if not invocation_context.run_config.input_audio_transcription:
-          # if the live model's input transcription is not enabled, then
-          # we use our own audio transcriber to achieve that.
-          invocation_context.transcription_cache.append(
-              TranscriptionEntry(role='user', data=live_request.blob)
-          )
-
         # Cache input audio chunks before flushing
         self.audio_cache_manager.cache_audio(
             invocation_context, live_request.blob, cache_type='input'
@@ -324,7 +312,7 @@ def get_author_for_event(llm_response):
                 # Cache output audio chunks from model responses
                 # TODO: support video data
                 if (
-                    invocation_context.run_config.save_live_audio
+                    invocation_context.run_config.save_live_blob
                     and event.content
                     and event.content.parts
                     and event.content.parts[0].inline_data
@@ -603,14 +591,13 @@ async def _postprocess_live(
       return
 
     # Flush audio caches based on control events using configurable settings
-    if invocation_context.run_config.save_live_audio:
-      _handle_control_event_flush_event = (
-          await self._handle_control_event_flush(
-              invocation_context, llm_response
-          )
+    if invocation_context.run_config.save_live_blob:
+      flushed_events = await self._handle_control_event_flush(
+          invocation_context, llm_response
       )
-      if _handle_control_event_flush_event:
-        yield _handle_control_event_flush_event
+      for event in flushed_events:
+        yield event
+      if flushed_events:
         return
 
     # Builds the event.
@@ -925,12 +912,15 @@ def _finalize_model_response_event(
 
   async def _handle_control_event_flush(
       self, invocation_context: InvocationContext, llm_response: LlmResponse
-  ) -> None:
+  ) -> list[Event]:
     """Handle audio cache flushing based on control events.
 
     Args:
       invocation_context: The invocation context containing audio caches.
       llm_response: The LLM response containing control event information.
+
+    Returns:
+      A list of Event objects created from the flushed caches.
     """
 
     # Log cache statistics if enabled
@@ -959,6 +949,7 @@ async def _handle_control_event_flush(
           flush_user_audio=False,
           flush_model_audio=True,
       )
+    return []
 
   async def _run_and_handle_error(
       self,
diff --git a/src/google/adk/flows/llm_flows/contents.py b/src/google/adk/flows/llm_flows/contents.py
@@ -648,10 +648,10 @@ def _is_request_confirmation_event(event: Event) -> bool:
   return _is_function_call_event(event, REQUEST_CONFIRMATION_FUNCTION_CALL_NAME)
 
 
-def _is_live_model_audio_event(event: Event) -> bool:
-  """Check if the event is an audio event produced by live/bidi models
+def _is_live_model_audio_event_with_inline_data(event: Event) -> bool:
+  """Check if the event is a live/bidi audio event with inline data.
 
-  There are two possible cases:
+  There are two possible cases and we only care about the second case:
   content=Content(
     parts=[
       Part(
@@ -676,24 +676,15 @@ def _is_live_model_audio_event(event: Event) -> bool:
   ) grounding_metadata=None partial=None turn_complete=None finish_reason=None
   error_code=None error_message=None ...
   """
-  if not event.content:
-    return False
-  if not event.content.parts:
+  if not event.content or not event.content.parts:
     return False
-  # If it's audio data, then one event only has one part of audio.
   for part in event.content.parts:
     if (
         part.inline_data
         and part.inline_data.mime_type
         and part.inline_data.mime_type.startswith('audio/')
     ):
       return True
-    if (
-        part.file_data
-        and part.file_data.mime_type
-        and part.file_data.mime_type.startswith('audio/')
-    ):
-      return True
   return False
 
 
diff --git a/src/google/adk/runners.py b/src/google/adk/runners.py
@@ -596,7 +596,12 @@ def _should_append_event(self, event: Event, is_live_call: bool) -> bool:
     # transcription events should not be appended.
     # Function call and function response events should be appended.
     # Other control events should be appended.
-    if is_live_call and contents._is_live_model_audio_event(event):
+    if is_live_call and contents._is_live_model_audio_event_with_inline_data(
+        event
+    ):
+      # We don't append live model audio events with inline data to avoid
+      # storing large blobs in the session. However, events with file_data
+      # (references to artifacts) should be appended.
       return False
     return True
 
@@ -738,6 +743,36 @@ async def run_live(
   ) -> AsyncGenerator[Event, None]:
     """Runs the agent in live mode (experimental feature).
 
+    The `run_live` method yields a stream of `Event` objects, but not all
+    yielded events are saved to the session. Here's a breakdown:
+
+    **Events Yielded to Callers:**
+    *   **Live Model Audio Events with Inline Data:** Events containing raw
+        audio `Blob` data(`inline_data`).
+    *   **Live Model Audio Events with File Data:** Both input and ouput audio
+        data are aggregated into a audio file saved into artifacts. The
+        reference to the file is saved in the event as `file_data`.
+    *   **Usage Metadata:** Events containing token usage.
+    *   **Transcription Events:** Both partial and non-partial transcription
+        events are yielded.
+    *   **Function Call and Response Events:** Always saved.
+    *   **Other Control Events:** Most control events are saved.
+
+    **Events Saved to the Session:**
+    *   **Live Model Audio Events with File Data:** Both input and ouput audio
+        data are aggregated into a audio file saved into artifacts. The
+        reference to the file is saved as event in the `file_data` to session
+        if RunConfig.save_live_model_audio_to_session is True.
+    *   **Usage Metadata Events:** Saved to the session.
+    *   **Non-Partial Transcription Events:** Non-partial transcription events
+        are saved.
+    *   **Function Call and Response Events:** Always saved.
+    *   **Other Control Events:** Most control events are saved.
+
+    **Events Not Saved to the Session:**
+    *   **Live Model Audio Events with Inline Data:** Events containing raw
+        audio `Blob` data are **not** saved to the session.
+
     Args:
         user_id: The user ID for the session. Required if `session` is None.
         session_id: The session ID for the session. Required if `session` is