feat(api): custom voices

stainless-app[bot] · stainless-app[bot] · commit c1a87d945359 · 2026-03-13T19:55:25.000Z
diff --git a/.stats.yml b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 152
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-3e207c26eea3b15837c78ef2fe0e1c68937708fd0763971ce749c0bdb7db6376.yml
-openapi_spec_hash: 626982004d5a594a822fa7883422efb4
-config_hash: 0dda4b3af379312c9c55467a5e1e1ec0
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-cb3e4451108eed58d59cff25bf77ec0dc960ec9c6f3dba68f90e7a9847c09d21.yml
+openapi_spec_hash: dec6d9be64a5ba8f474a1f2a7a4fafef
+config_hash: e922f01e25accd07d8fd3641c37fbd62
diff --git a/src/openai/resources/audio/speech.py b/src/openai/resources/audio/speech.py
@@ -52,9 +52,7 @@ def create(
         *,
         input: str,
         model: Union[str, SpeechModel],
-        voice: Union[
-            str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]
-        ],
+        voice: speech_create_params.Voice,
         instructions: str | Omit = omit,
         response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] | Omit = omit,
         speed: float | Omit = omit,
@@ -80,8 +78,9 @@ def create(
 
           voice: The voice to use when generating the audio. Supported built-in voices are
               `alloy`, `ash`, `ballad`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`,
-              `shimmer`, `verse`, `marin`, and `cedar`. Previews of the voices are available
-              in the
+              `shimmer`, `verse`, `marin`, and `cedar`. You may also provide a custom voice
+              object with an `id`, for example `{ "id": "voice_1234" }`. Previews of the
+              voices are available in the
               [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
 
           instructions: Control the voice of your generated audio with additional instructions. Does not
@@ -153,9 +152,7 @@ async def create(
         *,
         input: str,
         model: Union[str, SpeechModel],
-        voice: Union[
-            str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]
-        ],
+        voice: speech_create_params.Voice,
         instructions: str | Omit = omit,
         response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] | Omit = omit,
         speed: float | Omit = omit,
@@ -181,8 +178,9 @@ async def create(
 
           voice: The voice to use when generating the audio. Supported built-in voices are
               `alloy`, `ash`, `ballad`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`,
-              `shimmer`, `verse`, `marin`, and `cedar`. Previews of the voices are available
-              in the
+              `shimmer`, `verse`, `marin`, and `cedar`. You may also provide a custom voice
+              object with an `id`, for example `{ "id": "voice_1234" }`. Previews of the
+              voices are available in the
               [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
 
           instructions: Control the voice of your generated audio with additional instructions. Does not
diff --git a/src/openai/types/audio/speech_create_params.py b/src/openai/types/audio/speech_create_params.py
@@ -3,11 +3,11 @@
 from __future__ import annotations
 
 from typing import Union
-from typing_extensions import Literal, Required, TypedDict
+from typing_extensions import Literal, Required, TypeAlias, TypedDict
 
 from .speech_model import SpeechModel
 
-__all__ = ["SpeechCreateParams"]
+__all__ = ["SpeechCreateParams", "Voice", "VoiceID"]
 
 
 class SpeechCreateParams(TypedDict, total=False):
@@ -20,14 +20,13 @@ class SpeechCreateParams(TypedDict, total=False):
     `tts-1`, `tts-1-hd`, `gpt-4o-mini-tts`, or `gpt-4o-mini-tts-2025-12-15`.
     """
 
-    voice: Required[
-        Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]]
-    ]
+    voice: Required[Voice]
     """The voice to use when generating the audio.
 
     Supported built-in voices are `alloy`, `ash`, `ballad`, `coral`, `echo`,
-    `fable`, `onyx`, `nova`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`.
-    Previews of the voices are available in the
+    `fable`, `onyx`, `nova`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. You
+    may also provide a custom voice object with an `id`, for example
+    `{ "id": "voice_1234" }`. Previews of the voices are available in the
     [Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
     """
 
@@ -55,3 +54,15 @@ class SpeechCreateParams(TypedDict, total=False):
     Supported formats are `sse` and `audio`. `sse` is not supported for `tts-1` or
     `tts-1-hd`.
     """
+
+
+class VoiceID(TypedDict, total=False):
+    """Custom voice reference."""
+
+    id: Required[str]
+    """The custom voice ID, e.g. `voice_1234`."""
+
+
+Voice: TypeAlias = Union[
+    str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], VoiceID
+]
diff --git a/src/openai/types/chat/chat_completion_audio_param.py b/src/openai/types/chat/chat_completion_audio_param.py
@@ -3,9 +3,21 @@
 from __future__ import annotations
 
 from typing import Union
-from typing_extensions import Literal, Required, TypedDict
+from typing_extensions import Literal, Required, TypeAlias, TypedDict
 
-__all__ = ["ChatCompletionAudioParam"]
+__all__ = ["ChatCompletionAudioParam", "Voice", "VoiceID"]
+
+
+class VoiceID(TypedDict, total=False):
+    """Custom voice reference."""
+
+    id: Required[str]
+    """The custom voice ID, e.g. `voice_1234`."""
+
+
+Voice: TypeAlias = Union[
+    str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], VoiceID
+]
 
 
 class ChatCompletionAudioParam(TypedDict, total=False):
@@ -21,11 +33,11 @@ class ChatCompletionAudioParam(TypedDict, total=False):
     Must be one of `wav`, `mp3`, `flac`, `opus`, or `pcm16`.
     """
 
-    voice: Required[
-        Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]]
-    ]
+    voice: Required[Voice]
     """The voice the model uses to respond.
 
     Supported built-in voices are `alloy`, `ash`, `ballad`, `coral`, `echo`,
-    `fable`, `nova`, `onyx`, `sage`, `shimmer`, `marin`, and `cedar`.
+    `fable`, `nova`, `onyx`, `sage`, `shimmer`, `marin`, and `cedar`. You may also
+    provide a custom voice object with an `id`, for example
+    `{ "id": "voice_1234" }`.
     """
diff --git a/src/openai/types/realtime/realtime_audio_config_output.py b/src/openai/types/realtime/realtime_audio_config_output.py
@@ -1,12 +1,24 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
 from typing import Union, Optional
-from typing_extensions import Literal
+from typing_extensions import Literal, TypeAlias
 
 from ..._models import BaseModel
 from .realtime_audio_formats import RealtimeAudioFormats
 
-__all__ = ["RealtimeAudioConfigOutput"]
+__all__ = ["RealtimeAudioConfigOutput", "Voice", "VoiceID"]
+
+
+class VoiceID(BaseModel):
+    """Custom voice reference."""
+
+    id: str
+    """The custom voice ID, e.g. `voice_1234`."""
+
+
+Voice: TypeAlias = Union[
+    str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], VoiceID
+]
 
 
 class RealtimeAudioConfigOutput(BaseModel):
@@ -24,13 +36,12 @@ class RealtimeAudioConfigOutput(BaseModel):
     generated, it's also possible to prompt the model to speak faster or slower.
     """
 
-    voice: Union[
-        str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None
-    ] = None
+    voice: Optional[Voice] = None
     """The voice the model uses to respond.
 
     Supported built-in voices are `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`,
-    `shimmer`, `verse`, `marin`, and `cedar`. Voice cannot be changed during the
-    session once the model has responded with audio at least once. We recommend
-    `marin` and `cedar` for best quality.
+    `shimmer`, `verse`, `marin`, and `cedar`. You may also provide a custom voice
+    object with an `id`, for example `{ "id": "voice_1234" }`. Voice cannot be
+    changed during the session once the model has responded with audio at least
+    once. We recommend `marin` and `cedar` for best quality.
     """
diff --git a/src/openai/types/realtime/realtime_audio_config_output_param.py b/src/openai/types/realtime/realtime_audio_config_output_param.py
@@ -3,11 +3,23 @@
 from __future__ import annotations
 
 from typing import Union
-from typing_extensions import Literal, TypedDict
+from typing_extensions import Literal, Required, TypeAlias, TypedDict
 
 from .realtime_audio_formats_param import RealtimeAudioFormatsParam
 
-__all__ = ["RealtimeAudioConfigOutputParam"]
+__all__ = ["RealtimeAudioConfigOutputParam", "Voice", "VoiceID"]
+
+
+class VoiceID(TypedDict, total=False):
+    """Custom voice reference."""
+
+    id: Required[str]
+    """The custom voice ID, e.g. `voice_1234`."""
+
+
+Voice: TypeAlias = Union[
+    str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], VoiceID
+]
 
 
 class RealtimeAudioConfigOutputParam(TypedDict, total=False):
@@ -25,11 +37,12 @@ class RealtimeAudioConfigOutputParam(TypedDict, total=False):
     generated, it's also possible to prompt the model to speak faster or slower.
     """
 
-    voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]]
+    voice: Voice
     """The voice the model uses to respond.
 
     Supported built-in voices are `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`,
-    `shimmer`, `verse`, `marin`, and `cedar`. Voice cannot be changed during the
-    session once the model has responded with audio at least once. We recommend
-    `marin` and `cedar` for best quality.
+    `shimmer`, `verse`, `marin`, and `cedar`. You may also provide a custom voice
+    object with an `id`, for example `{ "id": "voice_1234" }`. Voice cannot be
+    changed during the session once the model has responded with audio at least
+    once. We recommend `marin` and `cedar` for best quality.
     """
diff --git a/src/openai/types/realtime/realtime_response_create_audio_output.py b/src/openai/types/realtime/realtime_response_create_audio_output.py
@@ -1,26 +1,38 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
 from typing import Union, Optional
-from typing_extensions import Literal
+from typing_extensions import Literal, TypeAlias
 
 from ..._models import BaseModel
 from .realtime_audio_formats import RealtimeAudioFormats
 
-__all__ = ["RealtimeResponseCreateAudioOutput", "Output"]
+__all__ = ["RealtimeResponseCreateAudioOutput", "Output", "OutputVoice", "OutputVoiceID"]
+
+
+class OutputVoiceID(BaseModel):
+    """Custom voice reference."""
+
+    id: str
+    """The custom voice ID, e.g. `voice_1234`."""
+
+
+OutputVoice: TypeAlias = Union[
+    str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], OutputVoiceID
+]
 
 
 class Output(BaseModel):
     format: Optional[RealtimeAudioFormats] = None
     """The format of the output audio."""
 
-    voice: Union[
-        str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None
-    ] = None
+    voice: Optional[OutputVoice] = None
     """The voice the model uses to respond.
 
     Supported built-in voices are `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`,
-    `shimmer`, `verse`, `marin`, and `cedar`. Voice cannot be changed during the
-    session once the model has responded with audio at least once.
+    `shimmer`, `verse`, `marin`, and `cedar`. You may also provide a custom voice
+    object with an `id`, for example `{ "id": "voice_1234" }`. Voice cannot be
+    changed during the session once the model has responded with audio at least
+    once. We recommend `marin` and `cedar` for best quality.
     """
 
 
diff --git a/src/openai/types/realtime/realtime_response_create_audio_output_param.py b/src/openai/types/realtime/realtime_response_create_audio_output_param.py
@@ -3,23 +3,37 @@
 from __future__ import annotations
 
 from typing import Union
-from typing_extensions import Literal, TypedDict
+from typing_extensions import Literal, Required, TypeAlias, TypedDict
 
 from .realtime_audio_formats_param import RealtimeAudioFormatsParam
 
-__all__ = ["RealtimeResponseCreateAudioOutputParam", "Output"]
+__all__ = ["RealtimeResponseCreateAudioOutputParam", "Output", "OutputVoice", "OutputVoiceID"]
+
+
+class OutputVoiceID(TypedDict, total=False):
+    """Custom voice reference."""
+
+    id: Required[str]
+    """The custom voice ID, e.g. `voice_1234`."""
+
+
+OutputVoice: TypeAlias = Union[
+    str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], OutputVoiceID
+]
 
 
 class Output(TypedDict, total=False):
     format: RealtimeAudioFormatsParam
     """The format of the output audio."""
 
-    voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]]
+    voice: OutputVoice
     """The voice the model uses to respond.
 
     Supported built-in voices are `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`,
-    `shimmer`, `verse`, `marin`, and `cedar`. Voice cannot be changed during the
-    session once the model has responded with audio at least once.
+    `shimmer`, `verse`, `marin`, and `cedar`. You may also provide a custom voice
+    object with an `id`, for example `{ "id": "voice_1234" }`. Voice cannot be
+    changed during the session once the model has responded with audio at least
+    once. We recommend `marin` and `cedar` for best quality.
     """
 
 
diff --git a/tests/api_resources/audio/test_speech.py b/tests/api_resources/audio/test_speech.py
@@ -28,7 +28,7 @@ def test_method_create(self, client: OpenAI, respx_mock: MockRouter) -> None:
         speech = client.audio.speech.create(
             input="string",
             model="string",
-            voice="ash",
+            voice="string",
         )
         assert isinstance(speech, _legacy_response.HttpxBinaryResponseContent)
         assert speech.json() == {"foo": "bar"}
@@ -40,7 +40,7 @@ def test_method_create_with_all_params(self, client: OpenAI, respx_mock: MockRou
         speech = client.audio.speech.create(
             input="string",
             model="string",
-            voice="ash",
+            voice="string",
             instructions="instructions",
             response_format="mp3",
             speed=0.25,
@@ -57,7 +57,7 @@ def test_raw_response_create(self, client: OpenAI, respx_mock: MockRouter) -> No
         response = client.audio.speech.with_raw_response.create(
             input="string",
             model="string",
-            voice="ash",
+            voice="string",
         )
 
         assert response.is_closed is True
@@ -72,7 +72,7 @@ def test_streaming_response_create(self, client: OpenAI, respx_mock: MockRouter)
         with client.audio.speech.with_streaming_response.create(
             input="string",
             model="string",
-            voice="ash",
+            voice="string",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -95,7 +95,7 @@ async def test_method_create(self, async_client: AsyncOpenAI, respx_mock: MockRo
         speech = await async_client.audio.speech.create(
             input="string",
             model="string",
-            voice="ash",
+            voice="string",
         )
         assert isinstance(speech, _legacy_response.HttpxBinaryResponseContent)
         assert speech.json() == {"foo": "bar"}
@@ -107,7 +107,7 @@ async def test_method_create_with_all_params(self, async_client: AsyncOpenAI, re
         speech = await async_client.audio.speech.create(
             input="string",
             model="string",
-            voice="ash",
+            voice="string",
             instructions="instructions",
             response_format="mp3",
             speed=0.25,
@@ -124,7 +124,7 @@ async def test_raw_response_create(self, async_client: AsyncOpenAI, respx_mock:
         response = await async_client.audio.speech.with_raw_response.create(
             input="string",
             model="string",
-            voice="ash",
+            voice="string",
         )
 
         assert response.is_closed is True
@@ -139,7 +139,7 @@ async def test_streaming_response_create(self, async_client: AsyncOpenAI, respx_
         async with async_client.audio.speech.with_streaming_response.create(
             input="string",
             model="string",
-            voice="ash",
+            voice="string",
         ) as response:
             assert not response.is_closed
             assert response.http_request.headers.get("X-Stainless-Lang") == "python"
diff --git a/tests/api_resources/chat/test_completions.py b/tests/api_resources/chat/test_completions.py
diff --git a/tests/api_resources/realtime/test_calls.py b/tests/api_resources/realtime/test_calls.py
diff --git a/tests/api_resources/realtime/test_client_secrets.py b/tests/api_resources/realtime/test_client_secrets.py