Skip to content

Commit c1a87d9

Browse files
feat(api): custom voices
1 parent 0a4ca53 commit c1a87d9

12 files changed

Lines changed: 141 additions & 70 deletions

.stats.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
configured_endpoints: 152
2-
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-3e207c26eea3b15837c78ef2fe0e1c68937708fd0763971ce749c0bdb7db6376.yml
3-
openapi_spec_hash: 626982004d5a594a822fa7883422efb4
4-
config_hash: 0dda4b3af379312c9c55467a5e1e1ec0
2+
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-cb3e4451108eed58d59cff25bf77ec0dc960ec9c6f3dba68f90e7a9847c09d21.yml
3+
openapi_spec_hash: dec6d9be64a5ba8f474a1f2a7a4fafef
4+
config_hash: e922f01e25accd07d8fd3641c37fbd62

src/openai/resources/audio/speech.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,7 @@ def create(
5252
*,
5353
input: str,
5454
model: Union[str, SpeechModel],
55-
voice: Union[
56-
str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]
57-
],
55+
voice: speech_create_params.Voice,
5856
instructions: str | Omit = omit,
5957
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] | Omit = omit,
6058
speed: float | Omit = omit,
@@ -80,8 +78,9 @@ def create(
8078
8179
voice: The voice to use when generating the audio. Supported built-in voices are
8280
`alloy`, `ash`, `ballad`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`,
83-
`shimmer`, `verse`, `marin`, and `cedar`. Previews of the voices are available
84-
in the
81+
`shimmer`, `verse`, `marin`, and `cedar`. You may also provide a custom voice
82+
object with an `id`, for example `{ "id": "voice_1234" }`. Previews of the
83+
voices are available in the
8584
[Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
8685
8786
instructions: Control the voice of your generated audio with additional instructions. Does not
@@ -153,9 +152,7 @@ async def create(
153152
*,
154153
input: str,
155154
model: Union[str, SpeechModel],
156-
voice: Union[
157-
str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]
158-
],
155+
voice: speech_create_params.Voice,
159156
instructions: str | Omit = omit,
160157
response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] | Omit = omit,
161158
speed: float | Omit = omit,
@@ -181,8 +178,9 @@ async def create(
181178
182179
voice: The voice to use when generating the audio. Supported built-in voices are
183180
`alloy`, `ash`, `ballad`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`,
184-
`shimmer`, `verse`, `marin`, and `cedar`. Previews of the voices are available
185-
in the
181+
`shimmer`, `verse`, `marin`, and `cedar`. You may also provide a custom voice
182+
object with an `id`, for example `{ "id": "voice_1234" }`. Previews of the
183+
voices are available in the
186184
[Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
187185
188186
instructions: Control the voice of your generated audio with additional instructions. Does not

src/openai/types/audio/speech_create_params.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33
from __future__ import annotations
44

55
from typing import Union
6-
from typing_extensions import Literal, Required, TypedDict
6+
from typing_extensions import Literal, Required, TypeAlias, TypedDict
77

88
from .speech_model import SpeechModel
99

10-
__all__ = ["SpeechCreateParams"]
10+
__all__ = ["SpeechCreateParams", "Voice", "VoiceID"]
1111

1212

1313
class SpeechCreateParams(TypedDict, total=False):
@@ -20,14 +20,13 @@ class SpeechCreateParams(TypedDict, total=False):
2020
`tts-1`, `tts-1-hd`, `gpt-4o-mini-tts`, or `gpt-4o-mini-tts-2025-12-15`.
2121
"""
2222

23-
voice: Required[
24-
Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]]
25-
]
23+
voice: Required[Voice]
2624
"""The voice to use when generating the audio.
2725
2826
Supported built-in voices are `alloy`, `ash`, `ballad`, `coral`, `echo`,
29-
`fable`, `onyx`, `nova`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`.
30-
Previews of the voices are available in the
27+
`fable`, `onyx`, `nova`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. You
28+
may also provide a custom voice object with an `id`, for example
29+
`{ "id": "voice_1234" }`. Previews of the voices are available in the
3130
[Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech#voice-options).
3231
"""
3332

@@ -55,3 +54,15 @@ class SpeechCreateParams(TypedDict, total=False):
5554
Supported formats are `sse` and `audio`. `sse` is not supported for `tts-1` or
5655
`tts-1-hd`.
5756
"""
57+
58+
59+
class VoiceID(TypedDict, total=False):
60+
"""Custom voice reference."""
61+
62+
id: Required[str]
63+
"""The custom voice ID, e.g. `voice_1234`."""
64+
65+
66+
Voice: TypeAlias = Union[
67+
str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], VoiceID
68+
]

src/openai/types/chat/chat_completion_audio_param.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,21 @@
33
from __future__ import annotations
44

55
from typing import Union
6-
from typing_extensions import Literal, Required, TypedDict
6+
from typing_extensions import Literal, Required, TypeAlias, TypedDict
77

8-
__all__ = ["ChatCompletionAudioParam"]
8+
__all__ = ["ChatCompletionAudioParam", "Voice", "VoiceID"]
9+
10+
11+
class VoiceID(TypedDict, total=False):
12+
"""Custom voice reference."""
13+
14+
id: Required[str]
15+
"""The custom voice ID, e.g. `voice_1234`."""
16+
17+
18+
Voice: TypeAlias = Union[
19+
str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], VoiceID
20+
]
921

1022

1123
class ChatCompletionAudioParam(TypedDict, total=False):
@@ -21,11 +33,11 @@ class ChatCompletionAudioParam(TypedDict, total=False):
2133
Must be one of `wav`, `mp3`, `flac`, `opus`, or `pcm16`.
2234
"""
2335

24-
voice: Required[
25-
Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]]
26-
]
36+
voice: Required[Voice]
2737
"""The voice the model uses to respond.
2838
2939
Supported built-in voices are `alloy`, `ash`, `ballad`, `coral`, `echo`,
30-
`fable`, `nova`, `onyx`, `sage`, `shimmer`, `marin`, and `cedar`.
40+
`fable`, `nova`, `onyx`, `sage`, `shimmer`, `marin`, and `cedar`. You may also
41+
provide a custom voice object with an `id`, for example
42+
`{ "id": "voice_1234" }`.
3143
"""

src/openai/types/realtime/realtime_audio_config_output.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,24 @@
11
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
22

33
from typing import Union, Optional
4-
from typing_extensions import Literal
4+
from typing_extensions import Literal, TypeAlias
55

66
from ..._models import BaseModel
77
from .realtime_audio_formats import RealtimeAudioFormats
88

9-
__all__ = ["RealtimeAudioConfigOutput"]
9+
__all__ = ["RealtimeAudioConfigOutput", "Voice", "VoiceID"]
10+
11+
12+
class VoiceID(BaseModel):
13+
"""Custom voice reference."""
14+
15+
id: str
16+
"""The custom voice ID, e.g. `voice_1234`."""
17+
18+
19+
Voice: TypeAlias = Union[
20+
str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], VoiceID
21+
]
1022

1123

1224
class RealtimeAudioConfigOutput(BaseModel):
@@ -24,13 +36,12 @@ class RealtimeAudioConfigOutput(BaseModel):
2436
generated, it's also possible to prompt the model to speak faster or slower.
2537
"""
2638

27-
voice: Union[
28-
str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None
29-
] = None
39+
voice: Optional[Voice] = None
3040
"""The voice the model uses to respond.
3141
3242
Supported built-in voices are `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`,
33-
`shimmer`, `verse`, `marin`, and `cedar`. Voice cannot be changed during the
34-
session once the model has responded with audio at least once. We recommend
35-
`marin` and `cedar` for best quality.
43+
`shimmer`, `verse`, `marin`, and `cedar`. You may also provide a custom voice
44+
object with an `id`, for example `{ "id": "voice_1234" }`. Voice cannot be
45+
changed during the session once the model has responded with audio at least
46+
once. We recommend `marin` and `cedar` for best quality.
3647
"""

src/openai/types/realtime/realtime_audio_config_output_param.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,23 @@
33
from __future__ import annotations
44

55
from typing import Union
6-
from typing_extensions import Literal, TypedDict
6+
from typing_extensions import Literal, Required, TypeAlias, TypedDict
77

88
from .realtime_audio_formats_param import RealtimeAudioFormatsParam
99

10-
__all__ = ["RealtimeAudioConfigOutputParam"]
10+
__all__ = ["RealtimeAudioConfigOutputParam", "Voice", "VoiceID"]
11+
12+
13+
class VoiceID(TypedDict, total=False):
14+
"""Custom voice reference."""
15+
16+
id: Required[str]
17+
"""The custom voice ID, e.g. `voice_1234`."""
18+
19+
20+
Voice: TypeAlias = Union[
21+
str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], VoiceID
22+
]
1123

1224

1325
class RealtimeAudioConfigOutputParam(TypedDict, total=False):
@@ -25,11 +37,12 @@ class RealtimeAudioConfigOutputParam(TypedDict, total=False):
2537
generated, it's also possible to prompt the model to speak faster or slower.
2638
"""
2739

28-
voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]]
40+
voice: Voice
2941
"""The voice the model uses to respond.
3042
3143
Supported built-in voices are `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`,
32-
`shimmer`, `verse`, `marin`, and `cedar`. Voice cannot be changed during the
33-
session once the model has responded with audio at least once. We recommend
34-
`marin` and `cedar` for best quality.
44+
`shimmer`, `verse`, `marin`, and `cedar`. You may also provide a custom voice
45+
object with an `id`, for example `{ "id": "voice_1234" }`. Voice cannot be
46+
changed during the session once the model has responded with audio at least
47+
once. We recommend `marin` and `cedar` for best quality.
3548
"""

src/openai/types/realtime/realtime_response_create_audio_output.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,38 @@
11
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
22

33
from typing import Union, Optional
4-
from typing_extensions import Literal
4+
from typing_extensions import Literal, TypeAlias
55

66
from ..._models import BaseModel
77
from .realtime_audio_formats import RealtimeAudioFormats
88

9-
__all__ = ["RealtimeResponseCreateAudioOutput", "Output"]
9+
__all__ = ["RealtimeResponseCreateAudioOutput", "Output", "OutputVoice", "OutputVoiceID"]
10+
11+
12+
class OutputVoiceID(BaseModel):
13+
"""Custom voice reference."""
14+
15+
id: str
16+
"""The custom voice ID, e.g. `voice_1234`."""
17+
18+
19+
OutputVoice: TypeAlias = Union[
20+
str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], OutputVoiceID
21+
]
1022

1123

1224
class Output(BaseModel):
1325
format: Optional[RealtimeAudioFormats] = None
1426
"""The format of the output audio."""
1527

16-
voice: Union[
17-
str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], None
18-
] = None
28+
voice: Optional[OutputVoice] = None
1929
"""The voice the model uses to respond.
2030
2131
Supported built-in voices are `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`,
22-
`shimmer`, `verse`, `marin`, and `cedar`. Voice cannot be changed during the
23-
session once the model has responded with audio at least once.
32+
`shimmer`, `verse`, `marin`, and `cedar`. You may also provide a custom voice
33+
object with an `id`, for example `{ "id": "voice_1234" }`. Voice cannot be
34+
changed during the session once the model has responded with audio at least
35+
once. We recommend `marin` and `cedar` for best quality.
2436
"""
2537

2638

src/openai/types/realtime/realtime_response_create_audio_output_param.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,37 @@
33
from __future__ import annotations
44

55
from typing import Union
6-
from typing_extensions import Literal, TypedDict
6+
from typing_extensions import Literal, Required, TypeAlias, TypedDict
77

88
from .realtime_audio_formats_param import RealtimeAudioFormatsParam
99

10-
__all__ = ["RealtimeResponseCreateAudioOutputParam", "Output"]
10+
__all__ = ["RealtimeResponseCreateAudioOutputParam", "Output", "OutputVoice", "OutputVoiceID"]
11+
12+
13+
class OutputVoiceID(TypedDict, total=False):
14+
"""Custom voice reference."""
15+
16+
id: Required[str]
17+
"""The custom voice ID, e.g. `voice_1234`."""
18+
19+
20+
OutputVoice: TypeAlias = Union[
21+
str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"], OutputVoiceID
22+
]
1123

1224

1325
class Output(TypedDict, total=False):
1426
format: RealtimeAudioFormatsParam
1527
"""The format of the output audio."""
1628

17-
voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]]
29+
voice: OutputVoice
1830
"""The voice the model uses to respond.
1931
2032
Supported built-in voices are `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`,
21-
`shimmer`, `verse`, `marin`, and `cedar`. Voice cannot be changed during the
22-
session once the model has responded with audio at least once.
33+
`shimmer`, `verse`, `marin`, and `cedar`. You may also provide a custom voice
34+
object with an `id`, for example `{ "id": "voice_1234" }`. Voice cannot be
35+
changed during the session once the model has responded with audio at least
36+
once. We recommend `marin` and `cedar` for best quality.
2337
"""
2438

2539

tests/api_resources/audio/test_speech.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def test_method_create(self, client: OpenAI, respx_mock: MockRouter) -> None:
2828
speech = client.audio.speech.create(
2929
input="string",
3030
model="string",
31-
voice="ash",
31+
voice="string",
3232
)
3333
assert isinstance(speech, _legacy_response.HttpxBinaryResponseContent)
3434
assert speech.json() == {"foo": "bar"}
@@ -40,7 +40,7 @@ def test_method_create_with_all_params(self, client: OpenAI, respx_mock: MockRou
4040
speech = client.audio.speech.create(
4141
input="string",
4242
model="string",
43-
voice="ash",
43+
voice="string",
4444
instructions="instructions",
4545
response_format="mp3",
4646
speed=0.25,
@@ -57,7 +57,7 @@ def test_raw_response_create(self, client: OpenAI, respx_mock: MockRouter) -> No
5757
response = client.audio.speech.with_raw_response.create(
5858
input="string",
5959
model="string",
60-
voice="ash",
60+
voice="string",
6161
)
6262

6363
assert response.is_closed is True
@@ -72,7 +72,7 @@ def test_streaming_response_create(self, client: OpenAI, respx_mock: MockRouter)
7272
with client.audio.speech.with_streaming_response.create(
7373
input="string",
7474
model="string",
75-
voice="ash",
75+
voice="string",
7676
) as response:
7777
assert not response.is_closed
7878
assert response.http_request.headers.get("X-Stainless-Lang") == "python"
@@ -95,7 +95,7 @@ async def test_method_create(self, async_client: AsyncOpenAI, respx_mock: MockRo
9595
speech = await async_client.audio.speech.create(
9696
input="string",
9797
model="string",
98-
voice="ash",
98+
voice="string",
9999
)
100100
assert isinstance(speech, _legacy_response.HttpxBinaryResponseContent)
101101
assert speech.json() == {"foo": "bar"}
@@ -107,7 +107,7 @@ async def test_method_create_with_all_params(self, async_client: AsyncOpenAI, re
107107
speech = await async_client.audio.speech.create(
108108
input="string",
109109
model="string",
110-
voice="ash",
110+
voice="string",
111111
instructions="instructions",
112112
response_format="mp3",
113113
speed=0.25,
@@ -124,7 +124,7 @@ async def test_raw_response_create(self, async_client: AsyncOpenAI, respx_mock:
124124
response = await async_client.audio.speech.with_raw_response.create(
125125
input="string",
126126
model="string",
127-
voice="ash",
127+
voice="string",
128128
)
129129

130130
assert response.is_closed is True
@@ -139,7 +139,7 @@ async def test_streaming_response_create(self, async_client: AsyncOpenAI, respx_
139139
async with async_client.audio.speech.with_streaming_response.create(
140140
input="string",
141141
model="string",
142-
voice="ash",
142+
voice="string",
143143
) as response:
144144
assert not response.is_closed
145145
assert response.http_request.headers.get("X-Stainless-Lang") == "python"

0 commit comments

Comments
 (0)