From 67f24ce39702b91c607c55b5175b62e32c49089a Mon Sep 17 00:00:00 2001
From: Rajas Bansal <rbansal@together.ai>
Date: Thu, 30 Apr 2026 20:23:58 -0700
Subject: [PATCH] add the pronunciation dict

---
 openapi.yaml | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)
diff --git a/openapi.yaml b/openapi.yaml
index d446f7c..2cf5f37 100644
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -3866,12 +3866,15 @@ paths:
         - Parameters: Sent as query parameters (model, voice, max_partial_length, language)
 
         **Client Events:**
-        - `tts_session.updated`: Update session parameters like voice
+        - `tts_session.updated`: Update session parameters like voice. The `session` object also accepts an `extra_params` field for additional model-specific parameters that fine-tune speech generation behavior, such as `pronunciation_dict` (a list of pronunciation rules for specific characters or symbols, where each entry uses the format `"<source>/<replacement>"` (e.g., `["omg/oh my god"]`) to override how the model pronounces matching tokens).
           ```json
           {
             "type": "tts_session.updated",
             "session": {
-              "voice": "tara"
+              "voice": "tara",
+              "extra_params": {
+                "pronunciation_dict": ["omg/oh my god"]
+              }
             }
           }
           ```
@@ -10128,6 +10131,16 @@ components:
           type: boolean
           default: false
           description: 'If true, output is streamed for several characters at a time instead of waiting for the full response. The stream terminates with `data: [DONE]`. If false, return the encoded audio as octet stream'
+        extra_params:
+          type: object
+          description: Additional model-specific parameters that fine-tune speech generation behavior.
+          properties:
+            pronunciation_dict:
+              type: array
+              items:
+                type: string
+              description: A list of pronunciation rules for specific characters or symbols. Each entry uses the format `"<source>/<replacement>"` (e.g., `["omg/oh my god"]`) to override how the model pronounces matching tokens.
+              example: ["omg/oh my god"]
 
     AudioTranscriptionRequest:
       type: object