Realtime GA models working

moonbox3 · moonbox3 · commit 7104e3b715f2 · 2026-02-27T12:41:00.000+09:00
diff --git a/python/samples/concepts/realtime/README.md b/python/samples/concepts/realtime/README.md
@@ -5,7 +5,7 @@ These samples are more complex then most because of the nature of these API's. T
 To run these samples, you will need to have the following setup:
 
 - Environment variables for OpenAI (websocket or WebRTC), with your key and OPENAI_REALTIME_MODEL_ID set.
-- Environment variables for Azure (websocket only), set with your endpoint, optionally a key and AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME set. The API version needs to be at least `2024-10-01-preview`.
+- Environment variables for Azure (websocket only), set with your endpoint, optionally a key and AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME set. The API version needs to be at least `2025-08-28`.
 - To run the sample with a simple version of a class that handles the incoming and outgoing sound you need to install the following packages in your environment:
   - semantic-kernel[realtime]
   - pyaudio
diff --git a/python/samples/concepts/realtime/realtime_agent_with_function_calling_webrtc.py b/python/samples/concepts/realtime/realtime_agent_with_function_calling_webrtc.py
@@ -5,8 +5,6 @@
 from datetime import datetime
 from random import randint
 
-from azure.identity import AzureCliCredential
-
 from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
 from semantic_kernel.connectors.ai import FunctionChoiceBehavior
 from semantic_kernel.connectors.ai.open_ai import (
@@ -81,8 +79,12 @@ async def main() -> None:
     # and can also be passed in the receive method
     # You can also pass in kernel, plugins, chat_history or settings here.
     # For WebRTC the audio_track is required
+
+    # Note: api_version (either through settings or directly in the client) must be set to "2025-08-28"
+    # for Azure OpenAI deployments realtime deployments.
     realtime_agent = AzureRealtimeWebRTC(
-        audio_track=AudioRecorderWebRTC(), region="swedencentral", plugins=[Helpers()], credential=AzureCliCredential()
+        audio_track=AudioRecorderWebRTC(),
+        plugins=[Helpers()],
     )
 
     # Create the settings for the session
diff --git a/python/samples/concepts/realtime/realtime_agent_with_function_calling_websocket.py b/python/samples/concepts/realtime/realtime_agent_with_function_calling_websocket.py
@@ -82,6 +82,9 @@ async def main() -> None:
     # to signal the end of the user's turn and start the response.
     # manual VAD is not part of this sample
     # for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection
+
+    # Note: api_version (either through settings or directly in the client) must be set to "2025-08-28"
+    # for Azure OpenAI deployments realtime deployments.
     settings = AzureRealtimeExecutionSettings(
         instructions="""
     You are a chat bot. Your name is Mosscap and
diff --git a/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py b/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py
@@ -5,10 +5,10 @@
 
 from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
 from semantic_kernel.connectors.ai.open_ai import (
+    AzureRealtimeExecutionSettings,
     ListenEvents,
-    OpenAIRealtimeExecutionSettings,
-    OpenAIRealtimeWebRTC,
 )
+from semantic_kernel.connectors.ai.open_ai.services.azure_realtime import AzureRealtimeWebRTC
 from semantic_kernel.contents import RealtimeTextEvent
 
 logging.basicConfig(level=logging.WARNING)
@@ -43,7 +43,7 @@ async def main() -> None:
     # create the realtime client and optionally add the audio output function, this is optional
     # you can define the protocol to use, either "websocket" or "webrtc"
     # they will behave the same way, even though the underlying protocol is quite different
-    settings = OpenAIRealtimeExecutionSettings(
+    settings = AzureRealtimeExecutionSettings(
         instructions="""
     You are a chat bot. Your name is Mosscap and
     you have one goal: figure out what people need.
@@ -59,7 +59,12 @@ async def main() -> None:
         # Enable both text and audio output to get transcripts
         output_modalities=["text", "audio"],
     )
-    realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC(), settings=settings)
+    # Note: api_version (either through settings or directly in the client) must be set to "2025-08-28"
+    # for Azure OpenAI deployments realtime deployments.
+    realtime_client = AzureRealtimeWebRTC(
+        audio_track=AudioRecorderWebRTC(),
+        settings=settings,
+    )
     # Create the settings for the session
     audio_player = AudioPlayerWebRTC()
     # the context manager calls the create_session method on the client and starts listening to the audio stream
@@ -84,7 +89,7 @@ async def main() -> None:
 
 if __name__ == "__main__":
     print(
-        "Instructions: start speaking. "
+        "Instructions: start speaking when you see 'Session updated.' "
         "The model will detect when you stop and automatically start responding. "
         "Press ctrl + c to stop the program."
     )
diff --git a/python/samples/concepts/realtime/simple_realtime_chat_websocket.py b/python/samples/concepts/realtime/simple_realtime_chat_websocket.py
@@ -3,8 +3,6 @@
 import asyncio
 import logging
 
-from azure.identity import AzureCliCredential
-
 from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket, check_audio_devices
 from semantic_kernel.connectors.ai.open_ai import (
     AzureRealtimeExecutionSettings,
@@ -59,7 +57,11 @@ async def main() -> None:
         # for more details.
         voice="shimmer",
     )
-    realtime_client = AzureRealtimeWebsocket(settings=settings, credential=AzureCliCredential())
+    # Note: api_version (either through settings or directly in the client) must be set to "2025-08-28"
+    # for Azure OpenAI deployments realtime deployments.
+    realtime_client = AzureRealtimeWebsocket(
+        settings=settings,
+    )
     audio_player = AudioPlayerWebsocket()
     audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
     # Create the settings for the session
@@ -84,7 +86,7 @@ async def main() -> None:
 
 if __name__ == "__main__":
     print(
-        "Instructions: Start speaking. "
+        "Instructions: Start speaking when you see 'Session updated.' "
         "The model will detect when you stop and automatically start responding. "
         "Press ctrl + c to stop the program."
     )
diff --git a/python/samples/concepts/realtime/utils.py b/python/samples/concepts/realtime/utils.py
@@ -321,6 +321,7 @@ def _sounddevice_callback(self, outdata, frames, time, status):
             logger.debug(f"Audio output status: {status}")
         if self._queue:
             if self._queue.empty():
+                outdata[:] = 0
                 return
             data = self._queue.get_nowait()
             outdata[:] = data.reshape(outdata.shape)
diff --git a/python/semantic_kernel/connectors/ai/open_ai/const.py b/python/semantic_kernel/connectors/ai/open_ai/const.py
@@ -2,4 +2,4 @@
 
 from typing import Final
 
-DEFAULT_AZURE_API_VERSION: Final[str] = "2024-10-21"
+DEFAULT_AZURE_API_VERSION: Final[str] = "2025-08-28"
diff --git a/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py b/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py
@@ -54,7 +54,6 @@ class TurnDetection(KernelBaseModel):
 class OpenAIRealtimeExecutionSettings(PromptExecutionSettings):
     """Request settings for OpenAI realtime services."""
 
-    modalities: Sequence[Literal["audio", "text"]] | None = None
     output_modalities: Sequence[Literal["audio", "text"]] | None = None
     ai_model_id: Annotated[str | None, Field(None, serialization_alias="model")] = None
     instructions: str | None = None
@@ -77,8 +76,7 @@ class OpenAIRealtimeExecutionSettings(PromptExecutionSettings):
             "on the function choice configuration.",
         ),
     ] = None
-    temperature: Annotated[float | None, Field(ge=0.6, le=1.2)] = None
-    max_response_output_tokens: Annotated[int | Literal["inf"] | None, Field(gt=0)] = None
+    max_output_tokens: Annotated[int | Literal["inf"] | None, Field(gt=0)] = None
     input_audio_noise_reduction: dict[Literal["type"], Literal["near_field", "far_field"]] | None = None
 
     def prepare_settings_dict(self, **kwargs) -> dict[str, Any]:
diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/_open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/_open_ai_realtime.py
@@ -255,8 +255,8 @@ class ListenEvents(str, Enum):
     RESPONSE_OUTPUT_ITEM_DONE = "response.output_item.done"
     RESPONSE_CONTENT_PART_ADDED = "response.content_part.added"
     RESPONSE_CONTENT_PART_DONE = "response.content_part.done"
-    RESPONSE_TEXT_DELTA = "response.text.delta"
-    RESPONSE_TEXT_DONE = "response.text.done"
+    RESPONSE_TEXT_DELTA = "response.output_text.delta"
+    RESPONSE_TEXT_DONE = "response.output_text.done"
     RESPONSE_AUDIO_TRANSCRIPT_DELTA = "response.output_audio_transcript.delta"
     RESPONSE_AUDIO_TRANSCRIPT_DONE = "response.output_audio_transcript.done"
     RESPONSE_AUDIO_DELTA = "response.output_audio.delta"
@@ -302,7 +302,12 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt
         might be of different types.
         """
         match event.type:
-            case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value | "response.audio_transcript.delta":
+            case (
+                ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value
+                | "response.audio_transcript.delta"
+                | ListenEvents.RESPONSE_TEXT_DELTA.value
+                | "response.text.delta"
+            ):
                 yield RealtimeTextEvent(
                     service_type=event.type,
                     service_event=event,
@@ -312,15 +317,16 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt
                         choice_index=0,
                     ),
                 )
-            case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DONE.value | "response.audio_transcript.done":
-                yield RealtimeTextEvent(
-                    service_type=event.type,
-                    service_event=event,
-                    text=TextContent(
-                        inner_content=event,
-                        text=event.transcript,  # type: ignore
-                    ),
-                )
+            case (
+                ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DONE.value
+                | "response.audio_transcript.done"
+                | ListenEvents.RESPONSE_TEXT_DONE.value
+                | "response.text.done"
+            ):
+                # Don't yield RealtimeTextEvent here — the deltas already streamed all
+                # the text.  Emitting the full text again would cause duplicate output
+                # for any consumer that prints every RealtimeTextEvent.
+                yield RealtimeEvent(service_type=event.type, service_event=event)
             case ListenEvents.RESPONSE_OUTPUT_ITEM_ADDED.value:
                 if event.item.type == "function_call" and event.item.call_id and event.item.name:  # type: ignore
                     self._call_id_to_function_map[event.item.call_id] = event.item.name  # type: ignore
@@ -723,24 +729,19 @@ async def _send(self, event: RealtimeClientEvent) -> None:
                     # Only keep fields that are allowed in session updates
                     # Note: output_modalities is not allowed in WebRTC session updates
                     allowed_fields = {
+                        "type",
                         "instructions",
                         "model",
                         "max_output_tokens",
                         "tools",
                         "tool_choice",
-                        "temperature",
                         "prompt",
                         "tracing",
                         "truncation",
                     }
                     event_dict["session"] = {k: v for k, v in session_dict.items() if k in allowed_fields}
 
-                # Debug: Log what we're sending to see the structure
-                import json
-
-                json_data = json.dumps(event_dict)
-                logger.debug(f"Sending WebRTC session.update: {json_data}")
-                self.data_channel.send(json_data)
+                self.data_channel.send(json.dumps(event_dict))
             else:
                 self.data_channel.send(event.model_dump_json(exclude_none=True))
         except Exception as e:
@@ -860,8 +861,18 @@ async def _on_data(self, data: str) -> None:
             await self._receive_buffer.put(parsed_event)
 
     async def _get_ephemeral_token(self) -> str:
-        """Get an ephemeral token from OpenAI."""
-        data = {"model": self.ai_model_id}
+        """Get an ephemeral token from OpenAI.
+
+        GA endpoint: POST /v1/realtime/client_secrets
+        Request body: {"session": {"type": "realtime", "model": "<model>"}}
+        Response: {"value": "<token>", "expires_at": ..., "session": {...}}
+        """
+        data = {
+            "session": {
+                "type": "realtime",
+                "model": self.ai_model_id,
+            }
+        }
         headers, url = self._get_ephemeral_token_headers_and_url()
         headers = prepend_semantic_kernel_to_user_agent(headers)
         try:
@@ -874,22 +885,25 @@ async def _get_ephemeral_token(self) -> str:
                     raise Exception(f"Failed to get ephemeral token: {error_text}")
 
                 result = await response.json()
-                return result["client_secret"]["value"]
+                return result["value"]
 
         except Exception as e:
             logger.error(f"Failed to get ephemeral token: {e!s}")
             raise
 
     def _get_ephemeral_token_headers_and_url(self) -> tuple[dict[str, str], str]:
-        """Get the headers for the ephemeral token."""
+        """Get the headers and URL for the ephemeral token."""
         return {
             "Authorization": f"Bearer {self.client.api_key}",
             "Content-Type": "application/json",
-        }, f"{self.client.realtime._client.base_url}/realtime/sessions"
+        }, f"{self.client.realtime._client.base_url}/realtime/client_secrets"
 
     def _get_webrtc_url(self) -> str:
-        """Get the WebRTC URL."""
-        return f"{self.client.realtime._client.base_url}/realtime?model={self.ai_model_id}"
+        """Get the WebRTC URL.
+
+        GA endpoint: POST /v1/realtime/calls?model=<model>
+        """
+        return f"{self.client.realtime._client.base_url}/realtime/calls?model={self.ai_model_id}"
 
 
 # region Websocket
@@ -933,9 +947,6 @@ async def _send(self, event: RealtimeClientEvent) -> None:
         if not self.connection:
             raise ValueError("Connection is not established.")
         try:
-            # Debug logging to see what we're actually sending
-            if hasattr(event, "type") and event.type == "session.update":
-                logger.debug(f"Sending session.update event: {event.model_dump()}")
             await self.connection.send(event)
         except Exception as e:
             logger.error(f"Error sending response: {e!s}")
diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py
diff --git a/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py b/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py

Original file line number	Diff line number	Diff line change
`@@ -2,4 +2,4 @@`
`2`	`2`
`3`	`3`	`from typing import Final`
`4`	`4`
`5`		`-DEFAULT_AZURE_API_VERSION: Final[str] = "2024-10-21"`
	`5`	`+DEFAULT_AZURE_API_VERSION: Final[str] = "2025-08-28"`