Skip to content

Commit 7104e3b

Browse files
committed
Realtime GA models working
1 parent 54a653a commit 7104e3b

File tree

11 files changed

+201
-71
lines changed

11 files changed

+201
-71
lines changed

python/samples/concepts/realtime/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ These samples are more complex then most because of the nature of these API's. T
55
To run these samples, you will need to have the following setup:
66

77
- Environment variables for OpenAI (websocket or WebRTC), with your key and OPENAI_REALTIME_MODEL_ID set.
8-
- Environment variables for Azure (websocket only), set with your endpoint, optionally a key and AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME set. The API version needs to be at least `2024-10-01-preview`.
8+
- Environment variables for Azure (websocket only), set with your endpoint, optionally a key and AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME set. The API version needs to be at least `2025-08-28`.
99
- To run the sample with a simple version of a class that handles the incoming and outgoing sound you need to install the following packages in your environment:
1010
- semantic-kernel[realtime]
1111
- pyaudio

python/samples/concepts/realtime/realtime_agent_with_function_calling_webrtc.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
from datetime import datetime
66
from random import randint
77

8-
from azure.identity import AzureCliCredential
9-
108
from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
119
from semantic_kernel.connectors.ai import FunctionChoiceBehavior
1210
from semantic_kernel.connectors.ai.open_ai import (
@@ -81,8 +79,12 @@ async def main() -> None:
8179
# and can also be passed in the receive method
8280
# You can also pass in kernel, plugins, chat_history or settings here.
8381
# For WebRTC the audio_track is required
82+
83+
# Note: api_version (either through settings or directly in the client) must be set to "2025-08-28"
84+
# for Azure OpenAI deployments realtime deployments.
8485
realtime_agent = AzureRealtimeWebRTC(
85-
audio_track=AudioRecorderWebRTC(), region="swedencentral", plugins=[Helpers()], credential=AzureCliCredential()
86+
audio_track=AudioRecorderWebRTC(),
87+
plugins=[Helpers()],
8688
)
8789

8890
# Create the settings for the session

python/samples/concepts/realtime/realtime_agent_with_function_calling_websocket.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,9 @@ async def main() -> None:
8282
# to signal the end of the user's turn and start the response.
8383
# manual VAD is not part of this sample
8484
# for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection
85+
86+
# Note: api_version (either through settings or directly in the client) must be set to "2025-08-28"
87+
# for Azure OpenAI deployments realtime deployments.
8588
settings = AzureRealtimeExecutionSettings(
8689
instructions="""
8790
You are a chat bot. Your name is Mosscap and

python/samples/concepts/realtime/simple_realtime_chat_webrtc.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55

66
from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
77
from semantic_kernel.connectors.ai.open_ai import (
8+
AzureRealtimeExecutionSettings,
89
ListenEvents,
9-
OpenAIRealtimeExecutionSettings,
10-
OpenAIRealtimeWebRTC,
1110
)
11+
from semantic_kernel.connectors.ai.open_ai.services.azure_realtime import AzureRealtimeWebRTC
1212
from semantic_kernel.contents import RealtimeTextEvent
1313

1414
logging.basicConfig(level=logging.WARNING)
@@ -43,7 +43,7 @@ async def main() -> None:
4343
# create the realtime client and optionally add the audio output function, this is optional
4444
# you can define the protocol to use, either "websocket" or "webrtc"
4545
# they will behave the same way, even though the underlying protocol is quite different
46-
settings = OpenAIRealtimeExecutionSettings(
46+
settings = AzureRealtimeExecutionSettings(
4747
instructions="""
4848
You are a chat bot. Your name is Mosscap and
4949
you have one goal: figure out what people need.
@@ -59,7 +59,12 @@ async def main() -> None:
5959
# Enable both text and audio output to get transcripts
6060
output_modalities=["text", "audio"],
6161
)
62-
realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC(), settings=settings)
62+
# Note: api_version (either through settings or directly in the client) must be set to "2025-08-28"
63+
# for Azure OpenAI deployments realtime deployments.
64+
realtime_client = AzureRealtimeWebRTC(
65+
audio_track=AudioRecorderWebRTC(),
66+
settings=settings,
67+
)
6368
# Create the settings for the session
6469
audio_player = AudioPlayerWebRTC()
6570
# the context manager calls the create_session method on the client and starts listening to the audio stream
@@ -84,7 +89,7 @@ async def main() -> None:
8489

8590
if __name__ == "__main__":
8691
print(
87-
"Instructions: start speaking. "
92+
"Instructions: start speaking when you see 'Session updated.' "
8893
"The model will detect when you stop and automatically start responding. "
8994
"Press ctrl + c to stop the program."
9095
)

python/samples/concepts/realtime/simple_realtime_chat_websocket.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import asyncio
44
import logging
55

6-
from azure.identity import AzureCliCredential
7-
86
from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket, check_audio_devices
97
from semantic_kernel.connectors.ai.open_ai import (
108
AzureRealtimeExecutionSettings,
@@ -59,7 +57,11 @@ async def main() -> None:
5957
# for more details.
6058
voice="shimmer",
6159
)
62-
realtime_client = AzureRealtimeWebsocket(settings=settings, credential=AzureCliCredential())
60+
# Note: api_version (either through settings or directly in the client) must be set to "2025-08-28"
61+
# for Azure OpenAI deployments realtime deployments.
62+
realtime_client = AzureRealtimeWebsocket(
63+
settings=settings,
64+
)
6365
audio_player = AudioPlayerWebsocket()
6466
audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
6567
# Create the settings for the session
@@ -84,7 +86,7 @@ async def main() -> None:
8486

8587
if __name__ == "__main__":
8688
print(
87-
"Instructions: Start speaking. "
89+
"Instructions: Start speaking when you see 'Session updated.' "
8890
"The model will detect when you stop and automatically start responding. "
8991
"Press ctrl + c to stop the program."
9092
)

python/samples/concepts/realtime/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,7 @@ def _sounddevice_callback(self, outdata, frames, time, status):
321321
logger.debug(f"Audio output status: {status}")
322322
if self._queue:
323323
if self._queue.empty():
324+
outdata[:] = 0
324325
return
325326
data = self._queue.get_nowait()
326327
outdata[:] = data.reshape(outdata.shape)

python/semantic_kernel/connectors/ai/open_ai/const.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22

33
from typing import Final
44

5-
DEFAULT_AZURE_API_VERSION: Final[str] = "2024-10-21"
5+
DEFAULT_AZURE_API_VERSION: Final[str] = "2025-08-28"

python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ class TurnDetection(KernelBaseModel):
5454
class OpenAIRealtimeExecutionSettings(PromptExecutionSettings):
5555
"""Request settings for OpenAI realtime services."""
5656

57-
modalities: Sequence[Literal["audio", "text"]] | None = None
5857
output_modalities: Sequence[Literal["audio", "text"]] | None = None
5958
ai_model_id: Annotated[str | None, Field(None, serialization_alias="model")] = None
6059
instructions: str | None = None
@@ -77,8 +76,7 @@ class OpenAIRealtimeExecutionSettings(PromptExecutionSettings):
7776
"on the function choice configuration.",
7877
),
7978
] = None
80-
temperature: Annotated[float | None, Field(ge=0.6, le=1.2)] = None
81-
max_response_output_tokens: Annotated[int | Literal["inf"] | None, Field(gt=0)] = None
79+
max_output_tokens: Annotated[int | Literal["inf"] | None, Field(gt=0)] = None
8280
input_audio_noise_reduction: dict[Literal["type"], Literal["near_field", "far_field"]] | None = None
8381

8482
def prepare_settings_dict(self, **kwargs) -> dict[str, Any]:

python/semantic_kernel/connectors/ai/open_ai/services/_open_ai_realtime.py

Lines changed: 40 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -255,8 +255,8 @@ class ListenEvents(str, Enum):
255255
RESPONSE_OUTPUT_ITEM_DONE = "response.output_item.done"
256256
RESPONSE_CONTENT_PART_ADDED = "response.content_part.added"
257257
RESPONSE_CONTENT_PART_DONE = "response.content_part.done"
258-
RESPONSE_TEXT_DELTA = "response.text.delta"
259-
RESPONSE_TEXT_DONE = "response.text.done"
258+
RESPONSE_TEXT_DELTA = "response.output_text.delta"
259+
RESPONSE_TEXT_DONE = "response.output_text.done"
260260
RESPONSE_AUDIO_TRANSCRIPT_DELTA = "response.output_audio_transcript.delta"
261261
RESPONSE_AUDIO_TRANSCRIPT_DONE = "response.output_audio_transcript.done"
262262
RESPONSE_AUDIO_DELTA = "response.output_audio.delta"
@@ -302,7 +302,12 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt
302302
might be of different types.
303303
"""
304304
match event.type:
305-
case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value | "response.audio_transcript.delta":
305+
case (
306+
ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value
307+
| "response.audio_transcript.delta"
308+
| ListenEvents.RESPONSE_TEXT_DELTA.value
309+
| "response.text.delta"
310+
):
306311
yield RealtimeTextEvent(
307312
service_type=event.type,
308313
service_event=event,
@@ -312,15 +317,16 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt
312317
choice_index=0,
313318
),
314319
)
315-
case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DONE.value | "response.audio_transcript.done":
316-
yield RealtimeTextEvent(
317-
service_type=event.type,
318-
service_event=event,
319-
text=TextContent(
320-
inner_content=event,
321-
text=event.transcript, # type: ignore
322-
),
323-
)
320+
case (
321+
ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DONE.value
322+
| "response.audio_transcript.done"
323+
| ListenEvents.RESPONSE_TEXT_DONE.value
324+
| "response.text.done"
325+
):
326+
# Don't yield RealtimeTextEvent here — the deltas already streamed all
327+
# the text. Emitting the full text again would cause duplicate output
328+
# for any consumer that prints every RealtimeTextEvent.
329+
yield RealtimeEvent(service_type=event.type, service_event=event)
324330
case ListenEvents.RESPONSE_OUTPUT_ITEM_ADDED.value:
325331
if event.item.type == "function_call" and event.item.call_id and event.item.name: # type: ignore
326332
self._call_id_to_function_map[event.item.call_id] = event.item.name # type: ignore
@@ -723,24 +729,19 @@ async def _send(self, event: RealtimeClientEvent) -> None:
723729
# Only keep fields that are allowed in session updates
724730
# Note: output_modalities is not allowed in WebRTC session updates
725731
allowed_fields = {
732+
"type",
726733
"instructions",
727734
"model",
728735
"max_output_tokens",
729736
"tools",
730737
"tool_choice",
731-
"temperature",
732738
"prompt",
733739
"tracing",
734740
"truncation",
735741
}
736742
event_dict["session"] = {k: v for k, v in session_dict.items() if k in allowed_fields}
737743

738-
# Debug: Log what we're sending to see the structure
739-
import json
740-
741-
json_data = json.dumps(event_dict)
742-
logger.debug(f"Sending WebRTC session.update: {json_data}")
743-
self.data_channel.send(json_data)
744+
self.data_channel.send(json.dumps(event_dict))
744745
else:
745746
self.data_channel.send(event.model_dump_json(exclude_none=True))
746747
except Exception as e:
@@ -860,8 +861,18 @@ async def _on_data(self, data: str) -> None:
860861
await self._receive_buffer.put(parsed_event)
861862

862863
async def _get_ephemeral_token(self) -> str:
863-
"""Get an ephemeral token from OpenAI."""
864-
data = {"model": self.ai_model_id}
864+
"""Get an ephemeral token from OpenAI.
865+
866+
GA endpoint: POST /v1/realtime/client_secrets
867+
Request body: {"session": {"type": "realtime", "model": "<model>"}}
868+
Response: {"value": "<token>", "expires_at": ..., "session": {...}}
869+
"""
870+
data = {
871+
"session": {
872+
"type": "realtime",
873+
"model": self.ai_model_id,
874+
}
875+
}
865876
headers, url = self._get_ephemeral_token_headers_and_url()
866877
headers = prepend_semantic_kernel_to_user_agent(headers)
867878
try:
@@ -874,22 +885,25 @@ async def _get_ephemeral_token(self) -> str:
874885
raise Exception(f"Failed to get ephemeral token: {error_text}")
875886

876887
result = await response.json()
877-
return result["client_secret"]["value"]
888+
return result["value"]
878889

879890
except Exception as e:
880891
logger.error(f"Failed to get ephemeral token: {e!s}")
881892
raise
882893

883894
def _get_ephemeral_token_headers_and_url(self) -> tuple[dict[str, str], str]:
884-
"""Get the headers for the ephemeral token."""
895+
"""Get the headers and URL for the ephemeral token."""
885896
return {
886897
"Authorization": f"Bearer {self.client.api_key}",
887898
"Content-Type": "application/json",
888-
}, f"{self.client.realtime._client.base_url}/realtime/sessions"
899+
}, f"{self.client.realtime._client.base_url}/realtime/client_secrets"
889900

890901
def _get_webrtc_url(self) -> str:
891-
"""Get the WebRTC URL."""
892-
return f"{self.client.realtime._client.base_url}/realtime?model={self.ai_model_id}"
902+
"""Get the WebRTC URL.
903+
904+
GA endpoint: POST /v1/realtime/calls?model=<model>
905+
"""
906+
return f"{self.client.realtime._client.base_url}/realtime/calls?model={self.ai_model_id}"
893907

894908

895909
# region Websocket
@@ -933,9 +947,6 @@ async def _send(self, event: RealtimeClientEvent) -> None:
933947
if not self.connection:
934948
raise ValueError("Connection is not established.")
935949
try:
936-
# Debug logging to see what we're actually sending
937-
if hasattr(event, "type") and event.type == "session.update":
938-
logger.debug(f"Sending session.update event: {event.model_dump()}")
939950
await self.connection.send(event)
940951
except Exception as e:
941952
logger.error(f"Error sending response: {e!s}")

0 commit comments

Comments
 (0)