forked from deepgram/deepgram-api-specs
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathasyncapi.yml
More file actions
1782 lines (1778 loc) · 56.9 KB
/
asyncapi.yml
File metadata and controls
1782 lines (1778 loc) · 56.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
asyncapi: 3.0.0
info:
title: Deepgram Websocket Specification
version: 1.0.0
description: >
Deepgram's voice AI platform provides APIs for speech-to-text,
text-to-speech, and language understanding. From medical transcription to
autonomous agents, Deepgram is the go-to choice for developers of voice AI
experiences.
The Deepgram API allows you to interact with Deepgram programmatically.
You can use our Websockets to: - Transcribe speech to text. - Transform text
to speech. - Build a Voice Agent.
tags:
- name: listen
description: Speech to Text WebSocket API
- name: speak
description: Text to Speech WebSocket API
- name: agent
description: Voice Agent WebSocket API
termsOfService: https://deepgram.com/terms/
contact:
name: Deepgram Developer Relations
email: devrel@deepgram.com
url: https://community.deepgram.com
license:
name: Privacy Notice
url: https://deepgram.com/privacy/
externalDocs:
description: Learn more about using Deepgram on our docs.
url: http://developers.deepgram.com
servers:
public:
host: api.deepgram.com
protocol: wss
channels:
speak:
address: /v1/speak
description: Deepgram Text to Speech WebSocket
parameters:
ApiKey:
description: >-
API key for authentication. Format should be 'token
<DEEPGRAM_API_KEY>'
location: $message.header#/Authorization
examples:
- token YOUR_DEEPGRAM_API_KEY
encoding:
description: >-
Encoding allows you to specify the expected encoding of your audio
output
location: $message.payload#/encoding
default: mp3
enum:
- linear16
- mulaw
- alaw
- opus
- mp3
- flac
- aac
examples:
- linear16
model:
description: AI model used to process submitted text
location: $message.payload#/model
default: aura-asteria-en
enum:
- aura-asteria-en
- aura-luna-en
- aura-stella-en
- aura-athena-en
- aura-hera-en
- aura-orion-en
- aura-arcas-en
- aura-perseus-en
- aura-angus-en
- aura-orpheus-en
- aura-helios-en
- aura-zeus-en
examples:
- aura-asteria-en
sample_rate:
description: >-
Sample Rate specifies the sample rate for the output audio. Based on
encoding 8000 or 24000 are possible defaults. For some encodings
sample rate is not configurable.
location: $message.payload#/sample_rate
default: '24000'
enum:
- '8000'
- '16000'
- '24000'
- '44100'
- '48000'
examples:
- '24000'
messages:
textToSpeechRequest:
description: Request to convert text to speech
payload:
type: object
properties:
type:
type: string
enum:
- Speak
description: Message type indicating a text-to-speech request
text:
type: string
description: The input text to be converted to speech
required:
- type
- text
examples:
- payload:
type: Speak
text: Hello, world!
- payload:
type: Speak
text: Convert this text to speech
controlMessagesRequest:
description: Control messages for managing the Text to Speech WebSocket connection
payload:
type: object
properties:
type:
type: string
enum:
- Flush
- Clear
- Close
required:
- type
examples:
- payload:
type: Flush
- payload:
type: Clear
- payload:
type: Close
textToSpeechResponse:
contentType: application/octet-stream
payload:
type: string
format: binary
description: Audio data in the format specified by the request parameters
controlMessagesResponse:
contentType: application/json
payload:
type: object
required:
- type
properties:
type:
type: string
enum:
- flush
- clear
- close
description: The type of control message response
sequence_id:
type: integer
description: The sequence ID of the response
metadataResponse:
contentType: application/json
payload:
type: object
description: >-
Text to Speech Metadata information sent immediately after
completing the WebSocket handshake
properties:
type:
type: string
description: Message type identifier
request_id:
type: string
format: uuid
description: Unique identifier for the request
model_name:
type: string
description: Name of the model being used
model_version:
type: string
description: Version of the model being used
model_uuid:
type: string
format: uuid
description: Unique identifier for the model
errorResponse:
contentType: application/json
description: Error information for failed requests
payload:
type: object
required:
- error_code
- error_message
- request_id
properties:
error_code:
type: integer
enum:
- 400
- 401
- 402
- 403
- 429
- 503
description: HTTP status code equivalent
error_message:
type: string
description: Error message
request_id:
type: string
format: uuid
description: Unique identifier for the request
successResponse:
contentType: application/json
description: Sent when a request completes successfully
payload:
type: object
required:
- success_code
- success_message
- request_id
properties:
success_code:
type: integer
enum:
- 200
description: HTTP status code equivalent
success_message:
type: string
description: Success message
request_id:
type: string
format: uuid
description: Unique identifier for the request
closeFrame:
payload:
type: object
description: >
When Deepgram encounters an error during streaming text to speech, a
WebSocket Close frame is sent. The frame contains a status code and
UTF-8-encoded payload describing the error reason
required:
- code
- payload
properties:
code:
type: integer
enum:
- 1000
- 1003
- 1008
- 1009
- 1011
description: WebSocket close status code
payload:
type: string
enum:
- None
- MESSAGE-0000
- DATA-0000
- BIG-0000
- BIG-0001
- NET-0000
- NET-0001
- NET-0002
- NET-0003
description: Error reason code
examples:
- code: 1000
payload: None
description: Normal closure
- code: 1003
payload: MESSAGE-0000
description: Input message isn't a supported websocket message type
- code: 1008
payload: DATA-0000
description: Input message isn't recognized as a valid command
- code: 1009
payload: BIG-0000
description: Input message is too large
- code: 1009
payload: BIG-0001
description: Input text has too many characters
- code: 1011
payload: NET-0000
description: Internal server error
- code: 1011
payload: NET-0001
description: Failed to receive message
- code: 1011
payload: NET-0002
description: Failed to send message
- code: 1011
payload: NET-0003
description: Time limit exceeded
listen:
address: /v1/listen
description: Deepgram Speech to Text WebSocket
parameters:
ApiKey:
description: >-
API key for authentication. Format should be 'token
<DEEPGRAM_API_KEY>'
location: $message.header#/Authorization
examples:
- token YOUR_DEEPGRAM_API_KEY
callback:
description: URL to which we'll make the callback request
location: $message.payload#/callback
examples:
- https://example.com
callback_method:
description: HTTP method by which the callback request will be made
location: $message.payload#/callback_method
default: POST
enum:
- POST
- GET
- PUT
- DELETE
examples:
- POST
- GET
- PUT
- DELETE
channels:
description: The number of channels in the submitted audio
location: $message.payload#/channels
default: '1'
examples:
- '1'
diarize:
description: >-
Recognize speaker changes. Each word in the transcript will be
assigned a speaker number starting at 0
location: $message.payload#/diarize
default: 'false'
enum:
- 'true'
- 'false'
examples:
- 'true'
diarize_version:
description: >-
Version of the diarization feature to use. Only used when the
diarization feature is enabled (`diarize=true` is passed to the API)
location: $message.payload#/diarize_version
default: v2
examples:
- v1
- v2
dictation:
description: Identify and extract key entities from content in submitted audio
location: $message.payload#/dictation
default: 'false'
enum:
- 'true'
- 'false'
examples:
- 'true'
encoding:
description: Specify the expected encoding of your submitted audio
location: $message.payload#/encoding
enum:
- linear16
- flac
- mulaw
- amr-nb
- amr-wb
- opus
- speex
- g729
examples:
- linear16
endpointing:
description: >-
Indicates how long Deepgram will wait to detect whether a speaker has
finished speaking or pauses for a significant period of time. When set
to a value, the streaming endpoint immediately finalizes the
transcription for the processed time range and returns the transcript
with a speech_final parameter set to true. Can also be set to false to
disable endpointing
location: $message.payload#/endpointing
default: '10'
examples:
- '300'
- false"
extra:
description: >-
Arbitrary key-value pairs that are attached to the API response for
usage in downstream processing
location: $message.payload#/extra
examples:
- key:value
filler_words:
description: >-
Filler Words can help transcribe interruptions in your audio, like
"uh" and "um"
location: $message.payload#/filler_words
default: 'false'
enum:
- 'true'
- 'false'
examples:
- 'true'
interim_results:
description: >-
Specifies whether the streaming endpoint should provide ongoing
transcription updates as more audio is received. When set to true, the
endpoint sends continuous updates, meaning transcription results may
evolve over time
location: $message.payload#/interim_results
default: 'false'
enum:
- 'true'
- 'false'
examples:
- 'true'
keyterm:
description: >-
Key term prompting can boost or suppress specialized terminology and
brands. Only compatible with Nova-3
location: $message.payload#/keyterm
examples:
- Snuffleupagus
keywords:
description: Keywords can boost or suppress specialized terminology and brands
location: $message.payload#/keywords
examples:
- Twilio:2
language:
description: >-
The [BCP-47 language tag](https://tools.ietf.org/html/bcp47) that
hints at the primary spoken language. Depending on the Model you
choose only certain languages are available
location: $message.payload#/language
default: en
enum:
- bg
- ca
- cs
- da
- da-DK
- de
- de-CH
- el
- en
- en-AU
- en-GB
- en-IN
- en-NZ
- en-US
- es
- es-419
- es-LATAM
- et
- fi
- fr
- fr-CA
- hi
- hi-Latn
- hu
- id
- it
- ja
- ko
- ko-KR
- lt
- lv
- ms
- nl
- nl-BE
- 'no'
- pl
- pt
- pt-BR
- pt-PT
- ro
- ru
- sk
- sv
- sv-SE
- taq
- th
- th-TH
- tr
- uk
- vi
- zh
- zh-CN
- zh-HK
- zh-Hans
- zh-Hant
- zh-TW
examples:
- en
model:
description: AI model to use for the transcription
location: $message.payload#/model
enum:
- nova-3
- nova-3-general
- nova-2
- nova-2-general
- nova-2-meeting
- nova-2-finance
- nova-2-conversationalai
- nova-2-voicemail
- nova-2-video
- nova-2-medical
- nova-2-drivethru
- nova-2-automotive
- nova
- nova-general
- nova-phonecall
- nova-medical
- enhanced
- enhanced-general
- enhanced-meeting
- enhanced-phonecall
- enhanced-finance
- base
- meeting
- phonecall
- finance
- conversationalai
- voicemail
- video
- custom
examples:
- nova-2
- custom model name
multichannel:
description: Transcribe each audio channel independently
location: $message.payload#/multichannel
default: 'false'
enum:
- 'true'
- 'false'
examples:
- 'true'
numerals:
description: Convert numbers from written format to numerical format
location: $message.payload#/numerals
default: 'false'
enum:
- 'true'
- 'false'
examples:
- 'true'
profanity_filter:
description: >-
Profanity Filter looks for recognized profanity and converts it to the
nearest recognized non-profane word or removes it from the transcript
completely
location: $message.payload#/profanity_filter
default: 'false'
enum:
- 'true'
- 'false'
examples:
- 'true'
punctuate:
description: Add punctuation and capitalization to the transcript
location: $message.payload#/punctuate
default: 'false'
enum:
- 'true'
- 'false'
examples:
- 'true'
redact:
description: Redaction removes sensitive information from your transcripts
location: $message.payload#/redact
default: 'false'
enum:
- 'true'
- 'false'
examples:
- 'true'
replace:
description: Search for terms or phrases in submitted audio and replaces them
location: $message.payload#/replace
examples:
- monika:Monica
sample_rate:
description: >-
Sample rate of submitted audio. Required (and only read) when a value
is provided for encoding
location: $message.payload#/sample_rate
examples:
- '8000'
search:
description: Search for terms or phrases in submitted audio
location: $message.payload#/search
examples:
- Deepgram
- Text to Speech
smart_format:
description: >-
Apply formatting to transcript output. When set to true, additional
formatting will be applied to transcripts to improve readability
location: $message.payload#/smart_format
default: 'false'
enum:
- 'true'
- 'false'
examples:
- 'true'
tag:
description: >-
Label your requests for the purpose of identification during usage
reporting
location: $message.payload#/tag
examples:
- my-team
- marketing%20team
utterance_end:
description: >-
Indicates how long Deepgram will wait to send an UtteranceEnd message
after a word has been transcribed. Use with interim_results
location: $message.payload#/utterance_end
examples:
- '1000'
vad_events:
description: >-
Indicates that speech has started. You'll begin receiving Speech
Started messages upon speech starting
location: $message.payload#/vad_events
default: 'false'
enum:
- 'true'
- 'false'
examples:
- 'true'
version:
description: Version of an AI model to use
location: $message.payload#/version
default: latest
examples:
- MODEL_VERSION
messages:
transcriptionRequest:
description: >-
Request to convert speech to text. Audio data is transmitted as raw
binary WebSocket messages
payload:
type: string
format: binary
description: >-
Raw audio data to be transcribed. Should be sent as a binary
WebSocket message without base64 encoding
examples:
- payload: <binary audio data>
controlMessagesRequest:
description: Control messages for managing the Speech to Text WebSocket connection
payload:
type: object
oneOf:
- type: object
properties:
type:
type: string
enum:
- Finalize
description: >-
Used to handle specific scenarios where you need to force
the server to process all unprocessed audio data and
immediately return the final results
- type: object
properties:
type:
type: string
enum:
- CloseStream
description: Close the websocket connection
- type: object
properties:
type:
type: string
enum:
- KeepAlive
description: Used to keep the websocket connection alive
required:
- type
examples:
- payload:
type: Finalize
- payload:
type: CloseStream
- payload:
type: KeepAlive
transcriptionResponse:
contentType: application/json
payload:
type: object
properties:
channel:
type: object
properties:
alternatives:
type: array
items:
type: object
properties:
transcript:
type: string
description: Complete transcribed text
confidence:
type: number
format: float
description: Overall confidence score
words:
type: array
items:
type: object
properties:
word:
type: string
start:
type: number
format: float
end:
type: number
format: float
confidence:
type: number
format: float
punctuated_word:
type: string
metadata:
type: object
properties:
model_info:
type: object
properties:
name:
type: string
version:
type: string
arch:
type: string
request_id:
type: string
format: uuid
model_uuid:
type: string
format: uuid
type:
type: string
channel_index:
type: array
items:
type: integer
duration:
type: number
format: float
start:
type: number
format: float
is_final:
type: boolean
from_finalize:
type: boolean
speech_final:
type: boolean
controlMessageResponse:
contentType: application/json
payload:
type: object
required:
- type
oneOf:
- title: FinalizeResponse
description: >
The server will process all remaining audio data and return the
final results. You may receive a response with the from_finalize
attribute set to true, indicating that the finalization process
is complete. This response typically occurs when there is a
noticeable amount of audio buffered in the server.
allOf:
- properties:
type:
type: string
enum:
- Finalize
channel:
type: integer
description: The channel number being finalized
minimum: 0
- required:
- type
- channel
- title: MetadataResponse
description: >
Provides real-time metadata during audio streaming, including
audio characteristics
and processing details. This response is sent periodically
during streaming to
provide updates about the audio being processed.
allOf:
- properties:
type:
type: string
enum:
- Metadata
transaction_key:
type: string
description: Deprecated field
request_id:
type: string
format: uuid
description: Unique identifier for the request
sha256:
type: string
pattern: ^[a-fA-F0-9]{64}$
description: SHA-256 hash of the audio content
created:
type: string
format: date-time
description: Timestamp when the response was created
duration:
type: number
format: float
description: Duration of the audio in seconds
channels:
type: integer
minimum: 0
description: Number of audio channels
- required:
- type
- request_id
- sha256
- created
- duration
- channels
- title: CloseStreamResponse
description: >
Indicates that the server has closed the WebSocket connection
and the server will process all remaining audio data.
allOf:
- properties:
type:
type: string
enum:
- CloseStream
transaction_key:
type: string
description: Deprecated field
request_id:
type: string
format: uuid
description: Unique identifier for the request
sha256:
type: string
pattern: ^[a-fA-F0-9]{64}$
description: SHA-256 hash of the audio content
created:
type: string
format: date-time
description: Timestamp when the response was created
duration:
type: number
format: float
description: Duration of the audio in seconds
channels:
type: integer
minimum: 0
description: Number of audio channels
- required:
- type
- request_id
- sha256
- created
- duration
- channels
examples:
- payload:
type: Finalize
channel: 0
- payload:
type: Metadata
transaction_key: deprecated
request_id: 8c8ebea9-dbec-45fa-a035-e4632cb05b5f
sha256: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
created: '2024-08-29T22:37:55.202Z'
duration: 0
channels: 0
- payload:
type: CloseStream
transaction_key: deprecated
request_id: 8c8ebea9-dbec-45fa-a035-e4632cb05b5f
sha256: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
created: '2024-08-29T22:37:55.202Z'
duration: 0
channels: 0
metadataResponse:
contentType: application/json
payload:
type: object
description: Sent immediately after completing the WebSocket handshake
properties:
type:
type: string
description: Message type identifier
request_id:
type: string
format: uuid
description: Unique identifier for the request
model_name:
type: string
description: Name of the model being used
model_version:
type: string
description: Version of the model being used
model_uuid:
type: string
format: uuid
description: Unique identifier for the model
errorResponse:
contentType: application/json
description: Error information for failed requests
payload:
type: object
required:
- error_code
- error_message
- request_id
properties:
error_code:
type: integer
enum:
- 400
- 401
- 402
- 403
- 429
- 503
description: HTTP status code equivalent
error_message:
type: string
description: Error message
request_id:
type: string
format: uuid
description: Unique identifier for the request
successResponse:
contentType: application/json
description: Sent when a request completes successfully
payload:
type: object
required:
- success_code
- success_message
- request_id
properties:
success_code:
type: integer
enum:
- 200
description: HTTP status code equivalent
success_message:
type: string
description: Success message
request_id:
type: string
format: uuid
description: Unique identifier for the request
closeFrame:
payload:
type: object
description: >
When Deepgram encounters an error during streaming speech to text, a
WebSocket Close frame is sent. The frame contains a status code and
UTF-8-encoded payload describing the error reason
required:
- code
- payload
properties:
code:
type: integer
enum:
- 1000
- 1008
- 1011
description: WebSocket close status code
payload:
type: string
enum:
- None
- DATA-0000
- NET-0000
- NET-0001