Seedance 2.0 is one of the first AI video generation models to produce synchronized audio in a single forward pass, without separate audio post-processing.
Access Seedance 2.0's native audio capabilities via Atlas Cloud.
Most video generation pipelines separate video and audio:
Prompt → Video model → Silent video → Audio model → Audio → Merge → Final video
Seedance 2.0 uses a Dual-Branch Diffusion Transformer that processes both modalities in parallel:
Prompt → [Video branch ←→ Audio branch] → Video + Audio simultaneously
The two branches share cross-modal attention layers, allowing audio timing to directly influence video motion (lip sync, footstep timing, beat-synchronized visuals) and vice versa.
Describe the audio in your text prompt and Seedance 2.0 will generate matching sound:
import requests
import time
import os
API_KEY = os.environ["ATLASCLOUD_API_KEY"]
BASE_URL = "https://api.atlascloud.ai"
def generate_with_audio(prompt: str, **kwargs) -> str:
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {API_KEY}"
}
payload = {
"model": "bytedance/seedance-2.0/text-to-video",
"input": {"prompt": prompt, **kwargs}
}
resp = requests.post(
f"{BASE_URL}/api/v1/model/generateVideo",
headers=headers, json=payload
)
prediction_id = resp.json()["data"]["id"]
while True:
result = requests.get(
f"{BASE_URL}/api/v1/model/prediction/{prediction_id}",
headers={"Authorization": f"Bearer {API_KEY}"}
).json()["data"]
if result["status"] in ["completed", "succeeded"]:
return result["outputs"][0]
elif result["status"] == "failed":
raise RuntimeError(result.get("error"))
time.sleep(3)
# Rain and thunder
video_url = generate_with_audio(
"Heavy rain falling on city streets at night, thunder rumbling in the distance, "
"neon lights reflecting in puddles, cinematic wide shot"
)
# Music performance
video_url = generate_with_audio(
"A jazz pianist playing an upbeat melody in a dimly lit club, "
"close-up on hands moving across keys, warm amber lighting, smooth jazz audio"
)
# Nature soundscape
video_url = generate_with_audio(
"A peaceful forest stream flowing over mossy rocks, birds chirping, "
"leaves rustling in gentle breeze, morning sunlight filtering through trees"
)Provide a reference audio track and Seedance 2.0 will generate video synchronized to it:
def generate_from_audio(audio_url: str, prompt: str, **kwargs) -> str:
"""Generate video synchronized to a reference audio track."""
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {API_KEY}"
}
payload = {
"model": "bytedance/seedance-2.0/text-to-video",
"input": {
"prompt": prompt,
"audio": audio_url,
**kwargs
}
}
resp = requests.post(
f"{BASE_URL}/api/v1/model/generateVideo",
headers=headers, json=payload
)
prediction_id = resp.json()["data"]["id"]
while True:
result = requests.get(
f"{BASE_URL}/api/v1/model/prediction/{prediction_id}",
headers={"Authorization": f"Bearer {API_KEY}"}
).json()["data"]
if result["status"] in ["completed", "succeeded"]:
return result["outputs"][0]
elif result["status"] == "failed":
raise RuntimeError(result.get("error"))
time.sleep(3)
# Upload audio file
with open("my_music.mp3", "rb") as f:
audio_upload = requests.post(
f"{BASE_URL}/api/v1/model/uploadMedia",
headers={"Authorization": f"Bearer {API_KEY}"},
files={"file": f}
).json()["data"]["url"]
# Generate music video synchronized to the track
video_url = generate_from_audio(
audio_url=audio_upload,
prompt="Abstract colorful geometric shapes pulsing and transforming in rhythm, "
"dark background, vibrant neon colors, music visualizer aesthetic"
)Seedance 2.0 supports phoneme-level lip synchronization across 8+ languages. Provide a portrait image + dialogue audio:
def generate_talking_head(portrait_url: str, dialogue_audio_url: str) -> str:
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {API_KEY}"
}
payload = {
"model": "bytedance/seedance-2.0/image-to-video",
"input": {
"image": portrait_url,
"audio": dialogue_audio_url,
"prompt": "The person speaks naturally, realistic head movement, professional lighting"
}
}
resp = requests.post(
f"{BASE_URL}/api/v1/model/generateVideo",
headers=headers, json=payload
)
prediction_id = resp.json()["data"]["id"]
while True:
result = requests.get(
f"{BASE_URL}/api/v1/model/prediction/{prediction_id}",
headers={"Authorization": f"Bearer {API_KEY}"}
).json()["data"]
if result["status"] in ["completed", "succeeded"]:
return result["outputs"][0]
elif result["status"] == "failed":
raise RuntimeError(result.get("error"))
time.sleep(3)Supported languages for lip sync: English, Mandarin Chinese, Spanish, French, German, Japanese, Korean, Portuguese, and more.
| Use Case | Approach |
|---|---|
| Music video | Reference audio track + abstract/narrative prompt |
| Talking head / avatar | Portrait image + dialogue audio |
| Product ad with voiceover | Product image + VO audio + motion prompt |
| Nature/ambient ASMR | Text prompt with detailed soundscape description |
| Presentation slides | Slide image + narration audio |