meituan-longcat · rsxdalv · Apr 5, 2026 · Apr 5, 2026
diff --git a/README.md b/README.md
@@ -90,7 +90,15 @@ LongCat-AudioDiT obtains state-of-the-art (SOTA) voice cloning performance on th
 
 ## Installation
 
+### As a Python Package
 ```bash
+pip install git+https://github.com/meituan-longcat/LongCat-AudioDiT
+```
+
+### Development
+
+```bash
+git clone https://github.com/meituan-longcat/LongCat-AudioDiT
 pip install -r requirements.txt
 ```
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,50 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "audiodit"
+version = "0.1.0"
+description = "AudioDiT — Conditional Flow Matching TTS with DiT backbone"
+readme = "README.md"
+requires-python = ">=3.10"
+license = {text = "Apache-2.0"}
+authors = [
+    {name = "AudioDiT Authors"}
+]
+keywords = ["tts", "diffusion", "transformer", "huggingface"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Multimedia :: Sound/Audio :: Speech",
+]
+dependencies = [
+    "transformers>=5.3.0",
+    "torch>=2.0.0",
+    "torchaudio>=2.0.0",
+    "safetensors>=0.4.0",
+    "librosa>=0.10.0",
+    "soundfile>=0.12.0",
+    "numpy>=1.23.5",
+    "einops>=0.8.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0",
+    "black>=24.0",
+    "ruff>=0.4.0",
+]
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["audiodit*"]
+
+[tool.setuptools.package-data]
+audiodit = ["py.typed"]