yottalabsai · dadaism · Feb 19, 2026 · Feb 8, 2026
diff --git a/skills/yotta-agent-skills/SKILL.md b/skills/yotta-agent-skills/SKILL.md
@@ -87,3 +87,138 @@ pod_create:
   gpuType: "H100_80G"
   gpuCount: 2
 ```
+
+---
+
+## Launch Pod
+
+Help the user configure and launch a GPU pod on Yotta Platform. A pod is an interactive GPU instance (like a VM with GPUs attached) for development, training, or batch processing.
+
+### Gather Requirements
+
+If not already clear from the conversation, ask the user for:
+
+1. **Template** (required): pytorch, unsloth, skyrl, or comfyui
+2. **Development mode**: whether to expose Jupyter (8888) and TensorBoard (6006) ports
+3. **Storage**: small (20 GB), medium (100 GB), or large (500 GB)
+
+### Pod Templates
+
+| Template | Image | Best For |
+|----------|-------|----------|
+| pytorch | `yottalabsai/pytorch:2.9.0-py3.11-cuda12.8.1-cudnn-devel-ubuntu22.04` | General deep learning: training, fine-tuning, research |
+| unsloth | `yottalabsai/unsloth:0.6.9-py3.11-cuda12.1-cudnn-devel-ubuntu22.04` | Fast LoRA/QLoRA fine-tuning of LLMs (2-5x speedup) |
+| skyrl | `yottalabsai/skyrl:ray2.51-py3.11-cuda12.1-cudnn-devel-ubuntu22.04` | Reinforcement learning (RLHF, PPO, GRPO) |
+| comfyui | `yottalabsai/comfyui:cuda12.8.1-ubuntu22.04-2025102101` | Image generation (Stable Diffusion, SDXL, Flux) |
+
+### Configuration Process
+
+1. **Resolve template** to Docker image from the table above.
+2. **Choose GPU:** Ask the user what model/workload they'll run, then select a GPU from the catalog.
+   - pytorch / unsloth / skyrl: estimate VRAM based on model size (see GPU Selector heuristics above).
+   - comfyui: a single RTX 4090 (24 GB) or RTX 5090 (32 GB) is usually sufficient.
+3. **Set storage:** small=20 GB, medium=100 GB, large=500 GB.
+4. **Configure ports:** Include template defaults. If development mode, add 8888 (Jupyter) and 6006 (TensorBoard).
+5. **Environment variables:** Remind the user about `HF_TOKEN` (Hugging Face) and `WANDB_API_KEY` (Weights & Biases).
+
+### Output Format
+
+Show the exact `pod_create` tool parameters. For example:
+
+```
+pod_create:
+  name: "my-unsloth-pod"
+  image: "yottalabsai/unsloth:0.6.9-py3.11-cuda12.1-cudnn-devel-ubuntu22.04"
+  gpuType: "A100_80G"
+  gpuCount: 1
+  containerVolumeInGb: 100
+  ports: [8888, 6006]
+  envVars: [{"key": "HF_TOKEN", "value": "<user's token>"}]
+```
+
+---
+
+## Serve Model
+
+Help the user deploy a model for inference on Yotta Platform — either as a pod or a serverless endpoint.
+
+### Gather Requirements
+
+If not already clear from the conversation, ask the user for:
+
+1. **Model** (required): model name or HuggingFace ID (e.g. "meta-llama/Llama-3-70B-Instruct")
+2. **Serving framework**: vLLM, TGI, Triton, or custom
+3. **Service mode** (required): POD, ALB, QUEUE, or CUSTOM
+4. **Quantization**: FP16, INT8, INT4, AWQ, or GPTQ
+
+### Serving Frameworks
+
+| Framework | Image | Best For | Port |
+|-----------|-------|----------|------|
+| vLLM | `vllm/vllm-openai:v0.7.3` | LLM inference, chat, text/code generation | 8000 |
+| TGI | `ghcr.io/huggingface/text-generation-inference:3.1.1` | LLM inference (HuggingFace ecosystem) | 80 |
+| Triton | `nvcr.io/nvidia/tritonserver:25.01-py3` | Multi-model, non-LLM, ensemble pipelines | 8000 |
+| Custom | `nvidia/cuda:12.6.3-runtime-ubuntu22.04` | Custom models, proprietary serving code | 8080 |
+
+**Selection guidance:**
+- LLMs (text generation, chat, code): use **vLLM** for best throughput or **TGI** for HuggingFace ecosystem
+- Multi-model or non-LLM (vision, audio, ensembles): use **Triton**
+- Custom inference code: use **custom** base image
+
+### Service Modes
+
+| Mode | Deploy Via | Description |
+|------|-----------|-------------|
+| POD | `pod_create` | Interactive GPU instance. Good for dev, testing, or single-user serving. |
+| ALB | `endpoint_create` | HTTP load balancer with round-robin. Real-time inference at scale. |
+| QUEUE | `endpoint_create` | Async job queue. Results via webhook. Ideal for batch/long jobs. |
+| CUSTOM | `endpoint_create` | Raw container, no built-in routing. For gRPC or custom protocols. |
+
+### VRAM Estimation for Inference
+
+Base VRAM = model parameters x bytes per precision. Apply 1.1-1.2x overhead for KV cache.
+
+| Precision | Bytes/param | 7B | 13B | 70B | 405B |
+|-----------|-------------|------|------|------|-------|
+| FP16/BF16 | 2 | 14 GB | 26 GB | 140 GB | 810 GB |
+| INT8 | 1 | 7 GB | 13 GB | 70 GB | 405 GB |
+| INT4/AWQ/GPTQ | 0.5 | 3.5 GB | 6.5 GB | 35 GB | 203 GB |
+
+### Configuration Process
+
+1. **Select framework** based on model type (see guidance above).
+2. **Estimate VRAM** at the chosen quantization (default FP16). Apply 1.1-1.2x overhead.
+3. **Choose GPU** type and count from the catalog. GPU count must be a power of 2.
+4. **Configure deployment** based on service mode:
+   - **POD:** Use `pod_create`. Expose the serving port. Set storage for model weights.
+   - **ALB/QUEUE/CUSTOM:** Use `endpoint_create`. Set resources, workers, expose, and serviceMode. Endpoint name max 20 chars.
+5. **Set env vars** based on framework:
+   - vLLM: `MODEL_NAME`, `HF_TOKEN`
+   - TGI: `MODEL_ID`, `HUGGING_FACE_HUB_TOKEN`
+
+### Output Format — POD mode
+
+```
+pod_create:
+  name: "serve-llama3"
+  image: "vllm/vllm-openai:v0.7.3"
+  gpuType: "H100_80G"
+  gpuCount: 2
+  containerVolumeInGb: 100
+  ports: [8000]
+  envVars: [{"key": "MODEL_NAME", "value": "meta-llama/Llama-3-70B-Instruct"}]
+```
+
+### Output Format — Endpoint mode (ALB/QUEUE/CUSTOM)
+
+```
+endpoint_create:
+  name: "llama3-70b-ep"
+  image: "vllm/vllm-openai:v0.7.3"
+  resources: [{"region": "us-east-1", "gpuType": "H100_80G", "gpuCount": 2}]
+  workers: 1
+  containerVolumeInGb: 100
+  serviceMode: "ALB"
+  expose: {"port": 8000, "protocol": "HTTP"}
+  envVars: [{"key": "MODEL_NAME", "value": "meta-llama/Llama-3-70B-Instruct"}]
+```
diff --git a/src/prompts/index.ts b/src/prompts/index.ts
@@ -1,6 +1,10 @@
 import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
 import { registerGpuSelectorPrompt } from "./gpu-selector.js";
+import { registerLaunchPodPrompt } from "./launch-pod.js";
+import { registerServeModelPrompt } from "./serve-model.js";
 
 export function registerPrompts(server: McpServer): void {
   registerGpuSelectorPrompt(server);
+  registerLaunchPodPrompt(server);
+  registerServeModelPrompt(server);
 }
diff --git a/src/prompts/launch-pod.ts b/src/prompts/launch-pod.ts
@@ -0,0 +1,155 @@
+import { createRequire } from "node:module";
+import { z } from "zod";
+import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
+import type { GpuType } from "../api/types.js";
+
+const require = createRequire(import.meta.url);
+const gpus: GpuType[] = require("../resources/gpus.json");
+
+interface PodTemplate {
+  id: string;
+  image: string;
+  description: string;
+  bestFor: string;
+  defaultPorts: number[];
+}
+
+const POD_TEMPLATES: PodTemplate[] = [
+  {
+    id: "pytorch",
+    image: "yottalabsai/pytorch:2.9.0-py3.11-cuda12.8.1-cudnn-devel-ubuntu22.04",
+    description: "PyTorch 2.9 with CUDA 12.8, cuDNN, Python 3.11",
+    bestFor: "General-purpose deep learning: training, fine-tuning, research",
+    defaultPorts: [8888],
+  },
+  {
+    id: "unsloth",
+    image: "yottalabsai/unsloth:0.6.9-py3.11-cuda12.1-cudnn-devel-ubuntu22.04",
+    description: "Unsloth 0.6 for fast LLM fine-tuning with CUDA 12.1",
+    bestFor: "Fast LoRA/QLoRA fine-tuning of LLMs (2-5x speedup over standard)",
+    defaultPorts: [8888],
+  },
+  {
+    id: "skyrl",
+    image: "yottalabsai/skyrl:ray2.51-py3.11-cuda12.1-cudnn-devel-ubuntu22.04",
+    description: "SkyRL on Ray 2.51 with CUDA 12.1, Python 3.11",
+    bestFor: "Reinforcement learning from human feedback (RLHF), PPO, GRPO",
+    defaultPorts: [8888, 8265],
+  },
+  {
+    id: "comfyui",
+    image: "yottalabsai/comfyui:cuda12.8.1-ubuntu22.04-2025102101",
+    description: "ComfyUI node-based Stable Diffusion interface with CUDA 12.8",
+    bestFor: "Image generation workflows, Stable Diffusion, SDXL, Flux",
+    defaultPorts: [8188],
+  },
+];
+
+function buildPromptText(args: {
+  template: string;
+  forDevelopment?: boolean;
+  storage?: string;
+}): string {
+  const sections: string[] = [];
+
+  sections.push(`# Pod Launch Assistant
+
+You are a GPU cloud infrastructure advisor for Yotta Platform. Help the user configure and launch a GPU pod. A pod is an interactive GPU instance (like a VM with GPUs attached) suitable for development, training, or batch processing.`);
+
+  // User requirements
+  const reqs: string[] = [`- **Template:** ${args.template}`];
+  if (args.forDevelopment !== undefined)
+    reqs.push(`- **Development mode:** ${args.forDevelopment ? "yes (expose Jupyter & TensorBoard)" : "no"}`);
+  if (args.storage) reqs.push(`- **Storage:** ${args.storage}`);
+  sections.push(`## User Requirements\n${reqs.join("\n")}`);
+
+  // Template details
+  const templateTable = POD_TEMPLATES.map(
+    (t) => `| ${t.id} | \`${t.image}\` | ${t.description} | ${t.bestFor} | ${t.defaultPorts.join(", ")} |`
+  ).join("\n");
+  sections.push(`## Pod Templates
+
+| Template | Image | Description | Best For | Default Ports |
+|----------|-------|-------------|----------|---------------|
+${templateTable}`);
+
+  // GPU catalog
+  sections.push(`## Available GPUs on Yotta Platform\n\`\`\`json\n${JSON.stringify(gpus, null, 2)}\n\`\`\``);
+
+  // Storage guidelines
+  sections.push(`## Storage Guidelines
+
+| Size | Volume (GB) | Use Case |
+|------|-------------|----------|
+| small | 20 | Small models, code-only development |
+| medium | 100 | Medium models (7B-13B), datasets up to 50 GB |
+| large | 500 | Large models (70B+), large datasets, checkpoints |`);
+
+  // Resolve selected template
+  const selected = POD_TEMPLATES.find((t) => t.id === args.template);
+  const storageGb = args.storage === "large" ? 500 : args.storage === "medium" ? 100 : 20;
+  const ports = selected ? [...selected.defaultPorts] : [8888];
+  if (args.forDevelopment) {
+    if (!ports.includes(8888)) ports.push(8888);
+    if (!ports.includes(6006)) ports.push(6006);
+  }
+
+  sections.push(`## Instructions
+
+1. **Image:** Use \`${selected?.image ?? "select from templates above"}\` for the ${args.template} template.
+2. **GPU selection:** Based on the template's use case, recommend an appropriate GPU type and count from the catalog. GPU count must be a power of 2 (1, 2, 4, 8).
+   - **pytorch / unsloth / skyrl:** Ask the user what model they plan to work with, then estimate VRAM to pick the right GPU.
+   - **comfyui:** A single RTX 4090 (24 GB) or RTX 5090 (32 GB) is usually sufficient for SDXL/Flux image generation.
+3. **Storage:** Set \`containerVolumeInGb\` to ${storageGb} based on the user's storage preference.
+4. **Ports:** Expose ports ${JSON.stringify(ports)}.${args.forDevelopment ? " Development mode: includes Jupyter (8888) and TensorBoard (6006)." : ""}
+5. **Environment variables:** Remind the user about common env vars:
+   - \`HF_TOKEN\` — Hugging Face access token (for gated models)
+   - \`WANDB_API_KEY\` — Weights & Biases experiment tracking
+6. **Output the final \`pod_create\` call.** Example:
+
+\`\`\`
+pod_create:
+  name: "my-${args.template}-pod"
+  image: "${selected?.image ?? "..."}"
+  gpuType: "H100_80G"
+  gpuCount: 1
+  containerVolumeInGb: ${storageGb}
+  ports: ${JSON.stringify(ports)}
+\`\`\``);
+
+  return sections.join("\n\n");
+}
+
+export function registerLaunchPodPrompt(server: McpServer): void {
+  server.registerPrompt(
+    "launch-pod",
+    {
+      description:
+        "Configure and launch a GPU pod from a preset template (PyTorch, Unsloth, SkyRL, ComfyUI)",
+      argsSchema: {
+        template: z
+          .enum(["pytorch", "unsloth", "skyrl", "comfyui"])
+          .describe("Image preset: pytorch, unsloth (LLM fine-tuning), skyrl (RL/RLHF), comfyui (image gen)"),
+        forDevelopment: z
+          .boolean()
+          .optional()
+          .describe("If true, expose Jupyter (8888) and TensorBoard (6006) ports"),
+        storage: z
+          .enum(["small", "medium", "large"])
+          .optional()
+          .describe("Storage size: small (20 GB), medium (100 GB), large (500 GB)"),
+      },
+    },
+    async (args) => ({
+      messages: [
+        {
+          role: "user" as const,
+          content: {
+            type: "text" as const,
+            text: buildPromptText(args),
+          },
+        },
+      ],
+    })
+  );
+}
diff --git a/src/prompts/prompts.test.ts b/src/prompts/prompts.test.ts
@@ -15,6 +15,34 @@ describe("prompt registration", () => {
     expect(promptNames).toEqual(["gpu-selector"]);
   });
 
+  it("registers launch-pod prompt", async () => {
+    const promptNames: string[] = [];
+    const mockServer = {
+      registerPrompt: vi.fn((...args: unknown[]) => {
+        promptNames.push(args[0] as string);
+      }),
+    };
+
+    const { registerLaunchPodPrompt } = await import("./launch-pod.js");
+    registerLaunchPodPrompt(mockServer as any);
+
+    expect(promptNames).toEqual(["launch-pod"]);
+  });
+
+  it("registers serve-model prompt", async () => {
+    const promptNames: string[] = [];
+    const mockServer = {
+      registerPrompt: vi.fn((...args: unknown[]) => {
+        promptNames.push(args[0] as string);
+      }),
+    };
+
+    const { registerServeModelPrompt } = await import("./serve-model.js");
+    registerServeModelPrompt(mockServer as any);
+
+    expect(promptNames).toEqual(["serve-model"]);
+  });
+
   it("registerPrompts registers all prompts", async () => {
     const promptNames: string[] = [];
     const mockServer = {
@@ -26,7 +54,9 @@ describe("prompt registration", () => {
     const { registerPrompts } = await import("./index.js");
     registerPrompts(mockServer as any);
 
-    expect(promptNames).toHaveLength(1);
+    expect(promptNames).toHaveLength(3);
     expect(promptNames).toContain("gpu-selector");
+    expect(promptNames).toContain("launch-pod");
+    expect(promptNames).toContain("serve-model");
   });
 });