Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 198 additions & 0 deletions .airstack/modules/osmo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -686,13 +686,207 @@ function cmd_osmo_foxglove {
--connect-timeout "$OSMO_PF_TIMEOUT"
}

# osmo:mission — submit airstack-mission.yaml with a mission spec selected.
#
# Usage: airstack osmo:mission <mission.yaml> [--pool POOL] [--key PATH]
# [--branch BRANCH] [--no-keep-alive]
#
# <mission.yaml> is a repo-relative path (e.g. osmo/missions/example_takeoff_land.yaml).
# The pod clones the branch and runs the mission spec from that clone, so the
# mission file must be committed and pushed. --no-keep-alive makes the task
# exit when the mission ends (frees the GPU, triggers the workflow's
# `outputs:` upload) instead of sleeping for `airstack osmo:fetch`.
function cmd_osmo_mission {
_osmo_check_cli || return 1

local mission=""
local pool="${OSMO_POOL:-}"
local pubkey_file=""
local branch=""
local branch_explicit=false
local keep_alive="true"
local extra_args=()

while [ $# -gt 0 ]; do
case "$1" in
--pool) pool="$2"; shift 2 ;;
--key) pubkey_file="$2"; shift 2 ;;
--branch) branch="$2"; branch_explicit=true; shift 2 ;;
--no-keep-alive) keep_alive="false"; shift ;;
-*) extra_args+=("$1"); shift ;;
*)
if [ -z "$mission" ]; then mission="$1"; else extra_args+=("$1"); fi
shift ;;
esac
done

if [ -z "$mission" ]; then
log_error "Usage: airstack osmo:mission <mission.yaml> [--pool POOL] [--branch BRANCH] [--no-keep-alive]"
log_error "Available missions:"
ls "${PROJECT_ROOT}/osmo/missions/"*.yaml 2>/dev/null \
| sed "s|${PROJECT_ROOT}/| |" >&2
return 1
fi
# Normalize to a repo-relative path — that's what the pod resolves
# against its clone of the branch.
mission="${mission#"${PROJECT_ROOT}"/}"
if [ ! -f "${PROJECT_ROOT}/${mission}" ]; then
log_error "Mission file not found locally: ${PROJECT_ROOT}/${mission}"
return 1
fi

if [ -z "$pubkey_file" ]; then
if ! pubkey_file="$(_osmo_pick_pubkey)"; then
log_error "No SSH public key found in ~/.ssh. Generate one with: ssh-keygen -t ed25519"
return 1
fi
fi

local workflow_yaml="${PROJECT_ROOT}/osmo/workflows/airstack-mission.yaml"
if [ ! -f "$workflow_yaml" ]; then
log_error "Workflow file not found: ${workflow_yaml}"
return 1
fi

# The pod runs the mission file from its clone of origin/<branch>, so an
# unpushed mission spec is the most common "why is it running the wrong
# thing" failure — same auto-pin + pushed check as osmo:up.
if [ "$branch_explicit" = false ] && [ -z "$branch" ]; then
branch="$(_osmo_local_branch)"
if [ -n "$branch" ]; then
log_info "Auto-detected local branch '${branch}'; pod will clone from origin/${branch} (override with --branch main)."
fi
fi
if [ -n "$branch" ]; then
_osmo_check_branch_pushed "$branch"
fi

local cmd=(osmo workflow submit "$workflow_yaml")
if [ -n "$pool" ]; then
cmd+=(--pool "$pool")
else
log_warn "No --pool provided and OSMO_POOL is unset; using your osmo profile's default pool."
fi
# Single --set-env: the flag is variadic and a second occurrence silently
# drops the first (see cmd_osmo_up).
local env_kvs=(
"SSH_PUB_KEY=$(cat "$pubkey_file")"
"OSMO_MISSION_FILE=${mission}"
"OSMO_MISSION_KEEP_ALIVE=${keep_alive}"
)
if [ -n "$branch" ]; then
env_kvs+=("AIRSTACK_BRANCH=${branch}")
fi
cmd+=(--set-env "${env_kvs[@]}")
if [ ${#extra_args[@]} -gt 0 ]; then
cmd+=("${extra_args[@]}")
fi

log_info "Submitting mission '${mission}' (keep_alive=${keep_alive}): ${cmd[*]}"
local output
if ! output="$("${cmd[@]}" 2>&1)"; then
echo "$output" >&2
log_error "osmo workflow submit failed."
return 1
fi
echo "$output"

local wf_id
wf_id="$(echo "$output" | awk -F'- ' '/^Workflow ID/ {print $2; exit}' | tr -d ' \r\n')"
if [ -z "$wf_id" ]; then
log_warn "Could not parse workflow id from submit output. Set it manually:"
log_warn " echo <wf-id> > ${OSMO_STATE_FILE}"
return 0
fi
_osmo_save_wf_id "$wf_id"

log_info "Next steps:"
log_info " airstack osmo:logs # follow mission progress"
log_info " airstack osmo:fetch # download bags + results (keep-alive mode)"
log_info " airstack osmo:down # cancel when done (results die with the pod!)"
}

# osmo:fetch — download mission results (mcap bags, logs, summaries) from the
# pod to the laptop over the authenticated ssh port-forward.
#
# Usage: airstack osmo:fetch [dest-dir]
#
# Incremental and resumable (rsync): safe to run mid-mission to pull finished
# iterations while the next one flies, and again later to top up. Falls back
# to scp -r (non-incremental) if rsync isn't installed.
#
# Note: the osmo CLI also ships `osmo workflow rsync download`, which could
# replace the port-forward + ssh below; we use the ssh path because the
# sshd + port-forward channel is already validated infrastructure for this
# workflow (osmo:ide) and works uniformly across osmo CLI versions.
function cmd_osmo_fetch {
_osmo_check_cli || return 1
local wf; wf="$(_osmo_wf_id)" || return 1

local dest="${1:-./osmo-results}"
local remote_path="/root/AirStack/osmo/results/" # trailing slash: rsync
# follows the symlink to
# /osmo/output/... on pods
local local_port="${OSMO_SSH_PORT%%:*}"
local ssh_opts=(-p "$local_port" -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR)

# Reuse an existing ssh port-forward (e.g. from osmo:ide) or spawn one
# for the duration of the fetch — same pattern as osmo:ide.
local pf_pid=""
if ! nc -z localhost "$local_port" 2>/dev/null; then
log_info "osmo workflow port-forward ${wf} workspace --port ${OSMO_SSH_PORT} (for the duration of the fetch)"
osmo workflow port-forward "$wf" workspace --port "$OSMO_SSH_PORT" --connect-timeout 600 \
> "${OSMO_STATE_DIR}/fetch-pf.log" 2>&1 &
pf_pid=$!
trap '[ -n "'"$pf_pid"'" ] && kill "'"$pf_pid"'" 2>/dev/null; trap - EXIT INT TERM' EXIT INT TERM
local waited=0
until nc -z localhost "$local_port" 2>/dev/null; do
sleep 1; waited=$((waited+1))
if [ "$waited" -ge 30 ]; then
log_error "Timed out waiting for port-forward on :${local_port}. Log: ${OSMO_STATE_DIR}/fetch-pf.log"
return 1
fi
if ! kill -0 "$pf_pid" 2>/dev/null; then
log_error "port-forward exited early. Tail:"
tail -10 "${OSMO_STATE_DIR}/fetch-pf.log" >&2
return 1
fi
done
fi

mkdir -p "$dest"
log_info "Fetching ${remote_path} → ${dest}"
local rc
if command -v rsync >/dev/null 2>&1; then
rsync -az --partial --info=progress2 -e "ssh ${ssh_opts[*]}" \
"root@localhost:${remote_path}" "$dest/"
rc=$?
else
log_warn "rsync not found; falling back to scp -r (non-incremental)."
scp "${ssh_opts[@]}" -r "root@localhost:${remote_path}." "$dest/"
rc=$?
fi

if [ -n "$pf_pid" ]; then
kill "$pf_pid" 2>/dev/null
trap - EXIT INT TERM
fi

if [ "$rc" -ne 0 ]; then
log_error "Fetch failed (exit ${rc}). Is the workflow still running, and has the mission produced results yet?"
return 1
fi
log_info "Done. Open any .mcap under ${dest} directly in Foxglove (Open local file)."
}

# osmo:down — cancel the active workflow. Reminds you to push first.
function cmd_osmo_down {
_osmo_check_cli || return 1
local wf; wf="$(_osmo_wf_id)" || return 1

log_warn "About to cancel workflow '${wf}'."
log_warn "Anything not pushed to git in /root/AirStack inside the pod will be LOST."
log_warn "Mission results (bags/logs) on the pod are lost too — run 'airstack osmo:fetch' first."
log_warn "Hit Ctrl-C in the next 5 seconds to abort."
sleep 5
osmo workflow cancel "$wf"
Expand All @@ -703,6 +897,8 @@ function cmd_osmo_down {
function register_osmo_commands {
COMMANDS["osmo:setup"]="cmd_osmo_setup"
COMMANDS["osmo:up"]="cmd_osmo_up"
COMMANDS["osmo:mission"]="cmd_osmo_mission"
COMMANDS["osmo:fetch"]="cmd_osmo_fetch"
COMMANDS["osmo:logs"]="cmd_osmo_logs"
COMMANDS["osmo:ide"]="cmd_osmo_ide"
COMMANDS["osmo:webrtc"]="cmd_osmo_webrtc"
Expand All @@ -711,6 +907,8 @@ function register_osmo_commands {

COMMAND_HELP["osmo:setup"]="One-time per-user OSMO credential setup (airlab-docker-registry, airlab-docker-login, airlab-nucleus)"
COMMAND_HELP["osmo:up"]="Submit osmo/workflows/airstack-dev.yaml with your SSH pubkey injected (--pool POOL, --key PATH, --branch BRANCH)"
COMMAND_HELP["osmo:mission"]="Submit a batch mission (osmo/missions/*.yaml): repeated up→fly→record→down cycles (--pool POOL, --branch BRANCH, --no-keep-alive)"
COMMAND_HELP["osmo:fetch"]="Download mission results (mcap bags, logs, summaries) from the pod over ssh — incremental, safe to run mid-mission (osmo:fetch [dest-dir])"
COMMAND_HELP["osmo:logs"]="Follow the workspace task logs (osmo workflow logs <id> -t workspace -n 500; OSMO_LOGS_TASK / OSMO_LOGS_TAIL override)"
COMMAND_HELP["osmo:ide"]="Port-forward sshd (2200:22) and open VS Code/Cursor on Host airstack-osmo"
COMMAND_HELP["osmo:webrtc"]="Port-forward Isaac Sim WebRTC ranges (TCP foreground + UDP background)"
Expand Down
2 changes: 1 addition & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ PROJECT_NAME="airstack"
# If you've run ./airstack.sh setup, then this will auto-generate from the git commit hash every time a change is made
# to a Dockerfile or docker-compose.yaml file. Otherwise this can also be set explicitly to make a release version.
# auto-generated from git commit hash
VERSION="0.19.0-alpha.3"
VERSION="8b927e46"
# Choose "dev" or "prebuilt". "dev" is for mounted code that must be built live. "prebuilt" is for built ros_ws baked into the image
DOCKER_IMAGE_BUILD_MODE="dev"
# Where to push and pull images from. Can replace with your docker hub username if using docker hub.
Expand Down
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@ simulation/ms-airsim/assets/scenes/*
# Test results
tests/results/

# OSMO mission results: local runs of osmo/workspace/mission_runner.py write
# to osmo/results/; `airstack osmo:fetch` downloads to ./osmo-results/.
osmo/results/
osmo-results/

# Local-only — embedded sibling repo, not part of this branch
common/rayfronts/

Expand Down
42 changes: 41 additions & 1 deletion docs/tutorials/airstack_on_osmo.md
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,45 @@ osmo workflow cancel $WF

</details>

## Batch missions (unattended runs)

Everything above is the *interactive* workflow. The same pod can instead run
**missions**: declarative YAML files (in
[`osmo/missions/`](https://github.com/castacks/AirStack/blob/main/osmo/missions/))
that script repeated cycles of bring-up → fly → record → tear-down with no
human attached. Each iteration restarts the containers, records mcap bag
files (Foxglove's native format — open the `.mcap` directly, no conversion),
and snapshots container logs and per-step results.

```bash
# Submit a mission (auto-pins your current branch, like osmo:up):
./airstack.sh osmo:mission osmo/missions/example_takeoff_land.yaml --pool airstack

# Watch it fly:
./airstack.sh osmo:logs

# Pull bags + logs + summaries to your laptop — incremental, safe to run
# mid-mission and again later to top up:
./airstack.sh osmo:fetch ./results/

# When you have everything (results die with the pod!):
./airstack.sh osmo:down
```

A mission step can be any robot task action (`takeoff`, `land`, `navigate`,
`semantic_search`, `exploration`, `coverage`, …), a timed wait, a topic pub,
a service call, or an arbitrary `ros2`/shell command. Spec schema and step
reference:
[`osmo/missions/README.md`](https://github.com/castacks/AirStack/blob/main/osmo/missions/README.md).

By default the pod **stays alive after the mission ends** so you can
`osmo:fetch` whenever you're ready (mind the workflow's 24h `exec_timeout`).
For fire-and-forget batches, submit with `--no-keep-alive`: the pod exits
cleanly when the mission ends, freeing the GPU — and uploading the results
directory to object storage automatically if the workflow's `outputs:`
block is configured (lab-admin setup; see
[`osmo/README.md`](https://github.com/castacks/AirStack/blob/main/osmo/README.md)).

## Troubleshooting

| Symptom | Likely cause | Fix |
Expand All @@ -572,7 +611,8 @@ osmo workflow cancel $WF
| Uncommitted edits in the IDE | Pod-local working tree | **No** |
| `colcon build` outputs (`build/`, `install/`, `log/`) | `/root/AirStack/**/ros_ws/...` | **No** (gitignored Linux x86_64 binaries; rebuild trivially) |
| Inner-dockerd image cache | Pod-local Docker layer cache | **No** |
| Bag files, sim recordings, debug screenshots | `/root/AirStack/bags/`, etc. | **No** — pull selectively via `osmo workflow rsync download "$(cat ~/.airstack/osmo-state)" <pod-path>:<local-path>` *before* tearing down |
| Mission results (mcap bags, logs, summaries) | `/root/AirStack/osmo/results/` | **No** — run `./airstack.sh osmo:fetch` *before* tearing down |
| Other bag files, sim recordings, debug screenshots | `/root/AirStack/bags/`, etc. | **No** — pull selectively via `osmo workflow rsync download "$(cat ~/.airstack/osmo-state)" <pod-path>:<local-path>` *before* tearing down |

The rule of thumb: **commit + push every time you'd save a file in a
git-tracked sense.** The Source Control panel is the persistence boundary.
Expand Down
59 changes: 54 additions & 5 deletions osmo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,17 @@ through [NVIDIA OSMO](https://github.com/NVIDIA/OSMO):
osmo/
├── README.md # This file (admin / operator reference)
├── workflows/
│ └── airstack-dev.yaml # The OSMO workflow students submit
│ ├── airstack-dev.yaml # Interactive dev workflow (IDE over Remote-SSH)
│ └── airstack-mission.yaml # Batch mission workflow (unattended flights)
├── missions/
│ ├── README.md # Mission spec schema reference
│ └── example_takeoff_land.yaml # Reference mission: takeoff → hover → land ×3
└── workspace/
├── Dockerfile # The airstack-osmo-workspace image
├── sshd_config # Pubkey-only sshd config baked into the image
└── entrypoint.sh # Pod startup: sshd, dockerd, clone, airstack up
├── entrypoint.sh # Pod startup: sshd, dockerd, clone, then
│ # dev mode (airstack up) or mission mode
└── mission_runner.py # Batch executor (run from the clone, not the image)
```

The student-facing walkthrough lives in
Expand All @@ -21,9 +27,9 @@ README is the **lab admin / operator** reference: pool requirements,
workspace image build & push, validation stages, plus a credential summary
for context.

> **Scope:** developer workflow only. CI/CD on OSMO is **not** part of this
> integration — the existing `system-tests.yml` + OpenStack orchestrator path
> is unchanged.
> **Scope:** developer workflow + batch missions. CI/CD on OSMO is **not**
> part of this integration — the existing `system-tests.yml` + OpenStack
> orchestrator path is unchanged.

## Architecture in one minute

Expand All @@ -47,6 +53,49 @@ app.foxglove.dev ── ws ────► port-forward 8766 ────►
airstack.sh up brings these 3 up
```

## Mission mode (batch runs)

`airstack-mission.yaml` reuses the same workspace image and DinD pod, but
instead of one interactive `airstack up`, the entrypoint hands off to
[`workspace/mission_runner.py`](workspace/mission_runner.py), which executes
a declarative mission spec from [`missions/`](missions/) — repeated cycles of:

```
airstack down → airstack up → wait for PX4 ready → record mcap bags
→ run steps (takeoff / land / navigate / semantic search / any ros2 command)
→ collect bags + container logs → airstack down
```

Submit, monitor, and download:

```bash
airstack osmo:mission osmo/missions/example_takeoff_land.yaml --pool <gpu-pool>
airstack osmo:logs # follow mission progress
airstack osmo:fetch ./results/ # rsync bags/logs/summaries to the laptop
airstack osmo:down # cancel (fetch first — results die with the pod)
```

Key behaviors:

- **The mission spec and runner come from the clone**, not the image — what
you push on your branch is what runs. The workspace image only needs a
rebuild when `Dockerfile`, `sshd_config`, or `entrypoint.sh` change.
- **Bags are mcap** (`ros2 bag record -s mcap`) — open the `.mcap` files
directly in Foxglove, no conversion or local ROS install.
- **Results location:** `/osmo/output/airstack-mission-results/<mission>/<stamp>/`
with a symlink at `/root/AirStack/osmo/results` (the path `osmo:fetch`
pulls). Artifacts are collected even for failed iterations.
- **`OSMO_MISSION_KEEP_ALIVE`** (default `true`): the pod sleeps after the
mission so you can fetch over ssh. Set `false` (or submit with
`osmo:mission --no-keep-alive`) for fire-and-forget: the task exits
cleanly when the mission ends, freeing the GPU — and if the workflow's
`outputs:` block is configured with a destination bucket, OSMO uploads
`/osmo/output` automatically on that exit. A **canceled** workflow does
not upload outputs, so in keep-alive mode `osmo:fetch` is the retrieval
path.

Mission spec schema and step types: [`missions/README.md`](missions/README.md).

## Pool requirements

The OSMO pool the workflow runs on must satisfy:
Expand Down
Loading
Loading