Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/10971.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add optional --with-sftp-agent option to the dev-mode TUI installer that provisions a dedicated SFTP agent alongside the regular compute agent using the multi-agent-per-node feature, and register the 'upload' scaling group for routing SFTP upload sessions.
137 changes: 137 additions & 0 deletions configs/agent/halfstack-sftp.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
[etcd]
namespace = "local"
addr = { host = "127.0.0.1", port = 8120 }
user = ""
password = ""


[agent]
mode = "docker"
# SFTP agent uses a distinct RPC port (primary is 6001 in the template,
# rewritten to 6011 by the dev installer).
rpc-listen-addr = { host = "127.0.0.1", port = 6013 }
# metric API service address — must not collide with the primary agent's 6003.
service-addr = { host = "0.0.0.0", port = 6023 }
ssl-enabled = false
#ssl-cert = ""
#ssl-key = ""
# Distinct from the primary agent's default 6007.
agent-sock-port = 6017
# Distinguishable agent id so that the manager treats this as a separate node
# from the primary agent.
id = "i-local-sftp"
# Dedicated scaling group for SFTP upload sessions.
scaling-group = "upload"
pid-file = "./agent-sftp.pid"
event-loop = "uvloop"
ipc-base-path = "ipc/agent-sftp"
var-base-path = "var/agent-sftp"
# The SFTP agent does NOT load any accelerator plugin — otherwise it would
# try to import libraries that may not be installed in a dev environment
# (e.g. Tenstorrent's `tt_tools_common`). Leaving this explicitly empty
# instead of relying on the default (None → load all discovered plugins).
allow-compute-plugins = []
# block-compute-plugins = []
image-commit-path = "./tmp/backend.ai/commit/"
# Share the same vfolder root as the primary agent so that files uploaded
# via SFTP are visible to compute sessions running on the primary agent.
# This path is rewritten by the installer to an absolute path under
# `<base>/vfolder/local/volume1`.
mount-path = "./vfroot/local"
cohabiting-storage-proxy = true
# aiomonitor / metadata-server ports — must not collide with the primary
# agent's defaults (38200 / 39200 / 40128).
aiomonitor-termui-port = 38201
aiomonitor-webui-port = 39201
metadata-server-port = 40129


[container]
# Non-overlapping with the primary agent's default 30000-31000. SFTP
# upload sessions don't typically allocate compute kernels but we still
# need a non-overlapping range to be safe.
port-range = [31100, 31200]
kernel-uid = -1
kernel-gid = -1
bind-host = "127.0.0.1"
sandbox-type = "docker"
scratch-type = "hostdir"
scratch-root = "./scratches"
scratch-size = "1G"


[watcher]
# Distinct watcher service-addr (primary is 6009).
service-addr = { host = "127.0.0.1", port = 6015 }
ssl-enabled = false
#ssl-cert = ""
#ssl-key = ""
target-service = "backendai-agent.service"
soft-reset-available = false


[pyroscope]
enabled = true
# Differentiated pyroscope app-name so the two agents show up separately
# in dashboards.
app-name = "backendai-half-sftp-agent"
server-addr = "http://localhost:4040"
sample-rate = 100


[logging]
level = "INFO"
drivers = ["console"]

[logging.pkg-ns]
"" = "WARNING"
"aiodocker" = "INFO"
"aiotools" = "INFO"
"aiohttp" = "INFO"
"ai.backend" = "INFO"

[logging.console]
colored = true
format = "verbose"

[logging.file]
path = "./logs"
filename = "agent-sftp.log"
rotation-size = "10M"

[logging.logstash]
endpoint = { host = "localhost", port = 9300 }
protocol = "tcp"
ssl-enabled = true
ssl-verify = false

[logging.graylog]
host = "127.0.0.1"
port = 12201
ssl-verify = false
ca-certs = ""
keyfile = ""
certfile = ""

[resource]
reserved-cpu = 1
reserved-mem = "1G"
reserved-disk = "8G"
allocation-order = ["cuda", "rocm", "tpu", "cpu", "mem"]
affinity-policy = "INTERLEAVED"


[debug]
enabled = true
skip-container-deletion = false

[debug.coredump]
enabled = false
path = "./coredumps"
backup-count = 10
size-limit = "64M"

[otel]
enabled = true
log-level = "INFO"
endpoint = "http://127.0.0.1:4317"
116 changes: 106 additions & 10 deletions dev
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ _service_cmd() {
case $1 in
mgr) echo "./backend.ai mgr start-server --debug" ;;
agent) echo "./backend.ai ag start-server --debug" ;;
sftp-agent) echo "./backend.ai ag start-server -f agent-sftp.toml --debug" ;;
storage) echo "./backend.ai storage start-server --debug" ;;
web) echo "./backend.ai web start-server --debug" ;;
proxy-coordinator) echo "./backend.ai app-proxy-coordinator start-server --debug" ;;
Expand All @@ -19,6 +20,7 @@ _service_process() {
case $1 in
mgr) echo "backend.ai: manager" ;;
agent) echo "backend.ai: agent" ;;
sftp-agent) echo "agent-sftp.toml" ;;
storage) echo "backend.ai: storage-proxy" ;;
web) echo "backend.ai: webserver" ;;
proxy-coordinator) echo "backend.ai: proxy-coordinator" ;;
Expand All @@ -35,7 +37,11 @@ _tmux_window_name() {
esac
}

# Services started by `./dev start all` and included in `./dev status`.
# Intentionally excludes sftp-agent since it is an optional dev addon that
# must be started explicitly via `./dev start sftp-agent`.
ALL_SERVICES="mgr agent storage web proxy-coordinator proxy-worker"
OPTIONAL_SERVICES="sftp-agent"

_color() {
local color=$1; shift
Expand All @@ -48,23 +54,88 @@ _color() {
esac
}

# Path to the pid-file managed by the agent process itself.
# Since setproctitle overwrites the argv of agent workers to just
# "backend.ai: agent", there is no way to distinguish the primary compute
# agent from the optional SFTP agent using `pgrep -f` alone. Both agents
# write their own pid-file (configured in agent.toml / agent-sftp.toml),
# and we use those files as the source of truth for process tracking.
_service_pidfile() {
case $1 in
agent) echo "./agent.pid" ;;
sftp-agent) echo "./agent-sftp.pid" ;;
*) return 1 ;;
esac
}

# Check whether the pid recorded in $1 is still a live process.
_pid_alive() {
local pid=$1
[ -n "$pid" ] && kill -0 "$pid" 2>/dev/null
}

_is_running() {
pgrep -f "$(_service_process "$1")" > /dev/null 2>&1
local svc=$1
case "$svc" in
agent|sftp-agent)
local pidfile
pidfile=$(_service_pidfile "$svc")
if [ -f "$pidfile" ]; then
local pid
pid=$(cat "$pidfile" 2>/dev/null || true)
_pid_alive "$pid"
return $?
fi
return 1
;;
*)
pgrep -f "$(_service_process "$svc")" > /dev/null 2>&1
;;
esac
}

_stop_service() {
local svc=$1
_kill_by_pidfile() {
local signal=$1
local pidfile
pidfile=$(_service_pidfile "$svc")
if [ -f "$pidfile" ]; then
local pid
pid=$(cat "$pidfile" 2>/dev/null || true)
if [ -n "$pid" ]; then
kill "-${signal}" "$pid" 2>/dev/null || true
fi
fi
}
_kill_matching() {
local signal=$1
case "$svc" in
agent|sftp-agent)
_kill_by_pidfile "$signal"
;;
*)
pkill "-${signal}" -f "$(_service_process "$svc")" 2>/dev/null || true
;;
esac
}
if _is_running "$svc"; then
echo " Stopping $svc..."
pkill -f "$(_service_process "$svc")" 2>/dev/null || true
_kill_matching TERM
for _ in $(seq 1 10); do
_is_running "$svc" || break
sleep 0.5
done
if _is_running "$svc"; then
pkill -9 -f "$(_service_process "$svc")" 2>/dev/null || true
_kill_matching KILL
sleep 1
fi
# Remove stale pid file on stop so the next status check is accurate.
case "$svc" in
agent|sftp-agent)
rm -f "$(_service_pidfile "$svc")"
;;
esac
fi
}

Expand All @@ -88,19 +159,35 @@ _start_service() {
echo " Started $svc"
}

cmd_status() {
echo ""
printf " %-20s %s\n" "SERVICE" "STATUS"
printf " %-20s %s\n" "-------" "------"
for svc in $ALL_SERVICES; do
_status_services() {
local services="$*"
for svc in $services; do
if _is_running "$svc"; then
local pid
pid=$(pgrep -f "$(_service_process "$svc")" | head -1)
case "$svc" in
agent|sftp-agent)
pid=$(cat "$(_service_pidfile "$svc")" 2>/dev/null || true)
;;
*)
pid=$(pgrep -f "$(_service_process "$svc")" | head -1)
;;
esac
printf " %-20s %s (pid: %s)\n" "$svc" "$(_color green running)" "$pid"
else
printf " %-20s %s\n" "$svc" "$(_color red stopped)"
fi
done
}

cmd_status() {
echo ""
printf " %-20s %s\n" "SERVICE" "STATUS"
printf " %-20s %s\n" "-------" "------"
_status_services $ALL_SERVICES
# Only show optional services that are actually installed (config exists).
if [ -f "./agent-sftp.toml" ]; then
_status_services sftp-agent
fi
echo ""
}

Expand Down Expand Up @@ -193,11 +280,15 @@ Commands:
Services:
mgr, agent, storage, web, proxy-coordinator, proxy-worker, all

Optional services (must be started explicitly, not included in 'all'):
sftp-agent Dedicated SFTP upload agent (needs agent-sftp.toml)

Examples:
./dev status
./dev restart mgr
./dev restart all
./dev log mgr
./dev start sftp-agent

EOF
}
Expand All @@ -207,9 +298,14 @@ _validate_service() {
[ "$svc" = "all" ] && return 0
_service_cmd "$svc" > /dev/null 2>&1 || {
echo "$(_color red "Unknown service: $svc")"
echo "Valid services: $ALL_SERVICES all"
echo "Valid services: $ALL_SERVICES $OPTIONAL_SERVICES all"
exit 1
Comment on lines 298 to 302
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sftp-agent is accepted as a valid service, but there’s no explicit check that ./agent-sftp.toml exists before attempting ./dev start/restart sftp-agent. Since cmd_status already hides the optional service unless the config exists, consider failing fast in _validate_service (or start path) with a clear message when the config is missing.

Copilot uses AI. Check for mistakes.
}
# Fail fast if sftp-agent config is missing
if [ "$svc" = "sftp-agent" ] && [ ! -f "./agent-sftp.toml" ]; then
echo "$(_color red "agent-sftp.toml not found. Run the installer with --with-sftp-agent first.")"
exit 1
fi
}

if [ $# -lt 1 ]; then
Expand Down
32 changes: 32 additions & 0 deletions src/ai/backend/install/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,6 +584,37 @@ def compose(self) -> ComposeResult:
"""
)
)
if service.sftp_agent_enabled:
with TabPane("SFTP Agent", id="sftp-agent"):
yield Markdown(
textwrap.dedent(
f"""
A dedicated SFTP agent has been configured alongside the
regular compute agent, assigned to the
`{service.sftp_agent_scaling_group}` scaling group.

Start it in a separate shell (or via the `./dev` helper):
```console
$ cd {self.install_info.base_path.resolve()}
$ ./dev start sftp-agent
```

Or run the agent process directly against the SFTP config:
```console
$ cd {self.install_info.base_path.resolve()}
$ ./backendai-agent ag start-server -f agent-sftp.toml
```

The SFTP agent listens on:
- RPC: `{service.sftp_agent_rpc_addr.bind.host}:{service.sftp_agent_rpc_addr.bind.port}`
- Watcher: `{service.sftp_agent_watcher_addr.bind.host}:{service.sftp_agent_watcher_addr.bind.port}`

SFTP upload sessions created via the web UI will be routed
to this agent; regular compute sessions continue to run on
the primary agent.
"""
)
)


class ModeMenu(Static):
Expand Down Expand Up @@ -645,6 +676,7 @@ def __init__(
use_wildcard_binding=args.use_wildcard_binding,
otel_endpoint=args.otel_endpoint,
metric_access_cidr=args.metric_access_cidr,
with_sftp_agent=args.with_sftp_agent,
)

def compose(self) -> ComposeResult:
Expand Down
Loading
Loading