personal-stack/backup.sh at main · ExtraToast/personal-stack · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#!/usr/bin/env bash

# Full old-stack backup runbook.
# You can either:
# 1. run this file line by line, or
# 2. execute it as a script after reviewing the commands.

set -euo pipefail
trap 'printf "backup.sh failed at line %s\n" "$LINENO" >&2' ERR
PS4='+ [${LINENO}] '
set -x

# Move into the repo root first.
cd /Users/j.w.jonkers/IDEAProjects/personal-stack-2

# Define one backup output directory per local calendar day so repeated reruns
# on the same day reuse the same folder instead of creating many timestamped
# runs. Override RUN_DIR manually if you want a separate run folder.
export RUN_DIR="${RUN_DIR:-$PWD/backups/run-$(date +%Y%m%d)}"

# Cloud/VPS backup connection settings.
export BACKUP_CLOUD_SSH_HOST=100.64.0.1
export BACKUP_CLOUD_SSH_USER=deploy
export BACKUP_CLOUD_SSH_PORT=2222
export BACKUP_CLOUD_SSH_IDENTITY_FILE="$HOME/.ssh/ps-vps-1"

# Home/GTX960M backup connection settings.
export BACKUP_HOME_SSH_HOST=100.64.0.2
export BACKUP_HOME_SSH_USER=extratoast
export BACKUP_HOME_SSH_PORT=22
export BACKUP_HOME_SSH_IDENTITY_FILE="$HOME/.ssh/ps-gtx960m"

# Force both backup scripts to write into the same run directory.
export BACKUP_OUTPUT_DIR="$RUN_DIR"

# Mirror all output to a daily log file while still printing to the terminal.
mkdir -p "$RUN_DIR"
exec > >(tee -a "$RUN_DIR/backup.log") 2>&1

step() {
  printf '\n==> %s\n' "$1"
}

# Convenience SSH wrappers so the remote commands stay readable.
vps() {
  ssh -i "$HOME/.ssh/ps-vps-1" -p 2222 deploy@100.64.0.1 "$@"
}

homehost() {
  ssh -i "$HOME/.ssh/ps-gtx960m" -p 22 extratoast@100.64.0.2 "$@"
}

step "Using backup run directory: $RUN_DIR"

# Step 0: restore the old environment so a fresh rerun can capture live state.
# This uses the VPS's own checked-out repo under /opt/personal-stack so the
# backup reflects the currently deployed old stack, not newer local changes.
step "Starting prerequisite services on both source hosts"
vps 'sudo systemctl start consul vault nomad'
homehost 'sudo systemctl start consul nomad smbd adguard-home'

# Unseal Vault on the VPS before any Vault-backed checks or job restores.
step "Waiting for Vault to become reachable and unsealed"
vps 'sudo bash -s' <<'EOF'
set -euo pipefail
source /opt/personal-stack/.vault-keys
export VAULT_ADDR=http://127.0.0.1:8200
tmp="$(mktemp -d)"
trap 'rm -rf "$tmp"' EXIT
seal_status_url="${VAULT_ADDR}/v1/sys/seal-status"

for _ in $(seq 1 30); do
  if curl -fsS --max-time 5 "$seal_status_url" >"$tmp/status.json" 2>/dev/null; then
    echo "Vault API reachable"
    break
  fi
  echo "Vault API not reachable yet, sleeping 2s"
  sleep 2
done

curl -fsS --max-time 5 "$seal_status_url" >"$tmp/status.json"
if jq -e '.sealed == true' "$tmp/status.json" >/dev/null; then
  if [[ -z "${VAULT_UNSEAL_KEY:-}" ]]; then
    echo "Vault is sealed but VAULT_UNSEAL_KEY is missing from /opt/personal-stack/.vault-keys" >&2
    exit 1
  fi
  echo "Vault is sealed, running unseal"
  timeout 10 vault operator unseal "$VAULT_UNSEAL_KEY" >/dev/null
else
  echo "Vault already unsealed"
fi

curl -fsS --max-time 5 "$seal_status_url" >"$tmp/status.json"
jq -r '"Vault state: initialized=\(.initialized) sealed=\(.sealed) standby=\(.standby // false)"' "$tmp/status.json"
echo VAULT_UNSEALED
EOF

# Re-submit only the RabbitMQ job so the live definitions export can run again.
# Avoid touching PostgreSQL or the rest of the old stack during a backup rerun.
step "Ensuring RabbitMQ is up for the live definitions export"
vps 'sudo bash -s' <<'EOF'
set -euo pipefail
source /opt/personal-stack/.nomad-keys
export NOMAD_ADDR=http://127.0.0.1:4646 NOMAD_TOKEN="$NOMAD_BOOTSTRAP_TOKEN"
cd /opt/personal-stack
nomad job run -detach infra/nomad/jobs/data/rabbitmq.nomad.hcl

source /opt/personal-stack/.vault-keys
export VAULT_ADDR=http://127.0.0.1:8200 VAULT_TOKEN="$VAULT_ROOT_TOKEN"

for _ in $(seq 1 60); do
  rmq_user="$(vault kv get -field=rabbitmq.user secret/platform/rabbitmq)"
  rmq_password="$(vault kv get -field=rabbitmq.password secret/platform/rabbitmq)"
  if curl -fsS --max-time 5 --user "$rmq_user:$rmq_password" http://127.0.0.1:15672/api/overview >/dev/null; then
    echo RABBITMQ_RESTORED
    exit 0
  fi
  echo "RabbitMQ management API not ready yet, sleeping 3s"
  sleep 3
done

nomad job status rabbitmq || true
exit 1
EOF

# Step 1: audit manifest coverage.
# Check that the manifest still covers every declared Nomad host volume.
# Note: the script will also print system-service paths such as /opt/consul,
# /opt/nomad, /opt/vault/data, /opt/adguard-home, and /var/lib/samba.
# Those are expected because they are backup paths, but not Nomad host_volume
# declarations. Paths listed in backups/audit-excluded-paths.txt are also
# intentionally ignored by the audit. The failure condition is only missing
# declared host volumes outside that explicit exclusion list.
step "Auditing backup manifest coverage"
infra/scripts/audit-backup-scope.sh

# Step 2: confirm remote sudo and control-plane auth.
# Check passwordless sudo on both source hosts.
step "Running sudo and control-plane auth preflight checks"
vps 'sudo -n true'
homehost 'sudo -n true'

# Check that the current Vault token on the VPS still works.
vps 'sudo bash -s' <<'EOF'
source /opt/personal-stack/.vault-keys
export VAULT_ADDR=http://127.0.0.1:8200 VAULT_TOKEN="$VAULT_ROOT_TOKEN"
vault status >/dev/null
echo VAULT_OK
EOF

# Check that RabbitMQ credentials can be resolved from Vault and authenticate locally.
vps 'sudo bash -s' <<'EOF'
source /opt/personal-stack/.vault-keys
export VAULT_ADDR=http://127.0.0.1:8200 VAULT_TOKEN="$VAULT_ROOT_TOKEN"
rmq_user="$(vault kv get -field=rabbitmq.user secret/platform/rabbitmq)"
rmq_password="$(vault kv get -field=rabbitmq.password secret/platform/rabbitmq)"
curl -fsS --user "$rmq_user:$rmq_password" http://127.0.0.1:15672/api/overview >/dev/null
echo RABBITMQ_OK
EOF

# Check that the current Nomad management token on the VPS still works.
vps 'sudo bash -s' <<'EOF'
source /opt/personal-stack/.nomad-keys
export NOMAD_ADDR=http://127.0.0.1:4646 NOMAD_TOKEN="$NOMAD_BOOTSTRAP_TOKEN"
nomad status >/dev/null
echo NOMAD_OK
EOF

# Step 3: capture live snapshots and exports while services are still up.
# Capture live service-native snapshots before stopping anything.
step "Capturing live service snapshots"
infra/scripts/backup-service-snapshots.sh

# Step 4: stop old workloads.
# Stop all Nomad jobs on the VPS before stopping Nomad itself.
step "Stopping Nomad jobs on the VPS"
vps 'sudo bash -s' <<'EOF'
source /opt/personal-stack/.nomad-keys
export NOMAD_ADDR=http://127.0.0.1:4646 NOMAD_TOKEN="$NOMAD_BOOTSTRAP_TOKEN"
curl -fsS -H "X-Nomad-Token: $NOMAD_TOKEN" "$NOMAD_ADDR/v1/jobs" | jq -r '.[].ID' | while read -r job; do
  nomad job stop -purge "$job"
done
EOF

# Stop remaining stateful host services on the VPS.
step "Stopping remaining VPS host services"
vps 'sudo systemctl stop vault nomad consul'

# Stop stateful host services on the home node.
step "Stopping remaining home host services"
homehost 'sudo systemctl stop nomad consul smbd adguard-home'

# Step 5: pull filesystem archives.
# Pull filesystem backups from both hosts.
step "Streaming filesystem archives into $RUN_DIR"
infra/scripts/backup-service-state.sh

# Step 6: verify the completed run.
# Verify the run metadata, required artifacts, and recorded checksums.
step "Verifying backup artifacts"
infra/scripts/verify-backup-run.sh "$RUN_DIR"

# Verify every compressed filesystem archive is readable.
awk -F '\t' 'NR > 1 && $5 == "backed-up" { print $4 }' "$RUN_DIR/archives.tsv" | while read -r archive; do
  gzip -t "$archive"
done

# Optional: inspect the resulting snapshot/export table.
column -t -s $'\t' "$RUN_DIR/service-snapshots.tsv"

# Optional: inspect the resulting filesystem-archive table.
column -t -s $'\t' "$RUN_DIR/archives.tsv"

# Optional: print the run directory so it is easy to find afterwards.
printf 'Backup run saved in %s\n' "$RUN_DIR"