orama/scripts/redeploy.sh
2026-02-14 14:14:04 +02:00

401 lines
12 KiB
Bash
Executable File

#!/usr/bin/env bash
#
# Redeploy to all nodes in a given environment (devnet or testnet).
# Reads node credentials from scripts/remote-nodes.conf.
#
# Flow:
# 1) make build-linux
# 2) scripts/generate-source-archive.sh -> /tmp/network-source.tar.gz
# 3) scp archive + extract-deploy.sh + conf to hub node
# 4) from hub: sshpass scp to all other nodes + sudo bash /tmp/extract-deploy.sh
# 5) rolling upgrade: followers first, leader last
# per node: pre-upgrade -> stop -> extract binary -> post-upgrade
#
# Usage:
# scripts/redeploy.sh --devnet
# scripts/redeploy.sh --testnet
# scripts/redeploy.sh --devnet --no-build
# scripts/redeploy.sh --devnet --skip-build
#
set -euo pipefail
# ── Parse flags ──────────────────────────────────────────────────────────────
ENV=""
NO_BUILD=0
for arg in "$@"; do
case "$arg" in
--devnet) ENV="devnet" ;;
--testnet) ENV="testnet" ;;
--no-build|--skip-build) NO_BUILD=1 ;;
-h|--help)
echo "Usage: scripts/redeploy.sh --devnet|--testnet [--no-build|--skip-build]"
exit 0
;;
*)
echo "Unknown flag: $arg" >&2
echo "Usage: scripts/redeploy.sh --devnet|--testnet [--no-build|--skip-build]" >&2
exit 1
;;
esac
done
if [[ -z "$ENV" ]]; then
echo "ERROR: specify --devnet or --testnet" >&2
exit 1
fi
# ── Paths ────────────────────────────────────────────────────────────────────
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
CONF="$ROOT_DIR/scripts/remote-nodes.conf"
ARCHIVE="/tmp/network-source.tar.gz"
EXTRACT_SCRIPT="$ROOT_DIR/scripts/extract-deploy.sh"
die() { echo "ERROR: $*" >&2; exit 1; }
need_file() { [[ -f "$1" ]] || die "Missing file: $1"; }
need_file "$CONF"
need_file "$EXTRACT_SCRIPT"
# ── Load nodes from conf ────────────────────────────────────────────────────
HOSTS=()
PASSES=()
ROLES=()
SSH_KEYS=()
while IFS='|' read -r env host pass role key; do
[[ -z "$env" || "$env" == \#* ]] && continue
env="${env%%#*}"
env="$(echo "$env" | xargs)"
[[ "$env" != "$ENV" ]] && continue
HOSTS+=("$host")
PASSES+=("$pass")
ROLES+=("${role:-node}")
SSH_KEYS+=("${key:-}")
done < "$CONF"
if [[ ${#HOSTS[@]} -eq 0 ]]; then
die "No nodes found for environment '$ENV' in $CONF"
fi
echo "== redeploy.sh ($ENV) — ${#HOSTS[@]} nodes =="
for i in "${!HOSTS[@]}"; do
echo " [$i] ${HOSTS[$i]} (${ROLES[$i]})"
done
# ── Pick hub node ────────────────────────────────────────────────────────────
# Hub = first node that has an SSH key configured (direct SCP from local).
# If none have a key, use the first node (via sshpass).
HUB_IDX=0
HUB_KEY=""
for i in "${!HOSTS[@]}"; do
if [[ -n "${SSH_KEYS[$i]}" ]]; then
expanded_key="${SSH_KEYS[$i]/#\~/$HOME}"
if [[ -f "$expanded_key" ]]; then
HUB_IDX=$i
HUB_KEY="$expanded_key"
break
fi
fi
done
HUB_HOST="${HOSTS[$HUB_IDX]}"
HUB_PASS="${PASSES[$HUB_IDX]}"
echo "Hub: $HUB_HOST (idx=$HUB_IDX, key=${HUB_KEY:-none})"
# ── Build ────────────────────────────────────────────────────────────────────
if [[ "$NO_BUILD" -eq 0 ]]; then
echo "== build-linux =="
(cd "$ROOT_DIR" && make build-linux) || {
echo "WARN: make build-linux failed; continuing if existing bin-linux is acceptable."
}
else
echo "== skipping build (--no-build) =="
fi
# ── Generate source archive ─────────────────────────────────────────────────
echo "== generate source archive =="
(cd "$ROOT_DIR" && ./scripts/generate-source-archive.sh)
need_file "$ARCHIVE"
# ── Helper: SSH/SCP to hub ───────────────────────────────────────────────────
SSH_OPTS=(-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null)
hub_scp() {
if [[ -n "$HUB_KEY" ]]; then
scp -i "$HUB_KEY" "${SSH_OPTS[@]}" "$@"
else
sshpass -p "$HUB_PASS" scp "${SSH_OPTS[@]}" "$@"
fi
}
hub_ssh() {
if [[ -n "$HUB_KEY" ]]; then
ssh -i "$HUB_KEY" "${SSH_OPTS[@]}" "$@"
else
sshpass -p "$HUB_PASS" ssh "${SSH_OPTS[@]}" "$@"
fi
}
# ── Upload to hub ────────────────────────────────────────────────────────────
echo "== upload archive + extract script + conf to hub ($HUB_HOST) =="
hub_scp "$ARCHIVE" "$EXTRACT_SCRIPT" "$CONF" "$HUB_HOST":/tmp/
# ── Remote: fan-out + extract + rolling upgrade ─────────────────────────────
echo "== fan-out + extract + rolling upgrade from hub =="
hub_ssh "$HUB_HOST" "DEPLOY_ENV=$ENV HUB_IDX=$HUB_IDX bash -s" <<'REMOTE'
set -euo pipefail
export DEBIAN_FRONTEND=noninteractive
TAR=/tmp/network-source.tar.gz
EX=/tmp/extract-deploy.sh
CONF=/tmp/remote-nodes.conf
[[ -f "$TAR" ]] || { echo "Missing $TAR on hub"; exit 2; }
[[ -f "$EX" ]] || { echo "Missing $EX on hub"; exit 2; }
[[ -f "$CONF" ]] || { echo "Missing $CONF on hub"; exit 2; }
chmod +x "$EX" || true
# Parse conf file on the hub — same format as local
hosts=()
passes=()
idx=0
hub_host=""
hub_pass=""
while IFS='|' read -r env host pass role key; do
[[ -z "$env" || "$env" == \#* ]] && continue
env="${env%%#*}"
env="$(echo "$env" | xargs)"
[[ "$env" != "$DEPLOY_ENV" ]] && continue
if [[ $idx -eq $HUB_IDX ]]; then
hub_host="$host"
hub_pass="$pass"
else
hosts+=("$host")
passes+=("$pass")
fi
((idx++)) || true
done < "$CONF"
echo "Hub: $hub_host (this machine)"
echo "Fan-out nodes: ${#hosts[@]}"
# Install sshpass on hub if needed
if [[ ${#hosts[@]} -gt 0 ]] && ! command -v sshpass >/dev/null 2>&1; then
echo "Installing sshpass on hub..."
printf '%s\n' "$hub_pass" | sudo -S apt-get update -y >/dev/null
printf '%s\n' "$hub_pass" | sudo -S apt-get install -y sshpass >/dev/null
fi
echo "== fan-out: upload to ${#hosts[@]} nodes =="
upload_failed=()
for i in "${!hosts[@]}"; do
h="${hosts[$i]}"
p="${passes[$i]}"
echo " -> $h"
if ! sshpass -p "$p" scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
"$TAR" "$EX" "$h":/tmp/; then
echo " !! UPLOAD FAILED: $h"
upload_failed+=("$h")
fi
done
echo "== extract on all fan-out nodes =="
for i in "${!hosts[@]}"; do
h="${hosts[$i]}"
p="${passes[$i]}"
echo " -> $h"
if ! sshpass -p "$p" ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
"$h" "printf '%s\n' '$p' | sudo -S bash /tmp/extract-deploy.sh >/tmp/extract.log 2>&1 && echo OK"; then
echo " !! EXTRACT FAILED: $h"
upload_failed+=("$h")
fi
done
if [[ ${#upload_failed[@]} -gt 0 ]]; then
echo ""
echo "WARNING: ${#upload_failed[@]} nodes had upload/extract failures:"
for uf in "${upload_failed[@]}"; do
echo " - $uf"
done
echo "Continuing with rolling restart..."
fi
echo "== extract on hub =="
printf '%s\n' "$hub_pass" | sudo -S bash "$EX" >/tmp/extract.log 2>&1
# ── Raft state detection ──
raft_state() {
local h="$1" p="$2"
local cmd="curl -s http://localhost:5001/status"
local parse_py='import sys,json; j=json.load(sys.stdin); r=j.get("store",{}).get("raft",{}); print((r.get("state") or ""), (r.get("num_peers") or 0), (r.get("voter") is True))'
sshpass -p "$p" ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
"$h" "$cmd | python3 -c '$parse_py'" 2>/dev/null || true
}
echo "== detect leader =="
leader=""
leader_pass=""
for i in "${!hosts[@]}"; do
h="${hosts[$i]}"
p="${passes[$i]}"
out="$(raft_state "$h" "$p")"
echo " $h -> ${out:-NO_OUTPUT}"
if [[ "$out" == Leader* ]]; then
leader="$h"
leader_pass="$p"
break
fi
done
# Check hub itself
if [[ -z "$leader" ]]; then
hub_out="$(curl -s http://localhost:5001/status | python3 -c 'import sys,json; j=json.load(sys.stdin); r=j.get("store",{}).get("raft",{}); print((r.get("state") or ""), (r.get("num_peers") or 0), (r.get("voter") is True))' 2>/dev/null || true)"
echo " hub(localhost) -> ${hub_out:-NO_OUTPUT}"
if [[ "$hub_out" == Leader* ]]; then
leader="HUB"
leader_pass="$hub_pass"
fi
fi
if [[ -z "$leader" ]]; then
echo "No leader detected. Aborting before upgrades."
exit 3
fi
echo "Leader: $leader"
failed_nodes=()
# ── Per-node upgrade flow ──
# Uses pre-upgrade (maintenance + leadership transfer + propagation wait)
# then stops, deploys binary, and post-upgrade (start + health verification).
upgrade_one() {
local h="$1" p="$2"
echo "== upgrade $h =="
# 1. Pre-upgrade: enter maintenance, transfer leadership, wait for propagation
echo " [1/4] pre-upgrade..."
if ! sshpass -p "$p" ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
"$h" "printf '%s\n' '$p' | sudo -S orama prod pre-upgrade" 2>&1; then
echo " !! pre-upgrade failed on $h (continuing with stop)"
fi
# 2. Stop all services
echo " [2/4] stopping services..."
if ! sshpass -p "$p" ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
"$h" "printf '%s\n' '$p' | sudo -S systemctl stop 'orama-*'" 2>&1; then
echo " !! stop failed on $h"
failed_nodes+=("$h")
return 1
fi
# 3. Deploy new binary
echo " [3/4] deploying binary..."
if ! sshpass -p "$p" ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
"$h" "printf '%s\n' '$p' | sudo -S bash /tmp/extract-deploy.sh >/tmp/extract.log 2>&1 && echo OK" 2>&1; then
echo " !! extract failed on $h"
failed_nodes+=("$h")
return 1
fi
# 4. Post-upgrade: start services, verify health, exit maintenance
echo " [4/4] post-upgrade..."
if ! sshpass -p "$p" ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
"$h" "printf '%s\n' '$p' | sudo -S orama prod post-upgrade" 2>&1; then
echo " !! post-upgrade failed on $h"
failed_nodes+=("$h")
return 1
fi
echo " OK: $h"
}
upgrade_hub() {
echo "== upgrade hub (localhost) =="
# 1. Pre-upgrade
echo " [1/4] pre-upgrade..."
if ! (printf '%s\n' "$hub_pass" | sudo -S orama prod pre-upgrade) 2>&1; then
echo " !! pre-upgrade failed on hub (continuing with stop)"
fi
# 2. Stop all services
echo " [2/4] stopping services..."
if ! (printf '%s\n' "$hub_pass" | sudo -S systemctl stop 'orama-*') 2>&1; then
echo " !! stop failed on hub ($hub_host)"
failed_nodes+=("$hub_host (hub)")
return 1
fi
# 3. Deploy new binary
echo " [3/4] deploying binary..."
if ! (printf '%s\n' "$hub_pass" | sudo -S bash "$EX" >/tmp/extract.log 2>&1); then
echo " !! extract failed on hub ($hub_host)"
failed_nodes+=("$hub_host (hub)")
return 1
fi
# 4. Post-upgrade
echo " [4/4] post-upgrade..."
if ! (printf '%s\n' "$hub_pass" | sudo -S orama prod post-upgrade) 2>&1; then
echo " !! post-upgrade failed on hub ($hub_host)"
failed_nodes+=("$hub_host (hub)")
return 1
fi
echo " OK: hub ($hub_host)"
}
echo "== rolling upgrade (followers first, leader last) =="
for i in "${!hosts[@]}"; do
h="${hosts[$i]}"
p="${passes[$i]}"
[[ "$h" == "$leader" ]] && continue
upgrade_one "$h" "$p" || true
done
# Upgrade hub if not the leader
if [[ "$leader" != "HUB" ]]; then
upgrade_hub || true
fi
# Upgrade leader last
echo "== upgrade leader last =="
if [[ "$leader" == "HUB" ]]; then
upgrade_hub || true
else
upgrade_one "$leader" "$leader_pass" || true
fi
# Clean up conf from hub
rm -f "$CONF"
# ── Report results ──
echo ""
echo "========================================"
if [[ ${#failed_nodes[@]} -gt 0 ]]; then
echo "UPGRADE COMPLETED WITH FAILURES (${#failed_nodes[@]} nodes failed):"
for fn in "${failed_nodes[@]}"; do
echo " FAILED: $fn"
done
echo ""
echo "Recommended actions:"
echo " 1. SSH into the failed node(s)"
echo " 2. Check logs: sudo orama prod logs node --follow"
echo " 3. Manually run: sudo orama prod post-upgrade"
echo "========================================"
exit 1
else
echo "All nodes upgraded successfully."
echo "========================================"
fi
REMOTE
echo "== complete =="