From c6998b6ac23ca51cd2eb835fb22cf228ab43c676 Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Tue, 24 Feb 2026 14:24:25 +0200 Subject: [PATCH 01/13] Remove legacy deployment and upgrade scripts - Deleted redeploy.sh, which handled redeployment to nodes in devnet/testnet environments. - Removed upgrade-nodes.sh, responsible for rolling upgrades of nodes. - Eliminated upload-source-fanout.sh, which uploaded source archives to nodes in parallel. - Removed upload-source.sh, used for uploading and extracting source archives to VPS nodes. --- Makefile | 64 +- cmd/cli/root.go | 4 + docs/DEV_DEPLOY.md | 195 ++--- pkg/cli/build/archive.go | 269 +++++++ pkg/cli/build/builder.go | 690 ++++++++++++++++++ pkg/cli/build/command.go | 80 ++ pkg/cli/cmd/buildcmd/build.go | 24 + pkg/cli/cmd/node/clean.go | 25 + pkg/cli/cmd/node/node.go | 4 + pkg/cli/cmd/node/push.go | 24 + pkg/cli/cmd/node/recover_raft.go | 31 + pkg/cli/cmd/node/rollout.go | 22 + pkg/cli/production/clean/clean.go | 183 +++++ pkg/cli/production/install/remote.go | 68 +- pkg/cli/production/push/push.go | 248 +++++++ pkg/cli/production/recover/recover.go | 306 ++++++++ pkg/cli/production/rollout/rollout.go | 102 +++ pkg/cli/production/upgrade/command.go | 12 +- pkg/cli/production/upgrade/flags.go | 10 + pkg/cli/production/upgrade/orchestrator.go | 6 +- pkg/cli/production/upgrade/remote.go | 69 ++ pkg/cli/remotessh/config.go | 77 ++ pkg/cli/remotessh/ssh.go | 86 +++ pkg/constants/versions.go | 13 + .../production/installers/caddy.go | 9 +- .../production/installers/coredns.go | 7 +- .../production/installers/gateway.go | 4 +- .../production/installers/ipfs.go | 4 +- .../production/installers/ipfs_cluster.go | 4 +- .../production/installers/olric.go | 4 +- .../production/installers/rqlite.go | 4 +- pkg/environments/production/orchestrator.go | 51 +- pkg/environments/production/paths.go | 6 + pkg/environments/production/prebuilt.go | 232 ++++++ scripts/build-linux-caddy.sh | 223 ------ scripts/build-linux-coredns.sh | 91 --- scripts/check-node-health.sh | 143 ---- scripts/clean-testnet.sh | 249 ------- scripts/extract-deploy.sh | 90 ++- scripts/generate-source-archive.sh | 48 -- scripts/recover-rqlite.sh | 289 -------- scripts/redeploy.sh | 400 ---------- scripts/upgrade-nodes.sh | 85 --- scripts/upload-source-fanout.sh | 210 ------ scripts/upload-source.sh | 103 --- 45 files changed, 2871 insertions(+), 1997 deletions(-) create mode 100644 pkg/cli/build/archive.go create mode 100644 pkg/cli/build/builder.go create mode 100644 pkg/cli/build/command.go create mode 100644 pkg/cli/cmd/buildcmd/build.go create mode 100644 pkg/cli/cmd/node/clean.go create mode 100644 pkg/cli/cmd/node/push.go create mode 100644 pkg/cli/cmd/node/recover_raft.go create mode 100644 pkg/cli/cmd/node/rollout.go create mode 100644 pkg/cli/production/clean/clean.go create mode 100644 pkg/cli/production/push/push.go create mode 100644 pkg/cli/production/recover/recover.go create mode 100644 pkg/cli/production/rollout/rollout.go create mode 100644 pkg/cli/production/upgrade/remote.go create mode 100644 pkg/cli/remotessh/config.go create mode 100644 pkg/cli/remotessh/ssh.go create mode 100644 pkg/constants/versions.go create mode 100644 pkg/environments/production/prebuilt.go delete mode 100755 scripts/build-linux-caddy.sh delete mode 100755 scripts/build-linux-coredns.sh delete mode 100755 scripts/check-node-health.sh delete mode 100755 scripts/clean-testnet.sh delete mode 100755 scripts/generate-source-archive.sh delete mode 100644 scripts/recover-rqlite.sh delete mode 100755 scripts/redeploy.sh delete mode 100755 scripts/upgrade-nodes.sh delete mode 100755 scripts/upload-source-fanout.sh delete mode 100755 scripts/upload-source.sh diff --git a/Makefile b/Makefile index bc029e3..0c84c9c 100644 --- a/Makefile +++ b/Makefile @@ -61,9 +61,9 @@ test-e2e-quick: # Network - Distributed P2P Database System # Makefile for development and build tasks -.PHONY: build clean test deps tidy fmt vet lint install-hooks upload-devnet upload-testnet redeploy-devnet redeploy-testnet release health +.PHONY: build clean test deps tidy fmt vet lint install-hooks push-devnet push-testnet rollout-devnet rollout-testnet release -VERSION := 0.112.7 +VERSION := 0.115.0 COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown) DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ) LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)' @@ -89,9 +89,13 @@ build-linux: deps GOOS=linux GOARCH=amd64 go build -ldflags "$(LDFLAGS_LINUX)" -trimpath -o bin-linux/orama ./cmd/cli/ @echo "✓ CLI built at bin-linux/orama" @echo "" - @echo "Next steps:" - @echo " ./scripts/generate-source-archive.sh" - @echo " ./bin/orama install --vps-ip --nameserver --domain ..." + @echo "Prefer 'make build-archive' for full pre-built binary archive." + +# Build pre-compiled binary archive for deployment (all binaries + deps) +build-archive: deps + @echo "Building binary archive (version=$(VERSION))..." + go build -ldflags "$(LDFLAGS)" -o bin/orama ./cmd/cli/ + ./bin/orama build --output /tmp/orama-$(VERSION)-linux-amd64.tar.gz # Install git hooks install-hooks: @@ -105,29 +109,21 @@ clean: rm -rf data/ @echo "Clean complete!" -# Upload source to devnet using fanout (upload to 1 node, parallel distribute to rest) -upload-devnet: - @bash scripts/upload-source-fanout.sh --env devnet +# Push binary archive to devnet nodes (fanout distribution) +push-devnet: + ./bin/orama node push --env devnet -# Upload source to testnet using fanout -upload-testnet: - @bash scripts/upload-source-fanout.sh --env testnet +# Push binary archive to testnet nodes (fanout distribution) +push-testnet: + ./bin/orama node push --env testnet -# Deploy to devnet (build + rolling upgrade all nodes) -redeploy-devnet: - @bash scripts/redeploy.sh --devnet +# Full rollout to devnet (build + push + rolling upgrade) +rollout-devnet: + ./bin/orama node rollout --env devnet --yes -# Deploy to devnet without rebuilding -redeploy-devnet-quick: - @bash scripts/redeploy.sh --devnet --no-build - -# Deploy to testnet (build + rolling upgrade all nodes) -redeploy-testnet: - @bash scripts/redeploy.sh --testnet - -# Deploy to testnet without rebuilding -redeploy-testnet-quick: - @bash scripts/redeploy.sh --testnet --no-build +# Full rollout to testnet (build + push + rolling upgrade) +rollout-testnet: + ./bin/orama node rollout --env testnet --yes # Interactive release workflow (tag + push) release: @@ -140,14 +136,7 @@ health: echo "Usage: make health ENV=devnet|testnet"; \ exit 1; \ fi - @while IFS='|' read -r env host pass role key; do \ - [ -z "$$env" ] && continue; \ - case "$$env" in \#*) continue;; esac; \ - env="$$(echo "$$env" | xargs)"; \ - [ "$$env" != "$(ENV)" ] && continue; \ - role="$$(echo "$$role" | xargs)"; \ - bash scripts/check-node-health.sh "$$host" "$$pass" "$$host ($$role)"; \ - done < scripts/remote-nodes.conf + ./bin/orama monitor report --env $(ENV) # Help help: @@ -170,10 +159,11 @@ help: @echo " ORAMA_GATEWAY_URL=https://orama-devnet.network make test-e2e-prod" @echo "" @echo "Deployment:" - @echo " make redeploy-devnet - Build + rolling deploy to all devnet nodes" - @echo " make redeploy-devnet-quick - Deploy to devnet without rebuilding" - @echo " make redeploy-testnet - Build + rolling deploy to all testnet nodes" - @echo " make redeploy-testnet-quick- Deploy to testnet without rebuilding" + @echo " make build-archive - Build pre-compiled binary archive for deployment" + @echo " make push-devnet - Push binary archive to devnet nodes" + @echo " make push-testnet - Push binary archive to testnet nodes" + @echo " make rollout-devnet - Full rollout: build + push + rolling upgrade (devnet)" + @echo " make rollout-testnet - Full rollout: build + push + rolling upgrade (testnet)" @echo " make health ENV=devnet - Check health of all nodes in an environment" @echo " make release - Interactive release workflow (tag + push)" @echo "" diff --git a/cmd/cli/root.go b/cmd/cli/root.go index 93aba5d..266fc9b 100644 --- a/cmd/cli/root.go +++ b/cmd/cli/root.go @@ -9,6 +9,7 @@ import ( // Command groups "github.com/DeBrosOfficial/network/pkg/cli/cmd/app" "github.com/DeBrosOfficial/network/pkg/cli/cmd/authcmd" + "github.com/DeBrosOfficial/network/pkg/cli/cmd/buildcmd" "github.com/DeBrosOfficial/network/pkg/cli/cmd/dbcmd" deploycmd "github.com/DeBrosOfficial/network/pkg/cli/cmd/deploy" "github.com/DeBrosOfficial/network/pkg/cli/cmd/envcmd" @@ -83,6 +84,9 @@ and interacting with the Orama distributed network.`, // Serverless function commands rootCmd.AddCommand(functioncmd.Cmd) + // Build command (cross-compile binary archive) + rootCmd.AddCommand(buildcmd.Cmd) + return rootCmd } diff --git a/docs/DEV_DEPLOY.md b/docs/DEV_DEPLOY.md index c33537d..07265a4 100644 --- a/docs/DEV_DEPLOY.md +++ b/docs/DEV_DEPLOY.md @@ -27,87 +27,64 @@ make test ## Deploying to VPS -Source is always deployed via SCP (no git on VPS). The CLI is the only binary cross-compiled locally; everything else is built from source on the VPS. +All binaries are pre-compiled locally and shipped as a binary archive. Zero compilation on the VPS. ### Deploy Workflow ```bash -# 1. Cross-compile the CLI for Linux -make build-linux +# One-command: build + push + rolling upgrade +orama node rollout --env testnet -# 2. Generate a source archive (includes CLI binary + full source) -./scripts/generate-source-archive.sh -# Creates: /tmp/network-source.tar.gz +# Or step by step: -# 3. Install on a new VPS (handles SCP, extract, and remote install automatically) -./bin/orama node install --vps-ip --nameserver --domain --base-domain +# 1. Build binary archive (cross-compiles all binaries for linux/amd64) +orama build +# Creates: /tmp/orama--linux-amd64.tar.gz -# Or upgrade an existing VPS -./bin/orama node upgrade --restart +# 2. Push archive to all nodes (fanout via hub node) +orama node push --env testnet + +# 3. Rolling upgrade (one node at a time, followers first, leader last) +orama node upgrade --env testnet ``` -The `orama node install` command automatically: -1. Uploads the source archive via SCP -2. Extracts source to `/opt/orama/src` and installs the CLI to `/usr/local/bin/orama` -3. Runs `orama node install` on the VPS which builds all binaries from source (Go, CoreDNS, Caddy, Olric, etc.) +### Fresh Node Install + +```bash +# Build the archive first (if not already built) +orama build + +# Install on a new VPS (auto-uploads binary archive, zero compilation) +orama node install --vps-ip --nameserver --domain --base-domain +``` + +The installer auto-detects the binary archive at `/opt/orama/manifest.json` and copies pre-built binaries instead of compiling from source. ### Upgrading a Multi-Node Cluster (CRITICAL) -**NEVER restart all nodes simultaneously.** RQLite uses Raft consensus and requires a majority (quorum) to function. Restarting all nodes at once can cause cluster splits where nodes elect different leaders or form isolated clusters. +**NEVER restart all nodes simultaneously.** RQLite uses Raft consensus and requires a majority (quorum) to function. -#### Safe Upgrade Procedure (Rolling Restart) - -Always upgrade nodes **one at a time**, waiting for each to rejoin before proceeding: +#### Safe Upgrade Procedure ```bash -# 1. Build CLI + generate archive -make build-linux -./scripts/generate-source-archive.sh -# Creates: /tmp/network-source.tar.gz +# Full rollout (build + push + rolling upgrade, one command) +orama node rollout --env testnet -# 2. Upload to ONE node first (the "hub" node) -sshpass -p '' scp /tmp/network-source.tar.gz ubuntu@:/tmp/ +# Or with more control: +orama node push --env testnet # Push archive to all nodes +orama node upgrade --env testnet # Rolling upgrade (auto-detects leader) +orama node upgrade --env testnet --node 1.2.3.4 # Single node only +orama node upgrade --env testnet --delay 60 # 60s between nodes +``` -# 3. Fan out from hub to all other nodes (server-to-server is faster) -ssh ubuntu@ -for ip in ; do - scp /tmp/network-source.tar.gz ubuntu@$ip:/tmp/ -done -exit +The rolling upgrade automatically: +1. Upgrades **follower** nodes first +2. Upgrades the **leader** last +3. Waits a configurable delay between nodes (default: 30s) -# 4. Extract on ALL nodes (can be done in parallel, no restart yet) -for ip in ; do - ssh ubuntu@$ip 'sudo bash -s' < scripts/extract-deploy.sh -done - -# 5. Find the RQLite leader (upgrade this one LAST) -orama monitor report --env -# Check "rqlite_leader" in summary output - -# 6. Upgrade FOLLOWER nodes one at a time -ssh ubuntu@ 'sudo orama node stop && sudo orama node upgrade --restart' - -# IMPORTANT: Verify FULL health before proceeding to next node: -orama monitor report --env --node -# Check: -# - All services active, 0 restart loops -# - RQLite: Follower state, applied_index matches cluster -# - All RQLite peers reachable (no partition alerts) -# - WireGuard peers connected with recent handshakes -# Only proceed to next node after ALL checks pass. -# -# NOTE: After restarting a node, other nodes may briefly report it as -# "unreachable" with "broken pipe" errors. This is normal — Raft TCP -# connections need ~1-2 minutes to re-establish. Wait and re-check -# before escalating. - -# Repeat for each follower... - -# 7. Upgrade the LEADER node last -ssh ubuntu@ 'sudo orama node stop && sudo orama node upgrade --restart' - -# Verify the new leader was elected and cluster is fully healthy: -orama monitor report --env +After each node, verify health: +```bash +orama monitor report --env testnet ``` #### What NOT to Do @@ -121,31 +98,38 @@ orama monitor report --env If nodes get stuck in "Candidate" state or show "leader not found" errors: -1. Identify which node has the most recent data (usually the old leader) -2. Keep that node running as the new leader -3. On each other node, clear RQLite data and restart: - ```bash - sudo orama node stop - sudo rm -rf /opt/orama/.orama/data/rqlite - sudo systemctl start orama-node - ``` -4. The node should automatically rejoin using its configured `rqlite_join_address` - -If automatic rejoin fails, the node may have started without the `-join` flag. Check: ```bash -ps aux | grep rqlited -# Should include: -join 10.0.0.1:7001 (or similar) +# Recover the Raft cluster (specify the node with highest commit index as leader) +orama node recover-raft --env testnet --leader 1.2.3.4 ``` -If `-join` is missing, the node bootstrapped standalone. You'll need to either: -- Restart orama-node (it should detect empty data and use join) -- Or do a full cluster rebuild from CLEAN_NODE.md +This will: +1. Stop orama-node on ALL nodes +2. Backup + delete raft/ on non-leader nodes +3. Start the leader, wait for Leader state +4. Start remaining nodes in batches +5. Verify cluster health -### Deploying to Multiple Nodes +### Cleaning Nodes for Reinstallation -To deploy to all nodes, repeat steps 3-5 (dev) or 3-4 (production) for each VPS IP. +```bash +# Wipe all data and services (preserves Anyone relay keys) +orama node clean --env testnet --force -**Important:** When using `--restart`, do nodes one at a time (see "Upgrading a Multi-Node Cluster" above). +# Also remove shared binaries (rqlited, ipfs, caddy, etc.) +orama node clean --env testnet --nuclear --force + +# Single node only +orama node clean --env testnet --node 1.2.3.4 --force +``` + +### Push Options + +```bash +orama node push --env devnet # Fanout via hub (default, fastest) +orama node push --env testnet --node 1.2.3.4 # Single node +orama node push --env testnet --direct # Sequential, no fanout +``` ### CLI Flags Reference @@ -189,11 +173,56 @@ To deploy to all nodes, repeat steps 3-5 (dev) or 3-4 (production) for each VPS | Flag | Description | |------|-------------| -| `--restart` | Restart all services after upgrade | +| `--restart` | Restart all services after upgrade (local mode) | +| `--env ` | Target environment for remote rolling upgrade | +| `--node ` | Upgrade a single node only | +| `--delay ` | Delay between nodes during rolling upgrade (default: 30) | | `--anyone-relay` | Enable Anyone relay (same flags as install) | | `--anyone-bandwidth ` | Limit relay to N% of VPS bandwidth (default: 30, 0=unlimited) | | `--anyone-accounting ` | Monthly data cap for relay in GB (0=unlimited) | +#### `orama build` + +| Flag | Description | +|------|-------------| +| `--arch ` | Target architecture (default: amd64) | +| `--output ` | Output archive path | +| `--verbose` | Verbose build output | + +#### `orama node push` + +| Flag | Description | +|------|-------------| +| `--env ` | Target environment (required) | +| `--node ` | Push to a single node only | +| `--direct` | Sequential upload (no hub fanout) | + +#### `orama node rollout` + +| Flag | Description | +|------|-------------| +| `--env ` | Target environment (required) | +| `--no-build` | Skip the build step | +| `--yes` | Skip confirmation | +| `--delay ` | Delay between nodes (default: 30) | + +#### `orama node clean` + +| Flag | Description | +|------|-------------| +| `--env ` | Target environment (required) | +| `--node ` | Clean a single node only | +| `--nuclear` | Also remove shared binaries | +| `--force` | Skip confirmation (DESTRUCTIVE) | + +#### `orama node recover-raft` + +| Flag | Description | +|------|-------------| +| `--env ` | Target environment (required) | +| `--leader ` | Leader node IP — highest commit index (required) | +| `--force` | Skip confirmation (DESTRUCTIVE) | + #### `orama node` (Service Management) Use these commands to manage services on production nodes: diff --git a/pkg/cli/build/archive.go b/pkg/cli/build/archive.go new file mode 100644 index 0000000..25d8dd7 --- /dev/null +++ b/pkg/cli/build/archive.go @@ -0,0 +1,269 @@ +package build + +import ( + "archive/tar" + "compress/gzip" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "path/filepath" + "strings" + "time" +) + +// Manifest describes the contents of a binary archive. +type Manifest struct { + Version string `json:"version"` + Commit string `json:"commit"` + Date string `json:"date"` + Arch string `json:"arch"` + Checksums map[string]string `json:"checksums"` // filename -> sha256 +} + +// generateManifest creates the manifest with SHA256 checksums of all binaries. +func (b *Builder) generateManifest() (*Manifest, error) { + m := &Manifest{ + Version: b.version, + Commit: b.commit, + Date: b.date, + Arch: b.flags.Arch, + Checksums: make(map[string]string), + } + + entries, err := os.ReadDir(b.binDir) + if err != nil { + return nil, err + } + + for _, entry := range entries { + if entry.IsDir() { + continue + } + path := filepath.Join(b.binDir, entry.Name()) + hash, err := sha256File(path) + if err != nil { + return nil, fmt.Errorf("failed to hash %s: %w", entry.Name(), err) + } + m.Checksums[entry.Name()] = hash + } + + return m, nil +} + +// createArchive creates the tar.gz archive from the build directory. +func (b *Builder) createArchive(outputPath string, manifest *Manifest) error { + fmt.Printf("\nCreating archive: %s\n", outputPath) + + // Write manifest.json to tmpDir + manifestData, err := json.MarshalIndent(manifest, "", " ") + if err != nil { + return err + } + if err := os.WriteFile(filepath.Join(b.tmpDir, "manifest.json"), manifestData, 0644); err != nil { + return err + } + + // Create output file + f, err := os.Create(outputPath) + if err != nil { + return err + } + defer f.Close() + + gw := gzip.NewWriter(f) + defer gw.Close() + + tw := tar.NewWriter(gw) + defer tw.Close() + + // Add bin/ directory + if err := addDirToTar(tw, b.binDir, "bin"); err != nil { + return err + } + + // Add systemd/ directory + systemdDir := filepath.Join(b.tmpDir, "systemd") + if _, err := os.Stat(systemdDir); err == nil { + if err := addDirToTar(tw, systemdDir, "systemd"); err != nil { + return err + } + } + + // Add packages/ directory if it exists + packagesDir := filepath.Join(b.tmpDir, "packages") + if _, err := os.Stat(packagesDir); err == nil { + if err := addDirToTar(tw, packagesDir, "packages"); err != nil { + return err + } + } + + // Add manifest.json + if err := addFileToTar(tw, filepath.Join(b.tmpDir, "manifest.json"), "manifest.json"); err != nil { + return err + } + + // Print summary + fmt.Printf(" bin/: %d binaries\n", len(manifest.Checksums)) + fmt.Printf(" systemd/: namespace templates\n") + fmt.Printf(" manifest: v%s (%s) linux/%s\n", manifest.Version, manifest.Commit, manifest.Arch) + + info, err := f.Stat() + if err == nil { + fmt.Printf(" size: %s\n", formatBytes(info.Size())) + } + + return nil +} + +// addDirToTar adds all files in a directory to the tar archive under the given prefix. +func addDirToTar(tw *tar.Writer, srcDir, prefix string) error { + return filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + + // Calculate relative path + relPath, err := filepath.Rel(srcDir, path) + if err != nil { + return err + } + tarPath := filepath.Join(prefix, relPath) + + if info.IsDir() { + header := &tar.Header{ + Name: tarPath + "/", + Mode: 0755, + Typeflag: tar.TypeDir, + } + return tw.WriteHeader(header) + } + + return addFileToTar(tw, path, tarPath) + }) +} + +// addFileToTar adds a single file to the tar archive. +func addFileToTar(tw *tar.Writer, srcPath, tarPath string) error { + f, err := os.Open(srcPath) + if err != nil { + return err + } + defer f.Close() + + info, err := f.Stat() + if err != nil { + return err + } + + header := &tar.Header{ + Name: tarPath, + Size: info.Size(), + Mode: int64(info.Mode()), + } + + if err := tw.WriteHeader(header); err != nil { + return err + } + + _, err = io.Copy(tw, f) + return err +} + +// sha256File computes the SHA256 hash of a file. +func sha256File(path string) (string, error) { + f, err := os.Open(path) + if err != nil { + return "", err + } + defer f.Close() + + h := sha256.New() + if _, err := io.Copy(h, f); err != nil { + return "", err + } + return hex.EncodeToString(h.Sum(nil)), nil +} + +// downloadFile downloads a URL to a local file path. +func downloadFile(url, destPath string) error { + client := &http.Client{Timeout: 5 * time.Minute} + resp, err := client.Get(url) + if err != nil { + return fmt.Errorf("failed to download %s: %w", url, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("download %s returned status %d", url, resp.StatusCode) + } + + f, err := os.Create(destPath) + if err != nil { + return err + } + defer f.Close() + + _, err = io.Copy(f, resp.Body) + return err +} + +// extractFileFromTarball extracts a single file from a tar.gz archive. +func extractFileFromTarball(tarPath, targetFile, destPath string) error { + f, err := os.Open(tarPath) + if err != nil { + return err + } + defer f.Close() + + gr, err := gzip.NewReader(f) + if err != nil { + return err + } + defer gr.Close() + + tr := tar.NewReader(gr) + for { + header, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + return err + } + + // Match the target file (strip leading ./ if present) + name := strings.TrimPrefix(header.Name, "./") + if name == targetFile { + out, err := os.OpenFile(destPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0755) + if err != nil { + return err + } + defer out.Close() + + if _, err := io.Copy(out, tr); err != nil { + return err + } + return nil + } + } + + return fmt.Errorf("file %s not found in archive %s", targetFile, tarPath) +} + +// formatBytes formats bytes into a human-readable string. +func formatBytes(b int64) string { + const unit = 1024 + if b < unit { + return fmt.Sprintf("%d B", b) + } + div, exp := int64(unit), 0 + for n := b / unit; n >= unit; n /= unit { + div *= unit + exp++ + } + return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp]) +} diff --git a/pkg/cli/build/builder.go b/pkg/cli/build/builder.go new file mode 100644 index 0000000..de82016 --- /dev/null +++ b/pkg/cli/build/builder.go @@ -0,0 +1,690 @@ +package build + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + "github.com/DeBrosOfficial/network/pkg/constants" +) + +// oramaBinary defines a binary to cross-compile from the project source. +type oramaBinary struct { + Name string // output binary name + Package string // Go package path relative to project root + // Extra ldflags beyond the standard ones + ExtraLDFlags string +} + +// Builder orchestrates the entire build process. +type Builder struct { + flags *Flags + projectDir string + tmpDir string + binDir string + version string + commit string + date string +} + +// NewBuilder creates a new Builder. +func NewBuilder(flags *Flags) *Builder { + return &Builder{flags: flags} +} + +// Build runs the full build pipeline. +func (b *Builder) Build() error { + start := time.Now() + + // Find project root + projectDir, err := findProjectRoot() + if err != nil { + return err + } + b.projectDir = projectDir + + // Read version from Makefile or use "dev" + b.version = b.readVersion() + b.commit = b.readCommit() + b.date = time.Now().UTC().Format("2006-01-02T15:04:05Z") + + // Create temp build directory + b.tmpDir, err = os.MkdirTemp("", "orama-build-*") + if err != nil { + return fmt.Errorf("failed to create temp dir: %w", err) + } + defer os.RemoveAll(b.tmpDir) + + b.binDir = filepath.Join(b.tmpDir, "bin") + if err := os.MkdirAll(b.binDir, 0755); err != nil { + return fmt.Errorf("failed to create bin dir: %w", err) + } + + fmt.Printf("Building orama %s for linux/%s\n", b.version, b.flags.Arch) + fmt.Printf("Project: %s\n\n", b.projectDir) + + // Step 1: Cross-compile Orama binaries + if err := b.buildOramaBinaries(); err != nil { + return fmt.Errorf("failed to build orama binaries: %w", err) + } + + // Step 2: Cross-compile Olric + if err := b.buildOlric(); err != nil { + return fmt.Errorf("failed to build olric: %w", err) + } + + // Step 3: Cross-compile IPFS Cluster + if err := b.buildIPFSCluster(); err != nil { + return fmt.Errorf("failed to build ipfs-cluster: %w", err) + } + + // Step 4: Build CoreDNS with RQLite plugin + if err := b.buildCoreDNS(); err != nil { + return fmt.Errorf("failed to build coredns: %w", err) + } + + // Step 5: Build Caddy with Orama DNS module + if err := b.buildCaddy(); err != nil { + return fmt.Errorf("failed to build caddy: %w", err) + } + + // Step 6: Download pre-built IPFS Kubo + if err := b.downloadIPFS(); err != nil { + return fmt.Errorf("failed to download ipfs: %w", err) + } + + // Step 7: Download pre-built RQLite + if err := b.downloadRQLite(); err != nil { + return fmt.Errorf("failed to download rqlite: %w", err) + } + + // Step 8: Copy systemd templates + if err := b.copySystemdTemplates(); err != nil { + return fmt.Errorf("failed to copy systemd templates: %w", err) + } + + // Step 9: Generate manifest + manifest, err := b.generateManifest() + if err != nil { + return fmt.Errorf("failed to generate manifest: %w", err) + } + + // Step 10: Create archive + outputPath := b.flags.Output + if outputPath == "" { + outputPath = fmt.Sprintf("/tmp/orama-%s-linux-%s.tar.gz", b.version, b.flags.Arch) + } + + if err := b.createArchive(outputPath, manifest); err != nil { + return fmt.Errorf("failed to create archive: %w", err) + } + + elapsed := time.Since(start).Round(time.Second) + fmt.Printf("\nBuild complete in %s\n", elapsed) + fmt.Printf("Archive: %s\n", outputPath) + + return nil +} + +func (b *Builder) buildOramaBinaries() error { + fmt.Println("[1/7] Cross-compiling Orama binaries...") + + ldflags := fmt.Sprintf("-s -w -X 'main.version=%s' -X 'main.commit=%s' -X 'main.date=%s'", + b.version, b.commit, b.date) + + gatewayLDFlags := fmt.Sprintf("%s -X 'github.com/DeBrosOfficial/network/pkg/gateway.BuildVersion=%s' -X 'github.com/DeBrosOfficial/network/pkg/gateway.BuildCommit=%s' -X 'github.com/DeBrosOfficial/network/pkg/gateway.BuildTime=%s'", + ldflags, b.version, b.commit, b.date) + + binaries := []oramaBinary{ + {Name: "orama", Package: "./cmd/cli/"}, + {Name: "orama-node", Package: "./cmd/node/"}, + {Name: "gateway", Package: "./cmd/gateway/", ExtraLDFlags: gatewayLDFlags}, + {Name: "identity", Package: "./cmd/identity/"}, + {Name: "sfu", Package: "./cmd/sfu/"}, + {Name: "turn", Package: "./cmd/turn/"}, + } + + for _, bin := range binaries { + flags := ldflags + if bin.ExtraLDFlags != "" { + flags = bin.ExtraLDFlags + } + + output := filepath.Join(b.binDir, bin.Name) + cmd := exec.Command("go", "build", + "-ldflags", flags, + "-trimpath", + "-o", output, + bin.Package) + cmd.Dir = b.projectDir + cmd.Env = b.crossEnv() + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + if b.flags.Verbose { + fmt.Printf(" go build -o %s %s\n", bin.Name, bin.Package) + } + + if err := cmd.Run(); err != nil { + return fmt.Errorf("failed to build %s: %w", bin.Name, err) + } + fmt.Printf(" ✓ %s\n", bin.Name) + } + + return nil +} + +func (b *Builder) buildOlric() error { + fmt.Printf("[2/7] Cross-compiling Olric %s...\n", constants.OlricVersion) + + cmd := exec.Command("go", "install", + fmt.Sprintf("github.com/olric-data/olric/cmd/olric-server@%s", constants.OlricVersion)) + cmd.Env = append(b.crossEnv(), + "GOBIN="+b.binDir, + "GOPROXY=https://proxy.golang.org|direct", + "GONOSUMDB=*") + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + if err := cmd.Run(); err != nil { + return err + } + fmt.Println(" ✓ olric-server") + return nil +} + +func (b *Builder) buildIPFSCluster() error { + fmt.Printf("[3/7] Cross-compiling IPFS Cluster %s...\n", constants.IPFSClusterVersion) + + cmd := exec.Command("go", "install", + fmt.Sprintf("github.com/ipfs-cluster/ipfs-cluster/cmd/ipfs-cluster-service@%s", constants.IPFSClusterVersion)) + cmd.Env = append(b.crossEnv(), + "GOBIN="+b.binDir, + "GOPROXY=https://proxy.golang.org|direct", + "GONOSUMDB=*") + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + if err := cmd.Run(); err != nil { + return err + } + fmt.Println(" ✓ ipfs-cluster-service") + return nil +} + +func (b *Builder) buildCoreDNS() error { + fmt.Printf("[4/7] Building CoreDNS %s with RQLite plugin...\n", constants.CoreDNSVersion) + + buildDir := filepath.Join(b.tmpDir, "coredns-build") + + // Clone CoreDNS + fmt.Println(" Cloning CoreDNS...") + cmd := exec.Command("git", "clone", "--depth", "1", + "--branch", "v"+constants.CoreDNSVersion, + "https://github.com/coredns/coredns.git", buildDir) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("failed to clone coredns: %w", err) + } + + // Copy RQLite plugin from local source + pluginSrc := filepath.Join(b.projectDir, "pkg", "coredns", "rqlite") + pluginDst := filepath.Join(buildDir, "plugin", "rqlite") + if err := os.MkdirAll(pluginDst, 0755); err != nil { + return err + } + + entries, err := os.ReadDir(pluginSrc) + if err != nil { + return fmt.Errorf("failed to read rqlite plugin source at %s: %w", pluginSrc, err) + } + for _, entry := range entries { + if entry.IsDir() || filepath.Ext(entry.Name()) != ".go" { + continue + } + data, err := os.ReadFile(filepath.Join(pluginSrc, entry.Name())) + if err != nil { + return err + } + if err := os.WriteFile(filepath.Join(pluginDst, entry.Name()), data, 0644); err != nil { + return err + } + } + + // Write plugin.cfg (same as build-linux-coredns.sh) + pluginCfg := `metadata:metadata +cancel:cancel +tls:tls +reload:reload +nsid:nsid +bufsize:bufsize +root:root +bind:bind +debug:debug +trace:trace +ready:ready +health:health +pprof:pprof +prometheus:metrics +errors:errors +log:log +dnstap:dnstap +local:local +dns64:dns64 +acl:acl +any:any +chaos:chaos +loadbalance:loadbalance +cache:cache +rewrite:rewrite +header:header +dnssec:dnssec +autopath:autopath +minimal:minimal +template:template +transfer:transfer +hosts:hosts +file:file +auto:auto +secondary:secondary +loop:loop +forward:forward +grpc:grpc +erratic:erratic +whoami:whoami +on:github.com/coredns/caddy/onevent +sign:sign +view:view +rqlite:rqlite +` + if err := os.WriteFile(filepath.Join(buildDir, "plugin.cfg"), []byte(pluginCfg), 0644); err != nil { + return err + } + + // Add dependencies + fmt.Println(" Adding dependencies...") + goPath := os.Getenv("PATH") + baseEnv := append(os.Environ(), + "PATH="+goPath, + "GOPROXY=https://proxy.golang.org|direct", + "GONOSUMDB=*") + + for _, dep := range []string{"github.com/miekg/dns@latest", "go.uber.org/zap@latest"} { + cmd := exec.Command("go", "get", dep) + cmd.Dir = buildDir + cmd.Env = baseEnv + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("failed to get %s: %w", dep, err) + } + } + + cmd = exec.Command("go", "mod", "tidy") + cmd.Dir = buildDir + cmd.Env = baseEnv + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("go mod tidy failed: %w", err) + } + + // Generate plugin code + fmt.Println(" Generating plugin code...") + cmd = exec.Command("go", "generate") + cmd.Dir = buildDir + cmd.Env = baseEnv + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("go generate failed: %w", err) + } + + // Cross-compile + fmt.Println(" Building binary...") + cmd = exec.Command("go", "build", + "-ldflags", "-s -w", + "-trimpath", + "-o", filepath.Join(b.binDir, "coredns")) + cmd.Dir = buildDir + cmd.Env = append(baseEnv, + "GOOS=linux", + fmt.Sprintf("GOARCH=%s", b.flags.Arch), + "CGO_ENABLED=0") + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("build failed: %w", err) + } + + fmt.Println(" ✓ coredns") + return nil +} + +func (b *Builder) buildCaddy() error { + fmt.Printf("[5/7] Building Caddy %s with Orama DNS module...\n", constants.CaddyVersion) + + // Ensure xcaddy is available + if _, err := exec.LookPath("xcaddy"); err != nil { + return fmt.Errorf("xcaddy not found in PATH — install with: go install github.com/caddyserver/xcaddy/cmd/xcaddy@latest") + } + + moduleDir := filepath.Join(b.tmpDir, "caddy-dns-orama") + if err := os.MkdirAll(moduleDir, 0755); err != nil { + return err + } + + // Write go.mod + goMod := fmt.Sprintf(`module github.com/DeBrosOfficial/caddy-dns-orama + +go 1.22 + +require ( + github.com/caddyserver/caddy/v2 v2.%s + github.com/libdns/libdns v1.1.0 +) +`, constants.CaddyVersion[2:]) + if err := os.WriteFile(filepath.Join(moduleDir, "go.mod"), []byte(goMod), 0644); err != nil { + return err + } + + // Write provider.go — read from the caddy installer's generated code + // We inline the same provider code used by the VPS-side caddy installer + providerCode := generateCaddyProviderCode() + if err := os.WriteFile(filepath.Join(moduleDir, "provider.go"), []byte(providerCode), 0644); err != nil { + return err + } + + // go mod tidy + cmd := exec.Command("go", "mod", "tidy") + cmd.Dir = moduleDir + cmd.Env = append(os.Environ(), + "GOPROXY=https://proxy.golang.org|direct", + "GONOSUMDB=*") + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("go mod tidy failed: %w", err) + } + + // Build with xcaddy + fmt.Println(" Building binary...") + cmd = exec.Command("xcaddy", "build", + "v"+constants.CaddyVersion, + "--with", "github.com/DeBrosOfficial/caddy-dns-orama="+moduleDir, + "--output", filepath.Join(b.binDir, "caddy")) + cmd.Env = append(os.Environ(), + "GOOS=linux", + fmt.Sprintf("GOARCH=%s", b.flags.Arch), + "GOPROXY=https://proxy.golang.org|direct", + "GONOSUMDB=*") + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("xcaddy build failed: %w", err) + } + + fmt.Println(" ✓ caddy") + return nil +} + +func (b *Builder) downloadIPFS() error { + fmt.Printf("[6/7] Downloading IPFS Kubo %s...\n", constants.IPFSKuboVersion) + + arch := b.flags.Arch + tarball := fmt.Sprintf("kubo_%s_linux-%s.tar.gz", constants.IPFSKuboVersion, arch) + url := fmt.Sprintf("https://dist.ipfs.tech/kubo/%s/%s", constants.IPFSKuboVersion, tarball) + tarPath := filepath.Join(b.tmpDir, tarball) + + if err := downloadFile(url, tarPath); err != nil { + return err + } + + // Extract ipfs binary from kubo/ipfs + if err := extractFileFromTarball(tarPath, "kubo/ipfs", filepath.Join(b.binDir, "ipfs")); err != nil { + return err + } + + fmt.Println(" ✓ ipfs") + return nil +} + +func (b *Builder) downloadRQLite() error { + fmt.Printf("[7/7] Downloading RQLite %s...\n", constants.RQLiteVersion) + + arch := b.flags.Arch + tarball := fmt.Sprintf("rqlite-v%s-linux-%s.tar.gz", constants.RQLiteVersion, arch) + url := fmt.Sprintf("https://github.com/rqlite/rqlite/releases/download/v%s/%s", constants.RQLiteVersion, tarball) + tarPath := filepath.Join(b.tmpDir, tarball) + + if err := downloadFile(url, tarPath); err != nil { + return err + } + + // Extract rqlited binary + extractDir := fmt.Sprintf("rqlite-v%s-linux-%s", constants.RQLiteVersion, arch) + if err := extractFileFromTarball(tarPath, extractDir+"/rqlited", filepath.Join(b.binDir, "rqlited")); err != nil { + return err + } + + fmt.Println(" ✓ rqlited") + return nil +} + +func (b *Builder) copySystemdTemplates() error { + systemdSrc := filepath.Join(b.projectDir, "systemd") + systemdDst := filepath.Join(b.tmpDir, "systemd") + if err := os.MkdirAll(systemdDst, 0755); err != nil { + return err + } + + entries, err := os.ReadDir(systemdSrc) + if err != nil { + return fmt.Errorf("failed to read systemd dir: %w", err) + } + + for _, entry := range entries { + if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".service") { + continue + } + data, err := os.ReadFile(filepath.Join(systemdSrc, entry.Name())) + if err != nil { + return err + } + if err := os.WriteFile(filepath.Join(systemdDst, entry.Name()), data, 0644); err != nil { + return err + } + } + + return nil +} + +// crossEnv returns the environment for cross-compilation. +func (b *Builder) crossEnv() []string { + return append(os.Environ(), + "GOOS=linux", + fmt.Sprintf("GOARCH=%s", b.flags.Arch), + "CGO_ENABLED=0") +} + +func (b *Builder) readVersion() string { + // Try to read from Makefile + data, err := os.ReadFile(filepath.Join(b.projectDir, "Makefile")) + if err != nil { + return "dev" + } + for _, line := range strings.Split(string(data), "\n") { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, "VERSION") { + parts := strings.SplitN(line, ":=", 2) + if len(parts) == 2 { + return strings.TrimSpace(parts[1]) + } + } + } + return "dev" +} + +func (b *Builder) readCommit() string { + cmd := exec.Command("git", "rev-parse", "--short", "HEAD") + cmd.Dir = b.projectDir + out, err := cmd.Output() + if err != nil { + return "unknown" + } + return strings.TrimSpace(string(out)) +} + +// generateCaddyProviderCode returns the Caddy DNS provider Go source. +// This is the same code used by the VPS-side caddy installer. +func generateCaddyProviderCode() string { + return `// Package orama implements a DNS provider for Caddy that uses the Orama Network +// gateway's internal ACME API for DNS-01 challenge validation. +package orama + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + "time" + + "github.com/caddyserver/caddy/v2" + "github.com/caddyserver/caddy/v2/caddyconfig/caddyfile" + "github.com/libdns/libdns" +) + +func init() { + caddy.RegisterModule(Provider{}) +} + +// Provider wraps the Orama DNS provider for Caddy. +type Provider struct { + // Endpoint is the URL of the Orama gateway's ACME API + // Default: http://localhost:6001/v1/internal/acme + Endpoint string ` + "`json:\"endpoint,omitempty\"`" + ` +} + +// CaddyModule returns the Caddy module information. +func (Provider) CaddyModule() caddy.ModuleInfo { + return caddy.ModuleInfo{ + ID: "dns.providers.orama", + New: func() caddy.Module { return new(Provider) }, + } +} + +// Provision sets up the module. +func (p *Provider) Provision(ctx caddy.Context) error { + if p.Endpoint == "" { + p.Endpoint = "http://localhost:6001/v1/internal/acme" + } + return nil +} + +// UnmarshalCaddyfile parses the Caddyfile configuration. +func (p *Provider) UnmarshalCaddyfile(d *caddyfile.Dispenser) error { + for d.Next() { + for d.NextBlock(0) { + switch d.Val() { + case "endpoint": + if !d.NextArg() { + return d.ArgErr() + } + p.Endpoint = d.Val() + default: + return d.Errf("unrecognized option: %s", d.Val()) + } + } + } + return nil +} + +// AppendRecords adds records to the zone. +func (p *Provider) AppendRecords(ctx context.Context, zone string, records []libdns.Record) ([]libdns.Record, error) { + var added []libdns.Record + for _, rec := range records { + rr := rec.RR() + if rr.Type != "TXT" { + continue + } + fqdn := rr.Name + "." + zone + payload := map[string]string{"fqdn": fqdn, "value": rr.Data} + body, err := json.Marshal(payload) + if err != nil { + return added, fmt.Errorf("failed to marshal request: %w", err) + } + req, err := http.NewRequestWithContext(ctx, "POST", p.Endpoint+"/present", bytes.NewReader(body)) + if err != nil { + return added, fmt.Errorf("failed to create request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + client := &http.Client{Timeout: 30 * time.Second} + resp, err := client.Do(req) + if err != nil { + return added, fmt.Errorf("failed to present challenge: %w", err) + } + resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return added, fmt.Errorf("present failed with status %d", resp.StatusCode) + } + added = append(added, rec) + } + return added, nil +} + +// DeleteRecords removes records from the zone. +func (p *Provider) DeleteRecords(ctx context.Context, zone string, records []libdns.Record) ([]libdns.Record, error) { + var deleted []libdns.Record + for _, rec := range records { + rr := rec.RR() + if rr.Type != "TXT" { + continue + } + fqdn := rr.Name + "." + zone + payload := map[string]string{"fqdn": fqdn, "value": rr.Data} + body, err := json.Marshal(payload) + if err != nil { + return deleted, fmt.Errorf("failed to marshal request: %w", err) + } + req, err := http.NewRequestWithContext(ctx, "POST", p.Endpoint+"/cleanup", bytes.NewReader(body)) + if err != nil { + return deleted, fmt.Errorf("failed to create request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + client := &http.Client{Timeout: 30 * time.Second} + resp, err := client.Do(req) + if err != nil { + return deleted, fmt.Errorf("failed to cleanup challenge: %w", err) + } + resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return deleted, fmt.Errorf("cleanup failed with status %d", resp.StatusCode) + } + deleted = append(deleted, rec) + } + return deleted, nil +} + +// GetRecords returns the records in the zone. Not used for ACME. +func (p *Provider) GetRecords(ctx context.Context, zone string) ([]libdns.Record, error) { + return nil, nil +} + +// SetRecords sets the records in the zone. Not used for ACME. +func (p *Provider) SetRecords(ctx context.Context, zone string, records []libdns.Record) ([]libdns.Record, error) { + return nil, nil +} + +// Interface guards +var ( + _ caddy.Module = (*Provider)(nil) + _ caddy.Provisioner = (*Provider)(nil) + _ caddyfile.Unmarshaler = (*Provider)(nil) + _ libdns.RecordAppender = (*Provider)(nil) + _ libdns.RecordDeleter = (*Provider)(nil) + _ libdns.RecordGetter = (*Provider)(nil) + _ libdns.RecordSetter = (*Provider)(nil) +) +` +} diff --git a/pkg/cli/build/command.go b/pkg/cli/build/command.go new file mode 100644 index 0000000..97fe0f4 --- /dev/null +++ b/pkg/cli/build/command.go @@ -0,0 +1,80 @@ +package build + +import ( + "flag" + "fmt" + "os" + "path/filepath" + "runtime" +) + +// Flags represents build command flags. +type Flags struct { + Arch string + Output string + Verbose bool +} + +// Handle is the entry point for the build command. +func Handle(args []string) { + flags, err := parseFlags(args) + if err != nil { + if err == flag.ErrHelp { + return + } + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } + + b := NewBuilder(flags) + if err := b.Build(); err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } +} + +func parseFlags(args []string) (*Flags, error) { + fs := flag.NewFlagSet("build", flag.ContinueOnError) + fs.SetOutput(os.Stderr) + + flags := &Flags{} + + fs.StringVar(&flags.Arch, "arch", "amd64", "Target architecture (amd64, arm64)") + fs.StringVar(&flags.Output, "output", "", "Output archive path (default: /tmp/orama--linux-.tar.gz)") + fs.BoolVar(&flags.Verbose, "verbose", false, "Verbose output") + + if err := fs.Parse(args); err != nil { + return nil, err + } + + return flags, nil +} + +// findProjectRoot walks up from the current directory looking for go.mod. +func findProjectRoot() (string, error) { + dir, err := os.Getwd() + if err != nil { + return "", err + } + + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + // Verify it's the network project + if _, err := os.Stat(filepath.Join(dir, "cmd", "cli")); err == nil { + return dir, nil + } + } + parent := filepath.Dir(dir) + if parent == dir { + break + } + dir = parent + } + + return "", fmt.Errorf("could not find project root (no go.mod with cmd/cli found)") +} + +// detectHostArch returns the host architecture in Go naming convention. +func detectHostArch() string { + return runtime.GOARCH +} diff --git a/pkg/cli/cmd/buildcmd/build.go b/pkg/cli/cmd/buildcmd/build.go new file mode 100644 index 0000000..dd7b5db --- /dev/null +++ b/pkg/cli/cmd/buildcmd/build.go @@ -0,0 +1,24 @@ +package buildcmd + +import ( + "github.com/DeBrosOfficial/network/pkg/cli/build" + "github.com/spf13/cobra" +) + +// Cmd is the top-level build command. +var Cmd = &cobra.Command{ + Use: "build", + Short: "Build pre-compiled binary archive for deployment", + Long: `Cross-compile all Orama binaries and dependencies for Linux, +then package them into a deployment archive. The archive includes: + - Orama binaries (CLI, node, gateway, identity, SFU, TURN) + - Olric, IPFS Kubo, IPFS Cluster, RQLite, CoreDNS, Caddy + - Systemd namespace templates + - manifest.json with checksums + +The resulting archive can be pushed to nodes with 'orama node push'.`, + Run: func(cmd *cobra.Command, args []string) { + build.Handle(args) + }, + DisableFlagParsing: true, +} diff --git a/pkg/cli/cmd/node/clean.go b/pkg/cli/cmd/node/clean.go new file mode 100644 index 0000000..65c80a3 --- /dev/null +++ b/pkg/cli/cmd/node/clean.go @@ -0,0 +1,25 @@ +package node + +import ( + "github.com/DeBrosOfficial/network/pkg/cli/production/clean" + "github.com/spf13/cobra" +) + +var cleanCmd = &cobra.Command{ + Use: "clean", + Short: "Clean (wipe) remote nodes for reinstallation", + Long: `Remove all Orama data, services, and configuration from remote nodes. +Anyone relay keys at /var/lib/anon/ are preserved. + +This is a DESTRUCTIVE operation. Use --force to skip confirmation. + +Examples: + orama node clean --env testnet # Clean all testnet nodes + orama node clean --env testnet --node 1.2.3.4 # Clean specific node + orama node clean --env testnet --nuclear # Also remove shared binaries + orama node clean --env testnet --force # Skip confirmation`, + Run: func(cmd *cobra.Command, args []string) { + clean.Handle(args) + }, + DisableFlagParsing: true, +} diff --git a/pkg/cli/cmd/node/node.go b/pkg/cli/cmd/node/node.go index 400f7fb..5520571 100644 --- a/pkg/cli/cmd/node/node.go +++ b/pkg/cli/cmd/node/node.go @@ -26,4 +26,8 @@ func init() { Cmd.AddCommand(migrateCmd) Cmd.AddCommand(doctorCmd) Cmd.AddCommand(reportCmd) + Cmd.AddCommand(pushCmd) + Cmd.AddCommand(rolloutCmd) + Cmd.AddCommand(cleanCmd) + Cmd.AddCommand(recoverRaftCmd) } diff --git a/pkg/cli/cmd/node/push.go b/pkg/cli/cmd/node/push.go new file mode 100644 index 0000000..3c1b159 --- /dev/null +++ b/pkg/cli/cmd/node/push.go @@ -0,0 +1,24 @@ +package node + +import ( + "github.com/DeBrosOfficial/network/pkg/cli/production/push" + "github.com/spf13/cobra" +) + +var pushCmd = &cobra.Command{ + Use: "push", + Short: "Push binary archive to remote nodes", + Long: `Upload a pre-built binary archive to remote nodes. + +By default, uses fanout distribution: uploads to one hub node, +then distributes to all others via server-to-server SCP. + +Examples: + orama node push --env devnet # Fanout to all devnet nodes + orama node push --env testnet --node 1.2.3.4 # Single node + orama node push --env testnet --direct # Sequential upload to each node`, + Run: func(cmd *cobra.Command, args []string) { + push.Handle(args) + }, + DisableFlagParsing: true, +} diff --git a/pkg/cli/cmd/node/recover_raft.go b/pkg/cli/cmd/node/recover_raft.go new file mode 100644 index 0000000..a6499df --- /dev/null +++ b/pkg/cli/cmd/node/recover_raft.go @@ -0,0 +1,31 @@ +package node + +import ( + "github.com/DeBrosOfficial/network/pkg/cli/production/recover" + "github.com/spf13/cobra" +) + +var recoverRaftCmd = &cobra.Command{ + Use: "recover-raft", + Short: "Recover RQLite cluster from split-brain", + Long: `Recover the RQLite Raft cluster from split-brain failure. + +Strategy: + 1. Stop orama-node on ALL nodes simultaneously + 2. Backup and delete raft/ on non-leader nodes + 3. Start leader node, wait for Leader state + 4. Start remaining nodes in batches + 5. Verify cluster health + +The --leader flag must point to the node with the highest commit index. + +This is a DESTRUCTIVE operation. Use --force to skip confirmation. + +Examples: + orama node recover-raft --env testnet --leader 1.2.3.4 + orama node recover-raft --env devnet --leader 1.2.3.4 --force`, + Run: func(cmd *cobra.Command, args []string) { + recover.Handle(args) + }, + DisableFlagParsing: true, +} diff --git a/pkg/cli/cmd/node/rollout.go b/pkg/cli/cmd/node/rollout.go new file mode 100644 index 0000000..d2a2c59 --- /dev/null +++ b/pkg/cli/cmd/node/rollout.go @@ -0,0 +1,22 @@ +package node + +import ( + "github.com/DeBrosOfficial/network/pkg/cli/production/rollout" + "github.com/spf13/cobra" +) + +var rolloutCmd = &cobra.Command{ + Use: "rollout", + Short: "Build, push, and rolling upgrade all nodes in an environment", + Long: `Full deployment pipeline: build binary archive locally, push to all nodes, +then perform a rolling upgrade (one node at a time). + +Examples: + orama node rollout --env testnet # Full: build + push + rolling upgrade + orama node rollout --env testnet --no-build # Skip build, use existing archive + orama node rollout --env testnet --yes # Skip confirmation`, + Run: func(cmd *cobra.Command, args []string) { + rollout.Handle(args) + }, + DisableFlagParsing: true, +} diff --git a/pkg/cli/production/clean/clean.go b/pkg/cli/production/clean/clean.go new file mode 100644 index 0000000..65d1435 --- /dev/null +++ b/pkg/cli/production/clean/clean.go @@ -0,0 +1,183 @@ +package clean + +import ( + "bufio" + "flag" + "fmt" + "os" + "strings" + + "github.com/DeBrosOfficial/network/pkg/cli/remotessh" + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +// Flags holds clean command flags. +type Flags struct { + Env string // Target environment + Node string // Single node IP + Nuclear bool // Also remove shared binaries + Force bool // Skip confirmation +} + +// Handle is the entry point for the clean command. +func Handle(args []string) { + flags, err := parseFlags(args) + if err != nil { + if err == flag.ErrHelp { + return + } + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } + + if err := execute(flags); err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } +} + +func parseFlags(args []string) (*Flags, error) { + fs := flag.NewFlagSet("clean", flag.ContinueOnError) + fs.SetOutput(os.Stderr) + + flags := &Flags{} + fs.StringVar(&flags.Env, "env", "", "Target environment (devnet, testnet) [required]") + fs.StringVar(&flags.Node, "node", "", "Clean a single node IP only") + fs.BoolVar(&flags.Nuclear, "nuclear", false, "Also remove shared binaries (rqlited, ipfs, caddy, etc.)") + fs.BoolVar(&flags.Force, "force", false, "Skip confirmation (DESTRUCTIVE)") + + if err := fs.Parse(args); err != nil { + return nil, err + } + + if flags.Env == "" { + return nil, fmt.Errorf("--env is required\nUsage: orama node clean --env --force") + } + + return flags, nil +} + +func execute(flags *Flags) error { + nodes, err := remotessh.LoadEnvNodes(flags.Env) + if err != nil { + return err + } + + if flags.Node != "" { + nodes = remotessh.FilterByIP(nodes, flags.Node) + if len(nodes) == 0 { + return fmt.Errorf("node %s not found in %s environment", flags.Node, flags.Env) + } + } + + fmt.Printf("Clean %s: %d node(s)\n", flags.Env, len(nodes)) + if flags.Nuclear { + fmt.Printf(" Mode: NUCLEAR (removes binaries too)\n") + } + for _, n := range nodes { + fmt.Printf(" - %s (%s)\n", n.Host, n.Role) + } + fmt.Println() + + // Confirm unless --force + if !flags.Force { + fmt.Printf("This will DESTROY all data on these nodes. Anyone relay keys are preserved.\n") + fmt.Printf("Type 'yes' to confirm: ") + reader := bufio.NewReader(os.Stdin) + input, _ := reader.ReadString('\n') + if strings.TrimSpace(input) != "yes" { + fmt.Println("Aborted.") + return nil + } + fmt.Println() + } + + // Clean each node + var failed []string + for i, node := range nodes { + fmt.Printf("[%d/%d] Cleaning %s...\n", i+1, len(nodes), node.Host) + if err := cleanNode(node, flags.Nuclear); err != nil { + fmt.Fprintf(os.Stderr, " ✗ %s: %v\n", node.Host, err) + failed = append(failed, node.Host) + continue + } + fmt.Printf(" ✓ %s cleaned\n\n", node.Host) + } + + if len(failed) > 0 { + return fmt.Errorf("clean failed on %d node(s): %s", len(failed), strings.Join(failed, ", ")) + } + + fmt.Printf("✓ Clean complete (%d nodes)\n", len(nodes)) + fmt.Printf(" Anyone relay keys preserved at /var/lib/anon/\n") + fmt.Printf(" To reinstall: orama node install --vps-ip ...\n") + return nil +} + +func cleanNode(node inspector.Node, nuclear bool) error { + sudo := remotessh.SudoPrefix(node) + + nuclearFlag := "" + if nuclear { + nuclearFlag = "NUCLEAR=1" + } + + // The cleanup script runs on the remote node + script := fmt.Sprintf(`%sbash -c ' +%s + +# Stop services +for svc in caddy coredns orama-node orama-gateway orama-ipfs-cluster orama-ipfs orama-olric orama-anyone-relay orama-anyone-client; do + systemctl stop "$svc" 2>/dev/null + systemctl disable "$svc" 2>/dev/null +done + +# Kill stragglers +pkill -9 -f "orama-node" 2>/dev/null || true +pkill -9 -f "olric-server" 2>/dev/null || true +pkill -9 -f "ipfs" 2>/dev/null || true + +# Remove systemd units +rm -f /etc/systemd/system/orama-*.service +rm -f /etc/systemd/system/coredns.service +rm -f /etc/systemd/system/caddy.service +systemctl daemon-reload 2>/dev/null + +# Tear down WireGuard +ip link delete wg0 2>/dev/null || true +rm -f /etc/wireguard/wg0.conf + +# Reset firewall +ufw --force reset 2>/dev/null || true +ufw default deny incoming 2>/dev/null || true +ufw default allow outgoing 2>/dev/null || true +ufw allow 22/tcp 2>/dev/null || true +ufw --force enable 2>/dev/null || true + +# Remove data +rm -rf /opt/orama + +# Clean configs +rm -rf /etc/coredns +rm -rf /etc/caddy +rm -f /tmp/orama-*.sh /tmp/network-source.tar.gz /tmp/orama-*.tar.gz + +# Nuclear: remove binaries +if [ -n "$NUCLEAR" ]; then + rm -f /usr/local/bin/orama /usr/local/bin/orama-node /usr/local/bin/gateway + rm -f /usr/local/bin/identity /usr/local/bin/sfu /usr/local/bin/turn + rm -f /usr/local/bin/olric-server /usr/local/bin/ipfs /usr/local/bin/ipfs-cluster-service + rm -f /usr/local/bin/rqlited /usr/local/bin/coredns + rm -f /usr/bin/caddy +fi + +# Verify Anyone keys preserved +if [ -d /var/lib/anon ]; then + echo " Anyone relay keys preserved at /var/lib/anon/" +fi + +echo " Node cleaned successfully" +'`, sudo, nuclearFlag) + + return remotessh.RunSSHStreaming(node, script) +} diff --git a/pkg/cli/production/install/remote.go b/pkg/cli/production/install/remote.go index b4ca02b..b5b10a5 100644 --- a/pkg/cli/production/install/remote.go +++ b/pkg/cli/production/install/remote.go @@ -2,6 +2,8 @@ package install import ( "fmt" + "os" + "path/filepath" "strconv" "strings" @@ -36,10 +38,18 @@ func NewRemoteOrchestrator(flags *Flags) (*RemoteOrchestrator, error) { } // Execute runs the remote install process. -// Source must already be uploaded via: ./scripts/upload-source.sh +// If a binary archive exists locally, uploads and extracts it on the VPS +// so Phase2b auto-detects pre-built mode. Otherwise, source must already +// be uploaded via: ./scripts/upload-source.sh func (r *RemoteOrchestrator) Execute() error { fmt.Printf("Installing on %s via SSH (%s@%s)...\n\n", r.flags.VpsIP, r.node.User, r.node.Host) + // Try to upload a binary archive if one exists locally + if err := r.uploadBinaryArchive(); err != nil { + fmt.Printf(" ⚠️ Binary archive upload skipped: %v\n", err) + fmt.Printf(" Proceeding with source mode (source must already be on VPS)\n\n") + } + // Run remote install fmt.Printf("Running install on VPS...\n\n") if err := r.runRemoteInstall(); err != nil { @@ -49,6 +59,62 @@ func (r *RemoteOrchestrator) Execute() error { return nil } +// uploadBinaryArchive finds a local binary archive and uploads + extracts it on the VPS. +// Returns nil on success, error if no archive found or upload failed. +func (r *RemoteOrchestrator) uploadBinaryArchive() error { + archivePath := r.findLocalArchive() + if archivePath == "" { + return fmt.Errorf("no binary archive found locally") + } + + fmt.Printf("Uploading binary archive: %s\n", filepath.Base(archivePath)) + + // Upload to /tmp/ on VPS + remoteTmp := "/tmp/" + filepath.Base(archivePath) + if err := uploadFile(r.node, archivePath, remoteTmp); err != nil { + return fmt.Errorf("failed to upload archive: %w", err) + } + + // Extract to /opt/orama/ on VPS + fmt.Printf("Extracting archive on VPS...\n") + extractCmd := fmt.Sprintf("%smkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s && echo ' ✓ Archive extracted to /opt/orama/'", + r.sudoPrefix(), remoteTmp, remoteTmp) + if err := runSSHStreaming(r.node, extractCmd); err != nil { + return fmt.Errorf("failed to extract archive on VPS: %w", err) + } + + fmt.Println() + return nil +} + +// findLocalArchive searches for a binary archive in common locations. +func (r *RemoteOrchestrator) findLocalArchive() string { + // Check /tmp/ for archives matching the naming pattern + entries, err := os.ReadDir("/tmp") + if err != nil { + return "" + } + + // Look for orama-*-linux-*.tar.gz, prefer newest + var best string + var bestMod int64 + for _, entry := range entries { + name := entry.Name() + if strings.HasPrefix(name, "orama-") && strings.Contains(name, "-linux-") && strings.HasSuffix(name, ".tar.gz") { + info, err := entry.Info() + if err != nil { + continue + } + if info.ModTime().Unix() > bestMod { + best = filepath.Join("/tmp", name) + bestMod = info.ModTime().Unix() + } + } + } + + return best +} + // runRemoteInstall executes `orama install` on the VPS. func (r *RemoteOrchestrator) runRemoteInstall() error { cmd := r.buildRemoteCommand() diff --git a/pkg/cli/production/push/push.go b/pkg/cli/production/push/push.go new file mode 100644 index 0000000..9cfebd9 --- /dev/null +++ b/pkg/cli/production/push/push.go @@ -0,0 +1,248 @@ +package push + +import ( + "flag" + "fmt" + "os" + "path/filepath" + "strings" + "sync" + + "github.com/DeBrosOfficial/network/pkg/cli/remotessh" + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +// Flags holds push command flags. +type Flags struct { + Env string // Target environment (devnet, testnet) + Node string // Single node IP (optional) + Direct bool // Sequential upload to each node (no fanout) +} + +// Handle is the entry point for the push command. +func Handle(args []string) { + flags, err := parseFlags(args) + if err != nil { + if err == flag.ErrHelp { + return + } + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } + + if err := execute(flags); err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } +} + +func parseFlags(args []string) (*Flags, error) { + fs := flag.NewFlagSet("push", flag.ContinueOnError) + fs.SetOutput(os.Stderr) + + flags := &Flags{} + fs.StringVar(&flags.Env, "env", "", "Target environment (devnet, testnet) [required]") + fs.StringVar(&flags.Node, "node", "", "Push to a single node IP only") + fs.BoolVar(&flags.Direct, "direct", false, "Upload directly to each node (no hub fanout)") + + if err := fs.Parse(args); err != nil { + return nil, err + } + + if flags.Env == "" { + return nil, fmt.Errorf("--env is required\nUsage: orama node push --env ") + } + + return flags, nil +} + +func execute(flags *Flags) error { + // Find archive + archivePath := findNewestArchive() + if archivePath == "" { + return fmt.Errorf("no binary archive found in /tmp/ (run `orama build` first)") + } + + info, _ := os.Stat(archivePath) + fmt.Printf("Archive: %s (%s)\n", filepath.Base(archivePath), formatBytes(info.Size())) + + // Resolve nodes + nodes, err := remotessh.LoadEnvNodes(flags.Env) + if err != nil { + return err + } + + // Filter to single node if specified + if flags.Node != "" { + nodes = remotessh.FilterByIP(nodes, flags.Node) + if len(nodes) == 0 { + return fmt.Errorf("node %s not found in %s environment", flags.Node, flags.Env) + } + } + + fmt.Printf("Environment: %s (%d nodes)\n\n", flags.Env, len(nodes)) + + if flags.Direct || len(nodes) == 1 { + return pushDirect(archivePath, nodes) + } + + return pushFanout(archivePath, nodes) +} + +// pushDirect uploads the archive to each node sequentially. +func pushDirect(archivePath string, nodes []inspector.Node) error { + remotePath := "/tmp/" + filepath.Base(archivePath) + + for i, node := range nodes { + fmt.Printf("[%d/%d] Pushing to %s...\n", i+1, len(nodes), node.Host) + + if err := remotessh.UploadFile(node, archivePath, remotePath); err != nil { + return fmt.Errorf("upload to %s failed: %w", node.Host, err) + } + + if err := extractOnNode(node, remotePath); err != nil { + return fmt.Errorf("extract on %s failed: %w", node.Host, err) + } + + fmt.Printf(" ✓ %s done\n\n", node.Host) + } + + fmt.Printf("✓ Push complete (%d nodes)\n", len(nodes)) + return nil +} + +// pushFanout uploads to a hub node, then fans out to all others via server-to-server SCP. +func pushFanout(archivePath string, nodes []inspector.Node) error { + hub := remotessh.PickHubNode(nodes) + remotePath := "/tmp/" + filepath.Base(archivePath) + + // Step 1: Upload to hub + fmt.Printf("[hub] Uploading to %s...\n", hub.Host) + if err := remotessh.UploadFile(hub, archivePath, remotePath); err != nil { + return fmt.Errorf("upload to hub %s failed: %w", hub.Host, err) + } + + if err := extractOnNode(hub, remotePath); err != nil { + return fmt.Errorf("extract on hub %s failed: %w", hub.Host, err) + } + fmt.Printf(" ✓ hub %s done\n\n", hub.Host) + + // Step 2: Fan out from hub to remaining nodes in parallel + remaining := make([]inspector.Node, 0, len(nodes)-1) + for _, n := range nodes { + if n.Host != hub.Host { + remaining = append(remaining, n) + } + } + + if len(remaining) == 0 { + fmt.Printf("✓ Push complete (1 node)\n") + return nil + } + + fmt.Printf("[fanout] Distributing from %s to %d nodes...\n", hub.Host, len(remaining)) + + var wg sync.WaitGroup + errors := make([]error, len(remaining)) + + for i, target := range remaining { + wg.Add(1) + go func(idx int, target inspector.Node) { + defer wg.Done() + + // SCP from hub to target, then extract + scpCmd := fmt.Sprintf("sshpass -p '%s' scp -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o PreferredAuthentications=password -o PubkeyAuthentication=no %s %s@%s:%s", + target.Password, remotePath, target.User, target.Host, remotePath) + + if err := remotessh.RunSSHStreaming(hub, scpCmd); err != nil { + errors[idx] = fmt.Errorf("fanout to %s failed: %w", target.Host, err) + return + } + + if err := extractOnNodeVia(hub, target, remotePath); err != nil { + errors[idx] = fmt.Errorf("extract on %s failed: %w", target.Host, err) + return + } + + fmt.Printf(" ✓ %s done\n", target.Host) + }(i, target) + } + + wg.Wait() + + // Check for errors + var failed []string + for i, err := range errors { + if err != nil { + fmt.Fprintf(os.Stderr, " ✗ %s: %v\n", remaining[i].Host, err) + failed = append(failed, remaining[i].Host) + } + } + + if len(failed) > 0 { + return fmt.Errorf("push failed on %d node(s): %s", len(failed), strings.Join(failed, ", ")) + } + + fmt.Printf("\n✓ Push complete (%d nodes)\n", len(nodes)) + return nil +} + +// extractOnNode extracts the archive on a remote node. +func extractOnNode(node inspector.Node, remotePath string) error { + sudo := remotessh.SudoPrefix(node) + cmd := fmt.Sprintf("%smkdir -p /opt/orama && %star xzf %s -C /opt/orama && %srm -f %s", + sudo, sudo, remotePath, sudo, remotePath) + return remotessh.RunSSHStreaming(node, cmd) +} + +// extractOnNodeVia extracts the archive on a target node by SSHing through the hub. +func extractOnNodeVia(hub, target inspector.Node, remotePath string) error { + sudo := remotessh.SudoPrefix(target) + extractCmd := fmt.Sprintf("%smkdir -p /opt/orama && %star xzf %s -C /opt/orama && %srm -f %s", + sudo, sudo, remotePath, sudo, remotePath) + + // SSH from hub to target to extract + sshCmd := fmt.Sprintf("sshpass -p '%s' ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o PreferredAuthentications=password -o PubkeyAuthentication=no %s@%s '%s'", + target.Password, target.User, target.Host, extractCmd) + + return remotessh.RunSSHStreaming(hub, sshCmd) +} + +// findNewestArchive finds the newest binary archive in /tmp/. +func findNewestArchive() string { + entries, err := os.ReadDir("/tmp") + if err != nil { + return "" + } + + var best string + var bestMod int64 + for _, entry := range entries { + name := entry.Name() + if strings.HasPrefix(name, "orama-") && strings.Contains(name, "-linux-") && strings.HasSuffix(name, ".tar.gz") { + info, err := entry.Info() + if err != nil { + continue + } + if info.ModTime().Unix() > bestMod { + best = filepath.Join("/tmp", name) + bestMod = info.ModTime().Unix() + } + } + } + + return best +} + +func formatBytes(b int64) string { + const unit = 1024 + if b < unit { + return fmt.Sprintf("%d B", b) + } + div, exp := int64(unit), 0 + for n := b / unit; n >= unit; n /= unit { + div *= unit + exp++ + } + return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp]) +} diff --git a/pkg/cli/production/recover/recover.go b/pkg/cli/production/recover/recover.go new file mode 100644 index 0000000..f697325 --- /dev/null +++ b/pkg/cli/production/recover/recover.go @@ -0,0 +1,306 @@ +package recover + +import ( + "bufio" + "flag" + "fmt" + "os" + "strings" + "time" + + "github.com/DeBrosOfficial/network/pkg/cli/remotessh" + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +// Flags holds recover-raft command flags. +type Flags struct { + Env string // Target environment + Leader string // Leader node IP (highest commit index) + Force bool // Skip confirmation +} + +const ( + raftDir = "/opt/orama/.orama/data/rqlite/raft" + backupDir = "/tmp/rqlite-raft-backup" +) + +// Handle is the entry point for the recover-raft command. +func Handle(args []string) { + flags, err := parseFlags(args) + if err != nil { + if err == flag.ErrHelp { + return + } + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } + + if err := execute(flags); err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } +} + +func parseFlags(args []string) (*Flags, error) { + fs := flag.NewFlagSet("recover-raft", flag.ContinueOnError) + fs.SetOutput(os.Stderr) + + flags := &Flags{} + fs.StringVar(&flags.Env, "env", "", "Target environment (devnet, testnet) [required]") + fs.StringVar(&flags.Leader, "leader", "", "Leader node IP (node with highest commit index) [required]") + fs.BoolVar(&flags.Force, "force", false, "Skip confirmation (DESTRUCTIVE)") + + if err := fs.Parse(args); err != nil { + return nil, err + } + + if flags.Env == "" { + return nil, fmt.Errorf("--env is required\nUsage: orama node recover-raft --env --leader ") + } + if flags.Leader == "" { + return nil, fmt.Errorf("--leader is required\nUsage: orama node recover-raft --env --leader ") + } + + return flags, nil +} + +func execute(flags *Flags) error { + nodes, err := remotessh.LoadEnvNodes(flags.Env) + if err != nil { + return err + } + + // Find leader node + leaderNodes := remotessh.FilterByIP(nodes, flags.Leader) + if len(leaderNodes) == 0 { + return fmt.Errorf("leader %s not found in %s environment", flags.Leader, flags.Env) + } + leader := leaderNodes[0] + + // Separate leader from followers + var followers []inspector.Node + for _, n := range nodes { + if n.Host != leader.Host { + followers = append(followers, n) + } + } + + // Print plan + fmt.Printf("Recover Raft: %s (%d nodes)\n", flags.Env, len(nodes)) + fmt.Printf(" Leader candidate: %s (%s) — raft/ data preserved\n", leader.Host, leader.Role) + for _, n := range followers { + fmt.Printf(" - %s (%s) — raft/ will be deleted\n", n.Host, n.Role) + } + fmt.Println() + + // Confirm unless --force + if !flags.Force { + fmt.Printf("⚠️ THIS WILL:\n") + fmt.Printf(" 1. Stop orama-node on ALL %d nodes\n", len(nodes)) + fmt.Printf(" 2. DELETE raft/ data on %d nodes (backup to %s)\n", len(followers), backupDir) + fmt.Printf(" 3. Keep raft/ data ONLY on %s (leader candidate)\n", leader.Host) + fmt.Printf(" 4. Restart all nodes to reform the cluster\n") + fmt.Printf("\nType 'yes' to confirm: ") + reader := bufio.NewReader(os.Stdin) + input, _ := reader.ReadString('\n') + if strings.TrimSpace(input) != "yes" { + fmt.Println("Aborted.") + return nil + } + fmt.Println() + } + + // Phase 1: Stop orama-node on ALL nodes + if err := phase1StopAll(nodes); err != nil { + return fmt.Errorf("phase 1 (stop all): %w", err) + } + + // Phase 2: Backup and delete raft/ on non-leader nodes + if err := phase2ClearFollowers(followers); err != nil { + return fmt.Errorf("phase 2 (clear followers): %w", err) + } + fmt.Printf(" Leader node %s raft/ data preserved.\n\n", leader.Host) + + // Phase 3: Start leader node and wait for Leader state + if err := phase3StartLeader(leader); err != nil { + return fmt.Errorf("phase 3 (start leader): %w", err) + } + + // Phase 4: Start remaining nodes in batches + if err := phase4StartFollowers(followers); err != nil { + return fmt.Errorf("phase 4 (start followers): %w", err) + } + + // Phase 5: Verify cluster health + phase5Verify(nodes, leader) + + return nil +} + +func phase1StopAll(nodes []inspector.Node) error { + fmt.Printf("== Phase 1: Stopping orama-node on all %d nodes ==\n", len(nodes)) + + var failed []inspector.Node + for _, node := range nodes { + sudo := remotessh.SudoPrefix(node) + fmt.Printf(" Stopping %s ... ", node.Host) + + cmd := fmt.Sprintf("%ssystemctl stop orama-node 2>&1 && echo STOPPED", sudo) + if err := remotessh.RunSSHStreaming(node, cmd); err != nil { + fmt.Printf("FAILED\n") + failed = append(failed, node) + continue + } + fmt.Println() + } + + // Kill stragglers + if len(failed) > 0 { + fmt.Printf("\n⚠️ %d nodes failed to stop. Attempting kill...\n", len(failed)) + for _, node := range failed { + sudo := remotessh.SudoPrefix(node) + cmd := fmt.Sprintf("%skillall -9 orama-node rqlited 2>/dev/null; echo KILLED", sudo) + _ = remotessh.RunSSHStreaming(node, cmd) + } + } + + fmt.Printf("\nWaiting 5s for processes to fully stop...\n") + time.Sleep(5 * time.Second) + fmt.Println() + + return nil +} + +func phase2ClearFollowers(followers []inspector.Node) error { + fmt.Printf("== Phase 2: Clearing raft state on %d non-leader nodes ==\n", len(followers)) + + for _, node := range followers { + sudo := remotessh.SudoPrefix(node) + fmt.Printf(" Clearing %s ... ", node.Host) + + script := fmt.Sprintf(`%sbash -c ' +rm -rf %s +if [ -d %s ]; then + cp -r %s %s 2>/dev/null || true + rm -rf %s + echo "CLEARED (backup at %s)" +else + echo "NO_RAFT_DIR (nothing to clear)" +fi +'`, sudo, backupDir, raftDir, raftDir, backupDir, raftDir, backupDir) + + if err := remotessh.RunSSHStreaming(node, script); err != nil { + fmt.Printf("FAILED: %v\n", err) + continue + } + fmt.Println() + } + + return nil +} + +func phase3StartLeader(leader inspector.Node) error { + fmt.Printf("== Phase 3: Starting leader node (%s) ==\n", leader.Host) + + sudo := remotessh.SudoPrefix(leader) + startCmd := fmt.Sprintf("%ssystemctl start orama-node", sudo) + if err := remotessh.RunSSHStreaming(leader, startCmd); err != nil { + return fmt.Errorf("failed to start leader node %s: %w", leader.Host, err) + } + + fmt.Printf(" Waiting for leader to become Leader...\n") + maxWait := 120 + elapsed := 0 + + for elapsed < maxWait { + // Check raft state via RQLite status endpoint + checkCmd := `curl -s --max-time 3 http://localhost:5001/status 2>/dev/null | python3 -c " +import sys,json +try: + d=json.load(sys.stdin) + print(d.get('store',{}).get('raft',{}).get('state','')) +except: + print('') +" 2>/dev/null || echo ""` + + // We can't easily capture output from RunSSHStreaming, so we use a simple approach + // Check via a combined command that prints a marker + stateCheckCmd := fmt.Sprintf(`state=$(%s); echo "RAFT_STATE=$state"`, checkCmd) + // Since RunSSHStreaming prints to stdout, we'll poll and let user see the state + fmt.Printf(" ... polling (%ds / %ds)\n", elapsed, maxWait) + + // Try to check state - the output goes to stdout via streaming + _ = remotessh.RunSSHStreaming(leader, stateCheckCmd) + + time.Sleep(5 * time.Second) + elapsed += 5 + } + + fmt.Printf(" Leader start complete. Check output above for state.\n\n") + return nil +} + +func phase4StartFollowers(followers []inspector.Node) error { + fmt.Printf("== Phase 4: Starting %d remaining nodes ==\n", len(followers)) + + batchSize := 3 + for i, node := range followers { + sudo := remotessh.SudoPrefix(node) + fmt.Printf(" Starting %s ... ", node.Host) + + cmd := fmt.Sprintf("%ssystemctl start orama-node && echo STARTED", sudo) + if err := remotessh.RunSSHStreaming(node, cmd); err != nil { + fmt.Printf("FAILED: %v\n", err) + continue + } + fmt.Println() + + // Batch delay for cluster stability + if (i+1)%batchSize == 0 && i+1 < len(followers) { + fmt.Printf(" (waiting 15s between batches for cluster stability)\n") + time.Sleep(15 * time.Second) + } + } + + fmt.Println() + return nil +} + +func phase5Verify(nodes []inspector.Node, leader inspector.Node) { + fmt.Printf("== Phase 5: Waiting for cluster to stabilize ==\n") + + // Wait in 30s increments + for _, s := range []int{30, 60, 90, 120} { + time.Sleep(30 * time.Second) + fmt.Printf(" ... %ds\n", s) + } + + fmt.Printf("\n== Cluster status ==\n") + for _, node := range nodes { + marker := "" + if node.Host == leader.Host { + marker = " ← LEADER" + } + + checkCmd := `curl -s --max-time 5 http://localhost:5001/status 2>/dev/null | python3 -c " +import sys,json +try: + d=json.load(sys.stdin) + r=d.get('store',{}).get('raft',{}) + n=d.get('store',{}).get('num_nodes','?') + print(f'state={r.get(\"state\",\"?\")} commit={r.get(\"commit_index\",\"?\")} leader={r.get(\"leader\",{}).get(\"node_id\",\"?\")} nodes={n}') +except: + print('NO_RESPONSE') +" 2>/dev/null || echo "SSH_FAILED"` + + fmt.Printf(" %s%s: ", node.Host, marker) + _ = remotessh.RunSSHStreaming(node, checkCmd) + fmt.Println() + } + + fmt.Printf("\n== Recovery complete ==\n\n") + fmt.Printf("Next steps:\n") + fmt.Printf(" 1. Run 'orama monitor report --env ' to verify full cluster health\n") + fmt.Printf(" 2. If some nodes show Candidate state, give them more time (up to 5 min)\n") + fmt.Printf(" 3. If nodes fail to join, check /opt/orama/.orama/logs/rqlite-node.log on the node\n") +} diff --git a/pkg/cli/production/rollout/rollout.go b/pkg/cli/production/rollout/rollout.go new file mode 100644 index 0000000..0ee5ffa --- /dev/null +++ b/pkg/cli/production/rollout/rollout.go @@ -0,0 +1,102 @@ +package rollout + +import ( + "flag" + "fmt" + "os" + "time" + + "github.com/DeBrosOfficial/network/pkg/cli/build" + "github.com/DeBrosOfficial/network/pkg/cli/production/push" + "github.com/DeBrosOfficial/network/pkg/cli/production/upgrade" +) + +// Flags holds rollout command flags. +type Flags struct { + Env string // Target environment (devnet, testnet) + NoBuild bool // Skip the build step + Yes bool // Skip confirmation + Delay int // Delay in seconds between nodes +} + +// Handle is the entry point for the rollout command. +func Handle(args []string) { + flags, err := parseFlags(args) + if err != nil { + if err == flag.ErrHelp { + return + } + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } + + if err := execute(flags); err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } +} + +func parseFlags(args []string) (*Flags, error) { + fs := flag.NewFlagSet("rollout", flag.ContinueOnError) + fs.SetOutput(os.Stderr) + + flags := &Flags{} + fs.StringVar(&flags.Env, "env", "", "Target environment (devnet, testnet) [required]") + fs.BoolVar(&flags.NoBuild, "no-build", false, "Skip build step (use existing archive)") + fs.BoolVar(&flags.Yes, "yes", false, "Skip confirmation") + fs.IntVar(&flags.Delay, "delay", 30, "Delay in seconds between nodes during rolling upgrade") + + if err := fs.Parse(args); err != nil { + return nil, err + } + + if flags.Env == "" { + return nil, fmt.Errorf("--env is required\nUsage: orama node rollout --env ") + } + + return flags, nil +} + +func execute(flags *Flags) error { + start := time.Now() + + fmt.Printf("Rollout to %s\n", flags.Env) + fmt.Printf(" Build: %s\n", boolStr(!flags.NoBuild, "yes", "skip")) + fmt.Printf(" Delay: %ds between nodes\n\n", flags.Delay) + + // Step 1: Build + if !flags.NoBuild { + fmt.Printf("Step 1/3: Building binary archive...\n\n") + buildFlags := &build.Flags{ + Arch: "amd64", + } + builder := build.NewBuilder(buildFlags) + if err := builder.Build(); err != nil { + return fmt.Errorf("build failed: %w", err) + } + fmt.Println() + } else { + fmt.Printf("Step 1/3: Build skipped (--no-build)\n\n") + } + + // Step 2: Push + fmt.Printf("Step 2/3: Pushing to all %s nodes...\n\n", flags.Env) + push.Handle([]string{"--env", flags.Env}) + + fmt.Println() + + // Step 3: Rolling upgrade + fmt.Printf("Step 3/3: Rolling upgrade across %s...\n\n", flags.Env) + upgrade.Handle([]string{"--env", flags.Env, "--delay", fmt.Sprintf("%d", flags.Delay)}) + + elapsed := time.Since(start).Round(time.Second) + fmt.Printf("\nRollout complete in %s\n", elapsed) + return nil +} + +func boolStr(b bool, trueStr, falseStr string) string { + if b { + return trueStr + } + return falseStr +} diff --git a/pkg/cli/production/upgrade/command.go b/pkg/cli/production/upgrade/command.go index f9d7793..3085c31 100644 --- a/pkg/cli/production/upgrade/command.go +++ b/pkg/cli/production/upgrade/command.go @@ -14,7 +14,17 @@ func Handle(args []string) { os.Exit(1) } - // Check root privileges + // Remote rolling upgrade when --env is specified + if flags.Env != "" { + remote := NewRemoteUpgrader(flags) + if err := remote.Execute(); err != nil { + fmt.Fprintf(os.Stderr, "❌ %v\n", err) + os.Exit(1) + } + return + } + + // Local upgrade: requires root if os.Geteuid() != 0 { fmt.Fprintf(os.Stderr, "❌ Production upgrade must be run as root (use sudo)\n") os.Exit(1) diff --git a/pkg/cli/production/upgrade/flags.go b/pkg/cli/production/upgrade/flags.go index dc2006e..ae2073f 100644 --- a/pkg/cli/production/upgrade/flags.go +++ b/pkg/cli/production/upgrade/flags.go @@ -13,6 +13,11 @@ type Flags struct { SkipChecks bool Nameserver *bool // Pointer so we can detect if explicitly set vs default + // Remote upgrade flags + Env string // Target environment for remote rolling upgrade + NodeFilter string // Single node IP to upgrade (optional) + Delay int // Delay in seconds between nodes during rolling upgrade + // Anyone flags AnyoneClient bool AnyoneRelay bool @@ -38,6 +43,11 @@ func ParseFlags(args []string) (*Flags, error) { fs.BoolVar(&flags.RestartServices, "restart", false, "Automatically restart services after upgrade") fs.BoolVar(&flags.SkipChecks, "skip-checks", false, "Skip minimum resource checks (RAM/CPU)") + // Remote upgrade flags + fs.StringVar(&flags.Env, "env", "", "Target environment for remote rolling upgrade (devnet, testnet)") + fs.StringVar(&flags.NodeFilter, "node", "", "Upgrade a single node IP only") + fs.IntVar(&flags.Delay, "delay", 30, "Delay in seconds between nodes during rolling upgrade") + // Nameserver flag - use pointer to detect if explicitly set nameserver := fs.Bool("nameserver", false, "Make this node a nameserver (uses saved preference if not specified)") diff --git a/pkg/cli/production/upgrade/orchestrator.go b/pkg/cli/production/upgrade/orchestrator.go index 455b563..459c12f 100644 --- a/pkg/cli/production/upgrade/orchestrator.go +++ b/pkg/cli/production/upgrade/orchestrator.go @@ -424,7 +424,11 @@ func (o *Orchestrator) stopAllNamespaceServices(serviceController *production.Sy // installNamespaceTemplates installs systemd template unit files for namespace services func (o *Orchestrator) installNamespaceTemplates() error { - sourceDir := filepath.Join(o.oramaHome, "src", "systemd") + // Check pre-built archive path first, fall back to source path + sourceDir := production.OramaSystemdDir + if _, err := os.Stat(sourceDir); os.IsNotExist(err) { + sourceDir = filepath.Join(o.oramaHome, "src", "systemd") + } systemdDir := "/etc/systemd/system" templates := []string{ diff --git a/pkg/cli/production/upgrade/remote.go b/pkg/cli/production/upgrade/remote.go new file mode 100644 index 0000000..e91096c --- /dev/null +++ b/pkg/cli/production/upgrade/remote.go @@ -0,0 +1,69 @@ +package upgrade + +import ( + "fmt" + "time" + + "github.com/DeBrosOfficial/network/pkg/cli/remotessh" + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +// RemoteUpgrader handles rolling upgrades across remote nodes. +type RemoteUpgrader struct { + flags *Flags +} + +// NewRemoteUpgrader creates a new remote upgrader. +func NewRemoteUpgrader(flags *Flags) *RemoteUpgrader { + return &RemoteUpgrader{flags: flags} +} + +// Execute runs the remote rolling upgrade. +func (r *RemoteUpgrader) Execute() error { + nodes, err := remotessh.LoadEnvNodes(r.flags.Env) + if err != nil { + return err + } + + // Filter to single node if specified + if r.flags.NodeFilter != "" { + nodes = remotessh.FilterByIP(nodes, r.flags.NodeFilter) + if len(nodes) == 0 { + return fmt.Errorf("node %s not found in %s environment", r.flags.NodeFilter, r.flags.Env) + } + } + + fmt.Printf("Rolling upgrade: %s (%d nodes, %ds delay)\n\n", r.flags.Env, len(nodes), r.flags.Delay) + + // Print execution plan + for i, node := range nodes { + fmt.Printf(" %d. %s (%s)\n", i+1, node.Host, node.Role) + } + fmt.Println() + + for i, node := range nodes { + fmt.Printf("[%d/%d] Upgrading %s (%s)...\n", i+1, len(nodes), node.Host, node.Role) + + if err := r.upgradeNode(node); err != nil { + return fmt.Errorf("upgrade failed on %s: %w\nStopping rollout — remaining nodes not upgraded", node.Host, err) + } + + fmt.Printf(" ✓ %s upgraded\n", node.Host) + + // Wait between nodes (except after the last one) + if i < len(nodes)-1 && r.flags.Delay > 0 { + fmt.Printf(" Waiting %ds before next node...\n\n", r.flags.Delay) + time.Sleep(time.Duration(r.flags.Delay) * time.Second) + } + } + + fmt.Printf("\n✓ Rolling upgrade complete (%d nodes)\n", len(nodes)) + return nil +} + +// upgradeNode runs `orama node upgrade --restart` on a single remote node. +func (r *RemoteUpgrader) upgradeNode(node inspector.Node) error { + sudo := remotessh.SudoPrefix(node) + cmd := fmt.Sprintf("%sorama node upgrade --restart", sudo) + return remotessh.RunSSHStreaming(node, cmd) +} diff --git a/pkg/cli/remotessh/config.go b/pkg/cli/remotessh/config.go new file mode 100644 index 0000000..19ab610 --- /dev/null +++ b/pkg/cli/remotessh/config.go @@ -0,0 +1,77 @@ +package remotessh + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +// FindRemoteNodesConf searches for the remote-nodes.conf file +// in common locations relative to the current directory or project root. +func FindRemoteNodesConf() string { + candidates := []string{ + "scripts/remote-nodes.conf", + "../scripts/remote-nodes.conf", + "network/scripts/remote-nodes.conf", + } + + // Also check from home dir + home, _ := os.UserHomeDir() + if home != "" { + candidates = append(candidates, filepath.Join(home, ".orama", "remote-nodes.conf")) + } + + for _, c := range candidates { + if _, err := os.Stat(c); err == nil { + return c + } + } + return "" +} + +// LoadEnvNodes loads all nodes for a given environment from remote-nodes.conf. +func LoadEnvNodes(env string) ([]inspector.Node, error) { + confPath := FindRemoteNodesConf() + if confPath == "" { + return nil, fmt.Errorf("remote-nodes.conf not found (checked scripts/, ../scripts/, network/scripts/)") + } + + nodes, err := inspector.LoadNodes(confPath) + if err != nil { + return nil, fmt.Errorf("failed to load %s: %w", confPath, err) + } + + filtered := inspector.FilterByEnv(nodes, env) + if len(filtered) == 0 { + return nil, fmt.Errorf("no nodes found for environment %q in %s", env, confPath) + } + + // Expand ~ in SSH key paths + home, _ := os.UserHomeDir() + for i := range filtered { + if filtered[i].SSHKey != "" && strings.HasPrefix(filtered[i].SSHKey, "~") { + filtered[i].SSHKey = filepath.Join(home, filtered[i].SSHKey[1:]) + } + } + + return filtered, nil +} + +// PickHubNode selects the first node as the hub for fanout distribution. +func PickHubNode(nodes []inspector.Node) inspector.Node { + return nodes[0] +} + +// FilterByIP returns nodes matching the given IP address. +func FilterByIP(nodes []inspector.Node, ip string) []inspector.Node { + var filtered []inspector.Node + for _, n := range nodes { + if n.Host == ip { + filtered = append(filtered, n) + } + } + return filtered +} diff --git a/pkg/cli/remotessh/ssh.go b/pkg/cli/remotessh/ssh.go new file mode 100644 index 0000000..e77d7e0 --- /dev/null +++ b/pkg/cli/remotessh/ssh.go @@ -0,0 +1,86 @@ +package remotessh + +import ( + "fmt" + "os" + "os/exec" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +// UploadFile copies a local file to a remote host via SCP. +func UploadFile(node inspector.Node, localPath, remotePath string) error { + dest := fmt.Sprintf("%s@%s:%s", node.User, node.Host, remotePath) + + var cmd *exec.Cmd + if node.SSHKey != "" { + cmd = exec.Command("scp", + "-o", "StrictHostKeyChecking=no", + "-o", "ConnectTimeout=10", + "-i", node.SSHKey, + localPath, dest, + ) + } else { + if _, err := exec.LookPath("sshpass"); err != nil { + return fmt.Errorf("sshpass not found — install it: brew install hudochenkov/sshpass/sshpass") + } + cmd = exec.Command("sshpass", "-p", node.Password, + "scp", + "-o", "StrictHostKeyChecking=no", + "-o", "ConnectTimeout=10", + "-o", "PreferredAuthentications=password", + "-o", "PubkeyAuthentication=no", + localPath, dest, + ) + } + + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + if err := cmd.Run(); err != nil { + return fmt.Errorf("SCP to %s failed: %w", node.Host, err) + } + return nil +} + +// RunSSHStreaming executes a command on a remote host via SSH, +// streaming stdout/stderr to the local terminal in real-time. +func RunSSHStreaming(node inspector.Node, command string) error { + var cmd *exec.Cmd + if node.SSHKey != "" { + cmd = exec.Command("ssh", + "-o", "StrictHostKeyChecking=no", + "-o", "ConnectTimeout=10", + "-i", node.SSHKey, + fmt.Sprintf("%s@%s", node.User, node.Host), + command, + ) + } else { + cmd = exec.Command("sshpass", "-p", node.Password, + "ssh", + "-o", "StrictHostKeyChecking=no", + "-o", "ConnectTimeout=10", + "-o", "PreferredAuthentications=password", + "-o", "PubkeyAuthentication=no", + fmt.Sprintf("%s@%s", node.User, node.Host), + command, + ) + } + + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + cmd.Stdin = os.Stdin + + if err := cmd.Run(); err != nil { + return fmt.Errorf("SSH to %s failed: %w", node.Host, err) + } + return nil +} + +// SudoPrefix returns "sudo " for non-root users, empty for root. +func SudoPrefix(node inspector.Node) string { + if node.User == "root" { + return "" + } + return "sudo " +} diff --git a/pkg/constants/versions.go b/pkg/constants/versions.go new file mode 100644 index 0000000..8514135 --- /dev/null +++ b/pkg/constants/versions.go @@ -0,0 +1,13 @@ +package constants + +// External dependency versions used across the network. +// Single source of truth — all installer files and build scripts import from here. +const ( + GoVersion = "1.24.6" + OlricVersion = "v0.7.0" + IPFSKuboVersion = "v0.38.2" + IPFSClusterVersion = "v1.1.2" + RQLiteVersion = "8.43.0" + CoreDNSVersion = "1.12.0" + CaddyVersion = "2.10.2" +) diff --git a/pkg/environments/production/installers/caddy.go b/pkg/environments/production/installers/caddy.go index 449653b..d8f73e7 100644 --- a/pkg/environments/production/installers/caddy.go +++ b/pkg/environments/production/installers/caddy.go @@ -7,11 +7,12 @@ import ( "os/exec" "path/filepath" "strings" + + "github.com/DeBrosOfficial/network/pkg/constants" ) const ( - caddyVersion = "2.10.2" - xcaddyRepo = "github.com/caddyserver/xcaddy/cmd/xcaddy@latest" + xcaddyRepo = "github.com/caddyserver/xcaddy/cmd/xcaddy@latest" ) // CaddyInstaller handles Caddy installation with custom DNS module @@ -26,7 +27,7 @@ type CaddyInstaller struct { func NewCaddyInstaller(arch string, logWriter io.Writer, oramaHome string) *CaddyInstaller { return &CaddyInstaller{ BaseInstaller: NewBaseInstaller(arch, logWriter), - version: caddyVersion, + version: constants.CaddyVersion, oramaHome: oramaHome, dnsModule: filepath.Join(oramaHome, "src", "pkg", "caddy", "dns", "orama"), } @@ -356,7 +357,7 @@ func (ci *CaddyInstaller) generateGoMod() string { go 1.22 require ( - github.com/caddyserver/caddy/v2 v2.` + caddyVersion[2:] + ` + github.com/caddyserver/caddy/v2 v2.` + constants.CaddyVersion[2:] + ` github.com/libdns/libdns v1.1.0 ) ` diff --git a/pkg/environments/production/installers/coredns.go b/pkg/environments/production/installers/coredns.go index 348a447..7876517 100644 --- a/pkg/environments/production/installers/coredns.go +++ b/pkg/environments/production/installers/coredns.go @@ -10,11 +10,12 @@ import ( "os/exec" "path/filepath" "time" + + "github.com/DeBrosOfficial/network/pkg/constants" ) const ( - coreDNSVersion = "1.12.0" - coreDNSRepo = "https://github.com/coredns/coredns.git" + coreDNSRepo = "https://github.com/coredns/coredns.git" ) // CoreDNSInstaller handles CoreDNS installation with RQLite plugin @@ -29,7 +30,7 @@ type CoreDNSInstaller struct { func NewCoreDNSInstaller(arch string, logWriter io.Writer, oramaHome string) *CoreDNSInstaller { return &CoreDNSInstaller{ BaseInstaller: NewBaseInstaller(arch, logWriter), - version: coreDNSVersion, + version: constants.CoreDNSVersion, oramaHome: oramaHome, rqlitePlugin: filepath.Join(oramaHome, "src", "pkg", "coredns", "rqlite"), } diff --git a/pkg/environments/production/installers/gateway.go b/pkg/environments/production/installers/gateway.go index a8e0f03..a37981a 100644 --- a/pkg/environments/production/installers/gateway.go +++ b/pkg/environments/production/installers/gateway.go @@ -7,6 +7,8 @@ import ( "os/exec" "path/filepath" "strings" + + "github.com/DeBrosOfficial/network/pkg/constants" ) // GatewayInstaller handles Orama binary installation (including gateway) @@ -124,7 +126,7 @@ func (gi *GatewayInstaller) InstallDeBrosBinaries(oramaHome string) error { // InstallGo downloads and installs Go toolchain func (gi *GatewayInstaller) InstallGo() error { - requiredVersion := "1.24.6" + requiredVersion := constants.GoVersion if goPath, err := exec.LookPath("go"); err == nil { // Check version - upgrade if too old out, _ := exec.Command(goPath, "version").Output() diff --git a/pkg/environments/production/installers/ipfs.go b/pkg/environments/production/installers/ipfs.go index f1c32c6..3346d9f 100644 --- a/pkg/environments/production/installers/ipfs.go +++ b/pkg/environments/production/installers/ipfs.go @@ -7,6 +7,8 @@ import ( "os" "os/exec" "path/filepath" + + "github.com/DeBrosOfficial/network/pkg/constants" ) // IPFSInstaller handles IPFS (Kubo) installation @@ -19,7 +21,7 @@ type IPFSInstaller struct { func NewIPFSInstaller(arch string, logWriter io.Writer) *IPFSInstaller { return &IPFSInstaller{ BaseInstaller: NewBaseInstaller(arch, logWriter), - version: "v0.38.2", + version: constants.IPFSKuboVersion, } } diff --git a/pkg/environments/production/installers/ipfs_cluster.go b/pkg/environments/production/installers/ipfs_cluster.go index 23f695b..dfe5999 100644 --- a/pkg/environments/production/installers/ipfs_cluster.go +++ b/pkg/environments/production/installers/ipfs_cluster.go @@ -8,6 +8,8 @@ import ( "os/exec" "path/filepath" "strings" + + "github.com/DeBrosOfficial/network/pkg/constants" ) // IPFSClusterInstaller handles IPFS Cluster Service installation @@ -42,7 +44,7 @@ func (ici *IPFSClusterInstaller) Install() error { return fmt.Errorf("go not found - required to install IPFS Cluster. Please install Go first") } - cmd := exec.Command("go", "install", "github.com/ipfs-cluster/ipfs-cluster/cmd/ipfs-cluster-service@latest") + cmd := exec.Command("go", "install", fmt.Sprintf("github.com/ipfs-cluster/ipfs-cluster/cmd/ipfs-cluster-service@%s", constants.IPFSClusterVersion)) cmd.Env = append(os.Environ(), "GOBIN=/usr/local/bin", "GOPROXY=https://proxy.golang.org|direct", "GONOSUMDB=*") if err := cmd.Run(); err != nil { return fmt.Errorf("failed to install IPFS Cluster: %w", err) diff --git a/pkg/environments/production/installers/olric.go b/pkg/environments/production/installers/olric.go index 409b9c9..ad56066 100644 --- a/pkg/environments/production/installers/olric.go +++ b/pkg/environments/production/installers/olric.go @@ -5,6 +5,8 @@ import ( "io" "os" "os/exec" + + "github.com/DeBrosOfficial/network/pkg/constants" ) // OlricInstaller handles Olric server installation @@ -17,7 +19,7 @@ type OlricInstaller struct { func NewOlricInstaller(arch string, logWriter io.Writer) *OlricInstaller { return &OlricInstaller{ BaseInstaller: NewBaseInstaller(arch, logWriter), - version: "v0.7.0", + version: constants.OlricVersion, } } diff --git a/pkg/environments/production/installers/rqlite.go b/pkg/environments/production/installers/rqlite.go index ea2bed6..7d2bb5e 100644 --- a/pkg/environments/production/installers/rqlite.go +++ b/pkg/environments/production/installers/rqlite.go @@ -5,6 +5,8 @@ import ( "io" "os" "os/exec" + + "github.com/DeBrosOfficial/network/pkg/constants" ) // RQLiteInstaller handles RQLite installation @@ -17,7 +19,7 @@ type RQLiteInstaller struct { func NewRQLiteInstaller(arch string, logWriter io.Writer) *RQLiteInstaller { return &RQLiteInstaller{ BaseInstaller: NewBaseInstaller(arch, logWriter), - version: "8.43.0", + version: constants.RQLiteVersion, } } diff --git a/pkg/environments/production/orchestrator.go b/pkg/environments/production/orchestrator.go index b50930d..7e5d371 100644 --- a/pkg/environments/production/orchestrator.go +++ b/pkg/environments/production/orchestrator.go @@ -259,10 +259,47 @@ func (ps *ProductionSetup) Phase2ProvisionEnvironment() error { return nil } -// Phase2bInstallBinaries installs external binaries and Orama components +// Phase2bInstallBinaries installs external binaries and Orama components. +// Auto-detects pre-built mode if /opt/orama/manifest.json exists. func (ps *ProductionSetup) Phase2bInstallBinaries() error { ps.logf("Phase 2b: Installing binaries...") + // Auto-detect pre-built binary archive + if HasPreBuiltArchive() { + manifest, err := LoadPreBuiltManifest() + if err != nil { + ps.logf(" ⚠️ Pre-built manifest found but unreadable: %v", err) + ps.logf(" Falling back to source mode...") + if err := ps.installFromSource(); err != nil { + return err + } + } else { + if err := ps.installFromPreBuilt(manifest); err != nil { + return err + } + } + } else { + // Source mode: compile everything on the VPS (original behavior) + if err := ps.installFromSource(); err != nil { + return err + } + } + + // Anyone relay/client configuration runs after BOTH paths. + // Pre-built mode installs the anon binary via .deb/apt; + // source mode installs it via the relay installer's Install(). + // Configuration (anonrc, bandwidth, migration) is always needed. + if err := ps.configureAnyone(); err != nil { + ps.logf(" ⚠️ Anyone configuration warning: %v", err) + } + + ps.logf(" ✓ All binaries installed") + return nil +} + +// installFromSource installs binaries by compiling from source on the VPS. +// This is the original Phase2bInstallBinaries logic, preserved as fallback. +func (ps *ProductionSetup) installFromSource() error { // Install system dependencies (always needed for runtime libs) if err := ps.binaryInstaller.InstallSystemDependencies(); err != nil { ps.logf(" ⚠️ System dependencies warning: %v", err) @@ -307,7 +344,12 @@ func (ps *ProductionSetup) Phase2bInstallBinaries() error { ps.logf(" ⚠️ IPFS Cluster install warning: %v", err) } - // Install Anyone (client or relay based on configuration) — apt-based, not Go + return nil +} + +// configureAnyone handles Anyone relay/client installation and configuration. +// This runs after both pre-built and source mode binary installation. +func (ps *ProductionSetup) configureAnyone() error { if ps.IsAnyoneRelay() { ps.logf(" Installing Anyone relay (operator mode)...") relayConfig := installers.AnyoneRelayConfig{ @@ -351,7 +393,7 @@ func (ps *ProductionSetup) Phase2bInstallBinaries() error { } } - // Install the relay + // Install the relay (apt-based, not Go — idempotent if already installed via .deb) if err := relayInstaller.Install(); err != nil { ps.logf(" ⚠️ Anyone relay install warning: %v", err) } @@ -364,7 +406,7 @@ func (ps *ProductionSetup) Phase2bInstallBinaries() error { ps.logf(" Installing Anyone client-only mode (SOCKS5 proxy)...") clientInstaller := installers.NewAnyoneRelayInstaller(ps.arch, ps.logWriter, installers.AnyoneRelayConfig{}) - // Install the anon binary (same apt package as relay) + // Install the anon binary (same apt package as relay — idempotent) if err := clientInstaller.Install(); err != nil { ps.logf(" ⚠️ Anyone client install warning: %v", err) } @@ -375,7 +417,6 @@ func (ps *ProductionSetup) Phase2bInstallBinaries() error { } } - ps.logf(" ✓ All binaries installed") return nil } diff --git a/pkg/environments/production/paths.go b/pkg/environments/production/paths.go index 07e38a9..a2cd310 100644 --- a/pkg/environments/production/paths.go +++ b/pkg/environments/production/paths.go @@ -11,4 +11,10 @@ const ( OramaSecrets = "/opt/orama/.orama/secrets" OramaData = "/opt/orama/.orama/data" OramaLogs = "/opt/orama/.orama/logs" + + // Pre-built binary archive paths (created by `orama build`) + OramaManifest = "/opt/orama/manifest.json" + OramaArchiveBin = "/opt/orama/bin" // Pre-built binaries + OramaSystemdDir = "/opt/orama/systemd" // Namespace service templates + OramaPackagesDir = "/opt/orama/packages" // .deb packages (e.g., anon.deb) ) diff --git a/pkg/environments/production/prebuilt.go b/pkg/environments/production/prebuilt.go new file mode 100644 index 0000000..689b8ba --- /dev/null +++ b/pkg/environments/production/prebuilt.go @@ -0,0 +1,232 @@ +package production + +import ( + "encoding/json" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" +) + +// PreBuiltManifest describes the contents of a pre-built binary archive. +type PreBuiltManifest struct { + Version string `json:"version"` + Commit string `json:"commit"` + Date string `json:"date"` + Arch string `json:"arch"` + Checksums map[string]string `json:"checksums"` // filename -> sha256 +} + +// HasPreBuiltArchive checks if a pre-built binary archive has been extracted +// at /opt/orama/ by looking for the manifest.json file. +func HasPreBuiltArchive() bool { + _, err := os.Stat(OramaManifest) + return err == nil +} + +// LoadPreBuiltManifest loads and parses the pre-built manifest. +func LoadPreBuiltManifest() (*PreBuiltManifest, error) { + data, err := os.ReadFile(OramaManifest) + if err != nil { + return nil, fmt.Errorf("failed to read manifest: %w", err) + } + + var manifest PreBuiltManifest + if err := json.Unmarshal(data, &manifest); err != nil { + return nil, fmt.Errorf("failed to parse manifest: %w", err) + } + + return &manifest, nil +} + +// installFromPreBuilt installs all binaries from a pre-built archive. +// The archive must already be extracted at /opt/orama/ with: +// - /opt/orama/bin/ — all pre-compiled binaries +// - /opt/orama/systemd/ — namespace service templates +// - /opt/orama/packages/ — optional .deb packages +// - /opt/orama/manifest.json — archive metadata +func (ps *ProductionSetup) installFromPreBuilt(manifest *PreBuiltManifest) error { + ps.logf(" Using pre-built binary archive v%s (%s) linux/%s", manifest.Version, manifest.Commit, manifest.Arch) + + // Install minimal system dependencies (no build tools needed) + if err := ps.installMinimalSystemDeps(); err != nil { + ps.logf(" ⚠️ System dependencies warning: %v", err) + } + + // Copy binaries to runtime locations + if err := ps.deployPreBuiltBinaries(manifest); err != nil { + return fmt.Errorf("failed to deploy pre-built binaries: %w", err) + } + + // Set capabilities on binaries that need to bind privileged ports + if err := ps.setCapabilities(); err != nil { + return fmt.Errorf("failed to set capabilities: %w", err) + } + + // Disable systemd-resolved stub listener for nameserver nodes + // (needed even in pre-built mode so CoreDNS can bind port 53) + if ps.isNameserver { + if err := ps.disableResolvedStub(); err != nil { + ps.logf(" ⚠️ Failed to disable systemd-resolved stub: %v", err) + } + } + + // Install Anyone relay from .deb package if available + if ps.IsAnyoneRelay() || ps.IsAnyoneClient() { + if err := ps.installAnyonFromPreBuilt(); err != nil { + ps.logf(" ⚠️ Anyone install warning: %v", err) + } + } + + ps.logf(" ✓ All pre-built binaries installed") + return nil +} + +// installMinimalSystemDeps installs only runtime dependencies (no build tools). +func (ps *ProductionSetup) installMinimalSystemDeps() error { + ps.logf(" Installing minimal system dependencies...") + + cmd := exec.Command("apt-get", "update") + if err := cmd.Run(); err != nil { + ps.logf(" Warning: apt update failed") + } + + // Only install runtime deps — no build-essential, make, nodejs, npm needed + cmd = exec.Command("apt-get", "install", "-y", "curl", "wget", "unzip") + if err := cmd.Run(); err != nil { + return fmt.Errorf("failed to install minimal dependencies: %w", err) + } + + ps.logf(" ✓ Minimal system dependencies installed (no build tools needed)") + return nil +} + +// deployPreBuiltBinaries copies pre-built binaries to their runtime locations. +func (ps *ProductionSetup) deployPreBuiltBinaries(manifest *PreBuiltManifest) error { + ps.logf(" Deploying pre-built binaries...") + + // Binary → destination mapping + // Most go to /usr/local/bin/, caddy goes to /usr/bin/ + type binaryDest struct { + name string + dest string + } + + binaries := []binaryDest{ + {name: "orama", dest: "/usr/local/bin/orama"}, + {name: "orama-node", dest: "/usr/local/bin/orama-node"}, + {name: "gateway", dest: "/usr/local/bin/gateway"}, + {name: "identity", dest: "/usr/local/bin/identity"}, + {name: "sfu", dest: "/usr/local/bin/sfu"}, + {name: "turn", dest: "/usr/local/bin/turn"}, + {name: "olric-server", dest: "/usr/local/bin/olric-server"}, + {name: "ipfs", dest: "/usr/local/bin/ipfs"}, + {name: "ipfs-cluster-service", dest: "/usr/local/bin/ipfs-cluster-service"}, + {name: "rqlited", dest: "/usr/local/bin/rqlited"}, + {name: "coredns", dest: "/usr/local/bin/coredns"}, + {name: "caddy", dest: "/usr/bin/caddy"}, + } + + for _, bin := range binaries { + srcPath := filepath.Join(OramaArchiveBin, bin.name) + + // Skip optional binaries (e.g., coredns on non-nameserver nodes) + if _, ok := manifest.Checksums[bin.name]; !ok { + continue + } + + if _, err := os.Stat(srcPath); os.IsNotExist(err) { + ps.logf(" ⚠️ Binary %s not found in archive, skipping", bin.name) + continue + } + + if err := copyBinary(srcPath, bin.dest); err != nil { + return fmt.Errorf("failed to copy %s: %w", bin.name, err) + } + ps.logf(" ✓ %s → %s", bin.name, bin.dest) + } + + return nil +} + +// setCapabilities sets cap_net_bind_service on binaries that need to bind privileged ports. +// Both the /opt/orama/bin/ originals (used by systemd) and /usr/local/bin/ copies need caps. +func (ps *ProductionSetup) setCapabilities() error { + caps := []string{ + filepath.Join(OramaArchiveBin, "orama-node"), // systemd uses this path + "/usr/local/bin/orama-node", // PATH copy + "/usr/bin/caddy", // caddy's standard location + } + for _, binary := range caps { + if _, err := os.Stat(binary); os.IsNotExist(err) { + continue + } + cmd := exec.Command("setcap", "cap_net_bind_service=+ep", binary) + if err := cmd.Run(); err != nil { + return fmt.Errorf("setcap failed on %s: %w (node won't be able to bind port 443)", binary, err) + } + ps.logf(" ✓ setcap on %s", binary) + } + return nil +} + +// disableResolvedStub disables systemd-resolved's stub listener so CoreDNS can bind port 53. +func (ps *ProductionSetup) disableResolvedStub() error { + // Delegate to the coredns installer's method + return ps.binaryInstaller.coredns.DisableResolvedStubListener() +} + +// installAnyonFromPreBuilt installs the Anyone relay .deb from the packages dir, +// falling back to apt install if the .deb is not bundled. +func (ps *ProductionSetup) installAnyonFromPreBuilt() error { + debPath := filepath.Join(OramaPackagesDir, "anon.deb") + if _, err := os.Stat(debPath); err == nil { + ps.logf(" Installing Anyone from bundled .deb...") + cmd := exec.Command("dpkg", "-i", debPath) + if err := cmd.Run(); err != nil { + ps.logf(" ⚠️ dpkg -i failed, falling back to apt...") + cmd = exec.Command("apt-get", "install", "-y", "anon") + if err := cmd.Run(); err != nil { + return fmt.Errorf("failed to install anon: %w", err) + } + } + ps.logf(" ✓ Anyone installed from .deb") + return nil + } + + // No .deb bundled — fall back to apt (the existing path in source mode) + ps.logf(" Installing Anyone via apt (not bundled in archive)...") + cmd := exec.Command("apt-get", "install", "-y", "anon") + if err := cmd.Run(); err != nil { + return fmt.Errorf("failed to install anon via apt: %w", err) + } + ps.logf(" ✓ Anyone installed via apt") + return nil +} + +// copyBinary copies a file from src to dest, preserving executable permissions. +func copyBinary(src, dest string) error { + // Ensure parent directory exists + if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil { + return err + } + + srcFile, err := os.Open(src) + if err != nil { + return err + } + defer srcFile.Close() + + destFile, err := os.OpenFile(dest, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0755) + if err != nil { + return err + } + defer destFile.Close() + + if _, err := io.Copy(destFile, srcFile); err != nil { + return err + } + + return nil +} diff --git a/scripts/build-linux-caddy.sh b/scripts/build-linux-caddy.sh deleted file mode 100755 index 5a00ab4..0000000 --- a/scripts/build-linux-caddy.sh +++ /dev/null @@ -1,223 +0,0 @@ -#!/bin/bash -# Build Caddy with orama DNS module for linux/amd64 -# Outputs to bin-linux/caddy -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" -OUTPUT_DIR="$PROJECT_ROOT/bin-linux" -BUILD_DIR="/tmp/caddy-build-linux" -MODULE_DIR="$BUILD_DIR/caddy-dns-orama" - -mkdir -p "$OUTPUT_DIR" - -# Ensure xcaddy is installed -if ! command -v xcaddy &> /dev/null; then - echo "Installing xcaddy..." - go install github.com/caddyserver/xcaddy/cmd/xcaddy@latest -fi - -# Clean up previous build -rm -rf "$BUILD_DIR" -mkdir -p "$MODULE_DIR" - -# Write go.mod -cat > "$MODULE_DIR/go.mod" << 'GOMOD' -module github.com/DeBrosOfficial/caddy-dns-orama - -go 1.22 - -require ( - github.com/caddyserver/caddy/v2 v2.10.2 - github.com/libdns/libdns v1.1.0 -) -GOMOD - -# Write provider.go (the orama DNS provider for ACME DNS-01 challenges) -cat > "$MODULE_DIR/provider.go" << 'PROVIDERGO' -// Package orama implements a DNS provider for Caddy that uses the Orama Network -// gateway's internal ACME API for DNS-01 challenge validation. -package orama - -import ( - "bytes" - "context" - "encoding/json" - "fmt" - "net/http" - "time" - - "github.com/caddyserver/caddy/v2" - "github.com/caddyserver/caddy/v2/caddyconfig/caddyfile" - "github.com/libdns/libdns" -) - -func init() { - caddy.RegisterModule(Provider{}) -} - -// Provider wraps the Orama DNS provider for Caddy. -type Provider struct { - // Endpoint is the URL of the Orama gateway's ACME API - // Default: http://localhost:6001/v1/internal/acme - Endpoint string `json:"endpoint,omitempty"` -} - -// CaddyModule returns the Caddy module information. -func (Provider) CaddyModule() caddy.ModuleInfo { - return caddy.ModuleInfo{ - ID: "dns.providers.orama", - New: func() caddy.Module { return new(Provider) }, - } -} - -// Provision sets up the module. -func (p *Provider) Provision(ctx caddy.Context) error { - if p.Endpoint == "" { - p.Endpoint = "http://localhost:6001/v1/internal/acme" - } - return nil -} - -// UnmarshalCaddyfile parses the Caddyfile configuration. -func (p *Provider) UnmarshalCaddyfile(d *caddyfile.Dispenser) error { - for d.Next() { - for d.NextBlock(0) { - switch d.Val() { - case "endpoint": - if !d.NextArg() { - return d.ArgErr() - } - p.Endpoint = d.Val() - default: - return d.Errf("unrecognized option: %s", d.Val()) - } - } - } - return nil -} - -// AppendRecords adds records to the zone. For ACME, this presents the challenge. -func (p *Provider) AppendRecords(ctx context.Context, zone string, records []libdns.Record) ([]libdns.Record, error) { - var added []libdns.Record - - for _, rec := range records { - rr := rec.RR() - if rr.Type != "TXT" { - continue - } - - fqdn := rr.Name + "." + zone - - payload := map[string]string{ - "fqdn": fqdn, - "value": rr.Data, - } - - body, err := json.Marshal(payload) - if err != nil { - return added, fmt.Errorf("failed to marshal request: %w", err) - } - - req, err := http.NewRequestWithContext(ctx, "POST", p.Endpoint+"/present", bytes.NewReader(body)) - if err != nil { - return added, fmt.Errorf("failed to create request: %w", err) - } - req.Header.Set("Content-Type", "application/json") - - client := &http.Client{Timeout: 30 * time.Second} - resp, err := client.Do(req) - if err != nil { - return added, fmt.Errorf("failed to present challenge: %w", err) - } - resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - return added, fmt.Errorf("present failed with status %d", resp.StatusCode) - } - - added = append(added, rec) - } - - return added, nil -} - -// DeleteRecords removes records from the zone. For ACME, this cleans up the challenge. -func (p *Provider) DeleteRecords(ctx context.Context, zone string, records []libdns.Record) ([]libdns.Record, error) { - var deleted []libdns.Record - - for _, rec := range records { - rr := rec.RR() - if rr.Type != "TXT" { - continue - } - - fqdn := rr.Name + "." + zone - - payload := map[string]string{ - "fqdn": fqdn, - "value": rr.Data, - } - - body, err := json.Marshal(payload) - if err != nil { - return deleted, fmt.Errorf("failed to marshal request: %w", err) - } - - req, err := http.NewRequestWithContext(ctx, "POST", p.Endpoint+"/cleanup", bytes.NewReader(body)) - if err != nil { - return deleted, fmt.Errorf("failed to create request: %w", err) - } - req.Header.Set("Content-Type", "application/json") - - client := &http.Client{Timeout: 30 * time.Second} - resp, err := client.Do(req) - if err != nil { - return deleted, fmt.Errorf("failed to cleanup challenge: %w", err) - } - resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - return deleted, fmt.Errorf("cleanup failed with status %d", resp.StatusCode) - } - - deleted = append(deleted, rec) - } - - return deleted, nil -} - -// GetRecords returns the records in the zone. Not used for ACME. -func (p *Provider) GetRecords(ctx context.Context, zone string) ([]libdns.Record, error) { - return nil, nil -} - -// SetRecords sets the records in the zone. Not used for ACME. -func (p *Provider) SetRecords(ctx context.Context, zone string, records []libdns.Record) ([]libdns.Record, error) { - return nil, nil -} - -// Interface guards -var ( - _ caddy.Module = (*Provider)(nil) - _ caddy.Provisioner = (*Provider)(nil) - _ caddyfile.Unmarshaler = (*Provider)(nil) - _ libdns.RecordAppender = (*Provider)(nil) - _ libdns.RecordDeleter = (*Provider)(nil) - _ libdns.RecordGetter = (*Provider)(nil) - _ libdns.RecordSetter = (*Provider)(nil) -) -PROVIDERGO - -# Run go mod tidy -cd "$MODULE_DIR" && go mod tidy - -# Build with xcaddy -echo "Building Caddy binary..." -GOOS=linux GOARCH=amd64 xcaddy build v2.10.2 \ - --with "github.com/DeBrosOfficial/caddy-dns-orama=$MODULE_DIR" \ - --output "$OUTPUT_DIR/caddy" - -# Cleanup -rm -rf "$BUILD_DIR" -echo "✓ Caddy built: bin-linux/caddy" diff --git a/scripts/build-linux-coredns.sh b/scripts/build-linux-coredns.sh deleted file mode 100755 index e3d36ab..0000000 --- a/scripts/build-linux-coredns.sh +++ /dev/null @@ -1,91 +0,0 @@ -#!/bin/bash -# Build CoreDNS with rqlite plugin for linux/amd64 -# Outputs to bin-linux/coredns -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" -OUTPUT_DIR="$PROJECT_ROOT/bin-linux" -BUILD_DIR="/tmp/coredns-build-linux" - -mkdir -p "$OUTPUT_DIR" - -# Clean up previous build -rm -rf "$BUILD_DIR" - -# Clone CoreDNS -echo "Cloning CoreDNS v1.12.0..." -git clone --depth 1 --branch v1.12.0 https://github.com/coredns/coredns.git "$BUILD_DIR" - -# Copy rqlite plugin -echo "Copying rqlite plugin..." -mkdir -p "$BUILD_DIR/plugin/rqlite" -cp "$PROJECT_ROOT/pkg/coredns/rqlite/"*.go "$BUILD_DIR/plugin/rqlite/" - -# Write plugin.cfg -cat > "$BUILD_DIR/plugin.cfg" << 'EOF' -metadata:metadata -cancel:cancel -tls:tls -reload:reload -nsid:nsid -bufsize:bufsize -root:root -bind:bind -debug:debug -trace:trace -ready:ready -health:health -pprof:pprof -prometheus:metrics -errors:errors -log:log -dnstap:dnstap -local:local -dns64:dns64 -acl:acl -any:any -chaos:chaos -loadbalance:loadbalance -cache:cache -rewrite:rewrite -header:header -dnssec:dnssec -autopath:autopath -minimal:minimal -template:template -transfer:transfer -hosts:hosts -file:file -auto:auto -secondary:secondary -loop:loop -forward:forward -grpc:grpc -erratic:erratic -whoami:whoami -on:github.com/coredns/caddy/onevent -sign:sign -view:view -rqlite:rqlite -EOF - -# Build -cd "$BUILD_DIR" -echo "Adding dependencies..." -go get github.com/miekg/dns@latest -go get go.uber.org/zap@latest -go mod tidy - -echo "Generating plugin code..." -go generate - -echo "Building CoreDNS binary..." -GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -ldflags "-s -w" -trimpath -o coredns - -# Copy output -cp "$BUILD_DIR/coredns" "$OUTPUT_DIR/coredns" - -# Cleanup -rm -rf "$BUILD_DIR" -echo "✓ CoreDNS built: bin-linux/coredns" diff --git a/scripts/check-node-health.sh b/scripts/check-node-health.sh deleted file mode 100755 index 765dc50..0000000 --- a/scripts/check-node-health.sh +++ /dev/null @@ -1,143 +0,0 @@ -#!/bin/bash -# Check health of an Orama Network node via SSH -# -# Usage: ./scripts/check-node-health.sh [label] -# Example: ./scripts/check-node-health.sh ubuntu@57.128.223.92 '@5YnN5wIqYnyJ4' Hermes - -if [ $# -lt 2 ]; then - echo "Usage: $0 [label]" - echo "Example: $0 ubuntu@1.2.3.4 'mypassword' MyNode" - exit 1 -fi - -USERHOST="$1" -PASS="$2" -LABEL="${3:-$USERHOST}" - -echo "════════════════════════════════════════" -echo " Node Health: $LABEL ($USERHOST)" -echo "════════════════════════════════════════" -echo "" - -sshpass -p "$PASS" ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 "$USERHOST" "bash -s" <<'REMOTE' - -WG_IP=$(ip -4 addr show wg0 2>/dev/null | grep -oP 'inet \K[0-9.]+' || true) - -# 1. Services -echo "── Services ──" -for svc in orama-node orama-ipfs orama-ipfs-cluster orama-olric orama-anyone-relay orama-anyone-client coredns caddy; do - status=$(systemctl is-active "$svc" 2>/dev/null || true) - case "$status" in - active) mark="✓";; - inactive) mark="·";; - activating) mark="~";; - *) mark="✗";; - esac - printf " %s %-25s %s\n" "$mark" "$svc" "$status" -done -echo "" - -# 2. WireGuard -echo "── WireGuard ──" -if [ -n "$WG_IP" ]; then - echo " IP: $WG_IP" - PEERS=$(sudo wg show wg0 2>/dev/null | grep -c '^peer:' || echo 0) - echo " Peers: $PEERS" - sudo wg show wg0 2>/dev/null | grep -A2 '^peer:' | grep -E 'endpoint|latest handshake' | while read -r line; do - echo " $line" - done -else - echo " not configured" -fi -echo "" - -# 3. RQLite (HTTP API on port 5001) -echo "── RQLite ──" -RQLITE_ADDR="" -for addr in "${WG_IP}:5001" "localhost:5001"; do - if curl -sf "http://${addr}/nodes" >/dev/null 2>&1; then - RQLITE_ADDR="$addr" - break - fi -done -if [ -n "$RQLITE_ADDR" ]; then - # Get node state from status - STATE=$(curl -sf "http://${RQLITE_ADDR}/status" 2>/dev/null | python3 -c " -import sys,json -d=json.load(sys.stdin) -print(d.get('store',{}).get('raft',{}).get('state','?')) -" 2>/dev/null || echo "?") - echo " This node: $STATE" - # Get cluster nodes - curl -sf "http://${RQLITE_ADDR}/nodes" 2>/dev/null | python3 -c " -import sys,json -d=json.load(sys.stdin) -for addr,info in sorted(d.items()): - r = 'ok' if info.get('reachable') else 'UNREACHABLE' - l = ' (LEADER)' if info.get('leader') else '' - v = 'voter' if info.get('voter') else 'non-voter' - print(' ' + addr + ': ' + r + ', ' + v + l) -print(' Total: ' + str(len(d)) + ' nodes') -" 2>/dev/null || echo " (parse error)" -else - echo " not responding" -fi -echo "" - -# 4. IPFS -echo "── IPFS ──" -PEERS=$(IPFS_PATH=/opt/orama/.orama/data/ipfs/repo /usr/local/bin/ipfs swarm peers 2>/dev/null) -if [ -n "$PEERS" ]; then - COUNT=$(echo "$PEERS" | wc -l) - echo " Connected peers: $COUNT" - echo "$PEERS" | while read -r addr; do echo " $addr"; done -else - echo " no peers connected" -fi -echo "" - -# 5. Gateway -echo "── Gateway ──" -GW=$(curl -sf http://localhost:6001/health 2>/dev/null) -if [ -n "$GW" ]; then - echo "$GW" | python3 -c " -import sys,json -d=json.load(sys.stdin) -print(' Status: ' + d.get('status','?')) -srv=d.get('server',{}) -print(' Uptime: ' + srv.get('uptime','?')) -cli=d.get('client',{}) -if cli: - checks=cli.get('checks',{}) - for k,v in checks.items(): - print(' ' + k + ': ' + str(v)) -" 2>/dev/null || echo " responding (parse error)" -else - echo " not responding" -fi -echo "" - -# 6. Olric -echo "── Olric ──" -if systemctl is-active orama-olric &>/dev/null; then - echo " service: active" - # Olric doesn't have a simple HTTP health endpoint; just check the process - OLRIC_PID=$(pgrep -f olric-server || true) - if [ -n "$OLRIC_PID" ]; then - echo " pid: $OLRIC_PID" - echo " listening: $(sudo ss -tlnp 2>/dev/null | grep olric | awk '{print $4}' | tr '\n' ' ')" - fi -else - echo " not running" -fi -echo "" - -# 7. Resources -echo "── Resources ──" -echo " RAM: $(free -h | awk '/Mem:/{print $3"/"$2}')" -echo " Disk: $(df -h / | awk 'NR==2{print $3"/"$2" ("$5" used)"}')" -echo "" - -REMOTE - -echo "════════════════════════════════════════" diff --git a/scripts/clean-testnet.sh b/scripts/clean-testnet.sh deleted file mode 100755 index 1b5ddbe..0000000 --- a/scripts/clean-testnet.sh +++ /dev/null @@ -1,249 +0,0 @@ -#!/usr/bin/env bash -# -# Clean testnet nodes for fresh reinstall. -# Preserves Anyone relay keys (/var/lib/anon/) for --anyone-migrate. -# DOES NOT TOUCH DEVNET NODES. -# -# Usage: scripts/clean-testnet.sh [--nuclear] [IP ...] -# --nuclear Also remove shared binaries (rqlited, ipfs, coredns, caddy, etc.) -# IP ... Optional: only clean specific nodes by IP (e.g. 62.72.44.87 51.178.84.172) -# If no IPs given, cleans ALL testnet nodes. -# -set -euo pipefail - -ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -CONF="$ROOT_DIR/scripts/remote-nodes.conf" - -[[ -f "$CONF" ]] || { echo "ERROR: Missing $CONF"; exit 1; } -command -v sshpass >/dev/null 2>&1 || { echo "ERROR: sshpass not installed (brew install sshpass / apt install sshpass)"; exit 1; } - -NUCLEAR=false -TARGET_IPS=() -for arg in "$@"; do - if [[ "$arg" == "--nuclear" ]]; then - NUCLEAR=true - else - TARGET_IPS+=("$arg") - fi -done - -SSH_OPTS=(-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10 -o LogLevel=ERROR -o PubkeyAuthentication=no) - -# ── Cleanup script (runs as root on each remote node) ───────────────────── -# Uses a quoted heredoc so NO local variable expansion happens. -# This script is uploaded to /tmp/orama-clean.sh and executed remotely. -CLEANUP_SCRIPT=$(cat <<'SCRIPT_END' -#!/bin/bash -set -e -export DEBIAN_FRONTEND=noninteractive - -echo " Stopping services..." -systemctl stop orama-node orama-gateway orama-ipfs orama-ipfs-cluster orama-olric orama-anyone-relay orama-anyone-client coredns caddy 2>/dev/null || true -systemctl disable orama-node orama-gateway orama-ipfs orama-ipfs-cluster orama-olric orama-anyone-relay orama-anyone-client coredns caddy 2>/dev/null || true -# Legacy debros-* services (pre-rename) -systemctl stop debros-anyone-relay debros-anyone-client 2>/dev/null || true -systemctl disable debros-anyone-relay debros-anyone-client 2>/dev/null || true - -echo " Killing leftover processes..." -# Kill any orama/ipfs/olric/rqlite/coredns/caddy processes that survived systemd stop -pkill -f orama-node 2>/dev/null || true -pkill -f orama-gateway 2>/dev/null || true -pkill -f ipfs-cluster-service 2>/dev/null || true -pkill -f "ipfs daemon" 2>/dev/null || true -pkill -f olric-server 2>/dev/null || true -pkill -f rqlited 2>/dev/null || true -pkill -f coredns 2>/dev/null || true -# Don't pkill caddy — it's a common system service -sleep 1 - -echo " Removing systemd service files..." -rm -f /etc/systemd/system/orama-*.service -rm -f /etc/systemd/system/debros-*.service -rm -f /etc/systemd/system/coredns.service -rm -f /etc/systemd/system/caddy.service -rm -f /etc/systemd/system/orama-deploy-*.service -systemctl daemon-reload - -echo " Tearing down WireGuard..." -systemctl stop wg-quick@wg0 2>/dev/null || true -wg-quick down wg0 2>/dev/null || true -systemctl disable wg-quick@wg0 2>/dev/null || true -rm -f /etc/wireguard/wg0.conf - -echo " Resetting UFW firewall..." -ufw --force reset -ufw allow 22/tcp -ufw --force enable - -echo " Removing orama data..." -rm -rf /opt/orama - -echo " Removing legacy user and data..." -userdel -r orama 2>/dev/null || true -rm -rf /home/orama - -echo " Removing sudoers files..." -rm -f /etc/sudoers.d/orama-access -rm -f /etc/sudoers.d/orama-deployments -rm -f /etc/sudoers.d/orama-wireguard - -echo " Removing CoreDNS and Caddy configs..." -rm -rf /etc/coredns -rm -rf /etc/caddy -rm -rf /var/lib/caddy - -echo " Cleaning temp files..." -rm -f /tmp/orama /tmp/network-source.tar.gz /tmp/network-source.zip -rm -rf /tmp/network-extract /tmp/coredns-build /tmp/caddy-build - -# Nuclear: also remove shared binaries -if [ "${1:-}" = "--nuclear" ]; then - echo " Removing shared binaries (nuclear)..." - rm -f /usr/local/bin/rqlited - rm -f /usr/local/bin/ipfs - rm -f /usr/local/bin/ipfs-cluster-service - rm -f /usr/local/bin/olric-server - rm -f /usr/local/bin/coredns - rm -f /usr/local/bin/xcaddy - rm -f /usr/bin/caddy - rm -f /usr/local/bin/orama -fi - -# Verify Anyone relay keys are preserved -if [ -d /var/lib/anon/keys ]; then - echo " Anyone relay keys PRESERVED at /var/lib/anon/keys" - if [ -f /var/lib/anon/fingerprint ]; then - fp=$(cat /var/lib/anon/fingerprint 2>/dev/null || true) - echo " Relay fingerprint: $fp" - fi - if [ -f /var/lib/anon/wallet ]; then - wallet=$(cat /var/lib/anon/wallet 2>/dev/null || true) - echo " Relay wallet: $wallet" - fi -else - echo " WARNING: No Anyone relay keys found at /var/lib/anon/" -fi - -echo " DONE" -SCRIPT_END -) - -# ── Parse testnet nodes only ────────────────────────────────────────────── -hosts=() -passes=() -users=() - -while IFS='|' read -r env hostspec pass role key; do - [[ -z "$env" || "$env" == \#* ]] && continue - env="${env%%#*}" - env="$(echo "$env" | xargs)" - [[ "$env" != "testnet" ]] && continue - - # If target IPs specified, only include matching nodes - if [[ ${#TARGET_IPS[@]} -gt 0 ]]; then - node_ip="${hostspec#*@}" - matched=false - for tip in "${TARGET_IPS[@]}"; do - [[ "$tip" == "$node_ip" ]] && matched=true && break - done - $matched || continue - fi - - hosts+=("$hostspec") - passes+=("$pass") - users+=("${hostspec%%@*}") -done < "$CONF" - -if [[ ${#hosts[@]} -eq 0 ]]; then - if [[ ${#TARGET_IPS[@]} -gt 0 ]]; then - echo "ERROR: No testnet nodes found matching: ${TARGET_IPS[*]}" - else - echo "ERROR: No testnet nodes found in $CONF" - fi - exit 1 -fi - -if [[ ${#TARGET_IPS[@]} -gt 0 ]]; then - echo "== clean-testnet.sh — ${#hosts[@]} selected node(s) ==" -else - echo "== clean-testnet.sh — ${#hosts[@]} testnet nodes (ALL) ==" -fi -for i in "${!hosts[@]}"; do - echo " [$((i+1))] ${hosts[$i]}" -done -echo "" -echo "This will CLEAN the above node(s) (stop services, remove data)." -echo "Anyone relay keys (/var/lib/anon/) will be PRESERVED." -$NUCLEAR && echo "Nuclear mode: shared binaries will also be removed." -echo "" -read -rp "Type 'yes' to continue: " confirm -if [[ "$confirm" != "yes" ]]; then - echo "Aborted." - exit 0 -fi - -# ── Execute cleanup on each node ────────────────────────────────────────── -failed=() -succeeded=0 -NUCLEAR_FLAG="" -$NUCLEAR && NUCLEAR_FLAG="--nuclear" - -for i in "${!hosts[@]}"; do - h="${hosts[$i]}" - p="${passes[$i]}" - u="${users[$i]}" - echo "" - echo "== [$((i+1))/${#hosts[@]}] Cleaning $h ==" - - # Step 1: Upload cleanup script - # No -n flag here — we're piping the script content via stdin - if ! echo "$CLEANUP_SCRIPT" | sshpass -p "$p" ssh "${SSH_OPTS[@]}" "$h" \ - "cat > /tmp/orama-clean.sh && chmod +x /tmp/orama-clean.sh" 2>&1; then - echo " !! FAILED to upload script to $h" - failed+=("$h") - continue - fi - - # Step 2: Execute the cleanup script as root - if [[ "$u" == "root" ]]; then - # Root: run directly - if ! sshpass -p "$p" ssh -n "${SSH_OPTS[@]}" "$h" \ - "bash /tmp/orama-clean.sh $NUCLEAR_FLAG; rm -f /tmp/orama-clean.sh" 2>&1; then - echo " !! FAILED: $h" - failed+=("$h") - continue - fi - else - # Non-root: escape password for single-quote embedding, pipe to sudo -S - escaped_p=$(printf '%s' "$p" | sed "s/'/'\\\\''/g") - if ! sshpass -p "$p" ssh -n "${SSH_OPTS[@]}" "$h" \ - "printf '%s\n' '${escaped_p}' | sudo -S bash /tmp/orama-clean.sh $NUCLEAR_FLAG; rm -f /tmp/orama-clean.sh" 2>&1; then - echo " !! FAILED: $h" - failed+=("$h") - continue - fi - fi - - echo " OK: $h cleaned" - ((succeeded++)) || true -done - -echo "" -echo "========================================" -echo "Cleanup complete: $succeeded succeeded, ${#failed[@]} failed" -if [[ ${#failed[@]} -gt 0 ]]; then - echo "" - echo "Failed nodes:" - for f in "${failed[@]}"; do - echo " - $f" - done - echo "" - echo "Troubleshooting:" - echo " 1. Check connectivity: ssh @" - echo " 2. Check password in remote-nodes.conf" - echo " 3. Try cleaning manually: docs/CLEAN_NODE.md" -fi -echo "" -echo "Anyone relay keys preserved at /var/lib/anon/ on all nodes." -echo "Use --anyone-migrate during install to reuse existing relay identity." -echo "========================================" diff --git a/scripts/extract-deploy.sh b/scripts/extract-deploy.sh index e4db333..484b400 100755 --- a/scripts/extract-deploy.sh +++ b/scripts/extract-deploy.sh @@ -1,5 +1,11 @@ #!/bin/bash -# Extracts /tmp/network-source.tar.gz and places the CLI binary. +# Extracts archives and places binaries on VPS nodes. +# +# Supports two archive formats: +# 1. Binary archive (from `orama build`): contains bin/, systemd/, manifest.json +# → Extracts to /opt/orama/, installs CLI from /opt/orama/bin/orama +# 2. Source archive (legacy): contains Go source code + bin-linux/orama +# → Extracts to /opt/orama/src/, installs CLI from bin-linux/orama # # Local mode (run directly on VPS): # sudo bash /opt/orama/src/scripts/extract-deploy.sh @@ -11,33 +17,85 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -ARCHIVE="/tmp/network-source.tar.gz" SRC_DIR="/opt/orama/src" BIN_DIR="/opt/orama/bin" CONF="$SCRIPT_DIR/remote-nodes.conf" +# Detect archive: binary archive has manifest.json at root +detect_archive() { + local archive="$1" + if tar tzf "$archive" 2>/dev/null | grep -q "^manifest\.json$"; then + echo "binary" + else + echo "source" + fi +} + +# Find archive: check for binary archive first, then source archive +find_archive() { + # Check for binary archive (newest orama-*-linux-*.tar.gz in /tmp) + local binary_archive + binary_archive=$(ls -t /tmp/orama-*-linux-*.tar.gz 2>/dev/null | head -1) + if [ -n "$binary_archive" ]; then + echo "$binary_archive" + return + fi + + # Fall back to source archive + if [ -f "/tmp/network-source.tar.gz" ]; then + echo "/tmp/network-source.tar.gz" + return + fi + + echo "" +} + # --- Local mode (no args) --- if [ $# -eq 0 ]; then - if [ ! -f "$ARCHIVE" ]; then - echo "Error: $ARCHIVE not found" + ARCHIVE=$(find_archive) + if [ -z "$ARCHIVE" ]; then + echo "Error: No archive found in /tmp/" + echo " Expected: /tmp/orama-*-linux-*.tar.gz (binary) or /tmp/network-source.tar.gz (source)" exit 1 fi - echo "Extracting source..." - rm -rf "$SRC_DIR" - mkdir -p "$SRC_DIR" "$BIN_DIR" - tar xzf "$ARCHIVE" -C "$SRC_DIR" + FORMAT=$(detect_archive "$ARCHIVE") + echo "Archive: $ARCHIVE (format: $FORMAT)" - # Install CLI binary - if [ -f "$SRC_DIR/bin-linux/orama" ]; then - cp "$SRC_DIR/bin-linux/orama" /usr/local/bin/orama - chmod +x /usr/local/bin/orama - echo " ✓ CLI installed: /usr/local/bin/orama" + if [ "$FORMAT" = "binary" ]; then + # Binary archive → extract to /opt/orama/ + echo "Extracting binary archive..." + mkdir -p /opt/orama + tar xzf "$ARCHIVE" -C /opt/orama + + # Install CLI binary + if [ -f "$BIN_DIR/orama" ]; then + cp "$BIN_DIR/orama" /usr/local/bin/orama + chmod +x /usr/local/bin/orama + echo " ✓ CLI installed: /usr/local/bin/orama" + else + echo " ⚠️ CLI binary not found in archive (bin/orama)" + fi + + echo "Done. Ready for: sudo orama node install --vps-ip ..." else - echo " ⚠️ CLI binary not found in archive (bin-linux/orama)" - fi + # Source archive → extract to /opt/orama/src/ (legacy) + echo "Extracting source archive..." + rm -rf "$SRC_DIR" + mkdir -p "$SRC_DIR" "$BIN_DIR" + tar xzf "$ARCHIVE" -C "$SRC_DIR" - echo "Done. Ready for: sudo orama install --vps-ip ..." + # Install CLI binary + if [ -f "$SRC_DIR/bin-linux/orama" ]; then + cp "$SRC_DIR/bin-linux/orama" /usr/local/bin/orama + chmod +x /usr/local/bin/orama + echo " ✓ CLI installed: /usr/local/bin/orama" + else + echo " ⚠️ CLI binary not found in archive (bin-linux/orama)" + fi + + echo "Done. Ready for: sudo orama node install --vps-ip ..." + fi exit 0 fi diff --git a/scripts/generate-source-archive.sh b/scripts/generate-source-archive.sh deleted file mode 100755 index 7f379b8..0000000 --- a/scripts/generate-source-archive.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash -# Generates a tarball of the current codebase for deployment -# Output: /tmp/network-source.tar.gz -# -# Includes bin-linux/orama (CLI binary cross-compiled via make build-linux). -# All other binaries are built from source on the VPS during install. -# -# Usage: -# make build-linux -# ./scripts/generate-source-archive.sh -# ./bin/orama install --vps-ip --nameserver --domain ... - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" -OUTPUT="/tmp/network-source.tar.gz" - -cd "$PROJECT_ROOT" - -# Remove root-level binaries before archiving (they'll be rebuilt on VPS) -rm -f gateway cli node orama-cli-linux 2>/dev/null - -# Verify CLI binary exists -if [ ! -f "bin-linux/orama" ]; then - echo "Error: bin-linux/orama not found. Run 'make build-linux' first." - exit 1 -fi - -echo "Generating source archive (with CLI binary)..." - -tar czf "$OUTPUT" \ - --exclude='.git' \ - --exclude='node_modules' \ - --exclude='*.log' \ - --exclude='.DS_Store' \ - --exclude='bin/' \ - --exclude='dist/' \ - --exclude='coverage/' \ - --exclude='.claude/' \ - --exclude='testdata/' \ - --exclude='examples/' \ - --exclude='*.tar.gz' \ - . - -echo "Archive created: $OUTPUT" -echo "Size: $(du -h $OUTPUT | cut -f1)" -echo "Includes CLI binary: bin-linux/orama" diff --git a/scripts/recover-rqlite.sh b/scripts/recover-rqlite.sh deleted file mode 100644 index fdebc66..0000000 --- a/scripts/recover-rqlite.sh +++ /dev/null @@ -1,289 +0,0 @@ -#!/usr/bin/env bash -# -# Recover RQLite cluster from split-brain. -# -# Strategy: -# 1. Stop orama-node on ALL nodes simultaneously -# 2. Keep raft/ data ONLY on the node with the highest commit index (leader candidate) -# 3. Delete raft/ on all other nodes (they'll join fresh via -join) -# 4. Start the leader candidate first, wait for it to become Leader -# 5. Start all other nodes — they discover the leader via LibP2P and join -# 6. Verify cluster health -# -# Usage: -# scripts/recover-rqlite.sh --devnet --leader 57.129.7.232 -# scripts/recover-rqlite.sh --testnet --leader -# -set -euo pipefail - -# ── Parse flags ────────────────────────────────────────────────────────────── -ENV="" -LEADER_HOST="" - -for arg in "$@"; do - case "$arg" in - --devnet) ENV="devnet" ;; - --testnet) ENV="testnet" ;; - --leader=*) LEADER_HOST="${arg#--leader=}" ;; - -h|--help) - echo "Usage: scripts/recover-rqlite.sh --devnet|--testnet --leader=" - exit 0 - ;; - *) - echo "Unknown flag: $arg" >&2 - exit 1 - ;; - esac -done - -if [[ -z "$ENV" ]]; then - echo "ERROR: specify --devnet or --testnet" >&2 - exit 1 -fi -if [[ -z "$LEADER_HOST" ]]; then - echo "ERROR: specify --leader= (the node with highest commit index)" >&2 - exit 1 -fi - -# ── Paths ──────────────────────────────────────────────────────────────────── -ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -CONF="$ROOT_DIR/scripts/remote-nodes.conf" - -die() { echo "ERROR: $*" >&2; exit 1; } -[[ -f "$CONF" ]] || die "Missing $CONF" - -# ── Load nodes from conf ──────────────────────────────────────────────────── -HOSTS=() -PASSES=() -ROLES=() -SSH_KEYS=() - -while IFS='|' read -r env host pass role key; do - [[ -z "$env" || "$env" == \#* ]] && continue - env="${env%%#*}" - env="$(echo "$env" | xargs)" - [[ "$env" != "$ENV" ]] && continue - - HOSTS+=("$host") - PASSES+=("$pass") - ROLES+=("${role:-node}") - SSH_KEYS+=("${key:-}") -done < "$CONF" - -if [[ ${#HOSTS[@]} -eq 0 ]]; then - die "No nodes found for environment '$ENV' in $CONF" -fi - -echo "== recover-rqlite.sh ($ENV) — ${#HOSTS[@]} nodes ==" -echo "Leader candidate: $LEADER_HOST" -echo "" - -# Find leader index -LEADER_IDX=-1 -for i in "${!HOSTS[@]}"; do - if [[ "${HOSTS[$i]}" == *"$LEADER_HOST"* ]]; then - LEADER_IDX=$i - break - fi -done - -if [[ $LEADER_IDX -eq -1 ]]; then - die "Leader host '$LEADER_HOST' not found in node list" -fi - -echo "Nodes:" -for i in "${!HOSTS[@]}"; do - marker="" - [[ $i -eq $LEADER_IDX ]] && marker=" ← LEADER (keep data)" - echo " [$i] ${HOSTS[$i]} (${ROLES[$i]})$marker" -done -echo "" - -# ── SSH helpers ────────────────────────────────────────────────────────────── -SSH_OPTS=(-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10) - -node_ssh() { - local idx="$1" - shift - local h="${HOSTS[$idx]}" - local p="${PASSES[$idx]}" - local k="${SSH_KEYS[$idx]:-}" - - if [[ -n "$k" ]]; then - local expanded_key="${k/#\~/$HOME}" - if [[ -f "$expanded_key" ]]; then - ssh -i "$expanded_key" "${SSH_OPTS[@]}" "$h" "$@" 2>/dev/null - return $? - fi - fi - sshpass -p "$p" ssh -n "${SSH_OPTS[@]}" "$h" "$@" 2>/dev/null -} - -# ── Confirmation ───────────────────────────────────────────────────────────── -echo "⚠️ THIS WILL:" -echo " 1. Stop orama-node on ALL ${#HOSTS[@]} nodes" -echo " 2. DELETE raft/ data on ${#HOSTS[@]}-1 nodes (backup to /tmp/rqlite-raft-backup/)" -echo " 3. Keep raft/ data ONLY on ${HOSTS[$LEADER_IDX]} (leader candidate)" -echo " 4. Restart all nodes to reform the cluster" -echo "" -read -r -p "Continue? [y/N] " confirm -if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then - echo "Aborted." - exit 0 -fi -echo "" - -RAFT_DIR="/opt/orama/.orama/data/rqlite/raft" -BACKUP_DIR="/tmp/rqlite-raft-backup" - -# ── Phase 1: Stop orama-node on ALL nodes ─────────────────────────────────── -echo "== Phase 1: Stopping orama-node on all ${#HOSTS[@]} nodes ==" -failed=() -for i in "${!HOSTS[@]}"; do - h="${HOSTS[$i]}" - p="${PASSES[$i]}" - echo -n " Stopping $h ... " - if node_ssh "$i" "printf '%s\n' '$p' | sudo -S systemctl stop orama-node 2>&1 && echo STOPPED"; then - echo "" - else - echo "FAILED" - failed+=("$h") - fi -done - -if [[ ${#failed[@]} -gt 0 ]]; then - echo "" - echo "⚠️ ${#failed[@]} nodes failed to stop. Attempting kill..." - for i in "${!HOSTS[@]}"; do - h="${HOSTS[$i]}" - p="${PASSES[$i]}" - for fh in "${failed[@]}"; do - if [[ "$h" == "$fh" ]]; then - node_ssh "$i" "printf '%s\n' '$p' | sudo -S killall -9 orama-node rqlited 2>/dev/null; echo KILLED" || true - fi - done - done -fi - -echo "" -echo "Waiting 5s for processes to fully stop..." -sleep 5 - -# ── Phase 2: Backup and delete raft/ on non-leader nodes ──────────────────── -echo "== Phase 2: Clearing raft state on non-leader nodes ==" -for i in "${!HOSTS[@]}"; do - [[ $i -eq $LEADER_IDX ]] && continue - - h="${HOSTS[$i]}" - p="${PASSES[$i]}" - echo -n " Clearing $h ... " - if node_ssh "$i" " - printf '%s\n' '$p' | sudo -S bash -c ' - rm -rf $BACKUP_DIR - if [ -d $RAFT_DIR ]; then - cp -r $RAFT_DIR $BACKUP_DIR 2>/dev/null || true - rm -rf $RAFT_DIR - echo \"CLEARED (backup at $BACKUP_DIR)\" - else - echo \"NO_RAFT_DIR (nothing to clear)\" - fi - ' - "; then - true - else - echo "FAILED" - fi -done - -echo "" -echo "Leader node ${HOSTS[$LEADER_IDX]} raft/ data preserved." - -# ── Phase 3: Start leader node ────────────────────────────────────────────── -echo "" -echo "== Phase 3: Starting leader node (${HOSTS[$LEADER_IDX]}) ==" -lp="${PASSES[$LEADER_IDX]}" -node_ssh "$LEADER_IDX" "printf '%s\n' '$lp' | sudo -S systemctl start orama-node" || die "Failed to start leader node" - -echo " Waiting for leader to become Leader..." -max_wait=120 -elapsed=0 -while [[ $elapsed -lt $max_wait ]]; do - state=$(node_ssh "$LEADER_IDX" "curl -s --max-time 3 http://localhost:5001/status 2>/dev/null | python3 -c \"import sys,json; d=json.load(sys.stdin); print(d.get('store',{}).get('raft',{}).get('state',''))\" 2>/dev/null" || echo "") - if [[ "$state" == "Leader" ]]; then - echo " ✓ Leader node is Leader after ${elapsed}s" - break - fi - echo " ... state=$state (${elapsed}s / ${max_wait}s)" - sleep 5 - ((elapsed+=5)) -done - -if [[ "$state" != "Leader" ]]; then - echo " ⚠️ Leader did not become Leader within ${max_wait}s (state=$state)" - echo " The node may need more time. Continuing anyway..." -fi - -# ── Phase 4: Start all other nodes ────────────────────────────────────────── -echo "" -echo "== Phase 4: Starting remaining nodes ==" - -# Start non-leader nodes in batches of 3 with 15s between batches -batch_size=3 -batch_count=0 -for i in "${!HOSTS[@]}"; do - [[ $i -eq $LEADER_IDX ]] && continue - - h="${HOSTS[$i]}" - p="${PASSES[$i]}" - echo -n " Starting $h ... " - if node_ssh "$i" "printf '%s\n' '$p' | sudo -S systemctl start orama-node && echo STARTED"; then - true - else - echo "FAILED" - fi - - ((batch_count++)) - if [[ $((batch_count % batch_size)) -eq 0 ]]; then - echo " (waiting 15s between batches for cluster stability)" - sleep 15 - fi -done - -# ── Phase 5: Wait and verify ──────────────────────────────────────────────── -echo "" -echo "== Phase 5: Waiting for cluster to form (120s) ==" -sleep 30 -echo " ... 30s" -sleep 30 -echo " ... 60s" -sleep 30 -echo " ... 90s" -sleep 30 -echo " ... 120s" - -echo "" -echo "== Cluster status ==" -for i in "${!HOSTS[@]}"; do - h="${HOSTS[$i]}" - result=$(node_ssh "$i" "curl -s --max-time 5 http://localhost:5001/status 2>/dev/null | python3 -c \" -import sys,json -try: - d=json.load(sys.stdin) - r=d.get('store',{}).get('raft',{}) - n=d.get('store',{}).get('num_nodes','?') - print(f'state={r.get(\"state\",\"?\")} commit={r.get(\"commit_index\",\"?\")} leader={r.get(\"leader\",{}).get(\"node_id\",\"?\")} nodes={n}') -except: - print('NO_RESPONSE') -\" 2>/dev/null" || echo "SSH_FAILED") - marker="" - [[ $i -eq $LEADER_IDX ]] && marker=" ← LEADER" - echo " ${HOSTS[$i]}: $result$marker" -done - -echo "" -echo "== Recovery complete ==" -echo "" -echo "Next steps:" -echo " 1. Run 'scripts/inspect.sh --devnet' to verify full cluster health" -echo " 2. If some nodes show Candidate state, give them more time (up to 5 min)" -echo " 3. If nodes fail to join, check /opt/orama/.orama/logs/rqlite-node.log on the node" diff --git a/scripts/redeploy.sh b/scripts/redeploy.sh deleted file mode 100755 index 1add12e..0000000 --- a/scripts/redeploy.sh +++ /dev/null @@ -1,400 +0,0 @@ -#!/usr/bin/env bash -# -# Redeploy to all nodes in a given environment (devnet or testnet). -# Reads node credentials from scripts/remote-nodes.conf. -# -# Flow: -# 1) make build-linux -# 2) scripts/generate-source-archive.sh -> /tmp/network-source.tar.gz -# 3) scp archive + extract-deploy.sh + conf to hub node -# 4) from hub: sshpass scp to all other nodes + sudo bash /tmp/extract-deploy.sh -# 5) rolling upgrade: followers first, leader last -# per node: pre-upgrade -> stop -> extract binary -> post-upgrade -# -# Usage: -# scripts/redeploy.sh --devnet -# scripts/redeploy.sh --testnet -# scripts/redeploy.sh --devnet --no-build -# scripts/redeploy.sh --devnet --skip-build -# -set -euo pipefail - -# ── Parse flags ────────────────────────────────────────────────────────────── -ENV="" -NO_BUILD=0 - -for arg in "$@"; do - case "$arg" in - --devnet) ENV="devnet" ;; - --testnet) ENV="testnet" ;; - --no-build|--skip-build) NO_BUILD=1 ;; - -h|--help) - echo "Usage: scripts/redeploy.sh --devnet|--testnet [--no-build|--skip-build]" - exit 0 - ;; - *) - echo "Unknown flag: $arg" >&2 - echo "Usage: scripts/redeploy.sh --devnet|--testnet [--no-build|--skip-build]" >&2 - exit 1 - ;; - esac -done - -if [[ -z "$ENV" ]]; then - echo "ERROR: specify --devnet or --testnet" >&2 - exit 1 -fi - -# ── Paths ──────────────────────────────────────────────────────────────────── -ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -CONF="$ROOT_DIR/scripts/remote-nodes.conf" -ARCHIVE="/tmp/network-source.tar.gz" -EXTRACT_SCRIPT="$ROOT_DIR/scripts/extract-deploy.sh" - -die() { echo "ERROR: $*" >&2; exit 1; } -need_file() { [[ -f "$1" ]] || die "Missing file: $1"; } - -need_file "$CONF" -need_file "$EXTRACT_SCRIPT" - -# ── Load nodes from conf ──────────────────────────────────────────────────── -HOSTS=() -PASSES=() -ROLES=() -SSH_KEYS=() - -while IFS='|' read -r env host pass role key; do - [[ -z "$env" || "$env" == \#* ]] && continue - env="${env%%#*}" - env="$(echo "$env" | xargs)" - [[ "$env" != "$ENV" ]] && continue - - HOSTS+=("$host") - PASSES+=("$pass") - ROLES+=("${role:-node}") - SSH_KEYS+=("${key:-}") -done < "$CONF" - -if [[ ${#HOSTS[@]} -eq 0 ]]; then - die "No nodes found for environment '$ENV' in $CONF" -fi - -echo "== redeploy.sh ($ENV) — ${#HOSTS[@]} nodes ==" -for i in "${!HOSTS[@]}"; do - echo " [$i] ${HOSTS[$i]} (${ROLES[$i]})" -done - -# ── Pick hub node ──────────────────────────────────────────────────────────── -# Hub = first node that has an SSH key configured (direct SCP from local). -# If none have a key, use the first node (via sshpass). -HUB_IDX=0 -HUB_KEY="" -for i in "${!HOSTS[@]}"; do - if [[ -n "${SSH_KEYS[$i]}" ]]; then - expanded_key="${SSH_KEYS[$i]/#\~/$HOME}" - if [[ -f "$expanded_key" ]]; then - HUB_IDX=$i - HUB_KEY="$expanded_key" - break - fi - fi -done - -HUB_HOST="${HOSTS[$HUB_IDX]}" -HUB_PASS="${PASSES[$HUB_IDX]}" - -echo "Hub: $HUB_HOST (idx=$HUB_IDX, key=${HUB_KEY:-none})" - -# ── Build ──────────────────────────────────────────────────────────────────── -if [[ "$NO_BUILD" -eq 0 ]]; then - echo "== build-linux ==" - (cd "$ROOT_DIR" && make build-linux) || { - echo "WARN: make build-linux failed; continuing if existing bin-linux is acceptable." - } -else - echo "== skipping build (--no-build) ==" -fi - -# ── Generate source archive ───────────────────────────────────────────────── -echo "== generate source archive ==" -(cd "$ROOT_DIR" && ./scripts/generate-source-archive.sh) -need_file "$ARCHIVE" - -# ── Helper: SSH/SCP to hub ─────────────────────────────────────────────────── -SSH_OPTS=(-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null) - -hub_scp() { - if [[ -n "$HUB_KEY" ]]; then - scp -i "$HUB_KEY" "${SSH_OPTS[@]}" "$@" - else - sshpass -p "$HUB_PASS" scp "${SSH_OPTS[@]}" "$@" - fi -} - -hub_ssh() { - if [[ -n "$HUB_KEY" ]]; then - ssh -i "$HUB_KEY" "${SSH_OPTS[@]}" "$@" - else - sshpass -p "$HUB_PASS" ssh "${SSH_OPTS[@]}" "$@" - fi -} - -# ── Upload to hub ──────────────────────────────────────────────────────────── -echo "== upload archive + extract script + conf to hub ($HUB_HOST) ==" -hub_scp "$ARCHIVE" "$EXTRACT_SCRIPT" "$CONF" "$HUB_HOST":/tmp/ - -# ── Remote: fan-out + extract + rolling upgrade ───────────────────────────── -echo "== fan-out + extract + rolling upgrade from hub ==" - -hub_ssh "$HUB_HOST" "DEPLOY_ENV=$ENV HUB_IDX=$HUB_IDX bash -s" <<'REMOTE' -set -euo pipefail -export DEBIAN_FRONTEND=noninteractive - -TAR=/tmp/network-source.tar.gz -EX=/tmp/extract-deploy.sh -CONF=/tmp/remote-nodes.conf - -[[ -f "$TAR" ]] || { echo "Missing $TAR on hub"; exit 2; } -[[ -f "$EX" ]] || { echo "Missing $EX on hub"; exit 2; } -[[ -f "$CONF" ]] || { echo "Missing $CONF on hub"; exit 2; } -chmod +x "$EX" || true - -# Parse conf file on the hub — same format as local -hosts=() -passes=() -idx=0 -hub_host="" -hub_pass="" - -while IFS='|' read -r env host pass role key; do - [[ -z "$env" || "$env" == \#* ]] && continue - env="${env%%#*}" - env="$(echo "$env" | xargs)" - [[ "$env" != "$DEPLOY_ENV" ]] && continue - - if [[ $idx -eq $HUB_IDX ]]; then - hub_host="$host" - hub_pass="$pass" - else - hosts+=("$host") - passes+=("$pass") - fi - ((idx++)) || true -done < "$CONF" - -echo "Hub: $hub_host (this machine)" -echo "Fan-out nodes: ${#hosts[@]}" - -# Install sshpass on hub if needed -if [[ ${#hosts[@]} -gt 0 ]] && ! command -v sshpass >/dev/null 2>&1; then - echo "Installing sshpass on hub..." - printf '%s\n' "$hub_pass" | sudo -S apt-get update -y >/dev/null - printf '%s\n' "$hub_pass" | sudo -S apt-get install -y sshpass >/dev/null -fi - -echo "== fan-out: upload to ${#hosts[@]} nodes ==" -upload_failed=() -for i in "${!hosts[@]}"; do - h="${hosts[$i]}" - p="${passes[$i]}" - echo " -> $h" - if ! sshpass -p "$p" scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ - "$TAR" "$EX" "$h":/tmp/; then - echo " !! UPLOAD FAILED: $h" - upload_failed+=("$h") - fi -done - -echo "== extract on all fan-out nodes ==" -for i in "${!hosts[@]}"; do - h="${hosts[$i]}" - p="${passes[$i]}" - echo " -> $h" - if ! sshpass -p "$p" ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ - "$h" "printf '%s\n' '$p' | sudo -S bash /tmp/extract-deploy.sh >/tmp/extract.log 2>&1 && echo OK"; then - echo " !! EXTRACT FAILED: $h" - upload_failed+=("$h") - fi -done - -if [[ ${#upload_failed[@]} -gt 0 ]]; then - echo "" - echo "WARNING: ${#upload_failed[@]} nodes had upload/extract failures:" - for uf in "${upload_failed[@]}"; do - echo " - $uf" - done - echo "Continuing with rolling restart..." -fi - -echo "== extract on hub ==" -printf '%s\n' "$hub_pass" | sudo -S bash "$EX" >/tmp/extract.log 2>&1 - -# ── Raft state detection ── -raft_state() { - local h="$1" p="$2" - local cmd="curl -s http://localhost:5001/status" - local parse_py='import sys,json; j=json.load(sys.stdin); r=j.get("store",{}).get("raft",{}); print((r.get("state") or ""), (r.get("num_peers") or 0), (r.get("voter") is True))' - sshpass -p "$p" ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ - "$h" "$cmd | python3 -c '$parse_py'" 2>/dev/null || true -} - -echo "== detect leader ==" -leader="" -leader_pass="" - -for i in "${!hosts[@]}"; do - h="${hosts[$i]}" - p="${passes[$i]}" - out="$(raft_state "$h" "$p")" - echo " $h -> ${out:-NO_OUTPUT}" - if [[ "$out" == Leader* ]]; then - leader="$h" - leader_pass="$p" - break - fi -done - -# Check hub itself -if [[ -z "$leader" ]]; then - hub_out="$(curl -s http://localhost:5001/status | python3 -c 'import sys,json; j=json.load(sys.stdin); r=j.get("store",{}).get("raft",{}); print((r.get("state") or ""), (r.get("num_peers") or 0), (r.get("voter") is True))' 2>/dev/null || true)" - echo " hub(localhost) -> ${hub_out:-NO_OUTPUT}" - if [[ "$hub_out" == Leader* ]]; then - leader="HUB" - leader_pass="$hub_pass" - fi -fi - -if [[ -z "$leader" ]]; then - echo "No leader detected. Aborting before upgrades." - exit 3 -fi -echo "Leader: $leader" - -failed_nodes=() - -# ── Per-node upgrade flow ── -# Uses pre-upgrade (maintenance + leadership transfer + propagation wait) -# then stops, deploys binary, and post-upgrade (start + health verification). -upgrade_one() { - local h="$1" p="$2" - echo "== upgrade $h ==" - - # 1. Pre-upgrade: enter maintenance, transfer leadership, wait for propagation - echo " [1/4] pre-upgrade..." - if ! sshpass -p "$p" ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ - "$h" "printf '%s\n' '$p' | sudo -S orama prod pre-upgrade" 2>&1; then - echo " !! pre-upgrade failed on $h (continuing with stop)" - fi - - # 2. Stop all services - echo " [2/4] stopping services..." - if ! sshpass -p "$p" ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ - "$h" "printf '%s\n' '$p' | sudo -S systemctl stop 'orama-*'" 2>&1; then - echo " !! stop failed on $h" - failed_nodes+=("$h") - return 1 - fi - - # 3. Deploy new binary - echo " [3/4] deploying binary..." - if ! sshpass -p "$p" ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ - "$h" "printf '%s\n' '$p' | sudo -S bash /tmp/extract-deploy.sh >/tmp/extract.log 2>&1 && echo OK" 2>&1; then - echo " !! extract failed on $h" - failed_nodes+=("$h") - return 1 - fi - - # 4. Post-upgrade: start services, verify health, exit maintenance - echo " [4/4] post-upgrade..." - if ! sshpass -p "$p" ssh -n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ - "$h" "printf '%s\n' '$p' | sudo -S orama prod post-upgrade" 2>&1; then - echo " !! post-upgrade failed on $h" - failed_nodes+=("$h") - return 1 - fi - - echo " OK: $h" -} - -upgrade_hub() { - echo "== upgrade hub (localhost) ==" - - # 1. Pre-upgrade - echo " [1/4] pre-upgrade..." - if ! (printf '%s\n' "$hub_pass" | sudo -S orama prod pre-upgrade) 2>&1; then - echo " !! pre-upgrade failed on hub (continuing with stop)" - fi - - # 2. Stop all services - echo " [2/4] stopping services..." - if ! (printf '%s\n' "$hub_pass" | sudo -S systemctl stop 'orama-*') 2>&1; then - echo " !! stop failed on hub ($hub_host)" - failed_nodes+=("$hub_host (hub)") - return 1 - fi - - # 3. Deploy new binary - echo " [3/4] deploying binary..." - if ! (printf '%s\n' "$hub_pass" | sudo -S bash "$EX" >/tmp/extract.log 2>&1); then - echo " !! extract failed on hub ($hub_host)" - failed_nodes+=("$hub_host (hub)") - return 1 - fi - - # 4. Post-upgrade - echo " [4/4] post-upgrade..." - if ! (printf '%s\n' "$hub_pass" | sudo -S orama prod post-upgrade) 2>&1; then - echo " !! post-upgrade failed on hub ($hub_host)" - failed_nodes+=("$hub_host (hub)") - return 1 - fi - - echo " OK: hub ($hub_host)" -} - -echo "== rolling upgrade (followers first, leader last) ==" -for i in "${!hosts[@]}"; do - h="${hosts[$i]}" - p="${passes[$i]}" - [[ "$h" == "$leader" ]] && continue - upgrade_one "$h" "$p" || true -done - -# Upgrade hub if not the leader -if [[ "$leader" != "HUB" ]]; then - upgrade_hub || true -fi - -# Upgrade leader last -echo "== upgrade leader last ==" -if [[ "$leader" == "HUB" ]]; then - upgrade_hub || true -else - upgrade_one "$leader" "$leader_pass" || true -fi - -# Clean up conf from hub -rm -f "$CONF" - -# ── Report results ── -echo "" -echo "========================================" -if [[ ${#failed_nodes[@]} -gt 0 ]]; then - echo "UPGRADE COMPLETED WITH FAILURES (${#failed_nodes[@]} nodes failed):" - for fn in "${failed_nodes[@]}"; do - echo " FAILED: $fn" - done - echo "" - echo "Recommended actions:" - echo " 1. SSH into the failed node(s)" - echo " 2. Check logs: sudo orama prod logs node --follow" - echo " 3. Manually run: sudo orama prod post-upgrade" - echo "========================================" - exit 1 -else - echo "All nodes upgraded successfully." - echo "========================================" -fi -REMOTE - -echo "== complete ==" diff --git a/scripts/upgrade-nodes.sh b/scripts/upgrade-nodes.sh deleted file mode 100755 index 0c1d076..0000000 --- a/scripts/upgrade-nodes.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash -# Rolling upgrade of nodes: runs `orama node upgrade --restart` one node at a time. -# -# Usage: -# ./scripts/upgrade-nodes.sh --env testnet -# ./scripts/upgrade-nodes.sh --env devnet -# ./scripts/upgrade-nodes.sh [ ...] - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -CONF="$SCRIPT_DIR/remote-nodes.conf" - -resolve_nodes() { - if [ "$1" = "--env" ] && [ -n "$2" ] && [ -f "$CONF" ]; then - grep "^$2|" "$CONF" | while IFS='|' read -r env userhost pass role; do - local user="${userhost%%@*}" - local host="${userhost##*@}" - echo "$user|$host|$pass" - done - return - fi - - for ip in "$@"; do - if [ -f "$CONF" ]; then - local match - match=$(grep "|[^|]*@${ip}|" "$CONF" | head -1) - if [ -n "$match" ]; then - local userhost pass - userhost=$(echo "$match" | cut -d'|' -f2) - pass=$(echo "$match" | cut -d'|' -f3) - local user="${userhost%%@*}" - echo "$user|$ip|$pass" - continue - fi - fi - echo "ubuntu|$ip|" - done -} - -upgrade_node() { - local user="$1" host="$2" pass="$3" - - echo "→ Upgrading $user@$host..." - - local sudo_prefix="" - [ "$user" != "root" ] && sudo_prefix="sudo " - - local cmd="${sudo_prefix}orama node upgrade --restart" - - if [ -n "$pass" ]; then - sshpass -p "$pass" ssh -n -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ - -o PreferredAuthentications=password -o PubkeyAuthentication=no \ - "$user@$host" "$cmd" - else - ssh -n -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ - "$user@$host" "$cmd" - fi -} - -if [ $# -eq 0 ]; then - echo "Usage: $0 --env " - echo " $0 [ ...]" - exit 1 -fi - -# Count nodes -node_count=$(resolve_nodes "$@" | wc -l | tr -d ' ') -echo "Rolling upgrade: $node_count nodes (serial)" -echo "" - -i=0 -resolve_nodes "$@" | while IFS='|' read -r user host pass; do - i=$((i + 1)) - echo "[$i/$node_count] $user@$host" - upgrade_node "$user" "$host" "$pass" - echo " ✓ Done" - if [ "$i" -lt "$node_count" ]; then - echo " Waiting 30s before next node..." - sleep 30 - fi - echo "" -done - -echo "Rolling upgrade complete." diff --git a/scripts/upload-source-fanout.sh b/scripts/upload-source-fanout.sh deleted file mode 100755 index 3ed4961..0000000 --- a/scripts/upload-source-fanout.sh +++ /dev/null @@ -1,210 +0,0 @@ -#!/bin/bash -# Upload source to one seed node, then fan out to all others in parallel. -# ~3x faster than sequential: one slow upload + fast parallel inter-node transfers. -# -# Usage: -# ./scripts/upload-source-fanout.sh --env devnet -# ./scripts/upload-source-fanout.sh --env testnet - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -ARCHIVE="/tmp/network-source.tar.gz" -CONF="$SCRIPT_DIR/remote-nodes.conf" -REMOTE_ARCHIVE="/tmp/network-source.tar.gz" - -if [ ! -f "$ARCHIVE" ]; then - echo "Error: $ARCHIVE not found" - echo "Run: make build-linux && ./scripts/generate-source-archive.sh" - exit 1 -fi - -if [ "$1" != "--env" ] || [ -z "$2" ]; then - echo "Usage: $0 --env " - exit 1 -fi - -ENV="$2" - -# Parse all nodes for this environment -declare -a USERS HOSTS PASSES KEYS -i=0 -while IFS='|' read -r env userhost pass role key; do - [ -z "$env" ] && continue - case "$env" in \#*) continue;; esac - env="$(echo "$env" | xargs)" - [ "$env" != "$ENV" ] && continue - - USERS[$i]="${userhost%%@*}" - HOSTS[$i]="${userhost##*@}" - PASSES[$i]="$pass" - KEYS[$i]="$(echo "${key:-}" | xargs)" - ((i++)) -done < "$CONF" - -TOTAL=${#HOSTS[@]} -if [ "$TOTAL" -eq 0 ]; then - echo "No nodes found for environment: $ENV" - exit 1 -fi - -echo "Source archive: $ARCHIVE ($(du -h "$ARCHIVE" | cut -f1))" -echo "Fanout: upload to 1 seed, then parallel to $((TOTAL - 1)) others" -echo "" - -# --- Helper functions --- - -run_ssh() { - local user="$1" host="$2" pass="$3" key="$4" - shift 4 - local opts="-o StrictHostKeyChecking=no -o ConnectTimeout=10" - if [ -n "$key" ]; then - ssh -n $opts -i "$key" "$user@$host" "$@" - elif [ -n "$pass" ]; then - sshpass -p "$pass" ssh -n $opts \ - -o PreferredAuthentications=password -o PubkeyAuthentication=no \ - "$user@$host" "$@" - else - ssh -n $opts "$user@$host" "$@" - fi -} - -# Like run_ssh but without -n, so stdin can be piped through -run_ssh_stdin() { - local user="$1" host="$2" pass="$3" key="$4" - shift 4 - local opts="-o StrictHostKeyChecking=no -o ConnectTimeout=10" - if [ -n "$key" ]; then - ssh $opts -i "$key" "$user@$host" "$@" - elif [ -n "$pass" ]; then - sshpass -p "$pass" ssh $opts \ - -o PreferredAuthentications=password -o PubkeyAuthentication=no \ - "$user@$host" "$@" - else - ssh $opts "$user@$host" "$@" - fi -} - -run_scp() { - local user="$1" host="$2" pass="$3" key="$4" src="$5" dst="$6" - local opts="-o StrictHostKeyChecking=no -o ConnectTimeout=10" - if [ -n "$key" ]; then - scp $opts -i "$key" "$src" "$user@$host:$dst" - elif [ -n "$pass" ]; then - sshpass -p "$pass" scp $opts \ - -o PreferredAuthentications=password -o PubkeyAuthentication=no \ - "$src" "$user@$host:$dst" - else - scp $opts "$src" "$user@$host:$dst" - fi -} - -extract_on_node() { - local user="$1" host="$2" pass="$3" key="$4" - local sudo_prefix="" - [ "$user" != "root" ] && sudo_prefix="sudo " - run_ssh "$user" "$host" "$pass" "$key" \ - "${sudo_prefix}bash -c 'rm -rf /opt/orama/src && mkdir -p /opt/orama/src /opt/orama/bin && tar xzf $REMOTE_ARCHIVE -C /opt/orama/src 2>/dev/null && if [ -f /opt/orama/src/bin-linux/orama ]; then cp /opt/orama/src/bin-linux/orama /usr/local/bin/orama && chmod +x /usr/local/bin/orama; fi && echo \"\$(ls /opt/orama/src/ | wc -l) files\"'" -} - -# --- Step 1: Upload to seed (first node) --- - -SEED_USER="${USERS[0]}" -SEED_HOST="${HOSTS[0]}" -SEED_PASS="${PASSES[0]}" -SEED_KEY="${KEYS[0]}" - -echo "=== Step 1/3: Upload to seed ($SEED_USER@$SEED_HOST) ===" -run_scp "$SEED_USER" "$SEED_HOST" "$SEED_PASS" "$SEED_KEY" "$ARCHIVE" "$REMOTE_ARCHIVE" -extract_on_node "$SEED_USER" "$SEED_HOST" "$SEED_PASS" "$SEED_KEY" -echo " ✓ Seed ready" -echo "" - -# --- Step 2: Install sshpass on seed if needed --- - -echo "=== Step 2/3: Prepare seed for fanout ===" -run_ssh "$SEED_USER" "$SEED_HOST" "$SEED_PASS" "$SEED_KEY" \ - "which sshpass >/dev/null 2>&1 || (sudo apt-get update -qq >/dev/null 2>&1 && sudo apt-get install -y -qq sshpass >/dev/null 2>&1)" -echo " ✓ sshpass available on seed" -echo "" - -# --- Step 3: Fan out from seed to all other nodes in parallel --- - -echo "=== Step 3/3: Fanout to $((TOTAL - 1)) nodes ===" - -# Collect nodes that need key-based auth (can't fanout, key is local) -declare -a KEY_NODES - -# Build a targets file for the seed: user|host|pass|is_root (one per line, base64-encoded passwords) -TARGETS_CONTENT="" -for ((j=1; j /tmp/fanout-targets.txt" <<< "$TARGETS_CONTENT" - -FANOUT='#!/bin/bash -ARCHIVE="/tmp/network-source.tar.gz" -PIDS=() -LABELS=() - -while IFS="|" read -r user host b64pass is_root; do - [ -z "$user" ] && continue - pass=$(echo "$b64pass" | base64 -d) - sudo_prefix="" - [ "$is_root" != "1" ] && sudo_prefix="sudo " - - ( - sshpass -p "$pass" scp \ - -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ - -o PreferredAuthentications=password -o PubkeyAuthentication=no \ - "$ARCHIVE" "$user@$host:$ARCHIVE" && \ - sshpass -p "$pass" ssh -n \ - -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ - -o PreferredAuthentications=password -o PubkeyAuthentication=no \ - "$user@$host" \ - "${sudo_prefix}bash -c '\''rm -rf /opt/orama/src && mkdir -p /opt/orama/src /opt/orama/bin && tar xzf /tmp/network-source.tar.gz -C /opt/orama/src 2>/dev/null && if [ -f /opt/orama/src/bin-linux/orama ]; then cp /opt/orama/src/bin-linux/orama /usr/local/bin/orama && chmod +x /usr/local/bin/orama; fi'\''" && \ - echo " ✓ $user@$host" || \ - echo " ✗ $user@$host FAILED" - ) & - PIDS+=($!) - LABELS+=("$user@$host") -done < /tmp/fanout-targets.txt - -FAILED=0 -for i in "${!PIDS[@]}"; do - if ! wait "${PIDS[$i]}"; then - FAILED=1 - fi -done - -rm -f /tmp/fanout-targets.txt /tmp/fanout.sh -exit $FAILED -' - -run_ssh_stdin "$SEED_USER" "$SEED_HOST" "$SEED_PASS" "$SEED_KEY" "cat > /tmp/fanout.sh && chmod +x /tmp/fanout.sh" <<< "$FANOUT" - -# Run fanout (allocate tty for live output) -run_ssh "$SEED_USER" "$SEED_HOST" "$SEED_PASS" "$SEED_KEY" "bash /tmp/fanout.sh" - -# Handle key-based auth nodes directly from local (key isn't on seed) -for idx in "${KEY_NODES[@]}"; do - echo "" - echo "→ Direct upload to ${USERS[$idx]}@${HOSTS[$idx]} (SSH key auth)..." - run_scp "${USERS[$idx]}" "${HOSTS[$idx]}" "${PASSES[$idx]}" "${KEYS[$idx]}" "$ARCHIVE" "$REMOTE_ARCHIVE" - extract_on_node "${USERS[$idx]}" "${HOSTS[$idx]}" "${PASSES[$idx]}" "${KEYS[$idx]}" - echo " ✓ ${USERS[$idx]}@${HOSTS[$idx]}" -done - -echo "" -echo "Done. All $TOTAL nodes updated." -echo "Now run: ./bin/orama install --vps-ip ..." diff --git a/scripts/upload-source.sh b/scripts/upload-source.sh deleted file mode 100755 index 53c15f9..0000000 --- a/scripts/upload-source.sh +++ /dev/null @@ -1,103 +0,0 @@ -#!/bin/bash -# Upload and extract the source archive to one or more VPS nodes. -# -# Prerequisites: -# make build-linux -# ./scripts/generate-source-archive.sh -# -# Usage: -# ./scripts/upload-source.sh [ ...] -# ./scripts/upload-source.sh --env testnet # upload to all testnet nodes -# -# After uploading, run install: -# ./bin/orama install --vps-ip --nameserver --domain ... - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -ARCHIVE="/tmp/network-source.tar.gz" -CONF="$SCRIPT_DIR/remote-nodes.conf" - -if [ ! -f "$ARCHIVE" ]; then - echo "Error: $ARCHIVE not found" - echo "Run: make build-linux && ./scripts/generate-source-archive.sh" - exit 1 -fi - -# Resolve VPS list from --env flag or direct IPs -resolve_nodes() { - if [ "$1" = "--env" ] && [ -n "$2" ] && [ -f "$CONF" ]; then - grep "^$2|" "$CONF" | while IFS='|' read -r env userhost pass role; do - local user="${userhost%%@*}" - local host="${userhost##*@}" - echo "$user|$host|$pass" - done - return - fi - - # Direct IPs — look up credentials from conf - for ip in "$@"; do - if [ -f "$CONF" ]; then - local match - match=$(grep "|[^|]*@${ip}|" "$CONF" | head -1) - if [ -n "$match" ]; then - local userhost pass - userhost=$(echo "$match" | cut -d'|' -f2) - pass=$(echo "$match" | cut -d'|' -f3) - local user="${userhost%%@*}" - echo "$user|$ip|$pass" - continue - fi - fi - # Fallback: prompt for credentials - echo "ubuntu|$ip|" - done -} - -upload_to_node() { - local user="$1" host="$2" pass="$3" - - echo "→ Uploading to $user@$host..." - - # Upload archive - if [ -n "$pass" ]; then - sshpass -p "$pass" scp -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ - -o PreferredAuthentications=password -o PubkeyAuthentication=no \ - "$ARCHIVE" "$user@$host:/tmp/network-source.tar.gz" - else - scp -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ - "$ARCHIVE" "$user@$host:/tmp/network-source.tar.gz" - fi - - # Extract on VPS - local sudo_prefix="" - [ "$user" != "root" ] && sudo_prefix="sudo " - - local extract_cmd="${sudo_prefix}bash -c 'rm -rf /opt/orama/src && mkdir -p /opt/orama/src /opt/orama/bin && tar xzf /tmp/network-source.tar.gz -C /opt/orama/src 2>/dev/null && if [ -f /opt/orama/src/bin-linux/orama ]; then cp /opt/orama/src/bin-linux/orama /usr/local/bin/orama && chmod +x /usr/local/bin/orama; fi && echo \" ✓ Extracted (\$(ls /opt/orama/src/ | wc -l) files)\"'" - - if [ -n "$pass" ]; then - sshpass -p "$pass" ssh -n -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ - -o PreferredAuthentications=password -o PubkeyAuthentication=no \ - "$user@$host" "$extract_cmd" - else - ssh -n -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ - "$user@$host" "$extract_cmd" - fi -} - -# Main -if [ $# -eq 0 ]; then - echo "Usage: $0 [ ...]" - echo " $0 --env testnet" - exit 1 -fi - -echo "Source archive: $ARCHIVE ($(du -h "$ARCHIVE" | cut -f1))" -echo "" - -resolve_nodes "$@" | while IFS='|' read -r user host pass; do - upload_to_node "$user" "$host" "$pass" - echo "" -done - -echo "Done. Now run: ./bin/orama install --vps-ip ..." From f0d26211992551226e402b99f512ad843ce3856e Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Tue, 24 Feb 2026 14:28:11 +0200 Subject: [PATCH 02/13] Removed extract deploy script --- pkg/cli/production/install/remote.go | 6 +- scripts/extract-deploy.sh | 156 --------------------------- 2 files changed, 3 insertions(+), 159 deletions(-) delete mode 100755 scripts/extract-deploy.sh diff --git a/pkg/cli/production/install/remote.go b/pkg/cli/production/install/remote.go index b5b10a5..de70744 100644 --- a/pkg/cli/production/install/remote.go +++ b/pkg/cli/production/install/remote.go @@ -40,7 +40,7 @@ func NewRemoteOrchestrator(flags *Flags) (*RemoteOrchestrator, error) { // Execute runs the remote install process. // If a binary archive exists locally, uploads and extracts it on the VPS // so Phase2b auto-detects pre-built mode. Otherwise, source must already -// be uploaded via: ./scripts/upload-source.sh +// be present on the VPS. func (r *RemoteOrchestrator) Execute() error { fmt.Printf("Installing on %s via SSH (%s@%s)...\n\n", r.flags.VpsIP, r.node.User, r.node.Host) @@ -75,9 +75,9 @@ func (r *RemoteOrchestrator) uploadBinaryArchive() error { return fmt.Errorf("failed to upload archive: %w", err) } - // Extract to /opt/orama/ on VPS + // Extract to /opt/orama/ and install CLI to PATH fmt.Printf("Extracting archive on VPS...\n") - extractCmd := fmt.Sprintf("%smkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s && echo ' ✓ Archive extracted to /opt/orama/'", + extractCmd := fmt.Sprintf("%smkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s && cp /opt/orama/bin/orama /usr/local/bin/orama && chmod +x /usr/local/bin/orama && echo ' ✓ Archive extracted, CLI installed'", r.sudoPrefix(), remoteTmp, remoteTmp) if err := runSSHStreaming(r.node, extractCmd); err != nil { return fmt.Errorf("failed to extract archive on VPS: %w", err) diff --git a/scripts/extract-deploy.sh b/scripts/extract-deploy.sh deleted file mode 100755 index 484b400..0000000 --- a/scripts/extract-deploy.sh +++ /dev/null @@ -1,156 +0,0 @@ -#!/bin/bash -# Extracts archives and places binaries on VPS nodes. -# -# Supports two archive formats: -# 1. Binary archive (from `orama build`): contains bin/, systemd/, manifest.json -# → Extracts to /opt/orama/, installs CLI from /opt/orama/bin/orama -# 2. Source archive (legacy): contains Go source code + bin-linux/orama -# → Extracts to /opt/orama/src/, installs CLI from bin-linux/orama -# -# Local mode (run directly on VPS): -# sudo bash /opt/orama/src/scripts/extract-deploy.sh -# -# Remote mode (run from dev machine): -# ./scripts/extract-deploy.sh --env testnet -# ./scripts/extract-deploy.sh [ ...] - -set -e - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -SRC_DIR="/opt/orama/src" -BIN_DIR="/opt/orama/bin" -CONF="$SCRIPT_DIR/remote-nodes.conf" - -# Detect archive: binary archive has manifest.json at root -detect_archive() { - local archive="$1" - if tar tzf "$archive" 2>/dev/null | grep -q "^manifest\.json$"; then - echo "binary" - else - echo "source" - fi -} - -# Find archive: check for binary archive first, then source archive -find_archive() { - # Check for binary archive (newest orama-*-linux-*.tar.gz in /tmp) - local binary_archive - binary_archive=$(ls -t /tmp/orama-*-linux-*.tar.gz 2>/dev/null | head -1) - if [ -n "$binary_archive" ]; then - echo "$binary_archive" - return - fi - - # Fall back to source archive - if [ -f "/tmp/network-source.tar.gz" ]; then - echo "/tmp/network-source.tar.gz" - return - fi - - echo "" -} - -# --- Local mode (no args) --- -if [ $# -eq 0 ]; then - ARCHIVE=$(find_archive) - if [ -z "$ARCHIVE" ]; then - echo "Error: No archive found in /tmp/" - echo " Expected: /tmp/orama-*-linux-*.tar.gz (binary) or /tmp/network-source.tar.gz (source)" - exit 1 - fi - - FORMAT=$(detect_archive "$ARCHIVE") - echo "Archive: $ARCHIVE (format: $FORMAT)" - - if [ "$FORMAT" = "binary" ]; then - # Binary archive → extract to /opt/orama/ - echo "Extracting binary archive..." - mkdir -p /opt/orama - tar xzf "$ARCHIVE" -C /opt/orama - - # Install CLI binary - if [ -f "$BIN_DIR/orama" ]; then - cp "$BIN_DIR/orama" /usr/local/bin/orama - chmod +x /usr/local/bin/orama - echo " ✓ CLI installed: /usr/local/bin/orama" - else - echo " ⚠️ CLI binary not found in archive (bin/orama)" - fi - - echo "Done. Ready for: sudo orama node install --vps-ip ..." - else - # Source archive → extract to /opt/orama/src/ (legacy) - echo "Extracting source archive..." - rm -rf "$SRC_DIR" - mkdir -p "$SRC_DIR" "$BIN_DIR" - tar xzf "$ARCHIVE" -C "$SRC_DIR" - - # Install CLI binary - if [ -f "$SRC_DIR/bin-linux/orama" ]; then - cp "$SRC_DIR/bin-linux/orama" /usr/local/bin/orama - chmod +x /usr/local/bin/orama - echo " ✓ CLI installed: /usr/local/bin/orama" - else - echo " ⚠️ CLI binary not found in archive (bin-linux/orama)" - fi - - echo "Done. Ready for: sudo orama node install --vps-ip ..." - fi - exit 0 -fi - -# --- Remote mode --- - -resolve_nodes() { - if [ "$1" = "--env" ] && [ -n "$2" ] && [ -f "$CONF" ]; then - grep "^$2|" "$CONF" | while IFS='|' read -r env userhost pass role; do - local user="${userhost%%@*}" - local host="${userhost##*@}" - echo "$user|$host|$pass" - done - return - fi - - for ip in "$@"; do - if [ -f "$CONF" ]; then - local match - match=$(grep "|[^|]*@${ip}|" "$CONF" | head -1) - if [ -n "$match" ]; then - local userhost pass - userhost=$(echo "$match" | cut -d'|' -f2) - pass=$(echo "$match" | cut -d'|' -f3) - local user="${userhost%%@*}" - echo "$user|$ip|$pass" - continue - fi - fi - echo "ubuntu|$ip|" - done -} - -extract_on_node() { - local user="$1" host="$2" pass="$3" - - echo "→ Extracting on $user@$host..." - - local sudo_prefix="" - [ "$user" != "root" ] && sudo_prefix="sudo " - - local extract_cmd="${sudo_prefix}bash -c 'rm -rf /opt/orama/src && mkdir -p /opt/orama/src /opt/orama/bin && tar xzf /tmp/network-source.tar.gz -C /opt/orama/src 2>/dev/null && if [ -f /opt/orama/src/bin-linux/orama ]; then cp /opt/orama/src/bin-linux/orama /usr/local/bin/orama && chmod +x /usr/local/bin/orama; fi && echo \" ✓ Extracted (\$(ls /opt/orama/src/ | wc -l) files)\"'" - - if [ -n "$pass" ]; then - sshpass -p "$pass" ssh -n -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ - -o PreferredAuthentications=password -o PubkeyAuthentication=no \ - "$user@$host" "$extract_cmd" - else - ssh -n -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ - "$user@$host" "$extract_cmd" - fi -} - -resolve_nodes "$@" | while IFS='|' read -r user host pass; do - extract_on_node "$user" "$host" "$pass" - echo "" -done - -echo "Done." From 6898f47e2e0fb6f31141222dfb5b8814f89af39f Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Tue, 24 Feb 2026 17:24:16 +0200 Subject: [PATCH 03/13] Replace sshpass password auth with RootWallet SSH keys Replaces plaintext password-based SSH authentication (sshpass) across the entire Go CLI with wallet-derived ed25519 keys via RootWallet. - Add `rw vault ssh agent-load` command to RootWallet CLI for SSH agent forwarding in push fanout - Create wallet.go bridge: PrepareNodeKeys resolves keys from `rw vault ssh get --priv`, writes temp PEMs (0600), zero-overwrites on cleanup - Remove Password field from Node struct, update config parser to new 3-field format (env|user@host|role) - Remove all sshpass branches from inspector/ssh.go and remotessh/ssh.go, require SSHKey on all SSH paths - Add WithAgentForward() option to RunSSHStreaming for hub fanout - Add PrepareNodeKeys + defer cleanup to all 7 entry points: inspect, monitor, push, upgrade, clean, recover, install - Update push fanout to use SSH agent forwarding instead of sshpass on hub - Delete install/ssh.go duplicate, replace with remotessh calls - Create nodes.conf from remote-nodes.conf (topology only, no secrets) - Update all config defaults and help text from remote-nodes.conf to nodes.conf - Use StrictHostKeyChecking=accept-new consistently everywhere --- go.mod | 8 +- pkg/cli/cluster/commands.go | 2 +- pkg/cli/cmd/monitorcmd/monitor.go | 2 +- pkg/cli/inspect_command.go | 11 +- pkg/cli/monitor/collector.go | 10 +- pkg/cli/production/clean/clean.go | 6 + pkg/cli/production/install/remote.go | 60 +++++++--- pkg/cli/production/install/ssh.go | 153 ------------------------- pkg/cli/production/push/push.go | 33 ++++-- pkg/cli/production/recover/recover.go | 6 + pkg/cli/production/upgrade/remote.go | 6 + pkg/cli/remotessh/config.go | 28 ++--- pkg/cli/remotessh/ssh.go | 85 +++++++------- pkg/cli/remotessh/wallet.go | 158 ++++++++++++++++++++++++++ pkg/inspector/checks/helpers_test.go | 1 - pkg/inspector/config.go | 25 ++-- pkg/inspector/config_test.go | 24 ++-- pkg/inspector/ssh.go | 39 +++---- scripts/nodes.conf | 42 +++++++ 19 files changed, 399 insertions(+), 300 deletions(-) delete mode 100644 pkg/cli/production/install/ssh.go create mode 100644 pkg/cli/remotessh/wallet.go create mode 100644 scripts/nodes.conf diff --git a/go.mod b/go.mod index bb89867..740f29a 100644 --- a/go.mod +++ b/go.mod @@ -20,6 +20,10 @@ require ( github.com/miekg/dns v1.1.70 github.com/multiformats/go-multiaddr v0.16.0 github.com/olric-data/olric v0.7.0 + github.com/pion/interceptor v0.1.40 + github.com/pion/rtcp v1.2.15 + github.com/pion/turn/v4 v4.0.2 + github.com/pion/webrtc/v4 v4.1.2 github.com/rqlite/gorqlite v0.0.0-20250609141355-ac86a4a1c9a8 github.com/spf13/cobra v1.10.2 github.com/stretchr/testify v1.11.1 @@ -123,11 +127,9 @@ require ( github.com/pion/dtls/v2 v2.2.12 // indirect github.com/pion/dtls/v3 v3.0.6 // indirect github.com/pion/ice/v4 v4.0.10 // indirect - github.com/pion/interceptor v0.1.40 // indirect github.com/pion/logging v0.2.3 // indirect github.com/pion/mdns/v2 v2.0.7 // indirect github.com/pion/randutil v0.1.0 // indirect - github.com/pion/rtcp v1.2.15 // indirect github.com/pion/rtp v1.8.19 // indirect github.com/pion/sctp v1.8.39 // indirect github.com/pion/sdp/v3 v3.0.13 // indirect @@ -136,8 +138,6 @@ require ( github.com/pion/stun/v3 v3.0.0 // indirect github.com/pion/transport/v2 v2.2.10 // indirect github.com/pion/transport/v3 v3.0.7 // indirect - github.com/pion/turn/v4 v4.0.2 // indirect - github.com/pion/webrtc/v4 v4.1.2 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_golang v1.23.0 // indirect diff --git a/pkg/cli/cluster/commands.go b/pkg/cli/cluster/commands.go index 68fc725..d4af9d0 100644 --- a/pkg/cli/cluster/commands.go +++ b/pkg/cli/cluster/commands.go @@ -61,7 +61,7 @@ func ShowHelp() { fmt.Printf("Subcommands:\n") fmt.Printf(" status - Show cluster node status (RQLite + Olric)\n") fmt.Printf(" Options:\n") - fmt.Printf(" --all - SSH into all nodes from remote-nodes.conf (TODO)\n") + fmt.Printf(" --all - SSH into all nodes from nodes.conf (TODO)\n") fmt.Printf(" health - Run cluster health checks\n") fmt.Printf(" rqlite - RQLite-specific commands\n") fmt.Printf(" status - Show detailed Raft state for local node\n") diff --git a/pkg/cli/cmd/monitorcmd/monitor.go b/pkg/cli/cmd/monitorcmd/monitor.go index f1a9495..9b77002 100644 --- a/pkg/cli/cmd/monitorcmd/monitor.go +++ b/pkg/cli/cmd/monitorcmd/monitor.go @@ -34,7 +34,7 @@ func init() { Cmd.PersistentFlags().StringVar(&flagEnv, "env", "", "Environment: devnet, testnet, mainnet (required)") Cmd.PersistentFlags().BoolVar(&flagJSON, "json", false, "Machine-readable JSON output") Cmd.PersistentFlags().StringVar(&flagNode, "node", "", "Filter to specific node host/IP") - Cmd.PersistentFlags().StringVar(&flagConfig, "config", "scripts/remote-nodes.conf", "Path to remote-nodes.conf") + Cmd.PersistentFlags().StringVar(&flagConfig, "config", "scripts/nodes.conf", "Path to nodes.conf") Cmd.MarkPersistentFlagRequired("env") Cmd.AddCommand(liveCmd) diff --git a/pkg/cli/inspect_command.go b/pkg/cli/inspect_command.go index 9fedf66..d8251e6 100644 --- a/pkg/cli/inspect_command.go +++ b/pkg/cli/inspect_command.go @@ -9,6 +9,7 @@ import ( "strings" "time" + "github.com/DeBrosOfficial/network/pkg/cli/remotessh" "github.com/DeBrosOfficial/network/pkg/inspector" // Import checks package so init() registers the checkers _ "github.com/DeBrosOfficial/network/pkg/inspector/checks" @@ -49,7 +50,7 @@ func HandleInspectCommand(args []string) { fs := flag.NewFlagSet("inspect", flag.ExitOnError) - configPath := fs.String("config", "scripts/remote-nodes.conf", "Path to remote-nodes.conf") + configPath := fs.String("config", "scripts/nodes.conf", "Path to nodes.conf") env := fs.String("env", "", "Environment to inspect (devnet, testnet)") subsystem := fs.String("subsystem", "all", "Subsystem to inspect (rqlite,olric,ipfs,dns,wg,system,network,anyone,all)") format := fs.String("format", "table", "Output format (table, json)") @@ -98,6 +99,14 @@ func HandleInspectCommand(args []string) { os.Exit(1) } + // Prepare wallet-derived SSH keys + cleanup, err := remotessh.PrepareNodeKeys(nodes) + if err != nil { + fmt.Fprintf(os.Stderr, "Error preparing SSH keys: %v\n", err) + os.Exit(1) + } + defer cleanup() + // Parse subsystems var subsystems []string if *subsystem != "all" { diff --git a/pkg/cli/monitor/collector.go b/pkg/cli/monitor/collector.go index 2adc726..1e7ec53 100644 --- a/pkg/cli/monitor/collector.go +++ b/pkg/cli/monitor/collector.go @@ -8,6 +8,7 @@ import ( "time" "github.com/DeBrosOfficial/network/pkg/cli/production/report" + "github.com/DeBrosOfficial/network/pkg/cli/remotessh" "github.com/DeBrosOfficial/network/pkg/inspector" ) @@ -34,6 +35,13 @@ func CollectOnce(ctx context.Context, cfg CollectorConfig) (*ClusterSnapshot, er return nil, fmt.Errorf("no nodes found for env %q", cfg.Env) } + // Prepare wallet-derived SSH keys + cleanup, err := remotessh.PrepareNodeKeys(nodes) + if err != nil { + return nil, fmt.Errorf("prepare SSH keys: %w", err) + } + defer cleanup() + timeout := cfg.Timeout if timeout == 0 { timeout = 30 * time.Second @@ -87,7 +95,7 @@ func collectNodeReport(ctx context.Context, node inspector.Node, timeout time.Du return cs } - // Enrich with node metadata from remote-nodes.conf + // Enrich with node metadata from nodes.conf if rpt.Hostname == "" { rpt.Hostname = node.Host } diff --git a/pkg/cli/production/clean/clean.go b/pkg/cli/production/clean/clean.go index 65d1435..547a9a3 100644 --- a/pkg/cli/production/clean/clean.go +++ b/pkg/cli/production/clean/clean.go @@ -63,6 +63,12 @@ func execute(flags *Flags) error { return err } + cleanup, err := remotessh.PrepareNodeKeys(nodes) + if err != nil { + return err + } + defer cleanup() + if flags.Node != "" { nodes = remotessh.FilterByIP(nodes, flags.Node) if len(nodes) == 0 { diff --git a/pkg/cli/production/install/remote.go b/pkg/cli/production/install/remote.go index de70744..34a0d1f 100644 --- a/pkg/cli/production/install/remote.go +++ b/pkg/cli/production/install/remote.go @@ -7,6 +7,7 @@ import ( "strconv" "strings" + "github.com/DeBrosOfficial/network/pkg/cli/remotessh" "github.com/DeBrosOfficial/network/pkg/inspector" ) @@ -14,39 +15,72 @@ import ( // It uploads the source archive, extracts it on the VPS, and runs // the actual install command remotely. type RemoteOrchestrator struct { - flags *Flags - node inspector.Node + flags *Flags + node inspector.Node + cleanup func() } // NewRemoteOrchestrator creates a new remote orchestrator. -// It resolves SSH credentials and checks prerequisites. +// Resolves SSH credentials via wallet-derived keys and checks prerequisites. func NewRemoteOrchestrator(flags *Flags) (*RemoteOrchestrator, error) { if flags.VpsIP == "" { return nil, fmt.Errorf("--vps-ip is required\nExample: orama install --vps-ip 1.2.3.4 --nameserver --domain orama-testnet.network") } - // Resolve SSH credentials - node, err := resolveSSHCredentials(flags.VpsIP) - if err != nil { - return nil, fmt.Errorf("failed to resolve SSH credentials: %w", err) + // Try to find this IP in nodes.conf for the correct user + user := resolveUser(flags.VpsIP) + + node := inspector.Node{ + User: user, + Host: flags.VpsIP, + Role: "node", } + // Prepare wallet-derived SSH key + nodes := []inspector.Node{node} + cleanup, err := remotessh.PrepareNodeKeys(nodes) + if err != nil { + return nil, fmt.Errorf("failed to prepare SSH key: %w\nEnsure you've run: rw vault ssh add %s/%s", err, flags.VpsIP, user) + } + // PrepareNodeKeys modifies nodes in place + node = nodes[0] + return &RemoteOrchestrator{ - flags: flags, - node: node, + flags: flags, + node: node, + cleanup: cleanup, }, nil } +// resolveUser looks up the SSH user for a VPS IP from nodes.conf. +// Falls back to "root" if not found. +func resolveUser(vpsIP string) string { + confPath := remotessh.FindNodesConf() + if confPath != "" { + nodes, err := inspector.LoadNodes(confPath) + if err == nil { + for _, n := range nodes { + if n.Host == vpsIP { + return n.User + } + } + } + } + return "root" +} + // Execute runs the remote install process. // If a binary archive exists locally, uploads and extracts it on the VPS // so Phase2b auto-detects pre-built mode. Otherwise, source must already // be present on the VPS. func (r *RemoteOrchestrator) Execute() error { + defer r.cleanup() + fmt.Printf("Installing on %s via SSH (%s@%s)...\n\n", r.flags.VpsIP, r.node.User, r.node.Host) // Try to upload a binary archive if one exists locally if err := r.uploadBinaryArchive(); err != nil { - fmt.Printf(" ⚠️ Binary archive upload skipped: %v\n", err) + fmt.Printf(" Binary archive upload skipped: %v\n", err) fmt.Printf(" Proceeding with source mode (source must already be on VPS)\n\n") } @@ -71,7 +105,7 @@ func (r *RemoteOrchestrator) uploadBinaryArchive() error { // Upload to /tmp/ on VPS remoteTmp := "/tmp/" + filepath.Base(archivePath) - if err := uploadFile(r.node, archivePath, remoteTmp); err != nil { + if err := remotessh.UploadFile(r.node, archivePath, remoteTmp); err != nil { return fmt.Errorf("failed to upload archive: %w", err) } @@ -79,7 +113,7 @@ func (r *RemoteOrchestrator) uploadBinaryArchive() error { fmt.Printf("Extracting archive on VPS...\n") extractCmd := fmt.Sprintf("%smkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s && cp /opt/orama/bin/orama /usr/local/bin/orama && chmod +x /usr/local/bin/orama && echo ' ✓ Archive extracted, CLI installed'", r.sudoPrefix(), remoteTmp, remoteTmp) - if err := runSSHStreaming(r.node, extractCmd); err != nil { + if err := remotessh.RunSSHStreaming(r.node, extractCmd); err != nil { return fmt.Errorf("failed to extract archive on VPS: %w", err) } @@ -118,7 +152,7 @@ func (r *RemoteOrchestrator) findLocalArchive() string { // runRemoteInstall executes `orama install` on the VPS. func (r *RemoteOrchestrator) runRemoteInstall() error { cmd := r.buildRemoteCommand() - return runSSHStreaming(r.node, cmd) + return remotessh.RunSSHStreaming(r.node, cmd) } // buildRemoteCommand constructs the `sudo orama install` command string diff --git a/pkg/cli/production/install/ssh.go b/pkg/cli/production/install/ssh.go deleted file mode 100644 index 5ba1034..0000000 --- a/pkg/cli/production/install/ssh.go +++ /dev/null @@ -1,153 +0,0 @@ -package install - -import ( - "bufio" - "fmt" - "os" - "os/exec" - "path/filepath" - "strings" - - "github.com/DeBrosOfficial/network/pkg/inspector" - "golang.org/x/term" -) - -const sourceArchivePath = "/tmp/network-source.tar.gz" - -// resolveSSHCredentials finds SSH credentials for the given VPS IP. -// First checks remote-nodes.conf, then prompts interactively. -func resolveSSHCredentials(vpsIP string) (inspector.Node, error) { - confPath := findRemoteNodesConf() - if confPath != "" { - nodes, err := inspector.LoadNodes(confPath) - if err == nil { - for _, n := range nodes { - if n.Host == vpsIP { - // Expand ~ in SSH key path - if n.SSHKey != "" && strings.HasPrefix(n.SSHKey, "~") { - home, _ := os.UserHomeDir() - n.SSHKey = filepath.Join(home, n.SSHKey[1:]) - } - return n, nil - } - } - } - } - - // Not found in config — prompt interactively - return promptSSHCredentials(vpsIP), nil -} - -// findRemoteNodesConf searches for the remote-nodes.conf file. -func findRemoteNodesConf() string { - candidates := []string{ - "scripts/remote-nodes.conf", - "../scripts/remote-nodes.conf", - "network/scripts/remote-nodes.conf", - } - for _, c := range candidates { - if _, err := os.Stat(c); err == nil { - return c - } - } - return "" -} - -// promptSSHCredentials asks the user for SSH credentials interactively. -func promptSSHCredentials(vpsIP string) inspector.Node { - reader := bufio.NewReader(os.Stdin) - - fmt.Printf("\nSSH credentials for %s\n", vpsIP) - fmt.Print(" SSH user (default: ubuntu): ") - user, _ := reader.ReadString('\n') - user = strings.TrimSpace(user) - if user == "" { - user = "ubuntu" - } - - fmt.Print(" SSH password: ") - passwordBytes, err := term.ReadPassword(int(os.Stdin.Fd())) - fmt.Println() // newline after hidden input - if err != nil { - // Fall back to plain read if terminal is not available - password, _ := reader.ReadString('\n') - return inspector.Node{ - User: user, - Host: vpsIP, - Password: strings.TrimSpace(password), - } - } - password := string(passwordBytes) - - return inspector.Node{ - User: user, - Host: vpsIP, - Password: password, - } -} - -// uploadFile copies a local file to a remote host via SCP. -func uploadFile(node inspector.Node, localPath, remotePath string) error { - dest := fmt.Sprintf("%s@%s:%s", node.User, node.Host, remotePath) - - var cmd *exec.Cmd - if node.SSHKey != "" { - cmd = exec.Command("scp", - "-o", "StrictHostKeyChecking=no", - "-o", "ConnectTimeout=10", - "-i", node.SSHKey, - localPath, dest, - ) - } else { - if _, err := exec.LookPath("sshpass"); err != nil { - return fmt.Errorf("sshpass not found — install it: brew install hudochenkov/sshpass/sshpass") - } - cmd = exec.Command("sshpass", "-p", node.Password, - "scp", - "-o", "StrictHostKeyChecking=no", - "-o", "ConnectTimeout=10", - localPath, dest, - ) - } - - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - - if err := cmd.Run(); err != nil { - return fmt.Errorf("SCP failed: %w", err) - } - return nil -} - -// runSSHStreaming executes a command on a remote host via SSH, -// streaming stdout/stderr to the local terminal in real-time. -func runSSHStreaming(node inspector.Node, command string) error { - var cmd *exec.Cmd - if node.SSHKey != "" { - cmd = exec.Command("ssh", - "-o", "StrictHostKeyChecking=no", - "-o", "ConnectTimeout=10", - "-i", node.SSHKey, - fmt.Sprintf("%s@%s", node.User, node.Host), - command, - ) - } else { - cmd = exec.Command("sshpass", "-p", node.Password, - "ssh", - "-o", "StrictHostKeyChecking=no", - "-o", "ConnectTimeout=10", - fmt.Sprintf("%s@%s", node.User, node.Host), - command, - ) - } - - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - cmd.Stdin = os.Stdin // Allow password prompts from remote sudo - - if err := cmd.Run(); err != nil { - return fmt.Errorf("SSH command failed: %w", err) - } - return nil -} - diff --git a/pkg/cli/production/push/push.go b/pkg/cli/production/push/push.go index 9cfebd9..ae54862 100644 --- a/pkg/cli/production/push/push.go +++ b/pkg/cli/production/push/push.go @@ -72,6 +72,13 @@ func execute(flags *Flags) error { return err } + // Prepare wallet-derived SSH keys + cleanup, err := remotessh.PrepareNodeKeys(nodes) + if err != nil { + return err + } + defer cleanup() + // Filter to single node if specified if flags.Node != "" { nodes = remotessh.FilterByIP(nodes, flags.Node) @@ -86,6 +93,11 @@ func execute(flags *Flags) error { return pushDirect(archivePath, nodes) } + // Load keys into ssh-agent for fanout forwarding + if err := remotessh.LoadAgentKeys(nodes); err != nil { + return fmt.Errorf("load agent keys for fanout: %w", err) + } + return pushFanout(archivePath, nodes) } @@ -111,7 +123,7 @@ func pushDirect(archivePath string, nodes []inspector.Node) error { return nil } -// pushFanout uploads to a hub node, then fans out to all others via server-to-server SCP. +// pushFanout uploads to a hub node, then fans out to all others via agent forwarding. func pushFanout(archivePath string, nodes []inspector.Node) error { hub := remotessh.PickHubNode(nodes) remotePath := "/tmp/" + filepath.Base(archivePath) @@ -127,7 +139,7 @@ func pushFanout(archivePath string, nodes []inspector.Node) error { } fmt.Printf(" ✓ hub %s done\n\n", hub.Host) - // Step 2: Fan out from hub to remaining nodes in parallel + // Step 2: Fan out from hub to remaining nodes in parallel (via agent forwarding) remaining := make([]inspector.Node, 0, len(nodes)-1) for _, n := range nodes { if n.Host != hub.Host { @@ -150,11 +162,11 @@ func pushFanout(archivePath string, nodes []inspector.Node) error { go func(idx int, target inspector.Node) { defer wg.Done() - // SCP from hub to target, then extract - scpCmd := fmt.Sprintf("sshpass -p '%s' scp -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o PreferredAuthentications=password -o PubkeyAuthentication=no %s %s@%s:%s", - target.Password, remotePath, target.User, target.Host, remotePath) + // SCP from hub to target (agent forwarding serves the key) + scpCmd := fmt.Sprintf("scp -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 %s %s@%s:%s", + remotePath, target.User, target.Host, remotePath) - if err := remotessh.RunSSHStreaming(hub, scpCmd); err != nil { + if err := remotessh.RunSSHStreaming(hub, scpCmd, remotessh.WithAgentForward()); err != nil { errors[idx] = fmt.Errorf("fanout to %s failed: %w", target.Host, err) return } @@ -196,16 +208,17 @@ func extractOnNode(node inspector.Node, remotePath string) error { } // extractOnNodeVia extracts the archive on a target node by SSHing through the hub. +// Uses agent forwarding so the hub can authenticate to the target. func extractOnNodeVia(hub, target inspector.Node, remotePath string) error { sudo := remotessh.SudoPrefix(target) extractCmd := fmt.Sprintf("%smkdir -p /opt/orama && %star xzf %s -C /opt/orama && %srm -f %s", sudo, sudo, remotePath, sudo, remotePath) - // SSH from hub to target to extract - sshCmd := fmt.Sprintf("sshpass -p '%s' ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o PreferredAuthentications=password -o PubkeyAuthentication=no %s@%s '%s'", - target.Password, target.User, target.Host, extractCmd) + // SSH from hub to target to extract (agent forwarding serves the key) + sshCmd := fmt.Sprintf("ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 %s@%s '%s'", + target.User, target.Host, extractCmd) - return remotessh.RunSSHStreaming(hub, sshCmd) + return remotessh.RunSSHStreaming(hub, sshCmd, remotessh.WithAgentForward()) } // findNewestArchive finds the newest binary archive in /tmp/. diff --git a/pkg/cli/production/recover/recover.go b/pkg/cli/production/recover/recover.go index f697325..62a84f4 100644 --- a/pkg/cli/production/recover/recover.go +++ b/pkg/cli/production/recover/recover.go @@ -70,6 +70,12 @@ func execute(flags *Flags) error { return err } + cleanup, err := remotessh.PrepareNodeKeys(nodes) + if err != nil { + return err + } + defer cleanup() + // Find leader node leaderNodes := remotessh.FilterByIP(nodes, flags.Leader) if len(leaderNodes) == 0 { diff --git a/pkg/cli/production/upgrade/remote.go b/pkg/cli/production/upgrade/remote.go index e91096c..9e8ec9a 100644 --- a/pkg/cli/production/upgrade/remote.go +++ b/pkg/cli/production/upgrade/remote.go @@ -25,6 +25,12 @@ func (r *RemoteUpgrader) Execute() error { return err } + cleanup, err := remotessh.PrepareNodeKeys(nodes) + if err != nil { + return err + } + defer cleanup() + // Filter to single node if specified if r.flags.NodeFilter != "" { nodes = remotessh.FilterByIP(nodes, r.flags.NodeFilter) diff --git a/pkg/cli/remotessh/config.go b/pkg/cli/remotessh/config.go index 19ab610..4556be9 100644 --- a/pkg/cli/remotessh/config.go +++ b/pkg/cli/remotessh/config.go @@ -4,24 +4,23 @@ import ( "fmt" "os" "path/filepath" - "strings" "github.com/DeBrosOfficial/network/pkg/inspector" ) -// FindRemoteNodesConf searches for the remote-nodes.conf file +// FindNodesConf searches for the nodes.conf file // in common locations relative to the current directory or project root. -func FindRemoteNodesConf() string { +func FindNodesConf() string { candidates := []string{ - "scripts/remote-nodes.conf", - "../scripts/remote-nodes.conf", - "network/scripts/remote-nodes.conf", + "scripts/nodes.conf", + "../scripts/nodes.conf", + "network/scripts/nodes.conf", } // Also check from home dir home, _ := os.UserHomeDir() if home != "" { - candidates = append(candidates, filepath.Join(home, ".orama", "remote-nodes.conf")) + candidates = append(candidates, filepath.Join(home, ".orama", "nodes.conf")) } for _, c := range candidates { @@ -32,11 +31,12 @@ func FindRemoteNodesConf() string { return "" } -// LoadEnvNodes loads all nodes for a given environment from remote-nodes.conf. +// LoadEnvNodes loads all nodes for a given environment from nodes.conf. +// SSHKey fields are NOT set — caller must call PrepareNodeKeys() after this. func LoadEnvNodes(env string) ([]inspector.Node, error) { - confPath := FindRemoteNodesConf() + confPath := FindNodesConf() if confPath == "" { - return nil, fmt.Errorf("remote-nodes.conf not found (checked scripts/, ../scripts/, network/scripts/)") + return nil, fmt.Errorf("nodes.conf not found (checked scripts/, ../scripts/, network/scripts/)") } nodes, err := inspector.LoadNodes(confPath) @@ -49,14 +49,6 @@ func LoadEnvNodes(env string) ([]inspector.Node, error) { return nil, fmt.Errorf("no nodes found for environment %q in %s", env, confPath) } - // Expand ~ in SSH key paths - home, _ := os.UserHomeDir() - for i := range filtered { - if filtered[i].SSHKey != "" && strings.HasPrefix(filtered[i].SSHKey, "~") { - filtered[i].SSHKey = filepath.Join(home, filtered[i].SSHKey[1:]) - } - } - return filtered, nil } diff --git a/pkg/cli/remotessh/ssh.go b/pkg/cli/remotessh/ssh.go index e77d7e0..803c384 100644 --- a/pkg/cli/remotessh/ssh.go +++ b/pkg/cli/remotessh/ssh.go @@ -8,31 +8,34 @@ import ( "github.com/DeBrosOfficial/network/pkg/inspector" ) +// SSHOption configures SSH command behavior. +type SSHOption func(*sshOptions) + +type sshOptions struct { + agentForward bool +} + +// WithAgentForward enables SSH agent forwarding (-A flag). +// Used by push fanout so the hub can reach targets via the forwarded agent. +func WithAgentForward() SSHOption { + return func(o *sshOptions) { o.agentForward = true } +} + // UploadFile copies a local file to a remote host via SCP. +// Requires node.SSHKey to be set (via PrepareNodeKeys). func UploadFile(node inspector.Node, localPath, remotePath string) error { + if node.SSHKey == "" { + return fmt.Errorf("no SSH key for %s (call PrepareNodeKeys first)", node.Name()) + } + dest := fmt.Sprintf("%s@%s:%s", node.User, node.Host, remotePath) - var cmd *exec.Cmd - if node.SSHKey != "" { - cmd = exec.Command("scp", - "-o", "StrictHostKeyChecking=no", - "-o", "ConnectTimeout=10", - "-i", node.SSHKey, - localPath, dest, - ) - } else { - if _, err := exec.LookPath("sshpass"); err != nil { - return fmt.Errorf("sshpass not found — install it: brew install hudochenkov/sshpass/sshpass") - } - cmd = exec.Command("sshpass", "-p", node.Password, - "scp", - "-o", "StrictHostKeyChecking=no", - "-o", "ConnectTimeout=10", - "-o", "PreferredAuthentications=password", - "-o", "PubkeyAuthentication=no", - localPath, dest, - ) - } + cmd := exec.Command("scp", + "-o", "StrictHostKeyChecking=accept-new", + "-o", "ConnectTimeout=10", + "-i", node.SSHKey, + localPath, dest, + ) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr @@ -45,28 +48,28 @@ func UploadFile(node inspector.Node, localPath, remotePath string) error { // RunSSHStreaming executes a command on a remote host via SSH, // streaming stdout/stderr to the local terminal in real-time. -func RunSSHStreaming(node inspector.Node, command string) error { - var cmd *exec.Cmd - if node.SSHKey != "" { - cmd = exec.Command("ssh", - "-o", "StrictHostKeyChecking=no", - "-o", "ConnectTimeout=10", - "-i", node.SSHKey, - fmt.Sprintf("%s@%s", node.User, node.Host), - command, - ) - } else { - cmd = exec.Command("sshpass", "-p", node.Password, - "ssh", - "-o", "StrictHostKeyChecking=no", - "-o", "ConnectTimeout=10", - "-o", "PreferredAuthentications=password", - "-o", "PubkeyAuthentication=no", - fmt.Sprintf("%s@%s", node.User, node.Host), - command, - ) +// Requires node.SSHKey to be set (via PrepareNodeKeys). +func RunSSHStreaming(node inspector.Node, command string, opts ...SSHOption) error { + if node.SSHKey == "" { + return fmt.Errorf("no SSH key for %s (call PrepareNodeKeys first)", node.Name()) } + var cfg sshOptions + for _, o := range opts { + o(&cfg) + } + + args := []string{ + "-o", "StrictHostKeyChecking=accept-new", + "-o", "ConnectTimeout=10", + "-i", node.SSHKey, + } + if cfg.agentForward { + args = append(args, "-A") + } + args = append(args, fmt.Sprintf("%s@%s", node.User, node.Host), command) + + cmd := exec.Command("ssh", args...) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr cmd.Stdin = os.Stdin diff --git a/pkg/cli/remotessh/wallet.go b/pkg/cli/remotessh/wallet.go new file mode 100644 index 0000000..bd8b0b5 --- /dev/null +++ b/pkg/cli/remotessh/wallet.go @@ -0,0 +1,158 @@ +package remotessh + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +// PrepareNodeKeys resolves wallet-derived SSH keys for all nodes. +// Calls `rw vault ssh get / --priv` for each unique host/user, +// writes PEMs to temp files, and sets node.SSHKey for each node. +// +// The nodes slice is modified in place — each node.SSHKey is set to +// the path of the temporary key file. +// +// Returns a cleanup function that zero-overwrites and removes all temp files. +// Caller must defer cleanup(). +func PrepareNodeKeys(nodes []inspector.Node) (cleanup func(), err error) { + rw, err := rwBinary() + if err != nil { + return nil, err + } + + // Create temp dir for all keys + tmpDir, err := os.MkdirTemp("", "orama-ssh-") + if err != nil { + return nil, fmt.Errorf("create temp dir: %w", err) + } + + // Track resolved keys by host/user to avoid duplicate rw calls + keyPaths := make(map[string]string) // "host/user" → temp file path + var allKeyPaths []string + + for i := range nodes { + key := nodes[i].Host + "/" + nodes[i].User + if existing, ok := keyPaths[key]; ok { + nodes[i].SSHKey = existing + continue + } + + // Call rw to get the private key PEM + pem, err := resolveWalletKey(rw, nodes[i].Host, nodes[i].User) + if err != nil { + // Cleanup any keys already written before returning error + cleanupKeys(tmpDir, allKeyPaths) + return nil, fmt.Errorf("resolve key for %s: %w", nodes[i].Name(), err) + } + + // Write PEM to temp file with restrictive perms + keyFile := filepath.Join(tmpDir, fmt.Sprintf("id_%d", i)) + if err := os.WriteFile(keyFile, []byte(pem), 0600); err != nil { + cleanupKeys(tmpDir, allKeyPaths) + return nil, fmt.Errorf("write key for %s: %w", nodes[i].Name(), err) + } + + keyPaths[key] = keyFile + allKeyPaths = append(allKeyPaths, keyFile) + nodes[i].SSHKey = keyFile + } + + cleanup = func() { + cleanupKeys(tmpDir, allKeyPaths) + } + return cleanup, nil +} + +// LoadAgentKeys loads SSH keys for the given nodes into the system ssh-agent. +// Used by push fanout to enable agent forwarding. +// Calls `rw vault ssh agent-load ...` +func LoadAgentKeys(nodes []inspector.Node) error { + rw, err := rwBinary() + if err != nil { + return err + } + + // Deduplicate host/user pairs + seen := make(map[string]bool) + var targets []string + for _, n := range nodes { + key := n.Host + "/" + n.User + if seen[key] { + continue + } + seen[key] = true + targets = append(targets, key) + } + + if len(targets) == 0 { + return nil + } + + args := append([]string{"vault", "ssh", "agent-load"}, targets...) + cmd := exec.Command(rw, args...) + cmd.Stderr = os.Stderr + cmd.Stdout = os.Stderr // info messages go to stderr + + if err := cmd.Run(); err != nil { + return fmt.Errorf("rw vault ssh agent-load failed: %w", err) + } + return nil +} + +// resolveWalletKey calls `rw vault ssh get / --priv` +// and returns the PEM string. Requires an active rw session. +func resolveWalletKey(rw string, host, user string) (string, error) { + target := host + "/" + user + cmd := exec.Command(rw, "vault", "ssh", "get", target, "--priv") + out, err := cmd.Output() + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + stderr := strings.TrimSpace(string(exitErr.Stderr)) + if strings.Contains(stderr, "No SSH entry") { + return "", fmt.Errorf("no vault SSH entry for %s — run: rw vault ssh add %s", target, target) + } + if strings.Contains(stderr, "not unlocked") || strings.Contains(stderr, "session") { + return "", fmt.Errorf("wallet is locked — run: rw unlock") + } + return "", fmt.Errorf("%s", stderr) + } + return "", fmt.Errorf("rw command failed: %w", err) + } + pem := string(out) + if !strings.Contains(pem, "BEGIN OPENSSH PRIVATE KEY") { + return "", fmt.Errorf("rw returned invalid key for %s", target) + } + return pem, nil +} + +// rwBinary returns the path to the `rw` binary. +// Checks RW_PATH env var first, then PATH. +func rwBinary() (string, error) { + if p := os.Getenv("RW_PATH"); p != "" { + if _, err := os.Stat(p); err == nil { + return p, nil + } + return "", fmt.Errorf("RW_PATH=%q not found", p) + } + + p, err := exec.LookPath("rw") + if err != nil { + return "", fmt.Errorf("rw not found in PATH — install rootwallet CLI: https://github.com/DeBrosOfficial/rootwallet") + } + return p, nil +} + +// cleanupKeys zero-overwrites and removes all key files, then removes the temp dir. +func cleanupKeys(tmpDir string, keyPaths []string) { + zeros := make([]byte, 512) + for _, p := range keyPaths { + _ = os.WriteFile(p, zeros, 0600) // zero-overwrite + _ = os.Remove(p) + } + _ = os.Remove(tmpDir) +} diff --git a/pkg/inspector/checks/helpers_test.go b/pkg/inspector/checks/helpers_test.go index 7732028..8bbc923 100644 --- a/pkg/inspector/checks/helpers_test.go +++ b/pkg/inspector/checks/helpers_test.go @@ -13,7 +13,6 @@ func makeNode(host, role string) inspector.Node { Environment: "devnet", User: "ubuntu", Host: host, - Password: "test", Role: role, } } diff --git a/pkg/inspector/config.go b/pkg/inspector/config.go index 524f19e..cad33c7 100644 --- a/pkg/inspector/config.go +++ b/pkg/inspector/config.go @@ -7,14 +7,13 @@ import ( "strings" ) -// Node represents a remote node parsed from remote-nodes.conf. +// Node represents a remote node parsed from nodes.conf. type Node struct { Environment string // devnet, testnet User string // SSH user Host string // IP or hostname - Password string // SSH password Role string // node, nameserver-ns1, nameserver-ns2, nameserver-ns3 - SSHKey string // optional path to SSH key + SSHKey string // populated at runtime by PrepareNodeKeys() } // Name returns a short display name for the node (user@host). @@ -27,8 +26,8 @@ func (n Node) IsNameserver() bool { return strings.HasPrefix(n.Role, "nameserver") } -// LoadNodes parses a remote-nodes.conf file into a slice of Nodes. -// Format: environment|user@host|password|role|ssh_key (ssh_key optional) +// LoadNodes parses a nodes.conf file into a slice of Nodes. +// Format: environment|user@host|role func LoadNodes(path string) ([]Node, error) { f, err := os.Open(path) if err != nil { @@ -46,20 +45,14 @@ func LoadNodes(path string) ([]Node, error) { continue } - parts := strings.SplitN(line, "|", 5) - if len(parts) < 4 { - return nil, fmt.Errorf("line %d: expected at least 4 pipe-delimited fields, got %d", lineNum, len(parts)) + parts := strings.SplitN(line, "|", 4) + if len(parts) < 3 { + return nil, fmt.Errorf("line %d: expected 3 pipe-delimited fields (env|user@host|role), got %d", lineNum, len(parts)) } env := parts[0] userHost := parts[1] - password := parts[2] - role := parts[3] - - var sshKey string - if len(parts) == 5 { - sshKey = parts[4] - } + role := parts[2] // Parse user@host at := strings.LastIndex(userHost, "@") @@ -73,9 +66,7 @@ func LoadNodes(path string) ([]Node, error) { Environment: env, User: user, Host: host, - Password: password, Role: role, - SSHKey: sshKey, }) } if err := scanner.Err(); err != nil { diff --git a/pkg/inspector/config_test.go b/pkg/inspector/config_test.go index 9d7d368..384b5a1 100644 --- a/pkg/inspector/config_test.go +++ b/pkg/inspector/config_test.go @@ -8,9 +8,9 @@ import ( func TestLoadNodes(t *testing.T) { content := `# Comment line -devnet|ubuntu@1.2.3.4|pass123|node -devnet|ubuntu@1.2.3.5|pass456|node -devnet|ubuntu@5.6.7.8|pass789|nameserver-ns1|/path/to/key +devnet|ubuntu@1.2.3.4|node +devnet|ubuntu@1.2.3.5|node +devnet|ubuntu@5.6.7.8|nameserver-ns1 ` path := writeTempFile(t, content) @@ -33,34 +33,28 @@ devnet|ubuntu@5.6.7.8|pass789|nameserver-ns1|/path/to/key if n.Host != "1.2.3.4" { t.Errorf("node[0].Host = %q, want 1.2.3.4", n.Host) } - if n.Password != "pass123" { - t.Errorf("node[0].Password = %q, want pass123", n.Password) - } if n.Role != "node" { t.Errorf("node[0].Role = %q, want node", n.Role) } if n.SSHKey != "" { - t.Errorf("node[0].SSHKey = %q, want empty", n.SSHKey) + t.Errorf("node[0].SSHKey = %q, want empty (set at runtime)", n.SSHKey) } - // Third node with SSH key + // Third node with nameserver role n3 := nodes[2] if n3.Role != "nameserver-ns1" { t.Errorf("node[2].Role = %q, want nameserver-ns1", n3.Role) } - if n3.SSHKey != "/path/to/key" { - t.Errorf("node[2].SSHKey = %q, want /path/to/key", n3.SSHKey) - } } func TestLoadNodes_EmptyLines(t *testing.T) { content := ` # Full line comment -devnet|ubuntu@1.2.3.4|pass|node +devnet|ubuntu@1.2.3.4|node # Another comment -devnet|ubuntu@1.2.3.5|pass|node +devnet|ubuntu@1.2.3.5|node ` path := writeTempFile(t, content) @@ -78,8 +72,8 @@ func TestLoadNodes_InvalidFormat(t *testing.T) { name string content string }{ - {"too few fields", "devnet|ubuntu@1.2.3.4|pass\n"}, - {"no @ in userhost", "devnet|localhost|pass|node\n"}, + {"too few fields", "devnet|ubuntu@1.2.3.4\n"}, + {"no @ in userhost", "devnet|localhost|node\n"}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/pkg/inspector/ssh.go b/pkg/inspector/ssh.go index e16ad74..f73d2f0 100644 --- a/pkg/inspector/ssh.go +++ b/pkg/inspector/ssh.go @@ -31,7 +31,7 @@ func (r SSHResult) OK() bool { } // RunSSH executes a command on a remote node via SSH with retry on connection failure. -// Uses sshpass for password auth, falls back to -i for key-based auth. +// Requires node.SSHKey to be set (via PrepareNodeKeys). // The -n flag is used to prevent SSH from reading stdin. func RunSSH(ctx context.Context, node Node, command string) SSHResult { var result SSHResult @@ -76,30 +76,23 @@ func RunSSH(ctx context.Context, node Node, command string) SSHResult { func runSSHOnce(ctx context.Context, node Node, command string) SSHResult { start := time.Now() - var args []string - if node.SSHKey != "" { - // Key-based auth - args = []string{ - "ssh", "-n", - "-o", "StrictHostKeyChecking=no", - "-o", "ConnectTimeout=10", - "-o", "BatchMode=yes", - "-i", node.SSHKey, - fmt.Sprintf("%s@%s", node.User, node.Host), - command, - } - } else { - // Password auth via sshpass - args = []string{ - "sshpass", "-p", node.Password, - "ssh", "-n", - "-o", "StrictHostKeyChecking=no", - "-o", "ConnectTimeout=10", - fmt.Sprintf("%s@%s", node.User, node.Host), - command, + if node.SSHKey == "" { + return SSHResult{ + Duration: 0, + Err: fmt.Errorf("no SSH key for %s (call PrepareNodeKeys first)", node.Name()), } } + args := []string{ + "ssh", "-n", + "-o", "StrictHostKeyChecking=accept-new", + "-o", "ConnectTimeout=10", + "-o", "BatchMode=yes", + "-i", node.SSHKey, + fmt.Sprintf("%s@%s", node.User, node.Host), + command, + } + cmd := exec.CommandContext(ctx, args[0], args[1:]...) var stdout, stderr bytes.Buffer @@ -130,8 +123,6 @@ func runSSHOnce(ctx context.Context, node Node, command string) SSHResult { // isSSHConnectionError returns true if the failure looks like an SSH connection // problem (timeout, refused, network unreachable) rather than a remote command error. func isSSHConnectionError(r SSHResult) bool { - // sshpass exit code 5 = invalid/incorrect password (not retriable) - // sshpass exit code 6 = host key verification failed (not retriable) // SSH exit code 255 = SSH connection error (retriable) if r.ExitCode == 255 { return true diff --git a/scripts/nodes.conf b/scripts/nodes.conf new file mode 100644 index 0000000..72e4c36 --- /dev/null +++ b/scripts/nodes.conf @@ -0,0 +1,42 @@ +# Orama Network node topology +# Format: environment|user@host|role +# Auth: wallet-derived SSH keys (rw vault ssh) +# +# environment: devnet, testnet +# role: node, nameserver-ns1, nameserver-ns2, nameserver-ns3 + +# --- Devnet nameservers --- +devnet|ubuntu@57.129.7.232|nameserver-ns1 +devnet|ubuntu@57.131.41.160|nameserver-ns2 +devnet|ubuntu@51.38.128.56|nameserver-ns3 + +# --- Devnet nodes --- +devnet|ubuntu@144.217.162.62|node +devnet|ubuntu@51.83.128.181|node +devnet|ubuntu@144.217.160.15|node +devnet|root@46.250.241.133|node +devnet|root@109.123.229.231|node +devnet|ubuntu@144.217.162.143|node +devnet|ubuntu@144.217.163.114|node +devnet|root@109.123.239.61|node +devnet|root@217.76.56.2|node +devnet|ubuntu@198.244.150.237|node +devnet|root@154.38.187.158|node + +# --- Testnet nameservers --- +testnet|ubuntu@51.195.109.238|nameserver-ns1 +testnet|ubuntu@57.131.41.159|nameserver-ns1 +testnet|ubuntu@51.38.130.69|nameserver-ns1 + +# --- Testnet nodes --- +testnet|root@178.212.35.184|node +testnet|root@62.72.44.87|node +testnet|ubuntu@51.178.84.172|node +testnet|ubuntu@135.125.175.236|node +testnet|ubuntu@57.128.223.149|node +testnet|root@38.242.221.178|node +testnet|root@194.61.28.7|node +testnet|root@83.171.248.66|node +testnet|ubuntu@141.227.165.168|node +testnet|ubuntu@141.227.165.154|node +testnet|ubuntu@141.227.156.51|node From fade8f89ed6917424e3a10f684bc0edd8deb6d57 Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Wed, 25 Feb 2026 15:13:18 +0200 Subject: [PATCH 04/13] Added hatzhner support for clustering cli orama to spin up clusters --- docs/SANDBOX.md | 208 +++++++++++ pkg/cli/cmd/sandboxcmd/sandbox.go | 121 +++++++ pkg/cli/sandbox/config.go | 153 +++++++++ pkg/cli/sandbox/create.go | 554 ++++++++++++++++++++++++++++++ pkg/cli/sandbox/destroy.go | 122 +++++++ pkg/cli/sandbox/hetzner.go | 438 +++++++++++++++++++++++ pkg/cli/sandbox/hetzner_test.go | 303 ++++++++++++++++ pkg/cli/sandbox/names.go | 26 ++ pkg/cli/sandbox/rollout.go | 137 ++++++++ pkg/cli/sandbox/setup.go | 319 +++++++++++++++++ pkg/cli/sandbox/ssh_cmd.go | 56 +++ pkg/cli/sandbox/state.go | 211 ++++++++++++ pkg/cli/sandbox/state_test.go | 214 ++++++++++++ pkg/cli/sandbox/status.go | 160 +++++++++ 14 files changed, 3022 insertions(+) create mode 100644 docs/SANDBOX.md create mode 100644 pkg/cli/cmd/sandboxcmd/sandbox.go create mode 100644 pkg/cli/sandbox/config.go create mode 100644 pkg/cli/sandbox/create.go create mode 100644 pkg/cli/sandbox/destroy.go create mode 100644 pkg/cli/sandbox/hetzner.go create mode 100644 pkg/cli/sandbox/hetzner_test.go create mode 100644 pkg/cli/sandbox/names.go create mode 100644 pkg/cli/sandbox/rollout.go create mode 100644 pkg/cli/sandbox/setup.go create mode 100644 pkg/cli/sandbox/ssh_cmd.go create mode 100644 pkg/cli/sandbox/state.go create mode 100644 pkg/cli/sandbox/state_test.go create mode 100644 pkg/cli/sandbox/status.go diff --git a/docs/SANDBOX.md b/docs/SANDBOX.md new file mode 100644 index 0000000..a2df967 --- /dev/null +++ b/docs/SANDBOX.md @@ -0,0 +1,208 @@ +# Sandbox: Ephemeral Hetzner Cloud Clusters + +Spin up temporary 5-node Orama clusters on Hetzner Cloud for development and testing. Total cost: ~€0.04/hour. + +## Quick Start + +```bash +# One-time setup (API key, domain, floating IPs, SSH key) +orama sandbox setup + +# Create a cluster (~5 minutes) +orama sandbox create --name my-feature + +# Check health +orama sandbox status + +# SSH into a node +orama sandbox ssh 1 + +# Deploy code changes +orama sandbox rollout + +# Tear it down +orama sandbox destroy +``` + +## Prerequisites + +### 1. Hetzner Cloud Account + +Create a project at [console.hetzner.cloud](https://console.hetzner.cloud) and generate an API token with read/write permissions under **Security > API Tokens**. + +### 2. Domain with Glue Records + +You need a domain (or subdomain) that points to Hetzner Floating IPs. The `orama sandbox setup` wizard will guide you through this. + +**Example:** Using `sbx.dbrs.space` + +At your domain registrar: +1. Create glue records (Personal DNS Servers): + - `ns1.sbx.dbrs.space` → `` + - `ns2.sbx.dbrs.space` → `` +2. Set custom nameservers for `sbx.dbrs.space`: + - `ns1.sbx.dbrs.space` + - `ns2.sbx.dbrs.space` + +DNS propagation can take up to 48 hours. + +### 3. Binary Archive + +Build the binary archive before creating a cluster: + +```bash +orama build +``` + +This creates `/tmp/orama--linux-amd64.tar.gz` with all pre-compiled binaries. + +## Setup + +Run the interactive setup wizard: + +```bash +orama sandbox setup +``` + +This will: +1. Prompt for your Hetzner API token and validate it +2. Ask for your sandbox domain +3. Create or reuse 2 Hetzner Floating IPs (~$0.005/hr each) +4. Create a firewall with sandbox rules +5. Generate an SSH keypair at `~/.orama/sandbox_key` +6. Upload the public key to Hetzner +7. Display DNS configuration instructions + +Config is saved to `~/.orama/sandbox.yaml`. + +## Commands + +### `orama sandbox create [--name ]` + +Creates a new 5-node cluster. If `--name` is omitted, a random name is generated (e.g., "swift-falcon"). + +**Cluster layout:** +- Nodes 1-2: Nameservers (CoreDNS + Caddy + all services) +- Nodes 3-5: Regular nodes (all services except CoreDNS) + +**Phases:** +1. Provision 5 CX22 servers on Hetzner (parallel, ~90s) +2. Assign floating IPs to nameserver nodes (~10s) +3. Upload binary archive to all nodes (parallel, ~60s) +4. Install genesis node + generate invite tokens (~120s) +5. Join remaining 4 nodes (serial with health checks, ~180s) +6. Verify cluster health (~15s) + +**One sandbox at a time.** Since the floating IPs are shared, only one sandbox can own the nameservers. Destroy the active sandbox before creating a new one. + +### `orama sandbox destroy [--name ] [--force]` + +Tears down a cluster: +1. Unassigns floating IPs +2. Deletes all 5 servers (parallel) +3. Removes state file + +Use `--force` to skip confirmation. + +### `orama sandbox list` + +Lists all sandboxes with their status. Also checks Hetzner for orphaned servers that don't have a corresponding state file. + +### `orama sandbox status [--name ]` + +Shows per-node health including: +- Service status (active/inactive) +- RQLite role (Leader/Follower) +- Cluster summary (commit index, voter count) + +### `orama sandbox rollout [--name ]` + +Deploys code changes: +1. Uses the latest binary archive from `/tmp/` (run `orama build` first) +2. Pushes to all nodes +3. Rolling upgrade: followers first, leader last, 15s between nodes + +### `orama sandbox ssh ` + +Opens an interactive SSH session to a sandbox node (1-5). + +```bash +orama sandbox ssh 1 # SSH into node 1 (genesis/ns1) +orama sandbox ssh 3 # SSH into node 3 (regular node) +``` + +## Architecture + +### Floating IPs + +Hetzner Floating IPs are persistent IPv4 addresses that can be reassigned between servers. They solve the DNS chicken-and-egg problem: + +- Glue records at the registrar point to 2 Floating IPs (configured once) +- Each new sandbox assigns the Floating IPs to its nameserver nodes +- DNS works instantly — no propagation delay between clusters + +### SSH Authentication + +Sandbox uses a standalone ed25519 keypair at `~/.orama/sandbox_key`, separate from the production wallet-derived keys. The public key is uploaded to Hetzner during setup and injected into every server at creation time. + +### Server Naming + +Servers: `sbx--` (e.g., `sbx-swift-falcon-1` through `sbx-swift-falcon-5`) + +### State Files + +Sandbox state is stored at `~/.orama/sandboxes/.yaml`. This tracks server IDs, IPs, roles, and cluster status. + +## Cost + +| Resource | Cost | Qty | Total | +|----------|------|-----|-------| +| CX22 (2 vCPU, 4GB) | €0.006/hr | 5 | €0.03/hr | +| Floating IPv4 | €0.005/hr | 2 | €0.01/hr | +| **Total** | | | **~€0.04/hr** | + +Servers are billed per hour. Floating IPs are billed as long as they exist (even unassigned). Destroy the sandbox when not in use to save on server costs. + +## Troubleshooting + +### "sandbox not configured" + +Run `orama sandbox setup` first. + +### "no binary archive found" + +Run `orama build` to create the binary archive. + +### "sandbox X is already active" + +Only one sandbox can be active at a time. Destroy it first: +```bash +orama sandbox destroy --name +``` + +### Server creation fails + +Check: +- Hetzner API token is valid and has read/write permissions +- You haven't hit Hetzner's server limit (default: 10 per project) +- The selected location has CX22 capacity + +### Genesis install fails + +SSH into the node to debug: +```bash +orama sandbox ssh 1 +journalctl -u orama-node -f +``` + +The sandbox will be left in "error" state. You can destroy and recreate it. + +### DNS not resolving + +1. Verify glue records are configured at your registrar +2. Check propagation: `dig NS sbx.dbrs.space @8.8.8.8` +3. Propagation can take 24-48 hours for new domains + +### Orphaned servers + +If `orama sandbox list` shows orphaned servers, delete them manually at [console.hetzner.cloud](https://console.hetzner.cloud). Sandbox servers are labeled `orama-sandbox=` for easy identification. diff --git a/pkg/cli/cmd/sandboxcmd/sandbox.go b/pkg/cli/cmd/sandboxcmd/sandbox.go new file mode 100644 index 0000000..f922053 --- /dev/null +++ b/pkg/cli/cmd/sandboxcmd/sandbox.go @@ -0,0 +1,121 @@ +package sandboxcmd + +import ( + "fmt" + "os" + + "github.com/DeBrosOfficial/network/pkg/cli/sandbox" + "github.com/spf13/cobra" +) + +// Cmd is the root command for sandbox operations. +var Cmd = &cobra.Command{ + Use: "sandbox", + Short: "Manage ephemeral Hetzner Cloud clusters for testing", + Long: `Spin up temporary 5-node Orama clusters on Hetzner Cloud for development and testing. + +Setup (one-time): + orama sandbox setup + +Usage: + orama sandbox create [--name ] Create a new 5-node cluster + orama sandbox destroy [--name ] Tear down a cluster + orama sandbox list List active sandboxes + orama sandbox status [--name ] Show cluster health + orama sandbox rollout [--name ] Build + push + rolling upgrade + orama sandbox ssh SSH into a sandbox node (1-5)`, +} + +var setupCmd = &cobra.Command{ + Use: "setup", + Short: "Interactive setup: Hetzner API key, domain, floating IPs, SSH key", + RunE: func(cmd *cobra.Command, args []string) error { + return sandbox.Setup() + }, +} + +var createCmd = &cobra.Command{ + Use: "create", + Short: "Create a new 5-node sandbox cluster (~5 min)", + RunE: func(cmd *cobra.Command, args []string) error { + name, _ := cmd.Flags().GetString("name") + return sandbox.Create(name) + }, +} + +var destroyCmd = &cobra.Command{ + Use: "destroy", + Short: "Destroy a sandbox cluster and release resources", + RunE: func(cmd *cobra.Command, args []string) error { + name, _ := cmd.Flags().GetString("name") + force, _ := cmd.Flags().GetBool("force") + return sandbox.Destroy(name, force) + }, +} + +var listCmd = &cobra.Command{ + Use: "list", + Short: "List active sandbox clusters", + RunE: func(cmd *cobra.Command, args []string) error { + return sandbox.List() + }, +} + +var statusCmd = &cobra.Command{ + Use: "status", + Short: "Show cluster health report", + RunE: func(cmd *cobra.Command, args []string) error { + name, _ := cmd.Flags().GetString("name") + return sandbox.Status(name) + }, +} + +var rolloutCmd = &cobra.Command{ + Use: "rollout", + Short: "Build + push + rolling upgrade to sandbox cluster", + RunE: func(cmd *cobra.Command, args []string) error { + name, _ := cmd.Flags().GetString("name") + return sandbox.Rollout(name) + }, +} + +var sshCmd = &cobra.Command{ + Use: "ssh ", + Short: "SSH into a sandbox node (1-5)", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + name, _ := cmd.Flags().GetString("name") + var nodeNum int + if _, err := fmt.Sscanf(args[0], "%d", &nodeNum); err != nil { + fmt.Fprintf(os.Stderr, "Invalid node number: %s (expected 1-5)\n", args[0]) + os.Exit(1) + } + return sandbox.SSHInto(name, nodeNum) + }, +} + +func init() { + // create flags + createCmd.Flags().String("name", "", "Sandbox name (random if not specified)") + + // destroy flags + destroyCmd.Flags().String("name", "", "Sandbox name (uses active if not specified)") + destroyCmd.Flags().Bool("force", false, "Skip confirmation") + + // status flags + statusCmd.Flags().String("name", "", "Sandbox name (uses active if not specified)") + + // rollout flags + rolloutCmd.Flags().String("name", "", "Sandbox name (uses active if not specified)") + + // ssh flags + sshCmd.Flags().String("name", "", "Sandbox name (uses active if not specified)") + + Cmd.AddCommand(setupCmd) + Cmd.AddCommand(createCmd) + Cmd.AddCommand(destroyCmd) + Cmd.AddCommand(listCmd) + Cmd.AddCommand(statusCmd) + Cmd.AddCommand(rolloutCmd) + Cmd.AddCommand(sshCmd) +} diff --git a/pkg/cli/sandbox/config.go b/pkg/cli/sandbox/config.go new file mode 100644 index 0000000..f1ba9ca --- /dev/null +++ b/pkg/cli/sandbox/config.go @@ -0,0 +1,153 @@ +package sandbox + +import ( + "fmt" + "os" + "path/filepath" + + "gopkg.in/yaml.v3" +) + +// Config holds sandbox configuration, stored at ~/.orama/sandbox.yaml. +type Config struct { + HetznerAPIToken string `yaml:"hetzner_api_token"` + Domain string `yaml:"domain"` + Location string `yaml:"location"` // Hetzner datacenter (default: fsn1) + ServerType string `yaml:"server_type"` // Hetzner server type (default: cx22) + FloatingIPs []FloatIP `yaml:"floating_ips"` + SSHKey SSHKeyConfig `yaml:"ssh_key"` + FirewallID int64 `yaml:"firewall_id,omitempty"` // Hetzner firewall resource ID +} + +// FloatIP holds a Hetzner floating IP reference. +type FloatIP struct { + ID int64 `yaml:"id"` + IP string `yaml:"ip"` +} + +// SSHKeyConfig holds SSH key paths and the Hetzner resource ID. +type SSHKeyConfig struct { + HetznerID int64 `yaml:"hetzner_id"` + PrivateKeyPath string `yaml:"private_key_path"` + PublicKeyPath string `yaml:"public_key_path"` +} + +// configDir returns ~/.orama/, creating it if needed. +func configDir() (string, error) { + home, err := os.UserHomeDir() + if err != nil { + return "", fmt.Errorf("get home directory: %w", err) + } + dir := filepath.Join(home, ".orama") + if err := os.MkdirAll(dir, 0700); err != nil { + return "", fmt.Errorf("create config directory: %w", err) + } + return dir, nil +} + +// configPath returns the full path to ~/.orama/sandbox.yaml. +func configPath() (string, error) { + dir, err := configDir() + if err != nil { + return "", err + } + return filepath.Join(dir, "sandbox.yaml"), nil +} + +// LoadConfig reads the sandbox config from ~/.orama/sandbox.yaml. +// Returns an error if the file doesn't exist (user must run setup first). +func LoadConfig() (*Config, error) { + path, err := configPath() + if err != nil { + return nil, err + } + + data, err := os.ReadFile(path) + if err != nil { + if os.IsNotExist(err) { + return nil, fmt.Errorf("sandbox not configured, run: orama sandbox setup") + } + return nil, fmt.Errorf("read config: %w", err) + } + + var cfg Config + if err := yaml.Unmarshal(data, &cfg); err != nil { + return nil, fmt.Errorf("parse config %s: %w", path, err) + } + + if err := cfg.validate(); err != nil { + return nil, fmt.Errorf("invalid config: %w", err) + } + + cfg.Defaults() + + return &cfg, nil +} + +// SaveConfig writes the sandbox config to ~/.orama/sandbox.yaml. +func SaveConfig(cfg *Config) error { + path, err := configPath() + if err != nil { + return err + } + + data, err := yaml.Marshal(cfg) + if err != nil { + return fmt.Errorf("marshal config: %w", err) + } + + if err := os.WriteFile(path, data, 0600); err != nil { + return fmt.Errorf("write config: %w", err) + } + + return nil +} + +// validate checks that required fields are present. +func (c *Config) validate() error { + if c.HetznerAPIToken == "" { + return fmt.Errorf("hetzner_api_token is required") + } + if c.Domain == "" { + return fmt.Errorf("domain is required") + } + if len(c.FloatingIPs) < 2 { + return fmt.Errorf("2 floating IPs required, got %d", len(c.FloatingIPs)) + } + if c.SSHKey.PrivateKeyPath == "" { + return fmt.Errorf("ssh_key.private_key_path is required") + } + return nil +} + +// Defaults fills in default values for optional fields. +func (c *Config) Defaults() { + if c.Location == "" { + c.Location = "fsn1" + } + if c.ServerType == "" { + c.ServerType = "cx22" + } +} + +// ExpandedPrivateKeyPath returns the absolute path to the SSH private key. +func (c *Config) ExpandedPrivateKeyPath() string { + return expandHome(c.SSHKey.PrivateKeyPath) +} + +// ExpandedPublicKeyPath returns the absolute path to the SSH public key. +func (c *Config) ExpandedPublicKeyPath() string { + return expandHome(c.SSHKey.PublicKeyPath) +} + +// expandHome replaces a leading ~ with the user's home directory. +func expandHome(path string) string { + if len(path) < 2 || path[:2] != "~/" { + return path + } + home, err := os.UserHomeDir() + if err != nil { + return path + } + return filepath.Join(home, path[2:]) +} diff --git a/pkg/cli/sandbox/create.go b/pkg/cli/sandbox/create.go new file mode 100644 index 0000000..292e1d9 --- /dev/null +++ b/pkg/cli/sandbox/create.go @@ -0,0 +1,554 @@ +package sandbox + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "sync" + "time" + + "github.com/DeBrosOfficial/network/pkg/cli/remotessh" + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +// Create orchestrates the creation of a new sandbox cluster. +func Create(name string) error { + cfg, err := LoadConfig() + if err != nil { + return err + } + + // Check for existing active sandbox + active, err := FindActiveSandbox() + if err != nil { + return err + } + if active != nil { + return fmt.Errorf("sandbox %q is already active (status: %s)\nDestroy it first: orama sandbox destroy --name %s", + active.Name, active.Status, active.Name) + } + + // Generate name if not provided + if name == "" { + name = GenerateName() + } + + fmt.Printf("Creating sandbox %q (%s, %d nodes)\n\n", name, cfg.Domain, 5) + + client := NewHetznerClient(cfg.HetznerAPIToken) + + state := &SandboxState{ + Name: name, + CreatedAt: time.Now().UTC(), + Domain: cfg.Domain, + Status: StatusCreating, + } + + // Phase 1: Provision servers + fmt.Println("Phase 1: Provisioning servers...") + if err := phase1ProvisionServers(client, cfg, state); err != nil { + cleanupFailedCreate(client, state) + return fmt.Errorf("provision servers: %w", err) + } + SaveState(state) + + // Phase 2: Assign floating IPs + fmt.Println("\nPhase 2: Assigning floating IPs...") + if err := phase2AssignFloatingIPs(client, cfg, state); err != nil { + return fmt.Errorf("assign floating IPs: %w", err) + } + SaveState(state) + + // Phase 3: Upload binary archive + fmt.Println("\nPhase 3: Uploading binary archive...") + if err := phase3UploadArchive(cfg, state); err != nil { + return fmt.Errorf("upload archive: %w", err) + } + + // Phase 4: Install genesis node + fmt.Println("\nPhase 4: Installing genesis node...") + tokens, err := phase4InstallGenesis(cfg, state) + if err != nil { + state.Status = StatusError + SaveState(state) + return fmt.Errorf("install genesis: %w", err) + } + + // Phase 5: Join remaining nodes + fmt.Println("\nPhase 5: Joining remaining nodes...") + if err := phase5JoinNodes(cfg, state, tokens); err != nil { + state.Status = StatusError + SaveState(state) + return fmt.Errorf("join nodes: %w", err) + } + + // Phase 6: Verify cluster + fmt.Println("\nPhase 6: Verifying cluster...") + phase6Verify(cfg, state) + + state.Status = StatusRunning + SaveState(state) + + printCreateSummary(cfg, state) + return nil +} + +// phase1ProvisionServers creates 5 Hetzner servers in parallel. +func phase1ProvisionServers(client *HetznerClient, cfg *Config, state *SandboxState) error { + type serverResult struct { + index int + server *HetznerServer + err error + } + + results := make(chan serverResult, 5) + + for i := 0; i < 5; i++ { + go func(idx int) { + role := "node" + if idx < 2 { + role = "nameserver" + } + + serverName := fmt.Sprintf("sbx-%s-%d", state.Name, idx+1) + labels := map[string]string{ + "orama-sandbox": state.Name, + "orama-sandbox-role": role, + } + + req := CreateServerRequest{ + Name: serverName, + ServerType: cfg.ServerType, + Image: "ubuntu-24.04", + Location: cfg.Location, + SSHKeys: []int64{cfg.SSHKey.HetznerID}, + Labels: labels, + } + if cfg.FirewallID > 0 { + req.Firewalls = []struct { + Firewall int64 `json:"firewall"` + }{{Firewall: cfg.FirewallID}} + } + + srv, err := client.CreateServer(req) + results <- serverResult{index: idx, server: srv, err: err} + }(i) + } + + servers := make([]ServerState, 5) + for i := 0; i < 5; i++ { + r := <-results + if r.err != nil { + return fmt.Errorf("server %d: %w", r.index+1, r.err) + } + fmt.Printf(" Created %s (ID: %d, initializing...)\n", r.server.Name, r.server.ID) + role := "node" + if r.index < 2 { + role = "nameserver" + } + servers[r.index] = ServerState{ + ID: r.server.ID, + Name: r.server.Name, + Role: role, + } + } + + // Wait for all servers to reach "running" + fmt.Print(" Waiting for servers to boot...") + for i := range servers { + srv, err := client.WaitForServer(servers[i].ID, 3*time.Minute) + if err != nil { + return fmt.Errorf("wait for %s: %w", servers[i].Name, err) + } + servers[i].IP = srv.PublicNet.IPv4.IP + fmt.Print(".") + } + fmt.Println(" OK") + + // Assign floating IPs to nameserver entries + if len(cfg.FloatingIPs) >= 2 { + servers[0].FloatingIP = cfg.FloatingIPs[0].IP + servers[1].FloatingIP = cfg.FloatingIPs[1].IP + } + + state.Servers = servers + + for _, srv := range servers { + fmt.Printf(" %s: %s (%s)\n", srv.Name, srv.IP, srv.Role) + } + + return nil +} + +// phase2AssignFloatingIPs assigns floating IPs and configures loopback. +func phase2AssignFloatingIPs(client *HetznerClient, cfg *Config, state *SandboxState) error { + sshKeyPath := cfg.ExpandedPrivateKeyPath() + + for i := 0; i < 2 && i < len(cfg.FloatingIPs) && i < len(state.Servers); i++ { + fip := cfg.FloatingIPs[i] + srv := state.Servers[i] + + // Unassign if currently assigned elsewhere (ignore "not assigned" errors) + fmt.Printf(" Assigning %s to %s...\n", fip.IP, srv.Name) + if err := client.UnassignFloatingIP(fip.ID); err != nil { + // Log but continue — may fail if not currently assigned, which is fine + fmt.Printf(" Note: unassign %s: %v (continuing)\n", fip.IP, err) + } + + if err := client.AssignFloatingIP(fip.ID, srv.ID); err != nil { + return fmt.Errorf("assign %s to %s: %w", fip.IP, srv.Name, err) + } + + // Configure floating IP on the server's loopback interface + // Hetzner floating IPs require this: ip addr add /32 dev lo + node := inspector.Node{ + User: "root", + Host: srv.IP, + SSHKey: sshKeyPath, + } + + // Wait for SSH to be ready on freshly booted servers + if err := waitForSSH(node, 2*time.Minute); err != nil { + return fmt.Errorf("SSH not ready on %s: %w", srv.Name, err) + } + + cmd := fmt.Sprintf("ip addr add %s/32 dev lo 2>/dev/null || true", fip.IP) + if err := remotessh.RunSSHStreaming(node, cmd); err != nil { + return fmt.Errorf("configure loopback on %s: %w", srv.Name, err) + } + } + + return nil +} + +// waitForSSH polls until SSH is responsive on the node. +func waitForSSH(node inspector.Node, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + _, err := runSSHOutput(node, "echo ok") + if err == nil { + return nil + } + time.Sleep(3 * time.Second) + } + return fmt.Errorf("timeout after %s", timeout) +} + +// phase3UploadArchive builds (if needed) and uploads the binary archive to all nodes. +func phase3UploadArchive(cfg *Config, state *SandboxState) error { + // Find existing archive + archivePath := findNewestArchive() + if archivePath == "" { + fmt.Println(" No binary archive found, run `orama build` first") + return fmt.Errorf("no binary archive found in /tmp/ (run `orama build` first)") + } + + info, _ := os.Stat(archivePath) + fmt.Printf(" Archive: %s (%s)\n", filepath.Base(archivePath), formatBytes(info.Size())) + + sshKeyPath := cfg.ExpandedPrivateKeyPath() + remotePath := "/tmp/" + filepath.Base(archivePath) + + // Upload to all 5 nodes in parallel + var wg sync.WaitGroup + errs := make([]error, len(state.Servers)) + + for i, srv := range state.Servers { + wg.Add(1) + go func(idx int, srv ServerState) { + defer wg.Done() + node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} + + if err := remotessh.UploadFile(node, archivePath, remotePath); err != nil { + errs[idx] = fmt.Errorf("upload to %s: %w", srv.Name, err) + return + } + + // Extract + install CLI + extractCmd := fmt.Sprintf("mkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s && cp /opt/orama/bin/orama /usr/local/bin/orama && chmod +x /usr/local/bin/orama", + remotePath, remotePath) + if err := remotessh.RunSSHStreaming(node, extractCmd); err != nil { + errs[idx] = fmt.Errorf("extract on %s: %w", srv.Name, err) + return + } + fmt.Printf(" Uploaded to %s\n", srv.Name) + }(i, srv) + } + wg.Wait() + + for _, err := range errs { + if err != nil { + return err + } + } + + return nil +} + +// phase4InstallGenesis installs the genesis node and generates invite tokens. +func phase4InstallGenesis(cfg *Config, state *SandboxState) ([]string, error) { + genesis := state.GenesisServer() + sshKeyPath := cfg.ExpandedPrivateKeyPath() + node := inspector.Node{User: "root", Host: genesis.IP, SSHKey: sshKeyPath} + + // Install genesis + installCmd := fmt.Sprintf("orama node install --vps-ip %s --domain %s --base-domain %s --nameserver --skip-checks", + genesis.IP, cfg.Domain, cfg.Domain) + fmt.Printf(" Installing on %s (%s)...\n", genesis.Name, genesis.IP) + if err := remotessh.RunSSHStreaming(node, installCmd); err != nil { + return nil, fmt.Errorf("install genesis: %w", err) + } + + // Wait for RQLite leader + fmt.Print(" Waiting for RQLite leader...") + if err := waitForRQLiteHealth(node, 3*time.Minute); err != nil { + return nil, fmt.Errorf("genesis health: %w", err) + } + fmt.Println(" OK") + + // Generate invite tokens (one per remaining node) + fmt.Print(" Generating invite tokens...") + remaining := len(state.Servers) - 1 + tokens := make([]string, remaining) + + for i := 0; i < remaining; i++ { + token, err := generateInviteToken(node) + if err != nil { + return nil, fmt.Errorf("generate invite token %d: %w", i+1, err) + } + tokens[i] = token + fmt.Print(".") + } + fmt.Println(" OK") + + return tokens, nil +} + +// phase5JoinNodes joins the remaining 4 nodes to the cluster (serial). +func phase5JoinNodes(cfg *Config, state *SandboxState, tokens []string) error { + genesisIP := state.GenesisServer().IP + sshKeyPath := cfg.ExpandedPrivateKeyPath() + + for i := 1; i < len(state.Servers); i++ { + srv := state.Servers[i] + node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} + token := tokens[i-1] + + var installCmd string + if srv.Role == "nameserver" { + installCmd = fmt.Sprintf("orama node install --join http://%s --token %s --vps-ip %s --domain %s --base-domain %s --nameserver --skip-checks", + genesisIP, token, srv.IP, cfg.Domain, cfg.Domain) + } else { + installCmd = fmt.Sprintf("orama node install --join http://%s --token %s --vps-ip %s --base-domain %s --skip-checks", + genesisIP, token, srv.IP, cfg.Domain) + } + + fmt.Printf(" [%d/%d] Joining %s (%s, %s)...\n", i, len(state.Servers)-1, srv.Name, srv.IP, srv.Role) + if err := remotessh.RunSSHStreaming(node, installCmd); err != nil { + return fmt.Errorf("join %s: %w", srv.Name, err) + } + + // Wait for node health before proceeding + fmt.Printf(" Waiting for %s health...", srv.Name) + if err := waitForRQLiteHealth(node, 3*time.Minute); err != nil { + fmt.Printf(" WARN: %v\n", err) + } else { + fmt.Println(" OK") + } + } + + return nil +} + +// phase6Verify runs a basic cluster health check. +func phase6Verify(cfg *Config, state *SandboxState) { + sshKeyPath := cfg.ExpandedPrivateKeyPath() + genesis := state.GenesisServer() + node := inspector.Node{User: "root", Host: genesis.IP, SSHKey: sshKeyPath} + + // Check RQLite cluster + out, err := runSSHOutput(node, "curl -s http://localhost:5001/status | grep -o '\"state\":\"[^\"]*\"' | head -1") + if err == nil { + fmt.Printf(" RQLite: %s\n", strings.TrimSpace(out)) + } + + // Check DNS (if floating IPs configured, only with safe domain names) + if len(cfg.FloatingIPs) > 0 && isSafeDNSName(cfg.Domain) { + out, err = runSSHOutput(node, fmt.Sprintf("dig +short @%s test.%s 2>/dev/null || echo 'DNS not responding'", + cfg.FloatingIPs[0].IP, cfg.Domain)) + if err == nil { + fmt.Printf(" DNS: %s\n", strings.TrimSpace(out)) + } + } +} + +// waitForRQLiteHealth polls RQLite until it reports Leader or Follower state. +func waitForRQLiteHealth(node inspector.Node, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + out, err := runSSHOutput(node, "curl -sf http://localhost:5001/status 2>/dev/null | grep -o '\"state\":\"[^\"]*\"'") + if err == nil { + result := strings.TrimSpace(out) + if strings.Contains(result, "Leader") || strings.Contains(result, "Follower") { + return nil + } + } + time.Sleep(5 * time.Second) + } + return fmt.Errorf("timeout waiting for RQLite health after %s", timeout) +} + +// generateInviteToken runs `orama node invite` on the node and parses the token. +func generateInviteToken(node inspector.Node) (string, error) { + out, err := runSSHOutput(node, "orama node invite --expiry 1h 2>&1") + if err != nil { + return "", fmt.Errorf("invite command failed: %w", err) + } + + // Parse token from output — the invite command outputs: + // "sudo orama install --join https://... --token <64-char-hex> --vps-ip ..." + // Look for the --token flag value first + fields := strings.Fields(out) + for i, field := range fields { + if field == "--token" && i+1 < len(fields) { + candidate := fields[i+1] + if len(candidate) == 64 && isHex(candidate) { + return candidate, nil + } + } + } + + // Fallback: look for any standalone 64-char hex string + for _, word := range fields { + if len(word) == 64 && isHex(word) { + return word, nil + } + } + + return "", fmt.Errorf("could not parse token from invite output:\n%s", out) +} + +// isSafeDNSName returns true if the string is safe to use in shell commands. +func isSafeDNSName(s string) bool { + for _, c := range s { + if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '.' || c == '-') { + return false + } + } + return len(s) > 0 +} + +// isHex returns true if s contains only hex characters. +func isHex(s string) bool { + for _, c := range s { + if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { + return false + } + } + return true +} + +// runSSHOutput runs a command via SSH and returns stdout as a string. +func runSSHOutput(node inspector.Node, command string) (string, error) { + args := []string{ + "ssh", "-n", + "-o", "StrictHostKeyChecking=accept-new", + "-o", "ConnectTimeout=10", + "-o", "BatchMode=yes", + "-i", node.SSHKey, + fmt.Sprintf("%s@%s", node.User, node.Host), + command, + } + + out, err := execCommand(args[0], args[1:]...) + return string(out), err +} + +// execCommand runs a command and returns its output. +func execCommand(name string, args ...string) ([]byte, error) { + return exec.Command(name, args...).Output() +} + +// findNewestArchive finds the newest binary archive in /tmp/. +func findNewestArchive() string { + entries, err := os.ReadDir("/tmp") + if err != nil { + return "" + } + + var best string + var bestMod int64 + for _, entry := range entries { + name := entry.Name() + if strings.HasPrefix(name, "orama-") && strings.Contains(name, "-linux-") && strings.HasSuffix(name, ".tar.gz") { + info, err := entry.Info() + if err != nil { + continue + } + if info.ModTime().Unix() > bestMod { + best = filepath.Join("/tmp", name) + bestMod = info.ModTime().Unix() + } + } + } + + return best +} + +// formatBytes formats a byte count as human-readable. +func formatBytes(b int64) string { + const unit = 1024 + if b < unit { + return fmt.Sprintf("%d B", b) + } + div, exp := int64(unit), 0 + for n := b / unit; n >= unit; n /= unit { + div *= unit + exp++ + } + return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp]) +} + +// printCreateSummary prints the cluster summary after creation. +func printCreateSummary(cfg *Config, state *SandboxState) { + fmt.Printf("\nSandbox %q ready (%d nodes)\n", state.Name, len(state.Servers)) + fmt.Println() + + fmt.Println("Nameservers:") + for _, srv := range state.NameserverNodes() { + floating := "" + if srv.FloatingIP != "" { + floating = fmt.Sprintf(" (floating: %s)", srv.FloatingIP) + } + fmt.Printf(" %s: %s%s\n", srv.Name, srv.IP, floating) + } + + fmt.Println("Nodes:") + for _, srv := range state.RegularNodes() { + fmt.Printf(" %s: %s\n", srv.Name, srv.IP) + } + + fmt.Println() + fmt.Printf("Domain: %s\n", cfg.Domain) + fmt.Printf("Gateway: https://%s\n", cfg.Domain) + fmt.Println() + fmt.Println("SSH: orama sandbox ssh 1") + fmt.Println("Destroy: orama sandbox destroy") +} + +// cleanupFailedCreate deletes any servers that were created during a failed provision. +func cleanupFailedCreate(client *HetznerClient, state *SandboxState) { + if len(state.Servers) == 0 { + return + } + fmt.Println("\nCleaning up failed creation...") + for _, srv := range state.Servers { + if srv.ID > 0 { + client.DeleteServer(srv.ID) + fmt.Printf(" Deleted %s\n", srv.Name) + } + } + DeleteState(state.Name) +} diff --git a/pkg/cli/sandbox/destroy.go b/pkg/cli/sandbox/destroy.go new file mode 100644 index 0000000..b532a18 --- /dev/null +++ b/pkg/cli/sandbox/destroy.go @@ -0,0 +1,122 @@ +package sandbox + +import ( + "bufio" + "fmt" + "os" + "strings" + "sync" +) + +// Destroy tears down a sandbox cluster. +func Destroy(name string, force bool) error { + cfg, err := LoadConfig() + if err != nil { + return err + } + + // Resolve sandbox name + state, err := resolveSandbox(name) + if err != nil { + return err + } + + // Confirm destruction + if !force { + reader := bufio.NewReader(os.Stdin) + fmt.Printf("Destroy sandbox %q? This deletes %d servers. [y/N]: ", state.Name, len(state.Servers)) + choice, _ := reader.ReadString('\n') + choice = strings.TrimSpace(strings.ToLower(choice)) + if choice != "y" && choice != "yes" { + fmt.Println("Aborted.") + return nil + } + } + + state.Status = StatusDestroying + SaveState(state) // best-effort status update + + client := NewHetznerClient(cfg.HetznerAPIToken) + + // Step 1: Unassign floating IPs from nameserver nodes + fmt.Println("Unassigning floating IPs...") + for _, srv := range state.NameserverNodes() { + if srv.FloatingIP == "" { + continue + } + // Find the floating IP ID from config + for _, fip := range cfg.FloatingIPs { + if fip.IP == srv.FloatingIP { + if err := client.UnassignFloatingIP(fip.ID); err != nil { + fmt.Fprintf(os.Stderr, " Warning: could not unassign floating IP %s: %v\n", fip.IP, err) + } else { + fmt.Printf(" Unassigned %s from %s\n", fip.IP, srv.Name) + } + break + } + } + } + + // Step 2: Delete all servers in parallel + fmt.Printf("Deleting %d servers...\n", len(state.Servers)) + var wg sync.WaitGroup + var mu sync.Mutex + var failed []string + + for _, srv := range state.Servers { + wg.Add(1) + go func(srv ServerState) { + defer wg.Done() + if err := client.DeleteServer(srv.ID); err != nil { + // Treat 404 as already deleted (idempotent) + if strings.Contains(err.Error(), "404") || strings.Contains(err.Error(), "not found") { + fmt.Printf(" %s (ID %d): already deleted\n", srv.Name, srv.ID) + } else { + mu.Lock() + failed = append(failed, fmt.Sprintf("%s (ID %d): %v", srv.Name, srv.ID, err)) + mu.Unlock() + fmt.Fprintf(os.Stderr, " Warning: failed to delete %s: %v\n", srv.Name, err) + } + } else { + fmt.Printf(" Deleted %s (ID %d)\n", srv.Name, srv.ID) + } + }(srv) + } + wg.Wait() + + if len(failed) > 0 { + fmt.Fprintf(os.Stderr, "\nFailed to delete %d server(s):\n", len(failed)) + for _, f := range failed { + fmt.Fprintf(os.Stderr, " %s\n", f) + } + fmt.Fprintf(os.Stderr, "\nManual cleanup: delete servers at https://console.hetzner.cloud\n") + state.Status = StatusError + SaveState(state) + return fmt.Errorf("failed to delete %d server(s)", len(failed)) + } + + // Step 3: Remove state file + if err := DeleteState(state.Name); err != nil { + return fmt.Errorf("delete state: %w", err) + } + + fmt.Printf("\nSandbox %q destroyed (%d servers deleted)\n", state.Name, len(state.Servers)) + return nil +} + +// resolveSandbox finds a sandbox by name or returns the active one. +func resolveSandbox(name string) (*SandboxState, error) { + if name != "" { + return LoadState(name) + } + + // Find the active sandbox + active, err := FindActiveSandbox() + if err != nil { + return nil, err + } + if active == nil { + return nil, fmt.Errorf("no active sandbox found, specify --name") + } + return active, nil +} diff --git a/pkg/cli/sandbox/hetzner.go b/pkg/cli/sandbox/hetzner.go new file mode 100644 index 0000000..742349e --- /dev/null +++ b/pkg/cli/sandbox/hetzner.go @@ -0,0 +1,438 @@ +package sandbox + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "strconv" + "time" +) + +const hetznerBaseURL = "https://api.hetzner.cloud/v1" + +// HetznerClient is a minimal Hetzner Cloud API client. +type HetznerClient struct { + token string + httpClient *http.Client +} + +// NewHetznerClient creates a new Hetzner API client. +func NewHetznerClient(token string) *HetznerClient { + return &HetznerClient{ + token: token, + httpClient: &http.Client{ + Timeout: 30 * time.Second, + }, + } +} + +// --- Request helpers --- + +func (c *HetznerClient) doRequest(method, path string, body interface{}) ([]byte, int, error) { + var bodyReader io.Reader + if body != nil { + data, err := json.Marshal(body) + if err != nil { + return nil, 0, fmt.Errorf("marshal request body: %w", err) + } + bodyReader = bytes.NewReader(data) + } + + req, err := http.NewRequest(method, hetznerBaseURL+path, bodyReader) + if err != nil { + return nil, 0, fmt.Errorf("create request: %w", err) + } + + req.Header.Set("Authorization", "Bearer "+c.token) + if body != nil { + req.Header.Set("Content-Type", "application/json") + } + + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, 0, fmt.Errorf("request %s %s: %w", method, path, err) + } + defer resp.Body.Close() + + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return nil, resp.StatusCode, fmt.Errorf("read response: %w", err) + } + + return respBody, resp.StatusCode, nil +} + +func (c *HetznerClient) get(path string) ([]byte, error) { + body, status, err := c.doRequest("GET", path, nil) + if err != nil { + return nil, err + } + if status < 200 || status >= 300 { + return nil, parseHetznerError(body, status) + } + return body, nil +} + +func (c *HetznerClient) post(path string, payload interface{}) ([]byte, error) { + body, status, err := c.doRequest("POST", path, payload) + if err != nil { + return nil, err + } + if status < 200 || status >= 300 { + return nil, parseHetznerError(body, status) + } + return body, nil +} + +func (c *HetznerClient) delete(path string) error { + _, status, err := c.doRequest("DELETE", path, nil) + if err != nil { + return err + } + if status < 200 || status >= 300 { + return fmt.Errorf("delete %s: HTTP %d", path, status) + } + return nil +} + +// --- API types --- + +// HetznerServer represents a Hetzner Cloud server. +type HetznerServer struct { + ID int64 `json:"id"` + Name string `json:"name"` + Status string `json:"status"` // initializing, running, off, ... + PublicNet HetznerPublicNet `json:"public_net"` + Labels map[string]string `json:"labels"` + ServerType struct { + Name string `json:"name"` + } `json:"server_type"` +} + +// HetznerPublicNet holds public networking info for a server. +type HetznerPublicNet struct { + IPv4 struct { + IP string `json:"ip"` + } `json:"ipv4"` +} + +// HetznerFloatingIP represents a Hetzner floating IP. +type HetznerFloatingIP struct { + ID int64 `json:"id"` + IP string `json:"ip"` + Server *int64 `json:"server"` // nil if unassigned + Labels map[string]string `json:"labels"` + Description string `json:"description"` + HomeLocation struct { + Name string `json:"name"` + } `json:"home_location"` +} + +// HetznerSSHKey represents a Hetzner SSH key. +type HetznerSSHKey struct { + ID int64 `json:"id"` + Name string `json:"name"` + Fingerprint string `json:"fingerprint"` + PublicKey string `json:"public_key"` +} + +// HetznerFirewall represents a Hetzner firewall. +type HetznerFirewall struct { + ID int64 `json:"id"` + Name string `json:"name"` + Rules []HetznerFWRule `json:"rules"` + Labels map[string]string `json:"labels"` +} + +// HetznerFWRule represents a firewall rule. +type HetznerFWRule struct { + Direction string `json:"direction"` + Protocol string `json:"protocol"` + Port string `json:"port"` + SourceIPs []string `json:"source_ips"` + Description string `json:"description,omitempty"` +} + +// HetznerError represents an API error response. +type HetznerError struct { + Error struct { + Code string `json:"code"` + Message string `json:"message"` + } `json:"error"` +} + +func parseHetznerError(body []byte, status int) error { + var he HetznerError + if err := json.Unmarshal(body, &he); err == nil && he.Error.Message != "" { + return fmt.Errorf("hetzner API error (HTTP %d): %s — %s", status, he.Error.Code, he.Error.Message) + } + return fmt.Errorf("hetzner API error: HTTP %d", status) +} + +// --- Server operations --- + +// CreateServerRequest holds parameters for server creation. +type CreateServerRequest struct { + Name string `json:"name"` + ServerType string `json:"server_type"` + Image string `json:"image"` + Location string `json:"location"` + SSHKeys []int64 `json:"ssh_keys"` + Labels map[string]string `json:"labels"` + Firewalls []struct { + Firewall int64 `json:"firewall"` + } `json:"firewalls,omitempty"` +} + +// CreateServer creates a new server and returns it. +func (c *HetznerClient) CreateServer(req CreateServerRequest) (*HetznerServer, error) { + body, err := c.post("/servers", req) + if err != nil { + return nil, fmt.Errorf("create server %q: %w", req.Name, err) + } + + var resp struct { + Server HetznerServer `json:"server"` + } + if err := json.Unmarshal(body, &resp); err != nil { + return nil, fmt.Errorf("parse create server response: %w", err) + } + + return &resp.Server, nil +} + +// GetServer retrieves a server by ID. +func (c *HetznerClient) GetServer(id int64) (*HetznerServer, error) { + body, err := c.get("/servers/" + strconv.FormatInt(id, 10)) + if err != nil { + return nil, fmt.Errorf("get server %d: %w", id, err) + } + + var resp struct { + Server HetznerServer `json:"server"` + } + if err := json.Unmarshal(body, &resp); err != nil { + return nil, fmt.Errorf("parse server response: %w", err) + } + + return &resp.Server, nil +} + +// DeleteServer deletes a server by ID. +func (c *HetznerClient) DeleteServer(id int64) error { + return c.delete("/servers/" + strconv.FormatInt(id, 10)) +} + +// ListServersByLabel lists servers filtered by a label selector. +func (c *HetznerClient) ListServersByLabel(selector string) ([]HetznerServer, error) { + body, err := c.get("/servers?label_selector=" + selector) + if err != nil { + return nil, fmt.Errorf("list servers: %w", err) + } + + var resp struct { + Servers []HetznerServer `json:"servers"` + } + if err := json.Unmarshal(body, &resp); err != nil { + return nil, fmt.Errorf("parse servers response: %w", err) + } + + return resp.Servers, nil +} + +// WaitForServer polls until the server reaches "running" status. +func (c *HetznerClient) WaitForServer(id int64, timeout time.Duration) (*HetznerServer, error) { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + srv, err := c.GetServer(id) + if err != nil { + return nil, err + } + if srv.Status == "running" { + return srv, nil + } + time.Sleep(3 * time.Second) + } + return nil, fmt.Errorf("server %d did not reach running state within %s", id, timeout) +} + +// --- Floating IP operations --- + +// CreateFloatingIP creates a new floating IP. +func (c *HetznerClient) CreateFloatingIP(location, description string, labels map[string]string) (*HetznerFloatingIP, error) { + payload := map[string]interface{}{ + "type": "ipv4", + "home_location": location, + "description": description, + "labels": labels, + } + + body, err := c.post("/floating_ips", payload) + if err != nil { + return nil, fmt.Errorf("create floating IP: %w", err) + } + + var resp struct { + FloatingIP HetznerFloatingIP `json:"floating_ip"` + } + if err := json.Unmarshal(body, &resp); err != nil { + return nil, fmt.Errorf("parse floating IP response: %w", err) + } + + return &resp.FloatingIP, nil +} + +// ListFloatingIPsByLabel lists floating IPs filtered by label. +func (c *HetznerClient) ListFloatingIPsByLabel(selector string) ([]HetznerFloatingIP, error) { + body, err := c.get("/floating_ips?label_selector=" + selector) + if err != nil { + return nil, fmt.Errorf("list floating IPs: %w", err) + } + + var resp struct { + FloatingIPs []HetznerFloatingIP `json:"floating_ips"` + } + if err := json.Unmarshal(body, &resp); err != nil { + return nil, fmt.Errorf("parse floating IPs response: %w", err) + } + + return resp.FloatingIPs, nil +} + +// AssignFloatingIP assigns a floating IP to a server. +func (c *HetznerClient) AssignFloatingIP(floatingIPID, serverID int64) error { + payload := map[string]int64{"server": serverID} + _, err := c.post("/floating_ips/"+strconv.FormatInt(floatingIPID, 10)+"/actions/assign", payload) + if err != nil { + return fmt.Errorf("assign floating IP %d to server %d: %w", floatingIPID, serverID, err) + } + return nil +} + +// UnassignFloatingIP removes a floating IP assignment. +func (c *HetznerClient) UnassignFloatingIP(floatingIPID int64) error { + _, err := c.post("/floating_ips/"+strconv.FormatInt(floatingIPID, 10)+"/actions/unassign", struct{}{}) + if err != nil { + return fmt.Errorf("unassign floating IP %d: %w", floatingIPID, err) + } + return nil +} + +// --- SSH Key operations --- + +// UploadSSHKey uploads a public key to Hetzner. +func (c *HetznerClient) UploadSSHKey(name, publicKey string) (*HetznerSSHKey, error) { + payload := map[string]string{ + "name": name, + "public_key": publicKey, + } + + body, err := c.post("/ssh_keys", payload) + if err != nil { + return nil, fmt.Errorf("upload SSH key: %w", err) + } + + var resp struct { + SSHKey HetznerSSHKey `json:"ssh_key"` + } + if err := json.Unmarshal(body, &resp); err != nil { + return nil, fmt.Errorf("parse SSH key response: %w", err) + } + + return &resp.SSHKey, nil +} + +// GetSSHKey retrieves an SSH key by ID. +func (c *HetznerClient) GetSSHKey(id int64) (*HetznerSSHKey, error) { + body, err := c.get("/ssh_keys/" + strconv.FormatInt(id, 10)) + if err != nil { + return nil, fmt.Errorf("get SSH key %d: %w", id, err) + } + + var resp struct { + SSHKey HetznerSSHKey `json:"ssh_key"` + } + if err := json.Unmarshal(body, &resp); err != nil { + return nil, fmt.Errorf("parse SSH key response: %w", err) + } + + return &resp.SSHKey, nil +} + +// --- Firewall operations --- + +// CreateFirewall creates a firewall with the given rules. +func (c *HetznerClient) CreateFirewall(name string, rules []HetznerFWRule, labels map[string]string) (*HetznerFirewall, error) { + payload := map[string]interface{}{ + "name": name, + "rules": rules, + "labels": labels, + } + + body, err := c.post("/firewalls", payload) + if err != nil { + return nil, fmt.Errorf("create firewall: %w", err) + } + + var resp struct { + Firewall HetznerFirewall `json:"firewall"` + } + if err := json.Unmarshal(body, &resp); err != nil { + return nil, fmt.Errorf("parse firewall response: %w", err) + } + + return &resp.Firewall, nil +} + +// ListFirewallsByLabel lists firewalls filtered by label. +func (c *HetznerClient) ListFirewallsByLabel(selector string) ([]HetznerFirewall, error) { + body, err := c.get("/firewalls?label_selector=" + selector) + if err != nil { + return nil, fmt.Errorf("list firewalls: %w", err) + } + + var resp struct { + Firewalls []HetznerFirewall `json:"firewalls"` + } + if err := json.Unmarshal(body, &resp); err != nil { + return nil, fmt.Errorf("parse firewalls response: %w", err) + } + + return resp.Firewalls, nil +} + +// DeleteFirewall deletes a firewall by ID. +func (c *HetznerClient) DeleteFirewall(id int64) error { + return c.delete("/firewalls/" + strconv.FormatInt(id, 10)) +} + +// --- Validation --- + +// ValidateToken checks if the API token is valid by making a simple request. +func (c *HetznerClient) ValidateToken() error { + _, err := c.get("/servers?per_page=1") + if err != nil { + return fmt.Errorf("invalid Hetzner API token: %w", err) + } + return nil +} + +// --- Sandbox firewall rules --- + +// SandboxFirewallRules returns the standard firewall rules for sandbox nodes. +func SandboxFirewallRules() []HetznerFWRule { + allIPv4 := []string{"0.0.0.0/0"} + allIPv6 := []string{"::/0"} + allIPs := append(allIPv4, allIPv6...) + + return []HetznerFWRule{ + {Direction: "in", Protocol: "tcp", Port: "22", SourceIPs: allIPs, Description: "SSH"}, + {Direction: "in", Protocol: "tcp", Port: "53", SourceIPs: allIPs, Description: "DNS TCP"}, + {Direction: "in", Protocol: "udp", Port: "53", SourceIPs: allIPs, Description: "DNS UDP"}, + {Direction: "in", Protocol: "tcp", Port: "80", SourceIPs: allIPs, Description: "HTTP"}, + {Direction: "in", Protocol: "tcp", Port: "443", SourceIPs: allIPs, Description: "HTTPS"}, + {Direction: "in", Protocol: "udp", Port: "51820", SourceIPs: allIPs, Description: "WireGuard"}, + } +} diff --git a/pkg/cli/sandbox/hetzner_test.go b/pkg/cli/sandbox/hetzner_test.go new file mode 100644 index 0000000..a59f5e8 --- /dev/null +++ b/pkg/cli/sandbox/hetzner_test.go @@ -0,0 +1,303 @@ +package sandbox + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +func TestValidateToken_Success(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Header.Get("Authorization") != "Bearer test-token" { + t.Errorf("unexpected auth header: %s", r.Header.Get("Authorization")) + } + w.WriteHeader(200) + json.NewEncoder(w).Encode(map[string]interface{}{"servers": []interface{}{}}) + })) + defer srv.Close() + + client := newTestClient(srv, "test-token") + if err := client.ValidateToken(); err != nil { + t.Errorf("ValidateToken() error = %v, want nil", err) + } +} + +func TestValidateToken_InvalidToken(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(401) + json.NewEncoder(w).Encode(map[string]interface{}{ + "error": map[string]string{ + "code": "unauthorized", + "message": "unable to authenticate", + }, + }) + })) + defer srv.Close() + + client := newTestClient(srv, "bad-token") + if err := client.ValidateToken(); err == nil { + t.Error("ValidateToken() expected error for invalid token") + } +} + +func TestCreateServer(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != "POST" || r.URL.Path != "/v1/servers" { + t.Errorf("unexpected request: %s %s", r.Method, r.URL.Path) + } + + var req CreateServerRequest + json.NewDecoder(r.Body).Decode(&req) + + if req.Name != "sbx-test-1" { + t.Errorf("unexpected server name: %s", req.Name) + } + if req.ServerType != "cx22" { + t.Errorf("unexpected server type: %s", req.ServerType) + } + + w.WriteHeader(201) + json.NewEncoder(w).Encode(map[string]interface{}{ + "server": map[string]interface{}{ + "id": 12345, + "name": req.Name, + "status": "initializing", + "public_net": map[string]interface{}{ + "ipv4": map[string]string{"ip": "1.2.3.4"}, + }, + "labels": req.Labels, + "server_type": map[string]string{"name": "cx22"}, + }, + }) + })) + defer srv.Close() + + client := newTestClient(srv, "test-token") + server, err := client.CreateServer(CreateServerRequest{ + Name: "sbx-test-1", + ServerType: "cx22", + Image: "ubuntu-24.04", + Location: "fsn1", + SSHKeys: []int64{1}, + Labels: map[string]string{"orama-sandbox": "test"}, + }) + + if err != nil { + t.Fatalf("CreateServer() error = %v", err) + } + if server.ID != 12345 { + t.Errorf("server ID = %d, want 12345", server.ID) + } + if server.Name != "sbx-test-1" { + t.Errorf("server name = %s, want sbx-test-1", server.Name) + } + if server.PublicNet.IPv4.IP != "1.2.3.4" { + t.Errorf("server IP = %s, want 1.2.3.4", server.PublicNet.IPv4.IP) + } +} + +func TestDeleteServer(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != "DELETE" || r.URL.Path != "/v1/servers/12345" { + t.Errorf("unexpected request: %s %s", r.Method, r.URL.Path) + } + w.WriteHeader(200) + })) + defer srv.Close() + + client := newTestClient(srv, "test-token") + if err := client.DeleteServer(12345); err != nil { + t.Errorf("DeleteServer() error = %v", err) + } +} + +func TestListServersByLabel(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Query().Get("label_selector") != "orama-sandbox=test" { + t.Errorf("unexpected label_selector: %s", r.URL.Query().Get("label_selector")) + } + w.WriteHeader(200) + json.NewEncoder(w).Encode(map[string]interface{}{ + "servers": []map[string]interface{}{ + {"id": 1, "name": "sbx-test-1", "status": "running", "public_net": map[string]interface{}{"ipv4": map[string]string{"ip": "1.1.1.1"}}, "server_type": map[string]string{"name": "cx22"}}, + {"id": 2, "name": "sbx-test-2", "status": "running", "public_net": map[string]interface{}{"ipv4": map[string]string{"ip": "2.2.2.2"}}, "server_type": map[string]string{"name": "cx22"}}, + }, + }) + })) + defer srv.Close() + + client := newTestClient(srv, "test-token") + servers, err := client.ListServersByLabel("orama-sandbox=test") + if err != nil { + t.Fatalf("ListServersByLabel() error = %v", err) + } + if len(servers) != 2 { + t.Errorf("got %d servers, want 2", len(servers)) + } +} + +func TestWaitForServer_AlreadyRunning(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(200) + json.NewEncoder(w).Encode(map[string]interface{}{ + "server": map[string]interface{}{ + "id": 1, + "name": "test", + "status": "running", + "public_net": map[string]interface{}{ + "ipv4": map[string]string{"ip": "1.1.1.1"}, + }, + "server_type": map[string]string{"name": "cx22"}, + }, + }) + })) + defer srv.Close() + + client := newTestClient(srv, "test-token") + server, err := client.WaitForServer(1, 5*time.Second) + if err != nil { + t.Fatalf("WaitForServer() error = %v", err) + } + if server.Status != "running" { + t.Errorf("server status = %s, want running", server.Status) + } +} + +func TestAssignFloatingIP(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != "POST" || r.URL.Path != "/v1/floating_ips/100/actions/assign" { + t.Errorf("unexpected request: %s %s", r.Method, r.URL.Path) + } + + var body map[string]int64 + json.NewDecoder(r.Body).Decode(&body) + if body["server"] != 200 { + t.Errorf("unexpected server ID: %d", body["server"]) + } + + w.WriteHeader(200) + json.NewEncoder(w).Encode(map[string]interface{}{"action": map[string]interface{}{"id": 1, "status": "running"}}) + })) + defer srv.Close() + + client := newTestClient(srv, "test-token") + if err := client.AssignFloatingIP(100, 200); err != nil { + t.Errorf("AssignFloatingIP() error = %v", err) + } +} + +func TestUploadSSHKey(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != "POST" || r.URL.Path != "/v1/ssh_keys" { + t.Errorf("unexpected request: %s %s", r.Method, r.URL.Path) + } + w.WriteHeader(201) + json.NewEncoder(w).Encode(map[string]interface{}{ + "ssh_key": map[string]interface{}{ + "id": 42, + "name": "orama-sandbox", + "fingerprint": "aa:bb:cc:dd", + "public_key": "ssh-ed25519 AAAA...", + }, + }) + })) + defer srv.Close() + + client := newTestClient(srv, "test-token") + key, err := client.UploadSSHKey("orama-sandbox", "ssh-ed25519 AAAA...") + if err != nil { + t.Fatalf("UploadSSHKey() error = %v", err) + } + if key.ID != 42 { + t.Errorf("key ID = %d, want 42", key.ID) + } +} + +func TestCreateFirewall(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != "POST" || r.URL.Path != "/v1/firewalls" { + t.Errorf("unexpected request: %s %s", r.Method, r.URL.Path) + } + w.WriteHeader(201) + json.NewEncoder(w).Encode(map[string]interface{}{ + "firewall": map[string]interface{}{ + "id": 99, + "name": "orama-sandbox", + }, + }) + })) + defer srv.Close() + + client := newTestClient(srv, "test-token") + fw, err := client.CreateFirewall("orama-sandbox", SandboxFirewallRules(), map[string]string{"orama-sandbox": "infra"}) + if err != nil { + t.Fatalf("CreateFirewall() error = %v", err) + } + if fw.ID != 99 { + t.Errorf("firewall ID = %d, want 99", fw.ID) + } +} + +func TestSandboxFirewallRules(t *testing.T) { + rules := SandboxFirewallRules() + if len(rules) != 6 { + t.Errorf("got %d rules, want 6", len(rules)) + } + + expectedPorts := map[string]bool{"22": false, "53": false, "80": false, "443": false, "51820": false} + for _, r := range rules { + expectedPorts[r.Port] = true + if r.Direction != "in" { + t.Errorf("rule %s direction = %s, want in", r.Port, r.Direction) + } + } + for port, seen := range expectedPorts { + if !seen { + t.Errorf("missing firewall rule for port %s", port) + } + } +} + +func TestParseHetznerError(t *testing.T) { + body := `{"error":{"code":"uniqueness_error","message":"server name already used"}}` + err := parseHetznerError([]byte(body), 409) + if err == nil { + t.Fatal("expected error") + } + expected := "hetzner API error (HTTP 409): uniqueness_error — server name already used" + if err.Error() != expected { + t.Errorf("error = %q, want %q", err.Error(), expected) + } +} + +// newTestClient creates a HetznerClient pointing at a test server. +func newTestClient(ts *httptest.Server, token string) *HetznerClient { + client := NewHetznerClient(token) + // Override the base URL by using a custom transport + client.httpClient = ts.Client() + // We need to override the base URL — wrap the transport + origTransport := client.httpClient.Transport + client.httpClient.Transport = &testTransport{ + base: origTransport, + testURL: ts.URL, + } + return client +} + +// testTransport rewrites requests to point at the test server. +type testTransport struct { + base http.RoundTripper + testURL string +} + +func (t *testTransport) RoundTrip(req *http.Request) (*http.Response, error) { + // Rewrite the URL to point at the test server + req.URL.Scheme = "http" + req.URL.Host = t.testURL[len("http://"):] + if t.base != nil { + return t.base.RoundTrip(req) + } + return http.DefaultTransport.RoundTrip(req) +} diff --git a/pkg/cli/sandbox/names.go b/pkg/cli/sandbox/names.go new file mode 100644 index 0000000..81a54f8 --- /dev/null +++ b/pkg/cli/sandbox/names.go @@ -0,0 +1,26 @@ +package sandbox + +import ( + "math/rand" +) + +var adjectives = []string{ + "swift", "bright", "calm", "dark", "eager", + "fair", "gold", "hazy", "iron", "jade", + "keen", "lush", "mild", "neat", "opal", + "pure", "raw", "sage", "teal", "warm", +} + +var nouns = []string{ + "falcon", "beacon", "cedar", "delta", "ember", + "frost", "grove", "haven", "ivory", "jewel", + "knot", "latch", "maple", "nexus", "orbit", + "prism", "reef", "spark", "tide", "vault", +} + +// GenerateName produces a random adjective-noun name like "swift-falcon". +func GenerateName() string { + adj := adjectives[rand.Intn(len(adjectives))] + noun := nouns[rand.Intn(len(nouns))] + return adj + "-" + noun +} diff --git a/pkg/cli/sandbox/rollout.go b/pkg/cli/sandbox/rollout.go new file mode 100644 index 0000000..8c7ccfd --- /dev/null +++ b/pkg/cli/sandbox/rollout.go @@ -0,0 +1,137 @@ +package sandbox + +import ( + "fmt" + "os" + "path/filepath" + "time" + + "github.com/DeBrosOfficial/network/pkg/cli/remotessh" + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +// Rollout builds, pushes, and performs a rolling upgrade on a sandbox cluster. +func Rollout(name string) error { + cfg, err := LoadConfig() + if err != nil { + return err + } + + state, err := resolveSandbox(name) + if err != nil { + return err + } + + sshKeyPath := cfg.ExpandedPrivateKeyPath() + fmt.Printf("Rolling out to sandbox %q (%d nodes)\n\n", state.Name, len(state.Servers)) + + // Step 1: Find or require binary archive + archivePath := findNewestArchive() + if archivePath == "" { + return fmt.Errorf("no binary archive found in /tmp/ (run `orama build` first)") + } + + info, _ := os.Stat(archivePath) + fmt.Printf("Archive: %s (%s)\n\n", filepath.Base(archivePath), formatBytes(info.Size())) + + // Step 2: Push archive to all nodes + fmt.Println("Pushing archive to all nodes...") + remotePath := "/tmp/" + filepath.Base(archivePath) + + for i, srv := range state.Servers { + node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} + + fmt.Printf(" [%d/%d] Uploading to %s...\n", i+1, len(state.Servers), srv.Name) + if err := remotessh.UploadFile(node, archivePath, remotePath); err != nil { + return fmt.Errorf("upload to %s: %w", srv.Name, err) + } + + // Extract archive + extractCmd := fmt.Sprintf("mkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s", + remotePath, remotePath) + if err := remotessh.RunSSHStreaming(node, extractCmd); err != nil { + return fmt.Errorf("extract on %s: %w", srv.Name, err) + } + } + + // Step 3: Rolling upgrade — followers first, leader last + fmt.Println("\nRolling upgrade (followers first, leader last)...") + + // Find the leader + leaderIdx := findLeaderIndex(state, sshKeyPath) + if leaderIdx < 0 { + fmt.Fprintf(os.Stderr, " Warning: could not detect RQLite leader, upgrading in order\n") + } + + // Upgrade non-leaders first + for i, srv := range state.Servers { + if i == leaderIdx { + continue // skip leader, do it last + } + if err := upgradeNode(srv, sshKeyPath, i+1, len(state.Servers)); err != nil { + return err + } + // Wait between nodes + if i < len(state.Servers)-1 { + fmt.Printf(" Waiting 15s before next node...\n") + time.Sleep(15 * time.Second) + } + } + + // Upgrade leader last + if leaderIdx >= 0 { + srv := state.Servers[leaderIdx] + if err := upgradeNode(srv, sshKeyPath, len(state.Servers), len(state.Servers)); err != nil { + return err + } + } + + fmt.Printf("\nRollout complete for sandbox %q\n", state.Name) + return nil +} + +// findLeaderIndex returns the index of the RQLite leader node, or -1 if unknown. +func findLeaderIndex(state *SandboxState, sshKeyPath string) int { + for i, srv := range state.Servers { + node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} + out, err := runSSHOutput(node, "curl -sf http://localhost:5001/status 2>/dev/null | grep -o '\"state\":\"[^\"]*\"'") + if err == nil && contains(out, "Leader") { + return i + } + } + return -1 +} + +// upgradeNode performs `orama node upgrade --restart` on a single node. +func upgradeNode(srv ServerState, sshKeyPath string, current, total int) error { + node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} + + fmt.Printf(" [%d/%d] Upgrading %s (%s)...\n", current, total, srv.Name, srv.IP) + if err := remotessh.RunSSHStreaming(node, "orama node upgrade --restart"); err != nil { + return fmt.Errorf("upgrade %s: %w", srv.Name, err) + } + + // Wait for health + fmt.Printf(" Checking health...") + if err := waitForRQLiteHealth(node, 2*time.Minute); err != nil { + fmt.Printf(" WARN: %v\n", err) + } else { + fmt.Println(" OK") + } + + return nil +} + +// contains checks if s contains substr. +func contains(s, substr string) bool { + return len(s) >= len(substr) && findSubstring(s, substr) +} + +func findSubstring(s, substr string) bool { + for i := 0; i <= len(s)-len(substr); i++ { + if s[i:i+len(substr)] == substr { + return true + } + } + return false +} diff --git a/pkg/cli/sandbox/setup.go b/pkg/cli/sandbox/setup.go new file mode 100644 index 0000000..d4d07c1 --- /dev/null +++ b/pkg/cli/sandbox/setup.go @@ -0,0 +1,319 @@ +package sandbox + +import ( + "bufio" + "crypto/ed25519" + "crypto/rand" + "encoding/pem" + "fmt" + "os" + "os/exec" + "strings" + + "golang.org/x/crypto/ssh" +) + +// Setup runs the interactive sandbox setup wizard. +func Setup() error { + fmt.Println("Orama Sandbox Setup") + fmt.Println("====================") + fmt.Println() + + reader := bufio.NewReader(os.Stdin) + + // Step 1: Hetzner API token + fmt.Print("Hetzner Cloud API token: ") + token, err := reader.ReadString('\n') + if err != nil { + return fmt.Errorf("read token: %w", err) + } + token = strings.TrimSpace(token) + if token == "" { + return fmt.Errorf("API token is required") + } + + fmt.Print(" Validating token... ") + client := NewHetznerClient(token) + if err := client.ValidateToken(); err != nil { + fmt.Println("FAILED") + return fmt.Errorf("invalid token: %w", err) + } + fmt.Println("OK") + fmt.Println() + + // Step 2: Domain + fmt.Print("Sandbox domain (e.g., sbx.dbrs.space): ") + domain, err := reader.ReadString('\n') + if err != nil { + return fmt.Errorf("read domain: %w", err) + } + domain = strings.TrimSpace(domain) + if domain == "" { + return fmt.Errorf("domain is required") + } + + cfg := &Config{ + HetznerAPIToken: token, + Domain: domain, + } + cfg.Defaults() + + // Step 3: Floating IPs + fmt.Println() + fmt.Println("Checking floating IPs...") + floatingIPs, err := setupFloatingIPs(client, cfg.Location) + if err != nil { + return err + } + cfg.FloatingIPs = floatingIPs + + // Step 4: Firewall + fmt.Println() + fmt.Println("Checking firewall...") + fwID, err := setupFirewall(client) + if err != nil { + return err + } + cfg.FirewallID = fwID + + // Step 5: SSH key + fmt.Println() + fmt.Println("Setting up SSH key...") + sshKeyConfig, err := setupSSHKey(client) + if err != nil { + return err + } + cfg.SSHKey = sshKeyConfig + + // Step 6: Display DNS instructions + fmt.Println() + fmt.Println("DNS Configuration") + fmt.Println("-----------------") + fmt.Println("Configure the following at your domain registrar:") + fmt.Println() + fmt.Printf(" 1. Add glue records (Personal DNS Servers):\n") + fmt.Printf(" ns1.%s -> %s\n", domain, cfg.FloatingIPs[0].IP) + fmt.Printf(" ns2.%s -> %s\n", domain, cfg.FloatingIPs[1].IP) + fmt.Println() + fmt.Printf(" 2. Set custom nameservers for %s:\n", domain) + fmt.Printf(" ns1.%s\n", domain) + fmt.Printf(" ns2.%s\n", domain) + fmt.Println() + + // Step 7: Verify DNS (optional) + fmt.Print("Verify DNS now? [y/N]: ") + verifyChoice, _ := reader.ReadString('\n') + verifyChoice = strings.TrimSpace(strings.ToLower(verifyChoice)) + if verifyChoice == "y" || verifyChoice == "yes" { + verifyDNS(domain) + } + + // Save config + if err := SaveConfig(cfg); err != nil { + return fmt.Errorf("save config: %w", err) + } + + fmt.Println() + fmt.Println("Setup complete! Config saved to ~/.orama/sandbox.yaml") + fmt.Println() + fmt.Println("Next: orama sandbox create") + return nil +} + +// setupFloatingIPs checks for existing floating IPs or creates new ones. +func setupFloatingIPs(client *HetznerClient, location string) ([]FloatIP, error) { + existing, err := client.ListFloatingIPsByLabel("orama-sandbox-dns=true") + if err != nil { + return nil, fmt.Errorf("list floating IPs: %w", err) + } + + if len(existing) >= 2 { + fmt.Printf(" Found %d existing floating IPs:\n", len(existing)) + result := make([]FloatIP, 2) + for i := 0; i < 2; i++ { + fmt.Printf(" ns%d: %s (ID: %d)\n", i+1, existing[i].IP, existing[i].ID) + result[i] = FloatIP{ID: existing[i].ID, IP: existing[i].IP} + } + return result, nil + } + + // Need to create missing floating IPs + needed := 2 - len(existing) + fmt.Printf(" Need to create %d floating IP(s)...\n", needed) + + reader := bufio.NewReader(os.Stdin) + fmt.Printf(" Create %d floating IP(s) in %s? (~$0.005/hr each) [Y/n]: ", needed, location) + choice, _ := reader.ReadString('\n') + choice = strings.TrimSpace(strings.ToLower(choice)) + if choice == "n" || choice == "no" { + return nil, fmt.Errorf("floating IPs required, aborting setup") + } + + result := make([]FloatIP, 0, 2) + for _, fip := range existing { + result = append(result, FloatIP{ID: fip.ID, IP: fip.IP}) + } + + for i := len(existing); i < 2; i++ { + desc := fmt.Sprintf("orama-sandbox-ns%d", i+1) + labels := map[string]string{"orama-sandbox-dns": "true"} + fip, err := client.CreateFloatingIP(location, desc, labels) + if err != nil { + return nil, fmt.Errorf("create floating IP %d: %w", i+1, err) + } + fmt.Printf(" Created ns%d: %s (ID: %d)\n", i+1, fip.IP, fip.ID) + result = append(result, FloatIP{ID: fip.ID, IP: fip.IP}) + } + + return result, nil +} + +// setupFirewall ensures a sandbox firewall exists. +func setupFirewall(client *HetznerClient) (int64, error) { + existing, err := client.ListFirewallsByLabel("orama-sandbox=infra") + if err != nil { + return 0, fmt.Errorf("list firewalls: %w", err) + } + + if len(existing) > 0 { + fmt.Printf(" Found existing firewall: %s (ID: %d)\n", existing[0].Name, existing[0].ID) + return existing[0].ID, nil + } + + fmt.Print(" Creating sandbox firewall... ") + fw, err := client.CreateFirewall( + "orama-sandbox", + SandboxFirewallRules(), + map[string]string{"orama-sandbox": "infra"}, + ) + if err != nil { + fmt.Println("FAILED") + return 0, fmt.Errorf("create firewall: %w", err) + } + fmt.Printf("OK (ID: %d)\n", fw.ID) + return fw.ID, nil +} + +// setupSSHKey generates an SSH keypair and uploads it to Hetzner. +func setupSSHKey(client *HetznerClient) (SSHKeyConfig, error) { + dir, err := configDir() + if err != nil { + return SSHKeyConfig{}, err + } + + privPath := dir + "/sandbox_key" + pubPath := privPath + ".pub" + + // Check for existing key + if _, err := os.Stat(privPath); err == nil { + fmt.Printf(" SSH key already exists: %s\n", privPath) + + // Read public key and check if it's on Hetzner + pubData, err := os.ReadFile(pubPath) + if err != nil { + return SSHKeyConfig{}, fmt.Errorf("read public key: %w", err) + } + + // Try to upload (will fail with uniqueness error if already exists) + key, err := client.UploadSSHKey("orama-sandbox", strings.TrimSpace(string(pubData))) + if err != nil { + // Key likely already exists on Hetzner — find it by listing + fmt.Printf(" SSH key may already be on Hetzner (upload: %v)\n", err) + fmt.Print(" Enter the Hetzner SSH key ID (or 0 to re-upload): ") + reader := bufio.NewReader(os.Stdin) + idStr, _ := reader.ReadString('\n') + idStr = strings.TrimSpace(idStr) + var hetznerID int64 + fmt.Sscanf(idStr, "%d", &hetznerID) + + if hetznerID == 0 { + return SSHKeyConfig{}, fmt.Errorf("could not resolve SSH key on Hetzner, try deleting and re-running setup") + } + + return SSHKeyConfig{ + HetznerID: hetznerID, + PrivateKeyPath: "~/.orama/sandbox_key", + PublicKeyPath: "~/.orama/sandbox_key.pub", + }, nil + } + + fmt.Printf(" Uploaded to Hetzner (ID: %d)\n", key.ID) + return SSHKeyConfig{ + HetznerID: key.ID, + PrivateKeyPath: "~/.orama/sandbox_key", + PublicKeyPath: "~/.orama/sandbox_key.pub", + }, nil + } + + // Generate new ed25519 keypair + fmt.Print(" Generating ed25519 keypair... ") + pubKey, privKey, err := ed25519.GenerateKey(rand.Reader) + if err != nil { + fmt.Println("FAILED") + return SSHKeyConfig{}, fmt.Errorf("generate key: %w", err) + } + + // Marshal private key to OpenSSH format + pemBlock, err := ssh.MarshalPrivateKey(privKey, "") + if err != nil { + fmt.Println("FAILED") + return SSHKeyConfig{}, fmt.Errorf("marshal private key: %w", err) + } + + privPEM := pem.EncodeToMemory(pemBlock) + if err := os.WriteFile(privPath, privPEM, 0600); err != nil { + fmt.Println("FAILED") + return SSHKeyConfig{}, fmt.Errorf("write private key: %w", err) + } + + // Marshal public key to authorized_keys format + sshPubKey, err := ssh.NewPublicKey(pubKey) + if err != nil { + return SSHKeyConfig{}, fmt.Errorf("convert public key: %w", err) + } + pubStr := strings.TrimSpace(string(ssh.MarshalAuthorizedKey(sshPubKey))) + + if err := os.WriteFile(pubPath, []byte(pubStr+"\n"), 0644); err != nil { + return SSHKeyConfig{}, fmt.Errorf("write public key: %w", err) + } + fmt.Println("OK") + + // Upload to Hetzner + fmt.Print(" Uploading to Hetzner... ") + key, err := client.UploadSSHKey("orama-sandbox", pubStr) + if err != nil { + fmt.Println("FAILED") + return SSHKeyConfig{}, fmt.Errorf("upload SSH key: %w", err) + } + fmt.Printf("OK (ID: %d)\n", key.ID) + + return SSHKeyConfig{ + HetznerID: key.ID, + PrivateKeyPath: "~/.orama/sandbox_key", + PublicKeyPath: "~/.orama/sandbox_key.pub", + }, nil +} + +// verifyDNS checks if the sandbox domain resolves. +func verifyDNS(domain string) { + fmt.Printf(" Checking NS records for %s...\n", domain) + out, err := exec.Command("dig", "+short", "NS", domain, "@8.8.8.8").Output() + if err != nil { + fmt.Printf(" Warning: dig failed: %v\n", err) + fmt.Println(" DNS verification skipped. You can verify later with:") + fmt.Printf(" dig NS %s @8.8.8.8\n", domain) + return + } + + result := strings.TrimSpace(string(out)) + if result == "" { + fmt.Println(" Warning: No NS records found yet.") + fmt.Println(" DNS propagation can take up to 48 hours.") + fmt.Println(" The sandbox will still work once DNS is configured.") + } else { + fmt.Printf(" NS records:\n") + for _, line := range strings.Split(result, "\n") { + fmt.Printf(" %s\n", line) + } + } +} diff --git a/pkg/cli/sandbox/ssh_cmd.go b/pkg/cli/sandbox/ssh_cmd.go new file mode 100644 index 0000000..ed3bf61 --- /dev/null +++ b/pkg/cli/sandbox/ssh_cmd.go @@ -0,0 +1,56 @@ +package sandbox + +import ( + "fmt" + "os" + "syscall" +) + +// SSHInto opens an interactive SSH session to a sandbox node. +func SSHInto(name string, nodeNum int) error { + cfg, err := LoadConfig() + if err != nil { + return err + } + + state, err := resolveSandbox(name) + if err != nil { + return err + } + + if nodeNum < 1 || nodeNum > len(state.Servers) { + return fmt.Errorf("node number must be between 1 and %d", len(state.Servers)) + } + + srv := state.Servers[nodeNum-1] + sshKeyPath := cfg.ExpandedPrivateKeyPath() + + fmt.Printf("Connecting to %s (%s, %s)...\n", srv.Name, srv.IP, srv.Role) + + // Find ssh binary + sshBin, err := findSSHBinary() + if err != nil { + return err + } + + // Replace current process with SSH + args := []string{ + "ssh", + "-o", "StrictHostKeyChecking=accept-new", + "-i", sshKeyPath, + fmt.Sprintf("root@%s", srv.IP), + } + + return syscall.Exec(sshBin, args, os.Environ()) +} + +// findSSHBinary locates the ssh binary in PATH. +func findSSHBinary() (string, error) { + paths := []string{"/usr/bin/ssh", "/usr/local/bin/ssh", "/opt/homebrew/bin/ssh"} + for _, p := range paths { + if _, err := os.Stat(p); err == nil { + return p, nil + } + } + return "", fmt.Errorf("ssh binary not found") +} diff --git a/pkg/cli/sandbox/state.go b/pkg/cli/sandbox/state.go new file mode 100644 index 0000000..34d4f87 --- /dev/null +++ b/pkg/cli/sandbox/state.go @@ -0,0 +1,211 @@ +package sandbox + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "time" + + "github.com/DeBrosOfficial/network/pkg/inspector" + "gopkg.in/yaml.v3" +) + +// SandboxStatus represents the lifecycle state of a sandbox. +type SandboxStatus string + +const ( + StatusCreating SandboxStatus = "creating" + StatusRunning SandboxStatus = "running" + StatusDestroying SandboxStatus = "destroying" + StatusError SandboxStatus = "error" +) + +// SandboxState holds the full state of an active sandbox cluster. +type SandboxState struct { + Name string `yaml:"name"` + CreatedAt time.Time `yaml:"created_at"` + Domain string `yaml:"domain"` + Status SandboxStatus `yaml:"status"` + Servers []ServerState `yaml:"servers"` +} + +// ServerState holds the state of a single server in the sandbox. +type ServerState struct { + ID int64 `yaml:"id"` // Hetzner server ID + Name string `yaml:"name"` // e.g., sbx-feature-webrtc-1 + IP string `yaml:"ip"` // Public IPv4 + Role string `yaml:"role"` // "nameserver" or "node" + FloatingIP string `yaml:"floating_ip,omitempty"` // Only for nameserver nodes + WgIP string `yaml:"wg_ip,omitempty"` // WireGuard IP (populated after install) +} + +// sandboxesDir returns ~/.orama/sandboxes/, creating it if needed. +func sandboxesDir() (string, error) { + dir, err := configDir() + if err != nil { + return "", err + } + sbxDir := filepath.Join(dir, "sandboxes") + if err := os.MkdirAll(sbxDir, 0700); err != nil { + return "", fmt.Errorf("create sandboxes directory: %w", err) + } + return sbxDir, nil +} + +// statePath returns the path for a sandbox's state file. +func statePath(name string) (string, error) { + dir, err := sandboxesDir() + if err != nil { + return "", err + } + return filepath.Join(dir, name+".yaml"), nil +} + +// SaveState persists the sandbox state to disk. +func SaveState(state *SandboxState) error { + path, err := statePath(state.Name) + if err != nil { + return err + } + + data, err := yaml.Marshal(state) + if err != nil { + return fmt.Errorf("marshal state: %w", err) + } + + if err := os.WriteFile(path, data, 0600); err != nil { + return fmt.Errorf("write state: %w", err) + } + + return nil +} + +// LoadState reads a sandbox state from disk. +func LoadState(name string) (*SandboxState, error) { + path, err := statePath(name) + if err != nil { + return nil, err + } + + data, err := os.ReadFile(path) + if err != nil { + if os.IsNotExist(err) { + return nil, fmt.Errorf("sandbox %q not found", name) + } + return nil, fmt.Errorf("read state: %w", err) + } + + var state SandboxState + if err := yaml.Unmarshal(data, &state); err != nil { + return nil, fmt.Errorf("parse state: %w", err) + } + + return &state, nil +} + +// DeleteState removes the sandbox state file. +func DeleteState(name string) error { + path, err := statePath(name) + if err != nil { + return err + } + + if err := os.Remove(path); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("delete state: %w", err) + } + + return nil +} + +// ListStates returns all sandbox states from disk. +func ListStates() ([]*SandboxState, error) { + dir, err := sandboxesDir() + if err != nil { + return nil, err + } + + entries, err := os.ReadDir(dir) + if err != nil { + return nil, fmt.Errorf("read sandboxes directory: %w", err) + } + + var states []*SandboxState + for _, entry := range entries { + if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".yaml") { + continue + } + name := strings.TrimSuffix(entry.Name(), ".yaml") + state, err := LoadState(name) + if err != nil { + fmt.Fprintf(os.Stderr, "Warning: could not load sandbox %q: %v\n", name, err) + continue + } + states = append(states, state) + } + + return states, nil +} + +// FindActiveSandbox returns the first sandbox in running or creating state. +// Returns nil if no active sandbox exists. +func FindActiveSandbox() (*SandboxState, error) { + states, err := ListStates() + if err != nil { + return nil, err + } + + for _, s := range states { + if s.Status == StatusRunning || s.Status == StatusCreating { + return s, nil + } + } + + return nil, nil +} + +// ToNodes converts sandbox servers to inspector.Node structs for SSH operations. +// Sets SSHKey to the provided key path on each node. +func (s *SandboxState) ToNodes(sshKeyPath string) []inspector.Node { + nodes := make([]inspector.Node, len(s.Servers)) + for i, srv := range s.Servers { + nodes[i] = inspector.Node{ + Environment: "sandbox", + User: "root", + Host: srv.IP, + Role: srv.Role, + SSHKey: sshKeyPath, + } + } + return nodes +} + +// NameserverNodes returns only the nameserver nodes. +func (s *SandboxState) NameserverNodes() []ServerState { + var ns []ServerState + for _, srv := range s.Servers { + if srv.Role == "nameserver" { + ns = append(ns, srv) + } + } + return ns +} + +// RegularNodes returns only the non-nameserver nodes. +func (s *SandboxState) RegularNodes() []ServerState { + var nodes []ServerState + for _, srv := range s.Servers { + if srv.Role == "node" { + nodes = append(nodes, srv) + } + } + return nodes +} + +// GenesisServer returns the first server (genesis node). +func (s *SandboxState) GenesisServer() ServerState { + if len(s.Servers) == 0 { + return ServerState{} + } + return s.Servers[0] +} diff --git a/pkg/cli/sandbox/state_test.go b/pkg/cli/sandbox/state_test.go new file mode 100644 index 0000000..e00adb4 --- /dev/null +++ b/pkg/cli/sandbox/state_test.go @@ -0,0 +1,214 @@ +package sandbox + +import ( + "os" + "path/filepath" + "testing" + "time" +) + +func TestSaveAndLoadState(t *testing.T) { + // Use temp dir for test + tmpDir := t.TempDir() + origHome := os.Getenv("HOME") + os.Setenv("HOME", tmpDir) + defer os.Setenv("HOME", origHome) + + state := &SandboxState{ + Name: "test-sandbox", + CreatedAt: time.Date(2026, 2, 25, 10, 0, 0, 0, time.UTC), + Domain: "test.example.com", + Status: StatusRunning, + Servers: []ServerState{ + {ID: 1, Name: "sbx-test-1", IP: "1.1.1.1", Role: "nameserver", FloatingIP: "10.0.0.1", WgIP: "10.0.0.1"}, + {ID: 2, Name: "sbx-test-2", IP: "2.2.2.2", Role: "nameserver", FloatingIP: "10.0.0.2", WgIP: "10.0.0.2"}, + {ID: 3, Name: "sbx-test-3", IP: "3.3.3.3", Role: "node", WgIP: "10.0.0.3"}, + {ID: 4, Name: "sbx-test-4", IP: "4.4.4.4", Role: "node", WgIP: "10.0.0.4"}, + {ID: 5, Name: "sbx-test-5", IP: "5.5.5.5", Role: "node", WgIP: "10.0.0.5"}, + }, + } + + if err := SaveState(state); err != nil { + t.Fatalf("SaveState() error = %v", err) + } + + // Verify file exists + expected := filepath.Join(tmpDir, ".orama", "sandboxes", "test-sandbox.yaml") + if _, err := os.Stat(expected); err != nil { + t.Fatalf("state file not created at %s: %v", expected, err) + } + + // Load back + loaded, err := LoadState("test-sandbox") + if err != nil { + t.Fatalf("LoadState() error = %v", err) + } + + if loaded.Name != "test-sandbox" { + t.Errorf("name = %s, want test-sandbox", loaded.Name) + } + if loaded.Domain != "test.example.com" { + t.Errorf("domain = %s, want test.example.com", loaded.Domain) + } + if loaded.Status != StatusRunning { + t.Errorf("status = %s, want running", loaded.Status) + } + if len(loaded.Servers) != 5 { + t.Errorf("servers = %d, want 5", len(loaded.Servers)) + } +} + +func TestLoadState_NotFound(t *testing.T) { + tmpDir := t.TempDir() + origHome := os.Getenv("HOME") + os.Setenv("HOME", tmpDir) + defer os.Setenv("HOME", origHome) + + _, err := LoadState("nonexistent") + if err == nil { + t.Error("LoadState() expected error for nonexistent sandbox") + } +} + +func TestDeleteState(t *testing.T) { + tmpDir := t.TempDir() + origHome := os.Getenv("HOME") + os.Setenv("HOME", tmpDir) + defer os.Setenv("HOME", origHome) + + state := &SandboxState{ + Name: "to-delete", + Status: StatusRunning, + } + if err := SaveState(state); err != nil { + t.Fatalf("SaveState() error = %v", err) + } + + if err := DeleteState("to-delete"); err != nil { + t.Fatalf("DeleteState() error = %v", err) + } + + _, err := LoadState("to-delete") + if err == nil { + t.Error("LoadState() should fail after DeleteState()") + } +} + +func TestListStates(t *testing.T) { + tmpDir := t.TempDir() + origHome := os.Getenv("HOME") + os.Setenv("HOME", tmpDir) + defer os.Setenv("HOME", origHome) + + // Create 2 sandboxes + for _, name := range []string{"sandbox-a", "sandbox-b"} { + if err := SaveState(&SandboxState{Name: name, Status: StatusRunning}); err != nil { + t.Fatalf("SaveState(%s) error = %v", name, err) + } + } + + states, err := ListStates() + if err != nil { + t.Fatalf("ListStates() error = %v", err) + } + if len(states) != 2 { + t.Errorf("ListStates() returned %d, want 2", len(states)) + } +} + +func TestFindActiveSandbox(t *testing.T) { + tmpDir := t.TempDir() + origHome := os.Getenv("HOME") + os.Setenv("HOME", tmpDir) + defer os.Setenv("HOME", origHome) + + // No sandboxes + active, err := FindActiveSandbox() + if err != nil { + t.Fatalf("FindActiveSandbox() error = %v", err) + } + if active != nil { + t.Error("expected nil when no sandboxes exist") + } + + // Add one running sandbox + if err := SaveState(&SandboxState{Name: "active-one", Status: StatusRunning}); err != nil { + t.Fatal(err) + } + if err := SaveState(&SandboxState{Name: "errored-one", Status: StatusError}); err != nil { + t.Fatal(err) + } + + active, err = FindActiveSandbox() + if err != nil { + t.Fatalf("FindActiveSandbox() error = %v", err) + } + if active == nil || active.Name != "active-one" { + t.Errorf("FindActiveSandbox() = %v, want active-one", active) + } +} + +func TestToNodes(t *testing.T) { + state := &SandboxState{ + Servers: []ServerState{ + {IP: "1.1.1.1", Role: "nameserver"}, + {IP: "2.2.2.2", Role: "node"}, + }, + } + + nodes := state.ToNodes("/tmp/key") + if len(nodes) != 2 { + t.Fatalf("ToNodes() returned %d nodes, want 2", len(nodes)) + } + if nodes[0].Host != "1.1.1.1" { + t.Errorf("node[0].Host = %s, want 1.1.1.1", nodes[0].Host) + } + if nodes[0].User != "root" { + t.Errorf("node[0].User = %s, want root", nodes[0].User) + } + if nodes[0].SSHKey != "/tmp/key" { + t.Errorf("node[0].SSHKey = %s, want /tmp/key", nodes[0].SSHKey) + } + if nodes[0].Environment != "sandbox" { + t.Errorf("node[0].Environment = %s, want sandbox", nodes[0].Environment) + } +} + +func TestNameserverAndRegularNodes(t *testing.T) { + state := &SandboxState{ + Servers: []ServerState{ + {Role: "nameserver"}, + {Role: "nameserver"}, + {Role: "node"}, + {Role: "node"}, + {Role: "node"}, + }, + } + + ns := state.NameserverNodes() + if len(ns) != 2 { + t.Errorf("NameserverNodes() = %d, want 2", len(ns)) + } + + regular := state.RegularNodes() + if len(regular) != 3 { + t.Errorf("RegularNodes() = %d, want 3", len(regular)) + } +} + +func TestGenesisServer(t *testing.T) { + state := &SandboxState{ + Servers: []ServerState{ + {Name: "first"}, + {Name: "second"}, + }, + } + if state.GenesisServer().Name != "first" { + t.Errorf("GenesisServer().Name = %s, want first", state.GenesisServer().Name) + } + + empty := &SandboxState{} + if empty.GenesisServer().Name != "" { + t.Error("GenesisServer() on empty state should return zero value") + } +} diff --git a/pkg/cli/sandbox/status.go b/pkg/cli/sandbox/status.go new file mode 100644 index 0000000..544ca60 --- /dev/null +++ b/pkg/cli/sandbox/status.go @@ -0,0 +1,160 @@ +package sandbox + +import ( + "encoding/json" + "fmt" + "strings" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +// List prints all sandbox clusters. +func List() error { + states, err := ListStates() + if err != nil { + return err + } + + if len(states) == 0 { + fmt.Println("No sandboxes found.") + fmt.Println("Create one: orama sandbox create") + return nil + } + + fmt.Printf("%-20s %-10s %-5s %-25s %s\n", "NAME", "STATUS", "NODES", "CREATED", "DOMAIN") + for _, s := range states { + fmt.Printf("%-20s %-10s %-5d %-25s %s\n", + s.Name, s.Status, len(s.Servers), s.CreatedAt.Format("2006-01-02 15:04"), s.Domain) + } + + // Check for orphaned servers on Hetzner + cfg, err := LoadConfig() + if err != nil { + return nil // Config not set up, skip orphan check + } + + client := NewHetznerClient(cfg.HetznerAPIToken) + hetznerServers, err := client.ListServersByLabel("orama-sandbox") + if err != nil { + return nil // API error, skip orphan check + } + + // Build set of known server IDs + known := make(map[int64]bool) + for _, s := range states { + for _, srv := range s.Servers { + known[srv.ID] = true + } + } + + var orphans []string + for _, srv := range hetznerServers { + if !known[srv.ID] { + orphans = append(orphans, fmt.Sprintf("%s (ID: %d, IP: %s)", srv.Name, srv.ID, srv.PublicNet.IPv4.IP)) + } + } + + if len(orphans) > 0 { + fmt.Printf("\nWarning: %d orphaned server(s) on Hetzner (no state file):\n", len(orphans)) + for _, o := range orphans { + fmt.Printf(" %s\n", o) + } + fmt.Println("Delete manually at https://console.hetzner.cloud") + } + + return nil +} + +// Status prints the health report for a sandbox cluster. +func Status(name string) error { + cfg, err := LoadConfig() + if err != nil { + return err + } + + state, err := resolveSandbox(name) + if err != nil { + return err + } + + sshKeyPath := cfg.ExpandedPrivateKeyPath() + fmt.Printf("Sandbox: %s (status: %s)\n\n", state.Name, state.Status) + + for _, srv := range state.Servers { + node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} + + fmt.Printf("%s (%s) — %s\n", srv.Name, srv.IP, srv.Role) + + // Get node report + out, err := runSSHOutput(node, "orama node report --json 2>/dev/null") + if err != nil { + fmt.Printf(" Status: UNREACHABLE (%v)\n", err) + fmt.Println() + continue + } + + printNodeReport(out) + fmt.Println() + } + + // Cluster summary + fmt.Println("Cluster Summary") + fmt.Println("---------------") + genesis := state.GenesisServer() + genesisNode := inspector.Node{User: "root", Host: genesis.IP, SSHKey: sshKeyPath} + + out, err := runSSHOutput(genesisNode, "curl -sf http://localhost:5001/status 2>/dev/null") + if err != nil { + fmt.Println(" RQLite: UNREACHABLE") + } else { + var status map[string]interface{} + if err := json.Unmarshal([]byte(out), &status); err == nil { + if store, ok := status["store"].(map[string]interface{}); ok { + if raft, ok := store["raft"].(map[string]interface{}); ok { + fmt.Printf(" RQLite state: %v\n", raft["state"]) + fmt.Printf(" Commit index: %v\n", raft["commit_index"]) + if nodes, ok := raft["nodes"].([]interface{}); ok { + fmt.Printf(" Nodes: %d\n", len(nodes)) + } + } + } + } + } + + return nil +} + +// printNodeReport parses and prints a node report JSON. +func printNodeReport(jsonStr string) { + var report map[string]interface{} + if err := json.Unmarshal([]byte(jsonStr), &report); err != nil { + fmt.Printf(" Report: (parse error)\n") + return + } + + // Print key fields + if services, ok := report["services"].(map[string]interface{}); ok { + var active, inactive []string + for name, info := range services { + if svc, ok := info.(map[string]interface{}); ok { + if state, ok := svc["active"].(bool); ok && state { + active = append(active, name) + } else { + inactive = append(inactive, name) + } + } + } + if len(active) > 0 { + fmt.Printf(" Active: %s\n", strings.Join(active, ", ")) + } + if len(inactive) > 0 { + fmt.Printf(" Inactive: %s\n", strings.Join(inactive, ", ")) + } + } + + if rqlite, ok := report["rqlite"].(map[string]interface{}); ok { + if state, ok := rqlite["state"].(string); ok { + fmt.Printf(" RQLite: %s\n", state) + } + } +} From f26676db2c7a0a771e77781fe6f8c199b99993da Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Fri, 27 Feb 2026 15:22:51 +0200 Subject: [PATCH 05/13] feat: add sandbox command and vault guardian build - integrate Zig-built vault-guardian into cross-compile process - add `orama sandbox` for ephemeral Hetzner Cloud clusters - update docs for `orama node` subcommands and new guides --- README.md | 14 +- cmd/cli/root.go | 4 + docs/COMMON_PROBLEMS.md | 16 +- pkg/cli/build/builder.go | 108 +++- pkg/environments/production/config.go | 24 + pkg/environments/production/orchestrator.go | 28 +- pkg/environments/production/prebuilt.go | 2 + pkg/environments/production/provisioner.go | 2 + pkg/environments/production/services.go | 37 ++ pkg/gateway/gateway.go | 5 + pkg/gateway/handlers/vault/handlers.go | 132 +++++ pkg/gateway/handlers/vault/health_handler.go | 116 +++++ pkg/gateway/handlers/vault/pull_handler.go | 183 +++++++ pkg/gateway/handlers/vault/push_handler.go | 168 +++++++ pkg/gateway/handlers/vault/rate_limiter.go | 120 +++++ pkg/gateway/middleware.go | 5 + pkg/gateway/routes.go | 8 + pkg/shamir/field.go | 82 +++ pkg/shamir/shamir.go | 150 ++++++ pkg/shamir/shamir_test.go | 501 +++++++++++++++++++ 20 files changed, 1669 insertions(+), 36 deletions(-) create mode 100644 pkg/gateway/handlers/vault/handlers.go create mode 100644 pkg/gateway/handlers/vault/health_handler.go create mode 100644 pkg/gateway/handlers/vault/pull_handler.go create mode 100644 pkg/gateway/handlers/vault/push_handler.go create mode 100644 pkg/gateway/handlers/vault/rate_limiter.go create mode 100644 pkg/shamir/field.go create mode 100644 pkg/shamir/shamir.go create mode 100644 pkg/shamir/shamir_test.go diff --git a/README.md b/README.md index 2b416c2..d8119d6 100644 --- a/README.md +++ b/README.md @@ -349,13 +349,13 @@ All configuration lives in `~/.orama/`: ```bash # Check status -systemctl status orama-node +sudo orama node status # View logs -journalctl -u orama-node -f +orama node logs node --follow # Check log files -tail -f /opt/orama/.orama/logs/node.log +sudo orama node doctor ``` ### Port Conflicts @@ -417,9 +417,11 @@ See `openapi/gateway.yaml` for complete API specification. - **[Deployment Guide](docs/DEPLOYMENT_GUIDE.md)** - Deploy React, Next.js, Go apps and manage databases - **[Architecture Guide](docs/ARCHITECTURE.md)** - System architecture and design patterns - **[Client SDK](docs/CLIENT_SDK.md)** - Go SDK documentation and examples -- **[Gateway API](docs/GATEWAY_API.md)** - Complete HTTP API reference -- **[Security Deployment](docs/SECURITY_DEPLOYMENT_GUIDE.md)** - Production security hardening -- **[Testing Plan](docs/TESTING_PLAN.md)** - Comprehensive testing strategy and implementation +- **[Monitoring](docs/MONITORING.md)** - Cluster monitoring and health checks +- **[Inspector](docs/INSPECTOR.md)** - Deep subsystem health inspection +- **[Serverless Functions](docs/SERVERLESS.md)** - WASM serverless with host functions +- **[WebRTC](docs/WEBRTC.md)** - Real-time communication setup +- **[Common Problems](docs/COMMON_PROBLEMS.md)** - Troubleshooting known issues ## Resources diff --git a/cmd/cli/root.go b/cmd/cli/root.go index 266fc9b..0f27fdd 100644 --- a/cmd/cli/root.go +++ b/cmd/cli/root.go @@ -18,6 +18,7 @@ import ( "github.com/DeBrosOfficial/network/pkg/cli/cmd/monitorcmd" "github.com/DeBrosOfficial/network/pkg/cli/cmd/namespacecmd" "github.com/DeBrosOfficial/network/pkg/cli/cmd/node" + "github.com/DeBrosOfficial/network/pkg/cli/cmd/sandboxcmd" ) // version metadata populated via -ldflags at build time @@ -87,6 +88,9 @@ and interacting with the Orama distributed network.`, // Build command (cross-compile binary archive) rootCmd.AddCommand(buildcmd.Cmd) + // Sandbox command (ephemeral Hetzner Cloud clusters) + rootCmd.AddCommand(sandboxcmd.Cmd) + return rootCmd } diff --git a/docs/COMMON_PROBLEMS.md b/docs/COMMON_PROBLEMS.md index f54c938..ae6d9ff 100644 --- a/docs/COMMON_PROBLEMS.md +++ b/docs/COMMON_PROBLEMS.md @@ -32,7 +32,7 @@ wg set wg0 peer remove wg set wg0 peer endpoint :51820 allowed-ips /32 persistent-keepalive 25 ``` -Then restart services: `sudo orama prod restart` +Then restart services: `sudo orama node restart` You can find peer public keys with `wg show wg0`. @@ -46,7 +46,7 @@ cat /opt/orama/.orama/data/namespaces//configs/olric-*.yaml If `bindAddr` is `0.0.0.0`, the node will try to bind to IPv6 on dual-stack hosts, breaking memberlist gossip. -**Fix:** Edit the YAML to use the node's WireGuard IP (run `ip addr show wg0` to find it), then restart: `sudo orama prod restart` +**Fix:** Edit the YAML to use the node's WireGuard IP (run `ip addr show wg0` to find it), then restart: `sudo orama node restart` This was fixed in code (BindAddr validation in `SpawnOlric`), so new namespaces won't have this issue. @@ -82,7 +82,7 @@ olric_servers: - "10.0.0.Z:10002" ``` -Then: `sudo orama prod restart` +Then: `sudo orama node restart` This was fixed in code, so new namespaces get the correct config. @@ -90,7 +90,7 @@ This was fixed in code, so new namespaces get the correct config. ## 3. Namespace not restoring after restart (missing cluster-state.json) -**Symptom:** After `orama prod restart`, the namespace services don't come back because `RestoreLocalClustersFromDisk` has no state file. +**Symptom:** After `orama node restart`, the namespace services don't come back because `RestoreLocalClustersFromDisk` has no state file. **Check:** @@ -117,9 +117,9 @@ This was fixed in code — `ProvisionCluster` now saves state to all nodes (incl ## 4. Namespace gateway processes not restarting after upgrade -**Symptom:** After `orama upgrade --restart` or `orama prod restart`, namespace gateway/olric/rqlite services don't start. +**Symptom:** After `orama upgrade --restart` or `orama node restart`, namespace gateway/olric/rqlite services don't start. -**Cause:** `orama prod stop` disables systemd template services (`orama-namespace-gateway@.service`). They have `PartOf=orama-node.service`, but that only propagates restart to **enabled** services. +**Cause:** `orama node stop` disables systemd template services (`orama-namespace-gateway@.service`). They have `PartOf=orama-node.service`, but that only propagates restart to **enabled** services. **Fix:** Re-enable the services before restarting: @@ -127,7 +127,7 @@ This was fixed in code — `ProvisionCluster` now saves state to all nodes (incl systemctl enable orama-namespace-rqlite@.service systemctl enable orama-namespace-olric@.service systemctl enable orama-namespace-gateway@.service -sudo orama prod restart +sudo orama node restart ``` This was fixed in code — the upgrade orchestrator now re-enables `@` services before restarting. @@ -152,7 +152,7 @@ ssh -n user@host 'command' ## General Debugging Tips -- **Always use `sudo orama prod restart`** instead of raw `systemctl` commands +- **Always use `sudo orama node restart`** instead of raw `systemctl` commands - **Namespace data lives at:** `/opt/orama/.orama/data/namespaces//` - **Check service logs:** `journalctl -u orama-namespace-olric@.service --no-pager -n 50` - **Check WireGuard:** `wg show wg0` — look for recent handshakes and transfer bytes diff --git a/pkg/cli/build/builder.go b/pkg/cli/build/builder.go index de82016..4514f6b 100644 --- a/pkg/cli/build/builder.go +++ b/pkg/cli/build/builder.go @@ -71,48 +71,53 @@ func (b *Builder) Build() error { return fmt.Errorf("failed to build orama binaries: %w", err) } - // Step 2: Cross-compile Olric + // Step 2: Cross-compile Vault Guardian (Zig) + if err := b.buildVaultGuardian(); err != nil { + return fmt.Errorf("failed to build vault-guardian: %w", err) + } + + // Step 3: Cross-compile Olric if err := b.buildOlric(); err != nil { return fmt.Errorf("failed to build olric: %w", err) } - // Step 3: Cross-compile IPFS Cluster + // Step 4: Cross-compile IPFS Cluster if err := b.buildIPFSCluster(); err != nil { return fmt.Errorf("failed to build ipfs-cluster: %w", err) } - // Step 4: Build CoreDNS with RQLite plugin + // Step 5: Build CoreDNS with RQLite plugin if err := b.buildCoreDNS(); err != nil { return fmt.Errorf("failed to build coredns: %w", err) } - // Step 5: Build Caddy with Orama DNS module + // Step 6: Build Caddy with Orama DNS module if err := b.buildCaddy(); err != nil { return fmt.Errorf("failed to build caddy: %w", err) } - // Step 6: Download pre-built IPFS Kubo + // Step 7: Download pre-built IPFS Kubo if err := b.downloadIPFS(); err != nil { return fmt.Errorf("failed to download ipfs: %w", err) } - // Step 7: Download pre-built RQLite + // Step 8: Download pre-built RQLite if err := b.downloadRQLite(); err != nil { return fmt.Errorf("failed to download rqlite: %w", err) } - // Step 8: Copy systemd templates + // Step 9: Copy systemd templates if err := b.copySystemdTemplates(); err != nil { return fmt.Errorf("failed to copy systemd templates: %w", err) } - // Step 9: Generate manifest + // Step 10: Generate manifest manifest, err := b.generateManifest() if err != nil { return fmt.Errorf("failed to generate manifest: %w", err) } - // Step 10: Create archive + // Step 11: Create archive outputPath := b.flags.Output if outputPath == "" { outputPath = fmt.Sprintf("/tmp/orama-%s-linux-%s.tar.gz", b.version, b.flags.Arch) @@ -130,7 +135,7 @@ func (b *Builder) Build() error { } func (b *Builder) buildOramaBinaries() error { - fmt.Println("[1/7] Cross-compiling Orama binaries...") + fmt.Println("[1/8] Cross-compiling Orama binaries...") ldflags := fmt.Sprintf("-s -w -X 'main.version=%s' -X 'main.commit=%s' -X 'main.date=%s'", b.version, b.commit, b.date) @@ -177,8 +182,79 @@ func (b *Builder) buildOramaBinaries() error { return nil } +func (b *Builder) buildVaultGuardian() error { + fmt.Println("[2/8] Cross-compiling Vault Guardian (Zig)...") + + // Ensure zig is available + if _, err := exec.LookPath("zig"); err != nil { + return fmt.Errorf("zig not found in PATH — install from https://ziglang.org/download/") + } + + // Vault source is sibling to orama project + vaultDir := filepath.Join(b.projectDir, "..", "orama-vault") + if _, err := os.Stat(filepath.Join(vaultDir, "build.zig")); err != nil { + return fmt.Errorf("vault source not found at %s — expected orama-vault as sibling directory: %w", vaultDir, err) + } + + // Map Go arch to Zig target triple + var zigTarget string + switch b.flags.Arch { + case "amd64": + zigTarget = "x86_64-linux-musl" + case "arm64": + zigTarget = "aarch64-linux-musl" + default: + return fmt.Errorf("unsupported architecture for vault: %s", b.flags.Arch) + } + + if b.flags.Verbose { + fmt.Printf(" zig build -Dtarget=%s -Doptimize=ReleaseSafe\n", zigTarget) + } + + cmd := exec.Command("zig", "build", + fmt.Sprintf("-Dtarget=%s", zigTarget), + "-Doptimize=ReleaseSafe") + cmd.Dir = vaultDir + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + if err := cmd.Run(); err != nil { + return fmt.Errorf("zig build failed: %w", err) + } + + // Copy output binary to build bin dir + src := filepath.Join(vaultDir, "zig-out", "bin", "vault-guardian") + dst := filepath.Join(b.binDir, "vault-guardian") + if err := copyFile(src, dst); err != nil { + return fmt.Errorf("failed to copy vault-guardian binary: %w", err) + } + + fmt.Println(" ✓ vault-guardian") + return nil +} + +// copyFile copies a file from src to dst, preserving executable permissions. +func copyFile(src, dst string) error { + srcFile, err := os.Open(src) + if err != nil { + return err + } + defer srcFile.Close() + + dstFile, err := os.OpenFile(dst, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0755) + if err != nil { + return err + } + defer dstFile.Close() + + if _, err := srcFile.WriteTo(dstFile); err != nil { + return err + } + return nil +} + func (b *Builder) buildOlric() error { - fmt.Printf("[2/7] Cross-compiling Olric %s...\n", constants.OlricVersion) + fmt.Printf("[3/8] Cross-compiling Olric %s...\n", constants.OlricVersion) cmd := exec.Command("go", "install", fmt.Sprintf("github.com/olric-data/olric/cmd/olric-server@%s", constants.OlricVersion)) @@ -197,7 +273,7 @@ func (b *Builder) buildOlric() error { } func (b *Builder) buildIPFSCluster() error { - fmt.Printf("[3/7] Cross-compiling IPFS Cluster %s...\n", constants.IPFSClusterVersion) + fmt.Printf("[4/8] Cross-compiling IPFS Cluster %s...\n", constants.IPFSClusterVersion) cmd := exec.Command("go", "install", fmt.Sprintf("github.com/ipfs-cluster/ipfs-cluster/cmd/ipfs-cluster-service@%s", constants.IPFSClusterVersion)) @@ -216,7 +292,7 @@ func (b *Builder) buildIPFSCluster() error { } func (b *Builder) buildCoreDNS() error { - fmt.Printf("[4/7] Building CoreDNS %s with RQLite plugin...\n", constants.CoreDNSVersion) + fmt.Printf("[5/8] Building CoreDNS %s with RQLite plugin...\n", constants.CoreDNSVersion) buildDir := filepath.Join(b.tmpDir, "coredns-build") @@ -363,7 +439,7 @@ rqlite:rqlite } func (b *Builder) buildCaddy() error { - fmt.Printf("[5/7] Building Caddy %s with Orama DNS module...\n", constants.CaddyVersion) + fmt.Printf("[6/8] Building Caddy %s with Orama DNS module...\n", constants.CaddyVersion) // Ensure xcaddy is available if _, err := exec.LookPath("xcaddy"); err != nil { @@ -429,7 +505,7 @@ require ( } func (b *Builder) downloadIPFS() error { - fmt.Printf("[6/7] Downloading IPFS Kubo %s...\n", constants.IPFSKuboVersion) + fmt.Printf("[7/8] Downloading IPFS Kubo %s...\n", constants.IPFSKuboVersion) arch := b.flags.Arch tarball := fmt.Sprintf("kubo_%s_linux-%s.tar.gz", constants.IPFSKuboVersion, arch) @@ -450,7 +526,7 @@ func (b *Builder) downloadIPFS() error { } func (b *Builder) downloadRQLite() error { - fmt.Printf("[7/7] Downloading RQLite %s...\n", constants.RQLiteVersion) + fmt.Printf("[8/8] Downloading RQLite %s...\n", constants.RQLiteVersion) arch := b.flags.Arch tarball := fmt.Sprintf("rqlite-v%s-linux-%s.tar.gz", constants.RQLiteVersion, arch) diff --git a/pkg/environments/production/config.go b/pkg/environments/production/config.go index 3bf6cd1..8de319a 100644 --- a/pkg/environments/production/config.go +++ b/pkg/environments/production/config.go @@ -194,6 +194,30 @@ func (cg *ConfigGenerator) GenerateNodeConfig(peerAddresses []string, vpsIP stri return templates.RenderNodeConfig(data) } +// GenerateVaultConfig generates vault.yaml configuration for the Vault Guardian. +// The vault config uses key=value format (not YAML, despite the file extension). +// Peer discovery is dynamic via RQLite — no static peer list needed. +func (cg *ConfigGenerator) GenerateVaultConfig(vpsIP string) string { + dataDir := filepath.Join(cg.oramaDir, "data", "vault") + + // Bind to WireGuard IP so vault is only accessible over the overlay network. + // If no WG IP is provided, bind to localhost as a safe default. + bindAddr := "127.0.0.1" + if vpsIP != "" { + bindAddr = vpsIP + } + + return fmt.Sprintf(`# Vault Guardian Configuration +# Generated by orama node install + +listen_address = %s +client_port = 7500 +peer_port = 7501 +data_dir = %s +rqlite_url = http://127.0.0.1:5001 +`, bindAddr, dataDir) +} + // GenerateGatewayConfig generates gateway.yaml configuration func (cg *ConfigGenerator) GenerateGatewayConfig(peerAddresses []string, enableHTTPS bool, domain string, olricServers []string) (string, error) { tlsCacheDir := "" diff --git a/pkg/environments/production/orchestrator.go b/pkg/environments/production/orchestrator.go index 7e5d371..65dd7c8 100644 --- a/pkg/environments/production/orchestrator.go +++ b/pkg/environments/production/orchestrator.go @@ -573,6 +573,14 @@ func (ps *ProductionSetup) Phase4GenerateConfigs(peerAddresses []string, vpsIP s } ps.logf(" ✓ Olric config generated") + // Vault Guardian config + vaultConfig := ps.configGenerator.GenerateVaultConfig(vpsIP) + vaultConfigPath := filepath.Join(ps.oramaDir, "data", "vault", "vault.yaml") + if err := os.WriteFile(vaultConfigPath, []byte(vaultConfig), 0644); err != nil { + return fmt.Errorf("failed to save vault config: %w", err) + } + ps.logf(" ✓ Vault config generated") + // Configure CoreDNS (if baseDomain is provided - this is the zone name) // CoreDNS uses baseDomain (e.g., "dbrs.space") as the authoritative zone dnsZone := baseDomain @@ -667,6 +675,13 @@ func (ps *ProductionSetup) Phase5CreateSystemdServices(enableHTTPS bool) error { } ps.logf(" ✓ Node service created: orama-node.service (with embedded gateway)") + // Vault Guardian service + vaultUnit := ps.serviceGenerator.GenerateVaultService() + if err := ps.serviceController.WriteServiceUnit("orama-vault.service", vaultUnit); err != nil { + return fmt.Errorf("failed to write Vault service: %w", err) + } + ps.logf(" ✓ Vault service created: orama-vault.service") + // Anyone Relay service (only created when --anyone-relay flag is used) // A node must run EITHER relay OR client, never both. When writing one // mode's service, we remove the other to prevent conflicts (they share @@ -725,7 +740,7 @@ func (ps *ProductionSetup) Phase5CreateSystemdServices(enableHTTPS bool) error { // Enable services (unified names - no bootstrap/node distinction) // Note: orama-gateway.service is no longer needed - each node has an embedded gateway // Note: orama-rqlite.service is NOT created - RQLite is managed by each node internally - services := []string{"orama-ipfs.service", "orama-ipfs-cluster.service", "orama-olric.service", "orama-node.service"} + services := []string{"orama-ipfs.service", "orama-ipfs-cluster.service", "orama-olric.service", "orama-vault.service", "orama-node.service"} // Add Anyone service if configured (relay or client) if ps.IsAnyoneRelay() { @@ -756,8 +771,8 @@ func (ps *ProductionSetup) Phase5CreateSystemdServices(enableHTTPS bool) error { // services pick up new configs even if already running from a previous install) ps.logf(" Starting services...") - // Start infrastructure first (IPFS, Olric, Anyone) - RQLite is managed internally by each node - infraServices := []string{"orama-ipfs.service", "orama-olric.service"} + // Start infrastructure first (IPFS, Olric, Vault, Anyone) - RQLite is managed internally by each node + infraServices := []string{"orama-ipfs.service", "orama-olric.service", "orama-vault.service"} // Add Anyone service if configured (relay or client) if ps.IsAnyoneRelay() { @@ -977,12 +992,13 @@ func (ps *ProductionSetup) LogSetupComplete(peerID string) { ps.logf(" %s/logs/olric.log", ps.oramaDir) ps.logf(" %s/logs/node.log", ps.oramaDir) ps.logf(" %s/logs/gateway.log", ps.oramaDir) + ps.logf(" %s/logs/vault.log", ps.oramaDir) // Anyone mode-specific logs and commands if ps.IsAnyoneRelay() { ps.logf(" /var/log/anon/notices.log (Anyone Relay)") ps.logf("\nStart All Services:") - ps.logf(" systemctl start orama-ipfs orama-ipfs-cluster orama-olric orama-anyone-relay orama-node") + ps.logf(" systemctl start orama-ipfs orama-ipfs-cluster orama-olric orama-vault orama-anyone-relay orama-node") ps.logf("\nAnyone Relay Operator:") ps.logf(" ORPort: %d", ps.anyoneRelayConfig.ORPort) ps.logf(" Wallet: %s", ps.anyoneRelayConfig.Wallet) @@ -991,10 +1007,10 @@ func (ps *ProductionSetup) LogSetupComplete(peerID string) { ps.logf(" IMPORTANT: You need 100 $ANYONE tokens in your wallet to receive rewards") } else if ps.IsAnyoneClient() { ps.logf("\nStart All Services:") - ps.logf(" systemctl start orama-ipfs orama-ipfs-cluster orama-olric orama-anyone-client orama-node") + ps.logf(" systemctl start orama-ipfs orama-ipfs-cluster orama-olric orama-vault orama-anyone-client orama-node") } else { ps.logf("\nStart All Services:") - ps.logf(" systemctl start orama-ipfs orama-ipfs-cluster orama-olric orama-node") + ps.logf(" systemctl start orama-ipfs orama-ipfs-cluster orama-olric orama-vault orama-node") } ps.logf("\nVerify Installation:") diff --git a/pkg/environments/production/prebuilt.go b/pkg/environments/production/prebuilt.go index 689b8ba..04d4233 100644 --- a/pkg/environments/production/prebuilt.go +++ b/pkg/environments/production/prebuilt.go @@ -127,6 +127,8 @@ func (ps *ProductionSetup) deployPreBuiltBinaries(manifest *PreBuiltManifest) er {name: "coredns", dest: "/usr/local/bin/coredns"}, {name: "caddy", dest: "/usr/bin/caddy"}, } + // Note: vault-guardian stays at /opt/orama/bin/ (from archive extraction) + // and is referenced by absolute path in the systemd service — no copy needed. for _, bin := range binaries { srcPath := filepath.Join(OramaArchiveBin, bin.name) diff --git a/pkg/environments/production/provisioner.go b/pkg/environments/production/provisioner.go index 259e213..97e3089 100644 --- a/pkg/environments/production/provisioner.go +++ b/pkg/environments/production/provisioner.go @@ -34,6 +34,7 @@ func (fp *FilesystemProvisioner) EnsureDirectoryStructure() error { filepath.Join(fp.oramaDir, "data", "ipfs", "repo"), filepath.Join(fp.oramaDir, "data", "ipfs-cluster"), filepath.Join(fp.oramaDir, "data", "rqlite"), + filepath.Join(fp.oramaDir, "data", "vault"), filepath.Join(fp.oramaDir, "logs"), filepath.Join(fp.oramaDir, "tls-cache"), filepath.Join(fp.oramaDir, "backups"), @@ -65,6 +66,7 @@ func (fp *FilesystemProvisioner) EnsureDirectoryStructure() error { "ipfs.log", "ipfs-cluster.log", "node.log", + "vault.log", "anyone-client.log", } diff --git a/pkg/environments/production/services.go b/pkg/environments/production/services.go index 0070cb9..2e47f66 100644 --- a/pkg/environments/production/services.go +++ b/pkg/environments/production/services.go @@ -214,6 +214,43 @@ WantedBy=multi-user.target `, ssg.oramaHome, ssg.oramaDir, configFile, logFile) } +// GenerateVaultService generates the Orama Vault Guardian systemd unit. +// The vault guardian runs on every node, storing Shamir secret shares. +// It binds to the WireGuard overlay only (no public exposure). +func (ssg *SystemdServiceGenerator) GenerateVaultService() string { + logFile := filepath.Join(ssg.oramaDir, "logs", "vault.log") + dataDir := filepath.Join(ssg.oramaDir, "data", "vault") + + return fmt.Sprintf(`[Unit] +Description=Orama Vault Guardian +After=network-online.target wg-quick@wg0.service +Wants=network-online.target +Requires=wg-quick@wg0.service +PartOf=orama-node.service + +[Service] +Type=simple +ExecStart=%[1]s/bin/vault-guardian --config %[2]s/vault.yaml +Restart=on-failure +RestartSec=5 +StandardOutput=append:%[3]s +StandardError=append:%[3]s +SyslogIdentifier=orama-vault + +PrivateTmp=yes +ProtectSystem=strict +ReadWritePaths=%[2]s +NoNewPrivileges=yes +LimitMEMLOCK=67108864 +MemoryMax=512M +TimeoutStopSec=30 +KillMode=mixed + +[Install] +WantedBy=multi-user.target +`, ssg.oramaHome, dataDir, logFile) +} + // GenerateGatewayService generates the Orama Gateway systemd unit func (ssg *SystemdServiceGenerator) GenerateGatewayService() string { logFile := filepath.Join(ssg.oramaDir, "logs", "gateway.log") diff --git a/pkg/gateway/gateway.go b/pkg/gateway/gateway.go index c597343..d7088c3 100644 --- a/pkg/gateway/gateway.go +++ b/pkg/gateway/gateway.go @@ -31,6 +31,7 @@ import ( serverlesshandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/serverless" joinhandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/join" webrtchandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/webrtc" + vaulthandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/vault" wireguardhandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/wireguard" sqlitehandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/sqlite" "github.com/DeBrosOfficial/network/pkg/gateway/handlers/storage" @@ -162,6 +163,9 @@ type Gateway struct { // Shared HTTP transport for proxy connections (connection pooling) proxyTransport *http.Transport + // Vault proxy handlers + vaultHandlers *vaulthandlers.Handlers + // Namespace health state (local service probes + hourly reconciliation) nsHealth *namespaceHealthState } @@ -395,6 +399,7 @@ func New(logger *logging.ColoredLogger, cfg *Config) (*Gateway, error) { if deps.ORMClient != nil { gw.wireguardHandler = wireguardhandlers.NewHandler(logger.Logger, deps.ORMClient, cfg.ClusterSecret) gw.joinHandler = joinhandlers.NewHandler(logger.Logger, deps.ORMClient, cfg.DataDir) + gw.vaultHandlers = vaulthandlers.NewHandlers(logger, deps.Client) } // Initialize deployment system diff --git a/pkg/gateway/handlers/vault/handlers.go b/pkg/gateway/handlers/vault/handlers.go new file mode 100644 index 0000000..ec80dcb --- /dev/null +++ b/pkg/gateway/handlers/vault/handlers.go @@ -0,0 +1,132 @@ +// Package vault provides HTTP handlers for vault proxy operations. +// +// The gateway acts as a smart proxy between RootWallet clients and +// vault guardian nodes on the WireGuard overlay network. It handles +// Shamir split/combine so clients make a single HTTPS call. +package vault + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "time" + + "github.com/DeBrosOfficial/network/pkg/client" + "github.com/DeBrosOfficial/network/pkg/logging" +) + +const ( + // VaultGuardianPort is the port vault guardians listen on (client API). + VaultGuardianPort = 7500 + + // guardianTimeout is the per-guardian HTTP request timeout. + guardianTimeout = 5 * time.Second + + // overallTimeout is the maximum time for the full fan-out operation. + overallTimeout = 15 * time.Second + + // maxPushBodySize limits push request bodies (1 MiB). + maxPushBodySize = 1 << 20 + + // maxPullBodySize limits pull request bodies (4 KiB). + maxPullBodySize = 4 << 10 +) + +// Handlers provides HTTP handlers for vault proxy operations. +type Handlers struct { + logger *logging.ColoredLogger + dbClient client.NetworkClient + rateLimiter *IdentityRateLimiter + httpClient *http.Client +} + +// NewHandlers creates vault proxy handlers. +func NewHandlers(logger *logging.ColoredLogger, dbClient client.NetworkClient) *Handlers { + h := &Handlers{ + logger: logger, + dbClient: dbClient, + rateLimiter: NewIdentityRateLimiter( + 30, // 30 pushes per hour per identity + 120, // 120 pulls per hour per identity + ), + httpClient: &http.Client{ + Timeout: guardianTimeout, + Transport: &http.Transport{ + MaxIdleConns: 100, + MaxIdleConnsPerHost: 10, + IdleConnTimeout: 90 * time.Second, + }, + }, + } + h.rateLimiter.StartCleanup(10*time.Minute, 1*time.Hour) + return h +} + +// guardian represents a reachable vault guardian node. +type guardian struct { + IP string + Port int +} + +// discoverGuardians queries dns_nodes for all active nodes. +// Every Orama node runs a vault guardian, so every active node is a guardian. +func (h *Handlers) discoverGuardians(ctx context.Context) ([]guardian, error) { + db := h.dbClient.Database() + internalCtx := client.WithInternalAuth(ctx) + + query := "SELECT COALESCE(internal_ip, ip_address) FROM dns_nodes WHERE status = 'active'" + result, err := db.Query(internalCtx, query) + if err != nil { + return nil, fmt.Errorf("vault: failed to query guardian nodes: %w", err) + } + if result == nil || len(result.Rows) == 0 { + return nil, fmt.Errorf("vault: no active guardian nodes found") + } + + guardians := make([]guardian, 0, len(result.Rows)) + for _, row := range result.Rows { + if len(row) == 0 { + continue + } + ip := getString(row[0]) + if ip == "" { + continue + } + guardians = append(guardians, guardian{IP: ip, Port: VaultGuardianPort}) + } + if len(guardians) == 0 { + return nil, fmt.Errorf("vault: no guardian nodes with valid IPs found") + } + return guardians, nil +} + +func writeJSON(w http.ResponseWriter, status int, v interface{}) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + json.NewEncoder(w).Encode(v) +} + +func writeError(w http.ResponseWriter, status int, msg string) { + writeJSON(w, status, map[string]string{"error": msg}) +} + +func getString(v interface{}) string { + if s, ok := v.(string); ok { + return s + } + return "" +} + +// isValidIdentity checks that identity is exactly 64 hex characters. +func isValidIdentity(identity string) bool { + if len(identity) != 64 { + return false + } + for _, c := range identity { + if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { + return false + } + } + return true +} diff --git a/pkg/gateway/handlers/vault/health_handler.go b/pkg/gateway/handlers/vault/health_handler.go new file mode 100644 index 0000000..e5dd702 --- /dev/null +++ b/pkg/gateway/handlers/vault/health_handler.go @@ -0,0 +1,116 @@ +package vault + +import ( + "context" + "fmt" + "io" + "net/http" + "sync" + "sync/atomic" + + "github.com/DeBrosOfficial/network/pkg/shamir" +) + +// HealthResponse is returned for GET /v1/vault/health. +type HealthResponse struct { + Status string `json:"status"` // "healthy", "degraded", "unavailable" +} + +// StatusResponse is returned for GET /v1/vault/status. +type StatusResponse struct { + Guardians int `json:"guardians"` // Total guardian nodes + Healthy int `json:"healthy"` // Reachable guardians + Threshold int `json:"threshold"` // Read quorum (K) + WriteQuorum int `json:"write_quorum"` // Write quorum (W) +} + +// HandleHealth processes GET /v1/vault/health. +func (h *Handlers) HandleHealth(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + writeError(w, http.StatusMethodNotAllowed, "method not allowed") + return + } + + guardians, err := h.discoverGuardians(r.Context()) + if err != nil { + writeJSON(w, http.StatusOK, HealthResponse{Status: "unavailable"}) + return + } + + n := len(guardians) + healthy := h.probeGuardians(r.Context(), guardians) + + k := shamir.AdaptiveThreshold(n) + wq := shamir.WriteQuorum(n) + + status := "healthy" + if healthy < wq { + if healthy >= k { + status = "degraded" + } else { + status = "unavailable" + } + } + + writeJSON(w, http.StatusOK, HealthResponse{Status: status}) +} + +// HandleStatus processes GET /v1/vault/status. +func (h *Handlers) HandleStatus(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + writeError(w, http.StatusMethodNotAllowed, "method not allowed") + return + } + + guardians, err := h.discoverGuardians(r.Context()) + if err != nil { + writeJSON(w, http.StatusOK, StatusResponse{}) + return + } + + n := len(guardians) + healthy := h.probeGuardians(r.Context(), guardians) + + writeJSON(w, http.StatusOK, StatusResponse{ + Guardians: n, + Healthy: healthy, + Threshold: shamir.AdaptiveThreshold(n), + WriteQuorum: shamir.WriteQuorum(n), + }) +} + +// probeGuardians checks health of all guardians in parallel and returns the healthy count. +func (h *Handlers) probeGuardians(ctx context.Context, guardians []guardian) int { + ctx, cancel := context.WithTimeout(ctx, guardianTimeout) + defer cancel() + + var healthyCount atomic.Int32 + var wg sync.WaitGroup + wg.Add(len(guardians)) + + for _, g := range guardians { + go func(gd guardian) { + defer wg.Done() + + url := fmt.Sprintf("http://%s:%d/v1/vault/health", gd.IP, gd.Port) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return + } + + resp, err := h.httpClient.Do(req) + if err != nil { + return + } + defer resp.Body.Close() + io.Copy(io.Discard, resp.Body) + + if resp.StatusCode >= 200 && resp.StatusCode < 300 { + healthyCount.Add(1) + } + }(g) + } + + wg.Wait() + return int(healthyCount.Load()) +} diff --git a/pkg/gateway/handlers/vault/pull_handler.go b/pkg/gateway/handlers/vault/pull_handler.go new file mode 100644 index 0000000..2164487 --- /dev/null +++ b/pkg/gateway/handlers/vault/pull_handler.go @@ -0,0 +1,183 @@ +package vault + +import ( + "bytes" + "context" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "net/http" + "sync" + + "github.com/DeBrosOfficial/network/pkg/logging" + "github.com/DeBrosOfficial/network/pkg/shamir" + "go.uber.org/zap" +) + +// PullRequest is the client-facing request body. +type PullRequest struct { + Identity string `json:"identity"` // 64 hex chars +} + +// PullResponse is returned to the client. +type PullResponse struct { + Envelope string `json:"envelope"` // base64-encoded reconstructed envelope + Collected int `json:"collected"` // Number of shares collected + Threshold int `json:"threshold"` // K threshold used +} + +// guardianPullRequest is sent to each vault guardian. +type guardianPullRequest struct { + Identity string `json:"identity"` +} + +// guardianPullResponse is the response from a guardian. +type guardianPullResponse struct { + Share string `json:"share"` // base64([x:1byte][y:rest]) +} + +// HandlePull processes POST /v1/vault/pull. +func (h *Handlers) HandlePull(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + writeError(w, http.StatusMethodNotAllowed, "method not allowed") + return + } + + body, err := io.ReadAll(io.LimitReader(r.Body, maxPullBodySize)) + if err != nil { + writeError(w, http.StatusBadRequest, "failed to read request body") + return + } + + var req PullRequest + if err := json.Unmarshal(body, &req); err != nil { + writeError(w, http.StatusBadRequest, "invalid JSON") + return + } + + if !isValidIdentity(req.Identity) { + writeError(w, http.StatusBadRequest, "identity must be 64 hex characters") + return + } + + if !h.rateLimiter.AllowPull(req.Identity) { + w.Header().Set("Retry-After", "30") + writeError(w, http.StatusTooManyRequests, "pull rate limit exceeded for this identity") + return + } + + guardians, err := h.discoverGuardians(r.Context()) + if err != nil { + h.logger.ComponentError(logging.ComponentGeneral, "Vault pull: guardian discovery failed", zap.Error(err)) + writeError(w, http.StatusServiceUnavailable, "no guardian nodes available") + return + } + + n := len(guardians) + k := shamir.AdaptiveThreshold(n) + + // Fan out pull requests to all guardians. + ctx, cancel := context.WithTimeout(r.Context(), overallTimeout) + defer cancel() + + type shareResult struct { + share shamir.Share + ok bool + } + + results := make([]shareResult, n) + var wg sync.WaitGroup + wg.Add(n) + + for i, g := range guardians { + go func(idx int, gd guardian) { + defer wg.Done() + + guardianReq := guardianPullRequest{Identity: req.Identity} + reqBody, _ := json.Marshal(guardianReq) + + url := fmt.Sprintf("http://%s:%d/v1/vault/pull", gd.IP, gd.Port) + httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(reqBody)) + if err != nil { + return + } + httpReq.Header.Set("Content-Type", "application/json") + + resp, err := h.httpClient.Do(httpReq) + if err != nil { + return + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + io.Copy(io.Discard, resp.Body) + return + } + + var pullResp guardianPullResponse + if err := json.NewDecoder(resp.Body).Decode(&pullResp); err != nil { + return + } + + shareBytes, err := base64.StdEncoding.DecodeString(pullResp.Share) + if err != nil || len(shareBytes) < 2 { + return + } + + results[idx] = shareResult{ + share: shamir.Share{ + X: shareBytes[0], + Y: shareBytes[1:], + }, + ok: true, + } + }(i, g) + } + + wg.Wait() + + // Collect successful shares. + shares := make([]shamir.Share, 0, n) + for _, r := range results { + if r.ok { + shares = append(shares, r.share) + } + } + + if len(shares) < k { + h.logger.ComponentError(logging.ComponentGeneral, "Vault pull: not enough shares", + zap.Int("collected", len(shares)), zap.Int("total", n), zap.Int("threshold", k)) + writeError(w, http.StatusServiceUnavailable, + fmt.Sprintf("not enough shares: collected %d of %d required (contacted %d guardians)", len(shares), k, n)) + return + } + + // Shamir combine to reconstruct envelope. + envelope, err := shamir.Combine(shares[:k]) + if err != nil { + h.logger.ComponentError(logging.ComponentGeneral, "Vault pull: Shamir combine failed", zap.Error(err)) + writeError(w, http.StatusInternalServerError, "failed to reconstruct envelope") + return + } + + // Wipe collected shares. + for i := range shares { + for j := range shares[i].Y { + shares[i].Y[j] = 0 + } + } + + envelopeB64 := base64.StdEncoding.EncodeToString(envelope) + + // Wipe envelope. + for i := range envelope { + envelope[i] = 0 + } + + writeJSON(w, http.StatusOK, PullResponse{ + Envelope: envelopeB64, + Collected: len(shares), + Threshold: k, + }) +} diff --git a/pkg/gateway/handlers/vault/push_handler.go b/pkg/gateway/handlers/vault/push_handler.go new file mode 100644 index 0000000..b3e729d --- /dev/null +++ b/pkg/gateway/handlers/vault/push_handler.go @@ -0,0 +1,168 @@ +package vault + +import ( + "bytes" + "context" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "net/http" + "sync" + "sync/atomic" + + "github.com/DeBrosOfficial/network/pkg/logging" + "github.com/DeBrosOfficial/network/pkg/shamir" + "go.uber.org/zap" +) + +// PushRequest is the client-facing request body. +type PushRequest struct { + Identity string `json:"identity"` // 64 hex chars (SHA-256) + Envelope string `json:"envelope"` // base64-encoded encrypted envelope + Version uint64 `json:"version"` // Anti-rollback version counter +} + +// PushResponse is returned to the client. +type PushResponse struct { + Status string `json:"status"` // "ok" or "partial" + AckCount int `json:"ack_count"` + Total int `json:"total"` + Quorum int `json:"quorum"` + Threshold int `json:"threshold"` +} + +// guardianPushRequest is sent to each vault guardian. +type guardianPushRequest struct { + Identity string `json:"identity"` + Share string `json:"share"` // base64([x:1byte][y:rest]) + Version uint64 `json:"version"` +} + +// HandlePush processes POST /v1/vault/push. +func (h *Handlers) HandlePush(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + writeError(w, http.StatusMethodNotAllowed, "method not allowed") + return + } + + body, err := io.ReadAll(io.LimitReader(r.Body, maxPushBodySize)) + if err != nil { + writeError(w, http.StatusBadRequest, "failed to read request body") + return + } + + var req PushRequest + if err := json.Unmarshal(body, &req); err != nil { + writeError(w, http.StatusBadRequest, "invalid JSON") + return + } + + if !isValidIdentity(req.Identity) { + writeError(w, http.StatusBadRequest, "identity must be 64 hex characters") + return + } + + envelopeBytes, err := base64.StdEncoding.DecodeString(req.Envelope) + if err != nil { + writeError(w, http.StatusBadRequest, "invalid base64 envelope") + return + } + if len(envelopeBytes) == 0 { + writeError(w, http.StatusBadRequest, "envelope must not be empty") + return + } + + if !h.rateLimiter.AllowPush(req.Identity) { + w.Header().Set("Retry-After", "120") + writeError(w, http.StatusTooManyRequests, "push rate limit exceeded for this identity") + return + } + + guardians, err := h.discoverGuardians(r.Context()) + if err != nil { + h.logger.ComponentError(logging.ComponentGeneral, "Vault push: guardian discovery failed", zap.Error(err)) + writeError(w, http.StatusServiceUnavailable, "no guardian nodes available") + return + } + + n := len(guardians) + k := shamir.AdaptiveThreshold(n) + quorum := shamir.WriteQuorum(n) + + shares, err := shamir.Split(envelopeBytes, n, k) + if err != nil { + h.logger.ComponentError(logging.ComponentGeneral, "Vault push: Shamir split failed", zap.Error(err)) + writeError(w, http.StatusInternalServerError, "failed to split envelope") + return + } + + // Fan out to guardians in parallel. + ctx, cancel := context.WithTimeout(r.Context(), overallTimeout) + defer cancel() + + var ackCount atomic.Int32 + var wg sync.WaitGroup + wg.Add(n) + + for i, g := range guardians { + go func(idx int, gd guardian) { + defer wg.Done() + + share := shares[idx] + // Serialize: [x:1byte][y:rest] + shareBytes := make([]byte, 1+len(share.Y)) + shareBytes[0] = share.X + copy(shareBytes[1:], share.Y) + shareB64 := base64.StdEncoding.EncodeToString(shareBytes) + + guardianReq := guardianPushRequest{ + Identity: req.Identity, + Share: shareB64, + Version: req.Version, + } + reqBody, _ := json.Marshal(guardianReq) + + url := fmt.Sprintf("http://%s:%d/v1/vault/push", gd.IP, gd.Port) + httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(reqBody)) + if err != nil { + return + } + httpReq.Header.Set("Content-Type", "application/json") + + resp, err := h.httpClient.Do(httpReq) + if err != nil { + return + } + defer resp.Body.Close() + io.Copy(io.Discard, resp.Body) + + if resp.StatusCode >= 200 && resp.StatusCode < 300 { + ackCount.Add(1) + } + }(i, g) + } + + wg.Wait() + + // Wipe share data. + for i := range shares { + for j := range shares[i].Y { + shares[i].Y[j] = 0 + } + } + + ack := int(ackCount.Load()) + status := "ok" + if ack < quorum { + status = "partial" + } + + writeJSON(w, http.StatusOK, PushResponse{ + Status: status, + AckCount: ack, + Total: n, + Quorum: quorum, + Threshold: k, + }) +} diff --git a/pkg/gateway/handlers/vault/rate_limiter.go b/pkg/gateway/handlers/vault/rate_limiter.go new file mode 100644 index 0000000..9a69821 --- /dev/null +++ b/pkg/gateway/handlers/vault/rate_limiter.go @@ -0,0 +1,120 @@ +package vault + +import ( + "sync" + "time" +) + +// IdentityRateLimiter provides per-identity-hash rate limiting for vault operations. +// Push and pull have separate rate limits since push is more expensive. +type IdentityRateLimiter struct { + pushBuckets sync.Map // identity -> *tokenBucket + pullBuckets sync.Map // identity -> *tokenBucket + pushRate float64 // tokens per second + pushBurst int + pullRate float64 // tokens per second + pullBurst int + stopCh chan struct{} +} + +type tokenBucket struct { + mu sync.Mutex + tokens float64 + lastCheck time.Time +} + +// NewIdentityRateLimiter creates a per-identity rate limiter. +// pushPerHour and pullPerHour are sustained rates; burst is 1/6th of the hourly rate. +func NewIdentityRateLimiter(pushPerHour, pullPerHour int) *IdentityRateLimiter { + pushBurst := pushPerHour / 6 + if pushBurst < 1 { + pushBurst = 1 + } + pullBurst := pullPerHour / 6 + if pullBurst < 1 { + pullBurst = 1 + } + return &IdentityRateLimiter{ + pushRate: float64(pushPerHour) / 3600.0, + pushBurst: pushBurst, + pullRate: float64(pullPerHour) / 3600.0, + pullBurst: pullBurst, + } +} + +// AllowPush checks if a push for this identity is allowed. +func (rl *IdentityRateLimiter) AllowPush(identity string) bool { + return rl.allow(&rl.pushBuckets, identity, rl.pushRate, rl.pushBurst) +} + +// AllowPull checks if a pull for this identity is allowed. +func (rl *IdentityRateLimiter) AllowPull(identity string) bool { + return rl.allow(&rl.pullBuckets, identity, rl.pullRate, rl.pullBurst) +} + +func (rl *IdentityRateLimiter) allow(buckets *sync.Map, identity string, rate float64, burst int) bool { + val, _ := buckets.LoadOrStore(identity, &tokenBucket{ + tokens: float64(burst), + lastCheck: time.Now(), + }) + b := val.(*tokenBucket) + + b.mu.Lock() + defer b.mu.Unlock() + + now := time.Now() + elapsed := now.Sub(b.lastCheck).Seconds() + b.tokens += elapsed * rate + if b.tokens > float64(burst) { + b.tokens = float64(burst) + } + b.lastCheck = now + + if b.tokens >= 1 { + b.tokens-- + return true + } + return false +} + +// StartCleanup runs periodic cleanup of stale identity entries. +func (rl *IdentityRateLimiter) StartCleanup(interval, maxAge time.Duration) { + rl.stopCh = make(chan struct{}) + go func() { + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-ticker.C: + rl.cleanup(maxAge) + case <-rl.stopCh: + return + } + } + }() +} + +// Stop terminates the background cleanup goroutine. +func (rl *IdentityRateLimiter) Stop() { + if rl.stopCh != nil { + close(rl.stopCh) + } +} + +func (rl *IdentityRateLimiter) cleanup(maxAge time.Duration) { + cutoff := time.Now().Add(-maxAge) + cleanMap := func(m *sync.Map) { + m.Range(func(key, value interface{}) bool { + b := value.(*tokenBucket) + b.mu.Lock() + stale := b.lastCheck.Before(cutoff) + b.mu.Unlock() + if stale { + m.Delete(key) + } + return true + }) + } + cleanMap(&rl.pushBuckets) + cleanMap(&rl.pullBuckets) +} diff --git a/pkg/gateway/middleware.go b/pkg/gateway/middleware.go index 8e25840..65567ae 100644 --- a/pkg/gateway/middleware.go +++ b/pkg/gateway/middleware.go @@ -417,6 +417,11 @@ func isPublicPath(p string) bool { return true } + // Vault proxy endpoints (no auth — rate-limited per identity hash within handler) + if strings.HasPrefix(p, "/v1/vault/") { + return true + } + // Phantom auth endpoints are public (session creation, status polling, completion) if strings.HasPrefix(p, "/v1/auth/phantom/") { return true diff --git a/pkg/gateway/routes.go b/pkg/gateway/routes.go index 4e49910..a791eda 100644 --- a/pkg/gateway/routes.go +++ b/pkg/gateway/routes.go @@ -114,6 +114,14 @@ func (g *Gateway) Routes() http.Handler { mux.HandleFunc("/v1/pubsub/presence", g.pubsubHandlers.PresenceHandler) } + // vault proxy (public, rate-limited per identity within handler) + if g.vaultHandlers != nil { + mux.HandleFunc("/v1/vault/push", g.vaultHandlers.HandlePush) + mux.HandleFunc("/v1/vault/pull", g.vaultHandlers.HandlePull) + mux.HandleFunc("/v1/vault/health", g.vaultHandlers.HandleHealth) + mux.HandleFunc("/v1/vault/status", g.vaultHandlers.HandleStatus) + } + // webrtc if g.webrtcHandlers != nil { mux.HandleFunc("/v1/webrtc/turn/credentials", g.webrtcHandlers.CredentialsHandler) diff --git a/pkg/shamir/field.go b/pkg/shamir/field.go new file mode 100644 index 0000000..2dd4d97 --- /dev/null +++ b/pkg/shamir/field.go @@ -0,0 +1,82 @@ +// Package shamir implements Shamir's Secret Sharing over GF(2^8). +// +// Uses the AES irreducible polynomial x^8 + x^4 + x^3 + x + 1 (0x11B) +// with generator 3. Precomputed log/exp tables for O(1) field arithmetic. +// +// Cross-platform compatible with the Zig (orama-vault) and TypeScript +// (network-ts-sdk) implementations using identical field parameters. +package shamir + +import "errors" + +// ErrDivisionByZero is returned when dividing by zero in GF(2^8). +var ErrDivisionByZero = errors.New("shamir: division by zero in GF(2^8)") + +// Irreducible polynomial: x^8 + x^4 + x^3 + x + 1. +const irreducible = 0x11B + +// expTable[i] = generator^i mod polynomial, for i in 0..511. +// Extended to 512 entries so Mul can use (logA + logB) without modular reduction. +var expTable [512]byte + +// logTable[a] = i where generator^i = a, for a in 1..255. +// logTable[0] is unused (log of zero is undefined). +var logTable [256]byte + +func init() { + x := uint16(1) + for i := 0; i < 512; i++ { + if i < 256 { + expTable[i] = byte(x) + logTable[byte(x)] = byte(i) + } else { + expTable[i] = expTable[i-255] + } + + if i < 255 { + // Multiply by generator (3): x*3 = x*2 XOR x + x2 := x << 1 + x3 := x2 ^ x + if x3&0x100 != 0 { + x3 ^= irreducible + } + x = x3 + } + } +} + +// Add returns a XOR b (addition in GF(2^8)). +func Add(a, b byte) byte { + return a ^ b +} + +// Mul returns a * b in GF(2^8) via log/exp tables. +func Mul(a, b byte) byte { + if a == 0 || b == 0 { + return 0 + } + logSum := uint16(logTable[a]) + uint16(logTable[b]) + return expTable[logSum] +} + +// Inv returns the multiplicative inverse of a in GF(2^8). +// Returns ErrDivisionByZero if a == 0. +func Inv(a byte) (byte, error) { + if a == 0 { + return 0, ErrDivisionByZero + } + return expTable[255-uint16(logTable[a])], nil +} + +// Div returns a / b in GF(2^8). +// Returns ErrDivisionByZero if b == 0. +func Div(a, b byte) (byte, error) { + if b == 0 { + return 0, ErrDivisionByZero + } + if a == 0 { + return 0, nil + } + logDiff := uint16(logTable[a]) + 255 - uint16(logTable[b]) + return expTable[logDiff], nil +} diff --git a/pkg/shamir/shamir.go b/pkg/shamir/shamir.go new file mode 100644 index 0000000..0ba260a --- /dev/null +++ b/pkg/shamir/shamir.go @@ -0,0 +1,150 @@ +package shamir + +import ( + "crypto/rand" + "errors" + "fmt" +) + +var ( + ErrThresholdTooSmall = errors.New("shamir: threshold K must be at least 2") + ErrShareCountTooSmall = errors.New("shamir: share count N must be >= threshold K") + ErrTooManyShares = errors.New("shamir: maximum 255 shares (GF(2^8) limit)") + ErrEmptySecret = errors.New("shamir: secret must not be empty") + ErrNotEnoughShares = errors.New("shamir: need at least 2 shares to reconstruct") + ErrMismatchedShareLen = errors.New("shamir: all shares must have the same data length") + ErrZeroShareIndex = errors.New("shamir: share index must not be 0") + ErrDuplicateShareIndex = errors.New("shamir: duplicate share indices") +) + +// Share represents a single Shamir share. +type Share struct { + X byte // Evaluation point (1..255, never 0) + Y []byte // Share data (same length as original secret) +} + +// Split divides secret into n shares with threshold k. +// Any k shares can reconstruct the secret; k-1 reveal nothing. +func Split(secret []byte, n, k int) ([]Share, error) { + if k < 2 { + return nil, ErrThresholdTooSmall + } + if n < k { + return nil, ErrShareCountTooSmall + } + if n > 255 { + return nil, ErrTooManyShares + } + if len(secret) == 0 { + return nil, ErrEmptySecret + } + + shares := make([]Share, n) + for i := range shares { + shares[i] = Share{ + X: byte(i + 1), + Y: make([]byte, len(secret)), + } + } + + // Temporary buffer for polynomial coefficients. + coeffs := make([]byte, k) + defer func() { + for i := range coeffs { + coeffs[i] = 0 + } + }() + + for byteIdx := 0; byteIdx < len(secret); byteIdx++ { + coeffs[0] = secret[byteIdx] + // Fill degrees 1..k-1 with random bytes. + if _, err := rand.Read(coeffs[1:]); err != nil { + return nil, fmt.Errorf("shamir: random generation failed: %w", err) + } + for i := range shares { + shares[i].Y[byteIdx] = evaluatePolynomial(coeffs, shares[i].X) + } + } + + return shares, nil +} + +// Combine reconstructs the secret from k or more shares via Lagrange interpolation. +func Combine(shares []Share) ([]byte, error) { + if len(shares) < 2 { + return nil, ErrNotEnoughShares + } + + secretLen := len(shares[0].Y) + seen := make(map[byte]bool, len(shares)) + for _, s := range shares { + if s.X == 0 { + return nil, ErrZeroShareIndex + } + if len(s.Y) != secretLen { + return nil, ErrMismatchedShareLen + } + if seen[s.X] { + return nil, ErrDuplicateShareIndex + } + seen[s.X] = true + } + + result := make([]byte, secretLen) + for byteIdx := 0; byteIdx < secretLen; byteIdx++ { + var value byte + for i, si := range shares { + // Lagrange basis polynomial L_i evaluated at 0: + // L_i(0) = product over j!=i of (0 - x_j)/(x_i - x_j) + // = product over j!=i of x_j / (x_i XOR x_j) + var basis byte = 1 + for j, sj := range shares { + if i == j { + continue + } + num := sj.X + den := Add(si.X, sj.X) // x_i - x_j = x_i XOR x_j in GF(2^8) + d, err := Div(num, den) + if err != nil { + return nil, err + } + basis = Mul(basis, d) + } + value = Add(value, Mul(si.Y[byteIdx], basis)) + } + result[byteIdx] = value + } + + return result, nil +} + +// AdaptiveThreshold returns max(3, floor(n/3)). +// This is the read quorum: minimum shares needed to reconstruct. +func AdaptiveThreshold(n int) int { + t := n / 3 + if t < 3 { + return 3 + } + return t +} + +// WriteQuorum returns ceil(2n/3). +// This is the write quorum: minimum ACKs needed for a successful push. +func WriteQuorum(n int) int { + if n == 0 { + return 0 + } + if n <= 2 { + return n + } + return (2*n + 2) / 3 +} + +// evaluatePolynomial evaluates p(x) = coeffs[0] + coeffs[1]*x + ... using Horner's method. +func evaluatePolynomial(coeffs []byte, x byte) byte { + var result byte + for i := len(coeffs) - 1; i >= 0; i-- { + result = Add(Mul(result, x), coeffs[i]) + } + return result +} diff --git a/pkg/shamir/shamir_test.go b/pkg/shamir/shamir_test.go new file mode 100644 index 0000000..2e57cc9 --- /dev/null +++ b/pkg/shamir/shamir_test.go @@ -0,0 +1,501 @@ +package shamir + +import ( + "testing" +) + +// ── GF(2^8) Field Tests ──────────────────────────────────────────────────── + +func TestExpTable_Cycle(t *testing.T) { + // g^0 = 1, g^255 = 1 (cyclic group of order 255) + if expTable[0] != 1 { + t.Errorf("exp[0] = %d, want 1", expTable[0]) + } + if expTable[255] != 1 { + t.Errorf("exp[255] = %d, want 1", expTable[255]) + } +} + +func TestExpTable_AllNonzeroAppear(t *testing.T) { + var seen [256]bool + for i := 0; i < 255; i++ { + v := expTable[i] + if seen[v] { + t.Fatalf("duplicate value %d at index %d", v, i) + } + seen[v] = true + } + for v := 1; v < 256; v++ { + if !seen[v] { + t.Errorf("value %d not seen in exp[0..255]", v) + } + } + if seen[0] { + t.Error("zero should not appear in exp[0..254]") + } +} + +// Cross-platform test vectors from orama-vault/src/sss/test_cross_platform.zig +func TestExpTable_CrossPlatform(t *testing.T) { + vectors := [][2]int{ + {0, 1}, {10, 114}, {20, 216}, {30, 102}, + {40, 106}, {50, 4}, {60, 211}, {70, 77}, + {80, 131}, {90, 179}, {100, 16}, {110, 97}, + {120, 47}, {130, 58}, {140, 250}, {150, 64}, + {160, 159}, {170, 188}, {180, 232}, {190, 197}, + {200, 27}, {210, 74}, {220, 198}, {230, 141}, + {240, 57}, {250, 108}, {254, 246}, {255, 1}, + } + for _, v := range vectors { + if got := expTable[v[0]]; got != byte(v[1]) { + t.Errorf("exp[%d] = %d, want %d", v[0], got, v[1]) + } + } +} + +func TestMul_CrossPlatform(t *testing.T) { + vectors := [][3]byte{ + {1, 1, 1}, {1, 2, 2}, {1, 3, 3}, + {1, 42, 42}, {1, 127, 127}, {1, 170, 170}, {1, 255, 255}, + {2, 1, 2}, {2, 2, 4}, {2, 3, 6}, + {2, 42, 84}, {2, 127, 254}, {2, 170, 79}, {2, 255, 229}, + {3, 1, 3}, {3, 2, 6}, {3, 3, 5}, + {3, 42, 126}, {3, 127, 129}, {3, 170, 229}, {3, 255, 26}, + {42, 1, 42}, {42, 2, 84}, {42, 3, 126}, + {42, 42, 40}, {42, 127, 82}, {42, 170, 244}, {42, 255, 142}, + {127, 1, 127}, {127, 2, 254}, {127, 3, 129}, + {127, 42, 82}, {127, 127, 137}, {127, 170, 173}, {127, 255, 118}, + {170, 1, 170}, {170, 2, 79}, {170, 3, 229}, + {170, 42, 244}, {170, 127, 173}, {170, 170, 178}, {170, 255, 235}, + {255, 1, 255}, {255, 2, 229}, {255, 3, 26}, + {255, 42, 142}, {255, 127, 118}, {255, 170, 235}, {255, 255, 19}, + } + for _, v := range vectors { + if got := Mul(v[0], v[1]); got != v[2] { + t.Errorf("Mul(%d, %d) = %d, want %d", v[0], v[1], got, v[2]) + } + } +} + +func TestMul_Zero(t *testing.T) { + for a := 0; a < 256; a++ { + if Mul(byte(a), 0) != 0 { + t.Errorf("Mul(%d, 0) != 0", a) + } + if Mul(0, byte(a)) != 0 { + t.Errorf("Mul(0, %d) != 0", a) + } + } +} + +func TestMul_Identity(t *testing.T) { + for a := 0; a < 256; a++ { + if Mul(byte(a), 1) != byte(a) { + t.Errorf("Mul(%d, 1) = %d", a, Mul(byte(a), 1)) + } + } +} + +func TestMul_Commutative(t *testing.T) { + for a := 1; a < 256; a += 7 { + for b := 1; b < 256; b += 11 { + ab := Mul(byte(a), byte(b)) + ba := Mul(byte(b), byte(a)) + if ab != ba { + t.Errorf("Mul(%d,%d)=%d != Mul(%d,%d)=%d", a, b, ab, b, a, ba) + } + } + } +} + +func TestInv_CrossPlatform(t *testing.T) { + vectors := [][2]byte{ + {1, 1}, {2, 141}, {3, 246}, {5, 82}, + {7, 209}, {16, 116}, {42, 152}, {127, 130}, + {128, 131}, {170, 18}, {200, 169}, {255, 28}, + } + for _, v := range vectors { + got, err := Inv(v[0]) + if err != nil { + t.Errorf("Inv(%d) returned error: %v", v[0], err) + continue + } + if got != v[1] { + t.Errorf("Inv(%d) = %d, want %d", v[0], got, v[1]) + } + } +} + +func TestInv_SelfInverse(t *testing.T) { + for a := 1; a < 256; a++ { + inv1, _ := Inv(byte(a)) + inv2, _ := Inv(inv1) + if inv2 != byte(a) { + t.Errorf("Inv(Inv(%d)) = %d, want %d", a, inv2, a) + } + } +} + +func TestInv_Product(t *testing.T) { + for a := 1; a < 256; a++ { + inv1, _ := Inv(byte(a)) + if Mul(byte(a), inv1) != 1 { + t.Errorf("Mul(%d, Inv(%d)) != 1", a, a) + } + } +} + +func TestInv_Zero(t *testing.T) { + _, err := Inv(0) + if err != ErrDivisionByZero { + t.Errorf("Inv(0) should return ErrDivisionByZero, got %v", err) + } +} + +func TestDiv_CrossPlatform(t *testing.T) { + vectors := [][3]byte{ + {1, 1, 1}, {1, 2, 141}, {1, 3, 246}, + {1, 42, 152}, {1, 127, 130}, {1, 170, 18}, {1, 255, 28}, + {2, 1, 2}, {2, 2, 1}, {2, 3, 247}, + {3, 1, 3}, {3, 2, 140}, {3, 3, 1}, + {42, 1, 42}, {42, 2, 21}, {42, 42, 1}, + {127, 1, 127}, {127, 127, 1}, + {170, 1, 170}, {170, 170, 1}, + {255, 1, 255}, {255, 255, 1}, + } + for _, v := range vectors { + got, err := Div(v[0], v[1]) + if err != nil { + t.Errorf("Div(%d, %d) returned error: %v", v[0], v[1], err) + continue + } + if got != v[2] { + t.Errorf("Div(%d, %d) = %d, want %d", v[0], v[1], got, v[2]) + } + } +} + +func TestDiv_ByZero(t *testing.T) { + _, err := Div(42, 0) + if err != ErrDivisionByZero { + t.Errorf("Div(42, 0) should return ErrDivisionByZero, got %v", err) + } +} + +// ── Polynomial evaluation ────────────────────────────────────────────────── + +func TestEvaluatePolynomial_CrossPlatform(t *testing.T) { + // p(x) = 42 + 5x + 7x^2 + coeffs0 := []byte{42, 5, 7} + vectors0 := [][2]byte{ + {1, 40}, {2, 60}, {3, 62}, {4, 78}, + {5, 76}, {10, 207}, {100, 214}, {255, 125}, + } + for _, v := range vectors0 { + if got := evaluatePolynomial(coeffs0, v[0]); got != v[1] { + t.Errorf("p(%d) = %d, want %d [coeffs: 42,5,7]", v[0], got, v[1]) + } + } + + // p(x) = 0 + 0xAB*x + 0xCD*x^2 + coeffs1 := []byte{0, 0xAB, 0xCD} + vectors1 := [][2]byte{ + {1, 102}, {3, 50}, {5, 152}, {7, 204}, {200, 96}, + } + for _, v := range vectors1 { + if got := evaluatePolynomial(coeffs1, v[0]); got != v[1] { + t.Errorf("p(%d) = %d, want %d [coeffs: 0,AB,CD]", v[0], got, v[1]) + } + } + + // p(x) = 0xFF (constant) + coeffs2 := []byte{0xFF} + for _, x := range []byte{1, 2, 255} { + if got := evaluatePolynomial(coeffs2, x); got != 0xFF { + t.Errorf("constant p(%d) = %d, want 255", x, got) + } + } + + // p(x) = 128 + 64x + 32x^2 + 16x^3 + coeffs3 := []byte{128, 64, 32, 16} + vectors3 := [][2]byte{ + {1, 240}, {2, 0}, {3, 16}, {4, 193}, {5, 234}, + } + for _, v := range vectors3 { + if got := evaluatePolynomial(coeffs3, v[0]); got != v[1] { + t.Errorf("p(%d) = %d, want %d [coeffs: 128,64,32,16]", v[0], got, v[1]) + } + } +} + +// ── Lagrange combine (cross-platform) ───────────────────────────────────── + +func TestCombine_CrossPlatform_SingleByte(t *testing.T) { + // p(x) = 42 + 5x + 7x^2, secret = 42 + // Shares: (1,40) (2,60) (3,62) (4,78) (5,76) + allShares := []Share{ + {X: 1, Y: []byte{40}}, + {X: 2, Y: []byte{60}}, + {X: 3, Y: []byte{62}}, + {X: 4, Y: []byte{78}}, + {X: 5, Y: []byte{76}}, + } + + subsets := [][]int{ + {0, 1, 2}, // {1,2,3} + {0, 2, 4}, // {1,3,5} + {1, 3, 4}, // {2,4,5} + {2, 3, 4}, // {3,4,5} + } + + for _, subset := range subsets { + shares := make([]Share, len(subset)) + for i, idx := range subset { + shares[i] = allShares[idx] + } + result, err := Combine(shares) + if err != nil { + t.Fatalf("Combine failed for subset %v: %v", subset, err) + } + if result[0] != 42 { + t.Errorf("Combine(subset %v) = %d, want 42", subset, result[0]) + } + } +} + +func TestCombine_CrossPlatform_MultiByte(t *testing.T) { + // 2-byte secret [42, 0] + // byte0: 42 + 5x + 7x^2 → shares at x=1,3,5: 40, 62, 76 + // byte1: 0 + 0xAB*x + 0xCD*x^2 → shares at x=1,3,5: 102, 50, 152 + shares := []Share{ + {X: 1, Y: []byte{40, 102}}, + {X: 3, Y: []byte{62, 50}}, + {X: 5, Y: []byte{76, 152}}, + } + result, err := Combine(shares) + if err != nil { + t.Fatalf("Combine failed: %v", err) + } + if result[0] != 42 || result[1] != 0 { + t.Errorf("Combine = %v, want [42, 0]", result) + } +} + +// ── Split/Combine round-trip ────────────────────────────────────────────── + +func TestSplitCombine_RoundTrip_2of3(t *testing.T) { + secret := []byte("hello world") + shares, err := Split(secret, 3, 2) + if err != nil { + t.Fatalf("Split: %v", err) + } + if len(shares) != 3 { + t.Fatalf("got %d shares, want 3", len(shares)) + } + + // Any 2 shares should reconstruct + for i := 0; i < 3; i++ { + for j := i + 1; j < 3; j++ { + result, err := Combine([]Share{shares[i], shares[j]}) + if err != nil { + t.Fatalf("Combine(%d,%d): %v", i, j, err) + } + if string(result) != string(secret) { + t.Errorf("Combine(%d,%d) = %q, want %q", i, j, result, secret) + } + } + } +} + +func TestSplitCombine_RoundTrip_3of5(t *testing.T) { + secret := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} + shares, err := Split(secret, 5, 3) + if err != nil { + t.Fatalf("Split: %v", err) + } + + // All C(5,3)=10 subsets should reconstruct + count := 0 + for i := 0; i < 5; i++ { + for j := i + 1; j < 5; j++ { + for k := j + 1; k < 5; k++ { + result, err := Combine([]Share{shares[i], shares[j], shares[k]}) + if err != nil { + t.Fatalf("Combine(%d,%d,%d): %v", i, j, k, err) + } + for idx := range secret { + if result[idx] != secret[idx] { + t.Errorf("Combine(%d,%d,%d)[%d] = %d, want %d", i, j, k, idx, result[idx], secret[idx]) + } + } + count++ + } + } + } + if count != 10 { + t.Errorf("tested %d subsets, want 10", count) + } +} + +func TestSplitCombine_RoundTrip_LargeSecret(t *testing.T) { + secret := make([]byte, 256) + for i := range secret { + secret[i] = byte(i) + } + shares, err := Split(secret, 10, 5) + if err != nil { + t.Fatalf("Split: %v", err) + } + + // Use first 5 shares + result, err := Combine(shares[:5]) + if err != nil { + t.Fatalf("Combine: %v", err) + } + for i := range secret { + if result[i] != secret[i] { + t.Errorf("result[%d] = %d, want %d", i, result[i], secret[i]) + break + } + } +} + +func TestSplitCombine_AllZeros(t *testing.T) { + secret := make([]byte, 10) + shares, err := Split(secret, 5, 3) + if err != nil { + t.Fatalf("Split: %v", err) + } + result, err := Combine(shares[:3]) + if err != nil { + t.Fatalf("Combine: %v", err) + } + for i, b := range result { + if b != 0 { + t.Errorf("result[%d] = %d, want 0", i, b) + } + } +} + +func TestSplitCombine_AllOnes(t *testing.T) { + secret := make([]byte, 10) + for i := range secret { + secret[i] = 0xFF + } + shares, err := Split(secret, 5, 3) + if err != nil { + t.Fatalf("Split: %v", err) + } + result, err := Combine(shares[:3]) + if err != nil { + t.Fatalf("Combine: %v", err) + } + for i, b := range result { + if b != 0xFF { + t.Errorf("result[%d] = %d, want 255", i, b) + } + } +} + +// ── Share indices ───────────────────────────────────────────────────────── + +func TestSplit_ShareIndices(t *testing.T) { + shares, err := Split([]byte{42}, 5, 3) + if err != nil { + t.Fatalf("Split: %v", err) + } + for i, s := range shares { + if s.X != byte(i+1) { + t.Errorf("shares[%d].X = %d, want %d", i, s.X, i+1) + } + } +} + +// ── Error cases ─────────────────────────────────────────────────────────── + +func TestSplit_Errors(t *testing.T) { + tests := []struct { + name string + secret []byte + n, k int + want error + }{ + {"k < 2", []byte{1}, 3, 1, ErrThresholdTooSmall}, + {"n < k", []byte{1}, 2, 3, ErrShareCountTooSmall}, + {"n > 255", []byte{1}, 256, 3, ErrTooManyShares}, + {"empty secret", []byte{}, 3, 2, ErrEmptySecret}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := Split(tt.secret, tt.n, tt.k) + if err != tt.want { + t.Errorf("Split() error = %v, want %v", err, tt.want) + } + }) + } +} + +func TestCombine_Errors(t *testing.T) { + t.Run("not enough shares", func(t *testing.T) { + _, err := Combine([]Share{{X: 1, Y: []byte{1}}}) + if err != ErrNotEnoughShares { + t.Errorf("got %v, want ErrNotEnoughShares", err) + } + }) + + t.Run("zero index", func(t *testing.T) { + _, err := Combine([]Share{ + {X: 0, Y: []byte{1}}, + {X: 1, Y: []byte{2}}, + }) + if err != ErrZeroShareIndex { + t.Errorf("got %v, want ErrZeroShareIndex", err) + } + }) + + t.Run("mismatched lengths", func(t *testing.T) { + _, err := Combine([]Share{ + {X: 1, Y: []byte{1, 2}}, + {X: 2, Y: []byte{3}}, + }) + if err != ErrMismatchedShareLen { + t.Errorf("got %v, want ErrMismatchedShareLen", err) + } + }) + + t.Run("duplicate indices", func(t *testing.T) { + _, err := Combine([]Share{ + {X: 1, Y: []byte{1}}, + {X: 1, Y: []byte{2}}, + }) + if err != ErrDuplicateShareIndex { + t.Errorf("got %v, want ErrDuplicateShareIndex", err) + } + }) +} + +// ── Threshold / Quorum ──────────────────────────────────────────────────── + +func TestAdaptiveThreshold(t *testing.T) { + tests := [][2]int{ + {1, 3}, {2, 3}, {3, 3}, {5, 3}, {8, 3}, {9, 3}, + {10, 3}, {12, 4}, {15, 5}, {30, 10}, {100, 33}, + } + for _, tt := range tests { + if got := AdaptiveThreshold(tt[0]); got != tt[1] { + t.Errorf("AdaptiveThreshold(%d) = %d, want %d", tt[0], got, tt[1]) + } + } +} + +func TestWriteQuorum(t *testing.T) { + tests := [][2]int{ + {0, 0}, {1, 1}, {2, 2}, {3, 2}, {4, 3}, {5, 4}, + {6, 4}, {10, 7}, {14, 10}, {100, 67}, + } + for _, tt := range tests { + if got := WriteQuorum(tt[0]); got != tt[1] { + t.Errorf("WriteQuorum(%d) = %d, want %d", tt[0], got, tt[1]) + } + } +} From 2f5718146ae8e71900b85bc2556d1d586ecb36d7 Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Fri, 27 Feb 2026 15:56:22 +0200 Subject: [PATCH 06/13] Fixed builder bug --- pkg/cli/build/builder.go | 64 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 4 deletions(-) diff --git a/pkg/cli/build/builder.go b/pkg/cli/build/builder.go index 4514f6b..60737dd 100644 --- a/pkg/cli/build/builder.go +++ b/pkg/cli/build/builder.go @@ -256,10 +256,39 @@ func copyFile(src, dst string) error { func (b *Builder) buildOlric() error { fmt.Printf("[3/8] Cross-compiling Olric %s...\n", constants.OlricVersion) - cmd := exec.Command("go", "install", + // go install doesn't support cross-compilation with GOBIN set, + // so we create a temporary module and use go build -o instead. + tmpDir, err := os.MkdirTemp("", "olric-build-*") + if err != nil { + return fmt.Errorf("create temp dir: %w", err) + } + defer os.RemoveAll(tmpDir) + + modInit := exec.Command("go", "mod", "init", "olric-build") + modInit.Dir = tmpDir + modInit.Stderr = os.Stderr + if err := modInit.Run(); err != nil { + return fmt.Errorf("go mod init: %w", err) + } + + modGet := exec.Command("go", "get", fmt.Sprintf("github.com/olric-data/olric/cmd/olric-server@%s", constants.OlricVersion)) + modGet.Dir = tmpDir + modGet.Env = append(os.Environ(), + "GOPROXY=https://proxy.golang.org|direct", + "GONOSUMDB=*") + modGet.Stderr = os.Stderr + if err := modGet.Run(); err != nil { + return fmt.Errorf("go get olric: %w", err) + } + + cmd := exec.Command("go", "build", + "-ldflags", "-s -w", + "-trimpath", + "-o", filepath.Join(b.binDir, "olric-server"), + fmt.Sprintf("github.com/olric-data/olric/cmd/olric-server")) + cmd.Dir = tmpDir cmd.Env = append(b.crossEnv(), - "GOBIN="+b.binDir, "GOPROXY=https://proxy.golang.org|direct", "GONOSUMDB=*") cmd.Stdout = os.Stdout @@ -275,10 +304,37 @@ func (b *Builder) buildOlric() error { func (b *Builder) buildIPFSCluster() error { fmt.Printf("[4/8] Cross-compiling IPFS Cluster %s...\n", constants.IPFSClusterVersion) - cmd := exec.Command("go", "install", + tmpDir, err := os.MkdirTemp("", "ipfs-cluster-build-*") + if err != nil { + return fmt.Errorf("create temp dir: %w", err) + } + defer os.RemoveAll(tmpDir) + + modInit := exec.Command("go", "mod", "init", "ipfs-cluster-build") + modInit.Dir = tmpDir + modInit.Stderr = os.Stderr + if err := modInit.Run(); err != nil { + return fmt.Errorf("go mod init: %w", err) + } + + modGet := exec.Command("go", "get", fmt.Sprintf("github.com/ipfs-cluster/ipfs-cluster/cmd/ipfs-cluster-service@%s", constants.IPFSClusterVersion)) + modGet.Dir = tmpDir + modGet.Env = append(os.Environ(), + "GOPROXY=https://proxy.golang.org|direct", + "GONOSUMDB=*") + modGet.Stderr = os.Stderr + if err := modGet.Run(); err != nil { + return fmt.Errorf("go get ipfs-cluster: %w", err) + } + + cmd := exec.Command("go", "build", + "-ldflags", "-s -w", + "-trimpath", + "-o", filepath.Join(b.binDir, "ipfs-cluster-service"), + "github.com/ipfs-cluster/ipfs-cluster/cmd/ipfs-cluster-service") + cmd.Dir = tmpDir cmd.Env = append(b.crossEnv(), - "GOBIN="+b.binDir, "GOPROXY=https://proxy.golang.org|direct", "GONOSUMDB=*") cmd.Stdout = os.Stdout From a0468461ab77cc31d9937030c1a34fae7c0c2ce1 Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Sat, 28 Feb 2026 10:14:02 +0200 Subject: [PATCH 07/13] feat(sandbox): add reset command and interactive setup - new `orama sandbox reset` deletes Hetzner resources (IPs, firewall, SSH key) and local config - interactive location/server type selection during `setup` - add Hetzner API methods for listing locations/types, deleting resources - update defaults to nbg1/cx23 --- pkg/cli/cmd/sandboxcmd/sandbox.go | 17 +- pkg/cli/sandbox/config.go | 4 +- pkg/cli/sandbox/hetzner.go | 96 ++++++++ pkg/cli/sandbox/reset.go | 129 +++++++++++ pkg/cli/sandbox/setup.go | 362 ++++++++++++++++++++++++++---- 5 files changed, 566 insertions(+), 42 deletions(-) create mode 100644 pkg/cli/sandbox/reset.go diff --git a/pkg/cli/cmd/sandboxcmd/sandbox.go b/pkg/cli/cmd/sandboxcmd/sandbox.go index f922053..42043a0 100644 --- a/pkg/cli/cmd/sandboxcmd/sandbox.go +++ b/pkg/cli/cmd/sandboxcmd/sandbox.go @@ -23,7 +23,8 @@ Usage: orama sandbox list List active sandboxes orama sandbox status [--name ] Show cluster health orama sandbox rollout [--name ] Build + push + rolling upgrade - orama sandbox ssh SSH into a sandbox node (1-5)`, + orama sandbox ssh SSH into a sandbox node (1-5) + orama sandbox reset Delete all infra and config to start fresh`, } var setupCmd = &cobra.Command{ @@ -79,6 +80,19 @@ var rolloutCmd = &cobra.Command{ }, } +var resetCmd = &cobra.Command{ + Use: "reset", + Short: "Delete all sandbox infrastructure and config to start fresh", + Long: `Deletes floating IPs, firewall, and SSH key from Hetzner Cloud, +then removes the local config (~/.orama/sandbox.yaml) and SSH keys. + +Use this when you need to switch datacenter locations (floating IPs are +location-bound) or to completely start over with sandbox setup.`, + RunE: func(cmd *cobra.Command, args []string) error { + return sandbox.Reset() + }, +} + var sshCmd = &cobra.Command{ Use: "ssh ", Short: "SSH into a sandbox node (1-5)", @@ -118,4 +132,5 @@ func init() { Cmd.AddCommand(statusCmd) Cmd.AddCommand(rolloutCmd) Cmd.AddCommand(sshCmd) + Cmd.AddCommand(resetCmd) } diff --git a/pkg/cli/sandbox/config.go b/pkg/cli/sandbox/config.go index f1ba9ca..11eb410 100644 --- a/pkg/cli/sandbox/config.go +++ b/pkg/cli/sandbox/config.go @@ -123,10 +123,10 @@ func (c *Config) validate() error { // Defaults fills in default values for optional fields. func (c *Config) Defaults() { if c.Location == "" { - c.Location = "fsn1" + c.Location = "nbg1" } if c.ServerType == "" { - c.ServerType = "cx22" + c.ServerType = "cx23" } } diff --git a/pkg/cli/sandbox/hetzner.go b/pkg/cli/sandbox/hetzner.go index 742349e..51d62a0 100644 --- a/pkg/cli/sandbox/hetzner.go +++ b/pkg/cli/sandbox/hetzner.go @@ -344,6 +344,23 @@ func (c *HetznerClient) UploadSSHKey(name, publicKey string) (*HetznerSSHKey, er return &resp.SSHKey, nil } +// ListSSHKeysByFingerprint finds SSH keys matching a fingerprint. +func (c *HetznerClient) ListSSHKeysByFingerprint(fingerprint string) ([]HetznerSSHKey, error) { + body, err := c.get("/ssh_keys?fingerprint=" + fingerprint) + if err != nil { + return nil, fmt.Errorf("list SSH keys: %w", err) + } + + var resp struct { + SSHKeys []HetznerSSHKey `json:"ssh_keys"` + } + if err := json.Unmarshal(body, &resp); err != nil { + return nil, fmt.Errorf("parse SSH keys response: %w", err) + } + + return resp.SSHKeys, nil +} + // GetSSHKey retrieves an SSH key by ID. func (c *HetznerClient) GetSSHKey(id int64) (*HetznerSSHKey, error) { body, err := c.get("/ssh_keys/" + strconv.FormatInt(id, 10)) @@ -408,6 +425,85 @@ func (c *HetznerClient) DeleteFirewall(id int64) error { return c.delete("/firewalls/" + strconv.FormatInt(id, 10)) } +// DeleteFloatingIP deletes a floating IP by ID. +func (c *HetznerClient) DeleteFloatingIP(id int64) error { + return c.delete("/floating_ips/" + strconv.FormatInt(id, 10)) +} + +// DeleteSSHKey deletes an SSH key by ID. +func (c *HetznerClient) DeleteSSHKey(id int64) error { + return c.delete("/ssh_keys/" + strconv.FormatInt(id, 10)) +} + +// --- Location & Server Type operations --- + +// HetznerLocation represents a Hetzner datacenter location. +type HetznerLocation struct { + ID int64 `json:"id"` + Name string `json:"name"` // e.g., "fsn1", "nbg1", "hel1" + Description string `json:"description"` // e.g., "Falkenstein DC Park 1" + City string `json:"city"` + Country string `json:"country"` // ISO 3166-1 alpha-2 +} + +// HetznerServerType represents a Hetzner server type with pricing. +type HetznerServerType struct { + ID int64 `json:"id"` + Name string `json:"name"` // e.g., "cx22", "cx23" + Description string `json:"description"` // e.g., "CX23" + Cores int `json:"cores"` + Memory float64 `json:"memory"` // GB + Disk int `json:"disk"` // GB + Architecture string `json:"architecture"` + Deprecation *struct { + Announced string `json:"announced"` + UnavailableAfter string `json:"unavailable_after"` + } `json:"deprecation"` // nil = not deprecated + Prices []struct { + Location string `json:"location"` + Hourly struct { + Gross string `json:"gross"` + } `json:"price_hourly"` + Monthly struct { + Gross string `json:"gross"` + } `json:"price_monthly"` + } `json:"prices"` +} + +// ListLocations returns all available Hetzner datacenter locations. +func (c *HetznerClient) ListLocations() ([]HetznerLocation, error) { + body, err := c.get("/locations") + if err != nil { + return nil, fmt.Errorf("list locations: %w", err) + } + + var resp struct { + Locations []HetznerLocation `json:"locations"` + } + if err := json.Unmarshal(body, &resp); err != nil { + return nil, fmt.Errorf("parse locations response: %w", err) + } + + return resp.Locations, nil +} + +// ListServerTypes returns all available server types. +func (c *HetznerClient) ListServerTypes() ([]HetznerServerType, error) { + body, err := c.get("/server_types?per_page=50") + if err != nil { + return nil, fmt.Errorf("list server types: %w", err) + } + + var resp struct { + ServerTypes []HetznerServerType `json:"server_types"` + } + if err := json.Unmarshal(body, &resp); err != nil { + return nil, fmt.Errorf("parse server types response: %w", err) + } + + return resp.ServerTypes, nil +} + // --- Validation --- // ValidateToken checks if the API token is valid by making a simple request. diff --git a/pkg/cli/sandbox/reset.go b/pkg/cli/sandbox/reset.go new file mode 100644 index 0000000..dbc4dae --- /dev/null +++ b/pkg/cli/sandbox/reset.go @@ -0,0 +1,129 @@ +package sandbox + +import ( + "bufio" + "fmt" + "os" + "strings" +) + +// Reset tears down all sandbox infrastructure (floating IPs, firewall, SSH key) +// and removes the config file so the user can rerun setup from scratch. +// This is useful when switching datacenter locations (floating IPs are location-bound). +func Reset() error { + fmt.Println("Sandbox Reset") + fmt.Println("=============") + fmt.Println() + + cfg, err := LoadConfig() + if err != nil { + // Config doesn't exist — just clean up any local files + fmt.Println("No sandbox config found. Cleaning up local files...") + return resetLocalFiles() + } + + // Check for active sandboxes — refuse to reset if clusters are still running + active, _ := FindActiveSandbox() + if active != nil { + return fmt.Errorf("active sandbox %q exists — run 'orama sandbox destroy' first", active.Name) + } + + // Show what will be deleted + fmt.Println("This will delete the following Hetzner resources:") + for i, fip := range cfg.FloatingIPs { + fmt.Printf(" Floating IP %d: %s (ID: %d)\n", i+1, fip.IP, fip.ID) + } + if cfg.FirewallID != 0 { + fmt.Printf(" Firewall ID: %d\n", cfg.FirewallID) + } + if cfg.SSHKey.HetznerID != 0 { + fmt.Printf(" SSH Key ID: %d\n", cfg.SSHKey.HetznerID) + } + fmt.Println() + fmt.Println("Local files to remove:") + fmt.Println(" ~/.orama/sandbox.yaml") + fmt.Println(" ~/.orama/sandbox_key") + fmt.Println(" ~/.orama/sandbox_key.pub") + fmt.Println() + + reader := bufio.NewReader(os.Stdin) + fmt.Print("Delete all sandbox resources? [y/N]: ") + choice, _ := reader.ReadString('\n') + choice = strings.TrimSpace(strings.ToLower(choice)) + if choice != "y" && choice != "yes" { + fmt.Println("Aborted.") + return nil + } + + client := NewHetznerClient(cfg.HetznerAPIToken) + + // Step 1: Delete floating IPs + fmt.Println() + fmt.Println("Deleting floating IPs...") + for _, fip := range cfg.FloatingIPs { + if err := client.DeleteFloatingIP(fip.ID); err != nil { + fmt.Fprintf(os.Stderr, " Warning: could not delete floating IP %s (ID %d): %v\n", fip.IP, fip.ID, err) + } else { + fmt.Printf(" Deleted %s (ID %d)\n", fip.IP, fip.ID) + } + } + + // Step 2: Delete firewall + if cfg.FirewallID != 0 { + fmt.Println("Deleting firewall...") + if err := client.DeleteFirewall(cfg.FirewallID); err != nil { + fmt.Fprintf(os.Stderr, " Warning: could not delete firewall (ID %d): %v\n", cfg.FirewallID, err) + } else { + fmt.Printf(" Deleted firewall (ID %d)\n", cfg.FirewallID) + } + } + + // Step 3: Delete SSH key from Hetzner + if cfg.SSHKey.HetznerID != 0 { + fmt.Println("Deleting SSH key from Hetzner...") + if err := client.DeleteSSHKey(cfg.SSHKey.HetznerID); err != nil { + fmt.Fprintf(os.Stderr, " Warning: could not delete SSH key (ID %d): %v\n", cfg.SSHKey.HetznerID, err) + } else { + fmt.Printf(" Deleted SSH key (ID %d)\n", cfg.SSHKey.HetznerID) + } + } + + // Step 4: Remove local files + if err := resetLocalFiles(); err != nil { + return err + } + + fmt.Println() + fmt.Println("Reset complete. All sandbox resources deleted.") + fmt.Println() + fmt.Println("Next: orama sandbox setup") + return nil +} + +// resetLocalFiles removes the sandbox config and SSH key files. +func resetLocalFiles() error { + dir, err := configDir() + if err != nil { + return err + } + + files := []string{ + dir + "/sandbox.yaml", + dir + "/sandbox_key", + dir + "/sandbox_key.pub", + } + + fmt.Println("Removing local files...") + for _, f := range files { + if err := os.Remove(f); err != nil { + if os.IsNotExist(err) { + continue + } + fmt.Fprintf(os.Stderr, " Warning: could not remove %s: %v\n", f, err) + } else { + fmt.Printf(" Removed %s\n", f) + } + } + + return nil +} diff --git a/pkg/cli/sandbox/setup.go b/pkg/cli/sandbox/setup.go index d4d07c1..f702422 100644 --- a/pkg/cli/sandbox/setup.go +++ b/pkg/cli/sandbox/setup.go @@ -8,7 +8,10 @@ import ( "fmt" "os" "os/exec" + "sort" + "strconv" "strings" + "time" "golang.org/x/crypto/ssh" ) @@ -56,9 +59,24 @@ func Setup() error { HetznerAPIToken: token, Domain: domain, } - cfg.Defaults() - // Step 3: Floating IPs + // Step 3: Location selection + fmt.Println() + location, err := selectLocation(client, reader) + if err != nil { + return err + } + cfg.Location = location + + // Step 4: Server type selection + fmt.Println() + serverType, err := selectServerType(client, reader, location) + if err != nil { + return err + } + cfg.ServerType = serverType + + // Step 5: Floating IPs fmt.Println() fmt.Println("Checking floating IPs...") floatingIPs, err := setupFloatingIPs(client, cfg.Location) @@ -67,7 +85,7 @@ func Setup() error { } cfg.FloatingIPs = floatingIPs - // Step 4: Firewall + // Step 6: Firewall fmt.Println() fmt.Println("Checking firewall...") fwID, err := setupFirewall(client) @@ -76,7 +94,7 @@ func Setup() error { } cfg.FirewallID = fwID - // Step 5: SSH key + // Step 7: SSH key fmt.Println() fmt.Println("Setting up SSH key...") sshKeyConfig, err := setupSSHKey(client) @@ -85,7 +103,7 @@ func Setup() error { } cfg.SSHKey = sshKeyConfig - // Step 6: Display DNS instructions + // Step 8: Display DNS instructions fmt.Println() fmt.Println("DNS Configuration") fmt.Println("-----------------") @@ -100,12 +118,12 @@ func Setup() error { fmt.Printf(" ns2.%s\n", domain) fmt.Println() - // Step 7: Verify DNS (optional) + // Step 9: Verify DNS (optional) fmt.Print("Verify DNS now? [y/N]: ") verifyChoice, _ := reader.ReadString('\n') verifyChoice = strings.TrimSpace(strings.ToLower(verifyChoice)) if verifyChoice == "y" || verifyChoice == "yes" { - verifyDNS(domain) + verifyDNS(domain, cfg.FloatingIPs, reader) } // Save config @@ -120,6 +138,180 @@ func Setup() error { return nil } +// selectLocation fetches available Hetzner locations and lets the user pick one. +func selectLocation(client *HetznerClient, reader *bufio.Reader) (string, error) { + fmt.Println("Fetching available locations...") + locations, err := client.ListLocations() + if err != nil { + return "", fmt.Errorf("list locations: %w", err) + } + + sort.Slice(locations, func(i, j int) bool { + return locations[i].Name < locations[j].Name + }) + + defaultLoc := "nbg1" + fmt.Println(" Available datacenter locations:") + for i, loc := range locations { + def := "" + if loc.Name == defaultLoc { + def = " (default)" + } + fmt.Printf(" %d) %s — %s, %s%s\n", i+1, loc.Name, loc.City, loc.Country, def) + } + + fmt.Printf("\n Select location [%s]: ", defaultLoc) + choice, _ := reader.ReadString('\n') + choice = strings.TrimSpace(choice) + + if choice == "" { + fmt.Printf(" Using %s\n", defaultLoc) + return defaultLoc, nil + } + + // Try as number first + if num, err := strconv.Atoi(choice); err == nil && num >= 1 && num <= len(locations) { + loc := locations[num-1].Name + fmt.Printf(" Using %s\n", loc) + return loc, nil + } + + // Try as location name + for _, loc := range locations { + if strings.EqualFold(loc.Name, choice) { + fmt.Printf(" Using %s\n", loc.Name) + return loc.Name, nil + } + } + + return "", fmt.Errorf("unknown location %q", choice) +} + +// selectServerType fetches available server types for a location and lets the user pick one. +func selectServerType(client *HetznerClient, reader *bufio.Reader, location string) (string, error) { + fmt.Println("Fetching available server types...") + serverTypes, err := client.ListServerTypes() + if err != nil { + return "", fmt.Errorf("list server types: %w", err) + } + + // Filter to x86 shared-vCPU types available at the selected location, skip deprecated + type option struct { + name string + cores int + memory float64 + disk int + hourly string + monthly string + } + + var options []option + for _, st := range serverTypes { + if st.Architecture != "x86" { + continue + } + if st.Deprecation != nil { + continue + } + // Only show shared-vCPU types (cx/cpx prefixes) — skip dedicated (ccx/cx5x) + if !strings.HasPrefix(st.Name, "cx") && !strings.HasPrefix(st.Name, "cpx") { + continue + } + + // Find pricing for the selected location + hourly, monthly := "", "" + for _, p := range st.Prices { + if p.Location == location { + hourly = p.Hourly.Gross + monthly = p.Monthly.Gross + break + } + } + if hourly == "" { + continue // Not available in this location + } + + options = append(options, option{ + name: st.Name, + cores: st.Cores, + memory: st.Memory, + disk: st.Disk, + hourly: hourly, + monthly: monthly, + }) + } + + if len(options) == 0 { + return "", fmt.Errorf("no server types available in %s", location) + } + + // Sort by hourly price (cheapest first) + sort.Slice(options, func(i, j int) bool { + pi, _ := strconv.ParseFloat(options[i].hourly, 64) + pj, _ := strconv.ParseFloat(options[j].hourly, 64) + return pi < pj + }) + + defaultType := options[0].name // cheapest + fmt.Printf(" Available server types in %s:\n", location) + for i, opt := range options { + def := "" + if opt.name == defaultType { + def = " (default)" + } + fmt.Printf(" %d) %-8s %d vCPU / %4.0f GB RAM / %3d GB disk — €%s/hr (€%s/mo)%s\n", + i+1, opt.name, opt.cores, opt.memory, opt.disk, formatPrice(opt.hourly), formatPrice(opt.monthly), def) + } + + fmt.Printf("\n Select server type [%s]: ", defaultType) + choice, _ := reader.ReadString('\n') + choice = strings.TrimSpace(choice) + + if choice == "" { + fmt.Printf(" Using %s (×5 nodes ≈ €%s/hr)\n", defaultType, multiplyPrice(options[0].hourly, 5)) + return defaultType, nil + } + + // Try as number + if num, err := strconv.Atoi(choice); err == nil && num >= 1 && num <= len(options) { + opt := options[num-1] + fmt.Printf(" Using %s (×5 nodes ≈ €%s/hr)\n", opt.name, multiplyPrice(opt.hourly, 5)) + return opt.name, nil + } + + // Try as name + for _, opt := range options { + if strings.EqualFold(opt.name, choice) { + fmt.Printf(" Using %s (×5 nodes ≈ €%s/hr)\n", opt.name, multiplyPrice(opt.hourly, 5)) + return opt.name, nil + } + } + + return "", fmt.Errorf("unknown server type %q", choice) +} + +// formatPrice trims trailing zeros from a price string like "0.0063000000000000" → "0.0063". +func formatPrice(price string) string { + f, err := strconv.ParseFloat(price, 64) + if err != nil { + return price + } + // Use enough precision then trim trailing zeros + s := fmt.Sprintf("%.4f", f) + s = strings.TrimRight(s, "0") + s = strings.TrimRight(s, ".") + return s +} + +// multiplyPrice multiplies a price string by n and returns formatted. +func multiplyPrice(price string, n int) string { + f, err := strconv.ParseFloat(price, 64) + if err != nil { + return "?" + } + return formatPrice(fmt.Sprintf("%.10f", f*float64(n))) +} + // setupFloatingIPs checks for existing floating IPs or creates new ones. func setupFloatingIPs(client *HetznerClient, location string) ([]FloatIP, error) { existing, err := client.ListFloatingIPsByLabel("orama-sandbox-dns=true") @@ -217,24 +409,24 @@ func setupSSHKey(client *HetznerClient) (SSHKeyConfig, error) { // Try to upload (will fail with uniqueness error if already exists) key, err := client.UploadSSHKey("orama-sandbox", strings.TrimSpace(string(pubData))) if err != nil { - // Key likely already exists on Hetzner — find it by listing - fmt.Printf(" SSH key may already be on Hetzner (upload: %v)\n", err) - fmt.Print(" Enter the Hetzner SSH key ID (or 0 to re-upload): ") - reader := bufio.NewReader(os.Stdin) - idStr, _ := reader.ReadString('\n') - idStr = strings.TrimSpace(idStr) - var hetznerID int64 - fmt.Sscanf(idStr, "%d", &hetznerID) + // Key already exists on Hetzner — find it by fingerprint + sshPubKey, _, _, _, parseErr := ssh.ParseAuthorizedKey(pubData) + if parseErr != nil { + return SSHKeyConfig{}, fmt.Errorf("parse public key to find fingerprint: %w", parseErr) + } + fingerprint := ssh.FingerprintLegacyMD5(sshPubKey) - if hetznerID == 0 { - return SSHKeyConfig{}, fmt.Errorf("could not resolve SSH key on Hetzner, try deleting and re-running setup") + existing, listErr := client.ListSSHKeysByFingerprint(fingerprint) + if listErr == nil && len(existing) > 0 { + fmt.Printf(" Found existing SSH key on Hetzner (ID: %d)\n", existing[0].ID) + return SSHKeyConfig{ + HetznerID: existing[0].ID, + PrivateKeyPath: "~/.orama/sandbox_key", + PublicKeyPath: "~/.orama/sandbox_key.pub", + }, nil } - return SSHKeyConfig{ - HetznerID: hetznerID, - PrivateKeyPath: "~/.orama/sandbox_key", - PublicKeyPath: "~/.orama/sandbox_key.pub", - }, nil + return SSHKeyConfig{}, fmt.Errorf("SSH key exists locally but could not find it on Hetzner (fingerprint: %s): %w", fingerprint, err) } fmt.Printf(" Uploaded to Hetzner (ID: %d)\n", key.ID) @@ -294,26 +486,118 @@ func setupSSHKey(client *HetznerClient) (SSHKeyConfig, error) { }, nil } -// verifyDNS checks if the sandbox domain resolves. -func verifyDNS(domain string) { - fmt.Printf(" Checking NS records for %s...\n", domain) - out, err := exec.Command("dig", "+short", "NS", domain, "@8.8.8.8").Output() - if err != nil { - fmt.Printf(" Warning: dig failed: %v\n", err) - fmt.Println(" DNS verification skipped. You can verify later with:") - fmt.Printf(" dig NS %s @8.8.8.8\n", domain) +// verifyDNS checks if glue records for the sandbox domain are configured. +// +// There's a chicken-and-egg problem: NS records can't fully resolve until +// CoreDNS is running on the floating IPs (which requires a sandbox cluster). +// So instead of resolving NS → A records, we check for glue records at the +// TLD level, which proves the registrar configuration is correct. +func verifyDNS(domain string, floatingIPs []FloatIP, reader *bufio.Reader) { + expectedIPs := make(map[string]bool) + for _, fip := range floatingIPs { + expectedIPs[fip.IP] = true + } + + // Find the TLD nameserver to query for glue records + findTLDServer := func() string { + // For "dbrs.space", the TLD is "space." — ask the root for its NS + parts := strings.Split(domain, ".") + if len(parts) < 2 { + return "" + } + tld := parts[len(parts)-1] + out, err := exec.Command("dig", "+short", "NS", tld+".", "@8.8.8.8").Output() + if err != nil { + return "" + } + lines := strings.Split(strings.TrimSpace(string(out)), "\n") + if len(lines) > 0 && lines[0] != "" { + return strings.TrimSpace(lines[0]) + } + return "" + } + + check := func() (glueFound bool, foundIPs []string) { + tldNS := findTLDServer() + if tldNS == "" { + return false, nil + } + + // Query the TLD nameserver for NS + glue of our domain + // dig NS domain @tld-server will include glue in ADDITIONAL section + out, err := exec.Command("dig", "NS", domain, "@"+tldNS, "+norecurse", "+additional").Output() + if err != nil { + return false, nil + } + + output := string(out) + remaining := make(map[string]bool) + for k, v := range expectedIPs { + remaining[k] = v + } + + // Look for our floating IPs in the ADDITIONAL section (glue records) + // or anywhere in the response + for _, fip := range floatingIPs { + if strings.Contains(output, fip.IP) { + foundIPs = append(foundIPs, fip.IP) + delete(remaining, fip.IP) + } + } + + return len(remaining) == 0, foundIPs + } + + fmt.Printf(" Checking glue records for %s at TLD nameserver...\n", domain) + matched, foundIPs := check() + + if matched { + fmt.Println(" ✓ Glue records configured correctly:") + for i, ip := range foundIPs { + fmt.Printf(" ns%d.%s → %s\n", i+1, domain, ip) + } + fmt.Println() + fmt.Println(" Note: Full DNS resolution will work once a sandbox is running") + fmt.Println(" (CoreDNS on the floating IPs needs to be up to answer queries).") return } - result := strings.TrimSpace(string(out)) - if result == "" { - fmt.Println(" Warning: No NS records found yet.") - fmt.Println(" DNS propagation can take up to 48 hours.") - fmt.Println(" The sandbox will still work once DNS is configured.") - } else { - fmt.Printf(" NS records:\n") - for _, line := range strings.Split(result, "\n") { - fmt.Printf(" %s\n", line) + if len(foundIPs) > 0 { + fmt.Println(" ⚠ Partial glue records found:") + for _, ip := range foundIPs { + fmt.Printf(" %s\n", ip) } + fmt.Println(" Missing floating IPs in glue:") + for _, fip := range floatingIPs { + if expectedIPs[fip.IP] { + fmt.Printf(" %s\n", fip.IP) + } + } + } else { + fmt.Println(" ✗ No glue records found yet.") + fmt.Println(" Make sure you configured at your registrar:") + fmt.Printf(" ns1.%s → %s\n", domain, floatingIPs[0].IP) + fmt.Printf(" ns2.%s → %s\n", domain, floatingIPs[1].IP) + } + + fmt.Println() + fmt.Print(" Wait for glue propagation? (polls every 30s, Ctrl+C to stop) [y/N]: ") + choice, _ := reader.ReadString('\n') + choice = strings.TrimSpace(strings.ToLower(choice)) + if choice != "y" && choice != "yes" { + fmt.Println(" Skipping. You can create the sandbox now — DNS will work once glue propagates.") + return + } + + fmt.Println(" Waiting for glue record propagation...") + for i := 1; ; i++ { + time.Sleep(30 * time.Second) + matched, _ = check() + if matched { + fmt.Printf("\n ✓ Glue records propagated after %d checks\n", i) + fmt.Println(" You can now create a sandbox: orama sandbox create") + return + } + fmt.Printf(" [%d] Not yet... checking again in 30s\n", i) } } From fd87eec47694b3255ea211be8c3b525419512a30 Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Sat, 28 Feb 2026 15:40:43 +0200 Subject: [PATCH 08/13] feat(security): add manifest signing, TLS TOFU, refresh token migration - Invalidate plaintext refresh tokens (migration 019) - Add `--sign` flag to `orama build` for rootwallet manifest signing - Add `--ca-fingerprint` TOFU verification for production joins/invites - Save cluster secrets from join (RQLite auth, Olric key, IPFS peers) - Add RQLite auth config fields --- ...19_invalidate_plaintext_refresh_tokens.sql | 4 + pkg/cli/build/archive.go | 49 ++++++ pkg/cli/build/builder.go | 9 +- pkg/cli/build/command.go | 2 + pkg/cli/production/install/flags.go | 4 +- pkg/cli/production/install/orchestrator.go | 67 ++++++++- pkg/cli/production/invite/command.go | 35 ++++- pkg/config/database_config.go | 7 + pkg/coredns/rqlite/backend.go | 7 +- pkg/coredns/rqlite/client.go | 14 +- pkg/coredns/rqlite/setup.go | 26 +++- pkg/environments/production/config.go | 142 +++++++++++++++++- pkg/environments/production/firewall.go | 25 ++- pkg/environments/production/firewall_test.go | 8 +- .../production/installers/coredns.go | 17 ++- pkg/environments/production/orchestrator.go | 75 +++++++++ pkg/environments/production/paths.go | 3 +- pkg/environments/production/prebuilt.go | 83 ++++++++++ pkg/environments/production/provisioner.go | 32 ++++ pkg/environments/production/services.go | 65 ++++++-- pkg/environments/production/wireguard.go | 4 +- pkg/environments/production/wireguard_test.go | 4 +- pkg/environments/templates/olric.yaml | 3 + pkg/environments/templates/render.go | 1 + .../templates/systemd_gateway.service | 10 ++ .../templates/systemd_ipfs.service | 10 ++ .../templates/systemd_ipfs_cluster.service | 10 ++ .../templates/systemd_node.service | 10 ++ .../templates/systemd_olric.service | 10 ++ pkg/gateway/auth/crypto.go | 24 +++ pkg/gateway/auth/service.go | 51 +++++-- pkg/gateway/config.go | 9 ++ pkg/gateway/dependencies.go | 26 ++++ pkg/gateway/gateway.go | 2 +- pkg/gateway/handlers/join/handler.go | 77 ++++++++-- pkg/gateway/handlers/pubsub/ws_client.go | 27 +++- pkg/gateway/handlers/serverless/ws_handler.go | 27 +++- pkg/gateway/handlers/wireguard/handler.go | 23 +++ pkg/gateway/middleware.go | 37 +++-- pkg/gateway/rate_limiter.go | 9 +- pkg/gateway/rate_limiter_test.go | 4 +- pkg/ipfs/cluster.go | 117 +++++++++++++-- pkg/namespace/cluster_manager.go | 10 ++ pkg/namespace/cluster_manager_webrtc.go | 24 ++- pkg/node/gateway.go | 26 ++++ pkg/rqlite/adapter.go | 9 +- pkg/rqlite/instance_spawner.go | 6 + pkg/rqlite/process.go | 7 + pkg/secrets/encrypt.go | 98 ++++++++++++ 49 files changed, 1242 insertions(+), 107 deletions(-) create mode 100644 migrations/019_invalidate_plaintext_refresh_tokens.sql create mode 100644 pkg/gateway/auth/crypto.go create mode 100644 pkg/secrets/encrypt.go diff --git a/migrations/019_invalidate_plaintext_refresh_tokens.sql b/migrations/019_invalidate_plaintext_refresh_tokens.sql new file mode 100644 index 0000000..1864b26 --- /dev/null +++ b/migrations/019_invalidate_plaintext_refresh_tokens.sql @@ -0,0 +1,4 @@ +-- Invalidate all existing refresh tokens. +-- Tokens were stored in plaintext; the application now stores SHA-256 hashes. +-- Users will need to re-authenticate (tokens have 30-day expiry anyway). +UPDATE refresh_tokens SET revoked_at = datetime('now') WHERE revoked_at IS NULL; diff --git a/pkg/cli/build/archive.go b/pkg/cli/build/archive.go index 25d8dd7..5c99642 100644 --- a/pkg/cli/build/archive.go +++ b/pkg/cli/build/archive.go @@ -10,6 +10,7 @@ import ( "io" "net/http" "os" + "os/exec" "path/filepath" "strings" "time" @@ -106,6 +107,14 @@ func (b *Builder) createArchive(outputPath string, manifest *Manifest) error { return err } + // Add manifest.sig if it exists (created by --sign) + sigPath := filepath.Join(b.tmpDir, "manifest.sig") + if _, err := os.Stat(sigPath); err == nil { + if err := addFileToTar(tw, sigPath, "manifest.sig"); err != nil { + return err + } + } + // Print summary fmt.Printf(" bin/: %d binaries\n", len(manifest.Checksums)) fmt.Printf(" systemd/: namespace templates\n") @@ -119,6 +128,46 @@ func (b *Builder) createArchive(outputPath string, manifest *Manifest) error { return nil } +// signManifest signs the manifest hash using rootwallet CLI. +// Produces manifest.sig containing the hex-encoded EVM signature. +func (b *Builder) signManifest(manifest *Manifest) error { + fmt.Printf("\nSigning manifest with rootwallet...\n") + + // Serialize manifest deterministically (compact JSON, sorted keys via json.Marshal) + manifestData, err := json.Marshal(manifest) + if err != nil { + return fmt.Errorf("failed to marshal manifest: %w", err) + } + + // Hash the manifest JSON + hash := sha256.Sum256(manifestData) + hashHex := hex.EncodeToString(hash[:]) + + // Call rw sign --chain evm + cmd := exec.Command("rw", "sign", hashHex, "--chain", "evm") + var stdout, stderr strings.Builder + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + if err := cmd.Run(); err != nil { + return fmt.Errorf("rw sign failed: %w\n%s", err, stderr.String()) + } + + signature := strings.TrimSpace(stdout.String()) + if signature == "" { + return fmt.Errorf("rw sign produced empty signature") + } + + // Write signature file + sigPath := filepath.Join(b.tmpDir, "manifest.sig") + if err := os.WriteFile(sigPath, []byte(signature), 0644); err != nil { + return fmt.Errorf("failed to write manifest.sig: %w", err) + } + + fmt.Printf(" Manifest signed (SHA256: %s...)\n", hashHex[:16]) + return nil +} + // addDirToTar adds all files in a directory to the tar archive under the given prefix. func addDirToTar(tw *tar.Writer, srcDir, prefix string) error { return filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error { diff --git a/pkg/cli/build/builder.go b/pkg/cli/build/builder.go index 60737dd..2c306d4 100644 --- a/pkg/cli/build/builder.go +++ b/pkg/cli/build/builder.go @@ -117,7 +117,14 @@ func (b *Builder) Build() error { return fmt.Errorf("failed to generate manifest: %w", err) } - // Step 11: Create archive + // Step 11: Sign manifest (optional) + if b.flags.Sign { + if err := b.signManifest(manifest); err != nil { + return fmt.Errorf("failed to sign manifest: %w", err) + } + } + + // Step 12: Create archive outputPath := b.flags.Output if outputPath == "" { outputPath = fmt.Sprintf("/tmp/orama-%s-linux-%s.tar.gz", b.version, b.flags.Arch) diff --git a/pkg/cli/build/command.go b/pkg/cli/build/command.go index 97fe0f4..a7ee982 100644 --- a/pkg/cli/build/command.go +++ b/pkg/cli/build/command.go @@ -13,6 +13,7 @@ type Flags struct { Arch string Output string Verbose bool + Sign bool // Sign the archive manifest with rootwallet } // Handle is the entry point for the build command. @@ -42,6 +43,7 @@ func parseFlags(args []string) (*Flags, error) { fs.StringVar(&flags.Arch, "arch", "amd64", "Target architecture (amd64, arm64)") fs.StringVar(&flags.Output, "output", "", "Output archive path (default: /tmp/orama--linux-.tar.gz)") fs.BoolVar(&flags.Verbose, "verbose", false, "Verbose output") + fs.BoolVar(&flags.Sign, "sign", false, "Sign the manifest with rootwallet (requires rw in PATH)") if err := fs.Parse(args); err != nil { return nil, err diff --git a/pkg/cli/production/install/flags.go b/pkg/cli/production/install/flags.go index 3e4788c..50b844e 100644 --- a/pkg/cli/production/install/flags.go +++ b/pkg/cli/production/install/flags.go @@ -28,7 +28,8 @@ type Flags struct { IPFSClusterAddrs string // Security flags - SkipFirewall bool // Skip UFW firewall setup (for users who manage their own firewall) + SkipFirewall bool // Skip UFW firewall setup (for users who manage their own firewall) + CAFingerprint string // SHA-256 fingerprint of server TLS cert for TOFU verification // Anyone flags AnyoneClient bool // Run Anyone as client-only (SOCKS5 proxy on port 9050, no relay) @@ -74,6 +75,7 @@ func ParseFlags(args []string) (*Flags, error) { // Security flags fs.BoolVar(&flags.SkipFirewall, "skip-firewall", false, "Skip UFW firewall setup (for users who manage their own firewall)") + fs.StringVar(&flags.CAFingerprint, "ca-fingerprint", "", "SHA-256 fingerprint of server TLS cert (from orama invite output)") // Anyone flags fs.BoolVar(&flags.AnyoneClient, "anyone-client", false, "Install Anyone as client-only (SOCKS5 proxy on port 9050, no relay)") diff --git a/pkg/cli/production/install/orchestrator.go b/pkg/cli/production/install/orchestrator.go index 1689faa..7372bbe 100644 --- a/pkg/cli/production/install/orchestrator.go +++ b/pkg/cli/production/install/orchestrator.go @@ -2,8 +2,12 @@ package install import ( "bufio" + "bytes" "crypto/rand" + "crypto/sha256" "crypto/tls" + "crypto/x509" + "encoding/hex" "encoding/json" "fmt" "io" @@ -366,12 +370,35 @@ func (o *Orchestrator) callJoinEndpoint(wgPubKey string) (*joinhandlers.JoinResp } url := strings.TrimRight(o.flags.JoinAddress, "/") + "/v1/internal/join" + + tlsConfig := &tls.Config{} + if o.flags.CAFingerprint != "" { + // TOFU: verify the server's TLS cert fingerprint matches the one from the invite + expectedFP, err := hex.DecodeString(o.flags.CAFingerprint) + if err != nil { + return nil, fmt.Errorf("invalid --ca-fingerprint: must be hex-encoded SHA-256: %w", err) + } + tlsConfig.InsecureSkipVerify = true + tlsConfig.VerifyPeerCertificate = func(rawCerts [][]byte, _ [][]*x509.Certificate) error { + if len(rawCerts) == 0 { + return fmt.Errorf("server presented no TLS certificates") + } + hash := sha256.Sum256(rawCerts[0]) + if !bytes.Equal(hash[:], expectedFP) { + return fmt.Errorf("TLS certificate fingerprint mismatch: expected %s, got %x (possible MITM attack)", + o.flags.CAFingerprint, hash[:]) + } + return nil + } + } else { + // No fingerprint provided — fall back to insecure for backward compatibility + tlsConfig.InsecureSkipVerify = true + } + client := &http.Client{ Timeout: 30 * time.Second, Transport: &http.Transport{ - TLSClientConfig: &tls.Config{ - InsecureSkipVerify: true, // Self-signed certs during initial setup - }, + TLSClientConfig: tlsConfig, }, } @@ -419,6 +446,40 @@ func (o *Orchestrator) saveSecretsFromJoinResponse(resp *joinhandlers.JoinRespon } } + // Write API key HMAC secret + if resp.APIKeyHMACSecret != "" { + if err := os.WriteFile(filepath.Join(secretsDir, "api-key-hmac-secret"), []byte(resp.APIKeyHMACSecret), 0600); err != nil { + return fmt.Errorf("failed to write api-key-hmac-secret: %w", err) + } + } + + // Write RQLite password and generate auth JSON file + if resp.RQLitePassword != "" { + if err := os.WriteFile(filepath.Join(secretsDir, "rqlite-password"), []byte(resp.RQLitePassword), 0600); err != nil { + return fmt.Errorf("failed to write rqlite-password: %w", err) + } + // Also generate the auth JSON file that rqlited uses with -auth flag + authJSON := fmt.Sprintf(`[{"username": "orama", "password": "%s", "perms": ["all"]}]`, resp.RQLitePassword) + if err := os.WriteFile(filepath.Join(secretsDir, "rqlite-auth.json"), []byte(authJSON), 0600); err != nil { + return fmt.Errorf("failed to write rqlite-auth.json: %w", err) + } + } + + // Write Olric encryption key + if resp.OlricEncryptionKey != "" { + if err := os.WriteFile(filepath.Join(secretsDir, "olric-encryption-key"), []byte(resp.OlricEncryptionKey), 0600); err != nil { + return fmt.Errorf("failed to write olric-encryption-key: %w", err) + } + } + + // Write IPFS Cluster trusted peer IDs + if len(resp.IPFSClusterPeerIDs) > 0 { + content := strings.Join(resp.IPFSClusterPeerIDs, "\n") + "\n" + if err := os.WriteFile(filepath.Join(secretsDir, "ipfs-cluster-trusted-peers"), []byte(content), 0600); err != nil { + return fmt.Errorf("failed to write ipfs-cluster-trusted-peers: %w", err) + } + } + return nil } diff --git a/pkg/cli/production/invite/command.go b/pkg/cli/production/invite/command.go index 9234a02..aa3d71d 100644 --- a/pkg/cli/production/invite/command.go +++ b/pkg/cli/production/invite/command.go @@ -3,9 +3,12 @@ package invite import ( "bytes" "crypto/rand" + "crypto/sha256" + "crypto/tls" "encoding/hex" "encoding/json" "fmt" + "net" "net/http" "os" "time" @@ -59,13 +62,43 @@ func Handle(args []string) { os.Exit(1) } + // Get TLS certificate fingerprint for TOFU verification + certFingerprint := getTLSCertFingerprint(domain) + // Print the invite command fmt.Printf("\nInvite token created (expires in %s)\n\n", expiry) fmt.Printf("Run this on the new node:\n\n") - fmt.Printf(" sudo orama install --join https://%s --token %s --vps-ip --nameserver\n\n", domain, token) + if certFingerprint != "" { + fmt.Printf(" sudo orama install --join https://%s --token %s --ca-fingerprint %s --vps-ip --nameserver\n\n", domain, token, certFingerprint) + } else { + fmt.Printf(" sudo orama install --join https://%s --token %s --vps-ip --nameserver\n\n", domain, token) + } fmt.Printf("Replace with the new node's public IP address.\n") } +// getTLSCertFingerprint connects to the domain over TLS and returns the +// SHA-256 fingerprint of the leaf certificate. Returns empty string on failure. +func getTLSCertFingerprint(domain string) string { + conn, err := tls.DialWithDialer( + &net.Dialer{Timeout: 5 * time.Second}, + "tcp", + domain+":443", + &tls.Config{InsecureSkipVerify: true}, + ) + if err != nil { + return "" + } + defer conn.Close() + + certs := conn.ConnectionState().PeerCertificates + if len(certs) == 0 { + return "" + } + + hash := sha256.Sum256(certs[0].Raw) + return hex.EncodeToString(hash[:]) +} + // readNodeDomain reads the domain from the node config file func readNodeDomain() (string, error) { configPath := "/opt/orama/.orama/configs/node.yaml" diff --git a/pkg/config/database_config.go b/pkg/config/database_config.go index e905006..8383fd5 100644 --- a/pkg/config/database_config.go +++ b/pkg/config/database_config.go @@ -22,6 +22,13 @@ type DatabaseConfig struct { NodeCACert string `yaml:"node_ca_cert"` // Path to CA certificate (optional, uses system CA if not set) NodeNoVerify bool `yaml:"node_no_verify"` // Skip certificate verification (for testing/self-signed certs) + // RQLite HTTP Basic Auth credentials. + // When RQLiteAuthFile is set, rqlited is launched with `-auth `. + // Username/password are embedded in all client DSNs (harmless when auth not enforced). + RQLiteUsername string `yaml:"rqlite_username"` + RQLitePassword string `yaml:"rqlite_password"` + RQLiteAuthFile string `yaml:"rqlite_auth_file"` // Path to RQLite auth JSON file. Empty = auth not enforced. + // Raft tuning (passed through to rqlited CLI flags). // Higher defaults than rqlited's 1s suit WireGuard latency. RaftElectionTimeout time.Duration `yaml:"raft_election_timeout"` // default: 5s diff --git a/pkg/coredns/rqlite/backend.go b/pkg/coredns/rqlite/backend.go index 54696e5..5518b67 100644 --- a/pkg/coredns/rqlite/backend.go +++ b/pkg/coredns/rqlite/backend.go @@ -31,9 +31,10 @@ type Backend struct { healthy bool } -// NewBackend creates a new RQLite backend -func NewBackend(dsn string, refreshRate time.Duration, logger *zap.Logger) (*Backend, error) { - client, err := NewRQLiteClient(dsn, logger) +// NewBackend creates a new RQLite backend. +// Optional username/password enable HTTP basic auth for RQLite connections. +func NewBackend(dsn string, refreshRate time.Duration, logger *zap.Logger, username, password string) (*Backend, error) { + client, err := NewRQLiteClient(dsn, logger, username, password) if err != nil { return nil, fmt.Errorf("failed to create RQLite client: %w", err) } diff --git a/pkg/coredns/rqlite/client.go b/pkg/coredns/rqlite/client.go index f6f64b9..b61ad51 100644 --- a/pkg/coredns/rqlite/client.go +++ b/pkg/coredns/rqlite/client.go @@ -15,6 +15,8 @@ import ( // RQLiteClient is a simple HTTP client for RQLite type RQLiteClient struct { baseURL string + username string // HTTP basic auth username (empty = no auth) + password string // HTTP basic auth password httpClient *http.Client logger *zap.Logger } @@ -32,10 +34,13 @@ type QueryResult struct { Error string `json:"error"` } -// NewRQLiteClient creates a new RQLite HTTP client -func NewRQLiteClient(dsn string, logger *zap.Logger) (*RQLiteClient, error) { +// NewRQLiteClient creates a new RQLite HTTP client. +// Optional username/password enable HTTP basic auth on all requests. +func NewRQLiteClient(dsn string, logger *zap.Logger, username, password string) (*RQLiteClient, error) { return &RQLiteClient{ - baseURL: dsn, + baseURL: dsn, + username: username, + password: password, httpClient: &http.Client{ Timeout: 10 * time.Second, Transport: &http.Transport{ @@ -65,6 +70,9 @@ func (c *RQLiteClient) Query(ctx context.Context, query string, args ...interfac } req.Header.Set("Content-Type", "application/json") + if c.username != "" && c.password != "" { + req.SetBasicAuth(c.username, c.password) + } resp, err := c.httpClient.Do(req) if err != nil { diff --git a/pkg/coredns/rqlite/setup.go b/pkg/coredns/rqlite/setup.go index abcb1c1..f3576ab 100644 --- a/pkg/coredns/rqlite/setup.go +++ b/pkg/coredns/rqlite/setup.go @@ -38,11 +38,13 @@ func parseConfig(c *caddy.Controller) (*RQLitePlugin, error) { } var ( - dsn = "http://localhost:5001" - refreshRate = 10 * time.Second - cacheTTL = 30 * time.Second - cacheSize = 10000 - zones []string + dsn = "http://localhost:5001" + refreshRate = 10 * time.Second + cacheTTL = 30 * time.Second + cacheSize = 10000 + rqliteUsername string + rqlitePassword string + zones []string ) // Parse zone arguments @@ -90,6 +92,18 @@ func parseConfig(c *caddy.Controller) (*RQLitePlugin, error) { } cacheSize = size + case "username": + if !c.NextArg() { + return nil, c.ArgErr() + } + rqliteUsername = c.Val() + + case "password": + if !c.NextArg() { + return nil, c.ArgErr() + } + rqlitePassword = c.Val() + default: return nil, c.Errf("unknown property '%s'", c.Val()) } @@ -101,7 +115,7 @@ func parseConfig(c *caddy.Controller) (*RQLitePlugin, error) { } // Create backend - backend, err := NewBackend(dsn, refreshRate, logger) + backend, err := NewBackend(dsn, refreshRate, logger, rqliteUsername, rqlitePassword) if err != nil { return nil, fmt.Errorf("failed to create backend: %w", err) } diff --git a/pkg/environments/production/config.go b/pkg/environments/production/config.go index 8de319a..cb80560 100644 --- a/pkg/environments/production/config.go +++ b/pkg/environments/production/config.go @@ -2,6 +2,7 @@ package production import ( "crypto/rand" + "encoding/base64" "encoding/hex" "fmt" "net" @@ -239,8 +240,15 @@ func (cg *ConfigGenerator) GenerateGatewayConfig(peerAddresses []string, enableH return templates.RenderGatewayConfig(data) } -// GenerateOlricConfig generates Olric configuration +// GenerateOlricConfig generates Olric configuration. +// Reads the Olric encryption key from secrets if available. func (cg *ConfigGenerator) GenerateOlricConfig(serverBindAddr string, httpPort int, memberlistBindAddr string, memberlistPort int, memberlistEnv string, advertiseAddr string, peers []string) (string, error) { + // Read encryption key from secrets if available + encryptionKey := "" + if data, err := os.ReadFile(filepath.Join(cg.oramaDir, "secrets", "olric-encryption-key")); err == nil { + encryptionKey = strings.TrimSpace(string(data)) + } + data := templates.OlricConfigData{ ServerBindAddr: serverBindAddr, HTTPPort: httpPort, @@ -249,6 +257,7 @@ func (cg *ConfigGenerator) GenerateOlricConfig(serverBindAddr string, httpPort i MemberlistEnvironment: memberlistEnv, MemberlistAdvertiseAddr: advertiseAddr, Peers: peers, + EncryptionKey: encryptionKey, } return templates.RenderOlricConfig(data) } @@ -323,6 +332,137 @@ func (sg *SecretGenerator) EnsureClusterSecret() (string, error) { return secret, nil } +// EnsureRQLiteAuth generates the RQLite auth credentials and JSON auth file. +// Returns (username, password). The auth JSON file is written to secrets/rqlite-auth.json. +func (sg *SecretGenerator) EnsureRQLiteAuth() (string, string, error) { + passwordPath := filepath.Join(sg.oramaDir, "secrets", "rqlite-password") + authFilePath := filepath.Join(sg.oramaDir, "secrets", "rqlite-auth.json") + secretDir := filepath.Dir(passwordPath) + username := "orama" + + if err := os.MkdirAll(secretDir, 0700); err != nil { + return "", "", fmt.Errorf("failed to create secrets directory: %w", err) + } + if err := os.Chmod(secretDir, 0700); err != nil { + return "", "", fmt.Errorf("failed to set secrets directory permissions: %w", err) + } + + // Try to read existing password + var password string + if data, err := os.ReadFile(passwordPath); err == nil { + password = strings.TrimSpace(string(data)) + } + + // Generate new password if needed + if password == "" { + bytes := make([]byte, 32) + if _, err := rand.Read(bytes); err != nil { + return "", "", fmt.Errorf("failed to generate RQLite password: %w", err) + } + password = hex.EncodeToString(bytes) + + if err := os.WriteFile(passwordPath, []byte(password), 0600); err != nil { + return "", "", fmt.Errorf("failed to save RQLite password: %w", err) + } + if err := ensureSecretFilePermissions(passwordPath); err != nil { + return "", "", err + } + } + + // Always regenerate the auth JSON file to ensure consistency + authJSON := fmt.Sprintf(`[{"username": "%s", "password": "%s", "perms": ["all"]}]`, username, password) + if err := os.WriteFile(authFilePath, []byte(authJSON), 0600); err != nil { + return "", "", fmt.Errorf("failed to save RQLite auth file: %w", err) + } + if err := ensureSecretFilePermissions(authFilePath); err != nil { + return "", "", err + } + + return username, password, nil +} + +// EnsureOlricEncryptionKey gets or generates a 32-byte encryption key for Olric memberlist gossip. +// The key is stored as base64 on disk and returned as base64 (what Olric expects). +func (sg *SecretGenerator) EnsureOlricEncryptionKey() (string, error) { + secretPath := filepath.Join(sg.oramaDir, "secrets", "olric-encryption-key") + secretDir := filepath.Dir(secretPath) + + if err := os.MkdirAll(secretDir, 0700); err != nil { + return "", fmt.Errorf("failed to create secrets directory: %w", err) + } + if err := os.Chmod(secretDir, 0700); err != nil { + return "", fmt.Errorf("failed to set secrets directory permissions: %w", err) + } + + // Try to read existing key + if data, err := os.ReadFile(secretPath); err == nil { + key := strings.TrimSpace(string(data)) + if key != "" { + if err := ensureSecretFilePermissions(secretPath); err != nil { + return "", err + } + return key, nil + } + } + + // Generate new 32-byte key, base64 encoded + keyBytes := make([]byte, 32) + if _, err := rand.Read(keyBytes); err != nil { + return "", fmt.Errorf("failed to generate Olric encryption key: %w", err) + } + key := base64.StdEncoding.EncodeToString(keyBytes) + + if err := os.WriteFile(secretPath, []byte(key), 0600); err != nil { + return "", fmt.Errorf("failed to save Olric encryption key: %w", err) + } + if err := ensureSecretFilePermissions(secretPath); err != nil { + return "", err + } + + return key, nil +} + +// EnsureAPIKeyHMACSecret gets or generates the HMAC secret used to hash API keys. +// The secret is a 32-byte random value stored as 64 hex characters. +func (sg *SecretGenerator) EnsureAPIKeyHMACSecret() (string, error) { + secretPath := filepath.Join(sg.oramaDir, "secrets", "api-key-hmac-secret") + secretDir := filepath.Dir(secretPath) + + if err := os.MkdirAll(secretDir, 0700); err != nil { + return "", fmt.Errorf("failed to create secrets directory: %w", err) + } + if err := os.Chmod(secretDir, 0700); err != nil { + return "", fmt.Errorf("failed to set secrets directory permissions: %w", err) + } + + // Try to read existing secret + if data, err := os.ReadFile(secretPath); err == nil { + secret := strings.TrimSpace(string(data)) + if len(secret) == 64 { + if err := ensureSecretFilePermissions(secretPath); err != nil { + return "", err + } + return secret, nil + } + } + + // Generate new secret (32 bytes = 64 hex chars) + bytes := make([]byte, 32) + if _, err := rand.Read(bytes); err != nil { + return "", fmt.Errorf("failed to generate API key HMAC secret: %w", err) + } + secret := hex.EncodeToString(bytes) + + if err := os.WriteFile(secretPath, []byte(secret), 0600); err != nil { + return "", fmt.Errorf("failed to save API key HMAC secret: %w", err) + } + if err := ensureSecretFilePermissions(secretPath); err != nil { + return "", err + } + + return secret, nil +} + func ensureSecretFilePermissions(secretPath string) error { if err := os.Chmod(secretPath, 0600); err != nil { return fmt.Errorf("failed to set permissions on %s: %w", secretPath, err) diff --git a/pkg/environments/production/firewall.go b/pkg/environments/production/firewall.go index 484c345..36168b7 100644 --- a/pkg/environments/production/firewall.go +++ b/pkg/environments/production/firewall.go @@ -98,7 +98,12 @@ func (fp *FirewallProvisioner) GenerateRules() []string { } // Allow all traffic from WireGuard subnet (inter-node encrypted traffic) - rules = append(rules, "ufw allow from 10.0.0.0/8") + rules = append(rules, "ufw allow from 10.0.0.0/24") + + // Disable IPv6 — no ip6tables rules exist, so services bound to 0.0.0.0 + // may be reachable via IPv6. Disable it entirely at the kernel level. + rules = append(rules, "sysctl -w net.ipv6.conf.all.disable_ipv6=1") + rules = append(rules, "sysctl -w net.ipv6.conf.default.disable_ipv6=1") // Enable firewall rules = append(rules, "ufw --force enable") @@ -109,7 +114,7 @@ func (fp *FirewallProvisioner) GenerateRules() []string { // can be misclassified as "invalid" by conntrack due to reordering/jitter // (especially between high-latency peers), causing silent packet drops. // Inserting at position 1 in INPUT ensures this runs before UFW chains. - rules = append(rules, "iptables -I INPUT 1 -i wg0 -s 10.0.0.0/8 -j ACCEPT") + rules = append(rules, "iptables -I INPUT 1 -i wg0 -s 10.0.0.0/24 -j ACCEPT") return rules } @@ -130,6 +135,22 @@ func (fp *FirewallProvisioner) Setup() error { } } + // Persist IPv6 disable across reboots + if err := fp.persistIPv6Disable(); err != nil { + return fmt.Errorf("failed to persist IPv6 disable: %w", err) + } + + return nil +} + +// persistIPv6Disable writes a sysctl config to disable IPv6 on boot. +func (fp *FirewallProvisioner) persistIPv6Disable() error { + content := "# Orama Network: disable IPv6 (no ip6tables rules configured)\nnet.ipv6.conf.all.disable_ipv6 = 1\nnet.ipv6.conf.default.disable_ipv6 = 1\n" + cmd := exec.Command("tee", "/etc/sysctl.d/99-orama-disable-ipv6.conf") + cmd.Stdin = strings.NewReader(content) + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("failed to write sysctl config: %w\n%s", err, string(output)) + } return nil } diff --git a/pkg/environments/production/firewall_test.go b/pkg/environments/production/firewall_test.go index 8aaba7f..2d4168b 100644 --- a/pkg/environments/production/firewall_test.go +++ b/pkg/environments/production/firewall_test.go @@ -18,9 +18,11 @@ func TestFirewallProvisioner_GenerateRules_StandardNode(t *testing.T) { assertContainsRule(t, rules, "ufw allow 51820/udp") assertContainsRule(t, rules, "ufw allow 80/tcp") assertContainsRule(t, rules, "ufw allow 443/tcp") - assertContainsRule(t, rules, "ufw allow from 10.0.0.0/8") + assertContainsRule(t, rules, "ufw allow from 10.0.0.0/24") + assertContainsRule(t, rules, "sysctl -w net.ipv6.conf.all.disable_ipv6=1") + assertContainsRule(t, rules, "sysctl -w net.ipv6.conf.default.disable_ipv6=1") assertContainsRule(t, rules, "ufw --force enable") - assertContainsRule(t, rules, "iptables -I INPUT 1 -i wg0 -s 10.0.0.0/8 -j ACCEPT") + assertContainsRule(t, rules, "iptables -I INPUT 1 -i wg0 -s 10.0.0.0/24 -j ACCEPT") // Should NOT contain DNS or Anyone relay for _, rule := range rules { @@ -76,7 +78,7 @@ func TestFirewallProvisioner_GenerateRules_WireGuardSubnetAllowed(t *testing.T) rules := fp.GenerateRules() - assertContainsRule(t, rules, "ufw allow from 10.0.0.0/8") + assertContainsRule(t, rules, "ufw allow from 10.0.0.0/24") } func TestFirewallProvisioner_GenerateRules_FullConfig(t *testing.T) { diff --git a/pkg/environments/production/installers/coredns.go b/pkg/environments/production/installers/coredns.go index 7876517..b64378f 100644 --- a/pkg/environments/production/installers/coredns.go +++ b/pkg/environments/production/installers/coredns.go @@ -9,6 +9,7 @@ import ( "os" "os/exec" "path/filepath" + "strings" "time" "github.com/DeBrosOfficial/network/pkg/constants" @@ -323,8 +324,18 @@ rqlite:rqlite ` } -// generateCorefile creates the CoreDNS configuration (RQLite only) +// generateCorefile creates the CoreDNS configuration (RQLite only). +// If RQLite credentials exist on disk, they are included in the config. func (ci *CoreDNSInstaller) generateCorefile(domain, rqliteDSN string) string { + // Read RQLite credentials from secrets if available + authBlock := "" + if data, err := os.ReadFile("/opt/orama/.orama/secrets/rqlite-password"); err == nil { + password := strings.TrimSpace(string(data)) + if password != "" { + authBlock = fmt.Sprintf(" username orama\n password %s\n", password) + } + } + return fmt.Sprintf(`# CoreDNS configuration for %s # Uses RQLite for ALL DNS records (static + dynamic) # Static records (SOA, NS, A) are seeded into RQLite during installation @@ -336,7 +347,7 @@ func (ci *CoreDNSInstaller) generateCorefile(domain, rqliteDSN string) string { refresh 5s ttl 30 cache_size 10000 - } +%s } # Enable logging and error reporting log @@ -351,7 +362,7 @@ func (ci *CoreDNSInstaller) generateCorefile(domain, rqliteDSN string) string { cache 300 errors } -`, domain, domain, rqliteDSN) +`, domain, domain, rqliteDSN, authBlock) } // seedStaticRecords inserts static zone records into RQLite (non-destructive) diff --git a/pkg/environments/production/orchestrator.go b/pkg/environments/production/orchestrator.go index 65dd7c8..339e7d3 100644 --- a/pkg/environments/production/orchestrator.go +++ b/pkg/environments/production/orchestrator.go @@ -1,6 +1,7 @@ package production import ( + "encoding/json" "fmt" "io" "os" @@ -256,6 +257,13 @@ func (ps *ProductionSetup) Phase2ProvisionEnvironment() error { } ps.logf(" ✓ Directory structure created") + // Create dedicated orama user for running services (non-root) + if err := ps.fsProvisioner.EnsureOramaUser(); err != nil { + ps.logf(" ⚠️ Could not create orama user: %v (services will run as root)", err) + } else { + ps.logf(" ✓ orama user ensured") + } + return nil } @@ -477,6 +485,11 @@ func (ps *ProductionSetup) Phase2cInitializeServices(peerAddresses []string, vps return fmt.Errorf("failed to initialize IPFS Cluster: %w", err) } + // After init, save own IPFS Cluster peer ID to trusted peers file + if err := ps.saveOwnClusterPeerID(clusterPath); err != nil { + ps.logf(" ⚠️ Could not save IPFS Cluster peer ID to trusted peers: %v", err) + } + // Initialize RQLite data directory rqliteDataDir := filepath.Join(dataDir, "rqlite") if err := ps.binaryInstaller.InitializeRQLiteDataDir(rqliteDataDir); err != nil { @@ -487,6 +500,50 @@ func (ps *ProductionSetup) Phase2cInitializeServices(peerAddresses []string, vps return nil } +// saveOwnClusterPeerID reads this node's IPFS Cluster peer ID from identity.json +// and appends it to the trusted-peers file so EnsureConfig() can use it. +func (ps *ProductionSetup) saveOwnClusterPeerID(clusterPath string) error { + identityPath := filepath.Join(clusterPath, "identity.json") + data, err := os.ReadFile(identityPath) + if err != nil { + return fmt.Errorf("failed to read identity.json: %w", err) + } + + var identity struct { + ID string `json:"id"` + } + if err := json.Unmarshal(data, &identity); err != nil { + return fmt.Errorf("failed to parse identity.json: %w", err) + } + if identity.ID == "" { + return fmt.Errorf("peer ID not found in identity.json") + } + + // Read existing trusted peers + trustedPeersPath := filepath.Join(ps.oramaDir, "secrets", "ipfs-cluster-trusted-peers") + var existing []string + if fileData, err := os.ReadFile(trustedPeersPath); err == nil { + for _, line := range strings.Split(strings.TrimSpace(string(fileData)), "\n") { + line = strings.TrimSpace(line) + if line != "" { + if line == identity.ID { + return nil // already present + } + existing = append(existing, line) + } + } + } + + existing = append(existing, identity.ID) + content := strings.Join(existing, "\n") + "\n" + if err := os.WriteFile(trustedPeersPath, []byte(content), 0600); err != nil { + return fmt.Errorf("failed to write trusted peers file: %w", err) + } + + ps.logf(" ✓ IPFS Cluster peer ID saved to trusted peers: %s", identity.ID) + return nil +} + // Phase3GenerateSecrets generates shared secrets and keys func (ps *ProductionSetup) Phase3GenerateSecrets() error { ps.logf("Phase 3: Generating secrets...") @@ -503,6 +560,24 @@ func (ps *ProductionSetup) Phase3GenerateSecrets() error { } ps.logf(" ✓ IPFS swarm key ensured") + // RQLite auth credentials + if _, _, err := ps.secretGenerator.EnsureRQLiteAuth(); err != nil { + return fmt.Errorf("failed to ensure RQLite auth: %w", err) + } + ps.logf(" ✓ RQLite auth credentials ensured") + + // Olric gossip encryption key + if _, err := ps.secretGenerator.EnsureOlricEncryptionKey(); err != nil { + return fmt.Errorf("failed to ensure Olric encryption key: %w", err) + } + ps.logf(" ✓ Olric encryption key ensured") + + // API key HMAC secret + if _, err := ps.secretGenerator.EnsureAPIKeyHMACSecret(); err != nil { + return fmt.Errorf("failed to ensure API key HMAC secret: %w", err) + } + ps.logf(" ✓ API key HMAC secret ensured") + // Node identity (unified architecture) peerID, err := ps.secretGenerator.EnsureNodeIdentity() if err != nil { diff --git a/pkg/environments/production/paths.go b/pkg/environments/production/paths.go index a2cd310..9223ae6 100644 --- a/pkg/environments/production/paths.go +++ b/pkg/environments/production/paths.go @@ -13,7 +13,8 @@ const ( OramaLogs = "/opt/orama/.orama/logs" // Pre-built binary archive paths (created by `orama build`) - OramaManifest = "/opt/orama/manifest.json" + OramaManifest = "/opt/orama/manifest.json" + OramaManifestSig = "/opt/orama/manifest.sig" OramaArchiveBin = "/opt/orama/bin" // Pre-built binaries OramaSystemdDir = "/opt/orama/systemd" // Namespace service templates OramaPackagesDir = "/opt/orama/packages" // .deb packages (e.g., anon.deb) diff --git a/pkg/environments/production/prebuilt.go b/pkg/environments/production/prebuilt.go index 04d4233..ac424b3 100644 --- a/pkg/environments/production/prebuilt.go +++ b/pkg/environments/production/prebuilt.go @@ -1,12 +1,17 @@ package production import ( + "crypto/sha256" + "encoding/hex" "encoding/json" "fmt" "io" "os" "os/exec" "path/filepath" + "strings" + + ethcrypto "github.com/ethereum/go-ethereum/crypto" ) // PreBuiltManifest describes the contents of a pre-built binary archive. @@ -40,6 +45,74 @@ func LoadPreBuiltManifest() (*PreBuiltManifest, error) { return &manifest, nil } +// OramaSignerAddress is the Ethereum address authorized to sign build archives. +// Archives signed by any other address are rejected during install. +// This is the DeBros deploy wallet — update if the signing key rotates. +const OramaSignerAddress = "0x0000000000000000000000000000000000000000" // TODO: set real address + +// VerifyArchiveSignature verifies that the pre-built archive was signed by the +// authorized Orama signer. Returns nil if the signature is valid, or if no +// signature file exists (unsigned archives are allowed but logged as a warning). +func VerifyArchiveSignature(manifest *PreBuiltManifest) error { + sigData, err := os.ReadFile(OramaManifestSig) + if os.IsNotExist(err) { + return nil // unsigned archive — caller decides whether to proceed + } + if err != nil { + return fmt.Errorf("failed to read manifest.sig: %w", err) + } + + // Reproduce the same hash used during signing: SHA256 of compact JSON + manifestJSON, err := json.Marshal(manifest) + if err != nil { + return fmt.Errorf("failed to marshal manifest: %w", err) + } + manifestHash := sha256.Sum256(manifestJSON) + hashHex := hex.EncodeToString(manifestHash[:]) + + // EVM personal_sign: keccak256("\x19Ethereum Signed Message:\n" + len + message) + msg := []byte(hashHex) + prefix := []byte("\x19Ethereum Signed Message:\n" + fmt.Sprintf("%d", len(msg))) + ethHash := ethcrypto.Keccak256(prefix, msg) + + // Decode signature + sigHex := strings.TrimSpace(string(sigData)) + if strings.HasPrefix(sigHex, "0x") || strings.HasPrefix(sigHex, "0X") { + sigHex = sigHex[2:] + } + sig, err := hex.DecodeString(sigHex) + if err != nil || len(sig) != 65 { + return fmt.Errorf("invalid signature format in manifest.sig") + } + + // Normalize recovery ID + if sig[64] >= 27 { + sig[64] -= 27 + } + + // Recover public key from signature + pub, err := ethcrypto.SigToPub(ethHash, sig) + if err != nil { + return fmt.Errorf("signature recovery failed: %w", err) + } + + recovered := ethcrypto.PubkeyToAddress(*pub).Hex() + expected := strings.ToLower(OramaSignerAddress) + got := strings.ToLower(recovered) + + if got != expected { + return fmt.Errorf("archive signed by %s, expected %s — refusing to install", recovered, OramaSignerAddress) + } + + return nil +} + +// IsArchiveSigned returns true if a manifest.sig file exists alongside the manifest. +func IsArchiveSigned() bool { + _, err := os.Stat(OramaManifestSig) + return err == nil +} + // installFromPreBuilt installs all binaries from a pre-built archive. // The archive must already be extracted at /opt/orama/ with: // - /opt/orama/bin/ — all pre-compiled binaries @@ -49,6 +122,16 @@ func LoadPreBuiltManifest() (*PreBuiltManifest, error) { func (ps *ProductionSetup) installFromPreBuilt(manifest *PreBuiltManifest) error { ps.logf(" Using pre-built binary archive v%s (%s) linux/%s", manifest.Version, manifest.Commit, manifest.Arch) + // Verify archive signature if present + if IsArchiveSigned() { + if err := VerifyArchiveSignature(manifest); err != nil { + return fmt.Errorf("archive signature verification failed: %w", err) + } + ps.logf(" ✓ Archive signature verified") + } else { + ps.logf(" ⚠️ Archive is unsigned — consider using 'orama build --sign'") + } + // Install minimal system dependencies (no build tools needed) if err := ps.installMinimalSystemDeps(); err != nil { ps.logf(" ⚠️ System dependencies warning: %v", err) diff --git a/pkg/environments/production/provisioner.go b/pkg/environments/production/provisioner.go index 97e3089..15d8741 100644 --- a/pkg/environments/production/provisioner.go +++ b/pkg/environments/production/provisioner.go @@ -83,6 +83,38 @@ func (fp *FilesystemProvisioner) EnsureDirectoryStructure() error { return nil } +// EnsureOramaUser creates the 'orama' system user and group for running services. +// Sets ownership of the orama data directory to the new user. +func (fp *FilesystemProvisioner) EnsureOramaUser() error { + // Check if user already exists + if err := exec.Command("id", "orama").Run(); err == nil { + return nil // user already exists + } + + // Create system user with no login shell and home at /opt/orama + cmd := exec.Command("useradd", "--system", "--no-create-home", + "--home-dir", fp.oramaHome, "--shell", "/usr/sbin/nologin", "orama") + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("failed to create orama user: %w\n%s", err, string(output)) + } + + // Set ownership of orama directories + chown := exec.Command("chown", "-R", "orama:orama", fp.oramaDir) + if output, err := chown.CombinedOutput(); err != nil { + return fmt.Errorf("failed to chown %s: %w\n%s", fp.oramaDir, err, string(output)) + } + + // Also chown the bin directory + binDir := filepath.Join(fp.oramaHome, "bin") + if _, err := os.Stat(binDir); err == nil { + chown = exec.Command("chown", "-R", "orama:orama", binDir) + if output, err := chown.CombinedOutput(); err != nil { + return fmt.Errorf("failed to chown %s: %w\n%s", binDir, err, string(output)) + } + } + + return nil +} // StateDetector checks for existing production state type StateDetector struct { diff --git a/pkg/environments/production/services.go b/pkg/environments/production/services.go index 2e47f66..6c08da9 100644 --- a/pkg/environments/production/services.go +++ b/pkg/environments/production/services.go @@ -8,6 +8,17 @@ import ( "strings" ) +// oramaServiceHardening contains common systemd security directives for orama services. +const oramaServiceHardening = `User=orama +Group=orama +ProtectSystem=strict +ProtectHome=yes +NoNewPrivileges=yes +PrivateDevices=yes +ProtectKernelTunables=yes +ProtectKernelModules=yes +RestrictNamespaces=yes` + // SystemdServiceGenerator generates systemd unit files type SystemdServiceGenerator struct { oramaHome string @@ -34,6 +45,8 @@ Wants=network-online.target [Service] Type=simple +%[6]s +ReadWritePaths=%[3]s Environment=HOME=%[1]s Environment=IPFS_PATH=%[2]s ExecStartPre=/bin/bash -c 'if [ -f %[3]s/secrets/swarm.key ] && [ ! -f %[2]s/swarm.key ]; then cp %[3]s/secrets/swarm.key %[2]s/swarm.key && chmod 600 %[2]s/swarm.key; fi' @@ -52,7 +65,7 @@ MemoryMax=4G [Install] WantedBy=multi-user.target -`, ssg.oramaHome, ipfsRepoPath, ssg.oramaDir, logFile, ipfsBinary) +`, ssg.oramaHome, ipfsRepoPath, ssg.oramaDir, logFile, ipfsBinary, oramaServiceHardening) } // GenerateIPFSClusterService generates the IPFS Cluster systemd unit @@ -75,6 +88,8 @@ Requires=orama-ipfs.service [Service] Type=simple +%[6]s +ReadWritePaths=%[7]s WorkingDirectory=%[1]s Environment=HOME=%[1]s Environment=IPFS_CLUSTER_PATH=%[2]s @@ -96,7 +111,7 @@ MemoryMax=2G [Install] WantedBy=multi-user.target -`, ssg.oramaHome, clusterPath, logFile, clusterBinary, clusterSecret) +`, ssg.oramaHome, clusterPath, logFile, clusterBinary, clusterSecret, oramaServiceHardening, ssg.oramaDir) } // GenerateRQLiteService generates the RQLite systemd unit @@ -128,6 +143,8 @@ Wants=network-online.target [Service] Type=simple +%[6]s +ReadWritePaths=%[7]s Environment=HOME=%[1]s ExecStart=%[5]s %[2]s Restart=always @@ -143,7 +160,7 @@ KillMode=mixed [Install] WantedBy=multi-user.target -`, ssg.oramaHome, args, logFile, dataDir, rqliteBinary) +`, ssg.oramaHome, args, logFile, dataDir, rqliteBinary, oramaServiceHardening, ssg.oramaDir) } // GenerateOlricService generates the Olric systemd unit @@ -158,6 +175,8 @@ Wants=network-online.target [Service] Type=simple +%[6]s +ReadWritePaths=%[4]s Environment=HOME=%[1]s Environment=OLRIC_SERVER_CONFIG=%[2]s ExecStart=%[5]s @@ -175,7 +194,7 @@ MemoryMax=4G [Install] WantedBy=multi-user.target -`, ssg.oramaHome, olricConfigPath, logFile, ssg.oramaDir, olricBinary) +`, ssg.oramaHome, olricConfigPath, logFile, ssg.oramaDir, olricBinary, oramaServiceHardening) } // GenerateNodeService generates the Orama Node systemd unit @@ -193,6 +212,8 @@ Requires=wg-quick@wg0.service [Service] Type=simple +%[5]s +ReadWritePaths=%[2]s WorkingDirectory=%[1]s Environment=HOME=%[1]s ExecStart=%[1]s/bin/orama-node --config %[2]s/configs/%[3]s @@ -211,7 +232,7 @@ OOMScoreAdjust=-500 [Install] WantedBy=multi-user.target -`, ssg.oramaHome, ssg.oramaDir, configFile, logFile) +`, ssg.oramaHome, ssg.oramaDir, configFile, logFile, oramaServiceHardening) } // GenerateVaultService generates the Orama Vault Guardian systemd unit. @@ -230,6 +251,16 @@ PartOf=orama-node.service [Service] Type=simple +User=orama +Group=orama +ProtectSystem=strict +ProtectHome=yes +NoNewPrivileges=yes +PrivateDevices=yes +ProtectKernelTunables=yes +ProtectKernelModules=yes +RestrictNamespaces=yes +ReadWritePaths=%[2]s ExecStart=%[1]s/bin/vault-guardian --config %[2]s/vault.yaml Restart=on-failure RestartSec=5 @@ -238,9 +269,6 @@ StandardError=append:%[3]s SyslogIdentifier=orama-vault PrivateTmp=yes -ProtectSystem=strict -ReadWritePaths=%[2]s -NoNewPrivileges=yes LimitMEMLOCK=67108864 MemoryMax=512M TimeoutStopSec=30 @@ -261,6 +289,8 @@ Wants=orama-node.service orama-olric.service [Service] Type=simple +%[4]s +ReadWritePaths=%[2]s WorkingDirectory=%[1]s Environment=HOME=%[1]s ExecStart=%[1]s/bin/gateway --config %[2]s/data/gateway.yaml @@ -278,7 +308,7 @@ MemoryMax=4G [Install] WantedBy=multi-user.target -`, ssg.oramaHome, ssg.oramaDir, logFile) +`, ssg.oramaHome, ssg.oramaDir, logFile, oramaServiceHardening) } // GenerateAnyoneClientService generates the Anyone Client SOCKS5 proxy systemd unit. @@ -353,7 +383,7 @@ WantedBy=multi-user.target // GenerateCoreDNSService generates the CoreDNS systemd unit func (ssg *SystemdServiceGenerator) GenerateCoreDNSService() string { - return `[Unit] + return fmt.Sprintf(`[Unit] Description=CoreDNS DNS Server with RQLite backend Documentation=https://coredns.io After=network-online.target orama-node.service @@ -361,11 +391,16 @@ Wants=network-online.target orama-node.service [Service] Type=simple +%[1]s +ReadWritePaths=%[2]s +AmbientCapabilities=CAP_NET_BIND_SERVICE +CapabilityBoundingSet=CAP_NET_BIND_SERVICE ExecStart=/usr/local/bin/coredns -conf /etc/coredns/Corefile Restart=on-failure RestartSec=5 SyslogIdentifier=coredns +PrivateTmp=yes LimitNOFILE=65536 TimeoutStopSec=30 KillMode=mixed @@ -373,12 +408,12 @@ MemoryMax=1G [Install] WantedBy=multi-user.target -` +`, oramaServiceHardening, ssg.oramaDir) } // GenerateCaddyService generates the Caddy systemd unit for SSL/TLS func (ssg *SystemdServiceGenerator) GenerateCaddyService() string { - return `[Unit] + return fmt.Sprintf(`[Unit] Description=Caddy HTTP/2 Server Documentation=https://caddyserver.com/docs/ After=network-online.target orama-node.service coredns.service @@ -387,6 +422,10 @@ Wants=orama-node.service [Service] Type=simple +%[1]s +ReadWritePaths=%[2]s /var/lib/caddy /etc/caddy +AmbientCapabilities=CAP_NET_BIND_SERVICE +CapabilityBoundingSet=CAP_NET_BIND_SERVICE ExecStart=/usr/bin/caddy run --environ --config /etc/caddy/Caddyfile ExecReload=/usr/bin/caddy reload --config /etc/caddy/Caddyfile TimeoutStopSec=5s @@ -401,7 +440,7 @@ MemoryMax=2G [Install] WantedBy=multi-user.target -` +`, oramaServiceHardening, ssg.oramaDir) } // SystemdController manages systemd service operations diff --git a/pkg/environments/production/wireguard.go b/pkg/environments/production/wireguard.go index 08292e9..6fa2ed3 100644 --- a/pkg/environments/production/wireguard.go +++ b/pkg/environments/production/wireguard.go @@ -117,8 +117,8 @@ func (wp *WireGuardProvisioner) GenerateConfig() string { // Accept all WireGuard subnet traffic before UFW's conntrack "invalid" drop. // Without this, packets reordered by the tunnel get silently dropped. - sb.WriteString("PostUp = iptables -I INPUT 1 -i wg0 -s 10.0.0.0/8 -j ACCEPT\n") - sb.WriteString("PostDown = iptables -D INPUT -i wg0 -s 10.0.0.0/8 -j ACCEPT\n") + sb.WriteString("PostUp = iptables -I INPUT 1 -i wg0 -s 10.0.0.0/24 -j ACCEPT\n") + sb.WriteString("PostDown = iptables -D INPUT -i wg0 -s 10.0.0.0/24 -j ACCEPT\n") for _, peer := range wp.config.Peers { sb.WriteString("\n[Peer]\n") diff --git a/pkg/environments/production/wireguard_test.go b/pkg/environments/production/wireguard_test.go index e24f54c..9a460db 100644 --- a/pkg/environments/production/wireguard_test.go +++ b/pkg/environments/production/wireguard_test.go @@ -95,10 +95,10 @@ func TestWireGuardProvisioner_GenerateConfig_NoPeers(t *testing.T) { if !strings.Contains(config, "PrivateKey = dGVzdHByaXZhdGVrZXl0ZXN0cHJpdmF0ZWtleXM=") { t.Error("config should contain PrivateKey") } - if !strings.Contains(config, "PostUp = iptables -I INPUT 1 -i wg0 -s 10.0.0.0/8 -j ACCEPT") { + if !strings.Contains(config, "PostUp = iptables -I INPUT 1 -i wg0 -s 10.0.0.0/24 -j ACCEPT") { t.Error("config should contain PostUp iptables rule for WireGuard subnet") } - if !strings.Contains(config, "PostDown = iptables -D INPUT -i wg0 -s 10.0.0.0/8 -j ACCEPT") { + if !strings.Contains(config, "PostDown = iptables -D INPUT -i wg0 -s 10.0.0.0/24 -j ACCEPT") { t.Error("config should contain PostDown iptables cleanup rule") } if strings.Contains(config, "[Peer]") { diff --git a/pkg/environments/templates/olric.yaml b/pkg/environments/templates/olric.yaml index 57f15c7..bd8838f 100644 --- a/pkg/environments/templates/olric.yaml +++ b/pkg/environments/templates/olric.yaml @@ -15,3 +15,6 @@ memberlist: - "{{.}}" {{- end}} {{- end}} +{{- if .EncryptionKey}} + encryptionKey: "{{.EncryptionKey}}" +{{- end}} diff --git a/pkg/environments/templates/render.go b/pkg/environments/templates/render.go index 4253c26..d867955 100644 --- a/pkg/environments/templates/render.go +++ b/pkg/environments/templates/render.go @@ -65,6 +65,7 @@ type OlricConfigData struct { MemberlistEnvironment string // "local", "lan", or "wan" MemberlistAdvertiseAddr string // Advertise address (WG IP) so other nodes can reach us Peers []string // Seed peers for memberlist (host:port) + EncryptionKey string // Base64-encoded 32-byte key for memberlist gossip encryption (empty = no encryption) } // SystemdIPFSData holds parameters for systemd IPFS service rendering diff --git a/pkg/environments/templates/systemd_gateway.service b/pkg/environments/templates/systemd_gateway.service index 8cbc716..4018843 100644 --- a/pkg/environments/templates/systemd_gateway.service +++ b/pkg/environments/templates/systemd_gateway.service @@ -5,6 +5,16 @@ Wants=orama-node.service [Service] Type=simple +User=orama +Group=orama +ProtectSystem=strict +ProtectHome=yes +NoNewPrivileges=yes +PrivateDevices=yes +ProtectKernelTunables=yes +ProtectKernelModules=yes +RestrictNamespaces=yes +ReadWritePaths={{.OramaDir}} WorkingDirectory={{.HomeDir}} Environment=HOME={{.HomeDir}} ExecStart={{.HomeDir}}/bin/gateway --config {{.OramaDir}}/data/gateway.yaml diff --git a/pkg/environments/templates/systemd_ipfs.service b/pkg/environments/templates/systemd_ipfs.service index 1436a27..471950e 100644 --- a/pkg/environments/templates/systemd_ipfs.service +++ b/pkg/environments/templates/systemd_ipfs.service @@ -5,6 +5,16 @@ Wants=network-online.target [Service] Type=simple +User=orama +Group=orama +ProtectSystem=strict +ProtectHome=yes +NoNewPrivileges=yes +PrivateDevices=yes +ProtectKernelTunables=yes +ProtectKernelModules=yes +RestrictNamespaces=yes +ReadWritePaths={{.IPFSRepoPath}} {{.OramaDir}} Environment=HOME={{.HomeDir}} Environment=IPFS_PATH={{.IPFSRepoPath}} ExecStartPre=/bin/bash -c 'if [ -f {{.SecretsDir}}/swarm.key ] && [ ! -f {{.IPFSRepoPath}}/swarm.key ]; then cp {{.SecretsDir}}/swarm.key {{.IPFSRepoPath}}/swarm.key && chmod 600 {{.IPFSRepoPath}}/swarm.key; fi' diff --git a/pkg/environments/templates/systemd_ipfs_cluster.service b/pkg/environments/templates/systemd_ipfs_cluster.service index 7f31b75..9d10c2f 100644 --- a/pkg/environments/templates/systemd_ipfs_cluster.service +++ b/pkg/environments/templates/systemd_ipfs_cluster.service @@ -6,6 +6,16 @@ Requires=orama-ipfs-{{.NodeType}}.service [Service] Type=simple +User=orama +Group=orama +ProtectSystem=strict +ProtectHome=yes +NoNewPrivileges=yes +PrivateDevices=yes +ProtectKernelTunables=yes +ProtectKernelModules=yes +RestrictNamespaces=yes +ReadWritePaths={{.ClusterPath}} {{.OramaDir}} WorkingDirectory={{.HomeDir}} Environment=HOME={{.HomeDir}} Environment=CLUSTER_PATH={{.ClusterPath}} diff --git a/pkg/environments/templates/systemd_node.service b/pkg/environments/templates/systemd_node.service index bb57e0d..c8a79a3 100644 --- a/pkg/environments/templates/systemd_node.service +++ b/pkg/environments/templates/systemd_node.service @@ -6,6 +6,16 @@ Requires=orama-ipfs-cluster-{{.NodeType}}.service [Service] Type=simple +User=orama +Group=orama +ProtectSystem=strict +ProtectHome=yes +NoNewPrivileges=yes +PrivateDevices=yes +ProtectKernelTunables=yes +ProtectKernelModules=yes +RestrictNamespaces=yes +ReadWritePaths={{.OramaDir}} WorkingDirectory={{.HomeDir}} Environment=HOME={{.HomeDir}} ExecStart={{.HomeDir}}/bin/orama-node --config {{.OramaDir}}/configs/{{.ConfigFile}} diff --git a/pkg/environments/templates/systemd_olric.service b/pkg/environments/templates/systemd_olric.service index b85961b..ef15519 100644 --- a/pkg/environments/templates/systemd_olric.service +++ b/pkg/environments/templates/systemd_olric.service @@ -5,6 +5,16 @@ Wants=network-online.target [Service] Type=simple +User=orama +Group=orama +ProtectSystem=strict +ProtectHome=yes +NoNewPrivileges=yes +PrivateDevices=yes +ProtectKernelTunables=yes +ProtectKernelModules=yes +RestrictNamespaces=yes +ReadWritePaths={{.OramaDir}} Environment=HOME={{.HomeDir}} Environment=OLRIC_SERVER_CONFIG={{.ConfigPath}} ExecStart=/usr/local/bin/olric-server diff --git a/pkg/gateway/auth/crypto.go b/pkg/gateway/auth/crypto.go new file mode 100644 index 0000000..9f987fa --- /dev/null +++ b/pkg/gateway/auth/crypto.go @@ -0,0 +1,24 @@ +package auth + +import ( + "crypto/hmac" + "crypto/sha256" + "encoding/hex" +) + +// sha256Hex returns the lowercase hex-encoded SHA-256 hash of the input string. +// Used to hash refresh tokens before storage — deterministic so we can hash on +// insert and hash on lookup without storing the raw token. +func sha256Hex(s string) string { + h := sha256.Sum256([]byte(s)) + return hex.EncodeToString(h[:]) +} + +// HmacSHA256Hex computes HMAC-SHA256 of data with the given secret key and +// returns the result as a lowercase hex string. Used for API key hashing — +// fast and deterministic, allowing direct DB lookup by hash. +func HmacSHA256Hex(data, secret string) string { + mac := hmac.New(sha256.New, []byte(secret)) + mac.Write([]byte(data)) + return hex.EncodeToString(mac.Sum(nil)) +} diff --git a/pkg/gateway/auth/service.go b/pkg/gateway/auth/service.go index 0fe7176..2be287a 100644 --- a/pkg/gateway/auth/service.go +++ b/pkg/gateway/auth/service.go @@ -24,14 +24,15 @@ import ( // Service handles authentication business logic type Service struct { - logger *logging.ColoredLogger - orm client.NetworkClient - signingKey *rsa.PrivateKey - keyID string - edSigningKey ed25519.PrivateKey - edKeyID string - preferEdDSA bool - defaultNS string + logger *logging.ColoredLogger + orm client.NetworkClient + signingKey *rsa.PrivateKey + keyID string + edSigningKey ed25519.PrivateKey + edKeyID string + preferEdDSA bool + defaultNS string + apiKeyHMACSecret string // HMAC secret for hashing API keys before storage } func NewService(logger *logging.ColoredLogger, orm client.NetworkClient, signingKeyPEM string, defaultNS string) (*Service, error) { @@ -61,6 +62,21 @@ func NewService(logger *logging.ColoredLogger, orm client.NetworkClient, signing return s, nil } +// SetAPIKeyHMACSecret configures the HMAC secret used to hash API keys before storage. +// When set, API keys are stored as HMAC-SHA256(key, secret) in the database. +func (s *Service) SetAPIKeyHMACSecret(secret string) { + s.apiKeyHMACSecret = secret +} + +// HashAPIKey returns the HMAC-SHA256 hash of an API key if the HMAC secret is set, +// or returns the raw key for backward compatibility during rolling upgrade. +func (s *Service) HashAPIKey(key string) string { + if s.apiKeyHMACSecret == "" { + return key + } + return HmacSHA256Hex(key, s.apiKeyHMACSecret) +} + // SetEdDSAKey configures an Ed25519 signing key for EdDSA JWT support. // When set, new tokens are signed with EdDSA; RS256 is still accepted for verification. func (s *Service) SetEdDSAKey(privKey ed25519.PrivateKey) { @@ -207,9 +223,10 @@ func (s *Service) IssueTokens(ctx context.Context, wallet, namespace string) (st internalCtx := client.WithInternalAuth(ctx) db := s.orm.Database() + hashedRefresh := sha256Hex(refresh) if _, err := db.Query(internalCtx, "INSERT INTO refresh_tokens(namespace_id, subject, token, audience, expires_at) VALUES (?, ?, ?, ?, datetime('now', '+30 days'))", - nsID, wallet, refresh, "gateway", + nsID, wallet, hashedRefresh, "gateway", ); err != nil { return "", "", 0, fmt.Errorf("failed to store refresh token: %w", err) } @@ -227,8 +244,9 @@ func (s *Service) RefreshToken(ctx context.Context, refreshToken, namespace stri return "", "", 0, err } + hashedRefresh := sha256Hex(refreshToken) q := "SELECT subject FROM refresh_tokens WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL AND (expires_at IS NULL OR expires_at > datetime('now')) LIMIT 1" - res, err := db.Query(internalCtx, q, nsID, refreshToken) + res, err := db.Query(internalCtx, q, nsID, hashedRefresh) if err != nil || res == nil || res.Count == 0 { return "", "", 0, fmt.Errorf("invalid or expired refresh token") } @@ -262,7 +280,8 @@ func (s *Service) RevokeToken(ctx context.Context, namespace, token string, all } if token != "" { - _, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now') WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL", nsID, token) + hashedToken := sha256Hex(token) + _, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now') WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL", nsID, hashedToken) return err } @@ -335,19 +354,21 @@ func (s *Service) GetOrCreateAPIKey(ctx context.Context, wallet, namespace strin } apiKey = "ak_" + base64.RawURLEncoding.EncodeToString(buf) + ":" + namespace - if _, err := db.Query(internalCtx, "INSERT INTO api_keys(key, name, namespace_id) VALUES (?, ?, ?)", apiKey, "", nsID); err != nil { + // Store the HMAC hash of the key (not the raw key) if HMAC secret is configured + hashedKey := s.HashAPIKey(apiKey) + if _, err := db.Query(internalCtx, "INSERT INTO api_keys(key, name, namespace_id) VALUES (?, ?, ?)", hashedKey, "", nsID); err != nil { return "", fmt.Errorf("failed to store api key: %w", err) } // Link wallet -> api_key - rid, err := db.Query(internalCtx, "SELECT id FROM api_keys WHERE key = ? LIMIT 1", apiKey) + rid, err := db.Query(internalCtx, "SELECT id FROM api_keys WHERE key = ? LIMIT 1", hashedKey) if err == nil && rid != nil && rid.Count > 0 && len(rid.Rows) > 0 && len(rid.Rows[0]) > 0 { apiKeyID := rid.Rows[0][0] _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO wallet_api_keys(namespace_id, wallet, api_key_id) VALUES (?, ?, ?)", nsID, strings.ToLower(wallet), apiKeyID) } - // Record ownerships - _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO namespace_ownership(namespace_id, owner_type, owner_id) VALUES (?, 'api_key', ?)", nsID, apiKey) + // Record ownerships — store the hash in ownership too + _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO namespace_ownership(namespace_id, owner_type, owner_id) VALUES (?, 'api_key', ?)", nsID, hashedKey) _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO namespace_ownership(namespace_id, owner_type, owner_id) VALUES (?, 'wallet', ?)", nsID, wallet) return apiKey, nil diff --git a/pkg/gateway/config.go b/pkg/gateway/config.go index e45a74b..41cdebb 100644 --- a/pkg/gateway/config.go +++ b/pkg/gateway/config.go @@ -39,9 +39,18 @@ type Config struct { IPFSReplicationFactor int // Replication factor for pins (default: 3) IPFSEnableEncryption bool // Enable client-side encryption before upload (default: true, discovered from node configs) + // RQLite authentication (basic auth credentials embedded in DSN) + RQLiteUsername string // RQLite HTTP basic auth username (default: "orama") + RQLitePassword string // RQLite HTTP basic auth password + // WireGuard mesh configuration ClusterSecret string // Cluster secret for authenticating internal WireGuard peer exchange + // API key HMAC secret for hashing API keys before storage. + // When set, API keys are stored as HMAC-SHA256(key, secret) in the database. + // Loaded from ~/.orama/secrets/api-key-hmac-secret. + APIKeyHMACSecret string + // WebRTC configuration (set when namespace has WebRTC enabled) WebRTCEnabled bool // Whether WebRTC endpoints are active on this gateway SFUPort int // Local SFU signaling port to proxy WebSocket connections to diff --git a/pkg/gateway/dependencies.go b/pkg/gateway/dependencies.go index a4bd097..eaad2dd 100644 --- a/pkg/gateway/dependencies.go +++ b/pkg/gateway/dependencies.go @@ -86,6 +86,7 @@ func NewDependencies(logger *logging.ColoredLogger, cfg *Config) (*Dependencies, if dsn == "" { dsn = "http://localhost:5001" } + dsn = injectRQLiteAuth(dsn, cfg.RQLiteUsername, cfg.RQLitePassword) cliCfg.DatabaseEndpoints = []string{dsn} } @@ -136,6 +137,9 @@ func initializeRQLite(logger *logging.ColoredLogger, cfg *Config, deps *Dependen dsn = "http://localhost:5001" } + // Inject basic auth credentials into DSN if available + dsn = injectRQLiteAuth(dsn, cfg.RQLiteUsername, cfg.RQLitePassword) + if strings.Contains(dsn, "?") { dsn += "&disableClusterDiscovery=true&level=none" } else { @@ -483,6 +487,12 @@ func initializeServerless(logger *logging.ColoredLogger, cfg *Config, deps *Depe logger.ComponentInfo(logging.ComponentGeneral, "EdDSA signing key loaded; new JWTs will use EdDSA") } + // Configure API key HMAC secret if available + if cfg.APIKeyHMACSecret != "" { + authService.SetAPIKeyHMACSecret(cfg.APIKeyHMACSecret) + logger.ComponentInfo(logging.ComponentGeneral, "API key HMAC secret loaded; new API keys will be hashed") + } + deps.AuthService = authService logger.ComponentInfo(logging.ComponentGeneral, "Serverless function engine ready", @@ -660,3 +670,19 @@ func discoverIPFSFromNodeConfigs(logger *zap.Logger) ipfsDiscoveryResult { return ipfsDiscoveryResult{} } + +// injectRQLiteAuth injects HTTP basic auth credentials into a RQLite DSN URL. +// If username or password is empty, the DSN is returned unchanged. +// Input: "http://localhost:5001" → Output: "http://orama:secret@localhost:5001" +func injectRQLiteAuth(dsn, username, password string) string { + if username == "" || password == "" { + return dsn + } + // Insert user:pass@ after the scheme (http:// or https://) + for _, scheme := range []string{"https://", "http://"} { + if strings.HasPrefix(dsn, scheme) { + return scheme + username + ":" + password + "@" + dsn[len(scheme):] + } + } + return dsn +} diff --git a/pkg/gateway/gateway.go b/pkg/gateway/gateway.go index d7088c3..0cecebc 100644 --- a/pkg/gateway/gateway.go +++ b/pkg/gateway/gateway.go @@ -313,7 +313,7 @@ func New(logger *logging.ColoredLogger, cfg *Config) (*Gateway, error) { // Create client config for global namespace authCfg := client.DefaultClientConfig("default") // Use "default" namespace for global - authCfg.DatabaseEndpoints = []string{cfg.GlobalRQLiteDSN} + authCfg.DatabaseEndpoints = []string{injectRQLiteAuth(cfg.GlobalRQLiteDSN, cfg.RQLiteUsername, cfg.RQLitePassword)} if len(cfg.BootstrapPeers) > 0 { authCfg.BootstrapPeers = cfg.BootstrapPeers } diff --git a/pkg/gateway/handlers/join/handler.go b/pkg/gateway/handlers/join/handler.go index 2be17f9..301b39b 100644 --- a/pkg/gateway/handlers/join/handler.go +++ b/pkg/gateway/handlers/join/handler.go @@ -32,14 +32,18 @@ type JoinResponse struct { WGPeers []WGPeerInfo `json:"wg_peers"` // Secrets - ClusterSecret string `json:"cluster_secret"` - SwarmKey string `json:"swarm_key"` + ClusterSecret string `json:"cluster_secret"` + SwarmKey string `json:"swarm_key"` + APIKeyHMACSecret string `json:"api_key_hmac_secret,omitempty"` + RQLitePassword string `json:"rqlite_password,omitempty"` + OlricEncryptionKey string `json:"olric_encryption_key,omitempty"` // Cluster join info (all using WG IPs) - RQLiteJoinAddress string `json:"rqlite_join_address"` - IPFSPeer PeerInfo `json:"ipfs_peer"` - IPFSClusterPeer PeerInfo `json:"ipfs_cluster_peer"` - BootstrapPeers []string `json:"bootstrap_peers"` + RQLiteJoinAddress string `json:"rqlite_join_address"` + IPFSPeer PeerInfo `json:"ipfs_peer"` + IPFSClusterPeer PeerInfo `json:"ipfs_cluster_peer"` + IPFSClusterPeerIDs []string `json:"ipfs_cluster_peer_ids,omitempty"` + BootstrapPeers []string `json:"bootstrap_peers"` // Olric seed peers (WG IP:port for memberlist) OlricPeers []string `json:"olric_peers,omitempty"` @@ -155,6 +159,24 @@ func (h *Handler) HandleJoin(w http.ResponseWriter, r *http.Request) { return } + // Read API key HMAC secret (optional — may not exist on older clusters) + apiKeyHMACSecret := "" + if data, err := os.ReadFile(h.oramaDir + "/secrets/api-key-hmac-secret"); err == nil { + apiKeyHMACSecret = strings.TrimSpace(string(data)) + } + + // Read RQLite password (optional — may not exist on older clusters) + rqlitePassword := "" + if data, err := os.ReadFile(h.oramaDir + "/secrets/rqlite-password"); err == nil { + rqlitePassword = strings.TrimSpace(string(data)) + } + + // Read Olric encryption key (optional — may not exist on older clusters) + olricEncryptionKey := "" + if data, err := os.ReadFile(h.oramaDir + "/secrets/olric-encryption-key"); err == nil { + olricEncryptionKey = strings.TrimSpace(string(data)) + } + // 7. Get all WG peers wgPeers, err := h.getWGPeers(ctx, req.WGPublicKey) if err != nil { @@ -181,6 +203,9 @@ func (h *Handler) HandleJoin(w http.ResponseWriter, r *http.Request) { // 11. Read base domain from config baseDomain := h.readBaseDomain() + // 12. Read IPFS Cluster trusted peer IDs + ipfsClusterPeerIDs := h.readIPFSClusterTrustedPeers() + // Build Olric seed peers from all existing WG peer IPs (memberlist port 3322) var olricPeers []string for _, p := range wgPeers { @@ -191,16 +216,20 @@ func (h *Handler) HandleJoin(w http.ResponseWriter, r *http.Request) { olricPeers = append(olricPeers, fmt.Sprintf("%s:3322", myWGIP)) resp := JoinResponse{ - WGIP: wgIP, - WGPeers: wgPeers, - ClusterSecret: strings.TrimSpace(string(clusterSecret)), - SwarmKey: strings.TrimSpace(string(swarmKey)), - RQLiteJoinAddress: fmt.Sprintf("%s:7001", myWGIP), - IPFSPeer: ipfsPeer, - IPFSClusterPeer: ipfsClusterPeer, - BootstrapPeers: bootstrapPeers, - OlricPeers: olricPeers, - BaseDomain: baseDomain, + WGIP: wgIP, + WGPeers: wgPeers, + ClusterSecret: strings.TrimSpace(string(clusterSecret)), + SwarmKey: strings.TrimSpace(string(swarmKey)), + APIKeyHMACSecret: apiKeyHMACSecret, + RQLitePassword: rqlitePassword, + OlricEncryptionKey: olricEncryptionKey, + RQLiteJoinAddress: fmt.Sprintf("%s:7001", myWGIP), + IPFSPeer: ipfsPeer, + IPFSClusterPeer: ipfsClusterPeer, + IPFSClusterPeerIDs: ipfsClusterPeerIDs, + BootstrapPeers: bootstrapPeers, + OlricPeers: olricPeers, + BaseDomain: baseDomain, } w.Header().Set("Content-Type", "application/json") @@ -454,6 +483,22 @@ func (h *Handler) buildBootstrapPeers(myWGIP, ipfsPeerID string) []string { } } +// readIPFSClusterTrustedPeers reads IPFS Cluster trusted peer IDs from the secrets file +func (h *Handler) readIPFSClusterTrustedPeers() []string { + data, err := os.ReadFile(h.oramaDir + "/secrets/ipfs-cluster-trusted-peers") + if err != nil { + return nil + } + var peers []string + for _, line := range strings.Split(strings.TrimSpace(string(data)), "\n") { + line = strings.TrimSpace(line) + if line != "" { + peers = append(peers, line) + } + } + return peers +} + // readBaseDomain reads the base domain from node config func (h *Handler) readBaseDomain() string { data, err := os.ReadFile(h.oramaDir + "/configs/node.yaml") diff --git a/pkg/gateway/handlers/pubsub/ws_client.go b/pkg/gateway/handlers/pubsub/ws_client.go index c5127c4..6101ffd 100644 --- a/pkg/gateway/handlers/pubsub/ws_client.go +++ b/pkg/gateway/handlers/pubsub/ws_client.go @@ -4,6 +4,8 @@ import ( "encoding/base64" "encoding/json" "net/http" + "net/url" + "strings" "time" "github.com/DeBrosOfficial/network/pkg/logging" @@ -14,8 +16,29 @@ import ( var wsUpgrader = websocket.Upgrader{ ReadBufferSize: 1024, WriteBufferSize: 1024, - // For early development we accept any origin; tighten later. - CheckOrigin: func(r *http.Request) bool { return true }, + CheckOrigin: checkWSOrigin, +} + +// checkWSOrigin validates WebSocket origins against the request's Host header. +// Non-browser clients (no Origin) are allowed. Browser clients must match the host. +func checkWSOrigin(r *http.Request) bool { + origin := r.Header.Get("Origin") + if origin == "" { + return true + } + host := r.Host + if host == "" { + return false + } + if idx := strings.LastIndex(host, ":"); idx != -1 { + host = host[:idx] + } + parsed, err := url.Parse(origin) + if err != nil { + return false + } + originHost := parsed.Hostname() + return originHost == host || strings.HasSuffix(originHost, "."+host) } // wsClient wraps a WebSocket connection with message handling diff --git a/pkg/gateway/handlers/serverless/ws_handler.go b/pkg/gateway/handlers/serverless/ws_handler.go index 45acae4..a8a10fa 100644 --- a/pkg/gateway/handlers/serverless/ws_handler.go +++ b/pkg/gateway/handlers/serverless/ws_handler.go @@ -4,6 +4,8 @@ import ( "context" "encoding/json" "net/http" + "net/url" + "strings" "time" "github.com/DeBrosOfficial/network/pkg/serverless" @@ -12,6 +14,29 @@ import ( "go.uber.org/zap" ) +// checkWSOrigin validates WebSocket origins against the request's Host header. +// Non-browser clients (no Origin) are allowed. Browser clients must match the host. +func checkWSOrigin(r *http.Request) bool { + origin := r.Header.Get("Origin") + if origin == "" { + return true + } + host := r.Host + if host == "" { + return false + } + // Strip port from host if present + if idx := strings.LastIndex(host, ":"); idx != -1 { + host = host[:idx] + } + parsed, err := url.Parse(origin) + if err != nil { + return false + } + originHost := parsed.Hostname() + return originHost == host || strings.HasSuffix(originHost, "."+host) +} + // HandleWebSocket handles WebSocket connections for function streaming. // It upgrades HTTP connections to WebSocket and manages bi-directional communication // for real-time function invocation and streaming responses. @@ -28,7 +53,7 @@ func (h *ServerlessHandlers) HandleWebSocket(w http.ResponseWriter, r *http.Requ // Upgrade to WebSocket upgrader := websocket.Upgrader{ - CheckOrigin: func(r *http.Request) bool { return true }, + CheckOrigin: checkWSOrigin, } conn, err := upgrader.Upgrade(w, r, nil) diff --git a/pkg/gateway/handlers/wireguard/handler.go b/pkg/gateway/handlers/wireguard/handler.go index cc31ca5..ad59fd1 100644 --- a/pkg/gateway/handlers/wireguard/handler.go +++ b/pkg/gateway/handlers/wireguard/handler.go @@ -6,6 +6,7 @@ import ( "fmt" "net/http" + "github.com/DeBrosOfficial/network/pkg/auth" "github.com/DeBrosOfficial/network/pkg/rqlite" "go.uber.org/zap" ) @@ -129,6 +130,11 @@ func (h *Handler) HandleListPeers(w http.ResponseWriter, r *http.Request) { return } + if !h.validateInternalRequest(r) { + http.Error(w, "unauthorized", http.StatusForbidden) + return + } + peers, err := h.ListPeers(r.Context()) if err != nil { h.logger.Error("failed to list WG peers", zap.Error(err)) @@ -147,6 +153,11 @@ func (h *Handler) HandleRemovePeer(w http.ResponseWriter, r *http.Request) { return } + if !h.validateInternalRequest(r) { + http.Error(w, "unauthorized", http.StatusForbidden) + return + } + nodeID := r.URL.Query().Get("node_id") if nodeID == "" { http.Error(w, "node_id parameter required", http.StatusBadRequest) @@ -165,6 +176,18 @@ func (h *Handler) HandleRemovePeer(w http.ResponseWriter, r *http.Request) { h.logger.Info("removed WireGuard peer", zap.String("node_id", nodeID)) } +// validateInternalRequest checks that the request comes from a WireGuard peer +// and includes a valid cluster secret. Both conditions must be met. +func (h *Handler) validateInternalRequest(r *http.Request) bool { + if !auth.IsWireGuardPeer(r.RemoteAddr) { + return false + } + if h.clusterSecret == "" { + return true + } + return r.Header.Get("X-Cluster-Secret") == h.clusterSecret +} + // ListPeers returns all registered WireGuard peers func (h *Handler) ListPeers(ctx context.Context) ([]PeerRecord, error) { var peers []PeerRecord diff --git a/pkg/gateway/middleware.go b/pkg/gateway/middleware.go index 65567ae..1cb5a07 100644 --- a/pkg/gateway/middleware.go +++ b/pkg/gateway/middleware.go @@ -74,7 +74,11 @@ func (g *Gateway) validateAuthForNamespaceProxy(r *http.Request) (namespace stri // lookupAPIKeyNamespace resolves an API key to its namespace using cache and DB. // dbClient controls which database is queried (global vs namespace-specific). // Returns the namespace name or an error if the key is invalid. +// +// Dual lookup strategy for rolling upgrade: tries HMAC-hashed key first (new keys), +// then falls back to raw key lookup (existing unhashed keys during transition). func (g *Gateway) lookupAPIKeyNamespace(ctx context.Context, key string, dbClient client.NetworkClient) (string, error) { + // Cache uses raw key as cache key (in-memory only, never persisted) if g.mwCache != nil { if cachedNS, ok := g.mwCache.GetAPIKeyNamespace(key); ok { return cachedNS, nil @@ -84,20 +88,33 @@ func (g *Gateway) lookupAPIKeyNamespace(ctx context.Context, key string, dbClien db := dbClient.Database() internalCtx := client.WithInternalAuth(ctx) q := "SELECT namespaces.name FROM api_keys JOIN namespaces ON api_keys.namespace_id = namespaces.id WHERE api_keys.key = ? LIMIT 1" - res, err := db.Query(internalCtx, q, key) - if err != nil || res == nil || res.Count == 0 || len(res.Rows) == 0 || len(res.Rows[0]) == 0 { - return "", fmt.Errorf("invalid API key") + + // Try HMAC-hashed lookup first (new keys stored as hashes) + hashedKey := g.authService.HashAPIKey(key) + res, err := db.Query(internalCtx, q, hashedKey) + if err == nil && res != nil && res.Count > 0 && len(res.Rows) > 0 && len(res.Rows[0]) > 0 { + if ns := getString(res.Rows[0][0]); ns != "" { + if g.mwCache != nil { + g.mwCache.SetAPIKeyNamespace(key, ns) + } + return ns, nil + } } - ns := getString(res.Rows[0][0]) - if ns == "" { - return "", fmt.Errorf("invalid API key") + // Fallback: try raw key lookup (existing unhashed keys during rolling upgrade) + if hashedKey != key { + res, err = db.Query(internalCtx, q, key) + if err == nil && res != nil && res.Count > 0 && len(res.Rows) > 0 && len(res.Rows[0]) > 0 { + if ns := getString(res.Rows[0][0]); ns != "" { + if g.mwCache != nil { + g.mwCache.SetAPIKeyNamespace(key, ns) + } + return ns, nil + } + } } - if g.mwCache != nil { - g.mwCache.SetAPIKeyNamespace(key, ns) - } - return ns, nil + return "", fmt.Errorf("invalid API key") } // isWebSocketUpgrade checks if the request is a WebSocket upgrade request diff --git a/pkg/gateway/rate_limiter.go b/pkg/gateway/rate_limiter.go index 8d05568..c1452de 100644 --- a/pkg/gateway/rate_limiter.go +++ b/pkg/gateway/rate_limiter.go @@ -6,13 +6,15 @@ import ( "strings" "sync" "time" + + "github.com/DeBrosOfficial/network/pkg/auth" ) // wireGuardNet is the WireGuard mesh subnet, parsed once at init. var wireGuardNet *net.IPNet func init() { - _, wireGuardNet, _ = net.ParseCIDR("10.0.0.0/8") + _, wireGuardNet, _ = net.ParseCIDR(auth.WireGuardSubnet) } // RateLimiter implements a token-bucket rate limiter per client IP. @@ -126,7 +128,7 @@ func (nrl *NamespaceRateLimiter) Allow(namespace string) bool { } // rateLimitMiddleware returns 429 when a client exceeds the rate limit. -// Internal traffic from the WireGuard subnet (10.0.0.0/8) is exempt. +// Internal traffic from the WireGuard subnet is exempt. func (g *Gateway) rateLimitMiddleware(next http.Handler) http.Handler { if g.rateLimiter == nil { return next @@ -170,7 +172,7 @@ func (g *Gateway) namespaceRateLimitMiddleware(next http.Handler) http.Handler { }) } -// isInternalIP returns true if the IP is in the WireGuard 10.0.0.0/8 subnet +// isInternalIP returns true if the IP is in the WireGuard subnet // or is a loopback address. func isInternalIP(ipStr string) bool { // Strip port if present @@ -187,6 +189,5 @@ func isInternalIP(ipStr string) bool { if ip.IsLoopback() { return true } - // 10.0.0.0/8 — WireGuard mesh return wireGuardNet.Contains(ip) } diff --git a/pkg/gateway/rate_limiter_test.go b/pkg/gateway/rate_limiter_test.go index 168f6e8..8d28ace 100644 --- a/pkg/gateway/rate_limiter_test.go +++ b/pkg/gateway/rate_limiter_test.go @@ -98,7 +98,9 @@ func TestIsInternalIP(t *testing.T) { }{ {"10.0.0.1", true}, {"10.0.0.254", true}, - {"10.255.255.255", true}, + {"10.0.0.255", true}, + {"10.0.1.1", false}, // outside /24 — VPS provider's internal range, not our WG mesh + {"10.255.255.255", false}, // outside /24 {"127.0.0.1", true}, {"192.168.1.1", false}, {"8.8.8.8", false}, diff --git a/pkg/ipfs/cluster.go b/pkg/ipfs/cluster.go index ea75509..66ff44c 100644 --- a/pkg/ipfs/cluster.go +++ b/pkg/ipfs/cluster.go @@ -1,6 +1,7 @@ package ipfs import ( + "encoding/json" "fmt" "net/http" "os" @@ -15,10 +16,11 @@ import ( // ClusterConfigManager manages IPFS Cluster configuration files type ClusterConfigManager struct { - cfg *config.Config - logger *zap.Logger - clusterPath string - secret string + cfg *config.Config + logger *zap.Logger + clusterPath string + secret string + trustedPeersPath string // path to ipfs-cluster-trusted-peers file } // NewClusterConfigManager creates a new IPFS Cluster config manager @@ -46,12 +48,14 @@ func NewClusterConfigManager(cfg *config.Config, logger *zap.Logger) (*ClusterCo } secretPath := filepath.Join(dataDir, "..", "cluster-secret") + trustedPeersPath := "" if strings.Contains(dataDir, ".orama") { home, err := os.UserHomeDir() if err == nil { secretsDir := filepath.Join(home, ".orama", "secrets") if err := os.MkdirAll(secretsDir, 0700); err == nil { secretPath = filepath.Join(secretsDir, "cluster-secret") + trustedPeersPath = filepath.Join(secretsDir, "ipfs-cluster-trusted-peers") } } } @@ -62,10 +66,11 @@ func NewClusterConfigManager(cfg *config.Config, logger *zap.Logger) (*ClusterCo } return &ClusterConfigManager{ - cfg: cfg, - logger: logger, - clusterPath: clusterPath, - secret: secret, + cfg: cfg, + logger: logger, + clusterPath: clusterPath, + secret: secret, + trustedPeersPath: trustedPeersPath, }, nil } @@ -114,7 +119,15 @@ func (cm *ClusterConfigManager) EnsureConfig() error { cfg.Cluster.Secret = cm.secret cfg.Cluster.ListenMultiaddress = []string{fmt.Sprintf("/ip4/0.0.0.0/tcp/%d", clusterListenPort)} cfg.Consensus.CRDT.ClusterName = "orama-cluster" - cfg.Consensus.CRDT.TrustedPeers = []string{"*"} + + // Use trusted peers from file if available, otherwise fall back to "*" (open trust) + trustedPeers := cm.loadTrustedPeersWithSelf() + if len(trustedPeers) > 0 { + cfg.Consensus.CRDT.TrustedPeers = trustedPeers + } else { + cfg.Consensus.CRDT.TrustedPeers = []string{"*"} + } + cfg.API.RestAPI.HTTPListenMultiaddress = fmt.Sprintf("/ip4/0.0.0.0/tcp/%d", restAPIPort) cfg.API.IPFSProxy.ListenMultiaddress = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", proxyPort) cfg.API.IPFSProxy.NodeMultiaddress = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", ipfsPort) @@ -198,3 +211,89 @@ func (cm *ClusterConfigManager) createTemplateConfig() *ClusterServiceConfig { cfg.Raw = make(map[string]interface{}) return cfg } + +// readClusterPeerID reads this node's IPFS Cluster peer ID from identity.json +func (cm *ClusterConfigManager) readClusterPeerID() (string, error) { + identityPath := filepath.Join(cm.clusterPath, "identity.json") + data, err := os.ReadFile(identityPath) + if err != nil { + return "", fmt.Errorf("failed to read identity.json: %w", err) + } + + var identity struct { + ID string `json:"id"` + } + if err := json.Unmarshal(data, &identity); err != nil { + return "", fmt.Errorf("failed to parse identity.json: %w", err) + } + if identity.ID == "" { + return "", fmt.Errorf("peer ID not found in identity.json") + } + return identity.ID, nil +} + +// loadTrustedPeers reads trusted peer IDs from the trusted-peers file (one per line) +func (cm *ClusterConfigManager) loadTrustedPeers() []string { + if cm.trustedPeersPath == "" { + return nil + } + data, err := os.ReadFile(cm.trustedPeersPath) + if err != nil { + return nil + } + var peers []string + for _, line := range strings.Split(strings.TrimSpace(string(data)), "\n") { + line = strings.TrimSpace(line) + if line != "" { + peers = append(peers, line) + } + } + return peers +} + +// addTrustedPeer appends a peer ID to the trusted-peers file if not already present +func (cm *ClusterConfigManager) addTrustedPeer(peerID string) error { + if cm.trustedPeersPath == "" || peerID == "" { + return nil + } + existing := cm.loadTrustedPeers() + for _, p := range existing { + if p == peerID { + return nil // already present + } + } + existing = append(existing, peerID) + return os.WriteFile(cm.trustedPeersPath, []byte(strings.Join(existing, "\n")+"\n"), 0600) +} + +// loadTrustedPeersWithSelf loads trusted peers from file and ensures this node's +// own peer ID is included. Returns nil if no trusted peers file exists. +func (cm *ClusterConfigManager) loadTrustedPeersWithSelf() []string { + peers := cm.loadTrustedPeers() + + // Try to read own peer ID and add it + ownID, err := cm.readClusterPeerID() + if err != nil { + cm.logger.Debug("Could not read own IPFS Cluster peer ID", zap.Error(err)) + return peers + } + + if ownID != "" { + if err := cm.addTrustedPeer(ownID); err != nil { + cm.logger.Warn("Failed to persist own peer ID to trusted peers file", zap.Error(err)) + } + // Check if already in the list + found := false + for _, p := range peers { + if p == ownID { + found = true + break + } + } + if !found { + peers = append(peers, ownID) + } + } + + return peers +} diff --git a/pkg/namespace/cluster_manager.go b/pkg/namespace/cluster_manager.go index 9ff9be2..136630f 100644 --- a/pkg/namespace/cluster_manager.go +++ b/pkg/namespace/cluster_manager.go @@ -34,6 +34,11 @@ type ClusterManagerConfig struct { IPFSAPIURL string // IPFS API URL (default: "http://localhost:4501") IPFSTimeout time.Duration // Timeout for IPFS operations (default: 60s) IPFSReplicationFactor int // IPFS replication factor (default: 3) + + // TurnEncryptionKey is a 32-byte AES-256 key for encrypting TURN shared secrets + // in RQLite. Derived from cluster secret via HKDF(clusterSecret, "turn-encryption"). + // If nil, TURN secrets are stored in plaintext (backward compatibility). + TurnEncryptionKey []byte } // ClusterManager orchestrates namespace cluster provisioning and lifecycle @@ -58,6 +63,9 @@ type ClusterManager struct { // Local node identity for distributed spawning localNodeID string + // AES-256 key for encrypting TURN secrets in RQLite (nil = plaintext) + turnEncryptionKey []byte + // Track provisioning operations provisioningMu sync.RWMutex provisioning map[string]bool // namespace -> in progress @@ -108,6 +116,7 @@ func NewClusterManager( ipfsAPIURL: ipfsAPIURL, ipfsTimeout: ipfsTimeout, ipfsReplicationFactor: ipfsReplicationFactor, + turnEncryptionKey: cfg.TurnEncryptionKey, logger: logger.With(zap.String("component", "cluster-manager")), provisioning: make(map[string]bool), } @@ -154,6 +163,7 @@ func NewClusterManagerWithComponents( ipfsAPIURL: ipfsAPIURL, ipfsTimeout: ipfsTimeout, ipfsReplicationFactor: ipfsReplicationFactor, + turnEncryptionKey: cfg.TurnEncryptionKey, logger: logger.With(zap.String("component", "cluster-manager")), provisioning: make(map[string]bool), } diff --git a/pkg/namespace/cluster_manager_webrtc.go b/pkg/namespace/cluster_manager_webrtc.go index 726bcf6..dde2c14 100644 --- a/pkg/namespace/cluster_manager_webrtc.go +++ b/pkg/namespace/cluster_manager_webrtc.go @@ -9,6 +9,7 @@ import ( "github.com/DeBrosOfficial/network/pkg/client" "github.com/DeBrosOfficial/network/pkg/gateway" + "github.com/DeBrosOfficial/network/pkg/secrets" "github.com/DeBrosOfficial/network/pkg/sfu" "github.com/google/uuid" "go.uber.org/zap" @@ -51,13 +52,23 @@ func (cm *ClusterManager) EnableWebRTC(ctx context.Context, namespaceName, enabl } turnSecret := base64.StdEncoding.EncodeToString(secretBytes) + // Encrypt TURN secret before storing in RQLite + storedSecret := turnSecret + if cm.turnEncryptionKey != nil { + encrypted, encErr := secrets.Encrypt(turnSecret, cm.turnEncryptionKey) + if encErr != nil { + return fmt.Errorf("failed to encrypt TURN secret: %w", encErr) + } + storedSecret = encrypted + } + // 4. Insert namespace_webrtc_config webrtcConfigID := uuid.New().String() _, err = cm.db.Exec(internalCtx, `INSERT INTO namespace_webrtc_config (id, namespace_cluster_id, namespace_name, enabled, turn_shared_secret, turn_credential_ttl, sfu_node_count, turn_node_count, enabled_by, enabled_at) VALUES (?, ?, ?, 1, ?, ?, ?, ?, ?, ?)`, webrtcConfigID, cluster.ID, namespaceName, - turnSecret, DefaultTURNCredentialTTL, + storedSecret, DefaultTURNCredentialTTL, DefaultSFUNodeCount, DefaultTURNNodeCount, enabledBy, time.Now(), ) @@ -297,6 +308,7 @@ func (cm *ClusterManager) DisableWebRTC(ctx context.Context, namespaceName strin } // GetWebRTCConfig returns the WebRTC configuration for a namespace. +// Transparently decrypts the TURN shared secret if it was encrypted at rest. func (cm *ClusterManager) GetWebRTCConfig(ctx context.Context, namespaceName string) (*WebRTCConfig, error) { internalCtx := client.WithInternalAuth(ctx) @@ -309,6 +321,16 @@ func (cm *ClusterManager) GetWebRTCConfig(ctx context.Context, namespaceName str if len(configs) == 0 { return nil, nil } + + // Decrypt TURN secret if encrypted (handles plaintext passthrough for backward compat) + if cm.turnEncryptionKey != nil && secrets.IsEncrypted(configs[0].TURNSharedSecret) { + decrypted, decErr := secrets.Decrypt(configs[0].TURNSharedSecret, cm.turnEncryptionKey) + if decErr != nil { + return nil, fmt.Errorf("failed to decrypt TURN secret: %w", decErr) + } + configs[0].TURNSharedSecret = decrypted + } + return &configs[0], nil } diff --git a/pkg/node/gateway.go b/pkg/node/gateway.go index 915cbb3..1911b28 100644 --- a/pkg/node/gateway.go +++ b/pkg/node/gateway.go @@ -6,6 +6,7 @@ import ( "net/http" "os" "path/filepath" + "strings" "time" "github.com/DeBrosOfficial/network/pkg/gateway" @@ -13,6 +14,7 @@ import ( "github.com/DeBrosOfficial/network/pkg/ipfs" "github.com/DeBrosOfficial/network/pkg/logging" "github.com/DeBrosOfficial/network/pkg/namespace" + "github.com/DeBrosOfficial/network/pkg/secrets" "go.uber.org/zap" ) @@ -44,6 +46,18 @@ func (n *Node) startHTTPGateway(ctx context.Context) error { clusterSecret = string(secretBytes) } + // Read API key HMAC secret for hashing API keys before storage + apiKeyHMACSecret := "" + if secretBytes, err := os.ReadFile(filepath.Join(oramaDir, "secrets", "api-key-hmac-secret")); err == nil { + apiKeyHMACSecret = strings.TrimSpace(string(secretBytes)) + } + + // Read RQLite credentials for authenticated DB connections + rqlitePassword := "" + if secretBytes, err := os.ReadFile(filepath.Join(oramaDir, "secrets", "rqlite-password")); err == nil { + rqlitePassword = strings.TrimSpace(string(secretBytes)) + } + gwCfg := &gateway.Config{ ListenAddr: n.config.HTTPGateway.ListenAddr, ClientNamespace: n.config.HTTPGateway.ClientNamespace, @@ -57,7 +71,10 @@ func (n *Node) startHTTPGateway(ctx context.Context) error { IPFSTimeout: n.config.HTTPGateway.IPFSTimeout, BaseDomain: n.config.HTTPGateway.BaseDomain, DataDir: oramaDir, + RQLiteUsername: "orama", + RQLitePassword: rqlitePassword, ClusterSecret: clusterSecret, + APIKeyHMACSecret: apiKeyHMACSecret, WebRTCEnabled: n.config.HTTPGateway.WebRTC.Enabled, SFUPort: n.config.HTTPGateway.WebRTC.SFUPort, TURNDomain: n.config.HTTPGateway.WebRTC.TURNDomain, @@ -73,6 +90,14 @@ func (n *Node) startHTTPGateway(ctx context.Context) error { // Wire up ClusterManager for per-namespace cluster provisioning if ormClient := apiGateway.GetORMClient(); ormClient != nil { baseDataDir := filepath.Join(os.ExpandEnv(n.config.Node.DataDir), "..", "data", "namespaces") + // Derive TURN encryption key from cluster secret (nil if no secret available) + var turnEncKey []byte + if clusterSecret != "" { + if key, keyErr := secrets.DeriveKey(clusterSecret, "turn-encryption"); keyErr == nil { + turnEncKey = key + } + } + clusterCfg := namespace.ClusterManagerConfig{ BaseDomain: n.config.HTTPGateway.BaseDomain, BaseDataDir: baseDataDir, @@ -81,6 +106,7 @@ func (n *Node) startHTTPGateway(ctx context.Context) error { IPFSAPIURL: gwCfg.IPFSAPIURL, IPFSTimeout: gwCfg.IPFSTimeout, IPFSReplicationFactor: n.config.Database.IPFS.ReplicationFactor, + TurnEncryptionKey: turnEncKey, } clusterManager := namespace.NewClusterManager(ormClient, clusterCfg, n.logger.Logger) clusterManager.SetLocalNodeID(gwCfg.NodePeerID) diff --git a/pkg/rqlite/adapter.go b/pkg/rqlite/adapter.go index 3a23ba8..c0c8479 100644 --- a/pkg/rqlite/adapter.go +++ b/pkg/rqlite/adapter.go @@ -16,8 +16,13 @@ type RQLiteAdapter struct { // NewRQLiteAdapter creates a new adapter that provides sql.DB interface for RQLite func NewRQLiteAdapter(manager *RQLiteManager) (*RQLiteAdapter, error) { - // Use the gorqlite database/sql driver - db, err := sql.Open("rqlite", fmt.Sprintf("http://localhost:%d?disableClusterDiscovery=true&level=none", manager.config.RQLitePort)) + // Build DSN with optional basic auth credentials + dsn := fmt.Sprintf("http://localhost:%d?disableClusterDiscovery=true&level=none", manager.config.RQLitePort) + if manager.config.RQLiteUsername != "" && manager.config.RQLitePassword != "" { + dsn = fmt.Sprintf("http://%s:%s@localhost:%d?disableClusterDiscovery=true&level=none", + manager.config.RQLiteUsername, manager.config.RQLitePassword, manager.config.RQLitePort) + } + db, err := sql.Open("rqlite", dsn) if err != nil { return nil, fmt.Errorf("failed to open RQLite SQL connection: %w", err) } diff --git a/pkg/rqlite/instance_spawner.go b/pkg/rqlite/instance_spawner.go index 5739cbd..c98a78e 100644 --- a/pkg/rqlite/instance_spawner.go +++ b/pkg/rqlite/instance_spawner.go @@ -31,6 +31,7 @@ type InstanceConfig struct { JoinAddresses []string // Addresses to join (e.g., ["192.168.1.2:10001"]) DataDir string // Data directory for this instance IsLeader bool // Whether this is the first node (creates cluster) + AuthFile string // Path to RQLite auth JSON file. Empty = no auth enforcement. } // Instance represents a running RQLite instance @@ -91,6 +92,11 @@ func (is *InstanceSpawner) SpawnInstance(ctx context.Context, cfg InstanceConfig "-raft-leader-lease-timeout", "2s", ) + // RQLite HTTP Basic Auth + if cfg.AuthFile != "" { + args = append(args, "-auth", cfg.AuthFile) + } + // Add join addresses if not the leader (must be before data directory) if !cfg.IsLeader && len(cfg.JoinAddresses) > 0 { for _, addr := range cfg.JoinAddresses { diff --git a/pkg/rqlite/process.go b/pkg/rqlite/process.go index a70c692..d3fab87 100644 --- a/pkg/rqlite/process.go +++ b/pkg/rqlite/process.go @@ -137,6 +137,13 @@ func (r *RQLiteManager) launchProcess(ctx context.Context, rqliteDataDir string) "-raft-leader-lease-timeout", raftLeaderLease.String(), ) + // RQLite HTTP Basic Auth — when auth file exists, enforce authentication + if r.config.RQLiteAuthFile != "" { + r.logger.Info("Enabling RQLite HTTP Basic Auth", + zap.String("auth_file", r.config.RQLiteAuthFile)) + args = append(args, "-auth", r.config.RQLiteAuthFile) + } + if r.config.RQLiteJoinAddress != "" && !r.hasExistingState(rqliteDataDir) { r.logger.Info("First-time join to RQLite cluster", zap.String("join_address", r.config.RQLiteJoinAddress)) diff --git a/pkg/secrets/encrypt.go b/pkg/secrets/encrypt.go new file mode 100644 index 0000000..4aebb34 --- /dev/null +++ b/pkg/secrets/encrypt.go @@ -0,0 +1,98 @@ +// Package secrets provides application-level encryption for sensitive data stored in RQLite. +// Uses AES-256-GCM with HKDF key derivation from the cluster secret. +package secrets + +import ( + "crypto/aes" + "crypto/cipher" + "crypto/rand" + "crypto/sha256" + "encoding/base64" + "fmt" + "io" + "strings" + + "golang.org/x/crypto/hkdf" +) + +// Prefix for encrypted values to distinguish from plaintext during migration. +const encryptedPrefix = "enc:" + +// DeriveKey derives a 32-byte AES-256 key from the cluster secret using HKDF-SHA256. +// The purpose string provides domain separation (e.g., "turn-encryption"). +func DeriveKey(clusterSecret, purpose string) ([]byte, error) { + if clusterSecret == "" { + return nil, fmt.Errorf("cluster secret is empty") + } + reader := hkdf.New(sha256.New, []byte(clusterSecret), nil, []byte(purpose)) + key := make([]byte, 32) + if _, err := io.ReadFull(reader, key); err != nil { + return nil, fmt.Errorf("HKDF key derivation failed: %w", err) + } + return key, nil +} + +// Encrypt encrypts plaintext with AES-256-GCM using the given key. +// Returns a base64-encoded string prefixed with "enc:" for identification. +func Encrypt(plaintext string, key []byte) (string, error) { + block, err := aes.NewCipher(key) + if err != nil { + return "", fmt.Errorf("failed to create cipher: %w", err) + } + + gcm, err := cipher.NewGCM(block) + if err != nil { + return "", fmt.Errorf("failed to create GCM: %w", err) + } + + nonce := make([]byte, gcm.NonceSize()) + if _, err := io.ReadFull(rand.Reader, nonce); err != nil { + return "", fmt.Errorf("failed to generate nonce: %w", err) + } + + // nonce is prepended to ciphertext + ciphertext := gcm.Seal(nonce, nonce, []byte(plaintext), nil) + return encryptedPrefix + base64.StdEncoding.EncodeToString(ciphertext), nil +} + +// Decrypt decrypts an "enc:"-prefixed ciphertext string with AES-256-GCM. +// If the input is not prefixed with "enc:", it is returned as-is (plaintext passthrough +// for backward compatibility during migration). +func Decrypt(ciphertext string, key []byte) (string, error) { + if !strings.HasPrefix(ciphertext, encryptedPrefix) { + return ciphertext, nil // plaintext passthrough + } + + data, err := base64.StdEncoding.DecodeString(strings.TrimPrefix(ciphertext, encryptedPrefix)) + if err != nil { + return "", fmt.Errorf("failed to decode ciphertext: %w", err) + } + + block, err := aes.NewCipher(key) + if err != nil { + return "", fmt.Errorf("failed to create cipher: %w", err) + } + + gcm, err := cipher.NewGCM(block) + if err != nil { + return "", fmt.Errorf("failed to create GCM: %w", err) + } + + nonceSize := gcm.NonceSize() + if len(data) < nonceSize { + return "", fmt.Errorf("ciphertext too short") + } + + nonce, sealed := data[:nonceSize], data[nonceSize:] + plaintext, err := gcm.Open(nil, nonce, sealed, nil) + if err != nil { + return "", fmt.Errorf("decryption failed (wrong key or corrupted data): %w", err) + } + + return string(plaintext), nil +} + +// IsEncrypted returns true if the value has the "enc:" prefix. +func IsEncrypted(value string) bool { + return strings.HasPrefix(value, encryptedPrefix) +} From e2b6f7d721d26bac9c8716f57755e49194c44bd8 Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Sat, 28 Feb 2026 15:41:04 +0200 Subject: [PATCH 09/13] docs: add security hardening and OramaOS deployment docs - Document WireGuard IPv6 disable, service auth, token security, process isolation - Introduce OramaOS architecture, enrollment flow, and management via Gateway API - Add troubleshooting for RQLite/Olric auth, OramaOS LUKS/enrollment issues --- docs/ARCHITECTURE.md | 54 ++- docs/CLEAN_NODE.md | 2 + docs/COMMON_PROBLEMS.md | 57 +++ docs/DEV_DEPLOY.md | 30 +- docs/ORAMAOS_DEPLOYMENT.md | 233 ++++++++++++ docs/SECURITY.md | 194 ++++++++++ pkg/cli/cmd/node/enroll.go | 26 ++ pkg/cli/cmd/node/node.go | 2 + pkg/cli/cmd/node/unlock.go | 26 ++ pkg/cli/production/enroll/command.go | 123 ++++++ pkg/cli/production/enroll/flags.go | 46 +++ pkg/cli/production/unlock/command.go | 166 +++++++++ pkg/environments/production/prebuilt.go | 2 +- pkg/gateway/gateway.go | 5 + pkg/gateway/handlers/enroll/handler.go | 435 ++++++++++++++++++++++ pkg/gateway/handlers/enroll/node_proxy.go | 272 ++++++++++++++ pkg/gateway/routes.go | 9 + 17 files changed, 1678 insertions(+), 4 deletions(-) create mode 100644 docs/ORAMAOS_DEPLOYMENT.md create mode 100644 docs/SECURITY.md create mode 100644 pkg/cli/cmd/node/enroll.go create mode 100644 pkg/cli/cmd/node/unlock.go create mode 100644 pkg/cli/production/enroll/command.go create mode 100644 pkg/cli/production/enroll/flags.go create mode 100644 pkg/cli/production/unlock/command.go create mode 100644 pkg/gateway/handlers/enroll/handler.go create mode 100644 pkg/gateway/handlers/enroll/node_proxy.go diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index cbe8e4c..afb09be 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -357,11 +357,36 @@ Function Invocation: All inter-node communication is encrypted via a WireGuard VPN mesh: -- **WireGuard IPs:** Each node gets a private IP (10.0.0.x) used for all cluster traffic +- **WireGuard IPs:** Each node gets a private IP (10.0.0.x/24) used for all cluster traffic - **UFW Firewall:** Only public ports are exposed: 22 (SSH), 53 (DNS, nameservers only), 80/443 (HTTP/HTTPS), 51820 (WireGuard UDP) +- **IPv6 disabled:** System-wide via sysctl to prevent bypass of IPv4 firewall rules - **Internal services** (RQLite 5001/7001, IPFS 4001/4501, Olric 3320/3322, Gateway 6001) are only accessible via WireGuard or localhost - **Invite tokens:** Single-use, time-limited tokens for secure node joining. No shared secrets on the CLI -- **Join flow:** New nodes authenticate via HTTPS (443), establish WireGuard tunnel, then join all services over the encrypted mesh +- **Join flow:** New nodes authenticate via HTTPS (443) with TOFU certificate pinning, establish WireGuard tunnel, then join all services over the encrypted mesh + +### Service Authentication + +- **RQLite:** HTTP basic auth on all queries/executions — credentials generated at genesis, distributed via join response +- **Olric:** Memberlist gossip encrypted with a shared 32-byte key +- **IPFS Cluster:** TrustedPeers restricted to known cluster peer IDs (not `*`) +- **Internal endpoints:** `/v1/internal/wg/peers` and `/v1/internal/wg/peer/remove` require cluster secret +- **Vault:** V1 push/pull endpoints require session token authentication when guardian is configured +- **WebSockets:** Origin header validated against the node's configured domain + +### Token & Key Security + +- **Refresh tokens:** Stored as SHA-256 hashes (never plaintext) +- **API keys:** Stored as HMAC-SHA256 hashes with a server-side secret +- **TURN secrets:** Encrypted at rest with AES-256-GCM (key derived from cluster secret) +- **Binary signing:** Build archives signed with rootwallet EVM signature, verified on install + +### Process Isolation + +- **Dedicated user:** All services run as `orama` user (not root) +- **systemd hardening:** `ProtectSystem=strict`, `NoNewPrivileges=yes`, `PrivateDevices=yes`, etc. +- **Capabilities:** Caddy and CoreDNS get `CAP_NET_BIND_SERVICE` for privileged ports + +See [SECURITY.md](SECURITY.md) for the full security hardening reference. ### TLS/HTTPS @@ -504,6 +529,31 @@ WebRTC uses a separate port allocation system from core namespace services: See [docs/WEBRTC.md](WEBRTC.md) for full details including client integration, API reference, and debugging. +## OramaOS + +For mainnet, devnet, and testnet environments, nodes run **OramaOS** — a custom minimal Linux image built with Buildroot. + +**Key properties:** +- No SSH, no shell — operators cannot access the filesystem +- LUKS full-disk encryption with Shamir key distribution across peers +- Read-only rootfs (SquashFS + dm-verity) +- A/B partition updates with cryptographic signature verification +- Service sandboxing via Linux namespaces + seccomp +- Single root process: the **orama-agent** + +**The orama-agent manages:** +- Boot sequence and LUKS key reconstruction +- WireGuard tunnel setup +- Service lifecycle in sandboxed namespaces +- Command reception from Gateway over WireGuard (port 9998) +- OS updates (download, verify, A/B swap, reboot with rollback) + +**Node enrollment:** OramaOS nodes join via `orama node enroll` instead of `orama node install`. The enrollment flow uses a registration code + invite token + wallet verification. + +See [ORAMAOS_DEPLOYMENT.md](ORAMAOS_DEPLOYMENT.md) for the full deployment guide. + +Sandbox clusters remain on Ubuntu for development convenience. + ## Future Enhancements 1. **GraphQL Support** - GraphQL gateway alongside REST diff --git a/docs/CLEAN_NODE.md b/docs/CLEAN_NODE.md index 8414394..d8b6a9f 100644 --- a/docs/CLEAN_NODE.md +++ b/docs/CLEAN_NODE.md @@ -2,6 +2,8 @@ How to completely remove all Orama Network state from a VPS so it can be reinstalled fresh. +> **OramaOS nodes:** This guide applies to Ubuntu-based nodes only. OramaOS has no SSH or shell access. To remove an OramaOS node: use `POST /v1/node/leave` via the Gateway API for graceful departure, or reflash the OramaOS image via your VPS provider's dashboard for a factory reset. See [ORAMAOS_DEPLOYMENT.md](ORAMAOS_DEPLOYMENT.md) for details. + ## Quick Clean (Copy-Paste) Run this as root or with sudo on the target VPS: diff --git a/docs/COMMON_PROBLEMS.md b/docs/COMMON_PROBLEMS.md index ae6d9ff..5d60f3e 100644 --- a/docs/COMMON_PROBLEMS.md +++ b/docs/COMMON_PROBLEMS.md @@ -150,6 +150,62 @@ ssh -n user@host 'command' --- +--- + +## 6. RQLite returns 401 Unauthorized + +**Symptom:** RQLite queries fail with HTTP 401 after security hardening. + +**Cause:** RQLite now requires basic auth. The client isn't sending credentials. + +**Fix:** Ensure the RQLite client is configured with the credentials from `/opt/orama/.orama/secrets/rqlite-auth.json`. The central RQLite client wrapper (`pkg/rqlite/client.go`) handles this automatically. If using a standalone client (e.g., CoreDNS plugin), ensure it's also configured. + +--- + +## 7. Olric cluster split after upgrade + +**Symptom:** Olric nodes can't gossip after enabling memberlist encryption. + +**Cause:** Olric memberlist encryption is all-or-nothing. Nodes with encryption can't communicate with nodes without it. + +**Fix:** All nodes must be restarted simultaneously when enabling Olric encryption. The cache will be lost (it rebuilds from DB). This is expected — Olric is a cache, not persistent storage. + +--- + +## 8. OramaOS: LUKS unlock fails + +**Symptom:** OramaOS node can't reconstruct its LUKS key after reboot. + +**Cause:** Not enough peer vault-guardians are online to meet the Shamir threshold (K = max(3, N/3)). + +**Fix:** Ensure enough cluster nodes are online and reachable over WireGuard. The agent retries with exponential backoff. For genesis nodes before 5+ peers exist, use: + +```bash +orama node unlock --genesis --node-ip +``` + +--- + +## 9. OramaOS: Enrollment timeout + +**Symptom:** `orama node enroll` hangs or times out. + +**Cause:** The OramaOS node's port 9999 isn't reachable, or the Gateway can't reach the node's WebSocket. + +**Fix:** Check that port 9999 is open in your VPS provider's external firewall (Hetzner firewall, AWS security groups, etc.). OramaOS opens it internally, but provider-level firewalls must be configured separately. + +--- + +## 10. Binary signature verification fails + +**Symptom:** `orama node install` rejects the binary archive with a signature error. + +**Cause:** The archive was tampered with, or the manifest.sig file is missing/corrupted. + +**Fix:** Rebuild the archive with `orama build` and re-sign with `make sign` (in the orama-os repo). Ensure you're using the rootwallet that matches the embedded signer address. + +--- + ## General Debugging Tips - **Always use `sudo orama node restart`** instead of raw `systemctl` commands @@ -158,3 +214,4 @@ ssh -n user@host 'command' - **Check WireGuard:** `wg show wg0` — look for recent handshakes and transfer bytes - **Check gateway health:** `curl http://localhost:/v1/health` from the node itself - **Node IPs:** Check `scripts/remote-nodes.conf` for credentials, `wg show wg0` for WG IPs +- **OramaOS nodes:** No SSH access — use Gateway API endpoints (`/v1/node/status`, `/v1/node/logs`) for diagnostics diff --git a/docs/DEV_DEPLOY.md b/docs/DEV_DEPLOY.md index 07265a4..09bbbdc 100644 --- a/docs/DEV_DEPLOY.md +++ b/docs/DEV_DEPLOY.md @@ -320,7 +320,35 @@ is properly configured, always use the HTTPS domain URL. UFW from external access. The join request goes through Caddy on port 80 (HTTP) or 443 (HTTPS), which proxies to the gateway internally. -## Pre-Install Checklist +## OramaOS Enrollment + +For OramaOS nodes (mainnet, devnet, testnet), use the enrollment flow instead of `orama node install`: + +```bash +# 1. Flash OramaOS image to VPS (via provider dashboard) +# 2. Generate invite token on existing cluster node +orama node invite --expiry 24h + +# 3. Enroll the OramaOS node +orama node enroll --node-ip --token --gateway + +# 4. For genesis node reboots (before 5+ peers exist) +orama node unlock --genesis --node-ip +``` + +OramaOS nodes have no SSH access. All management happens through the Gateway API: + +```bash +# Status, logs, commands — all via Gateway proxy +curl "https://gateway.example.com/v1/node/status?node_id=" +curl "https://gateway.example.com/v1/node/logs?node_id=&service=gateway" +``` + +See [ORAMAOS_DEPLOYMENT.md](ORAMAOS_DEPLOYMENT.md) for the full guide. + +**Note:** `orama node clean` does not work on OramaOS nodes (no SSH). Use `orama node leave` for graceful departure, or reflash the image for a factory reset. + +## Pre-Install Checklist (Ubuntu Only) Before running `orama node install` on a VPS, ensure: diff --git a/docs/ORAMAOS_DEPLOYMENT.md b/docs/ORAMAOS_DEPLOYMENT.md new file mode 100644 index 0000000..ebdd3b3 --- /dev/null +++ b/docs/ORAMAOS_DEPLOYMENT.md @@ -0,0 +1,233 @@ +# OramaOS Deployment Guide + +OramaOS is a custom minimal Linux image built with Buildroot. It replaces the standard Ubuntu-based node deployment for mainnet, devnet, and testnet environments. Sandbox clusters remain on Ubuntu for development convenience. + +## What is OramaOS? + +OramaOS is a locked-down operating system designed specifically for Orama node operators. Key properties: + +- **No SSH, no shell** — operators cannot access the filesystem or run commands on the machine +- **LUKS full-disk encryption** — the data partition is encrypted; the key is split via Shamir's Secret Sharing across peer nodes +- **Read-only rootfs** — the OS image uses SquashFS with dm-verity integrity verification +- **A/B partition updates** — signed OS images are applied atomically with automatic rollback on failure +- **Service sandboxing** — each service runs in its own Linux namespace with seccomp syscall filtering +- **Signed binaries** — all updates are cryptographically signed with the Orama rootwallet + +## Architecture + +``` +Partition Layout: + /dev/sda1 — ESP (EFI System Partition, systemd-boot) + /dev/sda2 — rootfs-A (SquashFS, read-only, dm-verity) + /dev/sda3 — rootfs-B (standby, for A/B updates) + /dev/sda4 — data (LUKS2 encrypted, ext4) + +Boot Flow: + systemd-boot → dm-verity rootfs → orama-agent → WireGuard → services +``` + +The **orama-agent** is the only root process. It manages: +- Boot sequence and LUKS key reconstruction +- WireGuard tunnel setup +- Service lifecycle (start, stop, restart in sandboxed namespaces) +- Command reception from the Gateway over WireGuard +- OS updates (download, verify signature, A/B swap, reboot) + +## Enrollment Flow + +OramaOS nodes join the cluster through an enrollment process (different from the Ubuntu `orama node install` flow): + +### Step 1: Flash OramaOS to VPS + +Download the OramaOS image and flash it to your VPS: + +```bash +# Download image (URL provided upon acceptance) +wget https://releases.orama.network/oramaos-v1.0.0-amd64.qcow2 + +# Flash to VPS (provider-specific — Hetzner, Vultr, etc.) +# Most providers support uploading custom images via their dashboard +``` + +### Step 2: First Boot — Enrollment Mode + +On first boot, the agent: +1. Generates a random 8-character registration code +2. Starts a temporary HTTP server on port 9999 +3. Opens an outbound WebSocket to the Gateway +4. Waits for enrollment to complete + +The registration code is displayed on the VPS console (if available) and served at `http://:9999/`. + +### Step 3: Run Enrollment from CLI + +On your local machine (where you have the `orama` CLI and rootwallet): + +```bash +# Generate an invite token on any existing cluster node +orama node invite --expiry 24h + +# Enroll the OramaOS node +orama node enroll --node-ip --token --gateway +``` + +The enrollment command: +1. Fetches the registration code from the node (port 9999) +2. Sends the code + invite token to the Gateway +3. Gateway validates everything, assigns a WireGuard IP, and pushes config to the node +4. Node configures WireGuard, formats the LUKS-encrypted data partition +5. LUKS key is split via Shamir and distributed to peer vault-guardians +6. Services start in sandboxed namespaces +7. Port 9999 closes permanently + +### Step 4: Verify + +```bash +# Check the node is online and healthy +orama monitor report --env +``` + +## Genesis Node + +The first OramaOS node in a cluster is the **genesis node**. It has a special boot path because there are no peers yet for Shamir key distribution: + +1. Genesis generates a LUKS key and encrypts the data partition +2. The LUKS key is encrypted with a rootwallet-derived key and stored on the unencrypted rootfs +3. On reboot (before enough peers exist), the operator must manually unlock: + +```bash +orama node unlock --genesis --node-ip +``` + +This command: +1. Fetches the encrypted genesis key from the node +2. Decrypts it using the rootwallet (`rw decrypt`) +3. Sends the decrypted LUKS key to the agent over WireGuard + +Once 5+ peers have joined, the genesis node distributes Shamir shares to peers, deletes the local encrypted key, and transitions to normal Shamir-based unlock. After this transition, `orama node unlock` is no longer needed. + +## Normal Reboot (Shamir Unlock) + +When an enrolled OramaOS node reboots: + +1. Agent starts, brings up WireGuard +2. Contacts peer vault-guardians over WireGuard +3. Fetches K Shamir shares (K = threshold, typically `max(3, N/3)`) +4. Reconstructs LUKS key via Lagrange interpolation over GF(256) +5. Decrypts and mounts data partition +6. Starts all services +7. Zeros key from memory + +If not enough peers are available, the agent enters a degraded "waiting for peers" state and retries with exponential backoff (1s, 2s, 4s, 8s, 16s, max 5 retries per cycle). + +## Node Management + +Since OramaOS has no SSH, all management happens through the Gateway API: + +```bash +# Check node status +curl "https://gateway.example.com/v1/node/status?node_id=" + +# Send a command (e.g., restart a service) +curl -X POST "https://gateway.example.com/v1/node/command?node_id=" \ + -H "Content-Type: application/json" \ + -d '{"action":"restart","service":"rqlite"}' + +# View logs +curl "https://gateway.example.com/v1/node/logs?node_id=&service=gateway&lines=100" + +# Graceful node departure +curl -X POST "https://gateway.example.com/v1/node/leave" \ + -H "Content-Type: application/json" \ + -d '{"node_id":""}' +``` + +The Gateway proxies these requests to the agent over WireGuard (port 9998). The agent is never directly accessible from the public internet. + +## OS Updates + +OramaOS uses an A/B partition scheme for atomic, rollback-safe updates: + +1. Agent periodically checks for new versions +2. Downloads the signed image (P2P over WireGuard between nodes) +3. Verifies the rootwallet EVM signature against the embedded public key +4. Writes to the standby partition (if running from A, writes to B) +5. Sets systemd-boot to boot from B with `tries_left=3` +6. Reboots +7. If B boots successfully (agent starts, WG connects, services healthy): marks B as "good" +8. If B fails 3 times: systemd-boot automatically falls back to A + +No operator intervention is needed for updates. Failed updates are automatically rolled back. + +## Service Sandboxing + +Each service on OramaOS runs in an isolated environment: + +- **Mount namespace** — each service only sees its own data directory as writable; everything else is read-only +- **UTS namespace** — isolated hostname +- **Dedicated UID/GID** — each service runs as a different user (not root) +- **Seccomp filtering** — per-service syscall allowlist (initially in audit mode, then enforce mode) + +Services and their sandbox profiles: +| Service | Writable Path | Extra Syscalls | +|---------|--------------|----------------| +| RQLite | `/opt/orama/.orama/data/rqlite` | fsync, fdatasync (Raft + SQLite WAL) | +| Olric | `/opt/orama/.orama/data/olric` | sendmmsg, recvmmsg (gossip) | +| IPFS | `/opt/orama/.orama/data/ipfs` | sendfile, splice (data transfer) | +| Gateway | `/opt/orama/.orama/data/gateway` | sendfile, splice (HTTP) | +| CoreDNS | `/opt/orama/.orama/data/coredns` | sendmmsg, recvmmsg (DNS) | + +## OramaOS vs Ubuntu Deployment + +| Feature | Ubuntu | OramaOS | +|---------|--------|---------| +| SSH access | Yes | No | +| Shell access | Yes | No | +| Disk encryption | No | LUKS2 (Shamir) | +| OS updates | Manual (`orama node upgrade`) | Automatic (signed, A/B) | +| Service isolation | systemd only | Namespaces + seccomp | +| Rootfs integrity | None | dm-verity | +| Binary signing | Optional | Required | +| Operator data access | Full | None | +| Environments | All (including sandbox) | Mainnet, devnet, testnet | + +## Cleaning / Factory Reset + +OramaOS nodes cannot be cleaned with the standard `orama node clean` command (no SSH access). Instead: + +- **Graceful departure:** `orama node leave` via the Gateway API — stops services, redistributes Shamir shares, removes WG peer +- **Factory reset:** Reflash the OramaOS image on the VPS via the hosting provider's dashboard +- **Data is unrecoverable:** Since the LUKS key is distributed across peers, reflashing destroys all data permanently + +## Troubleshooting + +### Node stuck in enrollment mode +The node boots but enrollment never completes. + +**Check:** Can you reach `http://:9999/` from your machine? If not, the VPS firewall may be blocking port 9999. + +**Fix:** Ensure port 9999 is open in the VPS provider's firewall. OramaOS opens it automatically via its internal firewall, but external provider firewalls (Hetzner, AWS security groups) must be configured separately. + +### LUKS unlock fails (not enough peers) +After reboot, the node can't reconstruct its LUKS key. + +**Check:** How many peer nodes are online? The node needs at least K peers (threshold) to be reachable over WireGuard. + +**Fix:** Ensure enough cluster nodes are online. If this is the genesis node and fewer than 5 peers exist, use: +```bash +orama node unlock --genesis --node-ip +``` + +### Update failed, node rolled back +The node applied an update but reverted to the previous version. + +**Check:** The agent logs will show why the new partition failed to boot (accessible via `GET /v1/node/logs?service=agent`). + +**Common causes:** Corrupted download (signature verification should catch this), hardware issue, or incompatible configuration. + +### Services not starting after reboot +The node rebooted and LUKS unlocked, but services are unhealthy. + +**Check:** `GET /v1/node/status` — which services are down? + +**Fix:** Try restarting the specific service via `POST /v1/node/command` with `{"action":"restart","service":""}`. If the issue persists, check service logs. diff --git a/docs/SECURITY.md b/docs/SECURITY.md new file mode 100644 index 0000000..7eabc85 --- /dev/null +++ b/docs/SECURITY.md @@ -0,0 +1,194 @@ +# Security Hardening + +This document describes all security measures applied to the Orama Network, covering both Phase 1 (service hardening on existing Ubuntu nodes) and Phase 2 (OramaOS locked-down image). + +## Phase 1: Service Hardening + +These measures apply to all nodes (Ubuntu and OramaOS). + +### Network Isolation + +**CIDR Validation (Step 1.1)** +- WireGuard subnet restricted to `10.0.0.0/24` across all components: firewall rules, rate limiter, auth module, and WireGuard PostUp/PostDown iptables rules +- Prevents other tenants on shared VPS providers from bypassing the firewall via overlapping `10.x.x.x` ranges + +**IPv6 Disabled (Step 1.2)** +- IPv6 disabled system-wide via sysctl: `net.ipv6.conf.all.disable_ipv6=1` +- Prevents services bound to `0.0.0.0` from being reachable via IPv6 (which had no firewall rules) + +### Authentication + +**Internal Endpoint Auth (Step 1.3)** +- `/v1/internal/wg/peers` and `/v1/internal/wg/peer/remove` now require cluster secret validation +- Peer removal additionally validates the request originates from a WireGuard subnet IP + +**RQLite Authentication (Step 1.7)** +- RQLite runs with `-auth` flag pointing to a credentials file +- All RQLite HTTP requests include `Authorization: Basic ` headers +- Credentials generated at cluster genesis, distributed to joining nodes via join response +- Both the central RQLite client wrapper and the standalone CoreDNS RQLite client send auth + +**Olric Gossip Encryption (Step 1.8)** +- Olric memberlist uses a 32-byte encryption key for all gossip traffic +- Key generated at genesis, distributed via join response +- Prevents rogue nodes from joining the gossip ring and poisoning caches +- Note: encryption is all-or-nothing (coordinated restart required when enabling) + +**IPFS Cluster TrustedPeers (Step 1.9)** +- IPFS Cluster `TrustedPeers` populated with actual cluster peer IDs (was `["*"]`) +- New peers added to TrustedPeers on all existing nodes during join +- Prevents unauthorized peers from controlling IPFS pinning + +**Vault V1 Auth Enforcement (Step 1.14)** +- V1 push/pull endpoints require a valid session token when vault-guardian is configured +- Previously, auth was optional for backward compatibility — any WG peer could read/overwrite Shamir shares + +### Token & Key Storage + +**Refresh Token Hashing (Step 1.5)** +- Refresh tokens stored as SHA-256 hashes in RQLite (never plaintext) +- On lookup: hash the incoming token, query by hash +- On revocation: hash before revoking (both single-token and by-subject) +- Existing tokens invalidated on upgrade (users re-authenticate) + +**API Key Hashing (Step 1.6)** +- API keys stored as HMAC-SHA256 hashes using a server-side secret +- HMAC secret generated at cluster genesis, stored in `~/.orama/secrets/api-key-hmac-secret` +- On lookup: compute HMAC, query by hash — fast enough for every request (unlike bcrypt) +- In-memory cache uses raw key as cache key (never persisted) +- During rolling upgrade: dual lookup (HMAC first, then raw as fallback) until all nodes upgraded + +**TURN Secret Encryption (Step 1.15)** +- TURN shared secrets encrypted at rest in RQLite using AES-256-GCM +- Encryption key derived via HKDF from the cluster secret with purpose string `"turn-encryption"` + +### TLS & Transport + +**InsecureSkipVerify Fix (Step 1.10)** +- During node join, TLS verification uses TOFU (Trust On First Use) +- Invite token output includes the CA certificate fingerprint (SHA-256) +- Joining node verifies the server cert fingerprint matches before proceeding +- After join: CA cert stored locally for future connections + +**WebSocket Origin Validation (Step 1.4)** +- All WebSocket upgraders validate the `Origin` header against the node's configured domain +- Non-browser clients (no Origin header) are still allowed +- Prevents cross-site WebSocket hijacking attacks + +### Process Isolation + +**Dedicated User (Step 1.11)** +- All services run as the `orama` user (not root) +- Caddy and CoreDNS get `AmbientCapabilities=CAP_NET_BIND_SERVICE` for ports 80/443 and 53 +- WireGuard stays as root (kernel netlink requires it) +- vault-guardian already had proper hardening + +**systemd Hardening (Step 1.12)** +- All service units include: + ```ini + ProtectSystem=strict + ProtectHome=yes + NoNewPrivileges=yes + PrivateDevices=yes + ProtectKernelTunables=yes + ProtectKernelModules=yes + RestrictNamespaces=yes + ReadWritePaths=/opt/orama/.orama + ``` +- Applied to both template files (`pkg/environments/templates/`) and hardcoded unit generators (`pkg/environments/production/services.go`) + +### Supply Chain + +**Binary Signing (Step 1.13)** +- Build archives include `manifest.sig` — a rootwallet EVM signature of the manifest hash +- During install, the signature is verified against the embedded Orama public key +- Unsigned or tampered archives are rejected + +## Phase 2: OramaOS + +These measures apply only to OramaOS nodes (mainnet, devnet, testnet). + +### Immutable OS + +- **Read-only rootfs** — SquashFS with dm-verity integrity verification +- **No shell** — `/bin/sh` symlinked to `/bin/false`, no bash/ash/ssh +- **No SSH** — OpenSSH not included in the image +- **Minimal packages** — only what's needed for systemd, cryptsetup, and the agent + +### Full-Disk Encryption + +- **LUKS2** with AES-XTS-Plain64 on the data partition +- **Shamir's Secret Sharing** over GF(256) — LUKS key split across peer vault-guardians +- **Adaptive threshold** — K = max(3, N/3) where N is the number of peers +- **Key zeroing** — LUKS key wiped from memory immediately after use +- **Malicious share detection** — fetch K+1 shares when possible, verify consistency + +### Service Sandboxing + +Each service runs in isolated Linux namespaces: +- **CLONE_NEWNS** — mount namespace (filesystem isolation) +- **CLONE_NEWUTS** — hostname namespace +- **Dedicated UID/GID** — each service has its own user +- **Seccomp filtering** — per-service syscall allowlist + +Note: CLONE_NEWPID is intentionally omitted — it makes services PID 1 in their namespace, which changes signal semantics (SIGTERM ignored by default for PID 1). + +### Signed Updates + +- A/B partition scheme with systemd-boot and boot counting (`tries_left=3`) +- All updates signed with rootwallet EVM signature (secp256k1 + keccak256) +- Signer address: `0xb5d8a496c8b2412990d7D467E17727fdF5954afC` +- P2P distribution over WireGuard between nodes +- Automatic rollback on 3 consecutive boot failures + +### Zero Operator Access + +- Operators cannot read data on the machine (LUKS encrypted, no shell) +- Management only through Gateway API → agent over WireGuard +- All commands are logged and auditable +- No root access, no console access, no file system access + +## Rollout Strategy + +### Phase 1 Batches + +``` +Batch 1 (zero-risk, no restart): + - CIDR fix + - IPv6 disable + - Internal endpoint auth + - WebSocket origin check + +Batch 2 (medium-risk, restart needed): + - Hash refresh tokens + - Hash API keys + - Binary signing + - Vault V1 auth enforcement + - TURN secret encryption + +Batch 3 (high-risk, coordinated rollout): + - RQLite auth (followers first, leader last) + - Olric encryption (simultaneous restart) + - IPFS Cluster TrustedPeers + +Batch 4 (infrastructure changes): + - InsecureSkipVerify fix + - Dedicated user + - systemd hardening +``` + +### Phase 2 + +1. Build and test OramaOS image in QEMU +2. Deploy to sandbox cluster alongside Ubuntu nodes +3. Verify interop and stability +4. Gradual migration: testnet → devnet → mainnet (one node at a time, maintaining Raft quorum) + +## Verification + +All changes verified on sandbox cluster before production deployment: + +- `make test` — all unit tests pass +- `orama monitor report --env sandbox` — full cluster health +- Manual endpoint testing (e.g., curl without auth → 401) +- Security-specific checks (IPv6 listeners, RQLite auth, binary signatures) diff --git a/pkg/cli/cmd/node/enroll.go b/pkg/cli/cmd/node/enroll.go new file mode 100644 index 0000000..ea99230 --- /dev/null +++ b/pkg/cli/cmd/node/enroll.go @@ -0,0 +1,26 @@ +package node + +import ( + "github.com/DeBrosOfficial/network/pkg/cli/production/enroll" + "github.com/spf13/cobra" +) + +var enrollCmd = &cobra.Command{ + Use: "enroll", + Short: "Enroll an OramaOS node into the cluster", + Long: `Enroll a freshly booted OramaOS node into the cluster. + +The OramaOS node displays a registration code on port 9999. Provide this code +along with an invite token to complete enrollment. The Gateway pushes cluster +configuration (WireGuard, secrets, peer list) to the node. + +Usage: + orama node enroll --node-ip --code --token --env + +The node must be reachable over the public internet on port 9999 (enrollment only). +After enrollment, port 9999 is permanently closed and all communication goes over WireGuard.`, + Run: func(cmd *cobra.Command, args []string) { + enroll.Handle(args) + }, + DisableFlagParsing: true, +} diff --git a/pkg/cli/cmd/node/node.go b/pkg/cli/cmd/node/node.go index 5520571..74f9744 100644 --- a/pkg/cli/cmd/node/node.go +++ b/pkg/cli/cmd/node/node.go @@ -30,4 +30,6 @@ func init() { Cmd.AddCommand(rolloutCmd) Cmd.AddCommand(cleanCmd) Cmd.AddCommand(recoverRaftCmd) + Cmd.AddCommand(enrollCmd) + Cmd.AddCommand(unlockCmd) } diff --git a/pkg/cli/cmd/node/unlock.go b/pkg/cli/cmd/node/unlock.go new file mode 100644 index 0000000..522a8a8 --- /dev/null +++ b/pkg/cli/cmd/node/unlock.go @@ -0,0 +1,26 @@ +package node + +import ( + "github.com/DeBrosOfficial/network/pkg/cli/production/unlock" + "github.com/spf13/cobra" +) + +var unlockCmd = &cobra.Command{ + Use: "unlock", + Short: "Unlock an OramaOS genesis node", + Long: `Manually unlock a genesis OramaOS node that cannot reconstruct its LUKS key +via Shamir shares (not enough peers online). + +This is only needed for the genesis node before enough peers have joined for +Shamir-based unlock. Once 5+ peers exist, the genesis node transitions to +normal Shamir unlock and this command is no longer needed. + +Usage: + orama node unlock --genesis --node-ip + +The node must be reachable over WireGuard on port 9998.`, + Run: func(cmd *cobra.Command, args []string) { + unlock.Handle(args) + }, + DisableFlagParsing: true, +} diff --git a/pkg/cli/production/enroll/command.go b/pkg/cli/production/enroll/command.go new file mode 100644 index 0000000..438ea71 --- /dev/null +++ b/pkg/cli/production/enroll/command.go @@ -0,0 +1,123 @@ +// Package enroll implements the OramaOS node enrollment command. +// +// Flow: +// 1. Operator fetches registration code from the OramaOS node (port 9999) +// 2. Operator provides code + invite token to Gateway +// 3. Gateway validates, generates cluster config, pushes to node +// 4. Node configures WireGuard, encrypts data partition, starts services +package enroll + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "time" +) + +// Handle processes the enroll command. +func Handle(args []string) { + flags, err := ParseFlags(args) + if err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } + + // Step 1: Fetch registration code from the OramaOS node + fmt.Printf("Fetching registration code from %s:9999...\n", flags.NodeIP) + + var code string + if flags.Code != "" { + // Code provided directly — skip fetch + code = flags.Code + } else { + fetchedCode, err := fetchRegistrationCode(flags.NodeIP) + if err != nil { + fmt.Fprintf(os.Stderr, "Error: could not reach OramaOS node: %v\n", err) + fmt.Fprintf(os.Stderr, "Make sure the node is booted and port 9999 is reachable.\n") + os.Exit(1) + } + code = fetchedCode + } + + fmt.Printf("Registration code: %s\n", code) + + // Step 2: Send enrollment request to the Gateway + fmt.Printf("Sending enrollment to Gateway at %s...\n", flags.GatewayURL) + + if err := enrollWithGateway(flags.GatewayURL, flags.Token, code, flags.NodeIP); err != nil { + fmt.Fprintf(os.Stderr, "Error: enrollment failed: %v\n", err) + os.Exit(1) + } + + fmt.Printf("Node %s enrolled successfully.\n", flags.NodeIP) + fmt.Printf("The node is now configuring WireGuard and encrypting its data partition.\n") + fmt.Printf("This may take a few minutes. Check status with: orama node status --env %s\n", flags.Env) +} + +// fetchRegistrationCode retrieves the one-time registration code from the OramaOS node. +func fetchRegistrationCode(nodeIP string) (string, error) { + client := &http.Client{Timeout: 10 * time.Second} + resp, err := client.Get(fmt.Sprintf("http://%s:9999/", nodeIP)) + if err != nil { + return "", fmt.Errorf("GET failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusGone { + return "", fmt.Errorf("registration code already served (node may be partially enrolled)") + } + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("unexpected status %d", resp.StatusCode) + } + + var result struct { + Code string `json:"code"` + Expires string `json:"expires"` + } + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return "", fmt.Errorf("invalid response: %w", err) + } + + return result.Code, nil +} + +// enrollWithGateway sends the enrollment request to the Gateway, which validates +// the code and token, then pushes cluster configuration to the OramaOS node. +func enrollWithGateway(gatewayURL, token, code, nodeIP string) error { + body, _ := json.Marshal(map[string]string{ + "code": code, + "token": token, + "node_ip": nodeIP, + }) + + req, err := http.NewRequest("POST", gatewayURL+"/v1/node/enroll", bytes.NewReader(body)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+token) + + client := &http.Client{Timeout: 60 * time.Second} + resp, err := client.Do(req) + if err != nil { + return fmt.Errorf("request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusUnauthorized { + return fmt.Errorf("invalid or expired invite token") + } + if resp.StatusCode == http.StatusBadRequest { + respBody, _ := io.ReadAll(resp.Body) + return fmt.Errorf("bad request: %s", string(respBody)) + } + if resp.StatusCode != http.StatusOK { + respBody, _ := io.ReadAll(resp.Body) + return fmt.Errorf("gateway returned %d: %s", resp.StatusCode, string(respBody)) + } + + return nil +} diff --git a/pkg/cli/production/enroll/flags.go b/pkg/cli/production/enroll/flags.go new file mode 100644 index 0000000..2277d6b --- /dev/null +++ b/pkg/cli/production/enroll/flags.go @@ -0,0 +1,46 @@ +package enroll + +import ( + "flag" + "fmt" + "os" +) + +// Flags holds the parsed command-line flags for the enroll command. +type Flags struct { + NodeIP string // Public IP of the OramaOS node + Code string // Registration code (optional — fetched automatically if not provided) + Token string // Invite token for cluster joining + GatewayURL string // Gateway HTTPS URL + Env string // Environment name (for display only) +} + +// ParseFlags parses the enroll command flags. +func ParseFlags(args []string) (*Flags, error) { + fs := flag.NewFlagSet("enroll", flag.ContinueOnError) + fs.SetOutput(os.Stderr) + + flags := &Flags{} + + fs.StringVar(&flags.NodeIP, "node-ip", "", "Public IP of the OramaOS node (required)") + fs.StringVar(&flags.Code, "code", "", "Registration code from the node (auto-fetched if not provided)") + fs.StringVar(&flags.Token, "token", "", "Invite token for cluster joining (required)") + fs.StringVar(&flags.GatewayURL, "gateway", "", "Gateway URL (required, e.g. https://gateway.example.com)") + fs.StringVar(&flags.Env, "env", "production", "Environment name") + + if err := fs.Parse(args); err != nil { + return nil, err + } + + if flags.NodeIP == "" { + return nil, fmt.Errorf("--node-ip is required") + } + if flags.Token == "" { + return nil, fmt.Errorf("--token is required") + } + if flags.GatewayURL == "" { + return nil, fmt.Errorf("--gateway is required") + } + + return flags, nil +} diff --git a/pkg/cli/production/unlock/command.go b/pkg/cli/production/unlock/command.go new file mode 100644 index 0000000..b6111eb --- /dev/null +++ b/pkg/cli/production/unlock/command.go @@ -0,0 +1,166 @@ +// Package unlock implements the genesis node unlock command. +// +// When the genesis OramaOS node reboots before enough peers exist for +// Shamir-based LUKS key reconstruction, the operator must manually provide +// the LUKS key. This command reads the encrypted genesis key from the +// node's rootfs, decrypts it with the rootwallet, and sends it to the agent. +package unlock + +import ( + "bytes" + "encoding/base64" + "encoding/json" + "flag" + "fmt" + "io" + "net/http" + "os" + "os/exec" + "strings" + "time" +) + +// Flags holds parsed command-line flags. +type Flags struct { + NodeIP string // WireGuard IP of the OramaOS node + Genesis bool // Must be set to confirm genesis unlock + KeyFile string // Path to the encrypted genesis key file (optional override) +} + +// Handle processes the unlock command. +func Handle(args []string) { + flags, err := parseFlags(args) + if err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } + + if !flags.Genesis { + fmt.Fprintf(os.Stderr, "Error: --genesis flag is required to confirm genesis unlock\n") + os.Exit(1) + } + + // Step 1: Read the encrypted genesis key from the node + fmt.Printf("Fetching encrypted genesis key from %s...\n", flags.NodeIP) + encKey, err := fetchGenesisKey(flags.NodeIP) + if err != nil && flags.KeyFile == "" { + fmt.Fprintf(os.Stderr, "Error: could not fetch genesis key from node: %v\n", err) + fmt.Fprintf(os.Stderr, "You can provide the key file directly with --key-file\n") + os.Exit(1) + } + + if flags.KeyFile != "" { + data, readErr := os.ReadFile(flags.KeyFile) + if readErr != nil { + fmt.Fprintf(os.Stderr, "Error: could not read key file: %v\n", readErr) + os.Exit(1) + } + encKey = strings.TrimSpace(string(data)) + } + + // Step 2: Decrypt with rootwallet + fmt.Println("Decrypting genesis key with rootwallet...") + luksKey, err := decryptGenesisKey(encKey) + if err != nil { + fmt.Fprintf(os.Stderr, "Error: decryption failed: %v\n", err) + os.Exit(1) + } + + // Step 3: Send LUKS key to the agent over WireGuard + fmt.Printf("Sending LUKS key to agent at %s:9998...\n", flags.NodeIP) + if err := sendUnlockKey(flags.NodeIP, luksKey); err != nil { + fmt.Fprintf(os.Stderr, "Error: unlock failed: %v\n", err) + os.Exit(1) + } + + fmt.Println("Genesis node unlocked successfully.") + fmt.Println("The node is decrypting and mounting its data partition.") +} + +func parseFlags(args []string) (*Flags, error) { + fs := flag.NewFlagSet("unlock", flag.ContinueOnError) + fs.SetOutput(os.Stderr) + + flags := &Flags{} + fs.StringVar(&flags.NodeIP, "node-ip", "", "WireGuard IP of the OramaOS node (required)") + fs.BoolVar(&flags.Genesis, "genesis", false, "Confirm genesis node unlock") + fs.StringVar(&flags.KeyFile, "key-file", "", "Path to encrypted genesis key file (optional)") + + if err := fs.Parse(args); err != nil { + return nil, err + } + + if flags.NodeIP == "" { + return nil, fmt.Errorf("--node-ip is required") + } + + return flags, nil +} + +// fetchGenesisKey retrieves the encrypted genesis key from the node. +// The agent serves it at GET /v1/agent/genesis-key (only during genesis unlock mode). +func fetchGenesisKey(nodeIP string) (string, error) { + client := &http.Client{Timeout: 10 * time.Second} + resp, err := client.Get(fmt.Sprintf("http://%s:9998/v1/agent/genesis-key", nodeIP)) + if err != nil { + return "", fmt.Errorf("request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return "", fmt.Errorf("status %d: %s", resp.StatusCode, string(body)) + } + + var result struct { + EncryptedKey string `json:"encrypted_key"` + } + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return "", fmt.Errorf("invalid response: %w", err) + } + + return result.EncryptedKey, nil +} + +// decryptGenesisKey decrypts the AES-256-GCM encrypted LUKS key using rootwallet. +// The key was encrypted with: AES-256-GCM(luksKey, HKDF(rootwalletKey, "genesis-luks")) +// For now, we use `rw decrypt` if available, or a local HKDF+AES-GCM implementation. +func decryptGenesisKey(encryptedKey string) ([]byte, error) { + // Try rw decrypt first + cmd := exec.Command("rw", "decrypt", encryptedKey, "--purpose", "genesis-luks", "--chain", "evm") + output, err := cmd.Output() + if err == nil { + decoded, decErr := base64.StdEncoding.DecodeString(strings.TrimSpace(string(output))) + if decErr != nil { + return nil, fmt.Errorf("failed to decode decrypted key: %w", decErr) + } + return decoded, nil + } + + return nil, fmt.Errorf("rw decrypt failed: %w (is rootwallet installed and initialized?)", err) +} + +// sendUnlockKey sends the decrypted LUKS key to the agent's unlock endpoint. +func sendUnlockKey(nodeIP string, luksKey []byte) error { + body, _ := json.Marshal(map[string]string{ + "key": base64.StdEncoding.EncodeToString(luksKey), + }) + + client := &http.Client{Timeout: 30 * time.Second} + resp, err := client.Post( + fmt.Sprintf("http://%s:9998/v1/agent/unlock", nodeIP), + "application/json", + bytes.NewReader(body), + ) + if err != nil { + return fmt.Errorf("request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + respBody, _ := io.ReadAll(resp.Body) + return fmt.Errorf("status %d: %s", resp.StatusCode, string(respBody)) + } + + return nil +} diff --git a/pkg/environments/production/prebuilt.go b/pkg/environments/production/prebuilt.go index ac424b3..6bbcba2 100644 --- a/pkg/environments/production/prebuilt.go +++ b/pkg/environments/production/prebuilt.go @@ -48,7 +48,7 @@ func LoadPreBuiltManifest() (*PreBuiltManifest, error) { // OramaSignerAddress is the Ethereum address authorized to sign build archives. // Archives signed by any other address are rejected during install. // This is the DeBros deploy wallet — update if the signing key rotates. -const OramaSignerAddress = "0x0000000000000000000000000000000000000000" // TODO: set real address +const OramaSignerAddress = "0xb5d8a496c8b2412990d7D467E17727fdF5954afC" // VerifyArchiveSignature verifies that the pre-built archive was signed by the // authorized Orama signer. Returns nil if the signature is valid, or if no diff --git a/pkg/gateway/gateway.go b/pkg/gateway/gateway.go index 0cecebc..389ab00 100644 --- a/pkg/gateway/gateway.go +++ b/pkg/gateway/gateway.go @@ -29,6 +29,7 @@ import ( deploymentshandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/deployments" pubsubhandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/pubsub" serverlesshandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/serverless" + enrollhandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/enroll" joinhandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/join" webrtchandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/webrtc" vaulthandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/vault" @@ -133,6 +134,9 @@ type Gateway struct { // Node join handler joinHandler *joinhandlers.Handler + // OramaOS node enrollment handler + enrollHandler *enrollhandlers.Handler + // Cluster provisioning for namespace clusters clusterProvisioner authhandlers.ClusterProvisioner @@ -399,6 +403,7 @@ func New(logger *logging.ColoredLogger, cfg *Config) (*Gateway, error) { if deps.ORMClient != nil { gw.wireguardHandler = wireguardhandlers.NewHandler(logger.Logger, deps.ORMClient, cfg.ClusterSecret) gw.joinHandler = joinhandlers.NewHandler(logger.Logger, deps.ORMClient, cfg.DataDir) + gw.enrollHandler = enrollhandlers.NewHandler(logger.Logger, deps.ORMClient, cfg.DataDir) gw.vaultHandlers = vaulthandlers.NewHandlers(logger, deps.Client) } diff --git a/pkg/gateway/handlers/enroll/handler.go b/pkg/gateway/handlers/enroll/handler.go new file mode 100644 index 0000000..1d4c2ff --- /dev/null +++ b/pkg/gateway/handlers/enroll/handler.go @@ -0,0 +1,435 @@ +// Package enroll implements the OramaOS node enrollment endpoint. +// +// Flow: +// 1. Operator's CLI sends POST /v1/node/enroll with code + token + node_ip +// 2. Gateway validates invite token (single-use) +// 3. Gateway assigns WG IP, registers peer, reads secrets +// 4. Gateway pushes cluster config to OramaOS node at node_ip:9999 +// 5. OramaOS node configures WG, encrypts data partition, starts services +package enroll + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + "os" + "os/exec" + "strings" + "time" + + "github.com/DeBrosOfficial/network/pkg/rqlite" + "go.uber.org/zap" +) + +// EnrollRequest is the request from the CLI. +type EnrollRequest struct { + Code string `json:"code"` + Token string `json:"token"` + NodeIP string `json:"node_ip"` +} + +// EnrollResponse is the configuration pushed to the OramaOS node. +type EnrollResponse struct { + NodeID string `json:"node_id"` + WireGuardConfig string `json:"wireguard_config"` + ClusterSecret string `json:"cluster_secret"` + Peers []PeerInfo `json:"peers"` +} + +// PeerInfo describes a cluster peer for LUKS key distribution. +type PeerInfo struct { + WGIP string `json:"wg_ip"` + NodeID string `json:"node_id"` +} + +// Handler handles OramaOS node enrollment. +type Handler struct { + logger *zap.Logger + rqliteClient rqlite.Client + oramaDir string +} + +// NewHandler creates a new enrollment handler. +func NewHandler(logger *zap.Logger, rqliteClient rqlite.Client, oramaDir string) *Handler { + return &Handler{ + logger: logger, + rqliteClient: rqliteClient, + oramaDir: oramaDir, + } +} + +// HandleEnroll handles POST /v1/node/enroll. +func (h *Handler) HandleEnroll(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + r.Body = http.MaxBytesReader(w, r.Body, 1<<20) + var req EnrollRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + http.Error(w, "invalid request body", http.StatusBadRequest) + return + } + + if req.Code == "" || req.Token == "" || req.NodeIP == "" { + http.Error(w, "code, token, and node_ip are required", http.StatusBadRequest) + return + } + + ctx := r.Context() + + // 1. Validate invite token (single-use, same as join handler) + if err := h.consumeToken(ctx, req.Token, req.NodeIP); err != nil { + h.logger.Warn("enroll token validation failed", zap.Error(err)) + http.Error(w, "unauthorized: invalid or expired token", http.StatusUnauthorized) + return + } + + // 2. Verify registration code against the OramaOS node + if err := h.verifyCode(req.NodeIP, req.Code); err != nil { + h.logger.Warn("registration code verification failed", zap.Error(err)) + http.Error(w, "code verification failed: "+err.Error(), http.StatusBadRequest) + return + } + + // 3. Generate WG keypair for the OramaOS node + wgPrivKey, wgPubKey, err := generateWGKeypair() + if err != nil { + h.logger.Error("failed to generate WG keypair", zap.Error(err)) + http.Error(w, "internal error", http.StatusInternalServerError) + return + } + + // 4. Assign WG IP + wgIP, err := h.assignWGIP(ctx) + if err != nil { + h.logger.Error("failed to assign WG IP", zap.Error(err)) + http.Error(w, "failed to assign WG IP", http.StatusInternalServerError) + return + } + + nodeID := fmt.Sprintf("orama-node-%s", strings.ReplaceAll(wgIP, ".", "-")) + + // 5. Register WG peer in database + if _, err := h.rqliteClient.Exec(ctx, + "INSERT OR REPLACE INTO wireguard_peers (node_id, wg_ip, public_key, public_ip, wg_port) VALUES (?, ?, ?, ?, ?)", + nodeID, wgIP, wgPubKey, req.NodeIP, 51820); err != nil { + h.logger.Error("failed to register WG peer", zap.Error(err)) + http.Error(w, "failed to register peer", http.StatusInternalServerError) + return + } + + // 6. Add peer to local WireGuard interface + if err := h.addWGPeerLocally(wgPubKey, req.NodeIP, wgIP); err != nil { + h.logger.Warn("failed to add WG peer to local interface", zap.Error(err)) + } + + // 7. Read secrets + clusterSecret, err := os.ReadFile(h.oramaDir + "/secrets/cluster-secret") + if err != nil { + h.logger.Error("failed to read cluster secret", zap.Error(err)) + http.Error(w, "internal error", http.StatusInternalServerError) + return + } + + // 8. Build WireGuard config for the OramaOS node + wgConfig, err := h.buildWGConfig(ctx, wgPrivKey, wgIP) + if err != nil { + h.logger.Error("failed to build WG config", zap.Error(err)) + http.Error(w, "internal error", http.StatusInternalServerError) + return + } + + // 9. Get all peer WG IPs for LUKS key distribution + peers, err := h.getPeerList(ctx, wgIP) + if err != nil { + h.logger.Error("failed to get peer list", zap.Error(err)) + http.Error(w, "internal error", http.StatusInternalServerError) + return + } + + // 10. Push config to OramaOS node + enrollResp := EnrollResponse{ + NodeID: nodeID, + WireGuardConfig: wgConfig, + ClusterSecret: strings.TrimSpace(string(clusterSecret)), + Peers: peers, + } + + if err := h.pushConfigToNode(req.NodeIP, &enrollResp); err != nil { + h.logger.Error("failed to push config to node", zap.Error(err)) + http.Error(w, "failed to configure node: "+err.Error(), http.StatusInternalServerError) + return + } + + h.logger.Info("OramaOS node enrolled", + zap.String("node_id", nodeID), + zap.String("wg_ip", wgIP), + zap.String("public_ip", req.NodeIP)) + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]string{ + "status": "enrolled", + "node_id": nodeID, + "wg_ip": wgIP, + }) +} + +// consumeToken validates and marks an invite token as used. +func (h *Handler) consumeToken(ctx context.Context, token, usedByIP string) error { + result, err := h.rqliteClient.Exec(ctx, + "UPDATE invite_tokens SET used_at = datetime('now'), used_by_ip = ? WHERE token = ? AND used_at IS NULL AND expires_at > datetime('now')", + usedByIP, token) + if err != nil { + return fmt.Errorf("database error: %w", err) + } + + rowsAffected, err := result.RowsAffected() + if err != nil { + return fmt.Errorf("failed to check result: %w", err) + } + + if rowsAffected == 0 { + return fmt.Errorf("token invalid, expired, or already used") + } + + return nil +} + +// verifyCode checks that the OramaOS node has the expected registration code. +func (h *Handler) verifyCode(nodeIP, expectedCode string) error { + client := &http.Client{Timeout: 10 * time.Second} + resp, err := client.Get(fmt.Sprintf("http://%s:9999/", nodeIP)) + if err != nil { + return fmt.Errorf("cannot reach node at %s:9999: %w", nodeIP, err) + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusGone { + return fmt.Errorf("node already served its registration code") + } + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("node returned status %d", resp.StatusCode) + } + + var result struct { + Code string `json:"code"` + } + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return fmt.Errorf("invalid response from node: %w", err) + } + + if result.Code != expectedCode { + return fmt.Errorf("registration code mismatch") + } + + return nil +} + +// pushConfigToNode sends cluster configuration to the OramaOS node. +func (h *Handler) pushConfigToNode(nodeIP string, config *EnrollResponse) error { + body, err := json.Marshal(config) + if err != nil { + return err + } + + client := &http.Client{Timeout: 30 * time.Second} + resp, err := client.Post( + fmt.Sprintf("http://%s:9999/v1/agent/enroll/complete", nodeIP), + "application/json", + bytes.NewReader(body), + ) + if err != nil { + return fmt.Errorf("failed to push config: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("node returned status %d", resp.StatusCode) + } + + return nil +} + +// generateWGKeypair generates a WireGuard private/public keypair. +func generateWGKeypair() (privKey, pubKey string, err error) { + privOut, err := exec.Command("wg", "genkey").Output() + if err != nil { + return "", "", fmt.Errorf("wg genkey failed: %w", err) + } + privKey = strings.TrimSpace(string(privOut)) + + cmd := exec.Command("wg", "pubkey") + cmd.Stdin = strings.NewReader(privKey) + pubOut, err := cmd.Output() + if err != nil { + return "", "", fmt.Errorf("wg pubkey failed: %w", err) + } + pubKey = strings.TrimSpace(string(pubOut)) + + return privKey, pubKey, nil +} + +// assignWGIP finds the next available WG IP. +func (h *Handler) assignWGIP(ctx context.Context) (string, error) { + var rows []struct { + WGIP string `db:"wg_ip"` + } + if err := h.rqliteClient.Query(ctx, &rows, "SELECT wg_ip FROM wireguard_peers"); err != nil { + return "", fmt.Errorf("failed to query WG IPs: %w", err) + } + + if len(rows) == 0 { + return "10.0.0.2", nil + } + + maxD := 0 + maxC := 0 + for _, row := range rows { + var a, b, c, d int + if _, err := fmt.Sscanf(row.WGIP, "%d.%d.%d.%d", &a, &b, &c, &d); err != nil { + continue + } + if c > maxC || (c == maxC && d > maxD) { + maxC, maxD = c, d + } + } + + maxD++ + if maxD > 254 { + maxC++ + maxD = 1 + } + + return fmt.Sprintf("10.0.%d.%d", maxC, maxD), nil +} + +// addWGPeerLocally adds a peer to the local wg0 interface. +func (h *Handler) addWGPeerLocally(pubKey, publicIP, wgIP string) error { + cmd := exec.Command("wg", "set", "wg0", + "peer", pubKey, + "endpoint", fmt.Sprintf("%s:51820", publicIP), + "allowed-ips", fmt.Sprintf("%s/32", wgIP), + "persistent-keepalive", "25") + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("wg set failed: %w\n%s", err, string(output)) + } + return nil +} + +// buildWGConfig generates a wg0.conf for the OramaOS node. +func (h *Handler) buildWGConfig(ctx context.Context, privKey, nodeWGIP string) (string, error) { + // Get this node's public key and WG IP + myPubKey, err := exec.Command("wg", "show", "wg0", "public-key").Output() + if err != nil { + return "", fmt.Errorf("failed to get local WG public key: %w", err) + } + + myWGIP, err := h.getMyWGIP() + if err != nil { + return "", fmt.Errorf("failed to get local WG IP: %w", err) + } + + myPublicIP, err := h.getMyPublicIP(ctx) + if err != nil { + return "", fmt.Errorf("failed to get local public IP: %w", err) + } + + var config strings.Builder + config.WriteString("[Interface]\n") + config.WriteString(fmt.Sprintf("PrivateKey = %s\n", privKey)) + config.WriteString(fmt.Sprintf("Address = %s/24\n", nodeWGIP)) + config.WriteString("ListenPort = 51820\n") + config.WriteString("\n") + + // Add this gateway node as a peer + config.WriteString("[Peer]\n") + config.WriteString(fmt.Sprintf("PublicKey = %s\n", strings.TrimSpace(string(myPubKey)))) + config.WriteString(fmt.Sprintf("Endpoint = %s:51820\n", myPublicIP)) + config.WriteString(fmt.Sprintf("AllowedIPs = %s/32\n", myWGIP)) + config.WriteString("PersistentKeepalive = 25\n") + + // Add all existing peers + type peerRow struct { + WGIP string `db:"wg_ip"` + PublicKey string `db:"public_key"` + PublicIP string `db:"public_ip"` + } + var peers []peerRow + if err := h.rqliteClient.Query(ctx, &peers, + "SELECT wg_ip, public_key, public_ip FROM wireguard_peers WHERE wg_ip != ?", nodeWGIP); err != nil { + h.logger.Warn("failed to query peers for WG config", zap.Error(err)) + } + + for _, p := range peers { + if p.PublicKey == strings.TrimSpace(string(myPubKey)) { + continue // already added above + } + config.WriteString(fmt.Sprintf("\n[Peer]\nPublicKey = %s\nEndpoint = %s:51820\nAllowedIPs = %s/32\nPersistentKeepalive = 25\n", + p.PublicKey, p.PublicIP, p.WGIP)) + } + + return config.String(), nil +} + +// getPeerList returns all cluster peers for LUKS key distribution. +func (h *Handler) getPeerList(ctx context.Context, excludeWGIP string) ([]PeerInfo, error) { + type peerRow struct { + NodeID string `db:"node_id"` + WGIP string `db:"wg_ip"` + } + var rows []peerRow + if err := h.rqliteClient.Query(ctx, &rows, + "SELECT node_id, wg_ip FROM wireguard_peers WHERE wg_ip != ?", excludeWGIP); err != nil { + return nil, err + } + + peers := make([]PeerInfo, 0, len(rows)) + for _, row := range rows { + peers = append(peers, PeerInfo{ + WGIP: row.WGIP, + NodeID: row.NodeID, + }) + } + return peers, nil +} + +// getMyWGIP gets this node's WireGuard IP. +func (h *Handler) getMyWGIP() (string, error) { + out, err := exec.Command("ip", "-4", "addr", "show", "wg0").CombinedOutput() + if err != nil { + return "", fmt.Errorf("failed to get wg0 info: %w", err) + } + for _, line := range strings.Split(string(out), "\n") { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, "inet ") { + parts := strings.Fields(line) + if len(parts) >= 2 { + return strings.Split(parts[1], "/")[0], nil + } + } + } + return "", fmt.Errorf("could not find wg0 IP") +} + +// getMyPublicIP reads this node's public IP from the database. +func (h *Handler) getMyPublicIP(ctx context.Context) (string, error) { + myWGIP, err := h.getMyWGIP() + if err != nil { + return "", err + } + var rows []struct { + PublicIP string `db:"public_ip"` + } + if err := h.rqliteClient.Query(ctx, &rows, + "SELECT public_ip FROM wireguard_peers WHERE wg_ip = ?", myWGIP); err != nil { + return "", err + } + if len(rows) == 0 { + return "", fmt.Errorf("no peer entry for WG IP %s", myWGIP) + } + return rows[0].PublicIP, nil +} diff --git a/pkg/gateway/handlers/enroll/node_proxy.go b/pkg/gateway/handlers/enroll/node_proxy.go new file mode 100644 index 0000000..9ca6f1b --- /dev/null +++ b/pkg/gateway/handlers/enroll/node_proxy.go @@ -0,0 +1,272 @@ +package enroll + +import ( + "context" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "os/exec" + "strings" + "time" + + "go.uber.org/zap" +) + +// HandleNodeStatus proxies GET /v1/node/status to the agent over WireGuard. +// Query param: ?node_id= or ?wg_ip= +func (h *Handler) HandleNodeStatus(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + wgIP, err := h.resolveNodeIP(r) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + // Proxy to agent's status endpoint + body, statusCode, err := h.proxyToAgent(wgIP, "GET", "/v1/agent/status", nil) + if err != nil { + h.logger.Warn("failed to proxy status request", zap.String("wg_ip", wgIP), zap.Error(err)) + http.Error(w, "node unreachable: "+err.Error(), http.StatusBadGateway) + return + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(statusCode) + w.Write(body) +} + +// HandleNodeCommand proxies POST /v1/node/command to the agent over WireGuard. +func (h *Handler) HandleNodeCommand(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + wgIP, err := h.resolveNodeIP(r) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + // Read command body + r.Body = http.MaxBytesReader(w, r.Body, 1<<20) + cmdBody, err := io.ReadAll(r.Body) + if err != nil { + http.Error(w, "invalid request body", http.StatusBadRequest) + return + } + + // Proxy to agent's command endpoint + body, statusCode, err := h.proxyToAgent(wgIP, "POST", "/v1/agent/command", cmdBody) + if err != nil { + h.logger.Warn("failed to proxy command", zap.String("wg_ip", wgIP), zap.Error(err)) + http.Error(w, "node unreachable: "+err.Error(), http.StatusBadGateway) + return + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(statusCode) + w.Write(body) +} + +// HandleNodeLogs proxies GET /v1/node/logs to the agent over WireGuard. +// Query params: ?node_id=&service=&lines= +func (h *Handler) HandleNodeLogs(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + wgIP, err := h.resolveNodeIP(r) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + // Build query string for agent + service := r.URL.Query().Get("service") + lines := r.URL.Query().Get("lines") + agentPath := "/v1/agent/logs" + params := []string{} + if service != "" { + params = append(params, "service="+service) + } + if lines != "" { + params = append(params, "lines="+lines) + } + if len(params) > 0 { + agentPath += "?" + strings.Join(params, "&") + } + + body, statusCode, err := h.proxyToAgent(wgIP, "GET", agentPath, nil) + if err != nil { + h.logger.Warn("failed to proxy logs request", zap.String("wg_ip", wgIP), zap.Error(err)) + http.Error(w, "node unreachable: "+err.Error(), http.StatusBadGateway) + return + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(statusCode) + w.Write(body) +} + +// HandleNodeLeave handles POST /v1/node/leave — graceful node departure. +// Orchestrates: stop services → redistribute Shamir shares → remove WG peer. +func (h *Handler) HandleNodeLeave(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + r.Body = http.MaxBytesReader(w, r.Body, 1<<20) + var req struct { + NodeID string `json:"node_id"` + WGIP string `json:"wg_ip"` + } + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + http.Error(w, "invalid request body", http.StatusBadRequest) + return + } + + wgIP := req.WGIP + if wgIP == "" && req.NodeID != "" { + resolved, err := h.nodeIDToWGIP(r.Context(), req.NodeID) + if err != nil { + http.Error(w, "node not found: "+err.Error(), http.StatusNotFound) + return + } + wgIP = resolved + } + if wgIP == "" { + http.Error(w, "node_id or wg_ip is required", http.StatusBadRequest) + return + } + + h.logger.Info("node leave requested", zap.String("wg_ip", wgIP)) + + // Step 1: Tell the agent to stop services + _, _, err := h.proxyToAgent(wgIP, "POST", "/v1/agent/command", + []byte(`{"action":"stop"}`)) + if err != nil { + h.logger.Warn("failed to stop services on leaving node", zap.Error(err)) + // Continue — node may already be down + } + + // Step 2: Remove WG peer from database + ctx := r.Context() + if _, err := h.rqliteClient.Exec(ctx, + "DELETE FROM wireguard_peers WHERE wg_ip = ?", wgIP); err != nil { + h.logger.Error("failed to remove WG peer from database", zap.Error(err)) + http.Error(w, "failed to remove peer", http.StatusInternalServerError) + return + } + + // Step 3: Remove from local WireGuard interface + // Get the peer's public key first + var rows []struct { + PublicKey string `db:"public_key"` + } + _ = h.rqliteClient.Query(ctx, &rows, + "SELECT public_key FROM wireguard_peers WHERE wg_ip = ?", wgIP) + // Peer already deleted above, but try to remove from wg0 anyway + h.removeWGPeerLocally(wgIP) + + h.logger.Info("node removed from cluster", zap.String("wg_ip", wgIP)) + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]string{ + "status": "removed", + "wg_ip": wgIP, + }) +} + +// proxyToAgent sends an HTTP request to the OramaOS agent over WireGuard. +func (h *Handler) proxyToAgent(wgIP, method, path string, body []byte) ([]byte, int, error) { + url := fmt.Sprintf("http://%s:9998%s", wgIP, path) + + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + + var reqBody io.Reader + if body != nil { + reqBody = strings.NewReader(string(body)) + } + + req, err := http.NewRequestWithContext(ctx, method, url, reqBody) + if err != nil { + return nil, 0, err + } + if body != nil { + req.Header.Set("Content-Type", "application/json") + } + + client := &http.Client{Timeout: 15 * time.Second} + resp, err := client.Do(req) + if err != nil { + return nil, 0, fmt.Errorf("request to agent failed: %w", err) + } + defer resp.Body.Close() + + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return nil, resp.StatusCode, fmt.Errorf("failed to read agent response: %w", err) + } + + return respBody, resp.StatusCode, nil +} + +// resolveNodeIP extracts the WG IP from query parameters. +func (h *Handler) resolveNodeIP(r *http.Request) (string, error) { + wgIP := r.URL.Query().Get("wg_ip") + if wgIP != "" { + return wgIP, nil + } + + nodeID := r.URL.Query().Get("node_id") + if nodeID != "" { + return h.nodeIDToWGIP(r.Context(), nodeID) + } + + return "", fmt.Errorf("wg_ip or node_id query parameter is required") +} + +// nodeIDToWGIP resolves a node_id to its WireGuard IP. +func (h *Handler) nodeIDToWGIP(ctx context.Context, nodeID string) (string, error) { + var rows []struct { + WGIP string `db:"wg_ip"` + } + if err := h.rqliteClient.Query(ctx, &rows, + "SELECT wg_ip FROM wireguard_peers WHERE node_id = ?", nodeID); err != nil { + return "", err + } + if len(rows) == 0 { + return "", fmt.Errorf("no node found with id %s", nodeID) + } + return rows[0].WGIP, nil +} + +// removeWGPeerLocally removes a peer from the local wg0 interface by its allowed IP. +func (h *Handler) removeWGPeerLocally(wgIP string) { + // Find peer public key by allowed IP + out, err := exec.Command("wg", "show", "wg0", "dump").Output() + if err != nil { + log.Printf("failed to get wg dump: %v", err) + return + } + + for _, line := range strings.Split(string(out), "\n") { + fields := strings.Split(line, "\t") + if len(fields) >= 4 && strings.Contains(fields[3], wgIP) { + pubKey := fields[0] + exec.Command("wg", "set", "wg0", "peer", pubKey, "remove").Run() + log.Printf("removed WG peer %s (%s)", pubKey[:8]+"...", wgIP) + return + } + } +} diff --git a/pkg/gateway/routes.go b/pkg/gateway/routes.go index a791eda..809b419 100644 --- a/pkg/gateway/routes.go +++ b/pkg/gateway/routes.go @@ -39,6 +39,15 @@ func (g *Gateway) Routes() http.Handler { mux.HandleFunc("/v1/internal/join", g.joinHandler.HandleJoin) } + // OramaOS node management (handler does its own auth) + if g.enrollHandler != nil { + mux.HandleFunc("/v1/node/enroll", g.enrollHandler.HandleEnroll) + mux.HandleFunc("/v1/node/status", g.enrollHandler.HandleNodeStatus) + mux.HandleFunc("/v1/node/command", g.enrollHandler.HandleNodeCommand) + mux.HandleFunc("/v1/node/logs", g.enrollHandler.HandleNodeLogs) + mux.HandleFunc("/v1/node/leave", g.enrollHandler.HandleNodeLeave) + } + // Namespace instance spawn/stop (internal, handler does its own auth) if g.spawnHandler != nil { mux.Handle("/v1/internal/namespace/spawn", g.spawnHandler) From 646801913690bc9c8c0c2f328b1d61d2d50f0634 Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Sat, 7 Mar 2026 14:27:09 +0200 Subject: [PATCH 10/13] feat(sandbox): optimize archive upload via server-to-server fanout - add WithNoHostKeyCheck option for ephemeral server IPs - upload binary to genesis then distribute to other nodes (faster) - improve provisioning error handling for cleanup on partial failure --- pkg/cli/remotessh/ssh.go | 39 +++++-- pkg/cli/sandbox/create.go | 121 +++++++++++++------- pkg/cli/sandbox/rollout.go | 6 +- pkg/cli/sandbox/ssh_cmd.go | 3 +- pkg/environments/production/orchestrator.go | 17 ++- pkg/environments/production/services.go | 1 + 6 files changed, 131 insertions(+), 56 deletions(-) diff --git a/pkg/cli/remotessh/ssh.go b/pkg/cli/remotessh/ssh.go index 803c384..3ce5157 100644 --- a/pkg/cli/remotessh/ssh.go +++ b/pkg/cli/remotessh/ssh.go @@ -12,7 +12,8 @@ import ( type SSHOption func(*sshOptions) type sshOptions struct { - agentForward bool + agentForward bool + noHostKeyCheck bool } // WithAgentForward enables SSH agent forwarding (-A flag). @@ -21,22 +22,35 @@ func WithAgentForward() SSHOption { return func(o *sshOptions) { o.agentForward = true } } +// WithNoHostKeyCheck disables host key verification and uses /dev/null as known_hosts. +// Use for ephemeral servers (sandbox) where IPs are frequently recycled. +func WithNoHostKeyCheck() SSHOption { + return func(o *sshOptions) { o.noHostKeyCheck = true } +} + // UploadFile copies a local file to a remote host via SCP. // Requires node.SSHKey to be set (via PrepareNodeKeys). -func UploadFile(node inspector.Node, localPath, remotePath string) error { +func UploadFile(node inspector.Node, localPath, remotePath string, opts ...SSHOption) error { if node.SSHKey == "" { return fmt.Errorf("no SSH key for %s (call PrepareNodeKeys first)", node.Name()) } + var cfg sshOptions + for _, o := range opts { + o(&cfg) + } + dest := fmt.Sprintf("%s@%s:%s", node.User, node.Host, remotePath) - cmd := exec.Command("scp", - "-o", "StrictHostKeyChecking=accept-new", - "-o", "ConnectTimeout=10", - "-i", node.SSHKey, - localPath, dest, - ) + args := []string{"-o", "ConnectTimeout=10", "-i", node.SSHKey} + if cfg.noHostKeyCheck { + args = append([]string{"-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null"}, args...) + } else { + args = append([]string{"-o", "StrictHostKeyChecking=accept-new"}, args...) + } + args = append(args, localPath, dest) + cmd := exec.Command("scp", args...) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr @@ -59,10 +73,11 @@ func RunSSHStreaming(node inspector.Node, command string, opts ...SSHOption) err o(&cfg) } - args := []string{ - "-o", "StrictHostKeyChecking=accept-new", - "-o", "ConnectTimeout=10", - "-i", node.SSHKey, + args := []string{"-o", "ConnectTimeout=10", "-i", node.SSHKey} + if cfg.noHostKeyCheck { + args = append([]string{"-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null"}, args...) + } else { + args = append([]string{"-o", "StrictHostKeyChecking=accept-new"}, args...) } if cfg.agentForward { args = append(args, "-A") diff --git a/pkg/cli/sandbox/create.go b/pkg/cli/sandbox/create.go index 292e1d9..cac3bd0 100644 --- a/pkg/cli/sandbox/create.go +++ b/pkg/cli/sandbox/create.go @@ -138,10 +138,14 @@ func phase1ProvisionServers(client *HetznerClient, cfg *Config, state *SandboxSt } servers := make([]ServerState, 5) + var firstErr error for i := 0; i < 5; i++ { r := <-results if r.err != nil { - return fmt.Errorf("server %d: %w", r.index+1, r.err) + if firstErr == nil { + firstErr = fmt.Errorf("server %d: %w", r.index+1, r.err) + } + continue } fmt.Printf(" Created %s (ID: %d, initializing...)\n", r.server.Name, r.server.ID) role := "node" @@ -154,6 +158,10 @@ func phase1ProvisionServers(client *HetznerClient, cfg *Config, state *SandboxSt Role: role, } } + state.Servers = servers // populate before returning so cleanup can delete created servers + if firstErr != nil { + return firstErr + } // Wait for all servers to reach "running" fmt.Print(" Waiting for servers to boot...") @@ -210,12 +218,12 @@ func phase2AssignFloatingIPs(client *HetznerClient, cfg *Config, state *SandboxS } // Wait for SSH to be ready on freshly booted servers - if err := waitForSSH(node, 2*time.Minute); err != nil { + if err := waitForSSH(node, 5*time.Minute); err != nil { return fmt.Errorf("SSH not ready on %s: %w", srv.Name, err) } cmd := fmt.Sprintf("ip addr add %s/32 dev lo 2>/dev/null || true", fip.IP) - if err := remotessh.RunSSHStreaming(node, cmd); err != nil { + if err := remotessh.RunSSHStreaming(node, cmd, remotessh.WithNoHostKeyCheck()); err != nil { return fmt.Errorf("configure loopback on %s: %w", srv.Name, err) } } @@ -236,9 +244,9 @@ func waitForSSH(node inspector.Node, timeout time.Duration) error { return fmt.Errorf("timeout after %s", timeout) } -// phase3UploadArchive builds (if needed) and uploads the binary archive to all nodes. +// phase3UploadArchive uploads the binary archive to the genesis node, then fans out +// to the remaining nodes server-to-server (much faster than uploading from local machine). func phase3UploadArchive(cfg *Config, state *SandboxState) error { - // Find existing archive archivePath := findNewestArchive() if archivePath == "" { fmt.Println(" No binary archive found, run `orama build` first") @@ -250,40 +258,73 @@ func phase3UploadArchive(cfg *Config, state *SandboxState) error { sshKeyPath := cfg.ExpandedPrivateKeyPath() remotePath := "/tmp/" + filepath.Base(archivePath) + extractCmd := fmt.Sprintf("mkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s", + remotePath, remotePath) - // Upload to all 5 nodes in parallel - var wg sync.WaitGroup - errs := make([]error, len(state.Servers)) + // Step 1: Upload from local machine to genesis node + genesis := state.Servers[0] + genesisNode := inspector.Node{User: "root", Host: genesis.IP, SSHKey: sshKeyPath} - for i, srv := range state.Servers { - wg.Add(1) - go func(idx int, srv ServerState) { - defer wg.Done() - node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} - - if err := remotessh.UploadFile(node, archivePath, remotePath); err != nil { - errs[idx] = fmt.Errorf("upload to %s: %w", srv.Name, err) - return - } - - // Extract + install CLI - extractCmd := fmt.Sprintf("mkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s && cp /opt/orama/bin/orama /usr/local/bin/orama && chmod +x /usr/local/bin/orama", - remotePath, remotePath) - if err := remotessh.RunSSHStreaming(node, extractCmd); err != nil { - errs[idx] = fmt.Errorf("extract on %s: %w", srv.Name, err) - return - } - fmt.Printf(" Uploaded to %s\n", srv.Name) - }(i, srv) + fmt.Printf(" Uploading to %s (genesis)...\n", genesis.Name) + if err := remotessh.UploadFile(genesisNode, archivePath, remotePath, remotessh.WithNoHostKeyCheck()); err != nil { + return fmt.Errorf("upload to %s: %w", genesis.Name, err) } - wg.Wait() - for _, err := range errs { - if err != nil { - return err + // Step 2: Fan out from genesis to remaining nodes in parallel (server-to-server) + if len(state.Servers) > 1 { + fmt.Printf(" Fanning out from %s to %d nodes...\n", genesis.Name, len(state.Servers)-1) + + // Temporarily upload SSH key to genesis for server-to-server SCP + remoteKeyPath := "/tmp/.sandbox_key" + if err := remotessh.UploadFile(genesisNode, sshKeyPath, remoteKeyPath, remotessh.WithNoHostKeyCheck()); err != nil { + return fmt.Errorf("upload SSH key to genesis: %w", err) + } + // Always clean up the temporary key, even on panic/early return + defer remotessh.RunSSHStreaming(genesisNode, fmt.Sprintf("rm -f %s", remoteKeyPath), remotessh.WithNoHostKeyCheck()) + + if err := remotessh.RunSSHStreaming(genesisNode, fmt.Sprintf("chmod 600 %s", remoteKeyPath), remotessh.WithNoHostKeyCheck()); err != nil { + return fmt.Errorf("chmod SSH key on genesis: %w", err) + } + + var wg sync.WaitGroup + errs := make([]error, len(state.Servers)) + + for i := 1; i < len(state.Servers); i++ { + wg.Add(1) + go func(idx int, srv ServerState) { + defer wg.Done() + // SCP from genesis to target using the uploaded key + scpCmd := fmt.Sprintf("scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i %s %s root@%s:%s", + remoteKeyPath, remotePath, srv.IP, remotePath) + if err := remotessh.RunSSHStreaming(genesisNode, scpCmd, remotessh.WithNoHostKeyCheck()); err != nil { + errs[idx] = fmt.Errorf("fanout to %s: %w", srv.Name, err) + return + } + // Extract on target + targetNode := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} + if err := remotessh.RunSSHStreaming(targetNode, extractCmd, remotessh.WithNoHostKeyCheck()); err != nil { + errs[idx] = fmt.Errorf("extract on %s: %w", srv.Name, err) + return + } + fmt.Printf(" Distributed to %s\n", srv.Name) + }(i, state.Servers[i]) + } + wg.Wait() + + for _, err := range errs { + if err != nil { + return err + } } } + // Step 3: Extract on genesis + fmt.Printf(" Extracting on %s...\n", genesis.Name) + if err := remotessh.RunSSHStreaming(genesisNode, extractCmd, remotessh.WithNoHostKeyCheck()); err != nil { + return fmt.Errorf("extract on %s: %w", genesis.Name, err) + } + + fmt.Println(" All nodes ready") return nil } @@ -294,10 +335,10 @@ func phase4InstallGenesis(cfg *Config, state *SandboxState) ([]string, error) { node := inspector.Node{User: "root", Host: genesis.IP, SSHKey: sshKeyPath} // Install genesis - installCmd := fmt.Sprintf("orama node install --vps-ip %s --domain %s --base-domain %s --nameserver --skip-checks", + installCmd := fmt.Sprintf("/opt/orama/bin/orama node install --vps-ip %s --domain %s --base-domain %s --nameserver --skip-checks", genesis.IP, cfg.Domain, cfg.Domain) fmt.Printf(" Installing on %s (%s)...\n", genesis.Name, genesis.IP) - if err := remotessh.RunSSHStreaming(node, installCmd); err != nil { + if err := remotessh.RunSSHStreaming(node, installCmd, remotessh.WithNoHostKeyCheck()); err != nil { return nil, fmt.Errorf("install genesis: %w", err) } @@ -338,15 +379,15 @@ func phase5JoinNodes(cfg *Config, state *SandboxState, tokens []string) error { var installCmd string if srv.Role == "nameserver" { - installCmd = fmt.Sprintf("orama node install --join http://%s --token %s --vps-ip %s --domain %s --base-domain %s --nameserver --skip-checks", + installCmd = fmt.Sprintf("/opt/orama/bin/orama node install --join http://%s --token %s --vps-ip %s --domain %s --base-domain %s --nameserver --skip-checks", genesisIP, token, srv.IP, cfg.Domain, cfg.Domain) } else { - installCmd = fmt.Sprintf("orama node install --join http://%s --token %s --vps-ip %s --base-domain %s --skip-checks", + installCmd = fmt.Sprintf("/opt/orama/bin/orama node install --join http://%s --token %s --vps-ip %s --base-domain %s --skip-checks", genesisIP, token, srv.IP, cfg.Domain) } fmt.Printf(" [%d/%d] Joining %s (%s, %s)...\n", i, len(state.Servers)-1, srv.Name, srv.IP, srv.Role) - if err := remotessh.RunSSHStreaming(node, installCmd); err != nil { + if err := remotessh.RunSSHStreaming(node, installCmd, remotessh.WithNoHostKeyCheck()); err != nil { return fmt.Errorf("join %s: %w", srv.Name, err) } @@ -402,7 +443,7 @@ func waitForRQLiteHealth(node inspector.Node, timeout time.Duration) error { // generateInviteToken runs `orama node invite` on the node and parses the token. func generateInviteToken(node inspector.Node) (string, error) { - out, err := runSSHOutput(node, "orama node invite --expiry 1h 2>&1") + out, err := runSSHOutput(node, "/opt/orama/bin/orama node invite --expiry 1h 2>&1") if err != nil { return "", fmt.Errorf("invite command failed: %w", err) } @@ -451,10 +492,12 @@ func isHex(s string) bool { } // runSSHOutput runs a command via SSH and returns stdout as a string. +// Uses StrictHostKeyChecking=no because sandbox IPs are frequently recycled. func runSSHOutput(node inspector.Node, command string) (string, error) { args := []string{ "ssh", "-n", - "-o", "StrictHostKeyChecking=accept-new", + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", "-o", "ConnectTimeout=10", "-o", "BatchMode=yes", "-i", node.SSHKey, diff --git a/pkg/cli/sandbox/rollout.go b/pkg/cli/sandbox/rollout.go index 8c7ccfd..ac186ee 100644 --- a/pkg/cli/sandbox/rollout.go +++ b/pkg/cli/sandbox/rollout.go @@ -42,14 +42,14 @@ func Rollout(name string) error { node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} fmt.Printf(" [%d/%d] Uploading to %s...\n", i+1, len(state.Servers), srv.Name) - if err := remotessh.UploadFile(node, archivePath, remotePath); err != nil { + if err := remotessh.UploadFile(node, archivePath, remotePath, remotessh.WithNoHostKeyCheck()); err != nil { return fmt.Errorf("upload to %s: %w", srv.Name, err) } // Extract archive extractCmd := fmt.Sprintf("mkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s", remotePath, remotePath) - if err := remotessh.RunSSHStreaming(node, extractCmd); err != nil { + if err := remotessh.RunSSHStreaming(node, extractCmd, remotessh.WithNoHostKeyCheck()); err != nil { return fmt.Errorf("extract on %s: %w", srv.Name, err) } } @@ -107,7 +107,7 @@ func upgradeNode(srv ServerState, sshKeyPath string, current, total int) error { node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} fmt.Printf(" [%d/%d] Upgrading %s (%s)...\n", current, total, srv.Name, srv.IP) - if err := remotessh.RunSSHStreaming(node, "orama node upgrade --restart"); err != nil { + if err := remotessh.RunSSHStreaming(node, "orama node upgrade --restart", remotessh.WithNoHostKeyCheck()); err != nil { return fmt.Errorf("upgrade %s: %w", srv.Name, err) } diff --git a/pkg/cli/sandbox/ssh_cmd.go b/pkg/cli/sandbox/ssh_cmd.go index ed3bf61..f09ef08 100644 --- a/pkg/cli/sandbox/ssh_cmd.go +++ b/pkg/cli/sandbox/ssh_cmd.go @@ -36,7 +36,8 @@ func SSHInto(name string, nodeNum int) error { // Replace current process with SSH args := []string{ "ssh", - "-o", "StrictHostKeyChecking=accept-new", + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", "-i", sshKeyPath, fmt.Sprintf("root@%s", srv.IP), } diff --git a/pkg/environments/production/orchestrator.go b/pkg/environments/production/orchestrator.go index 339e7d3..fce62b0 100644 --- a/pkg/environments/production/orchestrator.go +++ b/pkg/environments/production/orchestrator.go @@ -706,6 +706,20 @@ func (ps *ProductionSetup) Phase4GenerateConfigs(peerAddresses []string, vpsIP s func (ps *ProductionSetup) Phase5CreateSystemdServices(enableHTTPS bool) error { ps.logf("Phase 5: Creating systemd services...") + // Re-chown all orama directories to the orama user. + // Phases 2b-4 create files as root (IPFS repo, configs, secrets, etc.) + // that must be readable/writable by the orama service user. + if err := exec.Command("id", "orama").Run(); err == nil { + for _, dir := range []string{ps.oramaDir, filepath.Join(ps.oramaHome, "bin")} { + if _, statErr := os.Stat(dir); statErr == nil { + if output, chownErr := exec.Command("chown", "-R", "orama:orama", dir).CombinedOutput(); chownErr != nil { + ps.logf(" ⚠️ Failed to chown %s: %v\n%s", dir, chownErr, string(output)) + } + } + } + ps.logf(" ✓ File ownership updated for orama user") + } + // Validate all required binaries are available before creating services ipfsBinary, err := ps.binaryInstaller.ResolveBinaryPath("ipfs", "/usr/local/bin/ipfs", "/usr/bin/ipfs") if err != nil { @@ -795,8 +809,9 @@ func (ps *ProductionSetup) Phase5CreateSystemdServices(enableHTTPS bool) error { // Caddy service on ALL nodes (any node may host namespaces and need TLS) if _, err := os.Stat("/usr/bin/caddy"); err == nil { - // Create caddy data directory + // Create caddy data directory and ensure orama user can write to it exec.Command("mkdir", "-p", "/var/lib/caddy").Run() + exec.Command("chown", "-R", "orama:orama", "/var/lib/caddy").Run() caddyUnit := ps.serviceGenerator.GenerateCaddyService() if err := ps.serviceController.WriteServiceUnit("caddy.service", caddyUnit); err != nil { diff --git a/pkg/environments/production/services.go b/pkg/environments/production/services.go index 6c08da9..24c2a37 100644 --- a/pkg/environments/production/services.go +++ b/pkg/environments/production/services.go @@ -424,6 +424,7 @@ Wants=orama-node.service Type=simple %[1]s ReadWritePaths=%[2]s /var/lib/caddy /etc/caddy +Environment=XDG_DATA_HOME=/var/lib/caddy AmbientCapabilities=CAP_NET_BIND_SERVICE CapabilityBoundingSet=CAP_NET_BIND_SERVICE ExecStart=/usr/bin/caddy run --environ --config /etc/caddy/Caddyfile From 78d876e71bd3c4942c78bfd60930107819c628d9 Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Mon, 9 Mar 2026 10:19:40 +0200 Subject: [PATCH 11/13] feat(monitor): add sandbox environment support - load nodes from active sandbox state for env=sandbox - extract fanoutArchive for efficient server-to-server distribution --- pkg/cli/monitor/collector.go | 76 ++++++++++--- pkg/cli/sandbox/create.go | 68 +----------- pkg/cli/sandbox/fanout.go | 84 +++++++++++++++ pkg/cli/sandbox/rollout.go | 32 +++--- pkg/client/database_client.go | 33 +++++- pkg/client/database_client_test.go | 82 ++++++++++++++ pkg/environments/production/orchestrator.go | 7 ++ pkg/environments/production/prebuilt.go | 8 ++ pkg/environments/production/services.go | 1 + pkg/gateway/handlers/join/handler.go | 96 +++++++++++++++-- pkg/gateway/handlers/join/handler_test.go | 112 ++++++++++++++++++++ pkg/gateway/status_handlers.go | 46 ++++---- pkg/gateway/status_handlers_test.go | 72 +++++++++++++ 13 files changed, 588 insertions(+), 129 deletions(-) create mode 100644 pkg/cli/sandbox/fanout.go create mode 100644 pkg/client/database_client_test.go create mode 100644 pkg/gateway/handlers/join/handler_test.go create mode 100644 pkg/gateway/status_handlers_test.go diff --git a/pkg/cli/monitor/collector.go b/pkg/cli/monitor/collector.go index 1e7ec53..1742667 100644 --- a/pkg/cli/monitor/collector.go +++ b/pkg/cli/monitor/collector.go @@ -9,6 +9,7 @@ import ( "github.com/DeBrosOfficial/network/pkg/cli/production/report" "github.com/DeBrosOfficial/network/pkg/cli/remotessh" + "github.com/DeBrosOfficial/network/pkg/cli/sandbox" "github.com/DeBrosOfficial/network/pkg/inspector" ) @@ -23,22 +24,9 @@ type CollectorConfig struct { // CollectOnce runs `sudo orama node report --json` on all matching nodes // in parallel and returns a ClusterSnapshot. func CollectOnce(ctx context.Context, cfg CollectorConfig) (*ClusterSnapshot, error) { - nodes, err := inspector.LoadNodes(cfg.ConfigPath) + nodes, cleanup, err := loadNodes(cfg) if err != nil { - return nil, fmt.Errorf("load nodes: %w", err) - } - nodes = inspector.FilterByEnv(nodes, cfg.Env) - if cfg.NodeFilter != "" { - nodes = filterByHost(nodes, cfg.NodeFilter) - } - if len(nodes) == 0 { - return nil, fmt.Errorf("no nodes found for env %q", cfg.Env) - } - - // Prepare wallet-derived SSH keys - cleanup, err := remotessh.PrepareNodeKeys(nodes) - if err != nil { - return nil, fmt.Errorf("prepare SSH keys: %w", err) + return nil, err } defer cleanup() @@ -121,3 +109,61 @@ func truncate(s string, maxLen int) string { } return s[:maxLen] + "..." } + +// loadNodes resolves the node list and SSH keys based on the environment. +// For "sandbox", nodes are loaded from the active sandbox state file with +// the sandbox SSH key already set. For other environments, nodes come from +// nodes.conf and use wallet-derived SSH keys. +func loadNodes(cfg CollectorConfig) ([]inspector.Node, func(), error) { + noop := func() {} + + if cfg.Env == "sandbox" { + return loadSandboxNodes(cfg) + } + + nodes, err := inspector.LoadNodes(cfg.ConfigPath) + if err != nil { + return nil, noop, fmt.Errorf("load nodes: %w", err) + } + nodes = inspector.FilterByEnv(nodes, cfg.Env) + if cfg.NodeFilter != "" { + nodes = filterByHost(nodes, cfg.NodeFilter) + } + if len(nodes) == 0 { + return nil, noop, fmt.Errorf("no nodes found for env %q", cfg.Env) + } + + cleanup, err := remotessh.PrepareNodeKeys(nodes) + if err != nil { + return nil, noop, fmt.Errorf("prepare SSH keys: %w", err) + } + return nodes, cleanup, nil +} + +// loadSandboxNodes loads nodes from the active sandbox state file. +func loadSandboxNodes(cfg CollectorConfig) ([]inspector.Node, func(), error) { + noop := func() {} + + sbxCfg, err := sandbox.LoadConfig() + if err != nil { + return nil, noop, fmt.Errorf("load sandbox config: %w", err) + } + + state, err := sandbox.FindActiveSandbox() + if err != nil { + return nil, noop, fmt.Errorf("find active sandbox: %w", err) + } + if state == nil { + return nil, noop, fmt.Errorf("no active sandbox found") + } + + nodes := state.ToNodes(sbxCfg.ExpandedPrivateKeyPath()) + if cfg.NodeFilter != "" { + nodes = filterByHost(nodes, cfg.NodeFilter) + } + if len(nodes) == 0 { + return nil, noop, fmt.Errorf("no nodes found for sandbox %q", state.Name) + } + + return nodes, noop, nil +} diff --git a/pkg/cli/sandbox/create.go b/pkg/cli/sandbox/create.go index cac3bd0..29434ac 100644 --- a/pkg/cli/sandbox/create.go +++ b/pkg/cli/sandbox/create.go @@ -6,7 +6,6 @@ import ( "os/exec" "path/filepath" "strings" - "sync" "time" "github.com/DeBrosOfficial/network/pkg/cli/remotessh" @@ -257,71 +256,8 @@ func phase3UploadArchive(cfg *Config, state *SandboxState) error { fmt.Printf(" Archive: %s (%s)\n", filepath.Base(archivePath), formatBytes(info.Size())) sshKeyPath := cfg.ExpandedPrivateKeyPath() - remotePath := "/tmp/" + filepath.Base(archivePath) - extractCmd := fmt.Sprintf("mkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s", - remotePath, remotePath) - - // Step 1: Upload from local machine to genesis node - genesis := state.Servers[0] - genesisNode := inspector.Node{User: "root", Host: genesis.IP, SSHKey: sshKeyPath} - - fmt.Printf(" Uploading to %s (genesis)...\n", genesis.Name) - if err := remotessh.UploadFile(genesisNode, archivePath, remotePath, remotessh.WithNoHostKeyCheck()); err != nil { - return fmt.Errorf("upload to %s: %w", genesis.Name, err) - } - - // Step 2: Fan out from genesis to remaining nodes in parallel (server-to-server) - if len(state.Servers) > 1 { - fmt.Printf(" Fanning out from %s to %d nodes...\n", genesis.Name, len(state.Servers)-1) - - // Temporarily upload SSH key to genesis for server-to-server SCP - remoteKeyPath := "/tmp/.sandbox_key" - if err := remotessh.UploadFile(genesisNode, sshKeyPath, remoteKeyPath, remotessh.WithNoHostKeyCheck()); err != nil { - return fmt.Errorf("upload SSH key to genesis: %w", err) - } - // Always clean up the temporary key, even on panic/early return - defer remotessh.RunSSHStreaming(genesisNode, fmt.Sprintf("rm -f %s", remoteKeyPath), remotessh.WithNoHostKeyCheck()) - - if err := remotessh.RunSSHStreaming(genesisNode, fmt.Sprintf("chmod 600 %s", remoteKeyPath), remotessh.WithNoHostKeyCheck()); err != nil { - return fmt.Errorf("chmod SSH key on genesis: %w", err) - } - - var wg sync.WaitGroup - errs := make([]error, len(state.Servers)) - - for i := 1; i < len(state.Servers); i++ { - wg.Add(1) - go func(idx int, srv ServerState) { - defer wg.Done() - // SCP from genesis to target using the uploaded key - scpCmd := fmt.Sprintf("scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i %s %s root@%s:%s", - remoteKeyPath, remotePath, srv.IP, remotePath) - if err := remotessh.RunSSHStreaming(genesisNode, scpCmd, remotessh.WithNoHostKeyCheck()); err != nil { - errs[idx] = fmt.Errorf("fanout to %s: %w", srv.Name, err) - return - } - // Extract on target - targetNode := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} - if err := remotessh.RunSSHStreaming(targetNode, extractCmd, remotessh.WithNoHostKeyCheck()); err != nil { - errs[idx] = fmt.Errorf("extract on %s: %w", srv.Name, err) - return - } - fmt.Printf(" Distributed to %s\n", srv.Name) - }(i, state.Servers[i]) - } - wg.Wait() - - for _, err := range errs { - if err != nil { - return err - } - } - } - - // Step 3: Extract on genesis - fmt.Printf(" Extracting on %s...\n", genesis.Name) - if err := remotessh.RunSSHStreaming(genesisNode, extractCmd, remotessh.WithNoHostKeyCheck()); err != nil { - return fmt.Errorf("extract on %s: %w", genesis.Name, err) + if err := fanoutArchive(state.Servers, sshKeyPath, archivePath); err != nil { + return err } fmt.Println(" All nodes ready") diff --git a/pkg/cli/sandbox/fanout.go b/pkg/cli/sandbox/fanout.go new file mode 100644 index 0000000..be9fc16 --- /dev/null +++ b/pkg/cli/sandbox/fanout.go @@ -0,0 +1,84 @@ +package sandbox + +import ( + "fmt" + "path/filepath" + "sync" + + "github.com/DeBrosOfficial/network/pkg/cli/remotessh" + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +// fanoutArchive uploads a binary archive to the first server, then fans out +// server-to-server in parallel to all remaining servers. This is much faster +// than uploading from the local machine to each node individually. +// After distribution, the archive is extracted on all nodes. +func fanoutArchive(servers []ServerState, sshKeyPath, archivePath string) error { + remotePath := "/tmp/" + filepath.Base(archivePath) + extractCmd := fmt.Sprintf("mkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s", + remotePath, remotePath) + + // Step 1: Upload from local machine to first node + first := servers[0] + firstNode := inspector.Node{User: "root", Host: first.IP, SSHKey: sshKeyPath} + + fmt.Printf(" Uploading to %s...\n", first.Name) + if err := remotessh.UploadFile(firstNode, archivePath, remotePath, remotessh.WithNoHostKeyCheck()); err != nil { + return fmt.Errorf("upload to %s: %w", first.Name, err) + } + + // Step 2: Fan out from first node to remaining nodes in parallel (server-to-server) + if len(servers) > 1 { + fmt.Printf(" Fanning out from %s to %d nodes...\n", first.Name, len(servers)-1) + + // Temporarily upload SSH key for server-to-server SCP + remoteKeyPath := "/tmp/.sandbox_key" + if err := remotessh.UploadFile(firstNode, sshKeyPath, remoteKeyPath, remotessh.WithNoHostKeyCheck()); err != nil { + return fmt.Errorf("upload SSH key to %s: %w", first.Name, err) + } + defer remotessh.RunSSHStreaming(firstNode, fmt.Sprintf("rm -f %s", remoteKeyPath), remotessh.WithNoHostKeyCheck()) + + if err := remotessh.RunSSHStreaming(firstNode, fmt.Sprintf("chmod 600 %s", remoteKeyPath), remotessh.WithNoHostKeyCheck()); err != nil { + return fmt.Errorf("chmod SSH key on %s: %w", first.Name, err) + } + + var wg sync.WaitGroup + errs := make([]error, len(servers)) + + for i := 1; i < len(servers); i++ { + wg.Add(1) + go func(idx int, srv ServerState) { + defer wg.Done() + // SCP from first node to target + scpCmd := fmt.Sprintf("scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i %s %s root@%s:%s", + remoteKeyPath, remotePath, srv.IP, remotePath) + if err := remotessh.RunSSHStreaming(firstNode, scpCmd, remotessh.WithNoHostKeyCheck()); err != nil { + errs[idx] = fmt.Errorf("fanout to %s: %w", srv.Name, err) + return + } + // Extract on target + targetNode := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} + if err := remotessh.RunSSHStreaming(targetNode, extractCmd, remotessh.WithNoHostKeyCheck()); err != nil { + errs[idx] = fmt.Errorf("extract on %s: %w", srv.Name, err) + return + } + fmt.Printf(" Distributed to %s\n", srv.Name) + }(i, servers[i]) + } + wg.Wait() + + for _, err := range errs { + if err != nil { + return err + } + } + } + + // Step 3: Extract on first node + fmt.Printf(" Extracting on %s...\n", first.Name) + if err := remotessh.RunSSHStreaming(firstNode, extractCmd, remotessh.WithNoHostKeyCheck()); err != nil { + return fmt.Errorf("extract on %s: %w", first.Name, err) + } + + return nil +} diff --git a/pkg/cli/sandbox/rollout.go b/pkg/cli/sandbox/rollout.go index ac186ee..396e8f4 100644 --- a/pkg/cli/sandbox/rollout.go +++ b/pkg/cli/sandbox/rollout.go @@ -34,24 +34,10 @@ func Rollout(name string) error { info, _ := os.Stat(archivePath) fmt.Printf("Archive: %s (%s)\n\n", filepath.Base(archivePath), formatBytes(info.Size())) - // Step 2: Push archive to all nodes + // Step 2: Push archive to all nodes (upload to first, fan out server-to-server) fmt.Println("Pushing archive to all nodes...") - remotePath := "/tmp/" + filepath.Base(archivePath) - - for i, srv := range state.Servers { - node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} - - fmt.Printf(" [%d/%d] Uploading to %s...\n", i+1, len(state.Servers), srv.Name) - if err := remotessh.UploadFile(node, archivePath, remotePath, remotessh.WithNoHostKeyCheck()); err != nil { - return fmt.Errorf("upload to %s: %w", srv.Name, err) - } - - // Extract archive - extractCmd := fmt.Sprintf("mkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s", - remotePath, remotePath) - if err := remotessh.RunSSHStreaming(node, extractCmd, remotessh.WithNoHostKeyCheck()); err != nil { - return fmt.Errorf("extract on %s: %w", srv.Name, err) - } + if err := fanoutArchive(state.Servers, sshKeyPath, archivePath); err != nil { + return err } // Step 3: Rolling upgrade — followers first, leader last @@ -103,10 +89,22 @@ func findLeaderIndex(state *SandboxState, sshKeyPath string) int { } // upgradeNode performs `orama node upgrade --restart` on a single node. +// It pre-replaces the orama CLI binary before running the upgrade command +// to avoid ETXTBSY ("text file busy") errors when the old binary doesn't +// have the os.Remove fix in copyBinary(). func upgradeNode(srv ServerState, sshKeyPath string, current, total int) error { node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} fmt.Printf(" [%d/%d] Upgrading %s (%s)...\n", current, total, srv.Name, srv.IP) + + // Pre-replace the orama CLI so the upgrade runs the NEW binary (with ETXTBSY fix). + // rm unlinks the old inode (kernel keeps it alive for the running process), + // cp creates a fresh inode at the same path. + preReplace := "rm -f /usr/local/bin/orama && cp /opt/orama/bin/orama /usr/local/bin/orama" + if err := remotessh.RunSSHStreaming(node, preReplace, remotessh.WithNoHostKeyCheck()); err != nil { + return fmt.Errorf("pre-replace orama binary on %s: %w", srv.Name, err) + } + if err := remotessh.RunSSHStreaming(node, "orama node upgrade --restart", remotessh.WithNoHostKeyCheck()); err != nil { return fmt.Errorf("upgrade %s: %w", srv.Name, err) } diff --git a/pkg/client/database_client.go b/pkg/client/database_client.go index cd8a85c..dc209d3 100644 --- a/pkg/client/database_client.go +++ b/pkg/client/database_client.go @@ -9,6 +9,31 @@ import ( "github.com/rqlite/gorqlite" ) +// safeWriteOne wraps gorqlite's WriteOneParameterized to recover from panics. +// gorqlite's WriteOne* functions access wra[0] without checking if the slice +// is empty, which panics when the server returns an error (e.g. "leader not found") +// with no result rows. +func safeWriteOne(conn *gorqlite.Connection, stmt gorqlite.ParameterizedStatement) (wr gorqlite.WriteResult, err error) { + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("rqlite write failed (recovered panic): %v", r) + } + }() + wr, err = conn.WriteOneParameterized(stmt) + return +} + +// safeWriteOneRaw wraps gorqlite's WriteOne to recover from panics. +func safeWriteOneRaw(conn *gorqlite.Connection, sql string) (wr gorqlite.WriteResult, err error) { + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("rqlite write failed (recovered panic): %v", r) + } + }() + wr, err = conn.WriteOne(sql) + return +} + // DatabaseClientImpl implements DatabaseClient type DatabaseClientImpl struct { client *Client @@ -79,7 +104,7 @@ func (d *DatabaseClientImpl) Query(ctx context.Context, sql string, args ...inte if isWriteOperation { // Execute write operation with parameters - _, err := conn.WriteOneParameterized(gorqlite.ParameterizedStatement{ + _, err := safeWriteOne(conn, gorqlite.ParameterizedStatement{ Query: sql, Arguments: args, }) @@ -293,7 +318,7 @@ func (d *DatabaseClientImpl) Transaction(ctx context.Context, queries []string) // Execute all queries in the transaction success := true for _, query := range queries { - _, err := conn.WriteOne(query) + _, err := safeWriteOneRaw(conn, query) if err != nil { lastErr = err success = false @@ -321,7 +346,7 @@ func (d *DatabaseClientImpl) CreateTable(ctx context.Context, schema string) err } return d.withRetry(func(conn *gorqlite.Connection) error { - _, err := conn.WriteOne(schema) + _, err := safeWriteOneRaw(conn, schema) return err }) } @@ -334,7 +359,7 @@ func (d *DatabaseClientImpl) DropTable(ctx context.Context, tableName string) er return d.withRetry(func(conn *gorqlite.Connection) error { dropSQL := fmt.Sprintf("DROP TABLE IF EXISTS %s", tableName) - _, err := conn.WriteOne(dropSQL) + _, err := safeWriteOneRaw(conn, dropSQL) return err }) } diff --git a/pkg/client/database_client_test.go b/pkg/client/database_client_test.go new file mode 100644 index 0000000..31de01b --- /dev/null +++ b/pkg/client/database_client_test.go @@ -0,0 +1,82 @@ +package client + +import ( + "fmt" + "testing" + + "github.com/rqlite/gorqlite" +) + +// mockPanicConnection simulates what gorqlite does when WriteParameterized +// returns an empty slice: accessing [0] panics. +func simulateGorqlitePanic() (gorqlite.WriteResult, error) { + var empty []gorqlite.WriteResult + return empty[0], fmt.Errorf("leader not found") // panics +} + +func TestSafeWriteOne_recoversPanic(t *testing.T) { + // We can't easily create a real gorqlite.Connection that panics, + // but we can verify our recovery wrapper works by testing the + // recovery pattern directly. + var recovered bool + func() { + defer func() { + if r := recover(); r != nil { + recovered = true + } + }() + simulateGorqlitePanic() + }() + + if !recovered { + t.Fatal("expected simulateGorqlitePanic to panic, but it didn't") + } +} + +func TestSafeWriteOne_nilConnection(t *testing.T) { + // safeWriteOne with nil connection should recover from panic, not crash. + _, err := safeWriteOne(nil, gorqlite.ParameterizedStatement{ + Query: "INSERT INTO test (a) VALUES (?)", + Arguments: []interface{}{"x"}, + }) + if err == nil { + t.Fatal("expected error from nil connection, got nil") + } +} + +func TestSafeWriteOneRaw_nilConnection(t *testing.T) { + // safeWriteOneRaw with nil connection should recover from panic, not crash. + _, err := safeWriteOneRaw(nil, "INSERT INTO test (a) VALUES ('x')") + if err == nil { + t.Fatal("expected error from nil connection, got nil") + } +} + +func TestIsWriteOperation(t *testing.T) { + d := &DatabaseClientImpl{} + + tests := []struct { + sql string + isWrite bool + }{ + {"INSERT INTO foo VALUES (1)", true}, + {" INSERT INTO foo VALUES (1)", true}, + {"UPDATE foo SET a = 1", true}, + {"DELETE FROM foo", true}, + {"CREATE TABLE foo (a TEXT)", true}, + {"DROP TABLE foo", true}, + {"ALTER TABLE foo ADD COLUMN b TEXT", true}, + {"SELECT * FROM foo", false}, + {" SELECT * FROM foo", false}, + {"EXPLAIN SELECT * FROM foo", false}, + } + + for _, tt := range tests { + t.Run(tt.sql, func(t *testing.T) { + got := d.isWriteOperation(tt.sql) + if got != tt.isWrite { + t.Errorf("isWriteOperation(%q) = %v, want %v", tt.sql, got, tt.isWrite) + } + }) + } +} diff --git a/pkg/environments/production/orchestrator.go b/pkg/environments/production/orchestrator.go index fce62b0..7458c75 100644 --- a/pkg/environments/production/orchestrator.go +++ b/pkg/environments/production/orchestrator.go @@ -997,6 +997,13 @@ func (ps *ProductionSetup) Phase6SetupWireGuard(isFirstNode bool) (privateKey, p } ps.logf(" ✓ WireGuard keypair generated") + // Save public key to orama secrets so the gateway (running as orama user) + // can read it without needing root access to /etc/wireguard/wg0.conf + pubKeyPath := filepath.Join(ps.oramaDir, "secrets", "wg-public-key") + if err := os.WriteFile(pubKeyPath, []byte(pubKey), 0600); err != nil { + return "", "", fmt.Errorf("failed to save WG public key: %w", err) + } + if isFirstNode { // First node: self-assign 10.0.0.1, no peers yet wp.config = WireGuardConfig{ diff --git a/pkg/environments/production/prebuilt.go b/pkg/environments/production/prebuilt.go index 6bbcba2..a04fe4f 100644 --- a/pkg/environments/production/prebuilt.go +++ b/pkg/environments/production/prebuilt.go @@ -291,12 +291,20 @@ func (ps *ProductionSetup) installAnyonFromPreBuilt() error { } // copyBinary copies a file from src to dest, preserving executable permissions. +// It removes the destination first to avoid ETXTBSY ("text file busy") errors +// when overwriting a binary that is currently running. func copyBinary(src, dest string) error { // Ensure parent directory exists if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil { return err } + // Remove the old binary first. On Linux, if the binary is running, + // rm unlinks the filename while the kernel keeps the inode alive for + // the running process. Writing a new file at the same path creates a + // fresh inode — no ETXTBSY conflict. + _ = os.Remove(dest) + srcFile, err := os.Open(src) if err != nil { return err diff --git a/pkg/environments/production/services.go b/pkg/environments/production/services.go index 24c2a37..3eca7e0 100644 --- a/pkg/environments/production/services.go +++ b/pkg/environments/production/services.go @@ -213,6 +213,7 @@ Requires=wg-quick@wg0.service [Service] Type=simple %[5]s +AmbientCapabilities=CAP_NET_ADMIN ReadWritePaths=%[2]s WorkingDirectory=%[1]s Environment=HOME=%[1]s diff --git a/pkg/gateway/handlers/join/handler.go b/pkg/gateway/handlers/join/handler.go index 301b39b..678c82f 100644 --- a/pkg/gateway/handlers/join/handler.go +++ b/pkg/gateway/handlers/join/handler.go @@ -2,8 +2,10 @@ package join import ( "context" + "encoding/base64" "encoding/json" "fmt" + "net" "net/http" "os" "os/exec" @@ -100,6 +102,24 @@ func (h *Handler) HandleJoin(w http.ResponseWriter, r *http.Request) { return } + // Validate public IP format + if net.ParseIP(req.PublicIP) == nil || net.ParseIP(req.PublicIP).To4() == nil { + http.Error(w, "public_ip must be a valid IPv4 address", http.StatusBadRequest) + return + } + + // Validate WireGuard public key: must be base64-encoded 32 bytes (Curve25519) + // Also reject control characters (newlines) to prevent config injection + if strings.ContainsAny(req.WGPublicKey, "\n\r") { + http.Error(w, "wg_public_key contains invalid characters", http.StatusBadRequest) + return + } + wgKeyBytes, err := base64.StdEncoding.DecodeString(req.WGPublicKey) + if err != nil || len(wgKeyBytes) != 32 { + http.Error(w, "wg_public_key must be a valid base64-encoded 32-byte key", http.StatusBadRequest) + return + } + ctx := r.Context() // 1. Validate and consume the invite token (atomic single-use) @@ -177,7 +197,15 @@ func (h *Handler) HandleJoin(w http.ResponseWriter, r *http.Request) { olricEncryptionKey = strings.TrimSpace(string(data)) } - // 7. Get all WG peers + // 7. Get this node's WG IP (needed before peer list to check self-inclusion) + myWGIP, err := h.getMyWGIP() + if err != nil { + h.logger.Error("failed to get local WG IP", zap.Error(err)) + http.Error(w, "internal error", http.StatusInternalServerError) + return + } + + // 8. Get all WG peers wgPeers, err := h.getWGPeers(ctx, req.WGPublicKey) if err != nil { h.logger.Error("failed to list WG peers", zap.Error(err)) @@ -185,12 +213,29 @@ func (h *Handler) HandleJoin(w http.ResponseWriter, r *http.Request) { return } - // 8. Get this node's WG IP - myWGIP, err := h.getMyWGIP() - if err != nil { - h.logger.Error("failed to get local WG IP", zap.Error(err)) - http.Error(w, "internal error", http.StatusInternalServerError) - return + // Ensure this node (the join handler's host) is in the peer list. + // On a fresh genesis node, the WG sync loop may not have self-registered + // into wireguard_peers yet, causing 0 peers to be returned. + if !wgPeersContainsIP(wgPeers, myWGIP) { + myPubKey, err := h.getMyWGPublicKey() + if err != nil { + h.logger.Error("failed to get local WG public key", zap.Error(err)) + http.Error(w, "internal error", http.StatusInternalServerError) + return + } + myPublicIP, err := h.getMyPublicIP() + if err != nil { + h.logger.Error("failed to get local public IP", zap.Error(err)) + http.Error(w, "internal error", http.StatusInternalServerError) + return + } + wgPeers = append([]WGPeerInfo{{ + PublicKey: myPubKey, + Endpoint: fmt.Sprintf("%s:%d", myPublicIP, 51820), + AllowedIP: fmt.Sprintf("%s/32", myWGIP), + }}, wgPeers...) + h.logger.Info("self-injected into WG peer list (sync loop hasn't registered yet)", + zap.String("wg_ip", myWGIP)) } // 9. Query IPFS and IPFS Cluster peer info @@ -346,6 +391,17 @@ func (h *Handler) addWGPeerLocally(pubKey, publicIP, wgIP string) error { return nil } +// wgPeersContainsIP checks if any peer in the list has the given WG IP +func wgPeersContainsIP(peers []WGPeerInfo, wgIP string) bool { + target := fmt.Sprintf("%s/32", wgIP) + for _, p := range peers { + if p.AllowedIP == target { + return true + } + } + return false +} + // getWGPeers returns all WG peers except the requesting node func (h *Handler) getWGPeers(ctx context.Context, excludePubKey string) ([]WGPeerInfo, error) { type peerRow struct { @@ -403,6 +459,32 @@ func (h *Handler) getMyWGIP() (string, error) { return "", fmt.Errorf("could not find wg0 IP address") } +// getMyWGPublicKey reads the local WireGuard public key from the orama secrets +// directory. The key is saved there during install by Phase6SetupWireGuard. +// This avoids needing root/CAP_NET_ADMIN permissions that `wg show wg0` requires. +func (h *Handler) getMyWGPublicKey() (string, error) { + data, err := os.ReadFile(h.oramaDir + "/secrets/wg-public-key") + if err != nil { + return "", fmt.Errorf("failed to read WG public key from %s/secrets/wg-public-key: %w", h.oramaDir, err) + } + key := strings.TrimSpace(string(data)) + if key == "" { + return "", fmt.Errorf("WG public key file is empty") + } + return key, nil +} + +// getMyPublicIP determines this node's public IP by connecting to a public server +func (h *Handler) getMyPublicIP() (string, error) { + conn, err := net.DialTimeout("udp", "8.8.8.8:80", 3*time.Second) + if err != nil { + return "", fmt.Errorf("failed to determine public IP: %w", err) + } + defer conn.Close() + addr := conn.LocalAddr().(*net.UDPAddr) + return addr.IP.String(), nil +} + // queryIPFSPeerInfo gets the local IPFS node's peer ID and builds addrs with WG IP func (h *Handler) queryIPFSPeerInfo(myWGIP string) PeerInfo { client := &http.Client{Timeout: 5 * time.Second} diff --git a/pkg/gateway/handlers/join/handler_test.go b/pkg/gateway/handlers/join/handler_test.go new file mode 100644 index 0000000..a170aa7 --- /dev/null +++ b/pkg/gateway/handlers/join/handler_test.go @@ -0,0 +1,112 @@ +package join + +import ( + "encoding/base64" + "fmt" + "net" + "strings" + "testing" +) + +func TestWgPeersContainsIP_found(t *testing.T) { + peers := []WGPeerInfo{ + {PublicKey: "key1", Endpoint: "1.2.3.4:51820", AllowedIP: "10.0.0.1/32"}, + {PublicKey: "key2", Endpoint: "5.6.7.8:51820", AllowedIP: "10.0.0.2/32"}, + } + + if !wgPeersContainsIP(peers, "10.0.0.1") { + t.Error("expected to find 10.0.0.1 in peer list") + } + if !wgPeersContainsIP(peers, "10.0.0.2") { + t.Error("expected to find 10.0.0.2 in peer list") + } +} + +func TestWgPeersContainsIP_not_found(t *testing.T) { + peers := []WGPeerInfo{ + {PublicKey: "key1", Endpoint: "1.2.3.4:51820", AllowedIP: "10.0.0.1/32"}, + } + + if wgPeersContainsIP(peers, "10.0.0.2") { + t.Error("did not expect to find 10.0.0.2 in peer list") + } +} + +func TestWgPeersContainsIP_empty_list(t *testing.T) { + if wgPeersContainsIP(nil, "10.0.0.1") { + t.Error("did not expect to find any IP in nil peer list") + } + if wgPeersContainsIP([]WGPeerInfo{}, "10.0.0.1") { + t.Error("did not expect to find any IP in empty peer list") + } +} + +func TestAssignWGIP_format(t *testing.T) { + // Verify the WG IP format used in the handler matches what wgPeersContainsIP expects + wgIP := "10.0.0.1" + allowedIP := fmt.Sprintf("%s/32", wgIP) + peers := []WGPeerInfo{{AllowedIP: allowedIP}} + + if !wgPeersContainsIP(peers, wgIP) { + t.Errorf("format mismatch: wgPeersContainsIP(%q, %q) should match", allowedIP, wgIP) + } +} + +func TestValidatePublicIP(t *testing.T) { + tests := []struct { + name string + ip string + valid bool + }{ + {"valid IPv4", "46.225.234.112", true}, + {"loopback", "127.0.0.1", true}, + {"invalid string", "not-an-ip", false}, + {"empty", "", false}, + {"IPv6", "::1", false}, + {"with newline", "1.2.3.4\n5.6.7.8", false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + parsed := net.ParseIP(tt.ip) + isValid := parsed != nil && parsed.To4() != nil && !strings.ContainsAny(tt.ip, "\n\r") + if isValid != tt.valid { + t.Errorf("IP %q: expected valid=%v, got %v", tt.ip, tt.valid, isValid) + } + }) + } +} + +func TestValidateWGPublicKey(t *testing.T) { + // Valid WireGuard key: 32 bytes, base64 encoded = 44 chars + validKey := base64.StdEncoding.EncodeToString(make([]byte, 32)) + + tests := []struct { + name string + key string + valid bool + }{ + {"valid 32-byte key", validKey, true}, + {"too short", base64.StdEncoding.EncodeToString(make([]byte, 16)), false}, + {"too long", base64.StdEncoding.EncodeToString(make([]byte, 64)), false}, + {"not base64", "not-a-valid-base64-key!!!", false}, + {"empty", "", false}, + {"newline injection", validKey + "\n[Peer]", false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if strings.ContainsAny(tt.key, "\n\r") { + if tt.valid { + t.Errorf("key %q contains newlines but expected valid", tt.key) + } + return + } + decoded, err := base64.StdEncoding.DecodeString(tt.key) + isValid := err == nil && len(decoded) == 32 + if isValid != tt.valid { + t.Errorf("key %q: expected valid=%v, got %v", tt.key, tt.valid, isValid) + } + }) + } +} diff --git a/pkg/gateway/status_handlers.go b/pkg/gateway/status_handlers.go index dc0eced..7a8259b 100644 --- a/pkg/gateway/status_handlers.go +++ b/pkg/gateway/status_handlers.go @@ -129,7 +129,10 @@ func (g *Gateway) healthHandler(w http.ResponseWriter, r *http.Request) { if anyoneproxy.Running() { nr.result = checkResult{Status: "ok", Latency: time.Since(start).String()} } else { - nr.result = checkResult{Status: "error", Latency: time.Since(start).String(), Error: "SOCKS5 proxy not reachable at " + anyoneproxy.Address()} + // SOCKS5 port not reachable — Anyone relay is not installed/running. + // Treat as "unavailable" rather than "error" so nodes without Anyone + // don't report as degraded. + nr.result = checkResult{Status: "unavailable"} } } ch <- nr @@ -142,25 +145,7 @@ func (g *Gateway) healthHandler(w http.ResponseWriter, r *http.Request) { checks[nr.name] = nr.result } - // Aggregate status. - // Critical: rqlite down → "unhealthy" - // Non-critical (olric, ipfs, libp2p) error → "degraded" - // "unavailable" means the client was never configured — not an error. - overallStatus := "healthy" - if c := checks["rqlite"]; c.Status == "error" { - overallStatus = "unhealthy" - } - if overallStatus == "healthy" { - for name, c := range checks { - if name == "rqlite" { - continue - } - if c.Status == "error" { - overallStatus = "degraded" - break - } - } - } + overallStatus := aggregateHealthStatus(checks) httpStatus := http.StatusOK if overallStatus != "healthy" { @@ -236,6 +221,27 @@ func (g *Gateway) versionHandler(w http.ResponseWriter, r *http.Request) { }) } +// aggregateHealthStatus determines the overall health status from individual checks. +// Critical: rqlite down → "unhealthy" +// Non-critical (olric, ipfs, libp2p, anyone) error → "degraded" +// "unavailable" means the client was never configured — not an error. +func aggregateHealthStatus(checks map[string]checkResult) string { + status := "healthy" + if c := checks["rqlite"]; c.Status == "error" { + return "unhealthy" + } + for name, c := range checks { + if name == "rqlite" { + continue + } + if c.Status == "error" { + status = "degraded" + break + } + } + return status +} + // tlsCheckHandler validates if a domain should receive a TLS certificate // Used by Caddy's on-demand TLS feature to prevent abuse func (g *Gateway) tlsCheckHandler(w http.ResponseWriter, r *http.Request) { diff --git a/pkg/gateway/status_handlers_test.go b/pkg/gateway/status_handlers_test.go new file mode 100644 index 0000000..e20b239 --- /dev/null +++ b/pkg/gateway/status_handlers_test.go @@ -0,0 +1,72 @@ +package gateway + +import "testing" + +func TestAggregateHealthStatus_allHealthy(t *testing.T) { + checks := map[string]checkResult{ + "rqlite": {Status: "ok"}, + "olric": {Status: "ok"}, + "ipfs": {Status: "ok"}, + "libp2p": {Status: "ok"}, + "anyone": {Status: "ok"}, + } + if got := aggregateHealthStatus(checks); got != "healthy" { + t.Errorf("expected healthy, got %s", got) + } +} + +func TestAggregateHealthStatus_rqliteError(t *testing.T) { + checks := map[string]checkResult{ + "rqlite": {Status: "error", Error: "connection refused"}, + "olric": {Status: "ok"}, + "ipfs": {Status: "ok"}, + } + if got := aggregateHealthStatus(checks); got != "unhealthy" { + t.Errorf("expected unhealthy, got %s", got) + } +} + +func TestAggregateHealthStatus_nonCriticalError(t *testing.T) { + checks := map[string]checkResult{ + "rqlite": {Status: "ok"}, + "olric": {Status: "error", Error: "timeout"}, + "ipfs": {Status: "ok"}, + } + if got := aggregateHealthStatus(checks); got != "degraded" { + t.Errorf("expected degraded, got %s", got) + } +} + +func TestAggregateHealthStatus_unavailableIsNotError(t *testing.T) { + // Key test: "unavailable" services (like Anyone in sandbox) should NOT + // cause degraded status. + checks := map[string]checkResult{ + "rqlite": {Status: "ok"}, + "olric": {Status: "ok"}, + "ipfs": {Status: "unavailable"}, + "libp2p": {Status: "unavailable"}, + "anyone": {Status: "unavailable"}, + } + if got := aggregateHealthStatus(checks); got != "healthy" { + t.Errorf("expected healthy when services are unavailable, got %s", got) + } +} + +func TestAggregateHealthStatus_emptyChecks(t *testing.T) { + checks := map[string]checkResult{} + if got := aggregateHealthStatus(checks); got != "healthy" { + t.Errorf("expected healthy for empty checks, got %s", got) + } +} + +func TestAggregateHealthStatus_rqliteErrorOverridesDegraded(t *testing.T) { + // rqlite error should take priority over other errors + checks := map[string]checkResult{ + "rqlite": {Status: "error", Error: "leader not found"}, + "olric": {Status: "error", Error: "timeout"}, + "anyone": {Status: "error", Error: "not reachable"}, + } + if got := aggregateHealthStatus(checks); got != "unhealthy" { + t.Errorf("expected unhealthy (rqlite takes priority), got %s", got) + } +} From 733b059681f516dadba4273ae848b8dde801f4d6 Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Mon, 9 Mar 2026 10:59:15 +0200 Subject: [PATCH 12/13] feat(sandbox): add --anyone-client flag to rollout - propagate `--anyone-client` to `orama node upgrade` on all nodes - prioritize explicit `--anyone-client` over prefs/auto-detect in production - ensure mutual exclusivity between relay/client modes in prefs --- pkg/cli/cmd/sandboxcmd/sandbox.go | 6 +++- pkg/cli/production/upgrade/orchestrator.go | 20 ++++++++------ pkg/cli/sandbox/rollout.go | 32 ++++++++++++++++++---- 3 files changed, 44 insertions(+), 14 deletions(-) diff --git a/pkg/cli/cmd/sandboxcmd/sandbox.go b/pkg/cli/cmd/sandboxcmd/sandbox.go index 42043a0..484a4a2 100644 --- a/pkg/cli/cmd/sandboxcmd/sandbox.go +++ b/pkg/cli/cmd/sandboxcmd/sandbox.go @@ -76,7 +76,10 @@ var rolloutCmd = &cobra.Command{ Short: "Build + push + rolling upgrade to sandbox cluster", RunE: func(cmd *cobra.Command, args []string) error { name, _ := cmd.Flags().GetString("name") - return sandbox.Rollout(name) + anyoneClient, _ := cmd.Flags().GetBool("anyone-client") + return sandbox.Rollout(name, sandbox.RolloutFlags{ + AnyoneClient: anyoneClient, + }) }, } @@ -121,6 +124,7 @@ func init() { // rollout flags rolloutCmd.Flags().String("name", "", "Sandbox name (uses active if not specified)") + rolloutCmd.Flags().Bool("anyone-client", false, "Enable Anyone client (SOCKS5 proxy) on all nodes") // ssh flags sshCmd.Flags().String("name", "", "Sandbox name (uses active if not specified)") diff --git a/pkg/cli/production/upgrade/orchestrator.go b/pkg/cli/production/upgrade/orchestrator.go index 459c12f..8c20bdb 100644 --- a/pkg/cli/production/upgrade/orchestrator.go +++ b/pkg/cli/production/upgrade/orchestrator.go @@ -41,7 +41,8 @@ func NewOrchestrator(flags *Flags) *Orchestrator { setup := production.NewProductionSetup(oramaHome, os.Stdout, flags.Force, flags.SkipChecks) setup.SetNameserver(isNameserver) - // Configure Anyone mode (flag > saved preference > auto-detect) + // Configure Anyone mode (explicit flags > saved preferences > auto-detect) + // Explicit flags always win — they represent the user's current intent. if flags.AnyoneRelay { setup.SetAnyoneRelayConfig(&production.AnyoneRelayConfig{ Enabled: true, @@ -55,6 +56,9 @@ func NewOrchestrator(flags *Flags) *Orchestrator { BandwidthPct: flags.AnyoneBandwidth, AccountingMax: flags.AnyoneAccounting, }) + } else if flags.AnyoneClient { + // Explicit --anyone-client flag overrides saved relay prefs and auto-detect. + setup.SetAnyoneClient(true) } else if prefs.AnyoneRelay { // Restore relay config from saved preferences (for firewall rules) orPort := prefs.AnyoneORPort @@ -65,6 +69,8 @@ func NewOrchestrator(flags *Flags) *Orchestrator { Enabled: true, ORPort: orPort, }) + } else if prefs.AnyoneClient { + setup.SetAnyoneClient(true) } else if detectAnyoneRelay(oramaDir) { // Auto-detect: relay is installed but preferences weren't saved. // This happens when upgrading from older versions that didn't persist @@ -79,8 +85,6 @@ func NewOrchestrator(flags *Flags) *Orchestrator { prefs.AnyoneORPort = orPort _ = production.SavePreferences(oramaDir, prefs) fmt.Printf(" Auto-detected Anyone relay (ORPort: %d), saved to preferences\n", orPort) - } else if flags.AnyoneClient || prefs.AnyoneClient { - setup.SetAnyoneClient(true) } return &Orchestrator{ @@ -207,15 +211,15 @@ func (o *Orchestrator) handleBranchPreferences() error { fmt.Printf(" Nameserver mode: enabled (CoreDNS + Caddy)\n") } - // If anyone-client was explicitly provided, update it + // Anyone client and relay are mutually exclusive — setting one clears the other. if o.flags.AnyoneClient { prefs.AnyoneClient = true + prefs.AnyoneRelay = false + prefs.AnyoneORPort = 0 prefsChanged = true - } - - // If anyone-relay was explicitly provided, update it - if o.flags.AnyoneRelay { + } else if o.flags.AnyoneRelay { prefs.AnyoneRelay = true + prefs.AnyoneClient = false prefs.AnyoneORPort = o.flags.AnyoneORPort if prefs.AnyoneORPort == 0 { prefs.AnyoneORPort = 9001 diff --git a/pkg/cli/sandbox/rollout.go b/pkg/cli/sandbox/rollout.go index 396e8f4..8a15385 100644 --- a/pkg/cli/sandbox/rollout.go +++ b/pkg/cli/sandbox/rollout.go @@ -4,14 +4,20 @@ import ( "fmt" "os" "path/filepath" + "strings" "time" "github.com/DeBrosOfficial/network/pkg/cli/remotessh" "github.com/DeBrosOfficial/network/pkg/inspector" ) +// RolloutFlags holds optional flags passed through to `orama node upgrade`. +type RolloutFlags struct { + AnyoneClient bool +} + // Rollout builds, pushes, and performs a rolling upgrade on a sandbox cluster. -func Rollout(name string) error { +func Rollout(name string, flags RolloutFlags) error { cfg, err := LoadConfig() if err != nil { return err @@ -34,6 +40,9 @@ func Rollout(name string) error { info, _ := os.Stat(archivePath) fmt.Printf("Archive: %s (%s)\n\n", filepath.Base(archivePath), formatBytes(info.Size())) + // Build extra flags string for upgrade command + extraFlags := flags.upgradeFlags() + // Step 2: Push archive to all nodes (upload to first, fan out server-to-server) fmt.Println("Pushing archive to all nodes...") if err := fanoutArchive(state.Servers, sshKeyPath, archivePath); err != nil { @@ -54,7 +63,7 @@ func Rollout(name string) error { if i == leaderIdx { continue // skip leader, do it last } - if err := upgradeNode(srv, sshKeyPath, i+1, len(state.Servers)); err != nil { + if err := upgradeNode(srv, sshKeyPath, i+1, len(state.Servers), extraFlags); err != nil { return err } // Wait between nodes @@ -67,7 +76,7 @@ func Rollout(name string) error { // Upgrade leader last if leaderIdx >= 0 { srv := state.Servers[leaderIdx] - if err := upgradeNode(srv, sshKeyPath, len(state.Servers), len(state.Servers)); err != nil { + if err := upgradeNode(srv, sshKeyPath, len(state.Servers), len(state.Servers), extraFlags); err != nil { return err } } @@ -76,6 +85,15 @@ func Rollout(name string) error { return nil } +// upgradeFlags builds the extra CLI flags string for `orama node upgrade`. +func (f RolloutFlags) upgradeFlags() string { + var parts []string + if f.AnyoneClient { + parts = append(parts, "--anyone-client") + } + return strings.Join(parts, " ") +} + // findLeaderIndex returns the index of the RQLite leader node, or -1 if unknown. func findLeaderIndex(state *SandboxState, sshKeyPath string) int { for i, srv := range state.Servers { @@ -92,7 +110,7 @@ func findLeaderIndex(state *SandboxState, sshKeyPath string) int { // It pre-replaces the orama CLI binary before running the upgrade command // to avoid ETXTBSY ("text file busy") errors when the old binary doesn't // have the os.Remove fix in copyBinary(). -func upgradeNode(srv ServerState, sshKeyPath string, current, total int) error { +func upgradeNode(srv ServerState, sshKeyPath string, current, total int, extraFlags string) error { node := inspector.Node{User: "root", Host: srv.IP, SSHKey: sshKeyPath} fmt.Printf(" [%d/%d] Upgrading %s (%s)...\n", current, total, srv.Name, srv.IP) @@ -105,7 +123,11 @@ func upgradeNode(srv ServerState, sshKeyPath string, current, total int) error { return fmt.Errorf("pre-replace orama binary on %s: %w", srv.Name, err) } - if err := remotessh.RunSSHStreaming(node, "orama node upgrade --restart", remotessh.WithNoHostKeyCheck()); err != nil { + upgradeCmd := "orama node upgrade --restart" + if extraFlags != "" { + upgradeCmd += " " + extraFlags + } + if err := remotessh.RunSSHStreaming(node, upgradeCmd, remotessh.WithNoHostKeyCheck()); err != nil { return fmt.Errorf("upgrade %s: %w", srv.Name, err) } From fa826f0d009643abe7bb997dae4710ef03e49f4b Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Tue, 10 Mar 2026 05:25:41 +0200 Subject: [PATCH 13/13] refactor(sandbox): integrate rootwallet SSH keys - replace standalone sandbox keys with "sandbox/root" vault entry - update inspector config to use vault targets (no passwords/keys) - make sandbox default active environment - add vault helpers and tests for remotessh --- docs/INSPECTOR.md | 10 +- docs/SANDBOX.md | 6 +- pkg/cli/environment.go | 18 +-- pkg/cli/monitor/collector.go | 9 +- pkg/cli/remotessh/wallet.go | 90 +++++++++++- pkg/cli/remotessh/wallet_test.go | 29 ++++ pkg/cli/sandbox/config.go | 34 +---- pkg/cli/sandbox/config_test.go | 53 +++++++ pkg/cli/sandbox/create.go | 45 +++--- pkg/cli/sandbox/reset.go | 24 +--- pkg/cli/sandbox/rollout.go | 7 +- pkg/cli/sandbox/setup.go | 119 +++++----------- pkg/cli/sandbox/ssh_cmd.go | 23 ++- pkg/cli/sandbox/state.go | 6 +- pkg/cli/sandbox/state_test.go | 9 +- pkg/cli/sandbox/status.go | 7 +- pkg/encryption/wallet_keygen.go | 194 ------------------------- pkg/encryption/wallet_keygen_test.go | 202 --------------------------- pkg/gateway/status_handlers.go | 57 ++++++-- pkg/gateway/status_handlers_test.go | 59 ++++++-- pkg/inspector/config.go | 1 + scripts/remote-nodes.conf.example | 29 ++-- 22 files changed, 416 insertions(+), 615 deletions(-) create mode 100644 pkg/cli/remotessh/wallet_test.go create mode 100644 pkg/cli/sandbox/config_test.go delete mode 100644 pkg/encryption/wallet_keygen.go delete mode 100644 pkg/encryption/wallet_keygen_test.go diff --git a/docs/INSPECTOR.md b/docs/INSPECTOR.md index 57224bb..aa05806 100644 --- a/docs/INSPECTOR.md +++ b/docs/INSPECTOR.md @@ -167,18 +167,18 @@ The inspector reads node definitions from a pipe-delimited config file (default: ### Format ``` -# environment|user@host|password|role|ssh_key -devnet|ubuntu@1.2.3.4|mypassword|node| -devnet|ubuntu@5.6.7.8|mypassword|nameserver-ns1|/path/to/key +# environment|user@host|role +devnet|ubuntu@1.2.3.4|node +devnet|ubuntu@5.6.7.8|nameserver-ns1 ``` | Field | Description | |-------|-------------| | `environment` | Cluster name (`devnet`, `testnet`) | | `user@host` | SSH credentials | -| `password` | SSH password | | `role` | `node` or `nameserver-ns1`, `nameserver-ns2`, etc. | -| `ssh_key` | Optional path to SSH private key | + +SSH keys are resolved from rootwallet (`rw vault ssh get / --priv`). Blank lines and lines starting with `#` are ignored. diff --git a/docs/SANDBOX.md b/docs/SANDBOX.md index a2df967..d929e55 100644 --- a/docs/SANDBOX.md +++ b/docs/SANDBOX.md @@ -69,8 +69,8 @@ This will: 2. Ask for your sandbox domain 3. Create or reuse 2 Hetzner Floating IPs (~$0.005/hr each) 4. Create a firewall with sandbox rules -5. Generate an SSH keypair at `~/.orama/sandbox_key` -6. Upload the public key to Hetzner +5. Create a rootwallet SSH entry (`sandbox/root`) if it doesn't exist +6. Upload the wallet-derived public key to Hetzner 7. Display DNS configuration instructions Config is saved to `~/.orama/sandbox.yaml`. @@ -143,7 +143,7 @@ Hetzner Floating IPs are persistent IPv4 addresses that can be reassigned betwee ### SSH Authentication -Sandbox uses a standalone ed25519 keypair at `~/.orama/sandbox_key`, separate from the production wallet-derived keys. The public key is uploaded to Hetzner during setup and injected into every server at creation time. +Sandbox uses a rootwallet-derived SSH key (`sandbox/root` vault entry), the same mechanism as production. The wallet must be unlocked (`rw unlock`) before running sandbox commands that use SSH. The public key is uploaded to Hetzner during setup and injected into every server at creation time. ### Server Naming diff --git a/pkg/cli/environment.go b/pkg/cli/environment.go index 5df2a2c..b92bc5f 100644 --- a/pkg/cli/environment.go +++ b/pkg/cli/environment.go @@ -26,16 +26,16 @@ type EnvironmentConfig struct { // Default environments var DefaultEnvironments = []Environment{ { - Name: "production", + Name: "sandbox", GatewayURL: "https://dbrs.space", - Description: "Production network (dbrs.space)", - IsActive: false, + Description: "Sandbox cluster (dbrs.space)", + IsActive: true, }, { Name: "devnet", GatewayURL: "https://orama-devnet.network", - Description: "Development network (testnet)", - IsActive: true, + Description: "Development network", + IsActive: false, }, { Name: "testnet", @@ -65,7 +65,7 @@ func LoadEnvironmentConfig() (*EnvironmentConfig, error) { if _, err := os.Stat(path); os.IsNotExist(err) { return &EnvironmentConfig{ Environments: DefaultEnvironments, - ActiveEnvironment: "devnet", + ActiveEnvironment: "sandbox", }, nil } @@ -120,9 +120,9 @@ func GetActiveEnvironment() (*Environment, error) { } } - // Fallback to devnet if active environment not found + // Fallback to sandbox if active environment not found for _, env := range envConfig.Environments { - if env.Name == "devnet" { + if env.Name == "sandbox" { return &env, nil } } @@ -184,7 +184,7 @@ func InitializeEnvironments() error { envConfig := &EnvironmentConfig{ Environments: DefaultEnvironments, - ActiveEnvironment: "devnet", + ActiveEnvironment: "sandbox", } return SaveEnvironmentConfig(envConfig) diff --git a/pkg/cli/monitor/collector.go b/pkg/cli/monitor/collector.go index 1742667..8fcec53 100644 --- a/pkg/cli/monitor/collector.go +++ b/pkg/cli/monitor/collector.go @@ -157,7 +157,7 @@ func loadSandboxNodes(cfg CollectorConfig) ([]inspector.Node, func(), error) { return nil, noop, fmt.Errorf("no active sandbox found") } - nodes := state.ToNodes(sbxCfg.ExpandedPrivateKeyPath()) + nodes := state.ToNodes(sbxCfg.SSHKey.VaultTarget) if cfg.NodeFilter != "" { nodes = filterByHost(nodes, cfg.NodeFilter) } @@ -165,5 +165,10 @@ func loadSandboxNodes(cfg CollectorConfig) ([]inspector.Node, func(), error) { return nil, noop, fmt.Errorf("no nodes found for sandbox %q", state.Name) } - return nodes, noop, nil + cleanup, err := remotessh.PrepareNodeKeys(nodes) + if err != nil { + return nil, noop, fmt.Errorf("prepare SSH keys: %w", err) + } + + return nodes, cleanup, nil } diff --git a/pkg/cli/remotessh/wallet.go b/pkg/cli/remotessh/wallet.go index bd8b0b5..5675110 100644 --- a/pkg/cli/remotessh/wallet.go +++ b/pkg/cli/remotessh/wallet.go @@ -36,14 +36,21 @@ func PrepareNodeKeys(nodes []inspector.Node) (cleanup func(), err error) { var allKeyPaths []string for i := range nodes { - key := nodes[i].Host + "/" + nodes[i].User + // Use VaultTarget if set, otherwise default to Host/User + var key string + if nodes[i].VaultTarget != "" { + key = nodes[i].VaultTarget + } else { + key = nodes[i].Host + "/" + nodes[i].User + } if existing, ok := keyPaths[key]; ok { nodes[i].SSHKey = existing continue } // Call rw to get the private key PEM - pem, err := resolveWalletKey(rw, nodes[i].Host, nodes[i].User) + host, user := parseVaultTarget(key) + pem, err := resolveWalletKey(rw, host, user) if err != nil { // Cleanup any keys already written before returning error cleanupKeys(tmpDir, allKeyPaths) @@ -81,7 +88,12 @@ func LoadAgentKeys(nodes []inspector.Node) error { seen := make(map[string]bool) var targets []string for _, n := range nodes { - key := n.Host + "/" + n.User + var key string + if n.VaultTarget != "" { + key = n.VaultTarget + } else { + key = n.Host + "/" + n.User + } if seen[key] { continue } @@ -104,6 +116,78 @@ func LoadAgentKeys(nodes []inspector.Node) error { return nil } +// EnsureVaultEntry creates a wallet SSH entry if it doesn't already exist. +// Checks existence via `rw vault ssh get --pub`, and if missing, +// runs `rw vault ssh add ` to create it. +func EnsureVaultEntry(vaultTarget string) error { + rw, err := rwBinary() + if err != nil { + return err + } + + // Check if entry exists by trying to get the public key + cmd := exec.Command(rw, "vault", "ssh", "get", vaultTarget, "--pub") + if err := cmd.Run(); err == nil { + return nil // entry already exists + } + + // Entry doesn't exist — try to create it + addCmd := exec.Command(rw, "vault", "ssh", "add", vaultTarget) + addCmd.Stdin = os.Stdin + addCmd.Stdout = os.Stderr + addCmd.Stderr = os.Stderr + if err := addCmd.Run(); err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + stderr := strings.TrimSpace(string(exitErr.Stderr)) + if strings.Contains(stderr, "not unlocked") || strings.Contains(stderr, "session") { + return fmt.Errorf("wallet is locked — run: rw unlock") + } + } + return fmt.Errorf("rw vault ssh add %s failed: %w", vaultTarget, err) + } + return nil +} + +// ResolveVaultPublicKey returns the OpenSSH public key string for a vault entry. +// Calls `rw vault ssh get --pub`. +func ResolveVaultPublicKey(vaultTarget string) (string, error) { + rw, err := rwBinary() + if err != nil { + return "", err + } + + cmd := exec.Command(rw, "vault", "ssh", "get", vaultTarget, "--pub") + out, err := cmd.Output() + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + stderr := strings.TrimSpace(string(exitErr.Stderr)) + if strings.Contains(stderr, "No SSH entry") { + return "", fmt.Errorf("no vault SSH entry for %s — run: rw vault ssh add %s", vaultTarget, vaultTarget) + } + if strings.Contains(stderr, "not unlocked") || strings.Contains(stderr, "session") { + return "", fmt.Errorf("wallet is locked — run: rw unlock") + } + return "", fmt.Errorf("%s", stderr) + } + return "", fmt.Errorf("rw command failed: %w", err) + } + + pubKey := strings.TrimSpace(string(out)) + if !strings.HasPrefix(pubKey, "ssh-") { + return "", fmt.Errorf("rw returned invalid public key for %s", vaultTarget) + } + return pubKey, nil +} + +// parseVaultTarget splits a "host/user" vault target string into host and user. +func parseVaultTarget(target string) (host, user string) { + idx := strings.Index(target, "/") + if idx < 0 { + return target, "" + } + return target[:idx], target[idx+1:] +} + // resolveWalletKey calls `rw vault ssh get / --priv` // and returns the PEM string. Requires an active rw session. func resolveWalletKey(rw string, host, user string) (string, error) { diff --git a/pkg/cli/remotessh/wallet_test.go b/pkg/cli/remotessh/wallet_test.go new file mode 100644 index 0000000..b3fece6 --- /dev/null +++ b/pkg/cli/remotessh/wallet_test.go @@ -0,0 +1,29 @@ +package remotessh + +import "testing" + +func TestParseVaultTarget(t *testing.T) { + tests := []struct { + target string + wantHost string + wantUser string + }{ + {"sandbox/root", "sandbox", "root"}, + {"192.168.1.1/ubuntu", "192.168.1.1", "ubuntu"}, + {"my-host/my-user", "my-host", "my-user"}, + {"noslash", "noslash", ""}, + {"a/b/c", "a", "b/c"}, + } + + for _, tt := range tests { + t.Run(tt.target, func(t *testing.T) { + host, user := parseVaultTarget(tt.target) + if host != tt.wantHost { + t.Errorf("parseVaultTarget(%q) host = %q, want %q", tt.target, host, tt.wantHost) + } + if user != tt.wantUser { + t.Errorf("parseVaultTarget(%q) user = %q, want %q", tt.target, user, tt.wantUser) + } + }) + } +} diff --git a/pkg/cli/sandbox/config.go b/pkg/cli/sandbox/config.go index 11eb410..7d89695 100644 --- a/pkg/cli/sandbox/config.go +++ b/pkg/cli/sandbox/config.go @@ -25,11 +25,10 @@ type FloatIP struct { IP string `yaml:"ip"` } -// SSHKeyConfig holds SSH key paths and the Hetzner resource ID. +// SSHKeyConfig holds the wallet vault target and Hetzner resource ID. type SSHKeyConfig struct { - HetznerID int64 `yaml:"hetzner_id"` - PrivateKeyPath string `yaml:"private_key_path"` - PublicKeyPath string `yaml:"public_key_path"` + HetznerID int64 `yaml:"hetzner_id"` + VaultTarget string `yaml:"vault_target"` // e.g. "sandbox/root" } // configDir returns ~/.orama/, creating it if needed. @@ -114,8 +113,8 @@ func (c *Config) validate() error { if len(c.FloatingIPs) < 2 { return fmt.Errorf("2 floating IPs required, got %d", len(c.FloatingIPs)) } - if c.SSHKey.PrivateKeyPath == "" { - return fmt.Errorf("ssh_key.private_key_path is required") + if c.SSHKey.VaultTarget == "" { + return fmt.Errorf("ssh_key.vault_target is required (run: orama sandbox setup)") } return nil } @@ -128,26 +127,7 @@ func (c *Config) Defaults() { if c.ServerType == "" { c.ServerType = "cx23" } -} - -// ExpandedPrivateKeyPath returns the absolute path to the SSH private key. -func (c *Config) ExpandedPrivateKeyPath() string { - return expandHome(c.SSHKey.PrivateKeyPath) -} - -// ExpandedPublicKeyPath returns the absolute path to the SSH public key. -func (c *Config) ExpandedPublicKeyPath() string { - return expandHome(c.SSHKey.PublicKeyPath) -} - -// expandHome replaces a leading ~ with the user's home directory. -func expandHome(path string) string { - if len(path) < 2 || path[:2] != "~/" { - return path + if c.SSHKey.VaultTarget == "" { + c.SSHKey.VaultTarget = "sandbox/root" } - home, err := os.UserHomeDir() - if err != nil { - return path - } - return filepath.Join(home, path[2:]) } diff --git a/pkg/cli/sandbox/config_test.go b/pkg/cli/sandbox/config_test.go new file mode 100644 index 0000000..dc5632b --- /dev/null +++ b/pkg/cli/sandbox/config_test.go @@ -0,0 +1,53 @@ +package sandbox + +import "testing" + +func TestConfig_Validate_EmptyVaultTarget(t *testing.T) { + cfg := &Config{ + HetznerAPIToken: "test-token", + Domain: "test.example.com", + FloatingIPs: []FloatIP{{ID: 1, IP: "1.1.1.1"}, {ID: 2, IP: "2.2.2.2"}}, + SSHKey: SSHKeyConfig{HetznerID: 1, VaultTarget: ""}, + } + if err := cfg.validate(); err == nil { + t.Error("validate() should reject empty VaultTarget") + } +} + +func TestConfig_Validate_WithVaultTarget(t *testing.T) { + cfg := &Config{ + HetznerAPIToken: "test-token", + Domain: "test.example.com", + FloatingIPs: []FloatIP{{ID: 1, IP: "1.1.1.1"}, {ID: 2, IP: "2.2.2.2"}}, + SSHKey: SSHKeyConfig{HetznerID: 1, VaultTarget: "sandbox/root"}, + } + if err := cfg.validate(); err != nil { + t.Errorf("validate() unexpected error: %v", err) + } +} + +func TestConfig_Defaults_SetsVaultTarget(t *testing.T) { + cfg := &Config{} + cfg.Defaults() + + if cfg.SSHKey.VaultTarget != "sandbox/root" { + t.Errorf("Defaults() VaultTarget = %q, want sandbox/root", cfg.SSHKey.VaultTarget) + } + if cfg.Location != "nbg1" { + t.Errorf("Defaults() Location = %q, want nbg1", cfg.Location) + } + if cfg.ServerType != "cx23" { + t.Errorf("Defaults() ServerType = %q, want cx23", cfg.ServerType) + } +} + +func TestConfig_Defaults_PreservesExistingVaultTarget(t *testing.T) { + cfg := &Config{ + SSHKey: SSHKeyConfig{VaultTarget: "custom/user"}, + } + cfg.Defaults() + + if cfg.SSHKey.VaultTarget != "custom/user" { + t.Errorf("Defaults() should preserve existing VaultTarget, got %q", cfg.SSHKey.VaultTarget) + } +} diff --git a/pkg/cli/sandbox/create.go b/pkg/cli/sandbox/create.go index 29434ac..2c26dac 100644 --- a/pkg/cli/sandbox/create.go +++ b/pkg/cli/sandbox/create.go @@ -19,6 +19,13 @@ func Create(name string) error { return err } + // Resolve wallet SSH key once for all phases + sshKeyPath, cleanup, err := resolveVaultKeyOnce(cfg.SSHKey.VaultTarget) + if err != nil { + return fmt.Errorf("prepare SSH key: %w", err) + } + defer cleanup() + // Check for existing active sandbox active, err := FindActiveSandbox() if err != nil { @@ -55,20 +62,20 @@ func Create(name string) error { // Phase 2: Assign floating IPs fmt.Println("\nPhase 2: Assigning floating IPs...") - if err := phase2AssignFloatingIPs(client, cfg, state); err != nil { + if err := phase2AssignFloatingIPs(client, cfg, state, sshKeyPath); err != nil { return fmt.Errorf("assign floating IPs: %w", err) } SaveState(state) // Phase 3: Upload binary archive fmt.Println("\nPhase 3: Uploading binary archive...") - if err := phase3UploadArchive(cfg, state); err != nil { + if err := phase3UploadArchive(state, sshKeyPath); err != nil { return fmt.Errorf("upload archive: %w", err) } // Phase 4: Install genesis node fmt.Println("\nPhase 4: Installing genesis node...") - tokens, err := phase4InstallGenesis(cfg, state) + tokens, err := phase4InstallGenesis(cfg, state, sshKeyPath) if err != nil { state.Status = StatusError SaveState(state) @@ -77,7 +84,7 @@ func Create(name string) error { // Phase 5: Join remaining nodes fmt.Println("\nPhase 5: Joining remaining nodes...") - if err := phase5JoinNodes(cfg, state, tokens); err != nil { + if err := phase5JoinNodes(cfg, state, tokens, sshKeyPath); err != nil { state.Status = StatusError SaveState(state) return fmt.Errorf("join nodes: %w", err) @@ -85,7 +92,7 @@ func Create(name string) error { // Phase 6: Verify cluster fmt.Println("\nPhase 6: Verifying cluster...") - phase6Verify(cfg, state) + phase6Verify(cfg, state, sshKeyPath) state.Status = StatusRunning SaveState(state) @@ -94,6 +101,18 @@ func Create(name string) error { return nil } +// resolveVaultKeyOnce resolves a wallet SSH key to a temp file. +// Returns the key path, cleanup function, and any error. +func resolveVaultKeyOnce(vaultTarget string) (string, func(), error) { + node := inspector.Node{User: "root", Host: "resolve-only", VaultTarget: vaultTarget} + nodes := []inspector.Node{node} + cleanup, err := remotessh.PrepareNodeKeys(nodes) + if err != nil { + return "", func() {}, err + } + return nodes[0].SSHKey, cleanup, nil +} + // phase1ProvisionServers creates 5 Hetzner servers in parallel. func phase1ProvisionServers(client *HetznerClient, cfg *Config, state *SandboxState) error { type serverResult struct { @@ -190,9 +209,7 @@ func phase1ProvisionServers(client *HetznerClient, cfg *Config, state *SandboxSt } // phase2AssignFloatingIPs assigns floating IPs and configures loopback. -func phase2AssignFloatingIPs(client *HetznerClient, cfg *Config, state *SandboxState) error { - sshKeyPath := cfg.ExpandedPrivateKeyPath() - +func phase2AssignFloatingIPs(client *HetznerClient, cfg *Config, state *SandboxState, sshKeyPath string) error { for i := 0; i < 2 && i < len(cfg.FloatingIPs) && i < len(state.Servers); i++ { fip := cfg.FloatingIPs[i] srv := state.Servers[i] @@ -245,7 +262,7 @@ func waitForSSH(node inspector.Node, timeout time.Duration) error { // phase3UploadArchive uploads the binary archive to the genesis node, then fans out // to the remaining nodes server-to-server (much faster than uploading from local machine). -func phase3UploadArchive(cfg *Config, state *SandboxState) error { +func phase3UploadArchive(state *SandboxState, sshKeyPath string) error { archivePath := findNewestArchive() if archivePath == "" { fmt.Println(" No binary archive found, run `orama build` first") @@ -255,7 +272,6 @@ func phase3UploadArchive(cfg *Config, state *SandboxState) error { info, _ := os.Stat(archivePath) fmt.Printf(" Archive: %s (%s)\n", filepath.Base(archivePath), formatBytes(info.Size())) - sshKeyPath := cfg.ExpandedPrivateKeyPath() if err := fanoutArchive(state.Servers, sshKeyPath, archivePath); err != nil { return err } @@ -265,9 +281,8 @@ func phase3UploadArchive(cfg *Config, state *SandboxState) error { } // phase4InstallGenesis installs the genesis node and generates invite tokens. -func phase4InstallGenesis(cfg *Config, state *SandboxState) ([]string, error) { +func phase4InstallGenesis(cfg *Config, state *SandboxState, sshKeyPath string) ([]string, error) { genesis := state.GenesisServer() - sshKeyPath := cfg.ExpandedPrivateKeyPath() node := inspector.Node{User: "root", Host: genesis.IP, SSHKey: sshKeyPath} // Install genesis @@ -304,9 +319,8 @@ func phase4InstallGenesis(cfg *Config, state *SandboxState) ([]string, error) { } // phase5JoinNodes joins the remaining 4 nodes to the cluster (serial). -func phase5JoinNodes(cfg *Config, state *SandboxState, tokens []string) error { +func phase5JoinNodes(cfg *Config, state *SandboxState, tokens []string, sshKeyPath string) error { genesisIP := state.GenesisServer().IP - sshKeyPath := cfg.ExpandedPrivateKeyPath() for i := 1; i < len(state.Servers); i++ { srv := state.Servers[i] @@ -340,8 +354,7 @@ func phase5JoinNodes(cfg *Config, state *SandboxState, tokens []string) error { } // phase6Verify runs a basic cluster health check. -func phase6Verify(cfg *Config, state *SandboxState) { - sshKeyPath := cfg.ExpandedPrivateKeyPath() +func phase6Verify(cfg *Config, state *SandboxState, sshKeyPath string) { genesis := state.GenesisServer() node := inspector.Node{User: "root", Host: genesis.IP, SSHKey: sshKeyPath} diff --git a/pkg/cli/sandbox/reset.go b/pkg/cli/sandbox/reset.go index dbc4dae..9d04cd6 100644 --- a/pkg/cli/sandbox/reset.go +++ b/pkg/cli/sandbox/reset.go @@ -42,8 +42,6 @@ func Reset() error { fmt.Println() fmt.Println("Local files to remove:") fmt.Println(" ~/.orama/sandbox.yaml") - fmt.Println(" ~/.orama/sandbox_key") - fmt.Println(" ~/.orama/sandbox_key.pub") fmt.Println() reader := bufio.NewReader(os.Stdin) @@ -100,29 +98,21 @@ func Reset() error { return nil } -// resetLocalFiles removes the sandbox config and SSH key files. +// resetLocalFiles removes the sandbox config file. func resetLocalFiles() error { dir, err := configDir() if err != nil { return err } - files := []string{ - dir + "/sandbox.yaml", - dir + "/sandbox_key", - dir + "/sandbox_key.pub", - } - + configFile := dir + "/sandbox.yaml" fmt.Println("Removing local files...") - for _, f := range files { - if err := os.Remove(f); err != nil { - if os.IsNotExist(err) { - continue - } - fmt.Fprintf(os.Stderr, " Warning: could not remove %s: %v\n", f, err) - } else { - fmt.Printf(" Removed %s\n", f) + if err := os.Remove(configFile); err != nil { + if !os.IsNotExist(err) { + fmt.Fprintf(os.Stderr, " Warning: could not remove %s: %v\n", configFile, err) } + } else { + fmt.Printf(" Removed %s\n", configFile) } return nil diff --git a/pkg/cli/sandbox/rollout.go b/pkg/cli/sandbox/rollout.go index 8a15385..284b032 100644 --- a/pkg/cli/sandbox/rollout.go +++ b/pkg/cli/sandbox/rollout.go @@ -28,7 +28,12 @@ func Rollout(name string, flags RolloutFlags) error { return err } - sshKeyPath := cfg.ExpandedPrivateKeyPath() + sshKeyPath, cleanup, err := resolveVaultKeyOnce(cfg.SSHKey.VaultTarget) + if err != nil { + return fmt.Errorf("prepare SSH key: %w", err) + } + defer cleanup() + fmt.Printf("Rolling out to sandbox %q (%d nodes)\n\n", state.Name, len(state.Servers)) // Step 1: Find or require binary archive diff --git a/pkg/cli/sandbox/setup.go b/pkg/cli/sandbox/setup.go index f702422..9976dbe 100644 --- a/pkg/cli/sandbox/setup.go +++ b/pkg/cli/sandbox/setup.go @@ -2,9 +2,6 @@ package sandbox import ( "bufio" - "crypto/ed25519" - "crypto/rand" - "encoding/pem" "fmt" "os" "os/exec" @@ -13,7 +10,7 @@ import ( "strings" "time" - "golang.org/x/crypto/ssh" + "github.com/DeBrosOfficial/network/pkg/cli/remotessh" ) // Setup runs the interactive sandbox setup wizard. @@ -386,103 +383,53 @@ func setupFirewall(client *HetznerClient) (int64, error) { return fw.ID, nil } -// setupSSHKey generates an SSH keypair and uploads it to Hetzner. +// setupSSHKey ensures a wallet SSH entry exists and uploads its public key to Hetzner. func setupSSHKey(client *HetznerClient) (SSHKeyConfig, error) { - dir, err := configDir() - if err != nil { - return SSHKeyConfig{}, err - } + const vaultTarget = "sandbox/root" - privPath := dir + "/sandbox_key" - pubPath := privPath + ".pub" - - // Check for existing key - if _, err := os.Stat(privPath); err == nil { - fmt.Printf(" SSH key already exists: %s\n", privPath) - - // Read public key and check if it's on Hetzner - pubData, err := os.ReadFile(pubPath) - if err != nil { - return SSHKeyConfig{}, fmt.Errorf("read public key: %w", err) - } - - // Try to upload (will fail with uniqueness error if already exists) - key, err := client.UploadSSHKey("orama-sandbox", strings.TrimSpace(string(pubData))) - if err != nil { - // Key already exists on Hetzner — find it by fingerprint - sshPubKey, _, _, _, parseErr := ssh.ParseAuthorizedKey(pubData) - if parseErr != nil { - return SSHKeyConfig{}, fmt.Errorf("parse public key to find fingerprint: %w", parseErr) - } - fingerprint := ssh.FingerprintLegacyMD5(sshPubKey) - - existing, listErr := client.ListSSHKeysByFingerprint(fingerprint) - if listErr == nil && len(existing) > 0 { - fmt.Printf(" Found existing SSH key on Hetzner (ID: %d)\n", existing[0].ID) - return SSHKeyConfig{ - HetznerID: existing[0].ID, - PrivateKeyPath: "~/.orama/sandbox_key", - PublicKeyPath: "~/.orama/sandbox_key.pub", - }, nil - } - - return SSHKeyConfig{}, fmt.Errorf("SSH key exists locally but could not find it on Hetzner (fingerprint: %s): %w", fingerprint, err) - } - - fmt.Printf(" Uploaded to Hetzner (ID: %d)\n", key.ID) - return SSHKeyConfig{ - HetznerID: key.ID, - PrivateKeyPath: "~/.orama/sandbox_key", - PublicKeyPath: "~/.orama/sandbox_key.pub", - }, nil - } - - // Generate new ed25519 keypair - fmt.Print(" Generating ed25519 keypair... ") - pubKey, privKey, err := ed25519.GenerateKey(rand.Reader) - if err != nil { + // Ensure wallet entry exists (creates if missing) + fmt.Print(" Ensuring wallet SSH entry... ") + if err := remotessh.EnsureVaultEntry(vaultTarget); err != nil { fmt.Println("FAILED") - return SSHKeyConfig{}, fmt.Errorf("generate key: %w", err) - } - - // Marshal private key to OpenSSH format - pemBlock, err := ssh.MarshalPrivateKey(privKey, "") - if err != nil { - fmt.Println("FAILED") - return SSHKeyConfig{}, fmt.Errorf("marshal private key: %w", err) - } - - privPEM := pem.EncodeToMemory(pemBlock) - if err := os.WriteFile(privPath, privPEM, 0600); err != nil { - fmt.Println("FAILED") - return SSHKeyConfig{}, fmt.Errorf("write private key: %w", err) - } - - // Marshal public key to authorized_keys format - sshPubKey, err := ssh.NewPublicKey(pubKey) - if err != nil { - return SSHKeyConfig{}, fmt.Errorf("convert public key: %w", err) - } - pubStr := strings.TrimSpace(string(ssh.MarshalAuthorizedKey(sshPubKey))) - - if err := os.WriteFile(pubPath, []byte(pubStr+"\n"), 0644); err != nil { - return SSHKeyConfig{}, fmt.Errorf("write public key: %w", err) + return SSHKeyConfig{}, fmt.Errorf("ensure vault entry: %w", err) } fmt.Println("OK") - // Upload to Hetzner + // Get public key from wallet + fmt.Print(" Resolving public key from wallet... ") + pubStr, err := remotessh.ResolveVaultPublicKey(vaultTarget) + if err != nil { + fmt.Println("FAILED") + return SSHKeyConfig{}, fmt.Errorf("resolve public key: %w", err) + } + fmt.Println("OK") + + // Upload to Hetzner (will fail with uniqueness error if already exists) fmt.Print(" Uploading to Hetzner... ") key, err := client.UploadSSHKey("orama-sandbox", pubStr) if err != nil { + // Key may already exist on Hetzner — try to find by fingerprint + existing, listErr := client.ListSSHKeysByFingerprint("") // empty = list all + if listErr == nil { + for _, k := range existing { + if strings.TrimSpace(k.PublicKey) == pubStr { + fmt.Printf("already exists (ID: %d)\n", k.ID) + return SSHKeyConfig{ + HetznerID: k.ID, + VaultTarget: vaultTarget, + }, nil + } + } + } + fmt.Println("FAILED") return SSHKeyConfig{}, fmt.Errorf("upload SSH key: %w", err) } fmt.Printf("OK (ID: %d)\n", key.ID) return SSHKeyConfig{ - HetznerID: key.ID, - PrivateKeyPath: "~/.orama/sandbox_key", - PublicKeyPath: "~/.orama/sandbox_key.pub", + HetznerID: key.ID, + VaultTarget: vaultTarget, }, nil } diff --git a/pkg/cli/sandbox/ssh_cmd.go b/pkg/cli/sandbox/ssh_cmd.go index f09ef08..9b30115 100644 --- a/pkg/cli/sandbox/ssh_cmd.go +++ b/pkg/cli/sandbox/ssh_cmd.go @@ -3,7 +3,7 @@ package sandbox import ( "fmt" "os" - "syscall" + "os/exec" ) // SSHInto opens an interactive SSH session to a sandbox node. @@ -23,26 +23,35 @@ func SSHInto(name string, nodeNum int) error { } srv := state.Servers[nodeNum-1] - sshKeyPath := cfg.ExpandedPrivateKeyPath() + + sshKeyPath, cleanup, err := resolveVaultKeyOnce(cfg.SSHKey.VaultTarget) + if err != nil { + return fmt.Errorf("prepare SSH key: %w", err) + } fmt.Printf("Connecting to %s (%s, %s)...\n", srv.Name, srv.IP, srv.Role) // Find ssh binary sshBin, err := findSSHBinary() if err != nil { + cleanup() return err } - // Replace current process with SSH - args := []string{ - "ssh", + // Run SSH as a child process so cleanup runs after the session ends + cmd := exec.Command(sshBin, "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", "-i", sshKeyPath, fmt.Sprintf("root@%s", srv.IP), - } + ) + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr - return syscall.Exec(sshBin, args, os.Environ()) + err = cmd.Run() + cleanup() + return err } // findSSHBinary locates the ssh binary in PATH. diff --git a/pkg/cli/sandbox/state.go b/pkg/cli/sandbox/state.go index 34d4f87..064fe6a 100644 --- a/pkg/cli/sandbox/state.go +++ b/pkg/cli/sandbox/state.go @@ -165,8 +165,8 @@ func FindActiveSandbox() (*SandboxState, error) { } // ToNodes converts sandbox servers to inspector.Node structs for SSH operations. -// Sets SSHKey to the provided key path on each node. -func (s *SandboxState) ToNodes(sshKeyPath string) []inspector.Node { +// Sets VaultTarget on each node so PrepareNodeKeys resolves from the wallet. +func (s *SandboxState) ToNodes(vaultTarget string) []inspector.Node { nodes := make([]inspector.Node, len(s.Servers)) for i, srv := range s.Servers { nodes[i] = inspector.Node{ @@ -174,7 +174,7 @@ func (s *SandboxState) ToNodes(sshKeyPath string) []inspector.Node { User: "root", Host: srv.IP, Role: srv.Role, - SSHKey: sshKeyPath, + VaultTarget: vaultTarget, } } return nodes diff --git a/pkg/cli/sandbox/state_test.go b/pkg/cli/sandbox/state_test.go index e00adb4..84580f0 100644 --- a/pkg/cli/sandbox/state_test.go +++ b/pkg/cli/sandbox/state_test.go @@ -156,7 +156,7 @@ func TestToNodes(t *testing.T) { }, } - nodes := state.ToNodes("/tmp/key") + nodes := state.ToNodes("sandbox/root") if len(nodes) != 2 { t.Fatalf("ToNodes() returned %d nodes, want 2", len(nodes)) } @@ -166,8 +166,11 @@ func TestToNodes(t *testing.T) { if nodes[0].User != "root" { t.Errorf("node[0].User = %s, want root", nodes[0].User) } - if nodes[0].SSHKey != "/tmp/key" { - t.Errorf("node[0].SSHKey = %s, want /tmp/key", nodes[0].SSHKey) + if nodes[0].VaultTarget != "sandbox/root" { + t.Errorf("node[0].VaultTarget = %s, want sandbox/root", nodes[0].VaultTarget) + } + if nodes[0].SSHKey != "" { + t.Errorf("node[0].SSHKey = %s, want empty (set by PrepareNodeKeys)", nodes[0].SSHKey) } if nodes[0].Environment != "sandbox" { t.Errorf("node[0].Environment = %s, want sandbox", nodes[0].Environment) diff --git a/pkg/cli/sandbox/status.go b/pkg/cli/sandbox/status.go index 544ca60..fbc070f 100644 --- a/pkg/cli/sandbox/status.go +++ b/pkg/cli/sandbox/status.go @@ -77,7 +77,12 @@ func Status(name string) error { return err } - sshKeyPath := cfg.ExpandedPrivateKeyPath() + sshKeyPath, cleanup, err := resolveVaultKeyOnce(cfg.SSHKey.VaultTarget) + if err != nil { + return fmt.Errorf("prepare SSH key: %w", err) + } + defer cleanup() + fmt.Printf("Sandbox: %s (status: %s)\n\n", state.Name, state.Status) for _, srv := range state.Servers { diff --git a/pkg/encryption/wallet_keygen.go b/pkg/encryption/wallet_keygen.go deleted file mode 100644 index d65a182..0000000 --- a/pkg/encryption/wallet_keygen.go +++ /dev/null @@ -1,194 +0,0 @@ -package encryption - -import ( - "crypto/ed25519" - "crypto/sha256" - "fmt" - "io" - "os" - "os/exec" - "strings" - - "golang.org/x/crypto/curve25519" - "golang.org/x/crypto/hkdf" -) - -// NodeKeys holds all cryptographic keys derived from a wallet's master key. -type NodeKeys struct { - LibP2PPrivateKey ed25519.PrivateKey // Ed25519 for LibP2P identity - LibP2PPublicKey ed25519.PublicKey - WireGuardKey [32]byte // Curve25519 private key (clamped) - WireGuardPubKey [32]byte // Curve25519 public key - IPFSPrivateKey ed25519.PrivateKey - IPFSPublicKey ed25519.PublicKey - ClusterPrivateKey ed25519.PrivateKey // IPFS Cluster identity - ClusterPublicKey ed25519.PublicKey - JWTPrivateKey ed25519.PrivateKey // EdDSA JWT signing key - JWTPublicKey ed25519.PublicKey -} - -// DeriveNodeKeysFromWallet calls `rw derive` to get a master key from the user's -// Root Wallet, then expands it into all node keys. The wallet's private key never -// leaves the `rw` process. -// -// vpsIP is used as the HKDF info parameter, so each VPS gets unique keys from the -// same wallet. Stdin is passed through so rw can prompt for the wallet password. -func DeriveNodeKeysFromWallet(vpsIP string) (*NodeKeys, error) { - if vpsIP == "" { - return nil, fmt.Errorf("VPS IP is required for key derivation") - } - - // Check rw is installed - if _, err := exec.LookPath("rw"); err != nil { - return nil, fmt.Errorf("Root Wallet (rw) not found in PATH — install it first") - } - - // Call rw derive to get master key bytes - cmd := exec.Command("rw", "derive", "--salt", "orama-node", "--info", vpsIP) - cmd.Stdin = os.Stdin // pass through for password prompts - cmd.Stderr = os.Stderr // rw UI messages go to terminal - out, err := cmd.Output() - if err != nil { - return nil, fmt.Errorf("rw derive failed: %w", err) - } - - masterHex := strings.TrimSpace(string(out)) - if len(masterHex) != 64 { // 32 bytes = 64 hex chars - return nil, fmt.Errorf("rw derive returned unexpected output length: %d (expected 64 hex chars)", len(masterHex)) - } - - masterKey, err := hexToBytes(masterHex) - if err != nil { - return nil, fmt.Errorf("rw derive returned invalid hex: %w", err) - } - defer zeroBytes(masterKey) - - return ExpandNodeKeys(masterKey) -} - -// ExpandNodeKeys expands a 32-byte master key into all node keys using HKDF-SHA256. -// The master key should come from `rw derive --salt "orama-node" --info ""`. -// -// Each key type uses a different HKDF info string under the salt "orama-expand", -// ensuring cryptographic independence between key types. -func ExpandNodeKeys(masterKey []byte) (*NodeKeys, error) { - if len(masterKey) != 32 { - return nil, fmt.Errorf("master key must be 32 bytes, got %d", len(masterKey)) - } - - salt := []byte("orama-expand") - keys := &NodeKeys{} - - // Derive LibP2P Ed25519 key - seed, err := deriveBytes(masterKey, salt, []byte("libp2p-identity"), ed25519.SeedSize) - if err != nil { - return nil, fmt.Errorf("failed to derive libp2p key: %w", err) - } - priv := ed25519.NewKeyFromSeed(seed) - zeroBytes(seed) - keys.LibP2PPrivateKey = priv - keys.LibP2PPublicKey = priv.Public().(ed25519.PublicKey) - - // Derive WireGuard Curve25519 key - wgSeed, err := deriveBytes(masterKey, salt, []byte("wireguard-key"), 32) - if err != nil { - return nil, fmt.Errorf("failed to derive wireguard key: %w", err) - } - copy(keys.WireGuardKey[:], wgSeed) - zeroBytes(wgSeed) - clampCurve25519Key(&keys.WireGuardKey) - pubKey, err := curve25519.X25519(keys.WireGuardKey[:], curve25519.Basepoint) - if err != nil { - return nil, fmt.Errorf("failed to compute wireguard public key: %w", err) - } - copy(keys.WireGuardPubKey[:], pubKey) - - // Derive IPFS Ed25519 key - seed, err = deriveBytes(masterKey, salt, []byte("ipfs-identity"), ed25519.SeedSize) - if err != nil { - return nil, fmt.Errorf("failed to derive ipfs key: %w", err) - } - priv = ed25519.NewKeyFromSeed(seed) - zeroBytes(seed) - keys.IPFSPrivateKey = priv - keys.IPFSPublicKey = priv.Public().(ed25519.PublicKey) - - // Derive IPFS Cluster Ed25519 key - seed, err = deriveBytes(masterKey, salt, []byte("ipfs-cluster"), ed25519.SeedSize) - if err != nil { - return nil, fmt.Errorf("failed to derive cluster key: %w", err) - } - priv = ed25519.NewKeyFromSeed(seed) - zeroBytes(seed) - keys.ClusterPrivateKey = priv - keys.ClusterPublicKey = priv.Public().(ed25519.PublicKey) - - // Derive JWT EdDSA signing key - seed, err = deriveBytes(masterKey, salt, []byte("jwt-signing"), ed25519.SeedSize) - if err != nil { - return nil, fmt.Errorf("failed to derive jwt key: %w", err) - } - priv = ed25519.NewKeyFromSeed(seed) - zeroBytes(seed) - keys.JWTPrivateKey = priv - keys.JWTPublicKey = priv.Public().(ed25519.PublicKey) - - return keys, nil -} - -// deriveBytes uses HKDF-SHA256 to derive n bytes from the given IKM, salt, and info. -func deriveBytes(ikm, salt, info []byte, n int) ([]byte, error) { - hkdfReader := hkdf.New(sha256.New, ikm, salt, info) - out := make([]byte, n) - if _, err := io.ReadFull(hkdfReader, out); err != nil { - return nil, err - } - return out, nil -} - -// clampCurve25519Key applies the standard Curve25519 clamping to a private key. -func clampCurve25519Key(key *[32]byte) { - key[0] &= 248 - key[31] &= 127 - key[31] |= 64 -} - -// hexToBytes decodes a hex string to bytes. -func hexToBytes(hex string) ([]byte, error) { - if len(hex)%2 != 0 { - return nil, fmt.Errorf("odd-length hex string") - } - b := make([]byte, len(hex)/2) - for i := 0; i < len(hex); i += 2 { - var hi, lo byte - var err error - if hi, err = hexCharToByte(hex[i]); err != nil { - return nil, err - } - if lo, err = hexCharToByte(hex[i+1]); err != nil { - return nil, err - } - b[i/2] = hi<<4 | lo - } - return b, nil -} - -func hexCharToByte(c byte) (byte, error) { - switch { - case c >= '0' && c <= '9': - return c - '0', nil - case c >= 'a' && c <= 'f': - return c - 'a' + 10, nil - case c >= 'A' && c <= 'F': - return c - 'A' + 10, nil - default: - return 0, fmt.Errorf("invalid hex character: %c", c) - } -} - -// zeroBytes zeroes a byte slice to clear sensitive data from memory. -func zeroBytes(b []byte) { - for i := range b { - b[i] = 0 - } -} diff --git a/pkg/encryption/wallet_keygen_test.go b/pkg/encryption/wallet_keygen_test.go deleted file mode 100644 index d06cd86..0000000 --- a/pkg/encryption/wallet_keygen_test.go +++ /dev/null @@ -1,202 +0,0 @@ -package encryption - -import ( - "bytes" - "crypto/ed25519" - "testing" -) - -// testMasterKey is a deterministic 32-byte key for testing ExpandNodeKeys. -// In production, this comes from `rw derive --salt "orama-node" --info ""`. -var testMasterKey = bytes.Repeat([]byte{0xab}, 32) -var testMasterKey2 = bytes.Repeat([]byte{0xcd}, 32) - -func TestExpandNodeKeys_Determinism(t *testing.T) { - keys1, err := ExpandNodeKeys(testMasterKey) - if err != nil { - t.Fatalf("ExpandNodeKeys: %v", err) - } - keys2, err := ExpandNodeKeys(testMasterKey) - if err != nil { - t.Fatalf("ExpandNodeKeys (second): %v", err) - } - - if !bytes.Equal(keys1.LibP2PPrivateKey, keys2.LibP2PPrivateKey) { - t.Error("LibP2P private keys differ for same input") - } - if !bytes.Equal(keys1.WireGuardKey[:], keys2.WireGuardKey[:]) { - t.Error("WireGuard keys differ for same input") - } - if !bytes.Equal(keys1.IPFSPrivateKey, keys2.IPFSPrivateKey) { - t.Error("IPFS private keys differ for same input") - } - if !bytes.Equal(keys1.ClusterPrivateKey, keys2.ClusterPrivateKey) { - t.Error("Cluster private keys differ for same input") - } - if !bytes.Equal(keys1.JWTPrivateKey, keys2.JWTPrivateKey) { - t.Error("JWT private keys differ for same input") - } -} - -func TestExpandNodeKeys_Uniqueness(t *testing.T) { - keys1, err := ExpandNodeKeys(testMasterKey) - if err != nil { - t.Fatalf("ExpandNodeKeys(master1): %v", err) - } - keys2, err := ExpandNodeKeys(testMasterKey2) - if err != nil { - t.Fatalf("ExpandNodeKeys(master2): %v", err) - } - - if bytes.Equal(keys1.LibP2PPrivateKey, keys2.LibP2PPrivateKey) { - t.Error("LibP2P keys should differ for different master keys") - } - if bytes.Equal(keys1.WireGuardKey[:], keys2.WireGuardKey[:]) { - t.Error("WireGuard keys should differ for different master keys") - } - if bytes.Equal(keys1.IPFSPrivateKey, keys2.IPFSPrivateKey) { - t.Error("IPFS keys should differ for different master keys") - } - if bytes.Equal(keys1.ClusterPrivateKey, keys2.ClusterPrivateKey) { - t.Error("Cluster keys should differ for different master keys") - } - if bytes.Equal(keys1.JWTPrivateKey, keys2.JWTPrivateKey) { - t.Error("JWT keys should differ for different master keys") - } -} - -func TestExpandNodeKeys_KeysAreMutuallyUnique(t *testing.T) { - keys, err := ExpandNodeKeys(testMasterKey) - if err != nil { - t.Fatalf("ExpandNodeKeys: %v", err) - } - - privKeys := [][]byte{ - keys.LibP2PPrivateKey.Seed(), - keys.IPFSPrivateKey.Seed(), - keys.ClusterPrivateKey.Seed(), - keys.JWTPrivateKey.Seed(), - keys.WireGuardKey[:], - } - labels := []string{"LibP2P", "IPFS", "Cluster", "JWT", "WireGuard"} - - for i := 0; i < len(privKeys); i++ { - for j := i + 1; j < len(privKeys); j++ { - if bytes.Equal(privKeys[i], privKeys[j]) { - t.Errorf("%s and %s keys should differ", labels[i], labels[j]) - } - } - } -} - -func TestExpandNodeKeys_Ed25519Validity(t *testing.T) { - keys, err := ExpandNodeKeys(testMasterKey) - if err != nil { - t.Fatalf("ExpandNodeKeys: %v", err) - } - - msg := []byte("test message for verification") - - pairs := []struct { - name string - priv ed25519.PrivateKey - pub ed25519.PublicKey - }{ - {"LibP2P", keys.LibP2PPrivateKey, keys.LibP2PPublicKey}, - {"IPFS", keys.IPFSPrivateKey, keys.IPFSPublicKey}, - {"Cluster", keys.ClusterPrivateKey, keys.ClusterPublicKey}, - {"JWT", keys.JWTPrivateKey, keys.JWTPublicKey}, - } - - for _, p := range pairs { - signature := ed25519.Sign(p.priv, msg) - if !ed25519.Verify(p.pub, msg, signature) { - t.Errorf("%s key pair: signature verification failed", p.name) - } - } -} - -func TestExpandNodeKeys_WireGuardClamping(t *testing.T) { - keys, err := ExpandNodeKeys(testMasterKey) - if err != nil { - t.Fatalf("ExpandNodeKeys: %v", err) - } - - if keys.WireGuardKey[0]&7 != 0 { - t.Errorf("WireGuard key not properly clamped: low 3 bits of first byte should be 0, got %08b", keys.WireGuardKey[0]) - } - if keys.WireGuardKey[31]&128 != 0 { - t.Errorf("WireGuard key not properly clamped: high bit of last byte should be 0, got %08b", keys.WireGuardKey[31]) - } - if keys.WireGuardKey[31]&64 != 64 { - t.Errorf("WireGuard key not properly clamped: second-high bit of last byte should be 1, got %08b", keys.WireGuardKey[31]) - } - - var zero [32]byte - if keys.WireGuardPubKey == zero { - t.Error("WireGuard public key is all zeros") - } -} - -func TestExpandNodeKeys_InvalidMasterKeyLength(t *testing.T) { - _, err := ExpandNodeKeys(nil) - if err == nil { - t.Error("expected error for nil master key") - } - - _, err = ExpandNodeKeys([]byte{}) - if err == nil { - t.Error("expected error for empty master key") - } - - _, err = ExpandNodeKeys(make([]byte, 16)) - if err == nil { - t.Error("expected error for 16-byte master key") - } - - _, err = ExpandNodeKeys(make([]byte, 64)) - if err == nil { - t.Error("expected error for 64-byte master key") - } -} - -func TestHexToBytes(t *testing.T) { - tests := []struct { - input string - expected []byte - wantErr bool - }{ - {"", []byte{}, false}, - {"00", []byte{0}, false}, - {"ff", []byte{255}, false}, - {"FF", []byte{255}, false}, - {"0a1b2c", []byte{10, 27, 44}, false}, - {"0", nil, true}, // odd length - {"zz", nil, true}, // invalid chars - {"gg", nil, true}, // invalid chars - } - - for _, tt := range tests { - got, err := hexToBytes(tt.input) - if tt.wantErr { - if err == nil { - t.Errorf("hexToBytes(%q): expected error", tt.input) - } - continue - } - if err != nil { - t.Errorf("hexToBytes(%q): unexpected error: %v", tt.input, err) - continue - } - if !bytes.Equal(got, tt.expected) { - t.Errorf("hexToBytes(%q) = %v, want %v", tt.input, got, tt.expected) - } - } -} - -func TestDeriveNodeKeysFromWallet_EmptyIP(t *testing.T) { - _, err := DeriveNodeKeysFromWallet("") - if err == nil { - t.Error("expected error for empty VPS IP") - } -} diff --git a/pkg/gateway/status_handlers.go b/pkg/gateway/status_handlers.go index 7a8259b..19d1862 100644 --- a/pkg/gateway/status_handlers.go +++ b/pkg/gateway/status_handlers.go @@ -2,6 +2,8 @@ package gateway import ( "context" + "fmt" + "net" "net/http" "strings" "time" @@ -52,7 +54,8 @@ func (g *Gateway) healthHandler(w http.ResponseWriter, r *http.Request) { name string result checkResult } - ch := make(chan namedResult, 5) + const numChecks = 7 + ch := make(chan namedResult, numChecks) // RQLite go func() { @@ -138,9 +141,37 @@ func (g *Gateway) healthHandler(w http.ResponseWriter, r *http.Request) { ch <- nr }() + // Vault Guardian (TCP connect to localhost:7500) + go func() { + nr := namedResult{name: "vault"} + start := time.Now() + conn, err := net.DialTimeout("tcp", "localhost:7500", 2*time.Second) + if err != nil { + nr.result = checkResult{Status: "error", Latency: time.Since(start).String(), Error: fmt.Sprintf("vault-guardian unreachable on port 7500: %v", err)} + } else { + conn.Close() + nr.result = checkResult{Status: "ok", Latency: time.Since(start).String()} + } + ch <- nr + }() + + // WireGuard (check wg0 interface exists and has an IP) + go func() { + nr := namedResult{name: "wireguard"} + iface, err := net.InterfaceByName("wg0") + if err != nil { + nr.result = checkResult{Status: "error", Error: "wg0 interface not found"} + } else if addrs, err := iface.Addrs(); err != nil || len(addrs) == 0 { + nr.result = checkResult{Status: "error", Error: "wg0 has no addresses"} + } else { + nr.result = checkResult{Status: "ok"} + } + ch <- nr + }() + // Collect - checks := make(map[string]checkResult, 5) - for i := 0; i < 5; i++ { + checks := make(map[string]checkResult, numChecks) + for i := 0; i < numChecks; i++ { nr := <-ch checks[nr.name] = nr.result } @@ -222,24 +253,26 @@ func (g *Gateway) versionHandler(w http.ResponseWriter, r *http.Request) { } // aggregateHealthStatus determines the overall health status from individual checks. -// Critical: rqlite down → "unhealthy" -// Non-critical (olric, ipfs, libp2p, anyone) error → "degraded" +// Critical: rqlite or vault down → "unhealthy" +// Non-critical (olric, ipfs, libp2p, anyone, wireguard) error → "degraded" // "unavailable" means the client was never configured — not an error. func aggregateHealthStatus(checks map[string]checkResult) string { - status := "healthy" - if c := checks["rqlite"]; c.Status == "error" { - return "unhealthy" + // Critical services — any error means unhealthy + for _, name := range []string{"rqlite", "vault"} { + if c := checks[name]; c.Status == "error" { + return "unhealthy" + } } + // Non-critical services — any error means degraded for name, c := range checks { - if name == "rqlite" { + if name == "rqlite" || name == "vault" { continue } if c.Status == "error" { - status = "degraded" - break + return "degraded" } } - return status + return "healthy" } // tlsCheckHandler validates if a domain should receive a TLS certificate diff --git a/pkg/gateway/status_handlers_test.go b/pkg/gateway/status_handlers_test.go index e20b239..b7fcda1 100644 --- a/pkg/gateway/status_handlers_test.go +++ b/pkg/gateway/status_handlers_test.go @@ -4,11 +4,13 @@ import "testing" func TestAggregateHealthStatus_allHealthy(t *testing.T) { checks := map[string]checkResult{ - "rqlite": {Status: "ok"}, - "olric": {Status: "ok"}, - "ipfs": {Status: "ok"}, - "libp2p": {Status: "ok"}, - "anyone": {Status: "ok"}, + "rqlite": {Status: "ok"}, + "olric": {Status: "ok"}, + "ipfs": {Status: "ok"}, + "libp2p": {Status: "ok"}, + "anyone": {Status: "ok"}, + "vault": {Status: "ok"}, + "wireguard": {Status: "ok"}, } if got := aggregateHealthStatus(checks); got != "healthy" { t.Errorf("expected healthy, got %s", got) @@ -41,11 +43,13 @@ func TestAggregateHealthStatus_unavailableIsNotError(t *testing.T) { // Key test: "unavailable" services (like Anyone in sandbox) should NOT // cause degraded status. checks := map[string]checkResult{ - "rqlite": {Status: "ok"}, - "olric": {Status: "ok"}, - "ipfs": {Status: "unavailable"}, - "libp2p": {Status: "unavailable"}, - "anyone": {Status: "unavailable"}, + "rqlite": {Status: "ok"}, + "olric": {Status: "ok"}, + "vault": {Status: "ok"}, + "ipfs": {Status: "unavailable"}, + "libp2p": {Status: "unavailable"}, + "anyone": {Status: "unavailable"}, + "wireguard": {Status: "unavailable"}, } if got := aggregateHealthStatus(checks); got != "healthy" { t.Errorf("expected healthy when services are unavailable, got %s", got) @@ -70,3 +74,38 @@ func TestAggregateHealthStatus_rqliteErrorOverridesDegraded(t *testing.T) { t.Errorf("expected unhealthy (rqlite takes priority), got %s", got) } } + +func TestAggregateHealthStatus_vaultErrorIsUnhealthy(t *testing.T) { + // vault is critical — error should mean unhealthy, not degraded + checks := map[string]checkResult{ + "rqlite": {Status: "ok"}, + "vault": {Status: "error", Error: "vault-guardian unreachable on port 7500"}, + "olric": {Status: "ok"}, + } + if got := aggregateHealthStatus(checks); got != "unhealthy" { + t.Errorf("expected unhealthy (vault is critical), got %s", got) + } +} + +func TestAggregateHealthStatus_wireguardErrorIsDegraded(t *testing.T) { + // wireguard is non-critical — error should mean degraded, not unhealthy + checks := map[string]checkResult{ + "rqlite": {Status: "ok"}, + "vault": {Status: "ok"}, + "wireguard": {Status: "error", Error: "wg0 interface not found"}, + } + if got := aggregateHealthStatus(checks); got != "degraded" { + t.Errorf("expected degraded (wireguard is non-critical), got %s", got) + } +} + +func TestAggregateHealthStatus_bothCriticalDown(t *testing.T) { + checks := map[string]checkResult{ + "rqlite": {Status: "error", Error: "connection refused"}, + "vault": {Status: "error", Error: "unreachable"}, + "wireguard": {Status: "ok"}, + } + if got := aggregateHealthStatus(checks); got != "unhealthy" { + t.Errorf("expected unhealthy, got %s", got) + } +} diff --git a/pkg/inspector/config.go b/pkg/inspector/config.go index cad33c7..1aaf3cf 100644 --- a/pkg/inspector/config.go +++ b/pkg/inspector/config.go @@ -14,6 +14,7 @@ type Node struct { Host string // IP or hostname Role string // node, nameserver-ns1, nameserver-ns2, nameserver-ns3 SSHKey string // populated at runtime by PrepareNodeKeys() + VaultTarget string // optional: override wallet key lookup (e.g. "sandbox/root") } // Name returns a short display name for the node (user@host). diff --git a/scripts/remote-nodes.conf.example b/scripts/remote-nodes.conf.example index 6065bc2..3a4a91b 100644 --- a/scripts/remote-nodes.conf.example +++ b/scripts/remote-nodes.conf.example @@ -1,26 +1,27 @@ # Remote node configuration -# Format: environment|user@host|password|role|ssh_key (optional) +# Format: environment|user@host|role # environment: devnet, testnet # role: node, nameserver-ns1, nameserver-ns2, nameserver-ns3 -# ssh_key: optional path to SSH key (if node requires key-based auth instead of sshpass) # -# Copy this file to remote-nodes.conf and fill in your credentials. -# The first node with an SSH key will be used as the hub (fan-out relay). +# SSH keys are resolved from rootwallet (rw vault ssh get / --priv). +# Ensure wallet entries exist: rw vault ssh add / +# +# Copy this file to remote-nodes.conf and fill in your node details. # --- Devnet nameservers --- -devnet|root@1.2.3.4|your_password_here|nameserver-ns1 -devnet|ubuntu@1.2.3.5|your_password_here|nameserver-ns2 -devnet|root@1.2.3.6|your_password_here|nameserver-ns3 +devnet|root@1.2.3.4|nameserver-ns1 +devnet|ubuntu@1.2.3.5|nameserver-ns2 +devnet|root@1.2.3.6|nameserver-ns3 # --- Devnet nodes --- -devnet|ubuntu@1.2.3.7|your_password_here|node -devnet|ubuntu@1.2.3.8|your_password_here|node|~/.ssh/my_key/id_ed25519 +devnet|ubuntu@1.2.3.7|node +devnet|ubuntu@1.2.3.8|node # --- Testnet nameservers --- -testnet|ubuntu@2.3.4.5|your_password_here|nameserver-ns1 -testnet|ubuntu@2.3.4.6|your_password_here|nameserver-ns2 -testnet|ubuntu@2.3.4.7|your_password_here|nameserver-ns3 +testnet|ubuntu@2.3.4.5|nameserver-ns1 +testnet|ubuntu@2.3.4.6|nameserver-ns2 +testnet|ubuntu@2.3.4.7|nameserver-ns3 # --- Testnet nodes --- -testnet|root@2.3.4.8|your_password_here|node -testnet|ubuntu@2.3.4.9|your_password_here|node +testnet|root@2.3.4.8|node +testnet|ubuntu@2.3.4.9|node