diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index cbe8e4c..afb09be 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -357,11 +357,36 @@ Function Invocation: All inter-node communication is encrypted via a WireGuard VPN mesh: -- **WireGuard IPs:** Each node gets a private IP (10.0.0.x) used for all cluster traffic +- **WireGuard IPs:** Each node gets a private IP (10.0.0.x/24) used for all cluster traffic - **UFW Firewall:** Only public ports are exposed: 22 (SSH), 53 (DNS, nameservers only), 80/443 (HTTP/HTTPS), 51820 (WireGuard UDP) +- **IPv6 disabled:** System-wide via sysctl to prevent bypass of IPv4 firewall rules - **Internal services** (RQLite 5001/7001, IPFS 4001/4501, Olric 3320/3322, Gateway 6001) are only accessible via WireGuard or localhost - **Invite tokens:** Single-use, time-limited tokens for secure node joining. No shared secrets on the CLI -- **Join flow:** New nodes authenticate via HTTPS (443), establish WireGuard tunnel, then join all services over the encrypted mesh +- **Join flow:** New nodes authenticate via HTTPS (443) with TOFU certificate pinning, establish WireGuard tunnel, then join all services over the encrypted mesh + +### Service Authentication + +- **RQLite:** HTTP basic auth on all queries/executions — credentials generated at genesis, distributed via join response +- **Olric:** Memberlist gossip encrypted with a shared 32-byte key +- **IPFS Cluster:** TrustedPeers restricted to known cluster peer IDs (not `*`) +- **Internal endpoints:** `/v1/internal/wg/peers` and `/v1/internal/wg/peer/remove` require cluster secret +- **Vault:** V1 push/pull endpoints require session token authentication when guardian is configured +- **WebSockets:** Origin header validated against the node's configured domain + +### Token & Key Security + +- **Refresh tokens:** Stored as SHA-256 hashes (never plaintext) +- **API keys:** Stored as HMAC-SHA256 hashes with a server-side secret +- **TURN secrets:** Encrypted at rest with AES-256-GCM (key derived from cluster secret) +- **Binary signing:** Build archives signed with rootwallet EVM signature, verified on install + +### Process Isolation + +- **Dedicated user:** All services run as `orama` user (not root) +- **systemd hardening:** `ProtectSystem=strict`, `NoNewPrivileges=yes`, `PrivateDevices=yes`, etc. +- **Capabilities:** Caddy and CoreDNS get `CAP_NET_BIND_SERVICE` for privileged ports + +See [SECURITY.md](SECURITY.md) for the full security hardening reference. ### TLS/HTTPS @@ -504,6 +529,31 @@ WebRTC uses a separate port allocation system from core namespace services: See [docs/WEBRTC.md](WEBRTC.md) for full details including client integration, API reference, and debugging. +## OramaOS + +For mainnet, devnet, and testnet environments, nodes run **OramaOS** — a custom minimal Linux image built with Buildroot. + +**Key properties:** +- No SSH, no shell — operators cannot access the filesystem +- LUKS full-disk encryption with Shamir key distribution across peers +- Read-only rootfs (SquashFS + dm-verity) +- A/B partition updates with cryptographic signature verification +- Service sandboxing via Linux namespaces + seccomp +- Single root process: the **orama-agent** + +**The orama-agent manages:** +- Boot sequence and LUKS key reconstruction +- WireGuard tunnel setup +- Service lifecycle in sandboxed namespaces +- Command reception from Gateway over WireGuard (port 9998) +- OS updates (download, verify, A/B swap, reboot with rollback) + +**Node enrollment:** OramaOS nodes join via `orama node enroll` instead of `orama node install`. The enrollment flow uses a registration code + invite token + wallet verification. + +See [ORAMAOS_DEPLOYMENT.md](ORAMAOS_DEPLOYMENT.md) for the full deployment guide. + +Sandbox clusters remain on Ubuntu for development convenience. + ## Future Enhancements 1. **GraphQL Support** - GraphQL gateway alongside REST diff --git a/docs/CLEAN_NODE.md b/docs/CLEAN_NODE.md index 8414394..d8b6a9f 100644 --- a/docs/CLEAN_NODE.md +++ b/docs/CLEAN_NODE.md @@ -2,6 +2,8 @@ How to completely remove all Orama Network state from a VPS so it can be reinstalled fresh. +> **OramaOS nodes:** This guide applies to Ubuntu-based nodes only. OramaOS has no SSH or shell access. To remove an OramaOS node: use `POST /v1/node/leave` via the Gateway API for graceful departure, or reflash the OramaOS image via your VPS provider's dashboard for a factory reset. See [ORAMAOS_DEPLOYMENT.md](ORAMAOS_DEPLOYMENT.md) for details. + ## Quick Clean (Copy-Paste) Run this as root or with sudo on the target VPS: diff --git a/docs/COMMON_PROBLEMS.md b/docs/COMMON_PROBLEMS.md index ae6d9ff..5d60f3e 100644 --- a/docs/COMMON_PROBLEMS.md +++ b/docs/COMMON_PROBLEMS.md @@ -150,6 +150,62 @@ ssh -n user@host 'command' --- +--- + +## 6. RQLite returns 401 Unauthorized + +**Symptom:** RQLite queries fail with HTTP 401 after security hardening. + +**Cause:** RQLite now requires basic auth. The client isn't sending credentials. + +**Fix:** Ensure the RQLite client is configured with the credentials from `/opt/orama/.orama/secrets/rqlite-auth.json`. The central RQLite client wrapper (`pkg/rqlite/client.go`) handles this automatically. If using a standalone client (e.g., CoreDNS plugin), ensure it's also configured. + +--- + +## 7. Olric cluster split after upgrade + +**Symptom:** Olric nodes can't gossip after enabling memberlist encryption. + +**Cause:** Olric memberlist encryption is all-or-nothing. Nodes with encryption can't communicate with nodes without it. + +**Fix:** All nodes must be restarted simultaneously when enabling Olric encryption. The cache will be lost (it rebuilds from DB). This is expected — Olric is a cache, not persistent storage. + +--- + +## 8. OramaOS: LUKS unlock fails + +**Symptom:** OramaOS node can't reconstruct its LUKS key after reboot. + +**Cause:** Not enough peer vault-guardians are online to meet the Shamir threshold (K = max(3, N/3)). + +**Fix:** Ensure enough cluster nodes are online and reachable over WireGuard. The agent retries with exponential backoff. For genesis nodes before 5+ peers exist, use: + +```bash +orama node unlock --genesis --node-ip +``` + +--- + +## 9. OramaOS: Enrollment timeout + +**Symptom:** `orama node enroll` hangs or times out. + +**Cause:** The OramaOS node's port 9999 isn't reachable, or the Gateway can't reach the node's WebSocket. + +**Fix:** Check that port 9999 is open in your VPS provider's external firewall (Hetzner firewall, AWS security groups, etc.). OramaOS opens it internally, but provider-level firewalls must be configured separately. + +--- + +## 10. Binary signature verification fails + +**Symptom:** `orama node install` rejects the binary archive with a signature error. + +**Cause:** The archive was tampered with, or the manifest.sig file is missing/corrupted. + +**Fix:** Rebuild the archive with `orama build` and re-sign with `make sign` (in the orama-os repo). Ensure you're using the rootwallet that matches the embedded signer address. + +--- + ## General Debugging Tips - **Always use `sudo orama node restart`** instead of raw `systemctl` commands @@ -158,3 +214,4 @@ ssh -n user@host 'command' - **Check WireGuard:** `wg show wg0` — look for recent handshakes and transfer bytes - **Check gateway health:** `curl http://localhost:/v1/health` from the node itself - **Node IPs:** Check `scripts/remote-nodes.conf` for credentials, `wg show wg0` for WG IPs +- **OramaOS nodes:** No SSH access — use Gateway API endpoints (`/v1/node/status`, `/v1/node/logs`) for diagnostics diff --git a/docs/DEV_DEPLOY.md b/docs/DEV_DEPLOY.md index 07265a4..09bbbdc 100644 --- a/docs/DEV_DEPLOY.md +++ b/docs/DEV_DEPLOY.md @@ -320,7 +320,35 @@ is properly configured, always use the HTTPS domain URL. UFW from external access. The join request goes through Caddy on port 80 (HTTP) or 443 (HTTPS), which proxies to the gateway internally. -## Pre-Install Checklist +## OramaOS Enrollment + +For OramaOS nodes (mainnet, devnet, testnet), use the enrollment flow instead of `orama node install`: + +```bash +# 1. Flash OramaOS image to VPS (via provider dashboard) +# 2. Generate invite token on existing cluster node +orama node invite --expiry 24h + +# 3. Enroll the OramaOS node +orama node enroll --node-ip --token --gateway + +# 4. For genesis node reboots (before 5+ peers exist) +orama node unlock --genesis --node-ip +``` + +OramaOS nodes have no SSH access. All management happens through the Gateway API: + +```bash +# Status, logs, commands — all via Gateway proxy +curl "https://gateway.example.com/v1/node/status?node_id=" +curl "https://gateway.example.com/v1/node/logs?node_id=&service=gateway" +``` + +See [ORAMAOS_DEPLOYMENT.md](ORAMAOS_DEPLOYMENT.md) for the full guide. + +**Note:** `orama node clean` does not work on OramaOS nodes (no SSH). Use `orama node leave` for graceful departure, or reflash the image for a factory reset. + +## Pre-Install Checklist (Ubuntu Only) Before running `orama node install` on a VPS, ensure: diff --git a/docs/ORAMAOS_DEPLOYMENT.md b/docs/ORAMAOS_DEPLOYMENT.md new file mode 100644 index 0000000..ebdd3b3 --- /dev/null +++ b/docs/ORAMAOS_DEPLOYMENT.md @@ -0,0 +1,233 @@ +# OramaOS Deployment Guide + +OramaOS is a custom minimal Linux image built with Buildroot. It replaces the standard Ubuntu-based node deployment for mainnet, devnet, and testnet environments. Sandbox clusters remain on Ubuntu for development convenience. + +## What is OramaOS? + +OramaOS is a locked-down operating system designed specifically for Orama node operators. Key properties: + +- **No SSH, no shell** — operators cannot access the filesystem or run commands on the machine +- **LUKS full-disk encryption** — the data partition is encrypted; the key is split via Shamir's Secret Sharing across peer nodes +- **Read-only rootfs** — the OS image uses SquashFS with dm-verity integrity verification +- **A/B partition updates** — signed OS images are applied atomically with automatic rollback on failure +- **Service sandboxing** — each service runs in its own Linux namespace with seccomp syscall filtering +- **Signed binaries** — all updates are cryptographically signed with the Orama rootwallet + +## Architecture + +``` +Partition Layout: + /dev/sda1 — ESP (EFI System Partition, systemd-boot) + /dev/sda2 — rootfs-A (SquashFS, read-only, dm-verity) + /dev/sda3 — rootfs-B (standby, for A/B updates) + /dev/sda4 — data (LUKS2 encrypted, ext4) + +Boot Flow: + systemd-boot → dm-verity rootfs → orama-agent → WireGuard → services +``` + +The **orama-agent** is the only root process. It manages: +- Boot sequence and LUKS key reconstruction +- WireGuard tunnel setup +- Service lifecycle (start, stop, restart in sandboxed namespaces) +- Command reception from the Gateway over WireGuard +- OS updates (download, verify signature, A/B swap, reboot) + +## Enrollment Flow + +OramaOS nodes join the cluster through an enrollment process (different from the Ubuntu `orama node install` flow): + +### Step 1: Flash OramaOS to VPS + +Download the OramaOS image and flash it to your VPS: + +```bash +# Download image (URL provided upon acceptance) +wget https://releases.orama.network/oramaos-v1.0.0-amd64.qcow2 + +# Flash to VPS (provider-specific — Hetzner, Vultr, etc.) +# Most providers support uploading custom images via their dashboard +``` + +### Step 2: First Boot — Enrollment Mode + +On first boot, the agent: +1. Generates a random 8-character registration code +2. Starts a temporary HTTP server on port 9999 +3. Opens an outbound WebSocket to the Gateway +4. Waits for enrollment to complete + +The registration code is displayed on the VPS console (if available) and served at `http://:9999/`. + +### Step 3: Run Enrollment from CLI + +On your local machine (where you have the `orama` CLI and rootwallet): + +```bash +# Generate an invite token on any existing cluster node +orama node invite --expiry 24h + +# Enroll the OramaOS node +orama node enroll --node-ip --token --gateway +``` + +The enrollment command: +1. Fetches the registration code from the node (port 9999) +2. Sends the code + invite token to the Gateway +3. Gateway validates everything, assigns a WireGuard IP, and pushes config to the node +4. Node configures WireGuard, formats the LUKS-encrypted data partition +5. LUKS key is split via Shamir and distributed to peer vault-guardians +6. Services start in sandboxed namespaces +7. Port 9999 closes permanently + +### Step 4: Verify + +```bash +# Check the node is online and healthy +orama monitor report --env +``` + +## Genesis Node + +The first OramaOS node in a cluster is the **genesis node**. It has a special boot path because there are no peers yet for Shamir key distribution: + +1. Genesis generates a LUKS key and encrypts the data partition +2. The LUKS key is encrypted with a rootwallet-derived key and stored on the unencrypted rootfs +3. On reboot (before enough peers exist), the operator must manually unlock: + +```bash +orama node unlock --genesis --node-ip +``` + +This command: +1. Fetches the encrypted genesis key from the node +2. Decrypts it using the rootwallet (`rw decrypt`) +3. Sends the decrypted LUKS key to the agent over WireGuard + +Once 5+ peers have joined, the genesis node distributes Shamir shares to peers, deletes the local encrypted key, and transitions to normal Shamir-based unlock. After this transition, `orama node unlock` is no longer needed. + +## Normal Reboot (Shamir Unlock) + +When an enrolled OramaOS node reboots: + +1. Agent starts, brings up WireGuard +2. Contacts peer vault-guardians over WireGuard +3. Fetches K Shamir shares (K = threshold, typically `max(3, N/3)`) +4. Reconstructs LUKS key via Lagrange interpolation over GF(256) +5. Decrypts and mounts data partition +6. Starts all services +7. Zeros key from memory + +If not enough peers are available, the agent enters a degraded "waiting for peers" state and retries with exponential backoff (1s, 2s, 4s, 8s, 16s, max 5 retries per cycle). + +## Node Management + +Since OramaOS has no SSH, all management happens through the Gateway API: + +```bash +# Check node status +curl "https://gateway.example.com/v1/node/status?node_id=" + +# Send a command (e.g., restart a service) +curl -X POST "https://gateway.example.com/v1/node/command?node_id=" \ + -H "Content-Type: application/json" \ + -d '{"action":"restart","service":"rqlite"}' + +# View logs +curl "https://gateway.example.com/v1/node/logs?node_id=&service=gateway&lines=100" + +# Graceful node departure +curl -X POST "https://gateway.example.com/v1/node/leave" \ + -H "Content-Type: application/json" \ + -d '{"node_id":""}' +``` + +The Gateway proxies these requests to the agent over WireGuard (port 9998). The agent is never directly accessible from the public internet. + +## OS Updates + +OramaOS uses an A/B partition scheme for atomic, rollback-safe updates: + +1. Agent periodically checks for new versions +2. Downloads the signed image (P2P over WireGuard between nodes) +3. Verifies the rootwallet EVM signature against the embedded public key +4. Writes to the standby partition (if running from A, writes to B) +5. Sets systemd-boot to boot from B with `tries_left=3` +6. Reboots +7. If B boots successfully (agent starts, WG connects, services healthy): marks B as "good" +8. If B fails 3 times: systemd-boot automatically falls back to A + +No operator intervention is needed for updates. Failed updates are automatically rolled back. + +## Service Sandboxing + +Each service on OramaOS runs in an isolated environment: + +- **Mount namespace** — each service only sees its own data directory as writable; everything else is read-only +- **UTS namespace** — isolated hostname +- **Dedicated UID/GID** — each service runs as a different user (not root) +- **Seccomp filtering** — per-service syscall allowlist (initially in audit mode, then enforce mode) + +Services and their sandbox profiles: +| Service | Writable Path | Extra Syscalls | +|---------|--------------|----------------| +| RQLite | `/opt/orama/.orama/data/rqlite` | fsync, fdatasync (Raft + SQLite WAL) | +| Olric | `/opt/orama/.orama/data/olric` | sendmmsg, recvmmsg (gossip) | +| IPFS | `/opt/orama/.orama/data/ipfs` | sendfile, splice (data transfer) | +| Gateway | `/opt/orama/.orama/data/gateway` | sendfile, splice (HTTP) | +| CoreDNS | `/opt/orama/.orama/data/coredns` | sendmmsg, recvmmsg (DNS) | + +## OramaOS vs Ubuntu Deployment + +| Feature | Ubuntu | OramaOS | +|---------|--------|---------| +| SSH access | Yes | No | +| Shell access | Yes | No | +| Disk encryption | No | LUKS2 (Shamir) | +| OS updates | Manual (`orama node upgrade`) | Automatic (signed, A/B) | +| Service isolation | systemd only | Namespaces + seccomp | +| Rootfs integrity | None | dm-verity | +| Binary signing | Optional | Required | +| Operator data access | Full | None | +| Environments | All (including sandbox) | Mainnet, devnet, testnet | + +## Cleaning / Factory Reset + +OramaOS nodes cannot be cleaned with the standard `orama node clean` command (no SSH access). Instead: + +- **Graceful departure:** `orama node leave` via the Gateway API — stops services, redistributes Shamir shares, removes WG peer +- **Factory reset:** Reflash the OramaOS image on the VPS via the hosting provider's dashboard +- **Data is unrecoverable:** Since the LUKS key is distributed across peers, reflashing destroys all data permanently + +## Troubleshooting + +### Node stuck in enrollment mode +The node boots but enrollment never completes. + +**Check:** Can you reach `http://:9999/` from your machine? If not, the VPS firewall may be blocking port 9999. + +**Fix:** Ensure port 9999 is open in the VPS provider's firewall. OramaOS opens it automatically via its internal firewall, but external provider firewalls (Hetzner, AWS security groups) must be configured separately. + +### LUKS unlock fails (not enough peers) +After reboot, the node can't reconstruct its LUKS key. + +**Check:** How many peer nodes are online? The node needs at least K peers (threshold) to be reachable over WireGuard. + +**Fix:** Ensure enough cluster nodes are online. If this is the genesis node and fewer than 5 peers exist, use: +```bash +orama node unlock --genesis --node-ip +``` + +### Update failed, node rolled back +The node applied an update but reverted to the previous version. + +**Check:** The agent logs will show why the new partition failed to boot (accessible via `GET /v1/node/logs?service=agent`). + +**Common causes:** Corrupted download (signature verification should catch this), hardware issue, or incompatible configuration. + +### Services not starting after reboot +The node rebooted and LUKS unlocked, but services are unhealthy. + +**Check:** `GET /v1/node/status` — which services are down? + +**Fix:** Try restarting the specific service via `POST /v1/node/command` with `{"action":"restart","service":""}`. If the issue persists, check service logs. diff --git a/docs/SECURITY.md b/docs/SECURITY.md new file mode 100644 index 0000000..7eabc85 --- /dev/null +++ b/docs/SECURITY.md @@ -0,0 +1,194 @@ +# Security Hardening + +This document describes all security measures applied to the Orama Network, covering both Phase 1 (service hardening on existing Ubuntu nodes) and Phase 2 (OramaOS locked-down image). + +## Phase 1: Service Hardening + +These measures apply to all nodes (Ubuntu and OramaOS). + +### Network Isolation + +**CIDR Validation (Step 1.1)** +- WireGuard subnet restricted to `10.0.0.0/24` across all components: firewall rules, rate limiter, auth module, and WireGuard PostUp/PostDown iptables rules +- Prevents other tenants on shared VPS providers from bypassing the firewall via overlapping `10.x.x.x` ranges + +**IPv6 Disabled (Step 1.2)** +- IPv6 disabled system-wide via sysctl: `net.ipv6.conf.all.disable_ipv6=1` +- Prevents services bound to `0.0.0.0` from being reachable via IPv6 (which had no firewall rules) + +### Authentication + +**Internal Endpoint Auth (Step 1.3)** +- `/v1/internal/wg/peers` and `/v1/internal/wg/peer/remove` now require cluster secret validation +- Peer removal additionally validates the request originates from a WireGuard subnet IP + +**RQLite Authentication (Step 1.7)** +- RQLite runs with `-auth` flag pointing to a credentials file +- All RQLite HTTP requests include `Authorization: Basic ` headers +- Credentials generated at cluster genesis, distributed to joining nodes via join response +- Both the central RQLite client wrapper and the standalone CoreDNS RQLite client send auth + +**Olric Gossip Encryption (Step 1.8)** +- Olric memberlist uses a 32-byte encryption key for all gossip traffic +- Key generated at genesis, distributed via join response +- Prevents rogue nodes from joining the gossip ring and poisoning caches +- Note: encryption is all-or-nothing (coordinated restart required when enabling) + +**IPFS Cluster TrustedPeers (Step 1.9)** +- IPFS Cluster `TrustedPeers` populated with actual cluster peer IDs (was `["*"]`) +- New peers added to TrustedPeers on all existing nodes during join +- Prevents unauthorized peers from controlling IPFS pinning + +**Vault V1 Auth Enforcement (Step 1.14)** +- V1 push/pull endpoints require a valid session token when vault-guardian is configured +- Previously, auth was optional for backward compatibility — any WG peer could read/overwrite Shamir shares + +### Token & Key Storage + +**Refresh Token Hashing (Step 1.5)** +- Refresh tokens stored as SHA-256 hashes in RQLite (never plaintext) +- On lookup: hash the incoming token, query by hash +- On revocation: hash before revoking (both single-token and by-subject) +- Existing tokens invalidated on upgrade (users re-authenticate) + +**API Key Hashing (Step 1.6)** +- API keys stored as HMAC-SHA256 hashes using a server-side secret +- HMAC secret generated at cluster genesis, stored in `~/.orama/secrets/api-key-hmac-secret` +- On lookup: compute HMAC, query by hash — fast enough for every request (unlike bcrypt) +- In-memory cache uses raw key as cache key (never persisted) +- During rolling upgrade: dual lookup (HMAC first, then raw as fallback) until all nodes upgraded + +**TURN Secret Encryption (Step 1.15)** +- TURN shared secrets encrypted at rest in RQLite using AES-256-GCM +- Encryption key derived via HKDF from the cluster secret with purpose string `"turn-encryption"` + +### TLS & Transport + +**InsecureSkipVerify Fix (Step 1.10)** +- During node join, TLS verification uses TOFU (Trust On First Use) +- Invite token output includes the CA certificate fingerprint (SHA-256) +- Joining node verifies the server cert fingerprint matches before proceeding +- After join: CA cert stored locally for future connections + +**WebSocket Origin Validation (Step 1.4)** +- All WebSocket upgraders validate the `Origin` header against the node's configured domain +- Non-browser clients (no Origin header) are still allowed +- Prevents cross-site WebSocket hijacking attacks + +### Process Isolation + +**Dedicated User (Step 1.11)** +- All services run as the `orama` user (not root) +- Caddy and CoreDNS get `AmbientCapabilities=CAP_NET_BIND_SERVICE` for ports 80/443 and 53 +- WireGuard stays as root (kernel netlink requires it) +- vault-guardian already had proper hardening + +**systemd Hardening (Step 1.12)** +- All service units include: + ```ini + ProtectSystem=strict + ProtectHome=yes + NoNewPrivileges=yes + PrivateDevices=yes + ProtectKernelTunables=yes + ProtectKernelModules=yes + RestrictNamespaces=yes + ReadWritePaths=/opt/orama/.orama + ``` +- Applied to both template files (`pkg/environments/templates/`) and hardcoded unit generators (`pkg/environments/production/services.go`) + +### Supply Chain + +**Binary Signing (Step 1.13)** +- Build archives include `manifest.sig` — a rootwallet EVM signature of the manifest hash +- During install, the signature is verified against the embedded Orama public key +- Unsigned or tampered archives are rejected + +## Phase 2: OramaOS + +These measures apply only to OramaOS nodes (mainnet, devnet, testnet). + +### Immutable OS + +- **Read-only rootfs** — SquashFS with dm-verity integrity verification +- **No shell** — `/bin/sh` symlinked to `/bin/false`, no bash/ash/ssh +- **No SSH** — OpenSSH not included in the image +- **Minimal packages** — only what's needed for systemd, cryptsetup, and the agent + +### Full-Disk Encryption + +- **LUKS2** with AES-XTS-Plain64 on the data partition +- **Shamir's Secret Sharing** over GF(256) — LUKS key split across peer vault-guardians +- **Adaptive threshold** — K = max(3, N/3) where N is the number of peers +- **Key zeroing** — LUKS key wiped from memory immediately after use +- **Malicious share detection** — fetch K+1 shares when possible, verify consistency + +### Service Sandboxing + +Each service runs in isolated Linux namespaces: +- **CLONE_NEWNS** — mount namespace (filesystem isolation) +- **CLONE_NEWUTS** — hostname namespace +- **Dedicated UID/GID** — each service has its own user +- **Seccomp filtering** — per-service syscall allowlist + +Note: CLONE_NEWPID is intentionally omitted — it makes services PID 1 in their namespace, which changes signal semantics (SIGTERM ignored by default for PID 1). + +### Signed Updates + +- A/B partition scheme with systemd-boot and boot counting (`tries_left=3`) +- All updates signed with rootwallet EVM signature (secp256k1 + keccak256) +- Signer address: `0xb5d8a496c8b2412990d7D467E17727fdF5954afC` +- P2P distribution over WireGuard between nodes +- Automatic rollback on 3 consecutive boot failures + +### Zero Operator Access + +- Operators cannot read data on the machine (LUKS encrypted, no shell) +- Management only through Gateway API → agent over WireGuard +- All commands are logged and auditable +- No root access, no console access, no file system access + +## Rollout Strategy + +### Phase 1 Batches + +``` +Batch 1 (zero-risk, no restart): + - CIDR fix + - IPv6 disable + - Internal endpoint auth + - WebSocket origin check + +Batch 2 (medium-risk, restart needed): + - Hash refresh tokens + - Hash API keys + - Binary signing + - Vault V1 auth enforcement + - TURN secret encryption + +Batch 3 (high-risk, coordinated rollout): + - RQLite auth (followers first, leader last) + - Olric encryption (simultaneous restart) + - IPFS Cluster TrustedPeers + +Batch 4 (infrastructure changes): + - InsecureSkipVerify fix + - Dedicated user + - systemd hardening +``` + +### Phase 2 + +1. Build and test OramaOS image in QEMU +2. Deploy to sandbox cluster alongside Ubuntu nodes +3. Verify interop and stability +4. Gradual migration: testnet → devnet → mainnet (one node at a time, maintaining Raft quorum) + +## Verification + +All changes verified on sandbox cluster before production deployment: + +- `make test` — all unit tests pass +- `orama monitor report --env sandbox` — full cluster health +- Manual endpoint testing (e.g., curl without auth → 401) +- Security-specific checks (IPv6 listeners, RQLite auth, binary signatures) diff --git a/pkg/cli/cmd/node/enroll.go b/pkg/cli/cmd/node/enroll.go new file mode 100644 index 0000000..ea99230 --- /dev/null +++ b/pkg/cli/cmd/node/enroll.go @@ -0,0 +1,26 @@ +package node + +import ( + "github.com/DeBrosOfficial/network/pkg/cli/production/enroll" + "github.com/spf13/cobra" +) + +var enrollCmd = &cobra.Command{ + Use: "enroll", + Short: "Enroll an OramaOS node into the cluster", + Long: `Enroll a freshly booted OramaOS node into the cluster. + +The OramaOS node displays a registration code on port 9999. Provide this code +along with an invite token to complete enrollment. The Gateway pushes cluster +configuration (WireGuard, secrets, peer list) to the node. + +Usage: + orama node enroll --node-ip --code --token --env + +The node must be reachable over the public internet on port 9999 (enrollment only). +After enrollment, port 9999 is permanently closed and all communication goes over WireGuard.`, + Run: func(cmd *cobra.Command, args []string) { + enroll.Handle(args) + }, + DisableFlagParsing: true, +} diff --git a/pkg/cli/cmd/node/node.go b/pkg/cli/cmd/node/node.go index 5520571..74f9744 100644 --- a/pkg/cli/cmd/node/node.go +++ b/pkg/cli/cmd/node/node.go @@ -30,4 +30,6 @@ func init() { Cmd.AddCommand(rolloutCmd) Cmd.AddCommand(cleanCmd) Cmd.AddCommand(recoverRaftCmd) + Cmd.AddCommand(enrollCmd) + Cmd.AddCommand(unlockCmd) } diff --git a/pkg/cli/cmd/node/unlock.go b/pkg/cli/cmd/node/unlock.go new file mode 100644 index 0000000..522a8a8 --- /dev/null +++ b/pkg/cli/cmd/node/unlock.go @@ -0,0 +1,26 @@ +package node + +import ( + "github.com/DeBrosOfficial/network/pkg/cli/production/unlock" + "github.com/spf13/cobra" +) + +var unlockCmd = &cobra.Command{ + Use: "unlock", + Short: "Unlock an OramaOS genesis node", + Long: `Manually unlock a genesis OramaOS node that cannot reconstruct its LUKS key +via Shamir shares (not enough peers online). + +This is only needed for the genesis node before enough peers have joined for +Shamir-based unlock. Once 5+ peers exist, the genesis node transitions to +normal Shamir unlock and this command is no longer needed. + +Usage: + orama node unlock --genesis --node-ip + +The node must be reachable over WireGuard on port 9998.`, + Run: func(cmd *cobra.Command, args []string) { + unlock.Handle(args) + }, + DisableFlagParsing: true, +} diff --git a/pkg/cli/production/enroll/command.go b/pkg/cli/production/enroll/command.go new file mode 100644 index 0000000..438ea71 --- /dev/null +++ b/pkg/cli/production/enroll/command.go @@ -0,0 +1,123 @@ +// Package enroll implements the OramaOS node enrollment command. +// +// Flow: +// 1. Operator fetches registration code from the OramaOS node (port 9999) +// 2. Operator provides code + invite token to Gateway +// 3. Gateway validates, generates cluster config, pushes to node +// 4. Node configures WireGuard, encrypts data partition, starts services +package enroll + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "time" +) + +// Handle processes the enroll command. +func Handle(args []string) { + flags, err := ParseFlags(args) + if err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } + + // Step 1: Fetch registration code from the OramaOS node + fmt.Printf("Fetching registration code from %s:9999...\n", flags.NodeIP) + + var code string + if flags.Code != "" { + // Code provided directly — skip fetch + code = flags.Code + } else { + fetchedCode, err := fetchRegistrationCode(flags.NodeIP) + if err != nil { + fmt.Fprintf(os.Stderr, "Error: could not reach OramaOS node: %v\n", err) + fmt.Fprintf(os.Stderr, "Make sure the node is booted and port 9999 is reachable.\n") + os.Exit(1) + } + code = fetchedCode + } + + fmt.Printf("Registration code: %s\n", code) + + // Step 2: Send enrollment request to the Gateway + fmt.Printf("Sending enrollment to Gateway at %s...\n", flags.GatewayURL) + + if err := enrollWithGateway(flags.GatewayURL, flags.Token, code, flags.NodeIP); err != nil { + fmt.Fprintf(os.Stderr, "Error: enrollment failed: %v\n", err) + os.Exit(1) + } + + fmt.Printf("Node %s enrolled successfully.\n", flags.NodeIP) + fmt.Printf("The node is now configuring WireGuard and encrypting its data partition.\n") + fmt.Printf("This may take a few minutes. Check status with: orama node status --env %s\n", flags.Env) +} + +// fetchRegistrationCode retrieves the one-time registration code from the OramaOS node. +func fetchRegistrationCode(nodeIP string) (string, error) { + client := &http.Client{Timeout: 10 * time.Second} + resp, err := client.Get(fmt.Sprintf("http://%s:9999/", nodeIP)) + if err != nil { + return "", fmt.Errorf("GET failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusGone { + return "", fmt.Errorf("registration code already served (node may be partially enrolled)") + } + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("unexpected status %d", resp.StatusCode) + } + + var result struct { + Code string `json:"code"` + Expires string `json:"expires"` + } + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return "", fmt.Errorf("invalid response: %w", err) + } + + return result.Code, nil +} + +// enrollWithGateway sends the enrollment request to the Gateway, which validates +// the code and token, then pushes cluster configuration to the OramaOS node. +func enrollWithGateway(gatewayURL, token, code, nodeIP string) error { + body, _ := json.Marshal(map[string]string{ + "code": code, + "token": token, + "node_ip": nodeIP, + }) + + req, err := http.NewRequest("POST", gatewayURL+"/v1/node/enroll", bytes.NewReader(body)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+token) + + client := &http.Client{Timeout: 60 * time.Second} + resp, err := client.Do(req) + if err != nil { + return fmt.Errorf("request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusUnauthorized { + return fmt.Errorf("invalid or expired invite token") + } + if resp.StatusCode == http.StatusBadRequest { + respBody, _ := io.ReadAll(resp.Body) + return fmt.Errorf("bad request: %s", string(respBody)) + } + if resp.StatusCode != http.StatusOK { + respBody, _ := io.ReadAll(resp.Body) + return fmt.Errorf("gateway returned %d: %s", resp.StatusCode, string(respBody)) + } + + return nil +} diff --git a/pkg/cli/production/enroll/flags.go b/pkg/cli/production/enroll/flags.go new file mode 100644 index 0000000..2277d6b --- /dev/null +++ b/pkg/cli/production/enroll/flags.go @@ -0,0 +1,46 @@ +package enroll + +import ( + "flag" + "fmt" + "os" +) + +// Flags holds the parsed command-line flags for the enroll command. +type Flags struct { + NodeIP string // Public IP of the OramaOS node + Code string // Registration code (optional — fetched automatically if not provided) + Token string // Invite token for cluster joining + GatewayURL string // Gateway HTTPS URL + Env string // Environment name (for display only) +} + +// ParseFlags parses the enroll command flags. +func ParseFlags(args []string) (*Flags, error) { + fs := flag.NewFlagSet("enroll", flag.ContinueOnError) + fs.SetOutput(os.Stderr) + + flags := &Flags{} + + fs.StringVar(&flags.NodeIP, "node-ip", "", "Public IP of the OramaOS node (required)") + fs.StringVar(&flags.Code, "code", "", "Registration code from the node (auto-fetched if not provided)") + fs.StringVar(&flags.Token, "token", "", "Invite token for cluster joining (required)") + fs.StringVar(&flags.GatewayURL, "gateway", "", "Gateway URL (required, e.g. https://gateway.example.com)") + fs.StringVar(&flags.Env, "env", "production", "Environment name") + + if err := fs.Parse(args); err != nil { + return nil, err + } + + if flags.NodeIP == "" { + return nil, fmt.Errorf("--node-ip is required") + } + if flags.Token == "" { + return nil, fmt.Errorf("--token is required") + } + if flags.GatewayURL == "" { + return nil, fmt.Errorf("--gateway is required") + } + + return flags, nil +} diff --git a/pkg/cli/production/unlock/command.go b/pkg/cli/production/unlock/command.go new file mode 100644 index 0000000..b6111eb --- /dev/null +++ b/pkg/cli/production/unlock/command.go @@ -0,0 +1,166 @@ +// Package unlock implements the genesis node unlock command. +// +// When the genesis OramaOS node reboots before enough peers exist for +// Shamir-based LUKS key reconstruction, the operator must manually provide +// the LUKS key. This command reads the encrypted genesis key from the +// node's rootfs, decrypts it with the rootwallet, and sends it to the agent. +package unlock + +import ( + "bytes" + "encoding/base64" + "encoding/json" + "flag" + "fmt" + "io" + "net/http" + "os" + "os/exec" + "strings" + "time" +) + +// Flags holds parsed command-line flags. +type Flags struct { + NodeIP string // WireGuard IP of the OramaOS node + Genesis bool // Must be set to confirm genesis unlock + KeyFile string // Path to the encrypted genesis key file (optional override) +} + +// Handle processes the unlock command. +func Handle(args []string) { + flags, err := parseFlags(args) + if err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } + + if !flags.Genesis { + fmt.Fprintf(os.Stderr, "Error: --genesis flag is required to confirm genesis unlock\n") + os.Exit(1) + } + + // Step 1: Read the encrypted genesis key from the node + fmt.Printf("Fetching encrypted genesis key from %s...\n", flags.NodeIP) + encKey, err := fetchGenesisKey(flags.NodeIP) + if err != nil && flags.KeyFile == "" { + fmt.Fprintf(os.Stderr, "Error: could not fetch genesis key from node: %v\n", err) + fmt.Fprintf(os.Stderr, "You can provide the key file directly with --key-file\n") + os.Exit(1) + } + + if flags.KeyFile != "" { + data, readErr := os.ReadFile(flags.KeyFile) + if readErr != nil { + fmt.Fprintf(os.Stderr, "Error: could not read key file: %v\n", readErr) + os.Exit(1) + } + encKey = strings.TrimSpace(string(data)) + } + + // Step 2: Decrypt with rootwallet + fmt.Println("Decrypting genesis key with rootwallet...") + luksKey, err := decryptGenesisKey(encKey) + if err != nil { + fmt.Fprintf(os.Stderr, "Error: decryption failed: %v\n", err) + os.Exit(1) + } + + // Step 3: Send LUKS key to the agent over WireGuard + fmt.Printf("Sending LUKS key to agent at %s:9998...\n", flags.NodeIP) + if err := sendUnlockKey(flags.NodeIP, luksKey); err != nil { + fmt.Fprintf(os.Stderr, "Error: unlock failed: %v\n", err) + os.Exit(1) + } + + fmt.Println("Genesis node unlocked successfully.") + fmt.Println("The node is decrypting and mounting its data partition.") +} + +func parseFlags(args []string) (*Flags, error) { + fs := flag.NewFlagSet("unlock", flag.ContinueOnError) + fs.SetOutput(os.Stderr) + + flags := &Flags{} + fs.StringVar(&flags.NodeIP, "node-ip", "", "WireGuard IP of the OramaOS node (required)") + fs.BoolVar(&flags.Genesis, "genesis", false, "Confirm genesis node unlock") + fs.StringVar(&flags.KeyFile, "key-file", "", "Path to encrypted genesis key file (optional)") + + if err := fs.Parse(args); err != nil { + return nil, err + } + + if flags.NodeIP == "" { + return nil, fmt.Errorf("--node-ip is required") + } + + return flags, nil +} + +// fetchGenesisKey retrieves the encrypted genesis key from the node. +// The agent serves it at GET /v1/agent/genesis-key (only during genesis unlock mode). +func fetchGenesisKey(nodeIP string) (string, error) { + client := &http.Client{Timeout: 10 * time.Second} + resp, err := client.Get(fmt.Sprintf("http://%s:9998/v1/agent/genesis-key", nodeIP)) + if err != nil { + return "", fmt.Errorf("request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return "", fmt.Errorf("status %d: %s", resp.StatusCode, string(body)) + } + + var result struct { + EncryptedKey string `json:"encrypted_key"` + } + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return "", fmt.Errorf("invalid response: %w", err) + } + + return result.EncryptedKey, nil +} + +// decryptGenesisKey decrypts the AES-256-GCM encrypted LUKS key using rootwallet. +// The key was encrypted with: AES-256-GCM(luksKey, HKDF(rootwalletKey, "genesis-luks")) +// For now, we use `rw decrypt` if available, or a local HKDF+AES-GCM implementation. +func decryptGenesisKey(encryptedKey string) ([]byte, error) { + // Try rw decrypt first + cmd := exec.Command("rw", "decrypt", encryptedKey, "--purpose", "genesis-luks", "--chain", "evm") + output, err := cmd.Output() + if err == nil { + decoded, decErr := base64.StdEncoding.DecodeString(strings.TrimSpace(string(output))) + if decErr != nil { + return nil, fmt.Errorf("failed to decode decrypted key: %w", decErr) + } + return decoded, nil + } + + return nil, fmt.Errorf("rw decrypt failed: %w (is rootwallet installed and initialized?)", err) +} + +// sendUnlockKey sends the decrypted LUKS key to the agent's unlock endpoint. +func sendUnlockKey(nodeIP string, luksKey []byte) error { + body, _ := json.Marshal(map[string]string{ + "key": base64.StdEncoding.EncodeToString(luksKey), + }) + + client := &http.Client{Timeout: 30 * time.Second} + resp, err := client.Post( + fmt.Sprintf("http://%s:9998/v1/agent/unlock", nodeIP), + "application/json", + bytes.NewReader(body), + ) + if err != nil { + return fmt.Errorf("request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + respBody, _ := io.ReadAll(resp.Body) + return fmt.Errorf("status %d: %s", resp.StatusCode, string(respBody)) + } + + return nil +} diff --git a/pkg/environments/production/prebuilt.go b/pkg/environments/production/prebuilt.go index ac424b3..6bbcba2 100644 --- a/pkg/environments/production/prebuilt.go +++ b/pkg/environments/production/prebuilt.go @@ -48,7 +48,7 @@ func LoadPreBuiltManifest() (*PreBuiltManifest, error) { // OramaSignerAddress is the Ethereum address authorized to sign build archives. // Archives signed by any other address are rejected during install. // This is the DeBros deploy wallet — update if the signing key rotates. -const OramaSignerAddress = "0x0000000000000000000000000000000000000000" // TODO: set real address +const OramaSignerAddress = "0xb5d8a496c8b2412990d7D467E17727fdF5954afC" // VerifyArchiveSignature verifies that the pre-built archive was signed by the // authorized Orama signer. Returns nil if the signature is valid, or if no diff --git a/pkg/gateway/gateway.go b/pkg/gateway/gateway.go index 0cecebc..389ab00 100644 --- a/pkg/gateway/gateway.go +++ b/pkg/gateway/gateway.go @@ -29,6 +29,7 @@ import ( deploymentshandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/deployments" pubsubhandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/pubsub" serverlesshandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/serverless" + enrollhandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/enroll" joinhandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/join" webrtchandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/webrtc" vaulthandlers "github.com/DeBrosOfficial/network/pkg/gateway/handlers/vault" @@ -133,6 +134,9 @@ type Gateway struct { // Node join handler joinHandler *joinhandlers.Handler + // OramaOS node enrollment handler + enrollHandler *enrollhandlers.Handler + // Cluster provisioning for namespace clusters clusterProvisioner authhandlers.ClusterProvisioner @@ -399,6 +403,7 @@ func New(logger *logging.ColoredLogger, cfg *Config) (*Gateway, error) { if deps.ORMClient != nil { gw.wireguardHandler = wireguardhandlers.NewHandler(logger.Logger, deps.ORMClient, cfg.ClusterSecret) gw.joinHandler = joinhandlers.NewHandler(logger.Logger, deps.ORMClient, cfg.DataDir) + gw.enrollHandler = enrollhandlers.NewHandler(logger.Logger, deps.ORMClient, cfg.DataDir) gw.vaultHandlers = vaulthandlers.NewHandlers(logger, deps.Client) } diff --git a/pkg/gateway/handlers/enroll/handler.go b/pkg/gateway/handlers/enroll/handler.go new file mode 100644 index 0000000..1d4c2ff --- /dev/null +++ b/pkg/gateway/handlers/enroll/handler.go @@ -0,0 +1,435 @@ +// Package enroll implements the OramaOS node enrollment endpoint. +// +// Flow: +// 1. Operator's CLI sends POST /v1/node/enroll with code + token + node_ip +// 2. Gateway validates invite token (single-use) +// 3. Gateway assigns WG IP, registers peer, reads secrets +// 4. Gateway pushes cluster config to OramaOS node at node_ip:9999 +// 5. OramaOS node configures WG, encrypts data partition, starts services +package enroll + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + "os" + "os/exec" + "strings" + "time" + + "github.com/DeBrosOfficial/network/pkg/rqlite" + "go.uber.org/zap" +) + +// EnrollRequest is the request from the CLI. +type EnrollRequest struct { + Code string `json:"code"` + Token string `json:"token"` + NodeIP string `json:"node_ip"` +} + +// EnrollResponse is the configuration pushed to the OramaOS node. +type EnrollResponse struct { + NodeID string `json:"node_id"` + WireGuardConfig string `json:"wireguard_config"` + ClusterSecret string `json:"cluster_secret"` + Peers []PeerInfo `json:"peers"` +} + +// PeerInfo describes a cluster peer for LUKS key distribution. +type PeerInfo struct { + WGIP string `json:"wg_ip"` + NodeID string `json:"node_id"` +} + +// Handler handles OramaOS node enrollment. +type Handler struct { + logger *zap.Logger + rqliteClient rqlite.Client + oramaDir string +} + +// NewHandler creates a new enrollment handler. +func NewHandler(logger *zap.Logger, rqliteClient rqlite.Client, oramaDir string) *Handler { + return &Handler{ + logger: logger, + rqliteClient: rqliteClient, + oramaDir: oramaDir, + } +} + +// HandleEnroll handles POST /v1/node/enroll. +func (h *Handler) HandleEnroll(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + r.Body = http.MaxBytesReader(w, r.Body, 1<<20) + var req EnrollRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + http.Error(w, "invalid request body", http.StatusBadRequest) + return + } + + if req.Code == "" || req.Token == "" || req.NodeIP == "" { + http.Error(w, "code, token, and node_ip are required", http.StatusBadRequest) + return + } + + ctx := r.Context() + + // 1. Validate invite token (single-use, same as join handler) + if err := h.consumeToken(ctx, req.Token, req.NodeIP); err != nil { + h.logger.Warn("enroll token validation failed", zap.Error(err)) + http.Error(w, "unauthorized: invalid or expired token", http.StatusUnauthorized) + return + } + + // 2. Verify registration code against the OramaOS node + if err := h.verifyCode(req.NodeIP, req.Code); err != nil { + h.logger.Warn("registration code verification failed", zap.Error(err)) + http.Error(w, "code verification failed: "+err.Error(), http.StatusBadRequest) + return + } + + // 3. Generate WG keypair for the OramaOS node + wgPrivKey, wgPubKey, err := generateWGKeypair() + if err != nil { + h.logger.Error("failed to generate WG keypair", zap.Error(err)) + http.Error(w, "internal error", http.StatusInternalServerError) + return + } + + // 4. Assign WG IP + wgIP, err := h.assignWGIP(ctx) + if err != nil { + h.logger.Error("failed to assign WG IP", zap.Error(err)) + http.Error(w, "failed to assign WG IP", http.StatusInternalServerError) + return + } + + nodeID := fmt.Sprintf("orama-node-%s", strings.ReplaceAll(wgIP, ".", "-")) + + // 5. Register WG peer in database + if _, err := h.rqliteClient.Exec(ctx, + "INSERT OR REPLACE INTO wireguard_peers (node_id, wg_ip, public_key, public_ip, wg_port) VALUES (?, ?, ?, ?, ?)", + nodeID, wgIP, wgPubKey, req.NodeIP, 51820); err != nil { + h.logger.Error("failed to register WG peer", zap.Error(err)) + http.Error(w, "failed to register peer", http.StatusInternalServerError) + return + } + + // 6. Add peer to local WireGuard interface + if err := h.addWGPeerLocally(wgPubKey, req.NodeIP, wgIP); err != nil { + h.logger.Warn("failed to add WG peer to local interface", zap.Error(err)) + } + + // 7. Read secrets + clusterSecret, err := os.ReadFile(h.oramaDir + "/secrets/cluster-secret") + if err != nil { + h.logger.Error("failed to read cluster secret", zap.Error(err)) + http.Error(w, "internal error", http.StatusInternalServerError) + return + } + + // 8. Build WireGuard config for the OramaOS node + wgConfig, err := h.buildWGConfig(ctx, wgPrivKey, wgIP) + if err != nil { + h.logger.Error("failed to build WG config", zap.Error(err)) + http.Error(w, "internal error", http.StatusInternalServerError) + return + } + + // 9. Get all peer WG IPs for LUKS key distribution + peers, err := h.getPeerList(ctx, wgIP) + if err != nil { + h.logger.Error("failed to get peer list", zap.Error(err)) + http.Error(w, "internal error", http.StatusInternalServerError) + return + } + + // 10. Push config to OramaOS node + enrollResp := EnrollResponse{ + NodeID: nodeID, + WireGuardConfig: wgConfig, + ClusterSecret: strings.TrimSpace(string(clusterSecret)), + Peers: peers, + } + + if err := h.pushConfigToNode(req.NodeIP, &enrollResp); err != nil { + h.logger.Error("failed to push config to node", zap.Error(err)) + http.Error(w, "failed to configure node: "+err.Error(), http.StatusInternalServerError) + return + } + + h.logger.Info("OramaOS node enrolled", + zap.String("node_id", nodeID), + zap.String("wg_ip", wgIP), + zap.String("public_ip", req.NodeIP)) + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]string{ + "status": "enrolled", + "node_id": nodeID, + "wg_ip": wgIP, + }) +} + +// consumeToken validates and marks an invite token as used. +func (h *Handler) consumeToken(ctx context.Context, token, usedByIP string) error { + result, err := h.rqliteClient.Exec(ctx, + "UPDATE invite_tokens SET used_at = datetime('now'), used_by_ip = ? WHERE token = ? AND used_at IS NULL AND expires_at > datetime('now')", + usedByIP, token) + if err != nil { + return fmt.Errorf("database error: %w", err) + } + + rowsAffected, err := result.RowsAffected() + if err != nil { + return fmt.Errorf("failed to check result: %w", err) + } + + if rowsAffected == 0 { + return fmt.Errorf("token invalid, expired, or already used") + } + + return nil +} + +// verifyCode checks that the OramaOS node has the expected registration code. +func (h *Handler) verifyCode(nodeIP, expectedCode string) error { + client := &http.Client{Timeout: 10 * time.Second} + resp, err := client.Get(fmt.Sprintf("http://%s:9999/", nodeIP)) + if err != nil { + return fmt.Errorf("cannot reach node at %s:9999: %w", nodeIP, err) + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusGone { + return fmt.Errorf("node already served its registration code") + } + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("node returned status %d", resp.StatusCode) + } + + var result struct { + Code string `json:"code"` + } + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return fmt.Errorf("invalid response from node: %w", err) + } + + if result.Code != expectedCode { + return fmt.Errorf("registration code mismatch") + } + + return nil +} + +// pushConfigToNode sends cluster configuration to the OramaOS node. +func (h *Handler) pushConfigToNode(nodeIP string, config *EnrollResponse) error { + body, err := json.Marshal(config) + if err != nil { + return err + } + + client := &http.Client{Timeout: 30 * time.Second} + resp, err := client.Post( + fmt.Sprintf("http://%s:9999/v1/agent/enroll/complete", nodeIP), + "application/json", + bytes.NewReader(body), + ) + if err != nil { + return fmt.Errorf("failed to push config: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("node returned status %d", resp.StatusCode) + } + + return nil +} + +// generateWGKeypair generates a WireGuard private/public keypair. +func generateWGKeypair() (privKey, pubKey string, err error) { + privOut, err := exec.Command("wg", "genkey").Output() + if err != nil { + return "", "", fmt.Errorf("wg genkey failed: %w", err) + } + privKey = strings.TrimSpace(string(privOut)) + + cmd := exec.Command("wg", "pubkey") + cmd.Stdin = strings.NewReader(privKey) + pubOut, err := cmd.Output() + if err != nil { + return "", "", fmt.Errorf("wg pubkey failed: %w", err) + } + pubKey = strings.TrimSpace(string(pubOut)) + + return privKey, pubKey, nil +} + +// assignWGIP finds the next available WG IP. +func (h *Handler) assignWGIP(ctx context.Context) (string, error) { + var rows []struct { + WGIP string `db:"wg_ip"` + } + if err := h.rqliteClient.Query(ctx, &rows, "SELECT wg_ip FROM wireguard_peers"); err != nil { + return "", fmt.Errorf("failed to query WG IPs: %w", err) + } + + if len(rows) == 0 { + return "10.0.0.2", nil + } + + maxD := 0 + maxC := 0 + for _, row := range rows { + var a, b, c, d int + if _, err := fmt.Sscanf(row.WGIP, "%d.%d.%d.%d", &a, &b, &c, &d); err != nil { + continue + } + if c > maxC || (c == maxC && d > maxD) { + maxC, maxD = c, d + } + } + + maxD++ + if maxD > 254 { + maxC++ + maxD = 1 + } + + return fmt.Sprintf("10.0.%d.%d", maxC, maxD), nil +} + +// addWGPeerLocally adds a peer to the local wg0 interface. +func (h *Handler) addWGPeerLocally(pubKey, publicIP, wgIP string) error { + cmd := exec.Command("wg", "set", "wg0", + "peer", pubKey, + "endpoint", fmt.Sprintf("%s:51820", publicIP), + "allowed-ips", fmt.Sprintf("%s/32", wgIP), + "persistent-keepalive", "25") + if output, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("wg set failed: %w\n%s", err, string(output)) + } + return nil +} + +// buildWGConfig generates a wg0.conf for the OramaOS node. +func (h *Handler) buildWGConfig(ctx context.Context, privKey, nodeWGIP string) (string, error) { + // Get this node's public key and WG IP + myPubKey, err := exec.Command("wg", "show", "wg0", "public-key").Output() + if err != nil { + return "", fmt.Errorf("failed to get local WG public key: %w", err) + } + + myWGIP, err := h.getMyWGIP() + if err != nil { + return "", fmt.Errorf("failed to get local WG IP: %w", err) + } + + myPublicIP, err := h.getMyPublicIP(ctx) + if err != nil { + return "", fmt.Errorf("failed to get local public IP: %w", err) + } + + var config strings.Builder + config.WriteString("[Interface]\n") + config.WriteString(fmt.Sprintf("PrivateKey = %s\n", privKey)) + config.WriteString(fmt.Sprintf("Address = %s/24\n", nodeWGIP)) + config.WriteString("ListenPort = 51820\n") + config.WriteString("\n") + + // Add this gateway node as a peer + config.WriteString("[Peer]\n") + config.WriteString(fmt.Sprintf("PublicKey = %s\n", strings.TrimSpace(string(myPubKey)))) + config.WriteString(fmt.Sprintf("Endpoint = %s:51820\n", myPublicIP)) + config.WriteString(fmt.Sprintf("AllowedIPs = %s/32\n", myWGIP)) + config.WriteString("PersistentKeepalive = 25\n") + + // Add all existing peers + type peerRow struct { + WGIP string `db:"wg_ip"` + PublicKey string `db:"public_key"` + PublicIP string `db:"public_ip"` + } + var peers []peerRow + if err := h.rqliteClient.Query(ctx, &peers, + "SELECT wg_ip, public_key, public_ip FROM wireguard_peers WHERE wg_ip != ?", nodeWGIP); err != nil { + h.logger.Warn("failed to query peers for WG config", zap.Error(err)) + } + + for _, p := range peers { + if p.PublicKey == strings.TrimSpace(string(myPubKey)) { + continue // already added above + } + config.WriteString(fmt.Sprintf("\n[Peer]\nPublicKey = %s\nEndpoint = %s:51820\nAllowedIPs = %s/32\nPersistentKeepalive = 25\n", + p.PublicKey, p.PublicIP, p.WGIP)) + } + + return config.String(), nil +} + +// getPeerList returns all cluster peers for LUKS key distribution. +func (h *Handler) getPeerList(ctx context.Context, excludeWGIP string) ([]PeerInfo, error) { + type peerRow struct { + NodeID string `db:"node_id"` + WGIP string `db:"wg_ip"` + } + var rows []peerRow + if err := h.rqliteClient.Query(ctx, &rows, + "SELECT node_id, wg_ip FROM wireguard_peers WHERE wg_ip != ?", excludeWGIP); err != nil { + return nil, err + } + + peers := make([]PeerInfo, 0, len(rows)) + for _, row := range rows { + peers = append(peers, PeerInfo{ + WGIP: row.WGIP, + NodeID: row.NodeID, + }) + } + return peers, nil +} + +// getMyWGIP gets this node's WireGuard IP. +func (h *Handler) getMyWGIP() (string, error) { + out, err := exec.Command("ip", "-4", "addr", "show", "wg0").CombinedOutput() + if err != nil { + return "", fmt.Errorf("failed to get wg0 info: %w", err) + } + for _, line := range strings.Split(string(out), "\n") { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, "inet ") { + parts := strings.Fields(line) + if len(parts) >= 2 { + return strings.Split(parts[1], "/")[0], nil + } + } + } + return "", fmt.Errorf("could not find wg0 IP") +} + +// getMyPublicIP reads this node's public IP from the database. +func (h *Handler) getMyPublicIP(ctx context.Context) (string, error) { + myWGIP, err := h.getMyWGIP() + if err != nil { + return "", err + } + var rows []struct { + PublicIP string `db:"public_ip"` + } + if err := h.rqliteClient.Query(ctx, &rows, + "SELECT public_ip FROM wireguard_peers WHERE wg_ip = ?", myWGIP); err != nil { + return "", err + } + if len(rows) == 0 { + return "", fmt.Errorf("no peer entry for WG IP %s", myWGIP) + } + return rows[0].PublicIP, nil +} diff --git a/pkg/gateway/handlers/enroll/node_proxy.go b/pkg/gateway/handlers/enroll/node_proxy.go new file mode 100644 index 0000000..9ca6f1b --- /dev/null +++ b/pkg/gateway/handlers/enroll/node_proxy.go @@ -0,0 +1,272 @@ +package enroll + +import ( + "context" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "os/exec" + "strings" + "time" + + "go.uber.org/zap" +) + +// HandleNodeStatus proxies GET /v1/node/status to the agent over WireGuard. +// Query param: ?node_id= or ?wg_ip= +func (h *Handler) HandleNodeStatus(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + wgIP, err := h.resolveNodeIP(r) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + // Proxy to agent's status endpoint + body, statusCode, err := h.proxyToAgent(wgIP, "GET", "/v1/agent/status", nil) + if err != nil { + h.logger.Warn("failed to proxy status request", zap.String("wg_ip", wgIP), zap.Error(err)) + http.Error(w, "node unreachable: "+err.Error(), http.StatusBadGateway) + return + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(statusCode) + w.Write(body) +} + +// HandleNodeCommand proxies POST /v1/node/command to the agent over WireGuard. +func (h *Handler) HandleNodeCommand(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + wgIP, err := h.resolveNodeIP(r) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + // Read command body + r.Body = http.MaxBytesReader(w, r.Body, 1<<20) + cmdBody, err := io.ReadAll(r.Body) + if err != nil { + http.Error(w, "invalid request body", http.StatusBadRequest) + return + } + + // Proxy to agent's command endpoint + body, statusCode, err := h.proxyToAgent(wgIP, "POST", "/v1/agent/command", cmdBody) + if err != nil { + h.logger.Warn("failed to proxy command", zap.String("wg_ip", wgIP), zap.Error(err)) + http.Error(w, "node unreachable: "+err.Error(), http.StatusBadGateway) + return + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(statusCode) + w.Write(body) +} + +// HandleNodeLogs proxies GET /v1/node/logs to the agent over WireGuard. +// Query params: ?node_id=&service=&lines= +func (h *Handler) HandleNodeLogs(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + wgIP, err := h.resolveNodeIP(r) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + // Build query string for agent + service := r.URL.Query().Get("service") + lines := r.URL.Query().Get("lines") + agentPath := "/v1/agent/logs" + params := []string{} + if service != "" { + params = append(params, "service="+service) + } + if lines != "" { + params = append(params, "lines="+lines) + } + if len(params) > 0 { + agentPath += "?" + strings.Join(params, "&") + } + + body, statusCode, err := h.proxyToAgent(wgIP, "GET", agentPath, nil) + if err != nil { + h.logger.Warn("failed to proxy logs request", zap.String("wg_ip", wgIP), zap.Error(err)) + http.Error(w, "node unreachable: "+err.Error(), http.StatusBadGateway) + return + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(statusCode) + w.Write(body) +} + +// HandleNodeLeave handles POST /v1/node/leave — graceful node departure. +// Orchestrates: stop services → redistribute Shamir shares → remove WG peer. +func (h *Handler) HandleNodeLeave(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + + r.Body = http.MaxBytesReader(w, r.Body, 1<<20) + var req struct { + NodeID string `json:"node_id"` + WGIP string `json:"wg_ip"` + } + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + http.Error(w, "invalid request body", http.StatusBadRequest) + return + } + + wgIP := req.WGIP + if wgIP == "" && req.NodeID != "" { + resolved, err := h.nodeIDToWGIP(r.Context(), req.NodeID) + if err != nil { + http.Error(w, "node not found: "+err.Error(), http.StatusNotFound) + return + } + wgIP = resolved + } + if wgIP == "" { + http.Error(w, "node_id or wg_ip is required", http.StatusBadRequest) + return + } + + h.logger.Info("node leave requested", zap.String("wg_ip", wgIP)) + + // Step 1: Tell the agent to stop services + _, _, err := h.proxyToAgent(wgIP, "POST", "/v1/agent/command", + []byte(`{"action":"stop"}`)) + if err != nil { + h.logger.Warn("failed to stop services on leaving node", zap.Error(err)) + // Continue — node may already be down + } + + // Step 2: Remove WG peer from database + ctx := r.Context() + if _, err := h.rqliteClient.Exec(ctx, + "DELETE FROM wireguard_peers WHERE wg_ip = ?", wgIP); err != nil { + h.logger.Error("failed to remove WG peer from database", zap.Error(err)) + http.Error(w, "failed to remove peer", http.StatusInternalServerError) + return + } + + // Step 3: Remove from local WireGuard interface + // Get the peer's public key first + var rows []struct { + PublicKey string `db:"public_key"` + } + _ = h.rqliteClient.Query(ctx, &rows, + "SELECT public_key FROM wireguard_peers WHERE wg_ip = ?", wgIP) + // Peer already deleted above, but try to remove from wg0 anyway + h.removeWGPeerLocally(wgIP) + + h.logger.Info("node removed from cluster", zap.String("wg_ip", wgIP)) + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(map[string]string{ + "status": "removed", + "wg_ip": wgIP, + }) +} + +// proxyToAgent sends an HTTP request to the OramaOS agent over WireGuard. +func (h *Handler) proxyToAgent(wgIP, method, path string, body []byte) ([]byte, int, error) { + url := fmt.Sprintf("http://%s:9998%s", wgIP, path) + + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + + var reqBody io.Reader + if body != nil { + reqBody = strings.NewReader(string(body)) + } + + req, err := http.NewRequestWithContext(ctx, method, url, reqBody) + if err != nil { + return nil, 0, err + } + if body != nil { + req.Header.Set("Content-Type", "application/json") + } + + client := &http.Client{Timeout: 15 * time.Second} + resp, err := client.Do(req) + if err != nil { + return nil, 0, fmt.Errorf("request to agent failed: %w", err) + } + defer resp.Body.Close() + + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return nil, resp.StatusCode, fmt.Errorf("failed to read agent response: %w", err) + } + + return respBody, resp.StatusCode, nil +} + +// resolveNodeIP extracts the WG IP from query parameters. +func (h *Handler) resolveNodeIP(r *http.Request) (string, error) { + wgIP := r.URL.Query().Get("wg_ip") + if wgIP != "" { + return wgIP, nil + } + + nodeID := r.URL.Query().Get("node_id") + if nodeID != "" { + return h.nodeIDToWGIP(r.Context(), nodeID) + } + + return "", fmt.Errorf("wg_ip or node_id query parameter is required") +} + +// nodeIDToWGIP resolves a node_id to its WireGuard IP. +func (h *Handler) nodeIDToWGIP(ctx context.Context, nodeID string) (string, error) { + var rows []struct { + WGIP string `db:"wg_ip"` + } + if err := h.rqliteClient.Query(ctx, &rows, + "SELECT wg_ip FROM wireguard_peers WHERE node_id = ?", nodeID); err != nil { + return "", err + } + if len(rows) == 0 { + return "", fmt.Errorf("no node found with id %s", nodeID) + } + return rows[0].WGIP, nil +} + +// removeWGPeerLocally removes a peer from the local wg0 interface by its allowed IP. +func (h *Handler) removeWGPeerLocally(wgIP string) { + // Find peer public key by allowed IP + out, err := exec.Command("wg", "show", "wg0", "dump").Output() + if err != nil { + log.Printf("failed to get wg dump: %v", err) + return + } + + for _, line := range strings.Split(string(out), "\n") { + fields := strings.Split(line, "\t") + if len(fields) >= 4 && strings.Contains(fields[3], wgIP) { + pubKey := fields[0] + exec.Command("wg", "set", "wg0", "peer", pubKey, "remove").Run() + log.Printf("removed WG peer %s (%s)", pubKey[:8]+"...", wgIP) + return + } + } +} diff --git a/pkg/gateway/routes.go b/pkg/gateway/routes.go index a791eda..809b419 100644 --- a/pkg/gateway/routes.go +++ b/pkg/gateway/routes.go @@ -39,6 +39,15 @@ func (g *Gateway) Routes() http.Handler { mux.HandleFunc("/v1/internal/join", g.joinHandler.HandleJoin) } + // OramaOS node management (handler does its own auth) + if g.enrollHandler != nil { + mux.HandleFunc("/v1/node/enroll", g.enrollHandler.HandleEnroll) + mux.HandleFunc("/v1/node/status", g.enrollHandler.HandleNodeStatus) + mux.HandleFunc("/v1/node/command", g.enrollHandler.HandleNodeCommand) + mux.HandleFunc("/v1/node/logs", g.enrollHandler.HandleNodeLogs) + mux.HandleFunc("/v1/node/leave", g.enrollHandler.HandleNodeLeave) + } + // Namespace instance spawn/stop (internal, handler does its own auth) if g.spawnHandler != nil { mux.Handle("/v1/internal/namespace/spawn", g.spawnHandler)